# COMP615 - Assignment Two Notebook

## Imports
This cell contains all necessary imports for the entire notebook.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif

from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB # Added for completeness, as per assignment

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.exceptions import ConvergenceWarning
import warnings

# Ignore warnings for cleaner output
warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)

## Data Loading

In [None]:
df = pd.read_csv('training.csv')
dftest = pd.read_csv('testing.csv')

# Part A: K-Nearest Neighbors (KNN) and Naïve Bayes

## A2. Perform Exploratory Data Analysis (EDA)

### Dataset Overview and Integrity Check
First, let's get a basic understanding of the dataset's structure, data types, and check for any missing or duplicate values.

In [None]:
print("--- Training Set Information ---")
df.info()

print("\n\n--- Testing Set Information ---")
dftest.info()

print('\n\n--- Null and Duplicate Value Check ---')
print('\nTraining Set:')
print(f"Total null values: {df.isnull().sum().sum()}")
print(f"Total duplicate values: {df.duplicated().sum()}")

print('\nTesting Set:')
print(f"Total null values: {dftest.isnull().sum().sum()}")
print(f"Total duplicate values: {dftest.duplicated().sum()}")

### Class Distribution
Visualize the distribution of the target variable (`class`) to check for any class imbalance.

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(x='class', data=df, order=df['class'].value_counts().index)
plt.title('Class Distribution')
plt.xlabel('Class')
plt.ylabel('Frequency (Count)')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

### Data Visualization: Feature Characteristics

#### Histogram of Feature Variances
Low-variance features may not be very informative. Let's visualize the spread of variances across all features.

In [None]:
numeric_df = df.drop('class', axis=1)
feature_variances = numeric_df.var()

plt.figure(figsize=(10, 6))
plt.hist(feature_variances, bins=30, edgecolor='black')
plt.title('Histogram of Feature Variances')
plt.xlabel('Variance')
plt.ylabel('Number of Features (Frequency)')
plt.grid(axis='y', alpha=0.75)
plt.show()

#### Correlation Heatmap
A correlation heatmap helps identify multicollinearity, where features are highly correlated with each other. This is especially important for models that assume feature independence, like Naïve Bayes.

In [None]:
correlation_matrix = numeric_df.corr()

plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix,
            annot=False,
            cmap='coolwarm',
            vmin=-1,
            vmax=1,
            center=0)
plt.title('Correlation Heatmap of Features', fontsize=15)
plt.tight_layout()
plt.show()

#### Boxplots for Selected Features by Class
Let's examine the distributions of a few potentially important features across the different classes.

In [None]:
target_column_name = 'class'
selected_features_for_boxplot = ['NDVI', 'Mean_NIR', 'GLCM2', 'Bright', 'Mean_R']

plt.figure(figsize=(18, 12))
for i, feature_name in enumerate(selected_features_for_boxplot):
   plt.subplot(2, 3, i + 1)
   sns.boxplot(x=target_column_name, y=feature_name, data=df)
   plt.title(f'Boxplot of {feature_name}\nby {target_column_name}')
   plt.xlabel(target_column_name)
   plt.ylabel(feature_name)

plt.tight_layout()
plt.show()

## A3. Feature Selection and Analysis
We use the ANOVA F-test (`f_classif`) to identify the top 5 features that have the strongest relationship with the target class.

In [None]:
X = df.drop('class', axis=1)
y = df['class']

le = LabelEncoder()
y_encoded = le.fit_transform(y)

selector = SelectKBest(score_func=f_classif, k=min(5, X.shape[1]))
selector.fit(X, y)

selected_features_indices = selector.get_support(indices=True)
top_features_names = X.columns[selected_features_indices].tolist()

print("Top 5 features by F score:")
for name in top_features_names:
    print(f"- {name}")

df_plot_selected = X[top_features_names].copy()
df_plot_selected['class'] = y.reset_index(drop=True)
class_order_for_plot = df['class'].value_counts().index

for feature_name in top_features_names:
    plt.figure(figsize=(8, 5))
    sns.boxplot(x='class', y=feature_name, data=df_plot_selected, order=class_order_for_plot)
    plt.title(f'Distribution of  {feature_name} by Class')
    plt.ylabel(f'{feature_name}')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

## Data Preprocessing for Models
Here, we define our training and test sets and scale the features using `StandardScaler`. Scaling is crucial for distance-based algorithms like KNN and helps with the convergence of models like MLP.

In [None]:
X_train = df.drop('class', axis=1)
y_train = df['class']

X_test = dftest.drop('class', axis= 1)
y_test = dftest['class']

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print(f"Training data scaled. Shape: {X_train_scaled.shape}")
print(f"Test data scaled. Shape: {X_test_scaled.shape}")

## A6. K-Nearest Neighbors (KNN) Model Building and Evaluation

### Finding the Optimal 'k' using Cross-Validation
We'll test a range of `k` values and use 5-fold cross-validation to find the value that yields the highest average accuracy.

In [None]:
k_range = range(1, 21)
cv_scores = []

print("--- Finding Best K using 5-Fold Cross-Validation ---")
for k_val in k_range:
    knn = KNeighborsClassifier(n_neighbors=k_val)
    scores = cross_val_score(knn, X_train_scaled, y_train, cv=5, scoring='accuracy')
    cv_scores.append(scores.mean())
    print(f"k={k_val}, CV Mean Accuracy: {scores.mean():.4f}")

best_k_cv = k_range[np.argmax(cv_scores)]
print(f"\nBest k based on cross-validation: {best_k_cv} with CV accuracy: {max(cv_scores):.4f}")

plt.figure(figsize=(10, 6))
plt.plot(k_range, cv_scores, marker='o', linestyle='dashed')
plt.title('KNN Performance for different k values (Cross-Validation)')
plt.xlabel('Number of Neighbors (k)')
plt.ylabel('Mean Cross-Validated Accuracy')
plt.xticks(k_range)
plt.grid(True)
plt.show()

### Final KNN Model Evaluation
Now we train the KNN model using the best `k` found and evaluate its performance on the unseen test data.

In [None]:
k = best_k_cv
knn_model = KNeighborsClassifier(n_neighbors=k)

knn_model.fit(X_train_scaled, y_train)

y_pred_knn = knn_model.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred_knn)
print(f"KNN Model Accuracy (k={k}): {accuracy:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred_knn))

class_labels = np.unique(np.concatenate((y_test, y_pred_knn)))

cm = confusion_matrix(y_test, y_pred_knn, labels=class_labels)

cm_df = pd.DataFrame(cm,
                     index = class_labels,
                     columns = class_labels)

plt.figure(figsize=(10, 7))
sns.heatmap(cm_df,
            annot=True,      
            fmt='d',         
            cmap='Blues',    
            linewidths=.5,
            linecolor='gray',
            cbar=True)

plt.title(f'Confusion Matrix for KNN (k={best_k_cv})', fontsize=15)
plt.ylabel('True Label', fontsize=12)
plt.xlabel('Predicted Label', fontsize=12)
plt.tight_layout()
plt.show()

# Part B: Exploring Artificial Neural Networks

## B2 & B3. Baseline Model (Single Hidden Layer) and Loss Tracking
We use `GridSearchCV` to find the optimal number of neurons (k) and iterations for a single-layer MLP. Then, we train a new model with these parameters and plot its learning curves to observe the training loss and validation accuracy over epochs.

In [None]:
print("--- Part B(b): Finding Best Parameters for Single-Layer MLP ---")

param_grid = {
    'hidden_layer_sizes': [(k,) for k in range(5, 26, 5)], # [5, 10, 15, 20, 25]
    'max_iter': [50, 100, 150, 200, 250]
}

grid_search = GridSearchCV(
    estimator=MLPClassifier(random_state=42),
    param_grid=param_grid,
    cv=10,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

print("--- Starting GridSearchCV to find best hyperparameters ---")
grid_search.fit(X_train_scaled, y_train)

print("\n--- Grid Search Results ---")
best_params_single_layer = grid_search.best_params_
baseline_accuracy = grid_search.best_score_
print(f"Best parameters found: {best_params_single_layer}")
print(f"Highest 10-fold CV Accuracy: {baseline_accuracy:.4f}")

print("\n--- Part B(c): Creating and training the optimal model for plotting ---")

le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)

optimal_mlp = MLPClassifier(
    **best_params_single_layer,
    random_state=42,
    early_stopping=True,
    validation_fraction=0.1,
    n_iter_no_change=15
)

optimal_mlp.fit(X_train_scaled, y_train_encoded)

final_accuracy = optimal_mlp.score(X_test_scaled, y_test_encoded)
print(f"Final test accuracy of the optimal model: {final_accuracy:.4f}")
print(f"Number of iterations run (due to early stopping): {optimal_mlp.n_iter_}")

plt.figure(figsize=(12, 7))

plt.plot(optimal_mlp.loss_curve_, label='Training Loss', color='blue')
k_mlp = optimal_mlp.hidden_layer_sizes[0]
plt.title(f'Optimal MLP (k={k_mlp}) - Loss & Validation Accuracy vs. Iterations', fontsize=14)
plt.xlabel('Iteration (Epoch)', fontsize=12)
plt.ylabel('Training Loss', color='blue', fontsize=12)
plt.legend(loc='upper left')
plt.grid(True)

ax2 = plt.gca().twinx()
ax2.plot(optimal_mlp.validation_scores_, label='Validation Accuracy', color='green', linestyle='--')
ax2.set_ylabel('Validation Accuracy', color='green', fontsize=12)
ax2.legend(loc='upper right')

plt.show()

## B4. Experimenting with Two Hidden Layers
Here, we experiment with splitting the total number of neurons (`k` from the best single-layer model) across two hidden layers to see if a deeper architecture improves performance.

In [None]:
best_k_from_single = best_params_single_layer['hidden_layer_sizes'][0]
iterations_to_use = best_params_single_layer['max_iter']

print(f"\n--- Part B(d): Experimenting with Two Hidden Layers ---")
print(f"Total neurons to distribute: {best_k_from_single}")
print(f"Iterations for each model: {iterations_to_use}\n")

results_table_data = []

for n2_neurons in range(1, best_k_from_single):
    n1_neurons = best_k_from_single - n2_neurons
    layer_config = (n1_neurons, n2_neurons)

    mlp_model = MLPClassifier(
        hidden_layer_sizes=layer_config,
        max_iter=iterations_to_use,
        random_state=42
    )

    cv_scores = cross_val_score(mlp_model, X_train_scaled, y_train, cv=10, scoring='accuracy', n_jobs=1)
    avg_accuracy = np.mean(cv_scores)

    results_table_data.append({
        'config': f"({n1_neurons}, {n2_neurons})",
        'accuracy': avg_accuracy
    })
    print(f"Configuration: {layer_config}, Avg 10-fold CV Accuracy: {avg_accuracy:.4f}")


print("\n--- Summary Table for Two-Layer Configurations ---")
print("---------------------------------------------")
print("| Neuron Combination | Avg CV Accuracy    |")
print("|--------------------|--------------------|")

best_two_layer_config = ""
best_two_layer_accuracy = 0.0

for result in results_table_data:
    print(f"| {result['config']:<18} | {result['accuracy']:.4f}           |")
    if result['accuracy'] > best_two_layer_accuracy:
        best_two_layer_accuracy = result['accuracy']
        best_two_layer_config = result['config']

print("---------------------------------------------")

print("\n--- Final Comparison ---")
print(f"Best single-layer baseline accuracy (from Part B(b)): {baseline_accuracy:.4f}")
print(f"Best two-layer configuration found: {best_two_layer_config} with an accuracy of {best_two_layer_accuracy:.4f}")

if best_two_layer_accuracy > baseline_accuracy:
    print("Conclusion: Adding a second hidden layer provided an improvement.")
else:
    print("Conclusion: The single-layer architecture remains the best performing model.")