# CSCI218 Group Project: Dry Bean Classification
## V3 — SMOTE + PCA + GridSearchCV + Advanced Models

**Enhancements over V2:**
1. **SMOTE** — Synthetic Minority Over-sampling via `imblearn` Pipeline (no data leakage)
2. **PCA** — Dimensionality reduction to remove correlated features
3. **GridSearchCV** — Systematic hyperparameter tuning for SVM (C, gamma) and KNN (k)
4. **Model Expansion** — XGBoost, LightGBM, and MLP (Neural Network) for complex non-linear patterns
5. **Macro-F1** as primary metric for fair 7-class evaluation

---
## 0. Install Dependencies

In [None]:
!pip install ucimlrepo imbalanced-learn xgboost lightgbm

---
## 1. Imports & Configuration

In [None]:
import os
import warnings
import numpy as np
import pandas as pd

import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt

from sklearn.model_selection import (
    train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
)
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA

# Use imblearn Pipeline (not sklearn) so SMOTE runs inside each CV fold
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

from sklearn.metrics import (
    accuracy_score, f1_score, precision_score, recall_score,
    classification_report, confusion_matrix
)

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

warnings.filterwarnings("ignore")

# Configuration
BASE_DIR = os.path.dirname(os.path.abspath(__file__)) if "__file__" in globals() else os.getcwd()
OUTPUT_DIR = os.path.join(BASE_DIR, "output")
os.makedirs(OUTPUT_DIR, exist_ok=True)

RANDOM_STATE = 42
TEST_SIZE = 0.2
N_SPLITS = 5

print("Imports and configuration complete.")

---
## 2. Load Dataset

In [None]:
X, y = None, None

try:
    from ucimlrepo import fetch_ucirepo
    print("Loading Dry Bean dataset from UCI ML Repository...")
    dataset = fetch_ucirepo(id=602)
    X = dataset.data.features
    y = dataset.data.targets.values.ravel()
    print("Loaded successfully via ucimlrepo.")
except Exception:
    local_path = os.path.join(BASE_DIR, "Dry_Bean_Dataset.csv")
    if os.path.exists(local_path):
        print(f"Loading from local CSV: {local_path}")
        df = pd.read_csv(local_path)
        X = df.iloc[:, :-1]
        y = df.iloc[:, -1].values
    else:
        raise FileNotFoundError(
            "Could not load dataset via ucimlrepo and local CSV not found.\n"
            "Place Dry_Bean_Dataset.csv next to this notebook, or install ucimlrepo."
        )

print(f"Samples: {X.shape[0]}, Features: {X.shape[1]}")
print(f"Classes: {np.unique(y)}")

---
## 3. Basic Checks & Class Distribution

In [None]:
# Missing values
missing = int(pd.DataFrame(X).isnull().sum().sum())
print(f"Missing values: {missing}")
if missing > 0:
    X = pd.DataFrame(X).fillna(pd.DataFrame(X).median())
    print("Filled missing values with median.")

# Class distribution
class_counts_before = pd.Series(y).value_counts().sort_index()
print("\nClass distribution (before SMOTE):")
for cls, cnt in class_counts_before.items():
    print(f"  {cls}: {cnt}")

---
## 4. Encode Labels & Train/Test Split

In [None]:
le = LabelEncoder()
y_encoded = le.fit_transform(y)
class_names = le.classes_
print(f"Encoded classes: {dict(zip(class_names, range(len(class_names))))}")

X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
    stratify=y_encoded
)
print(f"Train: {X_train.shape[0]} samples")
print(f"Test:  {X_test.shape[0]} samples")

---
## 5. Address Imbalance — SMOTE Visualisation

SMOTE (Synthetic Minority Over-sampling Technique) generates synthetic samples for minority classes like **BOMBAY** (only 522 samples vs 3546 for DERMASON). Inside the pipeline, SMOTE is applied only to training folds — the test set remains untouched.

In [None]:
smote_vis = SMOTE(random_state=RANDOM_STATE)
scaler_vis = StandardScaler()
X_train_scaled_vis = scaler_vis.fit_transform(X_train)
X_train_smote_vis, y_train_smote_vis = smote_vis.fit_resample(X_train_scaled_vis, y_train)

counts_before = pd.Series(y_train).value_counts().sort_index()
counts_after  = pd.Series(y_train_smote_vis).value_counts().sort_index()
labels = [class_names[i] for i in counts_before.index]

fig, axes = plt.subplots(1, 2, figsize=(14, 5))
axes[0].bar(labels, counts_before.values, edgecolor="black", color="steelblue")
axes[0].set_title("Class Distribution \u2014 Before SMOTE (Train Set)")
axes[0].set_xlabel("Bean Class"); axes[0].set_ylabel("Count")
axes[0].tick_params(axis="x", rotation=30)

axes[1].bar(labels, counts_after.values, edgecolor="black", color="seagreen")
axes[1].set_title("Class Distribution \u2014 After SMOTE (Train Set)")
axes[1].set_xlabel("Bean Class"); axes[1].set_ylabel("Count")
axes[1].tick_params(axis="x", rotation=30)

plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "smote_class_distribution.png"), dpi=150)
plt.show()

print(f"Training samples before SMOTE: {X_train.shape[0]}")
print(f"Training samples after  SMOTE: {X_train_smote_vis.shape[0]}")

---
## 6. Advanced Feature Engineering — PCA (Dimensionality Reduction)

Many of the 16 features are highly correlated (e.g. Area vs Perimeter, MajorAxisLength vs MinorAxisLength). PCA projects the data onto orthogonal principal components, removing redundancy while retaining >= 95% of the variance.

In [None]:
# Fit PCA on all components to analyse variance
pca_full = PCA(random_state=RANDOM_STATE)
pca_full.fit(X_train_scaled_vis)  # fit on scaled (non-SMOTE) training data

cumulative_variance = np.cumsum(pca_full.explained_variance_ratio_)
n_components_95 = int(np.argmax(cumulative_variance >= 0.95) + 1)

print(f"Original features:          {X_train.shape[1]}")
print(f"Components for 95% variance: {n_components_95}")
print(f"Variance retained:          {cumulative_variance[n_components_95 - 1]:.4f}")

In [None]:
# Explained variance plot
fig, ax = plt.subplots(figsize=(10, 5))
ax.bar(range(1, len(pca_full.explained_variance_ratio_) + 1),
       pca_full.explained_variance_ratio_, alpha=0.6,
       label="Individual", color="steelblue", edgecolor="black")
ax.step(range(1, len(cumulative_variance) + 1), cumulative_variance,
        where="mid", label="Cumulative", color="darkorange", linewidth=2)
ax.axhline(y=0.95, color="red", linestyle="--", label="95% threshold")
ax.axvline(x=n_components_95, color="green", linestyle="--", alpha=0.7,
           label=f"n_components = {n_components_95}")
ax.set_xlabel("Principal Component")
ax.set_ylabel("Explained Variance Ratio")
ax.set_title("PCA \u2014 Explained Variance Analysis")
ax.legend()
ax.set_xticks(range(1, len(pca_full.explained_variance_ratio_) + 1))
ax.grid(axis="y", linestyle="--", alpha=0.4)
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "pca_explained_variance.png"), dpi=150)
plt.show()

In [None]:
# Feature correlation heatmap (before PCA) — shows why PCA helps
corr_matrix = pd.DataFrame(X_train_scaled_vis, columns=X.columns).corr()
fig, ax = plt.subplots(figsize=(10, 8))
im = ax.imshow(corr_matrix.values, cmap="coolwarm", vmin=-1, vmax=1)
ax.set_xticks(range(len(X.columns)))
ax.set_yticks(range(len(X.columns)))
ax.set_xticklabels(X.columns, rotation=45, ha="right", fontsize=8)
ax.set_yticklabels(X.columns, fontsize=8)
ax.set_title("Feature Correlation Matrix (before PCA)")
plt.colorbar(im, ax=ax)
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "correlation_matrix_before_pca.png"), dpi=150)
plt.show()

---
## 7. Systematic Optimisation — GridSearchCV

We use `GridSearchCV` with `imblearn.Pipeline` (SMOTE inside each CV fold) to find:
- **SVM**: optimal `C` and `gamma`
- **KNN**: optimal `k` (n_neighbors) and weighting scheme

In [None]:
cv = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)

# --- SVM GridSearch ---
print("SVM GridSearchCV (C, gamma)...")
svm_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("pca",    PCA(n_components=n_components_95, random_state=RANDOM_STATE)),
    ("smote",  SMOTE(random_state=RANDOM_STATE)),
    ("clf",    SVC(kernel="rbf", random_state=RANDOM_STATE))
])

svm_param_grid = {
    "clf__C":     [0.1, 1, 10, 100],
    "clf__gamma": ["scale", "auto", 0.01, 0.1]
}

svm_grid = GridSearchCV(
    svm_pipeline, svm_param_grid,
    cv=cv, scoring="f1_macro", n_jobs=-1, verbose=0
)
svm_grid.fit(X_train, y_train)

print(f"  Best params:      {svm_grid.best_params_}")
print(f"  Best CV macro-F1: {svm_grid.best_score_:.4f}")

In [None]:
# --- KNN GridSearch ---
print("KNN GridSearchCV (n_neighbors, weights)...")
knn_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("pca",    PCA(n_components=n_components_95, random_state=RANDOM_STATE)),
    ("smote",  SMOTE(random_state=RANDOM_STATE)),
    ("clf",    KNeighborsClassifier())
])

knn_param_grid = {
    "clf__n_neighbors": [3, 5, 7, 9, 11, 15, 21],
    "clf__weights":     ["uniform", "distance"]
}

knn_grid = GridSearchCV(
    knn_pipeline, knn_param_grid,
    cv=cv, scoring="f1_macro", n_jobs=-1, verbose=0
)
knn_grid.fit(X_train, y_train)

print(f"  Best params:      {knn_grid.best_params_}")
print(f"  Best CV macro-F1: {knn_grid.best_score_:.4f}")

In [None]:
# GridSearch results visualisation
svm_results = pd.DataFrame(svm_grid.cv_results_)
C_values = svm_param_grid["clf__C"]
gamma_values = [str(g) for g in svm_param_grid["clf__gamma"]]
scores_matrix = np.zeros((len(C_values), len(gamma_values)))
for idx, row in svm_results.iterrows():
    c_idx = C_values.index(row["param_clf__C"])
    g_idx = gamma_values.index(str(row["param_clf__gamma"]))
    scores_matrix[c_idx, g_idx] = row["mean_test_score"]

fig, axes = plt.subplots(1, 2, figsize=(16, 5))

# SVM heatmap
im0 = axes[0].imshow(scores_matrix, cmap="YlOrRd", aspect="auto")
axes[0].set_xticks(range(len(gamma_values)))
axes[0].set_yticks(range(len(C_values)))
axes[0].set_xticklabels(gamma_values)
axes[0].set_yticklabels(C_values)
axes[0].set_xlabel("gamma"); axes[0].set_ylabel("C")
axes[0].set_title("SVM GridSearchCV \u2014 macro-F1 Scores")
for i in range(len(C_values)):
    for j in range(len(gamma_values)):
        axes[0].text(j, i, f"{scores_matrix[i, j]:.3f}",
                     ha="center", va="center", fontsize=9, fontweight="bold")
plt.colorbar(im0, ax=axes[0])

# KNN line plot
knn_results = pd.DataFrame(knn_grid.cv_results_)
for weight in ["uniform", "distance"]:
    mask = knn_results["param_clf__weights"] == weight
    subset = knn_results[mask].sort_values("param_clf__n_neighbors")
    axes[1].plot(subset["param_clf__n_neighbors"], subset["mean_test_score"],
                 marker="o", label=f"weights={weight}", linewidth=2)
    axes[1].fill_between(
        subset["param_clf__n_neighbors"],
        subset["mean_test_score"] - subset["std_test_score"],
        subset["mean_test_score"] + subset["std_test_score"],
        alpha=0.15
    )
axes[1].set_xlabel("k (n_neighbors)"); axes[1].set_ylabel("CV macro-F1")
axes[1].set_title("KNN GridSearchCV \u2014 macro-F1 vs k")
axes[1].legend()
axes[1].grid(True, linestyle="--", alpha=0.5)

plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "gridsearch_results.png"), dpi=150)
plt.show()

---
## 8. Model Training — All Models (PCA + SMOTE + Tuned Hyperparameters)

We train 7 models total:
1. **Logistic Regression** — linear baseline
2. **SVM (RBF, tuned)** — best C/gamma from GridSearchCV
3. **KNN (tuned)** — best k/weights from GridSearchCV
4. **Random Forest** — ensemble of decision trees
5. **XGBoost** — gradient boosting
6. **LightGBM** — fast gradient boosting
7. **MLP (Neural Network)** — deep learning for non-linear patterns

In [None]:
best_svm_C     = svm_grid.best_params_["clf__C"]
best_svm_gamma = svm_grid.best_params_["clf__gamma"]
best_knn_k     = knn_grid.best_params_["clf__n_neighbors"]
best_knn_w     = knn_grid.best_params_["clf__weights"]

models = {
    "Logistic Regression": Pipeline([
        ("scaler", StandardScaler()),
        ("pca",    PCA(n_components=n_components_95, random_state=RANDOM_STATE)),
        ("smote",  SMOTE(random_state=RANDOM_STATE)),
        ("clf",    LogisticRegression(max_iter=3000, random_state=RANDOM_STATE))
    ]),
    "SVM (RBF, tuned)": Pipeline([
        ("scaler", StandardScaler()),
        ("pca",    PCA(n_components=n_components_95, random_state=RANDOM_STATE)),
        ("smote",  SMOTE(random_state=RANDOM_STATE)),
        ("clf",    SVC(kernel="rbf", C=best_svm_C, gamma=best_svm_gamma,
                       random_state=RANDOM_STATE))
    ]),
    f"KNN (k={best_knn_k}, tuned)": Pipeline([
        ("scaler", StandardScaler()),
        ("pca",    PCA(n_components=n_components_95, random_state=RANDOM_STATE)),
        ("smote",  SMOTE(random_state=RANDOM_STATE)),
        ("clf",    KNeighborsClassifier(n_neighbors=best_knn_k, weights=best_knn_w))
    ]),
    "Random Forest": Pipeline([
        ("scaler", StandardScaler()),
        ("pca",    PCA(n_components=n_components_95, random_state=RANDOM_STATE)),
        ("smote",  SMOTE(random_state=RANDOM_STATE)),
        ("clf",    RandomForestClassifier(
                       n_estimators=300,
                       class_weight="balanced",
                       random_state=RANDOM_STATE,
                       n_jobs=-1
                   ))
    ]),
    "XGBoost": Pipeline([
        ("scaler", StandardScaler()),
        ("pca",    PCA(n_components=n_components_95, random_state=RANDOM_STATE)),
        ("smote",  SMOTE(random_state=RANDOM_STATE)),
        ("clf",    XGBClassifier(
                       n_estimators=300,
                       learning_rate=0.1,
                       max_depth=6,
                       use_label_encoder=False,
                       eval_metric="mlogloss",
                       random_state=RANDOM_STATE,
                       n_jobs=-1
                   ))
    ]),
    "LightGBM": Pipeline([
        ("scaler", StandardScaler()),
        ("pca",    PCA(n_components=n_components_95, random_state=RANDOM_STATE)),
        ("smote",  SMOTE(random_state=RANDOM_STATE)),
        ("clf",    LGBMClassifier(
                       n_estimators=300,
                       learning_rate=0.1,
                       max_depth=6,
                       class_weight="balanced",
                       random_state=RANDOM_STATE,
                       n_jobs=-1,
                       verbose=-1
                   ))
    ]),
    "MLP (Neural Network)": Pipeline([
        ("scaler", StandardScaler()),
        ("pca",    PCA(n_components=n_components_95, random_state=RANDOM_STATE)),
        ("smote",  SMOTE(random_state=RANDOM_STATE)),
        ("clf",    MLPClassifier(
                       hidden_layer_sizes=(128, 64, 32),
                       activation="relu",
                       solver="adam",
                       max_iter=500,
                       early_stopping=True,
                       validation_fraction=0.1,
                       random_state=RANDOM_STATE
                   ))
    ]),
}

print(f"Models to train: {len(models)}")
for name in models:
    print(f"  - {name}")

In [None]:
results = {}

for name, model in models.items():
    print(f"\nTraining: {name}")

    # CV score on TRAIN ONLY — SMOTE applied inside each fold automatically
    cv_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring="f1_macro")

    # Fit on full training set, evaluate on untouched test set
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    acc          = accuracy_score(y_test, y_pred)
    f1_macro     = f1_score(y_test, y_pred, average="macro")
    f1_weighted  = f1_score(y_test, y_pred, average="weighted")
    prec_macro   = precision_score(y_test, y_pred, average="macro", zero_division=0)
    rec_macro    = recall_score(y_test, y_pred, average="macro", zero_division=0)

    results[name] = {
        "cv_f1_macro_mean":     float(cv_scores.mean()),
        "cv_f1_macro_std":      float(cv_scores.std()),
        "test_accuracy":        float(acc),
        "test_f1_macro":        float(f1_macro),
        "test_f1_weighted":     float(f1_weighted),
        "test_precision_macro": float(prec_macro),
        "test_recall_macro":    float(rec_macro),
        "y_pred":               y_pred,
        "model":                model
    }

    print(f"  CV macro-F1:        {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")
    print(f"  Test Accuracy:      {acc:.4f}")
    print(f"  Test F1 (macro):    {f1_macro:.4f}")
    print(f"  Test F1 (weighted): {f1_weighted:.4f}")

In [None]:
# Best model selection
best_model_name = max(results, key=lambda k: results[k]["cv_f1_macro_mean"])
best = results[best_model_name]

print("=" * 60)
print(f"BEST MODEL (by CV macro-F1): {best_model_name}")
print(f"CV macro-F1:   {best['cv_f1_macro_mean']:.4f}")
print(f"Test Accuracy: {best['test_accuracy']:.4f}")
print(f"Test macro-F1: {best['test_f1_macro']:.4f}")
print("=" * 60)

---
## 9. Confusion Matrices — All Models

In [None]:
n_models = len(results)
n_cols = 4
n_rows = (n_models + n_cols - 1) // n_cols

fig, axes = plt.subplots(n_rows, n_cols, figsize=(7 * n_cols, 6 * n_rows))
axes_flat = axes.flatten() if n_models > 1 else [axes]

for idx, (name, r) in enumerate(results.items()):
    ax = axes_flat[idx]
    cm = confusion_matrix(y_test, r["y_pred"])
    im = ax.imshow(cm, cmap="Blues")
    ax.set_title(f"{name}", fontsize=10, fontweight="bold")
    ax.set_xlabel("Predicted"); ax.set_ylabel("Actual")
    ax.set_xticks(range(len(class_names)))
    ax.set_yticks(range(len(class_names)))
    ax.set_xticklabels(class_names, rotation=30, ha="right", fontsize=7)
    ax.set_yticklabels(class_names, fontsize=7)
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, str(cm[i, j]), ha="center", va="center", fontsize=6,
                    color="white" if cm[i, j] > cm.max() / 2 else "black")
    plt.colorbar(im, ax=ax)

# Hide unused subplots
for idx in range(n_models, len(axes_flat)):
    axes_flat[idx].set_visible(False)

plt.suptitle("Confusion Matrices \u2014 All Models (PCA + SMOTE + Tuned)",
             fontsize=14, fontweight="bold")
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "confusion_matrices_all.png"), dpi=150)
plt.show()

---
## 10. Classification Report (Best Model)

In [None]:
report = classification_report(y_test, best["y_pred"], target_names=class_names, zero_division=0)

print(f"Best Model: {best_model_name}\n")
print(report)

with open(os.path.join(OUTPUT_DIR, "classification_report_best.txt"), "w") as f:
    f.write(f"Best Model (by CV macro-F1): {best_model_name}\n")
    f.write(f"CV macro-F1 mean: {best['cv_f1_macro_mean']:.4f} (std {best['cv_f1_macro_std']:.4f})\n")
    f.write(f"Test Accuracy: {best['test_accuracy']:.4f}\n")
    f.write(f"Test macro-F1: {best['test_f1_macro']:.4f}\n")
    f.write(f"Test weighted-F1: {best['test_f1_weighted']:.4f}\n\n")
    f.write(report)
print("Saved: classification_report_best.txt")

---
## 11. Model Comparison Chart

In [None]:
model_names  = list(results.keys())
cv_means     = [results[n]["cv_f1_macro_mean"] for n in model_names]
test_f1      = [results[n]["test_f1_macro"]    for n in model_names]
test_acc     = [results[n]["test_accuracy"]     for n in model_names]

x = np.arange(len(model_names))
width = 0.25

fig, ax = plt.subplots(figsize=(14, 6))
bars1 = ax.bar(x - width, cv_means, width, label="CV macro-F1 (mean)",
               color="steelblue", edgecolor="black")
bars2 = ax.bar(x,         test_f1,  width, label="Test macro-F1",
               color="seagreen", edgecolor="black")
bars3 = ax.bar(x + width, test_acc, width, label="Test Accuracy",
               color="darkorange", edgecolor="black")

ax.set_title("Model Comparison (PCA + SMOTE + GridSearchCV)",
             fontsize=13, fontweight="bold")
ax.set_ylabel("Score")
ax.set_xticks(x)
ax.set_xticklabels(model_names, fontsize=8, rotation=15, ha="right")
ax.set_ylim(0.85, 1.01)
ax.legend()
ax.grid(axis="y", linestyle="--", alpha=0.5)

for bar in [*bars1, *bars2, *bars3]:
    ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.001,
            f"{bar.get_height():.3f}", ha="center", va="bottom", fontsize=7)

plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "model_comparison.png"), dpi=150)
plt.show()

---
## 12. Results Summary Table

In [None]:
summary_df = pd.DataFrame([
    {
        "Model":                name,
        "CV_MacroF1_Mean":      r["cv_f1_macro_mean"],
        "CV_MacroF1_Std":       r["cv_f1_macro_std"],
        "Test_Accuracy":        r["test_accuracy"],
        "Test_Precision_Macro": r["test_precision_macro"],
        "Test_Recall_Macro":    r["test_recall_macro"],
        "Test_F1_Macro":        r["test_f1_macro"],
        "Test_F1_Weighted":     r["test_f1_weighted"],
    }
    for name, r in results.items()
])

summary_csv_path = os.path.join(OUTPUT_DIR, "results_summary_v3.csv")
summary_df.to_csv(summary_csv_path, index=False)

# Display table
display(summary_df.style.highlight_max(
    subset=["CV_MacroF1_Mean", "Test_Accuracy", "Test_F1_Macro"],
    color="lightgreen"
))
print(f"\nSaved: results_summary_v3.csv")

---
## 13. Enhancement Summary

In [None]:
print("=" * 60)
print("ENHANCEMENT SUMMARY")
print("=" * 60)

print(f"\n1. SMOTE (Address Imbalance):")
print(f"   Augmented minority classes (e.g. BOMBAY {counts_before.min()} -> {counts_after.max()} samples)")
print(f"   Applied inside imblearn Pipeline — no data leakage")

print(f"\n2. PCA (Feature Engineering):")
print(f"   Reduced {X_train.shape[1]} correlated features -> {n_components_95} orthogonal components")
print(f"   Variance retained: {cumulative_variance[n_components_95-1]*100:.1f}%")

print(f"\n3. GridSearchCV (Systematic Optimisation):")
print(f"   SVM:  C={best_svm_C}, gamma={best_svm_gamma}")
print(f"   KNN:  k={best_knn_k}, weights={best_knn_w}")

print(f"\n4. Model Expansion (Advanced Models):")
print(f"   Added XGBoost, LightGBM, and MLP (Neural Network)")
print(f"   Total models evaluated: {len(results)}")

print(f"\n" + "=" * 60)
print(f"BEST MODEL: {best_model_name}")
print(f"CV macro-F1:   {best['cv_f1_macro_mean']:.4f}")
print(f"Test macro-F1: {best['test_f1_macro']:.4f}")
print(f"Test Accuracy: {best['test_accuracy']:.4f}")
print("=" * 60)

print(f"\nAll outputs saved to: {OUTPUT_DIR}")
print("Done!")