# 04_train_models.ipynb

**Objective:**  
1. Remount Drive & set paths  
2. Load pre-computed feature arrays (`.npz`) for **train**  
3. Perform stratified 5-fold CV grid-search over a lightweight classifier  
4. Save the best model to Drive  
5. Display CV results table  


In [None]:
# Cell Tag: imports
import numpy as np
import joblib
import pandas as pd

from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score


In [None]:
# Cell Tag: parameters
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

from pathlib import Path
import os

# Paths
ROOT        = Path("/content/drive/My Drive/Colab Notebooks/CPSC 381-581: Machine Learning/Final Project")
FEATURE_DIR = ROOT / "features"
MODEL_DIR   = ROOT / "models"
MODEL_DIR.mkdir(parents=True, exist_ok=True)

# Hyperparameters
FEATURE_FN = FEATURE_DIR / "train_resnet50_gap_95var.npz"
OUT_MODEL  = MODEL_DIR / "resnet50_gap_95var_svc.pkl"
CV_FOLDS   = 5
SEED       = 42

print("Feature file:", FEATURE_FN)
print("Will save model to:", OUT_MODEL)


Mounted at /content/drive
Feature file: /content/drive/My Drive/Colab Notebooks/CPSC 381-581: Machine Learning/Final Project/features/train_resnet50_gap_95var.npz
Will save model to: /content/drive/My Drive/Colab Notebooks/CPSC 381-581: Machine Learning/Final Project/models/resnet50_gap_95var_svc.pkl


In [None]:
# Cell Tag: load-data
# 1. Load features
data = np.load(FEATURE_FN)
X_train, y_train = data["X"], data["y"]
print("X_train shape:", X_train.shape)
print("y_train distribution:\n", pd.Series(y_train).value_counts())

X_train shape: (1067, 199)
y_train distribution:
 1    480
2    331
0    256
Name: count, dtype: int64


In [None]:
# Cell Tag: setup-cv
# 2. Define CV splitter and parameter grid
cv = StratifiedKFold(n_splits=CV_FOLDS, shuffle=True, random_state=SEED)
param_grid = {
    "C": [0.01, 0.1, 1, 10, 100],
    "kernel": ["linear", "rbf"]
}

svc = SVC(random_state=SEED, probability=False)

gs = GridSearchCV(
    estimator=svc,
    param_grid=param_grid,
    cv=cv,
    scoring="accuracy",
    n_jobs=-1,
    verbose=2,
    return_train_score=False
)


In [None]:
# Cell Tag: execute
# 3. Run grid search
gs.fit(X_train, y_train)
print(f"Best params: {gs.best_params_}")
print(f"CV best accuracy: {gs.best_score_:.4f}")


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best params: {'C': 10, 'kernel': 'rbf'}
CV best accuracy: 0.9616


In [None]:
# Cell Tag: save-model
# 4. Save the best estimator
joblib.dump(gs.best_estimator_, OUT_MODEL)
print("Saved trained model to:", OUT_MODEL)


Saved trained model to: /content/drive/My Drive/Colab Notebooks/CPSC 381-581: Machine Learning/Final Project/models/resnet50_gap_95var_svc.pkl


In [None]:
# Cell Tag: results-table
# 5. Display mean CV accuracy per C
results_df = pd.DataFrame(gs.cv_results_)[
    ["param_C", "mean_test_score", "std_test_score"]
].sort_values("param_C")
results_df.rename(columns={
    "param_C": "C",
    "mean_test_score": "mean_cv_acc",
    "std_test_score": "std_cv_acc"
}, inplace=True)
results_df.style.format({
    "C": "{:.2f}",
    "mean_cv_acc": "{:.4f}",
    "std_cv_acc": "{:.4f}"
})


Unnamed: 0,C,mean_cv_acc,std_cv_acc
0,0.01,0.9531,0.0155
1,0.01,0.4499,0.001
2,0.1,0.954,0.0211
3,0.1,0.9447,0.0087
4,1.0,0.955,0.0196
5,1.0,0.9606,0.0125
6,10.0,0.955,0.0196
7,10.0,0.9616,0.0147
8,100.0,0.955,0.0196
9,100.0,0.9616,0.0147
