# Model Testing

Questions to be answered:

- Remove outliers?
- Which Sampling method to use?


## 1. Imports

In [None]:
import sys
sys.path.append('..')

from src.utils.preprocessing import (
    prepare_mitbih, 
    prepare_ptbdb,
    resample_training
)
from src.visualization import plot_confusion_matrix
from src.utils import create_model_saver
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from scipy.stats import loguniform, randint, uniform
import numpy as np
import re

# Init model saver
model_saver = create_model_saver("../src/models/exploration_phase")

In [4]:
def eval_model(model, X_tr, y_tr, X_va, y_va, X_te, y_te):
    model.fit(X_tr, y_tr)
    yv = model.predict(X_va)
    yt = model.predict(X_te)

    # Choose a consistent label order (dynamic)
    labels = np.unique(np.concatenate([y_tr, y_va, y_te]))

    # Validation
    acc_v = accuracy_score(y_va, yv)
    p_v_m, r_v_m, f1_v_m, _ = precision_recall_fscore_support(
        y_va, yv, average='macro', zero_division=0
    )
    p_v_c, r_v_c, f1_v_c, sup_v = precision_recall_fscore_support(
        y_va, yv, average=None, labels=labels, zero_division=0
    )
    cm_v = confusion_matrix(y_va, yv, labels=labels)

    # Test
    acc_t = accuracy_score(y_te, yt)
    p_t_m, r_t_m, f1_t_m, _ = precision_recall_fscore_support(
        y_te, yt, average='macro', zero_division=0
    )
    p_t_c, r_t_c, f1_t_c, sup_t = precision_recall_fscore_support(
        y_te, yt, average=None, labels=labels, zero_division=0
    )
    cm_t = confusion_matrix(y_te, yt, labels=labels)

    return {
        'labels': labels,  # order for per-class arrays below
        'val': {
            'accuracy': acc_v,
            'precision_macro': p_v_m,
            'recall_macro': r_v_m,
            'f1_macro': f1_v_m,
            'precision_per_class': p_v_c,
            'recall_per_class': r_v_c,
            'f1_per_class': f1_v_c,
            'support_per_class': sup_v,
            'confusion_matrix': cm_v,
        },
        'test': {
            'accuracy': acc_t,
            'precision_macro': p_t_m,
            'recall_macro': r_t_m,
            'f1_macro': f1_t_m,
            'precision_per_class': p_t_c,
            'recall_per_class': r_t_c,
            'f1_per_class': f1_t_c,
            'support_per_class': sup_t,
            'confusion_matrix': cm_t,
        },
    }

results = {}


In [None]:
# Prepare datasets
mitbih = prepare_mitbih(remove_outliers=False)

print("MITBIH dataset prepared:")
print(f"  Training size: {mitbih.X_train.shape}")
print(f"  Validation size: {mitbih.X_val.shape if mitbih.X_val is not None else 'None'}")
print(f"  Test size: {mitbih.X_test.shape if mitbih.X_test is not None else 'None'}")


MITBIH dataset prepared:
  Training size: (78798, 187)
  Validation size: (8756, 187)
  Test size: (21892, 187)

PTBDB dataset prepared:
  Training size: (10472, 187)
  Validation size: (1164, 187)
  Test size: (2909, 187)


## 2. Load Data

In [6]:
X_train, X_val, X_test = mitbih.X_train.values, mitbih.X_val.values, mitbih.X_test.values
y_train = mitbih.y_train.astype(int).values
y_val = mitbih.y_val.astype(int).values
y_test = mitbih.y_test.astype(int).values

# Scale features using train fit only
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_val_s = scaler.transform(X_val)
X_test_s = scaler.transform(X_test)


## 3. Test models with Randomized Search CV

In [7]:
scoring = {'f1_macro': 'f1_macro', 'bal_acc': 'balanced_accuracy', 'f1_weighted': 'f1_weighted'}

### 3.1 Without outlier removal or sampling

#### 3.1.1 Logistic Regression

In [8]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression

# Check if model already exists
classifier_name = "LogisticRegression"
experiment_name = "no_sampling"

if model_saver.model_exists(classifier_name, experiment_name):
    print(f"Model {classifier_name} already exists for experiment {experiment_name}. Loading...")
    rs_logreg = model_saver.load_model(classifier_name, experiment_name)
    print("Model loaded successfully!")
else:
    print(f"Model {classifier_name} not found. Training new model...")
    
    logreg = LogisticRegression(max_iter=10000, multi_class='multinomial', solver='lbfgs')

    param_dist_logreg = {
        'C': loguniform(1e-3, 1e3), # Big C = less penalty on large weights (more freedom, risk of overfitting). 
                                    # Small C = more penalty (more discipline, less overfitting).
                                    # loguniform = means we try values spread across tiny to big scales (e.g., 0.001 up to 100), not just small steps.
        'penalty': ['l2'], # gently pushes weights toward zero, which keeps the model simpler and more stable.
        'solver': ['lbfgs'],
    }

    rs_logreg = RandomizedSearchCV(
        estimator=logreg,
        param_distributions=param_dist_logreg,
        n_iter=20,
        scoring=scoring,
        refit='f1_macro',
        cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
        random_state=42,
        n_jobs=-1,
        verbose=2
    )
    rs_logreg.fit(X_train_s, y_train)
    
    # Save the trained model
    metadata = {
        'best_params': rs_logreg.best_params_,
        'best_score': rs_logreg.best_score_,
        'cv_results': rs_logreg.cv_results_,
        'experiment': experiment_name,
        'classifier': classifier_name
    }
    model_saver.save_model(classifier_name, rs_logreg, experiment_name, metadata)
    print(f"Model {classifier_name} saved successfully!")

INFO:src.models.exploration_phase.model_saver:Model loaded: src/models/exploration_phase/LogisticRegression_no_sampling.joblib


Model LogisticRegression already exists for experiment no_sampling. Loading...
Model loaded successfully!


In [9]:
best_logreg = rs_logreg.best_estimator_
results['LogisticRegression'] = eval_model(
    best_logreg,
    X_train_s, y_train,
    X_val_s, y_val,
    X_test_s, y_test,
)

In [10]:
display(best_logreg)
results['LogisticRegression']

{'labels': array([0, 1, 2, 3, 4]),
 'val': {'accuracy': 0.9176564641388762,
  'precision_macro': 0.800141874074688,
  'recall_macro': 0.6105422563887755,
  'f1_macro': 0.6765546028672075,
  'precision_per_class': array([0.92888831, 0.81679389, 0.65064103, 0.65625   , 0.94813614]),
  'recall_per_class': array([0.98220199, 0.48198198, 0.35060449, 0.328125  , 0.90979782]),
  'f1_per_class': array([0.9548015 , 0.60623229, 0.45566779, 0.4375    , 0.92857143]),
  'support_per_class': array([7248,  222,  579,   64,  643]),
  'confusion_matrix': array([[7119,   18,   91,    1,   19],
         [ 106,  107,    7,    1,    1],
         [ 350,    5,  203,    9,   12],
         [  38,    0,    5,   21,    0],
         [  51,    1,    6,    0,  585]])},
 'test': {'accuracy': 0.9151288141786954,
  'precision_macro': 0.7885019903691001,
  'recall_macro': 0.5967700974401475,
  'f1_macro': 0.6633702561560696,
  'precision_per_class': array([0.92469552, 0.82105263, 0.66344828, 0.57843137, 0.95488215]),
 

#### 3.1.2 KNN

In [11]:
# KNN
from sklearn.neighbors import KNeighborsClassifier

# Check if model already exists
classifier_name = "KNN"
experiment_name = "no_sampling"

if model_saver.model_exists(classifier_name, experiment_name):
    print(f"Model {classifier_name} already exists for experiment {experiment_name}. Loading...")
    rs_knn = model_saver.load_model(classifier_name, experiment_name)
    print("Model loaded successfully!")
else:
    print(f"Model {classifier_name} not found. Training new model...")
    
    knn = KNeighborsClassifier()
    param_dist_knn = {
        'n_neighbors': randint(1, 51),
        'weights': ['uniform', 'distance'],           # helps with imbalance; 'distance' often better
        'metric': ['minkowski', 'manhattan', 'euclidean'],
        'p': [1,2],                           # used only for minkowski, if left out it defaults to euclidean
    }

    rs_knn = RandomizedSearchCV(
        estimator=knn,
        param_distributions=param_dist_knn,
        n_iter=20,
        scoring=scoring,
        refit='f1_macro',
        cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
        random_state=42,
        n_jobs=-1,
    )
    rs_knn.fit(X_train_s, y_train)
    
    # Save the trained model
    metadata = {
        'best_params': rs_knn.best_params_,
        'best_score': rs_knn.best_score_,
        'cv_results': rs_knn.cv_results_,
        'experiment': experiment_name,
        'classifier': classifier_name
    }
    model_saver.save_model(classifier_name, rs_knn, experiment_name, metadata)
    print(f"Model {classifier_name} saved successfully!")

INFO:src.models.exploration_phase.model_saver:Model loaded: src/models/exploration_phase/KNN_no_sampling.joblib


Model KNN already exists for experiment no_sampling. Loading...
Model loaded successfully!


In [12]:
best_knn = rs_knn.best_estimator_
results['KNN'] = eval_model(
    best_knn,
    X_train_s, y_train,
    X_val_s, y_val,
    X_test_s, y_test,
)

In [13]:
display(best_knn)
results['KNN']

{'labels': array([0, 1, 2, 3, 4]),
 'val': {'accuracy': 0.9798994974874372,
  'precision_macro': 0.9391433113355262,
  'recall_macro': 0.8599185488349104,
  'f1_macro': 0.8949289961165755,
  'precision_per_class': array([0.98375205, 0.88826816, 0.95087719, 0.87755102, 0.99526814]),
  'recall_per_class': array([0.99406733, 0.71621622, 0.93609672, 0.671875  , 0.98133748]),
  'f1_per_class': array([0.98888279, 0.79301746, 0.94342907, 0.76106195, 0.98825372]),
  'support_per_class': array([7248,  222,  579,   64,  643]),
  'confusion_matrix': array([[7205,   18,   20,    3,    2],
         [  61,  159,    2,    0,    0],
         [  33,    0,  542,    3,    1],
         [  16,    0,    5,   43,    0],
         [   9,    2,    1,    0,  631]])},
 'test': {'accuracy': 0.9775260369084597,
  'precision_macro': 0.920137365015206,
  'recall_macro': 0.8528821323680518,
  'f1_macro': 0.8832154533279242,
  'precision_per_class': array([0.98254894, 0.88167053, 0.94080338, 0.80141844, 0.99424552]),
 

#### 3.1.3 Random Forest


In [14]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier

# Check if model already exists
classifier_name = "RandomForest"
experiment_name = "no_sampling"

if model_saver.model_exists(classifier_name, experiment_name):
    print(f"Model {classifier_name} already exists for experiment {experiment_name}. Loading...")
    rs_rf = model_saver.load_model(classifier_name, experiment_name)
    print("Model loaded successfully!")
else:
    print(f"Model {classifier_name} not found. Training new model...")
    
    rf = RandomForestClassifier(random_state=42, n_jobs=-1)
    param_dist_rf = {
        'n_estimators': [100, 200, 300],
        'max_depth': [10, 15, 20], # prevent overfitting majority class
        
        'min_samples_split': [2, 5, 10, 20, 50],
        'min_samples_leaf': [1, 2, 4, 8], # higher = better regularization
        
        'max_features': ['sqrt', 'log2', None],
        'bootstrap': [True], # better generalization
        
        'class_weight': ['balanced', None], # for imbalanced data
        
        # Split criterion: entropy can help with imbalanced classes
        'criterion': ['gini', 'entropy'],
    }

    rs_rf = RandomizedSearchCV(
        estimator=rf,
        param_distributions=param_dist_rf,
        n_iter=20,
        scoring=scoring,
        refit='f1_macro',
        cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
        random_state=42,
        n_jobs=-1,
        verbose=2,
    )

    rs_rf.fit(X_train, y_train) # using unscaled data - RF is not sensitive to feature scaling
    
    # Save the trained model
    metadata = {
        'best_params': rs_rf.best_params_,
        'best_score': rs_rf.best_score_,
        'cv_results': rs_rf.cv_results_,
        'experiment': experiment_name,
        'classifier': classifier_name
    }
    model_saver.save_model(classifier_name, rs_rf, experiment_name, metadata)
    print(f"Model {classifier_name} saved successfully!")

INFO:src.models.exploration_phase.model_saver:Model loaded: src/models/exploration_phase/RandomForest_no_sampling.joblib


Model RandomForest already exists for experiment no_sampling. Loading...
Model loaded successfully!


In [15]:
best_rf = rs_rf.best_estimator_
results['RandomForest'] = eval_model( 
    best_rf, 
    X_train, y_train,
    X_val, y_val,
    X_test, y_test,
)

In [16]:
display(best_rf)
results['RandomForest']

{'labels': array([0, 1, 2, 3, 4]),
 'val': {'accuracy': 0.9785290086797624,
  'precision_macro': 0.9160207060087867,
  'recall_macro': 0.8648953216324944,
  'f1_macro': 0.8885240457117929,
  'precision_per_class': array([0.98467013, 0.88172043, 0.94210526, 0.78571429, 0.98589342]),
  'recall_per_class': array([0.99254967, 0.73873874, 0.92746114, 0.6875    , 0.97822706]),
  'f1_per_class': array([0.9885942 , 0.80392157, 0.93472585, 0.73333333, 0.98204528]),
  'support_per_class': array([7248,  222,  579,   64,  643]),
  'confusion_matrix': array([[7194,   20,   20,    8,    6],
         [  54,  164,    3,    0,    1],
         [  34,    2,  537,    4,    2],
         [  14,    0,    6,   44,    0],
         [  10,    0,    4,    0,  629]])},
 'test': {'accuracy': 0.9748309884889458,
  'precision_macro': 0.901553466600747,
  'recall_macro': 0.8709987482601733,
  'f1_macro': 0.8848821016625263,
  'precision_per_class': array([0.98157362, 0.8516129 , 0.94247159, 0.74556213, 0.98654709]),
 

#### 3.1.4 SVM

In [17]:
from sklearn.svm import SVC

# Check if model already exists
classifier_name = "SVM"
experiment_name = "no_sampling"

if model_saver.model_exists(classifier_name, experiment_name):
    print(f"Model {classifier_name} already exists for experiment {experiment_name}. Loading...")
    rs_svm = model_saver.load_model(classifier_name, experiment_name)
    print("Model loaded successfully!")
else:
    print(f"Model {classifier_name} not found. Training new model...")
    
    svm = SVC()
    param_dist_svm = {
        'kernel': ['rbf', 'poly'],
        'C': [0.1, 1, 10],
        'gamma': [0.001, 0.01, 0.1, 0.5, 1],
    }
    rs_svm = RandomizedSearchCV(
        estimator=svm,
        param_distributions=param_dist_svm,
        n_iter=15,
        scoring=scoring,
        refit='f1_macro',
        cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42),
        random_state=42,
        n_jobs=-1,
        verbose=2,
    )
    rs_svm.fit(X_train_s, y_train)
    
    # Save the trained model
    metadata = {
        'best_params': rs_svm.best_params_,
        'best_score': rs_svm.best_score_,
        'cv_results': rs_svm.cv_results_,
        'experiment': experiment_name,
        'classifier': classifier_name
    }
    model_saver.save_model(classifier_name, rs_svm, experiment_name, metadata)
    print(f"Model {classifier_name} saved successfully!")

INFO:src.models.exploration_phase.model_saver:Model loaded: src/models/exploration_phase/SVM_no_sampling.joblib


Model SVM already exists for experiment no_sampling. Loading...
Model loaded successfully!


In [18]:
best_svm = rs_svm.best_estimator_
results['SVM'] = eval_model(
    best_svm,
    X_train_s, y_train,
    X_val_s, y_val,
    X_test_s, y_test,
)

In [19]:
display(best_svm)
results['SVM']

{'labels': array([0, 1, 2, 3, 4]),
 'val': {'accuracy': 0.9745317496573778,
  'precision_macro': 0.9163739251346034,
  'recall_macro': 0.8540245675024011,
  'f1_macro': 0.8822902628187215,
  'precision_per_class': array([0.98194995, 0.84615385, 0.91872792, 0.8490566 , 0.98598131]),
  'recall_per_class': array([0.99075607, 0.69369369, 0.89810017, 0.703125  , 0.9844479 ]),
  'f1_per_class': array([0.98633336, 0.76237624, 0.90829694, 0.76923077, 0.98521401]),
  'support_per_class': array([7248,  222,  579,   64,  643]),
  'confusion_matrix': array([[7181,   23,   35,    3,    6],
         [  65,  154,    3,    0,    0],
         [  46,    5,  520,    5,    3],
         [  12,    0,    7,   45,    0],
         [   9,    0,    1,    0,  633]])},
 'test': {'accuracy': 0.9740544491138315,
  'precision_macro': 0.9049136882358036,
  'recall_macro': 0.8394617444559591,
  'f1_macro': 0.8684037795992783,
  'precision_per_class': array([0.98054602, 0.87347932, 0.92642857, 0.75675676, 0.98735777]),


#### 3.1.5 Decision Tree Classifier

In [20]:
from sklearn.tree import DecisionTreeClassifier

# Check if model already exists
classifier_name = "DecisionTree"
experiment_name = "no_sampling"

if model_saver.model_exists(classifier_name, experiment_name):
    print(f"Model {classifier_name} already exists for experiment {experiment_name}. Loading...")
    rs_dt = model_saver.load_model(classifier_name, experiment_name)
    print("Model loaded successfully!")
else:
    print(f"Model {classifier_name} not found. Training new model...")
    
    dt = DecisionTreeClassifier(random_state=42)

    param_dist_dt = {
        'max_depth': [None, 5, 10, 15, 20, 25, 30],
        'min_samples_split': [2, 5, 10, 20, 50],
        'min_samples_leaf': [1, 2, 4, 8, 16],
        'max_features': ['sqrt', 'log2', None],
        'criterion': ['gini', 'entropy'],  
        'class_weight': ['balanced', None],
        'splitter': ['best', 'random'],  # Split strategy
    }

    rs_dt = RandomizedSearchCV(
        estimator=dt,
        param_distributions=param_dist_dt,
        n_iter=100,  
        scoring=scoring,
        refit='f1_macro',
        cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
        random_state=42,
        n_jobs=-1,
        verbose=2,
    )

    rs_dt.fit(X_train, y_train)  # Using unscaled data - DT doesn't need scaling
    
    # Save the trained model
    metadata = {
        'best_params': rs_dt.best_params_,
        'best_score': rs_dt.best_score_,
        'cv_results': rs_dt.cv_results_,
        'experiment': experiment_name,
        'classifier': classifier_name
    }
    model_saver.save_model(classifier_name, rs_dt, experiment_name, metadata)
    print(f"Model {classifier_name} saved successfully!")

INFO:src.models.exploration_phase.model_saver:Model loaded: src/models/exploration_phase/DecisionTree_no_sampling.joblib


Model DecisionTree already exists for experiment no_sampling. Loading...
Model loaded successfully!


In [21]:
best_dt = rs_dt.best_estimator_
results['DecisionTree'] = eval_model(
    best_dt,
    X_train, y_train,
    X_val, y_val,
    X_test, y_test,
)

In [22]:
display(best_dt)
results['DecisionTree']

{'labels': array([0, 1, 2, 3, 4]),
 'val': {'accuracy': 0.9611694837825491,
  'precision_macro': 0.8286714991952492,
  'recall_macro': 0.8113420712229086,
  'f1_macro': 0.8192777361500603,
  'precision_per_class': array([0.97892852, 0.7357513 , 0.86677909, 0.61290323, 0.94899536]),
  'recall_per_class': array([0.98068433, 0.63963964, 0.88773748, 0.59375   , 0.95489891]),
  'f1_per_class': array([0.97980564, 0.68433735, 0.87713311, 0.6031746 , 0.95193798]),
  'support_per_class': array([7248,  222,  579,   64,  643]),
  'confusion_matrix': array([[7108,   41,   58,   15,   26],
         [  71,  142,    7,    0,    2],
         [  47,    5,  514,    9,    4],
         [  16,    1,    8,   38,    1],
         [  19,    4,    6,    0,  614]])},
 'test': {'accuracy': 0.9594372373469761,
  'precision_macro': 0.8282860612296581,
  'recall_macro': 0.7983514845333289,
  'f1_macro': 0.8124014481179902,
  'precision_per_class': array([0.97427195, 0.72995781, 0.8837535 , 0.59119497, 0.96225208]),


#### 3.1.6 XGBoost / Gradien Boosting

In [23]:
import xgboost as xgb

# Check if model already exists
classifier_name = "XGBoost"
experiment_name = "no_sampling"

if model_saver.model_exists(classifier_name, experiment_name):
    print(f"Model {classifier_name} already exists for experiment {experiment_name}. Loading...")
    rs_xgb = model_saver.load_model(classifier_name, experiment_name)
    print("Model loaded successfully!")
else:
    print(f"Model {classifier_name} not found. Training new model...")
    
    xgb_model = xgb.XGBClassifier(
        objective='multi:softmax',
        num_class=5,  # no of classes
        random_state=42,
        n_jobs=-1,
        eval_metric='mlogloss'
    )

    param_dist_xgb = {
        'n_estimators': [100, 200, 300, 500],
        'max_depth': [3, 4, 5, 6, 7, 8],
        'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],
        'subsample': [0.8, 0.9, 1.0],
        'colsample_bytree': [0.8, 0.9, 1.0],
        'reg_alpha': [0, 0.1, 0.5, 1.0],  # L1 regularization
        'reg_lambda': [0, 0.1, 0.5, 1.0],  # L2 regularization
        'min_child_weight': [1, 3, 5, 7],
        'gamma': [0, 0.1, 0.2, 0.3],  # Minimum loss reduction
    }

    rs_xgb = RandomizedSearchCV(
        estimator=xgb_model,
        param_distributions=param_dist_xgb,
        n_iter=30,  
        scoring=scoring,
        refit='f1_macro',
        cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
        random_state=42,
        n_jobs=-1,
        verbose=2,
    )

    rs_xgb.fit(X_train, y_train)  # XGBoost handles scaling internally
    
    # Save the trained model
    metadata = {
        'best_params': rs_xgb.best_params_,
        'best_score': rs_xgb.best_score_,
        'cv_results': rs_xgb.cv_results_,
        'experiment': experiment_name,
        'classifier': classifier_name
    }
    model_saver.save_model(classifier_name, rs_xgb, experiment_name, metadata)
    print(f"Model {classifier_name} saved successfully!")

INFO:src.models.exploration_phase.model_saver:Model loaded: src/models/exploration_phase/XGBoost_no_sampling.joblib


Model XGBoost already exists for experiment no_sampling. Loading...
Model loaded successfully!


In [24]:
best_xgb = rs_xgb.best_estimator_
results['XGBoost'] = eval_model(
    best_xgb,
    X_train, y_train,
    X_val, y_val,
    X_test, y_test,
)

In [25]:
display(best_xgb)
results['XGBoost']

{'labels': array([0, 1, 2, 3, 4]),
 'val': {'accuracy': 0.9816126084970306,
  'precision_macro': 0.9503427741288547,
  'recall_macro': 0.8653017509227453,
  'f1_macro': 0.9026234500019875,
  'precision_per_class': array([0.98499318, 0.92397661, 0.94903339, 0.9       , 0.99371069]),
  'recall_per_class': array([0.99613687, 0.71171171, 0.93264249, 0.703125  , 0.98289269]),
  'f1_per_class': array([0.99053368, 0.80407125, 0.94076655, 0.78947368, 0.98827209]),
  'support_per_class': array([7248,  222,  579,   64,  643]),
  'confusion_matrix': array([[7220,   11,   13,    2,    2],
         [  62,  158,    1,    0,    1],
         [  34,    1,  540,    3,    1],
         [   7,    0,   12,   45,    0],
         [   7,    1,    3,    0,  632]])},
 'test': {'accuracy': 0.9818198428649735,
  'precision_macro': 0.9545382548756202,
  'recall_macro': 0.8654063021130722,
  'f1_macro': 0.9046987238778078,
  'precision_per_class': array([0.98292922, 0.95588235, 0.97383721, 0.86764706, 0.99239544]),


#### 3.1.7 Results Summary and Comparison

In [30]:
def _safe_col(label):
    # Make safe column names like "val_f1_cls_0" or "val_f1_cls_N"
    return re.sub(r'[^0-9a-zA-Z_]+', '_', str(label)).strip('_')

# Mapping of model names to their RandomizedSearchCV objects
models_and_searchers = {
    "LogisticRegression": rs_logreg,
    "KNN": rs_knn, 
    "RandomForest": rs_rf,
    "SVM": rs_svm,
    "DecisionTree": rs_dt,
    "XGBoost": rs_xgb  
}

rows = []
for name, res in results.items():
    row = {
        'model': name,
        'val_accuracy': round(res['val']['accuracy'], 2),
        'val_f1_macro': round(res['val']['f1_macro'], 2),
        'test_accuracy': round(res['test']['accuracy'], 2),
        'test_f1_macro': round(res['test']['f1_macro'], 2),
    }

    # Add best parameters from RandomizedSearchCV
    if name in models_and_searchers:
        searcher = models_and_searchers[name]
        best_params = searcher.best_params_
        best_cv_score = searcher.best_score_
        row['best_cv_score'] = round(best_cv_score, 2)
        row['best_parameters'] = str(best_params)
    else:
        row['best_cv_score'] = None
        row['best_parameters'] = None

    labels = res['labels']
    f1_v = res['val']['f1_per_class']
    f1_t = res['test']['f1_per_class']

    # Add per-class F1 columns
    for lbl, f1 in zip(labels, f1_v):
        row[f'val_f1_cls_{_safe_col(lbl)}'] = round(f1, 2)
    for lbl, f1 in zip(labels, f1_t):
        row[f'test_f1_cls_{_safe_col(lbl)}'] = round(f1, 2)

    rows.append(row)

comparison_df = (
    pd.DataFrame(rows)
      .sort_values(by=['val_f1_macro','test_f1_macro'], ascending=False)
      .reset_index(drop=True)
)

best_model_name = comparison_df.iloc[0]['model']
best_model_results = results[best_model_name]

comparison_df_display = comparison_df.copy()
comparison_df_display['best_parameters'] = comparison_df_display['best_parameters'].apply(
    lambda x: json.dumps(x, indent=2) if isinstance(x, dict) else x
)
import os 

comparison_df_display.to_csv("../src/data/03_model_testing_results/model_comparison_without_resampling.csv", index=False)

# Display the comparison table with best parameters
print("=" * 100)
print("MODEL COMPARISON WITH BEST PARAMETERS FROM RANDOMIZEDSEARCHCV")
print("=" * 100)
display(comparison_df_display)


MODEL COMPARISON WITH BEST PARAMETERS FROM RANDOMIZEDSEARCHCV


Unnamed: 0,model,val_accuracy,val_f1_macro,test_accuracy,test_f1_macro,best_cv_score,best_parameters,val_f1_cls_0,val_f1_cls_1,val_f1_cls_2,val_f1_cls_3,val_f1_cls_4,test_f1_cls_0,test_f1_cls_1,test_f1_cls_2,test_f1_cls_3,test_f1_cls_4
0,XGBoost,0.98,0.9,0.98,0.9,0.9,"{'subsample': 0.8, 'reg_lambda': 0, 'reg_alpha...",0.99,0.8,0.94,0.79,0.99,0.99,0.81,0.95,0.79,0.98
1,KNN,0.98,0.89,0.98,0.88,0.88,"{'metric': 'manhattan', 'n_neighbors': 4, 'p':...",0.99,0.79,0.94,0.76,0.99,0.99,0.77,0.93,0.75,0.98
2,RandomForest,0.98,0.89,0.97,0.88,0.88,"{'n_estimators': 200, 'min_samples_split': 20,...",0.99,0.8,0.93,0.73,0.98,0.99,0.78,0.93,0.76,0.97
3,SVM,0.97,0.88,0.97,0.87,0.87,"{'kernel': 'poly', 'gamma': 0.01, 'C': 10}",0.99,0.76,0.91,0.77,0.99,0.99,0.74,0.91,0.72,0.98
4,DecisionTree,0.96,0.82,0.96,0.81,0.83,"{'splitter': 'best', 'min_samples_split': 20, ...",0.98,0.68,0.88,0.6,0.95,0.98,0.67,0.88,0.59,0.95
5,LogisticRegression,0.92,0.68,0.92,0.66,0.66,"{'C': 4.0428727350273315, 'penalty': 'l2', 'so...",0.95,0.61,0.46,0.44,0.93,0.95,0.56,0.44,0.45,0.92


In [None]:
# Check saved models
print("=" * 80)
print("SAVED MODELS INFORMATION")
print("=" * 80)

saved_models = model_saver.list_saved_models()
if saved_models:
    for model_key, info in saved_models.items():
        print(f"\nModel: {model_key}")
        print(f"  Exists: {info['exists']}")
        print(f"  Path: {info['model_path']}")
        if info['exists']:
            print(f"  Size: {info['size_bytes']} bytes")
            print(f"  Modified: {info['modified_time']}")
        
        # Load and display metadata if available
        if info['metadata_exists']:
            try:
                metadata = model_saver.load_metadata(model_key.split('_')[0], model_key.split('_')[1] if '_' in model_key else 'default')
                if metadata:
                    print(f"  Best Score: {metadata.get('best_score', 'N/A')}")
                    print(f"  Best Params: {metadata.get('best_params', 'N/A')}")
            except Exception as e:
                print(f"  Error loading metadata: {e}")
else:
    print("No saved models found.")


SAVED MODELS INFORMATION

Model: DecisionTree_no_sampling
  Exists: True
  Path: ../src/models/exploration_phase/DecisionTree_no_sampling.joblib
  Size: 232415 bytes
  Modified: 1760415820.9521165

Model: SVM_no_sampling
  Exists: True
  Path: ../src/models/exploration_phase/SVM_no_sampling.joblib
  Size: 12504219 bytes
  Modified: 1760415063.7689745

Model: KNN_no_sampling
  Exists: True
  Path: ../src/models/exploration_phase/KNN_no_sampling.joblib
  Size: 118528799 bytes
  Modified: 1760396388.739042

Model: RandomForest_no_sampling
  Exists: True
  Path: ../src/models/exploration_phase/RandomForest_no_sampling.joblib
  Size: 48327727 bytes
  Modified: 1760399598.0772789

Model: LogisticRegression_no_sampling
  Exists: True
  Path: ../src/models/exploration_phase/LogisticRegression_no_sampling.joblib
  Size: 26175 bytes
  Modified: 1760395829.1684663

Model: XGBoost_no_sampling
  Exists: True
  Path: ../src/models/exploration_phase/XGBoost_no_sampling.joblib
  Size: 2872222 bytes
  

### 3.2. With Sampling Methods

Quick run - Using the best models from above

In [38]:
## Test Models with Different Sampling Methods

from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.combine import SMOTETomek, SMOTEENN
from sklearn.preprocessing import StandardScaler

sampling_methods = {
    'No_Sampling': None,
    'RandomOverSampler': RandomOverSampler(random_state=42),
    'SMOTE': SMOTE(random_state=42, k_neighbors=5),
    'ADASYN': ADASYN(random_state=42, n_neighbors=5),
    'SMOTETomek': SMOTETomek(random_state=42, smote=SMOTE(random_state=42, k_neighbors=5)),
    'SMOTEENN': SMOTEENN(random_state=42, smote=SMOTE(random_state=42, k_neighbors=5)),
}

sampling_results = {}

best_models = {
    'KNN': best_knn,
    'RandomForest': best_rf,
    'XGBoost': best_xgb,
}

scale_sensitive = ['LogisticRegression', 'SVM', 'KNN']

print("Testing sampling methods on best models...")
print("=" * 80)

for sampling_name, sampler in sampling_methods.items():
    print(f"\nTesting {sampling_name}...")
    sampling_results[sampling_name] = {}
    
    for model_name, model in best_models.items():
        print(f"  - {model_name}")
        
        try:
            if sampler is None:
                # No sampling - use original, only scaled data
                result = eval_model(model,
                                    X_train_s if model_name in scale_sensitive else X_train , y_train,
                                    X_val_s if model_name in scale_sensitive else X_val, y_val,
                                    X_test_s if model_name in scale_sensitive else X_test, y_test)
            else:
                # Apply sampling on unscaled data
                X_train_sampled, y_train_sampled = sampler.fit_resample(X_train, y_train)
                
                # Re-scale if needed for models that require scaling
                if model_name in scale_sensitive:
                    scaler_sampling = StandardScaler()
                    X_train_sampled = scaler_sampling.fit_transform(X_train_sampled)
                    X_val_sampled = scaler_sampling.transform(X_val)
                    X_test_sampled = scaler_sampling.transform(X_test)
                else: # e.g. RF, XGBoost
                    X_val_sampled = X_val
                    X_test_sampled = X_test
            
                result = eval_model(
                    model,
                    X_train_sampled, y_train_sampled,
                    X_val_sampled, y_val,
                    X_test_sampled, y_test,
                )
            
            sampling_results[sampling_name][model_name] = result
            
            # Printing statistics
            if sampler is not None:
                unique, counts = np.unique(y_train_sampled, return_counts=True)
                print(f"    Class distribution after {sampling_name}:")
                for cls, count in zip(unique, counts):
                    print(f"      Class {cls}: {count:,} samples")
            
        except Exception as e:
            print(f"    ERROR with {sampling_name} + {model_name}: {str(e)}")
            sampling_results[sampling_name][model_name] = None


Testing sampling methods on best models...

Testing No_Sampling...
  - KNN
  - RandomForest
  - XGBoost

Testing RandomOverSampler...
  - KNN
    Class distribution after RandomOverSampler:
      Class 0: 65,223 samples
      Class 1: 65,223 samples
      Class 2: 65,223 samples
      Class 3: 65,223 samples
      Class 4: 65,223 samples
  - RandomForest
    Class distribution after RandomOverSampler:
      Class 0: 65,223 samples
      Class 1: 65,223 samples
      Class 2: 65,223 samples
      Class 3: 65,223 samples
      Class 4: 65,223 samples
  - XGBoost
    Class distribution after RandomOverSampler:
      Class 0: 65,223 samples
      Class 1: 65,223 samples
      Class 2: 65,223 samples
      Class 3: 65,223 samples
      Class 4: 65,223 samples

Testing SMOTE...
  - KNN
    Class distribution after SMOTE:
      Class 0: 65,223 samples
      Class 1: 65,223 samples
      Class 2: 65,223 samples
      Class 3: 65,223 samples
      Class 4: 65,223 samples
  - RandomForest
    Cl

In [None]:

# Create comprehensive comparison table
print("\n" + "=" * 100)
print("SAMPLING METHODS COMPARISON")
print("=" * 100)

# Prepare comparison data
comparison_rows = []

for sampling_name, models_results in sampling_results.items():
    for model_name, result in models_results.items():
        if result is not None:
            row = {
                'sampling_method': sampling_name,
                'model': model_name,
                'val_accuracy': round(result['val']['accuracy'],2),
                'val_f1_macro': round(result['val']['f1_macro'],2),
                'test_accuracy': round(result['test']['accuracy'],2),
                'test_f1_macro': round(result['test']['f1_macro'],2)
            }
            
            # Add per-class F1 scores
            labels = result['labels']
            f1_v = result['val']['f1_per_class']
            f1_t = result['test']['f1_per_class']
            
            for lbl, f1 in zip(labels, f1_v):
                row[f'val_f1_cls_{_safe_col(lbl)}'] = round(f1,2)
            for lbl, f1 in zip(labels, f1_t):
                row[f'test_f1_cls_{_safe_col(lbl)}'] = round(f1,2)
            
            comparison_rows.append(row)

# Create and display comparison DataFrame
sampling_comparison_df = (
    pd.DataFrame(comparison_rows)
    .sort_values(by=['test_f1_macro', 'val_f1_macro'], ascending=False)
    .reset_index(drop=True)
)

In [None]:
sampling_comparison_df.to_csv("../src/data/03_model_testing_results/model_comparison_with_sampling_on_best_models.csv", index=False)

# Find best combination
best_sampling_model = sampling_comparison_df.iloc[0]
print(f"\nBEST COMBINATION:")
print(f"Sampling Method: {best_sampling_model['sampling_method']}")
print(f"Model: {best_sampling_model['model']}")
print(f"Test F1-Macro: {best_sampling_model['test_f1_macro']:.4f}")
print(f"Validation F1-Macro: {best_sampling_model['val_f1_macro']:.4f}")

# Summary statistics
print(f"\nSUMMARY STATISTICS:")
print(f"Total combinations tested: {len(comparison_rows)}")
print(f"Best test F1-macro: {sampling_comparison_df['test_f1_macro'].max():.4f}")
print(f"Best validation F1-macro: {sampling_comparison_df['val_f1_macro'].max():.4f}")

# Show top 5 combinations
print(f"\nTOP 5 COMBINATIONS:")
top_5 = sampling_comparison_df.head(5)[['sampling_method', 'model', 'test_f1_macro', 'val_f1_macro']]
display(top_5)


BEST COMBINATION:
Sampling Method: RandomOverSampler
Model: XGBoost
Test F1-Macro: 0.9200
Validation F1-Macro: 0.9200

SUMMARY STATISTICS:
Total combinations tested: 18
Best test F1-macro: 0.9200
Best validation F1-macro: 0.9200

TOP 5 COMBINATIONS:


Unnamed: 0,sampling_method,model,test_f1_macro,val_f1_macro
0,RandomOverSampler,XGBoost,0.92,0.92
1,SMOTE,XGBoost,0.91,0.91
2,SMOTETomek,XGBoost,0.91,0.91
3,No_Sampling,XGBoost,0.9,0.9
4,RandomOverSampler,RandomForest,0.9,0.9


## 4. Possible improvements - Not Tested

- Implement Leak-free scaling
    - current: scale X_train once then run CV on X_train_s -> leaks validation-fold info into scaling
    - solution: Pipeline(StandardScaler(), model) so scaling is fit per CV fold
- RepeatedStratifiedKFold for more stable estimates

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV, RepeatedStratifiedKFold
from sklearn.metrics import make_scorer, f1_score, balanced_accuracy_score

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression(max_iter=10000, multi_class='multinomial'))
])

param_dist_logreg = {
    'C': loguniform(1e-3, 1e3), # Big C = less penalty on large weights (more freedom, risk of overfitting). 
                                # Small C = more penalty (more discipline, less overfitting).
                                # loguniform = means we try values spread across tiny to big scales (e.g., 0.001 up to 100), not just small steps.
    'penalty': ['l2'], # gently pushes weights toward zero, which keeps the model simpler and more stable.
    'solver': ['lbfgs'],
}

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=42)
scoring = {'f1_macro': 'f1_macro', 'bal_acc': 'balanced_accuracy', 'f1_weighted': 'f1_weighted'}

rs_logreg = RandomizedSearchCV(
    pipe,
    param_distributions=param_dist_logreg,
    n_iter=20,
    scoring=scoring,
    cv=cv,
    random_state=42,
    n_jobs=-1,
    verbose=2,
)
rs_logreg.fit(X_train, y_train) # not fitting on X_train_s because Pipeline will do it for us