# Model Testing

Questions to be answered:

- Remove outliers?
- Which Sampling method to use?


In [None]:
scoring = {'f1_macro': 'f1_macro', 'bal_acc': 'balanced_accuracy', 'f1_weighted': 'f1_weighted'}
random_state = 42

## 1. Imports

In [None]:
import sys
import os 
sys.path.append('..')

from src.utils.preprocessing import (
    prepare_mitbih, 
    prepare_ptbdb,
    resample_training
)
from src.utils.evaluation import eval_model
from src.visualization import plot_confusion_matrix
from src.utils.model_saver import create_model_saver

# external 
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from scipy.stats import loguniform, randint, uniform
import numpy as np
import re
import json

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
import xgboost as xgb

# Samplers

from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.combine import SMOTETomek, SMOTEENN
from sklearn.preprocessing import StandardScaler

# Init model saver
model_saver = create_model_saver("../src/models/exploration_phase")

In [None]:
# Prepare datasets
mitbih = prepare_mitbih(remove_outliers=False)

print("MITBIH dataset prepared:")
print(f"  Training size: {mitbih.X_train.shape}")
print(f"  Test size: {mitbih.X_test.shape if mitbih.X_test is not None else 'None'}")
print("Note: No validation set - using train/test split only. Cross-validation handles train/val splitting.")


## 2. Load Data

In [None]:
X_train, X_test = mitbih.X_train.values, mitbih.X_test.values
y_train = mitbih.y_train.astype(int).values
y_test = mitbih.y_test.astype(int).values

# Scale features using train fit only
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)


### Create Param Spaces

In [None]:
param_spaces = {
    "LogisticRegression": {
        "estimator": LogisticRegression(max_iter=10000, multi_class='multinomial', solver='lbfgs', n_jobs=-1),
        "params": {
            "C": loguniform(1e-3, 1e3),      # Big C = less penalty on large weights (more freedom, risk of overfitting). 
                                             # Small C = more penalty (more discipline, less overfitting).
                                             # loguniform = means we try values spread across tiny to big scales (e.g., 0.001 up to 100), not just small steps.
            "penalty": ["l2"], # gently pushes weights toward zero, which keeps the model simpler and more stable.
            "solver": ["lbfgs"],
        },
        "cv": StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state),
        "n_iter": 5,
        "create_new_model": False,
    },
    "KNN": {
        "estimator": KNeighborsClassifier(n_jobs=-1),
        "params": {
            "n_neighbors": randint(1, 51),
            "weights": ["uniform", "distance"],
            "metric": ["minkowski", "manhattan", "euclidean"],
            "p": [1, 2],
        },
        "cv": StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state),
        "n_iter": 5,
    },
    "RandomForest": {
        "estimator": RandomForestClassifier(random_state=random_state, n_jobs=-1),
        "params": {
            "n_estimators": [100, 200, 300],
            "max_depth": [10, 15, 20],
            "min_samples_split": [2, 5, 10, 20, 50],
            "min_samples_leaf": [1, 2, 4, 8],
            "max_features": ["sqrt", "log2", None],
            "bootstrap": [True],
            "class_weight": ["balanced", None],
            "criterion": ["gini", "entropy"],
        },
        "cv": StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state),
        "n_iter": 5,
    },
    "SVM": {
        "estimator": SVC(),
        "params": {
            "kernel": ["rbf", "poly"],
            "C": [0.1, 1, 10],
            "gamma": [0.001, 0.01, 0.1, 0.5, 0.9],
        },
        "cv": StratifiedKFold(n_splits=3, shuffle=True, random_state=random_state),
        "n_iter": 5,
    },
    "DecisionTree": {
        "estimator": DecisionTreeClassifier(random_state=random_state),
        "params": {
            "max_depth": [None, 5, 10, 15, 20, 25, 30],
            "min_samples_split": [2, 5, 10, 20, 50],
            "min_samples_leaf": [1, 2, 4, 8, 16],
            "max_features": ["sqrt", "log2", None],
            "criterion": ["gini", "entropy"],
            "class_weight": ["balanced", None],
            "splitter": ["best", "random"],
        },
        "cv": StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state),
        "n_iter": 100,
    },
    "XGBoost": {
        "estimator": xgb.XGBClassifier(
            objective="multi:softmax",
            num_class=5,
            random_state=random_state,
            n_jobs=-1,
            eval_metric="mlogloss",
        ),
        "params": {
            "n_estimators": [100, 200, 300, 500],
            "max_depth": [3, 4, 5, 6, 7, 8],
            "learning_rate": [0.01, 0.05, 0.1, 0.2, 0.3],
            "subsample": [0.8, 0.9, 1.0],
            "colsample_bytree": [0.8, 0.9, 1.0],
            "reg_alpha": [0, 0.1, 0.5, 1.0],
            "reg_lambda": [0, 0.1, 0.5, 1.0],
            "min_child_weight": [1, 3, 5, 7],
            "gamma": [0, 0.1, 0.2, 0.3],
        },
        "cv": StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state),
        "n_iter": 40,
    },
    "LDA": {
        "estimator": LinearDiscriminantAnalysis(),
        "params": [
            {"solver": ["svd"], "store_covariance": [False, True], "tol": [1e-4, 1e-3, 1e-2]},
            {"solver": ["lsqr", "eigen"], "shrinkage": [None, "auto", 0.0, 0.05, 0.1, 0.15, 0.25, 0.35, 0.5, 0.65, 0.75, 0.85, 0.9], "tol": [1e-4, 1e-3, 1e-2]},
        ],
        "cv": StratifiedKFold(n_splits=3, shuffle=True, random_state=random_state),
        "n_iter": 50,
    },
    "ANN": {
        "estimator": MLPClassifier(
            max_iter=300,
            early_stopping=True,
            random_state=random_state,
            n_iter_no_change=10,
            solver="adam",
        ),
        "params": {
            "hidden_layer_sizes": [(64,), (128,), (128, 64)],
            "activation": ["relu"],
            "alpha": loguniform(1e-4, 1e-2),
            "learning_rate_init": loguniform(1e-3, 1e-2),
            "batch_size": randint(64, 129),
            "beta_1": uniform(0.9, 0.09),
            "beta_2": uniform(0.95, 0.049),
            "validation_fraction": [0.1, 0.15],
        },
        "cv": StratifiedKFold(n_splits=3, shuffle=True, random_state=random_state),
        "n_iter": 100,
    },
}

## 3. Test models with Randomized Search CV

### 3.1 Without outlier removal or sampling

#### 3.1.1 Logistic Regression

In [None]:
# Check if model already exists
classifier_name = "LogisticRegression"
experiment_name = "no_sampling"
create_new = param_spaces[classifier_name].get('create_new_model', False)

if not create_new and model_saver.model_exists(classifier_name, experiment_name):
    print(f"Model {classifier_name} already exists for experiment {experiment_name}. Loading...")
    rs_logreg = model_saver.load_model(classifier_name, experiment_name)
    print("Model loaded successfully!")
else:
    print(f"Model {classifier_name} not found. Training new model...")
    
    logreg = param_spaces[classifier_name]['estimator']

    param_dist_logreg = param_spaces[classifier_name]['params']

    rs_logreg = RandomizedSearchCV(
        estimator=logreg,
        param_distributions=param_dist_logreg,
        n_iter=param_spaces[classifier_name]['n_iter'],
        scoring=scoring,
        refit='f1_macro',
        cv=param_spaces[classifier_name]['cv'],
        random_state=random_state,
        n_jobs=-1,
        verbose=2
    )
    rs_logreg.fit(X_train_s, y_train)
    
    # Save the trained model
    metadata = {
        'best_params': rs_logreg.best_params_,
        'best_score': rs_logreg.best_score_,
        'cv_results': rs_logreg.cv_results_,
        'experiment': experiment_name,
        'classifier': classifier_name
    }
    model_saver.save_model(classifier_name, rs_logreg, experiment_name, metadata)
    print(f"Model {classifier_name} saved successfully!")

In [None]:
best_logreg = rs_logreg.best_estimator_
results['LogisticRegression'] = eval_model(
    best_logreg,
    X_train_s, y_train,
    X_test_s, y_test,
)

In [None]:
display(best_logreg)
results['LogisticRegression']

#### 3.1.2 KNN

In [None]:
# Check if model already exists
classifier_name = "KNN"
experiment_name = "no_sampling"

if model_saver.model_exists(classifier_name, experiment_name):
    print(f"Model {classifier_name} already exists for experiment {experiment_name}. Loading...")
    rs_knn = model_saver.load_model(classifier_name, experiment_name)
    print("Model loaded successfully!")
else:
    print(f"Model {classifier_name} not found. Training new model...")
    
    knn = KNeighborsClassifier()
    param_dist_knn = {
        'n_neighbors': randint(1, 51),
        'weights': ['uniform', 'distance'],           # helps with imbalance; 'distance' often better
        'metric': ['minkowski', 'manhattan', 'euclidean'],
        'p': [1,2],                           # used only for minkowski, if left out it defaults to euclidean
    }

    rs_knn = RandomizedSearchCV(
        estimator=knn,
        param_distributions=param_dist_knn,
        n_iter=20,
        scoring=scoring,
        refit='f1_macro',
        cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state),
        random_state=random_state,
        n_jobs=-1,
    )
    rs_knn.fit(X_train_s, y_train)
    
    # Save the trained model
    metadata = {
        'best_params': rs_knn.best_params_,
        'best_score': rs_knn.best_score_,
        'cv_results': rs_knn.cv_results_,
        'experiment': experiment_name,
        'classifier': classifier_name
    }
    model_saver.save_model(classifier_name, rs_knn, experiment_name, metadata)
    print(f"Model {classifier_name} saved successfully!")

In [None]:
best_knn = rs_knn.best_estimator_
results['KNN'] = eval_model(
    best_knn,
    X_train_s, y_train,
    X_test_s, y_test,
)

In [None]:
display(best_knn)
results['KNN']

#### 3.1.3 Random Forest


In [None]:
# Check if model already exists
classifier_name = "RandomForest"
experiment_name = "no_sampling"

if model_saver.model_exists(classifier_name, experiment_name):
    print(f"Model {classifier_name} already exists for experiment {experiment_name}. Loading...")
    rs_rf = model_saver.load_model(classifier_name, experiment_name)
    print("Model loaded successfully!")
else:
    print(f"Model {classifier_name} not found. Training new model...")
    
    rf = RandomForestClassifier(random_state=random_state, n_jobs=-1)
    param_dist_rf = {
        'n_estimators': [100, 200, 300],
        'max_depth': [10, 15, 20], # prevent overfitting majority class
        
        'min_samples_split': [2, 5, 10, 20, 50],
        'min_samples_leaf': [1, 2, 4, 8], # higher = better regularization
        
        'max_features': ['sqrt', 'log2', None],
        'bootstrap': [True], # better generalization
        
        'class_weight': ['balanced', None], # for imbalanced data
        
        # Split criterion: entropy can help with imbalanced classes
        'criterion': ['gini', 'entropy'],
    }

    rs_rf = RandomizedSearchCV(
        estimator=rf,
        param_distributions=param_dist_rf,
        n_iter=20,
        scoring=scoring,
        refit='f1_macro',
        cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state),
        random_state=random_state,
        n_jobs=-1,
        verbose=2,
    )

    rs_rf.fit(X_train, y_train) # using unscaled data - RF is not sensitive to feature scaling
    
    # Save the trained model
    metadata = {
        'best_params': rs_rf.best_params_,
        'best_score': rs_rf.best_score_,
        'cv_results': rs_rf.cv_results_,
        'experiment': experiment_name,
        'classifier': classifier_name
    }
    model_saver.save_model(classifier_name, rs_rf, experiment_name, metadata)
    print(f"Model {classifier_name} saved successfully!")

In [None]:
best_rf = rs_rf.best_estimator_
results['RandomForest'] = eval_model( 
    best_rf, 
    X_train, y_train,
    X_test, y_test,
)

In [None]:
display(best_rf)
results['RandomForest']

#### 3.1.4 SVM

In [None]:
# Check if model already exists
classifier_name = "SVM"
experiment_name = "no_sampling"

if model_saver.model_exists(classifier_name, experiment_name):
    print(f"Model {classifier_name} already exists for experiment {experiment_name}. Loading...")
    rs_svm = model_saver.load_model(classifier_name, experiment_name)
    print("Model loaded successfully!")
else:
    print(f"Model {classifier_name} not found. Training new model...")
    
    svm = SVC()
    param_dist_svm = {
        'kernel': ['rbf', 'poly'],
        'C': [0.1, 1, 10],
        'gamma': [0.001, 0.01, 0.1, 0.5, 1],
    }
    rs_svm = RandomizedSearchCV(
        estimator=svm,
        param_distributions=param_dist_svm,
        n_iter=15,
        scoring=scoring,
        refit='f1_macro',
        cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=random_state),
        random_state=random_state,
        n_jobs=-1,
        verbose=2,
    )
    rs_svm.fit(X_train_s, y_train)
    
    # Save the trained model
    metadata = {
        'best_params': rs_svm.best_params_,
        'best_score': rs_svm.best_score_,
        'cv_results': rs_svm.cv_results_,
        'experiment': experiment_name,
        'classifier': classifier_name
    }
    model_saver.save_model(classifier_name, rs_svm, experiment_name, metadata)
    print(f"Model {classifier_name} saved successfully!")

In [None]:
best_svm = rs_svm.best_estimator_
results['SVM'] = eval_model(
    best_svm,
    X_train_s, y_train,
    X_test_s, y_test,
)

In [None]:
display(best_svm)
results['SVM']

#### 3.1.5 Decision Tree Classifier

In [None]:
# Check if model already exists
classifier_name = "DecisionTree"
experiment_name = "no_sampling"

if model_saver.model_exists(classifier_name, experiment_name):
    print(f"Model {classifier_name} already exists for experiment {experiment_name}. Loading...")
    rs_dt = model_saver.load_model(classifier_name, experiment_name)
    print("Model loaded successfully!")
else:
    print(f"Model {classifier_name} not found. Training new model...")
    
    dt = DecisionTreeClassifier(random_state=random_state)

    param_dist_dt = {
        'max_depth': [None, 5, 10, 15, 20, 25, 30],
        'min_samples_split': [2, 5, 10, 20, 50],
        'min_samples_leaf': [1, 2, 4, 8, 16],
        'max_features': ['sqrt', 'log2', None],
        'criterion': ['gini', 'entropy'],  
        'class_weight': ['balanced', None],
        'splitter': ['best', 'random'],  # Split strategy
    }

    rs_dt = RandomizedSearchCV(
        estimator=dt,
        param_distributions=param_dist_dt,
        n_iter=100,  
        scoring=scoring,
        refit='f1_macro',
        cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state),
        random_state=random_state,
        n_jobs=-1,
        verbose=2,
    )

    rs_dt.fit(X_train, y_train)  # Using unscaled data - DT doesn't need scaling
    
    # Save the trained model
    metadata = {
        'best_params': rs_dt.best_params_,
        'best_score': rs_dt.best_score_,
        'cv_results': rs_dt.cv_results_,
        'experiment': experiment_name,
        'classifier': classifier_name
    }
    model_saver.save_model(classifier_name, rs_dt, experiment_name, metadata)
    print(f"Model {classifier_name} saved successfully!")

In [None]:
best_dt = rs_dt.best_estimator_
results['DecisionTree'] = eval_model(
    best_dt,
    X_train, y_train,
    X_test, y_test,
)

In [None]:
display(best_dt)
results['DecisionTree']

#### 3.1.6 XGBoost / Gradien Boosting

In [None]:
# Check if model already exists
classifier_name = "XGBoost"
experiment_name = "no_sampling"

if model_saver.model_exists(classifier_name, experiment_name):
    print(f"Model {classifier_name} already exists for experiment {experiment_name}. Loading...")
    rs_xgb = model_saver.load_model(classifier_name, experiment_name)
    print("Model loaded successfully!")
else:
    print(f"Model {classifier_name} not found. Training new model...")
    
    xgb_model = xgb.XGBClassifier(
        objective='multi:softmax',
        num_class=5,  # no of classes
        random_state=random_state,
        n_jobs=-1,
        eval_metric='mlogloss'
    )

    param_dist_xgb = {
        'n_estimators': [100, 200, 300, 500],
        'max_depth': [3, 4, 5, 6, 7, 8],
        'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],
        'subsample': [0.8, 0.9, 1.0],
        'colsample_bytree': [0.8, 0.9, 1.0],
        'reg_alpha': [0, 0.1, 0.5, 1.0],  # L1 regularization
        'reg_lambda': [0, 0.1, 0.5, 1.0],  # L2 regularization
        'min_child_weight': [1, 3, 5, 7],
        'gamma': [0, 0.1, 0.2, 0.3],  # Minimum loss reduction
    }

    rs_xgb = RandomizedSearchCV(
        estimator=xgb_model,
        param_distributions=param_dist_xgb,
        n_iter=30,  
        scoring=scoring,
        refit='f1_macro',
        cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state),
        random_state=random_state,
        n_jobs=-1,
        verbose=2,
    )

    rs_xgb.fit(X_train, y_train)  # XGBoost handles scaling internally
    
    # Save the trained model
    metadata = {
        'best_params': rs_xgb.best_params_,
        'best_score': rs_xgb.best_score_,
        'cv_results': rs_xgb.cv_results_,
        'experiment': experiment_name,
        'classifier': classifier_name
    }
    model_saver.save_model(classifier_name, rs_xgb, experiment_name, metadata)
    print(f"Model {classifier_name} saved successfully!")

In [None]:
best_xgb = rs_xgb.best_estimator_
results['XGBoost'] = eval_model(
    best_xgb,
    X_train, y_train,
    X_test, y_test,
)

In [None]:
display(best_xgb)
results['XGBoost']

#### 3.1.7 Linear Discriminant Analysis

In [None]:
# Check if model already exists
classifier_name = "LDA"
experiment_name = "no_sampling"

if model_saver.model_exists(classifier_name, experiment_name):
    print(f"Model {classifier_name} already exists for experiment {experiment_name}. Loading...")
    rs_lda = model_saver.load_model(classifier_name, experiment_name)
    print("Model loaded successfully!")

else:
    print(f"Model {classifier_name} not found. Training new model...")
    
    param_distributions = [
        {
            "solver": ["svd"],
            "store_covariance": [False, True],
            "tol": [1e-4, 1e-3, 1e-2],
            # n_components kept implicit (None) to avoid invalid values vs. n_classes-1
        },
        {
            "solver": ["lsqr", "eigen"],
            "shrinkage": [None, "auto", 0.0, 0.05, 0.1, 0.15, 0.25, 0.35, 0.5, 0.65, 0.75, 0.85, 0.9],
            "tol": [1e-4, 1e-3, 1e-2],
        },
    ]

    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=random_state)

    lda = LinearDiscriminantAnalysis()
    rs_lda = RandomizedSearchCV(
        estimator=lda,
        param_distributions=param_distributions,
        n_iter=50,
        scoring=scoring,
        refit="f1_macro",
        cv=cv,
        random_state=random_state,
        n_jobs=-1,
        verbose=2,
    )


    rs_lda.fit(X_train_s, y_train)
    
    # Save the trained model
    metadata = {
        "best_params": rs_lda.best_params_,
        "best_score": rs_lda.best_score_,
        "cv_results": rs_lda.cv_results_,
        "experiment": experiment_name,
        "classifier": classifier_name,
    }

    model_saver.save_model(classifier_name, rs_lda, experiment_name, metadata)
    print(f"Model {classifier_name} saved successfully!")

In [None]:
best_lda = rs_lda.best_estimator_
results['LDA'] = eval_model(
    best_lda,
    X_train_s, y_train,
    X_test_s, y_test,
)

In [None]:
display(best_lda)
display(rs_lda.best_params_)
results['LDA']

#### 3.1.8 Artificial Neural Networks

In [None]:
classifier_name = "ANN"
experiment_name = "no_sampling"

if model_saver.model_exists(classifier_name, experiment_name):
    print(f"Model {classifier_name} already exists for experiment {experiment_name}. Loading...")
    rs_lda = model_saver.load_model(classifier_name, experiment_name)
    print("Model loaded successfully!")

else:
    print(f"Model {classifier_name} not found. Training new model...")
    
    ann = MLPClassifier(
        max_iter=300,
        early_stopping=True,
        random_state=random_state,
        n_iter_no_change=10,
        solver="adam",
    )


    param_distributions = {
        "hidden_layer_sizes": [
            (64,),
            (128,),
            (128, 64),
        ],
        "activation": ["relu"],  # focused, fast
        "alpha": loguniform(1e-4, 1e-2),  # L2
        "learning_rate_init": loguniform(1e-3, 1e-2),
        "batch_size": randint(64, 129),
        "beta_1": uniform(0.9, 0.09),   # ~0.90-0.99
        "beta_2": uniform(0.95, 0.049), # ~0.95-0.999
        "validation_fraction": [0.1, 0.15],
    }

    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=random_state)

    rs_ann = RandomizedSearchCV(
        estimator=ann,
        param_distributions=param_distributions,
        n_iter=20,
        scoring=scoring,
        refit="f1_macro",
        cv=cv,
        random_state=random_state,
        n_jobs=-1,
        verbose=2,
    )

    rs_ann.fit(X_train_s, y_train)
    
    metadata = {
        "best_params": rs_ann.best_params_,
        "best_score": rs_ann.best_score_,
        "cv_results": rs_ann.cv_results_,
        "experiment": experiment_name,
        "classifier": classifier_name,
    }
    model_saver.save_model(classifier_name, rs_ann, experiment_name, metadata)
    print(f"Model {classifier_name} saved successfully!")


In [None]:
best_ann = rs_ann.best_estimator_
results['ANN'] = eval_model(
    best_ann,
    X_train_s, y_train,
    X_test_s, y_test,
)

In [None]:
display(best_ann)
display(rs_ann.best_score_)
results['ANN']

#### 3.1.9 Results Summary and Comparison

In [None]:
def _safe_col(label):
    # Make safe column names like "val_f1_cls_0" or "val_f1_cls_N"
    return re.sub(r'[^0-9a-zA-Z_]+', '_', str(label)).strip('_')

# Mapping of model names to their RandomizedSearchCV objects
models_and_searchers = {
    "LogisticRegression": rs_logreg,
    "KNN": rs_knn, 
    "RandomForest": rs_rf,
    "SVM": rs_svm,
    "DecisionTree": rs_dt,
    "XGBoost": rs_xgb,
    "LDA": rs_lda,
    "ANN": rs_ann,
}

rows = []
for name, res in results.items():
    row = {
        'model': name,
        'test_accuracy': round(res['test']['accuracy'], 2),
        'test_f1_macro': round(res['test']['f1_macro'], 2),
    }

    # Add best parameters from RandomizedSearchCV
    if name in models_and_searchers:
        searcher = models_and_searchers[name]
        best_params = searcher.best_params_
        best_cv_score = searcher.best_score_
        row['best_cv_score'] = round(best_cv_score, 2)
        row['best_parameters'] = str(best_params)
    else:
        row['best_cv_score'] = None
        row['best_parameters'] = None

    labels = res['labels']
    f1_t = res['test']['f1_per_class']

    # Add per-class F1 columns for test set only
    for lbl, f1 in zip(labels, f1_t):
        row[f'test_f1_cls_{_safe_col(lbl)}'] = round(f1, 2)

    rows.append(row)

comparison_df = (
    pd.DataFrame(rows)
      .sort_values(by=['test_f1_macro'], ascending=False)
      .reset_index(drop=True)
)

best_model_name = comparison_df.iloc[0]['model']
best_model_results = results[best_model_name]

comparison_df_display = comparison_df.copy()
comparison_df_display['best_parameters'] = comparison_df_display['best_parameters'].apply(
    lambda x: json.dumps(x, indent=2) if isinstance(x, dict) else x
)
import os 

comparison_df_display.to_csv("../src/data/03_model_testing_results/03_01_model_comparison_without_resampling.csv", index=False)

# Display the comparison table with best parameters
print("=" * 100)
print("MODEL COMPARISON WITH BEST PARAMETERS FROM RANDOMIZEDSEARCHCV")
print("=" * 100)
display(comparison_df_display)


In [None]:
# Check saved models
print("=" * 80)
print("SAVED MODELS INFORMATION")
print("=" * 80)

saved_models = model_saver.list_saved_models()
if saved_models:
    for model_key, info in saved_models.items():
        print(f"\nModel: {model_key}")
        print(f"  Exists: {info['exists']}")
        print(f"  Path: {info['model_path']}")
        if info['exists']:
            print(f"  Size: {info['size_bytes']} bytes")
            print(f"  Modified: {info['modified_time']}")
        
        # Load and display metadata if available
        if info['metadata_exists']:
            try:
                metadata = model_saver.load_metadata(model_key.split('_')[0], model_key.split('_')[1] if '_' in model_key else 'default')
                if metadata:
                    print(f"  Best Score: {metadata.get('best_score', 'N/A')}")
                    print(f"  Best Params: {metadata.get('best_params', 'N/A')}")
            except Exception as e:
                print(f"  Error loading metadata: {e}")
else:
    print("No saved models found.")


### 3.2. With Sampling Methods

but without
- Feature Engineering ( RR-Interval! )
- baseline wandering removal
- denoising
- Leak-Free Scaling
- RepeatedStratifiedKFold


#### 3.2.1 Quick run - Using the best models from above

In [None]:
sampling_methods = {
    'No_Sampling': None,
    'RandomOverSampler': RandomOverSampler(random_state=random_state),
    'SMOTE': SMOTE(random_state=random_state, k_neighbors=5),
    'ADASYN': ADASYN(random_state=random_state, n_neighbors=5),
    'SMOTETomek': SMOTETomek(random_state=random_state, smote=SMOTE(random_state=random_state, k_neighbors=5)),
    'SMOTEENN': SMOTEENN(random_state=random_state, smote=SMOTE(random_state=random_state, k_neighbors=5)),
}

sampling_results = {}

best_models = {
    'KNN': best_knn,
    'RandomForest': best_rf,
    'XGBoost': best_xgb,
}

scale_sensitive = ['LogisticRegression', 'SVM', 'KNN']

print("Testing sampling methods on best models...")
print("=" * 80)

for sampling_name, sampler in sampling_methods.items():
    print(f"\nTesting {sampling_name}...")
    sampling_results[sampling_name] = {}
    
    for model_name, model in best_models.items():
        print(f"  - {model_name}")
        
        try:
            if sampler is None:
                # No sampling - use original, only scaled data
                result = eval_model(model,
                                    X_train_s if model_name in scale_sensitive else X_train , y_train,
                                    X_test_s if model_name in scale_sensitive else X_test, y_test)
            else:
                # Apply sampling on unscaled data
                X_train_sampled, y_train_sampled = sampler.fit_resample(X_train, y_train)
                
                # Re-scale if needed for models that require scaling
                if model_name in scale_sensitive:
                    scaler_sampling = StandardScaler()
                    X_train_sampled = scaler_sampling.fit_transform(X_train_sampled)
                    X_test_sampled = scaler_sampling.transform(X_test)
                else: # e.g. RF, XGBoost
                    X_test_sampled = X_test
            
                result = eval_model(
                    model,
                    X_train_sampled, y_train_sampled,
                    X_test_sampled, y_test,
                )
            
            sampling_results[sampling_name][model_name] = result
            
            # Printing statistics
            if sampler is not None:
                unique, counts = np.unique(y_train_sampled, return_counts=True)
                print(f"    Class distribution after {sampling_name}:")
                for cls, count in zip(unique, counts):
                    print(f"      Class {cls}: {count:,} samples")
            
        except Exception as e:
            print(f"    ERROR with {sampling_name} + {model_name}: {str(e)}")
            sampling_results[sampling_name][model_name] = None


In [None]:

# Create comprehensive comparison table
print("\n" + "=" * 100)
print("SAMPLING METHODS COMPARISON")
print("=" * 100)

# Prepare comparison data
comparison_rows = []

for sampling_name, models_results in sampling_results.items():
    for model_name, result in models_results.items():
        if result is not None:
            row = {
                'sampling_method': sampling_name,
                'model': model_name,
                'test_accuracy': round(result['test']['accuracy'],2),
                'test_f1_macro': round(result['test']['f1_macro'],2)
            }
            
            # Add per-class F1 scores for test set only
            labels = result['labels']
            f1_t = result['test']['f1_per_class']
            
            for lbl, f1 in zip(labels, f1_t):
                row[f'test_f1_cls_{_safe_col(lbl)}'] = round(f1,2)
            
            comparison_rows.append(row)

# Create and display comparison DataFrame
sampling_comparison_df = (
    pd.DataFrame(comparison_rows)
    .sort_values(by=['test_f1_macro'], ascending=False)
    .reset_index(drop=True)
)

In [None]:
sampling_comparison_df.to_csv("../reports/03_model_testing_results/03_02_model_comparison_with_sampling_on_best_models.csv", index=False)

# Find best combination
best_sampling_model = sampling_comparison_df.iloc[0]
print(f"\nBEST COMBINATION:")
print(f"Sampling Method: {best_sampling_model['sampling_method']}")
print(f"Model: {best_sampling_model['model']}")
print(f"Test F1-Macro: {best_sampling_model['test_f1_macro']:.4f}")

# Summary statistics
print(f"\nSUMMARY STATISTICS:")
print(f"Total combinations tested: {len(comparison_rows)}")
print(f"Best test F1-macro: {sampling_comparison_df['test_f1_macro'].max():.4f}")

# Show top 5 combinations
print(f"\nTOP 5 COMBINATIONS:")
top_5 = sampling_comparison_df.head(5)[['sampling_method', 'model', 'test_f1_macro']]
display(top_5)

#### 3.2.2 Extended Run: Sampling + RandomizedSearchCV

In [None]:

# Step 1: Model Training Phase
# This cell focuses only on training models with RandomizedSearchCV
# Evaluation is separated to prevent interruption of cross-validation runs

sampling_methods = {
    'RandomOverSampler': RandomOverSampler(random_state=random_state),
    'SMOTE': SMOTE(random_state=random_state, k_neighbors=5),
    'ADASYN': ADASYN(random_state=random_state, n_neighbors=5),
    'SMOTETomek': SMOTETomek(random_state=random_state, smote=SMOTE(random_state=random_state, k_neighbors=5)),
    'SMOTEENN': SMOTEENN(random_state=random_state, smote=SMOTE(random_state=random_state, k_neighbors=5)),
}

# Which models need scaling (no pipeline used; fit scaler once on the resampled training set)
scale_sensitive = ["LogisticRegression", "KNN", "SVM", "LDA", "ANN"]



In [None]:
print("Starting 3.2.2 TRAINING PHASE: Full RandomizedSearchCV for each model, per sampling method")
print("=" * 80)

for sampling_name, sampler in sampling_methods.items():
    print(f"\n=== Training with Sampling: {sampling_name} ===")

    # Apply sampling on original training set (before CV)
    try:
        if sampler is None:
            X_train_res, y_train_res = X_train, y_train
        else:
            X_train_res, y_train_res = sampler.fit_resample(X_train, y_train)
    except Exception as e:
        print(f"  Skipping sampling '{sampling_name}' due to error: {e}")
        continue

    # Show distribution if sampling applied
    if sampler is not None:
        unique, counts = np.unique(y_train_res, return_counts=True)
        print("  Class distribution after sampling:")
        for cls, cnt in zip(unique, counts):
            print(f"    Class {cls}: {cnt:,} samples")

    # For each model, run RS-CV on the resampled dataset (TRAINING ONLY)
    for model_name, spec in param_spaces.items():
        experiment_name = f"with_sampling_{sampling_name}"
        classifier_name = model_name

        # Prepare data (leak-prone sampling by design here)
        if model_name in scale_sensitive:
            scaler = StandardScaler()
            X_tr_fit = scaler.fit_transform(X_train_res)
        else:
            X_tr_fit = X_train_res

        # Train or load if already saved
        try:
            # Check if an RS-CV object already exists for this sampler+model
            if model_saver.model_exists(classifier_name, experiment_name):
                print(f"  [{model_name}] Exists for {experiment_name}. LoaLoading next one...")
            else:
                print(f"  [{model_name}] Training RS-CV for {experiment_name}...")
                rs = RandomizedSearchCV(
                    estimator=spec["estimator"],
                    param_distributions=spec["params"],
                    n_iter=spec["n_iter"],
                    scoring=scoring,
                    refit="f1_macro",
                    cv=spec["cv"],
                    random_state=random_state,
                    n_jobs=-1,
                    verbose=2,
                )
                rs.fit(X_tr_fit, y_train_res)

                metadata = {
                    "best_params": rs.best_params_,
                    "best_score": rs.best_score_,
                    "cv_results": rs.cv_results_,
                    "experiment": experiment_name,
                    "classifier": classifier_name,
                    "sampling_method": sampling_name,
                }
                model_saver.save_model(classifier_name, rs, experiment_name, metadata)
                print(f"  [{model_name}] Saved for {experiment_name}.")

        except Exception as e:
            print(f"  [{model_name}] ERROR for {experiment_name}: {e}")

print("\n" + "=" * 80)
print("TRAINING PHASE COMPLETED")
print("=" * 80)



In [None]:
# Step 2: Model Evaluation Phase
# This cell focuses only on evaluating the trained models
# This separation prevents interruption of cross-validation runs from affecting evaluation

print("Starting 3.2.2 EVALUATION PHASE: Evaluating trained models")
print("=" * 80)
# Path to results file
out_path = "../reports/03_model_testing_results/03_03_model_comparison_with_sampling_randomized_search.csv"

# Load existing results if CSV already exists
if os.path.exists(out_path):
    existing_df = pd.read_csv(out_path)
    print(f"Loaded existing results with {len(existing_df)} rows from {out_path}")
else:
    existing_df = pd.DataFrame()
    print(f"No existing results found. Will create {out_path}")

print("Starting 3.2.2 EVALUATION PHASE: Evaluating trained models")
print("=" * 80)

# Evaluation and saving
sampling_results = {}

for sampling_name, sampler in sampling_methods.items():
    print(f"\n=== Evaluating with Sampling: {sampling_name} ===")
    sampling_results[sampling_name] = {}

    # Apply sampling on original training set (before CV)
    try:
        if sampler is None:
            X_train_res, y_train_res = X_train, y_train
        else:
            X_train_res, y_train_res = sampler.fit_resample(X_train, y_train)
    except Exception as e:
        print(f"  Skipping sampling '{sampling_name}' due to error: {e}")
        continue

    for model_name, spec in param_spaces.items():
        experiment_name = f"with_sampling_{sampling_name}"
        classifier_name = model_name

        # Skip if already evaluated
        if not existing_df.empty and (
            (existing_df["sampling_method"] == sampling_name)
            & (existing_df["model"] == model_name)
        ).any():
            print(f"  [{model_name}] Skipping (already in CSV)")
            continue

        # Prepare scaled data if necessary
        if model_name in scale_sensitive:
            scaler = StandardScaler()
            X_tr_fit = scaler.fit_transform(X_train_res)
            X_te_fit = scaler.transform(X_test)
        else:
            X_tr_fit, X_te_fit = X_train_res, X_test

        try:
            # Load the trained model
            if model_saver.model_exists(classifier_name, experiment_name):
                print(f"  [{model_name}] Loading trained model for evaluation...")
                rs = model_saver.load_model(classifier_name, experiment_name)

                # Evaluate best estimator on test set
                print(f"  Evaluating model [{model_name}]")
                best_est = rs.best_estimator_
                res = eval_model(
                    best_est,
                    X_tr_fit, y_train_res,
                    X_te_fit, y_test,
                )

                sampling_results[sampling_name][model_name] = {"rs": rs, "eval": res}

                # Prepare single row result
                row = {
                    "sampling_method": sampling_name,
                    "model": model_name,
                    "test_accuracy": round(res["test"]["accuracy"], 2),
                    "test_f1_macro": round(res["test"]["f1_macro"], 2),
                    "best_cv_score": round(rs.best_score_, 2),
                    "best_parameters": json.dumps(rs.best_params_),
                }

                # Add per-class F1 scores
                labels = res["labels"]
                for lbl, f1 in zip(labels, res["test"]["f1_per_class"]):
                    row[f"test_f1_cls_{lbl}"] = round(float(f1), 2)

                # Convert to DataFrame and append immediately
                new_df = pd.DataFrame([row])
                header = not os.path.exists(out_path)
                new_df.to_csv(out_path, mode="a", index=False, header=header)
                print(f"  ✅ Result saved for [{model_name}] ({sampling_name})")

                # Update in-memory record too
                existing_df = pd.concat([existing_df, new_df], ignore_index=True)

            else:
                print(f"  [{model_name}] No trained model found for {experiment_name}")
                sampling_results[sampling_name][model_name] = None

        except Exception as e:
            print(f"  [{model_name}] ERROR for {experiment_name}: {e}")
            sampling_results[sampling_name][model_name] = None


# Final check summary
if not existing_df.empty:
    print("\n" + "=" * 100)
    print("CURRENT MODEL EVALUATION SUMMARY")
    print("=" * 100)
    display(
        existing_df[["sampling_method", "model", "test_f1_macro"]]
        .sort_values(by=["test_f1_macro"], ascending=False)
        .head(10)
    )
else:
    print("No results to display.")

print("\n" + "=" * 80)
print("EVALUATION PHASE COMPLETED")
print("=" * 80)


## 4. GridSearch - Final run on 3 best models

- target models. XGBoost, ANN, SVM
- test with an without outlier removal
- GridSearchCV on same parameter spaces
- using common train test dataset
- RepeatedStratifiedKFold:
    - single CV split can be "lucky" or "unlucky" --> dependency how data is shuffled
    - repeating stratified k-fold with different shuffles averages out randomness
    - more stable, less noisy estimates of performance
- Implement Leak-free scaling
    - current: without pipeline: scale once on the full training set, then do CV on the already sclaed data --> scaler "saw" all CV folds, including each folds validation part --> data leakage
    - makes CV too optimisic?
    - pipeline fits the scaler only on each training fold, then applieos it to that folds validation split. 
    - solution: Pipeline(StandardScaler(), model) so scaling is fit per CV fold



# Last Open Point

- Try to optimize the signal - run best models with the optimized signal and new features
    - add RR-Interval as feature
    - add new Target "not_normal" in MIT to compare to PTB
    - baseline wandering removal
    - denoising