# Model Testing - PTBDB Dataset

Questions to be answered:

- Remove outliers?
- Which Sampling method to use?


## 1. Imports


In [None]:
import sys
sys.path.append('..')

from src.utils.preprocessing import (
    prepare_mitbih,
    prepare_ptbdb,
    resample_training
)
from src.visualization import plot_confusion_matrix
from src.models.exploration_phase import create_model_saver

import pandas as pd
import numpy as np
import re
import json

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from scipy.stats import loguniform, randint

# Initialize model saver
model_saver = create_model_saver()


In [None]:
def eval_model(model, X_tr, y_tr, X_va, y_va, X_te, y_te):
    model.fit(X_tr, y_tr)
    yv = model.predict(X_va)
    yt = model.predict(X_te)

    # Choose a consistent label order (dynamic)
    labels = np.unique(np.concatenate([y_tr, y_va, y_te]))

    # Validation
    acc_v = accuracy_score(y_va, yv)
    p_v_m, r_v_m, f1_v_m, _ = precision_recall_fscore_support(
        y_va, yv, average='macro', zero_division=0
    )
    p_v_c, r_v_c, f1_v_c, sup_v = precision_recall_fscore_support(
        y_va, yv, average=None, labels=labels, zero_division=0
    )
    cm_v = confusion_matrix(y_va, yv, labels=labels)

    # Test
    acc_t = accuracy_score(y_te, yt)
    p_t_m, r_t_m, f1_t_m, _ = precision_recall_fscore_support(
        y_te, yt, average='macro', zero_division=0
    )
    p_t_c, r_t_c, f1_t_c, sup_t = precision_recall_fscore_support(
        y_te, yt, average=None, labels=labels, zero_division=0
    )
    cm_t = confusion_matrix(y_te, yt, labels=labels)

    return {
        'labels': labels,  # order for per-class arrays below
        'val': {
            'accuracy': acc_v,
            'precision_macro': p_v_m,
            'recall_macro': r_v_m,
            'f1_macro': f1_v_m,
            'precision_per_class': p_v_c,
            'recall_per_class': r_v_c,
            'f1_per_class': f1_v_c,
            'support_per_class': sup_v,
            'confusion_matrix': cm_v,
        },
        'test': {
            'accuracy': acc_t,
            'precision_macro': p_t_m,
            'recall_macro': r_t_m,
            'f1_macro': f1_t_m,
            'precision_per_class': p_t_c,
            'recall_per_class': r_t_c,
            'f1_per_class': f1_t_c,
            'support_per_class': sup_t,
            'confusion_matrix': cm_t,
        },
    }

results = {}


## 2. Load Data


In [None]:
# Prepare datasets
ptbdb = prepare_ptbdb(remove_outliers=False)

print("PTBDB dataset prepared:")
print(f"  Training size: {ptbdb.X_train.shape}")
print(f"  Validation size: {ptbdb.X_val.shape if ptbdb.X_val is not None else 'None'}")
print(f"  Test size: {ptbdb.X_test.shape if ptbdb.X_test is not None else 'None'}")

X_train, X_val, X_test = ptbdb.X_train.values, ptbdb.X_val.values, ptbdb.X_test.values
y_train = ptbdb.y_train.astype(int).values
y_val = ptbdb.y_val.astype(int).values
y_test = ptbdb.y_test.astype(int).values

# Scale features using train fit only
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_val_s = scaler.transform(X_val)
X_test_s = scaler.transform(X_test)


## 3. Test models with Randomized Search CV


In [None]:
scoring = {'f1_macro': 'f1_macro', 'bal_acc': 'balanced_accuracy', 'f1_weighted': 'f1_weighted'}


### 3.1 Without outlier removal or sampling


#### 3.1.1 Logistic Regression


In [None]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression

# Check if model already exists
classifier_name = "LogisticRegression"
experiment_name = "no_sampling_ptbdb"

if model_saver.model_exists(classifier_name, experiment_name):
    print(f"Model {classifier_name} already exists for experiment {experiment_name}. Loading...")
    rs_logreg = model_saver.load_model(classifier_name, experiment_name)
    print("Model loaded successfully!")
else:
    print(f"Model {classifier_name} not found. Training new model...")
    
    logreg = LogisticRegression(max_iter=10000, solver='lbfgs')

    param_dist_logreg = {
        'C': loguniform(1e-3, 1e3),
        'penalty': ['l2'],
        'solver': ['lbfgs'],
    }

    rs_logreg = RandomizedSearchCV(
        estimator=logreg,
        param_distributions=param_dist_logreg,
        n_iter=20,
        scoring=scoring,
        refit='f1_macro',
        cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
        random_state=42,
        n_jobs=-1,
        verbose=2
    )
    rs_logreg.fit(X_train_s, y_train)
    
    # Save the trained model
    metadata = {
        'best_params': rs_logreg.best_params_,
        'best_score': rs_logreg.best_score_,
        'cv_results': rs_logreg.cv_results_,
        'experiment': experiment_name,
        'classifier': classifier_name
    }
    model_saver.save_model(classifier_name, rs_logreg, experiment_name, metadata)
    print(f"Model {classifier_name} saved successfully!")


In [None]:
best_logreg = rs_logreg.best_estimator_
results['LogisticRegression'] = eval_model(
    best_logreg,
    X_train_s, y_train,
    X_val_s, y_val,
    X_test_s, y_test,
)


#### 3.1.2 KNN


In [None]:
# KNN
from sklearn.neighbors import KNeighborsClassifier

# Check if model already exists
classifier_name = "KNN"
experiment_name = "no_sampling_ptbdb"

if model_saver.model_exists(classifier_name, experiment_name):
    print(f"Model {classifier_name} already exists for experiment {experiment_name}. Loading...")
    rs_knn = model_saver.load_model(classifier_name, experiment_name)
    print("Model loaded successfully!")
else:
    print(f"Model {classifier_name} not found. Training new model...")
    
    knn = KNeighborsClassifier()
    param_dist_knn = {
        'n_neighbors': randint(1, 51),
        'weights': ['uniform', 'distance'],
        'metric': ['minkowski', 'manhattan', 'euclidean'],
        'p': [1, 2],
    }

    rs_knn = RandomizedSearchCV(
        estimator=knn,
        param_distributions=param_dist_knn,
        n_iter=20,
        scoring=scoring,
        refit='f1_macro',
        cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
        random_state=42,
        n_jobs=-1,
    )
    rs_knn.fit(X_train_s, y_train)
    
    # Save the trained model
    metadata = {
        'best_params': rs_knn.best_params_,
        'best_score': rs_knn.best_score_,
        'cv_results': rs_knn.cv_results_,
        'experiment': experiment_name,
        'classifier': classifier_name
    }
    model_saver.save_model(classifier_name, rs_knn, experiment_name, metadata)
    print(f"Model {classifier_name} saved successfully!")


In [None]:
best_knn = rs_knn.best_estimator_
results['KNN'] = eval_model(
    best_knn,
    X_train_s, y_train,
    X_val_s, y_val,
    X_test_s, y_test,
)


#### 3.1.3 Random Forest


In [None]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier

# Check if model already exists
classifier_name = "RandomForest"
experiment_name = "no_sampling_ptbdb"

if model_saver.model_exists(classifier_name, experiment_name):
    print(f"Model {classifier_name} already exists for experiment {experiment_name}. Loading...")
    rs_rf = model_saver.load_model(classifier_name, experiment_name)
    print("Model loaded successfully!")
else:
    print(f"Model {classifier_name} not found. Training new model...")
    
    rf = RandomForestClassifier(random_state=42, n_jobs=-1)
    param_dist_rf = {
        'n_estimators': [100, 200, 300],
        'max_depth': [10, 15, 20],
        'min_samples_split': [2, 5, 10, 20, 50],
        'min_samples_leaf': [1, 2, 4, 8],
        'max_features': ['sqrt', 'log2', None],
        'bootstrap': [True],
        'class_weight': ['balanced', None],
        'criterion': ['gini', 'entropy'],
    }

    rs_rf = RandomizedSearchCV(
        estimator=rf,
        param_distributions=param_dist_rf,
        n_iter=20,
        scoring=scoring,
        refit='f1_macro',
        cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
        random_state=42,
        n_jobs=-1,
        verbose=2,
    )

    rs_rf.fit(X_train, y_train)
    
    metadata = {
        'best_params': rs_rf.best_params_,
        'best_score': rs_rf.best_score_,
        'cv_results': rs_rf.cv_results_,
        'experiment': experiment_name,
        'classifier': classifier_name
    }
    model_saver.save_model(classifier_name, rs_rf, experiment_name, metadata)
    print(f"Model {classifier_name} saved successfully!")


In [None]:
best_rf = rs_rf.best_estimator_
results['RandomForest'] = eval_model( 
    best_rf, 
    X_train, y_train,
    X_val, y_val,
    X_test, y_test,
)


#### 3.1.4 SVM


In [None]:
from sklearn.svm import SVC

# Check if model already exists
classifier_name = "SVM"
experiment_name = "no_sampling_ptbdb"

if model_saver.model_exists(classifier_name, experiment_name):
    print(f"Model {classifier_name} already exists for experiment {experiment_name}. Loading...")
    rs_svm = model_saver.load_model(classifier_name, experiment_name)
    print("Model loaded successfully!")
else:
    print(f"Model {classifier_name} not found. Training new model...")
    
    svm = SVC()
    param_dist_svm = {
        'kernel': ['rbf', 'poly'],
        'C': [0.1, 1, 10],
        'gamma': [0.001, 0.01, 0.1, 0.5, 1],
    }
    rs_svm = RandomizedSearchCV(
        estimator=svm,
        param_distributions=param_dist_svm,
        n_iter=15,
        scoring=scoring,
        refit='f1_macro',
        cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42),
        random_state=42,
        n_jobs=-1,
        verbose=2,
    )
    rs_svm.fit(X_train_s, y_train)
    
    # Save the trained model
    metadata = {
        'best_params': rs_svm.best_params_,
        'best_score': rs_svm.best_score_,
        'cv_results': rs_svm.cv_results_,
        'experiment': experiment_name,
        'classifier': classifier_name
    }
    model_saver.save_model(classifier_name, rs_svm, experiment_name, metadata)
    print(f"Model {classifier_name} saved successfully!")


In [None]:
best_svm = rs_svm.best_estimator_
results['SVM'] = eval_model(
    best_svm,
    X_train_s, y_train,
    X_val_s, y_val,
    X_test_s, y_test,
)


#### 3.1.5 Decision Tree Classifier


In [None]:
from sklearn.tree import DecisionTreeClassifier

# Check if model already exists
classifier_name = "DecisionTree"
experiment_name = "no_sampling_ptbdb"

if model_saver.model_exists(classifier_name, experiment_name):
    print(f"Model {classifier_name} already exists for experiment {experiment_name}. Loading...")
    rs_dt = model_saver.load_model(classifier_name, experiment_name)
    print("Model loaded successfully!")
else:
    print(f"Model {classifier_name} not found. Training new model...")
    
    dt = DecisionTreeClassifier(random_state=42)

    param_dist_dt = {
        'max_depth': [None, 5, 10, 15, 20, 25, 30],
        'min_samples_split': [2, 5, 10, 20, 50],
        'min_samples_leaf': [1, 2, 4, 8, 16],
        'max_features': ['sqrt', 'log2', None],
        'criterion': ['gini', 'entropy'],
        'class_weight': ['balanced', None],
        'splitter': ['best', 'random'],
    }

    rs_dt = RandomizedSearchCV(
        estimator=dt,
        param_distributions=param_dist_dt,
        n_iter=100,
        scoring=scoring,
        refit='f1_macro',
        cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
        random_state=42,
        n_jobs=-1,
        verbose=2,
    )

    rs_dt.fit(X_train, y_train)
    
    metadata = {
        'best_params': rs_dt.best_params_,
        'best_score': rs_dt.best_score_,
        'cv_results': rs_dt.cv_results_,
        'experiment': experiment_name,
        'classifier': classifier_name
    }
    model_saver.save_model(classifier_name, rs_dt, experiment_name, metadata)
    print(f"Model {classifier_name} saved successfully!")


In [None]:
best_dt = rs_dt.best_estimator_
results['DecisionTree'] = eval_model(
    best_dt,
    X_train, y_train,
    X_val, y_val,
    X_test, y_test,
)


#### 3.1.6 XGBoost / Gradient Boosting


In [None]:
import xgboost as xgb

# Check if model already exists
classifier_name = "XGBoost"
experiment_name = "no_sampling_ptbdb"

if model_saver.model_exists(classifier_name, experiment_name):
    print(f"Model {classifier_name} already exists for experiment {experiment_name}. Loading...")
    rs_xgb = model_saver.load_model(classifier_name, experiment_name)
    print("Model loaded successfully!")
else:
    print(f"Model {classifier_name} not found. Training new model...")
    
    xgb_model = xgb.XGBClassifier(
        objective='binary:logistic',
        random_state=42,
        n_jobs=-1,
        eval_metric='logloss'
    )

    param_dist_xgb = {
        'n_estimators': [100, 200, 300, 500],
        'max_depth': [3, 4, 5, 6, 7, 8],
        'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],
        'subsample': [0.8, 0.9, 1.0],
        'colsample_bytree': [0.8, 0.9, 1.0],
        'reg_alpha': [0, 0.1, 0.5, 1.0],
        'reg_lambda': [0, 0.1, 0.5, 1.0],
        'min_child_weight': [1, 3, 5, 7],
        'gamma': [0, 0.1, 0.2, 0.3],
    }

    rs_xgb = RandomizedSearchCV(
        estimator=xgb_model,
        param_distributions=param_dist_xgb,
        n_iter=30,
        scoring=scoring,
        refit='f1_macro',
        cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
        random_state=42,
        n_jobs=-1,
        verbose=2,
    )

    rs_xgb.fit(X_train, y_train)
    
    metadata = {
        'best_params': rs_xgb.best_params_,
        'best_score': rs_xgb.best_score_,
        'cv_results': rs_xgb.cv_results_,
        'experiment': experiment_name,
        'classifier': classifier_name
    }
    model_saver.save_model(classifier_name, rs_xgb, experiment_name, metadata)
    print(f"Model {classifier_name} saved successfully!")


In [None]:
best_xgb = rs_xgb.best_estimator_
results['XGBoost'] = eval_model(
    best_xgb,
    X_train, y_train,
    X_val, y_val,
    X_test, y_test,
)


#### 3.1.7 Results Summary and Comparison


In [None]:
def _safe_col(label):
    # Make safe column names like "val_f1_cls_0" or "val_f1_cls_N"
    return re.sub(r'[^0-9a-zA-Z_]+', '_', str(label)).strip('_')

# Mapping of model names to their RandomizedSearchCV objects
models_and_searchers = {
    "LogisticRegression": rs_logreg,
    "KNN": rs_knn,
    "RandomForest": rs_rf,
    "SVM": rs_svm,
    "DecisionTree": rs_dt,
    "XGBoost": rs_xgb
}

rows = []
for name, res in results.items():
    row = {
        'model': name,
        'val_accuracy': res['val']['accuracy'],
        'val_f1_macro': res['val']['f1_macro'],
        'test_accuracy': res['test']['accuracy'],
        'test_f1_macro': res['test']['f1_macro'],
    }

    # Add best parameters from RandomizedSearchCV
    if name in models_and_searchers:
        searcher = models_and_searchers[name]
        best_params = searcher.best_params_
        best_cv_score = searcher.best_score_
        row['best_cv_score'] = best_cv_score
        row['best_parameters'] = str(best_params)
    else:
        row['best_cv_score'] = None
        row['best_parameters'] = None

    labels = res['labels']
    f1_v = res['val']['f1_per_class']
    f1_t = res['test']['f1_per_class']

    # Add per-class F1 columns
    for lbl, f1 in zip(labels, f1_v):
        row[f'val_f1_cls_{_safe_col(lbl)}'] = f1
    for lbl, f1 in zip(labels, f1_t):
        row[f'test_f1_cls_{_safe_col(lbl)}'] = f1

    rows.append(row)

comparison_df = (
    pd.DataFrame(rows)
      .sort_values(by=['val_f1_macro','test_f1_macro'], ascending=False)
      .reset_index(drop=True)
)

best_model_name = comparison_df.iloc[0]['model']
best_model_results = results[best_model_name]

comparison_df_display = comparison_df.copy()
comparison_df_display['best_parameters'] = comparison_df_display['best_parameters'].apply(
    lambda x: json.dumps(x, indent=2) if isinstance(x, dict) else x
)
comparison_df_display.to_csv("src/data/03_model_testing_results/model_comparison_ptbdb_without_resampling.csv", index=False)

# Display the comparison table with best parameters
print("=" * 100)
print("MODEL COMPARISON WITH BEST PARAMETERS FROM RANDOMIZEDSEARCHCV - PTBDB")
print("=" * 100)
display(comparison_df_display)


# Model Testing - PTBDB Dataset

- **Binary Classification**: 2 classes (Normal vs Abnormal)

Questions to be answered:

- Remove outliers?
- Which Sampling method to use?


In [4]:
import sys
sys.path.append('..')

from src.utils.preprocessing import (
    prepare_mitbih, 
    prepare_ptbdb,
    resample_training
)
from src.visualization import plot_confusion_matrix
from src.models.exploration_phase import create_model_saver
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from scipy.stats import loguniform, randint, uniform
import numpy as np
import re
import json
import matplotlib.pyplot as plt

# Initialize model saver
model_saver = create_model_saver()


In [5]:
def eval_model(model, X_tr, y_tr, X_va, y_va, X_te, y_te):
    model.fit(X_tr, y_tr)
    yv = model.predict(X_va)
    yt = model.predict(X_te)

    # Choose a consistent label order (dynamic)
    labels = np.unique(np.concatenate([y_tr, y_va, y_te]))

    # Validation
    acc_v = accuracy_score(y_va, yv)
    p_v_m, r_v_m, f1_v_m, _ = precision_recall_fscore_support(
        y_va, yv, average='macro', zero_division=0
    )
    p_v_c, r_v_c, f1_v_c, sup_v = precision_recall_fscore_support(
        y_va, yv, average=None, labels=labels, zero_division=0
    )
    cm_v = confusion_matrix(y_va, yv, labels=labels)

    # Test
    acc_t = accuracy_score(y_te, yt)
    p_t_m, r_t_m, f1_t_m, _ = precision_recall_fscore_support(
        y_te, yt, average='macro', zero_division=0
    )
    p_t_c, r_t_c, f1_t_c, sup_t = precision_recall_fscore_support(
        y_te, yt, average=None, labels=labels, zero_division=0
    )
    cm_t = confusion_matrix(y_te, yt, labels=labels)

    return {
        'labels': labels,  # order for per-class arrays below
        'val': {
            'accuracy': acc_v,
            'precision_macro': p_v_m,
            'recall_macro': r_v_m,
            'f1_macro': f1_v_m,
            'precision_per_class': p_v_c,
            'recall_per_class': r_v_c,
            'f1_per_class': f1_v_c,
            'support_per_class': sup_v,
            'confusion_matrix': cm_v,
        },
        'test': {
            'accuracy': acc_t,
            'precision_macro': p_t_m,
            'recall_macro': r_t_m,
            'f1_macro': f1_t_m,
            'precision_per_class': p_t_c,
            'recall_per_class': r_t_c,
            'f1_per_class': f1_t_c,
            'support_per_class': sup_t,
            'confusion_matrix': cm_t,
        },
    }

results = {}


In [6]:
# Prepare datasets
ptbdb = prepare_ptbdb(remove_outliers=False)

print("MITBIH dataset prepared:")
print(f"  Training size: {mitbih.X_train.shape}")
print(f"  Validation size: {mitbih.X_val.shape if mitbih.X_val is not None else 'None'}")
print(f"  Test size: {mitbih.X_test.shape if mitbih.X_test is not None else 'None'}")

print("\nPTBDB dataset prepared:")
print(f"  Training size: {ptbdb.X_train.shape}")
print(f"  Validation size: {ptbdb.X_val.shape if ptbdb.X_val is not None else 'None'}")
print(f"  Test size: {ptbdb.X_test.shape if ptbdb.X_test is not None else 'None'}")

# Check class distribution in PTBDB
print("\nPTBDB Class Distribution:")
print(f"  Training classes: {np.unique(ptbdb.y_train, return_counts=True)}")
print(f"  Validation classes: {np.unique(ptbdb.y_val, return_counts=True)}")
print(f"  Test classes: {np.unique(ptbdb.y_test, return_counts=True)}")


MITBIH dataset prepared:


NameError: name 'mitbih' is not defined

In [7]:
scoring = {'f1_macro': 'f1_macro', 'bal_acc': 'balanced_accuracy', 'f1_weighted': 'f1_weighted'}


#### 3.1.6 XGBoost / Gradient Boosting
