# A 02 I: MIT Hyperparameter tuning for baseline models using RandomizedSearch  without sampling

Establish baseline models via randomized search without any feature engineering or resampling, mainly to verify preprocessing and compare it to the DL models. 

## Content

A) MIT-BIH Arrhytmia Dataset

1. train/test split: 80%, 20% -> as defined at the beginning of the project to ensure result reproducibility, no duplicates or missing values present
2. Hyperparameter tuning using RandomizedSearch with cross validation for the mentioned baseline models but no oversampling techniques
 


## 1. Imports

In [None]:
import os 
from typing import Dict, Optional
import random 

from src.utils import eval_model, evaluate_model
from src.visualization import save_cv_diagnostics, save_overfit_diagnostic, save_model_diagnostics, save_roc_curve
from src.utils.model_saver import create_model_saver

# external 
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from scipy.stats import loguniform, randint, uniform
import numpy as np
import re
import json

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
import xgboost as xgb

# Samplers

from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.combine import SMOTETomek, SMOTEENN
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from src.utils.preprocessing import (
    _normalize_sampling_method_name,
    _SAMPLING_REGISTRY
)


In [None]:
# Init model saver
model_saver = create_model_saver("src/models/MIT_02_01_baseline_models_randomized_search_no_sampling")

#import MIT data 
df_mitbih_train = pd.read_csv('data/original/mitbih_train.csv', header = None)
df_mitbih_test = pd.read_csv('data/original/mitbih_test.csv', header = None)

#define train and test set
X_train = df_mitbih_train.drop(187, axis = 1)
y_train = df_mitbih_train[187]

X_test = df_mitbih_test.drop(187, axis = 1)
y_test = df_mitbih_test[187]

print("MITBIH dataset")
print(f"\tTraining size: {X_train.shape}, {y_train.shape}")
print(f"\tTest size: {X_test.shape}, {y_test.shape}")

In [None]:
if False:
    from sklearn.model_selection import train_test_split
    # Subsample training set to 10 % (keeping all classes)
    X_train_small, _, y_train_small, _ = train_test_split(
        X_train, y_train,
        train_size=0.05,
        stratify=y_train,
        random_state=42
    )

    # Subsample test set to 10 % as well
    X_test_small, _, y_test_small, _ = train_test_split(
        X_test, y_test,
        train_size=0.05,
        stratify=y_test,
        random_state=42
    )

    print("Reduced MIT-BIH dataset")
    print(f"\tTraining size: {X_train_small.shape}, {y_train_small.shape}")
    print(f"\tTest size: {X_test_small.shape}, {y_test_small.shape}")

    # Assign back for your pipeline
    X_train, y_train = X_train_small, y_train_small
    X_test,  y_test  = X_test_small,  y_test_small

## 2. Constants & Param Spaces

In [None]:
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
random.seed(RANDOM_STATE)
SCORING = {'f1_macro': 'f1_macro', 'bal_acc': 'balanced_accuracy', 'f1_weighted': 'f1_weighted'}

PARAM_SPACES = {
    "LogisticRegression": {
        "estimator": LogisticRegression(max_iter=10000, solver='lbfgs', n_jobs=-1),
        "params": {
            "C": loguniform(1e-3, 1e3),      # Big C = less penalty on large weights (more freedom, risk of overfitting). 
                                             # Small C = more penalty (more discipline, less overfitting).
                                             # loguniform = means we try values spread across tiny to big scales (e.g., 0.001 up to 100), not just small steps.
            "penalty": ["l2"], # gently pushes weights toward zero, which keeps the model simpler and more stable.
            "solver": ["lbfgs"],
        },
        "cv": StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE),
        "n_iter": 10,
        "create_new_model": False,
    },
    "KNN": {
        "estimator": KNeighborsClassifier(n_jobs=-1),
        "params": {
            "n_neighbors": randint(1, 51),
            "weights": ["uniform", "distance"],
            "metric": ["minkowski", "manhattan", "euclidean"],
            "p": [1, 2],
        },
        "cv": StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE),
        "n_iter": 10,
    },
    "RandomForest": {
        "estimator": RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=-1),
        "params": {
            "n_estimators": [100, 200, 300],
            "max_depth": [10, 15, 20],
            "min_samples_split": [2, 5, 10, 20, 50],
            "min_samples_leaf": [1, 2, 4, 8],
            "max_features": ["sqrt", "log2", None],
            "bootstrap": [True],
            "class_weight": ["balanced", None],
            "criterion": ["gini", "entropy"],
        },
        "cv": StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE),
        "n_iter": 10,
    },
    "SVM": {
        "estimator": SVC(probability=True),
        "params": {
            "kernel": ["rbf", "poly"],
            "C": [0.1, 1, 10],
            "gamma": [0.001, 0.01, 0.1, 0.5, 0.9],
        },
        "cv": StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE),
        "n_iter": 10,
    },
    "DecisionTree": {
        "estimator": DecisionTreeClassifier(random_state=RANDOM_STATE),
        "params": {
            "max_depth": [None, 5, 10, 15, 20, 25, 30],
            "min_samples_split": [2, 5, 10, 20, 50],
            "min_samples_leaf": [1, 2, 4, 8, 16],
            "max_features": ["sqrt", "log2", None],
            "criterion": ["gini", "entropy"],
            "class_weight": ["balanced", None],
            "splitter": ["best", "random"],
        },
        "cv": StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE),
        "n_iter": 100,
    },
    "XGBoost": {
        "estimator": xgb.XGBClassifier(
            objective="multi:softprob",
            num_class=5,
            random_state=RANDOM_STATE,
            n_jobs=-1,
            eval_metric="mlogloss",
        ),
        "params": {
            "n_estimators": [100, 200, 300, 500],
            "max_depth": [3, 4, 5, 6, 7, 8],
            "learning_rate": [0.01, 0.05, 0.1, 0.2, 0.3],
            "subsample": [0.8, 0.9, 1.0],
            "colsample_bytree": [0.8, 0.9, 1.0],
            "reg_alpha": [0, 0.1, 0.5, 1.0],
            "reg_lambda": [0, 0.1, 0.5, 1.0],
            "min_child_weight": [1, 3, 5, 7],
            "gamma": [0, 0.1, 0.2, 0.3],
        },
        "cv": StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE),
        "n_iter": 40,
    },
    "LDA": {
        "estimator": LinearDiscriminantAnalysis(),
        "params": [
            {"solver": ["svd"], "store_covariance": [False, True], "tol": [1e-4, 1e-3, 1e-2]},
            {"solver": ["lsqr", "eigen"], "shrinkage": [None, "auto", 0.0, 0.05, 0.1, 0.15, 0.25, 0.35, 0.5, 0.65, 0.75, 0.85, 0.9], "tol": [1e-4, 1e-3, 1e-2]},
        ],
        "cv": StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE),
        "n_iter": 10,
    },
    "ANN": {
        "estimator": MLPClassifier(
            max_iter=300,
            early_stopping=True,
            random_state=RANDOM_STATE,
            n_iter_no_change=10,
            solver="adam",
        ),
        "params": {
            "hidden_layer_sizes": [(64,), (128,), (128, 64)],
            "activation": ["relu"],
            "alpha": loguniform(1e-4, 1e-2),
            "learning_rate_init": loguniform(1e-3, 1e-2),
            "batch_size": randint(64, 129),
            "beta_1": uniform(0.9, 0.09),
            "beta_2": uniform(0.95, 0.049),
            "validation_fraction": [0.1, 0.15],
        },
        "cv": StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE),
        "n_iter": 100,
    },
}

## 3. Methods

In [None]:
def create_leak_free_pipeline(
    estimator,
    sampling_method: Optional[str] = "none",
    sampler_kwargs: Optional[Dict] = None,
    random_state: Optional[int] = 42,
) -> Pipeline:
    """
    Build a leak-free pipeline:
    - Using imblearn.Pipeline ensures fit/transform of SAMPLER happen within each CV fold on TRAIN only.
    """
    sampler_kwargs = dict(sampler_kwargs or {})

    # Provide a default random_state to samplers if not overridden
    if random_state is not None and "random_state" not in sampler_kwargs:
        sampler_kwargs["random_state"] = random_state

    internal_name = _normalize_sampling_method_name(sampling_method)

    steps = []

    SamplerClass = _SAMPLING_REGISTRY[internal_name]
    steps.append(("sampler", SamplerClass(**sampler_kwargs)))

    steps.append(("classifier", estimator))
    display(steps)
    return Pipeline(steps)

In [None]:
def run_randomized_search(
    model_name,
    estimator,
    params,
    X_train,
    y_train,
    X_test,
    y_test,
    cv,
    results_path,
    sampling_method="No_Sampling",
    sampler_kwargs=None,
    remove_outliers=False,
    model_saver=None,
    scoring=None,
    n_iter=20,
    refit_metric="f1_macro"
):
    print(f"\n{'='*80}")
    print(f"Running RandomizedSearchCV for {model_name} ({sampling_method})")
    print(f"Outlier removal: {remove_outliers}")
    print(f"{'='*80}")

    # Create experiment name
    experiment_name = f"{sampling_method.lower()}_outliers_{remove_outliers}"

    # --- SKIP if model already exists ---
    if model_saver and model_saver.model_exists(model_name, experiment_name):
        print(f"  Skipping {model_name} ({experiment_name}) - model already saved.")
        try:
            meta = model_saver.load_metadata(model_name, experiment_name)
            if meta:
                print(f"    Existing model best_score={meta.get('best_score'):.4f}, "
                      f"params={meta.get('best_params')}")
        except Exception as e:
            print(f"  (Could not load metadata: {e})")
        return None
    # ---------------------------------------------------------------


    # Create leak-free pipeline - only applies for sampling methods
    if sampling_method != "No_Sampling":
        estimator = create_leak_free_pipeline(estimator, sampling_method)
        # Adjust parameter names for pipeline
        params = {f'classifier__{param_name}': param_values 
                        for param_name, param_values in params.items()}
    

    # Run the search
    search = RandomizedSearchCV(
        estimator=estimator,
        param_distributions=params,
        scoring=scoring,
        refit=refit_metric,
        n_iter=n_iter,
        cv=cv,
        n_jobs=-1,
        verbose=2,
        return_train_score=True,
    )
    search.fit(X_train, y_train)

    # Save model
    experiment_name = f"{sampling_method.lower()}_outliers_{remove_outliers}"

    # Evaluate best model
    eval_results = evaluate_model(search.best_estimator_, X_train, y_train, X_test, y_test)

    # Summary table (1 row per model)
    summary = {
        "model": model_name,
        "sampling_method": sampling_method,
        "remove_outliers": remove_outliers,
        "best_cv_score": round(search.best_score_, 4), # Best mean validation score from CV (based on refit_metric), higher better!
        "best_params": json.dumps(search.best_params_), 
        "train_f1_macro": round(eval_results["train"]["f1_macro"], 4), # Macro-F1 on training - how well model fits seen data across all classes
        "test_f1_macro": round(eval_results["test"]["f1_macro"], 4), # Macro-F1 on test data - balanced generalization to all classes?
        "test_accuracy": round(eval_results["test"]["accuracy"], 4), # overall proportion of correct predictions 
        "train_test_diff": round(eval_results["train"]["f1_macro"] - eval_results["test"]["f1_macro"], 4), # Gap between train and test: Over/Underfitting indicator: smaller better!
        "roc_auc": round(eval_results["test"]["roc_auc"], 4) if not np.isnan(eval_results["test"]["roc_auc"]) else None, # ROC-AUC on test data: Class separation: closer to 1 better separation!
    }

    # Log cross-fold metrics for best model

    cv_df = pd.DataFrame(search.cv_results_)
    best_idx = search.best_index_
    summary["cv_mean_train_f1_macro"] = round(cv_df["mean_train_f1_macro"][best_idx],4) # High: model fits training folds well, too hig vs validation: possible overfitting
    summary["cv_std_train_f1_macro"]  = round(cv_df["std_train_f1_macro"][best_idx],4) # should be low: stable learning across folds
    summary["cv_mean_val_f1_macro"] = round(cv_df["mean_test_f1_macro"][best_idx],4) # balanced per class performance
    summary["cv_std_val_f1_macro"] = round(cv_df["std_test_f1_macro"][best_idx],4) # should be low
    summary["cv_diff_train_val_f1_macro"] = round(cv_df["mean_train_f1_macro"][best_idx] - cv_df["mean_test_f1_macro"][best_idx],4)
    summary["cv_mean_val_bal_acc"] = round(cv_df["mean_test_bal_acc"][best_idx],4) # Higher better: class imbalance  by averaging recall per class
    summary["cv_std_val_bal_acc"] = round(cv_df["std_test_bal_acc"][best_idx],4) # should be low
    summary["mean_fit_time"] = round(cv_df["mean_fit_time"][best_idx],4) 
    summary["std_fit_time"] = round(cv_df["std_fit_time"][best_idx],4)

    for lbl, f1_val in zip(eval_results["labels"], eval_results["test"]["f1_per_class"]):
        summary[f"test_f1_class_{lbl}"] = round(float(f1_val), 4)

    for lbl, f1_val in zip(eval_results["labels"], eval_results["train"]["f1_per_class"]):
        summary[f"train_f1_class_{lbl}"] = round(float(f1_val), 4)

    os.makedirs(os.path.dirname(results_path), exist_ok=True)
    pd.DataFrame([summary]).to_csv(results_path, mode="a", header=not os.path.exists(results_path), index=False)

    # Save full CV results for analysis
    cv_full_path = results_path.replace(".csv", '_'+model_name+'_'+experiment_name+"_cv_results.csv")
    cv_df.to_csv(cv_full_path, index=False)

    # Generate diagnostics / graphics
    save_overfit_diagnostic(cv_df, model_name, sampling_method, results_path)
    save_cv_diagnostics(cv_df, model_name, sampling_method, results_path)
    save_model_diagnostics(eval_results, model_name, sampling_method, results_path)
    save_roc_curve(search.best_estimator_, X_test, y_test, model_name, sampling_method, results_path)

    print(f"Saved unified results to {results_path}")

    if model_saver:
        meta = {
            "best_params": search.best_params_,
            "best_score": search.best_score_,
            "cv_results": search.cv_results_,
            "experiment": experiment_name,
            "classifier": model_name,
        }
        model_saver.save_model(model_name, search, experiment_name, meta)
    
    print(f"Saved model {model_name} ({experiment_name})!")

    return summary

## 3. Test models with Randomized Search CV

#### 3.1.1 Run the randomized search CV without Sampling

In [None]:
results_path = "reports/03_baseline_models/MIT_02_01_RANDOMIZED_SEARCH/A_02_01.csv"

for model_name, param_dict in PARAM_SPACES.items():
    run_randomized_search(model_name, 
                        estimator=param_dict["estimator"],
                        params=param_dict["params"],
                        X_train=X_train,
                        y_train=y_train,
                        X_test=X_test,
                        y_test=y_test,
                        cv=param_dict["cv"],
                        results_path=results_path,
                        sampling_method="No_Sampling",
                        remove_outliers=False,
                        model_saver=model_saver,
                        scoring=SCORING,
                        n_iter=param_dict["n_iter"],
                        refit_metric="f1_macro")