# 4 Find best "simple" model for given dataset

This part provides a pipeline for heartbeat classification based on the requirements from `notebooks/03_model_testing_example_mit.ipynb`.

Steps:
1. Use train/validation datasets created beforehand and shared in group
2. GridSearchCV with optimized parameter spaces, based on the previous Notebook
3. Target models: XGBoost, ANN, SVM
4. GridSearch with and without outlier removal
5. RepeatedStratifiedKFold cross-validation
6. Leak-free scaling using Pipeline


## 1. Imports

In [None]:

import sys
import os
from pathlib import Path
import json
import warnings
from typing import Dict, List, Optional, Tuple, Union

# Add src to path
sys.path.append('../..')
os.chdir('../..')

import pandas as pd
import numpy as np

# ML libraries
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score, precision_recall_fscore_support, confusion_matrix
)

# Models
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
import xgboost as xgb

# Sampling
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.combine import SMOTETomek, SMOTEENN
from imblearn.pipeline import Pipeline as ImbPipeline

# Custom utilities
from src.utils.preprocessing import (
    load_processed_dataset,
    DatasetSplit,
    build_full_suffix as pp_build_full_suffix,
    generate_all_processed_datasets,
)
from src.utils.evaluation import eval_model
from src.utils.model_saver import create_model_saver

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

## 2. Constants & Param Spaces

In [None]:
RANDOM_STATE = 42
SCORING = {'f1_macro': 'f1_macro', 'bal_acc': 'balanced_accuracy', 'f1_weighted': 'f1_weighted'}
results_csv = "reports/03_model_testing_results/model_comparison_best_models.csv"

PARAM_SPACES = {
    "XGBoost": {
        "estimator": xgb.XGBClassifier(
            objective="multi:softmax",
            num_class=5,
            random_state=RANDOM_STATE,
            n_jobs=-1,
            eval_metric="mlogloss",
        ),
        "params": {
            "n_estimators": [150, 200, 250],
            "max_depth": [8, 9],
            "learning_rate": [0.2],
            "subsample": [0.7, 0.8],
            "colsample_bytree": [0.9],
            "reg_alpha": [0.1, 0.2],
            "reg_lambda": [0.0, 0.05],
            "min_child_weight": [5],
            "gamma": [0.0, 0.05],
        },
        "cv": RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=RANDOM_STATE),
        "needs_scaling": False,
    },
    "ANN": {
        "estimator": MLPClassifier(
            max_iter=300,
            early_stopping=True,
            random_state=RANDOM_STATE,
            n_iter_no_change=10,
            solver="adam",
        ),
        "params": {
            "hidden_layer_sizes": [(128, 64)],
            "activation": ["relu"],
            "alpha": [3e-4],
            "learning_rate_init": [0.001, 0.0015],
            "batch_size": [96, 128],
            "beta_1": [0.9, 0.91],
            "beta_2": [0.97, 0.974],
            "validation_fraction": [0.1],
        },
        "cv": RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=RANDOM_STATE),
        "needs_scaling": True,
    },
    # best: {'clf__kernel': 'rbf', 'clf__gamma': 0,5, 'clf__C': 10}
    "SVM": {
        "estimator": SVC(),
        "params": {
            "kernel": ["rbf"],
            "C": [10],
            "gamma": [0.4, 0.5, 0.6],
        },
        "cv": RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=RANDOM_STATE),
        "needs_scaling": True,
    },
}

DATA_DIR = "data/processed/mitbih"


## 3. Methods used

In [None]:
def create_leak_free_pipeline(model_name: str, estimator, needs_scaling: bool = True) -> Pipeline:
    """Create a leak-free pipeline with scaling if needed."""
    if needs_scaling:
        return Pipeline([
            ('scaler', StandardScaler()),
            ('classifier', estimator)
        ])
    else:
        return Pipeline([
            ('classifier', estimator)
        ])


def prepare_dataset_with_sampling(
    data_dir: str = DATA_DIR,
    sampling_method: str = "No_Sampling",
    remove_outliers: bool = False
) -> Tuple[np.ndarray, Optional[np.ndarray], np.ndarray, Optional[np.ndarray]]:
    """Load an existing processed dataset for the given configuration.

    Datasets are assumed to be pre-generated by preprocessing utilities. This
    function never overwrites or generates new data; it only loads.
    """
    # Ensure all datasets are generated once (no-op if already done)
    generate_all_processed_datasets(data_dir=data_dir, only_once=True)

    full_suffix = pp_build_full_suffix(sampling_method, remove_outliers)
    split = load_processed_dataset(data_dir=data_dir, sampling_suffix=full_suffix)

    X_train_res = split.X_train.values
    y_train_res = split.y_train.values
    X_val = split.X_val.values if split.X_val is not None else None
    y_val = split.y_val.values if split.y_val is not None else None

    return X_train_res, X_val, y_train_res, y_val


def run_grid_search(
    model_name: str,
    sampling_method: str = "No_Sampling",
    remove_outliers: bool = False,
    model_saver=None,
    results_dir: str = "reports/comprehensive_model_testing"
) -> Dict:
    """
    Run GridSearchCV for a specific model and sampling method.
    
    Args:
        model_name: Name of the model to train
        sampling_method: Sampling method to use
        remove_outliers: Whether to remove outliers
        model_saver: Model saver instance
        results_dir: Directory to save results
        
    Returns:
        Dictionary with results
    """
    print(f"\n{'='*80}")
    print(f"Running GridSearchCV for {model_name} with {sampling_method}")
    print(f"Outlier removal: {remove_outliers}")
    print(f"{'='*80}")
    
    # Get model configuration
    model_config = PARAM_SPACES[model_name]
    estimator = model_config["estimator"]
    params = model_config["params"]
    cv = model_config["cv"]
    needs_scaling = model_config["needs_scaling"]
    
    # Prepare data
    X_train, X_val, y_train, y_val = prepare_dataset_with_sampling(
        sampling_method=sampling_method,
        remove_outliers=remove_outliers
    )
    
    # Create leak-free pipeline
    pipeline = create_leak_free_pipeline(model_name, estimator, needs_scaling)
    
    # Adjust parameter names for pipeline
    pipeline_params = {}
    for param_name, param_values in params.items():
        pipeline_params[f'classifier__{param_name}'] = param_values
    
    # Create experiment name
    experiment_name = f"{sampling_method.lower()}_outliers_{remove_outliers}"
    
    # Check if model already exists
    if model_saver and model_saver.model_exists(model_name, experiment_name):
        print(f"Model {model_name} already exists for experiment {experiment_name}. Skipping training and CSV append.")
        return None
    else:
        print(f"Training new model for {model_name}...")
        
        # Run GridSearchCV
        grid_search = GridSearchCV(
            estimator=pipeline,
            param_grid=pipeline_params,
            scoring=SCORING,
            refit='f1_macro',
            cv=cv,
            n_jobs=-1,
            verbose=3
        )
        
        grid_search.fit(X_train, y_train)
        
        # Save model if saver is provided
        if model_saver:
            metadata = {
                'best_params': grid_search.best_params_,
                'best_score': grid_search.best_score_,
                'cv_results': grid_search.cv_results_,
                'experiment': experiment_name,
                'classifier': model_name,
                'sampling_method': sampling_method,
                'remove_outliers': remove_outliers,
            }
            model_saver.save_model(model_name, grid_search, experiment_name, metadata)
            print(f"Model {model_name} saved successfully!")
    
    # Evaluate on validation set if available
    if X_val is not None and y_val is not None:
        print(f"Evaluating {model_name} on validation set...")
        best_model = grid_search.best_estimator_
        
        # For evaluation, we need to fit the model again since pipeline might not be fitted
        best_model.fit(X_train, y_train)
        
        # Get predictions
        y_pred = best_model.predict(X_val)
        
        # Calculate metrics
        accuracy = accuracy_score(y_val, y_pred)
        precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(
            y_val, y_pred, average='macro', zero_division=0
        )
        
        # Per-class metrics
        labels = np.unique(np.concatenate([y_train, y_val]))
        precision_per_class, recall_per_class, f1_per_class, support_per_class = precision_recall_fscore_support(
            y_val, y_pred, average=None, labels=labels, zero_division=0
        )
        
        confusion_mat = confusion_matrix(y_val, y_pred, labels=labels)
        
        results = {
            'model_name': model_name,
            'sampling_method': sampling_method,
            'remove_outliers': remove_outliers,
            'best_cv_score': grid_search.best_score_,
            'best_params': grid_search.best_params_,
            'validation_accuracy': accuracy,
            'validation_f1_macro': f1_macro,
            'validation_precision_macro': precision_macro,
            'validation_recall_macro': recall_macro,
            'validation_f1_per_class': f1_per_class,
            'validation_precision_per_class': precision_per_class,
            'validation_recall_per_class': recall_per_class,
            'validation_support_per_class': support_per_class,
            'confusion_matrix': confusion_mat,
            'labels': labels,
        }
        
        print(f"Validation F1-Macro: {f1_macro:.4f}")
        print(f"Validation Accuracy: {accuracy:.4f}")

        # Append to canonical results CSV (only for newly trained models)
        row = {
            'model': model_name,
            'sampling_method': sampling_method,
            'remove_outliers': remove_outliers,
            'val_accuracy': round(float(accuracy), 4),
            'val_f1_macro': round(float(f1_macro), 4),
            'best_cv_score': round(float(grid_search.best_score_), 4),
            'best_parameters': json.dumps(grid_search.best_params_),
        }
        os.makedirs(os.path.dirname(results_csv), exist_ok=True)
        header = not os.path.exists(results_csv)
        pd.DataFrame([row]).to_csv(results_csv, mode='a', index=False, header=header)
        
        return results
    else:
        print("No validation set available for evaluation")
        # Append limited info to CSV
        
        row = {
            'model': model_name,
            'sampling_method': sampling_method,
            'remove_outliers': remove_outliers,
            'val_accuracy': None,
            'val_f1_macro': None,
            'best_cv_score': round(float(grid_search.best_score_), 4),
            'best_parameters': json.dumps(grid_search.best_params_),
        }
        os.makedirs(os.path.dirname(results_csv), exist_ok=True)
        header = not os.path.exists(results_csv)
        pd.DataFrame([row]).to_csv(results_csv, mode='a', index=False, header=header)

        return {
            'model_name': model_name,
            'sampling_method': sampling_method,
            'remove_outliers': remove_outliers,
            'best_cv_score': grid_search.best_score_,
            'best_params': grid_search.best_params_,
        }



## 4. Run

In [4]:
print("Starting Model Testing")
print("="*80)

# Initialize model saver
# Change to project root directory

model_saver = create_model_saver("src/models/best_simple_models_testing")

# Define experiments to run
experiments = [
    # Without outlier removal
    
    ("XGBoost", "No_Sampling", False),
    ("ANN", "No_Sampling", False),
    #("SVM", "No_Sampling", False),
    
    # With outlier removal
    ("XGBoost", "No_Sampling", True),
    ("ANN", "No_Sampling", True),
    #("SVM", "No_Sampling", True),
    
    # With sampling (no outlier removal)
    ("XGBoost", "SMOTE", False),
    ("ANN", "SMOTE", False),
    #("SVM", "SMOTE", False),
    
    # With sampling (with outlier removal)
    ("XGBoost", "SMOTE", True),
    ("ANN", "SMOTE", True),
    #("SVM", "SMOTE", True),
]

# Run experiments
all_results = []

for model_name, sampling_method, remove_outliers in experiments:
    try:
        result = run_grid_search(
            model_name=model_name,
            sampling_method=sampling_method,
            remove_outliers=remove_outliers,
            model_saver=model_saver
        )
        if result is not None:  # Only append if result is not None
            all_results.append(result)
    except Exception as e:
        print(f"Error running {model_name} with {sampling_method}: {e}")
        continue

KeyboardInterrupt: 

In [None]:
print(f"\n{'='*100}")
print("BEST OVERALL RESULT (FROM ALL RUNS)")
print(f"{'='*100}")

# Load existing results from CSV to find truly best overall result
existing_csv = "reports/03_model_testing_results/model_comparison_best_models.csv"
if os.path.exists(existing_csv):
    df_all_results = pd.read_csv(existing_csv)
    # Remove rows with missing validation scores

    if len(df_all_results) > 0:
        best_idx = df_all_results['val_f1_macro'].idxmax()
        best_result = df_all_results.loc[best_idx]
        print(f"Best overall model: {best_result['model']}")
        print(f"Sampling Method: {best_result['sampling_method']}")
        print(f"Outlier Removal: {best_result['remove_outliers']}")
        print(f"Validation F1-Macro: {best_result['val_f1_macro']:.4f}")
        print(f"Validation Accuracy: {best_result['val_accuracy']:.4f}")
        print(f"Best CV Score: {best_result['best_cv_score']:.4f}")
        print(f"Best Parameters: {best_result['best_parameters']}")
    else:
        print("No valid results found in existing CSV.")
else:
    print("No existing results CSV found.")
