# A 02 II: MIT Hyperparameter tuning for baseline models using RandomizedSearch sampling

Establish baseline models via randomized search without any feature engineering but with sampling techniques, to identify best sampling technique for the dataset.

## Content

A) MIT-BIH Arrhytmia Dataset

1. train/test split: 80%, 20% -> as defined at the beginning of the project to ensure result reproducibility, no duplicates or missing values present
2. Hyperparameter tuning using RandomizedSearch with cross validation for the mentioned baseline models and oversampling techniques
 


## 1. Imports

In [9]:
import os 
from typing import Dict, Optional
import random 

from src.utils import eval_model, evaluate_model
from src.visualization import save_cv_diagnostics, save_overfit_diagnostic, save_model_diagnostics, save_roc_curve
from src.utils.model_saver import create_model_saver

# external 
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from scipy.stats import loguniform, randint, uniform
import numpy as np
import re
import json

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
import xgboost as xgb

# Samplers

from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.combine import SMOTETomek, SMOTEENN
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from src.utils.preprocessing import (
    _normalize_sampling_method_name,
    _SAMPLING_REGISTRY
)
import mlflow
from mlflow.tracking import MlflowClient

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
random.seed(RANDOM_STATE)


## 2. Param Spaces

In [10]:
PARAM_SPACES = {
    "LogisticRegression": {
        "estimator": LogisticRegression(max_iter=10000, solver='lbfgs', n_jobs=-1),
        "params": {
            "C": loguniform(1e-3, 1e3),      # Big C = less penalty on large weights (more freedom, risk of overfitting). 
                                             # Small C = more penalty (more discipline, less overfitting).
                                             # loguniform = means we try values spread across tiny to big scales (e.g., 0.001 up to 100), not just small steps.
            "penalty": ["l2"], # gently pushes weights toward zero, which keeps the model simpler and more stable.
            "solver": ["lbfgs"],
        },
        "cv": StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE),
        "n_iter": 10,
        "create_new_model": False,
    },
    "KNN": {
        "estimator": KNeighborsClassifier(n_jobs=-1),
        "params": {
            "n_neighbors": randint(1, 51),
            "weights": ["uniform", "distance"],
            "metric": ["minkowski", "manhattan", "euclidean"],
            "p": [1, 2],
        },
        "cv": StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE),
        "n_iter": 10,
    },
    "RandomForest": {
        "estimator": RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=-1),
        "params": {
            "n_estimators": [100, 200, 300],
            "max_depth": [10, 15, 20],
            "min_samples_split": [2, 5, 10, 20, 50],
            "min_samples_leaf": [1, 2, 4, 8],
            "max_features": ["sqrt", "log2", None],
            "bootstrap": [True],
            "class_weight": ["balanced", None],
            "criterion": ["gini", "entropy"],
        },
        "cv": StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE),
        "n_iter": 10,
    },
    #"SVM": {
    #    "estimator": SVC(probability=True),
    #    "params": {
    #        "kernel": ["rbf", "poly"],
    #        "C": [0.1, 1, 10],
    #        "gamma": [0.001, 0.01, 0.1, 0.5, 0.9],
    #    },
    #    "cv": StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE),
    #    "n_iter": 10,
    #},
    "DecisionTree": {
        "estimator": DecisionTreeClassifier(random_state=RANDOM_STATE),
        "params": {
            "max_depth": [None, 5, 10, 15, 20, 25, 30],
            "min_samples_split": [2, 5, 10, 20, 50],
            "min_samples_leaf": [1, 2, 4, 8, 16],
            "max_features": ["sqrt", "log2", None],
            "criterion": ["gini", "entropy"],
            "class_weight": ["balanced", None],
            "splitter": ["best", "random"],
        },
        "cv": StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE),
        "n_iter": 20,
    },
    "XGBoost": {
        "estimator": xgb.XGBClassifier(
            objective="multi:softprob",
            num_class=5,
            random_state=RANDOM_STATE,
            n_jobs=-1,
            eval_metric="mlogloss",
        ),
        "params": {
            "n_estimators": [100, 200, 300, 500],
            "max_depth": [3, 4, 5, 6, 7, 8],
            "learning_rate": [0.01, 0.05, 0.1, 0.2, 0.3],
            "subsample": [0.8, 0.9, 1.0],
            "colsample_bytree": [0.8, 0.9, 1.0],
            "reg_alpha": [0, 0.1, 0.5, 1.0],
            "reg_lambda": [0, 0.1, 0.5, 1.0],
            "min_child_weight": [1, 3, 5, 7],
            "gamma": [0, 0.1, 0.2, 0.3],
        },
        "cv": StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE),
        "n_iter": 40,
    },
    "LDA_svd": {
        "estimator": LinearDiscriminantAnalysis(),
        "params": {"solver": ["svd"], "store_covariance": [False, True], "tol": [1e-4, 1e-3, 1e-2]},
        "cv": StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE),
        "n_iter": 10,
    },
    "LDA_lsqr": {
        "estimator": LinearDiscriminantAnalysis(),
        "params": {"solver": ["lsqr", "eigen"], "shrinkage": [None, "auto", 0.0, 0.05, 0.1, 0.15, 0.25, 0.35, 0.5, 0.65, 0.75, 0.85, 0.9], "tol": [1e-4, 1e-3, 1e-2]},
        "cv": StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE),
        "n_iter": 10,
    },
    "ANN": {
        "estimator": MLPClassifier(
            max_iter=300,
            early_stopping=True,
            random_state=RANDOM_STATE,
            n_iter_no_change=10,
            solver="adam",
        ),
        "params": {
            "hidden_layer_sizes": [(64,), (128,), (128, 64)],
            "activation": ["relu"],
            "alpha": loguniform(1e-4, 1e-2),
            "learning_rate_init": loguniform(1e-3, 1e-2),
            "batch_size": randint(64, 129),
            "beta_1": uniform(0.9, 0.09),
            "beta_2": uniform(0.95, 0.049),
            "validation_fraction": [0.1, 0.15],
        },
        "cv": StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE),
        "n_iter": 100,
    },
}

## 3. Methods

In [11]:
def create_leak_free_pipeline(
    estimator,
    sampling_method: Optional[str] = "none",
    sampler_kwargs: Optional[Dict] = None,
    random_state: Optional[int] = 42,
) -> Pipeline:
    """
    Build a leak-free pipeline:
    - Using imblearn.Pipeline ensures fit/transform of SAMPLER happen within each CV fold on TRAIN only.
    """
    sampler_kwargs = dict(sampler_kwargs or {})

    # Provide a default random_state to samplers if not overridden
    if random_state is not None and "random_state" not in sampler_kwargs:
        sampler_kwargs["random_state"] = random_state

    internal_name = _normalize_sampling_method_name(sampling_method)

    steps = []

    SamplerClass = _SAMPLING_REGISTRY[internal_name]
    steps.append(("sampler", SamplerClass(**sampler_kwargs)))

    steps.append(("classifier", estimator))

    return Pipeline(steps)

In [12]:
def configure_mlflow_for_minio(experiment_name, mlflow_tracking_uri, minio_endpoint_url):
    """Configure MLflow to use MinIO. Credentials loaded from .aws/credentials"""
    import os
    from configparser import ConfigParser
    from pathlib import Path
    
    # Load credentials from .aws/credentials (project or home directory)
    creds_file = Path('.aws/credentials') if Path('.aws/credentials').exists() else Path.home() / '.aws' / 'credentials'
    if creds_file.exists():
        config = ConfigParser()
        config.read(creds_file)
        if 'default' in config:
            os.environ['AWS_ACCESS_KEY_ID'] = config['default'].get('aws_access_key_id', '')
            os.environ['AWS_SECRET_ACCESS_KEY'] = config['default'].get('aws_secret_access_key', '')
    else:
        raise LookupError("No MINIO Credentials found!")
    
    os.environ['MLFLOW_S3_ENDPOINT_URL'] = minio_endpoint_url

    mlflow.set_tracking_uri(mlflow_tracking_uri)

    # Overwrites existing experiment


    # Initialize client
    client = MlflowClient(tracking_uri=mlflow_tracking_uri)

    # First, get the experiment to find its ID#
    experiment = mlflow.get_experiment_by_name(experiment_name)
    if experiment:
        experiment_id = experiment.experiment_id
        print(f"Found experiment: {experiment_name} (ID: {experiment_id})")
    else:
        print(f"Experiment '{experiment_name}' not found, creating new experiment!")
        experiment_id = client.create_experiment(experiment_name)
        
    return experiment_id 

In [13]:
def log_results_to_mlflow(
    model_name: str,
    search,
    eval_results: dict,
    summary: dict,
    sampling_method: str,
    remove_outliers: bool,
    results_path: str,
    experiment_id: str,
    n_iter: int,
    cv,
    refit_metric: str,
    random_state: int,
    X_train,
    dataset_name: str,
    dataset_source: str,
    mlflow_tracking_uri: str = "http://mlflow.home.lan",
    sampler_kwargs: Optional[Dict] = None,
):
    """
    Log experiment results to MLflow.
    
    Args:
        model_name: Name of the model/classifier
        search: Fitted RandomizedSearchCV object
        eval_results: Dictionary with train/test evaluation results
        summary: Dictionary with summary metrics
        sampling_method: Name of sampling method used
        sampler_kwargs: Dictionary with sampler parameters (e.g., k_neighbors, n_neighbors)
        remove_outliers: Whether outliers were removed
        results_path: Path to results CSV file (used to find plot files)
        experiment_id: MLflow experiment ID
        n_iter: Number of iterations in RandomizedSearchCV
        cv: Cross-validation object
        refit_metric: Metric used for refitting
        random_state: Random state used
        mlflow_tracking_uri: MLflow tracking server URI
    """
    run_name = f"{model_name}_{sampling_method}_outliers_{remove_outliers}"
    
    with mlflow.start_run(run_name=run_name, experiment_id=experiment_id):
        # Log tags
        try:
            dataset = mlflow.data.from_pandas(
                X_train, 
                source=dataset_source,
                name=f"{dataset_name} Training Dataset"
            )
            mlflow.log_input(dataset, context="training")
        except Exception as e:
            print(f"Warning: Could not log dataset: {e}")

        mlflow.set_tags({
            "dataset": "MIT-BIH",
            "phase": "baseline_models",
            "model_type": model_name,
            "sampling_method": sampling_method,
            "outlier_removal": str(remove_outliers),
        })
        
        # Log hyperparameters (best_params)
        # Convert best_params to string format for MLflow (handles nested pipeline params)
        mlflow_params = {}
        for key, value in search.best_params_.items():
            # Convert to string if needed (MLflow params must be strings)
            if isinstance(value, (list, dict)):
                mlflow_params[key] = json.dumps(value)
            elif value is None:
                mlflow_params[key] = "None"
            else:
                mlflow_params[key] = str(value)
        
        mlflow.log_params(mlflow_params)
        
        # Log experiment configuration parameters
        mlflow.log_params({
            "sampling_method": sampling_method,
            "remove_outliers": str(remove_outliers),
            "n_iter": str(n_iter),
            "cv_n_splits": str(cv.n_splits),
            "refit_metric": refit_metric,
            "random_state": str(random_state),
        })
        
        # Log sampler parameters if available
        if sampler_kwargs:
            sampler_params = {}
            for key, value in sampler_kwargs.items():
                # Handle special case where 'smote' is an object instance (for SMOTETomek, SMOTEENN)
                if key == "smote" and hasattr(value, "__class__"):
                    # Extract SMOTE parameters from the object
                    sampler_params["sampler_smote_k_neighbors"] = str(getattr(value, "k_neighbors", "N/A"))
                    sampler_params["sampler_smote_random_state"] = str(getattr(value, "random_state", "N/A"))
                else:
                    # Convert to string for MLflow
                    if isinstance(value, (list, dict)):
                        sampler_params[f"sampler_{key}"] = json.dumps(value)
                    elif value is None:
                        sampler_params[f"sampler_{key}"] = "None"
                    elif hasattr(value, "__class__"):
                        # For other object instances, log the class name
                        sampler_params[f"sampler_{key}"] = value.__class__.__name__
                    else:
                        sampler_params[f"sampler_{key}"] = str(value)
            
            mlflow.log_params(sampler_params)
        
        # Log metrics (rest of the function remains the same)
        mlflow.log_metrics({
            "best_cv_score": summary["best_cv_score"],
            "test_f1_macro": summary["test_f1_macro"],
            "train_f1_macro": summary["train_f1_macro"],
            "test_accuracy": summary["test_accuracy"],
            "train_test_diff": summary["train_test_diff"],
            "cv_mean_val_f1_macro": summary["cv_mean_val_f1_macro"],
            "cv_std_val_f1_macro": summary["cv_std_val_f1_macro"],
            "cv_mean_train_f1_macro": summary["cv_mean_train_f1_macro"],
            "cv_std_train_f1_macro": summary["cv_std_train_f1_macro"],
            "cv_diff_train_val_f1_macro": summary["cv_diff_train_val_f1_macro"],
            "cv_mean_val_bal_acc": summary["cv_mean_val_bal_acc"],
            "cv_std_val_bal_acc": summary["cv_std_val_bal_acc"],
            "mean_fit_time": summary["mean_fit_time"],
            "std_fit_time": summary["std_fit_time"],
        })
        
        # Log ROC-AUC if available
        if summary["roc_auc"] is not None:
            mlflow.log_metric("test_roc_auc", summary["roc_auc"])
        
        # Log per-class F1 scores as metrics
        for lbl in eval_results["labels"]:
            mlflow.log_metric(f"test_f1_class_{lbl}", summary[f"test_f1_class_{lbl}"])
            mlflow.log_metric(f"train_f1_class_{lbl}", summary[f"train_f1_class_{lbl}"])
        
        # Log artifacts (plots and CSV files)
        base = results_path.replace(".csv", "")
        
        # Find all generated plot files
        plot_patterns = [
            f"{base}_{model_name}_{sampling_method}_cv_tradeoff.png",
            f"{base}_{model_name}_{sampling_method}_cv_spread.png",
            f"{base}_{model_name}_{sampling_method}_cv_learning_curve.png",
            f"{base}_{model_name}_{sampling_method}_overfit_diag.png",
            f"{base}_{model_name}_{sampling_method}_roc_curve.png",
        ]
        
        # Log diagnostic plots
        for plot_path in plot_patterns:
            if os.path.exists(plot_path):
                mlflow.log_artifact(plot_path, "diagnostics")
        
        # Log model diagnostics plot (if it exists)
        model_diag_path = f"{base}_{model_name}_{sampling_method}_model_diagnostics.png"
        if os.path.exists(model_diag_path):
            mlflow.log_artifact(model_diag_path, "diagnostics")
        
        # Log CSV files
        cv_full_path = results_path.replace(".csv", '_'+model_name+'_'+sampling_method.lower()+'_outliers_'+str(remove_outliers)+"_cv_results.csv")
        if os.path.exists(cv_full_path):
            mlflow.log_artifact(cv_full_path, "data")
        if os.path.exists(results_path):
            mlflow.log_artifact(results_path, "data")
        
        # Log the trained model
        mlflow.sklearn.log_model(
            search.best_estimator_,
            name="model",
            input_example=X_train.iloc[:1].values if hasattr(X_train, 'iloc') else X_train[:1],
            registered_model_name=f"{model_name}_{sampling_method}",
        )
        
        run_id = mlflow.active_run().info.run_id
        print(f"‚úÖ Logged to MLflow: {run_name}")
        print(f"   Run ID: {run_id}")
        print(f"   View at: {mlflow_tracking_uri}/#/experiments/{experiment_id}/runs/{run_id}")

In [14]:
def run_randomized_search(
    model_name,
    estimator,
    params,
    X_train,
    y_train,
    X_test,
    y_test,
    cv,
    results_path,
    sampling_method="No_Sampling",
    sampler_kwargs=None,
    remove_outliers=False,
    model_saver=None,
    scoring=None,
    n_iter=20,
    refit_metric="f1_macro",
    verbose=1,
    log_to_mlflow=True,
    dataset_name=None,
    dataset_source=None,
    mlflow_experiment_id=None,
    mlflow_tracking_uri="http://mlflow.home.lan"
):
    print(f"\n{'='*80}")
    print(f"Running RandomizedSearchCV for {model_name} ({sampling_method})")
    print(f"Outlier removal: {remove_outliers}")
    print(f"{'='*80}")

    # Create experiment name
    experiment_name = f"{sampling_method.lower()}_outliers_{remove_outliers}"

    # --- SKIP if model already exists ---
    if model_saver and model_saver.model_exists(model_name, experiment_name):
        print(f"  Skipping {model_name} ({experiment_name}) - model already saved.")
        try:
            meta = model_saver.load_metadata(model_name, experiment_name)
            if meta:
                print(f"    Existing model best_score={meta.get('best_score'):.4f}, "
                      f"params={meta.get('best_params')}")
        except Exception as e:
            print(f"  (Could not load metadata: {e})")
        return None
    # ---------------------------------------------------------------

    # Create leak-free pipeline - only applies for sampling methods
    if sampling_method != "No_Sampling":
        estimator = create_leak_free_pipeline(estimator, sampling_method, sampler_kwargs)
        # Adjust parameter names for pipeline
        params = {f'classifier__{param_name}': param_values 
                        for param_name, param_values in params.items()}
    

    # Run the search
    search = RandomizedSearchCV(
        estimator=estimator,
        param_distributions=params,
        scoring=scoring,
        refit=refit_metric,
        n_iter=n_iter,
        cv=cv,
        n_jobs=-1,
        verbose=verbose,
        return_train_score=True,
    )
    search.fit(X_train, y_train)

    # Save model
    experiment_name = f"{sampling_method.lower()}_outliers_{remove_outliers}"

    # Evaluate best model
    eval_results = evaluate_model(search.best_estimator_, X_train, y_train, X_test, y_test)

    # Summary table (1 row per model)
    summary = {
        "model": model_name,
        "sampling_method": sampling_method,
        "remove_outliers": remove_outliers,
        "best_cv_score": round(search.best_score_, 4), # Best mean validation score from CV (based on refit_metric), higher better!
        "best_params": json.dumps(search.best_params_), 
        "train_f1_macro": round(eval_results["train"]["f1_macro"], 4), # Macro-F1 on training - how well model fits seen data across all classes
        "test_f1_macro": round(eval_results["test"]["f1_macro"], 4), # Macro-F1 on test data - balanced generalization to all classes?
        "test_accuracy": round(eval_results["test"]["accuracy"], 4), # overall proportion of correct predictions 
        "train_test_diff": round(eval_results["train"]["f1_macro"] - eval_results["test"]["f1_macro"], 4), # Gap between train and test: Over/Underfitting indicator: smaller better!
        "roc_auc": round(eval_results["test"]["roc_auc"], 4) if not np.isnan(eval_results["test"]["roc_auc"]) else None, # ROC-AUC on test data: Class separation: closer to 1 better separation!
    }

    # Log cross-fold metrics for best model

    cv_df = pd.DataFrame(search.cv_results_)
    best_idx = search.best_index_
    summary["cv_mean_train_f1_macro"] = round(cv_df["mean_train_f1_macro"][best_idx],4) # High: model fits training folds well, too hig vs validation: possible overfitting
    summary["cv_std_train_f1_macro"]  = round(cv_df["std_train_f1_macro"][best_idx],4) # should be low: stable learning across folds
    summary["cv_mean_val_f1_macro"] = round(cv_df["mean_test_f1_macro"][best_idx],4) # balanced per class performance
    summary["cv_std_val_f1_macro"] = round(cv_df["std_test_f1_macro"][best_idx],4) # should be low
    summary["cv_diff_train_val_f1_macro"] = round(cv_df["mean_train_f1_macro"][best_idx] - cv_df["mean_test_f1_macro"][best_idx],4)
    summary["cv_mean_val_bal_acc"] = round(cv_df["mean_test_bal_acc"][best_idx],4) # Higher better: class imbalance  by averaging recall per class
    summary["cv_std_val_bal_acc"] = round(cv_df["std_test_bal_acc"][best_idx],4) # should be low
    summary["mean_fit_time"] = round(cv_df["mean_fit_time"][best_idx],4) 
    summary["std_fit_time"] = round(cv_df["std_fit_time"][best_idx],4)

    for lbl, f1_val in zip(eval_results["labels"], eval_results["test"]["f1_per_class"]):
        summary[f"test_f1_class_{lbl}"] = round(float(f1_val), 4)

    for lbl, f1_val in zip(eval_results["labels"], eval_results["train"]["f1_per_class"]):
        summary[f"train_f1_class_{lbl}"] = round(float(f1_val), 4)

    os.makedirs(os.path.dirname(results_path), exist_ok=True)
    pd.DataFrame([summary]).to_csv(results_path, mode="a", header=not os.path.exists(results_path), index=False)

    # Save full CV results for analysis
    cv_full_path = results_path.replace(".csv", '_'+model_name+'_'+experiment_name+"_cv_results.csv")
    cv_df.to_csv(cv_full_path, index=False)

    # Generate diagnostics / graphics
    save_overfit_diagnostic(cv_df, model_name, sampling_method, results_path)
    save_cv_diagnostics(cv_df, model_name, sampling_method, results_path)
    save_model_diagnostics(eval_results, model_name, sampling_method, results_path)
    save_roc_curve(search.best_estimator_, X_test, y_test, model_name, sampling_method, results_path)

    print(f"Saved unified results to {results_path}")

    # Log to MLflow if enabled
    if log_to_mlflow and mlflow_experiment_id is not None:
        log_results_to_mlflow(
            model_name=model_name,
            search=search,
            eval_results=eval_results,
            summary=summary,
            sampling_method=sampling_method,
            sampler_kwargs=sampler_kwargs,  # ADD THIS LINE
            remove_outliers=remove_outliers,
            results_path=results_path,
            experiment_id=mlflow_experiment_id,
            n_iter=n_iter,
            cv=cv,
            refit_metric=refit_metric,
            random_state=RANDOM_STATE,
            dataset_name=dataset_name, 
            dataset_source=dataset_source,
            X_train=X_train,
            mlflow_tracking_uri=mlflow_tracking_uri,
        )

    if model_saver:
        meta = {
            "best_params": search.best_params_,
            "best_score": search.best_score_,
            "cv_results": search.cv_results_,
            "experiment": experiment_name,
            "classifier": model_name,
        }
        model_saver.save_model(model_name, search, experiment_name, meta)
    
    print(f"Saved model {model_name} ({experiment_name})!")

    return summary

## 3. Test models with Randomized Search CV

#### 3.1.2 Run the randomized search CV Sampling

but without
- Feature Engineering ( RR-Interval! )
- baseline wandering removal
- denoising

In [15]:

REDUCED_DATASET = False # 5% of original for testing
EXPERIMENT_NAME = "MIT_02_02_RS_SAMPLING"
SCORING = {'f1_macro': 'f1_macro', 'bal_acc': 'balanced_accuracy', 'f1_weighted': 'f1_weighted'}
RESULTS_PATH = f"reports/03_baseline_models/{EXPERIMENT_NAME}/A_02_02.csv"
minio_endpoint_url = "http://192.168.178.78:9500"
MLFLOW_TRACKING_URI = "http://mlflow.home.lan"#
dataset_name="MIT-BIH"
dataset_train="data/original/mitbih_train.csv"


#import MIT data 
df_mitbih_train = pd.read_csv(dataset_train, header = None)
df_mitbih_test = pd.read_csv('data/original/mitbih_test.csv', header = None)

#define train and test set
X_train = df_mitbih_train.drop(187, axis = 1)
y_train = df_mitbih_train[187]

X_test = df_mitbih_test.drop(187, axis = 1)
y_test = df_mitbih_test[187]

print("MITBIH dataset")
print(f"\tTraining size: {X_train.shape}, {y_train.shape}")
print(f"\tTest size: {X_test.shape}, {y_test.shape}")

if REDUCED_DATASET:
    EXPERIMENT_NAME + '_RED'

    # Subsample training set to 10 % (keeping all classes)
    X_train_small, _, y_train_small, _ = train_test_split(
        X_train, y_train,
        train_size=0.05,
        stratify=y_train,
        random_state=42
    )

    # Subsample test set to 10 % as well
    X_test_small, _, y_test_small, _ = train_test_split(
        X_test, y_test,
        train_size=0.05,
        stratify=y_test,
        random_state=42
    )

    print("Reduced MIT-BIH dataset")
    print(f"\tTraining size: {X_train_small.shape}, {y_train_small.shape}")
    print(f"\tTest size: {X_test_small.shape}, {y_test_small.shape}")

    # Assign back for your pipeline
    X_train, y_train = X_train_small, y_train_small
    X_test,  y_test  = X_test_small,  y_test_small


experiment_id = configure_mlflow_for_minio(EXPERIMENT_NAME, MLFLOW_TRACKING_URI, minio_endpoint_url)

model_saver = create_model_saver("src/models/MIT_02_02_baseline_models_randomized_search_sampling")

sampling_methods = {
    'RandomOverSampler': {"random_state": RANDOM_STATE}, 
    'SMOTE': {"random_state": RANDOM_STATE, "k_neighbors": 5}, 
    'ADASYN': {"random_state": RANDOM_STATE, "n_neighbors": 5}, 
    'SMOTETomek': {"random_state": RANDOM_STATE, "smote": SMOTE(random_state=RANDOM_STATE, k_neighbors=5)}, 
    'SMOTEENN': {"random_state": RANDOM_STATE, "smote": SMOTE(random_state=RANDOM_STATE, k_neighbors=5)},
}

best_models = ["XGBoost", "SVM", "KNN"]


MITBIH dataset
	Training size: (87554, 187), (87554,)
	Test size: (21892, 187), (21892,)
Found experiment: MIT_02_02_RS_SAMPLING (ID: 10)


In [16]:
for model_name, param_dict in PARAM_SPACES.items():
    for sampler_name, sampler_kwargs in sampling_methods.items():
        print(model_name, param_dict, sampler_name, sampler_kwargs)
        run_randomized_search(model_name, 
                              estimator=param_dict["estimator"],
                              params=param_dict["params"],
                              X_train=X_train,
                              y_train=y_train,
                              X_test=X_test,
                              y_test=y_test,
                              cv=param_dict["cv"],
                              results_path=RESULTS_PATH,
                              sampling_method=sampler_name,
                              sampler_kwargs=sampler_kwargs,
                              remove_outliers=False,
                              model_saver=model_saver,
                              scoring=SCORING,
                              verbose=3,
                              n_iter=param_dict["n_iter"],
                              refit_metric="f1_macro",
                              log_to_mlflow=True,
                              dataset_name=dataset_name,
                              dataset_source=dataset_train,
                              mlflow_experiment_id=experiment_id
                              )

INFO:src.utils.model_saver:Metadata loaded: src/models/MIT_02_02_baseline_models_randomized_search_sampling/LogisticRegression_randomoversampler_outliers_False_metadata.pkl
INFO:src.utils.model_saver:Metadata loaded: src/models/MIT_02_02_baseline_models_randomized_search_sampling/LogisticRegression_smote_outliers_False_metadata.pkl
INFO:src.utils.model_saver:Metadata loaded: src/models/MIT_02_02_baseline_models_randomized_search_sampling/LogisticRegression_adasyn_outliers_False_metadata.pkl
INFO:src.utils.model_saver:Metadata loaded: src/models/MIT_02_02_baseline_models_randomized_search_sampling/LogisticRegression_smotetomek_outliers_False_metadata.pkl
INFO:src.utils.model_saver:Metadata loaded: src/models/MIT_02_02_baseline_models_randomized_search_sampling/LogisticRegression_smoteenn_outliers_False_metadata.pkl
INFO:src.utils.model_saver:Metadata loaded: src/models/MIT_02_02_baseline_models_randomized_search_sampling/KNN_randomoversampler_outliers_False_metadata.pkl
INFO:src.utils.m

LogisticRegression {'estimator': LogisticRegression(max_iter=10000, n_jobs=-1), 'params': {'C': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x7f63f3abb810>, 'penalty': ['l2'], 'solver': ['lbfgs']}, 'cv': StratifiedKFold(n_splits=5, random_state=42, shuffle=True), 'n_iter': 10, 'create_new_model': False} RandomOverSampler {'random_state': 42}

Running RandomizedSearchCV for LogisticRegression (RandomOverSampler)
Outlier removal: False
model_path=PosixPath('src/models/MIT_02_02_baseline_models_randomized_search_sampling/LogisticRegression_randomoversampler_outliers_False.joblib')
  Skipping LogisticRegression (randomoversampler_outliers_False) - model already saved.
    Existing model best_score=0.4843, params={'classifier__C': 1.40779231399724, 'classifier__penalty': 'l2', 'classifier__solver': 'lbfgs'}
LogisticRegression {'estimator': LogisticRegression(max_iter=10000, n_jobs=-1), 'params': {'C': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x



[CV 2/3] END classifier__solver=svd, classifier__store_covariance=False, classifier__tol=0.0001; bal_acc: (train=0.765, test=0.768) f1_macro: (train=0.504, test=0.503) f1_weighted: (train=0.760, test=0.759) total time=  29.9s
[CV 1/3] END classifier__solver=svd, classifier__store_covariance=False, classifier__tol=0.0001; bal_acc: (train=0.772, test=0.761) f1_macro: (train=0.512, test=0.511) f1_weighted: (train=0.770, test=0.771) total time=  37.5s
[CV 3/3] END classifier__solver=svd, classifier__store_covariance=False, classifier__tol=0.01; bal_acc: (train=0.773, test=0.752) f1_macro: (train=0.510, test=0.500) f1_weighted: (train=0.765, test=0.760) total time= 1.0min
[CV 3/3] END classifier__solver=svd, classifier__store_covariance=False, classifier__tol=0.001; bal_acc: (train=0.773, test=0.752) f1_macro: (train=0.510, test=0.500) f1_weighted: (train=0.765, test=0.760) total time= 1.1min
[CV 2/3] END classifier__solver=svd, classifier__store_covariance=False, classifier__tol=0.01; bal_

  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(
INFO:botocore.credentials:Found credentials in environment variables.
Successfully registered model 'LDA_svd_RandomOverSampler'.
2025/11/11 22:24:14 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LDA_svd_RandomOverSampler, version 1
Created version '1' of model 'LDA_svd_RandomOverSampler'.
INFO:src.utils.model_saver:Model saved: src/models/MIT_02_02_baseline_models_randomized_search_sampling/LDA_svd_randomoversampler_outliers_False.joblib
INFO:src.utils.model_saver:Metadata saved: src/models/MIT_02_02_baseline_models_randomized_search_sampling/LDA_svd_randomoversampler_outliers_False_metadata.pkl


‚úÖ Logged to MLflow: LDA_svd_RandomOverSampler_outliers_False
   Run ID: 89432e445f4b4d9f9672f88090c57679
   View at: http://mlflow.home.lan/#/experiments/10/runs/89432e445f4b4d9f9672f88090c57679
üèÉ View run LDA_svd_RandomOverSampler_outliers_False at: http://mlflow.home.lan/#/experiments/10/runs/89432e445f4b4d9f9672f88090c57679
üß™ View experiment at: http://mlflow.home.lan/#/experiments/10
Saved model LDA_svd (randomoversampler_outliers_False)!
LDA_svd {'estimator': LinearDiscriminantAnalysis(), 'params': {'solver': ['svd'], 'store_covariance': [False, True], 'tol': [0.0001, 0.001, 0.01]}, 'cv': StratifiedKFold(n_splits=3, random_state=42, shuffle=True), 'n_iter': 10} SMOTE {'random_state': 42, 'k_neighbors': 5}

Running RandomizedSearchCV for LDA_svd (SMOTE)
Outlier removal: False
model_path=PosixPath('src/models/MIT_02_02_baseline_models_randomized_search_sampling/LDA_svd_smote_outliers_False.joblib')
Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV 3/3] END clas

  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(
Successfully registered model 'LDA_svd_SMOTE'.
2025/11/11 22:26:12 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LDA_svd_SMOTE, version 1
Created version '1' of model 'LDA_svd_SMOTE'.
INFO:src.utils.model_saver:Model saved: src/models/MIT_02_02_baseline_models_randomized_search_sampling/LDA_svd_smote_outliers_False.joblib
INFO:src.utils.model_saver:Metadata saved: src/models/MIT_02_02_baseline_models_randomized_search_sampling/LDA_svd_smote_outliers_False_metadata.pkl


‚úÖ Logged to MLflow: LDA_svd_SMOTE_outliers_False
   Run ID: c3deab6e97324e8ca3086aa1c413aa4d
   View at: http://mlflow.home.lan/#/experiments/10/runs/c3deab6e97324e8ca3086aa1c413aa4d
üèÉ View run LDA_svd_SMOTE_outliers_False at: http://mlflow.home.lan/#/experiments/10/runs/c3deab6e97324e8ca3086aa1c413aa4d
üß™ View experiment at: http://mlflow.home.lan/#/experiments/10
Saved model LDA_svd (smote_outliers_False)!
LDA_svd {'estimator': LinearDiscriminantAnalysis(), 'params': {'solver': ['svd'], 'store_covariance': [False, True], 'tol': [0.0001, 0.001, 0.01]}, 'cv': StratifiedKFold(n_splits=3, random_state=42, shuffle=True), 'n_iter': 10} ADASYN {'random_state': 42, 'n_neighbors': 5}

Running RandomizedSearchCV for LDA_svd (ADASYN)
Outlier removal: False
model_path=PosixPath('src/models/MIT_02_02_baseline_models_randomized_search_sampling/LDA_svd_adasyn_outliers_False.joblib')
Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV 2/3] END classifier__solver=svd, classifier__s

  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(
Successfully registered model 'LDA_svd_ADASYN'.
2025/11/11 22:28:13 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LDA_svd_ADASYN, version 1
Created version '1' of model 'LDA_svd_ADASYN'.
INFO:src.utils.model_saver:Model saved: src/models/MIT_02_02_baseline_models_randomized_search_sampling/LDA_svd_adasyn_outliers_False.joblib
INFO:src.utils.model_saver:Metadata saved: src/models/MIT_02_02_baseline_models_randomized_search_sampling/LDA_svd_adasyn_outliers_False_metadata.pkl


‚úÖ Logged to MLflow: LDA_svd_ADASYN_outliers_False
   Run ID: 56c83070b76f4a84917d3848d8752043
   View at: http://mlflow.home.lan/#/experiments/10/runs/56c83070b76f4a84917d3848d8752043
üèÉ View run LDA_svd_ADASYN_outliers_False at: http://mlflow.home.lan/#/experiments/10/runs/56c83070b76f4a84917d3848d8752043
üß™ View experiment at: http://mlflow.home.lan/#/experiments/10
Saved model LDA_svd (adasyn_outliers_False)!
LDA_svd {'estimator': LinearDiscriminantAnalysis(), 'params': {'solver': ['svd'], 'store_covariance': [False, True], 'tol': [0.0001, 0.001, 0.01]}, 'cv': StratifiedKFold(n_splits=3, random_state=42, shuffle=True), 'n_iter': 10} SMOTETomek {'random_state': 42, 'smote': SMOTE(random_state=42)}

Running RandomizedSearchCV for LDA_svd (SMOTETomek)
Outlier removal: False
model_path=PosixPath('src/models/MIT_02_02_baseline_models_randomized_search_sampling/LDA_svd_smotetomek_outliers_False.joblib')
Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV 2/3] END classif

  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(
Successfully registered model 'LDA_svd_SMOTETomek'.
2025/11/11 22:47:00 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LDA_svd_SMOTETomek, version 1
Created version '1' of model 'LDA_svd_SMOTETomek'.
INFO:src.utils.model_saver:Model saved: src/models/MIT_02_02_baseline_models_randomized_search_sampling/LDA_svd_smotetomek_outliers_False.joblib
INFO:src.utils.model_saver:Metadata saved: src/models/MIT_02_02_baseline_models_randomized_search_sampling/LDA_svd_smotetomek_outliers_False_metadata.pkl


‚úÖ Logged to MLflow: LDA_svd_SMOTETomek_outliers_False
   Run ID: 37c5fca2ca354d95a52481c4f165b3d0
   View at: http://mlflow.home.lan/#/experiments/10/runs/37c5fca2ca354d95a52481c4f165b3d0
üèÉ View run LDA_svd_SMOTETomek_outliers_False at: http://mlflow.home.lan/#/experiments/10/runs/37c5fca2ca354d95a52481c4f165b3d0
üß™ View experiment at: http://mlflow.home.lan/#/experiments/10
Saved model LDA_svd (smotetomek_outliers_False)!
LDA_svd {'estimator': LinearDiscriminantAnalysis(), 'params': {'solver': ['svd'], 'store_covariance': [False, True], 'tol': [0.0001, 0.001, 0.01]}, 'cv': StratifiedKFold(n_splits=3, random_state=42, shuffle=True), 'n_iter': 10} SMOTEENN {'random_state': 42, 'smote': SMOTE(random_state=42)}

Running RandomizedSearchCV for LDA_svd (SMOTEENN)
Outlier removal: False
model_path=PosixPath('src/models/MIT_02_02_baseline_models_randomized_search_sampling/LDA_svd_smoteenn_outliers_False.joblib')
Fitting 3 folds for each of 6 candidates, totalling 18 fits




[CV 2/3] END classifier__solver=svd, classifier__store_covariance=False, classifier__tol=0.001; bal_acc: (train=0.767, test=0.769) f1_macro: (train=0.499, test=0.498) f1_weighted: (train=0.751, test=0.750) total time=13.3min
[CV 1/3] END classifier__solver=svd, classifier__store_covariance=True, classifier__tol=0.0001; bal_acc: (train=0.775, test=0.762) f1_macro: (train=0.509, test=0.507) f1_weighted: (train=0.764, test=0.765) total time=13.8min
[CV 2/3] END classifier__solver=svd, classifier__store_covariance=True, classifier__tol=0.001; bal_acc: (train=0.767, test=0.769) f1_macro: (train=0.499, test=0.498) f1_weighted: (train=0.751, test=0.750) total time=13.8min
[CV 2/3] END classifier__solver=svd, classifier__store_covariance=False, classifier__tol=0.0001; bal_acc: (train=0.767, test=0.769) f1_macro: (train=0.499, test=0.498) f1_weighted: (train=0.751, test=0.750) total time=14.0min
[CV 2/3] END classifier__solver=svd, classifier__store_covariance=True, classifier__tol=0.01; bal_ac

  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(
Successfully registered model 'LDA_svd_SMOTEENN'.
2025/11/11 23:06:58 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LDA_svd_SMOTEENN, version 1
Created version '1' of model 'LDA_svd_SMOTEENN'.


‚úÖ Logged to MLflow: LDA_svd_SMOTEENN_outliers_False
   Run ID: fb8ba4751a4149a1812c92d7f92c3503
   View at: http://mlflow.home.lan/#/experiments/10/runs/fb8ba4751a4149a1812c92d7f92c3503
üèÉ View run LDA_svd_SMOTEENN_outliers_False at: http://mlflow.home.lan/#/experiments/10/runs/fb8ba4751a4149a1812c92d7f92c3503
üß™ View experiment at: http://mlflow.home.lan/#/experiments/10


INFO:src.utils.model_saver:Model saved: src/models/MIT_02_02_baseline_models_randomized_search_sampling/LDA_svd_smoteenn_outliers_False.joblib
INFO:src.utils.model_saver:Metadata saved: src/models/MIT_02_02_baseline_models_randomized_search_sampling/LDA_svd_smoteenn_outliers_False_metadata.pkl


Saved model LDA_svd (smoteenn_outliers_False)!
LDA_lsqr {'estimator': LinearDiscriminantAnalysis(), 'params': {'solver': ['lsqr', 'eigen'], 'shrinkage': [None, 'auto', 0.0, 0.05, 0.1, 0.15, 0.25, 0.35, 0.5, 0.65, 0.75, 0.85, 0.9], 'tol': [0.0001, 0.001, 0.01]}, 'cv': StratifiedKFold(n_splits=3, random_state=42, shuffle=True), 'n_iter': 10} RandomOverSampler {'random_state': 42}

Running RandomizedSearchCV for LDA_lsqr (RandomOverSampler)
Outlier removal: False
model_path=PosixPath('src/models/MIT_02_02_baseline_models_randomized_search_sampling/LDA_lsqr_randomoversampler_outliers_False.joblib')
Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV 2/3] END classifier__shrinkage=None, classifier__solver=lsqr, classifier__tol=0.0001; bal_acc: (train=0.765, test=0.768) f1_macro: (train=0.504, test=0.503) f1_weighted: (train=0.760, test=0.759) total time=   5.1s
[CV 1/3] END classifier__shrinkage=None, classifier__solver=lsqr, classifier__tol=0.0001; bal_acc: (train=0.772, test=

  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(
Successfully registered model 'LDA_lsqr_RandomOverSampler'.
2025/11/11 23:07:38 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LDA_lsqr_RandomOverSampler, version 1
Created version '1' of model 'LDA_lsqr_RandomOverSampler'.
INFO:src.utils.model_saver:Model saved: src/models/MIT_02_02_baseline_models_randomized_search_sampling/LDA_lsqr_randomoversampler_outliers_False.joblib
INFO:src.utils.model_saver:Metadata saved: src/models/MIT_02_02_baseline_models_randomized_search_sampling/LDA_lsqr_randomoversampler_outliers_False_metadata.pkl


‚úÖ Logged to MLflow: LDA_lsqr_RandomOverSampler_outliers_False
   Run ID: eeea34d43243408289be80dff7ade230
   View at: http://mlflow.home.lan/#/experiments/10/runs/eeea34d43243408289be80dff7ade230
üèÉ View run LDA_lsqr_RandomOverSampler_outliers_False at: http://mlflow.home.lan/#/experiments/10/runs/eeea34d43243408289be80dff7ade230
üß™ View experiment at: http://mlflow.home.lan/#/experiments/10
Saved model LDA_lsqr (randomoversampler_outliers_False)!
LDA_lsqr {'estimator': LinearDiscriminantAnalysis(), 'params': {'solver': ['lsqr', 'eigen'], 'shrinkage': [None, 'auto', 0.0, 0.05, 0.1, 0.15, 0.25, 0.35, 0.5, 0.65, 0.75, 0.85, 0.9], 'tol': [0.0001, 0.001, 0.01]}, 'cv': StratifiedKFold(n_splits=3, random_state=42, shuffle=True), 'n_iter': 10} SMOTE {'random_state': 42, 'k_neighbors': 5}

Running RandomizedSearchCV for LDA_lsqr (SMOTE)
Outlier removal: False
model_path=PosixPath('src/models/MIT_02_02_baseline_models_randomized_search_sampling/LDA_lsqr_smote_outliers_False.joblib')
Fitti

  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(
Successfully registered model 'LDA_lsqr_SMOTE'.
2025/11/11 23:08:25 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LDA_lsqr_SMOTE, version 1
Created version '1' of model 'LDA_lsqr_SMOTE'.
INFO:src.utils.model_saver:Model saved: src/models/MIT_02_02_baseline_models_randomized_search_sampling/LDA_lsqr_smote_outliers_False.joblib
INFO:src.utils.model_saver:Metadata saved: src/models/MIT_02_02_baseline_models_randomized_search_sampling/LDA_lsqr_smote_outliers_False_metadata.pkl


‚úÖ Logged to MLflow: LDA_lsqr_SMOTE_outliers_False
   Run ID: 17c60d66db1746fd893afec36cadae1e
   View at: http://mlflow.home.lan/#/experiments/10/runs/17c60d66db1746fd893afec36cadae1e
üèÉ View run LDA_lsqr_SMOTE_outliers_False at: http://mlflow.home.lan/#/experiments/10/runs/17c60d66db1746fd893afec36cadae1e
üß™ View experiment at: http://mlflow.home.lan/#/experiments/10
Saved model LDA_lsqr (smote_outliers_False)!
LDA_lsqr {'estimator': LinearDiscriminantAnalysis(), 'params': {'solver': ['lsqr', 'eigen'], 'shrinkage': [None, 'auto', 0.0, 0.05, 0.1, 0.15, 0.25, 0.35, 0.5, 0.65, 0.75, 0.85, 0.9], 'tol': [0.0001, 0.001, 0.01]}, 'cv': StratifiedKFold(n_splits=3, random_state=42, shuffle=True), 'n_iter': 10} ADASYN {'random_state': 42, 'n_neighbors': 5}

Running RandomizedSearchCV for LDA_lsqr (ADASYN)
Outlier removal: False
model_path=PosixPath('src/models/MIT_02_02_baseline_models_randomized_search_sampling/LDA_lsqr_adasyn_outliers_False.joblib')
Fitting 3 folds for each of 10 candida

  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(
Successfully registered model 'LDA_lsqr_ADASYN'.
2025/11/11 23:09:33 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LDA_lsqr_ADASYN, version 1
Created version '1' of model 'LDA_lsqr_ADASYN'.
INFO:src.utils.model_saver:Model saved: src/models/MIT_02_02_baseline_models_randomized_search_sampling/LDA_lsqr_adasyn_outliers_False.joblib
INFO:src.utils.model_saver:Metadata saved: src/models/MIT_02_02_baseline_models_randomized_search_sampling/LDA_lsqr_adasyn_outliers_False_metadata.pkl


‚úÖ Logged to MLflow: LDA_lsqr_ADASYN_outliers_False
   Run ID: 76137a8b55b14d14a87d5c50983b4db4
   View at: http://mlflow.home.lan/#/experiments/10/runs/76137a8b55b14d14a87d5c50983b4db4
üèÉ View run LDA_lsqr_ADASYN_outliers_False at: http://mlflow.home.lan/#/experiments/10/runs/76137a8b55b14d14a87d5c50983b4db4
üß™ View experiment at: http://mlflow.home.lan/#/experiments/10
Saved model LDA_lsqr (adasyn_outliers_False)!
LDA_lsqr {'estimator': LinearDiscriminantAnalysis(), 'params': {'solver': ['lsqr', 'eigen'], 'shrinkage': [None, 'auto', 0.0, 0.05, 0.1, 0.15, 0.25, 0.35, 0.5, 0.65, 0.75, 0.85, 0.9], 'tol': [0.0001, 0.001, 0.01]}, 'cv': StratifiedKFold(n_splits=3, random_state=42, shuffle=True), 'n_iter': 10} SMOTETomek {'random_state': 42, 'smote': SMOTE(random_state=42)}

Running RandomizedSearchCV for LDA_lsqr (SMOTETomek)
Outlier removal: False
model_path=PosixPath('src/models/MIT_02_02_baseline_models_randomized_search_sampling/LDA_lsqr_smotetomek_outliers_False.joblib')
Fitting 

  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(
Successfully registered model 'LDA_lsqr_SMOTETomek'.
2025/11/11 23:39:40 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LDA_lsqr_SMOTETomek, version 1
Created version '1' of model 'LDA_lsqr_SMOTETomek'.
INFO:src.utils.model_saver:Model saved: src/models/MIT_02_02_baseline_models_randomized_search_sampling/LDA_lsqr_smotetomek_outliers_False.joblib
INFO:src.utils.model_saver:Metadata saved: src/models/MIT_02_02_baseline_models_randomized_search_sampling/LDA_lsqr_smotetomek_outliers_False_metadata.pkl


‚úÖ Logged to MLflow: LDA_lsqr_SMOTETomek_outliers_False
   Run ID: 9db04f5ecba642ed9dbb4a85e0bedc90
   View at: http://mlflow.home.lan/#/experiments/10/runs/9db04f5ecba642ed9dbb4a85e0bedc90
üèÉ View run LDA_lsqr_SMOTETomek_outliers_False at: http://mlflow.home.lan/#/experiments/10/runs/9db04f5ecba642ed9dbb4a85e0bedc90
üß™ View experiment at: http://mlflow.home.lan/#/experiments/10
Saved model LDA_lsqr (smotetomek_outliers_False)!
LDA_lsqr {'estimator': LinearDiscriminantAnalysis(), 'params': {'solver': ['lsqr', 'eigen'], 'shrinkage': [None, 'auto', 0.0, 0.05, 0.1, 0.15, 0.25, 0.35, 0.5, 0.65, 0.75, 0.85, 0.9], 'tol': [0.0001, 0.001, 0.01]}, 'cv': StratifiedKFold(n_splits=3, random_state=42, shuffle=True), 'n_iter': 10} SMOTEENN {'random_state': 42, 'smote': SMOTE(random_state=42)}

Running RandomizedSearchCV for LDA_lsqr (SMOTEENN)
Outlier removal: False
model_path=PosixPath('src/models/MIT_02_02_baseline_models_randomized_search_sampling/LDA_lsqr_smoteenn_outliers_False.joblib')
Fi

  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(
Successfully registered model 'LDA_lsqr_SMOTEENN'.
2025/11/12 00:11:45 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LDA_lsqr_SMOTEENN, version 1
Created version '1' of model 'LDA_lsqr_SMOTEENN'.


‚úÖ Logged to MLflow: LDA_lsqr_SMOTEENN_outliers_False
   Run ID: dea98dce941040188f93f1d0d0b35253
   View at: http://mlflow.home.lan/#/experiments/10/runs/dea98dce941040188f93f1d0d0b35253
üèÉ View run LDA_lsqr_SMOTEENN_outliers_False at: http://mlflow.home.lan/#/experiments/10/runs/dea98dce941040188f93f1d0d0b35253
üß™ View experiment at: http://mlflow.home.lan/#/experiments/10


INFO:src.utils.model_saver:Model saved: src/models/MIT_02_02_baseline_models_randomized_search_sampling/LDA_lsqr_smoteenn_outliers_False.joblib
INFO:src.utils.model_saver:Metadata saved: src/models/MIT_02_02_baseline_models_randomized_search_sampling/LDA_lsqr_smoteenn_outliers_False_metadata.pkl


Saved model LDA_lsqr (smoteenn_outliers_False)!
ANN {'estimator': MLPClassifier(early_stopping=True, max_iter=300, random_state=42), 'params': {'hidden_layer_sizes': [(64,), (128,), (128, 64)], 'activation': ['relu'], 'alpha': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x7f64aa989390>, 'learning_rate_init': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x7f63f3b6ef50>, 'batch_size': <scipy.stats._distn_infrastructure.rv_discrete_frozen object at 0x7f63f3b6e6d0>, 'beta_1': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x7f6409282890>, 'beta_2': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x7f63f3a52990>, 'validation_fraction': [0.1, 0.15]}, 'cv': StratifiedKFold(n_splits=3, random_state=42, shuffle=True), 'n_iter': 100} RandomOverSampler {'random_state': 42}

Running RandomizedSearchCV for ANN (RandomOverSampler)
Outlier removal: False
model_path=PosixPath('src/models/MIT_02_02_baseline_models_randomize



[CV 2/3] END classifier__activation=relu, classifier__alpha=0.005934180807121819, classifier__batch_size=115, classifier__beta_1=0.9337151353163824, classifier__beta_2=0.9806671358699975, classifier__hidden_layer_sizes=(128,), classifier__learning_rate_init=0.003169858557151913, classifier__validation_fraction=0.15; bal_acc: (train=0.987, test=0.920) f1_macro: (train=0.861, test=0.799) f1_weighted: (train=0.963, test=0.950) total time= 2.1min
[CV 1/3] END classifier__activation=relu, classifier__alpha=0.00015428154401494676, classifier__batch_size=125, classifier__beta_1=0.9633292073442021, classifier__beta_2=0.9732345176252789, classifier__hidden_layer_sizes=(128,), classifier__learning_rate_init=0.005799151077727614, classifier__validation_fraction=0.1; bal_acc: (train=0.994, test=0.899) f1_macro: (train=0.931, test=0.837) f1_weighted: (train=0.984, test=0.964) total time= 2.3min
[CV 3/3] END classifier__activation=relu, classifier__alpha=0.0018741622886871703, classifier__batch_size

  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(
Successfully registered model 'ANN_RandomOverSampler'.
2025/11/12 00:49:44 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: ANN_RandomOverSampler, version 1
Created version '1' of model 'ANN_RandomOverSampler'.
INFO:src.utils.model_saver:Model saved: src/models/MIT_02_02_baseline_models_randomized_search_sampling/ANN_randomoversampler_outliers_False.joblib
INFO:src.utils.model_saver:Metadata saved: src/models/MIT_02_02_baseline_models_randomized_search_sampling/ANN_randomoversampler_outliers_False_metadata.pkl


‚úÖ Logged to MLflow: ANN_RandomOverSampler_outliers_False
   Run ID: be90e0f96f1c48959f4089bbcad8605b
   View at: http://mlflow.home.lan/#/experiments/10/runs/be90e0f96f1c48959f4089bbcad8605b
üèÉ View run ANN_RandomOverSampler_outliers_False at: http://mlflow.home.lan/#/experiments/10/runs/be90e0f96f1c48959f4089bbcad8605b
üß™ View experiment at: http://mlflow.home.lan/#/experiments/10
Saved model ANN (randomoversampler_outliers_False)!
ANN {'estimator': MLPClassifier(early_stopping=True, max_iter=300, random_state=42), 'params': {'hidden_layer_sizes': [(64,), (128,), (128, 64)], 'activation': ['relu'], 'alpha': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x7f64aa989390>, 'learning_rate_init': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x7f63f3b6ef50>, 'batch_size': <scipy.stats._distn_infrastructure.rv_discrete_frozen object at 0x7f63f3b6e6d0>, 'beta_1': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x7f6409282890>, 'b



[CV 3/3] END classifier__activation=relu, classifier__alpha=0.0044174104255696916, classifier__batch_size=105, classifier__beta_1=0.9518864827962829, classifier__beta_2=0.963428705813673, classifier__hidden_layer_sizes=(64,), classifier__learning_rate_init=0.0025424246351022427, classifier__validation_fraction=0.15; bal_acc: (train=0.973, test=0.883) f1_macro: (train=0.880, test=0.810) f1_weighted: (train=0.971, test=0.957) total time= 1.5min
[CV 1/3] END classifier__activation=relu, classifier__alpha=0.00038808916604864797, classifier__batch_size=87, classifier__beta_1=0.9102814133328357, classifier__beta_2=0.9777230543783614, classifier__hidden_layer_sizes=(128, 64), classifier__learning_rate_init=0.007844264458775055, classifier__validation_fraction=0.15; bal_acc: (train=0.979, test=0.899) f1_macro: (train=0.895, test=0.833) f1_weighted: (train=0.976, test=0.962) total time= 3.4min
[CV 1/3] END classifier__activation=relu, classifier__alpha=0.005955537519654592, classifier__batch_si

  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(
Successfully registered model 'ANN_SMOTE'.
2025/11/12 01:33:22 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: ANN_SMOTE, version 1
Created version '1' of model 'ANN_SMOTE'.
INFO:src.utils.model_saver:Model saved: src/models/MIT_02_02_baseline_models_randomized_search_sampling/ANN_smote_outliers_False.joblib
INFO:src.utils.model_saver:Metadata saved: src/models/MIT_02_02_baseline_models_randomized_search_sampling/ANN_smote_outliers_False_metadata.pkl


‚úÖ Logged to MLflow: ANN_SMOTE_outliers_False
   Run ID: ea4cc5291a1647c8b54aff28d1014976
   View at: http://mlflow.home.lan/#/experiments/10/runs/ea4cc5291a1647c8b54aff28d1014976
üèÉ View run ANN_SMOTE_outliers_False at: http://mlflow.home.lan/#/experiments/10/runs/ea4cc5291a1647c8b54aff28d1014976
üß™ View experiment at: http://mlflow.home.lan/#/experiments/10
Saved model ANN (smote_outliers_False)!
ANN {'estimator': MLPClassifier(early_stopping=True, max_iter=300, random_state=42), 'params': {'hidden_layer_sizes': [(64,), (128,), (128, 64)], 'activation': ['relu'], 'alpha': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x7f64aa989390>, 'learning_rate_init': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x7f63f3b6ef50>, 'batch_size': <scipy.stats._distn_infrastructure.rv_discrete_frozen object at 0x7f63f3b6e6d0>, 'beta_1': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x7f6409282890>, 'beta_2': <scipy.stats._distn_infrastr



[CV 1/3] END classifier__activation=relu, classifier__alpha=0.008046278403424575, classifier__batch_size=69, classifier__beta_1=0.9131736592946025, classifier__beta_2=0.968009186692285, classifier__hidden_layer_sizes=(64,), classifier__learning_rate_init=0.0010612358361428505, classifier__validation_fraction=0.1; bal_acc: (train=0.973, test=0.889) f1_macro: (train=0.816, test=0.760) f1_weighted: (train=0.954, test=0.940) total time= 2.6min
[CV 2/3] END classifier__activation=relu, classifier__alpha=0.008046278403424575, classifier__batch_size=69, classifier__beta_1=0.9131736592946025, classifier__beta_2=0.968009186692285, classifier__hidden_layer_sizes=(64,), classifier__learning_rate_init=0.0010612358361428505, classifier__validation_fraction=0.1; bal_acc: (train=0.975, test=0.900) f1_macro: (train=0.822, test=0.761) f1_weighted: (train=0.953, test=0.939) total time= 2.6min
[CV 2/3] END classifier__activation=relu, classifier__alpha=0.0004240874126924652, classifier__batch_size=77, cl

  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(
Successfully registered model 'ANN_ADASYN'.
2025/11/12 02:15:26 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: ANN_ADASYN, version 1
Created version '1' of model 'ANN_ADASYN'.
INFO:src.utils.model_saver:Model saved: src/models/MIT_02_02_baseline_models_randomized_search_sampling/ANN_adasyn_outliers_False.joblib
INFO:src.utils.model_saver:Metadata saved: src/models/MIT_02_02_baseline_models_randomized_search_sampling/ANN_adasyn_outliers_False_metadata.pkl


‚úÖ Logged to MLflow: ANN_ADASYN_outliers_False
   Run ID: cf2ef40914fb4f579719b578875dfd25
   View at: http://mlflow.home.lan/#/experiments/10/runs/cf2ef40914fb4f579719b578875dfd25
üèÉ View run ANN_ADASYN_outliers_False at: http://mlflow.home.lan/#/experiments/10/runs/cf2ef40914fb4f579719b578875dfd25
üß™ View experiment at: http://mlflow.home.lan/#/experiments/10
Saved model ANN (adasyn_outliers_False)!
ANN {'estimator': MLPClassifier(early_stopping=True, max_iter=300, random_state=42), 'params': {'hidden_layer_sizes': [(64,), (128,), (128, 64)], 'activation': ['relu'], 'alpha': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x7f64aa989390>, 'learning_rate_init': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x7f63f3b6ef50>, 'batch_size': <scipy.stats._distn_infrastructure.rv_discrete_frozen object at 0x7f63f3b6e6d0>, 'beta_1': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x7f6409282890>, 'beta_2': <scipy.stats._distn_infra



[CV 1/3] END classifier__activation=relu, classifier__alpha=0.005173064246720068, classifier__batch_size=72, classifier__beta_1=0.9723243181897382, classifier__beta_2=0.9747551319052552, classifier__hidden_layer_sizes=(64,), classifier__learning_rate_init=0.0026167424469205583, classifier__validation_fraction=0.1; bal_acc: (train=0.962, test=0.898) f1_macro: (train=0.855, test=0.805) f1_weighted: (train=0.966, test=0.955) total time=19.5min
[CV 2/3] END classifier__activation=relu, classifier__alpha=0.00014414950961304262, classifier__batch_size=68, classifier__beta_1=0.9876111650978556, classifier__beta_2=0.966505184591194, classifier__hidden_layer_sizes=(64,), classifier__learning_rate_init=0.006592703038861593, classifier__validation_fraction=0.15; bal_acc: (train=0.967, test=0.900) f1_macro: (train=0.831, test=0.786) f1_weighted: (train=0.959, test=0.949) total time=19.9min
[CV 1/3] END classifier__activation=relu, classifier__alpha=0.00021108958766794024, classifier__batch_size=90

  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(
Successfully registered model 'ANN_SMOTETomek'.
2025/11/12 06:43:17 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: ANN_SMOTETomek, version 1
Created version '1' of model 'ANN_SMOTETomek'.
INFO:src.utils.model_saver:Model saved: src/models/MIT_02_02_baseline_models_randomized_search_sampling/ANN_smotetomek_outliers_False.joblib
INFO:src.utils.model_saver:Metadata saved: src/models/MIT_02_02_baseline_models_randomized_search_sampling/ANN_smotetomek_outliers_False_metadata.pkl


‚úÖ Logged to MLflow: ANN_SMOTETomek_outliers_False
   Run ID: 8f2baeabcca840c09a8d9366b31b8a3e
   View at: http://mlflow.home.lan/#/experiments/10/runs/8f2baeabcca840c09a8d9366b31b8a3e
üèÉ View run ANN_SMOTETomek_outliers_False at: http://mlflow.home.lan/#/experiments/10/runs/8f2baeabcca840c09a8d9366b31b8a3e
üß™ View experiment at: http://mlflow.home.lan/#/experiments/10
Saved model ANN (smotetomek_outliers_False)!
ANN {'estimator': MLPClassifier(early_stopping=True, max_iter=300, random_state=42), 'params': {'hidden_layer_sizes': [(64,), (128,), (128, 64)], 'activation': ['relu'], 'alpha': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x7f64aa989390>, 'learning_rate_init': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x7f63f3b6ef50>, 'batch_size': <scipy.stats._distn_infrastructure.rv_discrete_frozen object at 0x7f63f3b6e6d0>, 'beta_1': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x7f6409282890>, 'beta_2': <scipy.stats.



[CV 2/3] END classifier__activation=relu, classifier__alpha=0.004686144343466706, classifier__batch_size=99, classifier__beta_1=0.9321326409573011, classifier__beta_2=0.9953835093531572, classifier__hidden_layer_sizes=(128,), classifier__learning_rate_init=0.006379007635502177, classifier__validation_fraction=0.1; bal_acc: (train=0.970, test=0.919) f1_macro: (train=0.803, test=0.769) f1_weighted: (train=0.947, test=0.940) total time=20.4min
[CV 2/3] END classifier__activation=relu, classifier__alpha=0.002080786535526122, classifier__batch_size=104, classifier__beta_1=0.9177233134008086, classifier__beta_2=0.9966200607077298, classifier__hidden_layer_sizes=(128, 64), classifier__learning_rate_init=0.0095751091158059, classifier__validation_fraction=0.1; bal_acc: (train=0.974, test=0.917) f1_macro: (train=0.865, test=0.823) f1_weighted: (train=0.963, test=0.954) total time=21.3min
[CV 1/3] END classifier__activation=relu, classifier__alpha=0.002080786535526122, classifier__batch_size=104

  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(
Successfully registered model 'ANN_SMOTEENN'.
2025/11/12 11:15:36 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: ANN_SMOTEENN, version 1
Created version '1' of model 'ANN_SMOTEENN'.


‚úÖ Logged to MLflow: ANN_SMOTEENN_outliers_False
   Run ID: 8199c4e520384ca5bc2a9534cd6a61f9
   View at: http://mlflow.home.lan/#/experiments/10/runs/8199c4e520384ca5bc2a9534cd6a61f9
üèÉ View run ANN_SMOTEENN_outliers_False at: http://mlflow.home.lan/#/experiments/10/runs/8199c4e520384ca5bc2a9534cd6a61f9
üß™ View experiment at: http://mlflow.home.lan/#/experiments/10


INFO:src.utils.model_saver:Model saved: src/models/MIT_02_02_baseline_models_randomized_search_sampling/ANN_smoteenn_outliers_False.joblib
INFO:src.utils.model_saver:Metadata saved: src/models/MIT_02_02_baseline_models_randomized_search_sampling/ANN_smoteenn_outliers_False_metadata.pkl


Saved model ANN (smoteenn_outliers_False)!
