In [None]:

import sys
import os
import pandas as pd
from pathlib import Path
sys.path.insert(0, str(Path.cwd().parent))
from utils.utils import save_experiment, train_and_evaluate_logistic_regression, train_and_evaluate_linear_svm, train_and_evaluate_non_linear_svm, train_and_evaluate_decision_tree, train_and_evaluate_random_forest, train_and_evaluate_xgboost
from configs.config import DATASET_PATH, FEATURES_DIR, ITW_DATASET_PATH, MODELS_PATH

from utils.utils import grid_search_joblib
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import numpy as np


import matplotlib.pyplot as plt
import joblib


### Parquet paths

In [None]:
train_data_path = os.path.join(FEATURES_DIR, "training_features_mean_20_128_256_128.parquet")
val_data_path = os.path.join(FEATURES_DIR, "testing_features_mean_20_128_256_128.parquet")
test_data_path = os.path.join(ITW_DATASET_PATH, 'normalized_features', "itw_features_mean_20_128_256_128_trimmed_loudness_normalized.parquet")

#no mel features
train_data_path_no_mel = os.path.join(FEATURES_DIR, "training_features_mean_20_128_256_128_no_mel.parquet")
test_data_path_no_mel = os.path.join(FEATURES_DIR, "testing_features_mean_20_128_256_128_no_mel.parquet")
itw_data_path_no_mel = os.path.join(ITW_DATASET_PATH, 'normalized_features', "itw_features_mean_20_128_256_128_no_mel_trimmed_loudness_normalized.parquet")

### Logistic Regression

In [None]:
model = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(
        class_weight={0:1, 1:5},
        random_state=42,
        n_jobs=1,
    )),
])

param_grid = {
    "clf__solver": ["saga"],
    "clf__penalty": ["l1","l2"],
    "clf__C": np.logspace(-3, 2, 10),
    "clf__max_iter": [1000],
}

(
    final_model, 
    test_metrics, 
    val_metrics, 
    best_params, 
    val_results, 
    metadata, 
    feature_names  
) = grid_search_joblib(
    model,
    param_grid,
    train_data_path,
    val_data_path,
    test_data_path,
    n_jobs=20
)

print(f'Best parameters:{best_params}')
print(f'Resluts on validation data:{val_metrics}')
print(f'Resluts on test data:{test_metrics}')

save_experiment(
    model=final_model,
    metrics=test_metrics,
    experiment_dir=os.path.join(sys.path[0], "notebooks", "experiments", "final", "logistic_reg",),
    model_params=best_params,
    feature_names=feature_names,
    metadata_extra=metadata,
    val_results=val_results,
)


### Linear SVM

In [None]:
model = Pipeline([
    ("scaler", StandardScaler()),
    ("svm",  LinearSVC(max_iter=20000, class_weight={0:1, 1:5}, random_state=42)),
])

param_grid = {
    "svm__C": [0.001, 0.01, 0.1, 1, 10, 100, 1000],
}

(
    final_model, 
    test_metrics, 
    val_metrics, 
    best_params, 
    val_results, 
    metadata, 
    feature_names 
) = grid_search_joblib(
    model,
    param_grid,
    train_data_path,
    val_data_path,
    test_data_path,
    n_jobs=20
)

print(f'Best parameters:{best_params}')
print(f'Resluts on validation data:{val_metrics}')
print(f'Resluts on test data:{test_metrics}')

save_experiment(
    model=final_model,
    metrics=test_metrics,
    experiment_dir=os.path.join(sys.path[0], "notebooks", "experiments", "final", "linear_svm"),
    model_params=best_params,
    feature_names=feature_names,
    metadata_extra=metadata,
    val_results=val_results,
)

### RBF SVM

In [None]:
model = Pipeline([
    ("scaler", StandardScaler()),
    ("svm", SVC(
        class_weight={0: 1, 1: 5},
        cache_size=2000,
        random_state=42
    )),
])

param_grid_rbf = {
    "svm__kernel": ["rbf"],
    "svm__C": [0.1, 1, 10, 100],
    "svm__gamma": ["scale", 0.01, 0.1],
}

(
    final_model_rbf, 
    test_metrics_rbf, 
    val_metrics_rbf, 
    best_params_rbf, 
    val_results_rbf, 
    metadata_rbf, 
    feature_names_rbf 
) = grid_search_joblib(
    model,
    param_grid_rbf,
    train_data_path,
    val_data_path,
    test_data_path,
    n_jobs=20
)
print(f'Best parameters:{best_params_rbf}')
print(f'Resluts on validation data:{val_metrics_rbf}')
print(f'Resluts on test data:{test_metrics_rbf}')

save_experiment(
    model=final_model_rbf,
    metrics=test_metrics_rbf,
    experiment_dir=os.path.join(sys.path[0], "notebooks", "experiments", "final", "rbf_svm"),
    model_params=best_params_rbf,
    feature_names=feature_names_rbf,
    metadata_extra=metadata_rbf,
    val_results=val_results_rbf,
)

### Poly SVM

In [None]:
model = Pipeline([
    ("scaler", StandardScaler()),
    ("svm", SVC(
        class_weight={0: 1, 1: 5},
        cache_size=2000,
        random_state=42
    )),
])

param_grid_poly = {
    "svm__kernel": ["poly"],
    "svm__degree": [2, 3],
    "svm__C": [0.1, 1, 10],
    "svm__gamma": ["scale", 0.01],
    "svm__coef0": [0.0, 1.0],
}

(
    final_model_poly, 
    test_metrics_poly, 
    val_metrics_poly, 
    best_params_poly, 
    val_results_poly, 
    metadata_poly, 
    feature_names_poly 
) = grid_search_joblib(
    model,
    param_grid_poly,
    train_data_path,
    val_data_path,
    test_data_path,
    n_jobs=20
)
print(f'Best parameters:{best_params_poly}')
print(f'Resluts on validation data:{val_metrics_poly}')
print(f'Resluts on test data:{test_metrics_poly}')

save_experiment(
    model=final_model_poly,
    metrics=test_metrics_poly,
    experiment_dir=os.path.join(sys.path[0], "notebooks", "experiments",  "final", "poly_svm"),
    model_params=best_params_poly,
    feature_names=feature_names_poly,
    metadata_extra=metadata_poly,
    val_results=val_results_poly,
)

### Sigmoid Kernel

In [None]:
model = Pipeline([
    ("scaler", StandardScaler()),
    ("svm", SVC(
        class_weight={0: 1, 1: 5},
        cache_size=2000,
        random_state=42
    )),
])

param_grid_sigmoid = {
    "svm__kernel": ["sigmoid"],
    "svm__C": [0.01, 0.1, 1],
    "svm__gamma": ["scale", 0.01],
    "svm__coef0": [-1.0, 0.0, 1.0],
}

(
    final_model_sigmoid, 
    test_metrics_sigmoid, 
    val_metrics_sigmoid, 
    best_params_sigmoid, 
    val_results_sigmoid, 
    metadata_sigmoid, 
    feature_names_sigmoid 
) = grid_search_joblib(
    model,
    param_grid_sigmoid,
    train_data_path,
    val_data_path,
    test_data_path,
    n_jobs=20
)

print(f'Best parameters:{best_params_sigmoid}')
print(f'Resluts on validation data:{val_metrics_sigmoid}')
print(f'Resluts on test data:{test_metrics_sigmoid}')

save_experiment(
    model=final_model_sigmoid,
    metrics=test_metrics_sigmoid,
    experiment_dir=os.path.join(sys.path[0], "notebooks", "experiments", "final", "sigmoid_svm"),
    model_params=best_params_sigmoid,
    feature_names=feature_names_sigmoid,
    metadata_extra=metadata_sigmoid,
    val_results=val_results_sigmoid,
)

### Decission Tree

In [None]:
model = DecisionTreeClassifier(random_state=42)

params = {
    "max_depth": [x for x in range(5, 20)],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "class_weight": [{0: 1, 1: 5}, None],
    "criterion": ["gini", "entropy"],
    "ccp_alpha": [0.0, 1e-4, 1e-3, 1e-2],
}


(
    final_model,
    test_metrics,
    val_metrics,
    best_params,
    val_results,
    metadata,
    feature_names
) = grid_search_joblib(
    model=model,
    param_grid=params,
    train_path=train_data_path,
    val_path=val_data_path,
    test_path=test_data_path,
    scoring="f1_macro",
    verbose=1,
    n_jobs=-1
)

save_experiment(
    model=final_model,
    metrics=test_metrics,
    experiment_dir=os.path.join(sys.path[0], "notebooks", "experiments", "final", "Dtree",),
    model_params=params,
    feature_names=feature_names,
    metadata_extra=metadata,
    val_results=val_results,
)
print(metadata)
print(test_metrics)

### Random Forest

In [None]:

from sklearn.impute import SimpleImputer


base_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('rf', RandomForestClassifier(random_state=42))
])

params = {
    "rf__n_estimators": [300, 500, 700, 800],
    "rf__max_depth": [2, 3, 4, 5],
    "rf__max_features": ["sqrt", "log2"],
    "rf__min_samples_split": [5, 10, 20],
    "rf__min_samples_leaf": [2, 5, 10],
    "rf__max_samples": [0.5, 0.6, 0.7],
    "rf__class_weight": [{0: 1, 1: 5}, None],
}

(
    final_model,
    test_metrics,
    val_metrics,
    best_params,
    val_results,
    metadata,
    feature_names
) = grid_search_joblib(
    model=base_pipe,
    param_grid=params,
    train_path=train_data_path,
    val_path=val_data_path,
    test_path=test_data_path,
    n_jobs=-1,
    verbose=2
)
    
save_experiment(
    model=final_model,
    metrics=test_metrics,
    experiment_dir=os.path.join(sys.path[0], "notebooks", "experiments",  "final", "RF",),
    model_params=best_params,
    feature_names=feature_names,
    metadata_extra=metadata,
    val_results=val_results,
)

print("Metadata:", metadata)
print("Metrics:", test_metrics)

### XGBoost

#### XGBoost Mel features


In [None]:
base_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('xgb', XGBClassifier(eval_metric='aucpr', random_state=42))
])

params = {
    # CORE ANTI-OVERFITTING: Very shallow trees
    "xgb__max_depth": [3, 4, 2],  # Much shallower than DTree's 10
    
    # LEARNING: Slow and steady
    "xgb__learning_rate": [0.03, 0.05],
    "xgb__n_estimators": [700, 800],  # More trees, lower LR
    
    # REGULARIZATION: Aggressive to prevent dataset artifacts
    "xgb__min_child_weight": [10, 20],  # Higher than default
    "xgb__gamma": [0.3],  # Minimum loss reduction
    "xgb__reg_lambda": [2, 1],  # L2 regularization
    "xgb__reg_alpha": [0.5],  # L1 regularization
    
    # SAMPLING: Reduce correlation and overfitting
    "xgb__subsample": [0.6, 0.7, 0.5],  # Row sampling
    "xgb__colsample_bytree": [0.6, 0.7],  # Feature sampling
    
    # CLASS IMBALANCE: More moderate than 1:5
    "xgb__scale_pos_weight": [2],  # or calculate actual ratio
    
    # Fixed params
    "xgb__objective": ["binary:logistic"],
    #"eval_metric": ["logloss"],
    #"tree_method": ["hist"],  # Fast
}

(
    final_model,
    test_metrics,
    val_metrics,
    best_params,
    val_results,
    metadata,
    feature_names
) = grid_search_joblib(
    model=base_pipe,
    param_grid=params,
    train_path=train_data_path,
    val_path=val_data_path,
    test_path=test_data_path,
    n_jobs=-1,
    verbose=2
)

save_experiment(
    model=final_model,
    metrics=test_metrics,
    experiment_dir=os.path.join(sys.path[0], "notebooks", "experiments", "final", "XGB"),
    model_params=best_params,
    feature_names=feature_names,
    metadata_extra=metadata,
    val_results=val_results,
)

print("Metadata:", metadata)
print("Metrics:", test_metrics)

#### XGBoost without mel features

In [None]:
base_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('xgb', XGBClassifier(eval_metric='aucpr', random_state=42))
])

param_grid = {
    # CORE ANTI-OVERFITTING: Very shallow trees
    "xgb__max_depth": [3, 4, 5],  # ← xgb__ prefix
    
    # LEARNING: Slow and steady
    "xgb__learning_rate": [0.01, 0.03, 0.05],
    "xgb__n_estimators": [300, 500, 700],
    
    # REGULARIZATION: Aggressive to prevent dataset artifacts
    "xgb__min_child_weight": [5, 10, 20],
    "xgb__gamma": [0.1, 0.3, 0.5],
    "xgb__reg_lambda": [2, 5, 10],  # Note: 'lambda' → 'reg_lambda'
    "xgb__reg_alpha": [0, 0.5, 1],  # Note: 'alpha' → 'reg_alpha'
    
    # SAMPLING: Reduce correlation and overfitting
    "xgb__subsample": [0.6, 0.7, 0.8],
    "xgb__colsample_bytree": [0.6, 0.7, 0.8],
    
    # CLASS IMBALANCE: More moderate than 1:5
    "xgb__scale_pos_weight": [1, 2, 3],
    
    # Fixed params - NO prefix needed (or include with prefix)
    # "xgb__tree_method": ["hist"],
    # "xgb__random_state": [42],  # Already set in XGBClassifier above
}


(
    final_model,
    test_metrics,
    val_metrics,
    best_params,
    val_results,
    metadata,
    feature_names
) = grid_search_joblib(
    model=base_pipe,
    param_grid=param_grid,
    train_path=train_data_path_no_mel,
    val_path=test_data_path_no_mel,
    test_path=itw_data_path_no_mel,
    n_jobs=-1,
    verbose=2
)

save_experiment(
    model=final_model,
    metrics=test_metrics,
    experiment_dir=os.path.join(sys.path[0], "notebooks", "experiments", "final", "XGB_NO_MEL"),
    model_params=best_params,
    feature_names=feature_names,
    metadata_extra=metadata,
    val_results=val_results,
)

print("Metadata:", metadata)
print("Metrics:", test_metrics)

### Retrain Final Models on FoR (Training → Test)

For each classifier, we load the best hyperparameters from the final experiment's `model_params.json` and retrain the model using the FoR **training** set. The model is then evaluated on the FoR **test** set so that confusion matrices and ROC curves remain available.

In [None]:
import json
train_data_path = os.path.join(FEATURES_DIR, "training_features_mean_20_128_256_128.parquet")
val_data_path = os.path.join(FEATURES_DIR, "validation_features_mean_20_128_256_128.parquet")
test_data_path = os.path.join(FEATURES_DIR, "testing_features_mean_20_128_256_128.parquet")

train_data_path_no_mel = os.path.join(FEATURES_DIR, "training_features_mean_20_128_256_128_no_mel.parquet")
val_data_path_no_mel = os.path.join(FEATURES_DIR, "validation_features_mean_20_128_256_128_no_mel.parquet")
test_data_path_no_mel = os.path.join(FEATURES_DIR, "testing_features_mean_20_128_256_128_no_mel.parquet")

FINAL_MODELS_PATH = os.path.join("experiments", "final")
FOR_MODELS_PATH = os.path.join("experiments", "FoR")

# ── Helper: strip pipeline prefixes from param keys ──
def strip_prefix(params_dict):
    stripped = {}
    for k, v in params_dict.items():
        for pfx in ("clf__", "svm__", "rf__", "xgb__"):
            if k.startswith(pfx):
                k = k[len(pfx):]
                break
        stripped[k] = v
    return stripped

def fix_class_weight(params):
    """Convert class_weight dict keys from str to int (JSON artefact)."""
    cw = params.get("class_weight")
    if isinstance(cw, dict):
        params["class_weight"] = {int(k): v for k, v in cw.items()}
    return params

# ── Registry: (model_name, subdir, save_subdir, model_type, uses_no_mel) ──
MODELS = [
    ("Logistic Regression",  "logistic_reg/exp_20260207_192945",   "logistic_reg",  "lr",         False),
    ("Linear SVM",           "linear_svm/exp_20260207_193304",     "linear_svm",    "linear_svm", False),
    ("RBF SVM",              "rbf_svm/exp_20260207_200752",        "rbf_svm",       "nl_svm",     False),
    ("Poly SVM",             "poly_svm/exp_20260207_201742",       "poly_svm",      "nl_svm",     False),
    ("Sigmoid SVM",          "sigmoid_svm/exp_20260207_204305",    "sigmoid_svm",   "nl_svm",     False),
    ("Decision Tree",        "Dtree/exp_20260207_210558",          "Dtree",         "dtree",      False),
    ("Random Forest",        "RF/exp_20260208_163746",             "RF",            "rf",         False),
    ("XGBoost",              "XGB/exp_20260208_160623",            "XGB",           "xgb",        False),
    ("XGBoost (no mel)",     "XGB_NO_MEL/exp_20260208_034900",    "XGB_NO_MEL",    "xgb",        True),
]

# ── Train each model with best params and save ──
trained_models = {}

for model_name, subdir, save_subdir, model_type, no_mel in MODELS:
    print(f"\n{'='*60}")
    print(f"  {model_name}")
    print(f"{'='*60}")

    exp_dir = os.path.join(FINAL_MODELS_PATH, subdir)

    # Select data paths
    if no_mel:
        t_train, t_test, t_val = train_data_path_no_mel, test_data_path_no_mel, val_data_path_no_mel
    else:
        t_train, t_test, t_val = train_data_path, test_data_path, val_data_path

    # Load model_params.json
    with open(os.path.join(exp_dir, "model_params.json")) as f:
        raw_params = json.load(f)

    # Decision Tree: model_params.json has the grid, not best params
    if model_type == "dtree":
        with open(os.path.join(exp_dir, "val_results.json")) as f:
            val_results = json.load(f)
        best_entry = max(val_results, key=lambda x: x["selection_score"])
        params = strip_prefix(best_entry["params"])
    else:
        params = strip_prefix(raw_params)

    params = fix_class_weight(params)
    print(f"Params: {params}")

    # ── Train & evaluate ──
    if model_type == "lr":
        params.setdefault("class_weight", {0:1, 1:5})
        params.setdefault("random_state", 42)
        pipeline, metrics, used_params, features, meta = train_and_evaluate_logistic_regression(
            t_train, t_test, lr_params=params
        )

    elif model_type == "linear_svm":
        params.setdefault("class_weight", {0:1, 1:5})
        params.setdefault("max_iter", 20000)
        params.setdefault("random_state", 42)
        pipeline, metrics, used_params, features, meta = train_and_evaluate_linear_svm(
            t_train, t_test, svc_params=params
        )

    elif model_type == "nl_svm":
        params.setdefault("class_weight", {0:1, 1:5})
        params.setdefault("max_iter", 20000)
        params.setdefault("random_state", 42)
        pipeline, metrics, used_params, features, meta = train_and_evaluate_non_linear_svm(
            t_train, t_test, svc_params=params
        )

    elif model_type == "dtree":
        criterion = params.pop("criterion", "gini")
        params.setdefault("random_state", 42)
        pipeline, metrics, used_params, features, meta = train_and_evaluate_decision_tree(
            t_train, test_path=t_test, dt_params=params, criterion=criterion
        )

    elif model_type == "rf":
        params.setdefault("random_state", 42)
        params.setdefault("n_jobs", -1)
        pipeline, metrics, used_params, features, meta, _oob = train_and_evaluate_random_forest(
            t_train, val_path=t_val, test_path=t_test, rf_params=params
        )

    elif model_type == "xgb":
        pipeline, metrics, used_params, features, meta = train_and_evaluate_xgboost(
            t_train, val_path=t_val, test_path=t_test, xgb_params=params
        )

    trained_models[model_name] = {"pipeline": pipeline, "metrics": metrics}
    print(f"  Accuracy: {metrics['accuracy']:.4f}  |  F1: {metrics['f1']:.4f}  |  ROC AUC: {metrics['roc_auc']:.4f}")

    # ── Save experiment under experiments/for/<model_subdir> ──
    save_experiment(
        model=pipeline,
        metrics=metrics,
        experiment_dir=os.path.join(FOR_MODELS_PATH, save_subdir),
        model_params=used_params,
        feature_names=features,
        metadata_extra=meta,
    )

# ── Summary table ──
print(f"\n\n{'='*90}")
print("  SUMMARY: FoR Test Set Results (retrained with best params)")
print(f"{'='*90}")
print(f"{'Model':<25} {'Accuracy':>10} {'Precision':>10} {'Recall':>10} {'F1':>10} {'ROC AUC':>10}")
print("-" * 90)
for name, data in trained_models.items():
    m = data["metrics"]
    print(f"{name:<25} {m['accuracy']:>10.4f} {m['precision']:>10.4f} {m['recall']:>10.4f} {m['f1']:>10.4f} {m['roc_auc']:>10.4f}")


In [None]:
val = os.path.join(FEATURES_DIR, "validation_features_mean_20_128_256_128.parquet")

for_val = pd.read_parquet(val)

for_no_mel = for_val.loc[:, ~for_val.columns.str.startswith("mel_spectrogram")]

save_no_mel_data_path = os.path.join(FEATURES_DIR, "validation_features_mean_20_128_256_128_no_mel.parquet")
for_no_mel .to_parquet(save_no_mel_data_path)

