# 4.3 Refit best and transfer test on PTB dataset




## 1. Imports

In [None]:
import os
from pathlib import Path
import json
import warnings
from typing import Dict, List, Optional, Tuple, Union

# Add src to path
print(os.getcwd())

import pandas as pd
import numpy as np

# ML libraries
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold
from sklearn.metrics import (
    accuracy_score, precision_recall_fscore_support, confusion_matrix
)
from sklearn import set_config

# Models
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
import xgboost as xgb

# Sampling
from imblearn.pipeline import Pipeline


# Custom utilities
from src.utils.preprocessing import (
    load_processed_dataset,
    DatasetSplit,
    build_full_suffix as pp_build_full_suffix,
    generate_all_processed_datasets,
    _normalize_sampling_method_name,
    _SAMPLING_REGISTRY
)
from src.utils.evaluation import eval_model
from src.utils.model_saver import create_model_saver

import numpy as np
from typing import Optional, Dict, Union, Tuple
from scipy.signal import butter, filtfilt, medfilt
import pywt
import os, json
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix

from src.utils.model_saver import create_model_saver
from src.utils.preprocessing import (
    load_processed_dataset,
    build_full_suffix as pp_build_full_suffix,   # if not exported, import as in your notebook (pp_build_full_suffix alias)
    generate_all_processed_datasets,
)



ArrayLike = Union[np.ndarray, list]


# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

DATA_DIR = "data/processed/mitbih"

## 2. Refit best Model from part 4

### XGBoost, SMOTE, no outlier removal,	no feature engineering

Accuracy:   0.9834

F1-Macro:   0.9148

CV-Score:   0.9134


F1 Class1:  0.99

F1 Class2:  0.83

F1 Class3:  0.96

F1 Class4:  0.81

F1 Class5:  0.99

params: {"classifier__colsample_bytree": 0.9,"classifier__gamma": 0.0,"classifier__learning_rate": 0.2,"classifier__max_depth": 9,"classifier__min_child_weight": 5,"classifier__n_estimators": 250,"classifier__reg_alpha": 0.2,"classifier__reg_lambda": 0.05,"classifier__subsample": 0.7}

In [None]:
params = {"classifier__colsample_bytree": 0.9,"classifier__gamma": 0.0,
               "classifier__learning_rate": 0.2,"classifier__max_depth": 9,
               "classifier__min_child_weight": 5,"classifier__n_estimators": 250,
               "classifier__reg_alpha": 0.2,"classifier__reg_lambda": 0.05,
               "classifier__subsample": 0.7}
sampling_method = "SMOTE"
model_name = "XGBoost"
remove_outliers = False
estimator = "XGBoost"

In [None]:
# Pretty, Jupyter-native diagram (works in notebooks)
def show_pipeline_diagram(pipe: Pipeline) -> None:
    set_config(display="diagram")
    display(pipe)  # Jupyter display


def create_leak_free_pipeline(
    model_name: str,
    estimator,
    sampling_method: Optional[str] = "none",
    sampler_kwargs: Optional[Dict] = None,
    random_state: Optional[int] = 42,
) -> Pipeline:
    """
    Build a leak-free pipeline:
    - Using imblearn.Pipeline ensures fit/transform of SAMPLER happen within each CV fold on TRAIN only.
    """
    sampler_kwargs = dict(sampler_kwargs or {})

    # Provide a default random_state to samplers if not overridden
    if random_state is not None and "random_state" not in sampler_kwargs:
        sampler_kwargs["random_state"] = random_state

    internal_name = _normalize_sampling_method_name(sampling_method)

    steps = []

    SamplerClass = _SAMPLING_REGISTRY[internal_name]
    steps.append(("sampler", SamplerClass(**sampler_kwargs)))

    steps.append(("classifier", estimator))
    display(steps)
    return Pipeline(steps)


def prepare_dataset_with_sampling(
    data_dir: str = DATA_DIR,
    sampling_method: str = "No_Sampling",
    remove_outliers: bool = False
) -> Tuple[np.ndarray, Optional[np.ndarray], np.ndarray, Optional[np.ndarray]]:
    """Load an existing processed dataset for the given configuration.

    Datasets are assumed to be pre-generated by preprocessing utilities. This
    function never overwrites or generates new data; it only loads.
    """
    # Ensure all datasets are generated once (no-op if already done)
    generate_all_processed_datasets(data_dir=data_dir, only_once=True)

    full_suffix = pp_build_full_suffix(sampling_method, remove_outliers)
    split = load_processed_dataset(data_dir=data_dir, sampling_suffix=full_suffix)

    X_train_res = split.X_train.values
    y_train_res = split.y_train.values
    X_val = split.X_val.values if split.X_val is not None else None
    y_val = split.y_val.values if split.y_val is not None else None

    return X_train_res, X_val, y_train_res, y_val



print(f"\n{'='*80}")
print(f"Running GridSearchCV for {model_name} with {sampling_method}")
print(f"Outlier removal: {remove_outliers}")
print(f"{'='*80}")


# Prepare data
X_train, X_val, y_train, y_val = prepare_dataset_with_sampling(
    sampling_method="No_Sampling", # using non-sampled method for training - apply sampling inside pipeline
    remove_outliers=remove_outliers
)

# Create leak-free pipeline
pipeline = create_leak_free_pipeline(model_name, estimator, sampling_method)

# Adjust parameter names for pipeline
pipeline_params = {}
for param_name, param_values in params.items():
    pipeline_params[f'classifier__{param_name}'] = param_values

# Create experiment name
experiment_name = f"{sampling_method.lower()}_outliers_{remove_outliers}"

# ADD MODEL FIT HERE 

    
# For evaluation, we need to fit the model again since pipeline might not be fitted
model.fit(X_train, y_train)

# Get predictions
y_pred = model.predict(X_val)

# Calculate metrics
accuracy = accuracy_score(y_val, y_pred)
precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(
    y_val, y_pred, average='macro', zero_division=0
)

# Per-class metrics
labels = np.unique(np.concatenate([y_train, y_val]))
precision_per_class, recall_per_class, f1_per_class, support_per_class = precision_recall_fscore_support(
    y_val, y_pred, average=None, labels=labels, zero_division=0
)

confusion_mat = confusion_matrix(y_val, y_pred, labels=labels)

results = {
    'model_name': model_name,
    'sampling_method': sampling_method,
    'remove_outliers': remove_outliers,
    'best_cv_score': grid_search.best_score_,
    'best_params': grid_search.best_params_,
    'validation_accuracy': accuracy,
    'validation_f1_macro': f1_macro,
    'validation_precision_macro': precision_macro,
    'validation_recall_macro': recall_macro,
    'validation_f1_per_class': f1_per_class,
    'validation_precision_per_class': precision_per_class,
    'validation_recall_per_class': recall_per_class,
    'validation_support_per_class': support_per_class,
    'confusion_matrix': confusion_mat,
    'labels': labels,
}

print(f"Validation F1-Macro: {f1_macro:.4f}")
print(f"Validation Accuracy: {accuracy:.4f}")

# Append to results CSV
row = {
    'sampling_method': sampling_method,
    'outliers_removed': remove_outliers,
    'model': model_name,
    'test_accuracy': round(float(accuracy), 4),
    'test_f1_macro': round(float(f1_macro), 4),
    'best_cv_score': round(float(grid_search.best_score_), 4),
    'best_parameters': json.dumps(grid_search.best_params_),
}
# Add per-class F1 columns
for lbl, f1 in zip(labels, f1_per_class):
    row[f'test_f1_cls_{lbl}'] = round(float(f1), 2)

#os.makedirs(os.path.dirname(results_csv), exist_ok=True)
#header = not os.path.exists(results_csv)
#pd.DataFrame([row]).to_csv(results_csv, mode='a', index=False, header=header)

