# Hyperparameter Search Notebook

## Set up and configuration

In [None]:
import time
import json
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix

from Parser import Parser
from IBL import IBL
from preprocessing_types import (
    NormalizationStrategy, EncodingStrategy,
    MissingValuesNumericStrategy, MissingValuesCategoricalStrategy
)

BASE = "../datasetsCBR/datasetsCBR"
NUM_SPLITS = 10

K_LIST = [3, 5, 7]
METRICS_EUC_COS = ["euclidean", "cosine"] # separated euclidean/cosine vs heom for compatibility
METRIC_HEOM = ["heom"]
VOTES = ["modified_plurality"]
RETENTIONS = ["never_retain"]

# K_LIST = [5]
# METRICS_EUC_COS = ["euclidean"] # separated euclidean/cosine vs heom for compatibility
# METRIC_HEOM = ["heom"]
# VOTES = ["modified_plurality"]
# RETENTIONS = ["different_class_retention"]  

## Suite Runner

In [2]:
def split_xy(df: pd.DataFrame):
    """Split last column as y; return X (DF), y (Series), name of target col."""
    target_col = df.columns[-1]
    X = df.drop(columns=[target_col])
    y = df[target_col]
    return X, y, target_col

def cm_to_json(cm: np.ndarray, labels: list | None = None) -> str:
    d = {"labels": labels if labels is not None else list(range(cm.shape[0])),
         "matrix": cm.astype(int).tolist()}
    return json.dumps(d)

def run_suite(
    dataset_name: str,
    metrics: list[str],
    encoding_for_metrics: dict[str, EncodingStrategy | None],
    csv_path: str,
    run_retentions: list[str] = RETENTIONS,
):
    rows = []

    for metric in metrics:
        enc_strategy = encoding_for_metrics[metric]
        # Build a fresh Parser for this metric so preprocessing matches (OHE vs LE)
        parser = Parser(
            base_path=BASE,
            dataset_name=dataset_name,
            normalization_strategy=NormalizationStrategy.MINMAX_SCALING,   # numeric -> [0,1]
            encoding_strategy=enc_strategy,                         # OHE for eucl/cos, LE or None for HEOM
            missing_values_numeric_strategy=MissingValuesNumericStrategy.MEDIAN,
            missing_values_categorical_strategy=MissingValuesCategoricalStrategy.MODE,
            num_splits=NUM_SPLITS,
        )

        types = parser.get_types()

        for fold in range(NUM_SPLITS):
            train_matrix, test_matrix = parser.get_split(fold)
            X_train, y_train, target = split_xy(train_matrix)
            X_test, y_test, _ = split_xy(test_matrix)

            for k in K_LIST:
                for vote in VOTES:
                    for retention in run_retentions:

                        ibl = IBL()

                        # Time fit + predict
                        t0 = time.perf_counter()
                        ibl.fit(train_matrix)
                        t1 = time.perf_counter()
                        preds = ibl.run(test_matrix,  k=k, metric=metric, vote=vote,  retention_policy=retention, types=types)
                       
                        t2 = time.perf_counter()

                        # Metrics
                        y_true = np.asarray(y_test)
                        y_pred = np.asarray(preds)
                       
                        acc = accuracy_score(y_true, y_pred)
                        prec_macro, rec_macro, f1_macro, _ = precision_recall_fscore_support(
                            y_true, y_pred, average="macro", zero_division=0
                        )
                        # If you want per-class too:
                        prec_w, rec_w, f1_w, _ = precision_recall_fscore_support(
                            y_true, y_pred, average="weighted", zero_division=0
                        )
                        labels = np.unique(np.concatenate([y_true, y_pred]))
                        cm = confusion_matrix(y_true, y_pred, labels=labels)

                        rows.append({
                            "dataset": dataset_name,
                            "fold": fold,
                            "metric": metric,
                            "k": k,
                            "vote": vote,
                            "retention": retention,
                            "n_train": len(X_train),
                            "n_test": len(X_test),
                            "fit_time_s": t1 - t0,
                            "predict_time_s": t2 - t1,
                            "total_time_s": t2 - t0,
                            "accuracy": acc,
                            "precision_macro": prec_macro,
                            "recall_macro": rec_macro,
                            "f1_macro": f1_macro,
                            "precision_weighted": prec_w,
                            "recall_weighted": rec_w,
                            "f1_weighted": f1_w,
                            "confusion_matrix_json": cm_to_json(cm, labels=labels.tolist()),
                        })

    df = pd.DataFrame(rows)
    df.to_csv(csv_path, index=False)
    return df

## Main (tests)

In [3]:
df_11 = run_suite(
    dataset_name="adult",
    metrics=METRICS_EUC_COS,
    encoding_for_metrics={
        "euclidean": EncodingStrategy.ONE_HOT_ENCODE,
        "cosine":    EncodingStrategy.ONE_HOT_ENCODE,
    },
    csv_path="results_adult_eucl_cos.csv",
)

df_12 = run_suite(
    dataset_name="adult",
    metrics=METRIC_HEOM,
    encoding_for_metrics={
        "heom": EncodingStrategy.LABEL_ENCODE  # or None if your Parser encodes cats by default for HEOM
    },
    csv_path="results_adult_heom.csv",
)

# df_2 = run_suite(
#     dataset_name="pen-based",
#     metrics=METRICS_EUC_COS,
#     encoding_for_metrics={
#         "euclidean": EncodingStrategy.ONE_HOT_ENCODE,
#         "cosine":    EncodingStrategy.ONE_HOT_ENCODE,
#     },
#     csv_path="results_penbased_eucl_cos.csv",
# )

Total time for all instances: 121.04s
Total time for all instances: 456.29s
Total time for all instances: 180.90s
Total time for all instances: 223.64s
Total time for all instances: 120.12s
Total time for all instances: 453.19s
Total time for all instances: 188.32s
Total time for all instances: 223.54s
Total time for all instances: 120.73s
Total time for all instances: 453.80s
Total time for all instances: 179.16s
Total time for all instances: 184.15s
Total time for all instances: 119.84s
Total time for all instances: 453.90s
Total time for all instances: 181.80s
Total time for all instances: 183.78s
Total time for all instances: 120.48s
Total time for all instances: 454.60s
Total time for all instances: 177.56s
Total time for all instances: 220.38s
Total time for all instances: 120.38s
Total time for all instances: 455.83s
Total time for all instances: 179.10s
Total time for all instances: 221.07s
Total time for all instances: 119.90s
Total time for all instances: 456.02s
Total time f

KeyboardInterrupt: 