# Hyperparameter Search Notebook

## Set up and configuration

In [1]:
import time
import json
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix

from Parser import Parser
from IBL import IBL
from preprocessing_types import (
    NormalizationStrategy, EncodingStrategy,
    MissingValuesNumericStrategy, MissingValuesCategoricalStrategy
)

BASE = "../datasetsCBR/datasetsCBR"
NUM_SPLITS = 1
K_LIST = [3, 5, 7]
METRICS = ["euclidean", "cosine", "heom"] 
votes = ["modified_plurality", "borda"]
retentions = ["always_retain", "never_retain", "different_class_retention", "DD_retention"]

## Suite Runner

In [3]:
def split_xy(df: pd.DataFrame):
    """Split last column as y; return X (DF), y (Series), name of target col."""
    target_col = df.columns[-1]
    X = df.drop(columns=[target_col])
    y = df[target_col]
    return X, y, target_col

def cm_to_json(cm: np.ndarray, labels: list | None = None) -> str:
    d = {"labels": labels if labels is not None else list(range(cm.shape[0])),
         "matrix": cm.astype(int).tolist()}
    return json.dumps(d)

def run_suite(
    dataset_name: str,
    metrics: list[str],
    encoding_for_metrics: dict[str, EncodingStrategy | None],
    csv_path: str,
    run_retentions: list[str],
    votes: list[str]
):
    rows = []

    for metric in metrics:
        enc_strategy = encoding_for_metrics[metric]
        # Fresh Parser per metric so preprocessing matches (OHE vs LE)
        parser = Parser(
            base_path=BASE,
            dataset_name=dataset_name,
            normalization_strategy=NormalizationStrategy.MEAN_NORMALIZE,
            encoding_strategy=enc_strategy,  # OHE for eucl/cos; LE (or None) for HEOM
            missing_values_numeric_strategy=MissingValuesNumericStrategy.MEDIAN,
            missing_values_categorical_strategy=MissingValuesCategoricalStrategy.MODE,
            num_splits=NUM_SPLITS,
        )
        types = parser.get_types()

        for k in K_LIST:
            for vote in votes:
                for retention in run_retentions:
                    # ---------------------------
                    # ACCUMULATORS FOR THIS BRANCH
                    # ---------------------------
                    fit_times = []
                    pred_times = []
                    total_times = []

                    accs = []
                    prec_macros = []
                    rec_macros = []
                    f1_macros = []
                    prec_weighteds = []
                    rec_weighteds = []
                    f1_weighteds = []

                    # We’ll build a consistent label set across folds for CM aggregation
                    labels_union = None
                    cm_aggregate = None

                    for fold in range(NUM_SPLITS):
                        train_matrix, test_matrix = parser.get_split(fold)
                        X_test, y_test, _ = split_xy(test_matrix)

                        print(f"metric={metric} | k={k} | vote={vote} | retention={retention} | fold={fold}")

                        ibl = IBL()

                        # Fit + predict
                        t0 = time.perf_counter()
                        ibl.fit(train_matrix)
                        t1 = time.perf_counter()
                        preds = ibl.run(
                            test_matrix,
                            k=k,
                            metric=metric,
                            vote=vote,
                            retention_policy=retention,
                            types=types
                        )
                        t2 = time.perf_counter()

                        # Times
                        fit_times.append(t1 - t0)
                        pred_times.append(t2 - t1)
                        total_times.append(t2 - t0)

                        # Metrics
                        y_true = np.asarray(y_test)
                        y_pred = np.asarray(preds)

                        accs.append(accuracy_score(y_true, y_pred))

                        pm, rm, fm, _ = precision_recall_fscore_support(
                            y_true, y_pred, average="macro", zero_division=0
                        )
                        prec_macros.append(pm)
                        rec_macros.append(rm)
                        f1_macros.append(fm)

                        pw, rw, fw, _ = precision_recall_fscore_support(
                            y_true, y_pred, average="weighted", zero_division=0
                        )
                        prec_weighteds.append(pw)
                        rec_weighteds.append(rw)
                        f1_weighteds.append(fw)

                        # Confusion matrix aggregation with a consistent label order
                        fold_labels = np.unique(np.concatenate([y_true, y_pred]))
                        if labels_union is None:
                            labels_union = fold_labels
                        else:
                            labels_union = np.unique(np.concatenate([labels_union, fold_labels]))

                    # After we know labels_union, recompute & sum CMs across folds in that order
                    # (We need a second pass or we can store y_true/y_pred per fold; here we do a second pass quickly)
                    # To avoid re-running the model, we’ll store predictions in a quick pass above if desired.
                    # Simpler: re-loop folds once more to aggregate CM (cheap vs training).
                    cm_aggregate = np.zeros((labels_union.size, labels_union.size), dtype=int)
                    for fold in range(NUM_SPLITS):
                        train_matrix, test_matrix = parser.get_split(fold)
                        X_test, y_test, _ = split_xy(test_matrix)

                        ibl = IBL()
                        ibl.fit(train_matrix)
                        preds = ibl.run(
                            test_matrix,
                            k=k,
                            metric=metric,
                            vote=vote,
                            retention_policy=retention,
                            types=types
                        )

                        y_true = np.asarray(y_test)
                        y_pred = np.asarray(preds)
                        cm = confusion_matrix(y_true, y_pred, labels=labels_union)
                        cm_aggregate += cm.astype(int)

                    # Compute means (and stds) across folds
                    def mean_std(a):
                        a = np.asarray(a, dtype=float)
                        return float(np.mean(a)), float(np.std(a, ddof=0))

                    fit_mean, fit_std = mean_std(fit_times)
                    pred_mean, pred_std = mean_std(pred_times)
                    total_mean, total_std = mean_std(total_times)

                    acc_mean, acc_std = mean_std(accs)
                    pM_mean, pM_std = mean_std(prec_macros)
                    rM_mean, rM_std = mean_std(rec_macros)
                    fM_mean, fM_std = mean_std(f1_macros)

                    pW_mean, pW_std = mean_std(prec_weighteds)
                    rW_mean, rW_std = mean_std(rec_weighteds)
                    fW_mean, fW_std = mean_std(f1_weighteds)

                    # Row for this (dataset, metric, k, vote, retention)
                    rows.append({
                        "dataset": dataset_name,
                        "metric": metric,
                        "k": k,
                        "vote": vote,
                        "retention": retention,
                        "num_folds": NUM_SPLITS,

                        # Train/test sizes vary per fold; reporting averages is reasonable
                        "n_train_mean": float(np.mean([len(parser.get_split(f)[0]) for f in range(NUM_SPLITS)])),
                        "n_test_mean":  float(np.mean([len(parser.get_split(f)[1]) for f in range(NUM_SPLITS)])),

                        # Times
                        "fit_time_s_mean": fit_mean,
                        "fit_time_s_std":  fit_std,
                        "predict_time_s_mean": pred_mean,
                        "predict_time_s_std":  pred_std,
                        "total_time_s_mean": total_mean,
                        "total_time_s_std":  total_std,

                        # Metrics (mean ± std over folds)
                        "accuracy_mean": acc_mean,
                        "accuracy_std":  acc_std,

                        "precision_macro_mean": pM_mean,
                        "precision_macro_std":  pM_std,
                        "recall_macro_mean":    rM_mean,
                        "recall_macro_std":     rM_std,
                        "f1_macro_mean":        fM_mean,
                        "f1_macro_std":         fM_std,

                        "precision_weighted_mean": pW_mean,
                        "precision_weighted_std":  pW_std,
                        "recall_weighted_mean":    rW_mean,
                        "recall_weighted_std":     rW_std,
                        "f1_weighted_mean":        fW_mean,
                        "f1_weighted_std":         fW_std,

                        # Aggregated confusion matrix across folds
                        "confusion_matrix_json": cm_to_json(cm_aggregate, labels=labels_union.tolist()),
                    })

    df = pd.DataFrame(rows)
    df.to_csv(csv_path, index=False)
    return df

## Main (tests)

# refference

```
VOTES = ["modified_plurality", "borda"]
RETENTIONS = ["always_retain", "never_retain", "different_class_retention", "DD_retention"]


In [4]:
df_1 = run_suite(
    dataset_name="adult",
    metrics=METRICS,
    encoding_for_metrics={
        "euclidean": EncodingStrategy.ONE_HOT_ENCODE,
        "cosine":    EncodingStrategy.ONE_HOT_ENCODE,
        "heom": EncodingStrategy.LABEL_ENCODE, 
    },
    csv_path="test2.csv",
    run_retentions=["never_retain"], 
    votes=["modified_plurality"]
)

metric=euclidean | k=3 | vote=modified_plurality | retention=never_retain | fold=0
Total time for all instances: 533.38s
Total time for all instances: 531.93s
metric=euclidean | k=5 | vote=modified_plurality | retention=never_retain | fold=0
Total time for all instances: 545.78s


KeyboardInterrupt: 

In [None]:
df_1 = run_suite(
    dataset_name="adult",
    metrics=METRICS,
    encoding_for_metrics={
        "euclidean": EncodingStrategy.ONE_HOT_ENCODE,
        "cosine":    EncodingStrategy.ONE_HOT_ENCODE,
        "heom": EncodingStrategy.LABEL_ENCODE, 
    },
    csv_path="test.csv",
    run_retentions=["modified_plurality"], 
    votes=["never_retain"]
)

In [None]:
df_1 = run_suite(
    dataset_name="adult",
    metrics=METRICS,
    encoding_for_metrics={
        "euclidean": EncodingStrategy.ONE_HOT_ENCODE,
        "cosine":    EncodingStrategy.ONE_HOT_ENCODE,
        "heom": EncodingStrategy.LABEL_ENCODE, 
    },
    csv_path="test.csv",
    run_retentions=["modified_plurality"], 
    votes=["different_class_retention"]
)

In [None]:
df_1 = run_suite(
    dataset_name="adult",
    metrics=METRICS,
    encoding_for_metrics={
        "euclidean": EncodingStrategy.ONE_HOT_ENCODE,
        "cosine":    EncodingStrategy.ONE_HOT_ENCODE,
        "heom": EncodingStrategy.LABEL_ENCODE, 
    },
    csv_path="test.csv",
    run_retentions=["modified_plurality"], 
    votes=["always_retain"]
)

In [None]:
df_1 = run_suite(
    dataset_name="adult",
    metrics=METRICS,
    encoding_for_metrics={
        "euclidean": EncodingStrategy.ONE_HOT_ENCODE,
        "cosine":    EncodingStrategy.ONE_HOT_ENCODE,
        "heom": EncodingStrategy.LABEL_ENCODE, 
    },
    csv_path="test.csv",
    run_retentions=["modified_plurality"], 
    votes=["always_retain"]
)

In [None]:
df_1 = run_suite(
    dataset_name="adult",
    metrics=METRICS,
    encoding_for_metrics={
        "euclidean": EncodingStrategy.ONE_HOT_ENCODE,
        "cosine":    EncodingStrategy.ONE_HOT_ENCODE,
        "heom": EncodingStrategy.LABEL_ENCODE, 
    },
    csv_path="test.csv",
    run_retentions=["modified_plurality"], 
    votes=["always_retain"]
)