# Hyperparameter Search Notebook

## Set up and configuration

In [None]:
import time
import json
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix

from Parser import Parser
from IBL import IBL
from preprocessing_types import (
    NormalizationStrategy, EncodingStrategy,
    MissingValuesNumericStrategy, MissingValuesCategoricalStrategy
)

BASE = "../datasetsCBR/datasetsCBR"
NUM_SPLITS = 1
K_LIST = [3, 5, 7]
METRICS = ["euclidean", "cosine", "heom"] 
votes = ["modified_plurality", "borda"]
retentions = ["always_retain", "never_retain", "different_class_retention", "DD_retention"]

## Suite Runner

In [3]:
def split_xy(df: pd.DataFrame):
    """Split last column as y; return X (DF), y (Series), name of target col."""
    target_col = df.columns[-1]
    X = df.drop(columns=[target_col])
    y = df[target_col]
    return X, y, target_col

def cm_to_json(cm: np.ndarray, labels: list | None = None) -> str:
    d = {"labels": labels if labels is not None else list(range(cm.shape[0])),
         "matrix": cm.astype(int).tolist()}
    return json.dumps(d)

def run_suite(
    dataset_name: str,
    metrics: list[str],
    encoding_for_metrics: dict[str, EncodingStrategy | None],
    csv_path: str,
    run_retentions: list[str],
    votes: list[str]
):
    rows = []

    for metric in metrics:
        enc_strategy = encoding_for_metrics[metric]
        # Build a fresh Parser for this metric so preprocessing matches (OHE vs LE)
        parser = Parser(
            base_path=BASE,
            dataset_name=dataset_name,
            normalization_strategy=NormalizationStrategy.MEAN_NORMALIZE,   # numeric -> [0,1]
            encoding_strategy=enc_strategy,                         # OHE for eucl/cos, LE or None for HEOM
            missing_values_numeric_strategy=MissingValuesNumericStrategy.MEDIAN,
            missing_values_categorical_strategy=MissingValuesCategoricalStrategy.MODE,
            num_splits=NUM_SPLITS,
        )

        types = parser.get_types()
     
        for k in K_LIST:
            for vote in votes:
                for retention in run_retentions:
                    for fold in range(NUM_SPLITS):
                        train_matrix, test_matrix = parser.get_split(fold)
                        X_test, y_test, _ = split_xy(test_matrix)

                        print(f"metric={metric}")
                        print(f"k={k}")
                        print(f"vote={vote}")
                        print(f"retention={retention}")
                        print(f"fold={fold}")

                        ibl = IBL()

                        # Time fit + predict
                        t0 = time.perf_counter()
                        ibl.fit(train_matrix)
                        t1 = time.perf_counter()
                        preds = ibl.run(test_matrix,  k=k, metric=metric, vote=vote,  retention_policy=retention, types=types)
                        
                        t2 = time.perf_counter()

                        # Metrics
                        y_true = np.asarray(y_test)
                        y_pred = np.asarray(preds)
                        
                        acc = accuracy_score(y_true, y_pred)
                        prec_macro, rec_macro, f1_macro, _ = precision_recall_fscore_support(
                            y_true, y_pred, average="macro", zero_division=0
                        )
                        # If you want per-class too:
                        prec_w, rec_w, f1_w, _ = precision_recall_fscore_support(
                            y_true, y_pred, average="weighted", zero_division=0
                        )
                        labels = np.unique(np.concatenate([y_true, y_pred]))
                        cm = confusion_matrix(y_true, y_pred, labels=labels)

                        rows.append({
                            "dataset": dataset_name,
                            "fold": fold,
                            "metric": metric,
                            "k": k,
                            "vote": vote,
                            "retention": retention,
                            "n_train": len(train_matrix),
                            "n_test": len(X_test),
                            "fit_time_s": t1 - t0,
                            "predict_time_s": t2 - t1,
                            "total_time_s": t2 - t0,
                            "accuracy": acc,
                            "precision_macro": prec_macro,
                            "recall_macro": rec_macro,
                            "f1_macro": f1_macro,
                            "precision_weighted": prec_w,
                            "recall_weighted": rec_w,
                            "f1_weighted": f1_w,
                            "confusion_matrix_json": cm_to_json(cm, labels=labels.tolist()),
                        })

    df = pd.DataFrame(rows)
    df.to_csv(csv_path, index=False)
    return df

## Main (tests)

# refference

```
VOTES = ["modified_plurality", "borda"]
RETENTIONS = ["always_retain", "never_retain", "different_class_retention", "DD_retention"]


In [None]:
df_1 = run_suite(
    dataset_name="adult",
    metrics=METRICS,
    encoding_for_metrics={
        "euclidean": EncodingStrategy.ONE_HOT_ENCODE,
        "cosine":    EncodingStrategy.ONE_HOT_ENCODE,
        "heom": EncodingStrategy.LABEL_ENCODE, 
    },
    csv_path="test.csv",
    run_retentions=["modified_plurality"], 
    votes=["always_retain"]
)

metric=euclidean
k=3
vote=modified_plurality
retention=never_retain
fold=0
Instance 0/4884: dist=0.0529s, sort=0.0020s, vote=0.0013s, retention=0.0000s, total=0.0563s
Instance 1/4884: dist=0.0558s, sort=0.0020s, vote=0.0010s, retention=0.0000s, total=0.0588s
Instance 2/4884: dist=0.0549s, sort=0.0021s, vote=0.0000s, retention=0.0000s, total=0.0571s
Instance 3/4884: dist=0.0609s, sort=0.0020s, vote=0.0012s, retention=0.0000s, total=0.0641s
Instance 4/4884: dist=0.0551s, sort=0.0010s, vote=0.0010s, retention=0.0000s, total=0.0571s
Instance 5/4884: dist=0.0563s, sort=0.0015s, vote=0.0010s, retention=0.0000s, total=0.0589s
Instance 6/4884: dist=0.0527s, sort=0.0020s, vote=0.0000s, retention=0.0000s, total=0.0548s
Instance 7/4884: dist=0.0562s, sort=0.0036s, vote=0.0000s, retention=0.0000s, total=0.0599s
Instance 8/4884: dist=0.0564s, sort=0.0020s, vote=0.0000s, retention=0.0000s, total=0.0584s
Instance 9/4884: dist=0.0550s, sort=0.0020s, vote=0.0000s, retention=0.0000s, total=0.0570s
Insta

In [None]:
df_1 = run_suite(
    dataset_name="adult",
    metrics=METRICS,
    encoding_for_metrics={
        "euclidean": EncodingStrategy.ONE_HOT_ENCODE,
        "cosine":    EncodingStrategy.ONE_HOT_ENCODE,
        "heom": EncodingStrategy.LABEL_ENCODE, 
    },
    csv_path="test.csv",
    run_retentions=["modified_plurality"], 
    votes=["never_retain"]
)

In [None]:
df_1 = run_suite(
    dataset_name="adult",
    metrics=METRICS,
    encoding_for_metrics={
        "euclidean": EncodingStrategy.ONE_HOT_ENCODE,
        "cosine":    EncodingStrategy.ONE_HOT_ENCODE,
        "heom": EncodingStrategy.LABEL_ENCODE, 
    },
    csv_path="test.csv",
    run_retentions=["modified_plurality"], 
    votes=["different_class_retention"]
)

In [None]:
df_1 = run_suite(
    dataset_name="adult",
    metrics=METRICS,
    encoding_for_metrics={
        "euclidean": EncodingStrategy.ONE_HOT_ENCODE,
        "cosine":    EncodingStrategy.ONE_HOT_ENCODE,
        "heom": EncodingStrategy.LABEL_ENCODE, 
    },
    csv_path="test.csv",
    run_retentions=["modified_plurality"], 
    votes=["always_retain"]
)

In [None]:
df_1 = run_suite(
    dataset_name="adult",
    metrics=METRICS,
    encoding_for_metrics={
        "euclidean": EncodingStrategy.ONE_HOT_ENCODE,
        "cosine":    EncodingStrategy.ONE_HOT_ENCODE,
        "heom": EncodingStrategy.LABEL_ENCODE, 
    },
    csv_path="test.csv",
    run_retentions=["modified_plurality"], 
    votes=["always_retain"]
)

In [None]:
df_1 = run_suite(
    dataset_name="adult",
    metrics=METRICS,
    encoding_for_metrics={
        "euclidean": EncodingStrategy.ONE_HOT_ENCODE,
        "cosine":    EncodingStrategy.ONE_HOT_ENCODE,
        "heom": EncodingStrategy.LABEL_ENCODE, 
    },
    csv_path="test.csv",
    run_retentions=["modified_plurality"], 
    votes=["always_retain"]
)