# Hyperparameter Search Notebook

Set up parser

In [19]:
import time
import json
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix

from Parser import Parser
from IBL import IBL
from preprocessing_types import (
    NormalizationStrategy, EncodingStrategy,
    MissingValuesNumericStrategy, MissingValuesCategoricalStrategy
)

BASE = "../datasetsCBR/datasetsCBR"
NUM_SPLITS = 1  # or whatever your dataset folder has

K_LIST = [3, 5, 7]
METRICS_EUC_COS = ["euclidean", "cosine"]
METRIC_HEOM = ["heom"]
VOTES = ["modified_plurality", "borda"]
RETENTIONS = ["NR", "AR", "DC", "DD"]  # implement later; run NR now if needed


In [None]:
def split_xy(df: pd.DataFrame):
    """Split last column as y; return X (DF), y (Series), name of target col."""
    target_col = df.columns[-1]
    X = df.drop(columns=[target_col])
    y = df[target_col]
    return X, y, target_col

def cm_to_json(cm: np.ndarray, labels: list | None = None) -> str:
    d = {"labels": labels if labels is not None else list(range(cm.shape[0])),
         "matrix": cm.astype(int).tolist()}
    return json.dumps(d)

def run_suite(
    dataset_name: str,
    metrics: list[str],
    encoding_for_metrics: dict[str, EncodingStrategy | None],
    csv_path: str,
    run_retentions: list[str] = RETENTIONS,
):
    rows = []

    for metric in metrics:
        enc_strategy = encoding_for_metrics[metric]
        # Build a fresh Parser for this metric so preprocessing matches (OHE vs LE)
        parser = Parser(
            base_path=BASE,
            dataset_name=dataset_name,
            normalization_strategy=NormalizationStrategy.MINMAX_SCALING,   # numeric -> [0,1]
            encoding_strategy=enc_strategy,                         # OHE for eucl/cos, LE or None for HEOM
            missing_values_numeric_strategy=MissingValuesNumericStrategy.MEDIAN,
            missing_values_categorical_strategy=MissingValuesCategoricalStrategy.MODE,
            num_splits=NUM_SPLITS,
        )

        # If you rely on parser.get_types(), capture it once (HEOM path).
        types = parser.get_types()

        for fold in range(NUM_SPLITS):
            train_df, test_df = parser.get_split(fold)
            Xtr_df, ytr, target = split_xy(train_df)
            Xte_df, yte, _ = split_xy(test_df)

            for k in K_LIST:
                for vote in VOTES:
                    for retention in run_retentions:
                        # If retention not implemented yet, enforce NR now to avoid surprises
                        if retention != "NR":
                            pass  # TODO: wire your retention policy here

                        # Build IBL
                        ibl = IBL()

                        # Time fit + predict
                        t0 = time.perf_counter()
                        ibl.fit(Xtr_df)
                        t1 = time.perf_counter()
                        preds = ibl.run(Xte_df,  k=k, metric=metric, vote=vote, types=types)
                        t2 = time.perf_counter()

                        # Metrics
                        y_true = np.asarray(yte)
                        y_pred = np.asarray(preds)
                        print(y_true)
                        print(y_pred)
                        acc = accuracy_score(y_true, y_pred)
                        prec_macro, rec_macro, f1_macro, _ = precision_recall_fscore_support(
                            y_true, y_pred, average="macro", zero_division=0
                        )
                        # If you want per-class too:
                        prec_w, rec_w, f1_w, _ = precision_recall_fscore_support(
                            y_true, y_pred, average="weighted", zero_division=0
                        )
                        labels = np.unique(np.concatenate([y_true, y_pred]))
                        cm = confusion_matrix(y_true, y_pred, labels=labels)

                        rows.append({
                            "dataset": dataset_name,
                            "fold": fold,
                            "metric": metric,
                            "k": k,
                            "vote": vote,
                            "retention": retention,
                            "n_train": len(Xtr_df),
                            "n_test": len(Xte_df),
                            "fit_time_s": t1 - t0,
                            "predict_time_s": t2 - t1,
                            "total_time_s": t2 - t0,
                            "accuracy": acc,
                            "precision_macro": prec_macro,
                            "recall_macro": rec_macro,
                            "f1_macro": f1_macro,
                            "precision_weighted": prec_w,
                            "recall_weighted": rec_w,
                            "f1_weighted": f1_w,
                            "confusion_matrix_json": cm_to_json(cm, labels=labels.tolist()),
                        })

    df = pd.DataFrame(rows)
    df.to_csv(csv_path, index=False)
    return df

In [22]:
# df_11 = run_suite(
#     dataset_name="adult",
#     metrics=METRICS_EUC_COS,
#     encoding_for_metrics={
#         "euclidean": EncodingStrategy.ONE_HOT_ENCODE,
#         "cosine":    EncodingStrategy.ONE_HOT_ENCODE,
#     },
#     csv_path="results_adult_eucl_cos.csv",
# )

# TEST 1.2: Adult — HEOM (LE)
# df_12 = run_suite(
#     dataset_name="adult",
#     metrics=METRIC_HEOM,
#     encoding_for_metrics={
#         "heom": EncodingStrategy.LABEL_ENCODE  # or None if your Parser encodes cats by default for HEOM
#     },
#     csv_path="results_adult_heom.csv",
# )

# TEST 2: Pen-based — Euclidean & Cosine (OHE is harmless here; no cats)
df_2 = run_suite(
    dataset_name="pen-based",
    metrics=METRICS_EUC_COS,
    encoding_for_metrics={
        "euclidean": EncodingStrategy.ONE_HOT_ENCODE,
        "cosine":    EncodingStrategy.ONE_HOT_ENCODE,
    },
    csv_path="results_penbased_eucl_cos.csv",
)

Instance 0: dist=0.0005s, sort=0.0005s, vote=0.0000s, retention=0.0005s, total=0.0015s
Instance 1: dist=0.0005s, sort=0.0000s, vote=0.0000s, retention=0.0010s, total=0.0015s
Instance 2: dist=0.0000s, sort=0.0010s, vote=0.0000s, retention=0.0000s, total=0.0010s
Instance 3: dist=0.0000s, sort=0.0005s, vote=0.0000s, retention=0.0005s, total=0.0010s
Instance 4: dist=0.0000s, sort=0.0005s, vote=0.0005s, retention=0.0000s, total=0.0010s
Instance 5: dist=0.0000s, sort=0.0005s, vote=0.0000s, retention=0.0000s, total=0.0005s
Instance 6: dist=0.0010s, sort=0.0005s, vote=0.0000s, retention=0.0005s, total=0.0020s
Instance 7: dist=0.0005s, sort=0.0000s, vote=0.0005s, retention=0.0005s, total=0.0015s
Instance 8: dist=0.0005s, sort=0.0005s, vote=0.0000s, retention=0.0000s, total=0.0010s
Instance 9: dist=0.0000s, sort=0.0010s, vote=0.0000s, retention=0.0005s, total=0.0015s
Instance 10: dist=0.0000s, sort=0.0005s, vote=0.0005s, retention=0.0000s, total=0.0010s
Instance 11: dist=0.0005s, sort=0.0005s, v

ValueError: Classification metrics can't handle a mix of multiclass and continuous targets