# Hyperparameter Search Notebook

## Set up and configuration

In [57]:
import time
import json
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
from pathlib import Path

from Parser import Parser
from IBL import IBL
from processing_types import (
    NormalizationStrategy, EncodingStrategy,
    MissingValuesNumericStrategy, MissingValuesCategoricalStrategy, RetentionPolicy
)

BASE = "../datasetsCBR/datasetsCBR"
NUM_SPLITS = 10
K_LIST = [3, 5, 7]
METRICS = ["euclidean", "cosine", "heom"] 

## Suite Runner

In [58]:
def split_xy(df: pd.DataFrame):
    """Split last column as y; return X (DF), y (Series), name of target col."""
    target_col = df.columns[-1]
    X = df.drop(columns=[target_col])
    y = df[target_col]
    return X, y, target_col

def cm_to_json(cm: np.ndarray, labels: list | None = None) -> str:
    d = {"labels": labels if labels is not None else list(range(cm.shape[0])),
         "matrix": cm.astype(int).tolist()}
    return json.dumps(d)

def mean_std(a):
    a = np.asarray(a, dtype=float)
    return float(np.mean(a)), float(np.std(a, ddof=0))

def run_suite(
    dataset_name: str,
    metrics: list[str],
    encoding_for_metrics: dict[str, EncodingStrategy | None],
    run_retentions: list[str],
    votes: list[str]
):
    rows = []

    for metric in metrics:
        enc_strategy = encoding_for_metrics[metric]
        # Fresh Parser per metric so preprocessing matches (OHE vs LE)
        parser = Parser(
            base_path=BASE,
            dataset_name=dataset_name,
            normalization_strategy=NormalizationStrategy.MEAN_NORMALIZE,
            encoding_strategy=enc_strategy,  # OHE for eucl/cos; LE (or None) for HEOM
            missing_values_numeric_strategy=MissingValuesNumericStrategy.MEDIAN,
            missing_values_categorical_strategy=MissingValuesCategoricalStrategy.MODE,
            num_splits=NUM_SPLITS,
        )
        types = parser.get_types()

        splits = [parser.get_split(fold) for fold in range(NUM_SPLITS)]
        n_train_mean = float(np.mean([len(tr) for tr, _ in splits]))
        n_test_mean  = float(np.mean([len(te) for _, te in splits]))

        for k in K_LIST:
            for vote in votes:
                for retention in run_retentions:
                    # Accumulating each folds metrics
                    fit_times = []
                    pred_times = []
                    total_times = []

                    accs = []
                    prec_macros = []
                    rec_macros = []
                    f1_macros = []
                    prec_weighteds = []
                    rec_weighteds = []
                    f1_weighteds = []

                    # We’ll build a consistent label set across folds so we have consistent confusion matrices that can be combined
                    fold_results = []
                    labels = set()

                    for fold, (train_matrix, test_matrix) in enumerate(splits):
                        # print(f"metric={metric} | k={k} | vote={vote} | retention={retention} | fold={fold}")

                        ibl = IBL()

                        # Fit + predict
                        t0 = time.perf_counter()
                        ibl.fit(train_matrix)
                        t1 = time.perf_counter()
                        preds = ibl.run(
                            test_matrix,
                            k=k,
                            metric=metric,
                            vote=vote,
                            retention_policy=retention,
                            types=types
                        )
                        t2 = time.perf_counter()

                        # Times
                        fit_times.append(t1 - t0)
                        pred_times.append(t2 - t1)
                        total_times.append(t2 - t0)

                        # Metrics

                        y_true = test_matrix.iloc[:, -1].to_numpy()
                        y_pred = np.asarray(preds)

                        accs.append(accuracy_score(y_true, y_pred))

                        pm, rm, fm, _ = precision_recall_fscore_support(
                            y_true, y_pred, average="macro", zero_division=0
                        )
                        prec_macros.append(pm)
                        rec_macros.append(rm)
                        f1_macros.append(fm)

                        pw, rw, fw, _ = precision_recall_fscore_support(
                            y_true, y_pred, average="weighted", zero_division=0
                        )
                        prec_weighteds.append(pw)
                        rec_weighteds.append(rw)
                        f1_weighteds.append(fw)

                        # Confusion matrix aggregation with a consistent label order
                        fold_results.append((y_true, y_pred))
                        labels.update(np.unique(y_true))
                        labels.update(np.unique(y_pred))

                    labels = np.array(sorted(labels))
                    cm_aggregate = np.zeros((labels.size, labels.size), dtype=int)
                    for y_true, y_pred in fold_results:
                        cm_aggregate += confusion_matrix(y_true, y_pred, labels=labels).astype(int)

                    # Compute means (and stds) across folds
                    fit_mean, fit_std = mean_std(fit_times)
                    pred_mean, pred_std = mean_std(pred_times)
                    total_mean, total_std = mean_std(total_times)

                    acc_mean, acc_std = mean_std(accs)
                    pM_mean, pM_std = mean_std(prec_macros)
                    rM_mean, rM_std = mean_std(rec_macros)
                    fM_mean, fM_std = mean_std(f1_macros)
                    pW_mean, pW_std = mean_std(prec_weighteds)
                    rW_mean, rW_std = mean_std(rec_weighteds)
                    fW_mean, fW_std = mean_std(f1_weighteds)

                    rows.append({
                        "dataset": dataset_name,
                        "metric": metric,
                        "k": k,
                        "vote": vote,
                        "retention": retention,
                        "num_folds": NUM_SPLITS,

                        "n_train_mean": n_train_mean,
                        "n_test_mean":  n_test_mean,

                        "fit_time_s_mean": fit_mean,
                        "fit_time_s_std":  fit_std,
                        "predict_time_s_mean": pred_mean,
                        "predict_time_s_std":  pred_std,
                        "total_time_s_mean": total_mean,
                        "total_time_s_std":  total_std,

                        "accuracy_mean": acc_mean,
                        "accuracy_std":  acc_std,

                        "precision_macro_mean": pM_mean,
                        "precision_macro_std":  pM_std,
                        "recall_macro_mean":    rM_mean,
                        "recall_macro_std":     rM_std,
                        "f1_macro_mean":        fM_mean,
                        "f1_macro_std":         fM_std,

                        "precision_weighted_mean": pW_mean,
                        "precision_weighted_std":  pW_std,
                        "recall_weighted_mean":    rW_mean,
                        "recall_weighted_std":     rW_std,
                        "f1_weighted_mean":        fW_mean,
                        "f1_weighted_std":         fW_std,

                        "confusion_matrix_json": cm_to_json(cm_aggregate, labels=labels.tolist()),
                    })

    df = pd.DataFrame(rows)
    return df

## Main (tests)

### Results file

In [59]:
csv_path = "adult_results.csv"
Path(csv_path).unlink(missing_ok=True)

def append_df(df, path: str):
    p = Path(path)
    write_header = not p.exists()          # compute fresh EACH TIME
    df.to_csv(path, mode="a", header=write_header, index=False)

In [60]:
df = run_suite(
    dataset_name="adult",
    metrics=METRICS,
    encoding_for_metrics={
        "euclidean": EncodingStrategy.ONE_HOT_ENCODE,
        "cosine":    EncodingStrategy.ONE_HOT_ENCODE,
        "heom": EncodingStrategy.LABEL_ENCODE, 
    },
    run_retentions= [RetentionPolicy.ALWAYS_RETAIN], 
    votes=["modified_plurality"]
)
append_df(df, csv_path)

Preallocating matrix of shape (48842, 108)
Total time for all instances: 55.71s
Final training set size: (48842, 108)
Preallocating matrix of shape (48842, 108)
Total time for all instances: 69.22s
Final training set size: (48842, 108)
Preallocating matrix of shape (48842, 108)
Total time for all instances: 95.62s
Final training set size: (48842, 108)
Preallocating matrix of shape (48842, 108)
Total time for all instances: 95.63s
Final training set size: (48842, 108)
Preallocating matrix of shape (48842, 108)
Total time for all instances: 95.39s
Final training set size: (48842, 108)
Preallocating matrix of shape (48842, 108)
Total time for all instances: 90.91s
Final training set size: (48842, 108)
Preallocating matrix of shape (48842, 108)
Total time for all instances: 54.84s
Final training set size: (48842, 108)
Preallocating matrix of shape (48842, 108)
Total time for all instances: 54.75s
Final training set size: (48842, 108)
Preallocating matrix of shape (48842, 107)
Total time fo

In [61]:
df = run_suite(
    dataset_name="adult",
    metrics=METRICS,
    encoding_for_metrics={
        "euclidean": EncodingStrategy.ONE_HOT_ENCODE,
        "cosine":    EncodingStrategy.ONE_HOT_ENCODE,
        "heom": EncodingStrategy.LABEL_ENCODE, 
    },
    run_retentions= [RetentionPolicy.NEVER_RETAIN], 
    votes=["modified_plurality"]
)
append_df(df, csv_path)

Preallocating matrix of shape (48842, 108)
Total time for all instances: 86.65s
Final training set size: (43958, 108)
Preallocating matrix of shape (48842, 108)
Total time for all instances: 86.69s
Final training set size: (43959, 108)
Preallocating matrix of shape (48842, 108)
Total time for all instances: 87.06s
Final training set size: (43957, 108)
Preallocating matrix of shape (48842, 108)
Total time for all instances: 86.34s
Final training set size: (43958, 108)
Preallocating matrix of shape (48842, 108)
Total time for all instances: 86.77s
Final training set size: (43957, 108)
Preallocating matrix of shape (48842, 108)
Total time for all instances: 86.64s
Final training set size: (43958, 108)
Preallocating matrix of shape (48842, 108)
Total time for all instances: 86.50s
Final training set size: (43958, 108)
Preallocating matrix of shape (48842, 108)
Total time for all instances: 86.76s
Final training set size: (43958, 108)
Preallocating matrix of shape (48842, 107)
Total time fo

In [62]:
df = run_suite(
    dataset_name="adult",
    metrics=METRICS,
    encoding_for_metrics={
        "euclidean": EncodingStrategy.ONE_HOT_ENCODE,
        "cosine":    EncodingStrategy.ONE_HOT_ENCODE,
        "heom": EncodingStrategy.LABEL_ENCODE, 
    },
    run_retentions= [RetentionPolicy.DIFFERENT_CLASS_RETENTION], 
    votes=["modified_plurality"]
)
append_df(df, csv_path)

Preallocating matrix of shape (48842, 108)
Total time for all instances: 88.38s
Final training set size: (45103, 108)
Preallocating matrix of shape (48842, 108)
Total time for all instances: 88.16s
Final training set size: (45105, 108)
Preallocating matrix of shape (48842, 108)
Total time for all instances: 88.46s
Final training set size: (45096, 108)
Preallocating matrix of shape (48842, 108)
Total time for all instances: 88.41s
Final training set size: (45111, 108)
Preallocating matrix of shape (48842, 108)
Total time for all instances: 88.34s
Final training set size: (45142, 108)
Preallocating matrix of shape (48842, 108)
Total time for all instances: 88.35s
Final training set size: (45122, 108)
Preallocating matrix of shape (48842, 108)
Total time for all instances: 88.56s
Final training set size: (45105, 108)
Preallocating matrix of shape (48842, 108)
Total time for all instances: 88.54s
Final training set size: (45071, 108)
Preallocating matrix of shape (48842, 107)
Total time fo

In [63]:
df = run_suite(
    dataset_name="adult",
    metrics=METRICS,
    encoding_for_metrics={
        "euclidean": EncodingStrategy.ONE_HOT_ENCODE,
        "cosine":    EncodingStrategy.ONE_HOT_ENCODE,
        "heom": EncodingStrategy.LABEL_ENCODE, 
    },
    run_retentions= [RetentionPolicy.DD_RETENTION], 
    votes=["modified_plurality"]
)
append_df(df, csv_path)

Preallocating matrix of shape (48842, 108)
Total time for all instances: 90.30s
Final training set size: (46068, 108)
Preallocating matrix of shape (48842, 108)
Total time for all instances: 90.20s
Final training set size: (46029, 108)
Preallocating matrix of shape (48842, 108)
Total time for all instances: 90.02s
Final training set size: (46072, 108)
Preallocating matrix of shape (48842, 108)
Total time for all instances: 89.84s
Final training set size: (45979, 108)
Preallocating matrix of shape (48842, 108)
Total time for all instances: 90.24s
Final training set size: (46071, 108)
Preallocating matrix of shape (48842, 108)
Total time for all instances: 90.22s
Final training set size: (46042, 108)
Preallocating matrix of shape (48842, 108)
Total time for all instances: 89.93s
Final training set size: (46078, 108)
Preallocating matrix of shape (48842, 108)
Total time for all instances: 89.85s
Final training set size: (45975, 108)
Preallocating matrix of shape (48842, 107)
Total time fo

In [64]:
df = run_suite(
    dataset_name="adult",
    metrics=METRICS,
    encoding_for_metrics={
        "euclidean": EncodingStrategy.ONE_HOT_ENCODE,
        "cosine":    EncodingStrategy.ONE_HOT_ENCODE,
        "heom": EncodingStrategy.LABEL_ENCODE, 
    },
    run_retentions= [RetentionPolicy.ALWAYS_RETAIN], 
    votes=["borda"]
)
append_df(df, csv_path)

Preallocating matrix of shape (48842, 108)
Total time for all instances: 93.20s
Final training set size: (48842, 108)
Preallocating matrix of shape (48842, 108)
Total time for all instances: 93.51s
Final training set size: (48842, 108)
Preallocating matrix of shape (48842, 108)
Total time for all instances: 93.36s
Final training set size: (48842, 108)
Preallocating matrix of shape (48842, 108)
Total time for all instances: 93.39s
Final training set size: (48842, 108)
Preallocating matrix of shape (48842, 108)
Total time for all instances: 93.66s
Final training set size: (48842, 108)
Preallocating matrix of shape (48842, 108)
Total time for all instances: 93.52s
Final training set size: (48842, 108)
Preallocating matrix of shape (48842, 108)
Total time for all instances: 93.57s
Final training set size: (48842, 108)
Preallocating matrix of shape (48842, 108)
Total time for all instances: 93.81s
Final training set size: (48842, 108)
Preallocating matrix of shape (48842, 107)
Total time fo

In [65]:
df = run_suite(
    dataset_name="adult",
    metrics=METRICS,
    encoding_for_metrics={
        "euclidean": EncodingStrategy.ONE_HOT_ENCODE,
        "cosine":    EncodingStrategy.ONE_HOT_ENCODE,
        "heom": EncodingStrategy.LABEL_ENCODE, 
    },
    run_retentions= [RetentionPolicy.NEVER_RETAIN], 
    votes=["borda"]
)
append_df(df, csv_path)

Preallocating matrix of shape (48842, 108)
Total time for all instances: 86.49s
Final training set size: (43958, 108)
Preallocating matrix of shape (48842, 108)
Total time for all instances: 86.36s
Final training set size: (43959, 108)
Preallocating matrix of shape (48842, 108)
Total time for all instances: 86.43s
Final training set size: (43957, 108)
Preallocating matrix of shape (48842, 108)
Total time for all instances: 86.26s
Final training set size: (43958, 108)
Preallocating matrix of shape (48842, 108)
Total time for all instances: 86.23s
Final training set size: (43957, 108)
Preallocating matrix of shape (48842, 108)
Total time for all instances: 85.98s
Final training set size: (43958, 108)
Preallocating matrix of shape (48842, 108)
Total time for all instances: 86.22s
Final training set size: (43958, 108)
Preallocating matrix of shape (48842, 108)
Total time for all instances: 86.07s
Final training set size: (43958, 108)
Preallocating matrix of shape (48842, 107)
Total time fo

In [66]:
df = run_suite(
    dataset_name="adult",
    metrics=METRICS,
    encoding_for_metrics={
        "euclidean": EncodingStrategy.ONE_HOT_ENCODE,
        "cosine":    EncodingStrategy.ONE_HOT_ENCODE,
        "heom": EncodingStrategy.LABEL_ENCODE, 
    },
    run_retentions= [RetentionPolicy.DIFFERENT_CLASS_RETENTION], 
    votes=["borda"]
)
append_df(df, csv_path)

Preallocating matrix of shape (48842, 108)
Total time for all instances: 88.44s
Final training set size: (45179, 108)
Preallocating matrix of shape (48842, 108)
Total time for all instances: 88.45s
Final training set size: (45230, 108)
Preallocating matrix of shape (48842, 108)
Total time for all instances: 88.59s
Final training set size: (45236, 108)
Preallocating matrix of shape (48842, 108)
Total time for all instances: 88.36s
Final training set size: (45207, 108)
Preallocating matrix of shape (48842, 108)
Total time for all instances: 88.33s
Final training set size: (45195, 108)
Preallocating matrix of shape (48842, 108)
Total time for all instances: 88.59s
Final training set size: (45278, 108)
Preallocating matrix of shape (48842, 108)
Total time for all instances: 88.49s
Final training set size: (45220, 108)
Preallocating matrix of shape (48842, 108)
Total time for all instances: 88.34s
Final training set size: (45206, 108)
Preallocating matrix of shape (48842, 107)
Total time fo

In [67]:
df = run_suite(
    dataset_name="adult",
    metrics=METRICS,
    encoding_for_metrics={
        "euclidean": EncodingStrategy.ONE_HOT_ENCODE,
        "cosine":    EncodingStrategy.ONE_HOT_ENCODE,
        "heom": EncodingStrategy.LABEL_ENCODE, 
    },
    run_retentions= [RetentionPolicy.DD_RETENTION], 
    votes=["borda"]
)
append_df(df, csv_path)

Preallocating matrix of shape (48842, 108)
Total time for all instances: 52.00s
Final training set size: (46068, 108)
Preallocating matrix of shape (48842, 108)
Total time for all instances: 51.87s
Final training set size: (46029, 108)
Preallocating matrix of shape (48842, 108)
Total time for all instances: 51.92s
Final training set size: (46072, 108)
Preallocating matrix of shape (48842, 108)
Total time for all instances: 51.90s
Final training set size: (45979, 108)
Preallocating matrix of shape (48842, 108)
Total time for all instances: 52.37s
Final training set size: (46071, 108)
Preallocating matrix of shape (48842, 108)
Total time for all instances: 52.09s
Final training set size: (46042, 108)
Preallocating matrix of shape (48842, 108)
Total time for all instances: 52.09s
Final training set size: (46078, 108)
Preallocating matrix of shape (48842, 108)
Total time for all instances: 51.91s
Final training set size: (45975, 108)
Preallocating matrix of shape (48842, 107)
Total time fo