# Hyperparameter Search Notebook

Set up parser

In [11]:
import itertools
import os
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.metrics import f1_score, accuracy_score, recall_score
from Parser import Parser
from IBL import IBL 

base_path = "../datasetsCBR/datasetsCBR"
dataset_name = "adult"

# Hyper parameters
splits = range(2)
k_values = [1, 3, 5]
similarity_metrics = ['euclidean', 'cosine', 'heom']
voting_schemes = ["modified_plurality", "borda"]
csv_path = "results_log.csv"


Adjust Preprocessing function (Later will be thrown out)

In [None]:
parser = Parser(base_path, dataset_name, num_splits=10)


def get_split(i):
    # Trainsplit
    train_matrix, test_matrix = parser.get_split(i)
    
    # Handle missing values using model-based imputation
    train_filled, test_filled = parser.handle_missing_values(train_matrix, test_matrix, strategy_num="model", strategy_cat="mode")

    train_le, test_le, encoders = parser.label_encode(train_filled.copy(), test_filled.copy())
    
    return train_le, test_le

In [22]:
a, b = get_split(0)
a.head

<bound method NDFrame.head of         age  workclass    fnlwgt  education  education-num  marital-status  \
0      66.0          6  174788.0         15           10.0               4   
1      68.0          4  211162.0         11            9.0               2   
2      90.0          4  139660.0         15           10.0               0   
3      52.0          4  230657.0         11            9.0               2   
4      25.0          4   66622.0         15           10.0               4   
...     ...        ...       ...        ...            ...             ...   
43953  30.0          4  247328.0         11            9.0               5   
43954  38.0          4  213512.0         15           10.0               2   
43955  53.0          4  194995.0          4            3.0               2   
43956  37.0          2  117760.0          8           11.0               4   
43957  43.0          4  258049.0         12           14.0               2   

       occupation  relationship  

In [None]:

# --- CSV Setup (ensure header + at least one baseline row) ---
columns = [
    'dataset', 'k', 'metric', 'vote',
    'avg_f1', 'avg_accuracy', 'avg_recall',
    'std_f1', 'std_accuracy', 'std_recall',
    'n_folds', 'timestamp'
]

def write_baseline(path):
    baseline = {
        'dataset': dataset_name,
        'k': -1,
        'metric': 'baseline',
        'vote': 'baseline',
        'avg_f1': -1.0,         # metrics are >= 0 normally, so -1 is safe
        'avg_accuracy': -1.0,
        'avg_recall': -1.0,
        'std_f1': 0.0,
        'std_accuracy': 0.0,
        'std_recall': 0.0,
        'n_folds': 0,
        'timestamp': datetime.utcnow().isoformat()
    }
    pd.DataFrame([baseline]).to_csv(path, index=False, mode='a', header=not os.path.exists(path))

# create file + baseline if needed
if not os.path.exists(csv_path):
    pd.DataFrame(columns=columns).to_csv(csv_path, index=False)
    write_baseline(csv_path)
else:
    # if file exists but has no usable rows, append baseline
    try:
        prev = pd.read_csv(csv_path)
    except pd.errors.EmptyDataError:
        prev = pd.DataFrame(columns=columns)
    if prev.shape[0] == 0:
        write_baseline(csv_path)

# --- Helper to compute best-so-far from existing CSV (handles old format too) ---
def compute_best_so_far(path):
    try:
        prev = pd.read_csv(path)
    except pd.errors.EmptyDataError:
        return -np.inf, -np.inf, -np.inf

    # Prefer aggregated averages if present
    if 'avg_f1' in prev.columns:
        best_f1 = prev['avg_f1'].dropna().max() if not prev.empty else -np.inf
        best_acc = prev['avg_accuracy'].dropna().max() if 'avg_accuracy' in prev.columns else -np.inf
        best_rec = prev['avg_recall'].dropna().max() if 'avg_recall' in prev.columns else -np.inf
    elif 'f1_score' in prev.columns:
        # older per-split format: compute mean per param combo and take the max
        grp = prev.groupby(['dataset', 'k', 'metric', 'vote'])
        mean_f1 = grp['f1_score'].mean() if 'f1_score' in prev.columns else pd.Series(dtype=float)
        mean_acc = grp['accuracy'].mean() if 'accuracy' in prev.columns else pd.Series(dtype=float)
        mean_rec = grp['recall'].mean() if 'recall' in prev.columns else pd.Series(dtype=float)
        best_f1 = mean_f1.max() if not mean_f1.empty else -np.inf
        best_acc = mean_acc.max() if not mean_acc.empty else -np.inf
        best_rec = mean_rec.max() if not mean_rec.empty else -np.inf
    else:
        best_f1 = best_acc = best_rec = -np.inf

    # if they're NaN convert to -inf
    best_f1 = -np.inf if pd.isna(best_f1) else best_f1
    best_acc = -np.inf if pd.isna(best_acc) else best_acc
    best_rec = -np.inf if pd.isna(best_rec) else best_rec

    return best_f1, best_acc, best_rec

# --- Main (parameter-first, folds-inner) ---
total_combinations = len(k_values) * len(similarity_metrics) * len(voting_schemes)
combo_idx = 0

for k, metric, vote in itertools.product(k_values, similarity_metrics, voting_schemes):
    combo_idx += 1
    print(f"\n=== Combo {combo_idx}/{total_combinations}: k={k}, metric={metric}, vote={vote} ===")

    fold_f1s = []
    fold_accs = []
    fold_recs = []

    for split in splits:
        # Todo adjust this when we have the proper parser
        train_df, test_df = get_split(split)
        test_df = test_df[:10]
       
        # TODO
        X_train, y_train = train_df.drop(columns=['class']), train_df['class']
        X_test, y_test = test_df.drop(columns=['class']), test_df['class']

       # TODO
        model = IBL()
        model.fit(X_train, y_train)
        print(f"Model is being run in split: {split}")
        preds = model.run(X_test, k=k, metric=metric, vote=vote, types=None)
        print(f"Model is being run in split: {split}")


        # compute metrics (robust to zero-division)
        f1 = f1_score(y_test, preds, average='weighted', zero_division=0)
        acc = accuracy_score(y_test, preds)
        rec = recall_score(y_test, preds, average='weighted', zero_division=0)

        fold_f1s.append(f1)
        fold_accs.append(acc)
        fold_recs.append(rec)

        print(f"OK (F1={f1:.4f}, ACC={acc:.4f}, REC={rec:.4f})")

    # If no successful folds, skip logging this combo
    if len(fold_f1s) == 0:
        print("  → No successful folds for this combo, skipping.")
        continue

    # aggregate across folds
    avg_f1 = float(np.mean(fold_f1s))
    std_f1 = float(np.std(fold_f1s, ddof=1)) if len(fold_f1s) > 1 else 0.0
    avg_acc = float(np.mean(fold_accs))
    std_acc = float(np.std(fold_accs, ddof=1)) if len(fold_accs) > 1 else 0.0
    avg_rec = float(np.mean(fold_recs))
    std_rec = float(np.std(fold_recs, ddof=1)) if len(fold_recs) > 1 else 0.0

    # compute best so far from CSV (robust)
    best_f1_so_far, best_acc_so_far, best_rec_so_far = compute_best_so_far(csv_path)

    new_best = False
    if avg_f1 > best_f1_so_far:
        print(f"  🌟 New best AVG F1: {avg_f1:.4f} (prev best: {best_f1_so_far:.4f})")
        new_best = True
    elif avg_acc > best_acc_so_far:
        print(f"  🌟 New best AVG Accuracy: {avg_acc:.4f} (prev best: {best_acc_so_far:.4f})")
        new_best = True
    elif avg_rec > best_rec_so_far:
        print(f"  🌟 New best AVG Recall: {avg_rec:.4f} (prev best: {best_rec_so_far:.4f})")
        new_best = True
    else:
        print(f"  Avg (F1={avg_f1:.4f} ±{std_f1:.4f}, ACC={avg_acc:.4f} ±{std_acc:.4f}, REC={avg_rec:.4f} ±{std_rec:.4f})")

    # prepare and append averaged result row
    result = {
        'dataset': dataset_name,
        'k': k,
        'metric': metric,
        'vote': vote,
        'avg_f1': avg_f1,
        'avg_accuracy': avg_acc,
        'avg_recall': avg_rec,
        'std_f1': std_f1,
        'std_accuracy': std_acc,
        'std_recall': std_rec,
        'n_folds': len(fold_f1s),
        'timestamp': datetime.utcnow().isoformat()
    }

    pd.DataFrame([result]).to_csv(csv_path, mode='a', index=False, header=False)

    print(f"  → Logged average result (n_folds={len(fold_f1s)}) to {csv_path}")
    if new_best:
        print("  ✅ This is the new best-performing configuration!\n")

print("\n🏁 All parameter combos processed. Averages logged to:", csv_path)


  'timestamp': datetime.utcnow().isoformat()



=== Combo 1/18: k=1, metric=euclidean, vote=modified_plurality ===
Model is being run in split: 0
        age  workclass   fnlwgt  education  education-num  marital-status  \
25063  23.0          4  62339.0          9           13.0               4   

       occupation  relationship  race  sex  capital-gain  capital-loss  \
25063          10             3     4    1           0.0           0.0   

       hours-per-week  native-country  
25063            40.0              39   25063    0
Name: class, dtype: int64
        age  workclass    fnlwgt  education  education-num  marital-status  \
23048  49.0          4  225456.0          9           13.0               2   

       occupation  relationship  race  sex  capital-gain  capital-loss  \
23048           4             0     4    1           0.0           0.0   

       hours-per-week  native-country  
23048            40.0              39   23048    1
Name: class, dtype: int64
        age  workclass    fnlwgt  education  education-nu

  'timestamp': datetime.utcnow().isoformat()


Model is being run in split: 0
        age  workclass   fnlwgt  education  education-num  marital-status  \
25063  23.0          4  62339.0          9           13.0               4   

       occupation  relationship  race  sex  capital-gain  capital-loss  \
25063          10             3     4    1           0.0           0.0   

       hours-per-week  native-country  
25063            40.0              39   25063    0
Name: class, dtype: int64
        age  workclass    fnlwgt  education  education-num  marital-status  \
23048  49.0          4  225456.0          9           13.0               2   

       occupation  relationship  race  sex  capital-gain  capital-loss  \
23048           4             0     4    1           0.0           0.0   

       hours-per-week  native-country  
23048            40.0              39   23048    1
Name: class, dtype: int64
        age  workclass    fnlwgt  education  education-num  marital-status  \
20524  28.0          4  312372.0         11    

  'timestamp': datetime.utcnow().isoformat()


Model is being run in split: 0
        age  workclass   fnlwgt  education  education-num  marital-status  \
23615  22.0          4  65225.0          0            6.0               4   

       occupation  relationship  race  sex  capital-gain  capital-loss  \
23615           8             3     4    0           0.0           0.0   

       hours-per-week  native-country  
23615            40.0              39   23615    0
Name: class, dtype: int64
       age  workclass    fnlwgt  education  education-num  marital-status  \
1565  50.0          5  235307.0          7           12.0               2   

      occupation  relationship  race  sex  capital-gain  capital-loss  \
1565          10             0     4    1           0.0           0.0   

      hours-per-week  native-country  
1565            48.0              39   1565    1
Name: class, dtype: int64
        age  workclass    fnlwgt  education  education-num  marital-status  \
20524  28.0          4  312372.0         11           

  'timestamp': datetime.utcnow().isoformat()


Model is being run in split: 0
        age  workclass   fnlwgt  education  education-num  marital-status  \
23615  22.0          4  65225.0          0            6.0               4   

       occupation  relationship  race  sex  capital-gain  capital-loss  \
23615           8             3     4    0           0.0           0.0   

       hours-per-week  native-country  
23615            40.0              39   23615    0
Name: class, dtype: int64
       age  workclass    fnlwgt  education  education-num  marital-status  \
1565  50.0          5  235307.0          7           12.0               2   

      occupation  relationship  race  sex  capital-gain  capital-loss  \
1565          10             0     4    1           0.0           0.0   

      hours-per-week  native-country  
1565            48.0              39   1565    1
Name: class, dtype: int64
        age  workclass    fnlwgt  education  education-num  marital-status  \
20524  28.0          4  312372.0         11           

  'timestamp': datetime.utcnow().isoformat()


Model is being run in split: 0


ValueError: HEOM requires 'types' aligned to columns (pass at init).