# Example: Run Bayesian Optimization

This notebook show how to run Bayesian Optimization. 

For the example the within-matrix task of the 49th period is chosen. 

It assumes that matrices have been pre-computed. 

It also loads a pre-created set of train-ids (affair and elan id combinations) in that case 80% of 49th period data (rest is hold-out).

### Import Modules

In [2]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..', 'src')))

from data_loading import load_data
from create_weight_matrices import yea_nay_weight_matrices
from random_walk import rwhg
from helpers import evaluate_rwhg

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold

from skopt import gp_minimize
from skopt.space import Integer, Real
from skopt.utils import use_named_args
from skopt.plots import plot_convergence, plot_objective

### Run Bayesian Optimization (Within-Matrix) 49th period

In [None]:
# 49th period
period = [49]

# parameters
n_calls = 30      # BO calls
n_rnd_starts = 6  # amount of random starts 
b = 1             # amount of vote edges to complete per iteration

# load main data
votes, affairs, councillors = load_data(period)
votes = votes.reset_index()

# filter according to hold out sample (80-20 split)
train_ids = pd.read_csv(f'../data/tuning/train_votes_ids_{period[0]}.csv')
votes_train       = votes[votes['id'].isin(pd.unique(train_ids['id']))]
affairs_train     = affairs[affairs['id'].isin(pd.unique(train_ids['id']))]
councillors_train = councillors[councillors['elanId'].isin(pd.unique(train_ids['elanId']))]

# create mappings (and reverse mappings)
ordered_c_ids = sorted(set(councillors_train['elanId']))
ordered_a_ids = sorted(set(affairs_train['id']))
c_id2idx = {id_: i for i, id_ in enumerate(ordered_c_ids)}
a_id2idx = {id_: i for i, id_ in enumerate(ordered_a_ids)}

# load weight matrices (Approach II)
W_x = pd.read_csv(f'../data/tuning/W_x2_{period[0]}.csv').to_numpy()
W_y = pd.read_csv(f'../data/tuning/W_y2_{period[0]}.csv').to_numpy()
W_yea_xy, W_nay_xy = yea_nay_weight_matrices(votes_train, c_id2idx, a_id2idx)

# split train data into 5 folds
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# hyperparameter search space
search_space = [
    Real(0.01, 0.99, name='gamma'),
    Real(0.01, 0.99, name='alpha_x'),
    Real(0.01, 0.99, name='alpha_y'),
    Integer(1, 30, name='K_x'),
    Integer(1, 30, name='K_y')
]

# define objective function given search space
@use_named_args(search_space)
def objective(gamma, alpha_x, alpha_y, K_x, K_y):
    f1_scores = []

    print(f"Testing parameters: gamma={gamma:.3f}, alpha_x={alpha_x:.3f}, "
        f"alpha_y={alpha_y:.3f}, K_x={K_x}, K_y={K_y}")
    # 5-fold cross validation
    for _, test_idx in kf.split(votes_train):

        # ids of affairs and councillors
        test_c_id = votes_train.loc[test_idx, 'elanId']
        test_a_id = votes_train.loc[test_idx, 'id']
        test_decision = votes_train.loc[test_idx, 'decision']

        # weight matrix indices
        test_c_idx = test_c_id.map(c_id2idx)
        test_a_idx = test_a_id.map(a_id2idx)

        # input of test indices to RWHG
        test = np.column_stack((test_c_idx, test_a_idx)).astype(int)

        # keep track of ground truth
        ground_truth = list(zip(test_c_idx.values, test_a_idx.values, test_decision.values))

        # run RWHG
        results = rwhg(W_yea_xy.copy(), W_nay_xy.copy(), W_x.copy(), W_y.copy(), test,
                        b=b, gamma=gamma, alpha_x=alpha_x, alpha_y=alpha_y, K_x=K_x, K_y=K_y,
                        abstention=True)
        
        # evaluate it (only need f1 score here)
        _, _, _, _, f1_weighted_avg = evaluate_rwhg(results=results, ground_truth=ground_truth)
        f1_scores.append(f1_weighted_avg) 

    return -np.mean(f1_scores)  # minimize negative mean F1

# run Bayesian optimization
opt_result = gp_minimize(
    func=objective,
    dimensions=search_space,
    acq_func='EI',
    n_calls=n_calls,
    n_random_starts=n_rnd_starts,
    random_state=42, 
    verbose=True
)

# Save all parameters and their corresponding (negative) scores
param_names = ['gamma', 'alpha_x', 'alpha_y', 'K_x', 'K_y']
results_df = pd.DataFrame(opt_result.x_iters, columns=param_names)
results_df['neg_f1_score'] = opt_result.func_vals

# Save to CSV
results_df.to_csv(f'../results/all_params_{period[0]}.csv', index=False)

# Convergence plot 
plt.figure()
plot_convergence(opt_result)
plt.title(f"Convergence (Period {period[0]})")
plt.tight_layout()
plt.show()

# Objective plot (landscape of parameters)
plt.figure()
plot_objective(opt_result)
plt.suptitle(f"Objective Landscape (Period {period[0]})", y=1.02)
plt.tight_layout()
plt.show()