In [None]:
import sys
import os

sys.path.append(os.path.abspath('..'))

import yaml
import pandas as pd
import optuna

from src.data.load import load_data
from src.data.prepare import prepare_data
from src.models.cv_iterator import leave_last_k
from src.models.tuner import BayesianSearch
from src.models.retrieval import Retrieval
from src.models.evaluator import Evaluation
from src.models.tracker import launch_mlflow, log_run
from src.models.utils import set_global_seed

In [8]:
# read config
with open('config.yml', 'r') as file:
    config=yaml.load(file, Loader= yaml.SafeLoader)
del file

# ensure reproducibility
set_global_seed(seed=config["general"]["seed"])

# set experiment tracking
launch_mlflow()

# set algorithm
ALGORITHM = "CoClustering"

**Data Preparation & Train/Test Split**

- Load and transform the 3 datasets
- Split whole set into train, validation and test sets by segmenting it temporally

In [9]:
# load and prepare data
dfs = load_data(config=config['data_loader'])
dfs = prepare_data(dataframes=dfs)

In [10]:
# train-test split
df_train, df_test = leave_last_k(df=dfs['data'], config=config['optimization'])
df_train, df_valid = leave_last_k(df=df_train, config=config['optimization'])

**Optimization & Evaluation**

- Hyper-parameters - search which hyper-parameters optimize scoring metric for the given algorithm in the validation set
- Evaluation - retrieve best hyper-parameters and recover full training set to evaluate results on test set

In [11]:
# set tuner for hyperparam optimization
tuner = BayesianSearch(
    config["optimization"],
    method="retrieval",
    algorithm=ALGORITHM
    )

def objective(trial) -> float:
    return tuner.fit(df_train={"X": df_train}, df_valid={"X": df_valid}, trial=trial)

# set study
study = optuna.create_study(
    direction="minimize",
    sampler=optuna.samplers.TPESampler(seed=config["general"]["seed"])
    )
study.optimize(objective, n_trials=config["optimization"]["n_trials"])

# logging experiment
log_run(experiment_name="Retrieval", study=study, tuner=tuner)

[I 2025-06-30 23:18:16,490] A new study created in memory with name: no-name-594e2120-8ae1-4f31-8bf9-7fb820830584
[I 2025-06-30 23:18:25,299] Trial 0 finished with value: 0.72772 and parameters: {'n_cltr_u': 5, 'n_cltr_i': 9, 'n_epochs': 23}. Best is trial 0 with value: 0.72772.
[I 2025-06-30 23:18:32,005] Trial 1 finished with value: 0.72085 and parameters: {'n_cltr_u': 7, 'n_cltr_i': 4, 'n_epochs': 16}. Best is trial 1 with value: 0.72085.
[I 2025-06-30 23:18:39,208] Trial 2 finished with value: 0.71839 and parameters: {'n_cltr_u': 3, 'n_cltr_i': 9, 'n_epochs': 21}. Best is trial 2 with value: 0.71839.
[I 2025-06-30 23:18:52,231] Trial 3 finished with value: 0.72408 and parameters: {'n_cltr_u': 7, 'n_cltr_i': 3, 'n_epochs': 25}. Best is trial 2 with value: 0.71839.
[I 2025-06-30 23:19:06,874] Trial 4 finished with value: 0.7214 and parameters: {'n_cltr_u': 8, 'n_cltr_i': 4, 'n_epochs': 17}. Best is trial 2 with value: 0.71839.
[I 2025-06-30 23:19:21,354] Trial 5 finished with value: 

🏃 View run 30JUN2025 at: http://127.0.0.1:5000/#/experiments/590389134111182764/runs/26be667e374b43879d6b1d638d178fa7
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/590389134111182764
🏃 View run CoClustering at: http://127.0.0.1:5000/#/experiments/590389134111182764/runs/6918486cff5e42798caef9e458523acd
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/590389134111182764


In [12]:
# set algorithm best hyperparams
hyperparams = (
    config["optimization"]["retrieval"][ALGORITHM]["fixed"]
    | study.best_trial.params
)

# fit model on whole training set
df_train = pd.concat([df_train, df_valid]).reset_index(drop=True)
clf = Retrieval(algorithm=ALGORITHM, params=hyperparams)
clf.fit(trainset=df_train)

# test set evaluation
scorer = Evaluation(clf=clf)
scorer.fit(train=df_train, test=df_test)

Unnamed: 0_level_0,rmse,mse,mae,fcp
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
train,0.63957,0.40905,0.50967,0.75416
test,0.69051,0.47681,0.54682,0.61261
