In [1]:
import sys
import os

sys.path.append(os.path.abspath('..'))

import yaml
import pandas as pd
import optuna

from src.data.load import load_data
from src.data.prepare import prepare_data
from src.models.tuner import BayesianSearch
from src.models.retrieval import Retrieval
from src.models.evaluator import Evaluation
from src.models.tracker import launch_mlflow, log_run
from src.models.utils import leave_last_k, set_global_seed

In [2]:
# read config
with open('config.yml', 'r') as file:
    config=yaml.load(file, Loader= yaml.SafeLoader)
del file

# ensure reproducibility
set_global_seed(seed=config["general"]["seed"])

# set experiment tracking
launch_mlflow()

# set algorithm
ALGORITHM = "SVD"

**Data Preparation & Train/Test Split**

- Load and transform the 3 datasets
- Split whole set into train, validation and test sets by segmenting it temporally

In [3]:
# load and prepare data
dfs = load_data(config=config['data_loader'])
dfs = prepare_data(dataframes=dfs, config=config["data_preparation"])

In [4]:
# train-test split
df_train, df_test = leave_last_k(df=dfs['data'], config=config['optimization'])
df_train, df_valid = leave_last_k(df=df_train, config=config['optimization'])

**Optimization & Evaluation**

- Hyper-parameters - search which hyper-parameters optimize scoring metric for the given algorithm in the validation set
- Evaluation - retrieve best hyper-parameters and recover full training set to evaluate results on test set

In [5]:
# set tuner for hyperparam optimization
tuner = BayesianSearch(
    config["optimization"],
    method="retrieval",
    algorithm=ALGORITHM
    )

def objective(trial) -> float:
    return tuner.fit(df_train={"X": df_train}, df_valid={"X": df_valid}, trial=trial)

# set study
study = optuna.create_study(
    direction="maximize",
    sampler=optuna.samplers.TPESampler(seed=config["general"]["seed"])
    )
study.optimize(objective, n_trials=config["optimization"]["n_trials"])

[I 2025-07-07 20:30:23,122] A new study created in memory with name: no-name-429bf8d1-2682-43a5-b654-bf1550ba915e
[I 2025-07-07 20:30:29,652] Trial 0 finished with value: 0.00021 and parameters: {'n_factors': 87, 'n_epochs': 25, 'lr_bu': 0.36626497696389115, 'lr_bi': 0.29973058361432126, 'lr_pu': 0.07885330158077582, 'lr_qi': 0.07884126564776513, 'reg_bu': 0.03846096996241773, 'reg_bi': 0.4344263114297182, 'reg_pu': 0.3045463557541723, 'reg_qi': 0.35695556312006227}. Best is trial 0 with value: 0.00021.
[I 2025-07-07 20:30:35,731] Trial 1 finished with value: 0.00021 and parameters: {'n_factors': 52, 'n_epochs': 25, 'lr_bu': 0.41638887775941047, 'lr_bi': 0.1069572162284598, 'lr_pu': 0.0917306586363432, 'lr_qi': 0.09251885041686347, 'reg_bu': 0.1590786990501735, 'reg_bi': 0.26713065149979653, 'reg_pu': 0.22165305913463673, 'reg_qi': 0.15270227869704053}. Best is trial 0 with value: 0.00021.
[I 2025-07-07 20:30:40,099] Trial 2 finished with value: 0.00191 and parameters: {'n_factors': 11

In [6]:
# set algorithm best hyperparams
hyperparams = (
    config["optimization"]["retrieval"][ALGORITHM]["fixed"]
    | study.best_trial.params
)

if ALGORITHM=="KNNWithMeans":
    # merge hyperparams in sim_options param
    sim_options = {
        "name": hyperparams["name"]
        , "user_based": hyperparams["user_based"]
        , "min_support": hyperparams["min_support"]
    }

    del hyperparams["name"], hyperparams["user_based"], hyperparams["min_support"]
    hyperparams["sim_options"] = sim_options

# fit model on whole training set
df_train = pd.concat([df_train, df_valid]).reset_index(drop=True)
clf = Retrieval(algorithm=ALGORITHM, params=hyperparams)
clf.fit(trainset=df_train)
tuner.artifacts["models"][-1] = clf

# test set evaluation
scorer = Evaluation(clf=clf)
tuner.artifacts["test_evaluation"] = scorer.fit(train=df_train, test=df_test)
tuner.artifacts["test_evaluation"]

Unnamed: 0_level_0,rmse,mse,mae,fcp,recall@10,precision@10,hit_rate@10
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
train,0.71255,0.50773,0.57106,0.66707,-1.0,-1.0,-1.0
test,0.74085,0.54886,0.59128,0.56279,0.00276,0.00138,0.01379


In [7]:
# logging experiment
log_run(experiment_name="Retrieval", study=study, tuner=tuner)

🏃 View run 07JUL2025 at: http://127.0.0.1:5000/#/experiments/992590219418744088/runs/21e560493f334d54838246aa9aa22ae5
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/992590219418744088
🏃 View run SVD at: http://127.0.0.1:5000/#/experiments/992590219418744088/runs/1bcb11830a5c41caa38086267fd9ca24
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/992590219418744088
