In [1]:
import sys
import os

sys.path.append(os.path.abspath('..'))

import yaml
import pandas as pd
import optuna
import numpy as np

from src.data.load import load_data
from src.data.prepare import prepare_data
from src.features.features import feature_engineering
from src.models.utils import leave_last_k, set_global_seed
from src.models.tuner import BayesianSearch
from src.models.retrieval import Retrieval
from src.models.evaluator import Evaluation

In [2]:
# read config
with open('config.yml', 'r') as file:
    config=yaml.load(file, Loader= yaml.SafeLoader)
del file

# ensure reproducibility
set_global_seed(seed=config["general"]["seed"])

**Data Preparation & Train/Test Split**

- Load and transform the 3 datasets
- Split whole set into train, validation and test sets by segmenting it temporally

In [3]:
# load and prepare data
dfs = load_data(config=config['data_loader'])
dfs = prepare_data(dataframes=dfs, config=config["data_preparation"])

In [4]:
# train-test split
df_train, df_test = leave_last_k(df=dfs['data'], config=config['optimization'])
df_train, df_valid = leave_last_k(df=df_train, config=config['optimization'])

**Feature Engineering & Optimization**

- Add negative samples, i.e., items likely to be disliked by user
- Feature Engineering - creates cross user-item features for ranking model
- Search which hyper-parameters optimize scoring metric for the given algorithm in the validation set

In [5]:
# build features for ranking model
features = feature_engineering(
    dataframes={'user': dfs['user'], 'item': dfs['item'], 'data': df_train}
    )

In [6]:
# user and item id mappings
mapping = {
    "user": {uid: i for i, uid in dfs["user"][["user_id_encoded", "user_id"]].values},
    "item": {iid: i for i, iid in enumerate(df_train["item_id"].unique())}
}

features["user"]["user_id_encoded"] = features["user"]["user_id"].map(mapping["user"])
features["user"] = features["user"].sort_values("user_id_encoded")
features["item"]["item_id_encoded"] = features["item"]["item_id"].map(mapping["item"])
features["item"] = features["item"].sort_values("item_id_encoded")

df_train["item_id_encoded"] = df_train["item_id"].map(mapping["item"])
df_valid["item_id_encoded"] = df_valid["item_id"].map(mapping["item"])

# filter out non existing items from validation
df_valid = df_valid.dropna()
df_valid["item_id_encoded"] = df_valid["item_id_encoded"].astype(int)

del mapping

In [7]:
# set tuner for hyperparam optimization
tuner = BayesianSearch(
    config["optimization"],
    method="retrieval",
    algorithm="TwoTower"
    )

def objective(trial) -> float:
    return tuner.fit({"X": df_train}, {"X": df_valid}, trial, features=features, epochs=5)

# set study
study = optuna.create_study(
    direction=config["optimization"]["retrieval"]["direction"],
    sampler=optuna.samplers.TPESampler(seed=config["general"]["seed"])
    )
study.optimize(objective, n_trials= config["optimization"]["n_trials"])

del features, df_valid

[I 2025-08-02 20:00:59,033] A new study created in memory with name: no-name-bc3ef049-415a-45c9-8d4d-d6293e5b6ee7
[I 2025-08-02 20:02:15,749] Trial 0 finished with value: 0.021 and parameters: {'embedding_dim': 68, 'num_user_layers': 8, 'num_item_layers': 7, 'lr': 0.006026718993550663, 'user_layer_0': 47, 'user_layer_1': 47, 'user_layer_2': 37, 'user_layer_3': 116, 'user_layer_4': 90, 'user_layer_5': 100, 'user_layer_6': 33, 'user_layer_7': 126, 'user_dropout_0': 0.41622132040021087, 'user_dropout_1': 0.10616955533913808, 'user_dropout_2': 0.09091248360355031, 'user_dropout_3': 0.09170225492671691, 'user_dropout_4': 0.15212112147976886, 'user_dropout_5': 0.2623782158161189, 'user_dropout_6': 0.21597250932105788, 'user_dropout_7': 0.14561457009902096, 'item_layer_0': 91, 'item_layer_1': 45, 'item_layer_2': 60, 'item_layer_3': 67, 'item_layer_4': 76, 'item_layer_5': 108, 'item_layer_6': 51, 'item_dropout_0': 0.2571172192068058, 'item_dropout_1': 0.29620728443102123, 'item_dropout_2': 0.0

**Evaluation & Logging**

- Recover full training set to evaluate results on test set
- Perform feature engineering
- Log run's information through mlflow

In [8]:
# get anti test-set, i.e., train & validation sets together
df_train = dfs["data"].merge(
    df_test
    , on=df_test.columns.to_list(), how="left"
    , indicator=True
    )
df_train = df_train[df_train["_merge"] == "left_only"].drop(columns=["_merge"])

In [9]:
# build features for ranking model
features = feature_engineering(
    dataframes={'user': dfs['user'], 'item': dfs['item'], 'data': df_train}
    )

In [10]:
# user and item id mappings
mapping = {
    "user": {uid: i for i, uid in dfs["user"][["user_id_encoded", "user_id"]].values},
    "item": {iid: i for i, iid in enumerate(df_train["item_id"].unique())}
}

features["user"]["user_id_encoded"] = features["user"]["user_id"].map(mapping["user"])
features["user"] = features["user"].sort_values("user_id_encoded")
features["item"]["item_id_encoded"] = features["item"]["item_id"].map(mapping["item"])
features["item"] = features["item"].sort_values("item_id_encoded")

df_train["item_id_encoded"] = df_train["item_id"].map(mapping["item"])
df_test["item_id_encoded"] = df_test["item_id"].map(mapping["item"])

# filter out non existing items from validation
df_test = df_test.dropna()
df_test["item_id_encoded"] = df_test["item_id_encoded"].astype(int)

del mapping

In [11]:
# set algorithm best hyperparams
hyperparams = {}

hyperparams["user_layers"] = [
    study.best_trial.params[f"user_layer_{i}"]
    for i in range(study.best_trial.params["num_user_layers"])
]
hyperparams["user_dropout"] = [
    study.best_trial.params[f"user_dropout_{i}"]
    for i in range(study.best_trial.params["num_user_layers"])
]
hyperparams["item_layers"] = [
    study.best_trial.params[f"item_layer_{i}"]
    for i in range(study.best_trial.params["num_item_layers"])
]
hyperparams["item_dropout"] = [
    study.best_trial.params[f"item_dropout_{i}"]
    for i in range(study.best_trial.params["num_item_layers"])
]

hyperparams["embedding_dim"] = study.best_trial.params["embedding_dim"]
hyperparams["lr"] = study.best_trial.params["lr"]

hyperparams["user_dim"] = 64
hyperparams["item_dim"] = 25

# fit model on whole training set
clf = Retrieval(algorithm="TwoTower", params=hyperparams)
clf.fit(trainset=df_train, features=features, epochs=20)
tuner.artifacts["models"][-1] = clf

# test set evaluation
scorer = Evaluation(clf=clf)
tuner.artifacts["metrics_test"] = scorer.fit(train=df_train, test=df_test, k=10, features=features)
tuner.artifacts["metrics_test"]

Unnamed: 0_level_0,recall@10,precision@10,hit_rate@10
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
train,0.01423,0.15101,0.61824
test,0.01166,0.00583,0.0562
