In [1]:
import sys
import os

sys.path.append(os.path.abspath('..'))

import yaml
import pandas as pd
import optuna
from datetime import datetime
import mlflow

from src.data.load import load_data
from src.data.prepare import prepare_data
from src.features.features import feature_engineering
from src.features.utils import build_rank_input
from src.models.ranker import Ranker
from src.models.retrieval import Retrieval
from src.models.co_visit import CoVisit
from src.models.baseline import popular_items
from src.models.tracker import launch_mlflow, Logging, load_params, get_run
from src.models.utils import set_global_seed

**Config & Data Preparation**

- Read project configurations and settings
- Load and transform the 3 datasets

In [2]:
# read config
with open('config.yml', 'r') as file:
    config=yaml.load(file, Loader= yaml.SafeLoader)
del file

# ensure reproducibility
set_global_seed(seed=config["general"]["seed"])

# set experiment tracking
launch_mlflow()

# set params
experiment_name = "Training"
negative_sample_k = 15

In [3]:
# load and prepare data
dfs = load_data(config=config['data_loader'])
dfs = prepare_data(dataframes=dfs, config=config["data_preparation"])

**Train**

- Train models for candidate generation
- Train ranker

In [4]:
for algorithm in ["SVD", "CoClustering", "KNNWithMeans"]:
    # load best hyper params
    params = load_params(experiment_name="Retrieval", parent_run_name=algorithm)

    # model training
    clf = Retrieval(algorithm=algorithm, params=params)
    clf.fit(trainset=dfs["data"])

    # log model
    logging = Logging(experiment_name=experiment_name, run_name=algorithm)
    logging.log_run(clf=clf)

🏃 View run 08JUL2025_124005 at: http://127.0.0.1:5000/#/experiments/842522629086170842/runs/2857aef39c5b4fd2bc59da160d6b3e2a
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/842522629086170842
🏃 View run SVD at: http://127.0.0.1:5000/#/experiments/842522629086170842/runs/23f1307505354e7e9fd2be9e80ac9048
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/842522629086170842
🏃 View run 08JUL2025_124014 at: http://127.0.0.1:5000/#/experiments/842522629086170842/runs/3cfc0f9e019946259a95c4bf6fa2a24e
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/842522629086170842
🏃 View run CoClustering at: http://127.0.0.1:5000/#/experiments/842522629086170842/runs/70a0d60d3b0c4256b4156cb6bf0325f2
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/842522629086170842
🏃 View run 08JUL2025_124018 at: http://127.0.0.1:5000/#/experiments/842522629086170842/runs/bd580665dcd74eea993a7d089f1fd648
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/842522629086170842
🏃 View 

In [5]:
# negative samples for different sources
neg_sample_1 = popular_items(ui_matrix=dfs["data"], k=negative_sample_k)
neg_sample_2 = CoVisit(methods=["negative"]).fit(ui_matrix=dfs["data"])
neg_sample = pd.concat([neg_sample_1, neg_sample_2], ignore_index=True)

neg_sample = neg_sample[["user_id", "item_id"]]
neg_sample["rating"] = list(config["data_preparation"]["rating_conversion"].keys())[0]

del neg_sample_1, neg_sample_2

# build features for ranking model
user_item_features = feature_engineering(dataframes=dfs)

# add negative samples and merge features
df = pd.concat([dfs["data"], neg_sample], ignore_index=True)
df = build_rank_input(ratings=df.iloc[:,:3], features=user_item_features)

del neg_sample, user_item_features

In [6]:
for algorithm in ["XGBRanker"]:
    # load best hyper params
    params = load_params(experiment_name="Ranker", parent_run_name=algorithm)
    
    # model training
    clf = Ranker(algorithm=algorithm, params=params)
    clf.fit(X=df["X"], y=df["y"], group=df["group"])

    # log model
    logging = Logging(experiment_name=experiment_name, run_name=algorithm, input_sample=df["X"].head())
    logging.log_run(clf=clf)

Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

🏃 View run 08JUL2025_124102 at: http://127.0.0.1:5000/#/experiments/842522629086170842/runs/7e71231fdf524cd5a4c53a01541d6dd0
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/842522629086170842
🏃 View run XGBRanker at: http://127.0.0.1:5000/#/experiments/842522629086170842/runs/d8edc4822c7b447d8e8c2389cf194424
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/842522629086170842
