In [1]:
import sys
import os

sys.path.append(os.path.abspath('..'))

import yaml
import pandas as pd
import optuna
from datetime import datetime
import mlflow

from src.data.load import load_data
from src.data.prepare import prepare_data
from src.features.features import feature_engineering
from src.features.utils import build_rank_input
from src.models.ranker import Ranker
from src.models.retrieval import Retrieval
from src.models.co_visit import CoVisit
from src.models.baseline import popular_items
from src.models.tracker import launch_mlflow, Logging, load_params, get_run
from src.models.utils import set_global_seed

In [2]:
# read config
with open('config.yml', 'r') as file:
    config=yaml.load(file, Loader= yaml.SafeLoader)
del file

# ensure reproducibility
set_global_seed(seed=config["general"]["seed"])

# set experiment tracking
launch_mlflow()

# set params
experiment_name = "Training"
negative_sample_k = 15

**Data Preparation**

- Load and transform the 3 datasets

In [3]:
# load and prepare data
dfs = load_data(config=config['data_loader'])
dfs = prepare_data(dataframes=dfs, config=config["data_preparation"])

**Train**

- Train models for candidate generation
- Train ranker

In [4]:
for algorithm in ["SVD", "CoClustering", "KNNWithMeans"]:
    # load best hyper params
    params = load_params(experiment_name="Retrieval", parent_run_name=algorithm)

    if algorithm=="KNNWithMeans":
        # merge hyperparams in sim_options param
        sim_options = {
            "name": params["name"]
            , "user_based": params["user_based"]
            , "min_support": params["min_support"]
        }

        del params["name"], params["user_based"], params["min_support"]
        params["sim_options"] = sim_options

    # model training
    clf = Retrieval(algorithm=algorithm, params=params)
    clf.fit(trainset=dfs["data"])

    # log model
    logging = Logging(experiment_name=experiment_name, run_name=algorithm)
    logging.log_run(clf=clf)

🏃 View run 08JUL2025_005007 at: http://127.0.0.1:5000/#/experiments/542286196249503610/runs/d69f206c3bfe44d481ed2eff45408d09
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/542286196249503610
🏃 View run SVD at: http://127.0.0.1:5000/#/experiments/542286196249503610/runs/4142e7e912bc434f8897ad5626a0316a
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/542286196249503610
🏃 View run 08JUL2025_005017 at: http://127.0.0.1:5000/#/experiments/542286196249503610/runs/5ec7d10f966546e9b9d32f0139c944a0
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/542286196249503610
🏃 View run CoClustering at: http://127.0.0.1:5000/#/experiments/542286196249503610/runs/d3685a37c9894eacb73c75a96c4ad66b
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/542286196249503610
🏃 View run 08JUL2025_005022 at: http://127.0.0.1:5000/#/experiments/542286196249503610/runs/b599e58e85aa4517abf5c0f61c1c7893
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/542286196249503610
🏃 View 

In [None]:
# negative samples for different sources
neg_sample_1 = popular_items(ui_matrix=dfs["data"], k=negative_sample_k)
neg_sample_2 = CoVisit(methods=["negative"]).fit(ui_matrix=dfs["data"])
neg_sample = pd.concat([neg_sample_1, neg_sample_2], ignore_index=True)

neg_sample = neg_sample[["user_id", "item_id"]]
neg_sample["rating"] = list(config["data_preparation"]["rating_conversion"].keys())[0]

del neg_sample_1, neg_sample_2

In [None]:
# build features for ranking model
user_item_features = feature_engineering(dataframes=dfs)

# add negative samples and merge features
df = pd.concat([dfs["data"], neg_sample], ignore_index=True)
df = build_rank_input(ratings=df.iloc[:,:3], features=user_item_features)

del neg_sample, user_item_features

In [None]:
for algorithm in ["XGBRanker"]:
    # load best hyper params
    params = load_params(experiment_name="Ranker", parent_run_name=algorithm)
    
    # model training
    clf = Ranker(algorithm=algorithm, params=params)
    clf.fit(X=df["X"], y=df["y"], group=df["group"])

    # log model
    logging = Logging(experiment_name=experiment_name, run_name=algorithm, input_sample=df["X"].head())
    logging.log_run(clf=clf)