In [1]:
import sys
import os

sys.path.append(os.path.abspath('..'))

import yaml
import pandas as pd
import optuna
from datetime import datetime
import mlflow

from src.data.load import load_data
from src.data.prepare import prepare_data
from src.features.features import feature_engineering
from src.features.utils import build_rank_input
from src.models.ranker import Ranker
from src.models.retrieval import Retrieval
from src.models.co_visit import CoVisit
from src.models.baseline import popular_items

from src.models.tracker import launch_mlflow, load_params, get_or_create_run, log_artifact, log_model
from src.models.utils import set_global_seed

In [2]:
# read config
with open('config.yml', 'r') as file:
    config=yaml.load(file, Loader= yaml.SafeLoader)
del file

# ensure reproducibility
set_global_seed(seed=config["general"]["seed"])

# set experiment tracking
launch_mlflow()

# set params
experiment_name = "Training"
run_name_1 = "CandidateGeneration"
run_name_2 = "Ranking"
top_k_negative = 15

**Data Preparation**

- Load and transform the 3 datasets

In [3]:
# load and prepare data
dfs = load_data(config=config['data_loader'])
dfs = prepare_data(dataframes=dfs, config=config["data_preparation"])

**Train**

- Train models for candidate generation
- Train ranker

In [4]:
for algorithm in ["SVD", "CoClustering", "KNNWithMeans"]:
    # load best hyper params
    params = load_params(experiment_name="Retrieval", parent_run_name=algorithm)

    if algorithm=="KNNWithMeans":
        # merge hyperparams in sim_options param
        sim_options = {
            "name": params["name"]
            , "user_based": params["user_based"]
            , "min_support": params["min_support"]
        }

        del params["name"], params["user_based"], params["min_support"]
        params["sim_options"] = sim_options

    # model training
    clf = Retrieval(algorithm=algorithm, params=params)
    clf.fit(trainset=dfs["data"])

    # log model
    with get_or_create_run(run_name=run_name_1, experiment_name=experiment_name) as parent:
        with mlflow.start_run(run_name=algorithm, nested=True):
            log_artifact(params, "params")
            log_model(artifact=clf, artifact_name=f"{algorithm}_instance", input_sample=dfs["data"].head())

🏃 View run SVD at: http://127.0.0.1:5000/#/experiments/542286196249503610/runs/36930036fc744053935c683c6de88d4c
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/542286196249503610
🏃 View run CandidateGeneration at: http://127.0.0.1:5000/#/experiments/542286196249503610/runs/d52c64ca885043a88510302b9592c3d8
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/542286196249503610
🏃 View run CoClustering at: http://127.0.0.1:5000/#/experiments/542286196249503610/runs/14e990cfb8c441679b579f5522e2e4da
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/542286196249503610
🏃 View run CandidateGeneration at: http://127.0.0.1:5000/#/experiments/542286196249503610/runs/d52c64ca885043a88510302b9592c3d8
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/542286196249503610
🏃 View run KNNWithMeans at: http://127.0.0.1:5000/#/experiments/542286196249503610/runs/555dfa8062be42f99c928c6e4987b93b
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/542286196249503610
🏃 Vie

In [5]:
# negative samples for different sources
neg_sample_1 = popular_items(ui_matrix=dfs["data"], top_k=top_k_negative)
neg_sample_2 = CoVisit(methods=["negative"]).fit(ui_matrix=dfs["data"])
neg_sample = pd.concat([neg_sample_1, neg_sample_2], ignore_index=True)

neg_sample = neg_sample[["user_id", "item_id"]]
neg_sample["rating"] = list(config["data_preparation"]["rating_conversion"].keys())[0]

del neg_sample_1, neg_sample_2

In [6]:
# build features for ranking model
user_item_features = feature_engineering(dataframes=dfs)

# add negative samples and merge features
df = pd.concat([dfs["data"], neg_sample], ignore_index=True)
df = build_rank_input(ratings=df.iloc[:,:3], features=user_item_features)

del neg_sample, user_item_features

In [7]:
for algorithm in ["XGBRanker"]:
    # load best hyper params
    params = load_params(experiment_name="Ranker", parent_run_name=algorithm)
    
    # model training
    clf = Ranker(algorithm=algorithm, params=params)
    clf.fit(X=df["X"], y=df["y"], group=df["group"])

    # log model
    with get_or_create_run(run_name=run_name_2, experiment_name=experiment_name) as parent:
        with mlflow.start_run(run_name=algorithm, nested=True):
            log_artifact(params, "params")
            log_model(artifact=clf, artifact_name=f"{algorithm}_instance", input_sample=df["X"].head())

Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

🏃 View run XGBRanker at: http://127.0.0.1:5000/#/experiments/542286196249503610/runs/30c73adf5a704b8d98ac82517408fef1
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/542286196249503610
🏃 View run Ranking at: http://127.0.0.1:5000/#/experiments/542286196249503610/runs/d705080ceb0c44a3b7b16b589992fd9b
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/542286196249503610
