In [1]:
import sys
import os

sys.path.append(os.path.abspath('..'))

import yaml
import pandas as pd
import joblib

from src.data.load import load_data
from src.data.prepare import prepare_data
from src.models.baseline import popular_items
from src.models.co_visit import CoVisit
from src.features.features import feature_engineering
from src.features.utils import build_rank_input
from src.models.ranker import Ranker
from src.models.utils import set_global_seed

**Config & Data Preparation**

- Read project configurations and settings
- Load and transform the 3 datasets

In [2]:
# read config
with open('../main/config.yml', 'r') as file:
    config=yaml.load(file, Loader= yaml.SafeLoader)

config["data_loader"]["path"] = "../" + config["data_loader"]["path"]
config["train"]["path"] = "../" + config["train"]["path"]
del file

# ensure reproducibility
set_global_seed(seed=config["general"]["seed"])

In [3]:
# load and prepare data
dfs = load_data(config=config['data_loader'])
dfs = prepare_data(dataframes=dfs, config=config["data_preparation"])

**Train**

- Train ranker

In [4]:
# negative samples for different sources
neg_sample_1 = popular_items(ui_matrix=dfs["data"], k=config["train"]["negative_sample"]["popular"])
neg_sample_2 = CoVisit(methods=["negative"]).fit(ui_matrix=dfs["data"])
neg_sample = pd.concat([neg_sample_1, neg_sample_2], ignore_index=True)

neg_sample = neg_sample[["user_id", "item_id"]]
neg_sample["rating"] = list(config["data_preparation"]["rating_conversion"].keys())[0]

del neg_sample_1, neg_sample_2

# build features for ranking model
user_item_features = feature_engineering(dataframes=dfs)

# add negative samples and merge features
df = pd.concat([dfs["data"], neg_sample], ignore_index=True)
df = build_rank_input(ratings=df, features=user_item_features)

del neg_sample, user_item_features

In [5]:
for algorithm, params in config["train"]["ranker"].items():
    
    # model training
    clf = Ranker(algorithm=algorithm, params=params)
    clf.fit(X=df["X"], y=df["y"], group=df["group"])

    # log model
    joblib.dump(clf, f'{config["train"]["path"]}/{algorithm}.joblib')