In [1]:
import sys
import os

sys.path.append(os.path.abspath('..\..'))

import yaml
import pandas as pd
import optuna

from src.data.load import load_data
from src.data.prepare import prepare_data
from src.models.cv_iterator import leave_last_k
from src.features.features import feature_engineering
from src.models.retrieval import candidate_generation
from src.features.utils import build_rank_input
from src.models.tuner import BayesianSearch
from src.models.ranker import Ranker
from src.models.evaluator import evaluation, recs_score

In [2]:
# read config
with open('..\config.yml', 'r') as file:
    config=yaml.load(file, Loader= yaml.SafeLoader)
del file

In [3]:
# load and prepare data
dfs = load_data(config=config['data_loader'])
dfs = prepare_data(dataframes=dfs)

In [4]:
# train-test split
df_train, df_test = leave_last_k(df=dfs['data'], config=config['optimization'])
df_train, df_valid = leave_last_k(df=df_train, config=config['optimization'])

**Model Building**
- Candidate Generation: in addition to the existing observations, it retrieves items that a user might like/dislike it (negative sampling)
- Feature Engineering: creates cross user-item features for ranking model
- Hyper-parameter Tunning: searches best hyper-parameters to maximize evaluation metric

In [5]:
# generate candidates through negative sampling
candidates = candidate_generation(df_train, config['optimization']['retrieval'])
df_train = pd.concat([df_train.iloc[:,:3], candidates['positive'], candidates['negative']], ignore_index=True)

del candidates

In [6]:
# build features for ranking model
user_item_features = feature_engineering(
    dataframes={'user': dfs['user'], 'item': dfs['item'], 'data': df_train}
    )

df_train, df_valid = [
    build_rank_input(ratings=df.iloc[:,:3], features=user_item_features)
    for df in (df_train, df_valid)
    ]

del user_item_features

In [7]:
# hyper-parameter tunning through bayesian search
searcher = BayesianSearch(config['optimization']['hyper_params'], algorithm='XGBRanker')

def objective(trial) -> float:
    return searcher.fit(df_train, df_valid, trial)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

print("Best trial:\n", study.best_trial.params)
del searcher

[I 2025-05-12 21:43:31,941] A new study created in memory with name: no-name-46fb3476-52ec-4a58-abd9-651f48347d1c
[I 2025-05-12 21:43:46,903] Trial 0 finished with value: 0.9464911166587041 and parameters: {'learning_rate': 0.2836990601615435, 'gamma': 8.90023881537862, 'max_depth': 13, 'subsample': 0.584260145764142, 'n_estimators': 294}. Best is trial 0 with value: 0.9464911166587041.
[I 2025-05-12 21:43:58,585] Trial 1 finished with value: 0.9468276038883875 and parameters: {'learning_rate': 0.08314398871405618, 'gamma': 6.558256906498968, 'max_depth': 7, 'subsample': 0.6903977001584306, 'n_estimators': 228}. Best is trial 1 with value: 0.9468276038883875.
[I 2025-05-12 21:44:13,813] Trial 2 finished with value: 0.9459845749388813 and parameters: {'learning_rate': 0.4972996872071615, 'gamma': 8.888728848030302, 'max_depth': 14, 'subsample': 0.7614398051061037, 'n_estimators': 298}. Best is trial 1 with value: 0.9468276038883875.
[I 2025-05-12 21:44:25,217] Trial 3 finished with valu

Best trial:
 {'learning_rate': 0.1313766817581975, 'gamma': 2.527344368283234, 'max_depth': 3, 'subsample': 0.6530051385723286, 'n_estimators': 142}


**Evaluation**
- Use best hyper-params to train on previous train and validation sets
- Check model accuracy on test set

In [8]:
# get anti test-set, i.e., train & validation sets together
df_train = dfs['data'].merge(
    df_test
    , on=['user_id', 'item_id', 'rating'], how='left'
    , indicator=True
    )
df_train = df_train[df_train['_merge'] == 'left_only'].drop(columns=['_merge'])

In [9]:
# generate candidates
candidates = candidate_generation(df_train, config['optimization']['retrieval'])
df_train_ = pd.concat([df_train.iloc[:,:3], candidates['positive'], candidates['negative']], ignore_index=True)

del candidates

In [10]:
# create features for ranking model
user_item_features = feature_engineering(
    dataframes={'user': dfs['user'], 'item': dfs['item'], 'data': df_train_}
    )

df_train_, df_test_ = [
    build_rank_input(ratings=df.iloc[:,:3], features=user_item_features) for df in (df_train_, df_test)
    ]

del user_item_features

In [11]:
clf = Ranker(algorithm='XGBRanker', params=study.best_trial.params)
clf.fit(
    df_train_['X'], df_train_['y'].astype(int)
    , group=df_train_['group']
    )

preds = clf.predict(df_test_['X'])

print(f"NDCG: {evaluation(df_test_['y'], preds, df_test_['group'])}")

# shouldn't be done with test set
# recs_score(df_test.iloc[:, :2], df_train.iloc[:, :3])

NDCG: 0.9527302471389516
