In [1]:
import sys
import os

sys.path.append(os.path.abspath('..'))

import yaml
import optuna

from src.data.load import load_data
from src.data.prepare import prepare_data
from src.models.cv_iterator import leave_last_k
from src.data.features import feature_engineering
from src.data.utils import build_rank_input
from src.models.tuner import BayesianSearch
from src.models.evaluator import evaluation

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# read config
with open('../config.yml', 'r') as file:
    config=yaml.load(file, Loader= yaml.SafeLoader)
del file

In [3]:
# load and prepare data
dfs = load_data(config=config['data_loader'])
dfs = prepare_data(dataframes=dfs)

In [4]:
# train-test split
df_train, df_test = leave_last_k(df=dfs['data'], config=config['optimization'])
df_train, df_valid = leave_last_k(df=df_train, config=config['optimization'])

In [5]:
user_item_features = feature_engineering(
    dataframes={'user': dfs['user'], 'item': dfs['item'], 'data': df_train}
    )

In [6]:
import pandas as pd
from src.models.candidate import candidate_generation

df_train_neg = candidate_generation(df_train, n=20+3*3+3*3, positive_sampling=False)
df_test_neg = df_train_neg.groupby(by=['user_id']).sample(n=3*3)
df_train_neg = df_train_neg.drop(df_test_neg.index)
df_valid_neg = df_train_neg.groupby(by=['user_id']).sample(n=3*3)
df_train_neg = df_train_neg.drop(df_valid_neg.index)

df_train = pd.concat([df_train.iloc[:,:3], df_train_neg], ignore_index=True)
df_valid = pd.concat([df_valid.iloc[:,:3], df_valid_neg], ignore_index=True)
df_test = pd.concat([df_test.iloc[:,:3], df_test_neg], ignore_index=True)
del df_train_neg, df_valid_neg, df_test_neg

df_train, df_valid = [
    build_rank_input(ratings=df, features=user_item_features) for df in (df_train, df_valid)
    ]

In [7]:
# perform bayesian search
searcher = BayesianSearch(config['optimization']['hyper_params'], algorithm='XGBRanker')

def objective(trial) -> float:
    return searcher.fit(df_train, df_valid, trial)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

[I 2025-04-22 15:41:04,354] A new study created in memory with name: no-name-9f8a788a-c0ff-43c0-b3df-a7e6f810d466
[I 2025-04-22 15:41:06,591] Trial 0 finished with value: 0.9764391725263232 and parameters: {'learning_rate': 0.051883976570897286, 'gamma': 3.6383090706163106, 'max_depth': 6, 'subsample': 0.7901315430884503, 'n_estimators': 150}. Best is trial 0 with value: 0.9764391725263232.
[I 2025-04-22 15:41:09,413] Trial 1 finished with value: 0.9767517045258994 and parameters: {'learning_rate': 0.18470238039771686, 'gamma': 0.10808579587180123, 'max_depth': 7, 'subsample': 0.7585708610689534, 'n_estimators': 172}. Best is trial 1 with value: 0.9767517045258994.
[I 2025-04-22 15:41:12,317] Trial 2 finished with value: 0.9768439898977765 and parameters: {'learning_rate': 0.2589001910184663, 'gamma': 4.050699547696411, 'max_depth': 8, 'subsample': 0.8523260747703804, 'n_estimators': 262}. Best is trial 2 with value: 0.9768439898977765.
[I 2025-04-22 15:41:14,154] Trial 3 finished with

In [8]:
print("Best trial:")
print(study.best_trial.params)

Best trial:
{'learning_rate': 0.10133704221006602, 'gamma': 1.4777257537910544, 'max_depth': 3, 'subsample': 0.5018699629861345, 'n_estimators': 299}


In [9]:
import pandas as pd

# use indicator to know the origin
df_train = dfs['data'].merge(
    df_test
    , on=['user_id', 'item_id', 'rating'], how='left'
    , indicator=True
    )
# keep only rows that are present in df1 but not in df2
df_train = df_train[df_train['_merge'] == 'left_only'].drop(columns=['_merge'])

user_item_features = feature_engineering(
    dataframes={'user': dfs['user'], 'item': dfs['item'], 'data': df_train}
    )

# add negative sampling
df_train_neg = candidate_generation(df_train, n=20+3*3+3*3, positive_sampling=False)
df_test_neg = df_train_neg.groupby(by=['user_id']).sample(n=3*3)
df_train_neg = df_train_neg.drop(df_test_neg.index)

df_train = pd.concat([df_train, df_train_neg], ignore_index=True)
df_test = pd.concat([df_test, df_test_neg], ignore_index=True)


df_train, df_test = [
    build_rank_input(ratings=df.iloc[:,:3], features=user_item_features) for df in (df_train, df_test)
    ]

  df_train = dfs['data'].merge(


In [16]:
from sklearn.metrics import ndcg_score
from xgboost import XGBRanker
import numpy as np

clf = XGBRanker(**study.best_trial.params)
clf.fit(
    df_train['X'], df_train['y'].astype(int)
    , group=df_train['group']
    , verbose=False
    )

preds = clf.predict(df_test['X'])

print(evaluation(df_test['y'], preds, df_test['group']))

0.9802685136847599


In [11]:
from src.models.baseline import baseline_model

result = baseline_model(dataframes=dfs, n=10)
group = result.groupby('user_id').size().to_list()

print(evaluation(result['rating'], result['est_rating'], group))
del group

0.9168812846621082


In [12]:
df_baseline = build_rank_input(ratings=result.drop(columns=['est_rating']), features=user_item_features)

preds = clf.predict(df_baseline['X'])
print(evaluation(df_baseline['y'], preds, df_baseline['group']))

0.9493807341800834
