In [1]:
import sys
import os

sys.path.append(os.path.abspath('..\..'))

import yaml
import pandas as pd
import optuna

from src.data.load import load_data
from src.data.prepare import prepare_data
from src.models.baseline import baseline_model
from src.models.cv_iterator import leave_last_k
from src.features.features import feature_engineering
from src.models.retrieval import candidate_generation
from src.features.utils import build_rank_input
from src.models.tuner import BayesianSearch
from src.models.evaluator import evaluation

In [2]:
# read config
with open('..\config.yml', 'r') as file:
    config=yaml.load(file, Loader= yaml.SafeLoader)
del file

In [3]:
# load and prepare data
dfs = load_data(config=config['data_loader'])
dfs = prepare_data(dataframes=dfs)

In [4]:
# baseline model accuracy
df_baseline = baseline_model(dataframes=dfs, n=10)
group = df_baseline.groupby('user_id').size().to_list()

print(evaluation(df_baseline['rating'], df_baseline['est_rating'], group))
del group

0.9168812846621082


In [5]:
# train-test split
df_train, df_test = leave_last_k(df=dfs['data'], config=config['optimization'])
df_train, df_valid = leave_last_k(df=df_train, config=config['optimization'])

In [6]:
# generate candidates and create features
candidates = candidate_generation(df_train, config['optimization']['retrieval'])
df_train = pd.concat([df_train.iloc[:,:3], candidates['positive'], candidates['negative']], ignore_index=True)
user_item_features = feature_engineering(
    dataframes={'user': dfs['user'], 'item': dfs['item'], 'data': df_train}
    )

df_train, df_valid = [
    build_rank_input(ratings=df.iloc[:,:3], features=user_item_features)
    for df in (df_train, df_valid)
    ]

del candidates, user_item_features

In [7]:
# perform bayesian search
searcher = BayesianSearch(config['optimization']['hyper_params'], algorithm='XGBRanker')

def objective(trial) -> float:
    return searcher.fit(df_train, df_valid, trial)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

print("Best trial:\n", study.best_trial.params)
del searcher

[I 2025-04-25 19:08:14,691] A new study created in memory with name: no-name-a7b09356-a96a-4b2e-83ec-a21294b8e37d
[I 2025-04-25 19:08:19,482] Trial 0 finished with value: 0.9448879323919251 and parameters: {'learning_rate': 0.16730167981398847, 'gamma': 2.0297079808192398, 'max_depth': 13, 'subsample': 0.7719818423957301, 'n_estimators': 68}. Best is trial 0 with value: 0.9448879323919251.
[I 2025-04-25 19:08:35,941] Trial 1 finished with value: 0.946035927296629 and parameters: {'learning_rate': 0.25142009280712985, 'gamma': 7.873590441806872, 'max_depth': 15, 'subsample': 0.6654435151508664, 'n_estimators': 339}. Best is trial 1 with value: 0.946035927296629.
[I 2025-04-25 19:08:39,470] Trial 2 finished with value: 0.9465518568658565 and parameters: {'learning_rate': 0.270512714099771, 'gamma': 9.52874441821011, 'max_depth': 11, 'subsample': 0.5533369388362792, 'n_estimators': 56}. Best is trial 2 with value: 0.9465518568658565.
[I 2025-04-25 19:08:52,040] Trial 3 finished with value

Best trial:
 {'learning_rate': 0.0738917974562371, 'gamma': 4.853881784619486, 'max_depth': 3, 'subsample': 0.8000361468797555, 'n_estimators': 251}


In [8]:
# use indicator to know the origin
df_train = dfs['data'].merge(
    df_test
    , on=['user_id', 'item_id', 'rating'], how='left'
    , indicator=True
    )
# keep only rows that are present in df1 but not in df2
df_train = df_train[df_train['_merge'] == 'left_only'].drop(columns=['_merge'])

# create features
user_item_features = feature_engineering(
    dataframes={'user': dfs['user'], 'item': dfs['item'], 'data': df_train}
    )

# generate candidates
candidates = candidate_generation(df_train, config['optimization']['retrieval'])
df_train_ = pd.concat([df_train.iloc[:,:3], candidates['positive'], candidates['negative']], ignore_index=True)

df_train_, df_test_ = [
    build_rank_input(ratings=df.iloc[:,:3], features=user_item_features) for df in (df_train_, df_test)
    ]

del candidates, user_item_features

In [9]:
from xgboost import XGBRanker

clf = XGBRanker(**study.best_trial.params)
clf.fit(
    df_train_['X'], df_train_['y'].astype(int)
    , group=df_train_['group']
    , verbose=False
    )

preds = clf.predict(df_test_['X'])

print(evaluation(df_test_['y'], preds, df_test_['group']))

0.9528064392223665


In [10]:
def calculate_metrics(top_items, data):
    # Coverage: proportion of unique items recommended
    recommended_items = top_items['item_id'].unique()
    total_items = data['item_id'].unique()
    coverage_score = len(recommended_items) / len(total_items)
    
    # Novelty: Based on item popularity (inverse)
    item_popularity = data.groupby('item_id').size()
    item_popularity = item_popularity[item_popularity > 0]
    item_popularity = 1 - item_popularity / item_popularity.sum()  # Normalize
    novelty_score = top_items['item_id'].map(item_popularity).mean()

    return coverage_score, novelty_score

# Calculate metrics for the recommendations
coverage_score, novelty_score = calculate_metrics(df_test.iloc[:, :2], df_train.iloc[:, :3])
# Print out the results
print(f'Coverage: {coverage_score}')
print(f'Novelty: {novelty_score}')

Coverage: 0.6463195691202872
Novelty: 0.9984584266355963


In [11]:
user_item_features = feature_engineering(dataframes={
    'user': dfs['user'], 'item': dfs['item'], 'data': dfs['data']
    })

df_baseline = build_rank_input(ratings=df_baseline.drop(columns=['est_rating']), features=user_item_features)

preds = clf.predict(df_baseline['X'])
print(evaluation(df_baseline['y'], preds, df_baseline['group']))

0.9474741199634648
