In [1]:
import sys
import os

sys.path.append(os.path.abspath('..'))

import yaml
import optuna

from src.data.load import load_data
from src.data.prepare import prepare_data
from src.models.cv_iterator import leave_last_k
from src.data.features import feature_engineering
from src.data.utils import build_rank_input
from src.models.tuner import BayesianSearch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# read config
with open('../config.yml', 'r') as file:
    config=yaml.load(file, Loader= yaml.SafeLoader)
del file

In [3]:
# load and prepare data
dfs = load_data(config=config['data_loader'])
dfs = prepare_data(dataframes=dfs)

In [4]:
# train-test split
df_train, df_test = leave_last_k(df=dfs['data'], config=config['optimization'])
df_train, df_valid = leave_last_k(df=df_train, config=config['optimization'])

In [5]:
user_item_features = feature_engineering(
    dataframes={'user': dfs['user'], 'item': dfs['item'], 'data': df_train}
    )

In [6]:
import pandas as pd
from src.models.candidate import candidate_generation

df_train_neg = candidate_generation(df_train, n=20+3*3+3*3, positive_sampling=False)
df_test_neg = df_train_neg.groupby(by=['user_id']).sample(n=3*3)
df_train_neg = df_train_neg.drop(df_test_neg.index)
df_valid_neg = df_train_neg.groupby(by=['user_id']).sample(n=3*3)
df_train_neg = df_train_neg.drop(df_valid_neg.index)

df_train = pd.concat([df_train.iloc[:,:3], df_train_neg], ignore_index=True)
df_valid = pd.concat([df_valid.iloc[:,:3], df_valid_neg], ignore_index=True)
df_test = pd.concat([df_test.iloc[:,:3], df_test_neg], ignore_index=True)
del df_train_neg, df_valid_neg, df_test_neg

df_train, df_valid = [
    build_rank_input(ratings=df, features=user_item_features) for df in (df_train, df_valid)
    ]

In [7]:
# perform bayesian search
searcher = BayesianSearch(config['optimization']['hyper_params'], algorithm='XGBRanker')

def objective(trial) -> float:
    return searcher.fit(df_train, df_valid, trial)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

[I 2025-04-22 00:51:33,810] A new study created in memory with name: no-name-9536dc90-9b16-4844-836b-0446b4e6ddfa
[I 2025-04-22 00:51:35,238] Trial 0 finished with value: 0.9767882812442432 and parameters: {'learning_rate': 0.23925075205124977, 'gamma': 1.298644718662838, 'max_depth': 6, 'subsample': 0.9783715044617085, 'n_estimators': 85}. Best is trial 0 with value: 0.9767882812442432.
[I 2025-04-22 00:51:37,485] Trial 1 finished with value: 0.9767389830641775 and parameters: {'learning_rate': 0.27165602202944894, 'gamma': 1.4477235320626987, 'max_depth': 5, 'subsample': 0.7825631300226293, 'n_estimators': 186}. Best is trial 0 with value: 0.9767882812442432.
[I 2025-04-22 00:51:39,161] Trial 2 finished with value: 0.9766767452584386 and parameters: {'learning_rate': 0.16940251532502107, 'gamma': 3.8602274868054427, 'max_depth': 5, 'subsample': 0.8838961601313569, 'n_estimators': 111}. Best is trial 0 with value: 0.9767882812442432.
[I 2025-04-22 00:51:43,058] Trial 3 finished with v

In [8]:
print("Best trial:")
print(study.best_trial.params)

Best trial:
{'learning_rate': 0.09166913088710321, 'gamma': 4.72646187820386, 'max_depth': 4, 'subsample': 0.5708465919484484, 'n_estimators': 299}


In [9]:
import pandas as pd

# use indicator to know the origin
df_train = dfs['data'].merge(
    df_test
    , on=['user_id', 'item_id', 'rating'], how='left'
    , indicator=True
    )
# keep only rows that are present in df1 but not in df2
df_train = df_train[df_train['_merge'] == 'left_only'].drop(columns=['_merge'])

user_item_features = feature_engineering(
    dataframes={'user': dfs['user'], 'item': dfs['item'], 'data': df_train}
    )

# add negative sampling
df_train_neg = candidate_generation(df_train, n=20+3*3+3*3, positive_sampling=False)
df_test_neg = df_train_neg.groupby(by=['user_id']).sample(n=3*3)
df_train_neg = df_train_neg.drop(df_test_neg.index)

df_train = pd.concat([df_train, df_train_neg], ignore_index=True)
df_test = pd.concat([df_test, df_test_neg], ignore_index=True)


df_train, df_test = [
    build_rank_input(ratings=df.iloc[:,:3], features=user_item_features) for df in (df_train, df_test)
    ]

  df_train = dfs['data'].merge(


In [10]:
from sklearn.metrics import ndcg_score
from xgboost import XGBRanker
import numpy as np

clf = XGBRanker(**study.best_trial.params)
clf.fit(
    df_train['X'], df_train['y'].astype(int)
    , group=df_train['group']
    , verbose=False
    )

preds = clf.predict(df_test['X'])
offset = 0
ndcgs = []
for group_size in df_test['group']:
    y_true_group = df_test['y'][offset:offset+group_size]
    preds_group = preds[offset:offset+group_size]
    ndcgs.append(ndcg_score([y_true_group], [preds_group]))
    offset += group_size
np.mean(ndcgs)

0.980886431619589

In [11]:
baseline = dfs['data'].merge(
    dfs['item'].drop(columns=['movie_title', 'release_date', 'imdb_url'])
    , how='left', on='item_id'
    )

top_items_by_genre = {}

# genre columns
genre_cols = baseline.columns[4:]

# loop over each genre
for genre in genre_cols:
    # filter ratings for movies that belong to this genre
    genre_ratings = baseline[baseline[genre] == 1]
    
    # Count the number of ratings per item
    top_items = (
        genre_ratings
        .groupby('item_id')
        .size()
        .sort_values(ascending=False)
        .head(5)
    )

    # Save results
    top_items_by_genre[genre] = list(top_items.index)

top_items_by_genre

{'action': [267, 1373],
 'adventure': [50, 181, 300, 121, 174],
 'animation': [50, 181, 174, 117, 172],
 'children': [1, 71, 95, 588, 432],
 'comedy': [1, 151, 423, 132, 71],
 'crime': [294, 1, 204, 151, 173],
 'documentary': [100, 127, 56, 302, 12],
 'drama': [48, 32, 813, 847, 1065],
 'fantasy': [258, 100, 286, 127, 56],
 'film_noir': [423, 411, 472, 72, 755],
 'horror': [302, 89, 654, 484, 657],
 'musical': [288, 183, 234, 185, 200],
 'mystery': [289, 186, 132, 143, 71],
 'romance': [405, 302, 328, 191, 135],
 'sci_fi': [50, 181, 286, 237, 172],
 'thriller': [50, 258, 181, 121, 7],
 'war': [100, 288, 300, 98, 117],
 'western': [50, 181, 286, 121, 172],
 'unknown': [97, 435, 203, 177, 73]}

In [12]:
user_genre_counts = baseline.groupby('user_id')[genre_cols].sum()

# Get the genre with the max count per user
user_genre = user_genre_counts.idxmax(axis=1).reset_index()
user_genre.columns = ['user_id', 'genre']

result = []

for genre in top_items_by_genre.keys():
    
    users_genre_i = user_genre[user_genre['genre']==genre]
    users_genre_i.loc[:, 'item_id'] = [top_items_by_genre[genre]]* users_genre_i.shape[0]
    result.append(users_genre_i)

result = (
    pd.concat(result, axis=0, ignore_index=True)
    .sort_values('user_id', ascending=True)
    .reset_index(drop=True)
    .drop(columns=['genre'])
    )

result.loc[:, 'rating'] = [list(range(5)[::-1])] * result.shape[0]
result = result.explode(column=['item_id', 'rating'])

df_baseline = build_rank_input(ratings=result.iloc[:,:3], features=user_item_features)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  users_genre_i.loc[:, 'item_id'] = [top_items_by_genre[genre]]* users_genre_i.shape[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  users_genre_i.loc[:, 'item_id'] = [top_items_by_genre[genre]]* users_genre_i.shape[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  users_genre_i.loc[:, 'item_id'] =

In [13]:
preds = clf.predict(df_baseline['X'])
offset = 0
ndcgs = []
for group_size in df_baseline['group']:
    y_true_group = np.array(df_baseline['y'][offset:offset+group_size], dtype=float)
    preds_group = preds[offset:offset+group_size]
    ndcgs.append(ndcg_score([y_true_group], [preds_group]))
    offset += group_size
np.mean(ndcgs)

0.7543890666748849