In [8]:
import sys
import os

sys.path.append(os.path.abspath('..'))

import yaml
import pandas as pd

from src.data.load import load_data
from src.data.prepare import prepare_data
from src.models.cv_iterator import leave_last_k
from src.features.features import feature_engineering
from src.models.retrieval import candidate_generation
from src.features.utils import build_rank_input

In [9]:
# read config
with open('..\main\config.yml', 'r') as file:
    config=yaml.load(file, Loader= yaml.SafeLoader)
del file

In [10]:
# load and prepare data
dataframes = load_data(config=config['data_loader'])
dataframes = prepare_data(dataframes=dataframes)

In [11]:
user_item_features = feature_engineering(dataframes=dataframes)

candidates = candidate_generation(dataframes['data'], config['model']['retrieval'])
df_train = pd.concat(
    [dataframes['data'].iloc[:,:3], candidates['positive'], candidates['negative']]
    , ignore_index=True
    )

df_train = build_rank_input(ratings=df_train, features=user_item_features)

del candidates

In [12]:
from sklearn.metrics import ndcg_score
from xgboost import XGBRanker
import numpy as np

clf = XGBRanker(**config['model']['ranking']['hyper_params'])
clf.fit(
    df_train['X'], df_train['y'].astype(int)
    , group=df_train['group']
    , verbose=False
    )

In [13]:
df = dataframes['data'].copy()
all_users = df['user_id'].unique()
all_items = df['item_id'].unique()

# create a full user-item cartesian product (all possible interactions)
all_user_item_pairs = pd.MultiIndex.from_product([all_users, all_items], names=['user_id', 'item_id']).to_frame(index=False)

# merge with actual ratings to find which ones exist
all_user_item_pairs = all_user_item_pairs.merge(
    df[['user_id', 'item_id']]
    , on=['user_id', 'item_id'], how='left', indicator=True
    )

# filter for those not in the ratings (i.e., where the user has not rated the item)
missing_ratings = all_user_item_pairs[all_user_item_pairs['_merge'] == 'left_only']

# keep only the columns you want
missing_ratings = missing_ratings[['user_id', 'item_id']]

missing_ratings = (
    missing_ratings
    .merge(user_item_features['user'], how='left', on='user_id')
    .merge(user_item_features['item'], how='left', on='item_id')
    )

missing_ratings.head()

Unnamed: 0,user_id,item_id,u_age,u_gender_is_male,u_job_is_administrator,u_job_is_artist,u_job_is_doctor,u_job_is_educator,u_job_is_engineer,u_job_is_entertainment,...,i_gender_is_thriller,i_gender_is_war,i_gender_is_western,i_gender_is_unknown,i_release_year,i_release_month,i_rating_count,i_rating_nunique,i_rating_mean,i_rating_std
0,196,302,49,1,0,0,0,0,0,0,...,0,1,0,0,1997.0,1997.0,297,3,2.208754,0.746743
1,196,377,49,1,0,0,0,0,0,0,...,0,0,0,0,1994.0,1994.0,13,2,1.076923,0.27735
2,196,51,49,1,0,0,0,0,0,0,...,0,0,1,1,1994.0,1994.0,81,3,1.740741,0.754615
3,196,346,49,1,0,0,0,0,0,0,...,0,0,0,0,1997.0,1997.0,126,3,1.833333,0.766812
4,196,474,49,1,0,0,0,0,0,0,...,1,0,1,0,1963.0,1963.0,194,3,2.283505,0.786569


In [14]:
preds = clf.predict(missing_ratings.drop(columns=['user_id', 'item_id']))

missing_ratings['pred'] = preds

(
    missing_ratings
    .sort_values(['user_id', 'pred'], ascending=[True, False])
    .groupby('user_id').head(3)
    [['user_id', 'item_id', 'pred']]
)

Unnamed: 0,user_id,item_id,pred
178185,1,408,0.978521
179050,1,1367,0.963823
178195,1,318,0.941785
355919,2,119,0.617140
355002,2,408,0.533017
...,...,...,...
1472955,942,1293,5.552734
1472670,942,814,5.534440
1476635,943,408,0.591964
1476710,943,169,0.591964
