In [1]:
import sys
import os

sys.path.append(os.path.abspath('..'))

import yaml
import pandas as pd
import joblib

from src.data.load import load_data
from src.data.prepare import prepare_data
from src.models.cv_iterator import leave_last_k
from src.features.features import feature_engineering
from src.models.retrieval import candidate_generation
from src.features.utils import build_rank_input
from src.models.ranker import Ranker
from src.models.baseline import baseline_model
from src.models.evaluator import evaluation

In [2]:
# read config
with open('..\main\config.yml', 'r') as file:
    config=yaml.load(file, Loader= yaml.SafeLoader)
del file

In [3]:
# load and prepare data
dfs = load_data(config=config['data_loader'])
dfs = prepare_data(dataframes=dfs)

In [4]:
# generate candidates through negative sampling
candidates = candidate_generation(dfs['data'], config['model']['retrieval'])
df_train = pd.concat(
    [dfs['data'].iloc[:,:3], candidates['positive'], candidates['negative']]
    , ignore_index=True
    )

# build features for ranking model
user_item_features = feature_engineering(
    dataframes={'user': dfs['user'], 'item': dfs['item'], 'data': df_train})
df_train = build_rank_input(ratings=df_train, features=user_item_features)

del candidates

In [5]:
# train ranking model
clf = Ranker(
    algorithm='XGBRanker',
    params=config['model']['ranking']['hyper_params']
    )
clf.fit(
    df_train['X'], df_train['y']
    , group=df_train['group']
    )

In [6]:
joblib.dump(clf.model, f'../{config["model"]["path"]}')
del clf

**Inference**
- Get candidates thorugh anti-train set
- Rank candidates and get top-k recommendations

In [7]:
clf = joblib.load(f'../{config["model"]["path"]}')

In [8]:
# create a full user-item cartesian product (all possible interactions)
all_user_item_pairs = pd.MultiIndex.from_product(
    [dfs['data']['user_id'].unique(), dfs['data']['item_id'].unique()]
    , names=['user_id', 'item_id']
    ).to_frame(index=False)

# merge with actual ratings to find which ones exist
all_user_item_pairs = all_user_item_pairs.merge(
    dfs['data'][['user_id', 'item_id']].copy()
    , on=['user_id', 'item_id'], how='left', indicator=True
    )

# filter out existing ratings and select user-item pairs
candidates = all_user_item_pairs[all_user_item_pairs['_merge'] == 'left_only']
candidates = candidates[['user_id', 'item_id']]

In [9]:
# add user-item features
candidates = (
    candidates
    .merge(user_item_features['user'], how='left', on='user_id')
    .merge(user_item_features['item'], how='left', on='item_id')
    )

# rank and get top-k items
candidates['pred'] = clf.predict(candidates.drop(columns=['user_id', 'item_id']))
(
    candidates
    .sort_values(['user_id', 'pred'], ascending=[True, False])
    .groupby('user_id').head(3)
    [['user_id', 'item_id', 'pred']]
    .head(7)
)

Unnamed: 0,user_id,item_id,pred
179426,1,1467,3.271719
179369,1,1599,3.263664
179243,1,1293,3.256404
355914,2,1500,2.667152
356402,2,1467,2.574783
356345,2,1599,2.566728
421129,3,851,-0.464762


**Baseline Model**
- Compare baseline vs ranking model accuracy

In [10]:
# get recommendations based on popularity
df_baseline = baseline_model(dataframes=dfs, n=10)
group = df_baseline.groupby('user_id').size().to_list()

print(
    f"Baseline model: {evaluation(df_baseline['rating'], df_baseline['est_rating'], group):.4f}"
    )
del group

# get model's ranking and compute accuracy
df_baseline = build_rank_input(
    ratings=df_baseline.drop(columns=['est_rating']),
    features=user_item_features
    )

preds = clf.predict(df_baseline['X'])
print(
    f"Ranking model: {evaluation(df_baseline['y'], preds, df_baseline['group']):.4f}"
    )

Baseline model: 0.9169
Ranking model: 0.9531
