##### import libs

In [1]:
import gc
import numpy as np
import pandas as pd
from copy import deepcopy
from catboost import CatBoostRanker, Pool
from sklearn.model_selection import train_test_split

from utils import load_data, evaluate

In [3]:
model = CatBoostRanker()

In [4]:
model.load_model("ranker_01")

<catboost.core.CatBoostRanker at 0x7f6d4e138370>

In [6]:
model.feature_names_

['num_of_likes_by_item_id',
 'num_of_shares_by_item_id',
 'std_viewers_age_by_item_id',
 'num_of_bookmarks_by_user_id',
 'median_viewers_gender_by_item_id',
 'ratio_of_dislikes_by_source_id',
 'user_like_counts_by_source_id_ratio_to_likes',
 'age',
 'user_view_counts_by_source_id_ratio_to_views',
 'num_of_likes_by_user_id',
 'std_likers_gender_by_item_id',
 'min_likers_age_by_item_id',
 'duration',
 'max_likers_gender_by_item_id',
 'user_id',
 'ratio_of_likes_by_user_id',
 'max_viewers_gender_by_item_id',
 'user_like_counts_by_source_id_ratio_to_views',
 'mean_viewers_age_by_item_id',
 'min_viewers_gender_by_item_id',
 'source_id',
 'num_of_views_by_item_id',
 'num_of_shares_by_source_id',
 'median_likers_age_by_item_id',
 'mean_viewers_gender_by_item_id',
 'ratio_of_likes_by_source_id',
 'median_likers_gender_by_item_id',
 'ratio_of_dislikes_by_item_id',
 'min_likers_gender_by_item_id',
 'ratio_of_bookmarks_by_item_id',
 'ratio_of_bookmarks_by_source_id',
 'min_viewers_age_by_item_id'

: 

##### load data

In [2]:
user_item_data, user_meta_data, item_meta_data, test_pairs_data = load_data()

In [3]:
user_embeddings = np.load("dumps/user_embeddings.npy")
item_embeddings = np.load("dumps/item_embeddings.npy")
user_biases = np.load("dumps/user_biases.npy")
item_biases = np.load("dumps/item_biases.npy")
lightfm_scores = np.load("dumps/lightfm_scores.npy")

In [5]:
test_lightfm_scores = np.load("dumps/test_lightfm_scores.npy")

##### create features

In [6]:
user_meta_data["user_lightfm_embeddings"] = user_embeddings.tolist()
item_meta_data["item_lightfm_embeddings"] = item_embeddings.tolist()
user_meta_data["user_lightfm_biases"] = user_biases.tolist()
item_meta_data["item_lightfm_biases"] = item_biases.tolist()

In [7]:
user_item_data["lightfm_scores"] = lightfm_scores.tolist()

In [8]:
item_meta_data = item_meta_data.rename({"embeddings": "video_embeddings"}, axis=1)

In [9]:
user_item_data[user_item_data.dtypes[user_item_data.dtypes == np.uint8].index] = (
    user_item_data[
        user_item_data.dtypes[user_item_data.dtypes == np.uint8].index
    ].astype(np.int16)
)
# single column for likes and dislikes
user_item_data["explicit"] = user_item_data.like - user_item_data.dislike

In [10]:
user_item_data = user_item_data.merge(
    item_meta_data.drop(columns="video_embeddings"), on="item_id", how="left"
)

In [11]:
user_item_data = user_item_data.merge(user_meta_data, on="user_id", how="left")

In [12]:
user_item_data["timespent_rel"] = (
    user_item_data["timespent"] / user_item_data["duration"]
)

In [13]:
user_item_data.head()

Unnamed: 0,user_id,item_id,timespent,like,dislike,share,bookmarks,explicit,lightfm_scores,source_id,duration,item_lightfm_embeddings,item_lightfm_biases,gender,age,user_lightfm_embeddings,user_lightfm_biases,timespent_rel
0,3810,138979,6,0,0,0,0,0,-4.928252,4278,54,"[0.10935251414775848, -0.10262538492679596, 0....",-1.237144,1,36,"[-0.15640118718147278, 0.16826768219470978, -0...",-2.771661,0.111111
1,101874,331160,6,0,0,0,0,0,-2.806283,2049,6,"[0.05162833631038666, -0.0370173454284668, 0.0...",-0.452806,2,52,"[-0.13735204935073853, 0.13384205102920532, -0...",-2.13048,1.0
2,150332,73709,11,0,0,0,0,0,-4.921135,16375,16,"[0.22434011101722717, -0.3084537386894226, 0.3...",-3.10852,1,24,"[0.01132113952189684, 0.011939991265535355, -0...",-1.521888,0.6875
3,4982,189745,5,0,0,0,0,0,-6.170381,166,25,"[0.14558078348636627, -0.15912564098834991, 0....",-2.186147,1,40,"[-0.1815866231918335, 0.21181342005729675, -0....",-2.607319,0.2
4,149601,289643,1,0,0,1,0,0,-4.905728,1459,23,"[0.19378170371055603, -0.1637626737356186, 0.1...",-0.989608,1,34,"[-0.19518840312957764, 0.21150225400924683, -0...",-2.621225,0.043478


##### split

In [14]:
train_df, val_df = train_test_split(user_item_data, test_size=0.2)

##### catboost

In [15]:
train_df.head()

Unnamed: 0,user_id,item_id,timespent,like,dislike,share,bookmarks,explicit,lightfm_scores,source_id,duration,item_lightfm_embeddings,item_lightfm_biases,gender,age,user_lightfm_embeddings,user_lightfm_biases,timespent_rel
100223270,150891,234502,3,0,0,0,0,0,-3.21142,2787,20,"[0.17732080817222595, -0.17245712876319885, 0....",-1.615301,1,33,"[-0.11184735596179962, -0.04671141132712364, 0...",-1.868088,0.15
96622535,72494,56338,2,0,0,0,0,0,-2.997175,6503,21,"[0.24458101391792297, -0.17852246761322021, 0....",-1.828355,2,29,"[0.0660756379365921, -0.05466609448194504, -0....",-1.469147,0.095238
142861749,15685,214345,55,0,0,0,0,0,-2.978568,4242,55,"[0.07179868966341019, -0.05680978670716286, 0....",-1.317698,2,37,"[-0.07110429555177689, 0.07772308588027954, -0...",-1.447046,1.0
117347975,47283,110193,2,0,0,0,0,0,-5.805327,11846,22,"[0.2287990152835846, -0.29124483466148376, 0.2...",-1.827512,1,26,"[-0.11565996706485748, 0.1368868052959442, -0....",-1.944769,0.090909
31563459,92441,166855,4,0,0,0,0,0,-7.639848,3344,22,"[0.20358052849769592, -0.24150842428207397, 0....",-3.370323,1,27,"[-0.15662424266338348, 0.16617316007614136, -0...",-2.54827,0.181818


In [8]:
columns = [
    "user_id",
    "item_id",
    "source_id",
    "duration",
    "gender",
    "age",
    "lightfm_scores",
]
target = "explicit"

In [16]:
train_df = train_df.sort_values(by="user_id", axis=0)

In [17]:
val_df = val_df.sort_values(by="user_id", axis=0)

In [16]:
train_pool = Pool(
    data=train_df[columns].values,
    label=train_df[target].values,
    group_id=train_df["user_id"].values.tolist()
)

In [17]:
val_pool = Pool(
    data=val_df[columns].values,
    label=val_df[target].values,
    group_id=val_df["user_id"].values.tolist(),
)

In [19]:
del train_df, val_df, user_item_data
gc.collect()

0

In [18]:
default_parameters = {
    "iterations": 2000,
    "custom_metric": ["NDCG", "PFound", "AverageGain:top=10"],
    "verbose": True,
    "random_seed": 0,
}

parameters = {}

In [19]:
def fit_model(
    loss_function, additional_params=None, train_pool=train_pool, test_pool=val_pool
):
    parameters = deepcopy(default_parameters)
    parameters["loss_function"] = loss_function
    parameters["train_dir"] = loss_function

    if additional_params is not None:
        parameters.update(additional_params)

    model = CatBoostRanker(**parameters)
    model.fit(train_pool, eval_set=test_pool, plot=True)

    return model

##### catboost inference

In [11]:
test_pairs_data["lightfm_scores"] = test_lightfm_scores.tolist()

In [12]:
item_meta_data = item_meta_data.rename({"embeddings": "video_embeddings"}, axis=1)

In [None]:
# np.uint8 -> np.int16 cast to allow subtraction
user_item_data[user_item_data.dtypes[user_item_data.dtypes == np.uint8].index] = (
    user_item_data[
        user_item_data.dtypes[user_item_data.dtypes == np.uint8].index
    ].astype(np.int16)
)
# single column for likes and dislikes
user_item_data["explicit"] = user_item_data.like - user_item_data.dislike

In [13]:
test_pairs_data = test_pairs_data.merge(
    item_meta_data.drop(columns="video_embeddings"), on="item_id", how="left"
)

In [14]:
test_pairs_data = test_pairs_data.merge(user_meta_data, on="user_id", how="left")

In [None]:
user_item_data["timespent_rel"] = (
    user_item_data["timespent"] / user_item_data["duration"]
)

In [4]:
model = CatBoostRanker()
model.load_model('ranker')

<catboost.core.CatBoostRanker at 0x7fe3a5bd9150>

In [18]:
val_pool = Pool(
    data=test_pairs_data[columns].values,
    group_id=test_pairs_data["user_id"].values.tolist(),
)

In [19]:
preds = model.predict(val_pool)

In [20]:
preds.shape

(1655120,)

In [21]:
preds

array([ 0.25881262,  0.36498226,  0.36260786, ..., -0.14306244,
       -0.16803205, -0.12773313])

In [25]:
model.feature_importances_

array(None, dtype=object)

In [24]:
evaluate(user_id=val_df.user_id.values,
         target=val_df.explicit.values,
         score=preds)

100%|██████████| 183383/183383 [01:05<00:00, 2781.83it/s]


0.6792581303073152

In [None]:
test_lightfm_score = model.predict(test_pairs_data.user_id.values,
                                   test_pairs_data.item_id.values)

In [23]:
test_pairs_data["predict"] = preds
test_pairs_data.to_csv("./submission.csv",index=False)