##### import libs

In [1]:
import gc
import numpy as np
import pandas as pd
from copy import deepcopy
import lightgbm as lgbm
from sklearn.model_selection import train_test_split

from utils import load_data, evaluate

##### load data

In [2]:
user_item_data, user_meta_data, item_meta_data, test_pairs_data = load_data()

In [3]:
user_embeddings = np.load("dumps/user_embeddings.npy")
item_embeddings = np.load("dumps/item_embeddings.npy")
user_biases = np.load("dumps/user_biases.npy")
item_biases = np.load("dumps/item_biases.npy")
lightfm_scores = np.load("dumps/lightfm_scores.npy")

In [4]:
test_lightfm_scores = np.load("dumps/test_lightfm_scores.npy")

##### create features

In [5]:
user_meta_data["user_lightfm_embeddings"] = user_embeddings.tolist()
item_meta_data["item_lightfm_embeddings"] = item_embeddings.tolist()
user_meta_data["user_lightfm_biases"] = user_biases.tolist()
item_meta_data["item_lightfm_biases"] = item_biases.tolist()

In [6]:
user_item_data["lightfm_scores"] = lightfm_scores.tolist()

In [7]:
item_meta_data = item_meta_data.rename({"embeddings": "video_embeddings"}, axis=1)

In [8]:
# single column for likes and dislikes
user_item_data["explicit"] = user_item_data.like - user_item_data.dislike

In [9]:
user_item_data = user_item_data.merge(
    item_meta_data.drop(columns="video_embeddings"), on="item_id", how="left"
)

In [10]:
user_item_data = user_item_data.merge(user_meta_data, on="user_id", how="left")

In [11]:
user_item_data["timespent_rel"] = (
    user_item_data["timespent"] / user_item_data["duration"]
)

In [12]:
user_item_data.head()

Unnamed: 0,user_id,item_id,timespent,like,dislike,share,bookmarks,lightfm_scores,explicit,source_id,duration,item_lightfm_embeddings,item_lightfm_biases,gender,age,user_lightfm_embeddings,user_lightfm_biases,timespent_rel
0,3810,138979,6,0,0,0,0,-4.928252,0,4278,54,"[0.10935251414775848, -0.10262538492679596, 0....",-1.237144,1,36,"[-0.15640118718147278, 0.16826768219470978, -0...",-2.771661,0.111111
1,101874,331160,6,0,0,0,0,-2.806283,0,2049,6,"[0.05162833631038666, -0.0370173454284668, 0.0...",-0.452806,2,52,"[-0.13735204935073853, 0.13384205102920532, -0...",-2.13048,1.0
2,150332,73709,11,0,0,0,0,-4.921135,0,16375,16,"[0.22434011101722717, -0.3084537386894226, 0.3...",-3.10852,1,24,"[0.01132113952189684, 0.011939991265535355, -0...",-1.521888,0.6875
3,4982,189745,5,0,0,0,0,-6.170381,0,166,25,"[0.14558078348636627, -0.15912564098834991, 0....",-2.186147,1,40,"[-0.1815866231918335, 0.21181342005729675, -0....",-2.607319,0.2
4,149601,289643,1,0,0,1,0,-4.905728,0,1459,23,"[0.19378170371055603, -0.1637626737356186, 0.1...",-0.989608,1,34,"[-0.19518840312957764, 0.21150225400924683, -0...",-2.621225,0.043478


##### split

In [42]:
train_df, val_df = train_test_split(user_item_data[:100000000], test_size=0.2)

##### lightgbm

In [43]:
columns = [
    "user_id",
    "item_id",
    "source_id",
    "duration",
    "gender",
    "age",
    "lightfm_scores",
]
target = "explicit"

In [44]:
train_df = train_df.sort_values(by="user_id", axis=0)
val_df = val_df.sort_values(by="user_id", axis=0)

In [45]:
X_train = train_df[columns]
y_train = train_df[target]
X_val = val_df[columns]
y_val = val_df[target]
group_train = X_train.user_id.value_counts().sort_index().values
group_val = X_val.user_id.value_counts().sort_index().values

In [46]:
model = lgbm.LGBMRanker()

In [47]:
model.fit(X=X_train,
          y=y_train+1,
          group=group_train,
          eval_set=[(X_val, y_val+1)],
          eval_group=[group_val])

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.334768 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1244
[LightGBM] [Info] Number of data points in the train set: 80000000, number of used features: 7


In [49]:
y_preds = model.predict(X_val,)

In [50]:
y_preds.shape

(20000000,)

In [None]:
evaluate(X_val.user_id, y_val.values, y_preds,)

In [54]:
y_val.unique()

array([ 0,  1, -1], dtype=int16)

##### catboost inference

In [11]:
test_pairs_data["lightfm_scores"] = test_lightfm_scores.tolist()

In [12]:
item_meta_data = item_meta_data.rename({"embeddings": "video_embeddings"}, axis=1)

In [None]:
# np.uint8 -> np.int16 cast to allow subtraction
user_item_data[user_item_data.dtypes[user_item_data.dtypes == np.uint8].index] = (
    user_item_data[
        user_item_data.dtypes[user_item_data.dtypes == np.uint8].index
    ].astype(np.int16)
)
# single column for likes and dislikes
user_item_data["explicit"] = user_item_data.like - user_item_data.dislike

In [13]:
test_pairs_data = test_pairs_data.merge(
    item_meta_data.drop(columns="video_embeddings"), on="item_id", how="left"
)

In [14]:
test_pairs_data = test_pairs_data.merge(user_meta_data, on="user_id", how="left")

In [None]:
user_item_data["timespent_rel"] = (
    user_item_data["timespent"] / user_item_data["duration"]
)

In [4]:
model = CatBoostRanker()
model.load_model('ranker')

<catboost.core.CatBoostRanker at 0x7fe3a5bd9150>

In [18]:
val_pool = Pool(
    data=test_pairs_data[columns].values,
    group_id=test_pairs_data["user_id"].values.tolist(),
)

In [19]:
preds = model.predict(val_pool)

In [20]:
preds.shape

(1655120,)

In [21]:
preds

array([ 0.25881262,  0.36498226,  0.36260786, ..., -0.14306244,
       -0.16803205, -0.12773313])

In [25]:
model.feature_importances_

array(None, dtype=object)

In [24]:
evaluate(user_id=val_df.user_id.values,
         target=val_df.explicit.values,
         score=preds)

100%|██████████| 183383/183383 [01:05<00:00, 2781.83it/s]


0.6792581303073152

In [None]:
test_lightfm_score = model.predict(test_pairs_data.user_id.values,
                                   test_pairs_data.item_id.values)

In [23]:
test_pairs_data["predict"] = preds
test_pairs_data.to_csv("./submission.csv",index=False)