##### import libs

In [1]:
import numpy as np
import pandas as pd
import implicit
import matplotlib.pyplot as plt
from tqdm import tqdm
from pathlib import Path
from scipy.sparse import coo_matrix
from sklearn.model_selection import train_test_split

In [2]:
from utils import evaluate, load_data

##### read data

In [3]:
user_item_data, user_meta_data, item_meta_data, test_pairs_data = load_data()

In [18]:
user_item_data = user_item_data.merge(
    item_meta_data.drop(columns="embeddings"), on="item_id", how="left"
)

In [None]:
source_uniqueness = user_item_data[['user_id', 'source_id']].groupby(by="user_id").agg(lambda x: len(np.unique(x))/len(x))

In [33]:
sums = user_item_data.groupby(by="user_id").agg(sum)

  sums = user_item_data.groupby(by="user_id").agg(sum)


In [57]:
len(set(test_pairs_data.user_id))

82756

In [61]:
len(set(sums[(sums.like==0) & (sums.dislike==0)].index.values))

33131

In [60]:
len(set(sums[sums.like==0].index.values).intersection(set(test_pairs_data.user_id)))

2863

In [37]:
test_user_ids = test_pairs_data.user_id.unique()

In [46]:
(sums.loc[test_user_ids].like + sums.loc[test_user_ids].dislike).value_counts()

2       2929
1       2843
3       2547
0       2531
4       2416
        ... 
552        1
987        1
933        1
1012       1
945        1
Name: count, Length: 927, dtype: int64

##### split data

In [4]:
ui_train, ui_val = train_test_split(user_item_data,
                                    test_size=0.15,
                                    random_state=42,
                                    shuffle=False)

In [5]:
u_train = ui_train.user_id
i_train = ui_train.item_id
likes_train = ui_train.like
dislikes_train = ui_train.dislike

u_val = ui_val.user_id
i_val = ui_val.item_id
likes_val = ui_val.like
dislikes_val = ui_val.dislike

In [6]:
sparse_train = coo_matrix((likes_train-dislikes_train, (u_train, i_train)))
sparse_val = coo_matrix((likes_val-dislikes_val, (u_val, i_val)))

##### als model

In [7]:
model = implicit.bpr.BayesianPersonalizedRanking(factors=256,random_state=42)
model.fit(sparse_train)



  0%|          | 0/100 [00:00<?, ?it/s]

##### evaluation

In [None]:
model = model.to_cpu()

In [15]:
als_score = (model.user_factors[ui_val['user_id']] * 
             model.item_factors[ui_val['item_id']]).sum(axis=1)

In [16]:
np.save("dumps/bpr_val_score.npy", als_score)

In [17]:
ui_val['als_score'] = als_score

In [18]:
ui_val["target"] = ui_val["like"].astype(np.int8) - ui_val["dislike"].astype(np.int8)

In [19]:
evaluate(user_id=ui_val.user_id.values,
         target=ui_val.target.values,
         score=ui_val.als_score.values)

100%|██████████| 181721/181721 [00:51<00:00, 3508.06it/s]


0.5272686150733121

##### submission

In [None]:
test_als_score = (model.user_factors[test_pairs_data['user_id']] * 
                  model.item_factors[test_pairs_data['item_id']]).sum(axis=1)

In [None]:
test_pairs_data["predict"] = test_als_score
test_pairs_data.to_csv("./submission.csv",index=False)