##### import libs

In [1]:
import numpy as np
import lightfm
from scipy.sparse import coo_matrix
from sklearn.model_selection import train_test_split

In [2]:
from utils import evaluate, load_data

##### read data

In [3]:
user_item_data, user_meta_data, item_meta_data, test_pairs_data = load_data()

##### split data

In [4]:
ui_train, ui_val = train_test_split(user_item_data,
                                    test_size=0.15,
                                    random_state=42,
                                    shuffle=False)

In [5]:
u_train = ui_train.user_id
i_train = ui_train.item_id
likes_train = ui_train.like
dislikes_train = ui_train.dislike

u_val = ui_val.user_id
i_val = ui_val.item_id
likes_val = ui_val.like
dislikes_val = ui_val.dislike

In [6]:
sparse_train = coo_matrix((likes_train-dislikes_train, (u_train, i_train)))
sparse_val = coo_matrix((likes_val-dislikes_val, (u_val, i_val)))

In [7]:
sparse_train = coo_matrix((likes_train, (u_train, i_train)))
sparse_val = coo_matrix((likes_val, (u_val, i_val)))

In [8]:
sparse_train = coo_matrix((dislikes_train, (u_train, i_train)))
sparse_val = coo_matrix((dislikes_val, (u_val, i_val)))

##### lightfm model

In [18]:
for ratio in [1, 0.8, 0.6, 0.4, 0.2]:
    sparse_train = coo_matrix((likes_train.values[-int(len(likes_train)*ratio):],
                               (u_train.values[-int(len(likes_train)*ratio):],
                                i_train.values[-int(len(likes_train)*ratio):])))
    model = lightfm.LightFM(no_components=128,
                            loss='bpr',
                            random_state=42)
    model.fit(
        interactions=sparse_train,
        epochs=5,
        num_threads=4,
        verbose=True)
    
    val_lightfm_score = model.predict(u_val.values, i_val.values)

    roc_auc = evaluate(
        user_id=ui_val.user_id.values,
        target=ui_val.like.values,
        score=val_lightfm_score)
    
    print(f"{ratio=}, {roc_auc=}")

Epoch: 100%|██████████| 5/5 [01:44<00:00, 20.92s/it]
100%|██████████| 181721/181721 [00:53<00:00, 3411.23it/s]


ratio=1, roc_auc=0.5960142833386671


Epoch: 100%|██████████| 5/5 [01:18<00:00, 15.75s/it]
100%|██████████| 181721/181721 [00:52<00:00, 3457.65it/s]


ratio=0.8, roc_auc=0.5909390650523213


Epoch: 100%|██████████| 5/5 [00:58<00:00, 11.63s/it]
100%|██████████| 181721/181721 [00:51<00:00, 3500.49it/s]


ratio=0.6, roc_auc=0.5854972446478661


Epoch: 100%|██████████| 5/5 [00:37<00:00,  7.50s/it]
100%|██████████| 181721/181721 [00:52<00:00, 3436.69it/s]


ratio=0.4, roc_auc=0.5787914872362939


Epoch: 100%|██████████| 5/5 [00:18<00:00,  3.67s/it]
100%|██████████| 181721/181721 [00:53<00:00, 3419.58it/s]

ratio=0.2, roc_auc=0.5739941961337525





In [19]:
sparse_train = coo_matrix((np.ones_like(u_train.values), (u_train, i_train)))
sparse_val = coo_matrix((np.ones_like(u_val.values), (u_val, i_val)))

In [20]:
model = lightfm.LightFM(no_components=128,
                        loss="bpr",
                        random_state=42)

In [None]:
model.fit(interactions=sparse_train,
          epochs=20,
          num_threads=4,
          verbose=True)

Epoch: 100%|██████████| 20/20 [51:52<00:00, 155.63s/it]


<lightfm.lightfm.LightFM at 0x7f497a435060>

In [22]:
val_lightfm_score = model.predict(u_val.values, i_val.values)

In [23]:
ui_val["target"] = ui_val["like"]# - ui_val["dislike"]
ui_val["lightfm_score"] = val_lightfm_score

In [24]:
evaluate(user_id=ui_val.user_id.values,
         target=np.ones_like(u_val.values),
         score=ui_val.lightfm_score.values)

100%|██████████| 181721/181721 [00:01<00:00, 175669.59it/s]
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


nan

In [23]:
train_lightfm_score = model.predict(u_train.values, i_train.values)
ui_train["target"] = ui_train["like"]# - ui_val["dislike"]
ui_train["lightfm_score"] = train_lightfm_score

In [24]:
evaluate(user_id=ui_train.user_id.values,
         target=ui_train.target.values,
         score=ui_train.lightfm_score.values)

100%|██████████| 183404/183404 [07:28<00:00, 408.97it/s]


0.6966544215822357

In [14]:
evaluate(user_id=ui_val.user_id.values,
         target=ui_val.target.values,
         score=ui_val.lightfm_score.values)

100%|██████████| 181721/181721 [00:51<00:00, 3542.67it/s]


0.6213206286959302

In [12]:
evaluate(user_id=ui_val.user_id.values,
         target=ui_val.target.values,
         score=ui_val.lightfm_score.values)

100%|██████████| 181721/181721 [00:52<00:00, 3431.79it/s]


0.620683500908264

In [None]:
np.save("dumps/user_embeddings.npy", model.user_embeddings)
np.save("dumps/item_embeddings.npy", model.item_embeddings)
np.save("dumps/user_biases.npy", model.user_biases)
np.save("dumps/item_biases.npy", model.item_biases)

In [14]:
user_ids = user_item_data.user_id.values
item_ids = user_item_data.item_id.values

lightfm_scores = model.predict(user_ids=user_ids,
                               item_ids=item_ids)

In [None]:
np.save("dumps/lightfm_scores.npy", lightfm_scores)

##### submission

In [17]:
test_lightfm_score = model.predict(test_pairs_data.user_id.values,
                                   test_pairs_data.item_id.values)

In [None]:
np.save("dumps/test_lightfm_scores.npy", test_lightfm_score)

In [16]:
test_pairs_data["predict"] = test_lightfm_score
test_pairs_data.to_csv("./submission.csv",index=False)