In [95]:
import numpy as np
import pandas as pd
import lightfm
import matplotlib.pyplot as plt
from tqdm import tqdm
from pathlib import Path
from scipy.sparse import coo_matrix, csr_matrix
from sklearn.model_selection import train_test_split
from lifelines.utils import concordance_index

read data

In [2]:
data_path = Path("./data")
user_item_data_path = data_path / "train_interactions.parquet"
users_meta_data_path = data_path / "users_meta.parquet"
items_meta_data_path = data_path / "items_meta.parquet"
test_pairs_data_path = data_path / "test_pairs.csv"

In [3]:
user_item_data = pd.read_parquet(user_item_data_path)
users_meta_data = pd.read_parquet(users_meta_data_path)
items_meta_data = pd.read_parquet(items_meta_data_path)
test_pairs_data = pd.read_csv(test_pairs_data_path)

In [4]:
user_features = users_meta_data[["gender", "age"]].values
item_features = items_meta_data[["source_id", "duration"]].values

In [83]:
ui_train, ui_val = train_test_split(user_item_data,
                                    test_size=0.15,
                                    random_state=42,
                                    shuffle=False)

In [84]:
u_train = user_item_data.user_id
i_train = user_item_data.item_id
likes_train = user_item_data.like
dislikes_train = user_item_data.dislike

In [85]:
u_train = ui_train.user_id
i_train = ui_train.item_id
likes_train = ui_train.like
dislikes_train = ui_train.dislike

In [86]:
u_val = ui_val.user_id
i_val = ui_val.item_id
likes_val = ui_val.like
dislikes_val = ui_val.dislike

In [87]:
sparse_train = coo_matrix((likes_train-dislikes_train, (u_train, i_train)))
user_features_train = csr_matrix(user_features)
item_features_train = csr_matrix(item_features)

lightfm model

In [88]:
model = lightfm.LightFM(no_components=32)

In [89]:
model.fit(sparse_train,
          epochs=10,
          num_threads=8,
          verbose=True)

Epoch: 100%|██████████| 10/10 [05:13<00:00, 31.32s/it]


<lightfm.lightfm.LightFM at 0x7f61bb1ef760>

In [90]:
val_lightfm_score = model.predict(u_val.values, i_val.values)

In [96]:
ui_val["target"] = ui_val["like"] - ui_val["dislike"]
ui_val["lightfm_score"] = val_lightfm_score

In [93]:
def evaluate(user_id: np.ndarray, target: np.ndarray, score: np.ndarray) -> np.float64:
    sorting_indices = user_id.argsort()

    user_id = user_id[sorting_indices]
    target_and_score = np.stack([target,score]).reshape(-1,2)
    target_and_score = target_and_score[sorting_indices]
    
    groups = np.split(target_and_score, np.unique(user_id, return_index=True)[1][1:])
    roc_aucs = []
    for group in tqdm(groups):
        target = group[:,0]
        score = group[:,1]
        if len(np.unique(target)) == 1:
            continue
        roc_auc = concordance_index(target, score)
        roc_aucs.append(roc_auc)
    return np.mean(roc_aucs)

In [101]:
type(ui_val.user_id.values)

numpy.ndarray

In [102]:
a = np.float64(0.7750076492555034)

In [110]:
a.item()

0.7750076492555034

In [97]:
evaluate(user_id=ui_val.user_id.values,
         target=ui_val.target.values,
         score=ui_val.lightfm_score.values)

100%|██████████| 181721/181721 [01:41<00:00, 1788.16it/s]


np.float64(0.7750076492555034)

In [98]:
test_lightfm_score = model.predict(test_pairs_data.user_id.values,
                                   test_pairs_data.item_id.values,
                                   None)

In [99]:
test_pairs_data.user_id.values

array([     1,      1,      1, ..., 183403, 183403, 183403])

In [100]:
test_pairs_data["predict"] = test_lightfm_score
test_pairs_data.to_csv("./submission.csv",index=False)