##### import libs

In [1]:
import numpy as np
import pandas as pd
import lightfm
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split

In [2]:
from utils import evaluate, load_data

In [3]:
pd.set_option('display.max_rows', 100)

##### read data

In [4]:
user_item_data, user_meta_data, item_meta_data, test_pairs_data = load_data()

##### split data

In [5]:
ui_train, ui_val = train_test_split(user_item_data,
                                    test_size=0.15,
                                    random_state=42,
                                    shuffle=False)

In [6]:
last_week = ui_val

In [7]:
# Step 1: Filter users with less than 20 interactions
interaction_counts = last_week['user_id'].value_counts()
users_with_20_plus_interactions = interaction_counts[interaction_counts >= 20].index
filtered_last_week = last_week[last_week['user_id'].isin(users_with_20_plus_interactions)]

In [8]:
# Step 2: Keep only the first 20 interactions per user
filtered_last_week['interaction_rank'] = filtered_last_week.groupby('user_id').cumcount() + 1
final_last_week = filtered_last_week[filtered_last_week['interaction_rank'] <= 20].drop(columns='interaction_rank')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_last_week['interaction_rank'] = filtered_last_week.groupby('user_id').cumcount() + 1


In [9]:
# Step 3: Filter out users with zero total likes
user_likes = final_last_week.groupby('user_id')['like'].sum()
users_with_likes = user_likes[user_likes > 0].index
final_last_week = final_last_week[final_last_week['user_id'].isin(users_with_likes)]

In [10]:
ui_val = final_last_week

In [11]:
u_train = ui_train.user_id
i_train = ui_train.item_id
likes_train = ui_train.like
dislikes_train = ui_train.dislike

u_val = ui_val.user_id
i_val = ui_val.item_id
likes_val = ui_val.like
dislikes_val = ui_val.dislike

In [12]:
sparse_train = csr_matrix((likes_train-dislikes_train, (u_train, i_train)))
sparse_val = csr_matrix((likes_val-dislikes_val, (u_val, i_val)))

In [17]:
ui_val[ui_val.user_id==2]

Unnamed: 0,user_id,item_id,timespent,like,dislike,share,bookmarks,explicit
124715691,2,106645,17,0,0,0,0,0
126317514,2,59203,36,1,0,0,0,1
126318776,2,275450,19,1,0,0,0,1
127522050,2,21053,23,0,0,0,0,0
127524631,2,259723,30,0,0,0,0,0
127527710,2,219656,71,0,0,0,0,0
127755959,2,304626,2,0,0,0,0,0
127756054,2,219967,7,0,0,0,0,0
127756435,2,331804,5,0,0,0,0,0
127757159,2,106799,19,0,0,0,0,0


##### lightfm model

In [11]:
model = lightfm.LightFM(no_components=128,
                        loss="bpr",
                        random_state=42)

In [None]:
# Hyperparameters
epochs = 20
train_scores = []
val_scores = []

for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")

    model.fit_partial(
        interactions=sparse_train,
        epochs=1,
        num_threads=16,
        verbose=True)
    
    # Calculate training loss (average loss over all predictions)
    val_lightfm_pred = model.predict(u_val.values, i_val.values)
    train_lightfm_pred = model.predict(u_train.values, i_train.values)
    
    train_score = evaluate(
        user_id=ui_train.user_id.values,
        target=ui_train.explicit.values,
        score=train_lightfm_pred)
    
    val_score = evaluate(
        user_id=ui_val.user_id.values,
        target=ui_val.explicit.values,
        score=val_lightfm_pred)

    train_scores.append(train_score)
    val_scores.append(val_score)

    print(f"{train_score=:.4f}, {val_score=:.4f}")


In [None]:
# Plot training and validation loss
plt.figure(figsize=(10, 6))
plt.plot(range(1, epochs + 1), train_scores, label='Train ROC AUC')
plt.plot(range(1, epochs + 1), val_scores, label='Validation ROC AUC')
plt.xlabel('Epochs')
plt.ylabel('ROC AUC')
plt.title('LightFM Training and Validation ROC AUC')
plt.legend()
plt.grid()
plt.show()

In [12]:
model.fit(interactions=sparse_train,
          epochs=10,
          num_threads=16,
          verbose=True)

Epoch: 100%|██████████| 10/10 [01:48<00:00, 10.86s/it]


<lightfm.lightfm.LightFM at 0x7fd2e74a77c0>

In [38]:
val_lightfm_pred = model.predict(u_val.values, i_val.values, num_threads=16)
#train_lightfm_pred = model.predict(u_train.values, i_train.values, num_threads=16)

In [39]:
val_score = evaluate(
    user_id=ui_val.user_id.values,
    target=ui_val.explicit.values,
    score=val_lightfm_pred)
print(f"{val_score=}")

100%|██████████| 49842/49842 [00:05<00:00, 9850.63it/s] 

val_score=0.5848539790555083





In [20]:
ui_val["val_lightfm_pred"] = val_lightfm_pred

In [28]:
ui_val[ui_val.user_id==29421].sort_values(by="val_lightfm_pred")

Unnamed: 0,user_id,item_id,timespent,like,dislike,share,bookmarks,explicit,val_lightfm_pred
123817634,29421,237757,12,0,0,0,0,0,-6.3602
125004909,29421,223439,21,1,0,0,0,1,-5.390552
124845209,29421,155761,77,1,0,0,0,1,-4.775793
125170787,29421,153745,56,0,0,0,0,0,-4.361218
145087729,29421,255622,63,0,0,0,0,0,-4.348516
124829168,29421,27010,31,0,0,0,0,0,-4.135693
125003800,29421,158960,46,0,0,0,0,0,-4.119882
125157763,29421,121400,26,0,0,0,0,0,-4.110981
131479472,29421,123735,46,0,0,0,0,0,-3.792749
125180711,29421,240847,91,1,0,0,0,1,-3.67257


In [15]:
train_score = evaluate(
    user_id=ui_train.user_id.values,
    target=ui_train.explicit.values,
    score=train_lightfm_pred)
print(f"{train_score=}")

 59%|█████▉    | 108918/183404 [04:36<03:08, 394.62it/s]


KeyboardInterrupt: 

In [12]:
np.save("dumps/lfm_user_embeddings.npy", model.user_embeddings)
np.save("dumps/lfm_item_embeddings.npy", model.item_embeddings)
np.save("dumps/lfm_user_biases.npy", model.user_biases)
np.save("dumps/lfm_item_biases.npy", model.item_biases)

In [13]:
user_ids = user_item_data.user_id.values
item_ids = user_item_data.item_id.values

lightfm_pred = model.predict(user_ids=user_ids,
                               item_ids=item_ids)

In [14]:
np.save("dumps/lfm_scores.npy", lightfm_pred)

##### submission

In [40]:
test_lightfm_pred = model.predict(
    test_pairs_data.user_id.values,
    test_pairs_data.item_id.values,
    num_threads=16)

In [16]:
np.save("dumps/test_lightfm_scores.npy", test_lightfm_pred)

In [41]:
test_pairs_data["predict"] = test_lightfm_pred
test_pairs_data.to_csv("./lfm_submission.csv",index=False)