##### import libs

In [20]:
import numpy as np
import pandas as pd
import lightfm
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix, coo_matrix
from sklearn.model_selection import train_test_split

In [2]:
from utils import evaluate, load_data

In [3]:
pd.set_option('display.max_rows', 100)

##### read data

In [4]:
user_item_data, user_meta_data, item_meta_data, test_pairs_data = load_data()

In [6]:
user_item_data = user_item_data.merge(
    item_meta_data.drop(columns="embeddings"), on="item_id", how="left"
)

In [7]:
user_item_data["timespent_rel"] = (
    user_item_data["timespent"] / user_item_data["duration"]
)

In [8]:
share_weight = 10
bookmarks_weight = 1
timespent_rel_weight = 50

In [9]:
user_item_data["weighted_target"] = user_item_data["like"]*(
    1 + 
    share_weight*user_item_data.share + 
    bookmarks_weight*user_item_data.bookmarks + 
    timespent_rel_weight*user_item_data.timespent_rel)

##### split data

In [10]:
ui_train, ui_val = train_test_split(user_item_data,
                                    test_size=0.15,
                                    random_state=42,
                                    shuffle=False)

In [6]:
last_week = ui_val

In [7]:
# Step 1: Filter users with less than 20 interactions
interaction_counts = last_week['user_id'].value_counts()
users_with_20_plus_interactions = interaction_counts[interaction_counts >= 20].index
filtered_last_week = last_week[last_week['user_id'].isin(users_with_20_plus_interactions)]

In [None]:
# Step 2: Keep only the first 20 interactions per user
filtered_last_week['interaction_rank'] = filtered_last_week.groupby('user_id').cumcount() + 1
final_last_week = filtered_last_week[filtered_last_week['interaction_rank'] <= 20].drop(columns='interaction_rank')

In [9]:
# Step 3: Filter out users with zero total likes
user_likes = final_last_week.groupby('user_id')['like'].sum()
users_with_likes = user_likes[user_likes > 0].index
final_last_week = final_last_week[final_last_week['user_id'].isin(users_with_likes)]

In [10]:
ui_val = final_last_week

In [11]:
u_train = ui_train.user_id
i_train = ui_train.item_id
likes_train = ui_train.like
dislikes_train = ui_train.dislike

u_val = ui_val.user_id
i_val = ui_val.item_id
likes_val = ui_val.like
dislikes_val = ui_val.dislike

In [21]:
weights = coo_matrix((ui_train.weighted_target, (u_train, i_train)))

In [25]:
sparse_train = coo_matrix((likes_train-dislikes_train, (u_train, i_train)))
sparse_val = coo_matrix((likes_val-dislikes_val, (u_val, i_val)))

##### lightfm model

In [14]:
model = lightfm.LightFM(no_components=128,
                        loss="bpr",
                        random_state=42)

In [None]:
# Hyperparameters
epochs = 20
train_scores = []
val_scores = []

for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")

    model.fit_partial(
        interactions=sparse_train,
        epochs=1,
        num_threads=16,
        verbose=True)
    
    # Calculate training loss (average loss over all predictions)
    val_lightfm_pred = model.predict(u_val.values, i_val.values)
    train_lightfm_pred = model.predict(u_train.values, i_train.values)
    
    train_score = evaluate(
        user_id=ui_train.user_id.values,
        target=ui_train.explicit.values,
        score=train_lightfm_pred)
    
    val_score = evaluate(
        user_id=ui_val.user_id.values,
        target=ui_val.explicit.values,
        score=val_lightfm_pred)

    train_scores.append(train_score)
    val_scores.append(val_score)

    print(f"{train_score=:.4f}, {val_score=:.4f}")


In [None]:
# Plot training and validation loss
plt.figure(figsize=(10, 6))
plt.plot(range(1, epochs + 1), train_scores, label='Train ROC AUC')
plt.plot(range(1, epochs + 1), val_scores, label='Validation ROC AUC')
plt.xlabel('Epochs')
plt.ylabel('ROC AUC')
plt.title('LightFM Training and Validation ROC AUC')
plt.legend()
plt.grid()
plt.show()

In [32]:
model.fit(interactions=sparse_train,
        #   sample_weight=weights,
          epochs=20,
          num_threads=16,
          verbose=True)

Epoch:  10%|█         | 2/20 [00:33<05:07, 17.06s/it]

: 

In [None]:
val_lightfm_pred = model.predict(u_val.values, i_val.values, num_threads=16)
#train_lightfm_pred = model.predict(u_train.values, i_train.values, num_threads=16)

In [None]:
val_score = evaluate(
    user_id=ui_val.user_id.values,
    target=ui_val.explicit.values,
    score=val_lightfm_pred)
print(f"{val_score=}")

100%|██████████| 181721/181721 [00:51<00:00, 3521.37it/s]

val_score=0.6068528453727989





In [20]:
ui_val["val_lightfm_pred"] = val_lightfm_pred

In [None]:
ui_val[ui_val.user_id==29421].sort_values(by="val_lightfm_pred")

In [None]:
train_score = evaluate(
    user_id=ui_train.user_id.values,
    target=ui_train.explicit.values,
    score=train_lightfm_pred)
print(f"{train_score=}")

In [12]:
np.save("dumps/lfm_user_embeddings.npy", model.user_embeddings)
np.save("dumps/lfm_item_embeddings.npy", model.item_embeddings)
np.save("dumps/lfm_user_biases.npy", model.user_biases)
np.save("dumps/lfm_item_biases.npy", model.item_biases)

In [13]:
user_ids = user_item_data.user_id.values
item_ids = user_item_data.item_id.values

lightfm_pred = model.predict(user_ids=user_ids,
                               item_ids=item_ids)

In [14]:
np.save("dumps/lfm_scores.npy", lightfm_pred)

##### submission

In [40]:
test_lightfm_pred = model.predict(
    test_pairs_data.user_id.values,
    test_pairs_data.item_id.values,
    num_threads=16)

In [16]:
np.save("dumps/test_lightfm_scores.npy", test_lightfm_pred)

In [41]:
test_pairs_data["predict"] = test_lightfm_pred
test_pairs_data.to_csv("./lfm_submission.csv",index=False)