In [2]:
import pandas as pd
import numpy as np
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import precision_at_k
import datetime



In [None]:
# Load Data ---
# Load user interactions and meta data
interactions = pd.read_csv("dataset/User_interaction.csv")
meta_data = pd.read_csv("dataset/Metadata.csv")


In [4]:
# Convert date columns to datetime
interactions['updated_at'] = pd.to_datetime(interactions['updated_at'])
meta_data['updated_at'] = pd.to_datetime(meta_data['updated_at'])
meta_data['published_at'] = pd.to_datetime(meta_data['published_at'])


In [None]:
# Filter Positive Interactions ---
# Use only interactions with read_percentage >= 50
threshold = 50
interactions = interactions[interactions['read_percent'] >= threshold].copy()


In [6]:
# Sort interactions by time (for time-based splitting)
interactions = interactions.sort_values("updated_at").reset_index(drop=True)


In [7]:
# Split into 75% train and 25% test (by row order)
train_cutoff = int(0.75 * len(interactions))
train_interactions = interactions.iloc[:train_cutoff]
test_interactions = interactions.iloc[train_cutoff:]


In [None]:
# Prepare LightFM Dataset
dataset = Dataset()


In [9]:
# Get unique users and items from the interaction data
users = interactions['user_id'].unique()
# items = meta_data['pratilipi_id'].unique()


In [10]:
all_items = np.union1d(interactions['pratilipi_id'].unique(), meta_data['pratilipi_id'].unique())



In [11]:
dataset.fit(users=users, items=all_items)

In [12]:
# Build training interactions list and matrix
train_list = [(row['user_id'], row['pratilipi_id']) for _, row in train_interactions.iterrows()]
(train_matrix, _) = dataset.build_interactions(train_list)


In [13]:
# Build test interactions list and matrix
test_list = [(row['user_id'], row['pratilipi_id']) for _, row in test_interactions.iterrows()]
(test_matrix, _) = dataset.build_interactions(test_list)


In [None]:
# Build Item Features 
# We combine multiple columns: author_id, category_name, and reading_time (binned)
def create_item_features(row):
    features = []
    features.append(f"author:{row['author_id']}")
    features.append(f"category:{row['category_name']}")
    # Convert reading_time from seconds to minutes and bin it
    minutes = int(row['reading_time'] // 60)
    features.append(f"reading_time:{minutes}")
    return features

meta_data['features'] = meta_data.apply(create_item_features, axis=1)


In [15]:
# Create a list of (item_id, [features]) tuples
item_features_list = [(row['pratilipi_id'], row['features']) for _, row in meta_data.iterrows()]


In [16]:
# Let LightFM know all possible item features
all_item_features = set()
for _, feats in item_features_list:
    all_item_features.update(feats)
dataset.fit_partial(items=all_items, item_features=list(all_item_features))


In [17]:
# Build the item features matrix
item_features_matrix = dataset.build_item_features(item_features_list)


In [None]:

#  Train the LightFM Model ---
# Using fewer epochs and one thread to reduce load
model = LightFM(loss='logistic', random_state=42)
model.fit(train_matrix, item_features=item_features_matrix, epochs=5, num_threads=1)


<lightfm.lightfm.LightFM at 0x29b1f0ef8b0>

In [None]:
# Evaluate the Model
# train_precision = precision_at_k(model, train_matrix, item_features=item_features_matrix, k=5).mean()
# test_precision = precision_at_k(model, test_matrix, item_features=item_features_matrix, k=5).mean()
# print("Train Precision@5:", train_precision)
# print("Test Precision@5:", test_precision)


In [None]:
# Recommendation Function 
# Get mapping dictionaries from LightFM
user_id_mapping, item_id_mapping = dataset.mapping()[0], dataset.mapping()[1]
# Create a reverse mapping for items: internal id -> external pratilipi_id
rev_item_mapping = {v: k for k, v in item_id_mapping.items()}


In [21]:
def recommend(user_ext_id, model, dataset, item_features_matrix, num_rec=5):
    # Convert external user_id to internal index
    internal_user_id = user_id_mapping[user_ext_id]
    n_items = len(item_id_mapping)
    # Predict scores for all items for this user
    scores = model.predict(internal_user_id, np.arange(n_items), item_features=item_features_matrix)
    # Get top indices sorted by score
    top_indices = np.argsort(-scores)[:num_rec]
    recommended_items = [rev_item_mapping[i] for i in top_indices]
    return recommended_items


In [None]:
# Get Recommendations for a Sample User ---
sample_user = interactions['user_id'].iloc[568945]
recommendations = recommend(sample_user, model, dataset, item_features_matrix, num_rec=5)
print(f"Recommendations for user {sample_user}:", recommendations)

Recommendations for user 5506791974854999: [5506791979223815, 5506791970045925, 5506791968261668, 5506791991878999, 5506791973354582]


In [None]:
# --- Generate Recommendations for All Test Users ---

# Get the unique user IDs from the test interactions
test_users = test_interactions['user_id'].unique()

# Dictionary to store recommendations for each test user
all_recommendations = {}

# Loop through each user in the test set
for user in test_users:
    # Call your recommend function to get top 5 recommendations for the user
    recs = recommend(user, model, dataset, item_features_matrix, num_rec=5)
    if recs is not None:
        all_recommendations[user] = recs

# Print out recommendations for each test user
for user, recs in all_recommendations.items():
    print(f"Recommendations for user {user}: {recs}")


In [23]:
# import pickle

# # 1. Pickle the trained LightFM model
# with open("lightfm_model.pkl", "wb") as f:
#     pickle.dump(model, f)
# print("LightFM model saved as lightfm_model.pkl")

# # 2. Pickle the dataset mapping (user and item mapping)
# mapping = dataset.mapping()  # This returns a tuple: (user_mapping, item_mapping)
# with open("lightfm_mapping.pkl", "wb") as f:
#     pickle.dump(mapping, f)
# print("Dataset mapping saved as lightfm_mapping.pkl")

# # 3. Pickle the item features matrix (optional but recommended if expensive to compute)
# with open("lightfm_item_features.pkl", "wb") as f:
#     pickle.dump(item_features_matrix, f)
# print("Item features matrix saved as lightfm_item_features.pkl")