In [15]:
import pandas as pd
import numpy as np

from scipy.sparse import coo_matrix
from scipy.sparse import csr_matrix

from lightfm import LightFM
from lightfm.cross_validation import random_train_test_split
from lightfm.evaluation import precision_at_k, auc_score
from sklearn.preprocessing import MultiLabelBinarizer

ratings_df = pd.read_csv("ratings.csv").drop(columns=["timestamp"])
movies_df = pd.read_csv("movies.csv")

print(ratings_df)

        userId  movieId  rating
0            1        1     4.0
1            1        3     4.0
2            1        6     4.0
3            1       47     5.0
4            1       50     5.0
...        ...      ...     ...
100831     610   166534     4.0
100832     610   168248     5.0
100833     610   168250     5.0
100834     610   168252     5.0
100835     610   170875     3.0

[100836 rows x 3 columns]


In [16]:
unique_users = sorted(ratings_df["userId"].unique())
unique_movies = sorted(ratings_df["movieId"].unique())

user_id_map = {old_id: new_id for new_id, old_id in enumerate(unique_users)}
movie_id_map = {old_id: new_id for new_id, old_id in enumerate(unique_movies)}

ratings_df["user_idx"] = ratings_df["userId"].map(user_id_map)
ratings_df["movie_idx"] = ratings_df["movieId"].map(movie_id_map)

n_users = len(unique_users)
n_movies = len(unique_movies)

print(n_users, n_movies)

610 9724


In [17]:
interactions = coo_matrix(
    (ratings_df["rating"] >= 2, (ratings_df["user_idx"], ratings_df["movie_idx"])),
    shape=(n_users, n_movies),
)

train_interactions, test_interactions = random_train_test_split(interactions, 0.2, 42)

inv_movie_id_map = {v: k for k, v in movie_id_map.items()}

movies_indexed = movies_df.set_index("movieId").reindex(inv_movie_id_map.values())
movies_indexed = movies_indexed.reset_index()
movies_indexed["movie_idx"] = movies_indexed["movieId"].map(movie_id_map)

movies_indexed = movies_indexed.sort_values("movie_idx").reset_index(drop=True)

In [18]:
movies_indexed["genre_list"] = movies_indexed["genres"].fillna("").str.split("|")

mlb = MultiLabelBinarizer()
genre_matrix = mlb.fit_transform(movies_indexed["genre_list"])

item_features = csr_matrix(genre_matrix)

In [19]:
model_collab = LightFM(loss="warp", no_components=30, random_state=42)
model_collab.fit(train_interactions, epochs=30, num_threads=4)

model_hybrid = LightFM(loss="warp", no_components=30, random_state=42)
model_hybrid.fit(
    train_interactions, item_features=item_features, epochs=50, num_threads=5
)

<lightfm.lightfm.LightFM at 0x15dd2b810>

In [20]:
auc_collab = auc_score(
    model_collab, test_interactions, train_interactions=train_interactions
).mean()
auc_hybrid = auc_score(
    model_hybrid,
    test_interactions,
    train_interactions=train_interactions,
    item_features=item_features,
).mean()

train_prec = precision_at_k(
    model_hybrid,
    test_interactions,
    train_interactions=train_interactions,
    k=10,
    item_features=item_features,
).mean()

print(f"Collab AUC: {auc_collab.round(4)}")
print(f"Hybrid AUC: {auc_hybrid.round(4)}")
print(f"Train Precision: {train_prec.round(4)}")

Collab AUC: 0.9283000230789185
Hybrid AUC: 0.6988999843597412
Train Precision: 0.01899999938905239


In [21]:
user_id = 7
user_idx = user_id_map[user_id]

n_recommendations = 10

scores = model_hybrid.predict(
    user_idx, np.arange(n_movies), item_features=item_features
)

top_movie_idx = np.argsort(-scores)[:n_recommendations]
recommended_movie_ids = [inv_movie_id_map[idx] for idx in top_movie_idx]
recommended_titles = (
    movies_df[movies_df["movieId"].isin(recommended_movie_ids)]
    .set_index("movieId")
    .loc[recommended_movie_ids]["title"]
    .values
)

print("Movies recommendations:\n")
for title in recommended_titles:
    print(f" - {title}")

Movies recommendations:

 - Aelita: The Queen of Mars (Aelita) (1924)
 - Captain America: The First Avenger (2011)
 - X-Men: First Class (2011)
 - King Kong (1976)
 - Interstate 60 (2002)
 - Transformers: Dark of the Moon (2011)
 - Blood Diamond (2006)
 - Were the World Mine (2008)
 - Maximum Ride (2016)
 - Black Mask (Hak hap) (1996)
