In [13]:
import numpy as np
import pandas as pd

df = pd.read_parquet("../data/processed/ratings_joined.parquet")
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp,movie_id,title,release_date,video_release_date,IMDB_URL,unknown,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,196,242,3,881250949,242,Kolya (1996),24-Jan-1997,,http://us.imdb.com/M/title-exact?Kolya%20(1996),0,...,0,0,0,0,0,0,0,0,0,0
1,186,302,3,891717742,302,L.A. Confidential (1997),01-Jan-1997,,http://us.imdb.com/M/title-exact?L%2EA%2E+Conf...,0,...,0,1,0,0,1,0,0,1,0,0
2,22,377,1,878887116,377,Heavyweights (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?Heavyweights%...,0,...,0,0,0,0,0,0,0,0,0,0
3,244,51,2,880606923,51,Legends of the Fall (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?Legends%20of%...,0,...,0,0,0,0,0,1,0,0,1,1
4,166,346,1,886397596,346,Jackie Brown (1997),01-Jan-1997,,http://us.imdb.com/M/title-exact?imdb-title-11...,0,...,0,0,0,0,0,0,0,0,0,0


Build User-Item matrix and SVD

In [14]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD

# Load train/test like before
df = pd.read_parquet("../data/processed/ratings_joined.parquet")
df = df.sort_values("timestamp")

test = df.groupby("user_id").tail(1)
train = df.drop(test.index)

# Map ids to indices
unique_users = train["user_id"].unique()
unique_items = train["movie_id"].unique()

user_id_to_idx = {u: i for i, u in enumerate(unique_users)}
item_id_to_idx = {m: i for i, m in enumerate(unique_items)}
idx_to_item_id = {i: m for m, i in item_id_to_idx.items()}

n_users = len(unique_users)
n_items = len(unique_items)

# Build sparse matrix with ratings as values
row_idx = train["user_id"].map(user_id_to_idx)
col_idx = train["movie_id"].map(item_id_to_idx)
vals = train["rating"].astype(float)

user_item = csr_matrix(
    (vals, (row_idx, col_idx)),
    shape=(n_users, n_items)
)

user_item.shape


(943, 1680)

Truncated SVD as MF approximation

In [15]:
k = 50  # number of latent factors

svd = TruncatedSVD(n_components=k, random_state=42)
user_factors = svd.fit_transform(user_item)        # shape: (n_users, k)
item_factors = svd.components_.T                   # shape: (n_items, k)

user_factors.shape, item_factors.shape


((943, 50), (1680, 50))

Recommend with dot product

In [16]:
# Precompute item scores per user lazily
def recommend_svd(user_id, k_rec=10):
    if user_id not in user_id_to_idx:
        return []
    
    u_idx = user_id_to_idx[user_id]
    u_vec = user_factors[u_idx]  # (k,)

    # Scores for all items
    scores = item_factors @ u_vec  # shape: (n_items,)

    # Mask out seen items
    seen = set(train[train.user_id == user_id].movie_id.map(item_id_to_idx))
    scores_filtered = scores.copy()
    for i in seen:
        scores_filtered[i] = -np.inf

    # Top-k indices
    top_idx = np.argpartition(scores_filtered, -k_rec)[-k_rec:]
    top_idx = top_idx[np.argsort(scores_filtered[top_idx])[::-1]]

    movie_ids = [idx_to_item_id[i] for i in top_idx]
    return movie_ids


Reuse Hit@K

In [17]:
def hit_at_k(recommender_fn, k=10):
    hits = 0
    total = 0
    for _, row in test.iterrows():
        u = row.user_id
        true_item = row.movie_id
        recs = recommender_fn(u, k)
        if not recs:
            continue
        total += 1
        if true_item in recs:
            hits += 1
    return hits / total if total > 0 else 0.0

print("SVD Hit@10:", hit_at_k(recommend_svd, 10))


SVD Hit@10: 0.13043478260869565


Save model to Pickle files but do not include in git

In [19]:
import pickle
from pathlib import Path

Path("../models").mkdir(exist_ok=True)

with open("../models/svd_factors.pkl", "wb") as f:
    pickle.dump(
        {
            "user_factors": user_factors,
            "item_factors": item_factors,
            "user_id_to_idx": user_id_to_idx,
            "item_id_to_idx": item_id_to_idx,
            "idx_to_item_id": idx_to_item_id,
        },
        f
    )
