In [9]:
import numpy as np
import pandas as pd

from recommender.data import load_processed_ratings
from recommender.svd_model import (
    user_id_to_idx,
    item_id_to_idx,
    user_factors,
    item_factors,
    get_train_test,
)


In [10]:
df = load_processed_ratings()
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp,movie_id,title,release_date,video_release_date,IMDB_URL,unknown,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,196,242,3,881250949,242,Kolya (1996),24-Jan-1997,,http://us.imdb.com/M/title-exact?Kolya%20(1996),0,...,0,0,0,0,0,0,0,0,0,0
1,186,302,3,891717742,302,L.A. Confidential (1997),01-Jan-1997,,http://us.imdb.com/M/title-exact?L%2EA%2E+Conf...,0,...,0,1,0,0,1,0,0,1,0,0
2,22,377,1,878887116,377,Heavyweights (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?Heavyweights%...,0,...,0,0,0,0,0,0,0,0,0,0
3,244,51,2,880606923,51,Legends of the Fall (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?Legends%20of%...,0,...,0,0,0,0,0,1,0,0,1,1
4,166,346,1,886397596,346,Jackie Brown (1997),01-Jan-1997,,http://us.imdb.com/M/title-exact?imdb-title-11...,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
train, test = get_train_test()

print(len(train), len(test))
print("Users:", train.user_id.nunique())
print("Movies:", train.movie_id.nunique())

99057 943
Users: 943
Movies: 1680


- Define Positives and Negatives
- Positive = rating ≥ 4
- Negative = movie user never rated (we’ll sample a few per positive)

In [12]:
# Mark positives in training data
pos_train = train[train["rating"] >= 4].copy()
pos_train.head()


Unnamed: 0,user_id,item_id,rating,timestamp,movie_id,title,release_date,video_release_date,IMDB_URL,unknown,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
214,259,255,4,874724710,255,My Best Friend's Wedding (1997),20-Jun-1997,,http://us.imdb.com/M/title-exact?My+Best+Frien...,0,...,0,0,0,0,0,1,0,0,0,0
83965,259,286,4,874724727,286,"English Patient, The (1996)",15-Nov-1996,,http://us.imdb.com/M/title-exact?English%20Pat...,0,...,0,0,0,0,0,1,0,0,1,0
43027,259,298,4,874724754,298,Face/Off (1997),27-Jun-1997,,http://us.imdb.com/M/title-exact?Face/Off+(1997),0,...,0,0,0,0,0,0,1,1,0,0
21396,259,185,4,874724781,185,Psycho (1960),01-Jan-1960,,http://us.imdb.com/M/title-exact?Psycho%20(1960),0,...,0,0,1,0,0,1,0,1,0,0
82655,259,173,4,874724843,173,"Princess Bride, The (1987)",01-Jan-1987,,http://us.imdb.com/M/title-exact?Princess%20Br...,0,...,0,0,0,0,0,1,0,0,0,0


We’ll need a list of all movies:

In [13]:
all_movie_ids = train["movie_id"].unique()
all_movie_ids_set = set(all_movie_ids)

- Compute Basic User & Item Features
- These will power both LightGBM and (later) a neural model.

User-level aggregates

In [14]:
user_stats = train.groupby("user_id")["rating"].agg(
    user_total_ratings="count",
    user_avg_rating="mean",
    user_rating_std="std",
).reset_index()

user_stats.head()


Unnamed: 0,user_id,user_total_ratings,user_avg_rating,user_rating_std
0,1,271,3.619926,1.255879
1,2,61,3.721311,1.03491
2,3,53,2.811321,1.225633
3,4,23,4.347826,0.934622
4,5,174,2.87931,1.36527


Item-level aggregates

In [15]:
item_stats = train.groupby("movie_id")["rating"].agg(
    movie_popularity="count",
    movie_avg_rating="mean",
).reset_index()

item_stats.head()


Unnamed: 0,movie_id,movie_popularity,movie_avg_rating
0,1,449,3.879733
1,2,129,3.20155
2,3,89,3.022472
3,4,209,3.550239
4,5,85,3.294118


Helper: SVD Score Feature

In [16]:
def svd_score(user_id: int, movie_id: int) -> float:
    """
    Compute the MF (SVD) score for a given (user, movie) pair.
    Returns 0.0 if either ID is unknown.
    """
    try:
        u_idx = user_id_to_idx[user_id]
        i_idx = item_id_to_idx[movie_id]
    except KeyError:
        return 0.0

    return float(user_factors[u_idx] @ item_factors[i_idx])


- Build the Ranking Samples (Loop per User)
- For each user:
    - Find all positive movies (rating ≥ 4) in train
    - For each positive:
        - Add 1 positive row (label = 1)
        - Sample N_NEG negative movies (not rated by user) → N_NEG negative rows (label = 0)

In [17]:
N_NEG = 3  # negatives per positive
rows = []
rng = np.random.default_rng(seed=42)
# Precompute user -> set of movies they rated
user_to_rated = (
    train.groupby("user_id")["movie_id"]
    .apply(set)
    .to_dict()
)

for user_id, user_pos in pos_train.groupby("user_id"):
    rated_movies = user_to_rated[user_id]
    pos_movie_ids = user_pos["movie_id"].unique()

    # Skip users with no positives (shouldn't happen, but safe)
    if len(pos_movie_ids) == 0:
        continue

    # Candidate pool for negatives = all movies not rated by this user
    negative_pool = np.array(list(all_movie_ids_set - rated_movies))
    if len(negative_pool) == 0:
        continue

    for movie_id in pos_movie_ids:
        # Positive row
        rows.append((user_id, movie_id, 1))

        # Sample negatives
        if len(negative_pool) >= N_NEG:
            sampled_negs = rng.choice(negative_pool, size=N_NEG, replace=False)
        else:
            sampled_negs = negative_pool

        for neg_id in sampled_negs:
            rows.append((user_id, int(neg_id), 0))


Turn into a DataFrame:

In [18]:
rank_df = pd.DataFrame(rows, columns=["user_id", "movie_id", "label"])
rank_df.head(), len(rank_df)


(   user_id  movie_id  label
 0        1       172      1
 1        1      1194      0
 2        1       397      0
 3        1      1361      0
 4        1       168      1,
 219580)

Add User and Item Features
Merge in the stats from Step 4:

In [19]:
rank_df = rank_df.merge(user_stats, on="user_id", how="left")
rank_df = rank_df.merge(item_stats, on="movie_id", how="left")
rank_df.head()

Unnamed: 0,user_id,movie_id,label,user_total_ratings,user_avg_rating,user_rating_std,movie_popularity,movie_avg_rating
0,1,172,1,271,3.619926,1.255879,366,4.204918
1,1,1194,0,271,3.619926,1.255879,31,4.064516
2,1,397,0,271,3.619926,1.255879,12,2.583333
3,1,1361,0,271,3.619926,1.255879,3,2.666667
4,1,168,1,271,3.619926,1.255879,314,4.066879


Add SVD Score Feature

In [20]:
rank_df["svd_score"] = rank_df.apply(
    lambda row: svd_score(row["user_id"], row["movie_id"]),
    axis=1
)

rank_df.head()


Unnamed: 0,user_id,movie_id,label,user_total_ratings,user_avg_rating,user_rating_std,movie_popularity,movie_avg_rating,svd_score
0,1,172,1,271,3.619926,1.255879,366,4.204918,4.411927
1,1,1194,0,271,3.619926,1.255879,31,4.064516,1.331503
2,1,397,0,271,3.619926,1.255879,12,2.583333,0.029563
3,1,1361,0,271,3.619926,1.255879,3,2.666667,0.003841
4,1,168,1,271,3.619926,1.255879,314,4.066879,3.956445


At this point, I have:
- user_id
- movie_id
- label (1/0)
- user_total_ratings
- user_avg_rating
- user_rating_std
- movie_popularity
- movie_avg_rating
- svd_score

In [21]:
#print basic stats
rank_df["label"].value_counts(normalize=True)


label
0    0.75
1    0.25
Name: proportion, dtype: float64

In [22]:
output_path = "../data/processed/ranking_dataset.parquet"
rank_df.to_parquet(output_path)
output_path


'../data/processed/ranking_dataset.parquet'