In [1]:
import numpy as np
import pandas as pd

from recommender.data import load_processed_ratings
from recommender.svd_model import (
    user_id_to_idx,
    item_id_to_idx,
    user_factors,
    item_factors,
    get_train_test,
)


In [2]:
df = load_processed_ratings()
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp,movie_id,title,release_date,video_release_date,IMDB_URL,unknown,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,196,242,3,881250949,242,Kolya (1996),24-Jan-1997,,http://us.imdb.com/M/title-exact?Kolya%20(1996),0,...,0,0,0,0,0,0,0,0,0,0
1,186,302,3,891717742,302,L.A. Confidential (1997),01-Jan-1997,,http://us.imdb.com/M/title-exact?L%2EA%2E+Conf...,0,...,0,1,0,0,1,0,0,1,0,0
2,22,377,1,878887116,377,Heavyweights (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?Heavyweights%...,0,...,0,0,0,0,0,0,0,0,0,0
3,244,51,2,880606923,51,Legends of the Fall (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?Legends%20of%...,0,...,0,0,0,0,0,1,0,0,1,1
4,166,346,1,886397596,346,Jackie Brown (1997),01-Jan-1997,,http://us.imdb.com/M/title-exact?imdb-title-11...,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
train, test = get_train_test()

print(len(train), len(test))
print("Users:", train.user_id.nunique())
print("Movies:", train.movie_id.nunique())

99057 943
Users: 943
Movies: 1680


- Define Positives and Negatives
- Positive = rating ≥ 4
- Negative = movie user never rated (we’ll sample a few per positive)

In [None]:
# Mark positives in training data
pos_train = train[train["rating"] >= 4].copy()
pos_train.head()


We’ll need a list of all movies:

In [4]:
all_movie_ids = train["movie_id"].unique()
all_movie_ids_set = set(all_movie_ids)

- Compute Basic User & Item Features
- These will power both LightGBM and (later) a neural model.

User-level aggregates

In [5]:
user_stats = train.groupby("user_id")["rating"].agg(
    user_total_ratings="count",
    user_avg_rating="mean",
    user_rating_std="std",
).reset_index()

user_stats.head()


Unnamed: 0,user_id,user_total_ratings,user_avg_rating,user_rating_std
0,1,271,3.619926,1.255879
1,2,61,3.721311,1.03491
2,3,53,2.811321,1.225633
3,4,23,4.347826,0.934622
4,5,174,2.87931,1.36527


Item-level aggregates

In [None]:
item_stats = train.groupby("movie_id")["rating"].agg(
    movie_popularity="count",
    movie_avg_rating="mean",
).reset_index()

item_stats.head()


Helper: SVD Score Feature

In [7]:
def svd_score(user_id: int, movie_id: int) -> float:
    """
    Compute the MF (SVD) score for a given (user, movie) pair.
    Returns 0.0 if either ID is unknown.
    """
    try:
        u_idx = user_id_to_idx[user_id]
        i_idx = item_id_to_idx[movie_id]
    except KeyError:
        return 0.0

    return float(user_factors[u_idx] @ item_factors[i_idx])


- Build the Ranking Samples (Loop per User)
- For each user:
    - Find all positive movies (rating ≥ 4) in train
    - For each positive:
        - Add 1 positive row (label = 1)
        - Sample N_NEG negative movies (not rated by user) → N_NEG negative rows (label = 0)

In [None]:
N_NEG = 3  # negatives per positive
rows = []
rng = np.random.default_rng(seed=42)
# Precompute user -> set of movies they rated
user_to_rated = (
    train.groupby("user_id")["movie_id"]
    .apply(set)
    .to_dict()
)

for user_id, user_pos in pos_train.groupby("user_id"):
    rated_movies = user_to_rated[user_id]
    pos_movie_ids = user_pos["movie_id"].unique()

    # Skip users with no positives (shouldn't happen, but safe)
    if len(pos_movie_ids) == 0:
        continue

    # Candidate pool for negatives = all movies not rated by this user
    negative_pool = np.array(list(all_movie_ids_set - rated_movies))
    if len(negative_pool) == 0:
        continue

    for movie_id in pos_movie_ids:
        # Positive row
        rows.append((user_id, movie_id, 1))

        # Sample negatives
        if len(negative_pool) >= N_NEG:
            sampled_negs = rng.choice(negative_pool, size=N_NEG, replace=False)
        else:
            sampled_negs = negative_pool

        for neg_id in sampled_negs:
            rows.append((user_id, int(neg_id), 0))


Turn into a DataFrame:

In [None]:
rank_df = pd.DataFrame(rows, columns=["user_id", "movie_id", "label"])
rank_df.head(), len(rank_df)


Add User and Item Features
Merge in the stats from Step 4:

In [None]:
rank_df = rank_df.merge(user_stats, on="user_id", how="left")
rank_df = rank_df.merge(item_stats, on="movie_id", how="left")
rank_df.head()

Add SVD Score Feature

In [8]:
rank_df["svd_score"] = rank_df.apply(
    lambda row: svd_score(row["user_id"], row["movie_id"]),
    axis=1
)

rank_df.head()


NameError: name 'rank_df' is not defined

At this point, I have:
- user_id
- movie_id
- label (1/0)
- user_total_ratings
- user_avg_rating
- user_rating_std
- movie_popularity
- movie_avg_rating
- svd_score

In [None]:
#print basic stats
rank_df["label"].value_counts(normalize=True)


In [None]:
output_path = "../data/processed/ranking_dataset.parquet"
rank_df.to_parquet(output_path)
output_path
