In [10]:
# Use this cell when you modify the recsys package and need to reload it
import importlib
import sys

# Reload the recsys modules
modules_to_reload = []
for module_name in sys.modules.keys():
    if module_name.startswith('recsys'):
        modules_to_reload.append(module_name)

if modules_to_reload:
    # Reload in reverse order to handle dependencies
    for module_name in reversed(sorted(modules_to_reload)):
        importlib.reload(sys.modules[module_name])
    print(f"Reloaded modules: {modules_to_reload}")
else:
    print("No recsys modules found to reload")

Reloaded modules: ['recsys', 'recsys.recommendation.MFRecommender', 'recsys.recommendation.recommend', 'recsys.recommendation']


## Matrix Factorization Recommendation Engine
This notebook will document the implementation of the Matrix Factorization Method to solving item recommendation algorithm

## Dataset that we are using
The dataset that we will be using for this will be obtained from MovieLens (https://grouplens.org/datasets/movielens/).

This dataset holds approximately 33,000,000 ratings and 2,000,000 tag applications applied to 86,000 movies by 330,975 users.

## Initial downloading of the dataset

In [11]:
import sys
from pathlib import Path
import numpy as np

In [12]:

project_root = Path("..").resolve()
if str(project_root / "src") not in sys.path:
    sys.path.append(str(project_root / "src"))

In [13]:
from recsys.utils import unzip_file
zip_path = project_root / "data" / "raw" / "ml-latest.zip"
destination = project_root / "data" / "processed"

unzip_file(zip_path, destination, overwrite=True)

PosixPath('/Users/tonyli/Documents/Projects/craftyverse/craftyverse-recsys/data/processed')

In [14]:

from recsys.utils import read_csv
csv_path = project_root / "data" /"processed" / "ml-latest" / "ratings.csv"

ratings_df = read_csv(csv_path)
print(ratings_df.head())

   userId  movieId  rating   timestamp
0       1        1     4.0  1225734739
1       1      110     4.0  1225865086
2       1      158     4.0  1225733503
3       1      260     4.5  1225735204
4       1      356     5.0  1225735119


## Encoding the data
The first step we want to have contiguous ids for users and movies. This means that we want to have a unique identifier that can identify both user and movie. In this dataset we have the `timestamp` as the contiguous id.

Then we'll then split the ratings dataset into two subsets:
  - Training Dataset (75% of the data)
  - Testing Dataset (15% of the data)
  - Validation Dataset (10% of the data)

In [15]:
# Calculate quantiles for splitting the dataset
# 75% for training, 15% for testing, 10% for validation
train_threshold = np.quantile(ratings_df.timestamp, 0.75)  # 75th percentile
test_threshold = np.quantile(ratings_df.timestamp, 0.90)   # 90th percentile (75% + 15%)

print(f"Training threshold (75th percentile): {train_threshold}")
print(f"Testing threshold (90th percentile): {test_threshold}")
print(f"Validation will be the remaining 10% (90th-100th percentile)")

# Split the dataset based on timestamps
train_df = ratings_df[ratings_df.timestamp <= train_threshold].copy()
test_df = ratings_df[(ratings_df.timestamp > train_threshold) & 
                     (ratings_df.timestamp <= test_threshold)].copy()
val_df = ratings_df[ratings_df.timestamp > test_threshold].copy()

print(val_df.head())

# Print split statistics
total_ratings = len(ratings_df)
print(f"\nDataset split statistics:")
print(f"Total ratings: {total_ratings:,}")
print(f"Training set: {len(train_df):,} ({len(train_df)/total_ratings*100:.1f}%)")
print(f"Testing set: {len(test_df):,} ({len(test_df)/total_ratings*100:.1f}%)")
print(f"Validation set: {len(val_df):,} ({len(val_df)/total_ratings*100:.1f}%)")

Training threshold (75th percentile): 1496919379.25
Testing threshold (90th percentile): 1598691985.6000001
Validation will be the remaining 10% (90th-100th percentile)
      userId  movieId  rating   timestamp
1538      22       16     3.5  1685231200
1545      22      165     3.5  1685231212
1547      22      260     4.0  1685231007
1548      22      288     4.5  1685230899
1554      22     1036     4.0  1685230896

Dataset split statistics:
Total ratings: 33,832,162
Training set: 25,374,121 (75.0%)
Testing set: 5,074,824 (15.0%)
Validation set: 3,383,217 (10.0%)
      userId  movieId  rating   timestamp
1538      22       16     3.5  1685231200
1545      22      165     3.5  1685231212
1547      22      260     4.0  1685231007
1548      22      288     4.5  1685230899
1554      22     1036     4.0  1685230896

Dataset split statistics:
Total ratings: 33,832,162
Training set: 25,374,121 (75.0%)
Testing set: 5,074,824 (15.0%)
Validation set: 3,383,217 (10.0%)


## Dataset Cleaning
This section will clean the partitioned datasets that we conducted in the previous step to prepare them for matrix factorization.

### Cleaning Steps Overview:

1. **Create Contiguous ID Mappings**
   - Map original user IDs → contiguous indices (0, 1, 2, ...)
   - Map original movie IDs → contiguous indices (0, 1, 2, ...)
   - This ensures memory efficiency and algorithm compatibility

2. **Handle Cold Start Problems**
   - Filter out users/movies that appear only in test/validation sets
   - Ensure all test/validation users and movies exist in training data
   - Remove ratings for unknown users/movies from test/validation sets

3. **Create Rating Matrices**
   - Build sparse user-item rating matrix for training
   - Convert test/validation data to use the same index mappings
   - Handle missing ratings (typically filled with 0 or mean ratings)

4. **Data Quality Checks**
   - Verify no data leakage between train/test/validation
   - Check matrix dimensions and sparsity
   - Validate that all indices are within expected ranges

First we want to compute the unique userIds in the training set

In [16]:
# Compute number of unique users in the training dataset
training_user_ids = np.sort(np.unique(train_df.userId.values))
n_training_users = len(training_user_ids)
print(training_user_ids[:15])
print(f"Number of unique users in training set: {n_training_users}")



[ 1  2  4  5  6  7  8  9 10 11 12 14 15 16 17]
Number of unique users in training set: 268300


### Why do we need userId_to_index mapping?

In matrix factorization, we need to create matrices where:
- **Rows represent users**
- **Columns represent movies** 
- **Values are ratings**

**The Problem**: User IDs in MovieLens are not contiguous integers starting from 0. For example, you might have user IDs like [1, 5, 7, 15, 23, ...] instead of [0, 1, 2, 3, 4, ...].

**Why this matters for Matrix Factorization**:
1. **Memory efficiency**: If your largest user ID is 330,975 but you only have 100,000 users, creating a matrix with 330,975 rows would waste ~70% of memory on empty rows
2. **Algorithm requirements**: Most matrix factorization libraries (like scikit-learn's NMF, or custom implementations) expect matrix indices to be contiguous starting from 0
3. **Performance**: Sparse matrices and mathematical operations are more efficient with contiguous indices

**The Solution**: Create a mapping from original user IDs to contiguous indices (0, 1, 2, ..., n_users-1)

In [17]:
# Create mapping from original user IDs to contiguous matrix indices
userId_to_index = {user_id: index for index, user_id in enumerate(training_user_ids)}

print(f"\nOriginal user ID range: {training_user_ids.min()} to {training_user_ids.max()}")
print(f"Matrix index range:     0 to {len(training_user_ids)-1}")
print(f"Memory saved: {((training_user_ids.max() + 1) - len(training_user_ids)):,} empty rows avoided")


Original user ID range: 1 to 330975
Matrix index range:     0 to 268299
Memory saved: 62,676 empty rows avoided


In [18]:
# Similarly, we need to create a mapping for movie IDs
training_movie_ids = np.sort(np.unique(train_df.movieId.values))
movieId_to_index = {movie_id: index for index, movie_id in enumerate(training_movie_ids)}

# Calculate memory savings
original_size = (training_user_ids.max() + 1) * (training_movie_ids.max() + 1)
optimized_size = len(training_user_ids) * len(training_movie_ids)
savings_percentage = (1 - optimized_size/original_size) * 100

print(f"\nMemory efficiency:")
print(f"Original matrix size: {original_size:,} elements")
print(f"Optimized matrix size: {optimized_size:,} elements") 
print(f"Memory saved: {savings_percentage:.1f}%")


Memory efficiency:
Original matrix size: 57,548,782,976 elements
Optimized matrix size: 11,064,423,700 elements
Memory saved: 80.8%


### Step 1: Apply ID Mappings to Training Data

Now that we have our mappings created, we need to **transform the actual data** to use the new contiguous indices instead of the original IDs.

**What this code does:**
- Replaces each original `userId` in the training dataframe with its corresponding matrix index
- For example: original userId `123` becomes matrix index `5` (if it's the 6th unique user)
- This creates a "clean" dataset where all user IDs are contiguous integers starting from 0

**Why this step is crucial:**
- The training data now has user IDs that can directly index into our rating matrix
- No more gaps or missing indices - every user ID from 0 to n_users-1 exists
- This transformed data is what we'll use to build our user-item rating matrix for matrix factorization

#### For UserId

In [19]:
train_df['userId'] = train_df['userId'].apply(lambda uid: userId_to_index[uid])
print(train_df.head())

   userId  movieId  rating   timestamp
0       0        1     4.0  1225734739
1       0      110     4.0  1225865086
2       0      158     4.0  1225733503
3       0      260     4.5  1225735204
4       0      356     5.0  1225735119


In [21]:
# Now we need to transform the test set
test_df['userId'] = test_df['userId'].apply(lambda uid: userId_to_index.get(uid, -1))
test_df = test_df[test_df['userId'] >= 0].copy()  # Remove unknown users
print("Transformed test set (with unknown users removed):")
print(test_df.head())

Transformed test set (with unknown users removed):
      userId  movieId  rating   timestamp
8138      51     3822     3.5  1514820246
8142      51     4979     4.0  1514916004
8143      51     6043     4.0  1514820568
8144      51     6711     3.5  1514821550
8147      51     7068     5.0  1527964516


In [22]:
# Transform the validation set
val_df['userId'] = val_df['userId'].apply(lambda uid: userId_to_index.get(uid, -1))
val_df = val_df[val_df['userId'] >= 0].copy()  # Remove unknown users
print("Transformed validation set (with unknown users deleted):")
print(val_df.head())

Transformed validation set (with unknown users deleted):
       userId  movieId  rating   timestamp
11832     109     1394     4.0  1668055512
11879     109     3362     4.0  1609548629
11901     109     4370     4.5  1598921146
11924     109     5772     5.0  1615761089
11938     109     6787     4.0  1599104418


#### For MovieID

In [23]:
train_df['movieId'] = train_df['movieId'].apply(lambda mid: movieId_to_index[mid])
print(train_df.head())

   userId  movieId  rating   timestamp
0       0        0     4.0  1225734739
1       0      108     4.0  1225865086
2       0      156     4.0  1225733503
3       0      257     4.5  1225735204
4       0      351     5.0  1225735119


In [24]:
test_df['movieId'] = test_df['movieId'].apply(lambda mid: movieId_to_index.get(mid, -1))
test_df = test_df[test_df['movieId'] >= 0].copy()  # Remove unknown movies
print("Transformed test set (with unknown movies deleted):")
print(test_df.head())

Transformed test set (with unknown movies deleted):
      userId  movieId  rating   timestamp
8138      51     3721     3.5  1514820246
8142      51     4874     4.0  1514916004
8143      51     5932     4.0  1514820568
8144      51     6589     3.5  1514821550
8147      51     6944     5.0  1527964516


In [25]:
val_df['movieId'] = val_df['movieId'].apply(lambda mid: movieId_to_index.get(mid, -1))
val_df = val_df[val_df['movieId'] >= 0].copy()  # Remove unknown movies
print("Transformed validation set (with unknown movies deleted):")
print(val_df.head())

Transformed validation set (with unknown movies deleted):
       userId  movieId  rating   timestamp
11832     109     1358     4.0  1668055512
11879     109     3269     4.0  1609548629
11901     109     4266     4.5  1598921146
11924     109     5661     5.0  1615761089
11938     109     6665     4.0  1599104418


## Embedding the Data

Now that we have clean, indexed data, we need to create the **user-item rating matrix** that will be decomposed through matrix factorization.

### What are Embeddings in Matrix Factorization?

**Matrix Factorization** decomposes our large, sparse user-item rating matrix into two smaller dense matrices:
- **user_embeddings** (User latent factors): Each row represents a user as a vector of preferences
- **movie_embeddings** (Movie latent factors): Each row represents a movie as a vector of characteristics

**Mathematical Representation:**
```
rating_matrix ≈ user_embeddings × movie_embeddings^T
```
Where:
- **rating_matrix** is our `num_users × num_movies` rating matrix (mostly zeros/missing values)
- **user_embeddings** is `num_users × num_latent_factors` matrix (typically 50-200 factors)
- **movie_embeddings** is `num_movies × num_latent_factors` matrix
- **num_latent_factors** is much smaller than both num_users and num_movies

### Why Embeddings Work

The key insight is that user preferences and movie characteristics can be represented in a **lower-dimensional latent space**. For example:
- **Latent Factor 1**: "Action vs Romance preference"
- **Latent Factor 2**: "Mainstream vs Indie preference"  
- **Latent Factor 3**: "New vs Classic movie preference"

Each user and movie gets a vector in this latent space, and their **dot product** predicts the rating.

### Next Steps
1. Create the sparse user-item rating matrix from our cleaned data
2. Initialize user_embeddings and movie_embeddings matrices
3. Train the model to minimize prediction error

In [26]:
import torch
import torch.nn as nn
import torch.nn.functional as F

### Understanding PyTorch Embeddings with a Simple Example

Before we build our full matrix factorization model, let's understand how **PyTorch embeddings** work with a simple example.

**What `nn.Embedding` does:**
- Creates a lookup table that maps integer indices to dense vectors
- Each row in the embedding matrix represents one entity (user or movie)
- The vectors are **learnable parameters** that get optimized during training

**In our example below:**
- `n_users = 5` means we have 5 users (with indices 0, 1, 2, 3, 4)
- `embed_size = 5` means each user is represented by a 5-dimensional vector
- `embed.weight` is a 5×5 matrix where each row is a user's embedding vector

**How it works:**
```python
# If we want the embedding for user 0:
user_0_embedding = embed(torch.tensor([0]))  # Returns the first row
user_3_embedding = embed(torch.tensor([3]))  # Returns the fourth row
```

**Why this is powerful:**
- Initially, the embeddings are **random numbers** (as you'll see below)
- During training, these numbers get adjusted to capture user preferences
- Similar users will end up with similar embedding vectors
- The model learns meaningful representations automatically!

In [27]:
# Create a simple embedding example
embed_size = 5  # Each user/movie will be represented by 5 numbers
n_users = 5     # We have 5 users (indices 0, 1, 2, 3, 4)

# Create the embedding layer
user_embeddings = nn.Embedding(n_users, embed_size)

print("Initial random embedding matrix:")
print("Shape:", user_embeddings.weight.shape)
print("Values:")
print(user_embeddings.weight.data)

print("\n" + "="*50)

# Example: Get embeddings for specific users
print("Getting embeddings for individual users:")

# Get embedding for user 0
user_0_vector = user_embeddings(torch.tensor([0]))
print(f"User 0 embedding: {user_0_vector.data}")

# Get embedding for user 3  
user_3_vector = user_embeddings(torch.tensor([3]))
print(f"User 3 embedding: {user_3_vector.data}")

# Get embeddings for multiple users at once
multiple_users = user_embeddings(torch.tensor([0, 2, 4]))
print(f"\nEmbeddings for users [0, 2, 4]:")
print(multiple_users.data)

Initial random embedding matrix:
Shape: torch.Size([5, 5])
Values:
tensor([[ 0.7646,  1.0155,  0.6477,  1.4862, -0.9355],
        [-0.1059, -1.0354, -0.0275,  0.9706,  0.5976],
        [ 0.4763,  0.1567,  0.9565,  1.0822, -0.1728],
        [-0.9543, -0.5378, -0.7715,  0.5377, -0.6616],
        [-0.8066,  1.0206, -0.6346, -0.5660, -0.6606]])

Getting embeddings for individual users:
User 0 embedding: tensor([[ 0.7646,  1.0155,  0.6477,  1.4862, -0.9355]])
User 3 embedding: tensor([[-0.9543, -0.5378, -0.7715,  0.5377, -0.6616]])

Embeddings for users [0, 2, 4]:
tensor([[ 0.7646,  1.0155,  0.6477,  1.4862, -0.9355],
        [ 0.4763,  0.1567,  0.9565,  1.0822, -0.1728],
        [-0.8066,  1.0206, -0.6346, -0.5660, -0.6606]])


### Building the Complete Matrix Factorization Model

Now let's build our **complete recommendation system** using PyTorch. The `MFRecommender` class implements the core matrix factorization algorithm.

**Class Architecture:**
```
MFRecommender
├── user_embedding: Maps user IDs → user preference vectors
├── item_embedding: Maps movie IDs → movie characteristic vectors  
└── forward(): Computes predicted ratings via dot products
```

**How it works step-by-step:**

1. **Initialization (`__init__`)**:
   - Creates two embedding lookup tables (user and movie)
   - Each embedding has `embed_size` dimensions (typically 50-200)
   - Embeddings start with random weights that get learned during training

2. **Forward Pass (`forward`)**:
   - Takes user IDs and movie IDs as input
   - Looks up their corresponding embedding vectors
   - Computes **dot product** between user and movie vectors
   - Returns predicted ratings

**Mathematical Formula:**
```
predicted_rating = user_vector · movie_vector
                 = Σ(user_embedding[i] × movie_embedding[i])
```

**Key Insight**: The dot product measures **similarity** between user preferences and movie characteristics in the latent space. High dot product = high predicted rating!

In [28]:
# Use the packaged implementation for the matrix factorization model
from recsys.recommendation import MFRecommender

In [30]:
# Example: Create and test our recommendation model
print("Creating MFRecommender with our actual dataset dimensions:")

# Use the dimensions from our cleaned data
num_users = len(training_user_ids)
num_movies = len(training_movie_ids) 
embedding_dim = 50  # Start with 50 latent factors

print(f"Number of users: {num_users:,}")
print(f"Number of movies: {num_movies:,}")
print(f"Embedding dimensions: {embedding_dim}")

# Create the model
model = MFRecommender(num_users, num_movies, embedding_dim)

print(f"\nModel created successfully!")
print(f"User embedding shape: {model.user_embedding.weight.shape}")
print(f"Movie embedding shape: {model.item_embedding.weight.shape}")

# Test prediction for a few user-movie pairs
test_users = torch.tensor([0, 1, 2])      # First 3 users
test_movies = torch.tensor([0, 5, 10])    # Some movies

predicted_ratings = model(test_users, test_movies)
print(f"\nExample predictions:")
for i, (user, movie, rating) in enumerate(zip(test_users, test_movies, predicted_ratings)):
    print(f"User {user.item()}, Movie {movie.item()} → Predicted rating: {rating.item():.3f}")

print(f"\nNote: These are random predictions since the model hasn't been trained yet!")

Creating MFRecommender with our actual dataset dimensions:
Number of users: 268,300
Number of movies: 41,239
Embedding dimensions: 50

Model created successfully!
User embedding shape: torch.Size([268300, 50])
Movie embedding shape: torch.Size([41239, 50])

Example predictions:
User 0, Movie 0 → Predicted rating: 0.038
User 1, Movie 5 → Predicted rating: 0.017
User 2, Movie 10 → Predicted rating: -0.005

Note: These are random predictions since the model hasn't been trained yet!
