# Collaborative Filtering

This notebook builds a matrix factorization-based collaborative filtering model with TensorFlow using the Netflix ratings data stored in the `../data` directory. We will prepare the data, train an embedding model with implicit regularization, and surface sample movie recommendations for an arbitrary user.


In [2]:
from pathlib import Path

import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split

SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)
tf.get_logger().setLevel("ERROR")

print(f"TensorFlow version: {tf.__version__}")


TensorFlow version: 2.20.0


In [4]:
DATA_DIR = Path("..").resolve() / "data"
assert DATA_DIR.exists(), f"Data directory not found at {DATA_DIR}"

ratings = pd.read_csv(DATA_DIR / "rating.csv")
movies = pd.read_csv(DATA_DIR / "movie.csv")

print(
    f"Ratings: {ratings.shape[0]:,} rows, {ratings['userId'].nunique():,} users, {ratings['movieId'].nunique():,} movies"
)
print(f"Movies metadata: {movies.shape[0]:,} titles")
ratings.head()


Ratings: 20,000,263 rows, 138,493 users, 26,744 movies
Movies metadata: 27,278 titles


Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


In [5]:
rating_stats = ratings["rating"].describe()
print(rating_stats)

ratings_per_user = ratings.groupby("userId").size().describe()
print("\nRatings per user:")
print(ratings_per_user)

ratings_per_item = ratings.groupby("movieId").size().describe()
print("\nRatings per movie:")
print(ratings_per_item)


count    2.000026e+07
mean     3.525529e+00
std      1.051989e+00
min      5.000000e-01
25%      3.000000e+00
50%      3.500000e+00
75%      4.000000e+00
max      5.000000e+00
Name: rating, dtype: float64

Ratings per user:
count    138493.000000
mean        144.413530
std         230.267257
min          20.000000
25%          35.000000
50%          68.000000
75%         155.000000
max        9254.000000
dtype: float64

Ratings per movie:
count    26744.000000
mean       747.841123
std       3085.818268
min          1.000000
25%          3.000000
50%         18.000000
75%        205.000000
max      67310.000000
dtype: float64


In [6]:
def build_id_mappings(values: pd.Series):
    unique_values = np.sort(values.unique())
    value_to_idx = {value: idx for idx, value in enumerate(unique_values)}
    idx_to_value = {idx: value for value, idx in value_to_idx.items()}
    return value_to_idx, idx_to_value


user_to_idx, idx_to_user = build_id_mappings(ratings["userId"])
movie_to_idx, idx_to_movie = build_id_mappings(ratings["movieId"])

ratings["user_idx"] = ratings["userId"].map(user_to_idx).astype(np.int32)
ratings["movie_idx"] = ratings["movieId"].map(movie_to_idx).astype(np.int32)

num_users = len(user_to_idx)
num_items = len(movie_to_idx)
print(f"Model will train on {num_users} users and {num_items} items")


Model will train on 138493 users and 26744 items


In [7]:
train_df, val_df = train_test_split(
    ratings, test_size=0.1, random_state=SEED, shuffle=True
)

BATCH_SIZE = 4096
SHUFFLE_BUFFER = 1_000_000


def df_to_dataset(df: pd.DataFrame, training: bool = True) -> tf.data.Dataset:
    features = {
        "user_id": df["user_idx"].values.astype(np.int32),
        "item_id": df["movie_idx"].values.astype(np.int32),
    }
    labels = df["rating"].values.astype(np.float32)
    ds = tf.data.Dataset.from_tensor_slices((features, labels))
    if training:
        ds = ds.shuffle(
            min(len(df), SHUFFLE_BUFFER), seed=SEED, reshuffle_each_iteration=True
        )
    return ds.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)


train_ds = df_to_dataset(train_df, training=True)
val_ds = df_to_dataset(val_df, training=False)

len(train_df), len(val_df)


(18000236, 2000027)

In [8]:
class MatrixFactorization(tf.keras.Model):
    """Simple dot-product collaborative filtering with user/item biases."""

    def __init__(
        self,
        num_users: int,
        num_items: int,
        embedding_dim: int = 64,
        reg: float = 1e-6,
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.num_users = num_users
        self.num_items = num_items
        self.embedding_dim = embedding_dim
        self.reg = reg

        regularizer = tf.keras.regularizers.l2(reg)
        self.user_embedding = tf.keras.layers.Embedding(
            input_dim=num_users,
            output_dim=embedding_dim,
            embeddings_regularizer=regularizer,
            name="user_embedding",
        )
        self.item_embedding = tf.keras.layers.Embedding(
            input_dim=num_items,
            output_dim=embedding_dim,
            embeddings_regularizer=regularizer,
            name="item_embedding",
        )
        self.user_bias = tf.keras.layers.Embedding(num_users, 1, name="user_bias")
        self.item_bias = tf.keras.layers.Embedding(num_items, 1, name="item_bias")

    def call(self, inputs):
        user_vec = self.user_embedding(inputs["user_id"])
        item_vec = self.item_embedding(inputs["item_id"])
        dot_product = tf.reduce_sum(user_vec * item_vec, axis=1)
        user_b = tf.squeeze(self.user_bias(inputs["user_id"]), axis=1)
        item_b = tf.squeeze(self.item_bias(inputs["item_id"]), axis=1)
        return dot_product + user_b + item_b

    def get_config(self):
        config = super().get_config()
        config.update(
            {
                "num_users": self.num_users,
                "num_items": self.num_items,
                "embedding_dim": self.embedding_dim,
                "reg": self.reg,
            }
        )
        return config

    @classmethod
    def from_config(cls, config):
        return cls(**config)


In [9]:
model = MatrixFactorization(
    num_users=num_users, num_items=num_items, embedding_dim=64, reg=1e-6
)
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    loss=tf.keras.losses.MeanSquaredError(),
    metrics=[tf.keras.metrics.RootMeanSquaredError(name="rmse")],
)

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor="val_rmse", mode="min", patience=3, restore_best_weights=True
)

history = model.fit(
    train_ds, validation_data=val_ds, epochs=20, callbacks=[early_stopping], verbose=2
)


Epoch 1/20
4395/4395 - 215s - 49ms/step - loss: 2.3111 - rmse: 1.4723 - val_loss: 0.9608 - val_rmse: 0.8758
Epoch 2/20
4395/4395 - 234s - 53ms/step - loss: 0.9156 - rmse: 0.8508 - val_loss: 0.8924 - val_rmse: 0.8399
Epoch 3/20
4395/4395 - 227s - 52ms/step - loss: 0.8500 - rmse: 0.8177 - val_loss: 0.8487 - val_rmse: 0.8200
Epoch 4/20
4395/4395 - 250s - 57ms/step - loss: 0.7970 - rmse: 0.7905 - val_loss: 0.8171 - val_rmse: 0.8055
Epoch 5/20
4395/4395 - 250s - 57ms/step - loss: 0.7509 - rmse: 0.7652 - val_loss: 0.7962 - val_rmse: 0.7960
Epoch 6/20
4395/4395 - 244s - 56ms/step - loss: 0.7103 - rmse: 0.7415 - val_loss: 0.7845 - val_rmse: 0.7913
Epoch 7/20
4395/4395 - 249s - 57ms/step - loss: 0.6762 - rmse: 0.7211 - val_loss: 0.7784 - val_rmse: 0.7904
Epoch 8/20
4395/4395 - 243s - 55ms/step - loss: 0.6492 - rmse: 0.7058 - val_loss: 0.7740 - val_rmse: 0.7912
Epoch 9/20
4395/4395 - 243s - 55ms/step - loss: 0.6281 - rmse: 0.6951 - val_loss: 0.7690 - val_rmse: 0.7921
Epoch 10/20
4395/4395 - 246s

In [10]:
eval_metrics = model.evaluate(val_ds, return_dict=True, verbose=0)
print({k: round(v, 4) for k, v in eval_metrics.items()})

{'loss': 0.7784, 'rmse': 0.7904}


In [11]:
movie_titles = movies.set_index("movieId")["title"].to_dict()
user_consumed = ratings.groupby("user_idx")["movie_idx"].apply(set).to_dict()


def recommend_movies(raw_user_id: int, top_k: int = 10) -> pd.DataFrame:
    if raw_user_id not in user_to_idx:
        raise ValueError(f"User {raw_user_id} not found in the dataset")

    user_idx = user_to_idx[raw_user_id]
    all_items = np.arange(num_items, dtype=np.int32)
    user_vector = np.full_like(all_items, fill_value=user_idx)

    preds = model(
        {
            "user_id": tf.convert_to_tensor(user_vector, dtype=tf.int32),
            "item_id": tf.convert_to_tensor(all_items, dtype=tf.int32),
        }
    ).numpy()

    seen_items = user_consumed.get(user_idx, set())
    mask = np.ones_like(preds, dtype=bool)
    mask[list(seen_items)] = False
    filtered_scores = np.where(mask, preds, -np.inf)

    top_indices = np.argpartition(filtered_scores, -top_k)[-top_k:]
    top_indices = top_indices[np.argsort(filtered_scores[top_indices])[::-1]]

    recommendations = []
    for item_idx in top_indices:
        movie_id = idx_to_movie[item_idx]
        recommendations.append(
            {
                "movieId": movie_id,
                "title": movie_titles.get(movie_id, "Unknown"),
                "score": float(filtered_scores[item_idx]),
            }
        )

    return pd.DataFrame(recommendations)


sample_user = ratings["userId"].iloc[1]
recommend_movies(sample_user)


Unnamed: 0,movieId,title,score
0,1210,Star Wars: Episode VI - Return of the Jedi (1983),4.234172
1,34405,Serenity (2005),4.157743
2,1197,"Princess Bride, The (1987)",4.155848
3,40815,Harry Potter and the Goblet of Fire (2005),4.154024
4,1527,"Fifth Element, The (1997)",4.140618
5,98809,"Hobbit: An Unexpected Journey, The (2012)",4.127653
6,3578,Gladiator (2000),4.118014
7,480,Jurassic Park (1993),4.112618
8,54259,Stardust (2007),4.110387
9,88125,Harry Potter and the Deathly Hallows: Part 2 (...,4.103004


In [14]:
# Save model weights and configuration (works with already-trained model)
import pickle
import json

MODEL_DIR = Path("saved_model_cf")
MODEL_DIR.mkdir(exist_ok=True)

# Save model weights
model.save_weights(MODEL_DIR / "model_weights.weights.h5")
print(f"✓ Saved model weights to {MODEL_DIR / 'model_weights.weights.h5'}")

# Save model architecture config
model_config = {
    "num_users": num_users,
    "num_items": num_items,
    "embedding_dim": 64,
    "reg": 1e-6,
}
with open(MODEL_DIR / "model_config.json", "w") as f:
    json.dump(model_config, f)
print(f"✓ Saved model config to {MODEL_DIR / 'model_config.json'}")

# Save mappings for inference
mappings = {
    "user_to_idx": user_to_idx,
    "idx_to_user": idx_to_user,
    "movie_to_idx": movie_to_idx,
    "idx_to_movie": idx_to_movie,
    "movie_titles": movie_titles,
    "user_consumed": user_consumed,
    "num_users": num_users,
    "num_items": num_items,
}

with open(MODEL_DIR / "mappings.pkl", "wb") as f:
    pickle.dump(mappings, f)
print(f"✓ Saved mappings to {MODEL_DIR / 'mappings.pkl'}")


✓ Saved model weights to saved_model\model_weights.weights.h5
✓ Saved model config to saved_model\model_config.json
✓ Saved mappings to saved_model\mappings.pkl


In [15]:
# Export embeddings only (for vector similarity search / ANN systems)
# Extract learned embeddings for deployment to vector databases (Pinecone, Weaviate, etc.)

user_embeddings = model.user_embedding.get_weights()[
    0
]  # Shape: (num_users, embedding_dim)
item_embeddings = model.item_embedding.get_weights()[
    0
]  # Shape: (num_items, embedding_dim)
user_biases = model.user_bias.get_weights()[0].flatten()
item_biases = model.item_bias.get_weights()[0].flatten()

np.save(MODEL_DIR / "user_embeddings.npy", user_embeddings)
np.save(MODEL_DIR / "item_embeddings.npy", item_embeddings)
np.save(MODEL_DIR / "user_biases.npy", user_biases)
np.save(MODEL_DIR / "item_biases.npy", item_biases)

print("✓ Saved embeddings:")
print(f"  - User embeddings: {user_embeddings.shape}")
print(f"  - Item embeddings: {item_embeddings.shape}")
print("  → Use for approximate nearest neighbor search (FAISS, Annoy, etc.)")


✓ Saved embeddings:
  - User embeddings: (138493, 64)
  - Item embeddings: (26744, 64)
  → Use for approximate nearest neighbor search (FAISS, Annoy, etc.)
