# Collaborative filtering project

In this project, the task is to create a paper recommendation system. The system consists of 10,000 scientists and 1,000 papers. Scientists give ratings between 1–5 to the papers that they read. Since not all scientists have read every paper, we only have a limited amount of observations of these ratings. Additionally, each scientist has a wishlist of papers that they would like to read in the future. Your task is to fill in the missing observations using the provided rating and wishlist data, such that we can recommend papers to scientists that we expect them to rate highly.

More specifically, there are three data sources:
 - `train_tbr.csv` containing wishlist data.
 - `train_ratings.csv` containing observed rating data.
 - `sample_submission.csv` containing (scientist, paper) pairs that have to be rated for the evaluation of your method.

The data is available at `/cluster/courses/cil/collaborative_filtering/data` and an environment has been prepared for you at `/cluster/courses/cil/envs/collaborative_filtering`. You can activate the environment in your shell by running:
```bash
conda activate /cluster/courses/cil/envs/collaborative_filtering
```
If you wish to use notebooks on the cluster, you need to set the Environment path to `/cluster/courses/cil/envs/collaborative_filtering/bin` and load the `cuda/12.6` module.

**Evaluation**: Your models are evaluated using the root mean-squared error (RMSE) metric. Your grade is determined by a linear interpolation between the easy (grade 4) and hard (grade 6) baselines.

**Rules**: You are only allowed to use the data provided in `train_tbr.csv` and `train_ratings.csv` to make your predictions of `sample_submission.csv`. You are not allowed to use external data sources. But, you are allowed to use pre-trained models, as long as they are available publicly. Furthermore, no external API calls are allowed, except for downloading the weights of pre-trained models.

**We will verify your code for plagiarism and using solutions from previous years.**

[Link to Kaggle competition](https://www.kaggle.com/competitions/ethz-cil-collaborative-filtering-2025)


# Neural Collaborative Filtering

In [1]:
from typing import Tuple, Callable

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error
import os

Make sure that results are reproducible by using a seed.

In [2]:
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)

## Helper functions

In [3]:
DATA_DIR = "/cluster/courses/cil/collaborative_filtering/data"


def read_data_df() -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Reads in data and splits it into training and validation sets with a 75/25 split."""
    
    df = pd.read_csv(os.path.join(DATA_DIR, "train_ratings.csv"))

    # Split sid_pid into sid and pid columns
    df[["sid", "pid"]] = df["sid_pid"].str.split("_", expand=True)
    df = df.drop("sid_pid", axis=1)
    df["sid"] = df["sid"].astype(int)
    df["pid"] = df["pid"].astype(int)
    
    # Split into train and validation dataset
    train_df, valid_df = train_test_split(df, test_size=0.01)
    return train_df, valid_df


def evaluate(valid_df: pd.DataFrame, pred_fn: Callable[[np.ndarray, np.ndarray], np.ndarray]) -> float:
    """
    Inputs:
        valid_df: Validation data, returned from read_data_df for example.
        pred_fn: Function that takes in arrays of sid and pid and outputs their rating predictions.

    Outputs: Validation RMSE
    """
    
    preds = pred_fn(valid_df["sid"].values, valid_df["pid"].values)
    return root_mean_squared_error(valid_df["rating"].values, preds)


def make_submission(pred_fn: Callable[[np.ndarray, np.ndarray], np.ndarray], filename: os.PathLike):
    """Makes a submission CSV file that can be submitted to kaggle.

    Inputs:
        pred_fn: Function that takes in arrays of sid and pid and outputs a score.
        filename: File to save the submission to.
    """
    
    df = pd.read_csv(os.path.join(DATA_DIR, "sample_submission.csv"))

    # Get sids and pids
    sid_pid = df["sid_pid"].str.split("_", expand=True)
    sids = sid_pid[0]
    pids = sid_pid[1]
    sids = sids.astype(int).values
    pids = pids.astype(int).values
    
    df["rating"] = pred_fn(sids, pids)
    df.to_csv(filename, index=False)

Try to use a GPU

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using: {device}")

Using: cuda


In [5]:
class NeuralCollaborativeFilteringModel(nn.Module):
    def __init__(self, num_scientists: int, num_papers: int, dim: int, hidden_dims=(32,16)):
        super().__init__()

        # Assign to each scientist and paper an embedding
        self.scientist_vec_gmd = nn.Embedding(num_scientists, dim)
        self.paper_vec_gmd = nn.Embedding(num_papers, dim)

        self.scientist_emb_mlp = nn.Embedding(num_scientists, dim)
        self.paper_emb_mlp = nn.Embedding(num_papers, dim)

        # MLP layers
        mlp1_layers = []
        input_dim = dim * 2  # because we concatenate two embeddings

        for hdim in hidden_dims:
            mlp1_layers.append(nn.Linear(input_dim, hdim))
            mlp1_layers.append(nn.ReLU())
            mlp1_layers.append(nn.Dropout(0.2))
            input_dim = hdim

        self.mlp1 = nn.Sequential(*mlp1_layers)

        self.sigmoid = nn.Sigmoid()

        # Final prediction layer
        output_layers = []
        output_layers.append(nn.Linear(dim + hidden_dims[-1], 1))
        output_layers.append(nn.ReLU())
        self.output_layer = nn.Sequential(*output_layers)
        
    def forward(self, sid: torch.Tensor, pid: torch.Tensor) -> torch.Tensor:
        """
        Inputs:
            sid: [B,], int
            pid: [B,], int
        
        Outputs: [B,], float
        """
        
        # Fetch gmd embeddings
        scientist_vec_gmd = self.scientist_vec_gmd(sid)  # [B, dim]
        paper_vec_gmd = self.paper_vec_gmd(pid)          # [B, dim]

        gmf = scientist_vec_gmd * paper_vec_gmd

        # Fetch mlp embeddings
        scientist_vec = self.scientist_emb_mlp(sid)  # [B, dim]
        paper_vec = self.paper_emb_mlp(pid)          # [B, dim]


        # # Concatenate embeddings
        x = torch.cat([scientist_vec, paper_vec], dim=-1)  # [B, 2*dim]

        # # Feed through MLP
        x = self.mlp1(x)  # [B, hdims[-1]]

        x = torch.cat([x, gmf], dim=-1) # [B, dim+hdims[-1]]

        # Final output
        x = self.output_layer(x)  # [B, 1]

        x = x.squeeze(-1)  # [B]

        return x
print("Done")

Done


In [6]:
# Define model (10k scientists, 1k papers, 32-dimensional embeddings) and optimizer
model = NeuralCollaborativeFilteringModel(10_000, 1_000, 32).to(device)

In [7]:
def get_dataset(df: pd.DataFrame) -> torch.utils.data.Dataset:
    """Conversion from pandas data frame to torch dataset."""
    
    sids = torch.from_numpy(df["sid"].to_numpy())
    pids = torch.from_numpy(df["pid"].to_numpy())
    ratings = torch.from_numpy(df["rating"].to_numpy()).float()
    return torch.utils.data.TensorDataset(sids, pids, ratings)

In [8]:
train_df, valid_df = read_data_df()
train_dataset = get_dataset(train_df)
valid_dataset = get_dataset(valid_df)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=64, shuffle=False)
print("Done")

Done


Training loop

In [9]:
NUM_EPOCHS = 1
def train_model(model, train_loader):
    optim = torch.optim.Adam(model.parameters(), lr=1e-3)
    for epoch in range(NUM_EPOCHS):
        print("Epoch", epoch)
        # Train model for an epoch
        total_loss = 0.0
        total_data = 0
        model.train()
        for sid, pid, ratings in train_loader:
            # Move data to GPU
            sid = sid.to(device)
            pid = pid.to(device)
            ratings = ratings.to(device)
    
            # Make prediction and compute loss
            pred = model(sid, pid)
            loss = F.mse_loss(pred, ratings)
    
            # Compute gradients w.r.t. loss and take a step in that direction
            optim.zero_grad()
            loss.backward()
            optim.step()
    
            # Keep track of running loss
            total_data += len(sid)
            total_loss += len(sid) * loss.item()
    
        # Evaluate model on validation data
        total_val_mse = 0.0
        total_val_data = 0
        model.eval()
        for sid, pid, ratings in valid_loader:
            # Move data to GPU
            sid = sid.to(device)
            pid = pid.to(device)
            ratings = ratings.to(device)
    
            # Clamp predictions in [1,5], since all ground-truth ratings are
            pred = model(sid, pid).clamp(1, 5)
            mse = F.mse_loss(pred, ratings)
    
            # Keep track of running metrics
            total_val_data += len(sid)
            total_val_mse += len(sid) * mse.item()
    
        print(f"[Epoch {epoch+1}/{NUM_EPOCHS}] Train loss={total_loss / total_data:.3f}, Valid RMSE={(total_val_mse / total_val_data) ** 0.5:.3f}")
    return model


In [10]:
model = train_model(model, train_loader)

Epoch 0
[Epoch 1/1] Train loss=1.087, Valid RMSE=0.911


In [11]:
pred_fn = lambda sids, pids: model(torch.from_numpy(sids).to(device), torch.from_numpy(pids).to(device)).clamp(1, 5).cpu().numpy()

# Evaluate on validation data
with torch.no_grad():
    val_score = evaluate(valid_df, pred_fn)

print(f"Validation RMSE: {val_score:.3f}")

Validation RMSE: 0.911


### Test data and submission

In [12]:
with torch.no_grad():
    make_submission(pred_fn, "collab-filtering-NCF.csv")