In [1]:
from typing import Tuple, Callable

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error
import os
import time

Make sure that results are reproducible by using a seed.

In [2]:
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)

## Helper functions

In [3]:
DATA_DIR = ""


def read_data_df() -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Reads in data and splits it into training and validation sets with a 75/25 split."""
    
    df = pd.read_csv(os.path.join(DATA_DIR, "train_ratings.csv"))

    # Split sid_pid into sid and pid columns
    df[["sid", "pid"]] = df["sid_pid"].str.split("_", expand=True)
    df = df.drop("sid_pid", axis=1)
    df["sid"] = df["sid"].astype(int)
    df["pid"] = df["pid"].astype(int)
    
    # Split into train and validation dataset
    train_df, valid_df = train_test_split(df, test_size=0.25)
    return train_df, valid_df


def read_data_matrix(df: pd.DataFrame) -> np.ndarray:
    """Returns matrix view of the training data, where columns are scientists (sid) and
    rows are papers (pid)."""

    return df.pivot(index="sid", columns="pid", values="rating").values


def evaluate(valid_df: pd.DataFrame, pred_fn: Callable[[np.ndarray, np.ndarray], np.ndarray]) -> float:
    """
    Inputs:
        valid_df: Validation data, returned from read_data_df for example.
        pred_fn: Function that takes in arrays of sid and pid and outputs their rating predictions.

    Outputs: Validation RMSE
    """
    preds = pred_fn(valid_df["sid"].values, valid_df["pid"].values)
    return root_mean_squared_error(valid_df["rating"].values, preds)


def make_submission(pred_fn: Callable[[np.ndarray, np.ndarray], np.ndarray], filename: os.PathLike):
    """Makes a submission CSV file that can be submitted to kaggle.

    Inputs:
        pred_fn: Function that takes in arrays of sid and pid and outputs a score.
        filename: File to save the submission to.
    """
    
    df = pd.read_csv(os.path.join(DATA_DIR, "sample_submission.csv"))

    # Get sids and pids
    sid_pid = df["sid_pid"].str.split("_", expand=True)
    sids = sid_pid[0]
    pids = sid_pid[1]
    sids = sids.astype(int).values
    pids = pids.astype(int).values
    
    df["rating"] = pred_fn(sids, pids)
    df.to_csv(filename, index=False)

In [4]:
train_df, valid_df = read_data_df()

### Improve Speed of the svdpp training by
1) Avoiding iterrows and Python lists
2) Vectorizing the "implicit"-feedback and removing the inner Python loop

In [5]:
def train_svdpp_fast(train_df, num_factors=20, lr=0.005, reg=0.02, n_epochs=20):
    """
    Train a fast, NumPy‐only SVD++ on train_df (with columns 'sid','pid','rating').
    """
    # 1) remap IDs to 0…N–1
    sids = train_df['sid'].unique()
    pids = train_df['pid'].unique()
    user2ind = {sid: i for i, sid in enumerate(sids)}
    item2ind = {pid: i for i, pid in enumerate(pids)}
    n_users, n_items = len(sids), len(pids)

    # 2) build index arrays once
    user_arr   = train_df['sid'].map(user2ind).to_numpy(dtype=np.int32)
    item_arr   = train_df['pid'].map(item2ind).to_numpy(dtype=np.int32)
    rating_arr = train_df['rating'].to_numpy(dtype=np.float32)

    # 3) global mean as float32
    mu = np.float32(rating_arr.mean())

    # 4) init parameters (float32)
    b_u = np.zeros(n_users, np.float32)
    b_i = np.zeros(n_items, np.float32)
    p   = np.random.normal(0, 0.1, (n_users, num_factors)).astype(np.float32)
    q   = np.random.normal(0, 0.1, (n_items, num_factors)).astype(np.float32)
    y   = np.random.normal(0, 0.1, (n_items, num_factors)).astype(np.float32)

    # 5) build implicit lists
    implicit = {u: [] for u in range(n_users)}
    for u, i in zip(user_arr, item_arr):
        implicit[u].append(i)
    Nu_list  = [np.array(implicit[u], dtype=np.int32) for u in range(n_users)]
    Nu_count = np.array([len(a) for a in Nu_list], dtype=np.int32)
    sqrt_Nu  = np.where(Nu_count>0, np.sqrt(Nu_count, dtype=np.float32), 1.0)

    # 6) precompute y_sum[u] = sum_j y[j] / sqrt_Nu[u]
    y_sum = np.zeros((n_users, num_factors), np.float32)
    for u in range(n_users):
        if Nu_count[u]:
            y_sum[u] = y[Nu_list[u]].sum(0) / sqrt_Nu[u]

    # 7) SGD loop (vectorized implicit updates)
    n_ratings = rating_arr.shape[0]
    for epoch in range(n_epochs):
        start = time.time()
        perm = np.random.permutation(n_ratings)
        for idx in perm:
            u = user_arr[idx]; i = item_arr[idx]; r = rating_arr[idx]
            imp = y_sum[u]                      # (f,)
            pred = mu + b_u[u] + b_i[i] + q[i].dot(p[u] + imp)
            err  = r - pred

            # biases & factors
            b_u[u] += lr * (err - reg * b_u[u])
            b_i[i] += lr * (err - reg * b_i[i])
            p_old  = p[u].copy()
            p[u]  += lr * (err * q[i]   - reg * p[u])
            q[i]  += lr * (err * (p_old + imp) - reg * q[i])

            # fast implicit update
            if Nu_count[u]:
                coeff = lr * err / sqrt_Nu[u]
                idxs  = Nu_list[u]    # shape (|Nu|,)
                yj    = y[idxs]       # (|Nu|,f)
                y[idxs] = yj + coeff*q[i] - lr*reg*yj
                y_sum[u] = y_sum[u] + coeff*q[i] - lr*reg*y_sum[u]
        end = time.time()
        print(f"Epoch {epoch+1}/{n_epochs} done. {end-start:.2f}s elapsed.")

    return {
        'mu':mu, 'b_u':b_u, 'b_i':b_i,
        'p':p,   'q':q,   'y':y,
        'user2ind':user2ind, 'item2ind':item2ind,
        'implicit':implicit, 'num_factors':num_factors
    }

# Prediction function based on the trained SVD++ model.
def svdpp_pred(model, sids, pids):
    mu = model['mu']
    user2ind = model['user2ind']
    item2ind = model['item2ind']
    b_u = model['b_u']
    b_i = model['b_i']
    p = model['p']
    q = model['q']
    y = model['y']
    num_factors = model['num_factors']
    implicit = model['implicit']
    
    preds = []
    for sid, pid in zip(sids, pids):
        if (sid in user2ind) and (pid in item2ind):
            u = user2ind[sid]
            i = item2ind[pid]
            Nu = implicit[u]
            sqrt_Nu = np.sqrt(len(Nu)) if Nu else 1.0
            imp_sum = np.sum(y[Nu, :], axis=0) / sqrt_Nu if Nu else np.zeros(num_factors)
            pred = mu + b_u[u] + b_i[i] + np.dot(q[i], p[u] + imp_sum)
        else:
            pred = mu  # default to global mean if unknown user/item
        preds.append(pred)
    return np.array(preds)


### Adjust the pred function to add a fixed score amount if scientist wanta to read the paper

In [6]:
# Load TBR data and build a lookup set
tbr_df = pd.read_csv(os.path.join(DATA_DIR, "train_tbr.csv"))  # columns: sid, pid
tbr_pairs = set(zip(tbr_df['sid'], tbr_df['pid']))

# Wrap existing pred fn to apply the boost
def svdpp_pred_with_tbr_and_cap(model, sids, pids, boost_pairs, boost=0.5, cap=5.0):
    # 1) get base SVD++ predictions
    base_preds = svdpp_pred(model, sids, pids)
    
    # 2) add boost for any (sid, pid) in the “to‐be‐read” set
    for idx, (sid, pid) in enumerate(zip(sids, pids)):
        if (sid, pid) in boost_pairs:
            base_preds[idx] += boost
    
    # 3) cap at the rating ceiling
    np.clip(base_preds, None, cap, out=base_preds)
    
    return base_preds

### Main routine for training and evaluation 

In [7]:
# Read training and validation data using provided helper function.
train_df, valid_df = read_data_df()

In [8]:
# Train the SVD++ model (adjust hyperparameters as needed).
# num_factors=20, lr=0.005, reg=0.02, n_epochs=5 --> RMSE: 0.892
# num_factors=20, lr=0.005, reg=0.02, n_epochs=10 --> RMSE: 0.880
# num_factors=20, lr=0.005, reg=0.02, n_epochs=20 -> RMSE: 0.870
# num_factors=50, lr=0.01, reg=0.05, n_epochs=5 --> RMSE: 0.885
# model = train_svdpp_fast(train_df, num_factors=50, lr=0.01, reg=0.05, n_epochs=5) 

In [9]:
# Define the prediction function for evaluation/submission.
# svdpp_fn = lambda sids, pids: svdpp_pred(model, sids, pids)

# Evaluate on validation data.
# val_rmse = evaluate(valid_df, svdpp_fn)
# print(f"Validation RMSE: {val_rmse:.3f}")

### Prediction with adding of boost for paper read-list

For actual submissions, we should train on the entire data available

In [10]:
def read_data_for_training() -> pd.DataFrame:
    """Reads in the entire dataset for training purposes (no split into validation)."""
    
    df = pd.read_csv(os.path.join(DATA_DIR, "train_ratings.csv"))
    
    # Split sid_pid into sid and pid columns
    df[["sid", "pid"]] = df["sid_pid"].str.split("_", expand=True)
    df = df.drop("sid_pid", axis=1)
    df["sid"] = df["sid"].astype(int)
    df["pid"] = df["pid"].astype(int)
    
    return df

In [None]:
full_train_df = read_data_for_training()

model = train_svdpp_fast(full_train_df, num_factors=50, lr=0.005, reg=0.05, n_epochs=15)

In [12]:
# Define the prediction function for evaluation/submission.
svdpp_fn = lambda sids, pids: svdpp_pred(model, sids, pids)

In [13]:
# Define a new eval‐able function
svdpp_tbr_cap_fn = lambda sids, pids: svdpp_pred_with_tbr_and_cap(
    model, sids, pids, tbr_pairs, boost=0.0, cap=5.0
)

In [14]:
# Create submission file.
make_submission(svdpp_fn, "svdpp_submission_30e_noboost.csv")

In [15]:
# Create submission file.
make_submission(svdpp_tbr_cap_fn, "svdpp_50_0.005_0.05_30_0.5.csv")