In [2]:
from typing import Tuple, Callable

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error
import os

## Helper functions

In [3]:
# DATA_DIR = "/cluster/courses/cil/collaborative_filtering/data"
DATA_DIR = "../data"
SEED = 42


def read_data_df(seed: int = SEED) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Reads in data and splits it into training and validation sets with a 75/25 split."""
    
    df = pd.read_csv(os.path.join(DATA_DIR, "train_ratings.csv"))

    # Split sid_pid into sid and pid columns
    df[["sid", "pid"]] = df["sid_pid"].str.split("_", expand=True)
    df = df.drop("sid_pid", axis=1)
    df["sid"] = df["sid"].astype(int)
    df["pid"] = df["pid"].astype(int)
    
    # Split into train and validation dataset
    train_df, valid_df = train_test_split(df, test_size=0.25, random_state=seed)
    return train_df, valid_df


def read_data_matrix(df: pd.DataFrame) -> np.ndarray:
    """Returns matrix view of the training data, where columns are scientists (sid) and
    rows are papers (pid)."""

    return df.pivot(index="sid", columns="pid", values="rating").values

def read_full_data_matrix(df: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
    """Returns matrix view of the training data, where columns are scientists
    (sid) and rows are papers (pid)."""

    pivot_df = df.pivot(index="sid", columns="pid", values="rating")
    row_ids = pivot_df.index.values
    col_ids = pivot_df.columns.values
    return pivot_df.values, row_ids, col_ids


def evaluate(valid_df: pd.DataFrame, pred_fn: Callable[[np.ndarray, np.ndarray], np.ndarray]) -> float:
    """
    Inputs:
        valid_df: Validation data, returned from read_data_df for example.
        pred_fn: Function that takes in arrays of sid and pid and outputs their rating predictions.

    Outputs: Validation RMSE
    """
    
    preds = pred_fn(valid_df["sid"].values, valid_df["pid"].values)
    return root_mean_squared_error(valid_df["rating"].values, preds)


def make_submission(pred_fn: Callable[[np.ndarray, np.ndarray], np.ndarray], filename: os.PathLike):
    """Makes a submission CSV file that can be submitted to kaggle.

    Inputs:
        pred_fn: Function that takes in arrays of sid and pid and outputs a score.
        filename: File to save the submission to.
    """
    
    df = pd.read_csv(os.path.join(DATA_DIR, "sample_submission.csv"))

    # Get sids and pids
    sid_pid = df["sid_pid"].str.split("_", expand=True)
    sids = sid_pid[0]
    pids = sid_pid[1]
    sids = sids.astype(int).values
    pids = pids.astype(int).values
    
    df["rating"] = pred_fn(sids, pids)
    df.to_csv(filename, index=False)

## Baseline ALS implementation 

In [4]:
class ALSRecommender:
    def __init__(self,
                 train_df: pd.DataFrame,
                 valid_df: pd.DataFrame,
                 num_factors: int = 10,
                 num_iters: int = 10,
                 reg: float = 0.1):
        """
        num_factors: latent dimensionality (k)
        num_iters: number of ALS rounds
        reg: regularization λ
        """
        # build rating matrix R, mask M
        R, user_ids, item_ids = read_full_data_matrix(train_df)
        self.M = (~np.isnan(R)).astype(np.float32)
        self.R_filled = np.nan_to_num(R, nan=0.0)
        self.n_users, self.n_items = R.shape

        # lookup tables to map sid and pid to matrix indices
        self.user_ids = user_ids
        self.item_ids = item_ids
        self.user_index = {uid: idx for idx, uid in enumerate(user_ids)}
        self.item_index = {iid: idx for idx, iid in enumerate(item_ids)}

        # store validation DF to report validation RMSE
        self.valid_df = valid_df

        # ALS hyperparams
        self.k = num_factors
        self.num_iters = num_iters
        self.reg = reg

        # initialize latent factors (UV^{T} will approximate R)
        self.U = np.random.normal(scale=0.1, size=(self.n_users, self.k))
        self.V = np.random.normal(scale=0.1, size=(self.n_items, self.k))

    def fit(self):
        I_k = np.eye(self.k)

        for it in range(self.num_iters):
            # Fix V, optimize U
            # Solving:
            #    min_{u_i} \sum (R_{ij} - u_i^{T} v_j)^2 + \lambda ||u_i||^{2},
            # which leads to setting:
            #    u_i = (V_i^{T} V_i + \lambda I)^{-1} V_{i}^{T} r_i
            # where r_i is the vector of ratings given by person i
            for i in range(self.n_users):
                m_i = self.M[i]                   # mask of scientits i
                V_i = self.V[m_i == 1]            # papers rated by scientits i
                r_i = self.R_filled[i, m_i == 1]  # ratings given by scien. i
                A = V_i.T @ V_i + self.reg * I_k  # A = V_i^{T} V_i + \lambda I
                b = V_i.T @ r_i                   # V_{i}^{T} r_i
                self.U[i] = np.linalg.solve(A, b)

            # Fix U, optimize V
            for j in range(self.n_items):
                m_j = self.M[:, j]
                U_j = self.U[m_j == 1]
                r_j = self.R_filled[m_j == 1, j]
                A = U_j.T @ U_j + self.reg * I_k
                b = U_j.T @ r_j
                self.V[j] = np.linalg.solve(A, b)

            # train/validation RMSE
            preds_train = self.predict(train_df["sid"].values,
                                       train_df["pid"].values)
            train_rmse = root_mean_squared_error(train_df["rating"],
                                                 preds_train)
            val_rmse = evaluate(self.valid_df, self.get_pred_fn())

            print(f"Iter {it+1:2d}/{self.num_iters} — "
                  f"Train RMSE: {train_rmse:.4f}  |  "
                  f"Validate RMSE: {val_rmse:.4f}")

    def predict(self, sids: np.ndarray, pids: np.ndarray) -> np.ndarray:
        preds = []
        # get mean predicted ratings
        global_mean = np.nanmean(self.U @ self.V.T)
        for sid, pid in zip(sids, pids):
            # verify that sid and pid exist in our approximated rating matrix
            if sid in self.user_index and pid in self.item_index:
                u = self.U[self.user_index[sid]]
                v = self.V[self.item_index[pid]]
                # mathematically, return \hat{R}_{ij} = u_i^{T} v_j
                preds.append(u.dot(v))
            # otherwise, give mean to avoid error
            else:
                print("user or paper not found!")
                preds.append(global_mean)

        return np.array(preds)

    def get_pred_fn(self):
        return lambda sids, pids: self.predict(sids, pids)


In [21]:
val_rmses = []
train_rmses = []
for s in [10, 15, 20, 42, 50]:
    print(s)
    train_df, valid_df = read_data_df(s)
    k = 15
    num_iterations = 30
    regParam = 20.0
    als = ALSRecommender(train_df, valid_df,
                         num_factors=k,
                         num_iters=num_iterations,
                         reg=regParam)
    als.fit()
    val_rmse = evaluate(valid_df, als.get_pred_fn())
    train_rmse = evaluate(train_df, als.get_pred_fn())
    val_rmses.append(val_rmse)
    train_rmses.append(train_rmse)
    print('\n\n')

10
Iter  1/30 — Train RMSE: 2.6210  |  Validate RMSE: 2.8265
Iter  2/30 — Train RMSE: 0.9100  |  Validate RMSE: 0.9786
Iter  3/30 — Train RMSE: 0.8375  |  Validate RMSE: 0.9214
Iter  4/30 — Train RMSE: 0.8058  |  Validate RMSE: 0.8978
Iter  5/30 — Train RMSE: 0.7903  |  Validate RMSE: 0.8863
Iter  6/30 — Train RMSE: 0.7816  |  Validate RMSE: 0.8798
Iter  7/30 — Train RMSE: 0.7762  |  Validate RMSE: 0.8755
Iter  8/30 — Train RMSE: 0.7725  |  Validate RMSE: 0.8725
Iter  9/30 — Train RMSE: 0.7698  |  Validate RMSE: 0.8702
Iter 10/30 — Train RMSE: 0.7677  |  Validate RMSE: 0.8685
Iter 11/30 — Train RMSE: 0.7661  |  Validate RMSE: 0.8672
Iter 12/30 — Train RMSE: 0.7648  |  Validate RMSE: 0.8661
Iter 13/30 — Train RMSE: 0.7638  |  Validate RMSE: 0.8652
Iter 14/30 — Train RMSE: 0.7629  |  Validate RMSE: 0.8644
Iter 15/30 — Train RMSE: 0.7622  |  Validate RMSE: 0.8638
Iter 16/30 — Train RMSE: 0.7616  |  Validate RMSE: 0.8632
Iter 17/30 — Train RMSE: 0.7610  |  Validate RMSE: 0.8628
Iter 18/30 

In [20]:
val_mean_rmse = np.mean(val_rmses)
val_std_rmse = np.std(val_rmses)
train_mean_rmse = np.mean(train_rmses)
train_std_rmse = np.std(train_rmses)
print(f'''Mean train RMSE: {train_mean_rmse:.4f},
          Std train RMSE: {train_std_rmse:.4f}''')
print(f'''Mean validation RMSE: {val_mean_rmse:.4f},
          Std validation RMSE: {val_std_rmse:.4f}''')

Mean train RMSE: 0.7577,
          Std train RMSE: 0.0002
Mean validation RMSE: 0.8594,
          Std validation RMSE: 0.0007


In [6]:
train_df, valid_df = read_data_df()

k = 15
num_iterations = 30
regParam = 20.0
als = ALSRecommender(train_df, valid_df,
                     num_factors=k,
                     num_iters=num_iterations,
                     reg=regParam)
als.fit()
pred_fn = als.get_pred_fn()
make_submission(pred_fn,
                filename=f'''./submissions/als_simple_tuned_{k}_{regParam}_{num_iterations}.csv''')

Iter  1/30 — Train RMSE: 2.0959  |  Validate RMSE: 2.2343
Iter  2/30 — Train RMSE: 0.8854  |  Validate RMSE: 0.9595
Iter  3/30 — Train RMSE: 0.8327  |  Validate RMSE: 0.9202
Iter  4/30 — Train RMSE: 0.8057  |  Validate RMSE: 0.8992
Iter  5/30 — Train RMSE: 0.7908  |  Validate RMSE: 0.8872
Iter  6/30 — Train RMSE: 0.7822  |  Validate RMSE: 0.8801
Iter  7/30 — Train RMSE: 0.7766  |  Validate RMSE: 0.8755
Iter  8/30 — Train RMSE: 0.7727  |  Validate RMSE: 0.8723
Iter  9/30 — Train RMSE: 0.7699  |  Validate RMSE: 0.8699
Iter 10/30 — Train RMSE: 0.7677  |  Validate RMSE: 0.8681
Iter 11/30 — Train RMSE: 0.7660  |  Validate RMSE: 0.8667
Iter 12/30 — Train RMSE: 0.7647  |  Validate RMSE: 0.8655
Iter 13/30 — Train RMSE: 0.7636  |  Validate RMSE: 0.8646
Iter 14/30 — Train RMSE: 0.7626  |  Validate RMSE: 0.8638
Iter 15/30 — Train RMSE: 0.7619  |  Validate RMSE: 0.8632
Iter 16/30 — Train RMSE: 0.7612  |  Validate RMSE: 0.8626
Iter 17/30 — Train RMSE: 0.7606  |  Validate RMSE: 0.8621
Iter 18/30 — T