In [5]:
import sys
!{sys.executable} -m pip install -q -q -r requirements.txt


In [12]:
import datetime, os, random, shutil, urllib.request, zipfile, time, warnings
warnings.filterwarnings('ignore')
from functools import wraps
from math import trunc
import numpy as np
import pandas as pd
from pathlib import Path
from urllib.request import urlopen
from zipfile import ZipFile
from scipy.sparse.linalg import norm
import scipy.sparse as ss
from scipy.sparse.linalg import svds
from tqdm import tqdm
from sklearn.metrics import mean_squared_error as mse
import plotly.express as px

from numba import njit

SEED = 123
def seed_everything(seed=SEED):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything()

# Preprocessing

In [7]:
def get_dataset():
    path = Path("m1.zip")
    if not path.exists():
        with path.open("wb") as f:
            print("Downloading dataset...")
            f.write(urlopen("http://files.grouplens.org/datasets/movielens/ml-1m.zip").read())
    if not Path("ml-1m").is_dir():
        print("unzipping...")
        with ZipFile("m1.zip") as zf:
            zf.extractall()
    ratings_list = [i.strip().split("::") for i in open('ml-1m/ratings.dat', 'r').readlines()]
    ratings_df = pd.DataFrame(ratings_list, columns = ['UserID', 'MovieID', 'Rating', 'Timestamp'], dtype = int)
    ratings_df['Rating'] = ratings_df['Rating'].apply(pd.to_numeric)
    return ratings_df

In [8]:
def split(df):
    R_df = ratings_df.pivot(index = 'UserID', columns ='MovieID', values = 'Rating').fillna(0)
    seed_everything()
    test_indices = np.array(random.sample(list(np.argwhere(R_df.values>0)), 2500))

    x_indices = test_indices[:,0]
    y_indices = test_indices[:,1]
    u_ids = R_df.index[x_indices].astype(np.int32)
    i_ids = R_df.columns[y_indices].astype(np.int32)
    
    df['UserID'] = df['UserID'].astype(np.int32)
    df['MovieID'] = df['MovieID'].astype(np.int32)
    df['Rating'] = df['Rating'].astype(np.float64)
    
    df= df.drop(columns=['Timestamp'], errors='ignore')
    
    test = []
    for u_id, i_id in zip(u_ids, i_ids):
        test.append(df.loc[(df['UserID'] == u_id) & (df['MovieID'] == i_id)])
    test_df = pd.concat(test)
    train_df= df.drop(test_df.index.tolist())
    
    return train_df, test_df

In [9]:
ratings_df = get_dataset()
train_df, test_df = split(ratings_df)

In [10]:
test_df.head()

Unnamed: 0,UserID,MovieID,Rating
206405,1265,1732,4.0
409183,2456,2105,3.0
240650,1449,2959,4.0
886005,5350,2640,2.0
540360,3323,3468,4.0


# Funk-SVD

## Computational Methods

In [13]:

@njit
def _shuffle(X):
    np.random.shuffle(X)
    return X

@njit
def _initialization(n_users, n_items, n_factors):
    bu = np.zeros(n_users)
    bi = np.zeros(n_items)
    pu = np.random.normal(0, .1, (n_users, n_factors))
    qi = np.random.normal(0, .1, (n_items, n_factors))
    return bu, bi, pu, qi

@njit
def _run_epoch(X, bu, bi, pu, qi, global_mean, n_factors, lr, reg):
    for i in range(X.shape[0]):
        user, item, rating = int(X[i, 0]), int(X[i, 1]), X[i, 2]

        # Predict current rating
        pred = global_mean + bu[user] + bi[item]

        for factor in range(n_factors):
            pred += pu[user, factor] * qi[item, factor]
        err = rating - pred

        # Update biases
        bu[user] += lr * (err - reg * bu[user])
        bi[item] += lr * (err - reg * bi[item])

        # Update latent factors
        for factor in range(n_factors):
            puf = pu[user, factor]
            qif = qi[item, factor]
            pu[user, factor] += lr * (err * qif - reg * puf)
            qi[item, factor] += lr * (err * puf - reg * qif)
    return bu, bi, pu, qi

@njit
def _compute_val_metrics(X_val, bu, bi, pu, qi, global_mean, n_factors):
    residuals = []
    for i in range(X_val.shape[0]):
        user, item, rating = int(X_val[i, 0]), int(X_val[i, 1]), X_val[i, 2]
        pred = global_mean + bu[user] + bi[item]

        for factor in range(n_factors):
            pred += pu[user, factor] * qi[item, factor]
        residuals.append(rating - pred)
    residuals = np.array(residuals)
    loss = np.square(residuals).mean()
    rmse = np.sqrt(loss)
    mae = np.absolute(residuals).mean()
    return loss, rmse, mae

## Funk SVD Class

In [14]:
class SVD:
    def __init__(self, lr=.005, reg=.02, n_epochs=20, n_factors=100,
                 min_delta=.001,
                 min_rating=1, max_rating=5):

        self.lr = lr
        self.reg = reg
        self.n_epochs = n_epochs
        self.n_factors = n_factors
        self.min_rating = min_rating
        self.max_rating = max_rating

    def fit(self, X, X_test):
        X = self._preprocess_data(X)

        
        X_test = self._preprocess_data(X_test, train=False)
        self._init_metrics()
        self.global_mean_ = np.mean(X[:, 2])
        return self._run_sgd(X, X_test)

    def _preprocess_data(self, X, train=True):
        X = X.copy()

        if train:  # Mappings have to be created
            user_ids = X['UserID'].unique().tolist()
            item_ids = X['MovieID'].unique().tolist()
            n_users = len(user_ids)
            n_items = len(item_ids)
            user_idx = range(n_users)
            item_idx = range(n_items)
            self.user_mapping_ = dict(zip(user_ids, user_idx))
            self.item_mapping_ = dict(zip(item_ids, item_idx))
        X['UserID'] = X['UserID'].map(self.user_mapping_)
        X['MovieID'] = X['MovieID'].map(self.item_mapping_)
        
        X['UserID'] = X['UserID'].astype(np.int32)
        X['MovieID'] = X['MovieID'].astype(np.int32)
        return X[['UserID', 'MovieID', 'Rating']].values

    def _init_metrics(self):
        metrics = np.zeros((self.n_epochs, 3), dtype=np.float)
        self.metrics_ = pd.DataFrame(metrics, columns=['Loss', 'RMSE', 'MAE'])

    def _run_sgd(self, X, X_test):
        n_users = len(np.unique(X[:, 0]))
        n_items = len(np.unique(X[:, 1]))
        
        bu, bi, pu, qi = _initialization(n_users, n_items, self.n_factors)

        # Run SGD
        pbar = tqdm(range(self.n_epochs), desc='Epoch',
                             ncols=110)
        for epoch_ix in pbar:
            bu, bi, pu, qi = _run_epoch(X, bu, bi, pu, qi, self.global_mean_,
                                        self.n_factors, self.lr, self.reg)

            self.metrics_.loc[epoch_ix, :] = _compute_val_metrics(
                                                 X_test, bu, bi, pu, qi,
                                                 self.global_mean_,
                                                 self.n_factors)
            f = lambda x: f"{self.metrics_.loc[epoch_ix, x]:.3f}"
            pbar.set_postfix({f'test_{m}': f(m) for m in self.metrics_.columns})

        self.bu_,self.bi_,self.pu_,self.qi_ = bu, bi, pu, qi
        return {f"test_{m}": np.trim_zeros(self.metrics_[m]) for m in self.metrics_.columns}

    def predict(self, X, clip=True):
        return [
            self.predict_pair(u_id, i_id)
            for u_id, i_id in zip(X['UserID'], X['MovieID'])
        ]

    def predict_pair(self, u_id, i_id):
        assert u_id in self.user_mapping_ and i_id in self.item_mapping_, f"user {u_id} or movie {i_id} not in train data"
        
        if u_id in self.user_mapping_ and i_id in self.item_mapping_:
            i_ix = self.item_mapping_[i_id]
            u_ix = self.user_mapping_[u_id]
            pred = self.global_mean_ + self.bi_[i_ix] +self.bu_[u_ix] + np.dot(self.pu_[u_ix], self.qi_[i_ix])
        pred = np.clip(pred, self.min_rating, self.max_rating)
        return pred


## Experimental Setup

In [16]:
def rmse(y_true, y_pred):
    return mse(y_true, y_pred, squared=False)

def funk(train, test, **kw):
    seed_everything()
    svd = SVD(**kw)
    hist = svd.fit(X=train, X_test=test)
    y_true, y_pred = test['Rating'].values.ravel(), np.array(svd.predict(test)).ravel()
    print(f"Test RMSE: {rmse(y_true, y_pred):.4f}\n\n")
    print_hparams(kw)
    return hist, svd

def print_hparams(h):
    print("Hyperparameters:\n")
    print("hist = funk(train_df, test_df,\n\t" + ", ".join([f"{k}={h[k]}" for k in h]) + ')')
    
def plot_experiment(hist):
    fig = px.line(pd.DataFrame(hist).rename_axis('Epoch'))\
        .update_layout(hoverlabel=dict(font_size=12, font_family="Rockwell"),
                       font=dict(family="Courier New, monospace", size=18))\
        .update_xaxes(showspikes=True)
    fig.show("notebook")

## Run Experiment

In [17]:
hist,svd = funk(train=train_df, test=test_df,
		lr=0.01,
		reg=0.005,
		n_epochs=3,
		n_factors=15)

Epoch: 100%|██████████████████| 3/3 [00:02<00:00,  1.22it/s, test_Loss=0.825, test_RMSE=0.908, test_MAE=0.717]


Test RMSE: 0.9083


Hyperparameters:

hist = funk(train_df, test_df,
	lr=0.01, reg=0.005, n_epochs=3, n_factors=15)


In [None]:
plot_experiment(hist)

In [None]:
# Test RMSE: 0.8643


# Hyperparameters:

# hist = funk(train_df, test_df,
# 	early_stopping=False, min_delta=1e-05, lr=0.01, reg=0.005, n_epochs=10, n_factors=15)