In [1]:
import sys
!{sys.executable} -m pip install -q -q -U tqdm scikit-learn pandas sparsesvd plotly

# install plotting dependencies
!wget -q https://github.com/plotly/orca/releases/download/v1.2.1/orca-1.2.1-x86_64.AppImage -O /usr/local/bin/orca
!chmod +x /usr/local/bin/orca
!apt-get -qq install xvfb libgtk2.0-0 libgconf-2-4

[K     |████████████████████████████████| 81kB 8.6MB/s 
[K     |████████████████████████████████| 22.2MB 1.2MB/s 
[K     |████████████████████████████████| 13.2MB 254kB/s 
[?25h  Building wheel for sparsesvd (setup.py) ... [?25l[?25hdone
Selecting previously unselected package libdbus-glib-1-2:amd64.
(Reading database ... 146442 files and directories currently installed.)
Preparing to unpack .../00-libdbus-glib-1-2_0.110-2_amd64.deb ...
Unpacking libdbus-glib-1-2:amd64 (0.110-2) ...
Selecting previously unselected package gconf2-common.
Preparing to unpack .../01-gconf2-common_3.2.6-4ubuntu1_all.deb ...
Unpacking gconf2-common (3.2.6-4ubuntu1) ...
Selecting previously unselected package libgconf-2-4:amd64.
Preparing to unpack .../02-libgconf-2-4_3.2.6-4ubuntu1_amd64.deb ...
Unpacking libgconf-2-4:amd64 (3.2.6-4ubuntu1) ...
Selecting previously unselected package gconf-service-backend.
Preparing to unpack .../03-gconf-service-backend_3.2.6-4ubuntu1_amd64.deb ...
Unpacking gconf-se

In [2]:
import datetime, os, random, shutil, urllib.request, zipfile, time, warnings
warnings.filterwarnings('ignore')
from functools import wraps
from math import trunc
import numpy as np
import pandas as pd
from pathlib import Path
from urllib.request import urlopen
from zipfile import ZipFile
from scipy.sparse.linalg import norm
import scipy.sparse as ss
from scipy.sparse.linalg import svds
from sparsesvd import sparsesvd
from tqdm import tqdm
from sklearn.metrics import mean_squared_error as mse
import plotly.express as px


SEED = 123
def seed_everything(seed=SEED):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything()

# Preprocessing

In [3]:
def get_dataset():
    path = Path("m1.zip")
    if not path.exists():
        with path.open("wb") as f:
            print("Downloading dataset...")
            f.write(urlopen("http://files.grouplens.org/datasets/movielens/ml-1m.zip").read())
    if not Path("ml-1m").is_dir():
        print("unzipping...")
        with ZipFile("m1.zip") as zf:
            zf.extractall()
    ratings_list = [i.strip().split("::") for i in open('ml-1m/ratings.dat', 'r').readlines()]
    ratings_df = pd.DataFrame(ratings_list, columns = ['UserID', 'MovieID', 'Rating', 'Timestamp'], dtype = int)
    ratings_df['Rating'] = ratings_df['Rating'].apply(pd.to_numeric)
    return ratings_df

In [4]:
def split(df):
    R_df = ratings_df.pivot(index = 'UserID', columns ='MovieID', values = 'Rating').fillna(0)
    seed_everything()
    test_indices = np.array(random.sample(list(np.argwhere(R_df.values>0)), 2500))

    x_indices = test_indices[:,0]
    y_indices = test_indices[:,1]
    u_ids = R_df.index[x_indices].astype(np.int32)
    i_ids = R_df.columns[y_indices].astype(np.int32)
    
    df['UserID'] = df['UserID'].astype(np.int32)
    df['MovieID'] = df['MovieID'].astype(np.int32)
    df['Rating'] = df['Rating'].astype(np.float64)
    
    df= df.drop(columns=['Timestamp'], errors='ignore')
    
    test = []
    for u_id, i_id in zip(u_ids, i_ids):
        test.append(df.loc[(df['UserID'] == u_id) & (df['MovieID'] == i_id)])
    test_df = pd.concat(test)
    train_df= df.drop(test_df.index.tolist())
    
    
    matrix_only_with_test = np.zeros(R_df.shape, dtype=np.float64) 
    matrix_only_with_test[x_indices, y_indices] =  R_df.values[x_indices, y_indices]
    M_test_df = pd.DataFrame(matrix_only_with_test, index=R_df.index, columns=R_df.columns)
    
    matrix_without_test = R_df.copy().values
    matrix_without_test[x_indices,y_indices] = 0.0
    M_train_df = pd.DataFrame(matrix_without_test, index=R_df.index, columns=R_df.columns)
    
    return train_df, test_df, M_train_df, M_test_df
    

In [None]:
ratings_df = get_dataset()
train_df, test_df, M_train_df, M_test_df = split(ratings_df)

Downloading dataset...
unzipping...


In [None]:
test_df.head()

In [None]:
M_train_df.head()



# Funk-SVD

In [None]:
class FunkSVD:
    def __init__(self, lr=.005, reg=.02, n_epochs=20, n_factors=100,
                 min_delta=.001,
                 min_rating=1, max_rating=5):

        self.lr = lr
        self.reg = reg
        self.n_epochs = n_epochs
        self.n_factors = n_factors
        self.min_rating = min_rating
        self.max_rating = max_rating

    def fit(self, X, X_test):
        X = self._preprocess_data(X)

        
        X_test = self._preprocess_data(X_test, train=False)
        self._init_metrics()
        self.global_mean_ = np.mean(X[:, 2])
        return self._run_sgd(X, X_test)

    def _preprocess_data(self, X, train=True):
        X = X.copy()
        if train:  # Mappings have to be created
            #assumed that train data includes all possible users and movies (not necessarily ratings)
            user_ids = X['UserID'].unique().tolist()
            item_ids = X['MovieID'].unique().tolist()
            n_users = len(user_ids)
            n_items = len(item_ids)
            user_idx = range(n_users)
            item_idx = range(n_items)
            self.user_mapping_ = dict(zip(user_ids, user_idx))
            self.item_mapping_ = dict(zip(item_ids, item_idx))
        X['UserID'] = X['UserID'].map(self.user_mapping_)
        X['MovieID'] = X['MovieID'].map(self.item_mapping_)
        
        X['UserID'] = X['UserID'].astype(np.int32)
        X['MovieID'] = X['MovieID'].astype(np.int32)
        return X[['UserID', 'MovieID', 'Rating']].values

    def _init_metrics(self):
        metrics = np.zeros((self.n_epochs, 3), dtype=np.float)
        self.metrics_ = pd.DataFrame(metrics, columns=['Loss', 'RMSE', 'MAE'])

    def _run_sgd(self, X, X_test):
        reg,lr,global_mean,n_factors = self.reg,self.lr,self.global_mean_,self.n_factors
        
        n_users = len(np.unique(X[:, 0]))
        n_items = len(np.unique(X[:, 1]))
        bu = np.zeros(n_users)
        bi = np.zeros(n_items)
        pu = np.random.normal(0, .1, (n_users, n_factors))
        qi = np.random.normal(0, .1, (n_items, n_factors))
        
        indices = (X_test[:,0].astype(int), X_test[:,1].astype(int))
        true = X_test[:,2]
        
        # Run SGD
        pbar = tqdm(range(self.n_epochs), desc='Epoch',
                             ncols=110)
        
        #stochastic sgd
        for epoch_ix in pbar:
            
            #with batch size 1
            for row in X:
                user, item, rating = int(row[0]), int(row[1]), row[2]

                pred= np.dot(pu[user,:], qi[item,:])+ global_mean + bu[user]+bi[item]
                err = rating - pred

                # Update biases
                bu[user] += lr * (err - reg * bu[user])
                bi[item] += lr * (err - reg * bi[item])

                # Update latent factors
                pu_update= lr * (err * qi[item, :] - reg * pu[user, :])
                qi_update= lr * (err * pu[user, :] - reg * qi[item, :])
                pu[user, :] = pu[user, :] + pu_update
                qi[item, :] = qi[item, :] + qi_update
            
            #compute test error
            pred = global_mean+ np.matmul(pu, qi.T) + bu.reshape(-1, 1) + bi.reshape(1, -1) 
            residual = (pred[indices] - true).ravel()
            mse = np.square(residual).mean()
            rmse = np.sqrt(mse)
            mae = np.absolute(residual).mean()

            #save results in df
            self.metrics_.loc[epoch_ix, :] = (mse,rmse,mae)
            f = lambda x: f"{self.metrics_.loc[epoch_ix, x]:.3f}"
            pbar.set_postfix({f'test_{m}': f(m) for m in self.metrics_.columns})

        self.bu_,self.bi_,self.pu_,self.qi_ = bu, bi, pu, qi
        return {f"test_{m}": np.trim_zeros(self.metrics_[m]) for m in self.metrics_.columns}

    def predict(self, X):
        return [
            self.predict_pair(u_id, i_id)
            for u_id, i_id in zip(X['UserID'], X['MovieID'])
        ]

    def predict_pair(self, u_id, i_id):
        assert u_id in self.user_mapping_ and i_id in self.item_mapping_, f"user {u_id} or movie {i_id} not in train data"
        
        i_ix = self.item_mapping_[i_id]
        u_ix = self.user_mapping_[u_id]
        pred = self.global_mean_ + self.bi_[i_ix] +self.bu_[u_ix] + np.dot(self.pu_[u_ix], self.qi_[i_ix])
        pred = np.clip(pred, self.min_rating, self.max_rating)
        return pred


In [None]:
class SVT:
    def __init__(self, tau, delta, n_epochs, min_rating=1, max_rating=5):
        self.tau = tau
        self.delta = delta
        self.n_epochs = n_epochs
        self.min_rating = min_rating
        self.max_rating = max_rating
        self.tol = 0.001
        self.increment = 5
    
    def _init_metrics(self):
        metrics = np.zeros((self.n_epochs, 3), dtype=np.float)
        self.metrics_ = pd.DataFrame(metrics, columns=['Loss', 'RMSE', 'MAE'])
    
    def fit(self, M, M_test):
        ###preprocessing
        
        self._init_metrics()
        M = M.copy()
        M_test = M_test.copy()
        
        self.user_mapping_ = dict(zip(M.index, range(len(M.index))))
        self.item_mapping_ = dict(zip(M.columns, range(len(M.columns))))
        self.users_mean = np.mean(M.values, axis = 1).reshape(-1, 1)
        
        M = M.values
        M_test= M_test.values
        
        
        Omega = M.astype(np.int32).nonzero()
        test_indices= M_test.astype(np.int32).nonzero()
        
        M = M- self.users_mean
        M_test= M_test - self.users_mean
        
        test_true = M_test[test_indices].ravel()
        
        tol = self.tol
        incre = self.increment
        tau = self.tau
        iterations= self.n_epochs
        delta = self.delta
        ###preprocessing

        # SVT
        r = 0
        P_Omega_M = ss.csr_matrix((np.ravel(M[Omega]), Omega), shape=M.shape)
        normProjM = norm(P_Omega_M)
        k0 = np.ceil(tau / (delta * normProjM))
        Y = k0 * delta * P_Omega_M

        pbar = tqdm(range(iterations))

        rmses = []
        for epoch_ix in pbar:
            s = r + 1
            sparse_Y = ss.csc_matrix(Y)
            
            #find s largest eigen values. keep increasing s until the s'th largest value is smaller than tau
            u1, s1, v1 = sparsesvd(sparse_Y, s)
            while np.min(s1) > tau and s >= min(*M.shape):
                u1, s1, v1 = sparsesvd(sparse_Y, s)
                s+=incre
            
            #reconstruct x from svd decomposition
            r = np.sum(s1 > tau)
            U = u1.T[:, :r]
            V = v1[:r, :]
            S = s1[:r] - tau
            x = (U * S).dot(V)

            x_omega = ss.csr_matrix((x[Omega], Omega), shape=M.shape)
            
            #if there is no reconstruction error, stop
            reconstrcution_loss = norm(x_omega - P_Omega_M) / norm(P_Omega_M)
            if reconstrcution_loss < tol:
                break
            
            #update Y with values from constructed matrix
            diff = P_Omega_M - x_omega
            Y += delta * diff
                
            #compute test error
            residual = (x[test_indices].ravel() - test_true).ravel()
            mse = np.square(residual).mean()
            rmse = np.sqrt(mse)
            mae = np.absolute(residual).mean()

            #save results in df
            self.metrics_.loc[epoch_ix, :] = (mse,rmse,mae)
            f = lambda x: f"{self.metrics_.loc[epoch_ix, x]:.3f}"
            pbar.set_postfix({f'test_{m}': f(m) for m in self.metrics_.columns})
                
        self.x = x + self.users_mean
        return {f"test_{m}": np.trim_zeros(self.metrics_[m]) for m in self.metrics_.columns}
    
    
    def predict(self, X):
        return [self.predict_pair(u_id, i_id) for u_id, i_id in zip(X['UserID'], X['MovieID'])]

    def predict_pair(self, u_id, i_id):
        assert u_id in self.user_mapping_ and i_id in self.item_mapping_, f"user {u_id} or movie {i_id} not in train data"
        
        i_ix = self.item_mapping_[i_id]
        u_ix = self.user_mapping_[u_id]
        
        pred= self.x[u_ix,i_ix]
        pred = np.clip(pred, self.min_rating, self.max_rating)
        return pred

## Experimental Setup

In [None]:
def rmse(y_true, y_pred):
    return mse(y_true, y_pred, squared=False)

def to_sparse(df):
    users =[]
    movies =[]
    ratings =[]
    indices = df.values.astype(np.int32).nonzero()
    x_indices,y_indices = indices[0],indices[1]
    
    for i,j in zip(x_indices,y_indices):
        users.append(df.index[i])
        movies.append(df.columns[j])
        ratings.append(df.iloc[i,j])
    
    df = pd.DataFrame(dict(zip(['UserID', 'MovieID', 'Rating'], [users, movies, ratings])))
    return df

def run_funk(train, test, **kw):
    seed_everything()
    svd = FunkSVD(**kw)
    hist = svd.fit(X=train, X_test=test)
    y_true, y_pred = test['Rating'].values.ravel(), np.array(svd.predict(test)).ravel()
    print(f"\n\nTest RMSE: {rmse(y_true, y_pred):.3f}\n\n")
    print_hparams(kw, 'funk')
    return hist, svd
    

def run_svt(train, test, **kw):
    seed_everything()
    model = SVT(**kw)
    hist = model.fit(M=train, M_test=test)
    
    sparse_test = to_sparse(test)
    y_true, y_pred = sparse_test['Rating'].values.ravel(), np.array(model.predict(sparse_test)).ravel()
    
    print(f"\n\nTest RMSE: {rmse(y_true, y_pred):.3f}\n\n")
    print_hparams(kw, 'SVT')
    return hist, model

def print_hparams(h, name):
    print("Hyperparameters:\n")
    print(f"hist = {name}(train_df, test_df,\n\t" + ", ".join([f"{k}={h[k]}" for k in h]) + ')')
    
def plot_experiment(hist):
    fig = px.line(pd.DataFrame(hist).rename_axis('Epoch'))\
        .update_layout(hoverlabel=dict(font_size=12, font_family="Rockwell"),
                       font=dict(family="Courier New, monospace", size=18))\
        .update_xaxes(showspikes=True)
    fig.show()
    fig.show("svg")

## Run Experiment

In [None]:
hist, model = run_svt(train=M_train_df, test=M_test_df,
		tau=20000,
		delta=2,
		n_epochs=250)

In [None]:
plot_experiment(hist)

In [None]:
hist, model = run_funk(train=train_df, test=test_df,
		lr=0.01,
		reg=0.005,
		n_epochs=28,
		n_factors=10)

In [None]:
plot_experiment(hist)