In [2]:
# import sys
# !{sys.executable} -m pip install -U matplotlib
!pip install numba

Collecting numba
  Downloading numba-0.52.0-cp37-cp37m-win_amd64.whl (2.3 MB)
Collecting llvmlite<0.36,>=0.35.0
  Downloading llvmlite-0.35.0-cp37-cp37m-win_amd64.whl (16.0 MB)
Installing collected packages: llvmlite, numba
Successfully installed llvmlite-0.35.0 numba-0.52.0


In [3]:
import datetime, os, random, shutil, urllib.request, zipfile, time, warnings
warnings.filterwarnings('ignore')
from functools import wraps
from math import trunc
import numpy as np
import pandas as pd
from pathlib import Path
from urllib.request import urlopen
from zipfile import ZipFile
from scipy.sparse.linalg import norm
import scipy.sparse as ss
from scipy.sparse.linalg import svds
from tqdm import tqdm
from numba import njit
from sklearn.metrics import mean_squared_error as mse

SEED = 123
def seed_everything(seed=SEED):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything()

# Preprocessing

In [4]:
def rmse(y_true, y_pred):
    return mse(y_true, y_pred, squared=False)

In [5]:
def get_dataset():
    path = Path("m1.zip")
    if not path.exists():
        with path.open("wb") as f:
            print("Downloading dataset...")
            f.write(urlopen("http://files.grouplens.org/datasets/movielens/ml-1m.zip").read())
    if not Path("ml-1m").is_dir():
        print("unzipping...")
        with ZipFile("m1.zip") as zf:
            zf.extractall()
    ratings_list = [i.strip().split("::") for i in open('ml-1m/ratings.dat', 'r').readlines()]
    ratings_df = pd.DataFrame(ratings_list, columns = ['UserID', 'MovieID', 'Rating', 'Timestamp'], dtype = int)
    ratings_df['Rating'] = ratings_df['Rating'].apply(pd.to_numeric)
    return ratings_df


Downloading dataset...


Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


pivot ratings_df to get user-rating matrix format

# Funk-SVD

# Numba Methods

In [21]:
@njit
def _shuffle(X):
    np.random.shuffle(X)
    return X

@njit
def _initialization(n_users, n_items, n_factors):
    bu = np.zeros(n_users)
    bi = np.zeros(n_items)
    pu = np.random.normal(0, .1, (n_users, n_factors))
    qi = np.random.normal(0, .1, (n_items, n_factors))
    return bu, bi, pu, qi

@njit
def _run_epoch(X, bu, bi, pu, qi, global_mean, n_factors, lr, reg):
    for i in range(X.shape[0]):
        user, item, rating = int(X[i, 0]), int(X[i, 1]), X[i, 2]

        # Predict current rating
        pred = global_mean + bu[user] + bi[item]

        for factor in range(n_factors):
            pred += pu[user, factor] * qi[item, factor]
        err = rating - pred

        # Update biases
        bu[user] += lr * (err - reg * bu[user])
        bi[item] += lr * (err - reg * bi[item])

        # Update latent factors
        for factor in range(n_factors):
            puf = pu[user, factor]
            qif = qi[item, factor]
            pu[user, factor] += lr * (err * qif - reg * puf)
            qi[item, factor] += lr * (err * puf - reg * qif)
    return bu, bi, pu, qi

@njit
def _compute_val_metrics(X_val, bu, bi, pu, qi, global_mean, n_factors):
    residuals = []
    for i in range(X_val.shape[0]):
        user, item, rating = int(X_val[i, 0]), int(X_val[i, 1]), X_val[i, 2]
        pred = global_mean
        if user > -1:
            pred += bu[user]
        if item > -1:
            pred += bi[item]
        if (user > -1) and (item > -1):
            for factor in range(n_factors):
                pred += pu[user, factor] * qi[item, factor]
        residuals.append(rating - pred)
    residuals = np.array(residuals)
    loss = np.square(residuals).mean()
    rmse = np.sqrt(loss)
    mae = np.absolute(residuals).mean()
    return loss, rmse, mae

# Funk SVD Class

In [33]:
class SVD:
    def __init__(self, lr=.005, reg=.02, n_epochs=20, n_factors=100,
                 early_stopping=False, shuffle=False, min_delta=.001,
                 min_rating=1, max_rating=5):

        self.lr = lr
        self.reg = reg
        self.n_epochs = n_epochs
        self.n_factors = n_factors
        self.early_stopping = early_stopping
        self.shuffle = shuffle
        self.min_delta = min_delta
        self.min_rating = min_rating
        self.max_rating = max_rating

    def fit(self, X, X_val=None):
        X = self._preprocess_data(X)

        if X_val is not None:
            X_val = self._preprocess_data(X_val, train=False, verbose=False)
            self._init_metrics()
        self.global_mean_ = np.mean(X[:, 2])
        self._run_sgd(X, X_val)
        return self

    def _preprocess_data(self, X, train=True, verbose=True):
#         print('Preprocessing data...\n')
        X = X.copy()

        if train:  # Mappings have to be created
            user_ids = X['UserID'].unique().tolist()
            item_ids = X['MovieID'].unique().tolist()
            n_users = len(user_ids)
            n_items = len(item_ids)
            user_idx = range(n_users)
            item_idx = range(n_items)
            self.user_mapping_ = dict(zip(user_ids, user_idx))
            self.item_mapping_ = dict(zip(item_ids, item_idx))
        X['UserID'] = X['UserID'].map(self.user_mapping_)
        X['MovieID'] = X['MovieID'].map(self.item_mapping_)

        # Tag validation set unknown users/items with -1 (enables
        # `fast_methods._compute_val_metrics` detecting them)
        X.fillna(-1, inplace=True)
        
        X['UserID'] = X['UserID'].astype(np.int32)
        X['MovieID'] = X['MovieID'].astype(np.int32)
        return X[['UserID', 'MovieID', 'Rating']].values

    def _init_metrics(self):
        metrics = np.zeros((self.n_epochs, 3), dtype=np.float)
        self.metrics_ = pd.DataFrame(metrics, columns=['Loss', 'RMSE', 'MAE'])

    def _run_sgd(self, X, X_val):
        n_users = len(np.unique(X[:, 0]))
        n_items = len(np.unique(X[:, 1]))
        val_loss, val_rmse = '', ''
        bu, bi, pu, qi = _initialization(n_users, n_items, self.n_factors)

        # Run SGD
        pbar = tqdm(range(self.n_epochs), desc='Epoch',
                             ncols=110)
        for epoch_ix in pbar:
            pbar.set_postfix({'val_loss': val_loss, 'val_rmse': val_rmse})
            start = self._on_epoch_begin(epoch_ix)

            if self.shuffle:
                X = _shuffle(X)

            bu, bi, pu, qi = _run_epoch(X, bu, bi, pu, qi, self.global_mean_,
                                        self.n_factors, self.lr, self.reg)

            if X_val is not None:
                self.metrics_.loc[epoch_ix, :] = _compute_val_metrics(
                                                     X_val, bu, bi, pu, qi,
                                                     self.global_mean_,
                                                     self.n_factors)
                val_loss, val_rmse = self._on_epoch_end(start,
                                   self.metrics_.loc[epoch_ix, 'Loss'],
                                   self.metrics_.loc[epoch_ix, 'RMSE'],
                                   self.metrics_.loc[epoch_ix, 'MAE'])

                if self.early_stopping:
                    val_rmse = self.metrics_['RMSE'].tolist()
                    if self._early_stopping(val_rmse, epoch_ix,
                                            self.min_delta):
                        break
            else:
                val_loss, val_rmse = self._on_epoch_end(start)

        self.bu_ = bu
        self.bi_ = bi
        self.pu_ = pu
        self.qi_ = qi

    def predict(self, X, clip=True):
        return [
            self.predict_pair(u_id, i_id, clip)
            for u_id, i_id in zip(X['UserID'], X['MovieID'])
        ]

    def predict_pair(self, u_id, i_id, clip=True):
        user_known, item_known = False, False
        pred = self.global_mean_

        if u_id in self.user_mapping_:
            user_known = True
            u_ix = self.user_mapping_[u_id]
            pred += self.bu_[u_ix]

        if i_id in self.item_mapping_:
            item_known = True
            i_ix = self.item_mapping_[i_id]
            pred += self.bi_[i_ix]

        if user_known and item_known:
            pred += np.dot(self.pu_[u_ix], self.qi_[i_ix])

        if clip:
            pred = self.max_rating if pred > self.max_rating else pred
            pred = self.min_rating if pred < self.min_rating else pred

        return pred

    def _early_stopping(self, val_rmse, epoch_idx, min_delta):
        if epoch_idx > 0:
            if val_rmse[epoch_idx] + min_delta > val_rmse[epoch_idx-1]:
                self.metrics_ = self.metrics_.loc[:(epoch_idx+1), :]
                return True
        return False

    def _on_epoch_begin(self, epoch_ix):
        start = time.time()
        end = '  | ' if epoch_ix < 9 else ' | '
#         print('Epoch {}/{}'.format(epoch_ix + 1, self.n_epochs), end=end)

        return start

    def _on_epoch_end(self, start, val_loss=None, val_rmse=None, val_mae=None):
        end = time.time()
        return f'{val_loss:.3f}', f'{val_rmse:.3f}'
#         print(f'took {end - start:.1f} sec')

In [34]:
ratings_df = get_dataset()
R_df = ratings_df.pivot(index = 'UserID', columns ='MovieID', values = 'Rating').fillna(0)

In [35]:
seed_everything()
test_indices = np.array(random.sample(list(np.argwhere(R_df.values>0)), 2500))

In [50]:

def split(df, R_df, test_indices):
    x_indices = test_indices[:,0]
    y_indices = test_indices[:,1]
    u_ids = R_df.index[x_indices].astype(np.int32)
    i_ids = R_df.columns[y_indices].astype(np.int32)
    
    df['UserID'] = df['UserID'].astype(np.int32)
    df['MovieID'] = df['MovieID'].astype(np.int32)
    df['Rating'] = df['Rating'].astype(np.float64)
    
    df= df.drop(columns=['Timestamp'], errors='ignore')
    
    test = []
    for u_id, i_id in zip(u_ids, i_ids):
        test.append(df.loc[(df['UserID'] == u_id) & (df['MovieID'] == i_id)])
    test_df = pd.concat(test)
    train_df= df.drop(test_df.index.tolist())
    
    return train_df, test_df

train_df, test_df = split(ratings_df, R_df, test_indices)


        UserID  MovieID  Rating
974967    1265     1732     4.0
540199    2456     2105     3.0
843159    1449     2959     4.0
98506     5350     2640     2.0
440052    3323     3468     4.0
        UserID  MovieID  Rating
206405    1265     1732     4.0
409183    2456     2105     3.0
240650    1449     2959     4.0
886005    5350     2640     2.0
540360    3323     3468     4.0
True
        UserID  MovieID  Rating
910076       1        1     5.0
910083       1       48     5.0
904601       1      150     5.0
904587       1      260     4.0
910072       1      527     5.0
    UserID  MovieID  Rating
40       1        1     5.0
25       1       48     5.0
39       1      150     5.0
44       1      260     4.0
23       1      527     5.0
True


print(train_df.shape)
print(train_df2.shape)

In [30]:
def funk(train, test, **kw):
    hparams = '\n'.join(map(str, kw.items()))
    seed_everything()
    svd = SVD(**kw)
    svd.fit(X=train, X_val=test)
    y_true, y_pred = test['Rating'], svd.predict(test)
    print(f"Test RMSE: {rmse(y_true, y_pred):.4f}\n\nHyperparams: \n{hparams}")

funk(train=train_df, test=test_df,
                shuffle=False, min_rating=1, max_rating=5, early_stopping=False,
                min_delta=.00001,
                lr=.001,
                reg=.002,
#                 n_epochs=100,
                n_factors=20)

Epoch: 100%|████████████████████████████████| 100/100 [00:07<00:00, 12.51it/s, val_loss=0.747, val_rmse=0.864]

Test RMSE: 0.8627

Hyperparams: 
('shuffle', False)
('min_rating', 1)
('max_rating', 5)
('early_stopping', False)
('min_delta', 1e-05)
('lr', 0.001)
('reg', 0.002)
('n_epochs', 100)
('n_factors', 20)





In [31]:
# Training took 16 sec
# Test RMSE: 0.8416

# Hyperparams: 
# ('shuffle', False)
# ('min_rating', 1)
# ('max_rating', 5)
# ('early_stopping', False)
# ('min_delta', 1e-05)
# ('lr', 0.0007)
# ('reg', 0.02)
# ('n_epochs', 300)
# ('n_factors', 20)

# Training took 20 sec
# Test RMSE: 0.8412

# Hyperparams: 
# ('shuffle', False)
# ('min_rating', 1)
# ('max_rating', 5)
# ('early_stopping', True)
# ('min_delta', 1e-05)
# ('lr', 0.0006)
# ('reg', 0.015)
# ('n_epochs', 1000)
# ('n_factors', 25)