In [16]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse

class MF:
    def __init__(self, Y_data, K, lam = 0.1, Xinit = None, Winit = None,
                learning_rate = 0.5, max_iter = 1000, print_every = 100,
                user_based = 1):
        self.Y_raw_data = Y_data
        self.K = K
        # regularization parameter
        self.lam = lam
        # learning rate for gradient descent
        self.learning_rate = learning_rate
        # maximum number of iteractions
        self.max_iter = max_iter
        # print results after every print_every iteractions
        self.print_every = print_every
        # user-based or item-based
        self.user_based = user_based
        # number of users, items and ratings
        self.n_users = int(np.max(Y_data[:, 0])) + 1
        self.n_items = int(np.max(Y_data[:, 1])) + 1
        self.n_ratings = Y_data.shape[0]
        
        if Xinit is None:
            self.X = np.random.randn(self.n_items, K)
        else:
            self.X = Xinit
        
        if Winit is None:
            self.W = np.random.randn(K, self.n_users)
        else: 
            self.W = Winit
        
        # normalized data
        self.Y_data_n = self.Y_raw_data.copy()
        
    def normalize_Y(self):
        if self.user_based:
            user_col = 0
            item_col = 1
            n_objects = self.n_users
        
        else:
            user_col = 1
            item_col = 0
            n_objects = self.n_items
        
        users = self.Y_raw_data[:, user_col]
        self.mu = np.zeros((n_objects,))
        for n in range(n_objects):
            ids  = np.where(users == n)[0].astype(np.int32)
            item_ids = self.Y_data_n[ids, item_col]
            ratings = self.Y_data_n[ids, 2]
            m = np.mean(ratings)
            if np.isnan(m):
                m = 0
            self.mu[n] = m
            self.Y_data_n[ids, 2] = ratings - self.mu[n]
            
    def loss(self):
        L = 0
        for i in range(self.n_ratings):
            n, m, rate = int(self.Y_data_n[i, 0]), int(self.Y_data_n[i, 1]), int(self.Y_data_n[i, 2])
            L += 0.5*(rate-self.X[m, :].dot(self.W[:, n]))**2
        
        # take average
        L /= self.n_ratings
        #regularization
        L += 0.5*self.lam*(np.linalg.norm(self.X, 'fro') + 
                          np.linalg.norm(self.W, 'fro'))
        return L
    
    def get_items_rated_by_user(self, user_id):
        ids = np.where(self.Y_data_n[:, 0] == user_id)[0]
        item_ids = self.Y_data_n[ids, 1].astype(np.int32)
        ratings = self.Y_data_n[ids, 2]
        return (item_ids, ratings)
    
    def get_users_who_rate_item(self, item_id):
        ids = np.where(self.Y_data_n[:, 1] == item_id)[0]
        user_ids = self.Y_data_n[ids, 0].astype(np.int32)
        ratings = self.Y_data_n[ids, 2]
        return (user_ids, ratings)
    
    def updateX(self):
        for m in range(self.n_items):
            user_ids, ratings = self.get_users_who_rate_item(m)
            Wm = self.W[:, user_ids]
            grad_xm = - (ratings - self.X[m, :].dot(Wm)).dot(Wm.T)/self.n_ratings + self.lam*self.X[m, :]
            self.X[m, :] -= self.learning_rate*grad_xm.reshape((self.K,))
            
    def updateW(self):
        for n in range(self.n_users):
            item_ids, ratings = self.get_items_rated_by_user(n)
            Xn = self.X[item_ids, :]
            grad_wn = -Xn.T.dot(ratings - Xn.dot(self.W[:, n]))/self.n_ratings + self.lam*self.W[:, n]
            self.W[:, n] -= self.learning_rate*grad_wn.reshape((self.K,))
    
    def fit(self):
        self.normalize_Y()
        for it in range(self.max_iter):
            self.updateX()
            self.updateW()
            if (it + 1) % self.print_every == 0:
                rmse_train = self.evaluate_RMSE(self.Y_raw_data)
                print('iter = ', it+1, ', loss = ', self.loss(), ', RMSE train = ', rmse_train)
    
    def pred(self, u, i):
        u = int(u)
        i = int(i)
        if self.user_based:
            bias = self.mu[u]
        else:
            bias = self.mu[i]
        
        pred = self.X[i, :].dot(self.W[:, u]) + bias
        
        if pred < 0:
            return 0
        if pred > 5:
            return 5
        return pred
    
    def pred_for_user(self, user_id):
        ids = np.where(self.Y_data_n[:, 0] == user_id)[0]
        items_rated_by_u = self.Y_data_n[ids, 1].tolist()
        
        y_pred = self.X.dot(self.W[:, user_id]) + self.mu[user_id]
        predicted_ratings = []
        for i in range(self.n_items):
            if i not in items_rated_by_u:
                predicted_ratings.append((i, y_pred[i]))
        
        return predicted_ratings
    
    def evaluate_RMSE(self, rate_test):
        n_tests = rate_test.shape[0]
        SE = 0
        for n in range(n_tests):
            pred = self.pred(rate_test[n, 0], rate_test[n, 1])
            SE += (pred - rate_test[n, 2])**2
        
        RMSE = np.sqrt(SE/n_tests)
        return RMSE

In [17]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']

ratings_base = pd.read_csv('ml-100k/ub.base', sep='\t', names=r_cols, encoding='latin-1')
ratings_test = pd.read_csv('ml-100k/ub.test', sep='\t', names=r_cols, encoding='latin-1')

rate_train = ratings_base.values
rate_test = ratings_test.values

# indices start from 0
rate_train[:, :2] -= 1
rate_test[:, :2] -= 1

In [18]:
rs = MF(rate_train, K = 10, lam = .1, print_every = 10, 
    learning_rate = 0.75, max_iter = 100, user_based = 1)
rs.fit()
# evaluate on test data
RMSE = rs.evaluate_RMSE(rate_test)
print('\nUser-based MF, RMSE =', RMSE)

iter =  10 , loss =  5.638061880361899 , RMSE train =  1.2098462603099835
iter =  20 , loss =  2.6303004500253806 , RMSE train =  1.0378086135128022
iter =  30 , loss =  1.3391397807650463 , RMSE train =  1.0294640135138562
iter =  40 , loss =  0.750921851466934 , RMSE train =  1.0291961071487103
iter =  50 , loss =  0.4813399938312195 , RMSE train =  1.029207027926056
iter =  60 , loss =  0.35771879251420546 , RMSE train =  1.0292125695065755
iter =  70 , loss =  0.30102751106382736 , RMSE train =  1.0292139353932424
iter =  80 , loss =  0.2750295311558517 , RMSE train =  1.0292142434934006
iter =  90 , loss =  0.2631071796806936 , RMSE train =  1.029214311730571
iter =  100 , loss =  0.2576397435387018 , RMSE train =  1.0292143267635172

User-based MF, RMSE = 1.060379887918903


In [19]:
rs = MF(rate_train, K = 10, lam = .1, print_every = 10, learning_rate = 0.75, max_iter = 100, user_based = 0)
rs.fit()
# evaluate on test data
RMSE = rs.evaluate_RMSE(rate_test)
print('\nItem-based MF, RMSE =', RMSE)

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


iter =  10 , loss =  5.611125703202822 , RMSE train =  1.174383057785558
iter =  20 , loss =  2.6136125727255144 , RMSE train =  1.005247202290158
iter =  30 , loss =  1.3224618559586827 , RMSE train =  0.9965690306853113
iter =  40 , loss =  0.7340831778693544 , RMSE train =  0.996193560525038
iter =  50 , loss =  0.46442965678636094 , RMSE train =  0.9961801884586986
iter =  60 , loss =  0.3407779520866742 , RMSE train =  0.9961805352248034
iter =  70 , loss =  0.2840733830637253 , RMSE train =  0.9961808022063813
iter =  80 , loss =  0.2580695185863676 , RMSE train =  0.996180878366193
iter =  90 , loss =  0.24614453912939832 , RMSE train =  0.9961808976716874
iter =  100 , loss =  0.2406759248909192 , RMSE train =  0.9961809023826615

Item-based MF, RMSE = 1.0498047433773399
