In [1]:
import pandas as pd 
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse 


class MF(object):
    """docstring for CF"""
    def __init__(self, Y_data, K, lam = 0.1, Xinit = None, Winit = None, 
                 learning_rate = 0.5, max_iter = 1000, print_every = 100, user_based = 1):
        self.Y_raw_data = Y_data
        self.K = K
        self.lam = lam
        self.learning_rate = learning_rate
        self.max_iter = max_iter
        self.print_every = print_every
        self.user_based = user_based
        # number of users and items. Remember to add 1 since id starts from 0
        self.n_users = int(np.max(Y_data[:, 0])) + 1 
        self.n_items = int(np.max(Y_data[:, 1])) + 1
        if Xinit is None: 
            self.X = np.random.randn(self.n_items, K)
        else:
            self.X = Xinit 
        
        if Winit is None: 
            self.W = np.random.randn(K, self.n_users)
        else: 
            self.W = Winit
            
        #self.all_users = self.Y_data[:,0] # all users (may be duplicated)
        self.n_ratings = Y_data.shape[0]
        # normalized data
        self.Y_data_n = self.Y_raw_data.copy()

    def normalize_Y(self):
        if self.user_based:
            user_col = 0
            item_col = 1
            n_objects = self.n_users
        else:
            user_col = 1
            item_col = 0 
            n_objects = self.n_items

        users = self.Y_raw_data[:, user_col] 
        self.mu = np.zeros((n_objects,))
        for n in range(n_objects):
            # row indices of rating done by user n
            # since indices need to be integers, we need to convert
            ids = np.where(users == n)[0].astype(np.int32)
            # indices of all ratings associated with user n
            item_ids = self.Y_data_n[ids, item_col] 
            # and the corresponding ratings 
            ratings = self.Y_data_n[ids, 2]
            # take mean
            m = np.mean(ratings) 
            if np.isnan(m):
                m = 0 # to avoid empty array and nan value
            self.mu[n] = m
            # normalize
            self.Y_data_n[ids, 2] = ratings - self.mu[n]
            
    
    def loss(self):
        L = 0 
        for i in range(self.Y_data_n.shape[0]):
            # user, item, rating
            n, m, rate = int(self.Y_data_n[i, 0]), int(self.Y_data_n[i, 1]), self.Y_data_n[i, 2]
            L += 0.5*(rate - self.X[m, :].dot(self.W[:, n]))**2
            
        # regularization, don't ever forget this 
        L /= self.n_ratings
        L += 0.5*self.lam*(np.linalg.norm(self.X, 'fro') + np.linalg.norm(self.W, 'fro'))
        return L 

    
    def get_items_rated_by_user(self, user_id):
        """
        get all items which are rated by user n, and the corresponding ratings
        """
        # y = self.Y_data_n[:,0] # all users (may be duplicated)
        # item indices rated by user_id
        # we need to +1 to user_id since in the rate_matrix, id starts from 1 
        # while index in python starts from 0
        ids = np.where(self.Y_data_n[:,0] == user_id)[0] 
        item_ids = self.Y_data_n[ids, 1].astype(np.int32) # index starts from 0 
        ratings = self.Y_data_n[ids, 2]
        return (item_ids, ratings)
        
    def get_users_who_rate_item(self, item_id):
        """
        get all users who rated item m and get the corresponding ratings
        """
        ids = np.where(self.Y_data_n[:,1] == item_id)[0] 
        user_ids = self.Y_data_n[ids, 0].astype(np.int32)
        ratings = self.Y_data_n[ids, 2]
        return (user_ids, ratings)
        
    def updateX(self):
        for m in range(self.n_items):
            user_ids, ratings = self.get_users_who_rate_item(m)
            #cac user ids da rate item m
            Wm = self.W[:, user_ids]
            grad_xm = -(ratings - self.X[m, :].dot(Wm)).dot(Wm.T)/self.n_ratings + \
                                               self.lam*self.X[m, :]
            self.X[m, :] -= self.learning_rate*grad_xm
    
    def updateW(self):
        for n in range(self.n_users):
            item_ids, ratings = self.get_items_rated_by_user(n)
            #cac item ids da rate boi user n
            Xn = self.X[item_ids, :]
            grad_wn = -Xn.T.dot(ratings - Xn.dot(self.W[:, n]))/self.n_ratings + \
                        self.lam*self.W[:, n]
            self.W[:, n] -= self.learning_rate*grad_wn
    
    def fit(self):
        self.normalize_Y()
        for it in range(self.max_iter):
            self.updateX()
            self.updateW()
            if (it + 1) % self.print_every == 0:
                rmse_train = self.evaluate_RMSE(self.Y_raw_data)
                print ('iter =', it + 1, ', loss =', self.loss(), ', RMSE train =', rmse_train)
        

    
    def pred(self, u, i):
        """ 
        predict the rating of user u for item i 
        if you need the un
        """
        u = int(u)
        i = int(i)
        
        if self.user_based:
            bias = self.mu[u]
        else: 
            bias = self.mu[i]
        pred = self.X[i, :].dot(self.W[:, u]) + bias 
        if pred < 1:
            return 1 
        if pred > 5: 
            return 5 
        return pred 
        
    
    def pred_for_user(self, user_id):
        ids = np.where(self.Y_data_n[:, 0] == user_id)[0]
        items_rated_by_u = self.Y_data_n[ids, 1].tolist()         
        y_pred = self.X.dot(self.W[:, user_id]) + self.mu[user_id]
        predicted_ratings= []
        for i in range(self.n_items):
            if i not in items_rated_by_u:
                predicted_ratings.append((i, y_pred[i]))
        
        return predicted_ratings
    
    def evaluate_RMSE(self, rate_test):
        n_tests = rate_test.shape[0]
        SE = 0 # squared error
        for n in range(n_tests):
            pred = self.pred(rate_test[n, 0], rate_test[n, 1])
            SE += (pred - rate_test[n, 2])**2 

        RMSE = np.sqrt(SE/n_tests)
        return RMSE
        
# r_cols = ['user_id', 'item_id', 'rating']
# ratings = pd.read_csv('ex.dat', sep = ' ', names = r_cols, encoding='latin-1')
# Y_data = ratings.values


# rs = MF(Y_data, K = 2, max_iter = 1000, print_every = 1000)

# rs.fit()
# rs.pred_for_user(0)

In [2]:
# print (rs.X.dot(rs.W) + rs.mu)

In [3]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']

ratings_base = pd.read_csv('data/ratings.csv', sep=',')
ratings = ratings_base.values.astype(int)

# indices in Python start from 0
print("ratings",ratings)
ratings[:, :2] -= 1



ratings [[        1      1193         5 978300760]
 [        1       661         3 978302109]
 [        1       914         3 978301968]
 ...
 [     6040       562         5 956704746]
 [     6040      1096         4 956715648]
 [     6040      1097         4 956715569]]


In [4]:
from sklearn.model_selection import train_test_split
rate_train, rate_test = train_test_split(ratings, test_size=0.33, random_state=42)
print("rate_train",rate_test)
mylist = list(dict.fromkeys(rate_train[:,1]))
print(len(mylist))


rate_train [[     5411      2682         2 960243649]
 [     5439       903         5 959995181]
 [      367      3716         4 976311423]
 ...
 [      145      2084         3 977348890]
 [      500      3917         2 976211927]
 [     5204      2093         2 961640661]]
3658


In [5]:
rs = MF(rate_train, K = 100, lam = 0.1, print_every = 2, learning_rate = 2, max_iter = 10, user_based = 0)
rs.fit()
print("haizzz",rs.X)
matX = np.matrix(rs.X)
dfX = pd.DataFrame(data=matX.astype(float))
dfX.to_csv('Matrix_X.csv', sep=',', header=False, float_format='%.2f', index=False)

matW = np.matrix(rs.W)
dfW= pd.DataFrame(data=matW.astype(float))
dfW.to_csv('Matrix_W.csv', sep=',', header=False, float_format='%.2f', index=False)

matMu = np.matrix(rs.mu)
dfMu= pd.DataFrame(data=matMu.astype(float))
dfMu.to_csv('Matrix_Mu.csv', sep=',', header=False, float_format='%.2f', index=False)

matResult = np.matrix(rs.X.dot(rs.W))
dfMuResult= pd.DataFrame(data=matResult.astype(float))
dfMuResult.to_csv('Matrix_Result.csv', sep=',', header=False, float_format='%.2f', index=False)

YResult = np.matrix(rs.Y_data_n)
dfYResult= pd.DataFrame(data=YResult.astype(int))
dfYResult.to_csv('Matrix_Y_rating.csv', sep=',', header=False, index=False)

predicted_ratings = rs.pred_for_user(1)
print("pred",predicted_ratings)
# evaluate on test data
RMSE = rs.evaluate_RMSE(rate_test)
print("RMSE",RMSE)



  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


iter = 2 , loss = 53.485232322615914 , RMSE train = 2.024661208835899
iter = 4 , loss = 30.372920446931253 , RMSE train = 1.6061688890909622
iter = 6 , loss = 18.860456651399367 , RMSE train = 1.1722268364542212
iter = 8 , loss = 12.039865861032764 , RMSE train = 1.0121927809164555
iter = 10 , loss = 7.76639601883195 , RMSE train = 0.9798597469740215
haizzz [[-0.08222424  0.00490272  0.12923114 ...  0.18101291  0.175068
  -0.00990147]
 [-0.07618471  0.04597993  0.0525343  ...  0.18351511  0.09243151
   0.10538618]
 [-0.13426238  0.04131346  0.04634345 ... -0.09377589 -0.15996192
   0.14327043]
 ...
 [-0.16935574  0.19528085 -0.09061807 ... -0.16383688 -0.10387785
  -0.0620109 ]
 [-0.16637676 -0.07764113  0.15509089 ...  0.06303354  0.03627917
  -0.04857732]
 [-0.09086486 -0.10016915  0.04314304 ... -0.01706416  0.07354866
   0.03075717]]
pred [(0, 3.2022476104732536), (1, 3.124688745618968), (2, 3.1150994785854773), (3, 3.249970054719275), (4, 3.3577912586072687), (5, 3.140884332912178