# Baseline Model 
## compute the average ratings for all items from all users
## add the user rating bias and item bias to the average rating
## compute all users and all items biases 
## convert the problem into loss minimization: use gradient descent and least squares 

In [8]:
import pandas as pd
import numpy as np

class BaselineCFBySGD(object):
    
    def __init__(self, number_epochs, alpha, reg, columns=["uid", "iid", "rating"]):
        self.number_epochs = number_epochs
        self.alpha = alpha 
        self.reg = reg
        self.columns = columns 
        
    def fit(self, dataset):
        self.dataset = dataset 
        # user ratings data 
        self.users_ratings = dataset.groupby(self.columns[0]).agg([list])[[self.columns[1], self.columns[2]]]
        # item rating
        self.items_ratings = dataset.groupby(self.columns[1]).agg([list])[[self.columns[0], self.columns[2]]]
        # compute average
        self.global_mean = self.dataset[self.columns[2]].mean()
        # use sgd 
        self.bu, self.bi = self.sgd()
        
    def sgd(self):
        bu = dict(zip(self.users_ratings.index, np.zeros(len(self.users_ratings))))
        bi = dict(zip(self.items_ratings.index, np.zeros(len(self.items_ratings))))
        
        for i in range(self.number_epochs):
            print("iter%d" % i)
            # stochastic gradient descent algorithm 
            for uid, iid, real_rating in self.dataset.itertuples(index=False):
                error = real_rating - (self.global_mean + bu[uid] + bi[iid])
                bu[uid] += self.alpha * (error - self.reg*bu[uid])
                bi[iid] += self.alpha * (error - self.reg*bi[iid])
        return bu, bi
         
        
        
    def predict(self, uid, iid):
        predict_rating = self.global_mean + self.bu[uid] + self.bi[iid]
        return predict_rating 
        
    
if __name__ == '__main__':
    dtype = [("userId", np.int32), ("movieId", np.int32), ("rating", np.float32)]
    dataset = pd.read_csv("Downloads/movies/ratings.csv", usecols=range(3), dtype=dict(dtype))
    
    bcf = BaselineCFBySGD(20, 0.1, 0.1, ["userId", "movieId", "rating"])
    bcf.fit(dataset)
    
    while True:
        uid = int(input("uid: "))
        iid = int(input("iid: "))
        print(bcf.predict(uid, iid))
        

iter0
iter1
iter2
iter3
iter4
iter5
iter6
iter7
iter8
iter9
iter10
iter11
iter12
iter13
iter14
iter15
iter16
iter17
iter18
iter19
uid: 4
iid: 22
2.9720473869372492
uid: 0
iid: 0


KeyError: 0

In [38]:
def data_split(datapath, x=0.8, random=False):
    
    print("start to divide dataset")
    dtype = [("userId", np.int32), ("movieId", np.int32), ("rating", np.float32)]
    ratings = pd.read_csv(datapath, usecols=range(3), dtype=dict(dtype))
    
    testset_index = []
    
    for uid in ratings.groupby("userId").any().index:
        user_rating_data = ratings.where(ratings["userId"]==uid).dropna()
        
        if random:
            index = list(user_rating_data.index)
            np.random.shuffle(index)
            _index = round(len(user_rating_data)*x)
            testset_index += list(index[_index:])
        else:
            index = round(len(user_rating_data)*x)
            testset_index += list(user_rating_data.index.values[index:])
            
    testset = ratings.loc[testset_index]
    trainset = ratings.drop(testset_index)
    print("complete dividing")
    return trainset, testset

def accuracy(predict_results, method="all"):
    def rmse_and_mae(predict_results):
        length = 0
        _rmse_sum = 0
        _mae_sum = 0
        for uid, iid, real_rating, pred_rating in predict_results:
            length += 1
            _rmse_sum += (pred_rating - real_rating) ** 2
            _mae_sum += abs(pred_rating - real_rating) 
        rmse_value = round(np.sqrt(_rmse_sum / length), 4)
        mae_value = round(_mae_sum / length, 4)
        return rmse_value, mae_value

    if method.lower() == 'rmse':
        return rmse_and_mae(predict_results)[0]
    elif method.lower() == 'mae':
        return rmse_and_mae(predict_results)[1]
    else:
        return rmse_and_mae(predict_results)


In [40]:
class BaselineCFBySGD(object):
    
    def __init__(self, number_epochs, alpha, reg, columns=["uid", "iid", "rating"]):
        self.number_epochs = number_epochs
        self.alpha = alpha 
        self.reg = reg
        self.columns = columns 
        
    def fit(self, dataset):
        self.dataset = dataset 
        # user ratings data 
        self.users_ratings = dataset.groupby(self.columns[0]).agg([list])[[self.columns[1], self.columns[2]]]
        # item rating
        self.items_ratings = dataset.groupby(self.columns[1]).agg([list])[[self.columns[0], self.columns[2]]]
        # compute average
        self.global_mean = self.dataset[self.columns[2]].mean()
        # use sgd 
        self.bu, self.bi = self.sgd()
        
    def sgd(self):
        bu = dict(zip(self.users_ratings.index, np.zeros(len(self.users_ratings))))
        bi = dict(zip(self.items_ratings.index, np.zeros(len(self.items_ratings))))
        
        for i in range(self.number_epochs):
            print("iter%d" % i)
            # stochastic gradient descent algorithm 
            for uid, iid, real_rating in self.dataset.itertuples(index=False):
                error = real_rating - (self.global_mean + bu[uid] + bi[iid])
                bu[uid] += self.alpha * (error - self.reg*bu[uid])
                bi[iid] += self.alpha * (error - self.reg*bi[iid])
        return bu, bi
         
        
        
    def predict(self, uid, iid):
        # prediction 
        if iid not in self.items_ratings.index:
            raise Exception("cannot predict ")
            
        predict_rating = self.global_mean + self.bu[uid] + self.bi[iid]
        return predict_rating 
    
    def test(self, testset):
        for uid, iid, real_rating in testset.itertuples(index=False):
            try:
                pred_rating = self.predict(uid, iid)
            except Exception as e:
                print(e)
                continue 
            else:
                yield uid, iid, real_rating, pred_rating 

if __name__ == '__main__':
    trainset, testset = data_split("Downloads/movies/ratings.csv", random=True)
    bcf = BaselineCFBySGD(20, 0.1, 0.1, ["userId", "movieId", "rating"])
    bcf.fit(trainset)
    pred_results = bcf.test(testset)
    rmse, mae = accuracy(pred_results)
    print("rmse: ", rmse, "mae ", mae)
    

start to divide dataset
complete dividing
iter0
iter1
iter2
iter3
iter4
iter5
iter6
iter7
iter8
iter9
iter10
iter11
iter12
iter13
iter14
iter15
iter16
iter17
iter18
iter19
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predi

## alternating least squares 

In [46]:
class BaselineCFBySGD_als(object):
    
    def __init__(self, number_epochs, reg_bu, reg_bi, columns=["uid", "iid", "rating"]):
        self.number_epochs = number_epochs
        self.reg_bu = reg_bu
        self.reg_bi = reg_bi
        self.columns = columns 
        
    def fit(self, dataset):
        self.dataset = dataset 
        # user ratings data 
        self.users_ratings = dataset.groupby(self.columns[0]).agg([list])[[self.columns[1], self.columns[2]]]
        # item rating
        self.items_ratings = dataset.groupby(self.columns[1]).agg([list])[[self.columns[0], self.columns[2]]]
        # compute average
        self.global_mean = self.dataset[self.columns[2]].mean()
        # use sgd 
        self.bu, self.bi = self.als()
        
    def als(self):
        bu = dict(zip(self.users_ratings.index, np.zeros(len(self.users_ratings))))
        bi = dict(zip(self.items_ratings.index, np.zeros(len(self.items_ratings))))
        
        for i in range(self.number_epochs):
            print("iter%d" % i)
            # stochastic gradient descent algorithm 
            for iid, uids, ratings in self.items_ratings.itertuples(index=True):
                _sum = 0
                for uid, rating in zip(uids, ratings):
                    _sum += rating - self.global_mean - bu[uid]
                bi[iid] = _sum / (self.reg_bi + len(uids))
                
            for uid, iids, ratings in self.users_ratings.itertuples(index=True):
                _sum = 0
                for iid, rating in zip(iids, ratings):
                    _sum += rating - self.global_mean - bi[iid]
                bu[uid] = _sum / (self.reg_bu + len(iids))
        return bu, bi
         
        
        
    def predict(self, uid, iid):
        # prediction 
        if iid not in self.items_ratings.index:
            raise Exception("cannot predict ")
            
        predict_rating = self.global_mean + self.bu[uid] + self.bi[iid]
        return predict_rating 
    
    def test(self, testset):
        for uid, iid, real_rating in testset.itertuples(index=False):
            try:
                pred_rating = self.predict(uid, iid)
            except Exception as e:
                print(e)
                continue 
            else:
                yield uid, iid, real_rating, pred_rating 

if __name__ == '__main__':
    #dtype = [("userId", np.int32), ("movieId", np.int32), ("rating", np.float32)]
    #dataset = pd.read_csv("Downloads/movies/ratings.csv", usecols=range(3), dtype=dict(dtype))
    trainset, testset = data_split("Downloads/movies/ratings.csv", random=True)
    bcf = BaselineCFBySGD_als(20, 25, 15, ["userId", "movieId", "rating"])
    bcf.fit(trainset)
    pred_results = bcf.test(testset)
    rmse, mae = accuracy(pred_results)
    print("rmse: ", rmse, "mae: ", mae)

start to divide dataset
complete dividing
iter0
iter1
iter2
iter3
iter4
iter5
iter6
iter7
iter8
iter9
iter10
iter11
iter12
iter13
iter14
iter15
iter16
iter17
iter18
iter19
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predi

## FunkSVD: Latent Factor Model (LFM)

In [50]:
class LFM(object):
    def __init__(self, alpha, reg_p, reg_q, number_LatentFactors = 10, number_epochs=10, columns=["uid", "iid", 
                                                                                                 "rating"]):
        self.alpha = alpha
        self.reg_p = reg_p
        self.reg_q = reg_q
        self.number_LatentFactors = number_LatentFactors
        self.number_epochs = number_epochs
        self.columns = columns
        
    def fit(self, dataset):
        self.dataset = pd.DataFrame(dataset)
        self.users_ratings = dataset.groupby(self.columns[0]).agg([list])[[self.columns[1], self.columns[2]]]
        # item rating
        self.items_ratings = dataset.groupby(self.columns[1]).agg([list])[[self.columns[0], self.columns[2]]]
        # compute average
        self.global_mean = self.dataset[self.columns[2]].mean()
        # use sgd 
        self.P, self.Q = self.sgd()
        
    def _init_matrix(self):
        P = dict(zip(
            self.users_ratings.index,
            np.random.rand(len(self.users_ratings),
                          self.number_LatentFactors).astype(np.float32)))
        Q = dict(zip(
            self.items_ratings.index,
            np.random.rand(len(self.items_ratings),
                          self.number_LatentFactors).astype(np.float32)))
        return P, Q
    
    def sgd(self):
        P, Q = self._init_matrix()
        
        for i in range(self.number_epochs):
            print("iter%d"% i)
            error_list = []
            for uid, iid, r_ui in self.dataset.itertuples(index=False):
                v_pu = P[uid]
                v_qi = Q[iid]
                # the matrix multiplication gives the prediction
                err = np.float32(r_ui - np.dot(v_pu, v_qi))
                # the gradient descent for the vectors
                v_pu += self.alpha * (err * v_qi - self.reg_p * v_pu)
                v_qi += self.alpha * (err * v_pu - self.reg_q * v_qi)
                
                P[uid] = v_pu
                Q[iid] = v_qi
                
                error_list.append(err ** 2)
                
            print(np.sqrt(np.mean(error_list)))
        return P, Q
    
    def predict(self, uid, iid):
        if uid not in self.users_ratings.index or iid not in self.items_ratings.index:
            return self.global_mean
        p_u = self.P[uid]
        p_v = self.Q[iid]
        return np.dot(p_u, p_v)
    
    def test(self, testset):
        for uid, iid, real_rating in testset.itertuples(index=False):
            try:
                pred_rating = self.predict(uid, iid)
            except Exception as e:
                print(e)
                continue 
            else:
                yield uid, iid, real_rating, pred_rating 
        
    
if __name__ == '__main__':
    #dtype = [("userId", np.int32), ("movieId", np.int32), ("rating", np.float32)]
    #dataset = pd.read_csv("Downloads/movies/ratings.csv", usecols=range(3), dtype=dict(dtype))
    trainset, testset = data_split("Downloads/movies/ratings.csv", random=True)
    lfm = LFM(0.1, 15, 15,10, 20,["userId", "movieId", "rating"])
    lfm.fit(trainset)
    pred_results = lfm.test(testset)
    rmse, mae = accuracy(pred_results)
    print("rmse: ", rmse, "mae: ", mae)
        
        

start to divide dataset
complete dividing
iter0
3.6441284610184073
iter1
3.653092501421141
iter2
3.653780221800992
iter3
3.653922038586782
iter4
3.6539446375517293
iter5
3.6539484940266465
iter6
3.653949163171286
iter7
3.6539492839321146
iter8
3.6539493065030406
iter9
3.6539493107733327
iter10
3.6539493115689137
iter11
3.653949311658238
iter12
3.6539493116617265
iter13
3.653949311661777
iter14
3.653949311661777
iter15
3.653949311661777
iter16
3.653949311661777
iter17
3.653949311661777
iter18
3.653949311661777
iter19
3.653949311661777
rmse:  3.5947 mae:  3.4055
