# Baseline Model 
## compute the average ratings for all items from all users
## add the user rating bias and item bias to the average rating
## compute all users and all items biases 
## convert the problem into loss minimization: use gradient descent and least squares 

In [8]:
import pandas as pd
import numpy as np

class BaselineCFBySGD(object):
    
    def __init__(self, number_epochs, alpha, reg, columns=["uid", "iid", "rating"]):
        self.number_epochs = number_epochs
        self.alpha = alpha 
        self.reg = reg
        self.columns = columns 
        
    def fit(self, dataset):
        self.dataset = dataset 
        # user ratings data 
        self.users_ratings = dataset.groupby(self.columns[0]).agg([list])[[self.columns[1], self.columns[2]]]
        # item rating
        self.items_ratings = dataset.groupby(self.columns[1]).agg([list])[[self.columns[0], self.columns[2]]]
        # compute average
        self.global_mean = self.dataset[self.columns[2]].mean()
        # use sgd 
        self.bu, self.bi = self.sgd()
        
    def sgd(self):
        bu = dict(zip(self.users_ratings.index, np.zeros(len(self.users_ratings))))
        bi = dict(zip(self.items_ratings.index, np.zeros(len(self.items_ratings))))
        
        for i in range(self.number_epochs):
            print("iter%d" % i)
            # stochastic gradient descent algorithm 
            for uid, iid, real_rating in self.dataset.itertuples(index=False):
                error = real_rating - (self.global_mean + bu[uid] + bi[iid])
                bu[uid] += self.alpha * (error - self.reg*bu[uid])
                bi[iid] += self.alpha * (error - self.reg*bi[iid])
        return bu, bi
         
        
        
    def predict(self, uid, iid):
        predict_rating = self.global_mean + self.bu[uid] + self.bi[iid]
        return predict_rating 
        
    
if __name__ == '__main__':
    dtype = [("userId", np.int32), ("movieId", np.int32), ("rating", np.float32)]
    dataset = pd.read_csv("Downloads/movies/ratings.csv", usecols=range(3), dtype=dict(dtype))
    
    bcf = BaselineCFBySGD(20, 0.1, 0.1, ["userId", "movieId", "rating"])
    bcf.fit(dataset)
    
    while True:
        uid = int(input("uid: "))
        iid = int(input("iid: "))
        print(bcf.predict(uid, iid))
        

iter0
iter1
iter2
iter3
iter4
iter5
iter6
iter7
iter8
iter9
iter10
iter11
iter12
iter13
iter14
iter15
iter16
iter17
iter18
iter19
uid: 4
iid: 22
2.9720473869372492
uid: 0
iid: 0


KeyError: 0

In [38]:
def data_split(datapath, x=0.8, random=False):
    
    print("start to divide dataset")
    dtype = [("userId", np.int32), ("movieId", np.int32), ("rating", np.float32)]
    ratings = pd.read_csv(datapath, usecols=range(3), dtype=dict(dtype))
    
    testset_index = []
    
    for uid in ratings.groupby("userId").any().index:
        user_rating_data = ratings.where(ratings["userId"]==uid).dropna()
        
        if random:
            index = list(user_rating_data.index)
            np.random.shuffle(index)
            _index = round(len(user_rating_data)*x)
            testset_index += list(index[_index:])
        else:
            index = round(len(user_rating_data)*x)
            testset_index += list(user_rating_data.index.values[index:])
            
    testset = ratings.loc[testset_index]
    trainset = ratings.drop(testset_index)
    print("complete dividing")
    return trainset, testset

def accuracy(predict_results, method="all"):
    def rmse_and_mae(predict_results):
        length = 0
        _rmse_sum = 0
        _mae_sum = 0
        for uid, iid, real_rating, pred_rating in predict_results:
            length += 1
            _rmse_sum += (pred_rating - real_rating) ** 2
            _mae_sum += abs(pred_rating - real_rating) 
        rmse_value = round(np.sqrt(_rmse_sum / length), 4)
        mae_value = round(_mae_sum / length, 4)
        return rmse_value, mae_value

    if method.lower() == 'rmse':
        return rmse_and_mae(predict_results)[0]
    elif method.lower() == 'mae':
        return rmse_and_mae(predict_results)[1]
    else:
        return rmse_and_mae(predict_results)


In [40]:
class BaselineCFBySGD(object):
    
    def __init__(self, number_epochs, alpha, reg, columns=["uid", "iid", "rating"]):
        self.number_epochs = number_epochs
        self.alpha = alpha 
        self.reg = reg
        self.columns = columns 
        
    def fit(self, dataset):
        self.dataset = dataset 
        # user ratings data 
        self.users_ratings = dataset.groupby(self.columns[0]).agg([list])[[self.columns[1], self.columns[2]]]
        # item rating
        self.items_ratings = dataset.groupby(self.columns[1]).agg([list])[[self.columns[0], self.columns[2]]]
        # compute average
        self.global_mean = self.dataset[self.columns[2]].mean()
        # use sgd 
        self.bu, self.bi = self.sgd()
        
    def sgd(self):
        bu = dict(zip(self.users_ratings.index, np.zeros(len(self.users_ratings))))
        bi = dict(zip(self.items_ratings.index, np.zeros(len(self.items_ratings))))
        
        for i in range(self.number_epochs):
            print("iter%d" % i)
            # stochastic gradient descent algorithm 
            for uid, iid, real_rating in self.dataset.itertuples(index=False):
                error = real_rating - (self.global_mean + bu[uid] + bi[iid])
                bu[uid] += self.alpha * (error - self.reg*bu[uid])
                bi[iid] += self.alpha * (error - self.reg*bi[iid])
        return bu, bi
         
        
        
    def predict(self, uid, iid):
        # prediction 
        if iid not in self.items_ratings.index:
            raise Exception("cannot predict ")
            
        predict_rating = self.global_mean + self.bu[uid] + self.bi[iid]
        return predict_rating 
    
    def test(self, testset):
        for uid, iid, real_rating in testset.itertuples(index=False):
            try:
                pred_rating = self.predict(uid, iid)
            except Exception as e:
                print(e)
                continue 
            else:
                yield uid, iid, real_rating, pred_rating 

if __name__ == '__main__':
    trainset, testset = data_split("Downloads/movies/ratings.csv", random=True)
    bcf = BaselineCFBySGD(20, 0.1, 0.1, ["userId", "movieId", "rating"])
    bcf.fit(trainset)
    pred_results = bcf.test(testset)
    rmse, mae = accuracy(pred_results)
    print("rmse: ", rmse, "mae ", mae)
    

start to divide dataset
complete dividing
iter0
iter1
iter2
iter3
iter4
iter5
iter6
iter7
iter8
iter9
iter10
iter11
iter12
iter13
iter14
iter15
iter16
iter17
iter18
iter19
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predict 
cannot predi