In [41]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

import surprise
from surprise import AlgoBase, SVD, Dataset, Reader, accuracy
from surprise.model_selection import cross_validate

from sklearn import cluster, datasets
from sklearn.neighbors import kneighbors_graph
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')

In [42]:
class LoadData:
    # Load Datasets, trainset, testset, and additional info
    
    def __init__(self, trainset_name, testset_name, addition_info):    
        self.train = trainset_name  # file name of train dataset
        self.test = testset_name  # file name of test dataset
        self.info = addition_info  # file name of addition information
    
    def load_files(self):
        train_dat = pd.read_csv(self.train, encoding = "ISO-8859-1")

        file_1 = open(self.test, "r", encoding = "ISO-8859-1")

        tempTest = {}
        
        # Set keys
        tempTest['movie-id'] = []
        tempTest['customer-id'] = []
        tempTest['rating'] = []
        tempTest['date'] = []

        i = 0
        # Read all lines in the text file
        for line in file_1:
            tokens = line.strip('\n').split(",", 3)
            tempTest['movie-id'].append(tokens[0])
            tempTest['customer-id'].append(tokens[1])
            tempTest['rating'].append(tokens[2])
            tempTest['date'].append(tokens[3])

            i += 1   

        file_1.close()

        test_dat = pd.DataFrame(tempTest)
        test_dat['movie-id'] = test_dat['movie-id'].astype(int)
        test_dat['customer-id'] = test_dat['customer-id'].astype(int)

        file_2 = open(self.info, "r", encoding = "ISO-8859-1")

        tempDict = {}

        j = 0
        # Read all lines in the text file
        for line in file_2:
            # Set keys
            if j == 0:
                #tokens = line.strip('\n').split(", ", 2)
                tempDict['movie-id'] = []
                tempDict['year-produced'] = []
                tempDict['title'] = []

            else:
                tokens = line.strip('\n').split(",", 2)
                tempDict['movie-id'].append(tokens[0])
                tempDict['year-produced'].append(tokens[1])
                tempDict['title'].append(tokens[2])

            j += 1   

        file_2.close()

        movie_info = pd.DataFrame(tempDict)
        movie_info['movie-id'] = movie_info['movie-id'].astype(int)
        
        #train_dat = pd.concat([train_dat, test_dat], ignore_index=True)
        
        train_dat = pd.merge(train_dat, movie_info, on='movie-id')

        return train_dat, test_dat

In [43]:
class MF(AlgoBase):
    '''A matrix factorization based movie rating algorithm'''
    
    def __init__(self, regular_para, learning_rate, n_episodes, n_factors):   
        self.re_p = regular_para  # regularization parameter
        self.lr = learning_rate  # learning rate
        self.n_episodes = n_episodes  # number of iterations
        self.n_factors = n_factors  # number of factors
        
    def fit(self, trainset):
        '''Learn the vectors b_u, b_i, p_u, and q_i by using stochastic gradient descent (SGD)'''
        self.trainset = trainset
        
        # Initialize local biases b_u, b_i
        self.b_u = np.zeros(self.trainset.n_users)
        self.b_i = np.zeros(self.trainset.n_items)
        
        # Initialize a global bias b_avg
        self.b_avg = self.trainset.global_mean
        
        
        # Randomly initialize the customer and movie factors.
        self.p = np.random.normal(0, .1, (self.trainset.n_users, self.n_factors))
        self.q = np.random.normal(0, .1, (self.trainset.n_items, self.n_factors))
        
        # The main procedure for minimizing error and update variables by using SGD
        for _ in range(self.n_episodes):
            for u, i, r_ui in self.trainset.all_ratings():
                err = r_ui - self.get_pred(u, i)
                
                # Update biases, b_u, b_i
                self.b_u[u] += self.lr * (err - self.re_p * self.b_u[u])
                self.b_i[i] += self.lr * (err - self.re_p * self.b_i[i])
                
                # Update vectors p_u and q_i
                self.p[u] += self.lr * (err * self.q[i] - self.re_p * self.p[u])
                self.q[i] += self.lr * (err * self.p[u] - self.re_p * self.q[i])
                
    def get_pred(self, u, i):
        ''' Estimate the rating of customer u and movie i.'''
        return np.dot(self.p[u], self.q[i]) + self.b_u[u] + self.b_i[i] + self.b_avg
    
    def estimate(self, u, i):
        '''Obtain the estmimated rating of customer u for movie i.'''
        if self.trainset.knows_user(u) and self.trainset.knows_item(i):
            return np.dot(self.p[u], self.q[i]) + self.b_u[u] + self.b_i[i] + self.b_avg
        #elif self.trainset.knows_user(u):
        #    return self.trainset.global_mean + self.b_u[u] + self.b_avg
        #elif self.trainset.knows_item(i):
        #    return self.trainset.global_mean + self.b_i[i] + self.b_avg
        else:
            return self.trainset.global_mean

In [44]:
td = LoadData("train.csv", "test.csv", "movie_titles.txt")
train_dataset, test_dataset = td.load_files()

# Reader class using only the rating parameter
reader = Reader(rating_scale=(0.0, 5.0))

# Convert dataframe to dataset
data_train = Dataset.load_from_df(train_dataset[['customer-id', 'movie-id', 'rating']], reader)
data_test = test_dataset #Dataset.load_from_df(test_dataset[['customer-id', 'movie-id', 'rating']], reader)

#train_dataset.head(100)
#data_test.head(10)

In [45]:
'''
# Apply MF-based algorithm to train the dataset and learn how to do rating
algo_mf = MF(regular_para=.01, learning_rate=.01, n_episodes=10, n_factors=10)
# Run 2-fold cross-validation and print results
cross_validate(algo_mf, data_train, measures=['RMSE', 'MAE'], cv=2, verbose=True)

# Second, use SVD algorithm as a comparison
algo_svd = SVD()
cross_validate(algo_svd, data_train, measures=['RMSE', 'MAE'], cv=2, verbose=True)
'''

# Retrieve the trainset.
full_data_train = data_train.build_full_trainset()

# Build an algorithm, and train it.
algo_mf = MF(regular_para=.01, learning_rate=.01, n_episodes=10, n_factors=10)
algo_mf.fit(full_data_train)

# Use SVD as a comparison
algo_svd = SVD()
algo_svd.fit(full_data_train)


customer_list = []
movie_list = []
est_rating_list = []
date_list = []

num_rating = 0
perf_rmse = 0
perf_mae = 0
for index, row in data_test.iterrows():
    predictions_svd = algo_svd.predict(row['customer-id'], row['movie-id'], verbose=False)
    predictions_mf = algo_mf.predict(row['customer-id'], row['movie-id'], verbose=False)
    
    customer_list.append(row['customer-id'])
    movie_list.append(row['movie-id'])
    est_rating_list.append(predictions_mf.est)
    date_list.append(row['date'])
    
    perf_rmse += (predictions_mf.est - predictions_svd.est)**2
    perf_mae += np.absolute(predictions_mf.est - predictions_svd.est)
    
    num_rating += 1

output_dict = {'movie-id': movie_list, 'customer-id': customer_list, 'rating': est_rating_list, 'date': date_list}
output_pd = pd.DataFrame(output_dict)
output_pd.to_csv('est_ratings.csv', index=False) 
    
perf_rmse = (perf_rmse / num_rating)**(0.5)
perf_mae = perf_mae / num_rating

print("---------------Prediction Performance----------------")
print("RMSE: ", perf_rmse)
print("MAE: ", perf_mae)

---------------Prediction Performance----------------
RMSE:  0.2931306942165147
MAE:  0.2167377782019026
