In [1]:
from __future__ import division
import pandas as pd
import numpy  as np
import re
import copy
import sys

def update_progress(progress):
    barLength = 10 # Modify this to change the length of the progress bar
    status = ""
    if isinstance(progress, int):
        progress = float(progress)
    if not isinstance(progress, float):
        progress = 0
        status = "error: progress var must be float\r\n"
    if progress < 0:
        progress = 0
        status = "\nHalt...\r\n"
    if progress >= 1:
        progress = 1
        status = "\nDone...\r\n"
    block = int(round(barLength*progress))
    text = "\rPercent: [{0}] {1}% {2}".format( "#"*block + "-"*(barLength-block), progress*100, status)
    sys.stdout.write(text)
    sys.stdout.flush()

In [2]:
movie_titles = open("movie_titles.txt").read()
netflix_training = open("TrainingRatings.txt").read()
netflix_testing = open("TestingRatings.txt").read()

In [3]:
# Tranform text files into DataFrames
movie_titles_df     = pd.DataFrame([i.split(',',2) for i in re.sub('[\r]', '', movie_titles).split('\n')][:-1])
netflix_training_df = pd.DataFrame([i.split(',',2) for i in re.sub('[\r]', '', netflix_training).split('\n')[:-1]])
netflix_testing_df  = pd.DataFrame([i.split(',',2) for i in re.sub('[\r]', '', netflix_testing).split('\n')[:-1]])
movie_titles_df.columns = ['MovieID', 'YearOfRelease', 'Title']
netflix_testing_df.columns = netflix_training_df.columns = ['MovieID', 'UserID', 'Rating']

In [4]:
# Transform all relevant string values to int
movie_titles_df['MovieID'] = movie_titles_df['MovieID'].astype(int)

netflix_testing_df['MovieID'] = netflix_testing_df['MovieID'].astype(int)
netflix_testing_df['UserID'] = netflix_testing_df['UserID'].astype(int)
netflix_testing_df['Rating'] = netflix_testing_df['Rating'].astype(np.float64)

netflix_training_df['MovieID'] = netflix_training_df['MovieID'].astype(int)
netflix_training_df['UserID'] = netflix_training_df['UserID'].astype(int)
netflix_training_df['Rating'] = netflix_training_df['Rating'].astype(np.float64)

In [125]:
import time

class Collaborative_Model:
    user_ids  = None
    movie_ids = None
    ratings   = None
    ui        = None
    all_users_avg  = None
    all_users_vote = None
    
    def __init__(self, train):
        self.Prepare_Data(train)
    
    def Prepare_Data(self, train):
        self.user_ids  = train['UserID'].values
        self.movie_ids = train['MovieID'].values
        self.ratings   = train['Rating'].values
        unique_users = np.unique(self.user_ids)
        self.ui = {user_id : np.argwhere(user_id == self.user_ids).flatten()
                   for user_id in unique_users}
        self.all_users_avg = {user_id : self.Average_Vote(self.ratings[np.argwhere(user_id == self.user_ids)])
                              for user_id in unique_users}
        #Got issueshere
        self.all_users_vote = {user_id :
                               {movie_id : self.ratings[self.ui[user_id]][np.argwhere(self.movie_ids[self.ui[user_id]] == movie_id)][0][0] 
                                for movie_id in self.movie_ids[self.ui[user_id]]}
                               for user_id in unique_users}
        
    def Predict_Data(self, test):
        test_ids       = test['UserID'].values
        test_movie_ids = test['MovieID'].values
        arr = []
        for i,target_user_id in enumerate(test_ids):
            update_progress(round( (i + 1) / len(test_ids), 3))
            arr.append(self.Predict_User_Vote(target_user_id, test_movie_ids[i]))
        return arr

    def Predict_User_Vote(self, target_user_id, target_movie_id):
        user_avg   = self.all_users_avg[target_user_id]
        user_votes = self.all_users_vote[target_user_id]
        other_users_votes = self.all_users_vote
        other_users_avg   = self.all_users_avg
        tui = self.ui[target_user_id]
        rui = self.ui
        relevant_users = np.unique(np.array([self.user_ids[i] 
                                             for i in np.argwhere(self.movie_ids == target_movie_id).flatten()]))
        
        correlated_weights = np.array([self.Pearson_Coefficient(user_avg, other_users_avg[user_id],
                                                               user_votes,other_users_votes[user_id], 
                                                               self.Get_Common_Movies(tui, rui[user_id])) 
                                       for user_id in relevant_users])
        
        voting_avg_diff = np.array([other_users_votes[user_id][target_movie_id] - other_users_avg[user_id]
                                    for user_id in relevant_users])
        
        absolute_sum = np.sum(np.absolute(correlated_weights))
        k = 1
        if (absolute_sum != 0):
            k = 1 / absolute_sum
        weight_sum = np.sum(correlated_weights * voting_avg_diff)
        return user_avg + k * weight_sum

    def Average_Vote(self, votes):
        return np.sum(votes) / len(votes)

    def Get_Common_Movies(self, tui, rui):
        return set(self.movie_ids[tui]).intersection(set(self.movie_ids[rui]))

    def Pearson_Coefficient(self, user_avg, other_user_avg, user_votes, other_user_votes, common_movies):
        if (len(common_movies) == 0):
            return 0
        user_calc_diff = np.array([user_votes[common_movie] - user_avg for common_movie in common_movies])
        other_user_calc_diff = np.array([other_user_votes[common_movie] - other_user_avg for common_movie in common_movies])
        numerator   = np.sum(user_calc_diff * other_user_calc_diff)
        denominator = np.sqrt(np.sum(user_calc_diff**2) * np.sum(other_user_calc_diff**2))
        if (denominator == 0):
            return 0
        return numerator / denominator

In [126]:
def Mean_Absolute_Error(predictions, target):
    return np.sum(np.absolute(predictions - target)) / len(predictions)

def Root_Mean_Square_Error(predictions, target):
    return np.sqrt( np.sum((predictions - target) ** 2) / len(predictions) )

In [130]:
start_time = time.time()
a = Collaborative_Model(netflix_training_df)
print (time.time() - start_time)

159.561586142


In [131]:
predictions = a.Predict_Data(netflix_testing_df[0:100])

Percent: [##########] 100%  
Done...


In [134]:
Mean_Absolute_Error(predictions, netflix_training_df['Rating'][0:100])
Root_Mean_Square_Error(predictions, netflix_training_df['Rating'][0:100])

1.5317402050852222

In [114]:
tuple(zip(predictions, netflix_training_df['Rating'][0:1000]))

((3.0884107119279536, 1.0),
 (3.1643941525774877, 2.0),
 (2.5490448258358653, 4.0),
 (2.813262670596346, 4.0),
 (2.795938956104415, 1.0),
 (2.8860634700671945, 4.0),
 (2.869671416951704, 4.0),
 (3.7591256331370486, 3.0),
 (3.522190553005384, 4.0),
 (3.13121140924537, 4.0),
 (2.7754908405676515, 4.0),
 (0.8365747504523375, 3.0),
 (2.7041806268403126, 2.0),
 (1.0, 5.0),
 (3.673616665664483, 3.0),
 (3.1470258063597836, 4.0),
 (3.119053963448188, 4.0),
 (2.926664262137881, 4.0),
 (3.082561000551381, 3.0),
 (2.8994872641678713, 5.0),
 (3.8039207985147634, 4.0),
 (3.1501653717318523, 4.0),
 (3.854699606242469, 1.0),
 (3.0882933426897976, 5.0),
 (3.9160036161885077, 2.0),
 (2.795623665746612, 4.0),
 (3.828525327059768, 2.0),
 (3.108890521342397, 4.0),
 (3.3923573262668762, 3.0),
 (2.4003079648187766, 4.0),
 (2.5709180103668667, 4.0),
 (3.3059583448123675, 2.0),
 (2.7821346370738507, 2.0),
 (2.6237048532405707, 1.0),
 (3.568314623405911, 4.0),
 (4.2566024339756785, 1.0),
 (4.02163220097662, 3.