In [227]:
from __future__ import division
import pandas as pd
import numpy  as np
import re
import copy
import sys

def update_progress(progress):
    
    barLength = 10 # Modify this to change the length of the progress bar
    status = ""
    if isinstance(progress, int):
        progress = float(progress)
    if not isinstance(progress, float):
        progress = 0
        status = "error: progress var must be float\r\n"
    if progress < 0:
        progress = 0
        status = "\nHalt...\r\n"
    if progress >= 1:
        progress = 1
        status = "\nDone...\r\n"
    block = int(round(barLength*progress))
    text = "\rPercent: [{0}] {1}% {2}".format( "#"*block + "-"*(barLength-block), progress*100, status)
    sys.stdout.write(text)
    sys.stdout.flush()

In [221]:
movie_titles = open("movie_titles.txt").read()
netflix_training = open("TrainingRatings.txt").read()
netflix_testing = open("TestingRatings.txt").read()

In [222]:
# Tranform text files into DataFrames
movie_titles_df     = pd.DataFrame([i.split(',',2) for i in re.sub('[\r]', '', movie_titles).split('\n')][:-1])
netflix_training_df = pd.DataFrame([i.split(',',2) for i in re.sub('[\r]', '', netflix_training).split('\n')[:-1]])
netflix_testing_df  = pd.DataFrame([i.split(',',2) for i in re.sub('[\r]', '', netflix_testing).split('\n')[:-1]])
movie_titles_df.columns = ['MovieID', 'YearOfRelease', 'Title']
netflix_testing_df.columns = netflix_training_df.columns = ['MovieID', 'UserID', 'Rating']

In [223]:
# Transform all relevant string values to int
movie_titles_df['MovieID'] = movie_titles_df['MovieID'].astype(int)

netflix_testing_df['MovieID'] = netflix_testing_df['MovieID'].astype(int)
netflix_testing_df['UserID'] = netflix_testing_df['UserID'].astype(int)
netflix_testing_df['Rating'] = netflix_testing_df['Rating'].astype(np.float64)

netflix_training_df['MovieID'] = netflix_training_df['MovieID'].astype(int)
netflix_training_df['UserID'] = netflix_training_df['UserID'].astype(int)
netflix_training_df['Rating'] = netflix_training_df['Rating'].astype(np.float64)

In [231]:
import time

class Collaborative_Model:
    user_ids  = None
    movie_ids = None
    ratings   = None
    ui        = None
    all_users_avg  = None
    all_users_vote = None
    
    def __init__(self, train):
        self.Prepare_Data(train)
    
    def Prepare_Data(self, train):
        self.user_ids  = train['UserID'].values
        self.movie_ids = train['MovieID'].values
        self.ratings   = train['Rating'].values
        unique_users = np.unique(self.user_ids)
        self.ui = {user_id : np.argwhere(user_id == self.user_ids).flatten()
                   for user_id in unique_users}
        self.all_users_avg = {user_id : Average_Vote(self.ratings[np.argwhere(user_id == self.user_ids)])
                              for user_id in unique_users}
        #Got issueshere
        self.all_users_vote = {user_id :
                               {movie_id : self.ratings[self.ui[user_id]][np.argwhere(self.movie_ids[self.ui[user_id]] == movie_id)][0][0] 
                                for movie_id in self.movie_ids[self.ui[user_id]]}
                               for user_id in unique_users}
        
    def Predict_Data(self, test):
        test_ids       = test['UserID'].values
        test_movie_ids = test['MovieID'].values
        arr = []
        for i,target_user_id in enumerate(test_ids):
            update_progress(i / (len(test_ids) - 1))
            arr.append(self.Predict_User_Vote(target_user_id, test_movie_ids[i]))
        return arr

    def Predict_User_Vote(self, target_user_id, target_movie_id):
        user_avg   = self.all_users_avg[target_user_id]
        user_votes = self.all_users_vote[target_user_id]
        other_users_votes = self.all_users_vote
        other_users_avg   = self.all_users_avg
        tui = self.ui[target_user_id]
        rui = self.ui
        relevant_users = np.unique(np.array([self.user_ids[i] for i in 
                                             np.argwhere(self.movie_ids == target_movie_id).flatten()]))
        correlated_weights = np.array(
            [self.Correlation_Weight(user_avg, other_users_avg[user_id], user_votes,other_users_votes[user_id], 
                                     self.Get_Common_Movies(tui, rui[user_id])) 
             for user_id in relevant_users]
        )
        voting_avg_diff = np.array(
            [other_users_votes[user_id][target_movie_id] - other_users_avg[user_id]
             for user_id in relevant_users]
        )
        k = 1 / np.sum(np.absolute(correlated_weights))
        weight_sum = np.sum(correlated_weights * voting_avg_diff)
        return user_avg + k * weight_sum

    def Average_Vote(self, votes):
        return np.sum(votes) / len(votes)

    def Get_Common_Movies(self, tui, rui):
        return np.intersect1d(self.movie_ids[tui], self.movie_ids[rui])

    def Correlation_Weight(self, user_avg, other_user_avg, user_votes, other_user_votes, common_movies):
        if (len(common_movies) == 0):
            return 0
        numerator   = np.sum( [ (user_votes[common_movie] - user_avg) 
                               * (other_user_votes[common_movie] - other_user_avg) 
                               for common_movie in common_movies])
        if (numerator == 0):
            return 0
        denominator = np.sqrt(np.sum( [(user_votes[common_movie] - user_avg)**2 for common_movie in common_movies]) *
                              np.sum( [(other_user_votes[common_movie] - other_user_avg)**2 for common_movie in common_movies]
                                    ))
        return numerator / denominator

In [232]:
start_time = time.time()
a = Collaborative_Model(netflix_training_df)
print (time.time() - start_time)

118.888456106


In [234]:
a = a.Predict_Data(netflix_testing_df[0:100])

Percent: [#---------] 14.1414141414% 



Percent: [##########] 100% 98989899% 
Done...


In [239]:
np.round(a)

array([ 3.,  3.,  3.,  3.,  3.,  3.,  3.,  4.,  4.,  3.,  3.,  1.,  3.,
       nan,  4.,  3.,  3.,  3.,  3.,  3.,  4.,  3.,  4.,  3.,  4.,  3.,
        4.,  3.,  3.,  2.,  3.,  3.,  3.,  3.,  4.,  4.,  4.,  3.,  3.,
        3.,  3.,  4.,  3.,  3.,  3.,  4.,  3.,  2.,  3.,  3.,  3.,  3.,
        4.,  4.,  4.,  3.,  3.,  4.,  3.,  3.,  3.,  4.,  3.,  3.,  3.,
        3.,  3.,  2.,  4.,  3.,  4.,  3.,  2.,  3.,  4.,  3.,  3.,  4.,
        3.,  4.,  3.,  3.,  3.,  3.,  3.,  4.,  4.,  4.,  3.,  4.,  3.,
        2.,  2.,  3.,  3.,  4.,  4.,  4.,  3.,  4.])

In [244]:
sum(netflix_testing_df['Rating'][0:100] == np.round(a)) / 100

0.39