In [190]:
from __future__ import division
import pandas as pd
import numpy  as np
import re
import copy
import sys
import datetime

def update_progress(progress, seconds):
    time = str(datetime.timedelta(seconds))
    barLength = 10 # Modify this to change the length of the progress bar
    status = ""
    if isinstance(progress, int):
        progress = float(progress)
    if not isinstance(progress, float):
        progress = 0
        status = "error: progress var must be float\r\n"
    if progress < 0:
        progress = 0
        status = "\nHalt...\r\n"
    if progress >= 1:
        progress = 1
        status = "\nDone...\r\n"
    block = int(round(barLength*progress))
    text = "\rPercent: [{0}] {1}% {2}, Time Taken: {3}".format( "#"*block + "-"*(barLength-block), progress*100, status, time)
    sys.stdout.write(text)
    sys.stdout.flush()

In [2]:
movie_titles = open("movie_titles.txt").read()
netflix_training = open("TrainingRatings.txt").read()
netflix_testing = open("TestingRatings.txt").read()

In [4]:
# Tranform text files into DataFrames
movie_titles_df     = pd.DataFrame([i.split(',',2) for i in re.sub('[\r]', '', movie_titles).split('\n')][:-1])
netflix_training_df = pd.DataFrame([i.split(',',2) for i in re.sub('[\r]', '', netflix_training).split('\n')[:-1]])
netflix_testing_df  = pd.DataFrame([i.split(',',2) for i in re.sub('[\r]', '', netflix_testing).split('\n')[:-1]])
movie_titles_df.columns = ['MovieID', 'YearOfRelease', 'Title']
netflix_testing_df.columns = netflix_training_df.columns = ['MovieID', 'UserID', 'Rating']

In [5]:
# Transform all relevant string values to int
movie_titles_df['MovieID'] = movie_titles_df['MovieID'].astype(int)

netflix_testing_df['MovieID'] = netflix_testing_df['MovieID'].astype(int)
netflix_testing_df['UserID'] = netflix_testing_df['UserID'].astype(int)
netflix_testing_df['Rating'] = netflix_testing_df['Rating'].astype(np.float64)

netflix_training_df['MovieID'] = netflix_training_df['MovieID'].astype(int)
netflix_training_df['UserID'] = netflix_training_df['UserID'].astype(int)
netflix_training_df['Rating'] = netflix_training_df['Rating'].astype(np.float64)

In [39]:
train = netflix_training_df
train.groupby('UserID')['MovieID', 'Rating'].apply(lambda x: dict(x.values.tolist()))

UserID
7          {9728.0: 4.0, 12293.0: 5.0, 8.0: 5.0, 10255.0:...
79         {9728.0: 3.0, 12293.0: 4.0, 1202.0: 3.0, 6408....
199        {9728.0: 3.0, 15712.0: 3.0, 12293.0: 4.0, 6408...
481        {13298.0: 5.0, 14209.0: 4.0, 16082.0: 5.0, 108...
769        {9728.0: 2.0, 3904.0: 4.0, 12293.0: 5.0, 13326...
906        {5760.0: 5.0, 8384.0: 3.0, 12292.0: 4.0, 12293...
1310       {9728.0: 2.0, 14144.0: 3.0, 12293.0: 4.0, 1480...
1333       {9728.0: 4.0, 13823.0: 2.0, 12292.0: 3.0, 4613...
1427       {9728.0: 4.0, 14144.0: 5.0, 10774.0: 4.0, 518....
1442       {9728.0: 4.0, 12293.0: 5.0, 10774.0: 4.0, 5656...
1457       {9728.0: 2.0, 8384.0: 4.0, 6917.0: 2.0, 6408.0...
1500       {9728.0: 4.0, 1046.0: 4.0, 5656.0: 1.0, 28.0: ...
1527       {9728.0: 3.0, 14144.0: 5.0, 16770.0: 5.0, 1581...
1918       {9728.0: 5.0, 16150.0: 3.0, 17324.0: 2.0, 1608...
2000       {9728.0: 5.0, 8107.0: 4.0, 12293.0: 5.0, 7186....
2128       {9728.0: 3.0, 8512.0: 4.0, 12293.0: 5.0, 16286...
2213       {9728.

In [183]:
import time

class Collaborative_Model:
    user_ids  = None
    movie_ids = None
    ratings   = None
    ui        = None
    all_users_avg  = None
    all_users_vote = None
    
    def __init__(self, train):
        self.Prepare_Data(train)
    
    def Prepare_Data(self, train):
        self.user_ids  = train['UserID'].values
        self.movie_ids = train['MovieID'].values
        self.ratings   = train['Rating'].values
        unique_users = np.unique(self.user_ids)
        self.ui = {user_id : np.argwhere(user_id == self.user_ids).flatten()
                   for user_id in unique_users}
        self.all_users_avg = dict(train.groupby('UserID')['Rating'].mean())
        self.all_users_vote = dict(train.groupby('UserID')['MovieID', 'Rating'].apply(lambda x: dict(x.values.tolist())))
        
    def Predict_Data(self, test):
        test_ids       = test['UserID'].values
        test_movie_ids = test['MovieID'].values
        arr = []
        for i,uid in enumerate(test_ids):
            update_progress(round( (i + 1) / len(test_ids), 3))
            arr.append(self.Predict_User_Vote(uid, test_movie_ids[i]))
        return arr

    def Predict_User_Vote(self, uid, tmi):
        user_avg   = self.all_users_avg[uid]
        user_votes = self.all_users_vote[uid]
        other_users_votes = self.all_users_vote
        other_users_avg   = self.all_users_avg
        tui = self.ui[uid]
        rui = self.ui
        users = np.unique(np.array([self.user_ids[i] 
                                             for i in np.argwhere(self.movie_ids == tmi).flatten()]))
        
        correlation = np.array([self.Pearson_Coefficient(user_avg, other_users_avg[user_id],
                                                               user_votes,other_users_votes[user_id], 
                                                               self.Get_Common_Movies(tui, rui[user_id])) 
                                       for user_id in users])
        
        voting_avg_diff = np.array([other_users_votes[user_id][tmi] - other_users_avg[user_id]
                                    for user_id in users])
        
        absolute_sum = np.sum(np.absolute(correlation))
        k = 1
        if (absolute_sum != 0):
            k = 1 / absolute_sum
        weight_sum = np.sum(correlation * voting_avg_diff)
        return user_avg + k * weight_sum

    def Average_Vote(self, votes):
        return np.sum(votes) / len(votes)

    def Get_Common_Movies(self, tui, rui):
        return set(self.movie_ids[tui]) & set(self.movie_ids[rui])

    def Pearson_Coefficient(self, user_avg, other_user_avg, user_votes, other_user_votes, common_movies):
        if (len(common_movies) == 0):
            return 0
        A = np.array([user_votes[common_movie] for common_movie in common_movies]) - user_avg
        B = np.array([other_user_votes[common_movie] for common_movie in common_movies]) - other_user_avg
        numerator   = np.sum(A * B)
        denominator = np.sqrt(np.sum(A**2) * np.sum(B**2))
        if (denominator == 0):
            return 0
        return numerator / denominator

In [184]:
def Mean_Absolute_Error(predictions, target):
    return np.sum(np.absolute(predictions - target)) / len(predictions)

def Root_Mean_Square_Error(predictions, target):
    return np.sqrt( np.sum((predictions - target) ** 2) / len(predictions) )

In [185]:
print "Preparing data into optimal data structures..."
predictor = Collaborative_Model(netflix_training_df)

Preparing data into optimal data structures...


In [186]:
start_time = time.time()
predictions = predictor.Predict_Data(netflix_testing_df[0:100])
print (time.time() - start_time)

Percent: [##########] 100%  
Done...
19.9057178497


In [172]:
Root_Mean_Square_Error(predictions, netflix_testing_df['Rating'])

1.2381345214865886

In [173]:
Mean_Absolute_Error(predictions, netflix_training_df['Rating'])

0.7705319973258226