In [1]:
from __future__ import division
import pandas as pd
import numpy  as np
import re
import copy

In [2]:
movie_titles = open("movie_titles.txt").read()
netflix_training = open("TrainingRatings.txt").read()
netflix_testing = open("TestingRatings.txt").read()

In [3]:
# Tranform text files into DataFrames
movie_titles_df     = pd.DataFrame([i.split(',',2) for i in re.sub('[\r]', '', movie_titles).split('\n')][:-1])
netflix_training_df = pd.DataFrame([i.split(',',2) for i in re.sub('[\r]', '', netflix_training).split('\n')[:-1]])
netflix_testing_df  = pd.DataFrame([i.split(',',2) for i in re.sub('[\r]', '', netflix_testing).split('\n')[:-1]])
movie_titles_df.columns = ['MovieID', 'YearOfRelease', 'Title']
netflix_testing_df.columns = netflix_training_df.columns = ['MovieID', 'UserID', 'Rating']

In [4]:
# Transform all relevant string values to int
movie_titles_df['MovieID'] = movie_titles_df['MovieID'].astype(int)

netflix_testing_df['MovieID'] = netflix_testing_df['MovieID'].astype(int)
netflix_testing_df['UserID'] = netflix_testing_df['UserID'].astype(int)
netflix_testing_df['Rating'] = netflix_testing_df['Rating'].astype(np.float64)

netflix_training_df['MovieID'] = netflix_training_df['MovieID'].astype(int)
netflix_training_df['UserID'] = netflix_training_df['UserID'].astype(int)
netflix_training_df['Rating'] = netflix_training_df['Rating'].astype(np.float64)

In [27]:
import time

def Collaborative_Predict_Votes(train, test):
    user_ids  = train['UserID'].values
    movie_ids = train['MovieID'].values
    ratings   = train['Rating'].values
    unique_users = np.unique(user_ids)
    ui = {user_id : np.argwhere(user_id == user_ids).flatten() 
          for user_id in unique_users}
    all_users_avg = {user_id : Average_Vote(ratings[np.argwhere(user_id == user_ids)]) 
                     for user_id in unique_users}
    all_users_vote = {user_id :
                      { movie_id : ratings[np.argwhere(movie_ids == movie_id)][0][0] 
                       for movie_id in movie_ids[ui[user_id]]}
                      for user_id in unique_users}
    test_ids       = test['UserID'].values
    test_movie_ids = test['MovieID'].values
    #return [Predict_User_Vote(all_users_vote, all_users_avg, movie_ids, user_ids, ui, target_user_id, test_movie_ids[i])
            #for i, target_user_id in enumerate(test_ids)]
    return all_users_avg
        

def Predict_User_Vote(all_users_vote, all_users_avg, movie_ids, user_ids, ui, target_user_id, target_movie_id):
    print target_user_id
    user_avg   = all_users_avg[target_user_id]
    user_votes = all_users_vote[target_user_id]
    other_users_votes = all_users_vote.copy().pop(target_user_id)
    other_users_avg   = all_user_avg.copy().pop(target_user_id)
    tui = ui[target_user_id]
    rui = ui.copy().pop(target_user_id)
    relevant_users = np.unique([user_ids[i] for i in np.argwhere(movie_ids == target_movie_id).flatten()])
    correlated_weights = np.array([Correlation_Weight(user_avg, other_users_avg[user_id], user_votes,
                                             other_users_vote[user_id], 
                                             Get_Common_Movies(tui, rui[user_id], movie_ids)) 
                          for user_id in relevant_users])
    voting_avg_diff = np.array([(other_users_vote[user_id][target_movie_id] - other_users_avg[user_id])
                               for user_id in relevant_users])
    k = 1 / np.sum(np.absolute(correlated_weights))
    weight_sum = np.sum(correlated_weights * voting_avg_diff)
    return user_avg + k * weight_sum
    
def Average_Vote(votes):
    return np.sum(votes) / len(votes)
    
def Get_Common_Movies(tui, rui, movie_ids):
    return np.intersect1d(movie_ids[tui], movie_ids[rui])

def Correlation_Weight(user_avg, other_user_avg, user_votes, other_user_votes, common_movies):
    if (len(common_movies) == 0):
        return 0
    numerator   = np.sum( [ (user_votes[common_movie] - user_avg) 
                           * (other_user_votes[common_movie] - other_user_avg) 
                           for common_movie in common_movies])
    if (numerator == 0):
        return 0
    denominator = np.sqrt(np.sum( [(user_votes[common_movie] - user_avg)**2 for common_movie in common_movies]) *
                          np.sum( [(other_user_votes[common_movie] - other_user_avg)**2 for common_movie in common_movies])
                         )
    return numerator / denominator

In [None]:
a = Collaborative_Predict_Votes(netflix_training_df, netflix_testing_df[0:3])
a

In [34]:
netflix_training_df['UserID'].unique()

array([1744889, 1395430, 1205593, ...,  571547, 2080843, 2383861])