In [1]:
from __future__ import division
import pandas as pd
import numpy  as np
import re

In [2]:
movie_titles = open("movie_titles.txt").read()
netflix_training = open("TrainingRatings.txt").read()
netflix_testing = open("TestingRatings.txt").read()

In [3]:
# Tranform text files into DataFrames
movie_titles_df     = pd.DataFrame([i.split(',',2) for i in re.sub('[\r]', '', movie_titles).split('\n')][:-1])
netflix_training_df = pd.DataFrame([i.split(',',2) for i in re.sub('[\r]', '', netflix_training).split('\n')[:-1]])
netflix_testing_df  = pd.DataFrame([i.split(',',2) for i in re.sub('[\r]', '', netflix_testing).split('\n')[:-1]])
movie_titles_df.columns = ['MovieID', 'YearOfRelease', 'Title']
netflix_testing_df.columns = netflix_training_df.columns = ['MovieID', 'UserID', 'Rating']

In [4]:
# Transform all relevant string values to int
movie_titles_df['MovieID'] = movie_titles_df['MovieID'].astype(int)

netflix_testing_df['MovieID'] = netflix_testing_df['MovieID'].astype(int)
netflix_testing_df['UserID'] = netflix_testing_df['UserID'].astype(int)
netflix_testing_df['Rating'] = netflix_testing_df['Rating'].astype(np.float64)

netflix_training_df['MovieID'] = netflix_training_df['MovieID'].astype(int)
netflix_training_df['UserID'] = netflix_training_df['UserID'].astype(int)
netflix_training_df['Rating'] = netflix_training_df['Rating'].astype(np.float64)

In [230]:
def Average_Vote(votes):
    return np.sum(votes) / len(votes)

def Predict_User_Vote(train_df, user_id, movie_id):
    user = train_df[train_df['UserID'] == user_id]
    user_avg = Average_Vote(user['Rating'])
    other_users = train_df[train_df['UserID'] != user_id]
    relevant_users = other_users[other_users['MovieID'] == movie_id]['UserID'].unique()
    k = 1 #Normalizing factor, find out how to do this
    user_votes = {movie_id: Vote_Minus_Mean(user, user_avg, movie_id) for movie_id in user['MovieID']}
    score = np.sum([Get_Correlation_Prediction(other_users, user, user_avg, user_votes, u, movie_id) for u in relevant_users])
    return (Average_Vote(user['Rating']) + k * score)

def Get_Correlation_Prediction(other_users, user, user_avg, user_votes, other_user_id, movie_id):
    other_user     = other_users[other_users['UserID'] == other_user_id]
    other_user_avg = Average_Vote(other_user['Rating'])
    vote_weight    = other_user[other_user['MovieID'] == movie_id]['Rating'].iloc[0] - other_user_avg
    return Correlation_Weight_Between_User(user, user_avg, user_votes, other_user, other_user_avg) * vote_weight

def Correlation_Weight_Between_User(user, user_avg, user_votes, other_user, other_user_avg):
    other_user_avg = Average_Vote(other_user['Rating'])
    common_movies = set(user['MovieID']).intersection(set(other_user['MovieID']))
    if (common_movies == set()):
        return 0
    other_user_votes  = {movie_id: Vote_Minus_Mean(other_user, other_user_avg, movie_id) for movie_id in common_movies}
    numerator   = np.sum([user_votes[movie_id] * other_user_votes[movie_id] for movie_id in common_movies])
    if numerator == 0:
        return 0
    denominator = np.sqrt( np.sum([user_votes[movie_id]**2 for movie_id in common_movies]) *
                           np.sum([other_user_votes[movie_id]**2 for movie_id in common_movies]))
    if (numerator == 0):
        print numerator
    return numerator / denominator
    
def Vote_Minus_Mean(user, user_avg, movie_id):
    return user[user['MovieID'] == movie_id]['Rating'].iloc[0] - user_avg

def Predict_User_Vote_2(ratings, movie_ids, user_ids, target_user_id, target_movie_id):
    rui = np.argwhere(target_movie_id == movie_ids).flatten()
    relevant_users = np.unique(user_ids[rui])
    tui = np.argwhere(user_ids == target_user_id).flatten()
    rui = {user_id : np.argwhere(user_id == user_ids).flatten() for user_id in relevant_users}
    
    user_avg = Average_Vote(ratings[tui])
    other_users_avg = {user_id : Average_Vote(ratings[np.argwhere(user_id == user_ids)]) for user_id in relevant_users}
    user_vote = {movie_id : ratings[tui][np.argwhere(movie_ids[tui] == movie_id)][0][0] for movie_id in movie_ids[tui].flatten() }
    other_users_vote = {user_id : 
                        {movie_id : ratings[rui[user_id]][np.argwhere(movie_ids[rui[user_id]] == movie_id)][0][0] 
                         for movie_id in movie_ids[rui[user_id]].flatten()}
                        for user_id in relevant_users}

    correlated_weights = np.array([Correlation_Weight(user_avg, other_users_avg[user_id], user_vote,
                                             other_users_vote[user_id], 
                                             Get_Common_Movies(tui, rui[user_id], movie_ids)) 
                          for user_id in relevant_users])
    voting_avg_diff = np.array([(other_users_vote[user_id][target_movie_id] - other_users_avg[user_id])
                               for user_id in relevant_users])
    k = 1 / np.sum(np.absolute(correlated_weights))
    weight_sum = np.sum(correlated_weights * voting_avg_diff)
    return user_avg + k * weight_sum

def Get_Common_Movies(tui, rui, movie_ids):
    return np.intersect1d(movie_ids[tui], movie_ids[rui])

def Correlation_Weight(user_avg, other_user_avg, user_vote, other_user_vote, common_movies):
    if (len(common_movies) == 0):
        return 0
    numerator   = np.sum( [ (user_vote[common_movie] - user_avg) 
                           * (other_user_vote[common_movie] - other_user_avg) 
                           for common_movie in common_movies])
    if (numerator == 0):
        return 0
    denominator = np.sqrt(np.sum( [(user_vote[common_movie] - user_avg)**2 for common_movie in common_movies]) *
                          np.sum( [(other_user_vote[common_movie] - other_user_avg)**2 for common_movie in common_movies])
                         )
    return numerator / denominator

In [232]:
M = training_df.values.T

Predict_User_Vote_2(M[2], M[0], M[1], 573364, 8)

3.0884107119279536

In [223]:
netflix_testing_df

Unnamed: 0,MovieID,UserID,Rating
0,8,573364,1.0
1,8,2149668,3.0
2,8,1089184,3.0
3,8,2465894,3.0
4,8,534508,1.0
5,8,992921,4.0
6,8,595054,4.0
7,8,1298304,4.0
8,8,1661600,4.0
9,8,553787,2.0
