In [1]:
from __future__ import division
import pandas as pd
import numpy  as np
import re

In [2]:
movie_titles = open("movie_titles.txt").read()
netflix_training = open("TrainingRatings.txt").read()
netflix_testing = open("TestingRatings.txt").read()

In [3]:
# Tranform text files into DataFrames
movie_titles_df     = pd.DataFrame([i.split(',',2) for i in re.sub('[\r]', '', movie_titles).split('\n')][:-1])
netflix_training_df = pd.DataFrame([i.split(',',2) for i in re.sub('[\r]', '', netflix_training).split('\n')[:-1]])
netflix_testing_df  = pd.DataFrame([i.split(',',2) for i in re.sub('[\r]', '', netflix_testing).split('\n')[:-1]])
movie_titles_df.columns = ['MovieID', 'YearOfRelease', 'Title']
netflix_testing_df.columns = netflix_training_df.columns = ['MovieID', 'UserID', 'Rating']

In [4]:
# Transform all relevant string values to int
movie_titles_df['MovieID'] = movie_titles_df['MovieID'].astype(int)

netflix_testing_df['MovieID'] = netflix_testing_df['MovieID'].astype(int)
netflix_testing_df['UserID'] = netflix_testing_df['UserID'].astype(int)
netflix_testing_df['Rating'] = netflix_testing_df['Rating'].astype(np.float64)

netflix_training_df['MovieID'] = netflix_training_df['MovieID'].astype(int)
netflix_training_df['UserID'] = netflix_training_df['UserID'].astype(int)
netflix_training_df['Rating'] = netflix_training_df['Rating'].astype(np.float64)

In [153]:
def Average_Vote(votes):
    return np.sum(votes) / len(votes)

def Predict_User_Vote(train_df, user_id, movie_id):
    user = train_df[train_df['UserID'] == user_id]
    user_avg = Average_Vote(user['Rating'])
    other_users = train_df[train_df['UserID'] != user_id]
    relevant_users = other_users[other_users['MovieID'] == movie_id]['UserID'].unique()
    k = 1 #Normalizing factor, find out how to do this
    user_votes = {movie_id: Vote_Minus_Mean(user, user_avg, movie_id) for movie_id in user['MovieID']}
    score = np.sum([Get_Correlation_Prediction(other_users, user, user_avg, user_votes, u, movie_id) for u in relevant_users])
    return (Average_Vote(user['Rating']) + k * score)

def Get_Correlation_Prediction(other_users, user, user_avg, user_votes, other_user_id, movie_id):
    other_user     = other_users[other_users['UserID'] == other_user_id]
    other_user_avg = Average_Vote(other_user['Rating'])
    vote_weight    = other_user[other_user['MovieID'] == movie_id]['Rating'].iloc[0] - other_user_avg
    return Correlation_Weight_Between_User(user, user_avg, user_votes, other_user, other_user_avg) * vote_weight

def Correlation_Weight_Between_User(user, user_avg, user_votes, other_user, other_user_avg):
    other_user_avg = Average_Vote(other_user['Rating'])
    common_movies = set(user['MovieID']).intersection(set(other_user['MovieID']))
    if (common_movies == set()):
        return 0
    other_user_votes  = {movie_id: Vote_Minus_Mean(other_user, other_user_avg, movie_id) for movie_id in common_movies}
    numerator   = np.sum([user_votes[movie_id] * other_user_votes[movie_id] for movie_id in common_movies])
    if numerator == 0:
        return 0
    denominator = np.sqrt( np.sum([user_votes[movie_id]**2 for movie_id in common_movies]) *
                           np.sum([other_user_votes[movie_id]**2 for movie_id in common_movies]))
    if (numerator == 0):
        print numerator
    return numerator / denominator
    
def Vote_Minus_Mean(user, user_avg, movie_id):
    return user[user['MovieID'] == movie_id]['Rating'].iloc[0] - user_avg

def Predict_User_Vote_2(ratings, movie_ids, user_ids, target_user_id, target_movie_id):
    rui = np.argwhere(target_movie_id == movie_ids).flatten()
    relevant_users = np.unique(user_ids[rui])
    tui = np.argwhere(user_ids == target_user_id).flatten()
    rui = {user_id : np.argwhere(user_id == user_ids).flatten() for user_id in relevant_users}
    
    user_avg = Average_Vote(ratings[tui])
    other_users_avg = {user_id : Average_Vote(ratings[np.argwhere(user_id == user_ids)]) for user_id in relevant_users}
    user_vote = { movie_id : ratings[tui][np.argwhere(movie_ids[tui] == movie_id)][0][0] for movie_id in movie_ids[tui].flatten() }
    other_users_vote = user_vote = { user_id : 
                                    { movie_id : ratings[rui[user_id]][np.argwhere(movie_ids[rui[user_id]] == movie_id)][0][0] 
                                     for movie_id in movie_ids[rui[user_id]].flatten() }
                                     for user_id in relevant_users }
    
    
    return other_users_vote

In [154]:
M = training_df.values.T

Predict_User_Vote_2(M[2], M[0], M[1], 1744889, 17742)

{6629.0: {111.0: 2.0,
  305.0: 3.0,
  361.0: 5.0,
  851.0: 3.0,
  1046.0: 4.0,
  1100.0: 2.0,
  1140.0: 3.0,
  1202.0: 3.0,
  1256.0: 5.0,
  1289.0: 2.0,
  1305.0: 4.0,
  1367.0: 3.0,
  1406.0: 3.0,
  1432.0: 2.0,
  1482.0: 4.0,
  1615.0: 3.0,
  1744.0: 3.0,
  1807.0: 3.0,
  1832.0: 3.0,
  1884.0: 4.0,
  2235.0: 2.0,
  2290.0: 4.0,
  2342.0: 5.0,
  2660.0: 4.0,
  2675.0: 2.0,
  2866.0: 4.0,
  2913.0: 5.0,
  2988.0: 2.0,
  3094.0: 1.0,
  3135.0: 3.0,
  3151.0: 4.0,
  3274.0: 3.0,
  3290.0: 4.0,
  3355.0: 4.0,
  3538.0: 4.0,
  3638.0: 5.0,
  3743.0: 3.0,
  3890.0: 3.0,
  3893.0: 4.0,
  3928.0: 5.0,
  4144.0: 3.0,
  4432.0: 5.0,
  4546.0: 4.0,
  4627.0: 4.0,
  4640.0: 5.0,
  4847.0: 2.0,
  5069.0: 4.0,
  5342.0: 3.0,
  5425.0: 3.0,
  5607.0: 3.0,
  5716.0: 2.0,
  5760.0: 4.0,
  5814.0: 3.0,
  5897.0: 3.0,
  6014.0: 4.0,
  6190.0: 4.0,
  6281.0: 4.0,
  6287.0: 5.0,
  6308.0: 2.0,
  6334.0: 3.0,
  6347.0: 3.0,
  6408.0: 4.0,
  6556.0: 3.0,
  6911.0: 2.0,
  6971.0: 3.0,
  7067.0: 5.0,
  7238

In [86]:
row = netflix_training_df.values.T[0]
np.argwhere(8 == row).flatten()

array([   0,    1,    2, ..., 2828, 2829, 2830])

In [132]:
a = np.array([1,2,3,4,5,6,7,1,2,3,4])
np.argwhere(1 == a[3:12])

array([[4]])

In [37]:
a[355:365]

[-0.3169179687646356,
 -0.6502267357818345,
 -0.067106131683814,
 -0.269473046844978,
 -0.9968364694366683,
 nan,
 -0.15901420560704732,
 -0.10155528095854963,
 -0.003391751364927699,
 -0.17025208537379052]

In [153]:
netflix_training_df

Unnamed: 0,MovieID,UserID,Rating
0,8,1744889,1.0
1,8,1395430,2.0
2,8,1205593,4.0
3,8,1488844,4.0
4,8,1447354,1.0
5,8,306466,4.0
6,8,1331154,4.0
7,8,1818178,3.0
8,8,991725,4.0
9,8,1987434,4.0
