In [37]:
import math
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [38]:
def read_movie_data(filename):
    data = []
    with open(filename, encoding="latin-1") as f:
        lines = f.readlines()
        for line in lines:
            temp = line.strip().split(",")
            data.append([int(temp[0]), temp[1], ', '.join(text for text in temp[2:])])
    return data

In [39]:
movie_titles = pd.DataFrame(read_movie_data("netflix/movie_titles.txt"), columns=['MovieID', 'YearOfRelease', 'Title'])
movie_titles

Unnamed: 0,MovieID,YearOfRelease,Title
0,1,2003,Dinosaur Planet
1,2,2004,Isle of Man TT 2004 Review
2,3,1997,Character
3,4,1994,Paula Abdul's Get Up & Dance
4,5,2004,The Rise and Fall of ECW
...,...,...,...
17765,17766,2002,Where the Wild Things Are and Other Maurice Se...
17766,17767,2004,Fidel Castro: American Experience
17767,17768,2000,Epoch
17768,17769,2003,The Company


In [40]:
train_ratings = pd.read_csv("netflix/TrainingRatings.txt", encoding='latin-1', header=None)
train_ratings.columns = ['MovieID', 'UserID', 'Rating']
train_ratings

Unnamed: 0,MovieID,UserID,Rating
0,8,1744889,1.0
1,8,1395430,2.0
2,8,1205593,4.0
3,8,1488844,4.0
4,8,1447354,1.0
...,...,...,...
3255347,17742,46222,3.0
3255348,17742,2534701,1.0
3255349,17742,208724,3.0
3255350,17742,483107,2.0


In [41]:
test_ratings = pd.read_csv("netflix/TestingRatings.txt", encoding='latin-1', header=None)
test_ratings.columns = ['MovieID', 'UserID', 'Rating']
test_ratings

Unnamed: 0,MovieID,UserID,Rating
0,8,573364,1.0
1,8,2149668,3.0
2,8,1089184,3.0
3,8,2465894,3.0
4,8,534508,1.0
...,...,...,...
100473,17742,1898310,2.0
100474,17742,716096,4.0
100475,17742,38115,3.0
100476,17742,2646347,5.0


In [42]:
user_id_grouping = train_ratings.groupby(['UserID']).agg(['sum', 'count'])['Rating'].reset_index()
user_id_grouping.columns = ['UserID', 'vij', 'count']
v = (user_id_grouping['vij'] / user_id_grouping['count']).to_frame()
v.columns = ['vi_bar']
v = pd.concat([user_id_grouping, v], axis=1)
v

Unnamed: 0,UserID,vij,count,vi_bar
0,7,406.0,104,3.903846
1,79,305.0,84,3.630952
2,199,280.0,71,3.943662
3,481,322.0,74,4.351351
4,769,313.0,98,3.193878
...,...,...,...,...
28973,2648869,406.0,112,3.625000
28974,2648885,537.0,136,3.948529
28975,2649120,298.0,79,3.772152
28976,2649267,312.0,81,3.851852


In [43]:
relevant_data = user_id_grouping#[user_id_grouping['count'] >= (0.005 * len(movie_titles))]
relevant_data = pd.merge(relevant_data, train_ratings, how='inner', on=['UserID', 'UserID'])
relevant_data
v = (relevant_data['vij'] / relevant_data['count']).to_frame()
v.columns = ['vi_bar']
v = pd.concat([relevant_data, v], axis=1)
v

Unnamed: 0,UserID,vij,count,MovieID,Rating,vi_bar
0,7,406.0,104,8,5.0,3.903846
1,7,406.0,104,28,4.0,3.903846
2,7,406.0,104,185,4.0,3.903846
3,7,406.0,104,636,4.0,3.903846
4,7,406.0,104,1046,3.0,3.903846
...,...,...,...,...,...,...
3255347,2649285,310.0,104,16948,4.0,2.980769
3255348,2649285,310.0,104,17324,4.0,2.980769
3255349,2649285,310.0,104,17334,2.0,2.980769
3255350,2649285,310.0,104,17338,3.0,2.980769


In [45]:
user_movie_table = train_ratings.pivot_table(index = ["UserID"],columns = ["MovieID"],values = "Rating").fillna(0)
user_movie_table.loc[user_movie_table[8]!= 0]

MovieID,8,28,43,48,61,64,66,92,96,111,...,17654,17660,17689,17693,17706,17725,17728,17734,17741,17742
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7,5.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1333,3.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,2.0,0.0,3.0,0.0,0.0,0.0,0.0
3321,1.0,4.0,0.0,4.0,0.0,0.0,0.0,0.0,3.0,0.0,...,3.0,1.0,0.0,4.0,0.0,1.0,0.0,0.0,2.0,0.0
3363,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3604,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2644289,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2645431,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2645671,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2645828,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0


In [46]:
def get_nearest_users(currUser, movie_j):
    user_movie_table_matrix = user_movie_table.loc[user_movie_table[movie_j]!= 0].to_numpy()
    model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
    model_knn.fit(user_movie_table_matrix)
    curr_user_values = user_movie_table.loc[currUser,:].values.reshape(1,-1)
    n = 15
    if user_movie_table_matrix.shape[0] < n:
        n = user_movie_table_matrix.shape[0]
    distances, indices = model_knn.kneighbors(curr_user_values, n_neighbors = n)
    ids = v['UserID'].iloc[indices.tolist()[0]].to_numpy()
    return ids

In [48]:
def get_correlation(user_a_id, user_i_id):
    user_a = v[v.UserID == user_a_id]
    user_i = v[v.UserID == user_i_id]
    merged = pd.merge(user_a, user_i, how='inner', on=['MovieID', 'MovieID'])
    num = 0
    den = 0
    j = merged.MovieID.values
    diff1 = merged['Rating_x'][merged['MovieID']==j].to_numpy() - merged['vi_bar_x'][merged['MovieID']==j].to_numpy()
    diff2 = merged['Rating_y'][merged['MovieID']==j].to_numpy() - merged['vi_bar_y'][merged['MovieID']==j].to_numpy()
    num = diff1 * diff2
    den = (diff1 ** 2) * (diff2 ** 2)
    sum_num = num.sum()
    sum_den = den.sum()
    if sum_den ==0:
        return 0
    return sum_num/math.sqrt(sum_den)

In [50]:
def get_kappa(correlation):
    if correlation == 0:
        return 0
    return 1/correlation

In [51]:
def get_prediction(user_a_id, movie_j):
    va_bar = v['vi_bar'][v.UserID == user_a_id].to_numpy()[0]
    sum1 = 0
    kappa_den = 0
    neighbors = get_nearest_users(user_a_id, movie_j)
    correlations = []
    for neighbor in neighbors:
        correlation = get_correlation(user_a_id, neighbor)
        kappa_den += abs(correlation)
        correlations.append(correlation)
    user_i = v.loc[(v.UserID == neighbor) & (v.MovieID == movie_j)]
    if user_i.empty:
        diff1 = 0
        sum1 = 0
    else:
        diff1 = user_i['Rating'][user_i.MovieID == movie_j].to_numpy() - user_i['vi_bar'][user_i.MovieID == movie_j].to_numpy()
        sum1 = (pd.DataFrame(correlations) * diff1).sum()
        sum1 = sum1.sum()
    return round(va_bar + get_kappa(kappa_den) * sum1)

In [53]:
predicted = []
for index in test_ratings.index:
    prediction = get_prediction(test_ratings['UserID'][index], test_ratings['MovieID'][index])
    predicted.append(prediction)
    #print(f"Done predicting item at {index} and left with {test_ratings.index.stop - index} user predictions...")

Done predicting item at 0 and left with 100478 user predictions...
Done predicting item at 1 and left with 100477 user predictions...
Done predicting item at 2 and left with 100476 user predictions...
Done predicting item at 3 and left with 100475 user predictions...
Done predicting item at 4 and left with 100474 user predictions...
Done predicting item at 5 and left with 100473 user predictions...
Done predicting item at 6 and left with 100472 user predictions...
Done predicting item at 7 and left with 100471 user predictions...
Done predicting item at 8 and left with 100470 user predictions...
Done predicting item at 9 and left with 100469 user predictions...
Done predicting item at 10 and left with 100468 user predictions...
Done predicting item at 11 and left with 100467 user predictions...
Done predicting item at 12 and left with 100466 user predictions...
Done predicting item at 13 and left with 100465 user predictions...
Done predicting item at 14 and left with 100464 user predi

In [54]:
# Mean Absolute Error
print("Mean absolute error = " + str(mean_absolute_error(test_ratings['Rating'].to_numpy(), predicted)))

Mean absolute error = 0.8015884074125679


In [55]:
# Root Mean Squared Error
print("Root mean square error = " + str(mean_squared_error(test_ratings['Rating'].to_numpy(), predicted,squared=False)))

Root mean square error = 1.0945198425711442
