In [1]:
import pandas as pd
import numpy as np
import math

# Data Manipulation
Main Goal: Obtain sparse matrix of users vs movies

In [2]:
df = pd.read_csv('./dataset/combined_data_1.txt', header=None, names=['Cust_ID', 'Rating', 'Date'], usecols=[0,1,2])
df.head()

Unnamed: 0,Cust_ID,Rating,Date
0,1:,,
1,1488844,3.0,2005-09-06
2,822109,5.0,2005-05-13
3,885013,4.0,2005-10-19
4,30878,4.0,2005-12-26


In [3]:
movie_count = df.isnull().sum()[1]
cust_count = df['Cust_ID'].nunique() - movie_count
print('Customer count: ', cust_count)

Customer count:  470758


In [4]:
df_nan = df[pd.isnull(df.Rating)]
df_nan = df_nan.reset_index()
df_nan.head()
print('movies: ')
print(df_nan)

movies: 
         index Cust_ID  Rating Date
0            0      1:     NaN  NaN
1          548      2:     NaN  NaN
2          694      3:     NaN  NaN
3         2707      4:     NaN  NaN
4         2850      5:     NaN  NaN
...        ...     ...     ...  ...
4494  24046714   4495:     NaN  NaN
4495  24047329   4496:     NaN  NaN
4496  24056849   4497:     NaN  NaN
4497  24057564   4498:     NaN  NaN
4498  24057834   4499:     NaN  NaN

[4499 rows x 4 columns]


In [5]:
# add movies as an additional column
movie_np = []
movie_id = 1
for i,j in zip(df_nan['index'][1:], df_nan['index'][:-1]):
    temp = np.full((1,i-j-1), movie_id)
    movie_np = np.append(movie_np, temp)
    movie_id += 1
    # print('curr movie: ', movie_id)

last_record = np.full((1,len(df) - df_nan.iloc[-1, 0] - 1), movie_id)
movie_np = np.append(movie_np, last_record)

print('Movie numpy: {}'.format(movie_np))
print('Length: {}'.format(len(movie_np)))

Movie numpy: [1.000e+00 1.000e+00 1.000e+00 ... 4.499e+03 4.499e+03 4.499e+03]
Length: 24053764


In [6]:
df = df[pd.notnull(df.Rating)]
df['Movie_ID'] = movie_np.astype(int)
df['Cust_ID'] = df['Cust_ID'].astype(int)
df.head()

Unnamed: 0,Cust_ID,Rating,Date,Movie_ID
1,1488844,3.0,2005-09-06,1
2,822109,5.0,2005-05-13,1
3,885013,4.0,2005-10-19,1
4,30878,4.0,2005-12-26,1
5,823519,3.0,2004-05-03,1


### Ratings sparse matrix

In [7]:
temp_df = df[['Cust_ID', 'Rating', 'Movie_ID']]
pivot_df = pd.pivot_table(temp_df, values='Rating', index='Cust_ID', columns='Movie_ID')

In [8]:
pivot_df.iloc[0:]

Movie_ID,1,2,3,4,5,6,7,8,9,10,...,4490,4491,4492,4493,4494,4495,4496,4497,4498,4499
Cust_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,5.0,,,...,,,,,,,,,,
8,,,,,,,,,,,...,,,,,,,,,,
10,,,,,,,,,,,...,,,,,,,,,,
25,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2649404,,,,,,,,,,,...,,,,,,,,,,
2649409,,,,,,,,,,,...,,,,,,,,,,
2649421,,,,,,,,,,,...,,,,,,,,,,
2649426,,,,,,,,,,,...,,,,,,,,,,


### Dates sparse matrix

In [19]:
# temp_df = df[['Cust_ID', 'Date', 'Movie_ID']]
# dates_df = pd.pivot_table(temp_df, values='Date', index='Cust_ID', columns='Movie_ID')

In [20]:
# pivot_df.iloc[0:]

# KNN algorithm
Main Goal: to obtaing K nearest neighbors to predict the rating

In [9]:
def filter_movies_rated_by_user(cust_ID):
    filtered_matrix = pd.DataFrame(pivot_df)
    filtered_matrix = filtered_matrix.transpose()
    filtered_matrix = filtered_matrix[pd.notnull(filtered_matrix[cust_ID])]
    return filtered_matrix.transpose()

def calc_similiarity(movieID, filtered_matrix):
    target_movie = pivot_df[movieID]
    return filtered_matrix.corrwith(target_movie)

def sigmoid(val):
    return 1.0/(1+math.exp(-val))

def calc_weights(gamma, delta, k_neighbors):
    k_neighbors = gamma + delta*k_neighbors
    return k_neighbors.apply(sigmoid)

def weighted_sum(weights, custID):
    ratings = np.array([pivot_df.loc[custID][movieID] for movieID in weights.index])
    return np.dot(weights, ratings)/sum(weights)

In [10]:
def predict_rating(custID, movieID, K=10, delta=-3, gamma=0):
    filtered_matrix = filter_movies_rated_by_user(custID)
    similarities = calc_similiarity(movieID, filtered_matrix)
    similarities.sort_values(ascending=False, inplace=True)
    k_neighbors = similarities[0:K]
    weights = calc_weights(k_neighbors, delta, gamma)
    return weighted_sum(weights, custID)

# Evaluation
Main Goal: obtain RMSE on a validation/test dataset

In [15]:
# eval_set: DataFrame of [custID,movieID,rating]
def evaluate(eval_set):
    predictions = np.vectorize(predict_rating)(eval_set['custID'], eval_set['movieID'])
    SSE = sum((predictions-eval_set['rating'])**2)
    return math.sqrt(SSE/len(eval_set))
# testing:
# fake_val_set = pd.DataFrame({'custID': [6, 7, 8], 'movieID': [2, 3, 4], 'rating': [2, 2, 5]})
# print(evaluate(fake_val_set))

predictions: 
[3.50173399 4.30455085]

subs: 
0    1.501734
1   -0.695449
Name: rating, dtype: float64

pow: 
0    2.255205
1    0.483650
Name: rating, dtype: float64
1.1702253004454404
