In [17]:
import pandas as pd
import numpy as np
import time
import random
from sklearn.decomposition import PCA, FastICA

In [2]:
ratings = pd.read_csv('ratings.csv')

In [3]:
#returns top k neighbours given correlation matrix
def top_k(corr_matrix, k):
    top_indexes = np.argsort(-corr) +1 # (-) sign in corr would sort this in desending order 
                                        # and +1 this would give the userID rather than index 
    return top_indexes.iloc[:,1:k+1] #sliced this way because the item is correlated to itself the most

In [33]:
k_folds = 5 #folds
k = 5 #neighbours
fold_errors = []

for i in range(k_folds):
    errors = [] #errors withing the fold
    #randomize the ratings matrix and choose random points for different user
    #just like the sampling technique specified in the slides
    shuffled = ratings.sample(frac=1) 
    cut = int((1- 1/k_folds)* len(ratings))
    
    #split training and test
    training = shuffled.iloc[:cut]
    test = shuffled.iloc[cut:]
    
    #creating of utility matrix, default value being NaNs, and rows being userID and columns are movieId
    utility_matrix = pd.DataFrame(index= sorted(pd.unique(ratings.userId)), \
                              columns = sorted(pd.unique(ratings.movieId)))
    
    
    print(f"####### K fold part-{i+1} ########")
    start= time.time() #just book-keeping
    
    #filling up row matrix
    for _, row in training.iterrows():
        utility_matrix.loc[int(row.userId), int(row.movieId)] = row.rating
        
    #need to fill in mean ratings for the final formula
    mean_ratings = utility_matrix.sum(axis=1)/(utility_matrix>0).sum(axis=1)
    
    utility_matrix.fillna(utility_matrix.mean(), inplace= True) #filling up the NaNs with movie averages
    utility_matrix.fillna(0, inplace= True) #we would have the problem of cold start, thus filling w/ 0
    
    corr = utility_matrix.T.corr() #correlation matrix, default is pearson
    
    top_neighbours = top_k(corr, k) #see up
    
    #prediction part
    #selecting 1 user at a time and predicting values for all the movies for that user
    #calculating mse for every movie encountered in test set
    for target_user in test.userId.unique():

        temp = test[test.userId==target_user]
        similarities = np.sort(corr)[target_user-1,-k-1:-1][::-1]
        neighbour_ratings = utility_matrix.loc[top_neighbours.loc[target_user]]

        suggested_ratings = mean_ratings.loc[target_user] + similarities * neighbour_ratings.T/sum(similarities)
        suggested_ratings = suggested_ratings.mean(axis=1)
        
        for _, row in temp.iterrows():
            errors.append((row.rating - suggested_ratings.loc[row.movieId])**2)
    
    fold_errors.append(np.mean(errors)) 
        
    print(f"Done with this fold in {time.time()- start} s")
    print(f"MSE for this part is {fold_errors[-1]}")
print(f"MSE is {np.mean(fold_errors)}")

####### K fold part-1 ########
Done with this fold in 204.4609353542328 s
MSE for this part is 1.185667482556367
####### K fold part-2 ########
Done with this fold in 204.37644243240356 s
MSE for this part is 1.1777630480232075
####### K fold part-3 ########
Done with this fold in 204.9205994606018 s
MSE for this part is 1.2135292501423276
####### K fold part-4 ########
Done with this fold in 204.34527969360352 s
MSE for this part is 1.181296350171233
####### K fold part-5 ########
Done with this fold in 205.66029167175293 s
MSE for this part is 1.2042443881845655
MSE is 1.19250010381554


In [34]:
k_folds = 5
k = 5
fold_errors = []

for i in range(k_folds):
    errors = []
    shuffled = ratings.sample(frac=1)
    cut = int((1- 1/k_folds)* len(ratings))
    
    training = shuffled.iloc[:cut]
    test = shuffled.iloc[cut:]
    
    utility_matrix = pd.DataFrame(index= sorted(pd.unique(ratings.userId)), \
                              columns = sorted(pd.unique(ratings.movieId)))
    
    
    
    
    print(f"####### K fold part-{i+1} ########")
    start= time.time()
    
    for _, row in training.iterrows():
        utility_matrix.loc[int(row.userId), int(row.movieId)] = row.rating
    
    utility_matrix.fillna(utility_matrix.mean(), inplace= True)
    utility_matrix.fillna(0, inplace= True)
    
    reduced_matrix = pd.DataFrame(PCA(n_components=500).fit_transform(utility_matrix), index= np.arange(1,611))
    corr = reduced_matrix.T.corr()
    
    top_neighbours = top_k(corr, k)
    
    mean_ratings = utility_matrix.sum(axis=1)/(utility_matrix>0).sum(axis=1)
    
    
    for target_user in test.userId.unique():

        temp = test[test.userId==target_user]
        similarities = np.sort(corr)[target_user-1,-k-1:-1][::-1]
        neighbour_ratings = utility_matrix.loc[top_neighbours.loc[target_user]]

        suggested_ratings = mean_ratings.loc[target_user] + similarities * neighbour_ratings.T/sum(similarities)
        suggested_ratings = suggested_ratings.mean(axis=1)
        
        for _, row in temp.iterrows():
            errors.append((row.rating - suggested_ratings.loc[row.movieId])**2)
    
    fold_errors.append(np.mean(errors)) 
        
    print(f"Done with this fold in {time.time()- start} s")
    
    print(f"MSE for this part is {fold_errors[-1]}")

print(f"MSE is {np.mean(fold_errors)}")

####### K fold part-1 ########
Done with this fold in 202.1288182735443 s
MSE for this part is 1.177967054445447
####### K fold part-2 ########
Done with this fold in 201.70803999900818 s
MSE for this part is 1.1917852949382002
####### K fold part-3 ########
Done with this fold in 200.63479113578796 s
MSE for this part is 1.1811478788084306
####### K fold part-4 ########
Done with this fold in 202.054208278656 s
MSE for this part is 1.1612560513228376
####### K fold part-5 ########
Done with this fold in 22126.19813156128 s
MSE for this part is 1.1801287665326108
MSE is 1.1784570092095052
