In [137]:
import numpy as np 
import pandas as pd 
import sys
from math import sqrt
from sklearn.metrics import mean_squared_error,mean_absolute_error
import matplotlib.pyplot as plt
df = pd.read_csv('ratings.csv', ',')
print(df.head())

   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931


In [138]:
n_users = df.userId.unique().shape[0]
n_items = df.movieId.unique().shape[0]
print ('Total Users : '+ str(n_users))
print('Total Movie : ' + str(n_items))

Total Users : 610
Total Movie : 9724


In [139]:
df_utility = df.pivot(index = 'userId',columns='movieId',values = 'rating').fillna(0)
print(df_utility.head())
ratings = np.array(df_utility)
#sim = ratings.dot(ratings.T)
#sim
np.array([np.sqrt(np.diagonal(ratings))])  

movieId  1       2       3       4       5       6       7       8       \
userId                                                                    
1           4.0     0.0     4.0     0.0     0.0     4.0     0.0     0.0   
2           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
3           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
4           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
5           4.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   

movieId  9       10      ...  193565  193567  193571  193573  193579  193581  \
userId                   ...                                                   
1           0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
2           0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
3           0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
4           0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0

array([[2.        , 0.        , 0.        , 0.        , 0.        ,
        2.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 1.41421356, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 2.        , 1.73205081, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.  

In [140]:
def train_test_split(ratings,fractionTest):
	test = np.zeros(ratings.shape)
	train = ratings.copy()
	for user in range(ratings.shape[0]):
		nonzeroarr = ratings[user,:].nonzero()[0]
		# print(nonzeroarr[0])
		nonzerolen = len(nonzeroarr)
		# print(nonzerolen)
		test_rating_indices = np.random.choice(nonzeroarr,size=int(nonzerolen*fractionTest),replace=False)
		train[user,test_rating_indices]=0
		test[user,test_rating_indices] = ratings[user,test_rating_indices]

	assert(np.all((train*test)==0))
	return train,test

In [141]:
def predict(ratings,similarity):
	# Summation sim(u,u')*r(u',i) / Summation of |sim(u,u')|
	den = np.array(np.abs(similarity).sum(axis=1)).T
	den = den.reshape((den.shape[0],1))
	return similarity.dot(ratings) /den


In [142]:
def pearson_sim(mat):
	sim_matrix = np.zeros((mat.shape[0],mat.shape[1]))
	mean = np.mean(mat,axis=1)
	for user in range(mat.shape[0]):
		nonzeroarr = mat[user,:].nonzero()[0]
		avg = np.sum(mat[user])/len(nonzeroarr)
		sim_matrix[user,nonzeroarr] = mat[user,nonzeroarr] - avg + 1e-9
	
	sim_matrix = (sim_matrix).dot((sim_matrix).T)
	norms = np.array([np.sqrt(np.diagonal(np.abs(sim_matrix)))])
	# norms is a square root array of magnitude of each user (diagonal contains magntitude of rows)
	return (sim_matrix / norms / norms.T)

In [143]:
def cosine_sim(ratings, epsilon=1e-9):
    # epsilon -> small number for handling dived-by-zero errors
    sim = ratings.dot(ratings.T) + epsilon
    norms = np.array([np.sqrt(np.diagonal(sim))])
    return (sim / norms / norms.T)

In [144]:
def get_rmse(pred, actual):
    # Ignore zero terms.
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return sqrt(mean_squared_error(pred, actual))


In [145]:
def get_mae(pred, actual):
    # Ignore zero terms.
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return mean_absolute_error(actual, pred)

In [147]:
split_ratio = [0.10,0.15,0.20,0.25,0.30,0.35,0.40]

for split in split_ratio:
    print("\n\n\n\n")
    print("split = ")
    print(split)
    print("\n\n\n")
    train,test = train_test_split(ratings,split) 
    print('Data splitted into Train,Test')

    sim_cos = cosine_sim(train)
    sim_matrix = pearson_sim(train)

    print('Similarity Matrix Calulated')
    # print(sim_cos)
    # print(sim_matrix)

    user_prediction_cos_bias = predict(train, sim_cos)
    print ('User-based CF RMSE:(Cosine) ' + str(get_rmse(user_prediction_cos_bias, test)))
    print ('User-based CF MAE:(Cosine) ' + str(get_mae(user_prediction_cos_bias, test)))
    user_prediction_pearson_bias = predict(train,sim_matrix)
    print ('User-based CF RMSE:(Pearson)' + str(get_rmse(user_prediction_pearson_bias, test)))
    print ('User-based CF MAE:(Pearson)' + str(get_mae(user_prediction_pearson_bias, test)))






split = 
0.1




Data splitted into Train,Test
Similarity Matrix Calulated
User-based CF RMSE:(Cosine) 3.170126387330986
User-based CF MAE:(Cosine) 3.0004434809175873
User-based CF RMSE:(Pearson)3.331023867422208
User-based CF MAE:(Pearson)3.1734570653436087





split = 
0.15




Data splitted into Train,Test
Similarity Matrix Calulated
User-based CF RMSE:(Cosine) 3.2008690876526233
User-based CF MAE:(Cosine) 3.0349515192506367
User-based CF RMSE:(Pearson)3.3597431528394113
User-based CF MAE:(Pearson)3.2044892369771425





split = 
0.2




Data splitted into Train,Test
Similarity Matrix Calulated
User-based CF RMSE:(Cosine) 3.2402486620858073
User-based CF MAE:(Cosine) 3.0780516469537083
User-based CF RMSE:(Pearson)3.3963145652995603
User-based CF MAE:(Pearson)3.242839908564023





split = 
0.25




Data splitted into Train,Test
Similarity Matrix Calulated
User-based CF RMSE:(Cosine) 3.2541446488835897
User-based CF MAE:(Cosine) 3.0912656872049182
User-based CF RMSE:(Pearson)3.