In [1]:
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate
import csv
import pandas as pd
import numpy as np
from surprise.model_selection import KFold
from surprise import accuracy
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae

In [2]:
df=pd.read_csv('data/ratings.csv')
reader = Reader(rating_scale=(1,5))
rating_data=Dataset.load_from_df(df[['userId', 'movieId', 'rating']],reader)
rating_matrix=df.as_matrix()
#print rating_matrix

In [3]:
def predict(dataset):
    users=np.unique(np.array(dataset)[:,0])
    predictionsPerUser={}
    
    for user in users:
        condition=df['userId'] == user
        newdata=df[condition]
        userRatings=np.array(newdata)[:,2]
        meanRating=np.mean(userRatings)
        predictionsPerUser[user]=meanRating

    return predictionsPerUser

In [4]:
predictionsPerUser=predict(rating_matrix)

In [5]:
def NBFilter(testset):
    predictions=[]
    dataFrame=pd.DataFrame(testset, columns=['userId', 'movieId', 'rating'])
    for row in dataFrame.itertuples(2):
        if row.userId in predictionsPerUser:
            predictions.append(predictionsPerUser[row.userId])
    return predictions

In [6]:
#this kfold is from surprise
kf = KFold(n_splits=10)

In [7]:
#Question 30
RMSE_list=[]
MAE_list=[]
for trainset, testset in kf.split(rating_data):
    #predicted=NBPredictionFilter(testset)
    predicted=NBFilter(testset)
    #print "Length of predicted=",len(predicted)
    actual=np.array(testset)[:,2]
    #print "Length of actual=",len(actual)
    RMSE=mse(actual, predicted)
    MAE=mae(actual,predicted)
    print "RMSE=",RMSE,"MAE=",MAE
    RMSE_list.append(RMSE)
    MAE_list.append(MAE)
print "Mean RMSE Q30=",np.mean(RMSE_list)
print "Mean MAE Q30=",np.mean(MAE_list)

RMSE= 0.897971030952 MAE= 0.74019918369
RMSE= 0.90517258414 MAE= 0.743942426678
RMSE= 0.921170524258 MAE= 0.751494782337
RMSE= 0.91039792085 MAE= 0.74440706886
RMSE= 0.897529913111 MAE= 0.736655634677
RMSE= 0.908575805163 MAE= 0.739431384052
RMSE= 0.935956657454 MAE= 0.752611397236
RMSE= 0.911167510635 MAE= 0.742976101115
RMSE= 0.931900579553 MAE= 0.752510813209
RMSE= 0.908727350519 MAE= 0.742040772463
Mean RMSE Q30= 0.912856987664
Mean MAE Q30= 0.744626956432


In [7]:
def getPopular(testset,n):
    movie,freq=np.unique(np.array(testset)[:,1],return_counts=True)
    a=zip(movie,freq)
    pop=map(lambda y:y[0],filter(lambda x:x[1]>n,a))
    t=filter(lambda x:x[1] in pop,testset)
    return t

def getUnpopular(testset,n):
    movie,freq=np.unique(np.array(testset)[:,1],return_counts=True)
    a=zip(movie,freq)
    pop=map(lambda y:y[0],filter(lambda x:x[1]<=n,a))
    t=filter(lambda x:x[1] in pop,testset)
    return t

def getHighVariance(testset):
    testset=getPopular(testset,4)
    ans=(pd.DataFrame.from_records(testset).groupby(1)[2].var(ddof=False))
    af=pd.DataFrame.from_records(np.transpose([ans.index,ans.values]))
    high_var=af[af[1]>2]
    high_var=high_var[0]
    high_var=list(high_var)
    t=filter(lambda x:x[1] in high_var,testset)
    return t

In [9]:
#Question 31
popular=getPopular(rating_matrix,2)
print "Fetched popular Dataset from the original dataset"
reader = Reader(rating_scale=(1,5))
dfP=pd.DataFrame(popular, columns=['userId', 'movieId', 'rating', 'timestamp'])

popularDataset=Dataset.load_from_df(dfP[['userId', 'movieId', 'rating']],reader)

RMSE_popular=[]
MAE_popular=[]

kf = KFold(n_splits=10)

for trainset, testset in kf.split(popularDataset):
    predicted=NBFilter(testset)
    actual=np.array(testset)[:,2]
    RMSE=mse(actual, predicted)
    MAE=mae(actual,predicted)
    print "RMSE=",RMSE,"MAE=",MAE
    RMSE_popular.append(RMSE)
    MAE_popular.append(MAE)    
print "Mean RMSE for popular Dataset=",np.mean(RMSE_popular)
print "Mean MAE for popular Dataset=",np.mean(MAE_popular)

Fetched popular Dataset from the original dataset
RMSE= 0.906335783052 MAE= 0.741095540082
RMSE= 0.931283047115 MAE= 0.753227666293
RMSE= 0.918884048334 MAE= 0.750351012852
RMSE= 0.899137237883 MAE= 0.736417332296
RMSE= 0.914044597375 MAE= 0.748596399499
RMSE= 0.910568613188 MAE= 0.744573259485
RMSE= 0.894492535798 MAE= 0.735631809964
RMSE= 0.890437315093 MAE= 0.738602086193
RMSE= 0.894655992458 MAE= 0.740259244108
RMSE= 0.905969311988 MAE= 0.741437977601
Mean RMSE for popular Dataset= 0.906580848228
Mean MAE for popular Dataset= 0.743019232837


In [10]:
#Question 32
unpopular=getUnpopular(rating_matrix,2)
print "Fetched unpopular Dataset from the original dataset"
reader = Reader(rating_scale=(1,5))
dfP=pd.DataFrame(unpopular, columns=['userId', 'movieId', 'rating', 'timestamp'])

unpopularDataset=Dataset.load_from_df(dfP[['userId', 'movieId', 'rating']],reader)

RMSE_unpopular=[]
MAE_unpopular=[]

for trainset, testset in kf.split(unpopularDataset):
    predicted=NBFilter(testset)
    actual=np.array(testset)[:,2]
    RMSE=mse(actual, predicted)
    MAE=mae(actual,predicted)
    print "RMSE=",RMSE,"MAE=",MAE
    RMSE_unpopular.append(RMSE)
    MAE_unpopular.append(MAE)    
print "Mean RMSE for unpopular Dataset=",np.mean(RMSE_unpopular)
print "Mean MAE for unpopular Dataset=",np.mean(MAE_unpopular)

Fetched unpopular Dataset from the original dataset
RMSE= 1.04064008457 MAE= 0.78154418659
RMSE= 0.979621673227 MAE= 0.762054797936
RMSE= 1.11219825527 MAE= 0.804579639484
RMSE= 1.00520188683 MAE= 0.743004453334
RMSE= 1.04490457017 MAE= 0.774793061152
RMSE= 0.938876085654 MAE= 0.747718147747
RMSE= 1.0739897982 MAE= 0.783620826521
RMSE= 1.01422361489 MAE= 0.783248504868
RMSE= 1.02247415644 MAE= 0.765808945644
RMSE= 0.981561664428 MAE= 0.777913880668
Mean RMSE for unpopular Dataset= 1.02136917897
Mean MAE for unpopular Dataset= 0.772428644394


In [8]:
#Question 33
hvd=getHighVariance(rating_matrix)
print "Fetched high variance Dataset from the original dataset"
reader = Reader(rating_scale=(1,5))
dfP=pd.DataFrame(hvd, columns=['userId', 'movieId', 'rating', 'timestamp'])

hvdDataset=Dataset.load_from_df(dfP[['userId', 'movieId', 'rating']],reader)

RMSE_hvd=[]
MAE_hvd=[]

for trainset, testset in kf.split(hvdDataset):
    predicted=NBFilter(testset)
    actual=np.array(testset)[:,2]
    RMSE=mse(actual, predicted)
    MAE=mae(actual,predicted)
    print "RMSE=",RMSE,"MAE=",MAE
    RMSE_hvd.append(RMSE)
    MAE_hvd.append(MAE)    
print "Mean RMSE for high variance Dataset=",np.mean(RMSE_hvd)
print "Mean MAE for high variance Dataset=",np.mean(MAE_hvd)

Fetched high variance Dataset from the original dataset
RMSE= 2.70643415533 MAE= 1.36991099242
RMSE= 2.46184348662 MAE= 1.34490006231
RMSE= 3.01217532071 MAE= 1.43760077665
RMSE= 1.93243676235 MAE= 1.17544389159
RMSE= 1.85780279764 MAE= 1.15159695919
RMSE= 2.91311786092 MAE= 1.36387372975
RMSE= 2.6684724921 MAE= 1.30271581604
RMSE= 2.07083713935 MAE= 1.21345623523
RMSE= 1.8307411001 MAE= 1.07442960091
RMSE= 2.09404022376 MAE= 1.18505576007
Mean RMSE for high variance Dataset= 2.35479013389
Mean MAE for high variance Dataset= 1.26189838242
