In [None]:
#Q24-25
import pandas as pd
import numpy as np
from surprise import Reader, Dataset, SVD, accuracy
from surprise.model_selection import cross_validate, KFold, train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, roc_auc_score 

# Define the format
reader = Reader(line_format="user item rating timestamp", sep=',',skip_lines=1)
# Load the data from the file using the reader format
data = Dataset.load_from_file('ml-latest-small/ratings.csv', reader=reader)

meanRMSE = []
meanMAE = []

for k in np.arange(2, 51, 2):
    print "------------------For k=",k,"--------------------"
    algo = SVD(n_factors = k, random_state = 37)
    ans=cross_validate(algo, data, cv=10, verbose=True)
    meanRMSE.append(np.mean(ans.get('test_rmse')))
    meanMAE.append(np.mean(ans.get('test_mae')))
    
kvals = np.arange(2, 51, 2)
plt.plot(kvals, meanRMSE)
plt.title("Average RMSE vs Number of latent factors")
plt.show()

plt.plot(kvals, meanMAE)
plt.title("Average MAE vs Number of latent factors")
plt.show()

plt.xlabel("Number of latent factors")
plt.ylabel("Average Error")
plt.title("Average Error vs Number of latent factors")
plt.plot(kvals, meanRMSE)
plt.plot(kvals, meanMAE)
plt.legend(["Avg RMSE", "Avg MAE"])
plt.show()

In [None]:
#Q26 (Popular Movies)

# define a cross-validation iterator
kf = KFold(n_splits=10)

meanRMSE1 = []

for k in np.arange(2, 51, 2):
    print "------------------For k=",k,"--------------------"
    algo = SVD(n_factors = k, random_state = 37)
    rmse = []
    for trainset, testset in kf.split(data):
        algo.fit(trainset)
        newtest = getPopular(testset, 2)
        predictions = algo.test(newtest)
        rmse.append(accuracy.rmse(predictions))
    meanRMSE1.append(round(np.mean(rmse),4))

plt.xlabel("Number of latent factors")
plt.ylabel("Average RMSE")
plt.title("Popular Movies \nAverage RMSE vs Number of latent factors")
plt.plot(kvals, meanRMSE1)
plt.ylim([0.8,0.9])
plt.show()       

In [None]:
#Q27 (Unpopular Movies)
meanRMSE2 = []

for k in np.arange(2, 51, 2):
    print "------------------For k=",k,"--------------------"
    algo = SVD(n_factors = k, random_state = 37)
    rmse = []
    for trainset, testset in kf.split(data):
        algo.fit(trainset)
        newtest = getUnpopular(testset, 2)
        predictions = algo.test(newtest)
        rmse.append(accuracy.rmse(predictions))
    meanRMSE2.append(round(np.mean(rmse),4))

plt.xlabel("Number of latent factors")
plt.ylabel("Average RMSE")
plt.title("Unpopular Movies \n Average RMSE vs Number of latent factors")
plt.plot(kvals, meanRMSE2)
plt.ylim([0.8,1.0])
plt.show()

In [None]:
#Q28 (High Variance Movies)
meanRMSE3 = []

for k in np.arange(2, 51, 2):
    print "------------------For k=",k,"--------------------"
    algo = SVD(n_factors = k, random_state = 37)
    rmse = []
    for trainset, testset in kf.split(data):
        algo.fit(trainset)
        newtest = getHighVariance(testset)
        predictions = algo.test(newtest)
        rmse.append(accuracy.rmse(predictions))
    meanRMSE3.append(round(np.mean(rmse),4))

plt.xlabel("Number of latent factors")
plt.ylabel("Average RMSE")
plt.title("High Variance \n Average RMSE vs Number of latent factors")
plt.plot(kvals, meanRMSE3)
plt.ylim([1.1,1.5])
plt.show()

In [None]:
#Q29
def classifyData(testarray, k):
    test = []
    for i in range(len(testarray)):
        test.append(0) if float(testarray[i])<k else test.append(1)
    return test

def plot_roc(actual, predicted, classifier_name):
    x, y, _ = roc_curve(actual, predicted)
    plt.plot(x, y, label="ROC Curve")
    plt.plot([0, 1], [0, 1])

    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.2])

    plt.xlabel('False Positive Rate (FPR)')
    plt.ylabel('True Positive Rate (TPR)')
    plt.title(classifier_name)
    plt.legend(loc="best")
    plt.show()
    
threshold = [2.5, 3, 3.5, 4]

trainset, testset = train_test_split(data, test_size=0.1, random_state=42)
algo = SVD(n_factors = 14, random_state = 37)
algo.fit(trainset)
predictions = algo.test(testset)

testarray = np.array(testset)[:,2]

for k in threshold:
    print '--------------------Threshold=', k,'-----------------------'
    test = classifyData(testarray, k)
    pred = np.array(predictions)[:,3]
    plot_roc(test, pred, "MF with bias")
    
    auc = roc_auc_score(test,pred)
    print 'Area under ROC = ', auc

In [None]:
#Q38
def getPR(predictions,k):
    userliked = defaultdict(list)
    actualliked=defaultdict(list)
    al=defaultdict(list)
    ul=defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        userliked[uid].append((iid, est))
        actualliked[uid].append((iid,true_r))

    for uid, user_ratings in userliked.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        #u=filter(lambda x:x[1]>3,user_ratings)
        u=map(lambda x:x[0],user_ratings)
        if len(u)>0:
            ul[uid] = u
    for uid, user_ratings in actualliked.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        u=filter(lambda x:x[1]>3,user_ratings)
        u=map(lambda x:x[0],u)
        if len(u)>0:
            al[uid] = u
    valid_users=[]
    map(lambda x:valid_users.append(x) ,al.keys())
    count=0
    prec=0
    rec=0
    recall=[]
    precision=[]
    for u in valid_users:
        if len(ul.get(u))>=k:
            prec=prec+(len(set(al.get(u))&set(ul.get(u)[:k]))*1.0/k)
            rec=rec+(len(set(al.get(u))&set(ul.get(u)[:k]))*1.0/len(al.get(u)))
            precision.append((len(set(al.get(u))&set(ul.get(u)[:5]))*1.0/k))
            recall.append((len(set(al.get(u))&set(ul.get(u)[:5]))*1.0/len(al.get(u))))
            count=count+1
    return prec/count,rec/count



recall=[]
precision=[]
for i in range(1,26):
    print "Computing for k=",i
    prec=0
    rec=0

    for trainset, testset in kf.split(data):
        algo.fit(trainset)
        predictions = algo.test(testset)
        p,r=getPR(predictions,i)
        prec=prec+p
        rec=rec+r
    precision.append(prec/10)
    recall.append(rec/10)
    print precision
    print recall
    
plt.xlabel("Number of recommendations")
plt.ylabel("Average Precision")
plt.title("Average Precision vs Number of recommendations \n for MF with bias")
plt.plot(tvals, precision)
plt.show()

plt.xlabel("Number of recommendations")
plt.ylabel("Average Recall")
plt.title("Average Recall vs Number of recommendations \n for MF with bias")
plt.plot(tvals, recall)
plt.show()

plt.xlabel("Number of recommendations")
plt.title("Average Precision vs Number of recommendations \n for MF with bias")
plt.plot(tvals, precision)
plt.plot(tvals, recall)
plt.legend(["Precision", "Recall"])
plt.show()

plt.xlabel("Average Recall")
plt.ylabel("Average Precision")
plt.title("Precision - Recall curve for MF with bias")
plt.plot(recall, precision)
plt.show()