# This project is contributed by Yanfeng, Garvit and Hyosang

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import surprise
from surprise.prediction_algorithms.knns import KNNWithMeans
from surprise.model_selection import cross_validate, KFold, train_test_split
from tqdm import tqdm
from surprise.accuracy import rmse
from sklearn.metrics import roc_curve, roc_auc_score
from surprise.prediction_algorithms.matrix_factorization import NMF, SVD

import seaborn as sns
sns.set()

## Question 1

In [None]:
#################### Question 1A ########################
rating_file_name = "ratings.csv"
dataset = pd.read_csv(rating_file_name)
print(dataset.info)
num_available_rating = dataset['rating'].shape[0]
print('num_available_rating:', num_available_rating)
# construct the Rating matrix R through the pivot table
R = pd.pivot_table(data=dataset, index='userId', columns='movieId', values='rating', fill_value=0)
print(R)
num_possible_rating = R.size
print(R.size)
sparsity = num_available_rating / num_possible_rating
print('sparsity=', sparsity)

In [None]:
#################### Question 1B ########################
plt.hist(dataset['rating'], bins=np.arange(0, 6, 0.5)-0.25, rwidth=0.5)
plt.title('Frequency of the Rating Values')
plt.xlabel('Rating Values')
plt.ylabel('Frequency')
plt.xticks(np.arange(0, 5.5, 0.5))
plt.savefig('Q1b.png', dpi=300)
plt.show()

In [None]:
#################### Question 1C ########################
num_rating_per_movie = []
n = 0
R_mat = R.values
for i in range(R.shape[1]):
    for j in range(R.shape[0]):
        if R_mat[j, i] != 0:
            n = n + 1
    num_rating_per_movie.append(n)
    n = 0
movieID = R.columns.values
new_mat = np.vstack((movieID, np.array(num_rating_per_movie)))
index = new_mat[1, :].argsort()[::-1]
new_mat = new_mat[:, index]
print(new_mat)
movieID_new = [str(x) for x in new_mat[0, :]]
plt.plot(movieID_new[:20], new_mat[1, :20])
plt.xlabel('Movie ID')
plt.xticks(fontsize=6)
plt.ylabel('Frequency of Ratings')
plt.title('Distribution of the number of ratings received among movies (Portion)')
plt.savefig('Q1c1.png', dpi=300)
plt.show()

# to observe the rating frequency of every movie, but do not print the movieId on X-axis. 
plt.plot(np.arange(R.shape[1]), new_mat[1, :])
plt.ylabel('Frequency of Ratings')
plt.savefig('Q1c2.png', dpi=300)
plt.show()

In [None]:
#################### Question 1D ########################
num_rating_per_user = []
n = 0
R_mat = R.values
for i in range(R.shape[0]):
    for j in range(R.shape[1]):
        if R_mat[i, j] != 0:
            n = n + 1
    num_rating_per_user.append(n)
    n = 0
userID = R.index.values.reshape(1, -1)
new_mat = np.vstack((userID, np.array(num_rating_per_user)))
index = new_mat[1, :].argsort()[::-1]
new_mat = new_mat[:, index]
print(new_mat)
userID_new = [str(x) for x in new_mat[0, :]]
plt.plot(userID_new[:20], new_mat[1, :20])
plt.xlabel('User ID')
plt.xticks(fontsize=6)
plt.ylabel('Frequency of Ratings')
plt.title('Distribution of the number of ratings among users (Portion)')
plt.savefig('Q1d1.png', dpi=300)
plt.show()

# to observe the rating frequency of every user, but do not print the userId on X-axis.
plt.plot(np.arange(R.shape[0]), new_mat[1, :])
plt.ylabel('Frequency of Ratings')
plt.savefig('Q1d2.png', dpi=300)
plt.show()

In [None]:
#################### Question 1F ########################
variance = []
for i in range(R.shape[1]):
    k = R_mat[:, i].reshape(-1)
    var = np.var(R_mat[:, i][R_mat[:, i] > 0])
    variance.append(var)

plt.hist(variance, bins=np.arange(0, max(variance)+1, 0.5)-0.25, rwidth=0.5)
plt.title('Frequency of the Rating Variance of movies')
plt.xlabel('Rating Variance')
plt.ylabel('Frequency')
plt.xticks(np.arange(0, 5.5, 0.5))
plt.savefig('Q1f.png', dpi=300)
plt.show()

## Question 4

In [None]:
#################### Question 4 ########################
dataset_sup = surprise.Dataset.load_from_df(dataset[['userId', 'movieId', 'rating']],
                                            reader=surprise.Reader(rating_scale=(0.5, 5), skip_lines=0))
print(dataset_sup)
k_neighbors = np.arange(2, 102, 2)
rmse_ave = []
mae_ave = []
for i in tqdm(range(2, 102, 2)):
    knn = KNNWithMeans(k=i, min_k=1, sim_options={'name': 'pearson'}, verbose=False)
    scores = cross_validate(algo=knn, data=dataset_sup, measures=['rmse', 'mae'], cv=10, n_jobs=-1, verbose=False)
    rmse_average = np.mean(scores['test_rmse'])
    rmse_ave.append(rmse_average)
    mae_average = np.mean(scores['test_mae'])
    mae_ave.append(mae_average)

plt.plot(k_neighbors, rmse_ave)
plt.ylabel('Average RMSE')
plt.xlabel('k')
plt.savefig('Q4_rmse.png', dpi=300)
plt.show()

plt.plot(k_neighbors, mae_ave)
plt.ylabel('Average MAE')
plt.xlabel('k')
plt.savefig('Q4_mae.png', dpi=300)
plt.show()

## Question 5

In [None]:
#################### Question 5 ########################
rmse_diff = np.diff(rmse_ave)  # compute rmse_ave[n+1]-rmse_ave[n] which can help your eyeball!
mae_diff = np.diff(mae_ave)
k_neighbors_diff = np.arange(2, 100, 2)

plt.plot(k_neighbors_diff, rmse_diff)
plt.ylabel('RMSE Diffence')
plt.xlabel('k')
plt.savefig('Q5_rmse_diffence.png', dpi=300)
plt.show()

plt.plot(k_neighbors_diff, mae_diff)
plt.ylabel('MAE Diffence')
plt.xlabel('k')
plt.savefig('Q5_mae_diffence.png', dpi=300)
plt.show()

## Question 6

In [None]:
#################### Question 6 ########################
dataset_original = dataset.copy()  # the original dataset without trimming

num_rating_per_movie = []
n = 0
R_mat = R.values
for i in range(R.shape[1]):
    for j in range(R.shape[0]):
        if R_mat[j, i] != 0:
            n = n + 1
    num_rating_per_movie.append(n)
    n = 0
movieID = R.columns.values
new_mat = np.vstack((movieID, np.array(num_rating_per_movie)))
# print(new_mat)
variance = []
for i in range(R.shape[1]):
    k = R_mat[:, i].reshape(-1)
    var = np.var(R_mat[:, i][R_mat[:, i] > 0])
    variance.append(var)
new_var_mat = np.vstack((new_mat, np.array(variance)))  
# new_var_mat: first row: movie ID; second row: number of ratings of each movie; third row: variance of rating of each movie
print(new_var_mat)

def trimming(var_mat, dataset, option='popular'):  # trim the dataset (pd.dataframe format)
    dataset1 = dataset.copy()  
    # it is necessary to copy pd.dataframe if you want to revise the data! Otherwise, the original data will change.
    if option == 'popular':
        mat = var_mat[0, :][var_mat[1, :] > 2]  # choose the ID of popular movie
        # print(mat)
        delete_index = []
        for i in range(dataset1['rating'].shape[0]):
            if dataset1['movieId'][i] not in mat:
                delete_index.append(i)
        dataset1.drop(delete_index, inplace=True)
    elif option == 'unpopular':
        mat = var_mat[0, :][var_mat[1, :] <= 2]
        delete_index = []
        for i in range(dataset1['rating'].shape[0]):
            if dataset1['movieId'][i] not in mat:
                delete_index.append(i)
        dataset1.drop(delete_index, inplace=True)
    elif option == 'high_variance':
        mat = var_mat[:2, :][:, var_mat[2, :] >= 2]  # variance >= 2
        # print(mat)
        mat = mat[0, :][mat[1, :] >= 5]
        delete_index = []
        for i in range(dataset1['rating'].shape[0]):
            if dataset1['movieId'][i] not in mat:
                delete_index.append(i)
        dataset1.drop(delete_index, inplace=True)
            
    return dataset1.reset_index(drop=True)


dataset_popular = trimming(new_var_mat, dataset_original, option='popular')
print(dataset_popular)
dataset_unpopular = trimming(new_var_mat, dataset_original, option='unpopular')
print(dataset_unpopular)
dataset_high_variance = trimming(new_var_mat, dataset_original, option='high_variance')
print(dataset_high_variance)
print(dataset_original)

# Note: Use the following Superise-format dataset when you call the models in Surprise
# dataset_sup_original: Superise-format dataset without trimming
dataset_sup_original = surprise.Dataset.load_from_df(dataset_original[['userId', 'movieId', 'rating']],
                                                     reader=surprise.Reader(rating_scale=(0.5, 5), skip_lines=0))
# dataset_sup_popular: Superise-format dataset with Popular movie trimming
dataset_sup_popular = surprise.Dataset.load_from_df(dataset_popular[['userId', 'movieId', 'rating']],
                                                     reader=surprise.Reader(rating_scale=(0.5, 5), skip_lines=0))
# dataset_sup_unpopular: Superise-format dataset with UnPopular movie trimming
dataset_sup_unpopular = surprise.Dataset.load_from_df(dataset_unpopular[['userId', 'movieId', 'rating']],
                                                     reader=surprise.Reader(rating_scale=(0.5, 5), skip_lines=0))
# dataset_sup_high_variance: Superise-format dataset with High variance movie trimming
dataset_sup_high_variance = surprise.Dataset.load_from_df(dataset_high_variance[['userId', 'movieId', 'rating']],
                                                     reader=surprise.Reader(rating_scale=(0.5, 5), skip_lines=0))


In [None]:
# Popular movie trimming, KNN
# Note: It will cost about 10 minutes to run the codes in this cell
k_neighbors = np.arange(2, 102, 2)
kf = KFold(n_splits=10, random_state=42)
rmse_scores = []
for i in tqdm(range(2, 102, 2)):
    knn = KNNWithMeans(k=i, min_k=1, sim_options={'name': 'pearson'}, verbose=False)
    rmse_fold = []
    for trainset, testset in kf.split(dataset_sup_popular):  # do the 10-fold split
        popular_rating_predict = knn.fit(trainset).test(testset)
        score = rmse(predictions=popular_rating_predict, verbose=False)
        rmse_fold.append(score)
    rmse_scores.append(np.mean(rmse_fold))
plt.plot(k_neighbors, rmse_scores)
plt.ylabel('Average RMSE within Popular Movies')
plt.xlabel('k')
plt.savefig('Q6_popular_rmse.png', dpi=300)
plt.show()
min_rmse = min(rmse_scores)
min_rmse_index = rmse_scores.index(min_rmse)
print(min_rmse)
print(min_rmse_index)
print(k_neighbors[min_rmse_index])  # best k

In [None]:
# Unpopular movie trimming, KNN
k_neighbors = np.arange(2, 102, 2)
kf = KFold(n_splits=10, random_state=42)
rmse_scores = []
for i in tqdm(range(2, 102, 2)):
    knn = KNNWithMeans(k=i, min_k=1, sim_options={'name': 'pearson'}, verbose=False)
    rmse_fold = []
    for trainset, testset in kf.split(dataset_sup_unpopular):
        unpopular_rating_predict = knn.fit(trainset).test(testset)
        score = rmse(predictions=unpopular_rating_predict, verbose=False)
        rmse_fold.append(score)
    rmse_scores.append(np.mean(rmse_fold))
plt.plot(k_neighbors, rmse_scores)
plt.ylabel('Average RMSE within Unpopular Movies')
plt.xlabel('k')
plt.savefig('Q6_unpopular_rmse.png', dpi=300)
plt.show()
min_rmse = min(rmse_scores)
min_rmse_index = rmse_scores.index(min_rmse)
print(min_rmse)
print(min_rmse_index)
print(k_neighbors[min_rmse_index])  # best k

In [None]:
# High_variance movie trimming, KNN
k_neighbors = np.arange(2, 102, 2)
kf = KFold(n_splits=10, random_state=42)
rmse_scores = []
for i in tqdm(range(2, 102, 2)):
    knn = KNNWithMeans(k=i, min_k=1, sim_options={'name': 'pearson'}, verbose=False)
    rmse_fold = []
    for trainset, testset in kf.split(dataset_sup_high_variance):
        high_variance_rating_predict = knn.fit(trainset).test(testset)
        score = rmse(predictions=high_variance_rating_predict, verbose=False)
        rmse_fold.append(score)
    rmse_scores.append(np.mean(rmse_fold))
plt.plot(k_neighbors, rmse_scores)
plt.ylabel('Average RMSE within High Variance Movies')
plt.xlabel('k')
plt.savefig('Q6_high_variance_rmse.png', dpi=300)
plt.show()
min_rmse = min(rmse_scores)
min_rmse_index = rmse_scores.index(min_rmse)
print(min_rmse)
print(min_rmse_index)
print(k_neighbors[min_rmse_index])  # best k

In [None]:
def get_rating(userID, movieID):
    # get the origial rating in the test set by searching in the original R_mat
    # not used
    return float(np.squeeze(R_mat[np.where(R.index.values == userID), np.where(R.columns.values == movieID)]))

def draw_roc_curve(fpr, tpr, model_name):
    # not used
    plt.figure()
    plt.plot(fpr, tpr)
    plt.xlabel("False Postive Rate (FPR)")
    plt.ylabel("True Postive Rate (TPR)")
    plt.title(model_name)
    plt.show()

In [None]:
# ROC and AUC of popular movies, KNN
rating_thresholds = [2.5, 3, 3.5, 4]
fpr_4 = []
tpr_4 = []
hold_4 = []
auc_4 = []

# print(dataset_popular)
# print(dataset_popular_threshold)

trainset, validset = train_test_split(dataset_sup_popular, test_size=0.1, random_state=42)        
knn = KNNWithMeans(k=46, min_k=1, sim_options={'name': 'pearson'}, verbose=False) # Note: k should be the best k in terms of RMSE
predict = knn.fit(trainset).test(validset)
for thres in rating_thresholds:
    real_rating = []
    predict_rating = []
    threshold = thres
    for i in range(len(predict)):
        if validset[i][2] < threshold:  # apply the rating threshold (such as 2.5, 3.0) on y_true in validset
            real_rating.append(0.0)
        else:
            real_rating.append(1.0)
        predict_rating.append(predict[i].est)
    fpr, tpr, hold = roc_curve(real_rating, predict_rating)
    fpr_4.append(fpr)
    tpr_4.append(tpr)
    hold_4.append(hold)
    auc = roc_auc_score(real_rating, predict_rating)
    auc_4.append(auc)
    
print('AUC for four thresholds:', auc_4)
for i in range(4):
    plt.plot(fpr_4[i], tpr_4[i], label='threshold='+str(rating_thresholds[i]))
plt.xlabel("False Postive Rate (FPR)")
plt.ylabel("True Postive Rate (TPR)")
plt.title('ROC curve for four thresholds of popular trimming (KNN)')
plt.legend(loc="lower right")
plt.savefig('Q6_popular_ROC.png', dpi=300)
plt.show()

In [None]:
# ROC and AUC of unpopular movies, KNN
rating_thresholds = [2.5, 3, 3.5, 4]
fpr_4 = []
tpr_4 = []
hold_4 = []
auc_4 = []

# print(dataset_unpopular)
# print(dataset_unpopular_threshold)

trainset, validset = train_test_split(dataset_sup_unpopular, test_size=0.1, random_state=42)
knn = KNNWithMeans(k=2, min_k=1, sim_options={'name': 'pearson'}, verbose=False)  # Note: k should be the best k in terms of RMSE
predict = knn.fit(trainset).test(validset)
for thres in rating_thresholds:
    real_rating = []
    predict_rating = []
    threshold = thres
    for i in range(len(predict)):
        if validset[i][2] < threshold:  # apply the rating threshold (such as 2.5, 3.0) on y_true in validset
            real_rating.append(0.0)
        else:
            real_rating.append(1.0)
        predict_rating.append(predict[i].est)
    fpr, tpr, hold = roc_curve(real_rating, predict_rating)
    fpr_4.append(fpr)
    tpr_4.append(tpr)
    hold_4.append(hold)
    auc = roc_auc_score(real_rating, predict_rating)
    auc_4.append(auc)
    
print('AUC for four thresholds:', auc_4)
for i in range(4):
    plt.plot(fpr_4[i], tpr_4[i], label='threshold='+str(rating_thresholds[i]))
plt.xlabel("False Postive Rate (FPR)")
plt.ylabel("True Postive Rate (TPR)")
plt.title('ROC curve for four thresholds of unpopular trimming (KNN)')
plt.legend(loc="lower right")
plt.savefig('Q6_unpopular_ROC.png', dpi=300)
plt.show()

In [None]:
# ROC and AUC of high variance movies, KNN
rating_thresholds = [2.5, 3, 3.5, 4]
fpr_4 = []
tpr_4 = []
hold_4 = []
auc_4 = []

# print(dataset_high_variance)
# print(dataset_high_variance_threshold)

trainset, validset = train_test_split(dataset_sup_high_variance, test_size=0.1, random_state=42)
knn = KNNWithMeans(k=2, min_k=1, sim_options={'name': 'pearson'}, verbose=False)  # Note: k should be the best k in terms of RMSE
predict = knn.fit(trainset).test(validset)
for thres in rating_thresholds:
    real_rating = []
    predict_rating = []
    threshold = thres
    for i in range(len(predict)):  # apply the rating threshold (such as 2.5, 3.0) on y_true in validset
        if validset[i][2] < threshold:
            real_rating.append(0.0)
        else:
            real_rating.append(1.0)    
        predict_rating.append(predict[i].est)
    fpr, tpr, hold = roc_curve(real_rating, predict_rating)
    fpr_4.append(fpr)
    tpr_4.append(tpr)
    hold_4.append(hold)
    auc = roc_auc_score(real_rating, predict_rating)
    auc_4.append(auc)
    
print('AUC for four thresholds:', auc_4)
for i in range(4):
    plt.plot(fpr_4[i], tpr_4[i], label='threshold='+str(rating_thresholds[i]))
plt.xlabel("False Postive Rate (FPR)")
plt.ylabel("True Postive Rate (TPR)")
plt.title('ROC curve for four thresholds of high variance trimming (KNN)')
plt.legend(loc="lower right")
plt.savefig('Q6_high_variance_ROC.png', dpi=300)
plt.show()

In [None]:
# ROC and AUC of original movies, KNN
rating_thresholds = [2.5, 3, 3.5, 4]
fpr_4 = []
tpr_4 = []
hold_4 = []
auc_4 = []

# print(dataset_original)
# print(dataset_original_threshold)

trainset, validset = train_test_split(dataset_sup_original, test_size=0.1, random_state=42)
knn = KNNWithMeans(k=30, min_k=1, sim_options={'name': 'pearson'}, verbose=False)  # k is the eyeball minimum k in Q5
predict = knn.fit(trainset).test(validset)
for thres in rating_thresholds:
    real_rating = []
    predict_rating = []
    threshold = thres
    for i in range(len(predict)):
        if validset[i][2] < threshold:  # apply the rating threshold (such as 2.5, 3.0) on y_true in validset
            real_rating.append(0.0)
        else:
            real_rating.append(1.0)   
        predict_rating.append(predict[i].est)
    fpr, tpr, hold = roc_curve(real_rating, predict_rating)
    fpr_4.append(fpr)
    tpr_4.append(tpr)
    hold_4.append(hold)
    auc = roc_auc_score(real_rating, predict_rating)
    auc_4.append(auc)
    
print('AUC for four thresholds:', auc_4)
for i in range(4):
    plt.plot(fpr_4[i], tpr_4[i], label='threshold='+str(rating_thresholds[i]))
plt.xlabel("False Postive Rate (FPR)")
plt.ylabel("True Postive Rate (TPR)")
plt.title('ROC curve for four thresholds without trimming (KNN)')
plt.legend(loc="lower right")
plt.savefig('Q6_original_ROC.png', dpi=300)
plt.show()

# save threshold = 3.0 values for ROC comparison
knn_fpr = fpr_4[1]
knn_tpr = tpr_4[1]
knn_auc = auc_4[1]

## Question 8

In [None]:
#################### Question 8A ########################
# It is similar to Question 4
print(dataset_sup_original)
k_neighbors = np.arange(2, 52, 2)
rmse_ave = []
mae_ave = []
for i in tqdm(range(2, 52, 2)):
    nmf = NMF(n_factors=i, n_epochs=50, random_state=42)
    scores = cross_validate(algo=nmf, data=dataset_sup_original, measures=['rmse', 'mae'], cv=10, n_jobs=-1, verbose=False)
    rmse_average = np.mean(scores['test_rmse'])
    rmse_ave.append(rmse_average)
    mae_average = np.mean(scores['test_mae'])
    mae_ave.append(mae_average)

plt.plot(k_neighbors, rmse_ave)
plt.ylabel('Average RMSE')
plt.xlabel('Number of latent factors')
plt.savefig('Q8A_rmse.png', dpi=300)
plt.show()

plt.plot(k_neighbors, mae_ave)
plt.ylabel('Average MAE')
plt.xlabel('Number of latent factors')
plt.savefig('Q8A_mae.png', dpi=300)
plt.show()

In [None]:
#################### Question 8B ########################
# Note: the minimum RMSE/MAE index may vary when re-run the cell above, but the results are similar (around 16-22)
k_neighbors = np.arange(2, 52, 2)
min_rmse_ave = min(rmse_ave)
min_rmse_ave_index = rmse_ave.index(min_rmse_ave)
print('Min average RMSE:', min_rmse_ave)
print('Min average RMSE index:', min_rmse_ave_index)
print('Num of latent factors of Min average RMSE:', k_neighbors[min_rmse_ave_index])

min_mae_ave = min(mae_ave)
min_mae_ave_index = mae_ave.index(min_mae_ave)
print('Min average MAE:', min_mae_ave)
print('Min average MAE index:', min_mae_ave_index)
print('Num of latent factors of Min average MAE:', k_neighbors[min_mae_ave_index])

In [None]:
#################### Question 8C ########################
# Popular movie trimming, NMF
# Note: It will cost about 10 minutes to run the codes in this cell
k_neighbors = np.arange(2, 52, 2)
kf = KFold(n_splits=10, random_state=42)
rmse_scores = []
for i in tqdm(range(2, 52, 2)):
    nmf = NMF(n_factors=i, n_epochs=50, random_state=42)
    rmse_fold = []
    for trainset, testset in kf.split(dataset_sup_popular):
        popular_rating_predict = nmf.fit(trainset).test(testset)
        score = rmse(predictions=popular_rating_predict, verbose=False)
        rmse_fold.append(score)
    rmse_scores.append(np.mean(rmse_fold))
plt.plot(k_neighbors, rmse_scores)
plt.ylabel('Average RMSE within Popular Movies')
plt.xlabel('Number of latent factors')
plt.savefig('Q8C_popular_rmse.png', dpi=300)
plt.show()
min_rmse = min(rmse_scores)
min_rmse_index = rmse_scores.index(min_rmse)
print(min_rmse)
print(min_rmse_index)
print(k_neighbors[min_rmse_index])

In [None]:
# Unpopular movie trimming, NMF
k_neighbors = np.arange(2, 52, 2)
kf = KFold(n_splits=10, random_state=42)
rmse_scores = []
for i in tqdm(range(2, 52, 2)):
    nmf = NMF(n_factors=i, n_epochs=50, random_state=42)
    rmse_fold = []
    for trainset, testset in kf.split(dataset_sup_unpopular):
        unpopular_rating_predict = nmf.fit(trainset).test(testset)
        score = rmse(predictions=unpopular_rating_predict, verbose=False)
        rmse_fold.append(score)
    rmse_scores.append(np.mean(rmse_fold))
plt.plot(k_neighbors, rmse_scores)
plt.ylabel('Average RMSE within Unpopular Movies')
plt.xlabel('Number of latent factors')
plt.savefig('Q8C_unpopular_rmse.png', dpi=300)
plt.show()
min_rmse = min(rmse_scores)
min_rmse_index = rmse_scores.index(min_rmse)
print(min_rmse)
print(min_rmse_index)
print(k_neighbors[min_rmse_index])

In [None]:
# High_variance movie trimming, NMF
k_neighbors = np.arange(2, 52, 2)
kf = KFold(n_splits=10, random_state=42)
rmse_scores = []
for i in tqdm(range(2, 52, 2)):
    nmf = NMF(n_factors=i, n_epochs=50, random_state=42)
    rmse_fold = []
    for trainset, testset in kf.split(dataset_sup_high_variance):
        high_variance_rating_predict = nmf.fit(trainset).test(testset)
        score = rmse(predictions=high_variance_rating_predict, verbose=False)
        rmse_fold.append(score)
    rmse_scores.append(np.mean(rmse_fold))
plt.plot(k_neighbors, rmse_scores)
plt.ylabel('Average RMSE within High Variance Movies')
plt.xlabel('Number of latent factors')
plt.savefig('Q8C_high_variance_rmse.png', dpi=300)
plt.show()
min_rmse = min(rmse_scores)
min_rmse_index = rmse_scores.index(min_rmse)
print(min_rmse)
print(min_rmse_index)
print(k_neighbors[min_rmse_index])

In [None]:
# ROC and AUC of popular movies, NMF
rating_thresholds = [2.5, 3, 3.5, 4]
fpr_4 = []
tpr_4 = []
hold_4 = []
auc_4 = []

# print(dataset_popular)
# print(dataset_popular_threshold)

trainset, validset = train_test_split(dataset_sup_popular, test_size=0.1, random_state=42)        
nmf = NMF(n_factors=16, n_epochs=50, random_state=42) # Note: n_factors should be the best k in terms of RMSE
predict = nmf.fit(trainset).test(validset)
for thres in rating_thresholds:
    real_rating = []
    predict_rating = []
    threshold = thres
    for i in range(len(predict)):
        if validset[i][2] < threshold:  # apply the rating threshold (such as 2.5, 3.0) on y_true in validset
            real_rating.append(0.0)
        else:
            real_rating.append(1.0)
        predict_rating.append(predict[i].est)
    fpr, tpr, hold = roc_curve(real_rating, predict_rating)
    fpr_4.append(fpr)
    tpr_4.append(tpr)
    hold_4.append(hold)
    auc = roc_auc_score(real_rating, predict_rating)
    auc_4.append(auc)
    
print('AUC for four thresholds:', auc_4)
for i in range(4):
    plt.plot(fpr_4[i], tpr_4[i], label='threshold='+str(rating_thresholds[i]))
plt.xlabel("False Postive Rate (FPR)")
plt.ylabel("True Postive Rate (TPR)")
plt.title('ROC curve for four thresholds of popular trimming (NMF)')
plt.legend(loc="lower right")
plt.savefig('Q8_popular_ROC.png', dpi=300)
plt.show()

In [None]:
# ROC and AUC of unpopular movies, NMF
rating_thresholds = [2.5, 3, 3.5, 4]
fpr_4 = []
tpr_4 = []
hold_4 = []
auc_4 = []

# print(dataset_unpopular)
# print(dataset_unpopular_threshold)

trainset, validset = train_test_split(dataset_sup_unpopular, test_size=0.1, random_state=42)
nmf = NMF(n_factors=50, n_epochs=50, random_state=42) # Note: n_factors should be the best k in terms of RMSE
predict = nmf.fit(trainset).test(validset)
for thres in rating_thresholds:
    real_rating = []
    predict_rating = []
    threshold = thres
    for i in range(len(predict)):
        if validset[i][2] < threshold:  # apply the rating threshold (such as 2.5, 3.0) on y_true in validset
            real_rating.append(0.0)
        else:
            real_rating.append(1.0)
        predict_rating.append(predict[i].est)
    fpr, tpr, hold = roc_curve(real_rating, predict_rating)
    fpr_4.append(fpr)
    tpr_4.append(tpr)
    hold_4.append(hold)
    auc = roc_auc_score(real_rating, predict_rating)
    auc_4.append(auc)
    
print('AUC for four thresholds:', auc_4)
for i in range(4):
    plt.plot(fpr_4[i], tpr_4[i], label='threshold='+str(rating_thresholds[i]))
plt.xlabel("False Postive Rate (FPR)")
plt.ylabel("True Postive Rate (TPR)")
plt.title('ROC curve for four thresholds of unpopular trimming (NMF)')
plt.legend(loc="lower right")
plt.savefig('Q8_unpopular_ROC.png', dpi=300)
plt.show()

In [None]:
# ROC and AUC of high variance movies, NMF
rating_thresholds = [2.5, 3, 3.5, 4]
fpr_4 = []
tpr_4 = []
hold_4 = []
auc_4 = []

# print(dataset_high_variance)
# print(dataset_high_variance_threshold)

trainset, validset = train_test_split(dataset_sup_high_variance, test_size=0.1, random_state=42)
nmf = NMF(n_factors=50, n_epochs=50, random_state=42) # Note: n_factors should be the best k in terms of RMSE
predict = nmf.fit(trainset).test(validset)
for thres in rating_thresholds:
    real_rating = []
    predict_rating = []
    threshold = thres
    for i in range(len(predict)):
        if validset[i][2] < threshold:  # apply the rating threshold (such as 2.5, 3.0) on y_true in validset
            real_rating.append(0.0)
        else:
            real_rating.append(1.0)    
        predict_rating.append(predict[i].est)
    fpr, tpr, hold = roc_curve(real_rating, predict_rating)
    fpr_4.append(fpr)
    tpr_4.append(tpr)
    hold_4.append(hold)
    auc = roc_auc_score(real_rating, predict_rating)
    auc_4.append(auc)
    
print('AUC for four thresholds:', auc_4)
for i in range(4):
    plt.plot(fpr_4[i], tpr_4[i], label='threshold='+str(rating_thresholds[i]))
plt.xlabel("False Postive Rate (FPR)")
plt.ylabel("True Postive Rate (TPR)")
plt.title('ROC curve for four thresholds of high variance trimming (NMF)')
plt.legend(loc="lower right")
plt.savefig('Q8_high_variance_ROC.png', dpi=300)
plt.show()

In [None]:
# ROC and AUC of original movies, NMF
rating_thresholds = [2.5, 3, 3.5, 4]
fpr_4 = []
tpr_4 = []
hold_4 = []
auc_4 = []

# print(dataset_original)
# print(dataset_original_threshold)

trainset, validset = train_test_split(dataset_sup_original, test_size=0.1, random_state=42)
nmf = NMF(n_factors=18, n_epochs=50, random_state=42) # Note: n_factors should be the best k in terms of RMSE/MAE
predict = nmf.fit(trainset).test(validset)
for thres in rating_thresholds:
    real_rating = []
    predict_rating = []
    threshold = thres
    for i in range(len(predict)):
        if validset[i][2] < threshold:  # apply the rating threshold (such as 2.5, 3.0) on y_true in validset
            real_rating.append(0.0)
        else:
            real_rating.append(1.0)   
        predict_rating.append(predict[i].est)
    fpr, tpr, hold = roc_curve(real_rating, predict_rating)
    fpr_4.append(fpr)
    tpr_4.append(tpr)
    hold_4.append(hold)
    auc = roc_auc_score(real_rating, predict_rating)
    auc_4.append(auc)
    
print('AUC for four thresholds:', auc_4)
for i in range(4):
    plt.plot(fpr_4[i], tpr_4[i], label='threshold='+str(rating_thresholds[i]))
plt.xlabel("False Postive Rate (FPR)")
plt.ylabel("True Postive Rate (TPR)")
plt.title('ROC curve for four thresholds without trimming (NMF)')
plt.legend(loc="lower right")
plt.savefig('Q8_original_ROC.png', dpi=300)
plt.show()

# save threshold = 3.0 values for ROC comparison
nmf_fpr = fpr_4[1]
nmf_tpr = tpr_4[1]
nmf_auc = auc_4[1]

## Question 9

In [None]:
#################### Question 9 ########################
# retrain nmf here by setting n_factors=20
trainset, validset = train_test_split(dataset_sup_original, test_size=0.1, random_state=42)
nmf = NMF(n_factors=20, n_epochs=50, random_state=42)
predict = nmf.fit(trainset).test(validset)
U = nmf.pu
V = nmf.qi
print('U:', U)
print(U.shape)
print('V:', V)
print(V.shape)  # why # of rows of V is not identical to # of movies? because some unpopular movies are not in the train set

In [None]:
sorted_index = []
for i in range(V.shape[1]):
    idx = np.argsort(-V[:,i])  # descending order
    sorted_index.append(idx[0:10])
print(sorted_index)
genres_file_name = "movies.csv"
generes_dataset = pd.read_csv(genres_file_name)    
print(generes_dataset.info)
for i in range(len(sorted_index)):
    print('Column=', i)
    print(generes_dataset['genres'][sorted_index[i]])

# Question 10

In [None]:
#################### Question 10A ########################
# It is similar to Question 4
print(dataset_sup_original)
k_neighbors = np.arange(2, 52, 2)
rmse_ave = []
mae_ave = []
for i in tqdm(range(2, 52, 2)):
    svd = SVD(n_factors=i, random_state=42, verbose=False)
    scores = cross_validate(algo=svd, data=dataset_sup_original, measures=['rmse', 'mae'], cv=10, n_jobs=-1, verbose=False)
    rmse_average = np.mean(scores['test_rmse'])
    rmse_ave.append(rmse_average)
    mae_average = np.mean(scores['test_mae'])
    mae_ave.append(mae_average)

plt.plot(k_neighbors, rmse_ave)
plt.ylabel('Average RMSE')
plt.xlabel('Number of latent factors')
plt.savefig('Q10A_rmse.png', dpi=300)
plt.show()

plt.plot(k_neighbors, mae_ave)
plt.ylabel('Average MAE')
plt.xlabel('Number of latent factors')
plt.savefig('Q10A_mae.png', dpi=300)
plt.show()

In [None]:
#################### Question 10B ########################
# Note: the minimum RMSE/MAE index may vary when re-run the cell above, but the results are similar (around 16-22)
k_neighbors = np.arange(2, 52, 2)
min_rmse_ave = min(rmse_ave)
min_rmse_ave_index = rmse_ave.index(min_rmse_ave)
print('Min average RMSE:', min_rmse_ave)
print('Min average RMSE index:', min_rmse_ave_index)
print('Num of latent factors of Min average RMSE:', k_neighbors[min_rmse_ave_index])
k_original = k_neighbors[min_rmse_ave_index]

min_mae_ave = min(mae_ave)
min_mae_ave_index = mae_ave.index(min_mae_ave)
print('Min average MAE:', min_mae_ave)
print('Min average MAE index:', min_mae_ave_index)
print('Num of latent factors of Min average MAE:', k_neighbors[min_mae_ave_index])

In [None]:
#################### Question 10C ########################
# Popular movie trimming, SVD
# Note: It will cost about 10 minutes to run the codes in this cell
k_neighbors = np.arange(2, 52, 2)
kf = KFold(n_splits=10, random_state=42)
rmse_scores = []
for i in tqdm(range(2, 52, 2)):
    svd = SVD(n_factors=i, random_state=42)
    rmse_fold = []
    for trainset, testset in kf.split(dataset_sup_popular):
        popular_rating_predict = svd.fit(trainset).test(testset)
        score = rmse(predictions=popular_rating_predict, verbose=False)
        rmse_fold.append(score)
    rmse_scores.append(np.mean(rmse_fold))
plt.plot(k_neighbors, rmse_scores)
plt.ylabel('Average RMSE within Popular Movies')
plt.xlabel('Number of latent factors')
plt.savefig('Q10C_popular_rmse.png', dpi=300)
plt.show()
min_rmse = min(rmse_scores)
min_rmse_index = rmse_scores.index(min_rmse)
print(min_rmse)
print(min_rmse_index)
print(k_neighbors[min_rmse_index])
k_popular = k_neighbors[min_rmse_index]

In [None]:
# Unpopular movie trimming, SVD
k_neighbors = np.arange(2, 52, 2)
kf = KFold(n_splits=10, random_state=42)
rmse_scores = []
for i in tqdm(range(2, 52, 2)):
    svd = SVD(n_factors=i, random_state=42)
    rmse_fold = []
    for trainset, testset in kf.split(dataset_sup_unpopular):
        unpopular_rating_predict = svd.fit(trainset).test(testset)
        score = rmse(predictions=unpopular_rating_predict, verbose=False)
        rmse_fold.append(score)
    rmse_scores.append(np.mean(rmse_fold))
plt.plot(k_neighbors, rmse_scores)
plt.ylabel('Average RMSE within Unpopular Movies')
plt.xlabel('Number of latent factors')
plt.savefig('Q10C_unpopular_rmse.png', dpi=300)
plt.show()
min_rmse = min(rmse_scores)
min_rmse_index = rmse_scores.index(min_rmse)
print(min_rmse)
print(min_rmse_index)
print(k_neighbors[min_rmse_index])
k_unpopular = k_neighbors[min_rmse_index]

In [None]:
# High_variance movie trimming, SVD
k_neighbors = np.arange(2, 52, 2)
kf = KFold(n_splits=10, random_state=42)
rmse_scores = []
for i in tqdm(range(2, 52, 2)):
    svd = SVD(n_factors=i, random_state=42)
    rmse_fold = []
    for trainset, testset in kf.split(dataset_sup_high_variance):
        high_variance_rating_predict = svd.fit(trainset).test(testset)
        score = rmse(predictions=high_variance_rating_predict, verbose=False)
        rmse_fold.append(score)
    rmse_scores.append(np.mean(rmse_fold))
plt.plot(k_neighbors, rmse_scores)
plt.ylabel('Average RMSE within High Variance Movies')
plt.xlabel('Number of latent factors')
plt.savefig('Q10C_high_variance_rmse.png', dpi=300)
plt.show()
min_rmse = min(rmse_scores)
min_rmse_index = rmse_scores.index(min_rmse)
print(min_rmse)
print(min_rmse_index)
print(k_neighbors[min_rmse_index])
k_high_var = k_neighbors[min_rmse_index]

In [None]:
# ROC and AUC of popular movies, SVD
rating_thresholds = [2.5, 3, 3.5, 4]
fpr_4 = []
tpr_4 = []
hold_4 = []
auc_4 = []

# print(dataset_popular)
# print(dataset_popular_threshold)

trainset, validset = train_test_split(dataset_sup_popular, test_size=0.1, random_state=42)        
svd = SVD(n_factors=k_popular, random_state=42) # Note: n_factors should be the best k in terms of RMSE
predict = svd.fit(trainset).test(validset)
for thres in rating_thresholds:
    real_rating = []
    predict_rating = []
    threshold = thres
    for i in range(len(predict)):
        if validset[i][2] < threshold:  # apply the rating threshold (such as 2.5, 3.0) on y_true in validset
            real_rating.append(0.0)
        else:
            real_rating.append(1.0)
        predict_rating.append(predict[i].est)
    fpr, tpr, hold = roc_curve(real_rating, predict_rating)
    fpr_4.append(fpr)
    tpr_4.append(tpr)
    hold_4.append(hold)
    auc = roc_auc_score(real_rating, predict_rating)
    auc_4.append(auc)
    
print('AUC for four thresholds:', auc_4)
for i in range(4):
    plt.plot(fpr_4[i], tpr_4[i], label='threshold='+str(rating_thresholds[i]))
plt.xlabel("False Postive Rate (FPR)")
plt.ylabel("True Postive Rate (TPR)")
plt.title('ROC curve for four thresholds of popular trimming (SVD)')
plt.legend(loc="lower right")
plt.savefig('Q10_popular_ROC.png', dpi=300)
plt.show()

In [None]:
# ROC and AUC of unpopular movies, SVD
rating_thresholds = [2.5, 3, 3.5, 4]
fpr_4 = []
tpr_4 = []
hold_4 = []
auc_4 = []

# print(dataset_unpopular)
# print(dataset_unpopular_threshold)

trainset, validset = train_test_split(dataset_sup_unpopular, test_size=0.1, random_state=42)
svd = SVD(n_factors=k_unpopular, random_state=42) # Note: n_factors should be the best k in terms of RMSE
predict = svd.fit(trainset).test(validset)
for thres in rating_thresholds:
    real_rating = []
    predict_rating = []
    threshold = thres
    for i in range(len(predict)):
        if validset[i][2] < threshold:  # apply the rating threshold (such as 2.5, 3.0) on y_true in validset
            real_rating.append(0.0)
        else:
            real_rating.append(1.0)
        predict_rating.append(predict[i].est)
    fpr, tpr, hold = roc_curve(real_rating, predict_rating)
    fpr_4.append(fpr)
    tpr_4.append(tpr)
    hold_4.append(hold)
    auc = roc_auc_score(real_rating, predict_rating)
    auc_4.append(auc)
    
print('AUC for four thresholds:', auc_4)
for i in range(4):
    plt.plot(fpr_4[i], tpr_4[i], label='threshold='+str(rating_thresholds[i]))
plt.xlabel("False Postive Rate (FPR)")
plt.ylabel("True Postive Rate (TPR)")
plt.title('ROC curve for four thresholds of unpopular trimming (SVD)')
plt.legend(loc="lower right")
plt.savefig('Q10_unpopular_ROC.png', dpi=300)
plt.show()

In [None]:
# ROC and AUC of high variance movies, SVD
rating_thresholds = [2.5, 3, 3.5, 4]
fpr_4 = []
tpr_4 = []
hold_4 = []
auc_4 = []

# print(dataset_high_variance)
# print(dataset_high_variance_threshold)

trainset, validset = train_test_split(dataset_sup_high_variance, test_size=0.1, random_state=42)
svd = SVD(n_factors=k_high_var, random_state=42) # Note: n_factors should be the best k in terms of RMSE
predict = svd.fit(trainset).test(validset)
for thres in rating_thresholds:
    real_rating = []
    predict_rating = []
    threshold = thres
    for i in range(len(predict)):
        if validset[i][2] < threshold:  # apply the rating threshold (such as 2.5, 3.0) on y_true in validset
            real_rating.append(0.0)
        else:
            real_rating.append(1.0)    
        predict_rating.append(predict[i].est)
    fpr, tpr, hold = roc_curve(real_rating, predict_rating)
    fpr_4.append(fpr)
    tpr_4.append(tpr)
    hold_4.append(hold)
    auc = roc_auc_score(real_rating, predict_rating)
    auc_4.append(auc)
    
print('AUC for four thresholds:', auc_4)
for i in range(4):
    plt.plot(fpr_4[i], tpr_4[i], label='threshold='+str(rating_thresholds[i]))
plt.xlabel("False Postive Rate (FPR)")
plt.ylabel("True Postive Rate (TPR)")
plt.title('ROC curve for four thresholds of high variance trimming (SVD)')
plt.legend(loc="lower right")
plt.savefig('Q10_high_variance_ROC.png', dpi=300)
plt.show()

In [None]:
# ROC and AUC of original movies, SVD
rating_thresholds = [2.5, 3, 3.5, 4]
fpr_4 = []
tpr_4 = []
hold_4 = []
auc_4 = []

# print(dataset_original)
# print(dataset_original_threshold)

trainset, validset = train_test_split(dataset_sup_original, test_size=0.1, random_state=42)
svd = SVD(n_factors=k_original, random_state=42) # Note: n_factors should be the best k in terms of RMSE/MAE
predict = svd.fit(trainset).test(validset)
for thres in rating_thresholds:
    real_rating = []
    predict_rating = []
    threshold = thres
    for i in range(len(predict)):
        if validset[i][2] < threshold:  # apply the rating threshold (such as 2.5, 3.0) on y_true in validset
            real_rating.append(0.0)
        else:
            real_rating.append(1.0)   
        predict_rating.append(predict[i].est)
    fpr, tpr, hold = roc_curve(real_rating, predict_rating)
    fpr_4.append(fpr)
    tpr_4.append(tpr)
    hold_4.append(hold)
    auc = roc_auc_score(real_rating, predict_rating)
    auc_4.append(auc)
    
print('AUC for four thresholds:', auc_4)
for i in range(4):
    plt.plot(fpr_4[i], tpr_4[i], label='threshold='+str(rating_thresholds[i]))
plt.xlabel("False Postive Rate (FPR)")
plt.ylabel("True Postive Rate (TPR)")
plt.title('ROC curve for four thresholds without trimming (SVD)')
plt.legend(loc="lower right")
plt.savefig('Q10_original_ROC.png', dpi=300)
plt.show()

# save threshold = 3.0 values for ROC comparison
mf_fpr = fpr_4[1]
mf_tpr = tpr_4[1]
mf_auc = auc_4[1]

## Question 11

In [None]:
from surprise import AlgoBase

# Define naive collaborative filter class that estimates using mean ratings
class Naive(AlgoBase):
    def __init__(self):
        AlgoBase.__init__(self)

    def estimate(self, uid, iid):
        return np.mean([r for (_, r) in self.trainset.ur[uid]])

In [None]:
# Original dataset
naive = Naive()
score = cross_validate(algo=naive, data=dataset_sup_original, measures=['rmse'], cv=10, verbose=False)
print("Average RMSE for naive collaborative filter:", np.mean(score['test_rmse']))

In [None]:
# Popular movie trimming, Naive
kf = KFold(n_splits=10, random_state=42)

naive = Naive()
rmse_fold = []
for trainset, testset in kf.split(dataset_sup_popular):
    popular_rating_predict = naive.fit(trainset).test(testset)
    score = rmse(predictions=popular_rating_predict, verbose=False)
    rmse_fold.append(score)
print("Average RMSE for Popular Movies:", np.mean(rmse_fold))

In [None]:
# Unpopular movie trimming, Naive
kf = KFold(n_splits=10, random_state=42)

naive = Naive()
rmse_fold = []
for trainset, testset in kf.split(dataset_sup_unpopular):
    unpopular_rating_predict = naive.fit(trainset).test(testset)
    score = rmse(predictions=unpopular_rating_predict, verbose=False)
    rmse_fold.append(score)
print("Average RMSE for Unpopular Movies:", np.mean(rmse_fold))

In [None]:
# High variance movie trimming, Naive

kf = KFold(n_splits=10, random_state=42)

naive = Naive()
rmse_fold = []
for trainset, testset in kf.split(dataset_sup_high_variance):
    high_variance_rating_predict = naive.fit(trainset).test(testset)
    score = rmse(predictions=high_variance_rating_predict, verbose=False)
    rmse_fold.append(score)
print("Average RMSE for High Variance Movies:", np.mean(rmse_fold))

## Question 12

In [None]:
# plot comparison of ROC curves for three architectures
plt.figure(figsize=(8,6))
plt.plot(knn_fpr, knn_tpr, '-', label='k-NN')
plt.plot(nmf_fpr, nmf_tpr, '--', label='NMF')
plt.plot(mf_fpr, mf_tpr, '-.', label='MF')

plt.xlabel("False Postive Rate (FPR)")
plt.ylabel("True Postive Rate (TPR)")
plt.title('ROC curve for three models without trimming')
plt.legend(loc="lower right")
plt.show()

## Question 14

In [None]:
def clean_and_fetch_validation_set(validset, t):
    # save information about each user to validate two conditions
    # also get G for later parts of the project
    user_information = {}

    # go through each user in the validation set
    for uid, iid, rating in validset:
        # first time seeing this user id
        if uid not in user_information:
            user_information[uid] = [0, set()]
        
        # update item count
        user_information[uid][0] += 1
        
        # add movie id to set if user has liked the movie
        if rating >= 3.0:
            user_information[uid][1].add(iid)
    
    # do a second pass through the dataset and use prior information to
    # remove or keep entries
    # the first condition - ensures that S(t) is valid with t elements
    # the second condition - ensures that user has liked a movie
    validset = [entry for entry in validset if user_information[entry[0]][0] >= t and len(user_information[entry[0]][1]) > 0]
    return validset, user_information

In [None]:
from operator import itemgetter

def calculate_precision_and_recall(predictions, t, user_information):
    # G = set of liked items by user id = second entry in user_information dict
    # create a set of estimated ratings per user - to be sorted for top t elements
    estimated_ratings_for_user = {}
    for uid, iid, _, estimate, _ in predictions:
        if uid not in estimated_ratings_for_user:
            estimated_ratings_for_user[uid] = set()
        
        # save estimated rating and movie ID
        estimated_ratings_for_user[uid].add((iid, estimate))

    precision_per_user = []
    recall_per_user = []
    # go through all users and calculate precision and recall
    for uid, movie_and_estimates in estimated_ratings_for_user.items():
        # top t movie ids based on estimates (descending order)
        sorted_t_movie_and_estimates = sorted(movie_and_estimates, key = itemgetter(1), reverse = True)[:t]
        # get a set of movies S(t) to compare to G
        St = set([movie_and_estimate[0] for movie_and_estimate in sorted_t_movie_and_estimates])
        # calculate intersection with liked movies G
        intersection = St.intersection(user_information[uid][1])
        # calculate precision and recall for user
        precision_per_user.append(len(intersection)/len(St))
        recall_per_user.append(len(intersection)/len(user_information[uid][1]))
    
    # return average precision and recall for all users
    return np.mean(precision_per_user), np.mean(recall_per_user)

In [None]:
def compute_pr_metrics(model, dataset):
    # 10-fold cross-validation
    kf = KFold(n_splits=10, random_state=42)
    # sweep t from 1 to 25 (inclusive) in step sizes of 1
    precision_per_t = []
    recall_per_t = []

    for t in tqdm(range(1, 26)):
        precision_per_fold = []
        recall_per_fold = []

        # split dataset
        for trainset, validset in kf.split(dataset):
            # get cleaned validation set and user information
            validset, user_information = clean_and_fetch_validation_set(validset, t)
            # get predictions
            predictions = model.fit(trainset).test(validset)
            # get precision and recall
            precision, recall = calculate_precision_and_recall(predictions, t, user_information)
            # append to per fold values
            precision_per_fold.append(precision)
            recall_per_fold.append(recall)
        
        precision_per_t.append(np.mean(precision_per_fold))
        recall_per_t.append(np.mean(recall_per_fold))
    
    return precision_per_t, recall_per_t

In [None]:
# 3 best models
knn = KNNWithMeans(k=30, min_k=1, sim_options={'name': 'pearson'}, verbose=False)  # k is the eyeball minimum k in Q5
nmf = NMF(n_factors=18, n_epochs=50, random_state=42) # Note: n_factors should be the best k in terms of RMSE/MAE
svd = SVD(n_factors=k_original, random_state=42) # Note: n_factors should be the best k in terms of RMSE/MAE

# compute precision and recall per t for each model
knn_precision, knn_recall = compute_pr_metrics(knn, dataset_sup_original)
nmf_precision, nmf_recall = compute_pr_metrics(nmf, dataset_sup_original)
mf_precision, mf_recall = compute_pr_metrics(svd, dataset_sup_original)

In [None]:
def plot_precision_recall(precision, recall, color, title):
    plt.figure(figsize=(8,6))
    sns.lineplot(x=recall, y=precision, color=color, marker='o')
    plt.xlabel('Mean 10-fold recall')
    plt.ylabel('Mean 10-fold precision')
    plt.title(title)
    plt.show()

# knn_precision, knn_recall
def plot_against_t(data, ylabel, color, title):
    plt.figure(figsize=(8,6))
    sns.lineplot(x=range(1,26), y=data, color=color, marker='o')
    plt.xlabel('Size of recommended list, t')
    plt.ylabel(ylabel)
    plt.title(title)
    plt.show()

In [None]:
plot_against_t(knn_precision, "Mean 10-fold precision", "tomato", "Precision of k-NN for varying size of recommended list t")

In [None]:
plot_against_t(knn_recall, "Mean 10-fold recall", "lightsalmon", "Recall of k-NN for varying size of recommended list t")

In [None]:
plot_precision_recall(knn_precision, knn_recall, "firebrick", "Precision-recall for k-NN with k = 30")

In [None]:
plot_against_t(nmf_precision, "Mean 10-fold precision", "darkturquoise", "Precision of NMF for varying size of recommended list t")

In [None]:
plot_against_t(nmf_recall, "Mean 10-fold recall", "lightskyblue", "Recall of NMF for varying size of recommended list t")

In [None]:
plot_precision_recall(nmf_precision, nmf_recall, "steelblue", "Precision-recall for NMF with factors = 18")

In [None]:
plot_against_t(mf_precision, "Mean 10-fold precision", "khaki", "Precision of MF for varying size of recommended list t")

In [None]:
plot_against_t(mf_recall, "Mean 10-fold recall", "wheat", "Recall of MF for varying size of recommended list t")

In [None]:
plot_precision_recall(mf_precision, mf_recall, "gold", "Precision-recall for MF with factors = " + str(k_original))

In [None]:
# Plot three precision-recall curves for three best architectures

plt.figure(figsize=(8,6))
sns.lineplot(x=knn_recall, y=knn_precision, color='firebrick', marker='o', label='k-NN')
plt.plot(nmf_recall, nmf_precision, color='steelblue', marker='o', label='NMF')
plt.plot(mf_recall, mf_precision, color='gold', marker='o', label='MF')
plt.legend()
plt.xlabel('Mean 10-fold recall')
plt.ylabel('Mean 10-fold precision')
plt.title('Precision-recall curves for best architectures')
plt.show()