# Evaluation of Recommender Systems

Based on the same dataset used on previous weeks, let us evaluate the Collaborative Filtering (CF) models implemented last week.

## Exercise 1

1. Load the test set and the predictions made with both Collaborative Filtering models in the previous session. 
2. Detect those users which are in the training set but not in the test set. Remove their predictions before evaluating the systems.
3. Report the Root Mean Square Error (RMSE) for both CF models defined in the previous session.

In [1]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds
from surprise import Reader
from surprise import Dataset
from surprise import SVD
from surprise import KNNWithMeans
import surprise
from collections import defaultdict

In [2]:
import gzip
import os
import json
import pandas as pd
import numpy as np
def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield json.loads(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

df = getDF('All_Beauty_5.json.gz')

df = df.sort_values(by=['reviewerID', 'asin', 'unixReviewTime'])
cleaned_dataset = df.dropna(subset=['overall']).drop_duplicates(subset=['reviewerID', 'asin'], keep = 'last').reset_index(drop=True)
# print(len(cleaned_dataset))
# cleaned_dataset.head()
cleaned_dataset = cleaned_dataset.sort_values(by=['reviewerID', 'unixReviewTime']).reset_index(drop=True)
# extracting the latest (in time) positively rated item (rating  ≥4 ) by each user. 
test_data_pre = cleaned_dataset[cleaned_dataset.overall >= 4.0].drop_duplicates(subset=['reviewerID'], keep='last')
# generate training data
training_data = cleaned_dataset.drop(test_data_pre.index)

# Remove users that do not appear in the training set.
user_in_training = test_data_pre['reviewerID'].isin(training_data['reviewerID'])
test_data = test_data_pre[user_in_training]

reader = Reader(rating_scale=(1, 5))
training = Dataset.load_from_df(training_data[['reviewerID', 'asin', 'overall']], reader=reader)
testing = Dataset.load_from_df(test_data[['reviewerID', 'asin', 'overall']], reader=reader)

sim_options = {'name': 'cosine',
               'user_based': True  # compute  similarities between items
               }

algo_svd = SVD(n_epochs=500, n_factors=30, random_state=0)
algo_knn = KNNWithMeans(k = 10 , sim_options = sim_options)

trainset = training.build_full_trainset()
testset = trainset.build_anti_testset()

In [3]:
import os
import sys
sys.path.append('../')
import pickle
import pandas as pd

# TEST
### YOUR CODE HERE ###

# PREDICTIONS
pred_nb_list = algo_knn.fit(trainset).test(testset)
pred_lf_list = algo_svd.fit(trainset).test(testset)
### YOUR CODE HERE ###

# Detect users from training set that are not in test
nb_users = set([pred.uid for pred in pred_nb_list])
lf_users = set([pred.uid for pred in pred_lf_list])
nb_users_in_pred_but_not_in_test = list(nb_users.difference(set(test_data['reviewerID'])))
lf_users_in_pred_but_not_in_test = list(lf_users.difference(set(test_data['reviewerID'])))
assert nb_users_in_pred_but_not_in_test == lf_users_in_pred_but_not_in_test
print(f"There are {len(lf_users_in_pred_but_not_in_test)} users in the training set that are not in the test set.")
print(len(pred_lf_list))
print(len(pred_nb_list))
print(surprise.accuracy.rmse(pred_lf_list))
print(surprise.accuracy.rmse(pred_nb_list))
# Remove these users' predictions for evaluation
### YOUR CODE HERE ###

Computing the cosine similarity matrix...
Done computing similarity matrix.
There are 32 users in the training set that are not in the test set.
54746
54746
RMSE: 0.7136
0.7135826611787258
RMSE: 0.8990
0.8989955134770856


In [4]:
print(type(pred_lf_list[0]))

<class 'surprise.prediction_algorithms.predictions.Prediction'>


In [36]:
rear_lf_list = []
for i,name in enumerate(pred_lf_list):
    if name.uid not in lf_users_in_pred_but_not_in_test:
        rear_lf_list.append(pred_lf_list[i])

rear_nb_list = []
for i,name in enumerate(pred_nb_list):
    if name.uid not in nb_users_in_pred_but_not_in_test:
        rear_nb_list.append(pred_nb_list[i])
    

In [37]:
print(len(rear_lf_list))
print(len(rear_nb_list))
print(surprise.accuracy.rmse(rear_lf_list))
print(surprise.accuracy.rmse(rear_nb_list))

52988
52988
RMSE: 0.5486
0.5485685650260251
RMSE: 0.6856
0.6855684531262348


## Exercise 2
Define a general method to get the top-k recommendations for each user. Print the top-k with k={5, 10} recommendations for the user with ID 'ARARUVZ8RUF5T' and its estimated ratings.

In [38]:
def top_prediction(rank, pred_list, uid):
    filted_pred_list = list(filter(lambda x: x.uid == uid, pred_list))
    filted_pred_list.sort(key=lambda x: x.est, reverse=True)
    return [(i.iid,i.est) for i in filted_pred_list][:rank]

print(top_prediction(10, rear_lf_list, 'ARARUVZ8RUF5T'))
print(top_prediction(10, rear_nb_list, 'ARARUVZ8RUF5T'))

[('B006IB5T4W', 5), ('B001F51RAG', 5), ('B00NT0AR7E', 5), ('B006WYJM8Y', 5), ('B00155Z6V2', 5), ('B00021DJ32', 4.993923352784167), ('B001QY8QXM', 4.948791090341968), ('B000X7ST9Y', 4.840934395462433), ('B00126LYJM', 4.839149265457805), ('B002RZZXYE', 4.834041803397214)]
[('B000WR2HB6', 5), ('B000FOI48G', 4.675), ('B000VV1YOY', 4.666666666666667), ('B001ET7FZE', 4.6), ('B000PKKAGO', 4.5), ('B00EF1QRMU', 4.470205150915517), ('B016V8YWBC', 4.458333333333333), ('B00W259T7G', 4.42), ('B00CZH3K1C', 4.333333333333334), ('B000GLRREU', 4.233333333333333)]


## Excercise 3
Report Precision@k (P@k), MAP@k and the MRR@k with k={5, 10, 20} averaged across users for both CF systems. When computing precision, we consider as relevant items those with an observed rating >= 4.0 (i.e., those items from the test set with a rating >= 4.0). Reflect on the differences obtained. 

In [56]:
relevant_matrix = pd.DataFrame(columns = test_data['asin'].drop_duplicates(), index = test_data['reviewerID'].drop_duplicates())
for row_i in range(len(test_data)):
    reviewerID = list(test_data.reviewerID)[row_i]
    asin = list(test_data.asin)[row_i]
    rate = list(test_data.overall)[row_i]
    relevant_matrix.loc[reviewerID, asin] = 1 if rate >= 4 else 0
relevant_matrix = relevant_matrix.fillna(0)

In [73]:

def p_k_user(filted_pred_list, user_id, cut_off, relevant_matrix:pd.DataFrame):
    summation = []
    for i in range(cut_off):
        item_id = filted_pred_list[i][0]
        try:
            summation.append(relevant_matrix.loc[user_id, item_id])
        except KeyError:
            summation.append(0)
    return sum(summation)/float(cut_off)

def ap_k_user(filted_pred_list, user_id, cut_off, relevant_matrix:pd.DataFrame):
    num_relevance = sum(relevant_matrix.loc[user_id,:])
    summation = []
    for i in range(cut_off):
        item_id = filted_pred_list[i][0]
        try:
            if relevant_matrix.loc[user_id, item_id] == 1:
                summation.append(p_k_user(filted_pred_list, user_id, i+1, relevant_matrix))
        except KeyError:
            summation.append(0)
    # if num_relevance == 0:
    #     return 0
    return sum(summation)/float(num_relevance)

def rr_k_user(filted_pred_list, user_id, cut_off, relevant_matrix:pd.DataFrame):
    for i in range(cut_off):
        item_id = filted_pred_list[i][0]
        try:
            if relevant_matrix.loc[user_id, item_id] == 1:
                return 1/float(i+1)
        except KeyError:
            continue
    return 0

def hr_k_user(filted_pred_list, user_id, cut_off, relevant_matrix:pd.DataFrame):
    for i in range(cut_off):
        item_id = filted_pred_list[i][0]
        try:
            if relevant_matrix.loc[user_id, item_id] == 1:
                return 1
        except KeyError:
            continue
    return 0

def mean_k(pred_list, cut_off, function, relevant_matrix:pd.DataFrame):
    user_list = []
    user_list = [item.uid for item in pred_list if item.uid not in user_list]
    num_users = len(relevant_matrix.index)
    summation = []
    user_item_rating = defaultdict(list)
    for pred in pred_list:
        user_item_rating[pred.uid].append((pred.iid, pred.est))
    for user_id, filted_pred_list in user_item_rating.items():
        filted_pred_list.sort(key=lambda x: x[1], reverse=True)
        summation.append(function(filted_pred_list, user_id, cut_off, relevant_matrix))
    return sum(summation)/float(num_users)


In [74]:
print(mean_k(rear_nb_list, 5, p_k_user, relevant_matrix))
print(mean_k(rear_nb_list, 5, ap_k_user, relevant_matrix))
print(mean_k(rear_nb_list, 5, rr_k_user, relevant_matrix))


0.14794520547945267
0.1674569722514925
0.1674569722514925


## Excercise 4

Based on the top-5, top-10 and top-20 predictions from Exercise 2, compute the systems’ hit rate averaged over the total number of users in the test set.

In [75]:
print(mean_k(rear_nb_list, 5, hr_k_user, relevant_matrix))

0.7397260273972602
