In [1]:
import json
import pandas as pd
from tqdm import tqdm
import numpy as np

In [2]:
# evaluation functions for recommender PERFORMANCE

In [3]:
def filter_test_set(test_df, min_user_test_entries):
  """
  filters test set to only include users for whom we have enough data to evaluate them on
  """
  counts = test_df['user_id'].value_counts()
  from collections import Counter
  counter = Counter(counts)
  print(counter)
  keep_users = counts[counts >= min_user_test_entries].index
  return test_df[test_df['user_id'].isin(keep_users)]

In [4]:
def highly_rated_recall_at_k(user_recommendations, train_df, test_df, user_id, k=20):
  """
  user_recommendations: list of book recommendations for the user
  train_df: dataset to make recommendations
  test_df: validation dataset
  user_id: the user's id
  k: number of books to recommend

  computes recall @ k for highly rated books
  """
  # get set of "ground truth books" (books that appear for the user in the test set, which they have rated >= 7)
  user_ratings_test = test_df[test_df.user_id == user_id]
  if len(user_ratings_test) == 0:
    return 0
  highly_rated = user_ratings_test[user_ratings_test['rating'] >= 7]
  ground_truth = set(highly_rated['join_title'])

  num_matches = len([book for book in user_recommendations if book in ground_truth])
  user_test_books = len(ground_truth)

  return num_matches / user_test_books if user_test_books else 0


In [5]:
def recall_at_k(user_recommendations, train_df, test_df, user_id, k=20):
  """
  user_recommendations: list of book recommendations for the user
  train_df: dataset to make recommendations
  test_df: validation dataset
  user_id: the user's id
  k: number of books to recommend

  computes recall @ k
  """
  # get set of "ground truth books" (books that appear for the user in the test set)
  user_ratings_test = test_df[test_df.user_id == user_id].copy()
  if len(user_ratings_test) == 0:
    return 0

  ground_truth = set(user_ratings_test['join_title'])

  num_matches = len([book for book in user_recommendations if book in ground_truth])
  user_test_books = len(ground_truth)

  return num_matches / user_test_books if user_test_books else 0

In [6]:
def precision_at_k(user_recommendations, train_df, test_df, user_id, k=20):
  """
  user_recommendations: list of book recommendations for the user
  train_df: dataset to make recommendations
  test_df: validation dataset
  user_id: the user's id
  k: number of books to recommend

  computes precision @ k
  """
  # get set of "ground truth books" (books that appear for the user in the test set)
  user_test = test_df[test_df.user_id == user_id]
  if len(user_test) == 0:
      return 0
  ground_truth = set(user_test['join_title'])

  num_matches = len([book for book in user_recommendations if book in ground_truth])

  return num_matches / k

In [7]:
def highly_rated_precision_at_k(user_recommendations, train_df, test_df, user_id, k=20):
  """
  user_recommendations: list of book recommendations for the user
  train_df: dataset to make recommendations
  test_df: validation dataset
  user_id: the user's id
  k: number of books to recommend

  computes precision @ k for highly rated books
  """
  # get set of "ground truth books" (books that appear for the user in the test set, which they have rated >= 7)
  user_test = test_df[test_df.user_id == user_id]
  if len(user_test) == 0:
      return 0
  ground_truth = set(user_test[user_test.rating >= 7]['join_title'])

  num_matches = len([book for book in user_recommendations if book in ground_truth])

  return num_matches / k

In [8]:
def f1_score(precision, recall):
  """
  computes f1 score from precision and recall
  """
  return 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

In [9]:
def evaluate(recommendations, train_df, test_df, k=20):
  """
  recommendations: a dictionary mapping user_ids (strings) to lists of book recommendations for each user
  train_df: dataset to make recommendations
  test_df: validation dataset
  user_id: the user's id
  k: number of books to recommend
  """
  # get data for only evaluation users
  valid_user_ids = list(set(train_df.user_id.unique()) & set(test_df.user_id.unique()))

  eval_user_ids = [user_id for user_id in test_df.user_id.unique()]

  highly_rated_recalls = []
  recalls = []
  precisions = []
  highly_rated_precisions = []
  for user_id in tqdm(eval_user_ids):
    user_id_str = str(user_id)
    user_recommendations = recommendations[user_id_str]
    highly_rated_recall = highly_rated_recall_at_k(user_recommendations, train_df, test_df, user_id, k=20)
    recall = recall_at_k(user_recommendations, train_df, test_df, user_id, k=20)
    precision = precision_at_k(user_recommendations, train_df, test_df, user_id, k=20)
    highly_rated_precision = highly_rated_precision_at_k(user_recommendations, train_df, test_df, user_id, k=20)
    highly_rated_recalls.append(highly_rated_recall)
    recalls.append(recall)
    precisions.append(precision)
    highly_rated_precisions.append(highly_rated_precision)

  f1_scores = [f1_score(p, r) for p, r in zip(precisions, recalls)]
  avg_f1_score = np.mean(f1_scores)
  highly_rated_f1_scores = [f1_score(p, r) for p, r in zip(highly_rated_precisions, highly_rated_recalls)]
  avg_highly_rated_f1_score = np.mean(highly_rated_f1_scores)

  avg_highly_rated_recall = np.mean(highly_rated_recalls)
  avg_recall = np.mean(recalls)
  avg_precision = np.mean(precisions)
  avg_highly_rated_precision = np.mean(highly_rated_precisions)

  print(f"\nAverage recall@k={k}: {avg_recall} for k = {k}")
  print(f"\nAverage highly rated recall@k={k}: {avg_highly_rated_recall}")
  print(f"\nAverage precision@k={k}: {avg_precision}")
  print(f"\nAverage highly rated precision@k={k}: {avg_highly_rated_precision}")
  print(f"\nAverage f1@k={k}: {avg_f1_score}")
  print(f"\nAverage highly rated f1@k={k}: {avg_highly_rated_f1_score}")
  print(f"\nNumber of users evaluated: {len(eval_user_ids)}")


In [10]:
# load datasets
train_df = pd.read_csv('train_edges.csv')
test_df = pd.read_csv('test_edges.csv')

In [11]:
train_df = train_df.rename(columns={'User-ID':'user_id','Book-Rating': 'rating'})
test_df = test_df.rename(columns={'User-ID':'user_id','Book-Rating': 'rating'})

In [12]:
print("Before filtering test data:")
valid_user_ids = list(set(train_df.user_id.unique()) & set(test_df.user_id.unique()))
print(f"Evaluation users: {len(valid_user_ids)}")

Before filtering test data:
Evaluation users: 9055


In [13]:
min_user_test_entries = 5
filtered_test_df = filter_test_set(test_df, min_user_test_entries)

Counter({1: 4986, 2: 1458, 3: 681, 4: 416, 5: 269, 6: 199, 7: 130, 8: 126, 9: 96, 10: 72, 11: 64, 13: 59, 12: 54, 14: 46, 15: 35, 16: 34, 17: 32, 20: 25, 18: 23, 21: 20, 22: 19, 19: 17, 30: 15, 23: 14, 26: 13, 25: 13, 35: 10, 37: 8, 32: 8, 27: 8, 34: 7, 33: 7, 24: 7, 31: 6, 48: 5, 46: 5, 28: 5, 51: 4, 42: 4, 40: 4, 29: 4, 76: 3, 43: 3, 84: 2, 80: 2, 62: 2, 53: 2, 50: 2, 49: 2, 45: 2, 44: 2, 39: 2, 38: 2, 36: 2, 326: 1, 117: 1, 104: 1, 93: 1, 92: 1, 86: 1, 82: 1, 79: 1, 71: 1, 70: 1, 66: 1, 59: 1, 58: 1, 57: 1, 56: 1, 54: 1, 52: 1, 47: 1, 41: 1})


In [14]:
print("After filtering test data:")
valid_user_ids = list(set(train_df.user_id.unique()) & set(filtered_test_df.user_id.unique()))
print(f"Evaluation users: {len(valid_user_ids)}")

After filtering test data:
Evaluation users: 1514


In [15]:
# read in recommendations from json file
# file_path = "collaborative_filtering_recommendations_k_50_users_100.json"
# file_path = "item_based_collaborative_filtering_recommendations_k_50_items_50.json"
file_path = "neural_collaborative_filtering_recommendations_k_50.json"
with open(file_path) as json_file:
  recommendations = json.load(json_file)
  print("data type:", type(recommendations))

data type: <class 'dict'>


In [16]:
# run evaluation: recall, precision, f1
k = 50
# evaluate on original test data
evaluate(recommendations, train_df, test_df, k)

100%|██████████| 9055/9055 [00:20<00:00, 446.54it/s]


Average recall@k=50: 0.02705120687688505 for k = 50

Average highly rated recall@k=50: 0.01697551546953054

Average precision@k=50: 0.00462175593594699

Average highly rated precision@k=50: 0.0017172832689122033

Average f1@k=50: 0.0062398114942204186

Average highly rated f1@k=50: 0.002874432244399456

Number of users evaluated: 9055





In [17]:
# evaluate on filtered test data
evaluate(recommendations, train_df, filtered_test_df, k)

100%|██████████| 1514/1514 [00:04<00:00, 354.70it/s]


Average recall@k=50: 0.027652143287226393 for k = 50

Average highly rated recall@k=50: 0.02689121042047492

Average precision@k=50: 0.017569352708058126

Average highly rated precision@k=50: 0.005746367239101717

Average f1@k=50: 0.018924653884147154

Average highly rated f1@k=50: 0.00874001866869081

Number of users evaluated: 1514





In [18]:
# evaluation functions for recommendation BIAS? tbd