In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
from scipy.sparse import csr_matrix
import numpy as np
import json

In [2]:
# load datasets
train_df = pd.read_csv('train_edges.csv')
test_df = pd.read_csv('test_edges.csv')


In [3]:
train_df = train_df.rename(columns={'User-ID':'user_id','Book-Rating': 'rating'})
test_df = test_df.rename(columns={'User-ID':'user_id','Book-Rating': 'rating'})


In [4]:
# create the user-item matrix for collaborative filtering (users as rows and books as columns)
user_item_matrix = train_df.pivot_table(index='user_id', columns='join_title', values='rating')
# mean centered collaborative filtering: https://web.stanford.edu/class/cs124/lec/collaborativefiltering21.pdf
user_means = user_item_matrix.mean(axis=1, skipna=True)
user_item_matrix_mean_centered = user_item_matrix.sub(user_means, axis=0)
user_item_matrix_mean_centered = user_item_matrix_mean_centered.fillna(0)


In [5]:
print(f"Train users: {len(user_item_matrix)}")
print(f"Books: {user_item_matrix.shape[1]}")
valid_user_ids = list(set(train_df.user_id.unique()) & set(test_df.user_id.unique()))
print(f"Evaluation users: {len(valid_user_ids)}")


Train users: 23780
Books: 1496
Evaluation users: 9055


In [6]:
# use a sparse matrix and only compute similarity for users in evaluation set against all users in training set
user_item_sparse = csr_matrix(user_item_matrix_mean_centered.values)

# get data for only evaluation users
eval_user_indices = [user_item_matrix.index.get_loc(user_id) for user_id in valid_user_ids if user_id in user_item_matrix.index]
eval_user_ids = [user_id for user_id in valid_user_ids if user_id in user_item_matrix.index]
eval_user_data = user_item_sparse[eval_user_indices]

# cosine similarity of each evaluation data user against all training data users
user_user_similarity = cosine_similarity(eval_user_data, user_item_sparse)
print(f"User-user similarity matrix shape: {user_user_similarity.shape}")
user_user_similarity_df = pd.DataFrame(user_user_similarity, index=eval_user_ids, columns=user_item_matrix.index)


User-user similarity matrix shape: (9055, 23780)


In [7]:
def get_similar_users(user_user_similarity_df, user_id, num_similar_users):
  """
  user_user_similarity_df: eval_users x all_users dataframe of similarities
  user_id: the user id (must be in eval_users)
  num_similar_users: number of most similar users to return
  """
  user_similarity = user_user_similarity_df.loc[user_id]
  user_similarity = user_similarity.drop(user_id) # should ignore the users own ratings
  most_similar = user_similarity.nlargest(num_similar_users)
  most_similar_users = most_similar.index
  return most_similar_users


In [8]:
def recommend_books(train_df, user_user_similarity_df, user_id, num_similar_users, num_books):
  """
  train_df: the original train dataset, to use for recommendations
  user_user_similarity_df: eval_users x all_users dataframe of similarities
  user_id: the user id
  num_similar_users: number of most similar users to return
  num_books: number of books to recommend
  """
  # get the highly rated books by similar users
  most_similar_users = get_similar_users(user_user_similarity_df, user_id, num_similar_users)

  if len(most_similar_users) == 0:
    return []

  similar_users_ratings = train_df[train_df.user_id.isin(most_similar_users)]
  similar_users_rated_highly = similar_users_ratings[similar_users_ratings.rating >= 7]

  # get candidate books to recommend (not rated by user in train set)
  user_rated_books = set(train_df[train_df.user_id == user_id].join_title)
  candidate_books = similar_users_rated_highly[~similar_users_rated_highly.join_title.isin(user_rated_books)]

  # use average ratings of most similar users to rank the candidates
  # https://realpython.com/build-recommendation-engine-collaborative-filtering/
  ranked_candidate_books = candidate_books.groupby('join_title')['rating'].mean().sort_values(ascending=False)

  recommendations = ranked_candidate_books.head(num_books).index.tolist()
  return recommendations


In [11]:
# generate recommendations
num_similar_users = 100
k = 50

recommendations_dict = {}
for user_id in tqdm(eval_user_ids):
  recommendations = recommend_books(train_df, user_user_similarity_df, user_id, num_similar_users, k)
  recommendations_dict[str(user_id)] = recommendations



100%|██████████| 9055/9055 [01:12<00:00, 124.78it/s]


In [12]:
file_path = "collaborative_filtering_recommendations_k_50_users_100.json"
with open(file_path, "w") as json_file:
  json.dump(recommendations_dict, json_file, indent=4)

In [17]:
# check similarities
# def top_k_similar_users(user_user_similarity_df, user_id, k=10):
#   # similarities for this user
#   sims = user_user_similarity_df.loc[user_id]

#   # get top-k
#   top_k = sims.nlargest(k)
#   return top_k

# print(top_k_similar_users(user_user_similarity_df, user_id=1733, k=10))

user_id
1733      1.000000
61028     0.666667
110112    0.666667
39077     0.577350
209684    0.577350
69933     0.333333
129338    0.333333
253821    0.325721
210485    0.291922
118627    0.288675
Name: 1733, dtype: float64


In [None]:
# def highly_rated_recall_at_k(train_df, test_df, user_user_similarity_df, user_id, num_similar_users, k=20):
#   """
#   train_df: dataset to make recommendations
#   test_df: validation dataset
#   user_user_similarity_df: eval_users x all_users dataframe of similarities
#   user_id: the user's id
#   num_similar_users: number of most similar users to return
#   k: number of books to recommend
#   """
#   recommendations = recommend_books(train_df, user_user_similarity_df, user_id, num_similar_users, k)

#   user_ratings_test = test_df[test_df.user_id == user_id].copy()
#   if len(user_ratings_test) == 0:
#     return 0

#   highly_rated = user_ratings_test[user_ratings_test['rating'] >= 7]
#   ground_truth = set(highly_rated['join_title'])

#   num_matches = len([book for book in recommendations if book in ground_truth])
#   user_test_books = len(ground_truth)

#   return num_matches / user_test_books if user_test_books else 0


In [None]:
# def recall_at_k(train_df, test_df, user_user_similarity_df, user_id, num_similar_users, k=20):
#   """
#   train_df: dataset to make recommendations
#   test_df: validation dataset
#   user_user_similarity_df: eval_users x all_users dataframe of similarities
#   user_id: the user's id
#   num_similar_users: number of most similar users to return
#   k: number of books to recommend
#   """
#   recommendations = recommend_books(train_df, user_user_similarity_df, user_id, num_similar_users, k)

#   user_ratings_test = test_df[test_df.user_id == user_id].copy()
#   if len(user_ratings_test) == 0:
#     return 0

#   ground_truth = set(user_ratings_test['join_title'])

#   num_matches = len([book for book in recommendations if book in ground_truth])
#   user_test_books = len(ground_truth)

#   return num_matches / user_test_books if user_test_books else 0

In [None]:
# def precision_at_k(train_df, test_df, user_user_similarity_df, user_id, num_similar_users, k=20):
#     """
#     Compute Precision@k for one user.
#     """
#     recommendations = recommend_books(train_df, user_user_similarity_df, user_id, num_similar_users, k)

#     user_test = test_df[test_df.user_id == user_id]
#     if len(user_test) == 0:
#         return 0

#     # ground_truth = set(user_test[user_test.rating >= 7]['join_title'])
#     ground_truth = set(user_test['join_title'])

#     num_matches = len([book for book in recommendations if book in ground_truth])

#     return num_matches / k


In [None]:
# # run evaluation
# num_similar_users = 100
# k = 50

# highly_rated_recalls = []
# recalls = []
# precisions = []
# for user_id in tqdm(eval_user_ids):
#   highly_rated_recall = highly_rated_recall_at_k(train_df, test_df, user_user_similarity_df, user_id, num_similar_users, k)
#   recall = recall_at_k(train_df, test_df, user_user_similarity_df, user_id, num_similar_users, k)
#   precision = precision_at_k(train_df, test_df, user_user_similarity_df, user_id, num_similar_users, k)
#   highly_rated_recalls.append(highly_rated_recall)
#   recalls.append(recall)
#   precisions.append(precision)

# avg_highly_rated_recall = sum(highly_rated_recalls) / len(highly_rated_recalls)
# avg_recall = sum(recalls) / len(recalls)
# avg_precision = sum(precisions) / len(precisions)

# print(f"\nAverage highly rated recall: {avg_highly_rated_recall} for k = {k} looking at num_similar_users = {num_similar_users}")
# print(f"\nAverage recall: {avg_recall} for k = {k} looking at num_similar_users = {num_similar_users}")
# print(f"\nAverage precision: {avg_precision} for k = {k} looking at num_similar_users = {num_similar_users}")


100%|██████████| 9055/9055 [03:00<00:00, 50.09it/s]


Average highly rated recall: 0.0812983549138053 for k = 50 looking at num_similar_users = 100

Average recall: 0.13341388908697333 for k = 50 looking at num_similar_users = 100

Average precision: 0.009033683048039757 for k = 50 looking at num_similar_users = 100



