# Recommenders

## Initialisation

In [1]:
import pandas as pd

In [2]:
reviews = pd.read_csv("data_sets/reviews_for_recommenders.csv")
reviews.head(4)

Unnamed: 0,projected_title,User_id,score
0,0001,A20J0X937MBVEX,5.0
1,01442DEVELOPINGSKILLSINALGEBRAONEBOOKB,A3H9FJL67HJA3D,5.0
2,01442DEVELOPINGSKILLSINALGEBRAONEBOOKB,AVDU7UUIB1DM9,5.0
3,01443DEVELOPINGSKILLSINALGEBRAONEBOOKC,A125AU4F6Z3569,5.0


In [3]:
# DataFrame with number of reviews per projected_title
titles = reviews.groupby('projected_title').size().reset_index(name='num_reviews').sort_values("num_reviews", ascending=False)

# DataFrame with number of reviews per User_id
users = reviews.groupby('User_id').size().reset_index(name='num_reviews').sort_values("num_reviews", ascending=False)

## Trim the data set

I only want books with a minimum number of reviews and users with a minimum number of reviews

In [39]:
min_reviews_per_title = 100
filtered_titles = titles[titles['num_reviews'] >= min_reviews_per_title]

min_reviews_per_user = 50
filtered_users = users[users['num_reviews'] >= min_reviews_per_user]

In [40]:
# now select the reviews that only apply to these users and titles
filtered_reviews = filtered_reviews = reviews[reviews['projected_title'].isin(filtered_titles["projected_title"]) & reviews['User_id'].isin(filtered_users["User_id"])]

In [41]:
print("Percentage of users kept after trimming:", round(len(filtered_users) / len(users) * 100, 2), "%")
print("Percentage of titles kept after trimming:", round(len(filtered_titles) / len(titles) * 100, 2), "%")
print("Percentage of reviews kept after trimming:", round(len(filtered_reviews) / len(reviews) * 100, 2), "%")

Percentage of users kept after trimming: 0.19 %
Percentage of titles kept after trimming: 1.31 %
Percentage of reviews kept after trimming: 3.79 %


In [47]:
# The new user-title matrix should be a lot less sparse now though:
print("Old density of the user-title matrix:", round(len(reviews) / (len(titles) * len(users)) * 100, 5), "%")
print("New density of the user-title matrix:", round(len(filtered_reviews) / (len(filtered_titles) * len(filtered_users)) * 100, 5), "%")

Old density of the user-title matrix: 0.00098 %
New density of the user-title matrix: 1.49553 %
