# Recommenders

## Initialisation

In [58]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from itertools import product

In [2]:
reviews = pd.read_csv("data_sets/reviews_for_recommenders.csv")
reviews.head(4)

Unnamed: 0,projected_title,User_id,score
0,0001,A20J0X937MBVEX,5.0
1,01442DEVELOPINGSKILLSINALGEBRAONEBOOKB,A3H9FJL67HJA3D,5.0
2,01442DEVELOPINGSKILLSINALGEBRAONEBOOKB,AVDU7UUIB1DM9,5.0
3,01443DEVELOPINGSKILLSINALGEBRAONEBOOKC,A125AU4F6Z3569,5.0


In [3]:
# DataFrame with number of reviews per projected_title
titles = reviews.groupby('projected_title').size().reset_index(name='num_reviews').sort_values("num_reviews", ascending=False)

# DataFrame with number of reviews per User_id
users = reviews.groupby('User_id').size().reset_index(name='num_reviews').sort_values("num_reviews", ascending=False)

## Trim the data set

I only want books with a minimum number of reviews and users with a minimum number of reviews

In [4]:
min_reviews_per_title = 100
initial_filtered_titles = titles[titles['num_reviews'] >= min_reviews_per_title]

min_reviews_per_user = 50
initial_filtered_users = users[users['num_reviews'] >= min_reviews_per_user]

In [5]:
# now select the reviews that only apply to these users and titles
filtered_reviews = filtered_reviews = reviews[reviews['projected_title'].isin(initial_filtered_titles["projected_title"]) & reviews['User_id'].isin(initial_filtered_users["User_id"])]

In [6]:
# DataFrame with number of reviews per projected_title
final_filtered_titles = filtered_reviews.groupby('projected_title').size().reset_index(name='num_reviews').sort_values("num_reviews", ascending=False)

# DataFrame with number of reviews per User_id
final_filtered_users = filtered_reviews.groupby('User_id').size().reset_index(name='num_reviews').sort_values("num_reviews", ascending=False)

The final number of titles and users is less than the initial filter. 
For example, some of the "good titles" (i.e. those that have more than 50 reviews in the initial filtering) have only reviews from "bad users" (i.e. those that gave less than 50 reviews in the initial set). The reverse is also true of course. In others words, the titles are also filtered when user are filtered.

In [7]:
print(len(final_filtered_titles), len(initial_filtered_titles))
print(len(final_filtered_users), len(initial_filtered_users))

2582 2638
1888 1916


In [8]:
print("Percentage of users kept after trimming:", round(len(final_filtered_users) / len(users) * 100, 2), "%")
print("Percentage of titles kept after trimming:", round(len(final_filtered_titles) / len(titles) * 100, 2), "%")
print("Percentage of reviews kept after trimming:", round(len(filtered_reviews) / len(reviews) * 100, 2), "%")

Percentage of users kept after trimming: 0.19 %
Percentage of titles kept after trimming: 1.28 %
Percentage of reviews kept after trimming: 3.79 %


In [9]:
# PRE FILTERING
print("average reviews per title:", round(len(reviews) / len(titles), 2))
print("average reviews per user:", round(len(reviews) / len(users), 2))

average reviews per title: 9.91
average reviews per user: 1.97


In [10]:
# POST FILTERING
print("average reviews per title:", round(len(filtered_reviews) / len(final_filtered_titles), 2))
print("average reviews per user:", round(len(filtered_reviews) / len(final_filtered_users), 2))

average reviews per title: 29.28
average reviews per user: 40.04


In [11]:
print("Old size of the matrix:", (len(titles) * len(users)))
print("New size of the matrix:", (len(final_filtered_titles) * len(final_filtered_users)))

Old size of the matrix: 202899030217
New size of the matrix: 4874816


In [12]:
# The new user-title matrix should be a lot less sparse now though:
print("Old density of the user-title matrix:", round(len(reviews) / (len(titles) * len(users)) * 100, 5), "%")
print("New density of the user-title matrix:", round(len(filtered_reviews) / (len(final_filtered_titles) * len(final_filtered_users)) * 100, 5), "%")

Old density of the user-title matrix: 0.00098 %
New density of the user-title matrix: 1.55062 %


## Get a pivot table

In [13]:
mat = filtered_reviews.pivot_table(values="score", index="projected_title", columns="User_id")
mat.head(5)

User_id,A106016KSI0YQ,A106E1N0ZQ4D9W,A10EH95EUDO9W3,A10HWJXUG67E7I,A10K3DLOEVMKW3,A10LWBOIZCF2QT,A10Q8NIFOVOHFV,A10T0OW97SFBB,A10VOEBL5S337W,A10Y3OZWENAQ6W,...,AZ7QNRPFJ2IW1,AZ85B5Q1UEH5U,AZ8N5BS6476S8,AZF69UA25SAWH,AZFX7FLEHCWXM,AZI0O32W4ZYGH,AZJHIRNFI04WK,AZM09BO1FD418,AZSNQ9FYC5SQO,AZSV99SDJC242
projected_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1000PLACESTOSEEBEFOREYOUDIEATRAVELERSLIFELIST,,,,,,,,,,,...,,,,,,,,,,
1001WAYSTOMARKETYOURBOOKSFORAUTHORSANDPUBLISHERSBOOKMARKETINGSERIES,,,,,,,,,,,...,,,,,,,,,,
10SECRETSFORSUCCESSANDINNERPEACE,,,,,,,,,,,...,,,,,,,,,,
13LITTLEBLUEENVELOPES,,,,,,,,,,,...,,,,,,,,,,
1491NEWREVELATIONSOFTHEAMERICASBEFORECOLUMBUS,,,,,,,,,,,...,,,,,,,,,,


### Generally users have few title they both rathed

In [14]:
user1 = mat["A106016KSI0YQ"]
user2 = mat["A106E1N0ZQ4D9W"]
user3 = mat["A10EH95EUDO9W3"]
user4 = mat["A10HWJXUG67E7I"]
user5 = mat["A10K3DLOEVMKW3"]
user6 = mat["A10Q8NIFOVOHFV"]
user7 = mat["A10T0OW97SFBB"]
user8 = mat["A10LWBOIZCF2QT"]

users = [user1, user2, user3, user4, user5, user6, user7, user8]

In [15]:
[user.notna().sum() for user in users]

[31, 26, 17, 16, 7, 8, 92, 36]

In [16]:
def intersect(user_a, user_b):
    bools = (user_a.notna() & user_b.notna())
    return bools.sum()

m = len(users)
intersects = np.zeros((m,m))
for i in range(m):
    for j in range(m):
        intersects[i,j] = intersect(users[i], users[j])
intersects

array([[31.,  0.,  0.,  0.,  0.,  0.,  0.,  1.],
       [ 0., 26.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0., 17.,  0.,  0.,  0.,  6.,  0.],
       [ 0.,  0.,  0., 16.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  7.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  8.,  0.,  1.],
       [ 0.,  0.,  6.,  0.,  0.,  0., 92.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  1.,  0., 36.]])

## Collaborative filtering

In [17]:
# Fill NaN values with 0
mat_filled = mat.fillna(0)

# Compute the cosine similarity matrix
similarity_matrix = cosine_similarity(mat_filled)

# Create a DataFrame from the similarity matrix
similarity_df = pd.DataFrame(similarity_matrix, index=mat_filled.index, columns=mat_filled.index)

In [18]:
# Function to get similar items
def get_similar_items(title, similarity_df, n=5):
    if title not in similarity_df.index:
        return f"{title} not found in the dataset."
    similar_items = similarity_df[title].sort_values(ascending=False).head(n+1)
    return similar_items.iloc[1:]  # Exclude the first item (self-similarity)

### Examples

In [19]:
get_similar_items("THEGREATGATSBY", similarity_df)

projected_title
THEGREATGATSBYLEADINGENGLISHLITERATURELIBRARY     1.000000
GREATGATSBYEVERYMAN                               0.996863
MANHATTANSTORIESFROMTHEHEARTOFAGREATCITY          0.993676
THECATCHERINTHERYEAUDIOBOOKCDUNABRIDGEDAUDIOCD    0.339520
THECATCHERINTHERYE                                0.338265
Name: THEGREATGATSBY, dtype: float64

In [20]:
get_similar_items("HARRYPOTTERANDTHESORCERERSSTONE", similarity_df)

projected_title
HARRYPOTTERANDTHECHAMBEROFSECRETS    0.689704
HARRYPOTTERTHEPRISONEROFAZKABAN      0.670006
THEHOBBITTHEREANDBACKAGAIN           0.259699
THEHOBBIT                            0.259679
THEHOBBITORTHEREANDBACKAGAIN         0.259635
Name: HARRYPOTTERANDTHESORCERERSSTONE, dtype: float64

In [21]:
get_similar_items("THEHOBBIT", similarity_df)

projected_title
THEHOBBITTHEREANDBACKAGAIN                             1.000000
THEHOBBITORTHEREANDBACKAGAIN                           0.999986
THEHOBBITTORTHEREANDBACKAGAINILLUSTRATEDBYTHEAUTHOR    0.990913
THELORDOFTHERINGSBOXSET                                0.377798
THELORDOFTHERINGSBOXEDSET                              0.372706
Name: THEHOBBIT, dtype: float64

In [22]:
get_similar_items("THESANTASHOP", similarity_df)

projected_title
THEOUTLANDISHCOMPANION                0.457496
ACOLDDAYFORMURDERKATESHUGAKMYSTERY    0.424094
RIPTIDEFBITHRILLER                    0.410391
THECOVEGKHALLLARGEPRINTBOOKSERIES     0.386695
THECOVEFBITHRILLER                    0.386695
Name: THESANTASHOP, dtype: float64

In [23]:
get_similar_items("ANIMALFARM", similarity_df)

projected_title
ANIMALFARMSIGNETCLASSICSCT304                            1.000000
ANIMALFARMAFAIRYSTORYTIMEREADINGPROGRAMSPECIALEDITION    1.000000
ANIMALFARM50THANNIVERSARYEDITION                         0.995825
GEORGEORWELL1984                                         0.351126
1984                                                     0.348597
Name: ANIMALFARM, dtype: float64

In [24]:
get_similar_items("CATCHTWENTYTWO", similarity_df)

projected_title
CATCH22CATCH22                             1.000000
CATCHTWENTYTWO                             1.000000
SLAUGHTERHOUSEFIVE                         0.309208
SLAUGHTERHOUSEFIVEFIRSTPAPERBACKEDITION    0.309208
SLAUGHTERHOUSEFIVEORTHECHILDRENSCRUSADE    0.309208
Name: CATCHTWENTYTWO, dtype: float64

# Understanding `cosine_similarity`

In [60]:
temp = {
    'User 1': [5, 3, None],
    'User 2': [4, None, 2],
    'User 3': [1, 1, 1],
    'User 4': [None, 2, 4],
    'User 5': [3, 5, None]
}

temp = pd.DataFrame(temp, index=['Book A', 'Book B', 'Book C'])
temp

Unnamed: 0,User 1,User 2,User 3,User 4,User 5
Book A,5.0,4.0,1,,3.0
Book B,3.0,,1,2.0,5.0
Book C,,2.0,1,4.0,


In [63]:
temp.index

Index(['Book A', 'Book B', 'Book C'], dtype='object')

In [75]:
# Fill NaN values with 0 (if any) and transpose the DataFrame
temp_filled = temp.fillna(0)

# Compute the cosine similarity matrix
similarity_matrix = cosine_similarity(temp_filled)

# Create a DataFrame from the similarity matrix
similarity_df = pd.DataFrame(similarity_matrix, index=ratings.index, columns=ratings.index)
print(similarity_df)

          Book A    Book B    Book C
Book A  1.000000  0.695095  0.275010
Book B  0.695095  1.000000  0.314485
Book C  0.275010  0.314485  1.000000


In [76]:
def my_cosine_sim(row_1, row_2):
    inner_product = row_1.dot(row_2)
    norm_1 = np.linalg.norm(row_1)
    norm_2 = np.linalg.norm(row_2)
    cosine = inner_product / (norm_1 * norm_2)
    return cosine

In [77]:
my_cosine_sim(temp_filled.loc["Book A"], temp_filled.loc["Book B"], )

0.6950952204628561

In [101]:
def my_cosine_sim_mat(df):
    num_rows = len(df)

    # Create an empty DataFrame with the desired indices and columns
    indices = df.index
    square_df = pd.DataFrame(index=indices, columns=indices)
    
    # Loop through the rows
    for index_i, row_i in df.iterrows():
        for index_j, row_j in df.iterrows():
            square_df.loc[index_i, index_j] = my_cosine_sim(row_i, row_j)

    return square_df
            
my_sim_mat = my_cosine_sim_mat(temp_filled)

[[0.9999999999999999 0.6950952204628561 0.27500954910846337]
 [0.6950952204628561 1.0 0.3144854510165755]
 [0.27500954910846337 0.3144854510165755 1.0]]


In [105]:
is_zero = my_sim_mat.values - similarity_df.values
np.product(np.round(is_zero.astype(float), 5) == 0) ### IT'S THE SAME. ME SMART.

1

## Experimenting with different fillings for missing values

In [107]:
# Scenario 1: Filling missing values with zeros
dummy_reviews = {
    'User 1': [5, 3, None],
    'User 2': [4, None, 2],
    'User 3': [1, 1, 1],
    'User 4': [None, 2, 4]
}

dummy_reviews = pd.DataFrame(dummy_reviews, index=['Book 1', 'Book 2', 'Book 3'])

In [109]:
df_filled_scenario1 = dummy_reviews.fillna(0)
similarity_matrix_scenario1 = cosine_similarity(df_filled_scenario1)

df_filled_scenario2 = dummy_reviews.fillna(dummy_reviews.mean())
similarity_matrix_scenario2 = cosine_similarity(df_filled_scenario2)

# Displaying the similarity matrices
print("Similarity Matrix - Scenario 1:")
print(pd.DataFrame(similarity_matrix_scenario1, index=df_filled_scenario1.index, columns=df_filled_scenario1.index))
print("\nSimilarity Matrix - Scenario 2:")
print(pd.DataFrame(similarity_matrix_scenario2, index=df_filled_scenario2.index, columns=df_filled_scenario2.index))


Similarity Matrix - Scenario 1:
          Book 1    Book 2    Book 3
Book 1  1.000000  0.659829  0.303046
Book 2  0.659829  1.000000  0.524891
Book 3  0.303046  0.524891  1.000000

Similarity Matrix - Scenario 2:
          Book 1    Book 2    Book 3
Book 1  1.000000  0.992727  0.943839
Book 2  0.992727  1.000000  0.925548
Book 3  0.943839  0.925548  1.000000


In [111]:
df_filled_scenario1

Unnamed: 0,User 1,User 2,User 3,User 4
Book 1,5.0,4.0,1,0.0
Book 2,3.0,0.0,1,2.0
Book 3,0.0,2.0,1,4.0


In [112]:
df_filled_scenario2

Unnamed: 0,User 1,User 2,User 3,User 4
Book 1,5.0,4.0,1,3.0
Book 2,3.0,3.0,1,2.0
Book 3,4.0,2.0,1,4.0
