# Recommenders

## Initialisation

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from itertools import product

In [2]:
reviews = pd.read_csv("data_sets/reviews_for_recommenders.csv")
reviews.head(4)

Unnamed: 0,projected_title,User_id,score
0,0001,A20J0X937MBVEX,5.0
1,01442DEVELOPINGSKILLSINALGEBRAONEBOOKB,A3H9FJL67HJA3D,5.0
2,01442DEVELOPINGSKILLSINALGEBRAONEBOOKB,AVDU7UUIB1DM9,5.0
3,01443DEVELOPINGSKILLSINALGEBRAONEBOOKC,A125AU4F6Z3569,5.0


In [3]:
# DataFrame with number of reviews per projected_title
titles = reviews.groupby('projected_title').size().reset_index(name='num_reviews').sort_values("num_reviews", ascending=False)

# DataFrame with number of reviews per User_id
users = reviews.groupby('User_id').size().reset_index(name='num_reviews').sort_values("num_reviews", ascending=False)

#### Secret code

In [4]:
def get_var_name(var):
    """
    Get the variable name of a given object.
    """
    for name, value in globals().items():
        if value is var:
            return name
    return None

x = 10
var_name = get_var_name(x)
print(var_name, type(var_name))

x <class 'str'>


## Trim the data set

I only want books with a minimum number of reviews and users with a minimum number of reviews

In [5]:
min_reviews_per_title = 100
initial_filtered_titles = titles[titles['num_reviews'] >= min_reviews_per_title]

min_reviews_per_user = 50
initial_filtered_users = users[users['num_reviews'] >= min_reviews_per_user]

In [6]:
# now select the reviews that only apply to these users and titles
filtered_reviews = filtered_reviews = reviews[reviews['projected_title'].isin(initial_filtered_titles["projected_title"]) & reviews['User_id'].isin(initial_filtered_users["User_id"])]

In [7]:
# DataFrame with number of reviews per projected_title
final_filtered_titles = filtered_reviews.groupby('projected_title').size().reset_index(name='num_reviews').sort_values("num_reviews", ascending=False)

# DataFrame with number of reviews per User_id
final_filtered_users = filtered_reviews.groupby('User_id').size().reset_index(name='num_reviews').sort_values("num_reviews", ascending=False)

The final number of titles and users is less than the initial filter. 
For example, some of the "good titles" (i.e. those that have more than 50 reviews in the initial filtering) have only reviews from "bad users" (i.e. those that gave less than 50 reviews in the initial set). The reverse is also true of course. In others words, the titles are also filtered when user are filtered.

In [8]:
print(len(final_filtered_titles), len(initial_filtered_titles))
print(len(final_filtered_users), len(initial_filtered_users))

2582 2638
1888 1916


In [9]:
print("Percentage of users kept after trimming:", round(len(final_filtered_users) / len(users) * 100, 2), "%")
print("Percentage of titles kept after trimming:", round(len(final_filtered_titles) / len(titles) * 100, 2), "%")
print("Percentage of reviews kept after trimming:", round(len(filtered_reviews) / len(reviews) * 100, 2), "%")

Percentage of users kept after trimming: 0.19 %
Percentage of titles kept after trimming: 1.28 %
Percentage of reviews kept after trimming: 3.79 %


In [10]:
# PRE FILTERING
print("average reviews per title:", round(len(reviews) / len(titles), 2))
print("average reviews per user:", round(len(reviews) / len(users), 2))

average reviews per title: 9.91
average reviews per user: 1.97


In [11]:
# POST FILTERING
print("average reviews per title:", round(len(filtered_reviews) / len(final_filtered_titles), 2))
print("average reviews per user:", round(len(filtered_reviews) / len(final_filtered_users), 2))

average reviews per title: 29.28
average reviews per user: 40.04


In [12]:
print("Old size of the matrix:", (len(titles) * len(users)))
print("New size of the matrix:", (len(final_filtered_titles) * len(final_filtered_users)))

Old size of the matrix: 202899030217
New size of the matrix: 4874816


In [13]:
# The new user-title matrix should be a lot less sparse now though:
print("Old density of the user-title matrix:", round(len(reviews) / (len(titles) * len(users)) * 100, 5), "%")
print("New density of the user-title matrix:", round(len(filtered_reviews) / (len(final_filtered_titles) * len(final_filtered_users)) * 100, 5), "%")

Old density of the user-title matrix: 0.00098 %
New density of the user-title matrix: 1.55062 %


## Get a pivot table

In [14]:
mat = filtered_reviews.pivot_table(values="score", index="projected_title", columns="User_id")
mat.head(5)

User_id,A106016KSI0YQ,A106E1N0ZQ4D9W,A10EH95EUDO9W3,A10HWJXUG67E7I,A10K3DLOEVMKW3,A10LWBOIZCF2QT,A10Q8NIFOVOHFV,A10T0OW97SFBB,A10VOEBL5S337W,A10Y3OZWENAQ6W,...,AZ7QNRPFJ2IW1,AZ85B5Q1UEH5U,AZ8N5BS6476S8,AZF69UA25SAWH,AZFX7FLEHCWXM,AZI0O32W4ZYGH,AZJHIRNFI04WK,AZM09BO1FD418,AZSNQ9FYC5SQO,AZSV99SDJC242
projected_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1000PLACESTOSEEBEFOREYOUDIEATRAVELERSLIFELIST,,,,,,,,,,,...,,,,,,,,,,
1001WAYSTOMARKETYOURBOOKSFORAUTHORSANDPUBLISHERSBOOKMARKETINGSERIES,,,,,,,,,,,...,,,,,,,,,,
10SECRETSFORSUCCESSANDINNERPEACE,,,,,,,,,,,...,,,,,,,,,,
13LITTLEBLUEENVELOPES,,,,,,,,,,,...,,,,,,,,,,
1491NEWREVELATIONSOFTHEAMERICASBEFORECOLUMBUS,,,,,,,,,,,...,,,,,,,,,,


### Generally users have few title they both rathed

In [15]:
user1 = mat["A106016KSI0YQ"]
user2 = mat["A106E1N0ZQ4D9W"]
user3 = mat["A10EH95EUDO9W3"]
user4 = mat["A10HWJXUG67E7I"]
user5 = mat["A10K3DLOEVMKW3"]
user6 = mat["A10Q8NIFOVOHFV"]
user7 = mat["A10T0OW97SFBB"]
user8 = mat["A10LWBOIZCF2QT"]

users = [user1, user2, user3, user4, user5, user6, user7, user8]

In [16]:
[user.notna().sum() for user in users]

[31, 26, 17, 16, 7, 8, 92, 36]

In [17]:
def intersect(user_a, user_b):
    bools = (user_a.notna() & user_b.notna())
    return bools.sum()

m = len(users)
intersects = np.zeros((m,m))
for i in range(m):
    for j in range(m):
        intersects[i,j] = intersect(users[i], users[j])
intersects

array([[31.,  0.,  0.,  0.,  0.,  0.,  0.,  1.],
       [ 0., 26.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0., 17.,  0.,  0.,  0.,  6.,  0.],
       [ 0.,  0.,  0., 16.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  7.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  8.,  0.,  1.],
       [ 0.,  0.,  6.,  0.,  0.,  0., 92.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  1.,  0., 36.]])

## Collaborative filtering

### Make a similarity matrix filling missing values with 0

In [18]:
# Fill NaN values with 0
mat_filled_0 = mat.fillna(0)

# Compute the cosine similarity matrix
similarity_matrix_0 = cosine_similarity(mat_filled_0)

# Create a DataFrame from the similarity matrix
similarity_df_0 = pd.DataFrame(similarity_matrix_0, index=mat_filled_0.index, columns=mat_filled_0.index)

### make a similarity matrix filling missing values with column averages

In [19]:
# Fill NaN values with 0
mat_filled_mean = mat.fillna(mat.mean())

# Compute the cosine similarity matrix
similarity_matrix_mean = cosine_similarity(mat_filled_mean)

# Create a DataFrame from the similarity matrix
similarity_df_mean = pd.DataFrame(similarity_matrix_mean, index=mat_filled_mean.index, columns=mat_filled_mean.index)

In [20]:
df_normalized = mat.sub(mat.mean(), axis=1)

# Fill remaining missing values with zeros
df_normalized.fillna(0, inplace=True)

# Compute the cosine similarity matrix
similarity_matrix_normalised_mean = cosine_similarity(df_normalized)

# Create a DataFrame from the similarity matrix
similarity_df_normalised_mean = pd.DataFrame(similarity_matrix_normalised_mean, index=df_normalized.index, columns=df_normalized.index)

### Make a similarity matrix based on correlation coefficients

In [21]:
cor_mat_initial = mat.T.corr()

print("Percentage of missing values:", cor_mat_initial.isnull().sum().sum() / len(cor_mat_initial) ** 2 * 100, "%") 
print("Number of book that do not have a correlation coefficient with any other book:", (cor_mat_initial.notnull().sum() == 0).sum())

Percentage of missing values: 90.47344692835641 %
Number of book that do not have a correlation coefficient with any other book: 165


There are lots of missing values: for a pair of books that do not have more than one user that has rated both books, a bona-fide correlation coefficient cannot be calculated. I have chosen to fill in these correlations as zero's

In [22]:
cor_mat = cor_mat_initial.fillna(0) 

### Searching for most similar

In [23]:
# Function to get similar items
def get_similar_items(title, similarity_df, n=5):
    if title not in similarity_df.index:
        return f"{title} not found in the dataset."
    similar_items = similarity_df[title].sort_values(ascending=False).head(n+1)
    return similar_items.iloc[1:]  # Exclude the first item (self-similarity)

### Compare item from different matrices

In [24]:
def compare(title, similarity_matrices, n=5):
    """
    similarity_matrices is a list of similarity matrices
    """
    results = []
    for similarity_matrix in similarity_matrices:
        result = get_similar_items(title, similarity_matrix, n)
        result.name = get_var_name(similarity_matrix)
        results.append(result)
    return pd.concat(results, axis=1)

my_sim_mats = [similarity_df_0, similarity_df_mean, similarity_df_normalised_mean, cor_mat]

### Examples

In [36]:
compare("THEGREATGATSBY", my_sim_mats, n=5)

Unnamed: 0_level_0,similarity_df_0,similarity_df_mean,similarity_df_normalised_mean,cor_mat
projected_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
THEGREATGATSBYLEADINGENGLISHLITERATURELIBRARY,1.0,1.0,1.0,
GREATGATSBYEVERYMAN,0.996863,1.0,0.999966,
MANHATTANSTORIESFROMTHEHEARTOFAGREATCITY,0.993676,0.999991,0.997115,
THECATCHERINTHERYEAUDIOBOOKCDUNABRIDGEDAUDIOCD,0.33952,,,
THECATCHERINTHERYE,0.338265,,,
THECALLINGOFEMILYEVANS,,0.998468,0.18307,
THECALLINGOFEMILYEVANSWOMENOFTHEWEST1JANETTEOKECLASSICSFORGIRLS,,0.998468,0.18307,
THEABOLITIONOFMANORREFLECTIONSONEDUCATIONWITHSPECIALREFERENCETOTHETEACHINGOFENGLISHINTHEUPPERFORMSOFSCHOOLSUNIVERSITYOFDURHAMRIDDELLMEMORIALLECTURES,,,,1.0
ALLCREATURESGREATANDSMALL,,,,1.0
ONTHEBRIGHTSIDEIMNOWTHEGIRLFRIENDOFASEXGOD,,,,1.0


In [26]:
compare("HARRYPOTTERANDTHESORCERERSSTONE", my_sim_mats)

Unnamed: 0_level_0,similarity_df_0,similarity_df_mean,similarity_df_normalised_mean,cor_mat
projected_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
HARRYPOTTERANDTHECHAMBEROFSECRETS,0.689704,0.998792,0.485875,
HARRYPOTTERTHEPRISONEROFAZKABAN,0.670006,0.998719,0.449304,
THEHOBBITTHEREANDBACKAGAIN,0.259699,,,
THEHOBBIT,0.259679,,,
THEHOBBITORTHEREANDBACKAGAIN,0.259635,,,
ROSEDAUGHTER,,0.998622,0.199597,
AWOMANAFTERGODSOWNHEARTWALKERLARGEPRINTBOOKS,,0.998601,,
AMATTEROFHONORGKHALLLARGEPRINTBOOKSERIES,,0.998594,,
SHATTEREDMIRRORDENOFSHADOWS,,,0.186477,
THELAUGHINGCORPSESIGNED,,,0.184403,


In [27]:
compare("THEHOBBIT", my_sim_mats)

Unnamed: 0_level_0,similarity_df_0,similarity_df_mean,similarity_df_normalised_mean,cor_mat
projected_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
THEHOBBITTHEREANDBACKAGAIN,1.0,1.0,0.999995,
THEHOBBITORTHEREANDBACKAGAIN,0.999986,0.999999,0.999367,
THEHOBBITTORTHEREANDBACKAGAINILLUSTRATEDBYTHEAUTHOR,0.990913,0.999987,0.99356,
THELORDOFTHERINGSBOXSET,0.377798,,,
THELORDOFTHERINGSBOXEDSET,0.372706,,,
SIBLINGSWITHOUTRIVALRYHOWTOHELPYOURCHILDRENLIVETOGETHERSOYOUCANLIVETOO,,0.999083,0.260431,
FREAKTHEMIGHTY,,0.999039,,
THEFELLOWSHIPOFTHERING,,,0.215323,
THESURVIVORSCLUB,,,,1.0
FALSEIMPRESSION,,,,1.0


In [28]:
filtered_reviews[filtered_reviews["projected_title"] == "THEFELLOWSHIPOFTHERING"]

Unnamed: 0,projected_title,User_id,score
1470439,THEFELLOWSHIPOFTHERING,A11B61QBGHLQDN,5.0
1470443,THEFELLOWSHIPOFTHERING,A11PTCZ2FM2547,5.0
1470446,THEFELLOWSHIPOFTHERING,A1288XI5LO0J76,4.0
1470455,THEFELLOWSHIPOFTHERING,A13G1TKIKHGV3F,5.0
1470469,THEFELLOWSHIPOFTHERING,A172VRBP4BZXK6,4.0
...,...,...,...
1471072,THEFELLOWSHIPOFTHERING,AWLFVCT9128JV,5.0
1471078,THEFELLOWSHIPOFTHERING,AXSH6O4OP5L4M,5.0
1471079,THEFELLOWSHIPOFTHERING,AXVXIWJ6IU6KA,5.0
1471086,THEFELLOWSHIPOFTHERING,AYT4FJYVCHYLE,5.0


In [29]:
compare("THESANTASHOP", my_sim_mats)

Unnamed: 0_level_0,similarity_df_0,similarity_df_mean,similarity_df_normalised_mean,cor_mat
projected_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
THEOUTLANDISHCOMPANION,0.457496,,0.586538,
ACOLDDAYFORMURDERKATESHUGAKMYSTERY,0.424094,,,
RIPTIDEFBITHRILLER,0.410391,,,
THECOVEGKHALLLARGEPRINTBOOKSERIES,0.386695,,,
THECOVEFBITHRILLER,0.386695,,,
SCRAWNYTOBRAWNYTHECOMPLETEGUIDETOBUILDINGMUSCLETHENATURALWAY,,1.0,,
THEADVENTURESOFSUPERDIAPERBABY,,1.0,,
MIDNIGHTWARRIORSTHENEWPROPHECYBOOK1,,1.0,,
THESECRETHISTORYOFTHEWORLDANDHOWTOGETOUTALIVE,,1.0,,
THEWOMANSBIBLE,,1.0,,


In [30]:
compare("ANIMALFARM", my_sim_mats)

Unnamed: 0_level_0,similarity_df_0,similarity_df_mean,similarity_df_normalised_mean,cor_mat
projected_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ANIMALFARMSIGNETCLASSICSCT304,1.0,,1.0,
ANIMALFARMAFAIRYSTORYTIMEREADINGPROGRAMSPECIALEDITION,1.0,1.0,1.0,
ANIMALFARM50THANNIVERSARYEDITION,0.995825,0.999999,0.999649,
GEORGEORWELL1984,0.351126,,,
1984,0.348597,,,
ANIMALFARM,,1.0,,
NAKEDECONOMICSUNDRESSINGTHEDISMALSCIENCE,,0.999153,0.226638,
THEARTOFDECEPTIONCONTROLLINGTHEHUMANELEMENTOFSECURITY,,0.999128,,
ECONOMICSINONELESSON,,,0.204798,
WHATTOEXPECTWHENYOUREEXPECTING3RDEDITION,,,,1.0


In [31]:
compare("CATCHTWENTYTWO", my_sim_mats)

Unnamed: 0_level_0,similarity_df_0,similarity_df_mean,similarity_df_normalised_mean,cor_mat
projected_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CATCH22CATCH22,1.0,1.0,1.0,
CATCHTWENTYTWO,1.0,,1.0,
SLAUGHTERHOUSEFIVE,0.309208,,,
SLAUGHTERHOUSEFIVEFIRSTPAPERBACKEDITION,0.309208,,,
SLAUGHTERHOUSEFIVEORTHECHILDRENSCRUSADE,0.309208,,,
CATCH22,,1.0,,
THEPROPERCAREANDFEEDINGOFMARRIAGE,,0.999099,0.139659,
THEINNOVATORSDILEMMATHEREVOLUTIONARYNATIONALBESTSELLERTHATCHANGEDTHEWAYWEDOBUSINESS,,0.99909,,
THESEARCHHOWGOOGLEANDITSRIVALSREWROTETHERULESOFBUSINESSANDTRANSFORMEDOURCULTURE,,0.99909,,
CATSCRADLE,,,0.150774,


# Understanding `cosine_similarity`

In [32]:
temp = {
    'User 1': [5, 3, None],
    'User 2': [4, None, 2],
    'User 3': [1, 1, 1],
    'User 4': [None, 2, 4],
    'User 5': [3, 5, None]
}

temp = pd.DataFrame(temp, index=['Book A', 'Book B', 'Book C'])
temp

Unnamed: 0,User 1,User 2,User 3,User 4,User 5
Book A,5.0,4.0,1,,3.0
Book B,3.0,,1,2.0,5.0
Book C,,2.0,1,4.0,


In [33]:
temp.index

Index(['Book A', 'Book B', 'Book C'], dtype='object')

In [34]:
# Fill NaN values with 0 (if any) and transpose the DataFrame
temp_filled = temp.fillna(0)

# Compute the cosine similarity matrix
similarity_matrix = cosine_similarity(temp_filled)

# Create a DataFrame from the similarity matrix
similarity_df = pd.DataFrame(similarity_matrix, index=ratings.index, columns=ratings.index)
print(similarity_df)

NameError: name 'ratings' is not defined

In [None]:
def my_cosine_sim(row_1, row_2):
    inner_product = row_1.dot(row_2)
    norm_1 = np.linalg.norm(row_1)
    norm_2 = np.linalg.norm(row_2)
    cosine = inner_product / (norm_1 * norm_2)
    return cosine

In [None]:
my_cosine_sim(temp_filled.loc["Book A"], temp_filled.loc["Book B"], )

In [None]:
def my_cosine_sim_mat(df):
    num_rows = len(df)

    # Create an empty DataFrame with the desired indices and columns
    indices = df.index
    square_df = pd.DataFrame(index=indices, columns=indices)
    
    # Loop through the rows
    for index_i, row_i in df.iterrows():
        for index_j, row_j in df.iterrows():
            square_df.loc[index_i, index_j] = my_cosine_sim(row_i, row_j)

    return square_df
            
my_sim_mat = my_cosine_sim_mat(temp_filled)

In [None]:
is_zero = my_sim_mat.values - similarity_df.values
np.product(np.round(is_zero.astype(float), 5) == 0) ### IT'S THE SAME. ME SMART.

## Experimenting with different fillings for missing values

In [None]:
# Scenario 1: Filling missing values with zeros
dummy_reviews = {
    'User 1': [5, 3, None],
    'User 2': [4, None, 2],
    'User 3': [1, 1, 1],
    'User 4': [None, 2, 4]
}

dummy_reviews = pd.DataFrame(dummy_reviews, index=['Book 1', 'Book 2', 'Book 3'])

In [None]:
df_filled_scenario1 = dummy_reviews.fillna(0)
similarity_matrix_scenario1 = cosine_similarity(df_filled_scenario1)

df_filled_scenario2 = dummy_reviews.fillna(dummy_reviews.mean())
similarity_matrix_scenario2 = cosine_similarity(df_filled_scenario2)

# Displaying the similarity matrices
print("Similarity Matrix - Scenario 1:")
print(pd.DataFrame(similarity_matrix_scenario1, index=df_filled_scenario1.index, columns=df_filled_scenario1.index))
print("\nSimilarity Matrix - Scenario 2:")
print(pd.DataFrame(similarity_matrix_scenario2, index=df_filled_scenario2.index, columns=df_filled_scenario2.index))


In [None]:
df_filled_scenario1

In [None]:
df_filled_scenario2

# Correlation matrix with missing values

In [None]:
# Example DataFrame with ratings for books by users
data = {
    'User 1': [5,    None, None, 4],
    'User 2': [4,    None, 2,    5],
    'User 3': [None, 1,    1,    2],
    'User 4': [0,    2,    4,    3],
    'User 5': [3,    5,    None, 1]
}

df = pd.DataFrame(data, index=['Book 1', 'Book 2', 'Book 3', 'Book 4'])

# Calculate correlation coefficients
corr_matrix = df.T.corr()
corr_matrix

In [None]:
# Get similarity to a specific book
specific_book = 'Book 1'
similar_books = corr_matrix[specific_book].sort_values(ascending=False)

# Exclude the book itself from the list of similar books
similar_books = similar_books.drop(specific_book)

# Print the similar books
print("Books similar to", specific_book, ":")
print(similar_books)