# Book recomendation collaborative filtering

In [1]:
# Importing libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity

### Reading the data

Dataset avaible [here](https://www.kaggle.com/datasets/arashnic/book-recommendation-dataset)

When reading the data, I chose to exclude any row or column that had any 'null' entry.

In [2]:
# Reading data
books = pd.read_csv('books.csv', encoding='latin-1', delimiter=';').dropna()
ratings = pd.read_csv('ratings.csv', encoding='latin-1', delimiter=';').dropna()

In [3]:
books.head(5)

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton & Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [4]:
books.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 271376 entries, 0 to 271378
Data columns (total 8 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   ISBN                 271376 non-null  object
 1   Book-Title           271376 non-null  object
 2   Book-Author          271376 non-null  object
 3   Year-Of-Publication  271376 non-null  int64 
 4   Publisher            271376 non-null  object
 5   Image-URL-S          271376 non-null  object
 6   Image-URL-M          271376 non-null  object
 7   Image-URL-L          271376 non-null  object
dtypes: int64(1), object(7)
memory usage: 18.6+ MB


In [5]:
ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [6]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149790 entries, 0 to 1149789
Data columns (total 3 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   User-ID      1149790 non-null  int64 
 1   ISBN         1149790 non-null  object
 2   Book-Rating  1149790 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 26.3+ MB


### Book codes

It was necessary to create unique codes for books since the same book can have different ISBN codes for different publications.

In [7]:
# Creating a unique code for books with the same title + author even if they have different ISBN
label_encoder = LabelEncoder()
books['Book-Code'] = books['Book-Title'] + ' - ' + books['Book-Author']
books['Book-Code'] = label_encoder.fit_transform(books['Book-Code'])
books['Book-Code'] = books['Book-Code'].astype(int)

In [8]:
# Merging books information inside ratings dataframe, filtering columns and ratings with '0' value
ratings = ratings.merge(books[['ISBN', 'Book-Code']], on='ISBN', how='inner').dropna()
ratings['Book-Code'] = ratings['Book-Code'].astype(int)
ratings = ratings[['User-ID', 'Book-Code', 'Book-Rating']]
ratings = ratings[ratings['Book-Rating'] != 0]

In [9]:
ratings.head()

Unnamed: 0,User-ID,Book-Code,Book-Rating
1,2313,70720,5
3,8680,70720,5
4,10314,70720,9
9,50403,70720,9
13,63970,70720,8


In [10]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 383859 entries, 1 to 1031181
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype
---  ------       --------------   -----
 0   User-ID      383859 non-null  int64
 1   Book-Code    383859 non-null  int32
 2   Book-Rating  383859 non-null  int64
dtypes: int32(1), int64(2)
memory usage: 10.3 MB


### Aggregating data

To make the calculation manageable for computer memory, I filtered only books that have more than 50 ratings.

In [11]:
# Creating an aggregation by book-code and filter books with over than 50 ratings
agg_ratings = ratings.groupby(['Book-Code']).agg(mean = ('Book-Rating', 'mean'), count = ('Book-Rating', 'count')).reset_index()
agg_ratings = agg_ratings[agg_ratings['count'] > 50]

In [12]:
# These are the most rated books
agg_ratings.sort_values(by='count', ascending=False).head()

Unnamed: 0,Book-Code,mean,count
113602,204705,8.18529,707
136239,244844,4.390706,581
105814,191383,8.438525,488
119754,215289,8.477833,406
115356,207703,7.437659,393


In [13]:
# Merging all data
df = pd.merge(ratings, agg_ratings[['Book-Code']], on='Book-Code', how='inner')

In [14]:
df.head()

Unnamed: 0,User-ID,Book-Code,Book-Rating
0,7158,208753,10
1,8253,208753,10
2,11676,208753,10
3,12589,208753,9
4,13279,208753,10


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 61445 entries, 0 to 61444
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   User-ID      61445 non-null  int64
 1   Book-Code    61445 non-null  int32
 2   Book-Rating  61445 non-null  int64
dtypes: int32(1), int64(2)
memory usage: 1.6 MB


In [16]:
# Chosing a user ID
user_id = 7158

In [17]:
# Ratings of picked user
user_ratings = df.loc[df['User-ID'] == user_id]
user_ratings = pd.merge(user_ratings, books[['Book-Code', 'Book-Title', 'Book-Author']].drop_duplicates(), on='Book-Code', how='left')
user_ratings

Unnamed: 0,User-ID,Book-Code,Book-Rating,Book-Title,Book-Author
0,7158,208753,10,The Notebook,Nicholas Sparks
1,7158,208753,10,The Notebook,Nicholas Sparks
2,7158,158069,8,Roses Are Red (Alex Cross Novels),James Patterson
3,7158,237393,5,Violets Are Blue,James Patterson
4,7158,57704,1,Dreamcatcher,Stephen King
5,7158,159263,9,SHIPPING NEWS,Annie Proulx
6,7158,7574,9,A Walk to Remember,Nicholas Sparks
7,7158,30786,7,By the Light of the Moon,DEAN KOONTZ
8,7158,243199,8,Where the Heart Is (Oprah's Book Club (Paperba...,Billie Letts
9,7158,96938,10,Intensity,DEAN KOONTZ


### Calculating

I input the data into a matrix and calculated it using Pearson's correlation. The recommended books are the top x books with the highest average rating among y users who have the highest degree of similarity with the target user, excluding books already read by the target user.

In [18]:
# Creating the base matrix
matrix = df.pivot_table(index='User-ID', columns='Book-Code', values='Book-Rating').astype('float32')

In [19]:
# Normalizing the matrix with the average from each user
matrix_norm = matrix.subtract(matrix.mean(axis=1), axis = 'rows').astype('float32')

In [20]:
# Creating the user similarity matrix using Pearson correlation
user_similarity = matrix_norm.T.corr(method='pearson').astype('float32')

In [21]:
# Drop user data from the matrix
user_similarity.drop(index=user_id, inplace=True)

In [22]:
# The number of results I want to be 10
n = 10

# I want to filter the similarity for at least 0.3 threshold
user_similarity_threshold = 0.3

In [23]:
# Get top n similar users
similar_users = user_similarity[user_similarity[user_id]>user_similarity_threshold][user_id].sort_values(ascending=False)[:n]

# Print out top n similar users
print(f'The similar users for user {user_id} are', similar_users)

The similar users for user 7158 are User-ID
6575      1.0
174072    1.0
197364    1.0
196886    1.0
196466    1.0
192245    1.0
192095    1.0
181176    1.0
170634    1.0
138844    1.0
Name: 7158, dtype: float32


In [24]:
# Books that the target user has watched
picked_userid_readed = matrix_norm[matrix_norm.index == user_id].dropna(axis=1, how='all')

In [25]:
# Books that similar users readed. Remove books that none of the similar users have readed
similar_user_books = matrix_norm[matrix_norm.index.isin(similar_users.index)].dropna(axis=1, how='all')

In [26]:
# Remove the readed book from the movie list
similar_user_books.drop(picked_userid_readed.columns,axis=1, inplace=True, errors='ignore')

In [27]:
# A dictionary to store item scores
item_score = {}

# Loop through items
for i in similar_user_books.columns:
  # Get the ratings for book i
  book_rating = similar_user_books[i]
  # Create a variable to store the score
  total = 0
  # Create a variable to store the number of scores
  count = 0
  # Loop through similar users
  for u in similar_users.index:
    # If the book has rating
    if pd.isna(book_rating[u]) == False:
      # Score is the sum of user similarity score multiply by the book rating
      score = similar_users[u] * book_rating[u]
      # Add the score to the total score for the book so far
      total += score
      # Add 1 to the count
      count +=1
  # Get the average score for the item
  item_score[i] = total / count

# Convert dictionary to pandas dataframe
item_score = pd.DataFrame(item_score.items(), columns=['Book-Code', 'Score'])
    
# Sort the book by score
ranked_item_score = item_score.sort_values(by='Score', ascending=False)

# Merge scores with 'books' dataframe to get additional information
ranked_item_score = pd.merge(ranked_item_score, books[['Book-Code', 'Book-Title', 'Book-Author']].drop_duplicates(), on='Book-Code', how='left')

# Select top m books
m = 10
ranked_item_score.head(m)

Unnamed: 0,Book-Code,Score,Book-Title,Book-Author
0,107712,2.052631,"Lamb : The Gospel According to Biff, Christ's ...",Christopher Moore
1,66536,2.052631,Fahrenheit 451,RAY BRADBURY
2,140003,1.957627,One for the Money (A Stephanie Plum Novel),Janet Evanovich
3,20513,1.957627,Balzac and the Little Chinese Seamstress : A N...,DAI SIJIE
4,77934,1.957627,"Girl, Interrupted",SUSANNA KAYSEN
5,72908,1.957627,Four To Score (A Stephanie Plum Novel),Janet Evanovich
6,41363,1.957627,Coraline,Neil Gaiman
7,229349,1.957627,To the Nines: A Stephanie Plum Novel,Janet Evanovich
8,233321,1.957627,Two for the Dough,Janet Evanovich
9,201669,1.957627,The Joy Luck Club,Amy Tan
