In [24]:
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

warnings.filterwarnings('ignore')

# Adjusting for Data Frame Output
# pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
# pd.set_option('display.max_colwidth', None)

# 1. Data Exploration

In [25]:
# Read BX-Book-Ratings.csv
origin_rating = pd.read_csv('dataset/Book reviews/Book reviews/BX-Book-Ratings.csv', sep=';', encoding="latin-1")
origin_rating.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149780 entries, 0 to 1149779
Data columns (total 3 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   User-ID      1149780 non-null  int64 
 1   ISBN         1149780 non-null  object
 2   Book-Rating  1149780 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 26.3+ MB


In [26]:
# Read Preprocessed_data.csv
origin_preprocessed = pd.read_csv(
    'dataset/Books Data with Category Language and Summary/Preprocessed_data.csv', sep=',', encoding="latin-1")
origin_preprocessed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1031175 entries, 0 to 1031174
Data columns (total 19 columns):
 #   Column               Non-Null Count    Dtype  
---  ------               --------------    -----  
 0   Unnamed: 0           1031175 non-null  int64  
 1   user_id              1031175 non-null  int64  
 2   location             1031175 non-null  object 
 3   age                  1031175 non-null  float64
 4   isbn                 1031175 non-null  object 
 5   rating               1031175 non-null  int64  
 6   book_title           1031175 non-null  object 
 7   book_author          1031175 non-null  object 
 8   year_of_publication  1031175 non-null  float64
 9   publisher            1031175 non-null  object 
 10  img_s                1031175 non-null  object 
 11  img_m                1031175 non-null  object 
 12  img_l                1031175 non-null  object 
 13  Summary              1031175 non-null  object 
 14  Language             1031175 non-null  object 
 15

# 2. Data Preprocessing

In [27]:
origin_preprocessed = origin_preprocessed.loc[:, ['book_title', 'isbn', 'Category']]
origin_preprocessed.rename(columns={'book_title': 'Book-Title'}, inplace=True)
origin_preprocessed.rename(columns={'isbn': 'ISBN'}, inplace=True)
origin_preprocessed.drop_duplicates(['ISBN'], inplace=True)
origin_preprocessed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 270170 entries, 0 to 1031174
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   Book-Title  270170 non-null  object
 1   ISBN        270170 non-null  object
 2   Category    270170 non-null  object
dtypes: object(3)
memory usage: 8.2+ MB


In [28]:
# Merge 'origin_preprocessed' data and 'origin_rating' data about ISBN
user_book_rating = pd.merge(origin_rating, origin_preprocessed, on="ISBN")
user_book_rating.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1031175 entries, 0 to 1031174
Data columns (total 5 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   User-ID      1031175 non-null  int64 
 1   ISBN         1031175 non-null  object
 2   Book-Rating  1031175 non-null  int64 
 3   Book-Title   1031175 non-null  object
 4   Category     1031175 non-null  object
dtypes: int64(2), object(3)
memory usage: 47.2+ MB


In [29]:
# Check null data
user_book_rating.isnull().sum()

# drop null data
user_book_rating.dropna(inplace=True)

# change data type
user_book_rating['Book-Rating'] = user_book_rating['Book-Rating'].astype(float)

# Rating average
avg_ratings = user_book_rating.groupby('ISBN', as_index=False)['Book-Rating'].mean()
avg_ratings = avg_ratings.rename(columns={'Book-Rating': 'Average-Rating'})

In [30]:
# Calculate rating count each ISBN
book_ratingCount = (user_book_rating.groupby(by=['ISBN'])
                    ['Book-Rating'].
                    count().
                    reset_index().
                    rename(columns={'Book-Rating': 'TotalRatingCount'})
                    )

user_book_rating = pd.merge(user_book_rating, book_ratingCount, on="ISBN")

# Delete data about rating count under 50
ratingThreshold = 50
user_book_rating = user_book_rating.query('TotalRatingCount >= @ratingThreshold')
user_book_rating.drop(columns='TotalRatingCount', inplace=True)
user_book_rating.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 234876 entries, 0 to 710818
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   User-ID      234876 non-null  int64  
 1   ISBN         234876 non-null  object 
 2   Book-Rating  234876 non-null  float64
 3   Book-Title   234876 non-null  object 
 4   Category     234876 non-null  object 
dtypes: float64(1), int64(1), object(3)
memory usage: 10.8+ MB


In [31]:
# Preprocessing 'Category' column
pre_dataframe = user_book_rating.copy()
pre_dataframe = pre_dataframe.drop(columns=['User-ID', 'Book-Rating'])
pre_dataframe = pd.merge(pre_dataframe, avg_ratings, on='ISBN')

# delete special symbols
pre_dataframe["Category"] = pre_dataframe["Category"].str.replace(
    pat=r'[^\w]', repl=r' ', regex=True)
pre_dataframe["Category"] = pre_dataframe["Category"].str.lower()

# Strange category value '9' delete
delete = pre_dataframe[pre_dataframe['Category'] == '9'].index
pre_dataframe.drop(delete, inplace=True)

# delete duplicate entries
pre_dataframe = pre_dataframe.drop_duplicates(['ISBN'])
pre_dataframe = pre_dataframe.reset_index(drop=True)

# Extract Category data
category = list(pre_dataframe['Category'].to_list())

# Extract User-ID data
isbn = list(set(pre_dataframe['ISBN'].to_list()))

# 3. Data Modeling

In [32]:
vect = TfidfVectorizer(min_df=3, stop_words='english')
transformed_weights = vect.fit_transform(pre_dataframe['Category'])
vect.vocabulary_

{'fiction': 15,
 'juvenile': 24,
 'business': 6,
 'economics': 12,
 'humor': 21,
 'history': 20,
 'biography': 4,
 'autobiography': 3,
 'american': 1,
 'family': 14,
 'relationships': 28,
 'fictitious': 16,
 'character': 8,
 'religion': 29,
 'intelligence': 23,
 'california': 7,
 'domestic': 10,
 'body': 5,
 'mind': 26,
 'spirit': 31,
 'self': 30,
 'help': 19,
 'literary': 25,
 'true': 33,
 'crime': 9,
 'african': 0,
 'england': 13,
 'health': 18,
 'fitness': 17,
 'americans': 2,
 'dune': 11,
 'imaginary': 22,
 'place': 27,
 'travel': 32}

In [33]:
attribute_df = pd.DataFrame(transformed_weights.toarray(), columns=vect.get_feature_names_out(),
                            index=pre_dataframe['ISBN'].tolist())
attribute_df

Unnamed: 0,african,american,americans,autobiography,biography,body,business,california,character,crime,domestic,dune,economics,england,family,fiction,fictitious,fitness,health,help,history,humor,imaginary,intelligence,juvenile,literary,mind,place,relationships,religion,self,spirit,travel,true
034545104X,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0449006522,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0553561618,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
055356451X,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0060517794,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0449002411,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
044023512X,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0345450728,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0385720114,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
# Cosine similarity (query, documents)
similarity = cosine_similarity(transformed_weights, transformed_weights)

# Construct a reverse map of indices and book title.
indices_code = pd.Series(pre_dataframe.index,
                         index=pre_dataframe['Book-Title']).drop_duplicates()

In [35]:
def get_recommendations(book_title, cosine_sim, data_info):
    # Get the index of the movie that matches the title
    idx = indices_code[book_title]

    # Similarity scores
    similarity_scores = list(enumerate(cosine_sim[idx]))

    # Sort the books based on the similarity scores
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the six most similar books
    similarity_scores = similarity_scores[0:10]

    # Get the book indices
    book_indices = [i[0] for i in similarity_scores]

    tmp = (data_info.iloc[book_indices]).sort_values(by='Average-Rating', ascending=False)

    same = tmp[tmp['Book-Title'] == book_title].index
    tmp.drop(same, inplace=True)

    # Return the top 5 most similar books
    return tmp['Book-Title'][0:5]

# 4. Evaluation

In [36]:
print('Book list')
pre_dataframe['Book-Title']

Book list


0           Flesh Tones: A Novel
1            Manhattan Hunt Club
2                  Dark Paradise
3                     Night Sins
4       Little Altars Everywhere
                  ...           
1447                The Presence
1448               City of Light
1449              Distant Shores
1450             The Map of Love
1451         Interest of Justice
Name: Book-Title, Length: 1452, dtype: object

In [37]:
get_recommendations('Wild Animus', similarity, pre_dataframe)

10                      Bridget Jones's Diary
4                    Little Altars Everywhere
6     The Girl Who Loved Tom Gordon : A Novel
1                         Manhattan Hunt Club
8                               The Dark Half
Name: Book-Title, dtype: object

In [39]:
get_recommendations("Walk Two Moons", similarity, pre_dataframe)

32     Harry Potter and the Chamber of Secrets (Book 2)
9     Harry Potter and the Order of the Phoenix (Boo...
33    Harry Potter and the Sorcerer's Stone (Harry P...
13                                    A Wrinkle In Time
31                                             Coraline
Name: Book-Title, dtype: object