In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [2]:
books = pd.read_csv('books.csv', low_memory=False)
books.head()

Unnamed: 0,id,book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,...,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,...,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,...,3198671,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,2683664,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...


In [3]:
ratings = pd.read_csv('ratings.csv', low_memory=False)
ratings.head()

Unnamed: 0,book_id,user_id,rating
0,1,314,5
1,1,439,3
2,1,588,5
3,1,1169,4
4,1,1185,4


In [4]:
tags = pd.read_csv('tags.csv', low_memory=False)
tags.head()

Unnamed: 0,tag_id,tag_name
0,0,-
1,1,--1-
2,2,--10-
3,3,--12-
4,4,--122-


In [5]:
book_tags = pd.read_csv('book_tags.csv', low_memory=False)
book_tags.head()

Unnamed: 0,goodreads_book_id,tag_id,count
0,1,30574,167697
1,1,11305,37174
2,1,11557,34173
3,1,8717,12986
4,1,33114,12716


In [6]:
books.columns

Index(['id', 'book_id', 'best_book_id', 'work_id', 'books_count', 'isbn',
       'isbn13', 'authors', 'original_publication_year', 'original_title',
       'title', 'language_code', 'average_rating', 'ratings_count',
       'work_ratings_count', 'work_text_reviews_count', 'ratings_1',
       'ratings_2', 'ratings_3', 'ratings_4', 'ratings_5', 'image_url',
       'small_image_url'],
      dtype='object')

# Content Based Filtering Recommender System

Filter the book recommendation through similar author

In [7]:
books_authors = books[['book_id', 'title','authors']]
books_authors.head()

Unnamed: 0,book_id,title,authors
0,2767052,"The Hunger Games (The Hunger Games, #1)",Suzanne Collins
1,3,Harry Potter and the Sorcerer's Stone (Harry P...,"J.K. Rowling, Mary GrandPré"
2,41865,"Twilight (Twilight, #1)",Stephenie Meyer
3,2657,To Kill a Mockingbird,Harper Lee
4,4671,The Great Gatsby,F. Scott Fitzgerald


In [8]:
# Remove words such as is, an, the, etc
tf = TfidfVectorizer(analyzer='word', stop_words='english')
# Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tf.fit_transform(books_authors['authors'])
# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [9]:
# Build  1-dimensional array for titles
titles = books_authors['title']
indices = pd.Series(books_authors.index, index=books_authors['title'])

In [10]:
# Function to get book recommendations based on the cosine similarity score of authors
def authors_recommendations(title):
    # Get the index of the matched book title
    idx = indices[title]
    # Get the pairwise similarity scores of all books with the selected
    sim_scores = list(enumerate(cosine_sim[idx]))
    # Sort the books based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # Get the scores of the 10 most similar books
    sim_scores = sim_scores[1:11]
    # Get the book indices
    book_indices = [i[0] for i in sim_scores]
    # Return the top 10 similar books
    return titles.iloc[book_indices]

In [11]:
# Getting input of book title from user
book_name = input('Enter the book title: ')

Enter the book title: Gone Girl


In [12]:
authors_recommendations(book_name)

225                              Dark Places
243                            Sharp Objects
2168                             The Grownup
9758     Another Bullshit Night in Suck City
2058      American Assassin (Mitch Rapp, #1)
2514         Act of Treason (Mitch Rapp, #9)
2621           Memorial Day (Mitch Rapp, #7)
2829        Consent to Kill (Mitch Rapp, #8)
2834    Separation of Power (Mitch Rapp, #5)
2919      Transfer of Power (Mitch Rapp, #3)
Name: title, dtype: object

In [13]:
# Combine book tags with tags.csv
join_tags = pd.merge(book_tags, tags, left_on='tag_id', right_on='tag_id', how='inner')
join_tags.head()

Unnamed: 0,goodreads_book_id,tag_id,count,tag_name
0,1,30574,167697,to-read
1,2,30574,24549,to-read
2,3,30574,496107,to-read
3,5,30574,11909,to-read
4,6,30574,298,to-read


In [14]:
# Combine file
books_merge_tags = pd.merge(books_authors, join_tags, left_on='book_id', right_on='goodreads_book_id', how='inner')
books_merge_tags.head()

Unnamed: 0,book_id,title,authors,goodreads_book_id,tag_id,count,tag_name
0,2767052,"The Hunger Games (The Hunger Games, #1)",Suzanne Collins,2767052,30574,11314,to-read
1,2767052,"The Hunger Games (The Hunger Games, #1)",Suzanne Collins,2767052,11305,10836,fantasy
2,2767052,"The Hunger Games (The Hunger Games, #1)",Suzanne Collins,2767052,11557,50755,favorites
3,2767052,"The Hunger Games (The Hunger Games, #1)",Suzanne Collins,2767052,8717,35418,currently-reading
4,2767052,"The Hunger Games (The Hunger Games, #1)",Suzanne Collins,2767052,33114,25968,young-adult


In [15]:
# Define a TF-IDF Vectorizer Object.
# Remove words such as 'the', 'a'
# Convert text to feature vectors that can be used as input to estimator
tf1 = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix1 = tf1.fit_transform(books_merge_tags['tag_name'].head(10000))
cosine_sim1 = linear_kernel(tfidf_matrix1, tfidf_matrix1)
cosine_sim1

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [16]:
# Build  1-dimensional array with book titles
book_titles = books_authors['title']
book_indices = pd.Series(books.index, index=books_authors['title'])

# Function that get book recommendations based on the cosine similarity score of book tags
def tags_recommendations(title):
    # Get the index of the matched book title
    idx = book_indices[title]
    # Get the pairwise similarity scores of all books with the selected
    sim_scores = list(enumerate(cosine_sim[idx]))
    # Sort the books based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # Get the scores of the 10 most similar books
    sim_scores = sim_scores[1:11]
    # Get the book indices
    book_indices1 = [i[0] for i in sim_scores]
    # Return the top 10 similar books
    return titles.iloc[book_indices1]

In [17]:
book_name1 = input('Enter the book title:  ')

Enter the book title:  Gone Girl


In [18]:
tags_recommendations(book_name1).head(10)

225                              Dark Places
243                            Sharp Objects
2168                             The Grownup
9758     Another Bullshit Night in Suck City
2058      American Assassin (Mitch Rapp, #1)
2514         Act of Treason (Mitch Rapp, #9)
2621           Memorial Day (Mitch Rapp, #7)
2829        Consent to Kill (Mitch Rapp, #8)
2834    Separation of Power (Mitch Rapp, #5)
2919      Transfer of Power (Mitch Rapp, #3)
Name: title, dtype: object

## Recommendation of books using combination of attributes(author and tags) for better results.


In [19]:
temp_df = books_merge_tags.groupby('book_id')['tag_name'].apply(' '.join).reset_index()
temp_df.head()

Unnamed: 0,book_id,tag_name
0,1,to-read fantasy favorites currently-reading yo...
1,2,to-read fantasy favorites currently-reading yo...
2,3,to-read fantasy favorites currently-reading yo...
3,5,to-read fantasy favorites currently-reading yo...
4,6,to-read fantasy young-adult fiction harry-pott...


In [20]:
books_authors = pd.merge(books_authors, temp_df, left_on='book_id', right_on='book_id', how='inner')
books_authors.head()

Unnamed: 0,book_id,title,authors,tag_name
0,2767052,"The Hunger Games (The Hunger Games, #1)",Suzanne Collins,to-read fantasy favorites currently-reading yo...
1,3,Harry Potter and the Sorcerer's Stone (Harry P...,"J.K. Rowling, Mary GrandPré",to-read fantasy favorites currently-reading yo...
2,41865,"Twilight (Twilight, #1)",Stephenie Meyer,to-read fantasy favorites currently-reading yo...
3,2657,To Kill a Mockingbird,Harper Lee,to-read favorites currently-reading young-adul...
4,4671,The Great Gatsby,F. Scott Fitzgerald,to-read favorites currently-reading young-adul...


In [21]:
books_authors['collection'] = (pd.Series(books_authors[['authors', 'tag_name']]
                      .fillna('')
                      .values.tolist()
                      ).str.join(' '))

In [22]:
tf_coll = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix_coll = tf_coll.fit_transform(books_authors['collection'])
cosine_sim_coll = linear_kernel(tfidf_matrix_coll, tfidf_matrix_coll)

# Build a 1-dimensional array with book titles
titles = books_authors['title']
indices = pd.Series(books_authors.index, index=books_authors['title'])

# Get book recommendations based on the cosine similarity score of books tags
def collection_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim_coll[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    book_indices = [i[0] for i in sim_scores]
    return titles.iloc[book_indices]

In [23]:
book_title = input('Enter the book title: ')

Enter the book title: Gone Girl


In [24]:
collection_recommendations(book_title).head(10)

243              Sharp Objects
225                Dark Places
2168               The Grownup
2616    The Kind Worth Killing
3311               Pretty Baby
1344              Pretty Girls
810              The Good Girl
7870               Second Life
60       The Girl on the Train
444       Before I Go to Sleep
Name: title, dtype: object

## Use CountVectorizer to perform recommendation based on authors

In [25]:
book_authors = books[['book_id', 'title','authors']]
book_authors.head()

Unnamed: 0,book_id,title,authors
0,2767052,"The Hunger Games (The Hunger Games, #1)",Suzanne Collins
1,3,Harry Potter and the Sorcerer's Stone (Harry P...,"J.K. Rowling, Mary GrandPré"
2,41865,"Twilight (Twilight, #1)",Stephenie Meyer
3,2657,To Kill a Mockingbird,Harper Lee
4,4671,The Great Gatsby,F. Scott Fitzgerald


In [26]:
book_titles = input('Enter the book title: ')

Enter the book title: Gone Girl


In [27]:
# Initialize vectorizer
# Min_df = rare words, max_df = most used words
vect1 = CountVectorizer(analyzer = 'word', ngram_range = (1,2), stop_words = 'english', min_df = 0.002)

In [28]:
# Fit into the authors
vect1.fit(book_authors['authors'])
title_matrix = vect1.transform(book_authors['authors'])

In [29]:
#features= vect1.get_feature_names() ver problem
features = vect1.get_feature_names_out()
features

array(['abbi', 'abbi glines', 'agatha', 'agatha christie', 'alan', 'alex',
       'alexander', 'alice', 'amy', 'anderson', 'andrew', 'andrews',
       'anita', 'ann', 'anna', 'anne', 'anne rice', 'anthony', 'archer',
       'armentrout', 'armstrong', 'arthur', 'ashley', 'asimov',
       'baldacci', 'banks', 'barbara', 'ben', 'bernard', 'blake',
       'brandon', 'brandon sanderson', 'brett', 'brian', 'brian vaughan',
       'briggs', 'brooks', 'brown', 'bryan', 'butcher', 'cabot', 'card',
       'carl', 'charlaine', 'charlaine harris', 'charles', 'charlie',
       'child', 'chris', 'christie', 'christopher', 'chuck', 'clare',
       'clark', 'clarke', 'clive', 'coben', 'cole', 'colleen', 'connelly',
       'cornwell', 'dan', 'daniel', 'dave', 'david', 'david baldacci',
       'david eddings', 'dean', 'dean koontz', 'diana', 'don', 'donald',
       'douglas', 'douglas preston', 'doyle', 'dr', 'dr seuss', 'eddings',
       'edward', 'elizabeth', 'ellen', 'ellis', 'emily', 'eric', 'erin',

In [30]:
cosine_sim_titles = cosine_similarity(title_matrix, title_matrix)
cosine_sim_titles

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [31]:
indices1 = pd.Series(book_authors.index,index=book_authors['title'])
title_id = indices1[book_titles]

In [32]:
# Find out what features have been considered  by the vectorizer for a given title 
# Squeeze activity matrix into array
feature_array = np.squeeze(title_matrix[title_id].toarray())
idx = np.where(feature_array > 0)

In [33]:
# Cosine similarity with other similar titles and determine the total number of books to recommend
n = 11 
top_n_idx = np.flip(np.argsort(cosine_sim_titles[title_id,]), axis = 0)[0:n]
top_n_sim_values = cosine_sim_titles[title_id, top_n_idx]

In [34]:
# Find top n with values > 0
top_n_idx = top_n_idx[top_n_sim_values > 0]
scores = top_n_sim_values[top_n_sim_values > 0]

In [35]:
book_authors['title'].iloc[title_id]
pd.DataFrame({"books": book_authors['title'].iloc[top_n_idx].values,
           "scores":scores}, columns = ["books","scores"])

Unnamed: 0,books,scores
0,"Pursuit of Honor (Mitch Rapp, #12)",1.0
1,"Act of Treason (Mitch Rapp, #9)",1.0
2,Gone Girl,1.0
3,"The Survivor (Mitch Rapp, #14)",1.0
4,"Extreme Measures (Mitch Rapp, #11)",1.0
5,"American Assassin (Mitch Rapp, #1)",1.0
6,"Transfer of Power (Mitch Rapp, #3)",1.0
7,Dark Places,1.0
8,"Protect and Defend (Mitch Rapp, #10)",1.0
9,"Executive Power (Mitch Rapp, #6)",1.0


## Count vector for tag and author recommendation

In [36]:
book_nama = input('Enter the book title: ')

Enter the book title: Gone Girl


In [37]:
vect1 = CountVectorizer(analyzer = 'word', ngram_range = (1,2), stop_words = 'english', min_df = 0.002)

In [38]:
vect1.fit(books_authors['collection'])
title_matrix = vect1.transform(books_authors['collection'])

In [39]:
#features1= vect1.get_feature_names() ver problem
features1 = vect1.get_feature_names_out()
features

array(['abbi', 'abbi glines', 'agatha', 'agatha christie', 'alan', 'alex',
       'alexander', 'alice', 'amy', 'anderson', 'andrew', 'andrews',
       'anita', 'ann', 'anna', 'anne', 'anne rice', 'anthony', 'archer',
       'armentrout', 'armstrong', 'arthur', 'ashley', 'asimov',
       'baldacci', 'banks', 'barbara', 'ben', 'bernard', 'blake',
       'brandon', 'brandon sanderson', 'brett', 'brian', 'brian vaughan',
       'briggs', 'brooks', 'brown', 'bryan', 'butcher', 'cabot', 'card',
       'carl', 'charlaine', 'charlaine harris', 'charles', 'charlie',
       'child', 'chris', 'christie', 'christopher', 'chuck', 'clare',
       'clark', 'clarke', 'clive', 'coben', 'cole', 'colleen', 'connelly',
       'cornwell', 'dan', 'daniel', 'dave', 'david', 'david baldacci',
       'david eddings', 'dean', 'dean koontz', 'diana', 'don', 'donald',
       'douglas', 'douglas preston', 'doyle', 'dr', 'dr seuss', 'eddings',
       'edward', 'elizabeth', 'ellen', 'ellis', 'emily', 'eric', 'erin',

In [40]:
cosine_sim_titles1 = cosine_similarity(title_matrix, title_matrix)

In [41]:
indices2 = pd.Series(books_authors.index,index=books_authors['title'])
title_id1 = indices2[book_nama]

In [42]:
feature_array = np.squeeze(title_matrix[title_id1].toarray()) #squeeze activity matrix into array
idx = np.where(feature_array > 0)

In [43]:
# Number of books to show
n = 11
top_n_idx1 = np.flip(np.argsort(cosine_sim_titles1[title_id1,]), axis = 0)[0:n]
top_n_sim_values1 = cosine_sim_titles1[title_id1, top_n_idx1]

In [44]:
# Find top n with values > 0
top_n_idx1 = top_n_idx1[top_n_sim_values1 > 0]
scores1 = top_n_sim_values1[top_n_sim_values1 > 0]

In [45]:
books_authors['title'].iloc[title_id1]
pd.DataFrame({"books": books_authors['title'].iloc[top_n_idx1].values,
           "scores":scores1}, columns = ["books","scores"])

Unnamed: 0,books,scores
0,Gone Girl,1.0
1,Where They Found Her,0.800593
2,Pretty Baby,0.799111
3,Sharp Objects,0.795383
4,The Good Girl,0.794806
5,Disclaimer,0.794438
6,The Girl on the Train,0.792013
7,Dark Places,0.790939
8,Every Fifteen Minutes,0.790613
9,Descent,0.78735
