In [1]:
# Import necessary libraries

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
# Load and preview cleaned_df 

cleaned_df = pd.read_csv('cleaned_df.csv')
cleaned_df.head()

Unnamed: 0,user_id,book_id,rating,authors,title,language_code,average_rating,tag
0,1,258,5,"Carlos Ruiz Zafón, Lucia Graves",The Shadow of the Wind (The Cemetery of Forgot...,eng,4.24,wish-list
1,2,4081,4,Tom Wolfe,I am Charlotte Simmons,en-US,3.4,young-adult
2,2,260,5,Dale Carnegie,How to Win Friends and Influence People,eng,4.13,تنمية-بشرية
3,2,9296,5,"Alice Miller, Ruth Ward",The Drama of the Gifted Child: The Search for ...,en-GB,4.09,wish-list
4,2,2318,3,"Thomas J. Stanley, William D. Danko",The Millionaire Next Door: The Surprising Secr...,eng,4.0,wish-list


In [4]:
# Create dataframe with features to be used to check for similarity, 'tag', 'authors'

data = cleaned_df[['title','tag','authors']]
data.head()

Unnamed: 0,title,tag,authors
0,The Shadow of the Wind (The Cemetery of Forgot...,wish-list,"Carlos Ruiz Zafón, Lucia Graves"
1,I am Charlotte Simmons,young-adult,Tom Wolfe
2,How to Win Friends and Influence People,تنمية-بشرية,Dale Carnegie
3,The Drama of the Gifted Child: The Search for ...,wish-list,"Alice Miller, Ruth Ward"
4,The Millionaire Next Door: The Surprising Secr...,wish-list,"Thomas J. Stanley, William D. Danko"


In [5]:
# View number of titles and review counts of each

data.groupby(['title']).count() #9964 titles

Unnamed: 0_level_0,tag,authors
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"Angels (Walsh Family, #3)",263,263
"""حكايات فرغلي المستكاوي ""حكايتى مع كفر السحلاوية",113,113
#GIRLBOSS,195,195
'Salem's Lot,4433,4433
"'Tis (Frank McCourt, #2)",703,703
"1,000 Places to See Before You Die",398,398
1/4 جرام,196,196
"10% Happier: How I Tamed the Voice in My Head, Reduced Stress Without Losing My Edge, and Found Self-Help That Actually Works",297,297
"100 Bullets, Vol. 1: First Shot, Last Call",170,170
100 Love Sonnets,138,138


In [6]:
# We only need one account of each book
# Drop duplicates, duplicate tags and authors are all the same

data.drop_duplicates(subset ="title", keep="first",inplace=True)
data.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


(9964, 3)

In [7]:
# Prepare dataframe so tags and authors can be read as one single string

data['title'] = data['title'].map(lambda x: x if type(x)!=str else x.lower())
data['tag'] = data['tag'].map(lambda x: x if type(x)!=str else x.lower())
data['authors'] = data['authors'].map(lambda x: x if type(x)!=str else x.lower())
data['authors'] = data['authors'].map(lambda x: x.replace(" ",""))
data['authors'] = data['authors'].map(lambda x: x.replace(","," "))
data['words_of_interest'] = data.apply(lambda row: row.tag + " " + row.authors, axis=1)
data = data.drop(columns=['tag','authors'])
data = data.set_index('title') 
data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the ca

Unnamed: 0_level_0,words_of_interest
title,Unnamed: 1_level_1
"the shadow of the wind (the cemetery of forgotten books, #1)",wish-list carlosruizzafón luciagraves
i am charlotte simmons,young-adult tomwolfe
how to win friends and influence people,تنمية-بشرية dalecarnegie
the drama of the gifted child: the search for the true self,wish-list alicemiller ruthward
the millionaire next door: the surprising secrets of americas wealthy,wish-list thomasj.stanley williamd.danko


In [13]:
# Instantiate CountVectorizer() and compute cosine similarity matrix

cv = CountVectorizer()
cv_matrix = cv.fit_transform(data['words_of_interest'])
cosine_sim = cosine_similarity(cv_matrix, cv_matrix)
indices = pd.Series(data.index)

In [16]:
# Create function to provide top ten books similar to the input

def content_based_recommendations(title):
    recommended_books = []
    idx = indices[indices == title.lower()].index[0]
    sim_scores = pd.Series(cosine_sim[idx]).sort_values(ascending = False)
    top_10 = list(sim_scores.iloc[1:11].index)
    for i in top_10:
        recommended_books.append(list(data.index)[i])    
    return recommended_books

In [17]:
# Recommend books similar to 'The Hunger Games (The Hunger Games, #1)'

content_based_recommendations('The Hunger Games (The Hunger Games, #1)')

['catching fire (the hunger games, #2)',
 'the hunger games (the hunger games, #1)',
 'mockingjay (the hunger games, #3)',
 'the rithmatist (the rithmatist, #1)',
 'if there be thorns (dollanganger, #3)',
 'angelfire (angelfire, #1)',
 'mosquitoland',
 'mitosis (the reckoners, #1.5)',
 'the power of six (lorien legacies, #2)',
 'frostbite (vampire academy, #2)']

In [20]:
# Recommend books similar to 'The Hobbit'

content_based_recommendations('The Hobbit')

['the hobbit',
 'the two towers (the lord of the rings, #2)',
 'the return of the king (the lord of the rings, #3)',
 'the lord of the rings (the lord of the rings, #1-3)',
 'the complete guide to middle-earth',
 'the children of húrin',
 'the silmarillion (middle-earth universe)',
 'vicious (sinners of saint, #1)',
 'harry potter page to screen: the complete filmmaking journey',
 'remember me?']

In [21]:
# Recommend books similar to 'Pride and Prejudice

content_based_recommendations('Pride and Prejudice')

['vicious (sinners of saint, #1)',
 'love & misadventure',
 'the cellist of sarajevo',
 'john dies at the end (john dies at the end, #1)',
 'in the afterlight (the darkest minds, #3)',
 'the man who walked between the towers',
 'adam',
 'succubus blues (georgina kincaid, #1)',
 'breath',
 "william shakespeare's star wars: verily, a new hope (william shakespeare's star wars, #4)"]