In [147]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

### Reading dataset

In [148]:
book_df = pd.read_csv('books.csv')
book_df.head()

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher,Unnamed: 12
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,4.57,439785960,9780439785969,eng,652,2095690,27591,9/16/2006,Scholastic Inc.,
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,4.49,439358078,9780439358071,eng,870,2153167,29221,9/1/2004,Scholastic Inc.,
2,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.42,439554896,9780439554893,eng,352,6333,244,11/1/2003,Scholastic,
3,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPré,4.56,043965548X,9780439655484,eng,435,2339585,36325,5/1/2004,Scholastic Inc.,
4,8,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling/Mary GrandPré,4.78,439682584,9780439682589,eng,2690,41428,164,9/13/2004,Scholastic,


### check the shape of the dataset

In [149]:
book_df.shape

(11127, 13)

### check dataype

In [150]:
book_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11127 entries, 0 to 11126
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   bookID              11127 non-null  int64 
 1   title               11127 non-null  object
 2   authors             11127 non-null  object
 3   average_rating      11127 non-null  object
 4   isbn                11127 non-null  object
 5   isbn13              11127 non-null  object
 6   language_code       11127 non-null  object
 7   num_pages           11127 non-null  object
 8   ratings_count       11127 non-null  int64 
 9   text_reviews_count  11127 non-null  int64 
 10  publication_date    11127 non-null  object
 11  publisher           11127 non-null  object
 12  Unnamed: 12         3 non-null      object
dtypes: int64(3), object(10)
memory usage: 1.1+ MB


##### ==>> 'average_rating' need to be converted to float 

### check 'average_rating' unique values

In [151]:
book_df['average_rating'].unique()

array(['4.57', '4.49', '4.42', '4.56', '4.78', '3.74', '4.73', '4.38',
       '4.22', '4.21', '3.44', '3.87', '4.07', '3.9', '3.83', '3.86',
       '3.91', '3.93', '4.59', '4.5', '4.36', '4.53', '3.84', '3.72', '4',
       '3.88', '4.33', '3.45', '3.76', '3.6', '4.06', '4.17', '3.55',
       '3.98', '4.13', '3.95', '4.24', '4.34', '3.92', '4.23', '3.75',
       '3.94', '3.77', '4.46', '3.68', '3.99', '3.25', '3.89', '3.82',
       '3.78', '4.01', '4.35', '4.28', '3.34', '3.67', '3.48', '3.5',
       '3.62', '4.27', '3.39', '4.12', '4.08', '4.05', '3.85', '2.99',
       '4.19', '3.71', '4.2', '3.65', '3.4', '4.11', '3.42', '4.02',
       '3.69', '4.1', '3.66', '4.61', '4.04', '3.97', '4.14', '3.57',
       '3.43', '3.22', '3.64', '3.96', '3.8', '3.81', '3.79', '3.23',
       '4.15', '4.39', '4.03', '3.73', '3.51', '3.63', '3.09', '3.59',
       '4.29', '0', '4.25', '4.26', '3.61', '4.16', '4.55', '4.52',
       '3.41', '4.7', '4.18', '3.58', '4.51', '3.15', '3.31', '4.09',
       '3.49'

### Correct inconsistent values

In [152]:
book_df[book_df['average_rating'].isin(['Rawles', 'Son & Ferguson', 'one of the founding members of this Tolkien website)/Verlyn Flieger/Turgon (=David E. Smith)'])]

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher,Unnamed: 12
4702,16914,The Tolkien Fan's Medieval Reader,David E. Smith (Turgon of TheOneRing.net,one of the founding members of this Tolkien we...,3.58,1593600119,9781593600112,eng,400,26,4,4/6/2004,Cold Spring Press
5877,22128,Patriots (The Coming Collapse),James Wesley,Rawles,3.63,156384155X,9781563841552,eng,342,38,4,1/15/1999,Huntington House Publishers
8979,34889,Brown's Star Atlas: Showing All The Bright Sta...,Brown,Son & Ferguson,0.0,851742718,9780851742717,eng,49,0,0,5/1/1977,Brown Son & Ferguson Ltd.


##### ===> values from 'isbn' need to be shifted left

In [153]:
columns = book_df[book_df.index.isin([4702, 8979, 5877])].iloc[:,3:].columns
columns

Index(['average_rating', 'isbn', 'isbn13', 'language_code', 'num_pages',
       'ratings_count', 'text_reviews_count', 'publication_date', 'publisher',
       'Unnamed: 12'],
      dtype='object')

In [154]:
for index in [4702, 8979, 5877]:
    for i in range(len(columns)-1):
        book_df.iloc[index, i] = book_df.iloc[index, i + 1] 

book_df[book_df.index.isin([4702, 8979, 5877])]

  book_df.iloc[index, i] = book_df.iloc[index, i + 1]


Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher,Unnamed: 12
4702,The Tolkien Fan's Medieval Reader,David E. Smith (Turgon of TheOneRing.net,one of the founding members of this Tolkien we...,3.58,1593600119,9781593600112,eng,400,26,26,4,4/6/2004,Cold Spring Press
5877,Patriots (The Coming Collapse),James Wesley,Rawles,3.63,156384155X,9781563841552,eng,342,38,38,4,1/15/1999,Huntington House Publishers
8979,Brown's Star Atlas: Showing All The Bright Sta...,Brown,Son & Ferguson,0.0,851742718,9780851742717,eng,49,0,0,0,5/1/1977,Brown Son & Ferguson Ltd.


### convert 'average_rating' type from categorical to float

In [155]:
book_df['average_rating'] = book_df['average_rating'].astype(float)

### Popularity-based Recommender

In [156]:
def imdb(df, ratings_count, average_rating):
    #m : minimum vote count to be considered
    minimum_vote_count = 0.75*df[ratings_count].max() 
     #C : mean of all ratings
    mean_rating = df[average_rating].mean()

    #create weighted rating 
    df['weighted_rating'] = (df[ratings_count] / (df[ratings_count] + minimum_vote_count)) * df[average_rating] + (minimum_vote_count / (df[ratings_count] + minimum_vote_count)) * mean_rating
    return df.sort_values(by='weighted_rating', ascending=False).head()

recommendation =  imdb(book_df, 'ratings_count', 'average_rating')
recommendation[['title', 'weighted_rating']]

Unnamed: 0,title,weighted_rating
3,Harry Potter and the Prisoner of Azkaban (Harr...,4.186825
0,Harry Potter and the Half-Blood Prince (Harry ...,4.174188
1,Harry Potter and the Order of the Phoenix (Har...,4.147497
4416,Harry Potter and the Chamber of Secrets (Harry...,4.127931
23,The Fellowship of the Ring (The Lord of the Ri...,4.096386


### Content-based Recommender

In [157]:
def vectorize(df, col):

    #instatiate a new vectorizer object
    tfidf = TfidfVectorizer(stop_words='english')

    #Replace NaN with empty string
    df[col] = df[col].fillna('')

    #transforms the description of the movies (col) to tfidf matrix
    tfidf_matrix = tfidf.fit_transform(df[col])

    #calculate distance matrix between items
    distance_matrix = cosine_similarity(tfidf_matrix)
    return distance_matrix

def content_based_recommender(df, title, recomm_title, distance_matrix):

    #create series of indices that are indexed by their titles
    indices = pd.Series(df.index, index = df[title]).drop_duplicates()
    
    
    #fetch index of the movie itself
    id = indices[recomm_title]
    
    #create a list of tuple: each tuple has an id and a distance to the original movie
    if np.ndim(id) > 0:
        distances = np.max(distance_matrix[id], axis=0)

    distances = list(enumerate(distances))

    #sort movies based on distances
    distances = sorted(distances, key = lambda x: x[1], reverse=True)

    #Get the scores of the 5 most similar movies
    #skip the first most similar because it is the movie itself
    distances = distances[1:6]

    #Get the movie indices
    recommendations = [distance[0] for distance in distances]

    #return recommendation titles
    return df[title].iloc[recommendations]


distance_matrix = vectorize(book_df, 'authors')

recommendations = content_based_recommender(book_df, 'title', "A Short History of Nearly Everything", distance_matrix)
recommendations





13                          Bill Bryson's African Diary
14    Bryson's Dictionary of Troublesome Words: A Wr...
15                               In a Sunburned Country
16    I'm a Stranger Here Myself: Notes on Returning...
17    The Lost Continent: Travels in Small Town America
Name: title, dtype: object