In [22]:
import pandas as pd

In [23]:
# Stored the dataset csv books.csv in a pandas DataFrame books

books = pd.read_csv('books.csv');
books.head()

Unnamed: 0,Name,Genre,Description
0,dog heaven,children,in newbery medalist cynthia rylant's classic b...
1,the saturdays,children,meet the melendys! the four melendy children l...
2,the legend of rock paper scissors,children,you've played the game. now read the legend of...
3,leo the littlest seahorse,children,"in the warm waters of the coral reef, one hund..."
4,miss mary mack,children,everyone knows some version of this popular ch...


In [24]:
books.isnull().sum()                                              # checks for total number of null values
books.dropna(inplace=True)                                        # drops null values
books.duplicated().sum()                                          # checks for duplicate values 
books = books.drop_duplicates(subset=['Name', 'Description'])     # drops duplicate values

In [25]:
# Converts the string to a list for better processing

books['Genre'] = books['Genre'].apply(lambda x:x.split())
books['Description'] = books['Description'].apply(lambda X:X.split())
books.head()

Unnamed: 0,Name,Genre,Description
0,dog heaven,[children],"[in, newbery, medalist, cynthia, rylant's, cla..."
1,the saturdays,[children],"[meet, the, melendys!, the, four, melendy, chi..."
2,the legend of rock paper scissors,[children],"[you've, played, the, game., now, read, the, l..."
3,leo the littlest seahorse,[children],"[in, the, warm, waters, of, the, coral, reef,,..."
4,miss mary mack,[children],"[everyone, knows, some, version, of, this, pop..."


In [26]:
# Creates a new column Tags which is a combination of two existing columns - Description, Genre

books['Tags'] = books['Description'] + books['Genre']
books['Tags'].head()

0    [in, newbery, medalist, cynthia, rylant's, cla...
1    [meet, the, melendys!, the, four, melendy, chi...
2    [you've, played, the, game., now, read, the, l...
3    [in, the, warm, waters, of, the, coral, reef,,...
4    [everyone, knows, some, version, of, this, pop...
Name: Tags, dtype: object

In [27]:
# Creates a new DataFrame called modifies_books_data by selecting the columns Name, Genre, Description from existing DataFrame Books

books['Name'] = books['Name'].apply(lambda x: x.title())                    # Converts font of Name to Title Case
books['Tags'] = books['Tags'].apply(lambda x:" ".join(x))                   # Converts list to a single string
books['Description'] = books['Description'].apply(lambda x:" ".join(x))     # Converts list to a single string
books['Description'] = books['Description'].apply(lambda x:x.capitalize())     # Converts list to a single string
books['Genre'] = books['Genre'].apply(lambda x:" ".join(x))                 # Converts list to a single string
books['Tags'] = books['Tags'].apply(lambda x:x.lower())                     # Converts font style of Tags to be lower
books.head()

Unnamed: 0,Name,Genre,Description,Tags
0,Dog Heaven,children,In newbery medalist cynthia rylant's classic b...,in newbery medalist cynthia rylant's classic b...
1,The Saturdays,children,Meet the melendys! the four melendy children l...,meet the melendys! the four melendy children l...
2,The Legend Of Rock Paper Scissors,children,You've played the game. now read the legend of...,you've played the game. now read the legend of...
3,Leo The Littlest Seahorse,children,"In the warm waters of the coral reef, one hund...","in the warm waters of the coral reef, one hund..."
4,Miss Mary Mack,children,Everyone knows some version of this popular ch...,everyone knows some version of this popular ch...


In [28]:
# A UDF to stem the data (Stemming - Text normalization Technique)
# Eg: running - run

from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

def stem(text):
    y=[]

    for i in text.split():
        y.append(ps.stem(i))

    return " ".join(y)

books['Tags'] = books['Tags'].apply(stem)

In [29]:
# Converts collection of text documents into matrices

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words='english')

# Creates vector relation between the column Tags

vectors = cv.fit_transform(books['Tags']).toarray()
cv.get_feature_names_out()

array(['000', '10', '100', ..., 'zombi', 'zone', 'zoo'], dtype=object)

In [30]:
# Finds the distance between the vectors with each other

from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(vectors)

In [31]:
# A UDF which takes name of book as an argument / parameter and returns names of books which are closest to the provided book based on similarity matrix

def recommend(book):
    book_index = books[books['Name'] == book].index[0]
    book_genre = books.iloc[book_index]['Genre']

    same_genre_books = books[books['Genre'] == book_genre]

    distances = similarity[book_index][same_genre_books.index]
    books_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]

    for i in books_list:
        print(same_genre_books.iloc[i[0]].Name)

In [32]:
recommend('Verity')

Carson'S Conspiracy (Sir John Appleby, #35)
A Reason To Stay (Heroes #1)
The Elements
The Wife Between Us
Gone Girl


In [33]:
# Importing modified_books_data DataFrame (in the form of dictionary for ease) and similarity matrix

import pickle
pickle.dump(books.to_dict(), open('book_dict.pkl', 'wb'))
pickle.dump(similarity, open('similarity.pkl', 'wb'))