In [31]:
import numpy as np
import pandas as pd

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [22]:
corpus_data = pd.read_csv("cleaned_data/books_withRatings.csv")
corpus_data.head(10)

Unnamed: 0,Title,description,authors,publisher,publishedDate,categories,ratingsCount,review/score
0,Night World: Daughters Of Darkness,"""There’s something strange about the new girls...",['L.J. Smith'],Simon and Schuster,2016-12-06,['Juvenile Fiction'],11.0,4.768657
1,The Rabbi's Cat,Gaining the ability to speak after swallowing ...,['Joann Sfar'],Pantheon,2005,['Comics & Graphic Novels'],25.0,4.425
2,From Potter's Field,"The sixth book in the Kay Scarpetta series, fr...",['Patricia Cornwell'],Hachette UK,2008-09-04,['Fiction'],19.0,3.783439
3,Economics in one lesson,"With over a million copies sold, Economics in ...",['Henry Hazlitt'],Currency,2010-08-11,['Business & Economics'],18.0,4.363086
4,Plain And Simple - A Woman's Journey To The Amish,"""I had an obsession with the Amish. Plan and s...",['Sue Bender'],Harper Collins,2009-03-17,['Religion'],11.0,3.527273
5,The Castle in the Attic,The classic children's story about a young boy...,['Elizabeth Winthrop'],Holiday House,2012-05-15,['Juvenile Fiction'],21.0,4.474576
6,Dumb witness,An elderly spinster has been poisoned in her c...,['AGATHA. CHRISTIE'],HarperCollins,2018-03-22,,21.0,4.307692
7,Jean Paul Sartres No Exit and the Flies,English translations of four plays which drama...,['Jean-Paul Sartre'],Vintage,1976,['Drama'],22.0,5.0
8,Mothman Prophecies,This true account of the aliens who invaded th...,['John A. Keel'],Hachette UK,2013-03-28,"['Body, Mind & Spirit']",13.0,3.439759
9,The Gods of Mars,The Barsoom series continues: John Carter retu...,['Edgar Rice Burroughs'],Open Road Media,2020-03-17,['Fiction'],26.0,4.214286


In [32]:
corpus_data['tokenized_text'] = corpus_data['description'].apply(lambda x: word_tokenize(x.lower()))

# Remove stop words
stop_words = set(stopwords.words('english'))
corpus_data['filtered_text'] = corpus_data['tokenized_text'].apply(lambda x: [word for word in x if word not in stop_words])

# Lemmatization using WordNet lemmatizer
lemmatizer = WordNetLemmatizer()
corpus_data['lemmatized_text'] = corpus_data['filtered_text'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

# Stemming using Porter stemmer
stemmer = PorterStemmer()
corpus_data['stemmed_text'] = corpus_data['filtered_text'].apply(lambda x: [stemmer.stem(word) for word in x])

# Display the DataFrame
corpus_data.head(10)

Unnamed: 0,Title,description,authors,publisher,publishedDate,categories,ratingsCount,review/score,tokenized_text,filtered_text,lemmatized_text,stemmed_text
0,Night World: Daughters Of Darkness,"""There’s something strange about the new girls...",['L.J. Smith'],Simon and Schuster,2016-12-06,['Juvenile Fiction'],11.0,4.768657,"[``, there, ’, s, something, strange, about, t...","[``, ’, something, strange, new, girls, town, ...","[``, ’, something, strange, new, girl, town, ....","[``, ’, someth, strang, new, girl, town, ., br..."
1,The Rabbi's Cat,Gaining the ability to speak after swallowing ...,['Joann Sfar'],Pantheon,2005,['Comics & Graphic Novels'],25.0,4.425,"[gaining, the, ability, to, speak, after, swal...","[gaining, ability, speak, swallowing, parakeet...","[gaining, ability, speak, swallowing, parakeet...","[gain, abil, speak, swallow, parakeet, ,, rabb..."
2,From Potter's Field,"The sixth book in the Kay Scarpetta series, fr...",['Patricia Cornwell'],Hachette UK,2008-09-04,['Fiction'],19.0,3.783439,"[the, sixth, book, in, the, kay, scarpetta, se...","[sixth, book, kay, scarpetta, series, ,, ., 1,...","[sixth, book, kay, scarpetta, series, ,, ., 1,...","[sixth, book, kay, scarpetta, seri, ,, ., 1, b..."
3,Economics in one lesson,"With over a million copies sold, Economics in ...",['Henry Hazlitt'],Currency,2010-08-11,['Business & Economics'],18.0,4.363086,"[with, over, a, million, copies, sold, ,, econ...","[million, copies, sold, ,, economics, one, les...","[million, copy, sold, ,, economics, one, lesso...","[million, copi, sold, ,, econom, one, lesson, ..."
4,Plain And Simple - A Woman's Journey To The Amish,"""I had an obsession with the Amish. Plan and s...",['Sue Bender'],Harper Collins,2009-03-17,['Religion'],11.0,3.527273,"[``, i, had, an, obsession, with, the, amish, ...","[``, obsession, amish, ., plan, simple, ., obj...","[``, obsession, amish, ., plan, simple, ., obj...","[``, obsess, amish, ., plan, simpl, ., object,..."
5,The Castle in the Attic,The classic children's story about a young boy...,['Elizabeth Winthrop'],Holiday House,2012-05-15,['Juvenile Fiction'],21.0,4.474576,"[the, classic, children, 's, story, about, a, ...","[classic, children, 's, story, young, boy, ,, ...","[classic, child, 's, story, young, boy, ,, toy...","[classic, children, 's, stori, young, boy, ,, ..."
6,Dumb witness,An elderly spinster has been poisoned in her c...,['AGATHA. CHRISTIE'],HarperCollins,2018-03-22,,21.0,4.307692,"[an, elderly, spinster, has, been, poisoned, i...","[elderly, spinster, poisoned, country, home, ....","[elderly, spinster, poisoned, country, home, ....","[elderli, spinster, poison, countri, home, ......"
7,Jean Paul Sartres No Exit and the Flies,English translations of four plays which drama...,['Jean-Paul Sartre'],Vintage,1976,['Drama'],22.0,5.0,"[english, translations, of, four, plays, which...","[english, translations, four, plays, dramatize...","[english, translation, four, play, dramatize, ...","[english, translat, four, play, dramat, theme,..."
8,Mothman Prophecies,This true account of the aliens who invaded th...,['John A. Keel'],Hachette UK,2013-03-28,"['Body, Mind & Spirit']",13.0,3.439759,"[this, true, account, of, the, aliens, who, in...","[true, account, aliens, invaded, town, point, ...","[true, account, alien, invaded, town, point, p...","[true, account, alien, invad, town, point, ple..."
9,The Gods of Mars,The Barsoom series continues: John Carter retu...,['Edgar Rice Burroughs'],Open Road Media,2020-03-17,['Fiction'],26.0,4.214286,"[the, barsoom, series, continues, :, john, car...","[barsoom, series, continues, :, john, carter, ...","[barsoom, series, continues, :, john, carter, ...","[barsoom, seri, continu, :, john, carter, retu..."
