In [43]:
import pandas as pd
import numpy as np

import langid

## Cleaning the Data

In [29]:
books_data1 = pd.read_csv('data/books_data.csv')
print(f"Total Books: {books_data1.shape[0]}")

Total Books: 212404


In [30]:
# Dropping uninformative columns
books_data = books_data1.drop(["image","previewLink","infoLink"], axis = 1)

# Removing books with no description and no ratings.
books_data = books_data[books_data["description"].isna() == False]
books_data = books_data[books_data["ratingsCount"].isna() == False]

print(f'Books Remaining: {books_data.shape[0]}')


Books Remaining: 45127


In [31]:

books_data = books_data[books_data["ratingsCount"] > 10]
print(f'Books Remaining: {books_data.shape[0]}')
books_data.head()

Books Remaining: 6424


Unnamed: 0,Title,description,authors,publisher,publishedDate,categories,ratingsCount
73,Night World: Daughters Of Darkness,"""There’s something strange about the new girls...",['L.J. Smith'],Simon and Schuster,2016-12-06,['Juvenile Fiction'],11.0
111,The Rabbi's Cat,Gaining the ability to speak after swallowing ...,['Joann Sfar'],Pantheon,2005,['Comics & Graphic Novels'],25.0
115,From Potter's Field,"The sixth book in the Kay Scarpetta series, fr...",['Patricia Cornwell'],Hachette UK,2008-09-04,['Fiction'],19.0
122,Economics in one lesson,"With over a million copies sold, Economics in ...",['Henry Hazlitt'],Currency,2010-08-11,['Business & Economics'],18.0
225,Plain And Simple - A Woman's Journey To The Amish,"""I had an obsession with the Amish. Plan and s...",['Sue Bender'],Harper Collins,2009-03-17,['Religion'],11.0


In [44]:
def is_english(text):
    lang, _ = langid.classify(text)
    return lang == "en"

books_data = books_data[books_data['description'].apply(is_english)]

In [49]:
print(f'Books Remaining: {books_data.shape[0]}')
books_data.head(10)

Books Remaining: 6399


Unnamed: 0,Title,description,authors,publisher,publishedDate,categories,ratingsCount
73,Night World: Daughters Of Darkness,"""There’s something strange about the new girls...",['L.J. Smith'],Simon and Schuster,2016-12-06,['Juvenile Fiction'],11.0
111,The Rabbi's Cat,Gaining the ability to speak after swallowing ...,['Joann Sfar'],Pantheon,2005,['Comics & Graphic Novels'],25.0
115,From Potter's Field,"The sixth book in the Kay Scarpetta series, fr...",['Patricia Cornwell'],Hachette UK,2008-09-04,['Fiction'],19.0
122,Economics in one lesson,"With over a million copies sold, Economics in ...",['Henry Hazlitt'],Currency,2010-08-11,['Business & Economics'],18.0
225,Plain And Simple - A Woman's Journey To The Amish,"""I had an obsession with the Amish. Plan and s...",['Sue Bender'],Harper Collins,2009-03-17,['Religion'],11.0
269,The Castle in the Attic,The classic children's story about a young boy...,['Elizabeth Winthrop'],Holiday House,2012-05-15,['Juvenile Fiction'],21.0
359,Dumb witness,An elderly spinster has been poisoned in her c...,['AGATHA. CHRISTIE'],HarperCollins,2018-03-22,,21.0
368,Jean Paul Sartres No Exit and the Flies,English translations of four plays which drama...,['Jean-Paul Sartre'],Vintage,1976,['Drama'],22.0
466,Mothman Prophecies,This true account of the aliens who invaded th...,['John A. Keel'],Hachette UK,2013-03-28,"['Body, Mind & Spirit']",13.0
475,The Gods of Mars,The Barsoom series continues: John Carter retu...,['Edgar Rice Burroughs'],Open Road Media,2020-03-17,['Fiction'],26.0


In [50]:
books_data.to_csv("cleaned_data/cleaned_book_data.csv", index = False)

## Ratings

In [54]:
ratings_data = pd.read_csv('data/Books_rating.csv')

In [56]:
print(f"Total Reviews: {ratings_data.shape[0]}")

Total Reviews: 3000000


In [57]:
ratings = ratings_data.drop(["Id","User_id","profileName","review/time","Price"], axis = 1)
ratings.head()

Unnamed: 0,Title,review/helpfulness,review/score,review/summary,review/text
0,Its Only Art If Its Well Hung!,7/7,4.0,Nice collection of Julie Strain images,This is only for Julie Strain fans. It's a col...
1,Dr. Seuss: American Icon,10/10,5.0,Really Enjoyed It,I don't care much for Dr. Seuss but after read...
2,Dr. Seuss: American Icon,10/11,5.0,Essential for every personal and Public Library,"If people become the books they read and if ""t..."
3,Dr. Seuss: American Icon,7/7,4.0,Phlip Nel gives silly Seuss a serious treatment,"Theodore Seuss Geisel (1904-1991), aka &quot;D..."
4,Dr. Seuss: American Icon,3/3,4.0,Good academic overview,Philip Nel - Dr. Seuss: American IconThis is b...


Keeping only reviews that have had more than 10 people respond to the reviews:

In [58]:
helpfulness = pd.DataFrame(ratings["review/helpfulness"].str.split("/").tolist())
reviews = pd.to_numeric(helpfulness[1], errors = "coerce")

filtered_ratings = ratings[reviews > 10]
print(f'Ratings Count: {filtered_ratings.shape[0]}')
filtered_ratings.head(10)

Ratings Count: 488257


Unnamed: 0,Title,review/helpfulness,review/score,review/summary,review/text
2,Dr. Seuss: American Icon,10/11,5.0,Essential for every personal and Public Library,"If people become the books they read and if ""t..."
14,Whispers of the Wicked Saints,7/11,1.0,not good,I bought this book because I read some glowing...
47,The Church of Christ: A Biblical Ecclesiology ...,74/81,5.0,Ecclesiological Milestone,With the publication of Everett Ferguson's boo...
81,The Ultimate Guide to Law School Admission: In...,27/29,3.0,No &quot;Insider&quot; Secrets,If you are someone who is fairly new to the la...
84,The Ultimate Guide to Law School Admission: In...,9/12,5.0,Exactly what I needed.,This book answered all my questions about gett...
86,The Ultimate Guide to Law School Admission: In...,7/12,5.0,"THIS BOOK SAVED ME OVER $100,000.00!!!",Boy am I lucky to have found The Ultimate Guid...
88,The Repeal of Reticence: A History of America'...,29/29,5.0,Great treatment of the defeat of reticence by ...,"Using a quiet, restrained writing style that i..."
91,Alaska Sourdough,36/37,5.0,Real Alaskan Sourdough,Ruth Allman has written an excellent book abou...
92,Alaska Sourdough,29/30,5.0,True Alaskan cooking,"I have been using this book since 1988, the ei..."
93,Alaska Sourdough,25/28,5.0,Cheechako to Sourdough in 190 Pages,"My poor dogeared, stained copy of this book ca..."


Keeping only ratings that meet our book criteria:

In [59]:
selected_books = books_data["Title"].unique()
filtered_ratings = filtered_ratings[filtered_ratings["Title"].isin(selected_books)]

In [60]:
print(f'Raings Count: {filtered_ratings.shape[0]}')
filtered_ratings.head(10)

Raings Count: 94573


Unnamed: 0,Title,review/helpfulness,review/score,review/summary,review/text
979,The Rabbi's Cat,25/27,5.0,"A wonderful, funny-sad book",The Rabbi's Cat is a wonderful book. I heard t...
980,The Rabbi's Cat,14/14,5.0,My Rabbi loved this book too,This book is great fun to read. The cat is a w...
981,The Rabbi's Cat,14/15,5.0,"This is the most wonderful, charming Jewish bo...",What can I say? The cat is one of the most cha...
982,The Rabbi's Cat,22/26,4.0,Blessed are those who transgress,This book works on many different levels. I fe...
994,The Rabbi's Cat,8/13,3.0,Too Aimless for My Taste,"I've long been a fan of the graphic novel, and..."
995,The Rabbi's Cat,5/21,1.0,Unpleasant and disappointing,"This is a nasty, mean-spirited book that exhib..."
996,The Rabbi's Cat,1/15,3.0,Grafic Novels,Our book club tried the Rabbi's Cat to see wha...
997,The Rabbi's Cat,15/51,1.0,Self-Serving Message,"This is the tale of a kindly Rabbi, his sweet ..."
998,The Rabbi's Cat,6/33,1.0,Not what it looks like on the cover.,As someone who is interested in different cult...
1036,From Potter's Field,4/14,2.0,More of a &quot;yawn&quot; than a &quot;scream...,"Usually I read &quot;general fiction&quot;, bu..."


In [61]:
eng_filtered_ratings = filtered_ratings[filtered_ratings['review/text'].apply(is_english)]
filtered_ratings = eng_filtered_ratings

filtered_ratings.to_csv("cleaned_data/cleaned_ratings.csv", index = False)