In [39]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import time

In [40]:
raw = pd.read_csv("data_sets/Books_Raw.csv", na_values='None')

In [41]:
start_time = time.time()
reviews = pd.read_csv("data_sets/Reviews_Raw.csv", na_values='None')
end_time = time.time()
print("Import time:", end_time - start_time, "seconds")

Import time: 18.409472465515137 seconds


In [42]:
# Change the values of missing
missing_val = raw["authors"][212400]
reviews = reviews.replace(missing_val, None)
raw = raw.replace(missing_val, None)

## Clean the book file

In [19]:
# Remove the outer square brackets. Remove all quote marks. (Note some fields have multiple sets of quotes, if there are multiple authors)
def clean_author(str):
    if str is None:
        return None
    else:
        return ((str[1:-1]).replace("'", ""))

raw["authors"] = raw["authors"].apply(clean_author)

In [20]:
del raw["description"]
del raw["image"]
del raw["previewLink"]
del raw["infoLink"]
del raw["ratingsCount"]

In [21]:
def my_count(str):
    if str == None:
        return None
    else:
        return str.count("'")
        
raw["categories_count"] = raw["categories"].apply(my_count)

In [22]:
raw[raw["categories_count"] > 2.0]

# No book has more than 1 category, so let's safely remove the quotes and brackets like we did for authors

Unnamed: 0,Title,authors,publisher,publishedDate,categories,categories_count


In [23]:
raw["categories"] = raw["categories"].apply(clean_author)
del raw["categories_count"]

In [24]:
raw

Unnamed: 0,Title,authors,publisher,publishedDate,categories
0,Its Only Art If Its Well Hung!,Julie Strain,,1996,Comics & Graphic Novels
1,Dr. Seuss: American Icon,Philip Nel,A&C Black,2005-01-01,Biography & Autobiography
2,Wonderful Worship in Smaller Churches,David R. Ray,,2000,Religion
3,Whispers of the Wicked Saints,Veronica Haddon,iUniverse,2005-02,Fiction
4,"Nation Dance: Religion, Identity and Cultural ...",Edward Long,,2003-03-01,
...,...,...,...,...,...
212399,The Orphan Of Ellis Island (Time Travel Advent...,Elvira Woodruff,Scholastic Paperbacks,2000-06-01,Juvenile Fiction
212400,Red Boots for Christmas,,,1995,Juvenile Fiction
212401,Mamaw,Wild Wild Cabbage,,2018-01-17,
212402,The Autograph Man,Zadie Smith,Vintage,2003-08-12,Fiction


In [36]:
# Notice one book does not have a title
raw[raw["Title"].isnull()]

Unnamed: 0,Title,authors,publisher,publishedDate,categories
1066,,Maharshi Ramana,,2015-12-15,


## Clean the reviews file

In [43]:
del reviews["review/text"]
del reviews ["review/summary"]

In [44]:
# Drop all reviews that don't have a title, as they can't be match to a book
reviews.dropna(subset="Title", inplace=True)
reviews.count()

Id                    2999792
Title                 2999792
Price                  481164
User_id               2438018
profileName           2437900
review/helpfulness    2999792
review/score          2999792
review/time           2999792
dtype: int64

## Add review data to the raw data

In [70]:
average_score_per_title = reviews.groupby("Title")["review/score"].mean()
average_score_per_title

Title
" Film technique, " and, " Film acting "                                                                                                   4.500000
" We'll Always Have Paris": The Definitive Guide to Great Lines from the Movies                                                            5.000000
"... And Poetry is Born ..." Russian Classical Poetry                                                                                      4.000000
"A Titanic hero" Thomas Andrews, shipbuilder                                                                                               4.875000
"A Truthful Impression of the Country": British and American Travel Writing in China, 1880-1949                                            4.000000
                                                                                                                                             ...   
with an everlasting love                                                                                  

In [69]:
# Notice some of these titles are WILD
print(any(i == '" Film technique, " and, " Film acting "' for i in reviews["Title"]))
print(any(i == 'www.whitbread.org/book' for i in reviews["Title"]))

True
True


In [72]:
number_of_reviews_per_title = reviews["Title"].value_counts()
number_of_reviews_per_title

Title
The Hobbit                                                                                                         22023
Pride and Prejudice                                                                                                20371
Atlas Shrugged                                                                                                     12513
Wuthering Heights                                                                                                  10780
The Giver                                                                                                           7644
                                                                                                                   ...  
Illuminatus! Part I: The Eye in the Pyramid (The eye in the Pyramid, The Golden Apple,Leviathan, Parts 1, 2, 3)        1
Modern Chess Openings                                                                                                  1
Child Abuse and Neglect: C