## 17_Clean_Scraped_Data_Random_Sample

Author: Daniel Hui

License: MIT

This notebook cleans the scraped data from the Seattle Public Library's online catalog

In [1]:
import pandas as pd
import numpy as np

# visualization imports
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

pd.set_option('display.max_columns', 500)
pd.set_option('display.float_format', lambda x: '%.4f' % x)

### Load Scraped Book Data

In [4]:
scrape_df = pd.read_csv('../01_Data/04_Scraped/book_data_random.csv',index_col=0)

In [5]:
scrape_df = scrape_df.reset_index(drop=True)
scrape_df.head()

Unnamed: 0,isbn,url,page,dim,avg_rating,tot_ratings,tot_reviews,type,callno,subjects,desc,image
0,1620401371,https://seattle.bibliocommons.com/item/show/31...,185.0,21,3.7,4.0,1.0,Hardcover,809.9332 B3407H 2016,['Travel in literature'],&quot;Written in the irreverent style that mad...,https://secure.syndetics.com/index.aspx?isbn=9...
1,985673486,https://seattle.bibliocommons.com/item/show/32...,269.0,22,,0.0,0.0,Hardcover,FIC REALITY 2012,['Gangs Fiction'],Welcome to Far Rock where opportunity never kn...,https://secure.syndetics.com/index.aspx?isbn=9...
2,1618101110,https://seattle.bibliocommons.com/item/show/28...,24.0,26,,0.0,0.0,Hardcover,J507.8 H5295i 2013,['Observation (Scientific method) Juvenile lit...,This Title Is All About The Scientific Method ...,https://secure.syndetics.com/index.aspx?isbn=9...
3,375864326,https://seattle.bibliocommons.com/item/show/26...,,27,7.8,16.0,,Hardcover,E MULDROW,"['Trees Fiction', 'Ecology Juvenile fiction']",Simple text reveals the benefits of planting a...,https://secure.syndetics.com/index.aspx?isbn=9...
4,792271351,https://seattle.bibliocommons.com/item/show/23...,31.0,30,,0.0,0.0,Hardcover,J811.54 L5871M 2005,"['Monuments Juvenile poetry', ""Children's poet...",Award-winning poet Lewis invites readers to cl...,https://secure.syndetics.com/index.aspx?isbn=9...


### Truncate DataSet

In [7]:
scrape_df = scrape_df[["isbn","page","dim","avg_rating","tot_ratings","tot_reviews"]]
scrape_df = scrape_df.rename({"isbn":"ISBN"},axis=1)
scrape_df.head()

Unnamed: 0,ISBN,page,dim,avg_rating,tot_ratings,tot_reviews
0,1620401371,185.0,21,3.7,4.0,1.0
1,985673486,269.0,22,,0.0,0.0
2,1618101110,24.0,26,,0.0,0.0
3,375864326,,27,7.8,16.0,
4,792271351,31.0,30,,0.0,0.0


In [8]:
scrape_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3103 entries, 0 to 3102
Data columns (total 6 columns):
ISBN           3103 non-null object
page           2822 non-null object
dim            3039 non-null object
avg_rating     1777 non-null float64
tot_ratings    3043 non-null float64
tot_reviews    2122 non-null float64
dtypes: float64(3), object(3)
memory usage: 145.5+ KB


### Clean Object Types to Numerical

In [9]:
def make_num(row):
    try:
        return round(row)
    except: -1   #If this happens there was an error

In [10]:
scrape_df["page"] = scrape_df["page"].apply(make_num)
scrape_df["dim"] = scrape_df["dim"].apply(make_num)
scrape_df = scrape_df.fillna(0)

In [11]:
scrape_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3103 entries, 0 to 3102
Data columns (total 6 columns):
ISBN           3103 non-null object
page           3103 non-null int64
dim            3103 non-null int64
avg_rating     3103 non-null float64
tot_ratings    3103 non-null float64
tot_reviews    3103 non-null float64
dtypes: float64(3), int64(2), object(1)
memory usage: 145.5+ KB


In [12]:
scrape_df[scrape_df["page"] == -1]     #Error Check. Should be Empty

Unnamed: 0,ISBN,page,dim,avg_rating,tot_ratings,tot_reviews


In [13]:
scrape_df[scrape_df["dim"] == -1]     #Error Check. Should be Empty

Unnamed: 0,ISBN,page,dim,avg_rating,tot_ratings,tot_reviews


### Export CSV

In [14]:
scrape_df.to_csv("../01_Data/03_Cleaned/Clean_Book_Data_Random.csv")