# Goodreads

In [1]:
%matplotlib inline 

import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

#sets up pandas table display
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)

In [2]:
df = pd.read_csv("data/all.csv", header=None,
                names=['rating', 'review_count', 'isbn',
                      'booktype','author_url', 'year',
                      'genre_urls', 'dir','rating_count', 'name']
)

df.head(10)

Unnamed: 0,rating,review_count,isbn,booktype,author_url,year,genre_urls,dir,rating_count,name
0,4.4,136455,439023483,good_reads:book,https://www.goodreads.com/author/show/153394.S...,2008.0,/genres/young-adult|/genres/science-fiction|/g...,dir01/2767052-the-hunger-games.html,2958974,"The Hunger Games (The Hunger Games, #1)"
1,4.41,16648,439358078,good_reads:book,https://www.goodreads.com/author/show/1077326....,2003.0,/genres/fantasy|/genres/young-adult|/genres/fi...,dir01/2.Harry_Potter_and_the_Order_of_the_Phoe...,1284478,Harry Potter and the Order of the Phoenix (Har...
2,3.56,85746,316015849,good_reads:book,https://www.goodreads.com/author/show/941441.S...,2005.0,/genres/young-adult|/genres/fantasy|/genres/ro...,dir01/41865.Twilight.html,2579564,"Twilight (Twilight, #1)"
3,4.23,47906,61120081,good_reads:book,https://www.goodreads.com/author/show/1825.Har...,1960.0,/genres/classics|/genres/fiction|/genres/histo...,dir01/2657.To_Kill_a_Mockingbird.html,2078123,To Kill a Mockingbird
4,4.23,34772,679783261,good_reads:book,https://www.goodreads.com/author/show/1265.Jan...,1813.0,/genres/classics|/genres/fiction|/genres/roman...,dir01/1885.Pride_and_Prejudice.html,1388992,Pride and Prejudice
5,4.25,12363,446675539,good_reads:book,https://www.goodreads.com/author/show/11081.Ma...,1936.0,/genres/classics|/genres/historical-fiction|/g...,dir01/18405.Gone_with_the_Wind.html,645470,Gone with the Wind
6,4.22,7205,66238501,good_reads:book,https://www.goodreads.com/author/show/1069006....,1949.0,/genres/classics|/genres/young-adult|/genres/c...,dir01/11127.The_Chronicles_of_Narnia.html,286677,The Chronicles of Narnia (Chronicles of Narnia...
7,4.38,10902,60256656,good_reads:book,https://www.goodreads.com/author/show/435477.S...,1964.0,/genres/childrens|/genres/young-adult|/genres/...,dir01/370493.The_Giving_Tree.html,502891,The Giving Tree
8,3.79,20670,452284244,good_reads:book,https://www.goodreads.com/author/show/3706.Geo...,1945.0,/genres/classics|/genres/fiction|/genres/scien...,dir01/7613.Animal_Farm.html,1364879,Animal Farm
9,4.18,12302,345391802,good_reads:book,https://www.goodreads.com/author/show/4.Dougla...,1979.0,/genres/science-fiction|/genres/humor|/genres/...,dir01/11.The_Hitchhiker_s_Guide_to_the_Galaxy....,724713,The Hitchhiker's Guide to the Galaxy (Hitchhik...


In [3]:
# Column Data Types
df.dtypes

rating          float64
review_count     object
isbn             object
booktype         object
author_url       object
year            float64
genre_urls       object
dir              object
rating_count     object
name             object
dtype: object

In [4]:
# Tuple representing the dimensionality of the DataFrame
# 6000 rows, 10 columns
print(df.shape)
print(df.shape[0])
print(df.shape[1])

(6000, 10)
6000
10


In [5]:
# Numpy representation of the DataFrame. Axes labels will be removed.
df.values

array([[4.4, '136455', '0439023483', ...,
        'dir01/2767052-the-hunger-games.html', '2958974',
        'The Hunger Games (The Hunger Games, #1)'],
       [4.41, '16648', '0439358078', ...,
        'dir01/2.Harry_Potter_and_the_Order_of_the_Phoenix.html',
        '1284478',
        'Harry Potter and the Order of the Phoenix (Harry Potter, #5)'],
       [3.56, '85746', '0316015849', ..., 'dir01/41865.Twilight.html',
        '2579564', 'Twilight (Twilight, #1)'],
       ...,
       [3.78, '540', '1620612321', ..., 'dir60/13503247-flawed.html',
        '2971', 'Flawed'],
       [3.91, '281', nan, ..., 'dir60/2750008.html', '3083',
        'Ø£Ø³Ø¹Ø¯ Ø§Ù\x85Ø±Ø£Ø© Ù\x81Ù\x8a Ø§Ù\x84Ø¹Ø§Ù\x84Ù\x85'],
       [4.35, '61', '0786929081', ...,
        'dir60/66677.Legacy_of_the_Drow_Collector_s_Edition.html',
        '3982',
        "Legacy of the Drow Collector's Edition (Legacy of the Drow, #1-4; Legend of Drizzt, #7-10)"]],
      dtype=object)

In [6]:
df.columns

Index(['rating', 'review_count', 'isbn', 'booktype', 'author_url', 'year', 'genre_urls', 'dir', 'rating_count', 'name'], dtype='object')

In [7]:
# Columnar Datatype for "rating" column
type(df[df.columns[0]]), type(df.rating)

(pandas.core.series.Series, pandas.core.series.Series)

In [8]:
# Query rows based on condition
df.year > 2000

0        True
1        True
2        True
3       False
4       False
5       False
6       False
7       False
8       False
9       False
10      False
11       True
12       True
13      False
14      False
15      False
16       True
17      False
18      False
19      False
20      False
21      False
22      False
23      False
24      False
25      False
26       True
27      False
28      False
29      False
        ...  
5970    False
5971    False
5972     True
5973     True
5974     True
5975     True
5976     True
5977    False
5978     True
5979     True
5980    False
5981     True
5982     True
5983    False
5984     True
5985     True
5986     True
5987    False
5988    False
5989     True
5990    False
5991     True
5992     True
5993     True
5994     True
5995     True
5996     True
5997     True
5998     True
5999     True
Name: year, Length: 6000, dtype: bool

In [9]:
# Query dataset based on year and show all columns (using query string)
df.query("year > 2000")

Unnamed: 0,rating,review_count,isbn,booktype,author_url,year,genre_urls,dir,rating_count,name
0,4.40,136455,0439023483,good_reads:book,https://www.goodreads.com/author/show/153394.S...,2008.0,/genres/young-adult|/genres/science-fiction|/g...,dir01/2767052-the-hunger-games.html,2958974,"The Hunger Games (The Hunger Games, #1)"
1,4.41,16648,0439358078,good_reads:book,https://www.goodreads.com/author/show/1077326....,2003.0,/genres/fantasy|/genres/young-adult|/genres/fi...,dir01/2.Harry_Potter_and_the_Order_of_the_Phoe...,1284478,Harry Potter and the Order of the Phoenix (Har...
2,3.56,85746,0316015849,good_reads:book,https://www.goodreads.com/author/show/941441.S...,2005.0,/genres/young-adult|/genres/fantasy|/genres/ro...,dir01/41865.Twilight.html,2579564,"Twilight (Twilight, #1)"
11,3.72,34959,0307277674,good_reads:book,https://www.goodreads.com/author/show/630.Dan_...,2003.0,/genres/mystery|/genres/thriller|/genres/suspe...,dir01/968.The_Da_Vinci_Code.html,1220657,"The Da Vinci Code (Robert Langdon, #2)"
12,4.36,69524,0375831002,good_reads:book,https://www.goodreads.com/author/show/11466.Ma...,2005.0,/genres/historical-fiction|/genres/young-adult...,dir01/19063.The_Book_Thief.html,675431,The Book Thief
16,3.92,38061,,good_reads:book,https://www.goodreads.com/author/show/498072.A...,2003.0,/genres/fiction|/genres/romance|/genres/fantas...,dir01/18619684-the-time-traveler-s-wife.html,927254,The Time Traveler's Wife
26,4.43,112279,0525478817,good_reads:book,https://www.goodreads.com/author/show/1406384....,2012.0,/genres/young-adult|/genres/book-club|/genres/...,dir01/11870085-the-fault-in-our-stars.html,1150626,The Fault in Our Stars
32,4.44,70247,0399155341,good_reads:book,https://www.goodreads.com/author/show/1943477....,2009.0,/genres/book-club|/genres/historical-fiction|/...,dir01/4667024-the-help.html,1091909,The Help
37,3.72,31131,0316166685,good_reads:book,https://www.goodreads.com/author/show/316.Alic...,2002.0,/genres/fiction|/genres/mystery|/genres/young-...,dir01/12232938-the-lovely-bones.html,1190963,The Lovely Bones
38,4.30,33912,1594489505,good_reads:book,https://www.goodreads.com/author/show/569.Khal...,2007.0,/genres/fiction|/genres/historical-fiction|/ge...,dir01/128029.A_Thousand_Splendid_Suns.html,563920,A Thousand Splendid Suns


In [10]:
# Query dataset based on year and show all columns (using bracket conditional)
df[df.year > 2000]

Unnamed: 0,rating,review_count,isbn,booktype,author_url,year,genre_urls,dir,rating_count,name
0,4.40,136455,0439023483,good_reads:book,https://www.goodreads.com/author/show/153394.S...,2008.0,/genres/young-adult|/genres/science-fiction|/g...,dir01/2767052-the-hunger-games.html,2958974,"The Hunger Games (The Hunger Games, #1)"
1,4.41,16648,0439358078,good_reads:book,https://www.goodreads.com/author/show/1077326....,2003.0,/genres/fantasy|/genres/young-adult|/genres/fi...,dir01/2.Harry_Potter_and_the_Order_of_the_Phoe...,1284478,Harry Potter and the Order of the Phoenix (Har...
2,3.56,85746,0316015849,good_reads:book,https://www.goodreads.com/author/show/941441.S...,2005.0,/genres/young-adult|/genres/fantasy|/genres/ro...,dir01/41865.Twilight.html,2579564,"Twilight (Twilight, #1)"
11,3.72,34959,0307277674,good_reads:book,https://www.goodreads.com/author/show/630.Dan_...,2003.0,/genres/mystery|/genres/thriller|/genres/suspe...,dir01/968.The_Da_Vinci_Code.html,1220657,"The Da Vinci Code (Robert Langdon, #2)"
12,4.36,69524,0375831002,good_reads:book,https://www.goodreads.com/author/show/11466.Ma...,2005.0,/genres/historical-fiction|/genres/young-adult...,dir01/19063.The_Book_Thief.html,675431,The Book Thief
16,3.92,38061,,good_reads:book,https://www.goodreads.com/author/show/498072.A...,2003.0,/genres/fiction|/genres/romance|/genres/fantas...,dir01/18619684-the-time-traveler-s-wife.html,927254,The Time Traveler's Wife
26,4.43,112279,0525478817,good_reads:book,https://www.goodreads.com/author/show/1406384....,2012.0,/genres/young-adult|/genres/book-club|/genres/...,dir01/11870085-the-fault-in-our-stars.html,1150626,The Fault in Our Stars
32,4.44,70247,0399155341,good_reads:book,https://www.goodreads.com/author/show/1943477....,2009.0,/genres/book-club|/genres/historical-fiction|/...,dir01/4667024-the-help.html,1091909,The Help
37,3.72,31131,0316166685,good_reads:book,https://www.goodreads.com/author/show/316.Alic...,2002.0,/genres/fiction|/genres/mystery|/genres/young-...,dir01/12232938-the-lovely-bones.html,1190963,The Lovely Bones
38,4.30,33912,1594489505,good_reads:book,https://www.goodreads.com/author/show/569.Khal...,2007.0,/genres/fiction|/genres/historical-fiction|/ge...,dir01/128029.A_Thousand_Splendid_Suns.html,563920,A Thousand Splendid Suns


In [11]:
# Aggregate function based off query
np.sum(df.year > 2000)

3177

In [12]:
# Calculate mean based off conditional
np.mean(df.year > 1950)

0.8676666666666667

In [13]:
# Multiple condiationals
(df.year > 1960) & (df.rating > 2)

0        True
1        True
2        True
3       False
4       False
5       False
6       False
7        True
8       False
9        True
10       True
11       True
12       True
13      False
14      False
15      False
16       True
17       True
18      False
19       True
20      False
21      False
22      False
23      False
24       True
25      False
26       True
27      False
28       True
29      False
        ...  
5970     True
5971     True
5972     True
5973     True
5974     True
5975     True
5976     True
5977     True
5978     True
5979     True
5980     True
5981     True
5982     True
5983     True
5984     True
5985     True
5986     True
5987    False
5988     True
5989     True
5990     True
5991     True
5992     True
5993     True
5994     True
5995     True
5996     True
5997     True
5998     True
5999     True
Length: 6000, dtype: bool

In [14]:
# Multiple conditionals
df[(df.year > 1960) & (df.rating > 2)]

Unnamed: 0,rating,review_count,isbn,booktype,author_url,year,genre_urls,dir,rating_count,name
0,4.40,136455,0439023483,good_reads:book,https://www.goodreads.com/author/show/153394.S...,2008.0,/genres/young-adult|/genres/science-fiction|/g...,dir01/2767052-the-hunger-games.html,2958974,"The Hunger Games (The Hunger Games, #1)"
1,4.41,16648,0439358078,good_reads:book,https://www.goodreads.com/author/show/1077326....,2003.0,/genres/fantasy|/genres/young-adult|/genres/fi...,dir01/2.Harry_Potter_and_the_Order_of_the_Phoe...,1284478,Harry Potter and the Order of the Phoenix (Har...
2,3.56,85746,0316015849,good_reads:book,https://www.goodreads.com/author/show/941441.S...,2005.0,/genres/young-adult|/genres/fantasy|/genres/ro...,dir01/41865.Twilight.html,2579564,"Twilight (Twilight, #1)"
7,4.38,10902,0060256656,good_reads:book,https://www.goodreads.com/author/show/435477.S...,1964.0,/genres/childrens|/genres/young-adult|/genres/...,dir01/370493.The_Giving_Tree.html,502891,The Giving Tree
9,4.18,12302,0345391802,good_reads:book,https://www.goodreads.com/author/show/4.Dougla...,1979.0,/genres/science-fiction|/genres/humor|/genres/...,dir01/11.The_Hitchhiker_s_Guide_to_the_Galaxy....,724713,The Hitchhiker's Guide to the Galaxy (Hitchhik...
10,4.03,20937,0739326228,good_reads:book,https://www.goodreads.com/author/show/614.Arth...,1997.0,/genres/fiction|/genres/historical-fiction|/ge...,dir01/930.Memoirs_of_a_Geisha.html,1042679,Memoirs of a Geisha
11,3.72,34959,0307277674,good_reads:book,https://www.goodreads.com/author/show/630.Dan_...,2003.0,/genres/mystery|/genres/thriller|/genres/suspe...,dir01/968.The_Da_Vinci_Code.html,1220657,"The Da Vinci Code (Robert Langdon, #2)"
12,4.36,69524,0375831002,good_reads:book,https://www.goodreads.com/author/show/11466.Ma...,2005.0,/genres/historical-fiction|/genres/young-adult...,dir01/19063.The_Book_Thief.html,675431,The Book Thief
16,3.92,38061,,good_reads:book,https://www.goodreads.com/author/show/498072.A...,2003.0,/genres/fiction|/genres/romance|/genres/fantas...,dir01/18619684-the-time-traveler-s-wife.html,927254,The Time Traveler's Wife
17,4.58,1314,0345538374,good_reads:book,https://www.goodreads.com/author/show/656983.J...,1973.0,/genres/fantasy|/genres/classics|/genres/scien...,dir01/30.J_R_R_Tolkien_4_Book_Boxed_Set.html,68495,J.R.R. Tolkien 4-Book Boxed Set


In [15]:
# Statistical methods based off data result
# Multiple conditionals
df[(df.year > 1960) & (df.rating > 2)].mean()

rating             4.049416
review_count            inf
year            2000.315590
rating_count            inf
dtype: float64

In [16]:
df[(df.year > 1960) & (df.rating > 2)].std()

rating     0.262485
year      12.777265
dtype: float64

## Clean datatypes

In [17]:
df.dtypes

rating          float64
review_count     object
isbn             object
booktype         object
author_url       object
year            float64
genre_urls       object
dir              object
rating_count     object
name             object
dtype: object

In [18]:
# Strip out null data first based on conversion error message when casting data types
df = df[(df.year.notnull()) & (df.rating.notnull()) & (df.review_count.notnull())]

df["rating_count"] = df.rating_count.astype(int)
df["review_count"] = df.review_count.astype(int)
df["year"] = df.year.astype(int)

df.dtypes

rating          float64
review_count      int64
isbn             object
booktype         object
author_url       object
year              int64
genre_urls       object
dir              object
rating_count      int64
name             object
dtype: object

In [19]:
# Determine which columns have Null or bad data
df[df.rating.isnull()]

Unnamed: 0,rating,review_count,isbn,booktype,author_url,year,genre_urls,dir,rating_count,name


In [20]:
df[df.year.notnull()]

Unnamed: 0,rating,review_count,isbn,booktype,author_url,year,genre_urls,dir,rating_count,name
0,4.40,136455,0439023483,good_reads:book,https://www.goodreads.com/author/show/153394.S...,2008,/genres/young-adult|/genres/science-fiction|/g...,dir01/2767052-the-hunger-games.html,2958974,"The Hunger Games (The Hunger Games, #1)"
1,4.41,16648,0439358078,good_reads:book,https://www.goodreads.com/author/show/1077326....,2003,/genres/fantasy|/genres/young-adult|/genres/fi...,dir01/2.Harry_Potter_and_the_Order_of_the_Phoe...,1284478,Harry Potter and the Order of the Phoenix (Har...
2,3.56,85746,0316015849,good_reads:book,https://www.goodreads.com/author/show/941441.S...,2005,/genres/young-adult|/genres/fantasy|/genres/ro...,dir01/41865.Twilight.html,2579564,"Twilight (Twilight, #1)"
3,4.23,47906,0061120081,good_reads:book,https://www.goodreads.com/author/show/1825.Har...,1960,/genres/classics|/genres/fiction|/genres/histo...,dir01/2657.To_Kill_a_Mockingbird.html,2078123,To Kill a Mockingbird
4,4.23,34772,0679783261,good_reads:book,https://www.goodreads.com/author/show/1265.Jan...,1813,/genres/classics|/genres/fiction|/genres/roman...,dir01/1885.Pride_and_Prejudice.html,1388992,Pride and Prejudice
5,4.25,12363,0446675539,good_reads:book,https://www.goodreads.com/author/show/11081.Ma...,1936,/genres/classics|/genres/historical-fiction|/g...,dir01/18405.Gone_with_the_Wind.html,645470,Gone with the Wind
6,4.22,7205,0066238501,good_reads:book,https://www.goodreads.com/author/show/1069006....,1949,/genres/classics|/genres/young-adult|/genres/c...,dir01/11127.The_Chronicles_of_Narnia.html,286677,The Chronicles of Narnia (Chronicles of Narnia...
7,4.38,10902,0060256656,good_reads:book,https://www.goodreads.com/author/show/435477.S...,1964,/genres/childrens|/genres/young-adult|/genres/...,dir01/370493.The_Giving_Tree.html,502891,The Giving Tree
8,3.79,20670,0452284244,good_reads:book,https://www.goodreads.com/author/show/3706.Geo...,1945,/genres/classics|/genres/fiction|/genres/scien...,dir01/7613.Animal_Farm.html,1364879,Animal Farm
9,4.18,12302,0345391802,good_reads:book,https://www.goodreads.com/author/show/4.Dougla...,1979,/genres/science-fiction|/genres/humor|/genres/...,dir01/11.The_Hitchhiker_s_Guide_to_the_Galaxy....,724713,The Hitchhiker's Guide to the Galaxy (Hitchhik...


## Cleaned Dataframe

In [21]:
df

Unnamed: 0,rating,review_count,isbn,booktype,author_url,year,genre_urls,dir,rating_count,name
0,4.40,136455,0439023483,good_reads:book,https://www.goodreads.com/author/show/153394.S...,2008,/genres/young-adult|/genres/science-fiction|/g...,dir01/2767052-the-hunger-games.html,2958974,"The Hunger Games (The Hunger Games, #1)"
1,4.41,16648,0439358078,good_reads:book,https://www.goodreads.com/author/show/1077326....,2003,/genres/fantasy|/genres/young-adult|/genres/fi...,dir01/2.Harry_Potter_and_the_Order_of_the_Phoe...,1284478,Harry Potter and the Order of the Phoenix (Har...
2,3.56,85746,0316015849,good_reads:book,https://www.goodreads.com/author/show/941441.S...,2005,/genres/young-adult|/genres/fantasy|/genres/ro...,dir01/41865.Twilight.html,2579564,"Twilight (Twilight, #1)"
3,4.23,47906,0061120081,good_reads:book,https://www.goodreads.com/author/show/1825.Har...,1960,/genres/classics|/genres/fiction|/genres/histo...,dir01/2657.To_Kill_a_Mockingbird.html,2078123,To Kill a Mockingbird
4,4.23,34772,0679783261,good_reads:book,https://www.goodreads.com/author/show/1265.Jan...,1813,/genres/classics|/genres/fiction|/genres/roman...,dir01/1885.Pride_and_Prejudice.html,1388992,Pride and Prejudice
5,4.25,12363,0446675539,good_reads:book,https://www.goodreads.com/author/show/11081.Ma...,1936,/genres/classics|/genres/historical-fiction|/g...,dir01/18405.Gone_with_the_Wind.html,645470,Gone with the Wind
6,4.22,7205,0066238501,good_reads:book,https://www.goodreads.com/author/show/1069006....,1949,/genres/classics|/genres/young-adult|/genres/c...,dir01/11127.The_Chronicles_of_Narnia.html,286677,The Chronicles of Narnia (Chronicles of Narnia...
7,4.38,10902,0060256656,good_reads:book,https://www.goodreads.com/author/show/435477.S...,1964,/genres/childrens|/genres/young-adult|/genres/...,dir01/370493.The_Giving_Tree.html,502891,The Giving Tree
8,3.79,20670,0452284244,good_reads:book,https://www.goodreads.com/author/show/3706.Geo...,1945,/genres/classics|/genres/fiction|/genres/scien...,dir01/7613.Animal_Farm.html,1364879,Animal Farm
9,4.18,12302,0345391802,good_reads:book,https://www.goodreads.com/author/show/4.Dougla...,1979,/genres/science-fiction|/genres/humor|/genres/...,dir01/11.The_Hitchhiker_s_Guide_to_the_Galaxy....,724713,The Hitchhiker's Guide to the Galaxy (Hitchhik...


In [22]:
df.year.unique()

array([ 2008,  2003,  2005,  1960,  1813,  1936,  1949,  1964,  1945,
        1979,  1997,  1865,  1597,  1862,  1973,  1954,  1985,  1890,
        1866,  1952,  1908,  1988,  1953,  2012,  1937,  1962,  1897,
        1932,  2009,  1967,  1847,  1963,  1884,  2002,  2007,  1911,
        2006,  2011,  1859,  2001,  -800,  1877,  1955,  1969,  1961,
        1989,  1951,  1972,  1925,  1978,  2004,  1818,  1861,  1943,
        1892,  1996,  1998,  1938,  1982,  1880,  1940,  1957,  1991,
        1999,  1922,  1605,  1851,  1868,  1976,  1993,  1995,  1975,
        1926,  1876,  1981,  1994,  1850,  1942,  1977,  1980,  1965,
        1959,  1915,  1986,  1939,  1321,  1900,  1928,  2000,  1902,
        1931,  1971,  1895,  1891,  1983,  1923,  1843,  1595,  1899,
        1970,  1390,  1920,  1856,  1990,  1929,  1903,  1839,  2010,
        1883,  1992,  1950,  1987,  1905,  1946,  1958,  1910,  1934,
        1603,  1844,  1855,  1667,  1852,  1759,  1854,  1869,  1623,
        2013,  1916,