### Goal: use the Goodreads API to gather data about books and authors on the _New York Times_ Fiction Best Sellers list for a range of years between 1942-2016

* General Search
* Get data by author id
* Get data by book id

In [1]:
import requests
import urllib

from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET
import xml.dom.minidom
from lxml import etree

from time import sleep

import re

import pandas as pd
import numpy as np

import pickle

In [2]:
#goodreads api key
key = '2Kp7IGlO7GKjmAASOT2pEw'

In [3]:
with open('../data/interim/books_dict.pkl', 'rb') as picklefile:
    books = pickle.load(picklefile)

In [4]:
#books['A Tree Grows in Brooklyn']

In [5]:
def try_url(url):
    '''
    Attempt to access Goodreads page
    '''
    response = requests.get(url)
    status = response.status_code
    if status != 200:
        print status
    else:
        page = response.text
        soup = BeautifulSoup(page, 'xml')
        return soup

### General Search
url = https://www.goodreads.com/search.xml?key=YOUR_KEY&q=Ender%27s+Game

Requires
* Title of book

Returns 
* first item in results>work
    * books_count: number of versions
    * id: Goodreads ID
    * original_publication_day: publication day
    * original_publication_month: publication month
    * original_publication_year: publication year
    * average_rating: average rating
    * ratings_count: number of ratings
    * text_reviews_count: number of text reviews
* first item in results>work>best_book>author
    * id: author ID

In [6]:
search_base_url = 'https://www.goodreads.com/search.xml?'
search_criteria = {'key': key,
                  'q': None}

#books = {'A Tree Grows in Brooklyn' : 'Betty  Smith'}

search_urls = {}

for nyt_title, author in books.items():
    search_criteria['q'] = nyt_title + ' ' + author
    url = search_base_url + urllib.urlencode(search_criteria)
    search_urls[url] = [author, nyt_title]

In [7]:
def get_search_data(works, author, nyt_title, df):
    '''
    Retrieve data from works xml tree from search results page
    '''
    for work in works:
        if work.find('best_book/author/name').text == author:
            b_id = work.find('best_book/id').text
            gr_title = work.find('best_book/title').text
            b_count = work.find('books_count').text
            pub_day = work.find('original_publication_day').text
            pub_mon = work.find('original_publication_month').text
            pub_yr = work.find('original_publication_year').text
            b_avg_rating = work.find('average_rating').text
            b_ratings_count = work.find('ratings_count').text
            b_txt_rev_count = work.find('text_reviews_count').text
            a_id = work.find('best_book/author/id').text
            results = pd.Series((b_id, gr_title, nyt_title, b_count, pub_day, pub_mon, pub_yr, b_avg_rating, b_ratings_count, b_txt_rev_count, a_id, author))
            df = df.append(results, ignore_index=True)
            break
    return df

In [8]:
#get data by title, author from search API
books_df = pd.DataFrame(data=None)
count = 0

for url, lst in search_urls.items():
    soup = try_url(url)
    results = ET.fromstring(str(soup.find('results')))
    works = results.findall('work')
    author = lst[0].decode('utf-8')
    nyt_title = lst[1]
    books_df = get_search_data(works, author, nyt_title, books_df)
    count += 1
    print nyt_title, count
    sleep(1.5)

Night Broken 1
Bay of Sighs 2
The Prodigal Daughter 3
Something of Value 4
Private L.A. 5
The Spy Who Came in from the Cold 6
Double Cross 7
Just Take My Heart 8
Scruples 9
3rd Degree 10
Tick Tock 11
Black Notice 12
Ten Big Ones 13
The 6th Target 14
The Icarus Agenda 15
The Wise Man's Fear 16
Changes 17
The Gift 18
Once Is Not Enough 19
Arch of Triumph 20
Smokin' Seventeen 21
The Chosen 22
Dreamcatcher 23
The Shadow of Your Smile 24
Threat Vector 25
Honeymoon 26
How Stella Got Her Groove Back 27
The Source 28
Bleachers 29
The Nanny Diaries 30
Promises in Death 31
Under the Dome 32
Just One Evil Act 33
Devices and Desires 34
Kingsblood Royal 35
By Love Possessed 36
Private Games 37
Point of Origin 38
The Talisman 39
Sincerely, Willis Wayde 40
Gerald's Game 41
Cross 42
Lean Mean Thirteen 43
Bloodline 44
Second Honeymoon 45
Fine Things 46
Politically Correct Bedtime Stories 47
A Feast for Crows 48
A Wanted Man 49
The Secret Pilgrim 50
Finger Lickin' Fifteen 51
Water for Elephants 52
Fearl

In [15]:
len(books_df)

742

In [16]:
columns = {0: 'b_id',
           1: 'gr_title',
           2: 'nyt_title',
           3: 'b_count',
           4: 'pub_day',
           5: 'pub_mon',
           6: 'pub_yr',
           7: 'b_avg_rating',
           8: 'b_ratings_count',
           9: 'b_txt_rev_count',
           10: 'a_id',
           11: 'author'}
books_df.rename(columns=columns, inplace=True)

Check which books did not return results from goodreads.

In [17]:
gr_titles = set(books_df['nyt_title'])
nyt_titles = set(books.keys())
missing_titles = nyt_titles.difference(gr_titles)

In [18]:
for title in missing_titles:
    print title, books[title]

Primary Colors Anonymous
Later revealed to be Joe Klein
Isle of Dog Patricia Cornwell
Gentlemen's Agreement Laura Z. Hobson
From Here to Eternity James Jones
Just One Evil Act Elizabeth George


In [19]:
len(missing_titles)

5

In [22]:
with open('../data/raw/books_scraped.pkl', 'wb') as picklefile:
    pickle.dump(books_df, picklefile)

### Get info about an author by id

url = https://www.goodreads.com/author/show/18541?format=xml&key=2Kp7IGlO7GKjmAASOT2pEw

Requires
* author ID

Returns
* in author
    * fans_count: number of fans on goodreads
    * works_count: number of works
    * gender: m/f
    * hometown: hometown
    * born_at: date of birth
    * died_at: date of death
 
_add these if necessary LATER_:
* in author>books>book[1]>authors>author[1]
    * average_rating: average rating for author
    * ratings_count: total ratings for author
    * text_reviews_count: total number of reviews for author

In [None]:
with open('../data/raw/books_scraped.pkl', 'rb') as picklefile:
    books_df = pickle.load(picklefile)

In [23]:
a_ids = books_df['a_id'].drop_duplicates()
len(a_ids)

241

In [24]:
def get_author_data(author, a_id, df):
    '''
    Retrieve data from author xml tree on author page
    '''
    a_fans_count = author.find('fans_count').text
    a_works_count = author.find('works_count').text
    gender = author.find('gender').text
    hometown = author.find('hometown').text
    birth_date = author.find('born_at').text
    death_date = author.find('died_at').text
    first_book = author.find('books').find('book').find('authors').find('author')
    a_avg_rating = first_book.find('average_rating').text
    a_ratings_count = first_book.find('ratings_count').text
    a_text_reviews_count = first_book.find('text_reviews_count').text
    results = pd.Series((a_id, a_fans_count, a_works_count, gender, hometown, birth_date, death_date, a_avg_rating, a_ratings_count, a_text_reviews_count))
    df = df.append(results, ignore_index=True)
    return df

In [25]:
author_base_url = 'https://www.goodreads.com/author/show/'
search_criteria = {'format': 'xml',
                  'key': key}

author_urls = {}

for a_id in a_ids:
    url = author_base_url + a_id + '?' + urllib.urlencode(search_criteria)
    author_urls[url] = a_id

In [26]:
#get data by author id from author API
authors_df = pd.DataFrame(data=None)
count = 0

for url, a_id in author_urls.items():
    soup = try_url(url)
    author = ET.fromstring(str(soup.find('author')))
    authors_df = get_author_data(author, a_id, authors_df)
    count += 1
    print a_id, count
    sleep(1.5)

29326 1
150377 2
4725841 3
3849415 4
24978 5
20248 6
1455 7
6098 8
9222 9
20704 10
1943477 11
13370 12
10746 13
819789 14
3670 15
858 16
7577 17
61105 18
3299 19
4610 20
12833 21
107767 22
19824 23
388840 24
1730 25
18344 26
33472 27
45314 28
1415338 29
59605 30
16893 31
1360780 32
33384 33
1063732 34
23356 35
12455 36
19823 37
32279 38
674062 39
12605 40
383606 41
2327917 42
3075 43
9291 44
8719 45
6417 46
3936 47
93127 48
14610 49
9629 50
569 51
75516 52
2578 53
861 54
4882127 55
21004 56
43626 57
6942 58
7385 59
1825 60
5091 61
10039 62
16904 63
7128 64
807271 65
2031 66
32155 67
7995 68
120490 69
3083854 70
86175 71
300708 72
2001717 73
656983 74
3513 75
5246 76
3505 77
4464118 78
92960 79
704 80
46097 81
32202 82
7464 83
1238 84
339820 85
5353 86
3617 87
104728 88
9355 89
3620 90
7565 91
40552 92
3780 93
27991 94
49699 95
2014 96
463 97
2384 98
2749 99
5088 100
3167 101
630 102
128499 103
3500 104
57432 105
3504 106
8258 107
40563 108
15516 109
7927 110
12479 111
99044 112
4711 11

In [28]:
columns = {0: 'a_id',
           1: 'a_fans_count',
           2: 'a_works_count',
           3: 'gender',
           4: 'hometown',
           5: 'birth_date',
           6: 'death_date',
           7: 'a_avg_rating',
           8: 'a_ratings_count',
           9: 'a_text_reviews_count'}
authors_df.rename(columns=columns, inplace=True)

In [29]:
len(authors_df)

241

In [30]:
with open('../data/raw/authors_scraped.pkl', 'wb') as picklefile:
    pickle.dump(authors_df, picklefile)

### Get info about book by book id

url = https://www.goodreads.com/book/show/1017661?key=2Kp7IGlO7GKjmAASOT2pEw

Requires
* book ID

Returns
* in book
    * publisher: Publisher
    * rating_dist: distribution of ratings

In [None]:
with open('../data/raw/books_scraped.pkl', 'rb') as picklefile:
    books_df = pickle.load(picklefile)

In [31]:
b_ids = books_df['b_id']

In [32]:
books_base_url = 'https://www.goodreads.com/book/show/'
search_criteria = {'key': key}

books_urls = {}

for b_id in b_ids:
    url = books_base_url + b_id + '?' + urllib.urlencode(search_criteria)
    books_urls[url] = b_id

In [33]:
def get_ratings_dist(rating_dist):
    for i in range(0, 5):
        yield rating_dist[i].split(':')[1]

In [34]:
def get_book_data(book, b_id, df):
    '''
    Retrieve data from books xml tree on author paginate page
    '''
    publisher = book.find('publisher').text
    rating_dist = book.find('work').find('rating_dist').text.split('|')
    ratings = get_ratings_dist(rating_dist)
    fives, fours, threes, twos, ones = get_ratings_dist(rating_dist)
    results = pd.Series((b_id, publisher, fives, fours, threes, twos, ones))
    return df.append(results, ignore_index=True)

In [35]:
#get data by book id from book info API
pubs_df = pd.DataFrame(data=None)
count = 0

for url, b_id in books_urls.items():
    soup = try_url(url)
    book = ET.fromstring(str(soup.find('book')))
    pubs_df = get_book_data(book, b_id, pubs_df)
    count += 1
    print b_id, count
    sleep(1.5)

7675 1
65948 2
114132 3
2211221 4
968 5
5551522 6
15791154 7
2336803 8
419346 9
368819 10
20448515 11
2029927 12
778285 13
6892870 14
13888 15
59851 16
15806231 17
7856305 18
864777 19
7905092 20
17707605 21
1629007 22
4527502 23
152402 24
5064 25
321552 26
18527947 27
140060 28
49501 29
46020 30
5509321 31
7936809 32
2306910 33
26216087 34
457457 35
85733 36
2003767 37
26236956 38
201972 39
2429135 40
7926242 41
479415 42
7156505 43
32692 44
5739373 45
21484 46
19288043 47
815721 48
6526 49
7277715 50
30347 51
18775247 52
552887 53
82807 54
9420 55
85958 56
124931 57
3268926 58
198331 59
1528385 60
84573 61
319136 62
7869 63
6425 64
350065 65
85322 66
43884 67
8437913 68
1215032 69
7096123 70
419542 71
278102 72
401942 73
28815364 74
282659 75
994663 76
3431 77
2029176 78
27525 79
13007638 80
5350 81
59897 82
5093 83
315511 84
179780 85
86424 86
40317 87
25324116 88
170641 89
32682 90
10920 91
279416 92
31623 93
79852 94
110694 95
7604 96
13600318 97
23346918 98
270521 99
722367 100
1

In [36]:
columns = {0: 'b_id',
           1: 'publisher',
           2: 'b_rating_5_count',
           3: 'b_rating_4_count',
           4: 'b_rating_3_count',
           5: 'b_rating_2_count',
           6: 'b_rating_1_count',}

pubs_df.rename(columns=columns, inplace=True)

In [37]:
books_df = books_df.merge(pubs_df, on='b_id')

In [42]:
with open('../data/raw/books_scraped.pkl', 'wb') as picklefile:
    pickle.dump(books_df, picklefile)