In [1]:
import pandas as pd
import numpy as np
import pickle

%matplotlib inline

In [2]:
#load pickled file into dataframe
def open_pickle(pkl_file):
    with open(pkl_file, 'rb') as picklefile:
        return pickle.load(picklefile)

In [3]:
#load data collected from wikipedia and goodreads
nyt = open_pickle('../data/interim/nyt_cleaned.pkl')
books = open_pickle('../data/raw/books_scraped.pkl')
authors = open_pickle('../data/raw/authors_scraped.pkl')

In [4]:
#convert number strings to numeric values
def to_num(df):
    return df.apply(lambda x: pd.to_numeric(x, errors='ignore'))

nyt = to_num(nyt)
books = to_num(books)
authors = to_num(authors)

### Clean author data

In [6]:
#add author ids to authors df
a_ids = books[['a_id', 'author']].drop_duplicates()
authors = authors.merge(a_ids, on='a_id')

In [8]:
#drop birth_date, death_date because of number of NANs
print authors.isnull().sum()/len(authors)
labels = ['birth_date', 'death_date']
authors.drop(labels=labels, inplace=True, axis=1)

a_id                    0.000000
a_fans_count            0.000000
a_works_count           0.000000
gender                  0.141079
hometown                0.136929
birth_date              0.319502
death_date              0.543568
a_avg_rating            0.000000
a_ratings_count         0.000000
a_text_reviews_count    0.000000
author                  0.000000
dtype: float64


In [10]:
#fill gender and hometown NANs with 'not reported'
authors['gender'].fillna('not reported', inplace=True)
authors['hometown'].fillna('not reported', inplace=True)

In [13]:
#calculate number of weeks each book was on NYT lists by book, and by author
nyt_by_book = nyt.groupby(by=['title', 'author'], as_index=False).count()[['title', 'author', 'date']]
nyt_by_author = nyt.groupby(by='author', as_index=False).count()[['author', 'date']]

In [16]:
#rename columns to be able to join nyt df with goodreads dfs
columns = {'title': 'nyt_title',
           'date': 'b_wks_on_list'}

nyt_by_book.rename(columns=columns, inplace=True)
nyt_by_author.rename(columns={'date': 'a_wks_on_list'}, inplace=True)

In [21]:
#calculate number of books on nyt for each author
nyt_books_by_author = nyt.groupby(['author','title'], as_index=False).count()[['author','title']]
nyt_books_by_author = nyt_books_by_author.groupby('author', as_index=False).count()
nyt_books_by_author.rename(columns={'title': 'a_books_on_list'}, inplace=True)

In [22]:
#add number of weeks each author has been on NYT list
nyt_by_author['author'] = nyt_by_author['author'].apply(lambda x: x.decode('utf-8'))
authors = authors.merge(nyt_by_author, on='author', how='left')

In [23]:
#add number of weeks each author has on NYT list
nyt_books_by_author['author'] = nyt_books_by_author['author'].apply(lambda x: x.decode('utf-8'))
authors = authors.merge(nyt_books_by_author, on='author', how='left')

In [26]:
authors.loc[authors['author'].str.contains('Betty'), 'a_wks_on_list'] = 22
authors.loc[authors['author'].str.contains('Betty'), 'a_books_on_list'] = 1

### Clean books data

In [27]:
#determine which decade each book was on the NYT list
#if in more than one decade, only keep first decade (about 6 books)
books_by_decade = nyt.groupby(['title', 'decade'], as_index=False).count()[['title', 'decade']]
books_by_decade.drop_duplicates(subset='title', inplace=True)

In [28]:
#add number of weeks each book was on NYT list
books = books.merge(nyt_by_book, on='nyt_title', how='left')

#add author data to each book
books = books.merge(authors, on='a_id')

#add which decade a book was in for each book
books_by_decade.rename(columns={'title': 'nyt_title'}, inplace=True)
books = books.merge(books_by_decade, on='nyt_title')

In [29]:
#drop author names from books df
books = books.drop(['author_x', 'author_y'], axis=1)

In [30]:
#fix typo in pub_yr
books.loc[books['pub_yr']==214,'pub_yr'] = 2014

In [31]:
#drop pub_day, pub_mon because of number of NANs
print books.isnull().sum()/len(books)
labels = ['pub_day', 'pub_mon']
books.drop(labels=labels, inplace=True, axis=1)

b_id                    0.000000
gr_title                0.000000
nyt_title               0.000000
b_count                 0.000000
pub_day                 0.407008
pub_mon                 0.369272
pub_yr                  0.000000
b_avg_rating            0.000000
b_ratings_count         0.000000
b_txt_rev_count         0.000000
a_id                    0.000000
publisher               0.198113
b_rating_5_count        0.000000
b_rating_4_count        0.000000
b_rating_3_count        0.000000
b_rating_2_count        0.000000
b_rating_1_count        0.000000
b_wks_on_list           0.000000
a_fans_count            0.000000
a_works_count           0.000000
gender                  0.000000
hometown                0.000000
a_avg_rating            0.000000
a_ratings_count         0.000000
a_text_reviews_count    0.000000
author                  0.000000
a_wks_on_list           0.000000
a_books_on_list         0.000000
decade                  0.000000
dtype: float64


In [32]:
#fill publisher NANs with 'Unknown'
books['publisher'].fillna('Unknown', inplace=True)

In [33]:
#standardize publisher names
publishers = {'Berkley': 'Berkley',
              'Brown' : 'Little Brown & Co',
              'Bantam': 'Bantam',
              'Putnam': 'G.P. Putnam\'s',
              'Dell': 'Dell',
              'Pocket': 'Pocket',
              'Scribner': 'Scribner',
              'Doubleday': 'Doubleday',
              'Arrow': 'Arrow',
              'Atria': 'Atria',
              'Vintage': 'Vintage',
              'Random': 'Random House',
              'Simon': 'Simon & Schuster',
              'Signet': 'Signet',
              'Martin': 'St. Martin\'s Press',
              'Penguin': 'Penguin',
              'Warner': 'Warner',
              'NAL': 'NAL',
              'Orion': 'Orion',
              'Dial': 'Dial',
              'Corgi': 'Corgi',
              'Harper': 'Harper Collins',
              'Collins': 'Harper Collins',
              'Ace': 'Ace',
              'Knopf': 'Knopf',
              'Fawcett': 'Fawcett',
              'Tor': 'Tor',
              'New English': 'New English',
              'Headline': 'Headline',
              'Avon': 'Avon',
              'Hachette': 'Hachette',
              'Viking': 'Viking',
              'Morrow': 'Morrow',
              'Hodder': 'Hodder & Stoughton',
              'Picador': 'Picador',
              'Piatkus': 'Piatkus',
              'MacMillan': 'MacMillan',
              'Houghton': 'Houghton Mifflin',
              'B E': 'B E Trice',
              'Dodd': 'Dodd Mead'
             }

for old_pub, new_pub in publishers.items():
    books.loc[books['publisher'].str.contains(old_pub), 'publisher'] = new_pub

### Pickle cleaned books and authors dataframes to be used in analysis

In [35]:
books.to_pickle('../data/processed/books_cleaned.pkl')
authors.to_pickle('../data/processed/authors_cleaned.pkl')