In [3]:
import pandas as pd
import re
import pickle

### Clean names and titles in wikipedia dataframe so that each author has one variation of name and each title has one variation of title.

In [4]:
wikipedia = pd.read_csv('../data/raw/nyt_scraped.csv', index_col = 0)

Standardize author names:
* standardize initials (ex: R. R.-->R.R. and W. E. B. to W.E.B.)
* replace 'nan' author names for books that actually have authors
* standardize spelling
* remove annotation from pseudonyms

In [5]:
def fix_initials(name):
    two_pattern = '([A-Z]\.\s[A-Z]\.)'
    name = str(name)
    initials_set = re.findall(two_pattern, name)
    if initials_set:
        for initials in initials_set:
            new_initials = re.sub('\s', '',  initials)
            name = re.sub(initials, new_initials, name)
    three_pattern ='([A-Z]\.[A-Z]\.\s[A-Z]+\.+)'
    initials_set = re.findall(three_pattern, name)
    if initials_set:
        for initials in initials_set:
            new_initials = re.sub('\s', '',  initials)
            name = re.sub(initials, new_initials, name)
    return name

In [6]:
wikipedia['author'] = wikipedia['author'].apply(fix_initials)

In [7]:
mispellings = {'The Black Rose': 'Thomas B. Costain',
                    'House Divided': 'Ben Ames Williams',
                    'The Cuckoo\'s Calling': 'Robert Galbraith',
                    'Slow Waltz in Cedar Bend': 'Robert James Waller',
                    'The Bridges of Madison County': 'Robert James Waller',
                    'Harry Potter and the Chamber of Secrets': 'J.K. Rowling'}

for title, author in mispellings.items():
    wikipedia.loc[wikipedia['title'] == title, 'author'] = author

Fix pseudonyms and missing middle initials

In [8]:
replacements = {'Richard Bachman\n(pseudonym for Stephen King)': 'Richard Bachman',
                'James Michener': 'James A. Michener',
                'Morris West': 'Morris L. West',
                'Elizabeth Chevalier': 'Elizabeth Pickett Chevalier',
                'Somerset Maugham': 'W. Somerset Maugham',
                'Mattie Stepanek': 'Mattie J.T. Stepanek',
                'Thomas Costain': 'Thomas B. Costain',
                'Lloyd Douglas': 'Lloyd C. Douglas',
                'Van Wyck Mason': 'F. van Wyck Mason',
                'Ross Lockridge': 'Ross Lockridge Jr.',
                'Lillian Smith': 'Lillian E. Smith'
               }

for old_name, new_name in replacements.items():
    wikipedia.loc[wikipedia['author'] == old_name, 'author'] = new_name

#J.D. Robb is pseudonym for Nora Roberts
wikipedia.loc[wikipedia['title'] == 'Innocent in Death', 'author'] = 'J.D. Robb'
wikipedia.loc[wikipedia['title'] == 'Concealed in Death', 'author'] = 'J.D. Robb'

Keep only first author for books with multiple authors.

In [9]:
def keep_first_name(name):
    split = re.split('\sand\s', name)
    return split[0]

wikipedia['author'] = wikipedia['author'].apply(keep_first_name)

Remove rows with 'nan' for author for rows without authors (title unknown, or no bestsellers list published due to newspaper stirke)

In [10]:
wikipedia.loc[wikipedia['author'] == 'nan', 'author'] = None
wikipedia = wikipedia.dropna()

Change duplicate titles so that each title is a unique identifier

In [11]:
wikipedia.loc[(wikipedia['title'] == 'Gone') & (wikipedia['author'] == 'Jonathan Kellerman'), 'title'] = 'Gone (Alex Delaware, #20)'
wikipedia.loc[(wikipedia['title'] == 'Gone') & (wikipedia['author'] == 'James Patterson and Michael Ledwidge'), 'title'] = 'Gone (Michael Bennett, #6)'

Add decade

In [19]:
wikipedia['decade'] = wikipedia['year'].apply(lambda x: str(x)[:3] + '0s')

Pickle cleaned wikipedia dataframe to be used in analysis

In [20]:
with open('../data/interim/nyt_cleaned.pkl', 'wb') as picklefile:
    pickle.dump(wikipedia, picklefile)

### Create title:author dictionary to be used to get data from goodreads. 

In [11]:
authors_titles = wikipedia[['title', 'author']].drop_duplicates()

Create and pickle dictionary with titles as keys and authors as values to be used to get data from goodreads.

In [12]:
books = {}

for i, row in authors_titles.iterrows():
    title = row[0]
    author = row[1]
    books[title] = author

In [13]:
#fix A Tree Grows in Brooklyn to match goodreads typo
books['A Tree Grows in Brooklyn'] = 'Betty  Smith'

In [14]:
with open('../data/interim/books_dict.pkl', 'wb') as picklefile:
    pickle.dump(books, picklefile)