# Cleaning Data from API Pull
# Gillian Tatreau
# February 2023
### Most current API pull: 26 February 2024

In [1]:
# import libraries
from itertools import repeat
import requests
import urllib.request, urllib.parse
from urllib.error import HTTPError,URLError
import json
import pandas as pd

# if using jupyter notebook, to import config file if saved in .ipynb file
# import import_ipynb

# file with secret key
import config

In [2]:
def get_list_of_lists():
    """
    Function to get list of best seller's lists
    """
    
    url = 'https://api.nytimes.com/svc/books/v3/lists/names.json?api-key='+config.my_key
    # checking connection
    try: 
        uh = urllib.request.urlopen(url)
    except HTTPError as e:
        print("Sorry! Could not retrive anything")
        return None
    except URLError as e:
        print('Failed to reach a server.')
        print('Reason: ', e.reason)
        return None
    else:
        data = uh.read().decode()
        print("Retrieved data. Total {} characters read.".format(len(data)))
    x = json.loads(data)
    y = x['results']
    return y

In [3]:
def get_list_data(l):
    """
    Function to get data from each bestseller's list in list_of_lists
    """
    list_name = str(l)
    url = 'https://api.nytimes.com/svc/books/v3/lists/current/'+list_name+'.json?api-key='+config.my_key
    # checking connection
    try: 
        uh = urllib.request.urlopen(url)
    except HTTPError as e:
        print("Sorry! Could not retrive anything on {}".format(list_name))
        return None
    except URLError as e:
        print('Failed to reach a server.')
        print('Reason: ', e.reason)
        return None
    else:
        data = uh.read().decode()
        print("Retrieved data on {}. Total {} characters read.".format(list_name,len(data)))
        return data

In [4]:
def build_list_dataBase(partial_list):
    """
    Function to create pandas DataFrame from bestseller's lists data. 
    Due to the number of calls needed for the number of bestseller's lists, the function must be called multiple times, 
    each with a subset of the list_of_lists.
    """
    list_dict = {'bestsellers_date':[],'bestsellers_list':[],'rank':[],'rank_last_week':[],'weeks_on_list':[],
                  'isbn10':[],'isbn13':[],'publisher':[],'title':[],'author':[],
                  'contributor':[]}
    
    for l in partial_list:
        data = get_list_data(l)
        if data !=None:
            x = json.loads(data)
            y = x["results"]
            z = y["books"]
            list_dict['bestsellers_date'].extend(repeat(y['bestsellers_date'],len(z)))
            list_dict['bestsellers_list'].extend(repeat(y['list_name'],len(z)))
            for i in z:
                list_dict['rank'].append(i['rank'])
                list_dict['rank_last_week'].append(i['rank_last_week'])
                list_dict['weeks_on_list'].append(i['weeks_on_list'])
                list_dict['isbn10'].append(i['primary_isbn10'])
                list_dict['isbn13'].append(i['primary_isbn13'])
                list_dict['publisher'].append(i['publisher'])
                list_dict['title'].append(i['title'])
                list_dict['author'].append(i['author'])
                list_dict['contributor'].append(i['contributor'])
    
    return pd.DataFrame(list_dict)               

In [5]:
# create list_of_lists
a = get_list_of_lists()
list_of_lists = []
for i in a:
    list_of_lists.append(i["list_name_encoded"])
len(a)

Retrieved data. Total 12486 characters read.


59

In [6]:
# breaking up list_of_lists into sublists that the API will be able to handle
a1 = list_of_lists[:5]
a2 = list_of_lists[5:10]
a3 = list_of_lists[10:15]
a4 = list_of_lists[15:20]
a5 = list_of_lists[20:25]
a6 = list_of_lists[25:30]
a7 = list_of_lists[30:35]
a8 = list_of_lists[35:40]
a9 = list_of_lists[40:45]
a10 = list_of_lists[45:50]
a11 = list_of_lists[50:55]
a12 = list_of_lists[55:]

In [7]:
# call function to perform API calls and convert data to first pandas DataFrame
df1 = build_list_dataBase(a1)

Retrieved data on combined-print-and-e-book-fiction. Total 25069 characters read.
Retrieved data on combined-print-and-e-book-nonfiction. Total 26541 characters read.
Retrieved data on hardcover-fiction. Total 25241 characters read.
Retrieved data on hardcover-nonfiction. Total 24986 characters read.
Retrieved data on trade-fiction-paperback. Total 25410 characters read.


In [8]:
# each call must be about a minute apart
df2 = build_list_dataBase(a2)

Retrieved data on mass-market-paperback. Total 27106 characters read.
Retrieved data on paperback-nonfiction. Total 27095 characters read.
Retrieved data on e-book-fiction. Total 26128 characters read.
Retrieved data on e-book-nonfiction. Total 28302 characters read.
Retrieved data on hardcover-advice. Total 25369 characters read.


In [9]:
df3 = build_list_dataBase(a3)

Retrieved data on paperback-advice. Total 26084 characters read.
Retrieved data on advice-how-to-and-miscellaneous. Total 16385 characters read.
Retrieved data on hardcover-graphic-books. Total 17561 characters read.
Retrieved data on paperback-graphic-books. Total 17830 characters read.
Retrieved data on manga. Total 17158 characters read.


In [10]:
df4 = build_list_dataBase(a4)

Retrieved data on combined-print-fiction. Total 37762 characters read.
Retrieved data on combined-print-nonfiction. Total 37681 characters read.
Retrieved data on chapter-books. Total 17313 characters read.
Retrieved data on childrens-middle-grade. Total 25759 characters read.
Retrieved data on childrens-middle-grade-e-book. Total 8497 characters read.


In [11]:
df5 = build_list_dataBase(a5)

Retrieved data on childrens-middle-grade-hardcover. Total 16942 characters read.
Retrieved data on childrens-middle-grade-paperback. Total 17154 characters read.
Retrieved data on paperback-books. Total 18320 characters read.
Retrieved data on picture-books. Total 16399 characters read.
Retrieved data on series-books. Total 36798 characters read.


In [12]:
df6 = build_list_dataBase(a6)

Retrieved data on young-adult. Total 26865 characters read.
Retrieved data on young-adult-e-book. Total 8081 characters read.
Retrieved data on young-adult-hardcover. Total 15779 characters read.
Retrieved data on young-adult-paperback. Total 17381 characters read.
Retrieved data on animals. Total 17693 characters read.


In [13]:
df7 = build_list_dataBase(a7)

Retrieved data on audio-fiction. Total 26539 characters read.
Retrieved data on audio-nonfiction. Total 27811 characters read.
Retrieved data on business-books. Total 16491 characters read.
Retrieved data on celebrities. Total 18336 characters read.
Retrieved data on crime-and-punishment. Total 19205 characters read.


In [14]:
df8 = build_list_dataBase(a8)

Retrieved data on culture. Total 18252 characters read.
Retrieved data on education. Total 18347 characters read.
Retrieved data on espionage. Total 18008 characters read.
Retrieved data on expeditions-disasters-and-adventures. Total 19508 characters read.
Retrieved data on fashion-manners-and-customs. Total 17844 characters read.


In [15]:
df9 = build_list_dataBase(a9)

Retrieved data on food-and-fitness. Total 17455 characters read.
Retrieved data on games-and-activities. Total 17446 characters read.
Retrieved data on graphic-books-and-manga. Total 24891 characters read.
Retrieved data on hardcover-business-books. Total 25793 characters read.
Retrieved data on health. Total 18309 characters read.


In [16]:
df10 = build_list_dataBase(a10)

Retrieved data on humor. Total 17937 characters read.
Retrieved data on indigenous-americans. Total 9606 characters read.
Retrieved data on relationships. Total 18811 characters read.
Retrieved data on mass-market-monthly. Total 26753 characters read.
Retrieved data on middle-grade-paperback-monthly. Total 17185 characters read.


In [17]:
df11 = build_list_dataBase(a11)

Retrieved data on paperback-business-books. Total 19571 characters read.
Retrieved data on family. Total 17940 characters read.
Retrieved data on hardcover-political-books. Total 19215 characters read.
Retrieved data on race-and-civil-rights. Total 19008 characters read.
Retrieved data on religion-spirituality-and-faith. Total 17662 characters read.


In [18]:
df12 = build_list_dataBase(a12)

Retrieved data on science. Total 18928 characters read.
Retrieved data on sports. Total 17207 characters read.
Retrieved data on travel. Total 18236 characters read.
Retrieved data on young-adult-paperback-monthly. Total 16324 characters read.


## Step 1
Combining all 7 DataFrames into 1 consolidated DataFrame

In [19]:
df_list = [df1, df2, df3, df4, df5, df6, df7, df8, df9, df10, df11, df12]

In [20]:
df = pd.concat(df_list, ignore_index=True)

In [21]:
df

Unnamed: 0,bestsellers_date,bestsellers_list,rank,rank_last_week,weeks_on_list,isbn10,isbn13,publisher,title,author,contributor
0,2024-02-17,Combined Print and E-Book Fiction,1,1,2,1250178630,9781250178633,St. Martin's,THE WOMEN,Kristin Hannah,by Kristin Hannah
1,2024-02-17,Combined Print and E-Book Fiction,2,5,42,1649374046,9781649374042,Red Tower,FOURTH WING,Rebecca Yarros,by Rebecca Yarros
2,2024-02-17,Combined Print and E-Book Fiction,3,4,2,0593550404,9780593550403,Berkley,BRIDE,Ali Hazelwood,by Ali Hazelwood
3,2024-02-17,Combined Print and E-Book Fiction,4,6,15,1649374178,9781649374172,Red Tower,IRON FLAME,Rebecca Yarros,by Rebecca Yarros
4,2024-02-17,Combined Print and E-Book Fiction,5,3,3,1635574102,9781635574104,Bloomsbury,HOUSE OF FLAME AND SHADOW,Sarah J. Maas,by Sarah J. Maas
...,...,...,...,...,...,...,...,...,...,...,...
680,2024-02-03,Young Adult Paperback Monthly,6,0,0,0593619919,9780593619919,Putnam,CHECK & MATE,Ali Hazelwood,by Ali Hazelwood
681,2024-02-03,Young Adult Paperback Monthly,7,0,0,1419760874,9781419760877,Amulet,LIGHTLARK,Alex Aster,by Alex Aster
682,2024-02-03,Young Adult Paperback Monthly,8,0,0,,9781665951388,Simon & Schuster,THE DO-OVER,Lynn Painter,by Lynn Painter
683,2024-02-03,Young Adult Paperback Monthly,9,0,0,1728205476,9781728205472,Sourcebooks Fire,MY LIFE WITH THE WALTER BOYS,Ali Novak,by Ali Novak


## Step 2
Check for outliers, unique values, number of NAs, and data types for each column.

In [22]:
# exploring datatypes of each column
df.dtypes

bestsellers_date    object
bestsellers_list    object
rank                 int64
rank_last_week       int64
weeks_on_list        int64
isbn10              object
isbn13              object
publisher           object
title               object
author              object
contributor         object
dtype: object

In [23]:
# number of unique values
df.nunique()

bestsellers_date     11
bestsellers_list     59
rank                 20
rank_last_week       16
weeks_on_list       102
isbn10              549
isbn13              591
publisher           223
title               545
author              477
contributor         480
dtype: int64

In [24]:
duplicate = df[df.duplicated('title')]
duplicate.head(60)

Unnamed: 0,bestsellers_date,bestsellers_list,rank,rank_last_week,weeks_on_list,isbn10,isbn13,publisher,title,author,contributor
30,2024-02-17,Hardcover Fiction,1,1,2,1250178630,9781250178633,St. Martin's,THE WOMEN,Kristin Hannah,by Kristin Hannah
31,2024-02-17,Hardcover Fiction,2,3,41,1649374046,9781649374042,Red Tower,FOURTH WING,Rebecca Yarros,by Rebecca Yarros
32,2024-02-17,Hardcover Fiction,3,4,15,1649374178,9781649374172,Red Tower,IRON FLAME,Rebecca Yarros,by Rebecca Yarros
33,2024-02-17,Hardcover Fiction,4,2,3,1635574102,9781635574104,Bloomsbury,HOUSE OF FLAME AND SHADOW,Sarah J. Maas,by Sarah J. Maas
34,2024-02-17,Hardcover Fiction,5,5,26,0593422945,9780593422946,Riverhead,THE HEAVEN & EARTH GROCERY STORE,James McBride,by James McBride
35,2024-02-17,Hardcover Fiction,6,6,7,0593492919,9780593492918,Pamela Dorman,FIRST LIE WINS,Ashley Elston,by Ashley Elston
36,2024-02-17,Hardcover Fiction,7,0,1,0316403385,9780316403382,"Little, Brown",CROSSHAIRS,James Patterson and James O. Born,by James Patterson and James O. Born
45,2024-02-17,Hardcover Nonfiction,1,2,43,0385534264,9780385534260,Doubleday,THE WAGER,David Grann,by David Grann
46,2024-02-17,Hardcover Nonfiction,2,3,47,0593236599,9780593236598,Harmony,OUTLIVE,Peter Attia with Bill Gifford,by Peter Attia with Bill Gifford
47,2024-02-17,Hardcover Nonfiction,3,1,2,0063068796,9780063068797,Mariner,MEDGAR & MYRLIE,Joy-Ann Reid,by Joy-Ann Reid


In [25]:
# number of NAs in each column
df.isna().sum()

bestsellers_date    0
bestsellers_list    0
rank                0
rank_last_week      0
weeks_on_list       0
isbn10              0
isbn13              0
publisher           0
title               0
author              0
contributor         0
dtype: int64

## Step 3
Change all strings in DataFrame to lowercase.

In [26]:
# make all strings lowercase
df = df.applymap(lambda s: s.lower() if type(s) == str else s)

In [27]:
df.head()

Unnamed: 0,bestsellers_date,bestsellers_list,rank,rank_last_week,weeks_on_list,isbn10,isbn13,publisher,title,author,contributor
0,2024-02-17,combined print and e-book fiction,1,1,2,1250178630,9781250178633,st. martin's,the women,kristin hannah,by kristin hannah
1,2024-02-17,combined print and e-book fiction,2,5,42,1649374046,9781649374042,red tower,fourth wing,rebecca yarros,by rebecca yarros
2,2024-02-17,combined print and e-book fiction,3,4,2,593550404,9780593550403,berkley,bride,ali hazelwood,by ali hazelwood
3,2024-02-17,combined print and e-book fiction,4,6,15,1649374178,9781649374172,red tower,iron flame,rebecca yarros,by rebecca yarros
4,2024-02-17,combined print and e-book fiction,5,3,3,1635574102,9781635574104,bloomsbury,house of flame and shadow,sarah j. maas,by sarah j. maas


## Step 4
Change datatype of bestsellers_date to datetime. Change datatype of isbn13 to numeric.

In [28]:
df["isbn13"] = pd.to_numeric(df["isbn13"], errors= "coerce")
df["bestsellers_date"] = pd.to_datetime(df["bestsellers_date"])
df.dtypes

bestsellers_date    datetime64[ns]
bestsellers_list            object
rank                         int64
rank_last_week               int64
weeks_on_list                int64
isbn10                      object
isbn13                     float64
publisher                   object
title                       object
author                      object
contributor                 object
dtype: object

## Step 5
Strip "by" from contributor. keep only first listed author as primary author in "author" column. All others will remain only in "contributor". Replace "and"/"with" to "/" in "contributor" column, to match format from CSV data. 

In [29]:
df['contributor'] = df['contributor'].str.replace('by', '')

In [30]:
new = df["author"].str.split("and", n = 1, expand = True)
df["first_listed_author"] = new[0]
df.drop(columns = ["author"], inplace = True)

In [31]:
df['contributor'] = df['contributor'].str.replace('with', '/')
df['contributor'] = df['contributor'].str.replace('and', '/')
df.head()

Unnamed: 0,bestsellers_date,bestsellers_list,rank,rank_last_week,weeks_on_list,isbn10,isbn13,publisher,title,contributor,first_listed_author
0,2024-02-17,combined print and e-book fiction,1,1,2,1250178630,9781250000000.0,st. martin's,the women,kristin hannah,kristin hannah
1,2024-02-17,combined print and e-book fiction,2,5,42,1649374046,9781649000000.0,red tower,fourth wing,rebecca yarros,rebecca yarros
2,2024-02-17,combined print and e-book fiction,3,4,2,593550404,9780594000000.0,berkley,bride,ali hazelwood,ali hazelwood
3,2024-02-17,combined print and e-book fiction,4,6,15,1649374178,9781649000000.0,red tower,iron flame,rebecca yarros,rebecca yarros
4,2024-02-17,combined print and e-book fiction,5,3,3,1635574102,9781636000000.0,bloomsbury,house of flame and shadow,sarah j. maas,sarah j. maas


## Step 6
Change order of dataframe to be more easily read.

In [32]:
# rearrange dataframe column order
df = df.iloc[:,[8,10,9,5,6,1,2,3,4,0,7]]

In [33]:
df.to_csv("api_data.csv", index = False)

## Ethical Implications

Even though there were duplicates within the datset, upon investigation I did not remove any of the duplicates because they either represented different versions of the same title (i.e. hardcover, paperback, audiobook, or ebook) which each corresponds to a different ISBN. However, even for titles that had the same ISBN and version, if they appeared on different bestseller's list I still kept them as seperate entities because while it would be straightforward if the dataset only included the bestseller's lists the title appears on, it becomes much more complicated to contain several different values for rank, weeks_on_list, and rank_last_week. Removing all authors except the first listed from the authors column presents an interesting dilemna: ultimately, the choice was made to simplify the column at the risk of undervaluing some authors' contributions to the title presented by excluding them from the author column and delegating them to the contributor column only. This also presents problems when titles are co-written, or are a collection of stories and the author column would imply that the author listed first is the primary contributor; therefore, "author" becomes "first_listed_author" to maintain that transparency clearly in the final dataset. 

## Readable Dataset

In [34]:
print(df.head(60))

                                          title  \
0                                     the women   
1                                   fourth wing   
2                                         bride   
3                                    iron flame   
4                     house of flame and shadow   
5                                    crosshairs   
6                                   the teacher   
7              the heaven & earth grocery store   
8                      house of earth and blood   
9                      a court of silver flames   
10                                   icebreaker   
11                                the housemaid   
12                               first lie wins   
13                                 fangirl down   
14                      house of sky and breath   
15                   killers of the flower moon   
16                                    the wager   
17                     the body keeps the score   
18                         the 

In [35]:
print(df.tail(60))

                                    title  \
625                          born a crime   
626                        hidden figures   
627              between the world and me   
628                      the new jim crow   
629                            just mercy   
630                         march trilogy   
631                       march: book one   
632   misadventures of awkward black girl   
633               you can't touch my hair   
634            stamped from the beginning   
635                                shaken   
636                       the book of joy   
637             think better, live better   
638                   the four agreements   
639                             uninvited   
640                        the broken way   
641                 the book of mysteries   
642                  present over perfect   
643                      the power of now   
644                  the american miracle   
645                               sapiens   
646       