### Goal: write a webscraper for Wikipedia to get _The New York Times_ Fiction Best Sellers for a range of years between 1942-2016

Sample urls:
* https://en.wikipedia.org/wiki/The_New_York_Times_Fiction_Best_Sellers_of_1945
* https://en.wikipedia.org/wiki/The_New_York_Times_Fiction_Best_Sellers_of_1970
* https://en.wikipedia.org/wiki/The_New_York_Times_Fiction_Best_Sellers_of_1995
* https://en.wikipedia.org/wiki/The_New_York_Times_Fiction_Best_Sellers_of_2015

Data to collect:
* date = Sunday of week during which the book was on the NYT best sellers list
* title = title of the book
* author = author of the book (first name first)

In [1]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import re
import urllib
import pandas as pd

In [2]:
def try_url(url):
    '''
    Attempt to access webpage
    '''
    response = requests.get(url)
    status = response.status_code
    if status != 200:
        return status
    else:
        page = response.text
        soup = BeautifulSoup(page, 'lxml')
        return soup

In [3]:
def get_data_from_table(df, bestsellers_table, year):
    '''
    Retrieves data from table from Wikipedia page
    bestsellers_table is BeautifulSoup(page)
    '''
    num_rows = len(bestsellers_table)
    for row_i in np.arange(1, num_rows): 
        row = bestsellers_table[row_i].find_all('td') #all columns in row
        date = row[0].text
        date = re.findall('([A-Za-z]+\s\d*)', date)
        if date:
            date = date[0]
            if len(row) ==2:
                title = row[1].text
                author = 'N/A'
            if len(row) == 3:
                title = row[1].text
                author = row[2].text
            results = pd.Series((year, date, title, author))
            #print results
            df = df.append(results, ignore_index=True)
    return df

In [4]:
urls = []
years = np.arange(1942, 2017)

base_url = 'https://en.wikipedia.org/wiki/The_New_York_Times_Fiction_Best_Sellers_of_'

for year in years:
    url = base_url + str(year)
    urls.append(url)

In [5]:
#get all data for all urls
wiki_df = pd.DataFrame(data=None)

for url in urls:
    year = url[-4:]
    soup = try_url(url)
    #table of best sellers
    bestsellers_table = soup.find_all(id='bodyContent')[0].find_all('table', attrs = {'class': 'wikitable'})[0].find_all('tr')
    wiki_df = get_data_from_table(wiki_df, bestsellers_table, year)

In [6]:
len(wiki_df)/52 == len(years)

True

In [7]:
columns = {0: 'year',
           1: 'date',
           2: 'title',
           3: 'author'
          }
wiki_df.rename(columns=columns, inplace=True)

In [8]:
wiki_df.head()

Unnamed: 0,year,date,title,author
0,1942,January 4,The Keys of the Kingdom,A. J. Cronin
1,1942,January 11,Windswept,Mary Ellen Chase
2,1942,January 18,Windswept,Mary Ellen Chase
3,1942,January 25,Windswept,Mary Ellen Chase
4,1942,February 1,Windswept,Mary Ellen Chase


In [9]:
wiki_df.to_csv('nyt_scraped.csv', encoding='utf-8')