In [4]:
import datetime
import time
import requests
import pandas as pd
from sqlalchemy import create_engine

# Archive.org crawling

In [1]:
#https://stackoverflow.com/questions/28154066/how-to-convert-datetime-to-integer-in-python
def to_integer(dt_time):
    '''Encode date as monthyear integer, to match archive.org timestamp format'''
    return 10000*dt_time.year + 100*dt_time.month + dt_time.day

In [None]:
#https://stackoverflow.com/questions/993358/creating-a-range-of-dates-in-python
#Build date range of 1500 days back from 12/30/2017
start = datetime.datetime(2017, 12, 30)
dates = [start - datetime.timedelta(days=x) for x in range(0, 1500)]

In [None]:
#Convert to integer encoding
intdates = [to_integer(date) for date in dates]

In [None]:
#Capture the point at which A/B testing started
not_testing = []
for i in intdates:
    #Get a snapshot for a given day
    r = requests.get('http://archive.org/wayback/available?url=nytimes.com&timestamp={}'.format(str(i)))
    #Pull the URL for that snapshot from the metadata the API gives us
    arc = r.json()['archived_snapshots']['closest']['url']
    #Scrape the HTML of the snapshot
    r2 = requests.get(arc)
    #Check if the Optimizely script is present. If it is, wait and go to the next day. If not, add it to the list.
    if 'optimizely.com' not in r2.text:
        print(arc)
        not_testing.append(arc)
    else:
        print(i)
        time.sleep(2)

I'm looking for the point where a bunch of URLs show up in the `not_testing` list in a row, then calling the day after the latest date in that run the start point for testing. The latest URL in this run was `http://web.archive.org/web/20150805230843/http://www.nytimes.com:80/`, so testing would have started on August 6, 2015.

# NYT headlines

I'm using the NYT archive API to pull headline data by month. Format: `http://api.nytimes.com/svc/archive/v1/{year}/{month}.json?api-key={your-api-key}`

In [52]:
key = %env KEY

def query_range(startmo, startyr, endmo, endyr):
    '''Build a range of [month, year] pairs to use for querying the archive API'''
    startmos = [[i, startyr] for i in range(startmo, 13)]
    middle = [[j, i] for i in range(startyr, endyr+1) if i != startyr and i != endyr for j in range(1, 13)]
    endmos = [[i, endyr] for i in range(1, endmo+1)]
    allmos = [i for sub in [startmos, middle, endmos] for i in sub]
    return allmos

def cleaning(entry):
    '''Clean the response of the archive API so that data can be uniformly written into a PostgreSQL table'''
    #Check for a print headline
    try:
        entry['print_headline'] = entry['headline']['print_headline']
    except Exception:
        pass
    #Check for a headline
    try:
        entry['headline'] = entry['headline']['main']
    except Exception:
        entry['headline'] = str(entry['headline'])
    #Check for a byline object
    try:
        entry['byline'] = str(entry['byline'])
    except Exception:
        entry['byline'] = ''
    #Turn keywords - returned as a list of objects - into a comma-separated string
    entry['keywords'] = ','.join(i['value'] for i in entry['keywords'])
    #Resolve change over time from 'news_desk' to 'new_desk'
    if 'new_desk' in entry:
        entry['news_desk'] = entry['new_desk']
        entry.pop('new_desk', None)
    #Remove multimedia metadata
    entry.pop('multimedia', None)
    #Remove largely empty blog field
    entry.pop('blog', None)
    #Remove score - unclear purpose
    entry.pop('score', None)
    #Remove inconsistently applied URI
    entry.pop('uri', None)
    return entry

In [None]:
#Database configuration
user = %env USER
password = %env PASSWORD
db = %env DATABASE
engine = create_engine('postgresql://{}:{}@localhost:5432/{}'.format(user, password, db))

In [None]:
#Generate range of months from mid-2012 to last month
morange = query_range(5, 2012, 9, 2018)

for i in morange:
    #Keep track of current month
    print(i)
    #Query archive API for current month
    h = requests.get('http://api.nytimes.com/svc/archive/v1/{}/{}.json?api-key={}'.format(i[1], i[0], key))
    #Clean results
    items = h.json()['response']['docs']
    cleaned = [cleaning(i) for i in items]
    #Convert results to DataFrame
    cdf = pd.DataFrame(cleaned)
    #Write DataFrame to PostgreSQL table
    cdf.to_sql('results', engine, if_exists = 'append')