In [42]:
import datetime
import time
import requests
import pandas as pd
import spacy
from sqlalchemy import create_engine

nlp = spacy.load('en_core_web_sm')

# Archive.org crawling

In [1]:
#https://stackoverflow.com/questions/28154066/how-to-convert-datetime-to-integer-in-python
def to_integer(dt_time):
    '''Encode date as monthyear integer, to match archive.org timestamp format'''
    return 10000*dt_time.year + 100*dt_time.month + dt_time.day

In [None]:
#https://stackoverflow.com/questions/993358/creating-a-range-of-dates-in-python
#Build date range of 1500 days back from 12/30/2017
start = datetime.datetime(2017, 12, 30)
dates = [start - datetime.timedelta(days=x) for x in range(0, 1500)]

In [None]:
#Convert to integer encoding
intdates = [to_integer(date) for date in dates]

In [None]:
#Capture the point at which A/B testing started
not_testing = []
for i in intdates:
    #Get a snapshot for a given day
    r = requests.get('http://archive.org/wayback/available?url=nytimes.com&timestamp={}'.format(str(i)))
    #Pull the URL for that snapshot from the metadata the API gives us
    arc = r.json()['archived_snapshots']['closest']['url']
    #Scrape the HTML of the snapshot
    r2 = requests.get(arc)
    #Check if the Optimizely script is present. If it is, wait and go to the next day. If not, add it to the list.
    if 'optimizely.com' not in r2.text:
        print(arc)
        not_testing.append(arc)
    else:
        print(i)
        time.sleep(2)

I'm looking for the point where a bunch of URLs show up in the `not_testing` list in a row, then calling the day after the latest date in that run the start point for testing. The latest URL in this run was `http://web.archive.org/web/20150805230843/http://www.nytimes.com:80/`, so testing would have started on August 6, 2015.

# NYT headlines

I'm using the NYT archive API to pull headline data by month. Format: `http://api.nytimes.com/svc/archive/v1/{year}/{month}.json?api-key={your-api-key}`

In [52]:
key = %env KEY

def query_range(startmo, startyr, endmo, endyr):
    '''Build a range of [month, year] pairs to use for querying the archive API'''
    startmos = [[i, startyr] for i in range(startmo, 13)]
    middle = [[j, i] for i in range(startyr, endyr+1) if i != startyr and i != endyr for j in range(1, 13)]
    endmos = [[i, endyr] for i in range(1, endmo+1)]
    allmos = [i for sub in [startmos, middle, endmos] for i in sub]
    return allmos

def cleaning(entry):
    '''Clean the response of the archive API so that data can be uniformly written into a PostgreSQL table'''
    #Check for a print headline
    try:
        entry['print_headline'] = entry['headline']['print_headline']
    except Exception:
        pass
    #Check for a headline
    try:
        entry['headline'] = entry['headline']['main']
    except Exception:
        entry['headline'] = str(entry['headline'])
    #Check for a byline object
    try:
        entry['byline'] = str(entry['byline'])
    except Exception:
        entry['byline'] = ''
    #Turn keywords - returned as a list of objects - into a comma-separated string
    entry['keywords'] = ','.join(i['value'] for i in entry['keywords'])
    #Resolve change over time from 'news_desk' to 'new_desk'
    if 'new_desk' in entry:
        entry['news_desk'] = entry['new_desk']
        entry.pop('new_desk', None)
    #Remove multimedia metadata
    entry.pop('multimedia', None)
    #Remove largely empty blog field
    entry.pop('blog', None)
    #Remove score - unclear purpose
    entry.pop('score', None)
    #Remove inconsistently applied URI
    entry.pop('uri', None)
    return entry

In [None]:
#Database configuration
user = %env USER
password = %env PASSWORD
db = %env DATABASE
engine = create_engine('postgresql://{}:{}@localhost:5432/{}'.format(user, password, db))

In [None]:
#Generate range of months from mid-2012 to last month
morange = query_range(5, 2012, 9, 2018)

for i in morange:
    #Keep track of current month
    print(i)
    #Query archive API for current month
    h = requests.get('http://api.nytimes.com/svc/archive/v1/{}/{}.json?api-key={}'.format(i[1], i[0], key))
    #Clean results
    items = h.json()['response']['docs']
    cleaned = [cleaning(i) for i in items]
    #Convert results to DataFrame
    cdf = pd.DataFrame(cleaned)
    #Write DataFrame to PostgreSQL table
    cdf.to_sql('results', engine, if_exists = 'append')

# Generating variables

In [4]:
#https://github.com/pandas-dev/pandas/issues/12265#issuecomment-181838631
#Database configuration
user = %env USER
password = %env PASSWORD
db = %env DATABASE
engine = create_engine('postgresql://{}:{}@localhost:5432/{}'.format(user, password, db), execution_options=dict(stream_results=True))

In [6]:
#Get sample of data
for table in pd.read_sql_query('SELECT * FROM results', engine, chunksize=1000):
    test = table
    break

In [None]:
#H1: Count quotation marks
test['headline'].str.count('\'|"')

In [20]:
test[test['headline'].str.contains('[0-9]+(,[0-9]+)*')]

  if __name__ == '__main__':


Unnamed: 0,index,_id,abstract,byline,document_type,headline,keywords,lead_paragraph,news_desk,print_headline,print_page,pub_date,section_name,slideshow_credits,snippet,source,subsection_name,type_of_material,web_url,word_count
1,1,53f7702d38f0d835538dbd7e,,,multimedia,"A Look at New York City, From 1940 to Today","Census,New York City,Corona (NYC),Harlem (NYC)...",The New York Times has compared statistics and...,N.Y. / Region,,,2012-05-01T00:00:00Z,N.Y. / Region,,The New York Times has compared statistics and...,The New York Times,,Interactive Feature,https://www.nytimes.com/interactive/2012/05/01...,0
3,3,4fd3a4958eb7c8105d8ef5c6,Highfields Capital Management on Tuesday publi...,"{'person': [{'firstname': 'Michael', 'middlena...",blogpost,Highfields Names 3 Candidates for CoreLogic Board,,,,,,2012-05-01T22:08:26Z,Business Day,,Highfields Capital Management on Tuesday publi...,The New York Times,Dealbook,Blog,https://dealbook.nytimes.com/2012/05/01/highfi...,74
5,5,4fd3a47a8eb7c8105d8ef0ee,"Charming Shoppes, the owner of the Lane Bryant...","{'person': [{'firstname': 'Michael', 'middlena...",blogpost,Owner of Lane Bryant Sold to Rival for $890 Mi...,,,,,,2012-05-01T21:48:50Z,Business Day,,"Charming Shoppes, the owner of the Lane Bryant...",The New York Times,Dealbook,Blog,https://dealbook.nytimes.com/2012/05/01/owner-...,283
10,10,4fd3a4aa8eb7c8105d8ef82c,"Fingerprinting, photography, bertillonage and ...","{'person': [{'organization': '', 'role': 'repo...",blogpost,What's in a Name? (Part 2),"Fingerprinting,Forensic Science,Identification...",,,,,2012-05-01T21:00:20Z,Opinion,,"Fingerprinting, photography, bertillonage and ...",The New York Times,,Blog,https://opinionator.blogs.nytimes.com/2012/05/...,7004
39,38,4fd3a4358eb7c8105d8ee8f9,"Restore Our Future, the ""super PAC"" supporting...","{'person': [{'organization': '', 'role': 'repo...",blogpost,Pro-Romney 'Super PAC' Begins $4 Million Ad Buy,Presidential Election of 2012,,,,,2012-05-01T17:07:54Z,U.S.,,"Restore Our Future, the ""super PAC"" supporting...",The New York Times,Politics,Blog,https://thecaucus.blogs.nytimes.com/2012/05/01...,205
61,58,4fd3a4958eb7c8105d8ef5ac,The Nashville Predators suspended Alexander Ra...,"{'person': [{'organization': '', 'role': 'repo...",blogpost,Predators Suspend Two for Game 3 Against Coyotes,"Hockey, Ice",,,,,2012-05-01T15:20:40Z,Sports,,The Nashville Predators suspended Alexander Ra...,The New York Times,Hockey,Blog,https://slapshot.blogs.nytimes.com/2012/05/01/...,255
66,63,4fd3a4868eb7c8105d8ef2e9,The Dearborn automaker announced pricing and p...,"{'person': [{'organization': '', 'role': 'repo...",blogpost,"Ford Focus ST Is Priced Below $25,000",Automobiles,,,,,2012-05-01T15:03:05Z,Autos,,The Dearborn automaker announced pricing and p...,The New York Times,,Blog,https://wheels.blogs.nytimes.com/2012/05/01/fo...,222
69,66,4fd3a4b98eb7c8105d8ef94b,"Once known as the Kodak, the Dolby Theater wil...","{'person': [{'organization': '', 'role': 'repo...",blogpost,Oscars Get New Name (and 20-Year Deal) for Its...,Academy Awards (Oscars),,,,,2012-05-01T14:50:27Z,Arts,,"Once known as the Kodak, the Dolby Theater wil...",The New York Times,,Blog,https://artsbeat.blogs.nytimes.com/2012/05/01/...,201
71,68,4fd3a4a28eb7c8105d8ef67d,N.Y. See: A presidential candidate and a forme...,"{'person': [], 'original': 'By THE NEW YORK TI...",blogpost,"Carmine Street, Circa 1 P.M.","Giuliani, Rudolph W,Romney, Mitt",,,,,2012-05-01T14:35:39Z,N.Y. / Region,,N.Y. See: A presidential candidate and a forme...,The New York Times,,Blog,https://cityroom.blogs.nytimes.com/2012/05/01/...,24
87,138,513e0664cf28d04ae0002215,,[],multimedia,2012 Tony Nominations: Musicals,"Theater,Tony Awards (Theater Awards)",Photos of shows in the best musical and best r...,Theater / Tony Awards,,,2012-05-01T00:00:00Z,Theater,,Photos of shows in the best musical and best r...,The New York Times,Tony Awards,Slideshow,https://www.nytimes.com/slideshow/2012/05/01/t...,11


In [21]:
#H2a: Count of numbers
test['headline'].str.count('[0-9]+(,[0-9]+)*')

0      0
1      1
2      0
3      1
4      0
5      1
6      0
7      0
8      0
9      0
10     1
11     0
12     0
13     0
14     0
15     0
16     0
17     0
18     0
19     0
20     0
21     0
22     0
23     0
24     0
25     0
26     0
27     0
28     0
29     0
      ..
970    0
971    0
972    0
973    0
974    0
975    0
976    1
977    0
978    0
979    0
980    0
981    0
982    0
983    2
984    0
985    0
986    0
987    0
988    0
989    0
990    0
991    0
992    0
993    1
994    0
995    0
996    0
997    0
998    0
999    0
Name: headline, Length: 1000, dtype: int64

In [31]:
#H2b: Count of numbers starting headlines
test['headline'].str.split(' ').map(lambda x: x[0]).str.count('[0-9]+')

0      0
655    0
656    0
657    0
658    0
659    0
660    0
661    0
662    0
663    0
664    0
665    0
666    0
667    0
668    0
669    0
670    0
671    0
672    0
673    0
674    0
675    0
676    0
677    0
678    0
679    0
680    0
681    0
654    0
682    0
      ..
318    0
319    0
320    0
321    0
322    0
338    0
324    0
325    0
323    0
999    0
328    0
329    0
336    0
330    0
335    0
331    0
332    0
334    0
333    0
747    1
774    1
326    1
861    1
327    1
941    1
944    1
573    1
149    1
424    1
87     1
Name: headline, Length: 1000, dtype: int64

In [68]:
#H3a: Count of interrogatives (WP, WRB) https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
test['headline'].map(lambda x: nlp(x)).map(lambda x: [i.tag_ for i in x]).map(lambda x: x.count('WP') + x.count('WRB'))

0      1
1      0
2      0
3      0
4      0
5      0
6      0
7      0
8      0
9      0
10     1
11     0
12     0
13     0
14     0
15     0
16     0
17     0
18     0
19     0
20     0
21     0
22     1
23     0
24     0
25     0
26     0
27     0
28     0
29     0
      ..
970    0
971    0
972    0
973    0
974    0
975    0
976    0
977    0
978    0
979    1
980    0
981    0
982    0
983    0
984    0
985    0
986    0
987    0
988    0
989    0
990    0
991    0
992    0
993    0
994    0
995    0
996    0
997    0
998    0
999    0
Name: headline, Length: 1000, dtype: int64

In [70]:
#H3b: Count of interrogatives starting headlines
test['headline'].map(lambda x: nlp(x)).map(lambda x: [i.tag_ for i in x][0]).str.count('WP|WRB')

0      1
1      0
2      0
3      0
4      0
5      0
6      0
7      0
8      0
9      0
10     1
11     0
12     0
13     0
14     0
15     0
16     0
17     0
18     0
19     0
20     0
21     0
22     1
23     0
24     0
25     0
26     0
27     0
28     0
29     0
      ..
970    0
971    0
972    0
973    0
974    0
975    0
976    0
977    0
978    0
979    0
980    0
981    0
982    0
983    0
984    0
985    0
986    0
987    0
988    0
989    0
990    0
991    0
992    0
993    0
994    0
995    0
996    0
997    0
998    0
999    0
Name: headline, Length: 1000, dtype: int64

In [78]:
#H4: Count of effective social media phrases
phrases = ['will make you', 
           'this is why', 
           'can we guess', 
           'only [0-9]+ in',
           'the reason is',
           'are freaking out',
           '[0-9]+ stunning photos',
           'tears of joy',
           'is what happens',
           'make you cry',
           'give you goosebumps',
           'talking about it',
           'is too cute',
           'shocked to see',
           'melt your heart',
           '[0-9]+ things only',
           'can\'t stop laughing',
           'top [0-9]+ songs',
           'twitter reacts to',
           'what happened next']
pattern = '|'.join(phrases)

test['headline'].str.contains(pattern, case = False)

0      False
1      False
2      False
3      False
4      False
5      False
6      False
7      False
8      False
9      False
10     False
11     False
12     False
13     False
14     False
15     False
16     False
17     False
18     False
19     False
20     False
21     False
22     False
23     False
24     False
25     False
26     False
27     False
28     False
29     False
       ...  
970    False
971    False
972    False
973    False
974    False
975    False
976    False
977    False
978    False
979    False
980    False
981    False
982    False
983    False
984    False
985    False
986    False
987    False
988    False
989    False
990    False
991    False
992    False
993    False
994    False
995    False
996    False
997    False
998    False
999    False
Name: headline, Length: 1000, dtype: bool

In [88]:
#H6: Average word length
test['headline'].str.split(' ').map(lambda x: sum([len(i) for i in x])/len(x))

0       5.333333
1       3.400000
2       5.250000
3       6.142857
4       3.333333
5       4.200000
6       7.333333
7       5.000000
8       4.545455
9       5.111111
10      3.500000
11      3.857143
12      6.000000
13      5.166667
14      4.700000
15      8.600000
16      3.750000
17     15.000000
18      5.222222
19      5.500000
20      5.100000
21      5.375000
22      3.571429
23     12.000000
24      6.571429
25      5.000000
26      4.000000
27      5.571429
28      4.500000
29      7.000000
         ...    
970     5.142857
971     5.000000
972     4.333333
973     5.500000
974     4.625000
975     5.500000
976     5.777778
977     4.000000
978     5.272727
979     5.875000
980     7.000000
981     4.714286
982     5.666667
983     4.857143
984     4.555556
985     5.285714
986     7.800000
987     6.333333
988     5.428571
989     7.000000
990     6.200000
991     5.666667
992    11.000000
993     5.200000
994     3.571429
995     4.888889
996     4.400000
997     6.2500

In [90]:
#H7a: Count of personal/possessive nouns (PRP, PRP$) https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
test['headline'].map(lambda x: nlp(x)).map(lambda x: [i.tag_ for i in x]).map(lambda x: x.count('PRP') + x.count('PRP$'))

0      1
1      0
2      0
3      0
4      0
5      0
6      0
7      0
8      0
9      0
10     0
11     0
12     0
13     0
14     0
15     0
16     0
17     0
18     1
19     0
20     0
21     0
22     0
23     0
24     0
25     0
26     1
27     0
28     0
29     0
      ..
970    0
971    0
972    0
973    0
974    0
975    0
976    0
977    0
978    0
979    0
980    0
981    0
982    0
983    0
984    0
985    0
986    1
987    0
988    0
989    0
990    0
991    0
992    0
993    0
994    0
995    0
996    0
997    0
998    0
999    0
Name: headline, Length: 1000, dtype: int64

In [91]:
#H7b: Count of personal/possessive nouns starting headlines
test['headline'].map(lambda x: nlp(x)).map(lambda x: [i.tag_ for i in x][0]).str.count('PRP|PRP$')

0      0
1      0
2      0
3      0
4      0
5      0
6      0
7      0
8      0
9      0
10     0
11     0
12     0
13     0
14     0
15     0
16     0
17     0
18     0
19     0
20     0
21     0
22     0
23     0
24     0
25     0
26     0
27     0
28     0
29     0
      ..
970    0
971    0
972    0
973    0
974    0
975    0
976    0
977    0
978    0
979    0
980    0
981    0
982    0
983    0
984    0
985    0
986    0
987    0
988    0
989    0
990    0
991    0
992    0
993    0
994    0
995    0
996    0
997    0
998    0
999    0
Name: headline, Length: 1000, dtype: int64

In [38]:
#H8: Count of uncommon words


In [39]:
#H9: Count of proper nouns


In [None]:
#H10: Count of active verbs


In [None]:
#H5: Emotional intensity


In [None]:
#Word count


In [None]:
#Character count
