In [2]:
import datetime
import time
import requests
import pandas as pd
import spacy
from sqlalchemy import create_engine

nlp = spacy.load('en_core_web_sm')

# Archive.org crawling

In [None]:
#https://stackoverflow.com/questions/28154066/how-to-convert-datetime-to-integer-in-python
def to_integer(dt_time):
    '''Encode date as monthyear integer, to match archive.org timestamp format'''
    return 10000*dt_time.year + 100*dt_time.month + dt_time.day

In [None]:
#https://stackoverflow.com/questions/993358/creating-a-range-of-dates-in-python
#Build date range of 1500 days back from 12/30/2017
start = datetime.datetime(2017, 12, 30)
dates = [start - datetime.timedelta(days=x) for x in range(0, 1500)]

In [None]:
#Convert to integer encoding
intdates = [to_integer(date) for date in dates]

In [None]:
#Capture the point at which A/B testing started
not_testing = []
for i in intdates:
    #Get a snapshot for a given day
    r = requests.get('http://archive.org/wayback/available?url=nytimes.com&timestamp={}'.format(str(i)))
    #Pull the URL for that snapshot from the metadata the API gives us
    arc = r.json()['archived_snapshots']['closest']['url']
    #Scrape the HTML of the snapshot
    r2 = requests.get(arc)
    #Check if the Optimizely script is present. If it is, wait and go to the next day. If not, add it to the list.
    if 'optimizely.com' not in r2.text:
        print(arc)
        not_testing.append(arc)
    else:
        print(i)
        time.sleep(2)

I'm looking for the point where a bunch of URLs show up in the `not_testing` list in a row, then calling the day after the latest date in that run the start point for testing. The latest URL in this run was `http://web.archive.org/web/20150805230843/http://www.nytimes.com:80/`, so testing would have started on August 6, 2015.

# NYT headlines

I'm using the NYT archive API to pull headline data by month. Format: `http://api.nytimes.com/svc/archive/v1/{year}/{month}.json?api-key={your-api-key}`

In [None]:
key = %env KEY

def query_range(startmo, startyr, endmo, endyr):
    '''Build a range of [month, year] pairs to use for querying the archive API'''
    startmos = [[i, startyr] for i in range(startmo, 13)]
    middle = [[j, i] for i in range(startyr, endyr+1) if i != startyr and i != endyr for j in range(1, 13)]
    endmos = [[i, endyr] for i in range(1, endmo+1)]
    allmos = [i for sub in [startmos, middle, endmos] for i in sub]
    return allmos

def cleaning(entry):
    '''Clean the response of the archive API so that data can be uniformly written into a PostgreSQL table'''
    #Check for a print headline
    try:
        entry['print_headline'] = entry['headline']['print_headline']
    except Exception:
        pass
    #Check for a headline
    try:
        entry['headline'] = entry['headline']['main']
    except Exception:
        entry['headline'] = str(entry['headline'])
    #Check for a byline object
    try:
        entry['byline'] = str(entry['byline'])
    except Exception:
        entry['byline'] = ''
    #Turn keywords - returned as a list of objects - into a comma-separated string
    entry['keywords'] = ','.join(i['value'] for i in entry['keywords'])
    #Resolve change over time from 'news_desk' to 'new_desk'
    if 'new_desk' in entry:
        entry['news_desk'] = entry['new_desk']
        entry.pop('new_desk', None)
    #Remove multimedia metadata
    entry.pop('multimedia', None)
    #Remove largely empty blog field
    entry.pop('blog', None)
    #Remove score - unclear purpose
    entry.pop('score', None)
    #Remove inconsistently applied URI
    entry.pop('uri', None)
    return entry

In [None]:
#Database configuration
user = %env USER
password = %env PASSWORD
db = %env DATABASE
engine = create_engine('postgresql://{}:{}@localhost:5432/{}'.format(user, password, db))

In [None]:
#Generate range of months from mid-2012 to last month
morange = query_range(5, 2012, 9, 2018)

for i in morange:
    #Keep track of current month
    print(i)
    #Query archive API for current month
    h = requests.get('http://api.nytimes.com/svc/archive/v1/{}/{}.json?api-key={}'.format(i[1], i[0], key))
    #Clean results
    items = h.json()['response']['docs']
    cleaned = [cleaning(i) for i in items]
    #Convert results to DataFrame
    cdf = pd.DataFrame(cleaned)
    #Write DataFrame to PostgreSQL table
    cdf.to_sql('results', engine, if_exists = 'append')

# Generating variables

In [4]:
#https://github.com/pandas-dev/pandas/issues/12265#issuecomment-181838631
#Database configuration


user = %env USER
password = %env PASSWORD
db = %env DATABASE
engine = create_engine('postgresql://{}:{}@localhost:5432/{}'.format(user, password, db), execution_options=dict(stream_results=True))

  """)


In [5]:
#Get sample of data
for table in pd.read_sql_query('SELECT * FROM results', engine, chunksize=1000):
    test = table
    break

In [None]:
#H1: Count quotation marks
test['headline'].str.count('\'|"')

In [None]:
#H2a: Count of numbers
test['headline'].str.count('[0-9]+(,[0-9]+)*')

In [None]:
#H2b: Count of numbers starting headlines
test['headline'].str.split(' ').map(lambda x: x[0]).str.count('[0-9]+')

In [None]:
#H3a: Count of interrogatives (WP, WRB) https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
test['headline'].map(lambda x: nlp(x)).map(lambda x: [i.tag_ for i in x]).map(lambda x: x.count('WP') + x.count('WRB'))

In [None]:
#H3b: Count of interrogatives starting headlines
test['headline'].map(lambda x: nlp(x)).map(lambda x: [i.tag_ for i in x][0]).str.count('WP|WRB')

In [None]:
#H4: Count of effective social media phrases https://buzzsumo.com/blog/most-shared-headlines-study/#gs.E5zXvW8
phrases = ['will make you', 
           'this is why', 
           'can we guess', 
           'only [0-9]+ in',
           'the reason is',
           'are freaking out',
           '[0-9]+ stunning photos',
           'tears of joy',
           'is what happens',
           'make you cry',
           'give you goosebumps',
           'talking about it',
           'is too cute',
           'shocked to see',
           'melt your heart',
           '[0-9]+ things only',
           'can\'t stop laughing',
           'top [0-9]+ songs',
           'twitter reacts to',
           'what happened next']
pattern = '|'.join(phrases)

test['headline'].str.contains(pattern, case = False)

In [None]:
#H6: Average word length
test['headline'].str.split(' ').map(lambda x: sum([len(i) for i in x])/len(x))

In [None]:
#H7a: Count of personal/possessive nouns (PRP, PRP$) https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
test['headline'].map(lambda x: nlp(x)).map(lambda x: [i.tag_ for i in x]).map(lambda x: x.count('PRP') + x.count('PRP$'))

In [None]:
#H7b: Count of personal/possessive nouns starting headlines
test['headline'].map(lambda x: nlp(x)).map(lambda x: [i.tag_ for i in x][0]).str.count('PRP|PRP$')

In [13]:
#Word count
test['headline'].map(lambda x: len(x.split(' ')))

0      10
1       8
2       5
3       8
4       7
5       6
6       7
7      13
8       5
9       8
10      3
11      6
12      9
13      7
14      5
15      6
16      9
17      6
18      7
19     12
20      8
21      5
22      8
23      5
24      6
25      6
26      6
27     11
28      7
29      7
       ..
970     3
971     5
972     6
973     5
974     4
975     6
976     4
977     7
978     7
979     9
980     8
981     4
982     9
983     3
984     4
985     7
986     9
987     4
988     7
989     6
990     3
991     6
992     7
993     3
994     3
995     2
996     5
997     5
998     3
999     3
Name: headline, Length: 1000, dtype: int64

In [14]:
#Character count
test['headline'].map(lambda x: len(x))

0      67
1      54
2      33
3      37
4      47
5      31
6      42
7      72
8      37
9      53
10     25
11     35
12     55
13     39
14     31
15     34
16     60
17     40
18     36
19     54
20     59
21     36
22     45
23     34
24     36
25     37
26     37
27     76
28     35
29     24
       ..
970    22
971    35
972    33
973    33
974    22
975    40
976    10
977    37
978    41
979    42
980    43
981    19
982    45
983    17
984    27
985    37
986    48
987    22
988    28
989    39
990    19
991    32
992    30
993    17
994    25
995    11
996    24
997    22
998    20
999    17
Name: headline, Length: 1000, dtype: int64

## Follow up

In [63]:
#H8: Count of uncommon words
'''List of popular words taken from https://github.com/dolph/dictionary/blob/master/popular.txt
Disregards cardinal numbers (CD), foreign words (FW), possessive endings (POS), and symbols (SYM/$)'''
#This approach doesn't work very well. Entity analysis?
words = open('popular.txt').read().splitlines()
test['headline'].map(lambda x: nlp(x)) \
    .map(lambda x: [i for i in x if i.tag_ not in ['CD', 'FW', 'POS', 'SYM', '$', '``', ',', ':', 'HYPH', '.']]) \
    .map(lambda x: sum([1 if i.text in words else 0 for i in x]))

0      1
1      0
2      2
3      2
4      1
5      2
6      0
7      0
8      0
9      2
10     1
11     0
12     0
13     2
14     0
15     1
16     1
17     1
18     0
19     1
20     2
21     0
22     1
23     0
24     2
25     1
26     1
27     0
28     0
29     3
      ..
970    2
971    2
972    1
973    0
974    2
975    2
976    1
977    1
978    0
979    2
980    2
981    1
982    0
983    0
984    2
985    1
986    1
987    1
988    1
989    1
990    1
991    1
992    0
993    2
994    2
995    1
996    1
997    2
998    1
999    2
Name: headline, Length: 1000, dtype: int64

In [None]:
#H9: Count of proper nouns (NNP and NNPS) https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
#Entity analysis?
test['headline'].map(lambda x: nlp(x)).map(lambda x: [i.tag_ for i in x]).map(lambda x: x.count('NNP') + x.count('NNPS'))

In [None]:
#H10: Count of active verbs


In [None]:
#H5: Emotional intensity
