In [44]:
# Modules
import pandas as pd
import requests
import matplotlib as plt
import json
import time
from datetime import date
from dateutil.relativedelta import relativedelta

In [45]:
# API Key
from keys import NY_Times_API_KEY

In [46]:
# Request Settings
from settings import startDate, endDate

In [47]:
# Base URLs

NY_Times_ARCHIVE_BASE_URL = 'https://api.nytimes.com/svc/archive/v1/'#YYYY/MM.json?api-key=
NY_Times_ARTICLE_SEARCH_BASE_URL = 'https://api.nytimes.com/svc/search/v2/articlesearch.json?q='

Hypothesis: Articles released by the NY Times influence the performance of stock symbols.
Null Hypothesis: Articles released by the NY Times have no correlation with the performance of stock symbols.

In [48]:
def request_articles(date):
    base = NY_Times_ARCHIVE_BASE_URL
    url = f'{base}{date.year}/{date.month}.json?api-key={NY_Times_API_KEY}'
    try:
        req = requests.get(url).json()
    except Exception as e:
        return (e,'')
    time.sleep(5)
    return req

In [49]:
def has_headline(article):
    article_has_headline = type(article[ 'headline']) == dict and 'main' in article['headline'].keys()
    return article_has_headline

In [50]:
def first_of_months_in_date_range(startDate=date.today(),endDate=date.today()):
    months_in_range = [x.split(' ') for x in pd.date_range(startDate, endDate, freq='MS').strftime("%Y %m %d").tolist()]
    return months_in_range

In [51]:
article_date_range = first_of_months_in_date_range(startDate=startDate,endDate=endDate)


In [52]:
articles = { # Column headers for dataframe
    'id':[],
    'date':[],
    'url':[],
    'headline':[],
    'abstract':[],
    'leadParagraph':[],
    'keywords':[],
    'author':[]
}

In [53]:
y = 1
b = tuple()
for d in article_date_range: # requests articles for every month in the range of startDate to endDate variables defined above. Filters by has_headline
    year, month, day  = int(d[0]), int(d[1]), int(d[2])
    article = request_articles(date(year,month,day))['response']['docs']
    if type(article) == type(b): # If article is returned as a tuple, its because request_articles ran into an error. (errorMessage,json)
        print(f'REQUEST ERROR: {article[0]}')
    else:
        for i in range(1,len(article)):
            # try:
            articles['index'].append(y)
            articles['id'].append(article[i]['_id'])
            articles['date'].append(article[i]['pub_date'])
            articles['url'].append(article[i]['web_url'])
            articles['headline'].append(article[i]['headline'])
            articles['abstract'].append(article[i]['abstract'])
            articles['leadParagraph'].append(article[i]['lead_paragraph'])
            articles['keywords'].append(article[i]['keywords'])
            articles['author'].append(article[i]['byline'])
                # except Exception as e:
                #     print(f'Exception when assigning data to articles dict: {e}')
            y += 1

In [54]:
# a['response']['docs'][2]

In [57]:
articles

{'id': 'nyt://article/6a4fbc8d-a0fc-50b7-934a-b96fc4abf30b',
 'date': '2021-03-31T23:11:05+0000',
 'url': 'https://www.nytimes.com/2021/03/31/us/minnesota-supreme-court-rape-ruling.html',
 'headline': {'main': 'Minnesota Court Ruling Fuels Calls to Change Sexual Assault Law',
  'kicker': None,
  'content_kicker': None,
  'print_headline': '',
  'name': None,
  'seo': None,
  'sub': None},
 'abstract': 'The state Supreme Court tossed out a man’s conviction on a third-degree sexual conduct charge because the woman he was accused of assaulting was “voluntarily intoxicated” at the time.',
 'leadParagraph': 'On the night of May 13, 2017, a woman consumed five shots of vodka and a prescription narcotic before heading to the Dinkytown neighborhood of Minneapolis with a friend. After a bar turned them away, they met three men who invited them to a party.',
 'keywords': [{'name': 'subject',
   'value': 'Law and Legislation',
   'rank': 1,
   'major': 'N'},
  {'name': 'subject', 'value': 'State 

In [56]:
allArticlesDataFrame = pd.DataFrame(data=articles)
allArticlesDataFrame.head()

ValueError: Mixing dicts with non-Series may lead to ambiguous ordering.

In [None]:
_ = allArticlesDataFrame.to_csv('articles.csv')