In [68]:
# Modules
import pandas as pd
import requests
import time
from datetime import date

In [69]:
# API Key
from keys import NY_Times_API_KEY

In [70]:
# Request Settings
from settings import startDate, endDate

In [71]:
# Base URLs

NY_Times_ARCHIVE_BASE_URL = 'https://api.nytimes.com/svc/archive/v1/'#YYYY/MM.json?api-key=
NY_Times_ARTICLE_SEARCH_BASE_URL = 'https://api.nytimes.com/svc/search/v2/articlesearch.json?q='

Hypothesis: Articles released by the NY Times influence the performance of stock symbols.
Null Hypothesis: Articles released by the NY Times have no correlation with the performance of stock symbols.

In [72]:
def request_articles(date):
    base = NY_Times_ARCHIVE_BASE_URL
    url = f'{base}{date.year}/{date.month}.json?api-key={NY_Times_API_KEY}'
    try:
        req = requests.get(url).json()
    except Exception as e:
        return (e,'')
    time.sleep(5)
    return req

In [73]:
def has_headline(article):
    article_has_headline = type(article[ 'headline']) == dict and 'main' in article['headline'].keys()
    return article_has_headline

In [74]:
def first_of_months_in_date_range(startDate=date.today(),endDate=date.today()):
    months_in_range = [x.split(' ') for x in pd.date_range(startDate, endDate, freq='MS').strftime("%Y %m %d").tolist()]
    return months_in_range

In [76]:
articlesDict = { # Column headers for dataframe
    'id':[],
    'date':[],
    'url':[],
    'headline':[],
    'abstract':[],
    'leadParagraph':[],
    'keywords':[],
    'author':[]
}

In [86]:
def fetch_articles_df(article_date_range,articles):
    b = tuple()
    for d in article_date_range: # requests articles for every month in the range of startDate to endDate variables defined above. Filters by has_headline
        year, month, day  = int(d[0]), int(d[1]), int(d[2])
        article = request_articles(date(year,month,day))['response']['docs']
        if type(article) == type(b): # If article is returned as a tuple, its because request_articles ran into an error. (errorMessage,json)
            print(f'REQUEST ERROR: {article[0]}')
        else:
            for i in range(1,len(article)):
                articles['id'].append(article[i]['_id'])
                articles['date'].append(article[i]['pub_date'])
                articles['url'].append(article[i]['web_url'])
                articles['headline'].append(article[i]['headline'])
                articles['abstract'].append(article[i]['abstract'])
                articles['leadParagraph'].append(article[i]['lead_paragraph'])
                articles['keywords'].append(article[i]['keywords'])
                articles['author'].append(article[i]['byline'])
    articlesDataframe = pd.DataFrame(data=articles)
    return articlesDataframe

In [84]:
article_date_range = first_of_months_in_date_range(startDate=startDate,endDate=endDate)
a = fetch_articles_df(article_date_range=article_date_range,articles=articlesDict)

In [81]:
_ = a.to_csv('articles.csv')

In [82]:
a.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16044 entries, 0 to 16043
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   id             16044 non-null  object
 1   date           16044 non-null  object
 2   url            16044 non-null  object
 3   headline       16044 non-null  object
 4   abstract       16044 non-null  object
 5   leadParagraph  16044 non-null  object
 6   keywords       16044 non-null  object
 7   author         16044 non-null  object
dtypes: object(8)
memory usage: 1002.9+ KB
