In [68]:
# Modules
import pandas as pd
import requests
import matplotlib as plt
import json
import time
from datetime import date
from dateutil.relativedelta import relativedelta

In [69]:
# API Key
from keys import NY_Times_API_KEY

In [70]:
# Request Settings
from settings import startDate, endDate

In [71]:
# Base URLs

NY_Times_ARCHIVE_BASE_URL = 'https://api.nytimes.com/svc/archive/v1/'#YYYY/MM.json?api-key=
NY_Times_ARTICLE_SEARCH_BASE_URL = 'https://api.nytimes.com/svc/search/v2/articlesearch.json?q='

Hypothesis: Articles released by the NY Times influence the performance of stock symbols.
Null Hypothesis: Articles released by the NY Times have no correlation with the performance of stock symbols.

In [72]:
def request_articles(date):
    base = NY_Times_ARCHIVE_BASE_URL
    url = f'{base}{date.year}/{date.month}.json?api-key={NY_Times_API_KEY}'
    try:
        req = requests.get(url).json()
    except Exception as e:
        return (e,'')
    time.sleep(5)
    return req

In [73]:
def has_headline(article):
    article_has_headline = type(article[ 'headline']) == dict and 'main' in article['headline'].keys()
    return article_has_headline

In [74]:
def first_of_months_in_date_range(startDate=date.today(),endDate=date.today()):
    months_in_range = [x.split(' ') for x in pd.date_range(startDate, endDate, freq='MS').strftime("%Y %m %d").tolist()]
    return months_in_range

In [75]:
article_date_range = first_of_months_in_date_range(startDate=startDate,endDate=endDate)


In [76]:
articles = { # Column headers for dataframe
    'id':[],
    'date':[],
    'url':[],
    'headline':[],
    'abstract':[],
    'leadParagraph':[],
    'keywords':[],
    'author':[]
}

In [77]:
y = 1
b = tuple()
for d in article_date_range: # requests articles for every month in the range of startDate to endDate variables defined above. Filters by has_headline
    year, month, day  = int(d[0]), int(d[1]), int(d[2])
    article = request_articles(date(year,month,day))['response']['docs']
    if type(article) == type(b): # If article is returned as a tuple, its because request_articles ran into an error. (errorMessage,json)
        print(f'REQUEST ERROR: {article[0]}')
    else:
        for i in range(1,len(article)):
            # try:
            # articles['index'].append(y)
            articles['id'].append(article[i]['_id'])
            articles['date'].append(article[i]['pub_date'])
            articles['url'].append(article[i]['web_url'])
            articles['headline'].append(article[i]['headline'])
            articles['abstract'].append(article[i]['abstract'])
            articles['leadParagraph'].append(article[i]['lead_paragraph'])
            articles['keywords'].append(article[i]['keywords'])
            articles['author'].append(article[i]['byline'])
                # except Exception as e:
                #     print(f'Exception when assigning data to articles dict: {e}')
            y += 1

In [78]:
# a['response']['docs'][2]

In [79]:
articles

{'id': ['nyt://article/8f633850-09ab-53ff-8753-bc72eaba01a2',
  'nyt://article/35e85454-abfc-5376-b4b5-7d99316d037a',
  'nyt://article/5fc45bbd-d622-56da-875c-b5b7c4ef0f27',
  'nyt://article/d75205a5-201f-5dc3-b8fe-278c3d645537',
  'nyt://article/5107fda2-e2c7-522b-ab92-239eeba4c06b',
  'nyt://article/9566517e-b7eb-55db-bb31-05c1bdab2d2c',
  'nyt://article/7eb8fed1-bd1f-5a31-a0c0-391e888e7a53',
  'nyt://article/1af40531-68dc-5743-a7d8-b0a4a369c41d',
  'nyt://article/57a7af4b-6fff-5b3c-9486-eee0fc484cce',
  'nyt://article/f378c8da-3054-5a13-a615-bd71dae72771',
  'nyt://article/c29df2db-63c0-5c92-b646-b4f6a09c0f78',
  'nyt://article/a62f5895-53b6-560a-b388-7cc9c9390d0a',
  'nyt://article/19223e43-e196-58c8-b57b-7be7c94225b2',
  'nyt://article/bf7ed6a9-1868-5763-99d9-3e96de49bd7e',
  'nyt://article/0b526ba8-2ce8-544f-a59d-3092fe81ac04',
  'nyt://article/f9171427-eed3-5031-88fa-f3ba050a5cf3',
  'nyt://article/1cd8a80e-034a-572b-b25f-1a8283a03ae0',
  'nyt://interactive/22b2316e-59ad-5af2-a5

In [80]:
allArticlesDataFrame = pd.DataFrame(data=articles)
allArticlesDataFrame.head()

Unnamed: 0,id,date,url,headline,abstract,leadParagraph,keywords,author
0,nyt://article/8f633850-09ab-53ff-8753-bc72eaba...,2021-01-01T00:16:53+0000,https://www.nytimes.com/2020/12/31/us/george-f...,{'main': 'Minneapolis Police Release Body Came...,The video shows a man raising something to his...,The Minneapolis Police Department released bod...,"[{'name': 'persons', 'value': 'Idd, Dolal B', ...","{'original': 'By Nicholas Bogel-Burroughs', 'p..."
1,nyt://article/35e85454-abfc-5376-b4b5-7d99316d...,2021-01-01T00:58:19+0000,https://www.nytimes.com/2020/12/31/us/resolvin...,{'main': 'Resolving to live a lot better than ...,"Every December since 2017, Ada Rojas has guide...","Every December since 2017, Ada Rojas has guide...",[],"{'original': 'By Concepción de León', 'person'..."
2,nyt://article/5fc45bbd-d622-56da-875c-b5b7c4ef...,2021-01-01T01:24:55+0000,https://www.nytimes.com/2020/12/31/us/politics...,{'main': 'Justice Dept. Asks Judge to Toss Ele...,"The suit, led by Representative Louie Gohmert ...",[Here’s what you need to know about President-...,"[{'name': 'organizations', 'value': 'Justice D...",{'original': 'By Maggie Haberman and Katie Ben...
3,nyt://article/d75205a5-201f-5dc3-b8fe-278c3d64...,2021-01-01T01:28:22+0000,https://www.nytimes.com/2020/12/31/world/the-u...,"{'main': 'The U.S. reaches 20 million cases.',...",The United States recorded its 20 millionth ca...,The United States recorded its 20 millionth ca...,"[{'name': 'subject', 'value': 'internal-essent...","{'original': 'By Kate Taylor', 'person': [{'fi..."
4,nyt://article/5107fda2-e2c7-522b-ab92-239eeba4...,2021-01-01T03:00:05+0000,https://www.nytimes.com/2020/12/31/crosswords/...,"{'main': 'Party Hearty', 'kicker': 'Wordplay, ...",Milo Beckman hides some pleasant surprises in ...,FRIDAY PUZZLE — I hope people have a lot of fu...,"[{'name': 'subject', 'value': 'Crossword Puzzl...","{'original': 'By Caitlin Lovinger', 'person': ..."


In [81]:
_ = allArticlesDataFrame.to_csv('articles.csv')

In [82]:
allArticlesDataFrame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16044 entries, 0 to 16043
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   id             16044 non-null  object
 1   date           16044 non-null  object
 2   url            16044 non-null  object
 3   headline       16044 non-null  object
 4   abstract       16044 non-null  object
 5   leadParagraph  16044 non-null  object
 6   keywords       16044 non-null  object
 7   author         16044 non-null  object
dtypes: object(8)
memory usage: 1002.9+ KB
