In [168]:
# Modules
import pandas as pd
import requests
import matplotlib as plt
import json
import time
from datetime import date
from dateutil.relativedelta import relativedelta

In [178]:
# API Keys
from keys import MARKET_STACK_API_KEY, NY_Times_API_KEY, NY_Times_API_SECRET_KEY

params = {
  'access_key': MARKET_STACK_API_KEY
}

In [170]:
# Base URLs
MARKET_STACK_BASE_URL = 'http://api.marketstack.com/v1/'
NY_Times_ARCHIVE_BASE_URL = 'https://api.nytimes.com/svc/archive/v1/'#YYYY/MM.json?api-key=
NY_Times_ARTICLE_SEARCH_BASE_URL = 'https://api.nytimes.com/svc/search/v2/articlesearch.json?q='

Hypothesis: Articles released by the NY Times influence the performance of stock symbols.
Null Hypothesis: Articles released by the NY Times have no correlation with the performance of stock symbols.

In [171]:
def request_articles(date):
    base = NY_Times_ARCHIVE_BASE_URL
    url = f'{base}{date.year}/{date.month}.json?api-key={NY_Times_API_KEY}'
    try:
        req = requests.get(url).json()
    except Exception as e:
        return (e,'')
    time.sleep(5)
    return req

In [172]:
def has_headline(article):
    article_has_headline = type(article[ 'headline']) == dict and 'main' in article['headline'].keys()
    return article_has_headline

In [173]:
def first_of_months_in_date_range(startDate=date.today(),endDate=date.today()):
    months_in_range = [x.split(' ') for x in pd.date_range(startDate, endDate, freq='MS').strftime("%Y %m %d").tolist()]
    return months_in_range

In [204]:
def request_ticker(tickerSymbol,tickerDate=None,params=params):
    data = {
        'date':[],
        'symbol':[],
        'high':[],
        'low':[],
        'close':[],
        'volume':[],
        'dividend':[],
        'split-factor':[],
        'exchange':[]
    }
    try:
        api_result   = requests.get(f'http://api.marketstack.com/v1/tickers/{tickerSymbol}/eod/{tickerDate}', params) #/tickers/[symbol]/eod/[date] YYYY-MM-DD
    except Exception as e:
        print(f'API REQUEST FAILED: {api_result.status_code}')
        print(f'TICKER SYMBOL: {tickerSymbol} | DATE: {tickerDate}')
        return [e,api_result]
    stock_data = api_result.json()
    # return stock_data
    data['date']         = stock_data['date']
    data['tickerSymbol'] = stock_data['symbol']
    data['high']         = stock_data['high']
    data['low']          = stock_data['low']
    data['close']        = stock_data['close']
    data['volume']       = stock_data['volume']
    data['dividend']     = stock_data['dividend']
    data['split-factor'] = stock_data['split_factor']
    data['exchange']     = stock_data['exchange']
    return data

In [184]:
startDate = date(2021,1,1) #(YYYY, MM, DD)
endDate   = date(2021,3,1)
tickers = [
    'AAPL',
    'MSFT',
    'META'
]

In [175]:
date_range = first_of_months_in_date_range(startDate=startDate,endDate=endDate)

In [176]:
articles = { # Column headers for dataframe
    'id':[],
    'date':[],
    'url':[],
    'headline':[],
    'abstract':[],
    'leadParagraph':[],
    'keywords':[],
    'author':[]
}

In [177]:
y = 1
b = tuple()
for d in date_range: # requests articles for every month in the range of startDate to endDate variables defined above. Filters by has_headline
    year, month, day  = int(d[0]), int(d[1]), int(d[2])
    a = request_articles(date(year,month,day)) # Hands back either the request json 
    if type(a) == type(b):
        print(f'REQUEST ERROR: {a[0]}')
    else:
        for article in a:
            if has_headline(article):
                # totalColumns = r[1].keys()
                for i in range(1,len(article)):
                    articles['index']         = y
                    articles['id']            = article[i]['_id']
                    articles['date']          = article[i]['pub_date']
                    articles['url']           = article[i]['web_url']
                    articles['headline']      = article[i]['headline']
                    articles['abstract']      = article[i]['abstract']
                    articles['leadParagraph'] = article[i]['lead_paragraph']
                    articles['keywords']      = article[i]['keywords']
                    articles['author']        = article[i]['byline']
                y += 1
            else:
                print(f'NO HEADLINE: {article}')


Request Error: HTTPSConnectionPool(host='api.nytimes.com', port=443): Max retries exceeded with url: /svc/archive/v1/2021/1.json?api-key=16Yx1EgtPXYKbhzeDX8YHhBIqS5cA7dL (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000001C81550A7A0>, 'Connection to api.nytimes.com timed out. (connect timeout=None)'))
Request Error: HTTPSConnectionPool(host='api.nytimes.com', port=443): Max retries exceeded with url: /svc/archive/v1/2021/2.json?api-key=16Yx1EgtPXYKbhzeDX8YHhBIqS5cA7dL (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000001C81550BC10>, 'Connection to api.nytimes.com timed out. (connect timeout=None)'))


KeyboardInterrupt: 

In [None]:
allArticlesDataFrame = pd.DataFrame(data=articles,index=articles['date'])
articlesAndStocksDataFrame = allArticlesDataFrame
allArticlesDataFrame

In [203]:
tickerR = request_ticker(tickerSymbol='AAPL',tickerDate='2022-02-01')
tickerR

200


174.01