In [1]:
#
#   Overreaction Data Code
#   Beta Zero Capital
#   Strategy Team Overreaction
#    
#   C. Delahanty, H. Fu, J. Kurlander
# 
#
#   April, 2020
#

In [2]:
#
#   As a preliminary to NLP strategy development,
#   we've created this script to generate an excel / csv
#   file relating news (a stimulus) and equity
#   pricing information pre- and post-stimulus
#
#   The goal is to support both our team and group-wide NLP strategies
#
#   Output will currently take the format of:
#   Ticker, DateOfDrop, PercentSizeOfDropFromRecentHigh, NewsPriorToDrop(1), ...
#   NewsPriorToDrop(2), NewsPriorToDrop(3), Price5minFromLow, Price30minFromLow, ...
#   Price1hFromLow, Price4hFromLow, Price1DFromLow, Price3DFromLow
#
#   Time increments that are during close are treated as the most recent price
#   since market close. 

In [3]:
#   ONGOING COMMENTS HERE
#   
#   Source price data from yfinance at minutely scale for drops / highs
#   Source news data from https://stocknewsapi.com/ (?) - free trial for 1 month
#   
#   A recent high is the most recent max within 24 hours (notice, if a price is monotonically decreasing
#   this could be extremely far back - thus 24hour arbitrary cutoff)
#
#   To avoid output clutter, we limit search for prices with at least a 5% drop.
#   
#   Chould we record both > 5% price drops AND increases? Might be useful to verify model
#
#   Finally, should have market-normalized performance. We should also record general market data for the 
#   fields mentioned above, and potentially duplicate fields after having subtracted out the markets movement
#   in that time period (so we don't consider a stock simply when general market favorability drops)
#

In [89]:
import pandas as pd
import yfinance as yf # reading minutely data
import requests # reading news data from ContextualWeb.io
import datetime as dt
import numpy as np

In [184]:
#   PARAMETERS FOR THE MODEL
#
#   These values are pre-defined to make the following code as general
#   as possible. 
#
#

start_date  =       "2018-05-30"
end_date    =       dt.date.today()
market      =       "SPY"
drop        =       0.05
n_headlines =       3
normalized  =       True
dayOf       =       True               # when looking for news, do we get news the day of the drop or the day after

In [185]:
#   IMPORTANT USAGE
#
#   API is available online, but some important usage:
#
#   yf.download(tickers = [array or string separated by " " of tickers], start = <date to start>, end = <date to end>, 
#       inverval = <how often>)

In [186]:
# reading dataframe with tickers of nasdaq / nyse stocks
nasdaq = pd.read_csv("nasdaq.csv")
nyse = pd.read_csv("nyse.csv")
tickers = pd.concat([nasdaq, nyse])

# getting authentication data
auth = pd.read_csv("auth.csv")
host =  auth["auth"][0]
apikey = auth["auth"][1]
url = "https://contextualwebsearch-websearch-v1.p.rapidapi.com/api/Search/NewsSearchAPI"
headers = {
    'x-rapidapi-host': host,
    'x-rapidapi-key': apikey
    }

In [187]:
# given a ticker, get all drops and the date of the drops
def get_drops_from_ticker(ticker):
    data = yf.download(tickers)

In [188]:
# sample call from yf - getting ticker data with interval
sdata = yf.download(tickers="AAPL", interval="1d", start = start_date, end = end_date)['Open']
mdata = yf.download(tickers=market, interval="1d", start = start_date, end = end_date)['Open']
beta = yf.Ticker("AAPL").info['beta']

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


In [189]:
sdata.head()

Date
2018-05-30    187.720001
2018-05-31    187.220001
2018-06-01    187.990005
2018-06-04    191.639999
2018-06-05    193.070007
Name: Open, dtype: float64

In [190]:
sdata.diff()

Date
2018-05-30         NaN
2018-05-31   -0.500000
2018-06-01    0.770004
2018-06-04    3.649994
2018-06-05    1.430008
                ...   
2020-04-21   -1.670013
2020-04-22   -2.670013
2020-04-23    2.260010
2020-04-24    1.330017
2020-04-27    4.599976
Name: Open, Length: 481, dtype: float64

In [191]:
# calculating percentage differences
if (normalized):
    sperdiff = sdata.diff() / sdata.shift() - (mdata.diff() / mdata.shift()) * beta
else:
    sperdiff = sdata.diff() / sdata.shift()

In [192]:
# getting dates of crutial drops
if (dayOf):
    critdates = pd.to_datetime(sperdiff[sperdiff.shift(-1).abs() > drop].keys()).date
else:
    critdates = pd.to_datetime(sperdiff[sperdiff.abs() > drop].keys()).date
ncritdates = pd.to_datetime(critdates + pd.DateOffset(1)).date

In [208]:
sdata[critdates[0]]

219.0500030517578

In [194]:
# returns two arrays, one with headings and one with body
def parseNewsResults(newsData, size):
    titles = []
    bodies = []
    for i in range (min(n_headlines, size)):
        titles.append(newsData[i]['title'])
        bodies.append(newsData[i]['body'])
    return titles, bodies

In [217]:
# declaring an output pandas dataframe
output_df = pd.DataFrame
d = {
    'ticker': "AAPL",
    'dropDate': critdates[i],
    'pOpen': p_open,
    'p1h': p_1h,
    'p2h': p_2h,
    'p4h': p_4h,
    'p1d': p_1d,
    'p2d': p_2d,
    'headlines': titles,
    'bodies': bodies 
}
temp = pd.DataFrame(data=d)
output_df = pd.concat([temp, temp])

In [246]:
sdata[critdates[3]]

IndexError: index 3 is out of bounds for axis 0 with size 3

In [247]:
output_df = pd.DataFrame()

In [248]:
# iterating through the critical dates
for i in range (len(critdates)):
    # use news API
    querystring = {
        "fromPublishedDate": critdates[i],
        "toPublishedDate":ncritdates[i], 
        "autoCorrect":"false",
        "pageNumber":"1",
        "pageSize":"10",
        "q":"AAPL",
        "safeSearch":"false"}
    response = requests.request("GET", url, headers=headers, params=querystring)
    # getting news data
    titles, bodies = parseNewsResults(response.json()['value'], response.json()['totalCount'])
    # get hourly data in that range
    sdata_h = yf.download(tickers="AAPL", interval="1h", start = ncritdates[i], end = ncritdates[i] + dt.timedelta(5))['Open']
    
    # TODO: if dayOf is false, then need to move this date back one day
    p_prior = sdata[critdates[i]]
    
    p_open = sdata_h[0]
    p_1h = sdata_h[1]
    p_2h = sdata_h[2]
    p_4h = sdata_h[4]
    p_1d = sdata_h[7] # 7 hours in trade day
    p_2d = sdata_h[14]
    
    
    d = {
    'ticker': "AAPL",
    'dropDate': critdates[i],
    'pPrior': p_prior,
    'pOpen': p_open,
    'p1h': p_1h,
    'p2h': p_2h,
    'p4h': p_4h,
    'p1d': p_1d,
    'p2d': p_2d,
    'headlines': titles,
    'bodies': bodies 
    }
    # creating temporary dataframe
    temp = pd.DataFrame(data=d)
    
    output_df = pd.concat([output_df, temp])
    

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


In [249]:
output_df

Unnamed: 0,ticker,dropDate,pPrior,pOpen,p1h,p2h,p4h,p1d,p2d,headlines,bodies
0,AAPL,2018-11-01,219.050003,209.699997,207.470001,207.259995,206.419998,204.25,201.869995,Apple Inc. (<b>AAPL</b>) Is Down Following Q4 ...,Apple Inc. (AAPL) reported fourth quarter EPS ...
1,AAPL,2018-11-01,219.050003,209.699997,207.470001,207.259995,206.419998,204.25,201.869995,Apple Earnings: <b>AAPL</b> Stock Slides on Q4...,Apple earnings (AAPL) is sliding after hours e...
2,AAPL,2018-11-01,219.050003,209.699997,207.470001,207.259995,206.419998,204.25,201.869995,Apple down 7% on soft guidance,)\ndrops 6.8%\non Q4 results that beat EPS and...
0,AAPL,2019-01-02,154.889999,144.039993,143.949997,144.377502,144.229996,144.419998,148.689896,Apple Inc. (<b>AAPL</b>) Is Sharply Lower Afte...,Apple Inc. (AAPL) announced after the bell Wed...
1,AAPL,2019-01-02,154.889999,144.039993,143.949997,144.377502,144.229996,144.419998,148.689896,Apple Announces Revenue Is Unraveling,Source: hocus-focus / Getty ImagesInvestors wh...
2,AAPL,2019-01-02,154.889999,144.039993,143.949997,144.377502,144.229996,144.419998,148.689896,Apple has cut its Q1 revenue guidance,One for the AAPL traders\r\n- Apple cuts Q1 re...
0,AAPL,2020-02-28,257.26001,283.380005,281.304993,289.394012,288.450012,303.630005,296.700012,"Apple's Cook says virus issues 'temporary,' pr...",AAPL\n-0.7%\n) chief Tim Cook isn't sure wheth...
1,AAPL,2020-02-28,257.26001,283.380005,281.304993,289.394012,288.450012,303.630005,296.700012,"<b>AAPL</b>, MSFT Stock Down 7%, Coronavirus F...",Apple Inc\n(NASDAQ: AAPL) was down by 3.48% in...
2,AAPL,2020-02-28,257.26001,283.380005,281.304993,289.394012,288.450012,303.630005,296.700012,"Airline, Hotel, Casino Stocks Likely To Be In ...",What the market has been doing this week is re...
