In [None]:
import pandas as pd
import numpy as np
import json
import urllib.request
import requests
import datetime
from datetime import datetime, timedelta,date
from bs4 import BeautifulSoup
import re
import dill
from ediblepickle import checkpoint
from retrying import retry
import os

import searchtweets
from searchtweets import ResultStream, gen_rule_payload, load_credentials
from updates import get_response, get_EOD_data, get_st_messages

#### 1. Get historical unemployment rates

In [None]:
df_UR=pd.read_html('https://data.bls.gov/timeseries/LNU04000000')[1]
df_UR.rename(columns={'Unnamed: 0':'Year'},inplace=True)
df_UR

In [None]:
dill.dump(df_UR, open('data/df_UR.pkd', 'wb'))

In [None]:
df_UR = dill.load(open('data/df_UR.pkd', 'rb'))

#### 2. Organize historical unemployment rates into dataframe

In [None]:
rates=pd.Series()
years=pd.Series()
for row in df_UR.index:
    rates=rates.append(df_UR.iloc[row]['Jan':])
    years=years.append(pd.Series([int(df_UR.iloc[row]['Year'])]*12))
changes=[round((rates[i]-rates[i-1])/rates[i-1]*100,2) for i in range(1,len(rates))]
changes.insert(0,0)

In [None]:
df_temp=pd.DataFrame(rates).reset_index().rename(columns={'index':'Month',0:'Revised Rate'})
df_temp['Year']=list(years)
df_UR_Releases=pd.DataFrame()
df_UR_Releases['Period']=df_temp['Month']+df_temp['Year'].apply(str)
df_UR_Releases['Period']=df_UR_Releases['Period'].apply(lambda x:x[:3]+' '+x[3:])
df_UR_Releases['Revised Rate']=df_temp['Revised Rate']
df_UR_Releases['Percent Change']=list(changes)
df_UR_Releases=df_UR_Releases.dropna()

#### 3. Add unemployment releases to data frame

###### 3.1 Retrieve links to text for each release

In [None]:
linkpage=BeautifulSoup(urllib.request.urlopen('https://www.bls.gov/bls/news-release/empsit.htm').read(), 'lxml')
linkpage

In [None]:
items=linkpage.find_all('a')
list_UR_links=[]
for item in items:
    if 'Employment Situation' in item.text and '/news.release/archives/empsit_' in item['href']:
        url='https://www.bls.gov'+re.search('(/news.*)',item['href']).group(0)
        list_UR_links.append(url)
list_UR_links

###### 3.2 Extract relevant text from each release

In [None]:
announcements=[]
dates=[]
for link in list_UR_links:
    d=datetime.strptime(re.search('\/empsit_(\d{8})\.',link).group(1),'%m%d%Y')
    if d>=datetime(2010,2,1):
        announcement=BeautifulSoup(requests.get(link).text,'lxml').find('pre')
        announcement=re.search('rate[,\s]?[\w\s\d\.]+ percent',announcement.text.replace('\n',' ')).group(0)
        announcements.append(announcement)
        dates.append(d)
announcements.reverse()
dates.reverse()

In [None]:
df_UR_Releases['Release Date']=dates
df_UR_Releases['Announcement']=announcements
df_UR_Releases

###### 3.2.1 Assign sentiment to each release

In [None]:
sentiment_words=[]
for item in df_UR_Releases['Announcement']:
    sentiment_words.append(re.search('[A-Za-z\s]+',item.replace('rate','').replace('percent','')).group(0))
set(sentiment_words)

In [None]:
def release_tone(txt):
    """Assigns a -1 when comments are positive"""
    """Remember: Unemployment rates going DOWN IS GOOD"""
    positive=['edged up','rose']
    negative=['declined','decreased','edged down','fell']
    for word in positive:
        if word in txt:
            return -1
    for word in negative:
        if word in txt:
            return 1
    return 0

###### 3.2.2 Calculate unemployment rate changes

In [None]:
def calc_changes(alist):
    return [round((alist[i]-alist[i-1])/alist[i-1]*100,2) for i in range(1,len(alist))]
df_UR_Releases['Tone']=df_UR_Releases['Announcement'].apply(release_tone)
df_UR_Releases['Announced Value']=df_UR_Releases['Announcement'].str.extract(r'(\d+\.\d)( percent$)').loc[:,0].apply(float)
df_UR_Releases['Announced Percent Change']=[-3]+calc_changes(list(df_UR_Releases['Announced Value']))
df_UR_Releases['Discrepancy']=df_UR_Releases['Announced Value']-df_UR_Releases['Revised Rate']
df_UR_Releases=df_UR_Releases.set_index('Period')
df_UR_Releases

In [None]:
dill.dump(df_UR_Releases, open('data/df_UR_Releases.pkd', 'wb'))

#### 4. Get list of tickers for stock sample

In [None]:
df_SP500=pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')[0]
df_SP500

In [None]:
dill.dump(df_SP500, open('data/df_SP500.pkd', 'wb'))

In [None]:
tickers=list(df_SP500['Symbol'])
tickers[0]='SPY'

In [None]:
dill.dump(tickers, open('data/tickers.pkd', 'wb'))

#### 5. Retrieve daily historical data for stocks (https://www.tiingo.com API)

In [None]:
tickers = dill.load(open('data/tickers.pkd', 'rb'))
d_all_EOD={}
errors=[]
for n,ticker in enumerate(tickers):
    try:
        df_EOD=pd.DataFrame(get_EOD_data(ticker,'2010-1-1','2020-10-17').json())
        df_EOD['date']=df_EOD['date'].apply(lambda x:x[:10]).apply(lambda x:datetime.strptime(x,'%Y-%m-%d'))
        d_all_EOD[ticker]=df_EOD
    except Exception as err:
        errors.append((ticker,err))

In [None]:
dill.dump(d_all_EOD, open('data/d_all_EOD.pkd', 'wb'))
dill.dump(errors, open('data/errors.pkd', 'wb'))

In [None]:
d_all_EOD = dill.load(open('data/d_all_EOD.pkd', 'rb'))
errors = dill.load(open('data/errors.pkd', 'rb'))

In [None]:
valid_tickers=list(d_all_EOD.keys())
valid_tickers

In [None]:
dill.dump(valid_tickers, open('data/valid_tickers.pkd', 'wb'))

In [None]:
valid_tickers = dill.load(open('data/valid_tickers.pkd', 'rb'))

#### 6. Retrieve data from stocktwits

In [None]:
cache_dir = 'data/STcache1'
if not os.path.exists(cache_dir):
    os.mkdir(cache_dir)

In [None]:
tickers = dill.load(open('data/tickers.pkd', 'rb'))
d_ST_messages={}
for ticker in tickers:
    url='https://api.stocktwits.com/api/2/streams/symbol/'+ticker+'.json'
    try:
        d_ST_messages[ticker]=get_st_messages(url,ticker)
    except NameError:
        print('URL not found for', ticker)

In [None]:
d_ST_messages_alldat[datetime(datetime.now().year,datetime.now().month,datetime.now().day)]=d_ST_messages

In [None]:
dill.dump(d_ST_messages_alldat, open('data/d_ST_messages_alldat.pkd', 'wb'))

#### 7. Retrieve data from twitter

In [None]:
def convert_date_to_string(date,form):
    str_year=str(date.year)
    str_month=str(date.month)
    str_day=str(date.day)
    if date.month<10:
        str_month='0'+str_month
    if date.day<10:
        str_day='0'+str_day
    if form==0:
        return str_year+str_month+str_day+'0000'
    elif form==1:
        return str_year+'-'+str_month+'-'+str_day

In [None]:
def get_tweets(ticker_list,date):
    premium_search_args = load_credentials(filename="twitter_keysG.yml",
                                       yaml_key="search_tweets_fullarchive_dev",env_overwrite=False)
    rule_str=' OR '.join(ticker_list)+' lang:en'
    date1=convert_date_to_string(date-timedelta(days=1),0)
    date2=convert_date_to_string(date,0)
    rule = gen_rule_payload(pt_rule=rule_str,from_date=date1,to_date=date2)
    rs = ResultStream(rule_payload=rule,**premium_search_args)
    return list(rs.stream())

In [None]:
first_index=130
second_index=140
while second_index<len(tickers):
    print(tickers[first_index:second_index])
    date=list(df_UR_Releases['Release Date'])[-2]
    tweet_list=get_tweets(tickers[first_index:second_index],date)
    for ticker in tickers[first_index:second_index]:
        tweets[date][ticker]=[tweet['text'] for tweet in tweet_list if '$'+ticker in tweet['text']]
    first_index+=10
    second_index+=10

In [None]:
df_UR_Releases = dill.load(open('data/df_UR_Releases.pkd', 'rb'))
date=list(df_UR_Releases['Release Date'])[-2]
tweets[date]={}
tweet_list=get_tweets(tickers[:10],date)

In [None]:
dill.dump(tweets, open('data/tweets.pkd', 'wb'))
dill.dump(tweet_list, open('data/tweet_list.pkd', 'wb'))

#### 8. Get news from Benzinga

In [None]:
cache_dir = 'data/BenzNewscache'
if not os.path.exists(cache_dir):
    os.mkdir(cache_dir)

@checkpoint(key=lambda args,kwargs:args[0], work_dir=cache_dir)
def get_news(ticker):
    url='https://www.benzinga.com/stock-articles/'+ticker+'/news'
    all_headlines={}
    next_page=''
    while next_page!=None:
        try:
            print(url)
            newspage=BeautifulSoup(get_response(url).text,'lxml')
        except NameError:
            return None
        news_list=newspage.find_all('div', attrs={'class':'item-list'})
        for item in news_list:
            if item.find('h3')!=None:
                date=datetime.strptime(item.find('h3').text,'%A, %B %d, %Y')
                headlines=item.find_all('span', attrs={'class':'field-content'})
                for index,headline in enumerate(headlines):
                    headlines[index]=headline.find('a').text
                all_headlines[date]=headlines
                print(date,headlines)
        next_page=newspage.find('a', attrs={'title':"Go to next page"})
        if next_page==None:
            break
        else:
            url='https://www.benzinga.com'+next_page['href']
    return all_headlines
get_news('AAPL')

In [None]:
tickers = dill.load(open('data/tickers.pkd', 'rb'))
for ticker in tickers:
    all_ticker_news[ticker]=get_news(ticker)

In [None]:
dill.dump(all_ticker_news, open('data/all_ticker_news.pkd', 'wb'))

#### 9. Get unemployment predictions from Trading Economics

In [None]:
def get_UR_prediction():
    date=datetime.now()
    table=pd.read_html('https://tradingeconomics.com/united-states/unemployment-rate')[1]
    dates=list(table['Calendar'].apply(lambda x:datetime.strptime(x,'%Y-%m-%d')))
    table.set_index('Calendar', inplace=True)
    for d in dates:
        if d>=date:
            next_release_date=datetime.strftime(d+timedelta(hours=8.5),'%Y-%m-%d')
            try:
                predicted_rate=float(table.loc[datetime.strftime(d,'%Y-%m-%d'),'TEForecast'][:-1])
            except:
                predicted_rate=0
            break
    return next_release_date,predicted_rate

In [None]:
get_UR_prediction()

In [None]:
df_UR_Releases = dill.load(open('data/df_UR_Releases.pkd', 'rb'))
def get_UR_current():
    periods=df_UR_Releases.index
    return df_UR_Releases.loc[periods[-1],'Revised Rate'],df_UR_Releases.loc[periods[-1],'Percent Change']

In [None]:
get_UR_current()

#### 10. Get sentiment indicators from Benzinga (Stock Snips)

In [None]:
def get_sentiment(ticker):
    page=BeautifulSoup(get_response('https://www.benzinga.com/stock/'+ticker).text,'lxml')
    try:
        sentiment=re.search('\d{1,2}\.\d{1,2}\%',page.find('div',attrs={'class':"stock-snips-content"}).text).group(0)[:-1]
    except AttributeError:
        print(ticker)
        return 0
    return float(sentiment)

In [None]:
sentiment_dict= dill.load(open('data/sentiment_dict.pkd', 'rb'))
tickers = dill.load(open('data/tickers.pkd', 'rb'))

today=datetime(datetime.now().year,datetime.now().month,datetime.now().day)
sent_today={}
for ticker in tickers:
    sent_today[ticker]=get_sentiment(ticker)
sentiment_dict[today]=sent_today
dill.dump(sentiment_dict, open('data/sentiment_dict.pkd', 'wb'))

In [None]:
dill.dump(sentiment_dict, open('data/sentiment_dict.pkd', 'wb'))