In [1]:
import pandas as pd
import requests
import logging
import json
from alpha_vantage.timeseries import TimeSeries
import matplotlib.pyplot as plt
from datetime import date, timedelta
import time
from bs4 import BeautifulSoup
from config import vantage_key, census_key, bls_key
import asyncio
import aiohttp
import nest_asyncio
from pandas import json_normalize

In [None]:
logging.basicConfig(format='%(asctime)s : %(lineno)d : %(levelname)s : %(message)s', level=logging.DEBUG)

In [4]:
covid_response = requests.get("https://covidtracking.com/api/states/daily").json()
covid_dataframe = pd.DataFrame(covid_response)
covid_dataframe['date'] = covid_dataframe['date'].apply(lambda x: pd.to_datetime(str(x), format='%Y%m%d')) 
covid_dataframe

Unnamed: 0,date,state,positive,negative,pending,hospitalizedCurrently,hospitalizedCumulative,inIcuCurrently,inIcuCumulative,onVentilatorCurrently,...,hospitalized,total,totalTestResults,posNeg,fips,deathIncrease,hospitalizedIncrease,negativeIncrease,positiveIncrease,totalTestResultsIncrease
0,2020-05-10,AK,379.0,26985.0,,8.0,,,,,...,,27364,27364,27364,02,0.0,0.0,914.0,1.0,915.0
1,2020-05-10,AL,9777.0,117644.0,,,1240.0,,460.0,,...,1240.0,127421,127421,127421,01,5.0,12.0,1717.0,210.0,1927.0
2,2020-05-10,AR,3747.0,61781.0,,64.0,471.0,,,14.0,...,471.0,65528,65528,65528,05,0.0,0.0,0.0,0.0,0.0
3,2020-05-10,AS,0.0,83.0,,,,,,,...,,83,83,83,60,0.0,0.0,0.0,0.0,0.0
4,2020-05-10,AZ,11119.0,126620.0,,713.0,1528.0,300.0,,195.0,...,1528.0,137739,137739,137739,04,4.0,14.0,8640.0,159.0,8799.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3708,2020-01-26,WA,1.0,,,,,,,,...,,1,1,1,53,0.0,0.0,0.0,0.0,0.0
3709,2020-01-25,WA,1.0,,,,,,,,...,,1,1,1,53,0.0,0.0,0.0,0.0,0.0
3710,2020-01-24,WA,1.0,,,,,,,,...,,1,1,1,53,0.0,0.0,0.0,0.0,0.0
3711,2020-01-23,WA,1.0,,,,,,,,...,,1,1,1,53,0.0,0.0,0.0,0.0,0.0


In [5]:
#Scrapes Wikipedia Table
wikipedia_response = requests.get("https://en.wikipedia.org/wiki/List_of_S%26P_500_companies").text
soup = BeautifulSoup(wikipedia_response,'lxml')
wiki_table = soup.find('table',{'class':'wikitable sortable'})

#This creates a list with the names of the headers
headerslist = []
headers = wiki_table.findAll('th')
for head in headers:
    headerslist.append(head.text)

#This adds the content to a list    
content = wiki_table.findAll('td')
dataframelist = []
entrylist = []
while (content):
    if len(entrylist) < 9:
        contentvalue = content.pop(0).text
        entrylist.append(contentvalue)
    else:
        dataframelist.append(entrylist)
        entrylist = []


In [6]:
#This creates the S&P dataframe and formats the datetime row
sp500DF = pd.DataFrame(dataframelist, columns=headerslist)
sp500DF['Symbol\n'] = sp500DF['Symbol\n'].str[0:-1]
sp500DF['Founded\n'] = sp500DF['Founded\n'].str[0:-1]
sp500DF['Date first added'] = pd.to_datetime(sp500DF['Date first added'], infer_datetime_format = True, errors='coerce')
sp500DF = sp500DF.rename(columns={'Symbol\n':'Symbol','Founded\n':'Founded'})
sp500DF

Unnamed: 0,Symbol,Security,SEC filings,GICS Sector,GICS Sub Industry,Headquarters Location,Date first added,CIK,Founded
0,MMM,3M Company,reports,Industrials,Industrial Conglomerates,"St. Paul, Minnesota",1976-08-09,0000066740,1902
1,ABT,Abbott Laboratories,reports,Health Care,Health Care Equipment,"North Chicago, Illinois",1964-03-31,0000001800,1888
2,ABBV,AbbVie Inc.,reports,Health Care,Pharmaceuticals,"North Chicago, Illinois",2012-12-31,0001551152,2013 (1888)
3,ABMD,ABIOMED Inc,reports,Health Care,Health Care Equipment,"Danvers, Massachusetts",2018-05-31,0000815094,1981
4,ACN,Accenture plc,reports,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,0001467373,1989
...,...,...,...,...,...,...,...,...,...
499,XYL,Xylem Inc.,reports,Industrials,Industrial Machinery,"White Plains, New York",2011-11-01,0001524472,2011
500,YUM,Yum! Brands Inc,reports,Consumer Discretionary,Restaurants,"Louisville, Kentucky",1997-10-06,0001041061\n,
501,ZBRA,Zebra Technologies,reports,Information Technology,Electronic Equipment & Instruments,"Lincolnshire, Illinois",2019-12-23,0000877212\n,1969
502,ZBH,Zimmer Biomet Holdings,reports,Health Care,Health Care Equipment,"Warsaw, Indiana",2001-08-07,0001136869\n,


In [None]:
#Alpha Vantage API
#Note that the S&P changes throughout the year
#Also, we're getting the past 100 days on a rolling basis

ticker_list = list(sp500DF['Symbol'].values)

completeurls = [] # completed urls go here

# stockDF = pd.DataFrame(columns=list(stockDF.columns.values))
stockdata = []

# This generates all URLs and places them in a list so that async can create futures out of them
logging.info("Generating URLs")
for stocks in ticker_list:
    completeurls.append(f"https://www.alphavantage.co/query?function=TIME_SERIES_DAILY&symbol={stocks}&outputsize=compact&apikey={vantage_key}")


nest_asyncio.apply()
async def get_stocks(url):
    logging.DEBUG(f"GET: {url}")
    connector = aiohttp.TCPConnector(limit=10)
    async with aiohttp.ClientSession(connector=connector, raise_for_status=True) as session:
        async with session.get(url) as response:
            response_content = await response.json()
            response_content['Time Series (Daily)']['symbol'] = url.split("&")[1].split('=')[1] #This extracts the symbol from the URL
            return stockdata.append(response_content['Time Series (Daily)'])


#set up the event loop
asyncio.set_event_loop(asyncio.new_event_loop())
loop = asyncio.get_event_loop()

#add tasks to run
tasks = []
for urls in completeurls:
    future = asyncio.ensure_future(get_stocks(urls))
    tasks.append(future)


#Run
loop.run_until_complete(asyncio.wait(tasks))


In [None]:
#This creates the final stockdata dataframe
stockdata = json_normalize(stockdata)
stockDF = pd.DataFrame(stockdata)
stockDF = stockDF.set_index(['symbol'])
stockDF = stockDF.filter(regex='close')
stockDF

In [3]:
#Population data - US Census Bureau
census_response = requests.get(f"https://api.census.gov/data/2019/pep/population?get=COUNTY,DATE_CODE,DATE_DESC,DENSITY,POP,NAME,STATE&for=state:*&key={census_key}").json()
census_headers = census_response.pop(0)
populationDF = pd.DataFrame(census_response, columns = census_headers)
populationDF = populationDF[populationDF['DATE_CODE']=='11']
populationDF = populationDF[['DENSITY','POP','NAME','STATE']]

# https://www.bls.gov/oes/home.htm
# https://www.bls.gov/oes/current/oessrcst.htm
# https://www.bls.gov/oes/current/oes_hi.htm#otherlinks
# total jobs 2018 - from bureau of economic analysis (https://apps.bea.gov/iTable/iTable.cfm?reqid=70&step=1&isuri=1)
empByIndDF = pd.read_csv("totalEmpIndustry2018.csv")
workers_by_indDF = pd.read_csv("totaljobs2018.csv")

#Master Dataframe for 2018 data
master_industryDF = pd.merge(empByIndDF, populationDF, right_on='NAME', left_on='GeoName', how='left')
master_industryDF = pd.merge(master_industryDF, workers_by_indDF, left_on='NAME', right_on='GeoName', how='left')
master_industryDF = master_industryDF[['NAME','STATE','LineCode','Description','2018_x','2018_y','POP']]
master_industryDF = master_industryDF.rename(columns={'2018_x':'2018 Workers In Industry', '2018_y':'2018 Working Pop', 'POP':'2019 Overall Population'})
master_industryDF

Unnamed: 0,NAME,STATE,LineCode,Description,2018 Workers In Industry,2018 Working Pop,2019 Overall Population
0,Alabama,01,70,Farm employment,42220,2691517,4887681
1,Alabama,01,100,"Forestry, fishing, and related activities",15895,2691517,4887681
2,Alabama,01,200,"Mining, quarrying, and oil and gas extra...",10560,2691517,4887681
3,Alabama,01,300,Utilities,14176,2691517,4887681
4,Alabama,01,400,Construction,148326,2691517,4887681
...,...,...,...,...,...,...,...
1066,Wyoming,56,1600,Health care and social assistance,30617,405010,577601
1067,Wyoming,56,1700,"Arts, entertainment, and recreation",8255,405010,577601
1068,Wyoming,56,1800,Accommodation and food services,35917,405010,577601
1069,Wyoming,56,1900,Other services (except government and go...,17920,405010,577601


In [2]:
# US Bureau of Labor Statistics
areaCodesDF = pd.read_csv("bureauoflaborstats.csv")
fips_dict = pd.read_csv("fipscodes.csv")
fips_dict = fips_dict[["State Abbreviation","FIPS Code"]]
fips_dict = fips_dict.to_dict()

#If we want to get more granular, we can switch the area_type_code from A to N
areaCodesList = areaCodesDF[areaCodesDF["area_type_code"]=="A"]
areaCodesList = list(areaCodesList["area_code"].values)

master_unemp_DF = pd.DataFrame({"year":[],"periodName":[],"value":[],"state":[]})

for x in areaCodesList:
    try:
        seriesid = f"LAU{x}03"
        fips_code = x[2:4]
        headers = {'Content-type': 'application/json'} 
        logging.debug(f"RETRIEVING: {x}, {fips_code}")
        data = json.dumps({"seriesid": [seriesid],"startyear":"2010", "endyear":"2020","registrationkey":bls_key})
        response = requests.post('https://api.bls.gov/publicAPI/v2/timeseries/data/', data=data, headers=headers).json()
        state_unemp_DF = pd.DataFrame(response['Results']['series'][0]['data'])
        state_unemp_DF['state'] = fips_dict['State Abbreviation'][int(fips_code)]
        state_unemp_DF = state_unemp_DF[["year","periodName","value","state"]]
        state_unemp_DF['value'] = state_unemp_DF['value'].astype('float')
        master_unemp_DF = master_unemp_DF.append(state_unemp_DF)
    except:
        logging.error(f"Error retrieving data: {x}")


ERROR:root:Error retrieving data: ST0100000000000
ERROR:root:Error retrieving data: ST0200000000000
ERROR:root:Error retrieving data: ST0400000000000
ERROR:root:Error retrieving data: ST0500000000000
ERROR:root:Error retrieving data: ST0600000000000
ERROR:root:Error retrieving data: ST0800000000000
ERROR:root:Error retrieving data: ST0900000000000
ERROR:root:Error retrieving data: ST1000000000000
ERROR:root:Error retrieving data: ST1100000000000
ERROR:root:Error retrieving data: ST1200000000000
ERROR:root:Error retrieving data: ST1300000000000
ERROR:root:Error retrieving data: ST1500000000000
ERROR:root:Error retrieving data: ST1600000000000
ERROR:root:Error retrieving data: ST1700000000000
ERROR:root:Error retrieving data: ST1800000000000
ERROR:root:Error retrieving data: ST1900000000000
ERROR:root:Error retrieving data: ST2000000000000
ERROR:root:Error retrieving data: ST2100000000000
ERROR:root:Error retrieving data: ST2200000000000
ERROR:root:Error retrieving data: ST2300000000000


In [None]:
# Create Unemployment rate DF and format month name to numeric
master_unemp_DF['periodName']  = master_unemp_DF['periodName'].apply(lambda x: time.strptime(x, '%B').tm_mon)
master_unemp_DF = master_unemp_DF.rename(columns={'periodName':'month'})
master_unemp_DF = pd.pivot_table(master_unemp_DF, index=['state','month'], columns=['year'], values=['value'])
master_unemp_DF