In [1]:
import pandas as pd
import requests
import logging
import json
from alpha_vantage.timeseries import TimeSeries
import matplotlib.pyplot as plt
from datetime import date, timedelta
import time
from bs4 import BeautifulSoup
from config import vantage_key, census_key, bls_key
import asyncio
import aiohttp
import nest_asyncio
from pandas import json_normalize

In [2]:
# logging.basicConfig(format='%(asctime)s : %(lineno)d : %(levelname)s : %(message)s', level=logging.DEBUG)

In [3]:
covid_response = requests.get("https://covidtracking.com/api/states/daily").json()
covid_dataframe = pd.DataFrame(covid_response)
covid_dataframe['date'] = covid_dataframe['date'].apply(lambda x: pd.to_datetime(str(x), format='%Y%m%d')) 
covid_dataframe

Unnamed: 0,date,state,positive,negative,pending,hospitalizedCurrently,hospitalizedCumulative,inIcuCurrently,inIcuCumulative,onVentilatorCurrently,...,hospitalized,total,totalTestResults,posNeg,fips,deathIncrease,hospitalizedIncrease,negativeIncrease,positiveIncrease,totalTestResultsIncrease
0,2020-05-11,AK,381.0,28299.0,,7.0,,,,,...,,28680,28680,28680,02,0.0,0.0,1314.0,2.0,1316.0
1,2020-05-11,AL,10009.0,119435.0,,,1256.0,,463.0,,...,1256.0,129444,129444,129444,01,8.0,16.0,1791.0,232.0,2023.0
2,2020-05-11,AR,4034.0,64996.0,,61.0,480.0,,,11.0,...,480.0,69030,69030,69030,05,6.0,9.0,3215.0,287.0,3502.0
3,2020-05-11,AS,0.0,105.0,,,,,,,...,,105,105,105,60,0.0,0.0,22.0,0.0,22.0
4,2020-05-11,AZ,11380.0,138861.0,,717.0,1537.0,297.0,,201.0,...,1537.0,150241,150241,150241,04,6.0,9.0,12241.0,261.0,12502.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3764,2020-01-26,WA,1.0,,,,,,,,...,,1,1,1,53,0.0,0.0,0.0,0.0,0.0
3765,2020-01-25,WA,1.0,,,,,,,,...,,1,1,1,53,0.0,0.0,0.0,0.0,0.0
3766,2020-01-24,WA,1.0,,,,,,,,...,,1,1,1,53,0.0,0.0,0.0,0.0,0.0
3767,2020-01-23,WA,1.0,,,,,,,,...,,1,1,1,53,0.0,0.0,0.0,0.0,0.0


In [4]:
#Scrapes Wikipedia Table
wikipedia_response = requests.get("https://en.wikipedia.org/wiki/List_of_S%26P_500_companies").text
soup = BeautifulSoup(wikipedia_response,'lxml')
wiki_table = soup.find('table',{'class':'wikitable sortable'})

#This creates a list with the names of the headers
headerslist = []
headers = wiki_table.findAll('th')
for head in headers:
    headerslist.append(head.text)

#This adds the content to a list    
content = wiki_table.findAll('td')
dataframelist = []
entrylist = []
while (content):
    if len(entrylist) < 9:
        contentvalue = content.pop(0).text
        entrylist.append(contentvalue)
    else:
        dataframelist.append(entrylist)
        entrylist = []


In [5]:
#This creates the S&P dataframe and formats the datetime row
sp500DF = pd.DataFrame(dataframelist, columns=headerslist)
sp500DF['Symbol\n'] = sp500DF['Symbol\n'].str[0:-1]
sp500DF['Founded\n'] = sp500DF['Founded\n'].str[0:-1]
sp500DF['Date first added'] = pd.to_datetime(sp500DF['Date first added'], infer_datetime_format = True, errors='coerce')
sp500DF = sp500DF.rename(columns={'Symbol\n':'Symbol','Founded\n':'Founded'})
sp500DF

Unnamed: 0,Symbol,Security,SEC filings,GICS Sector,GICS Sub Industry,Headquarters Location,Date first added,CIK,Founded
0,MMM,3M Company,reports,Industrials,Industrial Conglomerates,"St. Paul, Minnesota",1976-08-09,0000066740,1902
1,ABT,Abbott Laboratories,reports,Health Care,Health Care Equipment,"North Chicago, Illinois",1964-03-31,0000001800,1888
2,ABBV,AbbVie Inc.,reports,Health Care,Pharmaceuticals,"North Chicago, Illinois",2012-12-31,0001551152,2013 (1888)
3,ABMD,ABIOMED Inc,reports,Health Care,Health Care Equipment,"Danvers, Massachusetts",2018-05-31,0000815094,1981
4,ACN,Accenture plc,reports,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,0001467373,1989
...,...,...,...,...,...,...,...,...,...
499,XYL,Xylem Inc.,reports,Industrials,Industrial Machinery,"White Plains, New York",2011-11-01,0001524472,2011
500,YUM,Yum! Brands Inc,reports,Consumer Discretionary,Restaurants,"Louisville, Kentucky",1997-10-06,0001041061\n,
501,ZBRA,Zebra Technologies,reports,Information Technology,Electronic Equipment & Instruments,"Lincolnshire, Illinois",2019-12-23,0000877212\n,1969
502,ZBH,Zimmer Biomet Holdings,reports,Health Care,Health Care Equipment,"Warsaw, Indiana",2001-08-07,0001136869\n,


In [12]:
#Alpha Vantage API
#Note that the S&P changes throughout the year
#Also, we're getting the past 100 days on a rolling basis

ticker_list = list(sp500DF['Symbol'].values)

completeurls = [] # completed urls go here

# stockDF = pd.DataFrame(columns=list(stockDF.columns.values))
stockdata = []

# This generates all URLs and places them in a list so that async can create futures out of them
for stocks in ticker_list:
    completeurls.append(f"https://www.alphavantage.co/query?function=TIME_SERIES_DAILY&symbol={stocks}&outputsize=compact&apikey={vantage_key}")

nest_asyncio.apply()
async def get_stocks(url):
    connector = aiohttp.TCPConnector(limit=10)
    async with aiohttp.ClientSession(connector=connector, raise_for_status=True) as session:
        async with session.get(url) as response:
            response_content = await response.json()
            response_content['Time Series (Daily)']['symbol'] = url.split("&")[1].split('=')[1] #This extracts the symbol from the URL
            return stockdata.append(response_content['Time Series (Daily)'])

#set up the event loop
asyncio.set_event_loop(asyncio.new_event_loop())
loop = asyncio.get_event_loop()

#add tasks to run
tasks = []
for urls in completeurls:
    future = asyncio.ensure_future(get_stocks(urls))
    tasks.append(future)


#Run
loop.run_until_complete(asyncio.wait(tasks))
# loop.close()


({<Task finished coro=<get_stocks() done, defined at <ipython-input-12-0881c3b7a87a>:17> exception=KeyError('Time Series (Daily)')>,
  <Task finished coro=<get_stocks() done, defined at <ipython-input-12-0881c3b7a87a>:17> result=None>,
  <Task finished coro=<get_stocks() done, defined at <ipython-input-12-0881c3b7a87a>:17> result=None>,
  <Task finished coro=<get_stocks() done, defined at <ipython-input-12-0881c3b7a87a>:17> result=None>,
  <Task finished coro=<get_stocks() done, defined at <ipython-input-12-0881c3b7a87a>:17> result=None>,
  <Task finished coro=<get_stocks() done, defined at <ipython-input-12-0881c3b7a87a>:17> result=None>,
  <Task finished coro=<get_stocks() done, defined at <ipython-input-12-0881c3b7a87a>:17> result=None>,
  <Task finished coro=<get_stocks() done, defined at <ipython-input-12-0881c3b7a87a>:17> result=None>,
  <Task finished coro=<get_stocks() done, defined at <ipython-input-12-0881c3b7a87a>:17> result=None>,
  <Task finished coro=<get_stocks() done, d

In [13]:
# #This creates the final stockdata dataframe
stockdata = json_normalize(stockdata)
stockDF = pd.DataFrame(stockdata)
stockDF = stockDF.set_index(['symbol'])
stockDF = stockDF.filter(regex='close')
stockDF
# url = f"https://www.alphavantage.co/query?function=TIME_SERIES_DAILY&symbol=AAPL&outputsize=compact&apikey={vantage_key}"
# response = requests.get(url).json()
# response['Time Series (Daily)']['stocks'] = url.split("&")[1].split('=')[1]
# response

Unnamed: 0_level_0,2020-05-11.4. close,2020-05-08.4. close,2020-05-07.4. close,2020-05-06.4. close,2020-05-05.4. close,2020-05-04.4. close,2020-05-01.4. close,2020-04-30.4. close,2020-04-29.4. close,2020-04-28.4. close,...,2019-12-30.4. close,2019-12-27.4. close,2019-12-26.4. close,2019-12-24.4. close,2019-12-23.4. close,2019-12-20.4. close,2019-12-19.4. close,2019-12-18.4. close,2019-12-17.4. close,2019-12-16.4. close
symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
BAX,88.9300,88.0800,88.4400,87.9200,88.7400,87.2700,87.8400,88.7800,91.4900,91.0000,...,83.4300,84.0500,84.3300,84.6900,84.5200,82.4900,82.2200,83.6600,83.2500,
AME,83.2400,84.3200,83.3100,81.7200,81.1900,79.6700,80.4900,83.8700,84.5000,80.7600,...,99.9300,99.9500,100.0700,99.7100,100.1400,99.7800,99.5600,98.9600,99.2000,
CI,188.9900,189.2700,184.1800,183.4300,190.3900,184.6100,187.0900,195.7800,195.8700,189.5900,...,202.9700,204.0000,204.3800,204.0100,206.4700,204.5900,198.7200,198.2000,193.5200,
DAL,22.0100,22.7200,21.6800,21.0000,21.7100,22.5700,24.1200,25.9100,27.3200,24.3400,...,58.6600,59.0800,59.5600,59.3900,59.4400,58.9700,59.1400,58.5800,58.6900,
CARR,18.3500,18.3600,16.4100,16.2900,15.9700,16.4000,16.7700,17.7100,17.6400,17.0000,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
NDAQ,109.2400,107.5900,107.3300,105.6300,108.8300,106.1300,106.6200,109.6700,111.6000,111.2100,...,107.0200,107.9900,107.7900,106.6900,107.1700,107.6000,107.0200,105.9200,106.0000,
HII,178.1100,180.5800,173.3100,178.6500,182.6700,179.9800,185.6700,191.4100,195.5500,191.2700,...,251.8100,251.0700,252.6000,253.9900,255.4900,254.1100,254.2400,251.7100,252.4000,
DRI,73.9300,74.7200,70.5300,68.2100,68.9900,70.6200,69.8500,73.7900,78.2100,74.8000,...,108.9600,108.0100,108.9100,108.5000,108.7000,110.2400,109.0300,116.3100,116.0100,
L,30.0100,31.0200,30.3400,29.6800,30.7900,31.7600,31.6200,34.6600,36.7000,34.1100,...,52.0100,51.8900,51.9400,51.4500,51.4000,51.6600,51.1700,51.3200,51.4000,


In [8]:
#Population data - US Census Bureau
census_response = requests.get(f"https://api.census.gov/data/2019/pep/population?get=COUNTY,DATE_CODE,DATE_DESC,DENSITY,POP,NAME,STATE&for=state:*&key={census_key}").json()
census_headers = census_response.pop(0)
populationDF = pd.DataFrame(census_response, columns = census_headers)
populationDF = populationDF[populationDF['DATE_CODE']=='11']
populationDF = populationDF[['DENSITY','POP','NAME','STATE']]

# https://www.bls.gov/oes/home.htm
# https://www.bls.gov/oes/current/oessrcst.htm
# https://www.bls.gov/oes/current/oes_hi.htm#otherlinks
# total jobs 2018 - from bureau of economic analysis (https://apps.bea.gov/iTable/iTable.cfm?reqid=70&step=1&isuri=1)
empByIndDF = pd.read_csv("totalEmpIndustry2018.csv")
workers_by_indDF = pd.read_csv("totaljobs2018.csv")

#Master Dataframe for 2018 data
master_industryDF = pd.merge(empByIndDF, populationDF, right_on='NAME', left_on='GeoName', how='left')
master_industryDF = pd.merge(master_industryDF, workers_by_indDF, left_on='NAME', right_on='GeoName', how='left')
master_industryDF = master_industryDF[['NAME','STATE','LineCode','Description','2018_x','2018_y','POP']]
master_industryDF = master_industryDF.rename(columns={'2018_x':'2018 Workers In Industry', '2018_y':'2018 Working Pop', 'POP':'2019 Overall Population'})
master_industryDF

Unnamed: 0,NAME,STATE,LineCode,Description,2018 Workers In Industry,2018 Working Pop,2019 Overall Population
0,Alabama,01,70,Farm employment,42220,2691517,4887681
1,Alabama,01,100,"Forestry, fishing, and related activities",15895,2691517,4887681
2,Alabama,01,200,"Mining, quarrying, and oil and gas extra...",10560,2691517,4887681
3,Alabama,01,300,Utilities,14176,2691517,4887681
4,Alabama,01,400,Construction,148326,2691517,4887681
...,...,...,...,...,...,...,...
1066,Wyoming,56,1600,Health care and social assistance,30617,405010,577601
1067,Wyoming,56,1700,"Arts, entertainment, and recreation",8255,405010,577601
1068,Wyoming,56,1800,Accommodation and food services,35917,405010,577601
1069,Wyoming,56,1900,Other services (except government and go...,17920,405010,577601


In [9]:
# US Bureau of Labor Statistics
areaCodesDF = pd.read_csv("bureauoflaborstats.csv")
fips_dict = pd.read_csv("fipscodes.csv")
fips_dict = fips_dict[["State Abbreviation","FIPS Code"]]
fips_dict = fips_dict.to_dict()

#If we want to get more granular, we can switch the area_type_code from A to N
areaCodesList = areaCodesDF[areaCodesDF["area_type_code"]=="A"]
areaCodesList = list(areaCodesList["area_code"].values)

master_unemp_DF = pd.DataFrame({"year":[],"periodName":[],"value":[],"state":[]})

for x in areaCodesList:
    try:
        seriesid = f"LAU{x}03"
        fips_code = x[2:4]
        headers = {'Content-type': 'application/json'} 
        logging.debug(f"RETRIEVING: {x}, {fips_code}")
        data = json.dumps({"seriesid": [seriesid],"startyear":"2010", "endyear":"2020","registrationkey":bls_key})
        response = requests.post('https://api.bls.gov/publicAPI/v2/timeseries/data/', data=data, headers=headers).json()
        state_unemp_DF = pd.DataFrame(response['Results']['series'][0]['data'])
        state_unemp_DF['state'] = fips_dict['State Abbreviation'][int(fips_code)]
        state_unemp_DF = state_unemp_DF[["year","periodName","value","state"]]
        state_unemp_DF['value'] = state_unemp_DF['value'].astype('float')
        master_unemp_DF = master_unemp_DF.append(state_unemp_DF)
    except:
        logging.error(f"Error retrieving data: {x}")


ERROR:root:Error retrieving data: ST5500000000000
ERROR:root:Error retrieving data: ST5600000000000
ERROR:root:Error retrieving data: ST7200000000000


In [None]:
# Create Unemployment rate DF and format month name to numeric
master_unemp_DF['periodName']  = master_unemp_DF['periodName'].apply(lambda x: time.strptime(x, '%B').tm_mon)
master_unemp_DF = master_unemp_DF.rename(columns={'periodName':'month'})
master_unemp_DF = pd.pivot_table(master_unemp_DF, index=['state','month'], columns=['year'], values=['value'])
master_unemp_DF