In [4]:
import pandas as pd
from pandas import json_normalize
import requests
import logging
import json
from alpha_vantage.timeseries import TimeSeries
import matplotlib.pyplot as plt
from datetime import date
import time
from bs4 import BeautifulSoup
from config import vantage_key, census_key, bls_key
import asyncio
import aiohttp
import nest_asyncio

In [None]:
# logging.basicConfig(format='%(asctime)s : %(lineno)d : %(levelname)s : %(message)s', level=logging.DEBUG)

In [None]:
#Create covid dataframe
covid_response = requests.get("https://covidtracking.com/api/states/daily").json()
covid_dataframe = pd.DataFrame(covid_response)
covid_dataframe['date'] = covid_dataframe['date'].apply(lambda x: pd.to_datetime(str(x), format='%Y%m%d')) 
covid_dataframe

In [None]:
#Scrapes Wikipedia Table for S&P 500 data
wikipedia_response = requests.get("https://en.wikipedia.org/wiki/List_of_S%26P_500_companies").text
soup = BeautifulSoup(wikipedia_response,'lxml')
wiki_table = soup.find('table',{'class':'wikitable sortable'})

#This creates a list with the names of the headers
headerslist = []
headers = wiki_table.findAll('th')
for head in headers:
    headerslist.append(head.text)

#This adds the content to a list    
content = wiki_table.findAll('td')
dataframelist = []
entrylist = []
while (content):
    if len(entrylist) < 9:
        contentvalue = content.pop(0).text
        entrylist.append(contentvalue)
    else:
        dataframelist.append(entrylist)
        entrylist = []


In [None]:
#This creates the S&P dataframe and formats the datetime row
sp500DF = pd.DataFrame(dataframelist, columns=headerslist)
sp500DF['Symbol\n'] = sp500DF['Symbol\n'].str[0:-1]
sp500DF['Founded\n'] = sp500DF['Founded\n'].str[0:-1]
sp500DF['Date first added'] = pd.to_datetime(sp500DF['Date first added'], infer_datetime_format = True, errors='coerce')
sp500DF = sp500DF.rename(columns={'Symbol\n':'Symbol','Founded\n':'Founded'})
# sp500DF

In [None]:
#This joins the industries csv to the sp500 DF
industriesDF = pd.read_csv("Industries.csv")
industriesDF = pd.merge(sp500DF, industriesDF, how='left', left_on="GICS Sub Industry", right_on="GICS Codes/ Sub-industries")
industriesDF = industriesDF[['Symbol', 'Security', 'GICS Sector', 'GICS Sub Industry', 'Headquarters Location', 'Date first added', 'GICS Codes/ Sub-industries', 'NAICS']]
industriesDF

In [None]:
#This pulls S&P data from the Alpha Vantage API
#Note that the S&P changes throughout the year
#Also, we're getting the past 100 days on a rolling basis

ticker_list = list(sp500DF['Symbol'].values)

completeurls = [] # completed urls go here

# stockDF = pd.DataFrame(columns=list(stockDF.columns.values))
stockdata = []

# This generates all URLs and places them in a list so that async can create futures out of them
for stocks in ticker_list:
    completeurls.append(f"https://www.alphavantage.co/query?function=TIME_SERIES_DAILY&symbol={stocks}&outputsize=compact&apikey={vantage_key}")

nest_asyncio.apply()
async def get_stocks(url):
    connector = aiohttp.TCPConnector(limit=10)
    async with aiohttp.ClientSession(connector=connector, raise_for_status=True) as session:
        async with session.get(url) as response:
            response_content = await response.json()
            response_content['Time Series (Daily)']['symbol'] = url.split("&")[1].split('=')[1] #This extracts the symbol from the URL
            return stockdata.append(response_content['Time Series (Daily)'])

#set up the event loop
asyncio.set_event_loop(asyncio.new_event_loop())
loop = asyncio.get_event_loop()

#add tasks to run
tasks = []
for urls in completeurls:
    future = asyncio.ensure_future(get_stocks(urls))
    tasks.append(future)


#Run
loop.run_until_complete(asyncio.wait(tasks))
# loop.close()


In [None]:
#This creates the final stockdata dataframe
stockdata = json_normalize(stockdata)
stockDF = pd.DataFrame(stockdata)
stockDF = stockDF.set_index(['symbol'])
stockDF = stockDF.filter(regex='close')
stockDF

In [34]:
#2019 Population data - US Census Bureau
census_response = requests.get(f"https://api.census.gov/data/2019/pep/population?get=COUNTY,DATE_CODE,DATE_DESC,DENSITY,POP,NAME,STATE&for=state:*&key={census_key}").json()
census_headers = census_response.pop(0)
populationDF = pd.DataFrame(census_response, columns = census_headers)
populationDF = populationDF[populationDF['DATE_CODE']=='12']
populationDF = populationDF[['DENSITY','POP','NAME','STATE']]
# populationDF

#This is 2018 population data
census_response2 = requests.get(f"https://api.census.gov/data/2018/pep/population?get=COUNTY,DATE_CODE,DATE_DESC,DENSITY,POP,STATE&for=state:*&key={census_key}").json()
census_headers2 = census_response2.pop(0)
populationDF2 = pd.DataFrame(census_response2, columns = census_headers2)
populationDF2 = populationDF2[populationDF2['DATE_CODE']=='11']
populationDF2 = populationDF2[['DENSITY','POP','STATE']]
# populationDF2

# total jobs 2018 - from bureau of economic analysis (https://apps.bea.gov/iTable/iTable.cfm?reqid=70&step=1&isuri=1)
empByIndDF = pd.read_csv("totalEmpIndustry2018.csv")
workers_by_indDF = pd.read_csv("totaljobs2018.csv")

#Create master Dataframe for 2018 data
populationDF = pd.merge(populationDF, populationDF2, how='left', on="STATE", suffixes=['_2019','_2018'])
master_industryDF = pd.merge(empByIndDF, populationDF, right_on='NAME', left_on='GeoName', how='left')
master_industryDF = pd.merge(master_industryDF, workers_by_indDF, left_on='NAME', right_on='GeoName', how='left')
master_industryDF = master_industryDF[['NAME','STATE','LineCode','Description','2018_x','2018_y','POP_2018','POP_2019']]
master_industryDF = master_industryDF.rename(columns={'2018_x':'2018 Workers In Industry',
                                                      '2018_y':'2018 Working Pop',
                                                      'POP_2019':'2019 Overall Population',
                                                      'POP_2018':'2018 Overall Population',
                                                     })

#Imports state list and attaches the abbreviation
stateDF = pd.read_csv("stateabbrs.csv")
master_industryDF = pd.merge(master_industryDF, stateDF, how='left', left_on="NAME", right_on="State")
master_industryDF = master_industryDF[['NAME',
                                       'Code',
                                       'STATE',
                                       'LineCode',
                                       'Description',
                                       '2018 Workers In Industry',
                                       '2018 Working Pop',
                                       '2018 Overall Population',
                                       '2019 Overall Population'
                                       ]]
master_industryDF['2018 Overall Population'] = pd.to_numeric(master_industryDF['2018 Overall Population'])
master_industryDF['2019 Overall Population'] = pd.to_numeric(master_industryDF['2019 Overall Population'])
master_industryDF['2018 Workers In Industry'] = pd.to_numeric(master_industryDF['2018 Workers In Industry'], errors='coerce')
master_industryDF = master_industryDF.dropna(subset=['2018 Workers In Industry']) #This dropped Delaware's logging industry entry. Not sure of the downstream effects
master_industryDF['2018 Percentage of Working Pop'] = (master_industryDF['2018 Workers In Industry'].astype('int64') / master_industryDF['2018 Working Pop']) * 100
# master_industryDF['2019 Working Pop'] = round(master_industryDF['2019 Overall Population'] * (master_industryDF['2018 Percentage of Working Pop'] / 100),0) 
master_industryDF

Unnamed: 0,NAME,Code,STATE,LineCode,Description,2018 Workers In Industry,2018 Working Pop,2018 Overall Population,2019 Overall Population,2018 Percentage of Working Pop
0,Alabama,AL,01,70,Farm employment,42220.0,2691517,4887871,4903185,1.568632
1,Alabama,AL,01,100,"Forestry, fishing, and related activities",15895.0,2691517,4887871,4903185,0.590559
2,Alabama,AL,01,200,"Mining, quarrying, and oil and gas extra...",10560.0,2691517,4887871,4903185,0.392344
3,Alabama,AL,01,300,Utilities,14176.0,2691517,4887871,4903185,0.526692
4,Alabama,AL,01,400,Construction,148326.0,2691517,4887871,4903185,5.510870
...,...,...,...,...,...,...,...,...,...,...
1066,Wyoming,WY,56,1600,Health care and social assistance,30617.0,405010,577737,578759,7.559566
1067,Wyoming,WY,56,1700,"Arts, entertainment, and recreation",8255.0,405010,577737,578759,2.038221
1068,Wyoming,WY,56,1800,Accommodation and food services,35917.0,405010,577737,578759,8.868176
1069,Wyoming,WY,56,1900,Other services (except government and go...,17920.0,405010,577737,578759,4.424582


In [29]:
# US Bureau of Labor Statistics - Returns unadjusted unemployment rates
areaCodesDF = pd.read_csv("bureauoflaborstats.csv")
fips_dict = pd.read_csv("fipscodes.csv")
fips_dict = fips_dict[["State Abbreviation","FIPS Code"]]
fips_dict = fips_dict.to_dict()

#If we want to get more granular, we can switch the area_type_code from A to N
areaCodesList = areaCodesDF[areaCodesDF["area_type_code"]=="A"]
areaCodesList = list(areaCodesList["area_code"].values)

master_unemp_DF = pd.DataFrame({"year":[],"periodName":[],"value":[],"state":[]})

for x in areaCodesList:
    try:
        seriesid = f"LAU{x}03"
        fips_code = x[2:4]
        headers = {'Content-type': 'application/json'} 
        logging.debug(f"RETRIEVING: {x}, {fips_code}")
        data = json.dumps({"seriesid": [seriesid],"startyear":"2010", "endyear":"2020","registrationkey":bls_key})
        response = requests.post('https://api.bls.gov/publicAPI/v2/timeseries/data/', data=data, headers=headers).json()
        state_unemp_DF = pd.DataFrame(response['Results']['series'][0]['data'])
        state_unemp_DF['state'] = fips_dict['State Abbreviation'][int(fips_code)]
        state_unemp_DF = state_unemp_DF[["year","periodName","value","state"]]
        state_unemp_DF['value'] = state_unemp_DF['value'].astype('float')
        master_unemp_DF = master_unemp_DF.append(state_unemp_DF)
    except:
        logging.error(f"Error retrieving data: {x}")


ERROR:root:Error retrieving data: ST5500000000000
ERROR:root:Error retrieving data: ST5600000000000
ERROR:root:Error retrieving data: ST7200000000000


In [30]:
# Create Unemployment rate DF and format month name to numeric
master_unemp_DF['periodName']  = master_unemp_DF['periodName'].apply(lambda x: time.strptime(x, '%B').tm_mon)
master_unemp_DF = master_unemp_DF.rename(columns={'periodName':'month'})
master_unemp_DF = pd.pivot_table(master_unemp_DF, index=['state','month'], columns=['year'], values=['value'])
master_unemp_DF

Unnamed: 0_level_0,Unnamed: 1_level_0,value,value,value,value,value,value,value,value,value,value,value
Unnamed: 0_level_1,year,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
state,month,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
AL,1,12.3,11.0,8.3,8.2,7.6,6.4,6.3,6.0,4.3,4.2,3.2
AL,2,12.1,10.5,8.4,7.9,7.8,6.3,6.2,5.6,4.4,3.8,2.9
AL,3,11.5,9.9,7.9,7.1,7.3,5.9,5.9,4.9,4.0,3.4,3.3
AL,4,10.4,9.4,7.2,6.3,6.2,5.6,5.3,4.1,3.5,2.6,
AL,5,9.9,9.6,7.8,6.6,6.5,6.0,5.3,3.9,3.5,2.5,
...,...,...,...,...,...,...,...,...,...,...,...,...
WY,8,8.6,7.9,7.6,6.6,6.6,6.7,6.0,5.3,5.0,4.8,
WY,9,8.1,7.5,7.0,6.0,5.9,5.9,5.4,4.6,4.5,4.3,
WY,10,8.0,7.2,7.0,6.2,5.8,5.7,5.3,4.6,4.5,4.6,
WY,11,8.3,7.1,7.0,6.3,6.0,5.9,5.1,4.9,4.5,4.8,
