In [7]:
import numpy as np
import pandas as pd

In [8]:
# population
# https://www.census.gov/data/tables/time-series/demo/popest/2010s-state-detail.html
# health care worker
# https://www.kff.org/other/state-indicator/total-health-care-employment           

In [25]:
# where to load the files
_DATA_PATH_ = "/Users/xingruchen/Dropbox/COV/Vaccine/data/"

In [28]:
################
# load the raw data of Health Care Worker and save the processed data
################
# notes
################
# Estimates do not include self-employed workers.
# The Occupational Employment Statistics (OES) program provides wage and employment estimates by state and industry. 
# These estimates are intended for research purposes and users should be aware of the limitations of the data. 
# For more information, please see Bureau of Labor Statistics OES Research Estimates by State and Industry for more information.
################ sources
# Bureau of Labor Statistics, State Occupational Employment Statistics Survey, May 2018. 
# Available at http://www.bls.gov/oes/tables.htm.
################ definitions
# Health Care Employment includes the following subsectors in the National Industry-Specific Occupational Employment and Wage Estimates: 
# Ambulatory Health Care Services, Hospitals, and Nursing and Residential Care Facilities.
################ footnotes
# US totals include the territories.
def load_hcw_raw():
    raw = pd.read_csv(_DATA_PATH_ + 'hcw_raw.csv', names = ["jurisdiction", "number"])
    raw = raw.iloc[3:-10]
    raw = raw.reset_index(drop = True)
    # r stands for "raw" and will cause backslashes in the string to be interpreted as actual backslashes 
    # rather than special characters.
    raw.to_csv(_DATA_PATH_ + r'hcw.csv', index = False)

In [29]:
load_hcw_raw()

In [44]:
################
# load the raw data of Population and save the processed data
################
# notes
# Estimates of the Total Resident Population and Resident Population Age 18 
# Years and Older for the United States, States, and Puerto Rico: July 1, 2019
# Note: The estimates are based on the 2010 Census and reflect changes to the April 1, 2010 population 
# due to the Count Question Resolution program and geographic program revisions. 
# See Geographic Terms and Definitions at 
# https://www.census.gov/programs-surveys/popest/guidance-geographies/terms-and-definitions.html 
# for a list of the states that are included in each region.  
# All geographic boundaries for the 2019 population estimates series 
# except statistical area delineations are as of January 1, 2019.  
# For population estimates methodology statements, see 
# https://www.census.gov/programs-surveys/popest/technical-documentation/methodology.html.  
# The estimates add births to, subtract deaths from, 
# and add net migration to the enumerated resident population from the 2010 Census.  
# The enumerated resident population is the total population (citizen and noncitizen) 
# with usual residence in the 50 states and the District of Columbia.  
# See https://www.census.gov/glossary/#term_Apportionmentpopulation and 
# https://www.census.gov/glossary/#term_Residentpopulation.
def load_population_raw():
    raw = pd.read_excel(_DATA_PATH_ + r'population_raw.xlsx', names = ["jurisdiction", "number", "adult_number", "adult_percent"]) 
    raw = raw.iloc[3:-5]
    raw = raw.dropna()
    raw = raw.reset_index(drop = True)
    raw['jurisdiction'] = raw.apply(lambda row: row['jurisdiction'][1:] if '.' in row['jurisdiction'] else row['jurisdiction'], axis = 1)
    raw.to_csv(_DATA_PATH_ + r'population.csv', index = False)


In [45]:
load_population_raw()