# Data 4 Black Lives - COVID-19 Case/Death Disparities

Objective: Extract COVID-19 cases and deaths for each geographic location, both overall and for Black/African-Americans only.

# Setup

## Install modules

## Import modules

In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
import os
from getpass import getpass
from importlib import reload 

## Set up directories

In [2]:
## On Google Colab
#home_dir = '/content'

## If on your laptop, set path to 'python subfolder in your local copy of the repo
#home_dir_raw = '~/Documents/GitHub/d4bl_covid_tracker/workflow/python'
home_dir_raw = '~/Documents/GitHub/COVID19_tracker_data_extraction/workflow/python'

home_dir = os.path.expanduser(home_dir_raw)
os.chdir(home_dir)

In [3]:
home_dir

'/Users/poisson/Documents/GitHub/COVID19_tracker_data_extraction/workflow/python'

In [4]:
# Create new data folders
import shutil
if os.path.isdir('data'):
    print('removing existing data files')
    shutil.rmtree('data', ignore_errors=True)
    
os.mkdir('data')

locations = ['mass', 'virginia', 'dc', 'ga', 'michigan', 'minnesota', \
            'north carolina', 'texas_bexar', 'wisconsin_milwaukee', 'san_diego', 'florida']

[os.mkdir(os.path.join('data', loc)) for loc in locations]


os.listdir('data')

removing existing data files


['mass',
 'virginia',
 'ga',
 'michigan',
 'minnesota',
 'florida',
 'dc',
 'north carolina',
 'san_diego',
 'wisconsin_milwaukee',
 'texas_bexar']

In [5]:
## Create output folder
if os.path.isdir('output') is False:
    os.mkdir('output')

In [6]:
!ls data

[34mdc[m[m                  [34mmass[m[m                [34mnorth carolina[m[m      [34mvirginia[m[m
[34mflorida[m[m             [34mmichigan[m[m            [34msan_diego[m[m           [34mwisconsin_milwaukee[m[m
[34mga[m[m                  [34mminnesota[m[m           [34mtexas_bexar[m[m


## Helper functions

In [7]:
## Source: https://stackoverflow.com/questions/437589/how-do-i-unload-reload-a-module/487718#487718

# use instead of: from dfly_parser import parseMessages
#importOrReload("dfly_parser", "parseMessages")

def importOrReload(module_name, *names):
    import sys

    if module_name in sys.modules:
        reload(sys.modules[module_name])
    else:
        __import__(module_name, fromlist=names)

    for name in names:
        globals()[name] = getattr(sys.modules[module_name], name)

In [8]:
def get_github_file(file_link, new_file_name, token):

    auth_link = file_link.replace('https://', 'https://' + token +  '@')
    cmd_txt = "! curl -o {} -s  {}".format(new_file_name, auth_link)
    os.system(cmd_txt)
    print('Successfully downloaded {}'.format(new_file_name))

In [9]:
def delete_file(file_name):
    if os.path.exists(file_name):
        os.remove(file_name)

## Set parameters

In [10]:
## Replicate results in 4/9/2020 D4BL report?
validation_flag = False

## Download from Github (True) or use local modules (False)?
github_flag = False

## Download files from Github (optional)

In [11]:
if github_flag:
    ## Prompt for Github token
    token = getpass("Provide your Github token: ")
    
    ## Select Githun account name and repository name
    acct_name = 'd4bl'
    repo_name = 'COVID19_tracker_data_extraction'
    
    ## Path to repository
    repo_path = 'https://raw.githubusercontent.com/{}/{}/master'.format(acct_name, repo_name)

    ## Download the files
    
    get_github_file(file_link = '{}/workflow/python/functions_case1_csv_xlsx.py'.format(repo_path), 
                new_file_name = os.path.join(home_dir, 'functions_case1_csv_xlsx_gh.py'), token = token)
    
    get_github_file(file_link = '{}/workflow/python/functions_case2_html.py'.format(repo_path), 
                new_file_name = os.path.join(home_dir, 'functions_case2_html_gh.py'), token = token)
    
    get_github_file(file_link = '{}/workflow/python/functions_case3_pdf_table.py'.format(repo_path), 
                new_file_name = os.path.join(home_dir, 'functions_case3_pdf_table_gh.py'), token = token)
    
    get_github_file(file_link = '{}/workflow/python/misc_helper_functions.py'.format(repo_path), 
                new_file_name = os.path.join(home_dir, 'misc_helper_functions_gh.py'), token = token)


In [12]:
os.listdir()

['functions_case1_csv_xlsx.py',
 'functions_case2_html.py',
 '.DS_Store',
 'misc_helper_functions.py',
 'output',
 '__pycache__',
 'functions_case3_pdf_table.py',
 '.ipynb_checkpoints',
 'functions_case2_html_ns.py',
 'data',
 'COVID_DataExtraction_Workflow.ipynb']

In [13]:
pwd

'/Users/poisson/Documents/GitHub/COVID19_tracker_data_extraction/workflow/python'

In [14]:
#os.chdir(home_dir)
#! cat functions_case1_csv_xlsx_gh.py

In [15]:
!ls

COVID_DataExtraction_Workflow.ipynb functions_case2_html_ns.py
[34m__pycache__[m[m                         functions_case3_pdf_table.py
[34mdata[m[m                                misc_helper_functions.py
functions_case1_csv_xlsx.py         [34moutput[m[m
functions_case2_html.py


## Load python scripts

In [16]:
case1_locations = ["massachusetts", "virginia", "washingtonDC", "georgia"]
case2_locations = ['michigan', 'minnesota', 'north_carolina', 'texas_bexar_county', 'wisconsin_milwaukee']
case3_locations = ['san_diego', 'florida']
functions_case1 = ['data_extract_' + loc for loc in case1_locations]
functions_case2 = ['data_extract_' + loc for loc in case2_locations]
functions_case3 = ['data_extract_' + loc for loc in case3_locations]

if github_flag:
    [importOrReload("functions_case1_csv_xlsx_gh", function_name) for function_name in functions_case1]
    [importOrReload("functions_case2_html_gh", function_name) for function_name in functions_case2]
    [importOrReload("functions_case3_pdf_table_gh", function_name) for function_name in functions_case3]
else:
    [importOrReload("functions_case1_csv_xlsx", function_name) for function_name in functions_case1]
    [importOrReload("functions_case2_html", function_name) for function_name in functions_case2]
    [importOrReload("functions_case3_pdf_table", function_name) for function_name in functions_case3]

## Create empty dictionary to hold results

In [17]:
## Dictionary to hold results
output = dict()

# Data Extraction

## Case 1: Read from CSV and XLSX files

In [18]:
output['Massachusetts'] = data_extract_massachusetts(validation = validation_flag, 
                                                     home_dir = home_dir)

Navigate to Massachusetts data folder
Get URLs on Massachusetts COVID-19 response reporting page
['/doc/covid-19-raw-data-may-10-2020/download']
Find the URL corresponding to the COVID-19 data file
https://www.mass.gov/doc/covid-19-raw-data-may-10-2020/download
Download the file
/Users/poisson/Documents/GitHub/COVID19_tracker_data_extraction/workflow/python/data/mass/massachusetts.zip
file download success!
Unzip the file
Get the race/ethnicity breakdown
Get date of most recent data published
Get the data for only most recent data published (or validation date)
Total cases
Total deaths
AA cases
AA deaths
Success!


In [19]:
output['Virginia'] = data_extract_virginia(validation = validation_flag, 
                                           home_dir = home_dir)

Navigate to Virginia data folder
Download the CSV for race
file download success!
Read in the file
Get only the most recent data published
Roll up counts to race
Total cases
Total deaths
AA cases
AA deaths
Success!


In [20]:
output['Washington DC'] = data_extract_washingtonDC(validation = validation_flag, 
                                                    home_dir = home_dir)

Navigate to Washington, DC data folder
Find links to all Washington, DC COVID data files
Find date strings in data files
Convert date strings to date
Find most recent date
Convert to date format expected in data file
Download the most recent data file
file download success!
Load the race/ethnicity breakdown of cases
Set column names
Get date of most recent data published
Get cases associated with desired timestamp (most recent or 4/9/2020 validation date)
Load the race/ethnicity breakdown of deaths
Set column names
Get deaths associated with desired timestamp (most recent or 4/9/2020 validation date)
Get report date, formatted for output
Total cases
Total deaths
AA cases
AA deaths
Success!


In [21]:
output['Georgia'] = data_extract_georgia(validation = validation_flag, 
                                         home_dir = home_dir)

Navigate to Georgia data folder
Download file
Read contents of the zip
Report date = last update of the demographics.csv file in the ZIP archive
Load demographics CSV
African American cases and deaths
Success!


## Case 2: Extract data from HTML source code

In [22]:
output['Michigan'] = data_extract_michigan(validation = validation_flag, 
                                         home_dir = home_dir)

request successful
Success!


In [23]:
output['Minnesota'] = data_extract_minnesota(validation = validation_flag, 
                                         home_dir = home_dir)

request successful
Date: 5/10/2020
Number Cases: 11271
Number Deaths: 578
Pct Cases Black/AA: 
Pct Deaths Black/AA: 
Success!


In [24]:
output['North Carolina'] = data_extract_north_carolina(validation = validation_flag, 
                                         home_dir = home_dir)

request successful
Date: 5/10/2020
Number Cases: 547
Number Deaths: 14764
Pct Cases Black/AA: 35
Pct Deaths Black/AA: 34
Success!


In [25]:
output['Texas -- Bexar County'] = data_extract_texas_bexar_county(validation = validation_flag, 
                                         home_dir = home_dir)

Success!


In [26]:
output['Wisconsin -- Milwaukee'] = data_extract_wisconsin_milwaukee(validation = validation_flag, 
                                         home_dir = home_dir)

Success!


## Case #3: PDF table extraction

In [27]:
output['San Diego'] = data_extract_san_diego(validation = validation_flag, 
                                         home_dir = home_dir)

file download success!


'pages' argument isn't specified.Will extract only from page 1 by default.


file download success!


'pages' argument isn't specified.Will extract only from page 1 by default.


4926


Got stderr: May 10, 2020 7:19:45 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont <init>
May 10, 2020 7:19:46 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont <init>
May 10, 2020 7:19:46 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont <init>



175


### Note about Florida:

The Florida PDF is quite large and takes a long time to download.  After you download today's file once, you might optionally want to set `refresh = False` on subsequent runs to use the copy already saved locally.

In [35]:
output['Florida'] = data_extract_florida(validation = validation_flag, 
                                         home_dir = home_dir, refresh = True)  

Find daily Florida URL
https://floridadisaster.org/globalassets/covid19/dailies/covid-19-data---daily-report-2020-05-10-10am.pdf
file download success!
Parse the PDF
Execution error!
could not convert string to float: '16%'


# Combining into a single dataframe

In [29]:
output_df = pd.DataFrame(output).T
output_df

Unnamed: 0,Location,Date Published,Total Cases,Total Deaths,Count Cases Black/AA,Count Deaths Black/AA,Pct Cases Black/AA,Pct Deaths Black/AA,Status code
Massachusetts,Massachusetts,5/9/2020,76743.0,4840.0,6288.0,252.0,8.19,5.21,Success!
Virginia,Virginia,5/10/2020,24081.0,839.0,4031.0,187.0,16.74,22.29,Success!
Washington DC,"Washington, DC",5/8/2020,5899.0,304.0,2793.0,241.0,47.35,79.28,Success!
Georgia,Georgia,5/10/2020,33508.0,1405.0,11857.0,697.0,35.39,49.61,Success!
Michigan,Michigan,5/10/2020,47138.0,4551.0,15084.0,1866.0,32.0,41.0,Success!
Minnesota,Minnesota,5/10/2020,11271.0,578.0,1916.0,35.0,17.0,6.0,Success!
North Carolina,North Carolina,5/10/2020,547.0,14764.0,191.0,5020.0,35.0,34.0,Success!
Texas -- Bexar County,Texas -- Bexar County,5/9/2020,1805.0,54.0,168.0,13.0,9.31,24.07,Success!
Wisconsin -- Milwaukee,Wisconsin -- Milwaukee,5/10/2020,3981.0,217.0,1544.0,106.0,38.78,48.85,Success!
San Diego,California - San Diego,5/9/2020,4926.0,175.0,167.0,4.0,3.39,2.29,Success!


In [30]:
output_df['Status code'].values

array(['Success!', 'Success!', 'Success!', 'Success!', 'Success!',
       'Success!', 'Success!', 'Success!', 'Success!', 'Success!',
       "An error occured. ... FileNotFoundError(2, 'No such file or directory')"],
      dtype=object)

In [31]:
date_stamp = datetime.now().strftime('%Y-%m-%d')
out_file = os.path.join(home_dir, 'output', 'csv', 'covid_disparities_output_' + date_stamp + '.csv')
output_df.to_csv(out_file)

In [32]:
out_file = os.path.join(home_dir, 'output', 'xlsx', 'covid_disparities_output_' + date_stamp + '.xlsx')
output_df.to_excel(out_file)

In [33]:
os.chdir(os.path.join(home_dir, 'output'))
!ls

covid_disparities_output_2020-05-08.csv
covid_disparities_output_2020-05-08.xlsx
covid_disparities_output_2020-05-09.csv
covid_disparities_output_2020-05-09.xlsx
covid_disparities_output_2020-05-10.csv
covid_disparities_output_2020-05-10.xlsx


## Clean up downloaded files (if applicable)

In [36]:
delete_file('functions_case1_csv_xlsx_gh.py')
delete_file('functions_case2_html_gh.py')
delete_file('functions_case3_pdf_table_gh.py')
delete_file('misc_helper_functions_gh.py')    