# Data 4 Black Lives - COVID-19 Case/Death Disparities

Objective: Extract COVID-19 cases and deaths for each geographic location, both overall and for Black/African-Americans only.

# Setup

## Install modules

## Import modules

In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
import os
from getpass import getpass
from importlib import reload 

## Set up directories

In [2]:
## On Google Colab
#home_dir = '/content'

## If on your laptop, set path to 'python subfolder in your local copy of the repo
#home_dir_raw = '~/Documents/GitHub/d4bl_covid_tracker/workflow/python'
home_dir_raw = '~/Documents/GitHub/COVID19_tracker_data_extraction/workflow/python'

home_dir = os.path.expanduser(home_dir_raw)
os.chdir(home_dir)

In [3]:
# Create new data folders
import shutil
if os.path.isdir('data'):
    print('removing existing data files')
    shutil.rmtree('data', ignore_errors=True)
    
os.mkdir('data')

locations = ['mass', 'virginia', 'dc', 'ga', 'michigan', 'minnesota', \
            'north carolina', 'texas_bexar', 'wisconsin_milwaukee']

[os.mkdir(os.path.join('data', loc)) for loc in locations]


os.listdir('data')

removing existing data files


['mass',
 'virginia',
 'ga',
 'michigan',
 'minnesota',
 'dc',
 'north carolina',
 'wisconsin_milwaukee',
 'texas_bexar']

In [4]:
## Create output folder
if os.path.isdir('output') is False:
    os.mkdir('output')

In [5]:
!ls data

[34mdc[m[m                  [34mmichigan[m[m            [34mtexas_bexar[m[m
[34mga[m[m                  [34mminnesota[m[m           [34mvirginia[m[m
[34mmass[m[m                [34mnorth carolina[m[m      [34mwisconsin_milwaukee[m[m


## Helper functions

In [6]:
## Source: https://stackoverflow.com/questions/437589/how-do-i-unload-reload-a-module/487718#487718

# use instead of: from dfly_parser import parseMessages
#importOrReload("dfly_parser", "parseMessages")

def importOrReload(module_name, *names):
    import sys

    if module_name in sys.modules:
        reload(sys.modules[module_name])
    else:
        __import__(module_name, fromlist=names)

    for name in names:
        globals()[name] = getattr(sys.modules[module_name], name)

In [7]:
def get_github_file(file_link, new_file_name, token):

    auth_link = file_link.replace('https://', 'https://' + token +  '@')
    cmd_txt = "! curl -o {} -s  {}".format(new_file_name, auth_link)
    os.system(cmd_txt)
    print('Successfully downloaded {}'.format(new_file_name))

In [8]:
def delete_file(file_name):
    if os.path.exists(file_name):
        os.remove(file_name)

## Set parameters

In [9]:
## Replicate results in 4/9/2020 D4BL report?
validation_flag = False

## Download from Github (True) or use local modules (False)?
github_flag = False

## Download files from Github (optional)

In [10]:
if github_flag:
    ## Prompt for Github token
    token = getpass("Provide your Github token: ")
    
    ## Select account name and repository name
    acct_name = 'd4bl'; repo_name = 'COVID19_tracker_data_extraction'
    
    ## Path to repository
    repo_path = 'https://raw.githubusercontent.com/{}/{}/master'.format(acct_name, repo_name)

    ## Download the files
    delete_file('functions_case1_csv_xlsx_gh.py')
    get_github_file(file_link = '{}/workflow/python/functions_case1_csv_xlsx.py'.format(repo_path), 
                new_file_name = os.path.join(home_dir, 'functions_case1_csv_xlsx_gh.py'), token = token)
    
    delete_file('misc_helper_functions_gh.py')
    get_github_file(file_link = '{}/workflow/python/misc_helper_functions.py'.format(repo_path), 
                new_file_name = os.path.join(home_dir, 'misc_helper_functions_gh.py'), token = token)

In [11]:
os.listdir()

['functions_case1_csv_xlsx.py',
 'functions_case2_html.py',
 '.DS_Store',
 'misc_helper_functions.py',
 'output',
 'helper_functions',
 '__pycache__',
 'case1_csv_xlsx',
 'case3_pdf_table',
 '.ipynb_checkpoints',
 'case2_html',
 'data',
 'COVID_DataExtraction_Workflow.ipynb',
 'misc_helper_functions_gh.py',
 'functions_case1_csv_xlsx_gh.py']

In [12]:
pwd

'/Users/poisson/Documents/GitHub/COVID19_tracker_data_extraction/workflow/python'

In [13]:
#os.chdir(home_dir)
#! cat functions_case1_csv_xlsx_gh.py

In [14]:
!ls

COVID_DataExtraction_Workflow.ipynb functions_case1_csv_xlsx_gh.py
[34m__pycache__[m[m                         functions_case2_html.py
[34mcase1_csv_xlsx[m[m                      [34mhelper_functions[m[m
[34mcase2_html[m[m                          misc_helper_functions.py
[34mcase3_pdf_table[m[m                     misc_helper_functions_gh.py
[34mdata[m[m                                [34moutput[m[m
functions_case1_csv_xlsx.py


## Load python scripts

In [53]:
functions_case1 = ["data_extract_massachusetts", "data_extract_virginia", "data_extract_washingtonDC", "data_extract_georgia"]

case2_locations = ['michigan', 'minnesota', 'north_carolina', 'texas_bexar_county', 'wisconsin_milwaukee']
functions_case2 = ['data_extract_' + loc for loc in case2_locations]

if github_flag:
    [importOrReload("functions_case1_csv_xlsx_gh", function_name) for function_name in functions_case1]
    [importOrReload("functions_case2_html_gh", function_name) for function_name in functions_case2]
else:
    [importOrReload("functions_case1_csv_xlsx", function_name) for function_name in functions_case1]
    [importOrReload("functions_case2_html", function_name) for function_name in functions_case2]

## Create empty dictionary to hold results

In [40]:
## Dictionary to hold results
output = dict()

# Data Extraction

## Case 1: Read from CSV and XLSX files

In [41]:
output['Massachusetts'] = data_extract_massachusetts(validation = validation_flag, 
                                                     home_dir = home_dir)

output

Navigate to Massachusetts data folder
Get URLs on Massachusetts COVID-19 response reporting page
['/doc/covid-19-raw-data-may-8-2020/download']
Find the URL corresponding to the COVID-19 data file
https://www.mass.gov/doc/covid-19-raw-data-may-8-2020/download
Download the file
/Users/poisson/Documents/GitHub/COVID19_tracker_data_extraction/workflow/python/data/mass/massachusetts.zip
file download success!
Unzip the file
Get the race/ethnicity breakdown
Get date of most recent data published
Get the data for only most recent data published (or validation date)
Total cases
Total deaths
AA cases
AA deaths
Success!


{'Massachusetts': {'Location': 'Massachusetts',
  'Date Published': '5/8/2020',
  'Total Cases': 75333,
  'Total Deaths': 4702,
  'Pct Cases Black/AA': 8.18,
  'Pct Deaths Black/AA': 5.25,
  'Status code': 'Success!'}}

In [42]:
output['Virginia'] = data_extract_virginia(validation = validation_flag, 
                                           home_dir = home_dir)

output

Navigate to Virginia data folder
Download the CSV for race
file download success!
Read in the file
Get only the most recent data published
Roll up counts to race
Total cases
Total deaths
AA cases
AA deaths
Success!


{'Massachusetts': {'Location': 'Massachusetts',
  'Date Published': '5/8/2020',
  'Total Cases': 75333,
  'Total Deaths': 4702,
  'Pct Cases Black/AA': 8.18,
  'Pct Deaths Black/AA': 5.25,
  'Status code': 'Success!'},
 'Virginia': {'Location': 'Virginia',
  'Date Published': '5/9/2020',
  'Total Cases': 23196,
  'Total Deaths': 827,
  'Pct Cases Black/AA': 17.0,
  'Pct Deaths Black/AA': 22.37,
  'Status code': 'Success!'}}

In [43]:
output['Washington DC'] = data_extract_washingtonDC(validation = validation_flag, 
                                                    home_dir = home_dir)

output

Navigate to Washington, DC data folder
Find links to all Washington, DC COVID data files
Find date strings in data files
Convert date strings to date
Find most recent date
Convert to date format expected in data file
Download the most recent data file
file download success!
Load the race/ethnicity breakdown of cases
Set column names
Get date of most recent data published
Get cases associated with desired timestamp (most recent or 4/9/2020 validation date)
Load the race/ethnicity breakdown of deaths
Set column names
Get deaths associated with desired timestamp (most recent or 4/9/2020 validation date)
Get report date, formatted for output
Total cases
Total deaths
AA cases
AA deaths
Success!


{'Massachusetts': {'Location': 'Massachusetts',
  'Date Published': '5/8/2020',
  'Total Cases': 75333,
  'Total Deaths': 4702,
  'Pct Cases Black/AA': 8.18,
  'Pct Deaths Black/AA': 5.25,
  'Status code': 'Success!'},
 'Virginia': {'Location': 'Virginia',
  'Date Published': '5/9/2020',
  'Total Cases': 23196,
  'Total Deaths': 827,
  'Pct Cases Black/AA': 17.0,
  'Pct Deaths Black/AA': 22.37,
  'Status code': 'Success!'},
 'Washington DC': {'Location': 'Washington, DC',
  'Date Published': '5/8/2020',
  'Total Cases': 5899,
  'Total Deaths': 304,
  'Pct Cases Black/AA': 47.35,
  'Pct Deaths Black/AA': 79.28,
  'Status code': 'Success!'}}

In [44]:
output['Georgia'] = data_extract_georgia(validation = validation_flag, 
                                         home_dir = home_dir)

output

Navigate to Georgia data folder
Download file
Read contents of the zip
Report date = last update of the demographics.csv file in the ZIP archive
Load demographics CSV
African American cases and deaths
Success!


{'Massachusetts': {'Location': 'Massachusetts',
  'Date Published': '5/8/2020',
  'Total Cases': 75333,
  'Total Deaths': 4702,
  'Pct Cases Black/AA': 8.18,
  'Pct Deaths Black/AA': 5.25,
  'Status code': 'Success!'},
 'Virginia': {'Location': 'Virginia',
  'Date Published': '5/9/2020',
  'Total Cases': 23196,
  'Total Deaths': 827,
  'Pct Cases Black/AA': 17.0,
  'Pct Deaths Black/AA': 22.37,
  'Status code': 'Success!'},
 'Washington DC': {'Location': 'Washington, DC',
  'Date Published': '5/8/2020',
  'Total Cases': 5899,
  'Total Deaths': 304,
  'Pct Cases Black/AA': 47.35,
  'Pct Deaths Black/AA': 79.28,
  'Status code': 'Success!'},
 'Georgia': {'Location': 'Georgia',
  'Date Published': '5/9/2020',
  'Total Cases': 32504,
  'Total Deaths': 1400,
  'Pct Cases Black/AA': 35.76,
  'Pct Deaths Black/AA': 49.57,
  'Status code': 'Success!'}}

## Case 2: Extract data from HTML source code

In [54]:
output['Michigan'] = data_extract_michigan(validation = validation_flag, 
                                         home_dir = home_dir)

output

request successful
Success!


{'Massachusetts': {'Location': 'Massachusetts',
  'Date Published': '5/8/2020',
  'Total Cases': 75333,
  'Total Deaths': 4702,
  'Pct Cases Black/AA': 8.18,
  'Pct Deaths Black/AA': 5.25,
  'Status code': 'Success!'},
 'Virginia': {'Location': 'Virginia',
  'Date Published': '5/9/2020',
  'Total Cases': 23196,
  'Total Deaths': 827,
  'Pct Cases Black/AA': 17.0,
  'Pct Deaths Black/AA': 22.37,
  'Status code': 'Success!'},
 'Washington DC': {'Location': 'Washington, DC',
  'Date Published': '5/8/2020',
  'Total Cases': 5899,
  'Total Deaths': 304,
  'Pct Cases Black/AA': 47.35,
  'Pct Deaths Black/AA': 79.28,
  'Status code': 'Success!'},
 'Georgia': {'Location': 'Georgia',
  'Date Published': '5/9/2020',
  'Total Cases': 32504,
  'Total Deaths': 1400,
  'Pct Cases Black/AA': 35.76,
  'Pct Deaths Black/AA': 49.57,
  'Status code': 'Success!'},
 'Michigan': {'Location': 'Michigan',
  'Date Published': '5/8/2020',
  'Total Cases': 46326,
  'Total Deaths': 4393,
  'Pct Cases Black/AA': 0

In [46]:
output['Minnesota'] = data_extract_minnesota(validation = validation_flag, 
                                         home_dir = home_dir)

output

request successful
Date: 5/9/2020
Number Cases: 10,790
    
Number Deaths: 558
	
Pct Cases Black/AA: 17
Pct Deaths Black/AA: 6
Success!


{'Massachusetts': {'Location': 'Massachusetts',
  'Date Published': '5/8/2020',
  'Total Cases': 75333,
  'Total Deaths': 4702,
  'Pct Cases Black/AA': 8.18,
  'Pct Deaths Black/AA': 5.25,
  'Status code': 'Success!'},
 'Virginia': {'Location': 'Virginia',
  'Date Published': '5/9/2020',
  'Total Cases': 23196,
  'Total Deaths': 827,
  'Pct Cases Black/AA': 17.0,
  'Pct Deaths Black/AA': 22.37,
  'Status code': 'Success!'},
 'Washington DC': {'Location': 'Washington, DC',
  'Date Published': '5/8/2020',
  'Total Cases': 5899,
  'Total Deaths': 304,
  'Pct Cases Black/AA': 47.35,
  'Pct Deaths Black/AA': 79.28,
  'Status code': 'Success!'},
 'Georgia': {'Location': 'Georgia',
  'Date Published': '5/9/2020',
  'Total Cases': 32504,
  'Total Deaths': 1400,
  'Pct Cases Black/AA': 35.76,
  'Pct Deaths Black/AA': 49.57,
  'Status code': 'Success!'},
 'Michigan': {'Location': 'Michigan',
  'Date Published': '5/8/2020',
  'Total Cases': 46326,
  'Total Deaths': 4393,
  'Pct Cases Black/AA': 0

In [47]:
output['North Carolina'] = data_extract_north_carolina(validation = validation_flag, 
                                         home_dir = home_dir)

output

request successful
Date: 5/9/2020
Number Cases: 544
Number Deaths: 14,360
Pct Cases Black/AA: 35
Pct Deaths Black/AA: 34
Success!


{'Massachusetts': {'Location': 'Massachusetts',
  'Date Published': '5/8/2020',
  'Total Cases': 75333,
  'Total Deaths': 4702,
  'Pct Cases Black/AA': 8.18,
  'Pct Deaths Black/AA': 5.25,
  'Status code': 'Success!'},
 'Virginia': {'Location': 'Virginia',
  'Date Published': '5/9/2020',
  'Total Cases': 23196,
  'Total Deaths': 827,
  'Pct Cases Black/AA': 17.0,
  'Pct Deaths Black/AA': 22.37,
  'Status code': 'Success!'},
 'Washington DC': {'Location': 'Washington, DC',
  'Date Published': '5/8/2020',
  'Total Cases': 5899,
  'Total Deaths': 304,
  'Pct Cases Black/AA': 47.35,
  'Pct Deaths Black/AA': 79.28,
  'Status code': 'Success!'},
 'Georgia': {'Location': 'Georgia',
  'Date Published': '5/9/2020',
  'Total Cases': 32504,
  'Total Deaths': 1400,
  'Pct Cases Black/AA': 35.76,
  'Pct Deaths Black/AA': 49.57,
  'Status code': 'Success!'},
 'Michigan': {'Location': 'Michigan',
  'Date Published': '5/8/2020',
  'Total Cases': 46326,
  'Total Deaths': 4393,
  'Pct Cases Black/AA': 0

In [48]:
output['Texas -- Bexar County'] = data_extract_texas_bexar_county(validation = validation_flag, 
                                         home_dir = home_dir)

output

Success!


{'Massachusetts': {'Location': 'Massachusetts',
  'Date Published': '5/8/2020',
  'Total Cases': 75333,
  'Total Deaths': 4702,
  'Pct Cases Black/AA': 8.18,
  'Pct Deaths Black/AA': 5.25,
  'Status code': 'Success!'},
 'Virginia': {'Location': 'Virginia',
  'Date Published': '5/9/2020',
  'Total Cases': 23196,
  'Total Deaths': 827,
  'Pct Cases Black/AA': 17.0,
  'Pct Deaths Black/AA': 22.37,
  'Status code': 'Success!'},
 'Washington DC': {'Location': 'Washington, DC',
  'Date Published': '5/8/2020',
  'Total Cases': 5899,
  'Total Deaths': 304,
  'Pct Cases Black/AA': 47.35,
  'Pct Deaths Black/AA': 79.28,
  'Status code': 'Success!'},
 'Georgia': {'Location': 'Georgia',
  'Date Published': '5/9/2020',
  'Total Cases': 32504,
  'Total Deaths': 1400,
  'Pct Cases Black/AA': 35.76,
  'Pct Deaths Black/AA': 49.57,
  'Status code': 'Success!'},
 'Michigan': {'Location': 'Michigan',
  'Date Published': '5/8/2020',
  'Total Cases': 46326,
  'Total Deaths': 4393,
  'Pct Cases Black/AA': 0

In [49]:
output['Wisconsin -- Milwaukee'] = data_extract_wisconsin_milwaukee(validation = validation_flag, 
                                         home_dir = home_dir)

output

Success!


{'Massachusetts': {'Location': 'Massachusetts',
  'Date Published': '5/8/2020',
  'Total Cases': 75333,
  'Total Deaths': 4702,
  'Pct Cases Black/AA': 8.18,
  'Pct Deaths Black/AA': 5.25,
  'Status code': 'Success!'},
 'Virginia': {'Location': 'Virginia',
  'Date Published': '5/9/2020',
  'Total Cases': 23196,
  'Total Deaths': 827,
  'Pct Cases Black/AA': 17.0,
  'Pct Deaths Black/AA': 22.37,
  'Status code': 'Success!'},
 'Washington DC': {'Location': 'Washington, DC',
  'Date Published': '5/8/2020',
  'Total Cases': 5899,
  'Total Deaths': 304,
  'Pct Cases Black/AA': 47.35,
  'Pct Deaths Black/AA': 79.28,
  'Status code': 'Success!'},
 'Georgia': {'Location': 'Georgia',
  'Date Published': '5/9/2020',
  'Total Cases': 32504,
  'Total Deaths': 1400,
  'Pct Cases Black/AA': 35.76,
  'Pct Deaths Black/AA': 49.57,
  'Status code': 'Success!'},
 'Michigan': {'Location': 'Michigan',
  'Date Published': '5/8/2020',
  'Total Cases': 46326,
  'Total Deaths': 4393,
  'Pct Cases Black/AA': 0

# Combining into a single dataframe

In [55]:
output_df = pd.DataFrame(output).T
output_df

Unnamed: 0,Location,Date Published,Total Cases,Total Deaths,Pct Cases Black/AA,Pct Deaths Black/AA,Status code
Massachusetts,Massachusetts,5/8/2020,75333,4702,8.18,5.25,Success!
Virginia,Virginia,5/9/2020,23196,827,17.0,22.37,Success!
Washington DC,"Washington, DC",5/8/2020,5899,304,47.35,79.28,Success!
Georgia,Georgia,5/9/2020,32504,1400,35.76,49.57,Success!
Michigan,Michigan,5/8/2020,46326,4393,0.07,0.93,Success!
Minnesota,Minnesota,5/9/2020,10790,558,17.0,6.0,Success!
North Carolina,North Carolina,5/9/2020,544,14360,35.0,34.0,Success!
Texas -- Bexar County,Texas - Bexar County,5/8/2020,1805,54,8.92,24.07,Success!
Wisconsin -- Milwaukee,Wisconsin - Milwaukee,5/9/2020,3839,212,39.52,49.53,Success!


In [56]:
date_stamp = datetime.now().strftime('%Y-%m-%d')
out_file = os.path.join(home_dir, 'output', 'covid_disparities_output_' + date_stamp + '.csv')
output_df.to_csv(out_file)

In [57]:
out_file = os.path.join(home_dir, 'output', 'covid_disparities_output_' + date_stamp + '.xlsx')
output_df.to_excel(out_file)

In [58]:
os.chdir(os.path.join(home_dir, 'output'))
!ls

covid_disparities_output_2020-05-08.csv
covid_disparities_output_2020-05-08.xlsx
covid_disparities_output_2020-05-09.csv
covid_disparities_output_2020-05-09.xlsx
