# Data 4 Black Lives - COVID-19 Case/Death Disparities

Objective: Extract COVID-19 cases and deaths for each geographic location, both overall and for Black/African-Americans only.

# Setup

## Install modules

## Import modules

In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
import os
from getpass import getpass
from importlib import reload 

## Set up directories

In [2]:
#home_dir = '/content'
#home_dir = '~/GitHub/d4bl_covid_tracker/workflow/python'
home_dir = '~/GitHub/COVID19_tracker_data_extraction/workflow/python'
os.chdir(home_dir)

In [3]:
# Create new data folders
import shutil
if os.path.isdir('data'):
    print('removing existing data files')
    shutil.rmtree('data', ignore_errors=True)
os.mkdir('data')
os.mkdir('data/mass')
os.mkdir('data/virginia')
os.mkdir('data/dc')
os.mkdir('data/ga')

os.listdir('data')

['mass', 'virginia', 'ga', 'dc']

In [4]:
## Create output folder
if os.path.isdir('output') is False:
    os.mkdir('output')

In [5]:
!ls data

[34mdc[m[m       [34mga[m[m       [34mmass[m[m     [34mvirginia[m[m


## Helper functions

In [6]:
## Source: https://stackoverflow.com/questions/437589/how-do-i-unload-reload-a-module/487718#487718

# use instead of: from dfly_parser import parseMessages
#importOrReload("dfly_parser", "parseMessages")

def importOrReload(module_name, *names):
    import sys

    if module_name in sys.modules:
        reload(sys.modules[module_name])
    else:
        __import__(module_name, fromlist=names)

    for name in names:
        globals()[name] = getattr(sys.modules[module_name], name)

In [7]:
def get_github_file(file_link, new_file_name, token):

    auth_link = file_link.replace('https://', 'https://' + token +  '@')
    cmd_txt = "! curl -o {} -s  {}".format(new_file_name, auth_link)
    os.system(cmd_txt)
    print('Successfully downloaded {}'.format(new_file_name))

## Set parameters

In [8]:
## Replicate results in 4/9/2020 D4BL report?
validation_flag = False

## Download from Github (True) or use local modules (False)?
github_flag = True

## Download files from Github (optional)

In [9]:
if github_flag:
    ## Prompt for Github token
    token = getpass("Provide your Github token: ")

    ## Download the files
    get_github_file(file_link = 'https://raw.githubusercontent.com/sydeaka/d4bl_covid_tracker/master/workflow/python/functions_case1_csv_xlsx.py', 
                new_file_name = os.path.join(home_dir, 'functions_case1_csv_xlsx_gh.py'), token = token)
    
    get_github_file(file_link = 'https://raw.githubusercontent.com/sydeaka/d4bl_covid_tracker/master/workflow/python/misc_helper_functions.py', 
                new_file_name = os.path.join(home_dir, 'misc_helper_functions_gh.py'), token = token)

Provide your Github token: ········
Successfully downloaded /Users/poisson/Documents/GitHub/COVID19_tracker_data_extraction/workflow/python/functions_case1_csv_xlsx_gh.py
Successfully downloaded /Users/poisson/Documents/GitHub/COVID19_tracker_data_extraction/workflow/python/misc_helper_functions_gh.py


In [10]:
os.listdir()

['functions_case1_csv_xlsx.py',
 '.DS_Store',
 'misc_helper_functions.py',
 'output',
 'helper_functions',
 'case1_csv_xlsx',
 'case3_pdf_table',
 'case2_html',
 'data',
 'misc_helper_functions_gh.py',
 'functions_case1_csv_xlsx_gh.py']

In [None]:
#os.chdir(home_dir)
#! cat functions_case1_csv_xlsx_gh.py

## Load python scripts

In [11]:
functions_case1 = ["data_extract_massachusetts", "data_extract_virginia", "data_extract_washingtonDC", "data_extract_georgia"]
if github_flag:
    [importOrReload("functions_case1_csv_xlsx_gh", function_name) for function_name in functions_case1]
else:
    [importOrReload("functions_case1_csv_xlsx", function_name) for function_name in functions_case1]

## Create empty dictionary to hold results

In [12]:
## Dictionary to hold results
output = dict()

# Data Extraction

## Case 1: Read from CSV and XLSX files

In [13]:
output['Massachusetts'] = data_extract_massachusetts(validation = validation_flag, 
                                                     home_dir = home_dir)

output

Navigate to Massachusetts data folder
Get URLs on Massachusetts COVID-19 response reporting page
['/doc/covid-19-raw-data-may-8-2020/download']
Find the URL corresponding to the COVID-19 data file
https://www.mass.gov/doc/covid-19-raw-data-may-8-2020/download
Download the file
/Users/poisson/Documents/GitHub/COVID19_tracker_data_extraction/workflow/python/data/mass/massachusetts.zip
file download success!
Unzip the file
Get the race/ethnicity breakdown
Get date of most recent data published
Get the data for only most recent data published (or validation date)
Total cases
Total deaths
AA cases
AA deaths

Success!



{'Massachusetts': {'Location': 'Massachusetts',
  'Date Published': '5/8/2020',
  'Total Cases': 75333,
  'Total Deaths': 4702,
  'Pct Cases Black/AA': 8.18,
  'Pct Deaths Black/AA': 5.25}}

In [14]:
output['Virginia'] = data_extract_virginia(validation = validation_flag, 
                                           home_dir = home_dir)

output

Navigate to Virginia data folder
Download the CSV for race
file download success!
Read in the file
Get only the most recent data published
Roll up counts to race
Total cases
Total deaths
AA cases
AA deaths

Success!



{'Massachusetts': {'Location': 'Massachusetts',
  'Date Published': '5/8/2020',
  'Total Cases': 75333,
  'Total Deaths': 4702,
  'Pct Cases Black/AA': 8.18,
  'Pct Deaths Black/AA': 5.25},
 'Virginia': {'Location': 'Virginia',
  'Date Published': '5/8/2020',
  'Total Cases': 22342,
  'Total Deaths': 812,
  'Pct Cases Black/AA': 17.15,
  'Pct Deaths Black/AA': 22.41}}

In [15]:
output['Washington DC'] = data_extract_washingtonDC(validation = validation_flag, 
                                                    home_dir = home_dir)

output

Navigate to Washington, DC data folder
Find links to all Washington, DC COVID data files
Find date strings in data files
Convert date strings to date
Find most recent date
Convert to date format expected in data file
Download the most recent data file
file download success!
Load the race/ethnicity breakdown of cases
Set column names
Get date of most recent data published
Get cases associated with desired timestamp (most recent or 4/9/2020 validation date)
Load the race/ethnicity breakdown of deaths
Set column names
Get deaths associated with desired timestamp (most recent or 4/9/2020 validation date)
Get report date, formatted for output
Total cases
Total deaths
AA cases
AA deaths

Success!



{'Massachusetts': {'Location': 'Massachusetts',
  'Date Published': '5/8/2020',
  'Total Cases': 75333,
  'Total Deaths': 4702,
  'Pct Cases Black/AA': 8.18,
  'Pct Deaths Black/AA': 5.25},
 'Virginia': {'Location': 'Virginia',
  'Date Published': '5/8/2020',
  'Total Cases': 22342,
  'Total Deaths': 812,
  'Pct Cases Black/AA': 17.15,
  'Pct Deaths Black/AA': 22.41},
 'Washington DC': {'Location': 'Washington, DC',
  'Date Published': '5/8/2020',
  'Total Cases': 5899,
  'Total Deaths': 304,
  'Pct Cases Black/AA': 47.35,
  'Pct Deaths Black/AA': 79.28}}

In [16]:
output['Georgia'] = data_extract_georgia(validation = validation_flag, 
                                         home_dir = home_dir)

output

Navigate to Georgia data folder
Download file
Read contents of the zip
Report date = last update of the demographics.csv file in the ZIP archive
Load demographics CSV
African American cases and deaths

Success!



{'Massachusetts': {'Location': 'Massachusetts',
  'Date Published': '5/8/2020',
  'Total Cases': 75333,
  'Total Deaths': 4702,
  'Pct Cases Black/AA': 8.18,
  'Pct Deaths Black/AA': 5.25},
 'Virginia': {'Location': 'Virginia',
  'Date Published': '5/8/2020',
  'Total Cases': 22342,
  'Total Deaths': 812,
  'Pct Cases Black/AA': 17.15,
  'Pct Deaths Black/AA': 22.41},
 'Washington DC': {'Location': 'Washington, DC',
  'Date Published': '5/8/2020',
  'Total Cases': 5899,
  'Total Deaths': 304,
  'Pct Cases Black/AA': 47.35,
  'Pct Deaths Black/AA': 79.28},
 'Georgia': {'Location': 'Georgia',
  'Date Published': '5/8/2020',
  'Total Cases': 32179,
  'Total Deaths': 1399,
  'Pct Cases Black/AA': 36.0,
  'Pct Deaths Black/AA': 49.61}}

# Combining into a single dataframe

In [17]:
output_df = pd.DataFrame(output).T
output_df

Unnamed: 0,Location,Date Published,Total Cases,Total Deaths,Pct Cases Black/AA,Pct Deaths Black/AA
Massachusetts,Massachusetts,5/8/2020,75333,4702,8.18,5.25
Virginia,Virginia,5/8/2020,22342,812,17.15,22.41
Washington DC,"Washington, DC",5/8/2020,5899,304,47.35,79.28
Georgia,Georgia,5/8/2020,32179,1399,36.0,49.61


In [18]:
date_stamp = datetime.now().strftime('%Y-%m-%d')
out_file = os.path.join(home_dir, 'output', 'covid_disparities_output_' + date_stamp + '.csv')
output_df.to_csv(out_file)

In [19]:
out_file = os.path.join(home_dir, 'output', 'covid_disparities_output_' + date_stamp + '.xlsx')
output_df.to_excel(out_file)

In [20]:
os.chdir(os.path.join(home_dir, 'output'))
!ls

covid_disparities_output_2020-05-08.csv
covid_disparities_output_2020-05-08.xlsx
