# Data 4 Black Lives - COVID-19 Case/Death Disparities

Objective: Extract COVID-19 cases and deaths for each geographic location, both overall and for Black/African-Americans only.

Data sources for 3 locations (California (San Diego), Florida, and New York City) are provided in tables embedded in PDFs. There are tools that can extract tables from PDFs. Specifying the specific location of the table in the document can be a bit tricky, but this can certainly be done.

# Setup

## Install modules

In [1]:
!pip install wget

Collecting wget
  Downloading https://files.pythonhosted.org/packages/47/6a/62e288da7bcda82b935ff0c6cfe542970f04e29c756b0e147251b2fb251f/wget-3.2.zip
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-cp36-none-any.whl size=9682 sha256=6ef9ac9a3e07ae91fc3ea80e4764c560dccbb78016e00da7ecab8f51d88a6502
  Stored in directory: /root/.cache/pip/wheels/40/15/30/7d8f7cea2902b4db79e3fea550d7d7b85ecb27ef992b618f3f
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


## Import modules

In [0]:
## Misc utilities
import pandas as pd
import os
from datetime import datetime, timedelta
import wget
import numpy as np
import datetime

## Read webpage
from bs4 import BeautifulSoup
from bs4.dammit import EncodingDetector
import requests

## Display pandas dataframe
from IPython.display import display, HTML

## Create subdirectories

In [0]:
## home directory
home_dir = '/content'

In [0]:
# Create new data folders
%mkdir data
%mkdir data/mass
%mkdir data/virginia
%mkdir data/dc

## Helper functions

In [0]:
# Source: https://stackoverflow.com/questions/1080411/retrieve-links-from-web-page-using-python-and-beautifulsoup

def find_all_links(url, search_string=None):
    resp = requests.get(url)
    http_encoding = resp.encoding if 'charset' in resp.headers.get('content-type', '').lower() else None
    html_encoding = EncodingDetector.find_declared_encoding(resp.content, is_html=True)
    encoding = html_encoding or http_encoding
    soup = BeautifulSoup(resp.content, from_encoding=encoding)

    link_list = []

    for link in soup.find_all('a', href=True):
        link_list.append(link['href'])

    if search_string:
        return [x for x in link_list if search_string in x]
    else:
        return link_list

## Dictionary to hold results

In [0]:
output = dict()

In [0]:
validation_flag = False

# Analyses

## Massachusetts

In [0]:
## Navigate to Massachusetts data folder
mass_dir = os.path.join(home_dir, 'data', 'mass')
os.chdir(mass_dir)

In [0]:
def data_extract_massachusetts(validation=False):
    try:
        print('Get URLs on Massachusetts COVID-19 response reporting page')
        mass_urls = find_all_links(url='https://www.mass.gov/info-details/covid-19-response-reporting', search_string='covid-19-raw-data')

        print('Find the URL corresponding to the COVID-19 data file')
        #print(mass_urls)
        mass_url_fragment = mass_urls[0].split('/')[2]
        mass_url = 'https://www.mass.gov/doc/{}/download'.format(mass_url_fragment)

        print('Download the file')
        ## Cumulative number of cases / deaths
        mass_file = os.path.join(mass_dir, 'massachusetts.zip'); mass_file
        os.system("wget -O {} {}".format(mass_file, mass_url))

        print('Unzip the file')
        ! unzip -o -qq massachusetts.zip

        print('Get the race/ethnicity breakdown')
        df_mass_raw = pd.read_csv('RaceEthnicity.csv')

        print('Get date of most recent data published')
        ## If desired (validation = True), verify that calculations as of D4BL's last refresh match these calculations 
        ## TO DO: Convert date to string first before finding the max
        if validation is True:
            mass_max_date = '4/9/2020'
        else:
            mass_max_date = max(df_mass_raw.Date)
        
        print('Get the data for only most recent data published (or validation date)')
        df_mass = df_mass_raw[df_mass_raw.Date == mass_max_date]

        ##### Intermediate calculations #####

        print('total cases')
        mass_total_cases = df_mass['All Cases'].sum()

        print('total deaths')
        mass_total_deaths = df_mass['Deaths'].sum()

        print('AA cases')
        mass_aa_cases = df_mass[df_mass['Race/Ethnicity'] == 'Non-Hispanic Black/African American']['All Cases'].tolist()[0] 
        mass_aa_cases_pct = round(100 * mass_aa_cases / mass_total_cases, 2)

        print('AA deaths')
        mass_aa_deaths = df_mass[df_mass['Race/Ethnicity'] == 'Non-Hispanic Black/African American']['Deaths'].tolist()[0]
        mass_aa_deaths_pct = round(100 * mass_aa_deaths / mass_total_deaths, 2)

        return {
            'Location': 'Massachusetts',
        'Date Published': mass_max_date,
        'Total Cases': mass_total_cases,
        'Total Deaths': mass_total_deaths,
        'Pct Cases Black/AA': mass_aa_cases_pct,
        'Pct Deaths Black/AA': mass_aa_deaths_pct
        }

        print('Success!')
    
    except Exception as inst:
        print('Execution error!')
        print(inst)

        return {
        'Location': 'Massachusetts',
        'Date Published': '',
        'Total Cases': np.nan,
        'Total Deaths': np.nan,
        'Pct Cases Black/AA': np.nan,
        'Pct Deaths Black/AA': np.nan
        }



In [23]:
output['Massachusetts'] = data_extract_massachusetts(validation = validation_flag); output

Get URLs on Massachusetts COVID-19 response reporting page
Find the URL corresponding to the COVID-19 data file
Download the file
Unzip the file
Get the race/ethnicity breakdown
Get date of most recent data published
Get the data for only most recent data published (or validation date)
total cases
total deaths
AA cases
AA deaths


{'Massachusetts': {'Date Published': '5/6/2020',
  'Location': 'Massachusetts',
  'Pct Cases Black/AA': 7.33,
  'Pct Deaths Black/AA': 5.48,
  'Total Cases': 72025,
  'Total Deaths': 4420}}

## Virginia

In [0]:
## Navigate to Massachusetts data folder
virginia_dir = os.path.join(home_dir, 'data', 'virginia')
os.chdir(virginia_dir)

In [0]:
def data_extract_virginia(validation=False):
    ## No validation of 4/9/2020 available since data appear to be overwritten daily
    ## Thus, validation parameter setting has no effect
    try:
        ## Download the CSV for race
        !wget -q --no-check-certificate https://www.vdh.virginia.gov/content/uploads/sites/182/2020/03/VDH-COVID-19-PublicUseDataset-Cases_By-Race.csv

        ## Read in the file
        df_va_raw = pd.read_csv('VDH-COVID-19-PublicUseDataset-Cases_By-Race.csv')

        ## Get only the most recent data published
        ## TO DO: Convert date to string first before finding the max
        va_max_date = max(df_va_raw['Report Date'])

        ## Roll up counts to race
        df_va = df_va_raw.groupby('Race').sum()

        ##### Intermediate calculations #####

        ## total cases
        va_total_cases = df_va['Number of Cases'].sum()

        ## total deaths
        va_total_deaths = df_va['Number of Deaths'].sum()

        ## AA cases
        va_aa_cases = df_va.loc['Black or African American',:]['Number of Cases'] 
        va_aa_cases_pct = round(100 * va_aa_cases / va_total_cases, 2)

        ## AA deaths
        va_aa_deaths = df_va.loc['Black or African American',:]['Number of Deaths']
        va_aa_deaths_pct = round(100 * va_aa_deaths / va_total_deaths, 2)

        print('Success!')

        return {
            'Location': 'Virginia',
            'Date Published': va_max_date,
            'Total Cases': va_total_cases,
            'Total Deaths': va_total_deaths,
            'Pct Cases Black/AA': va_aa_cases_pct,
            'Pct Deaths Black/AA': va_aa_deaths_pct
            }
    
    except Exception as inst:
        print('Execution error!')
        print(inst)

        return {
            'Location': 'Virginia',
            'Date Published': '',
            'Total Cases': pd.nan,
            'Total Deaths': pd.nan,
            'Pct Cases Black/AA': pd.nan,
            'Pct Deaths Black/AA': pd.nan
        }

        

In [26]:
output['Virginia'] = data_extract_virginia(validation = validation_flag); output

Success!


{'Massachusetts': {'Date Published': '5/6/2020',
  'Location': 'Massachusetts',
  'Pct Cases Black/AA': 7.33,
  'Pct Deaths Black/AA': 5.48,
  'Total Cases': 72025,
  'Total Deaths': 4420},
 'Virginia': {'Date Published': '5/5/2020',
  'Location': 'Virginia',
  'Pct Cases Black/AA': 17.74,
  'Pct Deaths Black/AA': 23.0,
  'Total Cases': 20256,
  'Total Deaths': 713}}

In [27]:
pd.DataFrame(output).T

Unnamed: 0,Location,Date Published,Total Cases,Total Deaths,Pct Cases Black/AA,Pct Deaths Black/AA
Massachusetts,Massachusetts,5/6/2020,72025,4420,7.33,5.48
Virginia,Virginia,5/5/2020,20256,713,17.74,23.0


## Washington DC

In [0]:
## Navigate to Washington, DC data folder
dc_dir = os.path.join(home_dir, 'data', 'dc')
os.chdir(dc_dir)

In [0]:
def data_extract_washingtonDC(validation=False):

    try:
        ## 
        prefix = 'https://coronavirus.dc.gov/sites/default/files/dc/sites/coronavirus/page_content/attachments/'
        dc_links_raw = find_all_links('https://coronavirus.dc.gov/page/coronavirus-data', 
                    prefix + 'DC-COVID-19-Data')
        
        dc_links = [x for x in dc_links_raw if ('csv' in x or 'xlsx' in x)]
        
        ## 
        dc_date_strings = [x.replace('forApril', 'for-April'). \
                           replace(prefix + 'DC-COVID-19-Data-for-', ''). \
                           replace('-updated', '').replace('.xlsx', '') for x in dc_links]; dc_date_strings

        ##  
        dc_dates = [str(datetime.datetime.strptime(x, '%B-%d-%Y')).split(' ')[0] for x in dc_date_strings]

        ##
        dc_max_date = max(dc_dates)

        ## 
        dc_file_date = datetime.datetime.strptime(dc_max_date, '%Y-%m-%d').strftime('%B-%-d-%Y')

        ## Download the file
        ## Cumulative number of cases / deaths
        dc_url = "https://coronavirus.dc.gov/sites/default/files/dc/sites/coronavirus/page_content/attachments/DC-COVID-19-Data-for-{}.xlsx".format(dc_file_date)
        dc_file = os.path.join(dc_dir, 'dc_data.xlsx')
        os.system("wget -O {} {}".format(dc_file, dc_url))

        ## 
        df_dc_cases_raw = pd.read_excel('dc_data.xlsx', sheet_name = 'Total Cases by Race', skiprows=[0]).\
        T.drop(columns=[0])

        ##
        df_dc_cases_raw.columns = df_dc_cases_raw.loc['Unnamed: 0'].tolist()
        df_dc_cases_raw = df_dc_cases_raw.drop(index=['Unnamed: 0'])
        df_dc_cases_raw = df_dc_cases_raw.reset_index()

        ## Get date of most recent data published
        ## If desired (validation = True), verify that calculations as of D4BL's last refresh match these calculations 
        ## TO DO: Convert date to string first before finding the max
        if validation:
            max_case_ts = pd.Timestamp('2020-04-08 00:00:00')
        else:
            max_case_ts = max(df_dc_cases_raw['index']); max_case_ts

        ##
        df_dc_cases = df_dc_cases_raw[df_dc_cases_raw['index'] == max_case_ts]

        ## 
        df_dc_deaths_raw = pd.read_excel('dc_data.xlsx', sheet_name = 'Lives Lost by Race'). \
        T.drop(columns=[0])

        ## 
        df_dc_deaths_raw.columns = df_dc_deaths_raw.loc['Unnamed: 0'].tolist()
        df_dc_deaths_raw = df_dc_deaths_raw.drop(index=['Unnamed: 0'])
        df_dc_deaths_raw = df_dc_deaths_raw.reset_index()

        ##
        df_dc_deaths = df_dc_deaths_raw[df_dc_deaths_raw['index'] == max_case_ts]; df_dc_deaths

        ## 
        dc_max_date = (max_case_ts + timedelta(days=1) ).strftime('%-m/%-d/%Y'); dc_max_date

        ##### Intermediate calculations #####

        ## total cases
        dc_total_cases = df_dc_cases['All'].astype('int').tolist()[0]

        ## total deaths
        dc_total_deaths = df_dc_deaths['All'].astype('int').tolist()[0]

        ## AA cases
        dc_aa_cases = df_dc_cases['Black/African American'].astype('int').tolist()[0]
        dc_aa_cases_pct = round(100 * dc_aa_cases / dc_total_cases, 2)

        ## AA deaths
        dc_aa_deaths = df_dc_deaths['Black/African American'].astype('int').tolist()[0]
        dc_aa_deaths_pct = round(100 * dc_aa_deaths / dc_total_deaths, 2)




        print('Success!')

        return {
            'Location': 'Washington, DC',
            'Date Published': dc_max_date,
            'Total Cases': dc_total_cases,
            'Total Deaths': dc_total_deaths,
            'Pct Cases Black/AA': dc_aa_cases_pct,
            'Pct Deaths Black/AA': dc_aa_deaths_pct
            }
    
    except Exception as inst:
        print('Execution error!')
        print(inst)

        return {
            'Location': 'Washington, DC',
            'Date Published': '',
            'Total Cases': np.nan,
            'Total Deaths': np.nan,
            'Pct Cases Black/AA': np.nan,
            'Pct Deaths Black/AA': np.nan
        }

        

In [31]:
output['Washington DC'] = data_extract_washingtonDC(validation=False); output

Success!


{'Massachusetts': {'Date Published': '5/6/2020',
  'Location': 'Massachusetts',
  'Pct Cases Black/AA': 7.33,
  'Pct Deaths Black/AA': 5.48,
  'Total Cases': 72025,
  'Total Deaths': 4420},
 'Virginia': {'Date Published': '5/5/2020',
  'Location': 'Virginia',
  'Pct Cases Black/AA': 17.74,
  'Pct Deaths Black/AA': 23.0,
  'Total Cases': 20256,
  'Total Deaths': 713},
 'Washington DC': {'Date Published': '5/6/2020',
  'Location': 'Washington, DC',
  'Pct Cases Black/AA': 46.69,
  'Pct Deaths Black/AA': 79.42,
  'Total Cases': 5461,
  'Total Deaths': 277}}

In [33]:
output_df = pd.DataFrame(output).T
output_df

Unnamed: 0,Location,Date Published,Total Cases,Total Deaths,Pct Cases Black/AA,Pct Deaths Black/AA
Massachusetts,Massachusetts,5/6/2020,72025,4420,7.33,5.48
Virginia,Virginia,5/5/2020,20256,713,17.74,23.0
Washington DC,"Washington, DC",5/6/2020,5461,277,46.69,79.42


In [0]:
out_file = os.path.join(home_dir, 'covid_disparities_output_' + datetime.datetime.now().strftime('%Y-%m-%d') + '.csv'); out_file
output_df.to_csv(out_file)

In [36]:
os.chdir(home_dir)
!ls

covid_disparities_output_2020-05-07.csv  data  sample_data
