# Data 4 Black Lives - COVID-19 Case/Death Disparities

Objective: Extract COVID-19 cases and deaths for each geographic location, both overall and for Black/African-Americans only.

# Setup

## Install modules

## Import modules

In [None]:
import numpy as np
import pandas as pd
from datetime import datetime
import os
from getpass import getpass
from importlib import reload 

## Set up directories

In [None]:
## On Google Colab
#home_dir = '/content'

## If on your laptop, set path to 'python subfolder in your local copy of the repo
#home_dir_raw = '~/Documents/GitHub/d4bl_covid_tracker/workflow/python'
home_dir_raw = '~/Documents/GitHub/COVID19_tracker_data_extraction/workflow/python'

home_dir = os.path.expanduser(home_dir_raw)
os.chdir(home_dir)

In [None]:
home_dir

In [None]:
# Create new data folders
import shutil
if os.path.isdir('data'):
    print('removing existing data files')
    shutil.rmtree('data', ignore_errors=True)
    
os.mkdir('data')

locations = ['mass', 'virginia', 'dc', 'ga', 'michigan', 'minnesota', \
            'north carolina', 'texas_bexar', 'wisconsin_milwaukee', 'san_diego', 'florida']

[os.mkdir(os.path.join('data', loc)) for loc in locations]


os.listdir('data')

In [None]:
## Create output folder
if os.path.isdir('output') is False:
    os.mkdir('output')

In [None]:
!ls data

## Helper functions

In [None]:
## Source: https://stackoverflow.com/questions/437589/how-do-i-unload-reload-a-module/487718#487718

# use instead of: from dfly_parser import parseMessages
#importOrReload("dfly_parser", "parseMessages")

def importOrReload(module_name, *names):
    import sys

    if module_name in sys.modules:
        reload(sys.modules[module_name])
    else:
        __import__(module_name, fromlist=names)

    for name in names:
        globals()[name] = getattr(sys.modules[module_name], name)

In [None]:
def get_github_file(file_link, new_file_name, token):

    auth_link = file_link.replace('https://', 'https://' + token +  '@')
    cmd_txt = "! curl -o {} -s  {}".format(new_file_name, auth_link)
    os.system(cmd_txt)
    print('Successfully downloaded {}'.format(new_file_name))

In [None]:
def delete_file(file_name):
    if os.path.exists(file_name):
        os.remove(file_name)

## Set parameters

In [None]:
## Replicate results in 4/9/2020 D4BL report?
validation_flag = False

## Download from Github (True) or use local modules (False)?
github_flag = False

## Download files from Github (optional)

In [None]:
if github_flag:
    ## Prompt for Github token
    token = getpass("Provide your Github token: ")
    
    ## Select Githun account name and repository name
    acct_name = 'd4bl'
    repo_name = 'COVID19_tracker_data_extraction'
    
    ## Path to repository
    repo_path = 'https://raw.githubusercontent.com/{}/{}/master'.format(acct_name, repo_name)

    ## Download the files
    
    get_github_file(file_link = '{}/workflow/python/functions_case1_csv_xlsx.py'.format(repo_path), 
                new_file_name = os.path.join(home_dir, 'functions_case1_csv_xlsx_gh.py'), token = token)
    
    get_github_file(file_link = '{}/workflow/python/functions_case2_html.py'.format(repo_path), 
                new_file_name = os.path.join(home_dir, 'functions_case2_html_gh.py'), token = token)
    
    get_github_file(file_link = '{}/workflow/python/functions_case3_pdf_table.py'.format(repo_path), 
                new_file_name = os.path.join(home_dir, 'functions_case3_pdf_table_gh.py'), token = token)
    
    get_github_file(file_link = '{}/workflow/python/misc_helper_functions.py'.format(repo_path), 
                new_file_name = os.path.join(home_dir, 'misc_helper_functions_gh.py'), token = token)


In [None]:
os.listdir()

In [None]:
pwd

In [None]:
#os.chdir(home_dir)
#! cat functions_case1_csv_xlsx_gh.py

In [None]:
!ls

## Load python scripts

In [None]:
case1_locations = ["massachusetts", "virginia", "washingtonDC", "georgia"]
case2_locations = ['michigan', 'minnesota', 'north_carolina', 'texas_bexar_county', 'wisconsin_milwaukee']
case3_locations = ['san_diego', 'florida']
functions_case1 = ['data_extract_' + loc for loc in case1_locations]
functions_case2 = ['data_extract_' + loc for loc in case2_locations]
functions_case3 = ['data_extract_' + loc for loc in case3_locations]

if github_flag:
    [importOrReload("functions_case1_csv_xlsx_gh", function_name) for function_name in functions_case1]
    [importOrReload("functions_case2_html_gh", function_name) for function_name in functions_case2]
    [importOrReload("functions_case3_pdf_table_gh", function_name) for function_name in functions_case3]
else:
    [importOrReload("functions_case1_csv_xlsx", function_name) for function_name in functions_case1]
    [importOrReload("functions_case2_html", function_name) for function_name in functions_case2]
    [importOrReload("functions_case3_pdf_table", function_name) for function_name in functions_case3]

## Create empty dictionary to hold results

In [None]:
## Dictionary to hold results
output = dict()

# Data Extraction

## Case 1: Read from CSV and XLSX files

In [None]:
output['Massachusetts'] = data_extract_massachusetts(validation = validation_flag, 
                                                     home_dir = home_dir)

In [None]:
output['Virginia'] = data_extract_virginia(validation = validation_flag, 
                                           home_dir = home_dir)

In [None]:
output['Washington DC'] = data_extract_washingtonDC(validation = validation_flag, 
                                                    home_dir = home_dir)

In [None]:
output['Georgia'] = data_extract_georgia(validation = validation_flag, 
                                         home_dir = home_dir)

## Case 2: Extract data from HTML source code

In [None]:
output['Michigan'] = data_extract_michigan(validation = validation_flag, 
                                         home_dir = home_dir)

In [None]:
output['Minnesota'] = data_extract_minnesota(validation = validation_flag, 
                                         home_dir = home_dir)

In [None]:
output['North Carolina'] = data_extract_north_carolina(validation = validation_flag, 
                                         home_dir = home_dir)

In [None]:
output['Texas -- Bexar County'] = data_extract_texas_bexar_county(validation = validation_flag, 
                                         home_dir = home_dir)

In [None]:
output['Wisconsin -- Milwaukee'] = data_extract_wisconsin_milwaukee(validation = validation_flag, 
                                         home_dir = home_dir)

## Case #3: PDF table extraction

In [None]:
output['San Diego'] = data_extract_san_diego(validation = validation_flag, 
                                         home_dir = home_dir)

### Note about Florida:

The Florida PDF is quite large and takes a long time to download.  After you download today's file once, you might optionally want to set `refresh = False` on subsequent runs to use the copy already saved locally.

In [None]:
output['Florida'] = data_extract_florida(validation = validation_flag, 
                                         home_dir = home_dir, refresh = True)  

# Combining into a single dataframe

In [None]:
output_df = pd.DataFrame(output).T
output_df

In [None]:
output_df['Status code'].values

In [None]:
date_stamp = datetime.now().strftime('%Y-%m-%d')
out_file = os.path.join(home_dir, 'output', 'csv', 'covid_disparities_output_' + date_stamp + '.csv')
output_df.to_csv(out_file)

In [None]:
out_file = os.path.join(home_dir, 'output', 'xlsx', 'covid_disparities_output_' + date_stamp + '.xlsx')
output_df.to_excel(out_file)

In [None]:
os.chdir(os.path.join(home_dir, 'output'))
!ls

## Clean up downloaded files (if applicable)

In [None]:
delete_file('functions_case1_csv_xlsx_gh.py')
delete_file('functions_case2_html_gh.py')
delete_file('functions_case3_pdf_table_gh.py')
delete_file('misc_helper_functions_gh.py')    