# Data For Black Lives COVID-19 Webscraping


<hr style="height:2pt">

We manually gathered data on states reporting COVID-19 cases and deaths by race. Below, we work on automatically scraping data from websites to update data daily.

In [None]:
# Import packages needed to run

import numpy as np
from bs4 import BeautifulSoup
import datetime
import email.utils as eut
from io import BytesIO
import pandas as pd
import re
import zipfile

Download the webpage into a `requests` object with the following attributes: 

- `webpage.text`: attribute that is a string. We need this for input to BS
- `webpage.status_code`: The `status_code` attribute returns the HTTP status code, which tells you whether your request was successful (200), or not
- `webpage.content`: The `content` attribute gives you the raw HTML page


In [None]:
def url_to_soup(data_url):
    """
    Converts string into beautiful soup object for parsing
    
    Parameters
    ----------
    data_url: string
        website link
    
    Returns
    -------
    data_soup: Beautifulsoup object
        HMTL code from webpage
    """
    data_page = requests.get(data_url)
    if (data_page.status_code) == 200:
        print('request successful')
    else:
        print('request failed for')

    # Create a Beautiful Soup object
    data_text = data_page.text
    data_soup = BeautifulSoup(data_text, "html.parser")

    # check to see a familiar HTML code
#     print(data_soup.prettify()[:])
    
    return data_soup

In [None]:
def get_json(url):
    """Simple function to return the parsed JSON from a web API."""
    # The next two lines can raise a requests.RequestException
    r = requests.get(url) 
    r.raise_for_status()
    # The next line can raise a ValueError
    return r.json()

In [None]:
def get_metadata_date(metadata_url):
    """For states using ESRI web services, the field metadata includes a timestamp. 
    This function fetches, extracts, and parses it, returning a datetime.date.
    """
    metadata = get_json(metadata_url)
    last_edit_ms = metadata['editingInfo']['lastEditDate']
    # The next line can raise OverflowError
    return datetime.date.fromtimestamp(last_edit_ms / 1000)

Create dictionary corresponds to a state and has the following data:
- `name`: the name of the state
- `Date Published`: date webpage updated or published
- `Total Cases`: total number of COVID-19 cases
- `Total Deaths`: total number of COVID-19 deaths
- `Pct Cases Black/AA`: percentage of cases that are Black/African American
- `Pct Deaths Black/AA`: percentage of deaths that are Black/African American

# Georgia

In [None]:
r = requests.get('https://ga-covid19.ondemand.sas.com/docs/ga_covid_data.zip')
# r.raise_for_status()
# Since we are downloading a ZIP file whose CSVs are not date-tagged,
# we might try use the HTTP Date header as an approximation
r.headers['Date']

In [None]:
# The Date header is in email date format; email.utils.parsedate can split this
http_date_tuple = eut.parsedate(r.headers['Date'])
http_date_tuple

In [None]:
# And taking a slice of the first three tuple elements, we can pass those to datetime.date
http_date = datetime.date(*http_date_tuple[0:3])

In [None]:
z = zipfile.ZipFile(BytesIO(r.content))

In [None]:
# Another date could be the last update of the demographics.csv file in the ZIP archive:
info = z.getinfo('demographics.csv')
info.date_time

In [None]:
# That looks better. Let's use that
zip_date = datetime.date(*info.date_time[0:3])

In [None]:
with z.open('demographics.csv') as cases:
    data = pd.read_csv(cases)
by_race = data[['race', 'Confirmed_Cases', 'Deaths']].groupby('race').sum()
totals = by_race.sum(axis=0)
GA = {
    'name': 'Georgia',
    'Date Published': zip_date,
    'Total Cases': totals['Confirmed_Cases'],
    'Total Deaths': totals['Deaths'],
    'Pct Cases Black/AA': by_race.loc['AFRICAN-AMERICAN', 'Confirmed_Cases'] / totals['Confirmed_Cases'],
    'Pct Deaths Black/AA': by_race.loc['AFRICAN-AMERICAN', 'Deaths'] / totals['Deaths'],
}
GA

# Delaware

In [None]:
DE_url = "https://myhealthycommunity.dhss.delaware.gov/locations/state"
DE_soup = url_to_soup(DE_url)

In [None]:
# # Michigan
MI = {
     'name': 'Michigan'
}
MI_url = "https://www.michigan.gov/coronavirus/0,9753,7-406-98163_98173---,00.html"
MI_soup = url_to_soup(MI_url)
tables = MI_soup.find_all('table')
for table in tables:
    caption = table.find('caption')
    if caption.string.find('Confirmed COVID-19 Case') >= 0:
        m = re.search('updated (\d+)/(\d+)/(\d+)', caption.string)
        mon, day, year = tuple(map(int, m.groups()))
        MI['Date Published'] = str(datetime.date(year, mon, day))
        trs = table.find('tbody').find_all('tr')
        tds = trs[-1].find_all('td')
        total_cases = int(tds[1].string)
        total_deaths = int(tds[2].string)
    elif caption.string == 'Cases by Race':
        for tr in table.find('tbody').find_all('tr'):
            tds = tr.find_all('td')
            if tds[0].string == 'Black or African American':
                aa_cases = int(tds[1].string.strip('% '))
                aa_deaths = int(tds[2].string.strip('% '))
MI['Total Cases'] = total_cases
MI['Total Deaths'] = total_deaths
MI['Pct Cases Black/AA'] = aa_cases / total_cases
MI['Pct Deaths Black/AA'] = aa_deaths / total_deaths
MI

# Minnesota

In [None]:
MN_url = "https://www.health.state.mn.us/diseases/coronavirus/situation.html#raceeth1"
MN_soup = url_to_soup(MN_url)

# find date and total number of cases and deaths
counter = 0
num_cases = ''
num_deaths = ''
for strong_tag in MN_soup.find_all('strong'):
    this_string = strong_tag.text, strong_tag.next_sibling
    this_heading = strong_tag.text
    if counter == 0:
        date_text = strong_tag.text.strip('.')[11:]
    if this_heading == 'Total positive: ':
        num_cases = strong_tag.next_sibling
    if this_heading == 'Deaths: ':
        num_deaths = strong_tag.next_sibling
    counter += 1
    
date_time_obj = datetime.datetime.strptime(date_text, "%B %d, %Y")
date_formatted = date_time_obj.strftime("%m/%d/%Y")
print('Date:', date_formatted)
print('Number Cases:', num_cases)
print('Number Deaths:', num_deaths)

# find number of Black/AA cases and deaths
table = MN_soup.find("div", attrs={"id":"raceeth"})
counter = 0
pct_cases = ''
pct_deaths = ''
for th in table.find_all('th'):
    text = th.text
#     print(th.next_sibling)
    if text == "Black":
#         print(table.find_all('td'))
        pct_cases = table.find_all('td')[counter-2].text.strip('%')
        pct_deaths = table.find_all('td')[counter-1].text.strip('%')
    counter += 1

print('Pct Cases Black/AA:', pct_cases)
print('Pct Deaths Black/AA:', pct_deaths)

# North Carolina

In [None]:
NC_url = "https://www.ncdhhs.gov/divisions/public-health/covid19/covid-19-nc-case-count#by-race-ethnicity"
NC_soup = url_to_soup(NC_url)

# find date and total number of cases and deaths
date_text = NC_soup.find("div", attrs={"class":"field-item"}).p.text[50:]
date_time_obj = datetime.datetime.strptime(date_text, "%B %d, %Y. ")
date_formatted = date_time_obj.strftime("%m/%d/%Y")


field_item = NC_soup.find("div", attrs={"class":"field-item"})
# num_cases = field_item.findAll("tr")[1].td.text
items = field_item.findAll("tr")[1]
num_cases = items.findAll("td")[1].text
num_deaths = items.findAll("td")[0].text

print('Date:', date_formatted)
print('Number Cases:', num_cases)
print('Number Deaths:', num_deaths)

# find number of Black/AA cases and deaths
tables = NC_soup.findAll("table")
race_data = tables[4]
num_race_cases = race_data.findAll("td")[6]
num_race_deaths = race_data.findAll("td")[8]
pct_cases = race_data.findAll("td")[22].text.strip('%')
pct_deaths = race_data.findAll("td")[24].text.strip('%')

print('Pct Cases Black/AA:', pct_cases)
print('Pct Deaths Black/AA:', pct_deaths)

# Texas - Bexar County

In [None]:
TX_Bexar = {
    'name': 'Texas - Bexar County',
}
try:
    # Start by fetching the metadata to get the likey timestamp
    md_date = get_metadata_date('https://services.arcgis.com/g1fRTDLeMgspWrYp/arcgis/rest/services/vRaceEthnicity/FeatureServer/0?f=json')
    TX_Bexar['Date Published'] = str(md_date)

    # Next get the cumulative case and death counts
    total = get_json('https://services.arcgis.com/g1fRTDLeMgspWrYp/arcgis/rest/services/vDateCOVID19_Tracker_Public/FeatureServer/0/query?f=json&where=Date%20BETWEEN%20timestamp%20%272020-05-07%2005%3A00%3A00%27%20AND%20timestamp%20%272020-05-08%2004%3A59%3A59%27&returnGeometry=false&spatialRel=esriSpatialRelIntersects&outFields=*&resultOffset=0&resultRecordCount=50&resultType=standard&cacheHint=true')
    TX_Bexar['Total Cases'] = total['features'][0]['attributes']['ReportedCum']
    TX_Bexar['Total Deaths'] = total['features'][0]['attributes']['DeathsCum']

    # And finally the race/ethnicity breakdowns
    data = get_json('https://services.arcgis.com/g1fRTDLeMgspWrYp/arcgis/rest/services/vRaceEthnicity/FeatureServer/0/query?f=json&where=1%3D1&returnGeometry=false&spatialRel=esriSpatialRelIntersects&outFields=*&resultOffset=0&resultRecordCount=20&resultType=standard&cacheHint=true')
    for feature in data.get('features', []):
        if feature['attributes']['RaceEthnicity'] == 'Black':
            TX_Bexar['Pct Cases Black/AA'] = feature['attributes']['CasesConfirmed'] / TX_Bexar['Total Cases']
            TX_Bexar['Pct Deaths Black/AA'] = feature['attributes']['Deaths'] / TX_Bexar['Total Deaths']
            break
    if 'Pct Cases Black/AA' not in TX_Bexar:
        raise ValueError('No data found for Black RaceEthnicity category')

except OverflowError as e:
    print("Error processing last update timstamp for TX_Bexar")
except ValueError as e:
    print("Error processing data for TX_Bexar", e)
except requests.RequestException as e:
    print("Error retrieving URL for TX_Bexar:", e.request.url)
TX_Bexar

# WI - Milwaukee

In [None]:
WI_Milwaukee = {
    'name': 'Wisconsin - Milwaukee',
}
try:
    # Get the timestamp
    cases_date = get_metadata_date('https://services5.arcgis.com/8Q02ELWlq5TYUASS/arcgis/rest/services/Cases_View/FeatureServer/0?f=json')
    deaths_date = get_metadata_date('https://services5.arcgis.com/8Q02ELWlq5TYUASS/arcgis/rest/services/Deaths_View1/FeatureServer/0?f=json')
    if cases_date != deaths_date:
        print('Unexpected mismath between cases and deaths metadata dates:', cases_date, '!=', deaths_date)
    WI_Milwaukee['Date Published'] = str(cases_date)
    
    cases_total = get_json('https://services5.arcgis.com/8Q02ELWlq5TYUASS/arcgis/rest/services/Cases_View/FeatureServer/0/query?f=json&where=1%3D1&returnGeometry=false&spatialRel=esriSpatialRelIntersects&outFields=*&outStatistics=%5B%7B%22statisticType%22%3A%22count%22%2C%22onStatisticField%22%3A%22ObjectId%22%2C%22outStatisticFieldName%22%3A%22value%22%7D%5D&resultType=standard&cacheHint=true')
    WI_Milwaukee['Total Cases'] = cases_total['features'][0]['attributes']['value']
    deaths_total = get_json('https://services5.arcgis.com/8Q02ELWlq5TYUASS/arcgis/rest/services/Deaths_View1/FeatureServer/0/query?f=json&where=1%3D1&returnGeometry=false&spatialRel=esriSpatialRelIntersects&outFields=*&outStatistics=%5B%7B%22statisticType%22%3A%22count%22%2C%22onStatisticField%22%3A%22ObjectId%22%2C%22outStatisticFieldName%22%3A%22value%22%7D%5D&resultType=standard&cacheHint=true')
    WI_Milwaukee['Total Deaths'] = deaths_total['features'][0]['attributes']['value']
    
    cases_by_race = get_json('https://services5.arcgis.com/8Q02ELWlq5TYUASS/arcgis/rest/services/Cases_View/FeatureServer/0/query?f=json&where=Race_Eth%20NOT%20LIKE%20%27%25%23N%2FA%27&returnGeometry=false&spatialRel=esriSpatialRelIntersects&outFields=*&groupByFieldsForStatistics=Race_Eth&orderByFields=value%20desc&outStatistics=%5B%7B%22statisticType%22%3A%22count%22%2C%22onStatisticField%22%3A%22ObjectId%22%2C%22outStatisticFieldName%22%3A%22value%22%7D%5D&resultType=standard&cacheHint=true')
    for feature in cases_by_race['features']:
        if feature['attributes']['Race_Eth'] == 'Black Alone':
            WI_Milwaukee['Pct Cases Black/AA'] = feature['attributes']['value'] / WI_Milwaukee['Total Cases']
            break

    deaths_by_race = get_json('https://services5.arcgis.com/8Q02ELWlq5TYUASS/arcgis/rest/services/Deaths_View1/FeatureServer/0/query?f=json&where=1%3D1&returnGeometry=false&spatialRel=esriSpatialRelIntersects&outFields=*&groupByFieldsForStatistics=Race_Eth&orderByFields=value%20desc&outStatistics=%5B%7B%22statisticType%22%3A%22count%22%2C%22onStatisticField%22%3A%22ObjectId%22%2C%22outStatisticFieldName%22%3A%22value%22%7D%5D&resultType=standard&cacheHint=true')
    for feature in deaths_by_race['features']:
        if feature['attributes']['Race_Eth'] == 'Black Alone':
            WI_Milwaukee['Pct Deaths Black/AA'] = feature['attributes']['value'] / WI_Milwaukee['Total Deaths']
            break
except OverflowError as e:
    print("Error processing last update timstamp for WI_Milwaukee")
except ValueError as e:
    print("Error processing data for WI_Milwaukee", e)
except requests.RequestException as e:
    print("Error retrieving URL for WI_Milwaukee:", e.request.url)
WI_Milwaukee