# Data For Black Lives COVID-19 Webscraping


<hr style="height:2pt">

We manually gathered data on states reporting COVID-19 cases and deaths by race. Below, we work on automatically scraping data from websites to update data daily.

In [1]:
## Commented code is just FORMATTING 
import requests
# from IPython.core.display import HTML
# styles = requests.get("https://raw.githubusercontent.com/Harvard-IACS/2018-CS109A/master/content/styles/cs109.css").text
# HTML(styles)

In [2]:
# Import packages needed to run

import numpy as np
from bs4 import BeautifulSoup
import datetime

Download the webpage into a `requests` object with the following attributes: 

- `webpage.text`: attribute that is a string. We need this for input to BS
- `webpage.status_code`: The `status_code` attribute returns the HTTP status code, which tells you whether your request was successful (200), or not
- `webpage.content`: The `content` attribute gives you the raw HTML page


In [3]:
def url_to_soup(data_url):
    """
    Converts string into beautiful soup object for parsing
    
    Parameters
    ----------
    data_url: string
        website link
    
    Returns
    -------
    data_soup: Beautifulsoup object
        HMTL code from webpage
    """
    data_page = requests.get(data_url)
    if (data_page.status_code) == 200:
        print('request successful')
    else:
        print('request failed')

    # Create a Beautiful Soup object
    data_text = data_page.text
    data_soup = BeautifulSoup(data_text, "html.parser")

    # check to see a familiar HTML code
#     print(data_soup.prettify()[:])
    
    return data_soup

Create dictionary corresponds to a state and has the following data:
- `name`: the name of the state
- `Date Published`: date webpage updated or published
- `Total Cases`: total number of COVID-19 cases
- `Total Deaths`: total number of COVID-19 deaths
- `Pct Cases Black/AA`: percentage of cases that are Black/African American
- `Pct Deaths Black/AA`: percentage of deaths that are Black/African American

In [5]:
# # Georgia
GA_page = requests.get("https://dph.georgia.gov/covid-19-daily-status-report")
GA_soup = url_to_soup(GA_page)

InvalidURL: Failed to parse: <Response [200]>

In [6]:
# # Delaware
DE_page = requests.get("https://myhealthycommunity.dhss.delaware.gov/locations/state")
DE_soup = url_to_soup(DE_page)

InvalidURL: Failed to parse: <Response [200]>

In [7]:
# # Michigan
MI_page = requests.get("https://www.michigan.gov/coronavirus/0,9753,7-406-98163_98173---,00.html")
MI_soup = url_to_soup(MI_page)

InvalidURL: Failed to parse: <Response [200]>

In [125]:
# Minnesota Webscraping
MN_url = "https://www.health.state.mn.us/diseases/coronavirus/situation.html#raceeth1"
MN_soup = url_to_soup(MN_url)

# find date and total number of cases and deaths
counter = 0
num_cases = ''
num_deaths = ''
for strong_tag in MN_soup.find_all('strong'):
    this_string = strong_tag.text, strong_tag.next_sibling
    this_heading = strong_tag.text
    if counter == 0:
        date_text = strong_tag.text.strip('.')[11:]
    if this_heading == 'Total positive: ':
        num_cases = strong_tag.next_sibling
    if this_heading == 'Deaths: ':
        num_deaths = strong_tag.next_sibling
    counter += 1
    
date_time_obj = datetime.datetime.strptime(date_text, "%B %d, %Y")
date_formatted = date_time_obj.strftime("%m/%d/%Y")
print('Date:', date_formatted)
print('Number Cases:', num_cases)
print('Number Deaths:', num_deaths)

# find number of Black/AA cases and deaths
table = MN_soup.find("div", attrs={"id":"raceeth"})
counter = 0
pct_cases = ''
pct_deaths = ''
for th in table.find_all('th'):
    text = th.text
#     print(th.next_sibling)
    if text == "Black":
#         print(table.find_all('td'))
        pct_cases = table.find_all('td')[counter-2].text.strip('%')
        pct_deaths = table.find_all('td')[counter-1].text.strip('%')
    counter += 1

print('Pct Cases Black/AA:', pct_cases)
print('Pct Deaths Black/AA:', pct_deaths)

request successful
Date: 05/03/2020
Number Cases: 6,663
Number Deaths: 419
	
Pct Cases Black/AA: 15
Pct Deaths Black/AA: 4


In [210]:
# North Carollina
NC_url = "https://www.ncdhhs.gov/divisions/public-health/covid19/covid-19-nc-case-count#by-race-ethnicity"
NC_soup = url_to_soup(NC_url)

# find date and total number of cases and deaths
date_text = NC_soup.find("div", attrs={"class":"field-item"}).p.text[50:]
date_time_obj = datetime.datetime.strptime(date_text, "%B %d, %Y. ")
date_formatted = date_time_obj.strftime("%m/%d/%Y")


field_item = NC_soup.find("div", attrs={"class":"field-item"})
# num_cases = field_item.findAll("tr")[1].td.text
items = field_item.findAll("tr")[1]
num_cases = items.findAll("td")[1].text
num_deaths = items.findAll("td")[0].text

print('Date:', date_formatted)
print('Number Cases:', num_cases)
print('Number Deaths:', num_deaths)

# find number of Black/AA cases and deaths
tables = NC_soup.findAll("table")
race_data = tables[4]
num_race_cases = race_data.findAll("td")[6]
num_race_deaths = race_data.findAll("td")[8]
pct_cases = race_data.findAll("td")[22].text.strip('%')
pct_deaths = race_data.findAll("td")[24].text.strip('%')

print('Pct Cases Black/AA:', pct_cases)
print('Pct Deaths Black/AA:', pct_deaths)

request successful
Date: 05/04/2020
Number Cases: 430
Number Deaths: 11,848
Pct Cases Black/AA: 37
Pct Deaths Black/AA: 34


In [211]:
# Texas
TX_url = "https://cosagis.maps.arcgis.com/apps/opsdashboard/index.html#/d2c7584fe9fd4da1b30cb9d6cc311163"
TX_soup = url_to_soup(TX_url)

request successful


In [247]:
# TX_soup.find("div", attrs={"id":"ember522"})
# TX_soup.body.find("div", attrs={"id":"ember53"})
# TX_soup.findAll("body")

[<body class="claro">
 <script data-amd="true" src="https://js.arcgis.com/3.32/init.js"></script>
 <script data-amd-loading="true" src="assets/amd-loading-d8029d0343fa400ebae9865c42984750.js"></script>
 <div class="full-height flex-vertical flex-justify-center flex-align-items-center" id="initialLoadingContainer">
 <div class="loader is-active">
 <div class="loader-bars"></div>
 </div>
 </div>
 </body>]

In [250]:
#ember522 > svg > g.responsive-text-label > svg > text
TX_soup

<!DOCTYPE html>

<html>
<head>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<title>ArcGIS Dashboards</title>
<meta content="" name="description"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<link href="assets/images/favicon.ico?" rel="icon" type="image/x-icon"/>
<link href="https://js.arcgis.com/3.32/dijit/themes/claro/claro.css" rel="stylesheet" type="text/css"/>
<link href="https://js.arcgis.com/3.32/esri/css/esri.css" rel="stylesheet" type="text/css"/>
<link href="assets/vendor-ff6a5e0c0264e398e1ffaeb015926635.css" rel="stylesheet"/>
<link href="assets/app-light-7137f008b303d663c3645f07f162e89f.css" rel="stylesheet"/>
<script src="assets/amd-config-7e9801fc9c916a27bb75c6f356e09e0d.js"></script>
</head>
<body class="claro">
<script data-amd="true" src="https://js.arcgis.com/3.32/init.js"></script>
<script data-amd-loading="true" src="assets/amd-loading-d8029d0343fa400ebae9865c42984750.js"></script>
<div class="full-height f

In [259]:
# raw_data = TX_soup.find("body", attrs={"class":"claro"})
# raw_data.findAll('script')[1]

In [260]:
# Wisconsin (Milwaukee)
WI_url = "https://county.milwaukee.gov/EN/COVID-19"
WI_soup = url_to_soup(WI_url)

request successful


In [265]:
# WI_soup.findAll("div", attrs={"class":"embed-container"})
print(WI_soup.prettify()[:])

<!DOCTYPE doctype html>
<html class="Unknown 0_0 Unknown Unknown titanDisplay" id="htmlTag" xmlns="http://www.w3.org/1999/xhtml">
 <head>
  <meta content="initial-scale=1,maximum-scale=4.0,minimum-scale=1.0,user-scalable=0,width=device-width" name="viewport"/>
  <link href="//fonts.googleapis.com/css?family=Lato:400,400italic,700|Oswald:400,300,700" rel="stylesheet" type="text/css"/>
  <script src="//ajax.googleapis.com/ajax/libs/jquery/1.11.1/jquery.min.js">
  </script>
  <script src="/CommonScripts/NWS.js">
  </script>
  <script language="JavaScript" src="/CommonScripts/jquery.bxslider.js" type="text/javascript">
  </script>
  <script src="//code.jquery.com/ui/1.12.1/jquery-ui.js">
  </script>
  <link href="/CommonScripts/fancybox/jquery.fancybox.css?v=2.1.5" media="screen" rel="stylesheet" type="text/css">
   <script src="/CommonScripts/fancybox/jquery.fancybox.pack.js?v=2.1.5" type="text/javascript">
   </script>
   <script language="JavaScript" src="/CommonScripts/MilwaukeeCounty.