Code formatting with [black](https://pypi.org/project/nb-black/).

In [44]:
%load_ext lab_black

The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black


Add our `utils` directory to the system's `$PATH` so we can import Python files from sibling directories.

In [73]:
import os
import sys
import glob

In [74]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [47]:
data_dir = os.path.join(os.path.abspath(""), "data")

Retrieve the page

In [48]:
url = "https://www.cdph.ca.gov/Programs/CID/DCDC/Pages/COVID-19/Race-Ethnicity.aspx"

In [49]:
page = requests.get(url)

Parse it.

In [50]:
soup = BeautifulSoup(page.content, "html.parser")

Focus in on the content well.

In [51]:
content = soup.find("div", {"id": "s4-bodyContainer"})

Get the timestamp

In [52]:
date_container = content.find("span", {"class": "article-date-title"})

In [53]:
latest_date = pd.to_datetime(date_container.text.strip()).date()

Get tables.

In [54]:
table_list = content.find_all("table", {"class": "ms-rteTable-4"})

Verify there are eight of them.

In [55]:
assert len(table_list) == 8

Read them in.

In [56]:
table_slugs = ["all", "18+", "0-17", "18-34", "35-49", "50-64", "65-79", "80+"]

In [57]:
table_headers = [
    "race",
    "cases_total",
    "cases_pct",
    "deaths_total",
    "deaths_pct",
    "population_pct",
]

In [58]:
def safetxt(element):
    v = element.text.strip()
    v = v.replace("\u200b", "")
    return v

In [59]:
def safenumber(element):
    v = safetxt(element)
    v = v.replace(",", "")
    v = v.replace(" ", "")
    return v

In [60]:
def parse_table(soup):
    tbody = soup.tbody
    row_list = tbody.find_all("tr")[1:]
    dict_list = []
    for row in row_list:
        cell_list = row.find_all("td")
        d = dict(
            race=safetxt(cell_list[0]),
            cases_total=safenumber(cell_list[1]),
            cases_pct=safenumber(cell_list[2]),
            deaths_total=safenumber(cell_list[3]),
            deaths_pct=safenumber(cell_list[4]),
            population_pct=safenumber(cell_list[5]),
        )
        dict_list.append(d)
    df = pd.DataFrame(dict_list)
    df["date"] = latest_date
    return df

In [61]:
df_list = []
for slug, table in zip(table_slugs, table_list):
    df = parse_table(table)
    df["age"] = slug
    df_list.append(df)

Combine them.

In [62]:
concat_df = pd.concat(df_list)

Verify that all the values can be converted to floats

In [63]:
try:
    concat_df.set_index(["date", "race", "age"]).astype(float)
except Exception as e:
    raise AssertionError(e)

Write them out.

In [64]:
concat_df.to_csv(
    os.path.join(
        data_dir, f"totals/race-ethnicity-{latest_date.strftime('%Y%m%d')}.csv"
    ),
    index=False,
)

In [78]:
totals_df = pd.concat(
    [
        pd.read_csv(i, parse_dates=["date"])
        for i in glob.glob(data_dir + "/totals/*.csv")
        if not str(i).endswith("timeseries.csv")
    ]
)

In [80]:
totals_df.to_csv(os.path.join(data_dir, "totals/timeseries.csv"), index=False)

In [65]:
missing_text = content.find_all("h4")
missing_text = [i.text for i in missing_text]

In [66]:
missing_text = list(filter(None, missing_text))

In [67]:
def safetxt2(element):
    v = element.strip()
    v = v.replace("\xa0", " ")
    v = v.replace("*", " ")
    return v

In [68]:
missing_text = [safetxt2(i) for i in missing_text]

In [69]:
mt_df = pd.DataFrame(missing_text)

In [70]:
mt_df["date"] = latest_date

In [71]:
mt_df.to_csv(
    os.path.join(
        data_dir,
        f"missing/raceethnicity-missingtext-{latest_date.strftime('%Y%m%d')}.csv",
    ),
    index=False,
)

In [82]:
missing_df = pd.concat(
    [
        pd.read_csv(i, parse_dates=["date"])
        for i in glob.glob(data_dir + "/missing/*.csv")
        if not str(i).endswith("timeseries.csv")
    ]
)

In [83]:
missing_df.to_csv(os.path.join(data_dir, "missing/timeseries.csv"), index=False)