## Configuration
_Initial steps to get the notebook ready to play nice with our repository. Do not delete this section._

Code formatting with [black](https://pypi.org/project/nb-black/).

The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black


Add our `utils` directory to the system's `$PATH` so we can import Python files from sibling directories.

In [139]:
import os
import pathlib

In [140]:
this_dir = pathlib.Path(os.path.abspath(""))

In [141]:
data_dir = this_dir / "data"

In [142]:
import os
import glob
import requests
import pandas as pd
from bs4 import BeautifulSoup
import regex as re

Retrieve the page

In [143]:
url = "https://www.cdcr.ca.gov/covid19/cdcr-cchcs-covid-19-status/"

In [144]:
page = requests.get(url)

Parse it.

In [145]:
soup = BeautifulSoup(page.content, "html.parser")

Focus in on the content well.

In [146]:
content = soup.find("div", {"class": "page-standard__content"})

### Get the timestamp

In [147]:
update_date_string = content.find("p", text=re.compile("Updated as of ")).text

In [148]:
update_date_string_trim = update_date_string.replace("Updated as of ", "").split(" (")[
    0
]

In [149]:
latest_date = pd.to_datetime(update_date_string_trim).date()

### Get table of employee cases

In [150]:
table_container = content.find("figure", {"class": "wp-block-table"})

In [151]:
table = table_container.find("table")

In [152]:
def safetxt(element):
    v = element.text.strip()
    v = v.replace("\u200b", "")
    return v

In [153]:
def safenumber(element):
    v = safetxt(element)
    v = v.replace(",", "")
    v = v.replace(" ", "")
    return v

In [154]:
def parse_table(soup):
    tbody = soup.tbody
    row_list = tbody.find_all("tr")[1:]
    dict_list = []
    for row in row_list:
        cell_list = row.find_all("td")
        d = dict(
            institution_name=safetxt(cell_list[0]),
            confirmed_cases=safenumber(cell_list[1]),
            staff_returned_to_work=safenumber(cell_list[2]),
            active_cases=safenumber(cell_list[3]),
            new_confirmed_cases_14_day_total=safenumber(cell_list[4]),
        )
        dict_list.append(d)
    df = pd.DataFrame(dict_list)
    df["update_date"] = latest_date
    return df

In [155]:
df = parse_table(table)

Read the timeseries CSV in and append the latest scrape

In [156]:
employee_timeseries = pd.read_csv(data_dir / "employees/employee-cases.csv")

In [157]:
e_timeseries = df.copy()

In [158]:
e_timeseries["update_date"] = latest_date

In [159]:
if e_timeseries["update_date"].unique() in df["update_date"].unique():
    pass
else:
    employee_timeseries = pd.concat([employee_timeseries, e_timeseries])

### Get list of employee deaths

In [160]:
deaths_ol = content.find("ol")

In [161]:
raw_deaths_list = []
for li in deaths_ol:
    list_item_text = li.text
    raw_deaths_list.append(list_item_text)

In [162]:
deaths_df = pd.DataFrame(raw_deaths_list)

## Export

Employee cases

In [163]:
employee_timeseries.to_csv(data_dir / "employees/employee-cases.csv", index=False)

Employee death list 

In [164]:
deaths_df.to_csv(data_dir / "employees/employee-deaths.csv", index=False)