Code formatting with [black](https://pypi.org/project/nb-black/).

Add our `utils` directory to the system's `$PATH` so we can import Python files from sibling directories.

In [2]:
import os
import sys
import glob

In [3]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import regex as re

In [4]:
data_dir = os.path.join(os.path.abspath(""), "data")

Retrieve the page

In [5]:
url = "https://www.cdph.ca.gov/Programs/CID/DCDC/Pages/COVID-19/Regional-ICU-Capacity.aspx"

In [6]:
page = requests.get(url)

Parse it

In [7]:
soup = BeautifulSoup(page.content, "html.parser")

Focus in on the content well.

In [8]:
content = soup.find("div", {"id": "s4-bodyContainer"})

Get the timestamp

In [9]:
date_container = content.find("span", {"class": "article-date-title"})

In [10]:
latest_date = pd.to_datetime(date_container.text.strip()).date()

In [11]:
latest_date

datetime.date(2021, 9, 6)

Get table

In [12]:
table = content.find("table", {"class": "ms-rteTable-4"})

Verify the table is there

In [13]:
assert len(table) == 1

Read it in

In [14]:
table_headers = [
    "region",
    "pct_icu_beds_available",  # % of Staffed Adult ICU Beds Available
    "pct_covid_positive_patients",  # % COVID+ in Adult ICU Beds
    "consecutive_days_under_10_pct",  # Number of  Consecutive Days Under 10 %
    "health_order_effective_date",  # Date Health Order Effective
    "health_order_expiration_date",  # Date Health Order Set to Expire
]

In [15]:
def safetxt(element):
    v = element.text.strip()
    v = v.replace("\u200b", "")
    return v

In [16]:
def safenumber(element):
    v = safetxt(element)
    v = v.replace(",", "")
    v = v.replace(" ", "")
    return v

In [17]:
def parse_table(soup):
    tbody = soup.tbody
    row_list = tbody.find_all("tr")[1:]
    dict_list = []
    for row in row_list:
        cell_list = row.find_all("td")
        d = dict(
            region=safetxt(cell_list[0]),
            pct_icu_beds_available=safenumber(cell_list[1]),
            pct_covid_positive_patients=safenumber(cell_list[2]),
            consecutive_days_under_10_pct=safenumber(cell_list[3]),
            health_order_effective_date=safenumber(cell_list[4]),
            health_order_expiration_date=safenumber(cell_list[5]),
        )
        dict_list.append(d)
    df = pd.DataFrame(dict_list)
    df["date"] = latest_date
    return df

In [18]:
df = parse_table(table)

Clean up missing values

In [19]:
df["health_order_effective_date"] = pd.to_datetime(
    df["health_order_effective_date"], errors="coerce"
)

In [20]:
df["health_order_expiration_date"] = pd.to_datetime(
    df["health_order_expiration_date"], errors="coerce"
)

In [21]:
df["consecutive_days_under_10_pct"] = df["consecutive_days_under_10_pct"].replace(
    "--", "0"
)

In [22]:
df

Unnamed: 0,region,pct_icu_beds_available,pct_covid_positive_patients,consecutive_days_under_10_pct,health_order_effective_date,health_order_expiration_date,date
0,California Statewide,20.3,36.7,0,NaT,NaT,2021-09-06
1,Bay Area,24.6,31.4,0,NaT,NaT,2021-09-06
2,Greater Sacramento,14.8,43.2,0,NaT,NaT,2021-09-06
3,Northern California,19.7,54.7,0,NaT,NaT,2021-09-06
4,San Joaquin Valley,8.4,52.7,7,2021-09-03,2021-09-09,2021-09-06
5,Southern California,21.8,33.4,0,NaT,NaT,2021-09-06


Verify that all the values can be converted to floats

In [23]:
try:
    df.set_index(
        [
            "date",
            "health_order_effective_date",
            "health_order_expiration_date",
            "region",
        ]
    ).astype(float)
except Exception as e:
    raise AssertionError(e)

Write it out

In [24]:
df.to_csv(
    os.path.join(data_dir, f"daily/icu-capacity-{latest_date}.csv"),
    index=False,
)

### Concatenate

Combine all scraped tables into one timeseries

In [25]:
path = ""
files = glob.glob(os.path.join(path, "data/daily/*.csv"))

In [26]:
file_df = (
    pd.read_csv(
        f,
        low_memory=False,
        parse_dates=[
            "date",
            "health_order_effective_date",
            "health_order_expiration_date",
        ],
    )
    for f in files
)

In [27]:
concat_df = pd.concat(
    file_df,
    ignore_index=True,
)

In [28]:
sort_df = concat_df.sort_values(["date", "region"])

In [29]:
sort_df.to_csv("data/all/all.csv", index=False)