## Configuration
_Initial steps to get the notebook ready to play nice with our repository. Do not delete this section._

Code formatting with [black](https://pypi.org/project/nb-black/).

In [1]:
%load_ext lab_black

In [2]:
import os
import pathlib

In [3]:
this_dir = pathlib.Path(os.path.abspath(""))

In [4]:
data_dir = this_dir / "data"

In [5]:
import re
import pytz
import glob
import requests
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime, date, timedelta

## Download

In [6]:
GOOGLE_DOC_URL = "https://docs.google.com/spreadsheets/d/e/2PACX-1vRumJLxZ2OIN9TBV14uUbtdhxLDxddnDh0J3ASGUHMYvoS_dyal8OW2xOkbspt_hx3gj2tM5ByyVfTK/pubhtml?gid=2002140219&single=true"

In [7]:
ZIPS = ["Case Count by Zip Code"]

In [8]:
response = requests.get(GOOGLE_DOC_URL)

## Parse

In [9]:
soup = BeautifulSoup(response.text, "html.parser")

In [10]:
for elem in soup(text=re.compile(r"Case Count by Zip Code")):
    table_header = elem.parent.parent

In [11]:
first_tr = table_header.find_next_sibling("tr")

In [12]:
rows = []

In [13]:
for tag in first_tr.next_siblings:
    d = dict(
        city=tag.find_all("td")[2].text,
        zip_code=tag.find_all("td")[3].text,
        confirmed_cases=tag.find_all("td")[4].text,
    )
    rows.append(d)
    if len(tag.find_all("td")[2].contents) == 0:
        break

In [14]:
df = pd.DataFrame(rows, columns=["city", "zip_code", "confirmed_cases"])

Drop empty rows

In [15]:
df = df[df.confirmed_cases != ""]

In [16]:
df["area"] = df["zip_code"] + ": " + df["city"]

In [17]:
df.insert(0, "county", "Humboldt")

Get the date, updated weekly

In [18]:
county_date_adjacent = soup.find("td", text="Weekly Data - Last Updated:")

In [19]:
raw_date = county_date_adjacent.next_sibling.text

In [20]:
df["county_date"] = datetime.strptime(raw_date, "%m/%d/%Y")

Clean up

In [21]:
df = df[["county", "area", "confirmed_cases", "county_date", "zip_code"]].rename(
    columns={"zip_code": "zip"}
)

## Vet

In [22]:
try:
    assert not len(df) > 35
except AssertionError:
    raise AssertionError("Humboldt County's scraper has extra rows")

In [23]:
try:
    assert not len(df) < 35
except AssertionError:
    raise AssertionError("Humboldt County's scraper is missing rows")

## Export

Set the date

In [24]:
tz = pytz.timezone("America/Los_Angeles")

In [25]:
today = datetime.now(tz).date()

In [26]:
slug = "humboldt"

In [27]:
df.to_csv(data_dir / slug / f"{today}.csv", index=False)

## Combine

In [28]:
csv_list = [
    i
    for i in glob.glob(str(data_dir / slug / "*.csv"))
    if not str(i).endswith("timeseries.csv")
]

In [29]:
df_list = []
for csv in csv_list:
    if "manual" in csv:
        df = pd.read_csv(csv, parse_dates=["date"])
    else:
        file_date = csv.split("/")[-1].replace(".csv", "")
        df = pd.read_csv(csv, parse_dates=["county_date"])
        df["date"] = file_date
    df_list.append(df)

In [30]:
df = pd.concat(df_list).sort_values(["date", "area"])

In [31]:
df.to_csv(data_dir / slug / "timeseries.csv", index=False)