## Configuration
_Initial steps to get the notebook ready to play nice with our repository. Do not delete this section._

Code formatting with [black](https://pypi.org/project/nb-black/).

In [1]:
%load_ext lab_black

In [2]:
import os
import pathlib

In [3]:
this_dir = pathlib.Path(os.path.abspath(""))

In [4]:
data_dir = this_dir / "data"

In [5]:
import re
import pytz
import glob
import requests
import unicodedata
import pandas as pd
from datetime import datetime
from bs4 import BeautifulSoup

### Download / retrieve the page

In [6]:
url = "http://health.co.lake.ca.us/Coronavirus/COVID-19_Data.htm"

In [7]:
page = requests.get(url)

### Parse with BeautifulSoup

In [8]:
soup = BeautifulSoup(page.content, "html.parser")

### Get content well

In [9]:
content = soup.find("div", class_="content container")

### Extract timestamp from content

Finding h3 after h2 demographics head

In [10]:
prev_tag = content.find_all("h2")[3]

Find next h3

In [11]:
next_tag = prev_tag.find_next("p").text

In [12]:
date = next_tag.replace("Last Update: ", "").split(",")[0]

### Convert timestamp to python date obj

In [13]:
tz = pytz.timezone("America/Los_Angeles")

In [14]:
today = datetime.now(tz).date()

In [15]:
date = date + ", " + str(today.year)

In [16]:
latest_date = pd.to_datetime(date).date()

### Get table

In [17]:
table = content.find_all(text=re.compile("Zip Code"))[0].findParent("table")

IndexError: list index out of range

In [78]:
tbody = table.tbody

In [79]:
row_list = tbody.find_all("tr")

In [80]:
def safetxt(element):
    v = element.text.strip()
    v = v.replace("\u200b", "")
    return v

In [81]:
def safenumber(element):
    v = safetxt(element)
    v = v.replace(",", "")
    v = v.replace(" ", "")
    return v

In [82]:
dict_list = []
for row in row_list:
    zip_codes = row.find_all("td")[-2].text
    totals = row.find_all("td")[-1]
    # col_one = cell_list.select("td")[1].select("td")[0]
    # zip_codes = cell_list.find_all("td", text=re.compile("^\d{5}(?:[-\s]\d{4})?$"))
    d = dict(
        county="Lake",
        area=zip_codes,
        confirmed_cases=safetxt(totals),
    )
    dict_list.append(d)

In [83]:
df = pd.DataFrame(dict_list)

In [84]:
df.drop(df[df["area"] == "Zip Code"].index, inplace=True)

In [85]:
df["county_date"] = latest_date

### Vet

Lake County has 16 ZIP code areas, including "Unknown"

In [86]:
default_lake_len = 16

Replacing with two assertion tests

In [87]:
try:
    assert not len(df) > default_lake_len
except AssertionError:
    raise AssertionError(
        "Lake County scraper: spreadsheet has additional zip code(s) than previous"
    )

In [88]:
try:
    assert not len(df) < default_lake_len
except AssertionError:
    raise AssertionError("Lake County scraper: spreadsheet is missing row(s)")

### Export csv file

Write csv file timestamped with today's date

In [89]:
slug = "lake"

In [90]:
df.to_csv(data_dir / slug / f"{today}.csv", index=False)

## Combine

In [91]:
csv_list = [
    i
    for i in glob.glob(str(data_dir / slug / "*.csv"))
    if not str(i).endswith("timeseries.csv")
]

In [92]:
df_list = []
for csv in csv_list:
    if "manual" in csv:
        df = pd.read_csv(csv, parse_dates=["date"])
    else:
        file_date = csv.split("/")[-1].replace(".csv", "")
        df = pd.read_csv(csv, parse_dates=["county_date"])
        df["date"] = file_date
    df_list.append(df)

In [93]:
df = pd.concat(df_list).sort_values(["date", "area"])

In [94]:
df.to_csv(data_dir / slug / "timeseries.csv", index=False)