## Configuration
_Initial steps to get the notebook ready to play nice with our repository. Do not delete this section._

Code formatting with [black](https://pypi.org/project/nb-black/).

In [38]:
%load_ext lab_black

The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black


In [39]:
import os
import pathlib

In [40]:
this_dir = pathlib.Path(os.path.abspath(""))

In [41]:
data_dir = this_dir / "data"

In [42]:
import pytz
import glob
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re
import unicodedata
from datetime import datetime

## Download

Retrieve the page

In [43]:
url = "https://www.mendocinocounty.org/community/novel-coronavirus/covid-19-case-data"

In [44]:
page = requests.get(url)

##  Parse

In [45]:
soup = BeautifulSoup(page.content, "html.parser")

Get content well

In [46]:
content = soup.find("div", {"id": "widget_4_7164_4508"})

Get timestamp

In [47]:
date_text = content.find("p", text=re.compile("Last Updated")).text

In [48]:
date = re.sub("Last Updated:", "", date_text)

In [49]:
date = re.split("at", date)

In [50]:
date = re.split(" - ", date[0])

In [51]:
latest_date = pd.to_datetime(date[0]).date()

Get table

In [52]:
for tag in content.find_all(text=re.compile("Region")):
    table = tag.findParent("table")

In [53]:
tbody = table.tbody

In [54]:
row_list = tbody.find_all("tr")[1:]

In [55]:
dict_list = []

In [56]:
def safetxt(element):
    v = element.text.strip()
    v = v.replace("\u200b", "")
    return v

In [57]:
def safenumber(element):
    v = safetxt(element)
    v = v.replace(",", "")
    v = v.replace(" ", "")
    return v

In [58]:
for row in row_list:
    cell_list = row.find_all("td")
    d = dict(
        county="Mendocino",
        area=safetxt(cell_list[0]),
        confirmed_cases=safenumber(cell_list[1]),
    )
    dict_list.append(d)

In [59]:
df = pd.DataFrame(dict_list)

In [60]:
df["county_date"] = latest_date

## Vet

In [61]:
try:
    assert not len(df) > 5
except AssertionError:
    raise AssertionError("Mendocino's area scraper has extra rows")

In [62]:
try:
    assert not len(df) < 5
except AssertionError:
    raise AssertionError("Mendocino's area scraper is missing rows")

## Export

Set date

In [63]:
tz = pytz.timezone("America/Los_Angeles")

In [64]:
today = datetime.now(tz).date()

In [65]:
slug = "mendocino"

In [66]:
df.to_csv(data_dir / slug / f"{today}.csv", index=False)

## Combine

In [67]:
csv_list = [
    i
    for i in glob.glob(str(data_dir / slug / "*.csv"))
    if not str(i).endswith("timeseries.csv")
]

In [68]:
df_list = []
for csv in csv_list:
    if "manual" in csv:
        df = pd.read_csv(csv, parse_dates=["date"])
    else:
        file_date = csv.split("/")[-1].replace(".csv", "")
        df = pd.read_csv(csv, parse_dates=["county_date"])
        df["date"] = file_date
    df_list.append(df)

In [69]:
df = pd.concat(df_list).sort_values(["date", "area"])

In [70]:
df.to_csv(data_dir / slug / "timeseries.csv", index=False)

In [71]:
now = env.get_today()

NameError: name 'env' is not defined

In [None]:
writer.to_csv(df, f"places-scrapers/mendocino/{now}.csv")