## Configuration
_Initial steps to get the notebook ready to play nice with our repository. Do not delete this section._

Code formatting with [black](https://pypi.org/project/nb-black/).

In [2]:
import os
import pytz
import glob
import pathlib

this_dir = pathlib.Path(os.path.abspath(""))
data_dir = this_dir / "data"

In [3]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re
import json
from datetime import datetime

## Download

Retrieve the page

In [12]:
url = "https://www.datawrapper.de/_/b9OvW/"

In [13]:
page = requests.get(url)

### Parse with BeautifulSoup

In [14]:
soup = BeautifulSoup(page.content, "html.parser")

In [15]:
meta = soup.find("meta", {"property": "og:url"})

In [16]:
dw_url = meta["content"]

### Parse Datawrapper

In [17]:
dataset_url = dw_url + "dataset.csv"

In [18]:
dw_r = requests.get(dataset_url)

In [19]:
string = dw_r.text

In [20]:
parsed = string.split("\r\n")

In [21]:
data_list = []
for row in parsed:
    cell = row.split(",")
    data_list.append(cell)

In [22]:
data_list

[['Place of residence (by region)', 'Number ', 'Percent'],
 ['North County ', '27', '4'],
 ['South County', '191', '25'],
 ['East County ', '482', '63'],
 ['West County ', '58', '8']]

In [40]:
df = (
    pd.DataFrame(data_list[1:], columns=data_list[0])
    .rename(
        columns={
            "Place of residence (by region)": "area",
            "Number ": "confirmed_cases",
        }
    )
    .drop(["Percent"], axis=1)
)

In [42]:
df["county_date"] = datetime.now().date()

In [43]:
df.insert(0, "county", "Trinity")

## Vet

In [45]:
try:
    assert not len(df) > 5
except AssertionError:
    raise AssertionError("Trinity County's scraper has extra rows")

In [48]:
try:
    assert not len(df) < 4
except AssertionError:
    raise AssertionError("Trinity County's scraper is missing rows")

In [49]:
try:
    assert df.confirmed_cases.isnull().sum() == 0
except AssertionError:
    raise AssertionError("Trinity County's scraper has registered null case values")

## Export

Set date

In [50]:
tz = pytz.timezone("America/Los_Angeles")

In [51]:
today = datetime.now(tz).date()

In [52]:
slug = "trinity"

In [53]:
df.to_csv(data_dir / slug / f"{today}.csv", index=False)

## Combine

In [54]:
csv_list = [
    i
    for i in glob.glob(str(data_dir / slug / "*.csv"))
    if not str(i).endswith("timeseries.csv")
]

In [55]:
df_list = []
for csv in csv_list:
    if "manual" in csv:
        df = pd.read_csv(csv, parse_dates=["date"])
    else:
        file_date = csv.split("/")[-1].replace(".csv", "")
        df = pd.read_csv(csv, parse_dates=["county_date"])
        df["date"] = file_date
    df_list.append(df)

In [58]:
df = pd.concat(df_list).sort_values(["date", "area"]).drop_duplicates()

In [59]:
df.to_csv(data_dir / slug / "timeseries.csv", index=False)