## Configuration
_Initial steps to get the notebook ready to play nice with our repository. Do not delete this section._

Code formatting with [black](https://pypi.org/project/nb-black/).

In [1]:
%load_ext lab_black

In [2]:
import os
import pytz
import glob
import pathlib

this_dir = pathlib.Path(os.path.abspath(""))
data_dir = this_dir / "data"

In [3]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re
import json
from datetime import datetime

## Download

Retrieve the page

Link to the infogram dashboard retrieved 12/10

In [4]:
# url = "https://e.infogram.com/_/1TTEOGw8qO13uPts9wb4?parent_url=https%3A%2F%2Fwww.trinitycounty.org%2FCOVID-19&src=embed#async_embed"
# url = "https://infogram.com/1pg2x02vqkjy26c95d9p1rle6juwpe7ynnp"
url = "https://e.infogram.com/_/wsionIftGzbdz73rhj6f?parent_url=https%3A%2F%2Fwww.trinitycounty.org%2FCOVID-19&src=embed#async_embed"

In [5]:
page = requests.get(url)

## Parse

In [6]:
soup = BeautifulSoup(page.content, "html.parser")

Find all script tags

In [7]:
scripts = soup.find_all("script")

In [8]:
len(scripts)

3

Find the one that starts with `window.infographicData` (it was the sixth one at last check, but will loop over to check all)

In [9]:
# data = scripts[6]
jsondata = ""
for script in scripts:
    if script.string is not None:
        if re.match("window.infographicData=", script.string):
            jsondata = script.string
            break

Convert to python dict

Take out var definition and ending character

In [10]:
data = json.loads(re.sub("window.infographicData=", "", jsondata.string).rstrip(";"))

AttributeError: 'str' object has no attribute 'string'

Get localities

In [19]:
cities = data["elements"]["content"]["content"]["entities"][
    "d4de15cb-01c6-4b10-8b28-a225f606413f"
]["props"]["chartData"]["data"][0]

In [32]:
not_empty = [ele for ele in cities if ele[0] != ""]
not_empty

[['East County', '573'],
 ['North County', '33'],
 ['South County', '224'],
 ['West County', '85'],
 ['Under Investigation', '']]

In [33]:
df = pd.DataFrame(not_empty, columns=["area", "confirmed_cases"])

Get timestamp

In [34]:
timestamp = data["updatedAt"]

In [35]:
timestamp

'2021-10-15T15:30:26.000Z'

In [36]:
latest_date = pd.to_datetime(timestamp).date()

In [37]:
df["county_date"] = latest_date

In [38]:
df.insert(0, "county", "Trinity")

In [39]:
for index, row in df.iterrows():
    if row["confirmed_cases"] == "":
        df.loc[index, "confirmed_cases"] = 0

## Vet

In [41]:
try:
    assert not len(df) > 5
except AssertionError:
    raise AssertionError("Trinity County's scraper has extra rows")

In [42]:
try:
    assert not len(df) < 5
except AssertionError:
    raise AssertionError("Trinity County's scraper is missing rows")

In [43]:
try:
    assert df.confirmed_cases.isnull().sum() == 0
except AssertionError:
    raise AssertionError("Trinity County's scraper has registered null case values")

## Export

Set date

In [44]:
tz = pytz.timezone("America/Los_Angeles")

In [45]:
today = datetime.now(tz).date()

In [46]:
slug = "trinity"

In [47]:
df.to_csv(data_dir / slug / f"{today}.csv", index=False)

## Combine

In [48]:
csv_list = [
    i
    for i in glob.glob(str(data_dir / slug / "*.csv"))
    if not str(i).endswith("timeseries.csv")
]

In [49]:
df_list = []
for csv in csv_list:
    if "manual" in csv:
        df = pd.read_csv(csv, parse_dates=["date"])
    else:
        file_date = csv.split("/")[-1].replace(".csv", "")
        df = pd.read_csv(csv, parse_dates=["county_date"])
        df["date"] = file_date
    df_list.append(df)

In [50]:
df = pd.concat(df_list).sort_values(["date", "area"])

In [51]:
df.to_csv(data_dir / slug / "timeseries.csv", index=False)