## Configuration
_Initial steps to get the notebook ready to play nice with our repository. Do not delete this section._

Code formatting with [black](https://pypi.org/project/nb-black/).

In [1]:
%load_ext lab_black

Add our `utils` directory to the system's `$PATH` so we can import Python files from sibling directories.

In [2]:
import os
import pathlib

In [3]:
this_dir = pathlib.Path(os.path.abspath(""))

In [4]:
data_dir = this_dir / "data"

In [5]:
import re
import json
import pytz
import glob
import requests
import pandas as pd
from datetime import datetime
from bs4 import BeautifulSoup

## Download

Retrieve the page

In [6]:
url = "https://infogram.com/1pe66wmyjnmvkrhm66x9362kp3al60r57ex"

In [7]:
page = requests.get(url)

## Parse

In [8]:
soup = BeautifulSoup(page.content, "html.parser")

Find script tag with the Infogram data

In [9]:
for elem in soup(text=re.compile(r"window.infographicData")):
    script = elem

In [10]:
stripped_script = script.replace("window.infographicData=", "")
stripped_script = stripped_script.rstrip(";")

In [11]:
data = json.loads(stripped_script)

In [12]:
cities = data["elements"]["content"]["content"]["entities"][
    "b26b9acd-b036-40bc-bbbe-68667dd338e4"
]["props"]["chartData"]["data"][0]

In [13]:
not_empty = [ele for ele in cities if ele != [""]]

Convert to dataframe

In [14]:
df = pd.DataFrame(not_empty, columns=["area", "confirmed_cases"])

Get timestamp

In [15]:
timestamp = data["updatedAt"]

In [16]:
latest_date = pd.to_datetime(timestamp).date()

In [17]:
df.insert(0, "county", "Butte")

In [18]:
df["county_date"] = latest_date

## Vet

In [19]:
try:
    assert not len(df) > 6
except AssertionError:
    raise AssertionError("Butte County's city scraper has additional rows")

AssertionError: Butte County's city scraper has additional rows

In [20]:
try:
    assert not len(df) < 6
except AssertionError:
    raise AssertionError("Butte's city scraper is missing rows")

## Export

Mark the current date

In [21]:
tz = pytz.timezone("America/Los_Angeles")

In [22]:
today = datetime.now(tz).date()

In [23]:
slug = "butte"

In [24]:
df.to_csv(data_dir / slug / f"{today}.csv", index=False)

## Combine

In [25]:
csv_list = [
    i
    for i in glob.glob(str(data_dir / slug / "*.csv"))
    if not str(i).endswith("timeseries.csv")
]

In [26]:
df_list = []
for csv in csv_list:
    if "manual" in csv:
        df = pd.read_csv(csv, parse_dates=["date"])
    else:
        file_date = csv.split("/")[-1].replace(".csv", "")
        df = pd.read_csv(csv, parse_dates=["county_date"])
        df["date"] = file_date
    df_list.append(df)

In [27]:
df = pd.concat(df_list).sort_values(["date", "area"])

In [28]:
df.to_csv(data_dir / slug / "timeseries.csv", index=False)