## Configuration
_Initial steps to get the notebook ready to play nice with our repository. Do not delete this section._

Code formatting with [black](https://pypi.org/project/nb-black/).

In [1]:
%load_ext lab_black

Add our `utils` directory to the system's `$PATH` so we can import Python files from sibling directories.

In [2]:
import os
import pathlib

In [3]:
this_dir = pathlib.Path(os.path.abspath(""))

In [4]:
data_dir = this_dir / "data"

In [5]:
import pytz
import glob
import json
import urllib
import requests
import regex as re
import pandas as pd
from datetime import datetime
from bs4 import BeautifulSoup

## Download

Retrieve the page

In [6]:
url = "https://services6.arcgis.com/rNuo8nvF17v2dPFX/arcgis/rest/services/COVID19_DashboardData/FeatureServer/0/query"

In [7]:
params = dict(f="json", where="0=0", outFields="*")
session = requests.Session()
r = session.get(url, headers={"User-Agent": "Mozilla/5.0"}, params=params)

In [8]:
data = r.json()

## Parse

In [9]:
zips = data["features"]

KeyError: 'features'

In [10]:
dict_list = []

In [11]:
for item in zips:
    d = dict(item["attributes"])
    dict_list.append(d)

In [12]:
df = pd.DataFrame(dict_list)

In [13]:
df = df[df["Aggregate_Area"] == "No"]

In [14]:
df["Date_Reported"] = pd.to_datetime(df["Date_Reported"], unit="ms").dt.strftime(
    "%Y-%m-%d"
)

Eliminate unneeded rows

In [15]:
df = df[["Name", "Date_Reported", "Positive"]]

In [16]:
df.insert(0, "county", "Calaveras")

Rename fields to standardized column headers

In [17]:
df = df.rename(
    columns={
        "Name": "area",
        "Positive": "confirmed_cases",
        "Date_Reported": "county_date",
    }
)

In [18]:
df = df.astype({"confirmed_cases": int})

## Vet

In [19]:
default_cc_len = 13

In [20]:
try:
    assert not len(df) < default_cc_len
except AssertionError:
    raise AssertionError("Calaveras County scraper: latest entry is missing row(s)")

In [21]:
try:
    assert not len(df) > default_cc_len
except AssertionError:
    raise AssertionError(
        "Calaveras County scraper: latest entry has more area(s) than previously reported"
    )

## Export

In [22]:
tz = pytz.timezone("America/Los_Angeles")

In [23]:
today = datetime.now(tz).date()

In [24]:
slug = "calaveras"

In [25]:
df.to_csv(data_dir / slug / f"{today}.csv", index=False)

## Combine

In [26]:
csv_list = [
    i
    for i in glob.glob(str(data_dir / slug / "*.csv"))
    if not str(i).endswith("timeseries.csv")
]

In [27]:
df_list = []
for csv in csv_list:
    if "manual" in csv:
        df = pd.read_csv(csv, parse_dates=["date"])
    else:
        file_date = csv.split("/")[-1].replace(".csv", "")
        df = pd.read_csv(csv, parse_dates=["county_date"])
        df["date"] = file_date
    df_list.append(df)

In [28]:
df = pd.concat(df_list).sort_values(["date", "area"])

Fix for Calaveras zip code change 1/7/22

In [30]:
newZips = "95233, 95224, 95247, 95251, 95229"

In [33]:
oldZips = "95224, 95247, 95251, 95229"

In [37]:
df.loc[df.area == oldZips, "area"] = newZips

In [38]:
df.to_csv(data_dir / slug / "timeseries.csv", index=False)