## Configuration
_Initial steps to get the notebook ready to play nice with our repository. Do not delete this section._

Code formatting with [black](https://pypi.org/project/nb-black/).

In [1]:
%load_ext lab_black

In [2]:
import os
import pytz
import glob
import pathlib

this_dir = pathlib.Path(os.path.abspath(""))
data_dir = this_dir / "data"

In [3]:
import requests
import pandas as pd
import regex as re
import urllib, json
from datetime import datetime
from bs4 import BeautifulSoup

## Download

Retrieve the page

In [4]:
url = "https://gis2.stancounty.com/arcgis/rest/services/COVID19_Cases_City_Zip_Layer/FeatureServer/0/query"

In [5]:
params = dict(f="json", where="0=0", outFields="*")
session = requests.Session()
r = session.get(url, headers={"User-Agent": "Mozilla/5.0"}, params=params)

In [6]:
data = r.json()

In [7]:
data

{'objectIdFieldName': 'OBJECTID',
 'globalIdFieldName': '',
 'geometryType': 'esriGeometryPoint',
 'spatialReference': {'wkid': 4326, 'latestWkid': 4326},
 'features': []}

## Parse

In [8]:
cities = data["features"]

In [9]:
dict_list = []

In [10]:
for item in cities:
    d = dict(item["attributes"])
    dict_list.append(d)

In [11]:
df = pd.DataFrame(dict_list)

Rename fields to standardized column headers

In [12]:
df = df.rename(columns={"city_district": "area", "confirmed": "confirmed_cases"})

Eliminate unneeded rows

In [13]:
df = df.filter(["area", "confirmed_cases"], axis=1).sort_values(
    by="area", ascending=True
)

KeyError: 'area'

Drop where confirmed cases is NaN

In [None]:
df = df.dropna(subset=["confirmed_cases"])

Convert `confirmed_cases` column to int from float

In [None]:
df = df.astype({"confirmed_cases": int})

Sum districts 1-5 into unincorporated / districts

In [None]:
total = df[
    df["area"].isin(
        ["District 1", "District 2", "District 3", "District 4", "District 5"]
    )
]

Append districts sum row to the dataframe

In [None]:
new_row = {
    "area": "Unincorporated districts",
    "confirmed_cases": total["confirmed_cases"].sum(),
}
df = df.append(new_row, ignore_index=True)

Delete District rows

In [None]:
df.drop(
    df[
        df["area"].isin(
            ["District 1", "District 2", "District 3", "District 4", "District 5"]
        )
    ].index,
    inplace=True,
)

In [None]:
df = df.reset_index(drop=True)

### Adding county column

In [None]:
df.insert(0, "county", "Stanislaus")

Get county timestamp (using separate feed to extract unix timestamp)

In [None]:
url_date = "https://gis2.stancounty.com/arcgis/rest/services/COVID19_Cases_City_Zip_Layer/FeatureServer/0/metadata"
r_date = requests.get(url_date)
data_date_bad = BeautifulSoup(r_date.text).find("creadate").text
data_date = datetime.strptime(data_date_bad, "%Y%m%d").strftime("%Y-%m-%d")
# dropping last three digits of unix timestamp, converting to python date
feed_date_obj = data_date

If separate feed or modified date unavailable, use now for date

In [None]:
field_date = data_date if data_date else now

In [None]:
df["county_date"] = pd.to_datetime(field_date).date()

## Vet

In [None]:
default_stan_len = 10

In [None]:
try:
    assert not len(df) < default_stan_len
except AssertionError:
    raise AssertionError(
        "Stanislaus County scraper: latest spreadsheet entry is missing row(s)"
    )

In [None]:
try:
    assert not len(df) > default_stan_len
except AssertionError:
    raise AssertionError(
        "Stanislaus County scraper: latest spreadsheet entry has more area(s) than previously reported"
    )

## Export

Set date

In [None]:
tz = pytz.timezone("America/Los_Angeles")

In [None]:
today = datetime.now(tz).date()

In [None]:
slug = "stanislaus"

In [None]:
df.to_csv(data_dir / slug / f"{today}.csv", index=False)

## Combine

In [None]:
csv_list = [
    i
    for i in glob.glob(str(data_dir / slug / "*.csv"))
    if not str(i).endswith("timeseries.csv")
]

In [None]:
df_list = []
for csv in csv_list:
    if "manual" in csv:
        df = pd.read_csv(csv, parse_dates=["date"])
    else:
        file_date = csv.split("/")[-1].replace(".csv", "")
        df = pd.read_csv(csv, parse_dates=["county_date"])
        df["date"] = file_date
    df_list.append(df)

In [None]:
df = pd.concat(df_list).sort_values(["date", "area"])

In [None]:
df.to_csv(data_dir / slug / "timeseries.csv", index=False)