## Configuration
_Initial steps to get the notebook ready to play nice with our repository. Do not delete this section._

Code formatting with [black](https://pypi.org/project/nb-black/).

The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black


In [1]:
import os
import pytz
import glob
import pathlib

this_dir = pathlib.Path(os.path.abspath(""))
data_dir = this_dir / "data"

In [2]:
import requests
import pandas as pd
import json
from datetime import datetime
import regex as re

ModuleNotFoundError: No module named 'regex'

## Download

Retrieve the page

In [60]:
url = "https://services2.arcgis.com/SCn6czzcqKAFwdGU/ArcGIS/rest/services/City_Demographics_Summary/FeatureServer/0/query?where=1%3D1&objectIds=&time=&resultType=none&outFields=*&returnIdsOnly=false&returnUniqueIdsOnly=false&returnCountOnly=false&returnDistinctValues=false&cacheHint=false&orderByFields=&groupByFieldsForStatistics=&outStatistics=&having=&resultOffset=&resultRecordCount=&sqlFormat=none&f=pjson&token="

In [61]:
r = requests.get(url)

In [62]:
data = r.json()

## Parse

In [63]:
dict_list = []

In [64]:
for item in data["features"]:
    # timestamp = item["attributes"]["EditDate"]
    # timestamp = datetime.fromtimestamp((timestamp / 1000))
    d = dict(
        county="Solano",
        area=item["attributes"]["city"],
        confirmed_cases=item["attributes"]["cases_number"],
        # county_date=timestamp,
    )
    dict_list.append(d)

In [65]:
df = pd.DataFrame(dict_list)

Get timestamp

In [66]:
date_url = "https://services2.arcgis.com/SCn6czzcqKAFwdGU/ArcGIS/rest/services/City_Demographics_Summary/FeatureServer/0/?f=json"
date_r = requests.get(date_url)
date_data = date_r.json()

In [67]:
timestamp = date_data["editingInfo"]["lastEditDate"]

In [68]:
timestamp = datetime.fromtimestamp((timestamp / 1000))

In [69]:
latest_date = pd.to_datetime(timestamp).date()

In [70]:
df["county_date"] = latest_date

In [71]:
df["confirmed_cases"] = df.confirmed_cases.fillna(0)

In [72]:
df.loc[(df.area == "County Unincorporated"), "area"] = "Unincorporated"

Get the latest data

In [74]:
latest_df = df[df.county_date == df.county_date.max()]

Clean up  city names for export

In [75]:
export_df = latest_df.replace("_", " ", regex=True)

In [76]:
export_df["confirmed_cases"] = export_df["confirmed_cases"].astype(int)

In [77]:
export_df = export_df.dropna(axis=0, subset=["area"])

In [78]:
export_df = export_df.drop_duplicates(subset=["county_date", "area"], keep="first")

## Vet

In [80]:
try:
    assert not len(export_df) > 8
except AssertionError:
    raise AssertionError(
        f"Solano County's scraper has extra {len(export_df) - 9} rows: {list(export_df.area)}"
    )

In [81]:
try:
    assert not len(export_df) < 8
except AssertionError:
    raise AssertionError(
        f"Solano County's scraper is missing {9 - len(export_df)} rows: {list(export_df.area)}"
    )

## Export

Set date

In [82]:
tz = pytz.timezone("America/Los_Angeles")

In [83]:
today = datetime.now(tz).date()

In [84]:
slug = "solano"

In [85]:
export_df.to_csv(data_dir / slug / f"{today}.csv", index=False)

## Combine

In [86]:
csv_list = [
    i
    for i in glob.glob(str(data_dir / slug / "*.csv"))
    if not str(i).endswith("timeseries.csv")
]

In [87]:
df_list = []
for csv in csv_list:
    if "manual" in csv:
        df = pd.read_csv(csv, parse_dates=["date"])
    else:
        file_date = csv.split("/")[-1].replace(".csv", "")
        df = pd.read_csv(csv, parse_dates=["county_date"])
        df["date"] = file_date
    df_list.append(df)

In [88]:
df = pd.concat(df_list).sort_values(["date", "area"])

In [89]:
df.to_csv(data_dir / slug / "timeseries.csv", index=False)