## Configuration
_Initial steps to get the notebook ready to play nice with our repository. Do not delete this section._

Code formatting with [black](https://pypi.org/project/nb-black/).

In [1]:
%load_ext lab_black

In [2]:
import os
import pytz
import glob
import pathlib

this_dir = pathlib.Path(os.path.abspath(""))
data_dir = this_dir / "data"

In [3]:
import json
import requests
import pandas as pd
from slugify import slugify
from datetime import datetime

## Download

Retrieve the page

In [4]:
url = "https://services.arcgis.com/NkcnS0qk4w2wasOJ/ArcGIS/rest/services/COVIDCasesByCities/FeatureServer/3/query?where=1%3D1&objectIds=&time=&geometry=&geometryType=esriGeometryEnvelope&inSR=&spatialRel=esriSpatialRelIntersects&resultType=none&distance=0.0&units=esriSRUnit_Meter&relationParam=&returnGeodetic=false&outFields=*&returnGeometry=true&returnCentroid=false&featureEncoding=esriDefault&multipatchOption=xyFootprint&maxAllowableOffset=&geometryPrecision=&outSR=&defaultSR=&datumTransformation=&applyVCSProjection=false&returnIdsOnly=false&returnUniqueIdsOnly=false&returnCountOnly=false&returnExtentOnly=false&returnQueryGeometry=false&returnDistinctValues=false&cacheHint=false&orderByFields=&groupByFieldsForStatistics=&outStatistics=&having=&resultOffset=&resultRecordCount=&returnZ=false&returnM=false&returnExceededLimitFeatures=true&quantizationParameters=&sqlFormat=none&f=pjson&token="

In [5]:
r = requests.get(url)

In [6]:
r

<Response [200]>

In [7]:
data = r.json()

In [8]:
len(data["features"])

KeyError: 'features'

In [25]:
data["features"][1]

{'attributes': {'OBJECTID': 2,
  'Join_Count': 0,
  'NAME': 'SANTA CRUZ COUNTY',
  'geo_id': None,
  'City': None,
  'Cases': None,
  'Population': None,
  'Rate': None,
  'Shape__Area': 668618.977539063,
  'Shape__Length': 3239.93986958669},
 'geometry': {'rings': [[[6728599.38847928, 1831279.63274589],
    [6728661.3818836, 1831246.34379115],
    [6728727.83590517, 1831206.90638985],
    [6728787.99895279, 1831165.22262471],
    [6728840.78297063, 1831124.74675339],
    [6728881.19021294, 1831089.88006416],
    [6728904.37384411, 1831069.28459403],
    [6728929.38563007, 1831044.64445106],
    [6728969.36490496, 1831003.63899634],
    [6728989.17426228, 1830981.68266236],
    [6728747.67184793, 1830592.55763525],
    [6728492.34435281, 1830599.32319532],
    [6727813.64050935, 1830930.97292692],
    [6727902.19354749, 1831278.29853885],
    [6728035.39037799, 1831543.18449159],
    [6728599.38847928, 1831279.63274589]]]}}

## Parse

In [14]:
dict_list = []

In [28]:
for item in data["features"]:
    d = dict(
        county="Santa Clara",
        area=item["attributes"]["City"],
        confirmed_cases=item["attributes"]["Cases"],
    )
    if d["area"] != "UNINCORPORATED" and d["confirmed_cases"]:
        dict_list.append(d)

In [29]:
df = pd.DataFrame(dict_list)

In [30]:
df["area"] = df["area"].str.lower().str.title()

Get timestamp

In [34]:
date_url = "https://services.arcgis.com/NkcnS0qk4w2wasOJ/arcgis/rest/services/COVIDCasesByCities/FeatureServer/3?f=json"
date_r = requests.get(date_url)
date_data = date_r.json()

In [35]:
timestamp = date_data["editingInfo"]["lastEditDate"]

In [36]:
timestamp = datetime.fromtimestamp((timestamp / 1000))

In [37]:
latest_date = pd.to_datetime(timestamp).date()

In [38]:
df["county_date"] = latest_date

## Vet

In [43]:
len(df)

15

In [44]:
try:
    assert not len(df) > 15
except AssertionError:
    raise AssertionError(
        f"Santa Clara County's scraper has extra {len(export_df) - 9} rows: {list(export_df.area)}"
    )

In [45]:
try:
    assert not len(df) < 15
except AssertionError:
    raise AssertionError(
        f"Santa Clara County's scraper is missing {9 - len(export_df)} rows: {list(export_df.area)}"
    )

## Export

Set date

In [46]:
tz = pytz.timezone("America/Los_Angeles")

In [47]:
today = datetime.now(tz).date()

In [48]:
slug = "santa-clara"

In [49]:
df.to_csv(data_dir / slug / f"{today}.csv", index=False)

## Combine

In [50]:
csv_list = [
    i
    for i in glob.glob(str(data_dir / slug / "*.csv"))
    if not str(i).endswith("timeseries.csv")
]

In [51]:
df_list = []
for csv in csv_list:
    if "manual" in csv:
        df = pd.read_csv(csv, parse_dates=["date"])
    else:
        file_date = csv.split("/")[-1].replace(".csv", "")
        df = pd.read_csv(csv, parse_dates=["county_date"])
        df["date"] = file_date
    df_list.append(df)

In [52]:
df = pd.concat(df_list).sort_values(["date", "area"])

In [53]:
df.to_csv(data_dir / slug / "timeseries.csv", index=False)