## Configuration
_Initial steps to get the notebook ready to play nice with our repository. Do not delete this section._

Code formatting with [black](https://pypi.org/project/nb-black/).

In [1]:
%load_ext lab_black

In [2]:
import os
import pytz
import glob
import pathlib

this_dir = pathlib.Path(os.path.abspath(""))
data_dir = this_dir / "data"

In [3]:
import requests
import pandas as pd
from slugify import slugify
from datetime import datetime

## Download

Retrieve the page

In [4]:
url = "https://data.sfgov.org/resource/tpyr-dvnc.json"

In [5]:
r = requests.get(url)

In [6]:
data = r.json()

## Parse

Filter it down to ZIP Code level data only

In [7]:
zip_codes = [x for x in data if x["area_type"] == "ZCTA"]

Parse the result

In [8]:
dict_list = [
    dict(
        county="San Francisco",
        area=z["id"],
        confirmed_cases=z.get("count", 0),
        county_date=z["last_updated_at"],
    )
    for z in zip_codes
]

KeyError: 'last_updated_at'

Convert to a dataframe

In [9]:
df = pd.DataFrame(dict_list)

Set the date

In [10]:
df["county_date"] = pd.to_datetime(df["county_date"])

In [11]:
df["county_date"] = df["county_date"].dt.date

## Vet

In [12]:
try:
    assert not len(df) > 27
except AssertionError:
    raise AssertionError("San Francisco's scraper has more rows than before")

In [13]:
try:
    assert not len(df) < 27
except AssertionError:
    raise AssertionError("San Francisco's scraper is missing rows")

## Export

Set date

In [14]:
tz = pytz.timezone("America/Los_Angeles")

In [15]:
today = datetime.now(tz).date()

In [16]:
slug = "san-francisco"

In [17]:
df.to_csv(data_dir / slug / f"{today}.csv", index=False)

## Combine

In [18]:
csv_list = [
    i
    for i in glob.glob(str(data_dir / slug / "*.csv"))
    if not str(i).endswith("timeseries.csv")
]

In [19]:
df_list = []
for csv in csv_list:
    if "manual" in csv:
        df = pd.read_csv(csv, parse_dates=["date"])
    else:
        file_date = csv.split("/")[-1].replace(".csv", "")
        df = pd.read_csv(csv, parse_dates=["county_date"])
        df["date"] = file_date
    df_list.append(df)

In [20]:
df = pd.concat(df_list).sort_values(["date", "area"])

In [21]:
df.to_csv(data_dir / slug / "timeseries.csv", index=False)