## Configuration
_Initial steps to get the notebook ready to play nice with our repository. Do not delete this section._

Code formatting with [black](https://pypi.org/project/nb-black/).

### NOTE
Per SD [website](https://www.arcgis.com/apps/dashboards/96feda77f12f46638b984fcb1d17bd24): "Beginning 7/1/2021, data will be updated weekly on Wednesdays. Last updated 7/7/2021"


In [1]:
%load_ext lab_black

In [2]:
import os
import pytz
import glob
import pathlib

this_dir = pathlib.Path(os.path.abspath(""))
data_dir = this_dir / "data"

In [3]:
import requests
import pandas as pd
from datetime import datetime, date, timedelta
from slugify import slugify
import numpy as np
import pytz

## Download

Get the county's latest update time to set the query

In [4]:
sd_metadata_url = "https://services1.arcgis.com/1vIhDJwtG5eNmiqX/arcgis/rest/services/Covid19_San_Diego_County_Statistics_PUBLIC_VIEW/FeatureServer/0/?f=json"

In [5]:
sd_metadata_r = requests.get(sd_metadata_url)

In [6]:
sd_metadata = sd_metadata_r.json()

In [7]:
last_updated_str = sd_metadata["editingInfo"]["lastEditDate"]

In [8]:
tz = pytz.timezone("America/Los_Angeles")
last_updated = datetime.fromtimestamp(last_updated_str / 1000, tz).date()

In [9]:
last_updated

datetime.date(2022, 9, 22)

In [10]:
day_before = last_updated - timedelta(days=5)

In [11]:
day_before

datetime.date(2022, 9, 17)

Retrieve the page

In [12]:
url = f"https://services1.arcgis.com/1vIhDJwtG5eNmiqX/arcgis/rest/services/Covid19_San_Diego_County_Statistics_PUBLIC_VIEW/FeatureServer/0/query?f=json&where=(Case_Count%20%3E%3D%201)%20AND%20(UpdateDate%20BETWEEN%20timestamp%20%27{day_before}%2007%3A00%3A00%27%20AND%20timestamp%20%27{last_updated}%2006%3A59%3A59%27)&returnGeometry=false&spatialRel=esriSpatialRelIntersects&outFields=*&outSR=102100&resultOffset=0&resultRecordCount=125&resultType=standard&cacheHint=true"

In [13]:
r = requests.get(url)

In [14]:
data = r.json()

## Parse

Parse the result

In [15]:
features = data["features"]

In [16]:
values = []

In [17]:
for f in features:
    row = f["attributes"].values()
    values.append(row)

In [18]:
cols = list(data["features"][0]["attributes"].keys())

IndexError: list index out of range

Convert to a dataframe

In [19]:
df = pd.DataFrame(values, columns=cols)

Clean

In [20]:
clean_df = df[["ZipText", "Case_Count", "UpdateDate"]]

In [21]:
clean_df = clean_df.rename(
    columns={
        "ZipText": "zip",
        "Case_Count": "confirmed_cases",
        "UpdateDate": "update_date",
    }
)

Reformat and set the date

In [22]:
clean_df["timestamp"] = clean_df["update_date"] / (1000)

In [23]:
clean_df["timestamp_int"] = clean_df["timestamp"].astype(int)

In [24]:
timestamps = list(clean_df["timestamp_int"])

In [25]:
dates = [datetime.fromtimestamp(t) for t in timestamps]

In [26]:
clean_df["county_date"] = dates

In [27]:
clean_df["county_date"] = pd.to_datetime(clean_df["county_date"]).dt.date

Filter to just the latest date

In [28]:
latest_df = clean_df[clean_df.county_date == clean_df.county_date.max()]

Match zips with community names

In [29]:
matchup = {
    "91901": "Alpine",
    "91902": "Bonita",
    "91905": "Boulevard",
    "91906": "Campo",
    "91910": "Chula Vista",
    "91911": "Chula Vista",
    "91913": "Chula Vista",
    "91914": "Chula Vista",
    "91915": "Chula Vista",
    "91916": "Descanso",
    "91917": "Dulzura",
    "91931": "Guatay",
    "91932": "Imperial Beach",
    "91934": "Jacumba",
    "91935": "Jamul",
    "91941": "La Mesa",
    "91942": "La Mesa",
    "91945": "Lemon Grove",
    "91948": "Mount Laguna",
    "91950": "National City",
    "91962": "Pine Valley",
    "91963": "Potrero",
    "91977": "Spring Valley",
    "91978": "Spring Valley",
    "91980": "Tecate",
    "92003": "Bonsall",
    "92004": "Borrego Springs",
    "92007": "Cardiff by the Sea",
    "92008": "Carlsbad",
    "92009": "Carlsbad",
    "92010": "Carlsbad",
    "92011": "Carlsbad",
    "92014": "Del Mar",
    "92019": "El Cajon",
    "92020": "El Cajon",
    "92021": "El Cajon",
    "92024": "Encinitas",
    "92025": "Escondido",
    "92026": "Escondido",
    "92027": "Escondido",
    "92028": "Fallbrook",
    "92029": "Escondido",
    "92036": "Julian",
    "92037": "La Jolla",
    "92040": "Lakeside",
    "92054": "Oceanside",
    "92055": "Camp Pendleton",
    "92056": "Oceanside",
    "92057": "Oceanside",
    "92058": "Oceanside",
    "92059": "Pala",
    "92060": "Palomar Mountain",
    "92061": "Pauma Valley",
    "92064": "Poway",
    "92065": "Ramona",
    "92066": "Ranchita",
    "92067": "Rancho Santa Fe",
    "92069": "San Marcos",
    "92070": "Santa Ysabel",
    "92071": "Santee",
    "92075": "Solana Beach",
    "92078": "San Marcos",
    "92081": "Vista",
    "92082": "Valley Center",
    "92083": "Vista",
    "92084": "Vista",
    "92086": "Warner Springs",
    "92091": "Rancho Santa Fe",
    "92093": "La Jolla",
    "92096": "San Marcos",
    "92101": "San Diego",
    "92102": "San Diego",
    "92103": "San Diego",
    "92104": "San Diego",
    "92105": "San Diego",
    "92106": "San Diego",
    "92107": "San Diego",
    "92108": "San Diego",
    "92109": "San Diego",
    "92110": "San Diego",
    "92111": "San Diego",
    "92113": "San Diego",
    "92114": "San Diego",
    "92115": "San Diego",
    "92116": "San Diego",
    "92117": "San Diego",
    "92118": "Coronado",
    "92119": "San Diego",
    "92120": "San Diego",
    "92121": "San Diego",
    "92122": "San Diego",
    "92123": "San Diego",
    "92124": "San Diego",
    "92126": "San Diego",
    "92127": "San Diego",
    "92128": "San Diego",
    "92129": "San Diego",
    "92130": "San Diego",
    "92131": "San Diego",
    "92134": "San Diego",
    "92135": "Coronado",
    "92136": "San Diego",
    "92139": "San Diego",
    "92140": "San Diego",
    "92145": "San Diego",
    "92154": "San Diego",
    "92155": "Coronado",
    "92161": "San Diego",
    "92173": "San Ysidro",
    "92182": "San Diego",
    "92259": "Ocotillo",
    "92536": "Aguanga",
    "92672": "San Clemente",
}

In [30]:
latest_df["city"] = latest_df.zip.map(matchup)

In [31]:
latest_df["area"] = latest_df["zip"] + ": " + latest_df["city"]

In [32]:
latest_df["county"] = "San Diego"

In [33]:
export_df = latest_df[["county", "area", "confirmed_cases", "county_date", "zip"]]

In [34]:
export_df = export_df.rename(columns={"area": "city"}).drop_duplicates()

## Vet

Make sure everyone is here

In [35]:
dupes = export_df[export_df.duplicated()]

In [36]:
try:
    assert len(dupes) == 0
except AssertionError:
    print(dupes.set_index("county")[["county_date", "city"]])
    raise AssertionError(
        f"There are {len(dupes)} duplicate zip codes in the San Diego scraper"
    )

In [37]:
try:
    assert not len(export_df) > 110
except AssertionError:
    raise AssertionError("San Diego County's scraper has more rows than before")

In [38]:
try:
    assert not len(export_df) < 110
except AssertionError:
    raise AssertionError("San Diego County's scraper is missing rows")

## Export

Set date

In [39]:
tz = pytz.timezone("America/Los_Angeles")

In [40]:
today = datetime.now(tz).date()

In [41]:
slug = "san-diego"

In [42]:
export_df.to_csv(data_dir / slug / f"{today}.csv", index=False)

## Combine

In [43]:
csv_list = [
    i
    for i in glob.glob(str(data_dir / slug / "*.csv"))
    if not str(i).endswith("timeseries.csv")
]

In [44]:
df_list = []
for csv in csv_list:
    if "manual" in csv:
        df = pd.read_csv(csv, parse_dates=["date"])
    else:
        file_date = csv.split("/")[-1].replace(".csv", "")
        df = pd.read_csv(csv, parse_dates=["county_date"]).rename(
            columns={"city": "area"}
        )
        df["date"] = file_date
    df_list.append(df)

In [45]:
df = pd.concat(df_list).sort_values(["date", "area"])

In [47]:
df.to_csv(data_dir / slug / "timeseries.csv", index=False)