## Configuration
_Initial steps to get the notebook ready to play nice with our repository. Do not delete this section._

Code formatting with [black](https://pypi.org/project/nb-black/).

The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black


In [1]:
import os
import pathlib

In [2]:
this_dir = pathlib.Path(os.path.abspath(""))

In [3]:
data_dir = this_dir / "data"

In [4]:
import pytz
import glob
import requests
import pandas as pd
import json
from datetime import datetime, date
from bs4 import BeautifulSoup
import regex as re

ModuleNotFoundError: No module named 'regex'

## Download

Retrieve the page

In [27]:
url = "https://utility.arcgis.com/usrsvcs/servers/9ccc4670c77442f7b12b198a904f4a51/rest/services/HHS/Covid/MapServer/0/query?f=json&returnGeometry=false&outFields=*&where=1=1"

In [28]:
r = requests.get(url)

In [29]:
data = r.json()

## Parse

In [30]:
dict_list = []

In [31]:
for item in data["features"]:
    d = dict(
        county="Marin",
        area=item["attributes"]["Name"],
        confirmed_cases=item["attributes"]["CumulativePositives"],
    )
    dict_list.append(d)

In [32]:
df = pd.DataFrame(dict_list)

Get timestamp

In [33]:
headers = {"User-Agent": "Mozilla/5.0"}

In [34]:
url = "https://coronavirus.marinhhs.org/surveillance"

In [35]:
page = requests.get(url, headers=headers)

In [36]:
soup = BeautifulSoup(page.content, "html.parser")

In [37]:
last_updated_sentence = soup.find("div", {"class": "last-updated"}).text

In [38]:
last_updated_sentence

'Last Updated: 10.02.21 - 3:30 PM PST.\n*Residents Vaccinated represents the number of Marin County residents who have received at least one dose of vaccine.'

In [39]:
date = re.search("[0-9]{2}.[0-9]{2}.2[0-9]{1}", last_updated_sentence).group()

In [40]:
df["county_date"] = pd.to_datetime(date).date()

## Vet

Ensure we're getting all 54 areas of Marin County

In [41]:
try:
    assert not len(df) > 54
except AssertionError:
    raise AssertionError("Marin County's scraper has more rows than before")

In [42]:
try:
    assert not len(df) < 54
except AssertionError:
    raise AssertionError("Marin's scraper is missing rows")

## Export

Set date

In [43]:
tz = pytz.timezone("America/Los_Angeles")

In [44]:
today = datetime.now(tz).date()

In [45]:
slug = "marin"

In [46]:
df.to_csv(data_dir / slug / f"{today}.csv", index=False)

## Combine

In [47]:
csv_list = [
    i
    for i in glob.glob(str(data_dir / slug / "*.csv"))
    if not str(i).endswith("timeseries.csv")
]

In [48]:
df_list = []
for csv in csv_list:
    if "manual" in csv:
        df = pd.read_csv(csv, parse_dates=["date"])
    else:
        file_date = csv.split("/")[-1].replace(".csv", "")
        df = pd.read_csv(csv, parse_dates=["county_date"])
        df["date"] = file_date
    df_list.append(df)

In [49]:
df = pd.concat(df_list).sort_values(["date", "area"])

In [50]:
df.to_csv(data_dir / slug / "timeseries.csv", index=False)