## Configuration
_Initial steps to get the notebook ready to play nice with our repository. Do not delete this section._

Code formatting with [black](https://pypi.org/project/nb-black/).

In [1]:
%load_ext lab_black

In [2]:
import os
import pytz
import glob
import pathlib

this_dir = pathlib.Path(os.path.abspath(""))
data_dir = this_dir / "data"

In [3]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import json
import re
from datetime import datetime, date, timedelta
from slugify import slugify
from playwright.async_api import async_playwright

## Download

Retrieve the page

In [4]:
url = "https://e.infogram.com/1c8458d3-1302-4c47-bd47-24366f127b72?parent_url=https%3A%2F%2Fwww.emergencyslo.org%2Fen%2Fpositive-case-details.aspx&src=embed#"

In [5]:
page = requests.get(url)

In [6]:
soup = BeautifulSoup(page.content, "html.parser")

## Parse

Find script tag with the Infogram data

In [7]:
for elem in soup(text=re.compile(r"window.infographicData")):
    script = elem

In [8]:
stripped_script = re.sub("window.infographicData=", "", script)

NameError: name 'script' is not defined

In [9]:
stripped_script = stripped_script.rstrip(";")

In [10]:
data = json.loads(stripped_script)

In [11]:
cities = [
    x
    for x in data["elements"]
    if x["object_id"] == "3695dc89-8a1c-4aa2-951b-11eb68e147ac"
]

In [12]:
city_list = cities[0]["data"][0]

In [13]:
not_empty = [ele for ele in city_list if ele[0] != ""]

In [14]:
df = pd.DataFrame(not_empty)

In [15]:
df.columns = df.iloc[0]
df = df[1:]

Clean up the dataframe

In [16]:
melt_df = pd.melt(
    df, id_vars=["Date"], var_name="area", value_name="confirmed_cases"
).rename(columns={"Date": "date"})

In [17]:
melt_df["date"] = pd.to_datetime(melt_df["date"])

In [18]:
melt_df["confirmed_cases"] = melt_df["confirmed_cases"].replace(",", "", regex=True)

In [19]:
filter_df = melt_df[(melt_df["confirmed_cases"] != "") & (melt_df["area"] != "Bradley")]

In [20]:
sort_df = filter_df.sort_values(["date", "area"])

In [21]:
assert not sort_df.area.isnull().any()

In [22]:
assert len(sort_df[sort_df.area == ""]) == 0

Get timestamp

In [23]:
text = data["elements"]

In [24]:
grafs = []
for ele in data["elements"]:
    keys = ele.keys()
    if "text" in keys:
        text = ele["text"]
        grafs.append(text)

In [25]:
regexDateGraf = re.compile("As of \d{1,2}\/\d{1,2}\/\d{1,2}")

In [26]:
parsed_text = []
for graf in grafs:
    soup = BeautifulSoup(graf, "html.parser")
    text = soup.text
    parsed_text.append(text)

In [27]:
date_graf = [
    regexDateGraf.search(t)[0] for t in parsed_text if regexDateGraf.search(t) != None
][0]

In [28]:
date_graf_trim = date_graf.replace("As of ", "")

In [29]:
latest_date = pd.to_datetime(date_graf_trim, format="%m/%d/%y").date()

In [30]:
sort_df["county_date"] = latest_date

In [31]:
sort_df.insert(0, "county", "San Luis Obispo")

In [32]:
latest_df = sort_df[sort_df.date == sort_df.date.max()]

## Vet

Make sure everyone is here

In [33]:
try:
    assert not len(latest_df) > 22
except AssertionError:
    raise AssertionError("San Luis Obispo County's scraper has more rows than before")

In [34]:
try:
    assert not len(latest_df) < 22
except AssertionError:
    raise AssertionError("San Luis Obispo County's scraper is missing rows")

## Export

Set date

In [35]:
tz = pytz.timezone("America/Los_Angeles")

In [36]:
today = datetime.now(tz).date()

In [37]:
slug = "san-luis-obispo"

In [38]:
latest_df.to_csv(data_dir / slug / f"{today}.csv", index=False)

## Combine

In [39]:
csv_list = [
    i
    for i in glob.glob(str(data_dir / slug / "*.csv"))
    if not str(i).endswith("timeseries.csv")
]

In [40]:
df_list = []
for csv in csv_list:
    if "manual" in csv:
        df = pd.read_csv(csv, parse_dates=["date"])
    else:
        file_date = csv.split("/")[-1].replace(".csv", "")
        df = pd.read_csv(csv, parse_dates=["county_date"])
        df["date"] = file_date
    df_list.append(df)

In [41]:
df = pd.concat(df_list).sort_values(["date", "area"])

In [42]:
df.to_csv(data_dir / slug / "timeseries.csv", index=False)