## Configuration
_Initial steps to get the notebook ready to play nice with our repository. Do not delete this section._

Code formatting with [black](https://pypi.org/project/nb-black/).

In [2]:
import os
import pytz
import glob
import pathlib

this_dir = pathlib.Path(os.path.abspath(""))
data_dir = this_dir / "data"

In [3]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import json
import re
from datetime import datetime, date, timedelta
from slugify import slugify
from playwright.async_api import async_playwright



## Download

Retrieve the page

In [4]:
url = "https://e.infogram.com/1c8458d3-1302-4c47-bd47-24366f127b72?parent_url=https%3A%2F%2Fwww.emergencyslo.org%2Fen%2Fpositive-case-details.aspx&src=embed#"

In [5]:
url = "https://e.infogram.com/6d3c0cf1-4960-4340-9872-78dd22cb991a?parent_url=https%3A%2F%2Fwww.slocounty.ca.gov%2FCOVID-19%2FData.aspx&src=embed#"

In [6]:
page = requests.get(url)

In [7]:
soup = BeautifulSoup(page.content, "html.parser")

## Parse

Find script tag with the Infogram data

In [8]:
for elem in soup(text=re.compile(r"window.infographicData")):
    script = elem

In [9]:
stripped_script = re.sub("window.infographicData=", "", script)

In [10]:
stripped_script = stripped_script.rstrip(";")

In [11]:
data = json.loads(stripped_script)

In [12]:
cities = [
    x
    for x in data["elements"]
    ##if x["object_id"] == "3695dc89-8a1c-4aa2-951b-11eb68e147ac"
    if x["object_id"] == "ebb96e22-3766-4a73-a776-90c533b379b5"
]

# "ebb96e22-3766-4a73-a776-90c533b379b5"
# "e024d6eb-623f-48fc-8345-19a937721f7d"

In [13]:
city_list = cities[0]["data"][0]

In [14]:
not_empty = [ele for ele in city_list if ele[0] != ""]

In [15]:
df = pd.DataFrame(not_empty)

In [16]:
df.columns = df.iloc[0]
df = df[1:]

In [17]:
df

Unnamed: 0,Date,Arroyo Grande,Atascadero,Grover Beach,Nipomo,Paso Robles,San Luis Obispo,Templeton,CMC (Inmates),Pismo Beach,...,Shandon,Cal Poly (campus residents),Cayucos,Avila,Santa Margarita,Creston,San Simeon,Bradley,ASH (patients),NaN
1,3/14/2020,,,,,,,,,,...,,,,,,,,,,
2,3/15/2020,,,,,,,,,,...,,,,,,,,,,
3,3/16/2020,,,,,,,,,,...,,,,,,,,,,
4,3/17/2020,,,,,,,,,,...,,,,,,,,,,
5,3/18/2020,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
683,1/24/2022,3734,4611,2008,3406,8190,8450,1527,2571,895,...,319,857,242,91,324,179,66,37,258,362
684,1/25/2022,3831,4710,2063,3518,8365,8676,1552,2575,919,...,323,868,246,94,332,182,66,38,266,374
685,1/26/2022,3955,4824,2137,3636,8597,8862,1597,2588,945,...,336,884,257,97,336,185,69,40,266,
686,1/27/2022,4071,4918,2193,3723,8761,9018,1618,2590,972,...,341,885,261,98,343,190,69,43,266,


Clean up the dataframe

In [18]:
melt_df = pd.melt(
    df, id_vars=["Date"], var_name="area", value_name="confirmed_cases"
).rename(columns={"Date": "date"})

In [19]:
melt_df["date"] = pd.to_datetime(melt_df["date"])

In [20]:
melt_df["confirmed_cases"] = melt_df["confirmed_cases"].replace(",", "", regex=True)

In [21]:
filter_df = melt_df[(melt_df["confirmed_cases"] != "") & (melt_df["area"] != "Bradley")]

In [22]:
sort_df = filter_df.sort_values(["date", "area"])

In [23]:
sort_df = sort_df[sort_df.area.notnull()]

In [24]:
assert not sort_df.area.isnull().any()

In [25]:
assert len(sort_df[sort_df.area == ""]) == 0

Get timestamp

In [26]:
text = data["elements"]

In [27]:
grafs = []
for ele in data["elements"]:
    keys = ele.keys()
    if "text" in keys:
        text = ele["text"]
        grafs.append(text)

In [28]:
regexDateGraf = re.compile("As of \d{1,2}\/\d{1,2}\/\d{1,2}")

In [29]:
parsed_text = []
for graf in grafs:
    soup = BeautifulSoup(graf, "html.parser")
    text = soup.text
    parsed_text.append(text)

In [30]:
date_graf = [
    regexDateGraf.search(t)[0] for t in parsed_text if regexDateGraf.search(t) != None
][0]

In [31]:
date_graf_trim = date_graf.replace("As of ", "")

In [32]:
latest_date = pd.to_datetime(date_graf_trim, format="%m/%d/%y").date()

In [33]:
sort_df["county_date"] = latest_date

In [34]:
sort_df.insert(0, "county", "San Luis Obispo")

In [35]:
latest_df = sort_df[sort_df.date == sort_df.date.max()]

## Vet

Make sure everyone is here

In [36]:
try:
    assert not len(latest_df) > 22
except AssertionError:
    raise AssertionError("San Luis Obispo County's scraper has more rows than before")

In [37]:
try:
    assert not len(latest_df) < 22
except AssertionError:
    raise AssertionError("San Luis Obispo County's scraper is missing rows")

In [38]:
latest_df

Unnamed: 0,county,date,area,confirmed_cases,county_date
15800,San Luis Obispo,2022-01-28,ASH (patients),270,2022-01-28
686,San Luis Obispo,2022-01-28,Arroyo Grande,4148,2022-01-28
1373,San Luis Obispo,2022-01-28,Atascadero,4984,2022-01-28
12365,San Luis Obispo,2022-01-28,Avila,99,2022-01-28
5495,San Luis Obispo,2022-01-28,CMC (Inmates),2629,2022-01-28
10991,San Luis Obispo,2022-01-28,Cal Poly (campus residents),889,2022-01-28
9617,San Luis Obispo,2022-01-28,Cambria,477,2022-01-28
11678,San Luis Obispo,2022-01-28,Cayucos,265,2022-01-28
13739,San Luis Obispo,2022-01-28,Creston,192,2022-01-28
2060,San Luis Obispo,2022-01-28,Grover Beach,2240,2022-01-28


## Export

Set date

In [39]:
tz = pytz.timezone("America/Los_Angeles")

In [40]:
today = datetime.now(tz).date()

In [41]:
slug = "san-luis-obispo"

In [42]:
latest_df.to_csv(data_dir / slug / f"{today}.csv", index=False)

## Combine

In [43]:
csv_list = [
    i
    for i in glob.glob(str(data_dir / slug / "*.csv"))
    if not str(i).endswith("timeseries.csv")
]

In [44]:
df_list = []
for csv in csv_list:
    if "manual" in csv:
        df = pd.read_csv(csv, parse_dates=["date"])
    else:
        file_date = csv.split("/")[-1].replace(".csv", "")
        df = pd.read_csv(csv, parse_dates=["county_date"])
        df["date"] = file_date
    df_list.append(df)

In [45]:
df = pd.concat(df_list).sort_values(["date", "area"])

In [46]:
df.to_csv(data_dir / slug / "timeseries.csv", index=False)