## Configuration
_Initial steps to get the notebook ready to play nice with our repository. Do not delete this section._

Code formatting with [black](https://pypi.org/project/nb-black/).

In [1]:
%load_ext lab_black

In [2]:
import os
import pytz
import glob
import pathlib

this_dir = pathlib.Path(os.path.abspath(""))
data_dir = this_dir / "data"

In [3]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import json
import re
from datetime import datetime, date, timedelta
from slugify import slugify
from playwright.async_api import async_playwright

## Download

Retrieve the page

In [4]:
url = "https://e.infogram.com/1c8458d3-1302-4c47-bd47-24366f127b72?parent_url=https%3A%2F%2Fwww.emergencyslo.org%2Fen%2Fpositive-case-details.aspx&src=embed#"

In [5]:
url = "https://e.infogram.com/6d3c0cf1-4960-4340-9872-78dd22cb991a?parent_url=https%3A%2F%2Fwww.slocounty.ca.gov%2FCOVID-19%2FData.aspx&src=embed#"

In [6]:
page = requests.get(url)

In [7]:
soup = BeautifulSoup(page.content, "html.parser")

## Parse

Find script tag with the Infogram data

In [8]:
for elem in soup(text=re.compile(r"window.infographicData")):
    script = elem

In [9]:
stripped_script = re.sub("window.infographicData=", "", script)

In [10]:
stripped_script = stripped_script.rstrip(";")

In [11]:
data = json.loads(stripped_script)

In [12]:
cities = [
    x
    for x in data["elements"]
    ##if x["object_id"] == "3695dc89-8a1c-4aa2-951b-11eb68e147ac"
    if x["object_id"] == "ebb96e22-3766-4a73-a776-90c533b379b5"
]

# "ebb96e22-3766-4a73-a776-90c533b379b5"
# "e024d6eb-623f-48fc-8345-19a937721f7d"

In [13]:
city_list = cities[0]["data"][0]

In [14]:
not_empty = [ele for ele in city_list if ele[0] != ""]

In [15]:
df = pd.DataFrame(not_empty)

In [16]:
df.columns = df.iloc[0]
df = df[1:]

In [17]:
df

Unnamed: 0,Date,Arroyo Grande,Atascadero,Grover Beach,Nipomo,Paso Robles,San Luis Obispo,Templeton,CMC (Inmates),Pismo Beach,...,Shandon,Cal Poly (campus residents),Cayucos,Avila,Santa Margarita,Creston,San Simeon,Bradley,ASH (patients),NaN
1,3/14/2020,,,,,,,,,,...,,,,,,,,,,
2,3/15/2020,,,,,,,,,,...,,,,,,,,,,
3,3/16/2020,,,,,,,,,,...,,,,,,,,,,
4,3/17/2020,,,,,,,,,,...,,,,,,,,,,
5,3/18/2020,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
680,1/21/2022,3649,4496,1953,3315,7923,8235,1494,2549,863,...,312,849,236,85,317,175,64,36,249,
681,1/22/2022,3661,4511,1959,3325,7983,8273,1501,2554,864,...,313,854,236,87,318,177,65,37,253,357
682,1/23/2022,3669,4528,1964,3338,8033,8290,1507,2559,866,...,314,855,233,88,319,179,65,37,253,353
683,1/24/2022,3734,4611,2008,3406,8190,8450,1527,2571,895,...,319,857,242,91,324,179,66,37,258,362


Clean up the dataframe

In [18]:
melt_df = pd.melt(
    df, id_vars=["Date"], var_name="area", value_name="confirmed_cases"
).rename(columns={"Date": "date"})

In [19]:
melt_df["date"] = pd.to_datetime(melt_df["date"])

In [20]:
melt_df["confirmed_cases"] = melt_df["confirmed_cases"].replace(",", "", regex=True)

In [21]:
filter_df = melt_df[(melt_df["confirmed_cases"] != "") & (melt_df["area"] != "Bradley")]

In [22]:
sort_df = filter_df.sort_values(["date", "area"])

In [23]:
assert not sort_df.area.isnull().any()

AssertionError: 

In [60]:
assert len(sort_df[sort_df.area == ""]) == 0

Get timestamp

In [61]:
text = data["elements"]

In [62]:
grafs = []
for ele in data["elements"]:
    keys = ele.keys()
    if "text" in keys:
        text = ele["text"]
        grafs.append(text)

In [63]:
regexDateGraf = re.compile("As of \d{1,2}\/\d{1,2}\/\d{1,2}")

In [64]:
parsed_text = []
for graf in grafs:
    soup = BeautifulSoup(graf, "html.parser")
    text = soup.text
    parsed_text.append(text)

In [65]:
date_graf = [
    regexDateGraf.search(t)[0] for t in parsed_text if regexDateGraf.search(t) != None
][0]

In [66]:
date_graf_trim = date_graf.replace("As of ", "")

In [67]:
latest_date = pd.to_datetime(date_graf_trim, format="%m/%d/%y").date()

In [68]:
sort_df["county_date"] = latest_date

In [69]:
sort_df.insert(0, "county", "San Luis Obispo")

In [70]:
latest_df = sort_df[sort_df.date == sort_df.date.max()]

## Vet

Make sure everyone is here

In [71]:
try:
    assert not len(latest_df) > 22
except AssertionError:
    raise AssertionError("San Luis Obispo County's scraper has more rows than before")

In [72]:
try:
    assert not len(latest_df) < 22
except AssertionError:
    raise AssertionError("San Luis Obispo County's scraper is missing rows")

In [74]:
latest_df

Unnamed: 0,county,date,area,confirmed_cases,county_date
15570,San Luis Obispo,2022-01-18,ASH (patients),240,2022-01-18
676,San Luis Obispo,2022-01-18,Arroyo Grande,3366,2022-01-18
1353,San Luis Obispo,2022-01-18,Atascadero,4259,2022-01-18
12185,San Luis Obispo,2022-01-18,Avila,78,2022-01-18
5415,San Luis Obispo,2022-01-18,CMC (Inmates),2501,2022-01-18
10831,San Luis Obispo,2022-01-18,Cal Poly (campus residents),817,2022-01-18
9477,San Luis Obispo,2022-01-18,Cambria,380,2022-01-18
11508,San Luis Obispo,2022-01-18,Cayucos,209,2022-01-18
13539,San Luis Obispo,2022-01-18,Creston,167,2022-01-18
2030,San Luis Obispo,2022-01-18,Grover Beach,1787,2022-01-18


## Export

Set date

In [75]:
tz = pytz.timezone("America/Los_Angeles")

In [76]:
today = datetime.now(tz).date()

In [77]:
slug = "san-luis-obispo"

In [78]:
latest_df.to_csv(data_dir / slug / f"{today}.csv", index=False)

## Combine

In [79]:
csv_list = [
    i
    for i in glob.glob(str(data_dir / slug / "*.csv"))
    if not str(i).endswith("timeseries.csv")
]

In [80]:
df_list = []
for csv in csv_list:
    if "manual" in csv:
        df = pd.read_csv(csv, parse_dates=["date"])
    else:
        file_date = csv.split("/")[-1].replace(".csv", "")
        df = pd.read_csv(csv, parse_dates=["county_date"])
        df["date"] = file_date
    df_list.append(df)

In [81]:
df = pd.concat(df_list).sort_values(["date", "area"])

In [82]:
df.to_csv(data_dir / slug / "timeseries.csv", index=False)