## Configuration
_Initial steps to get the notebook ready to play nice with our repository. Do not delete this section._

Code formatting with [black](https://pypi.org/project/nb-black/).

In [1]:
import os
import pathlib

In [2]:
this_dir = pathlib.Path(os.path.abspath(""))

In [3]:
data_dir = this_dir / "data"

In [4]:
import re
import pytz
import glob
import requests
import unicodedata
import pandas as pd
from slugify import slugify
from bs4 import BeautifulSoup
from datetime import datetime, date



## Download

Retrieve the page

In [64]:
url = "http://publichealth.lacounty.gov/media/Coronavirus/locations.htm"

In [106]:
url = "http://publichealth.lacounty.gov/media/Coronavirus/json/covid19_location_casecounter.json"

In [107]:
page = requests.get(url)

## Parse

In [108]:
soup = BeautifulSoup(page.content, "html.parser")

Get table

In [126]:
for tag in soup.find_all(text=re.compile("CITY/COMMUNITY")):
    table = tag.findParent("table")

In [153]:
row_list = table.find_all("tr")

In [156]:
len(row_list[1:])

342

In [132]:
dict_list = []

In [133]:
def safetxt(element):
    v = element.text.strip()
    v = v.replace("\u200b", "")
    return v

In [134]:
def safenumber(element):
    v = safetxt(element)
    v = v.replace(",", "")
    v = v.replace(" ", "")
    return v

In [157]:
for row in row_list[1:]:
    cell_content = row.find_all("td")
    d = dict(
        county="Los Angeles",
        area=safetxt(cell_content[0]),
        confirmed_cases=safenumber(cell_content[1]),
        confirmed_deaths=safenumber(cell_content[3]),
    )
    dict_list.append(d)

In [158]:
df = pd.DataFrame(dict_list)

In [159]:
df

Unnamed: 0,county,area,confirmed_cases,confirmed_deaths
0,Los Angeles,City of Agoura Hills,6334,27
1,Los Angeles,City of Alhambra,25445,301
2,Los Angeles,City of Arcadia,13667,213
3,Los Angeles,City of Artesia,5997,95
4,Los Angeles,City of Avalon,288,3
...,...,...,...,...
337,Los Angeles,Unincorporated - Whittier,999,11
338,Los Angeles,Unincorporated - Whittier Narrows,76,1
339,Los Angeles,Unincorporated - Willowbrook,16744,141
340,Los Angeles,Unincorporated - Wiseburn,1909,20


Get timestamp

In [160]:
date_url = "http://publichealth.lacounty.gov/media/Coronavirus/js/casecounter.js"

In [161]:
response = requests.get(date_url)
date_page = response.text

In [162]:
date_page

"$(document).ready(function() {\t\r\n\tfetch('../json/covid19_location_casecounter.json')\r\n\t\t.then((response) => response.json())\r\n\t\t.then((json) => {\r\n\t\t\t$('#ctn').html(json.count);\r\n\t\t\t$('#det').html(json.death);\r\n\t\t\t$('#dctn').html(json.dailycount);\r\n\t\t\t$('#ddet').html(json.dailydeath);\r\n\t\t\t$('#hosp_1').html(json.hospitalizationsever);\r\n\t\t\t$('#dte').html(json.date);\r\n\t\t\t$('#dte_1').html(json.date);\r\n\t\t\t$('#dte_2').html(json.date);\r\n\t\t\t$('#dte_3').html(json.datehospitalizations);\r\n\t\t\t$('#cse').html(json.info);\t\r\n\t\t\t\r\n\t\t\t$('#tstavg').html(json.testingaverage);\t\r\n\t\t\t$('#ppltst').html(json.peopletested);\t\r\n\t\t\t$('#pos').html(json.positivity);\t\r\n\t\t\t$('#hosp').html(json.hospitalizations);\r\n\r\n\t\t\t/** Tables **/\r\n\t\t\t$('#case_summary').html(json.comms);\r\n\t\t\t$('#res_settings').html(json.res);\r\n\t\t\t$('#non_res_settings').html(json.non_res);\r\n\t\t\t$('#peh_settings').html(json.peh);\r\n\t

In [96]:
# date_text = re.search(r"([0-9][0-9]/[0-9][0-9])", date_page).group(1)
# date_text = date_text + "/" + str(date.today().year)

In [174]:
date_text = re.search(r"through (.+?)\"", page.text).group(1)

In [175]:
date_text

'12:00pm 5/8/2023'

In [176]:
date_text = date_text.split("m ")[0]

In [177]:
latest_date = pd.to_datetime(date_text).date()

In [178]:
df["county_date"] = latest_date

In [180]:
df.loc[df.area == "-  Under Investigation", "area"] = "Under Investigation"

In [181]:
df.loc[df.area == "- Under Investigation", "area"] = "Under Investigation"

## Vet

In [182]:
len(df)

342

In [183]:
# try:
#     assert not len(df) > 340
# except AssertionError:
#     raise AssertionError("L.A. County's scraper has extra rows")

In [184]:
# try:
#     assert not len(df) < 340
# except AssertionError:
#     raise AssertionError("L.A. County's scraper is missing rows")

## Export

Set the date

In [185]:
tz = pytz.timezone("America/Los_Angeles")

In [186]:
today = datetime.now(tz).date()

In [187]:
slug = "los-angeles"

In [188]:
df.to_csv(data_dir / slug / f"{today}.csv", index=False)

## Combine

In [189]:
csv_list = [
    i
    for i in glob.glob(str(data_dir / slug / "*.csv"))
    if not str(i).endswith("timeseries.csv")
]

In [190]:
df_list = []
for csv in csv_list:
    if "manual" in csv:
        df = pd.read_csv(csv, parse_dates=["date"])
    else:
        file_date = csv.split("/")[-1].replace(".csv", "")
        df = pd.read_csv(csv, parse_dates=["county_date"])
        df["date"] = file_date
    df_list.append(df)

In [191]:
df = pd.concat(df_list).sort_values(["date", "area"])

In [192]:
df.to_csv(data_dir / slug / "timeseries.csv", index=False)