## Configuration
_Initial steps to get the notebook ready to play nice with our repository. Do not delete this section._

Code formatting with [black](https://pypi.org/project/nb-black/).

In [1]:
%load_ext lab_black

In [2]:
import os
import pathlib

In [3]:
this_dir = pathlib.Path(os.path.abspath(""))

In [4]:
data_dir = this_dir / "data"

In [5]:
import re
import pytz
import glob
import requests
import unicodedata
import pandas as pd
from slugify import slugify
from bs4 import BeautifulSoup
from datetime import datetime, date

## Download

Retrieve the page

In [6]:
url = "http://publichealth.lacounty.gov/media/Coronavirus/locations.htm"

In [7]:
page = requests.get(url)

## Parse

In [8]:
soup = BeautifulSoup(page.content, "html.parser")

Get content well

In [9]:
content = soup.find("div", {"id": "content"})

Get table

In [10]:
for tag in content.find_all(text=re.compile("CITY/COMMUNITY")):
    table = tag.findParent("table")

AttributeError: 'NoneType' object has no attribute 'find_all'

In [48]:
tbody = soup.tbody

In [49]:
row_list = tbody.find_all("tr")

In [50]:
dict_list = []

In [51]:
def safetxt(element):
    v = element.text.strip()
    v = v.replace("\u200b", "")
    return v

In [52]:
def safenumber(element):
    v = safetxt(element)
    v = v.replace(",", "")
    v = v.replace(" ", "")
    return v

In [53]:
for row in row_list:
    cell_content = row.find_all("td")
    d = dict(
        county="Los Angeles",
        area=safetxt(cell_content[0]),
        confirmed_cases=safenumber(cell_content[1]),
        confirmed_deaths=safenumber(cell_content[3]),
    )
    dict_list.append(d)

In [54]:
df = pd.DataFrame(dict_list)

Get timestamp

In [55]:
date_url = "http://publichealth.lacounty.gov/media/Coronavirus/js/casecounter.js"

In [56]:
response = requests.get(date_url)
date_page = response.text

In [57]:
date_page

'$( document ).ready(function() {\r\n    var data = {\r\n\t\t\t\t    "content": { \r\n\t\t\r\n\t\t\t"count": "3,622,250",\r\n\r\n\r\n\t\t\t"death": "34,599",\r\n\r\n\t\t\t"dailycount": "668",\r\n\r\n\r\n\t\t\t"dailydeath": "6",\r\n\t\t\t"hospitalizationsever": "165,483",\r\n\r\n\r\n\t\t\t"date": "12/27",\r\n\t\t\t"info": "through 12:00pm 12/26/2022",\r\n\r\n\t\t\t"testingaverage": "25,409",\r\n\r\n\t\t\t"peopletested": "12,814,416",\r\n\r\n\t\t\t"positivity": "10.77%",\r\n\r\n\t\t\t"hospitalizations": "1,220",\r\n\t\t\t"datehospitalizations": "12/23"\r\n\r\n\t\t\t\t}       \r\n\t\t\t\t};\t\r\n\t\t\t\t\r\n\t\t\t\t\r\n\t\t\t\tvar content = data.content;\r\n\t\t\t\t\r\n\t\t\t\t$(\'#ctn\').html(content.count);\r\n\t\t\t\t$(\'#det\').html(content.death);\r\n\t\t\t\t$(\'#dctn\').html(content.dailycount);\r\n\t\t\t\t$(\'#ddet\').html(content.dailydeath);\r\n\t\t\t\t$(\'#hosp_1\').html(content.hospitalizationsever);\r\n\t\t\t\t$(\'#dte\').html(content.date);\r\n\t\t\t\t$(\'#dte_1\').html(conte

In [58]:
# date_text = re.search(r"([0-9][0-9]/[0-9][0-9])", date_page).group(1)
# date_text = date_text + "/" + str(date.today().year)

In [59]:
date_text = re.search(r"through (.+?)\"", date_page).group(1)

In [60]:
date_text = date_text.split("m ")[0]

In [61]:
latest_date = pd.to_datetime(date_text).date()

In [62]:
df["county_date"] = latest_date

In [63]:
df.loc[df.area == "-  Under Investigation", "area"] = "Under Investigation"

In [64]:
df.loc[df.area == "- Under Investigation", "area"] = "Under Investigation"

## Vet

In [65]:
try:
    assert not len(df) > 342
except AssertionError:
    raise AssertionError("L.A. County's scraper has extra rows")

In [66]:
try:
    assert not len(df) < 342
except AssertionError:
    raise AssertionError("L.A. County's scraper is missing rows")

## Export

Set the date

In [67]:
tz = pytz.timezone("America/Los_Angeles")

In [68]:
today = datetime.now(tz).date()

In [69]:
slug = "los-angeles"

In [70]:
df.to_csv(data_dir / slug / f"{today}.csv", index=False)

## Combine

In [71]:
csv_list = [
    i
    for i in glob.glob(str(data_dir / slug / "*.csv"))
    if not str(i).endswith("timeseries.csv")
]

In [72]:
df_list = []
for csv in csv_list:
    if "manual" in csv:
        df = pd.read_csv(csv, parse_dates=["date"])
    else:
        file_date = csv.split("/")[-1].replace(".csv", "")
        df = pd.read_csv(csv, parse_dates=["county_date"])
        df["date"] = file_date
    df_list.append(df)

In [73]:
df = pd.concat(df_list).sort_values(["date", "area"])

In [74]:
df.to_csv(data_dir / slug / "timeseries.csv", index=False)