## Configuration
_Initial steps to get the notebook ready to play nice with our repository. Do not delete this section._

Code formatting with [black](https://pypi.org/project/nb-black/).

In [228]:
%load_ext lab_black

The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black


In [229]:
import os
import pathlib

In [230]:
this_dir = pathlib.Path(os.path.abspath(""))

In [231]:
data_dir = this_dir / "data"

In [232]:
import re
import pytz
import glob
import requests
import unicodedata
import json
import pandas as pd
from datetime import datetime
from bs4 import BeautifulSoup

### Download / retrieve the page

Lake County's [dashboard](http://health.co.lake.ca.us/Coronavirus/COVID-19_Data.htm) uses Datawrapper to display cases by ZIP Code. To make sure we get the latest, we'll hit the chart page on Datawrapper's website, then get the iframe URL.

In [233]:
url = "https://www.datawrapper.de/_/D1RaA/"

In [234]:
page = requests.get(url)

### Parse with BeautifulSoup

In [235]:
soup = BeautifulSoup(page.content, "html.parser")

In [236]:
meta = soup.find("meta", {"property": "og:url"})

In [237]:
dw_url = meta["content"]

### Parse Datawrapper

In [238]:
dataset_url = dw_url + "dataset.csv"

In [239]:
dw_r = requests.get(dataset_url)

In [240]:
string = dw_r.text

In [241]:
parsed = string.split("\r\n")

In [242]:
data_list = []
for row in parsed:
    cell = row.split(",")
    data_list.append(cell)

In [243]:
data_list

[['Supervisorial District', 'District Total', 'Zip Code', 'Zip Code Total'],
 ['District 1', '771', '95457', '261'],
 ['', '', '95467', '344'],
 ['', '', '95461', '166'],
 ['District 2', '1647', '95422', '1627'],
 ['', '', '95424', '20'],
 ['District 3', '1100', '95423', '256'],
 ['', '', '95458', '271'],
 ['', '', '95464', '261'],
 ['', '', '95485', '291'],
 ['', '', '95493', '11'],
 ['', '', '95443', '10'],
 ['District 4', '1434', '95453', '1388'],
 ['', '', '95435', '46'],
 ['District 5', '1150', '95426', '62'],
 ['', '', '95451', '1088'],
 ['Unknown or Other', '99', 'Unknown/other', '99']]

In [244]:
df = pd.DataFrame(data_list[1:], columns=data_list[0])

### Get timestamp

In [245]:
title_tag = soup.find("meta", {"property": "og:title"})

In [246]:
title_str = title_tag["content"]

In [247]:
title_str

'Lake County COVID-19 cases, by district and zip code, through 9/30/21 | Created with Datawrapper'

In [248]:
date_regex = re.search(r"(\d+/\d+/\d+)", title_str)

In [249]:
date_str = date_regex.group(1)

In [250]:
latest_date = pd.to_datetime(date_str).date()

In [251]:
df["county_date"] = latest_date

### Rename and drop columns

In [252]:
cols = {"Zip Code": "area", "Zip Code Total": "confirmed_cases"}

In [253]:
export_df = df.rename(columns=cols)[["area", "confirmed_cases", "county_date"]]

In [254]:
len(export_df)

16

### Vet

Lake County has 16 ZIP code areas, including "Unknown"

In [255]:
lake_len = 16

In [256]:
try:
    assert not len(export_df) > lake_len
except AssertionError:
    raise AssertionError(
        "Lake County scraper: spreadsheet has additional zip code(s) than previous"
    )

In [257]:
try:
    assert not len(export_df) < lake_len
except AssertionError:
    raise AssertionError("Lake County scraper: spreadsheet is missing row(s)")

### Export csv file

Write csv file timestamped with today's date

In [258]:
slug = "lake"

In [259]:
export_df.to_csv(data_dir / slug / f"{today}.csv", index=False)

## Combine

In [260]:
csv_list = [
    i
    for i in glob.glob(str(data_dir / slug / "*.csv"))
    if not str(i).endswith("timeseries.csv")
]

In [261]:
df_list = []
for csv in csv_list:
    if "manual" in csv:
        df = pd.read_csv(csv, parse_dates=["date"])
    else:
        file_date = csv.split("/")[-1].replace(".csv", "")
        df = pd.read_csv(csv, parse_dates=["county_date"])
        df["date"] = file_date
    df_list.append(df)

In [262]:
df = pd.concat(df_list).sort_values(["date", "area"])

In [263]:
df.to_csv(data_dir / slug / "timeseries.csv", index=False)