## Configuration
_Initial steps to get the notebook ready to play nice with our repository. Do not delete this section._

Code formatting with [black](https://pypi.org/project/nb-black/).

In [1]:
%load_ext lab_black

In [2]:
import os
import pathlib

In [3]:
this_dir = pathlib.Path(os.path.abspath(""))

In [4]:
data_dir = this_dir / "data"

In [5]:
import re
import pytz
import glob
import requests
import unicodedata
import json
import pandas as pd
from datetime import datetime
from bs4 import BeautifulSoup

### Download / retrieve the page

Lake County's [dashboard](http://health.co.lake.ca.us/Coronavirus/COVID-19_Data.htm) uses Datawrapper to display cases by ZIP Code. To make sure we get the latest, we'll hit the chart page on Datawrapper's website, then get the iframe URL.

In [6]:
url = "https://www.datawrapper.de/_/D1RaA/"

In [7]:
page = requests.get(url)

### Parse with BeautifulSoup

In [8]:
soup = BeautifulSoup(page.content, "html.parser")

In [9]:
meta = soup.find("meta", {"property": "og:url"})

In [10]:
dw_url = meta["content"]

### Parse Datawrapper

In [11]:
dataset_url = dw_url + "dataset.csv"

In [12]:
dw_r = requests.get(dataset_url)

In [13]:
string = dw_r.text

In [14]:
parsed = string.split("\r\n")

In [15]:
data_list = []
for row in parsed:
    cell = row.split(",")
    data_list.append(cell)

In [16]:
data_list

[['Supervisorial District', 'District Total', 'Zip Code', 'Zip Code Total'],
 ['District 1', '863', '95457', '290'],
 ['', '', '95467', '387'],
 ['', '', '95461', '186'],
 ['District 2', '1844', '95422', '1819'],
 ['', '', '95424', '25'],
 ['District 3', '1183', '95423', '274'],
 ['', '', '95458', '290'],
 ['', '', '95464', '278'],
 ['', '', '95485', '315'],
 ['', '', '95493', '12'],
 ['', '', '95443', '14'],
 ['District 4', '1500', '95453', '1452'],
 ['', '', '95435', '48'],
 ['District 5', '1240', '95426', '72'],
 ['', '', '95451', '1168'],
 ['Unknown or Other', '97', 'Unknown/other', '97']]

In [17]:
df = pd.DataFrame(data_list[1:], columns=data_list[0])

### Get timestamp

In [18]:
title_tag = soup.find("meta", {"property": "og:title"})

In [19]:
title_str = title_tag["content"]

In [20]:
title_str

'Lake County COVID-19 cases, by district and zip code | Created with Datawrapper'

In [21]:
date_regex = re.search(r"(\d+/\d+/\d+)", title_str)

In [22]:
date_str = date_regex.group(1)

AttributeError: 'NoneType' object has no attribute 'group'

In [64]:
latest_date = pd.to_datetime(date_str).date()

In [65]:
df["county_date"] = latest_date

In [66]:
df["county"] = "Lake"

### Rename and drop columns

In [67]:
cols = {"Zip Code": "area", "Zip Code Total": "confirmed_cases"}

In [68]:
export_df = df.rename(columns=cols)[
    ["county", "area", "confirmed_cases", "county_date"]
]

In [69]:
len(export_df)

16

### Vet

Lake County has 16 ZIP code areas, including "Unknown"

In [70]:
lake_len = 16

In [71]:
try:
    assert not len(export_df) > lake_len
except AssertionError:
    raise AssertionError(
        "Lake County scraper: spreadsheet has additional zip code(s) than previous"
    )

In [72]:
try:
    assert not len(export_df) < lake_len
except AssertionError:
    raise AssertionError("Lake County scraper: spreadsheet is missing row(s)")

### Export csv file

Write csv file timestamped with today's date

In [73]:
slug = "lake"

In [74]:
tz = pytz.timezone("America/Los_Angeles")

In [75]:
today = datetime.now(tz).date()

In [76]:
export_df.to_csv(data_dir / slug / f"{today}.csv", index=False)

## Combine

In [77]:
csv_list = [
    i
    for i in glob.glob(str(data_dir / slug / "*.csv"))
    if not str(i).endswith("timeseries.csv")
]

In [78]:
df_list = []
for csv in csv_list:
    if "manual" in csv:
        df = pd.read_csv(csv, parse_dates=["date"])
    else:
        file_date = csv.split("/")[-1].replace(".csv", "")
        df = pd.read_csv(csv, parse_dates=["county_date"])
        df["date"] = file_date
    df_list.append(df)

In [79]:
df = pd.concat(df_list).sort_values(["date", "area"])

In [80]:
df.to_csv(data_dir / slug / "timeseries.csv", index=False)