## Configuration
_Initial steps to get the notebook ready to play nice with our repository. Do not delete this section._

Code formatting with [black](https://pypi.org/project/nb-black/).

In [1]:
%load_ext lab_black

In [2]:
import os
import pathlib

In [3]:
this_dir = pathlib.Path(os.path.abspath(""))

In [4]:
data_dir = this_dir / "data"

In [5]:
import pytz
import glob
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re
import unicodedata
from datetime import datetime

## Download

Retrieve the page

In [6]:
url = "https://www.mendocinocounty.org/community/novel-coronavirus/covid-19-case-data/covid-dashboard-draft/cases-and-deaths-demographics"

In [7]:
page = requests.get(url)

##  Parse

In [8]:
soup = BeautifulSoup(page.content, "html.parser")

Get content well

In [9]:
content = soup.find("div", {"id": "widget_4_7164_4508"})

In [10]:
content = soup.find_all("iframe")

In [11]:
source = content[1]["src"]

IndexError: list index out of range

In [13]:
newPage = requests.get(source)

In [14]:
newSoup = BeautifulSoup(newPage.content, "html.parser")

In [15]:
newSoup


<html><head>
<script>window.location.href='https://datawrapper.dwcdn.net/zj83H/9/'+window.location.search;</script>
<meta content="0; url=https://datawrapper.dwcdn.net/zj83H/9/" http-equiv="REFRESH"/>
</head></html>

In [16]:
src = newSoup.find("meta")["content"].split("=")[1]

In [17]:
page2 = requests.get(src)

In [18]:
soup2 = BeautifulSoup(page2.text, "html.parser")

In [19]:
src

'https://datawrapper.dwcdn.net/zj83H/9/'

Get timestamp

In [20]:
date_text = soup2.find("div", {"class": "dw-above-footer"}).text

In [21]:
date_text

'\nData as of 3/30/22 /Datos al 30/03/22. \nUkiah area / Ã\x81rea de Ukiah: zip codes 95418, 95449, 95469, 95470, 95481, 95482.\nNorth County / Condado Norte: zip codes 95417, 95428, 95429, 95454, 95490, 95585, 95587. \nNorth Coast / Costa Norte: zip codes 95420, 95437, 95488. South County / Condado Sur: zip codes 95414, 95463, 95466, 95494. South Coast / Costa Sur: 95410, 95427, 95432, 95445, 95456, 95459, 95460, 95468. \n'

In [22]:
assert "Data as of" in date_text

In [23]:
date = date_text.split("/Datos")[0].replace("\nData as of ", "")

In [24]:
date

'3/30/22 '

In [25]:
latest_date = pd.to_datetime(date).date()

Get table

In [26]:
table = requests.get(f"{src}dataset.csv").text

In [27]:
table

'Place of residence,# of cases / # de casos,% of total cases / % del total de casos,% of county population / % de la población del condado\r\nUkiah area / Área de Ukiah,8178,63%,43%\r\nNorth County / Condado Norte,2908,22%,22%\r\nNorth Coast / Costa Norte,1171,9%,18%\r\nSouth County / Condado Sur,287,2%,8%\r\nSouth Coast / Costa Sur,387,3%,9%'

In [28]:
table.split("\r\n")[1:]

['Ukiah area / Área de Ukiah,8178,63%,43%',
 'North County / Condado Norte,2908,22%,22%',
 'North Coast / Costa Norte,1171,9%,18%',
 'South County / Condado Sur,287,2%,8%',
 'South Coast / Costa Sur,387,3%,9%']

In [29]:
dict_list = []

In [30]:
for t in table.split("\r\n")[1:]:
    d = dict(
        county="Mendocino",
        area=t.split(" /")[0].title(),
        confirmed_cases=t.split(",")[1],
    )
    dict_list.append(d)

In [31]:
df = pd.DataFrame(dict_list)

In [32]:
df

Unnamed: 0,county,area,confirmed_cases
0,Mendocino,Ukiah Area,8178
1,Mendocino,North County,2908
2,Mendocino,North Coast,1171
3,Mendocino,South County,287
4,Mendocino,South Coast,387


In [33]:
df["county_date"] = latest_date

## Vet

In [34]:
try:
    assert not len(df) > 5
except AssertionError:
    raise AssertionError("Mendocino's area scraper has extra rows")

In [35]:
try:
    assert not len(df) < 5
except AssertionError:
    raise AssertionError("Mendocino's area scraper is missing rows")

## Export

Set date

In [36]:
tz = pytz.timezone("America/Los_Angeles")

In [37]:
today = datetime.now(tz).date()

In [38]:
slug = "mendocino"

In [39]:
df.to_csv(data_dir / slug / f"{today}.csv", index=False)

## Combine

In [40]:
csv_list = [
    i
    for i in glob.glob(str(data_dir / slug / "*.csv"))
    if not str(i).endswith("timeseries.csv")
]

In [41]:
df_list = []
for csv in csv_list:
    if "manual" in csv:
        df = pd.read_csv(csv, parse_dates=["date"])
    else:
        file_date = csv.split("/")[-1].replace(".csv", "")
        df = pd.read_csv(csv, parse_dates=["county_date"])
        df["date"] = file_date
    df_list.append(df)

In [42]:
df = pd.concat(df_list).sort_values(["date", "area"])

In [43]:
df.to_csv(data_dir / slug / "timeseries.csv", index=False)