In [1]:
%load_ext lab_black

In [2]:
import os
import pytz
import pathlib
import requests
from datetime import datetime
from bs4 import BeautifulSoup
import pandas as pd
import glob

this_dir = pathlib.Path(os.path.abspath(""))
data_dir = this_dir / "data"

In [3]:
r = requests.get("http://sierracounty.ca.gov/595/Sierra-County-Case-Info")

In [4]:
soup = BeautifulSoup(r.text)

In [5]:
content = soup.find("div", id="bodyWrapper")

In [6]:
table = soup.find("table")

In [7]:
tbody = table.tbody

In [8]:
tbody

<tbody>
<tr>
<td>
<br/>
<table border="0" cellpadding="0" cellspacing="5" summary="Item no longer available">
<tbody>
<tr>
<td align="left" colspan="2">
<div class="fr-view"><font style="font-size: 18pt;">We're sorry, but there is not a web page matching your entry.</font></div>
<br/>
<br/>
</td>
</tr>
<tr>
<td colspan="2" style="word-break: break-all">
<div style="text-align: left;">
											You entered: <span style="font-weight: bold;">http://sierracounty.ca.gov/595/Sierra-County-Case-Info</span><br/>
<br/>
</div>
</td>
</tr>
<tr>
<td colspan="2">
<a href="/" id="redirectToHome" style="text-align: center; font-size: 12pt; color: #336699;">Return to the home page</a>
<br/>
</td>
</tr>
</tbody>
</table>
<br/>
<br/>
</td>
</tr>
</tbody>

In [9]:
row_list = tbody.find_all("tr")[1:3]

In [10]:
row_list

[<tr>
 <td align="left" colspan="2">
 <div class="fr-view"><font style="font-size: 18pt;">We're sorry, but there is not a web page matching your entry.</font></div>
 <br/>
 <br/>
 </td>
 </tr>,
 <tr>
 <td colspan="2" style="word-break: break-all">
 <div style="text-align: left;">
 											You entered: <span style="font-weight: bold;">http://sierracounty.ca.gov/595/Sierra-County-Case-Info</span><br/>
 <br/>
 </div>
 </td>
 </tr>]

In [11]:
dict_list = []

In [12]:
def safetxt(element):
    # v = element.text.strip()
    # v = v.replace("\u200b", "")
    v = element.text.split("Side")[0]
    v = v.replace("Located on the ", "")
    return v

In [13]:
def safenumber(element):
    v = safetxt(element)
    v = v.replace(",", "")
    v = v.replace(" ", "")
    return v

In [14]:
for row in row_list:
    cell_list = row.find_all("td")
    print(cell_list)
    d = dict(
        county="Sierra",
        area=safetxt(cell_list[0]),
        confirmed_cases=safenumber(cell_list[1]),
    )
    dict_list.append(d)

[<td align="left" colspan="2">
<div class="fr-view"><font style="font-size: 18pt;">We're sorry, but there is not a web page matching your entry.</font></div>
<br/>
<br/>
</td>]


IndexError: list index out of range

In [162]:
df = pd.DataFrame(dict_list)

In [163]:
df

Unnamed: 0,county,area,confirmed_cases
0,Sierra,East,471
1,Sierra,West,133


In [164]:
rows_to_remove = ["Recovered Cases", "Currently Hospitalized", "Deaths"]

In [165]:
df_filtered = df[~df.area.isin(rows_to_remove)].copy()

In [166]:
df_filtered.loc[
    df_filtered.area == "Located on the East Side of the County", "area"
] = "East"
df_filtered.loc[
    df_filtered.area == "Located on the West Side of the County", "area"
] = "West"

Get date

In [167]:
raw_date_string = tbody.find("td")["data-th"]

In [168]:
latest_date = pd.to_datetime(raw_date_string.replace("Total Cases as of ", "")).date()

Set the date

In [169]:
df_filtered["county_date"] = latest_date

Mark the current date

In [170]:
tz = pytz.timezone("America/Los_Angeles")

In [171]:
today = datetime.now(tz).date()

In [172]:
slug = "sierra"

## Vet

In [173]:
try:
    assert not len(df_filtered) > 2
except AssertionError:
    raise AssertionError("Sierra's area scraper has extra rows")

In [174]:
try:
    assert not len(df_filtered) < 2
except AssertionError:
    raise AssertionError("Sierra's area scraper is missing rows")

## Export

In [175]:
df_filtered

Unnamed: 0,county,area,confirmed_cases,county_date
0,Sierra,East,471,2022-04-13
1,Sierra,West,133,2022-04-13


In [176]:
df_filtered.to_csv(data_dir / slug / f"{today}.csv", index=False)

## Combine

In [177]:
csv_list = [
    i
    for i in glob.glob(str(data_dir / slug / "*.csv"))
    if not str(i).endswith("timeseries.csv")
]

In [178]:
df_list = []
for csv in csv_list:
    if "manual" in csv:
        df = pd.read_csv(csv, parse_dates=["date"])
    else:
        file_date = csv.split("/")[-1].replace(".csv", "")
        df = pd.read_csv(csv, parse_dates=["county_date"])
        df["date"] = file_date
    df_list.append(df)

In [179]:
df = pd.concat(df_list).sort_values(["date", "area"])

In [34]:
df.to_csv(os.path.join(data_dir / slug / "timeseries.csv"), index=False)