In [16]:
# HIDDEN
# This useful nonsense should just go at the top of your notebook.
from datascience import *
%matplotlib inline
#%matplotlib notebook
import matplotlib.pyplot as plots
import numpy as np
plots.style.use('fivethirtyeight')
plots.rc('lines', linewidth=2, color='r')
from ipywidgets import interact
import ipywidgets as widgets
# datascience version number of last run of this notebook
print(version.__version__)

import requests
import sys
sys.path.append(".")
from timetable import TimeTable

import locale
locale.setlocale( locale.LC_ALL, 'en_US.UTF-8' ) 

import os
import datetime
from bs4 import BeautifulSoup

0.15.0


In [17]:
def peek_tables(url):
    """Help see what tables are in the document"""
    req = requests.get(ct_url)
    soup = BeautifulSoup(req.text, 'html.parser')
    return [x.attrs for x in soup.find_all('table')]

def peek_table_rows(url, find_args):
    """See how the rows are constructed"""
    req = requests.get(url)
    soup = BeautifulSoup(req.text, 'html.parser')
    tbl = soup.find("table", class_ = find_args)
    rows = tbl.find_all('tr')
    return rows

def clean_val(v):
    try :
        cl = v.text.rstrip()
        return cl
    except :
        return None

def get_html_table(url, find_args, head_row=0, data_row=1, end_skip=None):
    req = requests.get(url)
    soup = BeautifulSoup(req.text, 'html.parser')
    tbl = soup.find("table", class_ = find_args)
    rows = tbl.find_all('tr')
    headers = [clean_val(h) for h in rows[head_row].find_all('th')]
    data = rows[data_row : -end_skip] if end_skip else rows[data_row:]
    tbl = []
    for row in data :
        rheads = [clean_val(x) for x in row.find_all('th')]
        rvals = [clean_val(x) for x in row.find_all('td')]
        tbl.append(rheads + rvals)
    return headers, tbl

def remove_footnote(s):
    if '[' in s:
        return s.split('[')[0]
    else:
        return s

def get_CA_county_covid19():
    CA_county_URL = 'https://en.wikipedia.org/wiki/2020_coronavirus_pandemic_in_California'
    heads, r = get_html_table('https://en.wikipedia.org/wiki/2020_coronavirus_pandemic_in_California', 
                          "wikitable plainrowheaders sortable", 1, 3, 2)
    raw_table = Table([remove_footnote(h) for h in heads]).with_rows(r).drop(4)
    covid_table = raw_table.copy()
    for lbl in covid_table.labels :
        covid_table[lbl] = covid_table.apply(remove_footnote, lbl)
    covid_table.relabel('Recov.', 'Recoveries')
    return covid_table.sort('County', descending=False)

def get_CA_counties():
    ct_url = "https://en.wikipedia.org/wiki/List_of_counties_in_California"
    h, r = get_html_table(ct_url, ['wikitable', 'sortable'])
    raw_table = Table(h).with_rows(r)
    county_table = Table().with_columns('County', [x[:-7] for x in raw_table['County']],
                                      'Population', raw_table['Population[9]'],
                                      'Area', raw_table['Area[6]'])
    return county_table

In [18]:
cv_tbl = get_CA_county_covid19()
cv_tbl.sort('County').show()

County,Cases,Deaths,Recoveries
Alameda,220,4,–
Amador,2,0,–
Butte,5,0,–
Calaveras,3,0,–
Contra Costa,147,1,–
El Dorado,12,0,–
Fresno,31,0,–
Humboldt,14,0,1
Imperial,21,0,–
Inyo,5,0,–


In [19]:
def load_timeseries(t):
    name = './data/CA_county_' + t + '.csv'
    return Table.read_table(name).sort('County', descending=False)

def load_all_timeseries(current_table):
    return [load_timeseries(t) for t in current_table.labels if t != 'County']

def save_timeseries(ts, ts_name):
    name = './data/CA_county_' + ts_name + '.csv'
    oname = './old/CA_county_' + ts_name + '.csv'
    if os.path.exists(name):
        cmd = 'cp ' + name + ' ' + oname
        assert not os.system(cmd)
    ts.to_csv(name)

def add_to_timeseries(county, ts):
    assert county not in ts['County']
    return ts.with_row(['County'] + (ts.len()-1)*[0])

def new_counties(current_table, time_series):
    return [c for c in current_table['County'] if c not in time_series['County'] ]

In [20]:
cases, deaths, recovers = load_all_timeseries(cv_tbl)

In [21]:
today = datetime.date.today().strftime("%m/%d/%y")

In [23]:
additions = new_counties(cv_tbl, cases)
additions

[]

In [24]:
# We are not going to try to handle this automatically
assert not additions, "Counties have change in html table.  Adjust timeseries"

In [25]:
new_cases = cases.join('County', cv_tbl.select(['County', 'Cases']).relabel('Cases', today))
new_cases.show()

County,3/24/20,03/25/20,03/26/20,03/27/20
Alameda,135,135,178,220
Amador,1,1,1,2
Butte,4,4,5,5
Calaveras,2,2,2,3
Contra Costa,86,108,131,147
El Dorado,2,2,2,12
Fresno,18,19,19,31
Humboldt,5,10,10,14
Imperial,6,9,9,21
Inyo,0,1,1,5


In [26]:
new_deaths = deaths.join('County', cv_tbl.select(['County', 'Deaths']).relabel('Deaths', today))
new_deaths.show()

County,3/24/20,03/25/20,03/26/20,03/27/20
Alameda,2,2,4,4
Amador,0,0,0,0
Butte,0,0,0,0
Calaveras,0,0,0,0
Contra Costa,1,1,1,1
El Dorado,0,0,0,0
Fresno,0,0,0,0
Humboldt,0,0,0,0
Imperial,0,0,0,0
Inyo,0,0,0,0


In [27]:
new_recovers = recovers.join('County', cv_tbl.select(['County', 'Recoveries']).relabel('Recoveries', today))
new_recovers

County,3/24/20,03/25/20,03/26/20,03/27/20
Alameda,–,–,–,–
Amador,–,–,–,–
Butte,–,–,–,–
Calaveras,–,–,–,–
Contra Costa,–,–,–,–
El Dorado,1,1,1,–
Fresno,–,–,–,–
Humboldt,1,1,1,1
Imperial,–,–,–,–
Inyo,–,0,0,–


In [28]:
# Save the augmented timeseries, saving an old copy
save_timeseries(new_cases, 'Cases')
save_timeseries(new_deaths, 'Deaths')
save_timeseries(new_recovers, 'Recoveries')

In [29]:
def save_initial_table(date):
    for t in cv_tbl.labels[1:]:
        ts = cv_tbl.select(['County', t])
        ts.relabel(t, date)
        name = './data/CA_county_' + t + '.csv'
        ts.to_csv(name)

## Table of all the counties

In [30]:
c_tbl = get_CA_counties()
c_tbl.sort('County')

County,Population,Area
Alameda,1663190,"738 sq mi(1,911 km2)"
Alpine,1120,"739 sq mi(1,914 km2)"
Amador,38626,"606 sq mi(1,570 km2)"
Butte,229294,"1,640 sq mi(4,248 km2)"
Calaveras,45670,"1,020 sq mi(2,642 km2)"
Colusa,21805,"1,151 sq mi(2,981 km2)"
Contra Costa,1147439,"720 sq mi(1,865 km2)"
Del Norte,27470,"1,008 sq mi(2,611 km2)"
El Dorado,188987,"1,712 sq mi(4,434 km2)"
Fresno,989255,"5,963 sq mi(15,444 km2)"


In [15]:
c_tbl.num_rows, cv_tbl.num_rows

(58, 44)