## Configuration
_Initial steps to get the notebook ready to play nice with our repository. Do not delete this section._

Code formatting with [black](https://pypi.org/project/nb-black/).

In [1]:
%load_ext lab_black

Add our `utils` directory to the system's `$PATH` so we can import Python files from sibling directories.

In [5]:
import os
import sys
import pathlib

In [6]:
this_dir = pathlib.Path(os.path.abspath(""))

In [8]:
data_dir = this_dir / "data"

In [10]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options

from selenium.common.exceptions import NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
from datetime import datetime
import calendar, time

## Scrape

Open

In [11]:
driver = webdriver.Chrome(
    ChromeDriverManager().install(),
)
driver.get("https://www.cdcr.ca.gov/covid19/population-status-tracking/")



Current google-chrome version is 92.0.4515
Get LATEST driver version for 92.0.4515
There is no [linux64] chromedriver for browser 92.0.4515 in cache
Get LATEST driver version for 92.0.4515
Trying to download new driver from https://chromedriver.storage.googleapis.com/92.0.4515.107/chromedriver_linux64.zip
Driver has been saved in cache [/home/palewire/.wdm/drivers/chromedriver/linux64/92.0.4515.107]


Switch to iframe

In [12]:
wait = WebDriverWait(driver, 300)
wait.until(EC.frame_to_be_available_and_switch_to_it((By.XPATH, "//iframe")))

True

Move to "Confirmed Table View"

In [13]:
wait.until(
    EC.presence_of_element_located((By.XPATH, '//span[text()="Confirmed Table View"]'))
)

<selenium.webdriver.remote.webelement.WebElement (session="b1d6e6a42dfa09d1af30af135f641283", element="f6261e3f-c4f1-4fa5-aad6-81cc0854bdb9")>

In [14]:
buttons_css = driver.find_element_by_xpath('//span[text()="Confirmed Table View"]')
button_xpath = buttons_css.find_element_by_xpath("..")

In [15]:
time.sleep(10)
button_xpath.click()

Little wait for it to switch over and then find element with table

In [16]:
time.sleep(15)
table = driver.find_elements_by_class_name("innerContainer")

Save text from table

In [17]:
table_texts = table[0].text

Find update date and save text

In [18]:
update_box = driver.find_element_by_class_name("value")

In [19]:
update_string = update_box.text

Scrape employee status page

In [20]:
e_url = "https://www.cdcr.ca.gov/covid19/cdcr-cchcs-covid-19-status/"

In [21]:
driver.get(e_url)

Get page update date text

In [22]:
e_update_date = driver.find_element_by_xpath(
    '//*[@id="main"]/div/div/div/div[2]/div/p[3]'
)

In [23]:
e_update_date_string = e_update_date.text

Get list of deaths

In [24]:
e_death_list = driver.find_element_by_xpath('//*[@id="main"]/div/div/div/div[2]/div/ol')

In [25]:
e_death_list_text = e_death_list.text

In [26]:
driver.quit()

Get table

In [27]:
employee_list = pd.read_html(e_url, header=0)

In [28]:
employee_df = employee_list[0]

## Make dataframe

### 1. Institution Overview

In [29]:
headers = table_texts.splitlines()[:8]

After headers, the table content is listed by what is in view - first 20 rows, then next 15 rows

Get first 20 rows

In [30]:
first_half = table_texts.splitlines()[8:168]  ##every 20 is new column

Split first half into columns

In [31]:
first_half_lines = [first_half[i : i + 20] for i in range(0, len(first_half), 20)]

Get second 15 rows

In [32]:
second_half = table_texts.splitlines()[168:288]  # every 15 is new column

Split to colums

In [33]:
second_half_lines = [second_half[i : i + 15] for i in range(0, len(second_half), 15)]

Combine - each list is a column

In [34]:
combined = [first_half_lines[i] + second_half_lines[i] for i in range(8)]

Create dict for dataframe

In [35]:
data = dict(zip(headers, combined))

In [36]:
df = pd.DataFrame(data)

In [37]:
def extractUpdate(text):
    abbr_to_num = {name: num for num, name in enumerate(calendar.month_abbr) if num}
    month = abbr_to_num[text.split(" ")[3]]
    file_date = f"{month}-{text.split(' ')[4]}-{text.split(' ')[5]}"
    return file_date

In [38]:
file_date = extractUpdate(update_string)

### Append to institution-timeseries

In [40]:
institution_timeseries = pd.read_csv(
    data_dir / "institution-timeseries.csv", parse_dates=["update_date"]
)

In [41]:
df_timeseries = df.copy()

In [42]:
df_timeseries["update_date"] = file_date

In [43]:
df_timeseries["update_date"] = pd.to_datetime(df_timeseries["update_date"])

In [46]:
if (
    df_timeseries["update_date"].unique()
    in institution_timeseries["update_date"].unique()
):
    pass
else:
    institution_timeseries = pd.concat([institution_timeseries, df_timeseries])

In [47]:
institution_timeseries.sort_values("update_date", inplace=True)

Remove commas

In [48]:
institution_timeseries = institution_timeseries.replace(",", "", regex=True)

In [49]:
institution_timeseries["update_date"] = pd.to_datetime(
    institution_timeseries["update_date"]
)

In [50]:
institution_timeseries[["Confirmed"]] = institution_timeseries[["Confirmed"]].apply(
    pd.to_numeric
)

### 3.Employees

In [51]:
def extractEmployeeUpdate(text):
    month_to_num = {name: num for num, name in enumerate(calendar.month_name) if num}
    abbr_to_num = {name: num for num, name in enumerate(calendar.month_abbr) if num}
    try:
        month = month_to_num[text.split(" ")[3]]
    except KeyError:
        month = abbr_to_num[text.split(" ")[3].replace(".", "")]

    file_date = f"{month}-{text.split(' ')[4].replace(',','')}-{text.split(' ')[5]}"
    return file_date

In [52]:
e_file_date = extractEmployeeUpdate(e_update_date_string)

### Append to employee timeseries list

In [53]:
employees_timeseries = pd.read_csv(data_dir / "employees-timeseries.csv")

In [54]:
e_timeseries = employee_df.copy()

In [55]:
e_timeseries["update_date"] = e_file_date

In [56]:
if e_timeseries["update_date"].unique() in employees_timeseries["update_date"].unique():
    pass
else:
    employees_timeseries = pd.concat([employees_timeseries, e_timeseries])

In [57]:
e_death_final = e_death_list_text.splitlines()

In [58]:
e_deaths_df = pd.DataFrame(e_death_final)

## Export

In [59]:
institution_timeseries.to_csv(data_dir / "institution-timeseries.csv")

Employee list

In [60]:
employees_timeseries.to_csv(data_dir / "employees-timeseries.csv")

Employee death list 

In [61]:
e_deaths_df.to_csv(data_dir / "employees-deaths.csv")