## Configuration
_Initial steps to get the notebook ready to play nice with our repository. Do not delete this section._

Code formatting with [black](https://pypi.org/project/nb-black/).

In [1]:
%load_ext lab_black

Add our `utils` directory to the system's `$PATH` so we can import Python files from sibling directories.

In [2]:
import os
import pathlib

In [3]:
this_dir = pathlib.Path(os.path.abspath(""))

In [4]:
data_dir = this_dir / "data"

In [5]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options

from selenium.common.exceptions import NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
from datetime import datetime
import calendar, time

ModuleNotFoundError: No module named 'selenium'

## Scrape

Open

In [None]:
driver = webdriver.Chrome(
    ChromeDriverManager().install(),
)
driver.get("https://www.cdcr.ca.gov/covid19/population-status-tracking/")

Switch to iframe

In [None]:
wait = WebDriverWait(driver, 300)
wait.until(EC.frame_to_be_available_and_switch_to_it((By.XPATH, "//iframe")))

Move to "Confirmed Table View"

In [None]:
wait.until(
    EC.presence_of_element_located((By.XPATH, '//span[text()="Confirmed Table View"]'))
)

In [None]:
buttons_css = driver.find_element_by_xpath('//span[text()="Confirmed Table View"]')
button_xpath = buttons_css.find_element_by_xpath("..")

In [None]:
time.sleep(10)
button_xpath.click()

Little wait for it to switch over and then find element with table

In [None]:
time.sleep(15)
table = driver.find_elements_by_class_name("innerContainer")

Save text from table

In [None]:
table_texts = table[0].text

Find update date and save text

In [None]:
update_box = driver.find_element_by_class_name("value")

In [None]:
update_string = update_box.text

Scrape employee status page

In [None]:
e_url = "https://www.cdcr.ca.gov/covid19/cdcr-cchcs-covid-19-status/"

In [None]:
driver.get(e_url)

Get page update date text

In [None]:
e_update_date = driver.find_element_by_xpath(
    '//*[@id="main"]/div/div/div/div[2]/div/p[3]'
)

In [None]:
e_update_date_string = e_update_date.text

Get list of deaths

In [None]:
e_death_list = driver.find_element_by_xpath('//*[@id="main"]/div/div/div/div[2]/div/ol')

In [None]:
e_death_list_text = e_death_list.text

In [None]:
driver.quit()

Get table

In [None]:
employee_list = pd.read_html(e_url, header=0)

In [None]:
employee_df = employee_list[0]

## Make dataframe

### 1. Institution Overview

In [None]:
headers = table_texts.splitlines()[:8]

After headers, the table content is listed by what is in view - first 20 rows, then next 15 rows

Get first 20 rows

In [None]:
first_half = table_texts.splitlines()[8:168]  ##every 20 is new column

Split first half into columns

In [None]:
first_half_lines = [first_half[i : i + 20] for i in range(0, len(first_half), 20)]

Get second 15 rows

In [None]:
second_half = table_texts.splitlines()[168:288]  # every 15 is new column

Split to colums

In [None]:
second_half_lines = [second_half[i : i + 15] for i in range(0, len(second_half), 15)]

Combine - each list is a column

In [None]:
combined = [first_half_lines[i] + second_half_lines[i] for i in range(8)]

Create dict for dataframe

In [None]:
data = dict(zip(headers, combined))

In [None]:
df = pd.DataFrame(data)

In [None]:
def extractUpdate(text):
    abbr_to_num = {name: num for num, name in enumerate(calendar.month_abbr) if num}
    month = abbr_to_num[text.split(" ")[3]]
    file_date = f"{month}-{text.split(' ')[4]}-{text.split(' ')[5]}"
    return file_date

In [None]:
file_date = extractUpdate(update_string)

### Append to institution-timeseries

In [None]:
institution_timeseries = pd.read_csv(
    data_dir / "institution-timeseries.csv", parse_dates=["update_date"]
)

In [None]:
df_timeseries = df.copy()

In [None]:
df_timeseries["update_date"] = file_date

In [None]:
df_timeseries["update_date"] = pd.to_datetime(df_timeseries["update_date"])

In [None]:
if (
    df_timeseries["update_date"].unique()
    in institution_timeseries["update_date"].unique()
):
    pass
else:
    institution_timeseries = pd.concat([institution_timeseries, df_timeseries])

In [None]:
institution_timeseries.sort_values("update_date", inplace=True)

Remove commas

In [None]:
institution_timeseries = institution_timeseries.replace(",", "", regex=True)

In [None]:
institution_timeseries["update_date"] = pd.to_datetime(
    institution_timeseries["update_date"]
)

In [None]:
institution_timeseries[["Confirmed"]] = institution_timeseries[["Confirmed"]].apply(
    pd.to_numeric
)

### 3.Employees

In [None]:
def extractEmployeeUpdate(text):
    month_to_num = {name: num for num, name in enumerate(calendar.month_name) if num}
    abbr_to_num = {name: num for num, name in enumerate(calendar.month_abbr) if num}
    try:
        month = month_to_num[text.split(" ")[3]]
    except KeyError:
        month = abbr_to_num[text.split(" ")[3].replace(".", "")]

    file_date = f"{month}-{text.split(' ')[4].replace(',','')}-{text.split(' ')[5]}"
    return file_date

In [None]:
e_file_date = extractEmployeeUpdate(e_update_date_string)

### Append to employee timeseries list

In [None]:
employees_timeseries = pd.read_csv(data_dir / "employees-timeseries.csv")

In [None]:
e_timeseries = employee_df.copy()

In [None]:
e_timeseries["update_date"] = e_file_date

In [None]:
if e_timeseries["update_date"].unique() in employees_timeseries["update_date"].unique():
    pass
else:
    employees_timeseries = pd.concat([employees_timeseries, e_timeseries])

In [None]:
e_death_final = e_death_list_text.splitlines()

In [None]:
e_deaths_df = pd.DataFrame(e_death_final)

## Export

In [None]:
institution_timeseries.to_csv(data_dir / "institution-timeseries.csv", index=False)

Employee list

In [None]:
employees_timeseries.to_csv(data_dir / "employees-timeseries.csv", index=False)

Employee death list 

In [None]:
e_deaths_df.to_csv(data_dir / "employees-deaths.csv", index=False)