# ETL

ETL is comprised of 3 steps:
- Extract: Open the Excel files, read the data into Python memory
- Transform: Convert the data to the format I will use in my database
- Load: Put the data in my local SQL database

In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
from pathlib import Path

pd.set_option('display.max_rows', 100)

## Extract

In [2]:
def collect_name_and_excel_paths(num_files= 0):
    count = 0
    base_path = Path("chargemaster-cdm-2021")
    hospitals = sorted([path for path in base_path.iterdir() if not path.name.startswith('.')])
    hospital_sheets = []
    allowed_extensions = set([".xls", ".xlsx", ".csv"])
    
    for hospital in hospitals:
        # Skip hospitals with multiple excel files for now. 
        # TODO: Implement handling multiple excels and validate data from all hospitals.
        files = [(hospital.name, file) for file in hospital.iterdir() if file.suffix in allowed_extensions and not file.name.startswith("~")]
        if len(files) == 1:
            hospital_sheets.extend(files)
            count+=1
        if num_files >= 1 and count >= num_files:
            break

    return hospital_sheets

names_and_files = collect_name_and_excel_paths()
names_and_files[:5]

[('AHMC Anaheim Regional Medical Center',
  PosixPath('chargemaster-cdm-2021/AHMC Anaheim Regional Medical Center/106301098_CDM_All_2021.xlsx')),
 ('AHMC Seton Medical Center',
  PosixPath('chargemaster-cdm-2021/AHMC Seton Medical Center/106410817_CDM_All_2021.xlsx')),
 ('Alameda Hospital',
  PosixPath('chargemaster-cdm-2021/Alameda Hospital/106010735_CDM_All_2021.xlsx')),
 ('Alta Bates Summit Medical Center',
  PosixPath('chargemaster-cdm-2021/Alta Bates Summit Medical Center/106010937_CDM_All_2021.xlsx')),
 ('Alta Bates-Summit Medical Center - Alta Bates Campus',
  PosixPath('chargemaster-cdm-2021/Alta Bates-Summit Medical Center - Alta Bates Campus/106010739_CDM_All_2021.xlsx'))]

In [7]:
def data_from_excel(filepath):
    # Handle all extensions, and all sheet names. 
    if filepath.suffix not in ['.xlsx', '.xls']:
        raise Exception("file extension not supported")
        
    sheet_name = "AB 1045 Form"
    sheet_names = pd.ExcelFile(filepath).sheet_names
    for name in sheet_names:
        if "1045" in name:
            sheet_name = name
    
    item = pd.read_excel(filepath, sheet_name=sheet_name)
    return item

name, path = names_and_files[0]
data = data_from_excel(path)
data.head(100)

Unnamed: 0,Hospital Name: Anaheim Regional Medical Center,Unnamed: 1,Unnamed: 2
0,OSHPD Facility No: 106301098,,
1,Effective Date of Charges: 6/1/2021,,
2,,,
3,In response to requests from hospitals and the...,,
4,Evaluation & Management Services (CPT Codes 99...,2021 CPT Code,Average Charge
5,"Emergency Room Visit, Level 2 (low to moderate...",99282,790
6,"Emergency Room Visit, Level 3 (moderate severity)",99283,1229
7,"Emergency Room Visit, Level 4 (high severity w...",99284,2060
8,"Emergency Room Visit, Level 4 (high severity w...",99285,2911
9,"Outpatient Visit, established patient, 15 minutes",99213,160


## Transform

For now, we limit to extracting the rows with numeric CPT and charge values. In the future we can extract the number of reported procedures to validate out data extraction process. We can also extract the hospital ID and the date of the report. 

In [11]:
def transform_data(data, hospital_name):
    mask = data.astype('str').iloc[:, 2].str.contains("\d+")
    # for now, ignore procedures without a fully numeric CPT field
    # TODO: implement validation. Collected procedures count in file must be same as extracted. 
    mask2 = data.astype('str').iloc[:, 1].str.contains("^\d+$")
    reported_procedures = data[(mask & mask2)]
    idx = pd.Index(range(len(reported_procedures)))
    reported_procedures.set_index(idx, inplace=True)

    reported_procedures.columns = ["Procedure", "CPT Code", "Average Charge"]

    hospital_column = [hospital_name] * len(reported_procedures)
    hospital_series = pd.Series(np.array(hospital_column))
    reported_procedures["Hospital"] = hospital_series

    reported_procedures = reported_procedures[["Hospital", "CPT Code", "Procedure", "Average Charge"]]
    reported_procedures.set_index("Hospital", inplace=True)

    reported_procedures.Procedure.apply(str)
    reported_procedures["CPT Code"].apply(np.int32, inplace)
    reported_procedures["Average Charge"].apply(np.int32)

    return reported_procedures

reported_procedures = transform_data(data, name)
reported_procedures

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reported_procedures["Hospital"] = hospital_series


TypeError: 'inplace' is an invalid keyword argument for str()

In [13]:
reported_procedures.dtypes

CPT Code          object
Procedure         object
Average Charge    object
dtype: object

## Load

Convert to a database readable format and load into a database. 

In [9]:
def write_data(reported_procedures, dst_folder, hospital_name):
    output_dir = Path(dst_folder)
    output_dir.mkdir(parents=True, exist_ok=True)
    output_path = output_dir / f"{hospital_name}_reported_procedures.csv"
    print(f"Writing reported procedures to {output_path}")
    reported_procedures.to_csv(output_path)
    
write_data(reported_procedures, "reported_procedures", name)

Writing reported procedures to reported_procedures/AHMC Anaheim Regional Medical Center_reported_procedures.csv


## Execute

In [6]:
NUM_FILES = 20
DST_FOLDER = "reported_procedures"

names_and_files = collect_name_and_excel_paths(NUM_FILES)
for name, file in names_and_files:
    print("Parsing ", file)
    try: 
        data = data_from_excel(file)
    except:
        print(f"Failed to reat data from hospital {name}")
        continue
    reported_procedures = transform_data(data, name)
    if len(reported_procedures) == 0:
        print(f"Hospital {name} seems to contain no reported procedures")
        display(reported_procedures)
        continue
    write_data(reported_procedures, DST_FOLDER, name)

Parsing  chargemaster-cdm-2021/AHMC Anaheim Regional Medical Center/106301098_CDM_All_2021.xlsx


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reported_procedures["Hospital"] = hospital_series


Writing reported procedures to reported_procedures/AHMC Anaheim Regional Medical Center_reported_procedures.csv
Parsing  chargemaster-cdm-2021/AHMC Seton Medical Center/106410817_CDM_All_2021.xlsx


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reported_procedures["Hospital"] = hospital_series


Writing reported procedures to reported_procedures/AHMC Seton Medical Center_reported_procedures.csv
Parsing  chargemaster-cdm-2021/Alameda Hospital/106010735_CDM_All_2021.xlsx


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reported_procedures["Hospital"] = hospital_series


Writing reported procedures to reported_procedures/Alameda Hospital_reported_procedures.csv
Parsing  chargemaster-cdm-2021/Alta Bates Summit Medical Center/106010937_CDM_All_2021.xlsx


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reported_procedures["Hospital"] = hospital_series


Writing reported procedures to reported_procedures/Alta Bates Summit Medical Center_reported_procedures.csv
Parsing  chargemaster-cdm-2021/Alta Bates-Summit Medical Center - Alta Bates Campus/106010739_CDM_All_2021.xlsx


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reported_procedures["Hospital"] = hospital_series


Writing reported procedures to reported_procedures/Alta Bates-Summit Medical Center - Alta Bates Campus_reported_procedures.csv
Parsing  chargemaster-cdm-2021/Alvarado Hospital/106370652_CDM_All_2021.xlsx


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reported_procedures["Hospital"] = hospital_series


Writing reported procedures to reported_procedures/Alvarado Hospital_reported_procedures.csv
Parsing  chargemaster-cdm-2021/Atascadero State Hospital/106400683_CDM_All_2021.xlsx
Writing reported procedures to reported_procedures/Atascadero State Hospital_reported_procedures.csv
Parsing  chargemaster-cdm-2021/Aurora Behavioral Healthcare - Santa Rosa/106494048_CDM_All_2021.xls
Hospital Aurora Behavioral Healthcare - Santa Rosa seems to contain no reported procedures


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reported_procedures["Hospital"] = hospital_series


Unnamed: 0_level_0,CPT Code,Procedure,Average Charge
Hospital,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1


Parsing  chargemaster-cdm-2021/Aurora Charter Oak/106190163_CDM_All_2021.xlsx
Hospital Aurora Charter Oak seems to contain no reported procedures


Unnamed: 0_level_0,CPT Code,Procedure,Average Charge
Hospital,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1


Parsing  chargemaster-cdm-2021/Aurora Las Encinas Hospital/106190462_CDM_All_2021.xls
Hospital Aurora Las Encinas Hospital seems to contain no reported procedures


Unnamed: 0_level_0,CPT Code,Procedure,Average Charge
Hospital,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1


Parsing  chargemaster-cdm-2021/Aurora San Diego Hospital/106374024_CDM_All_2021.xls
Hospital Aurora San Diego Hospital seems to contain no reported procedures


Unnamed: 0_level_0,CPT Code,Procedure,Average Charge
Hospital,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1


Parsing  chargemaster-cdm-2021/Aurora Vista del Mar Hospital/106560203_CDM_All_2021.xls
Hospital Aurora Vista del Mar Hospital seems to contain no reported procedures


Unnamed: 0_level_0,CPT Code,Procedure,Average Charge
Hospital,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1


Parsing  chargemaster-cdm-2021/BHC Alhambra Hospital/106190020_CDM_All_2021.xlsx
Failed to reat data from hospital BHC Alhambra Hospital
Parsing  chargemaster-cdm-2021/BHC Fremont Hospital/106014034_CDM_All_2021.xlsx
Failed to reat data from hospital BHC Fremont Hospital
Parsing  chargemaster-cdm-2021/Bakersfield Behavioral Healthcare Hospital/106154044_CDM_All_2021.xlsx
Hospital Bakersfield Behavioral Healthcare Hospital seems to contain no reported procedures


Unnamed: 0_level_0,CPT Code,Procedure,Average Charge
Hospital,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1


Parsing  chargemaster-cdm-2021/Bakersfield Memorial Hospital/106150722_CDM_All_2021.xlsx


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reported_procedures["Hospital"] = hospital_series


Writing reported procedures to reported_procedures/Bakersfield Memorial Hospital_reported_procedures.csv
Parsing  chargemaster-cdm-2021/Ballard Rehabilitation Hospital/106364121_CDM_All_2021.xls


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reported_procedures["Hospital"] = hospital_series


Writing reported procedures to reported_procedures/Ballard Rehabilitation Hospital_reported_procedures.csv
Parsing  chargemaster-cdm-2021/Banner Lassen Medical Center/106184008_CDM_All_2021.xlsx


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reported_procedures["Hospital"] = hospital_series
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reported_procedures["Hospital"] = hospital_series


Writing reported procedures to reported_procedures/Banner Lassen Medical Center_reported_procedures.csv
Parsing  chargemaster-cdm-2021/Bear Valley Community Hospital/106361110_Common25_2021.xlsx
Writing reported procedures to reported_procedures/Bear Valley Community Hospital_reported_procedures.csv
Parsing  chargemaster-cdm-2021/Beverly Hospital/106190081_CDM_All_2021.xlsx
Writing reported procedures to reported_procedures/Beverly Hospital_reported_procedures.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reported_procedures["Hospital"] = hospital_series
