## Data Preparation of the Yale New Haven Dataset - Balanced Dataset

In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

from pickle import dump, load

Load data

In [2]:
filepath = "~/scratch/datasets/yale_new_haven/yale_new_haven.csv"
df = pd.read_csv(filepath)

### Preprocess Data

Feature types

In [3]:
disposition_var = {'disposition'}

demographic_vars = {'age', 'gender', 'ethnicity', 'race', 'lang',
       'religion', 'maritalstatus', 'employstatus', 'insurance_status'}

# department name, ESI score, arrival info, and triage vital info
triage_evaluation_vars = {'dep_name', 'esi', 'arrivalmode', 'arrivalmonth', 'arrivalday', 'arrivalhour_bin'}.union({col for col in df.columns if 'triage_vital' in col})

# chief complaint info (only top 200 were included; represents >90% of the complaints)
chief_complaint_vars = {col for col in df.columns if "cc_" in col}

# medication info
medication_vars = {col for col in df.columns if 'meds_' in col}

hospital_usage_stats_vars = {'previousdispo', 'n_edvisits', 'n_admissions', 'n_surgeries'}

# prior imaging and EKG counts
# chest x-ray, echocardiogram, electrocardiogram (EKG), other x-ray, other ultra-sound, head CT, other CT, MRI, and all other imaging
imaging_ekg_vars = {'cxr_count','echo_count','ekg_count','otherxr_count', 'otherus_count', 'headct_count', 'otherct_count', 'mri_count','otherimg_count'}

# historic vitals include: systolic blood pressure, diastolic blood pressure, pulse, respiratory rate, oxygen saturation, presence of oxygen device, and temperature
historical_vital_vars = {'dbp_last',
 'dbp_max',
 'dbp_median',
 'dbp_min',
 'o2_device_last',
 'o2_device_max',
 'o2_device_median',
 'o2_device_min',
 'pulse_last',
 'pulse_max',
 'pulse_median',
 'pulse_min',
 'resp_last',
 'resp_max',
 'resp_median',
 'resp_min',
 'sbp_last',
 'sbp_max',
 'sbp_median',
 'sbp_min',
 'spo2_last',
 'spo2_max',
 'spo2_median',
 'spo2_min',
 'temp_last',
 'temp_max',
 'temp_median',
 'temp_min'}

curr = disposition_var.union(demographic_vars.union(triage_evaluation_vars.union(chief_complaint_vars.union(medication_vars.union(hospital_usage_stats_vars.union(imaging_ekg_vars.union(historical_vital_vars)))))))

# past medical history
past_medical_hist_vars = {col for col in df.columns if col not in curr and "_" not in col and col not in ['ID', 'previousdispo']}

curr = curr.union(past_medical_hist_vars)

# historical labs ordered by ED (only top 150 comprising of 94% of all orders)
historical_lab_vars = {col for col in df.columns if col not in curr and col not in 'ID'}

print(f"Response: {len(disposition_var)}")
print(f"Demographics: {len(demographic_vars)}")
print(f"Triage evaluation: {len(triage_evaluation_vars)}")
print(f"Chief Complaint: {len(chief_complaint_vars)}")
print(f"Hospital Usage Statistics: {len(hospital_usage_stats_vars)}")
print(f"Past Medical History: {len(past_medical_hist_vars)}")
print(f"Medications: {len(medication_vars)}")
print(f"Historical Vitals: {len(historical_vital_vars)}")
print(f"Historical Labs: {len(historical_lab_vars)}")
print(f"Imaging/EKG counts: {len(imaging_ekg_vars)}")

Response: 1
Demographics: 9
Triage evaluation: 13
Chief Complaint: 200
Hospital Usage Statistics: 4
Past Medical History: 281
Medications: 48
Historical Vitals: 28
Historical Labs: 379
Imaging/EKG counts: 9


Fix some issues with the features

In [8]:
# N/A and other value fixes
# - race column has both nan and unknown -> change to just unknown
# - some patients have no entered chief complaints (all the chief complaints are N/A), give them all 0's
fillna_values = {'race': 'Unknown'}
chief_complaint_vars = {col for col in df.columns if "cc_" in col}
fillna_values.update({cc: 0 for cc in list(chief_complaint_vars)})
df = df.fillna(fillna_values)

# change ESI feature to string so it becomes categorical
# same with the O2 device features since these indicate the presence or absence of a O2 device which can be null (not known)
cate_float_feats = ['esi', 'triage_vital_o2_device', 'o2_device_last', 'o2_device_min', 'o2_device_max', 'o2_device_median']
df = df.astype({feat: str for feat in cate_float_feats})

# any chief complaint with number higher than 1, make as 1 (complaints are supposed to be binary)
for cc in chief_complaint_vars:
    df.loc[df[cc] > 1, cc] = 1.0

Update the datatypes for downstream processing

In [9]:
# convert all int features to floats for easier processing
df = df.astype({col: 'float64' for col in df[df.select_dtypes(include=['int64']).columns]})
# chief complaint and past medical history features should be ints, since they are already encoded 
# and don't need to be processed
df = df.astype({col: 'int64' for col in list(chief_complaint_vars.union(past_medical_hist_vars))})

Create an ID feature

In [11]:
# get the original index as the ID; store as an int, so it can pass through the ColumnTransformer
df.rename_axis('ID', inplace=True)
df = df.reset_index()

### Create balanced dataset

Get all the senior and adults indices

In [None]:
senior_idxs = df[df['age'] >= 65].index
adult_idxs = df[df['age'] < 65].index

Get a random sample of the adult DataFrame

In [None]:
df_a = df.loc[adult_idxs].sample(n=len(senior_idxs))
# sort the dataframe by its index
df_a = df_a.sort_index()

Create the balanced dataset

In [28]:
df_balanced = pd.concat([df_a, df.loc[senior_idxs]]).sort_index()
df = df_balanced

### Feature and Label Sets

Set features and label columns

In [23]:
label_col = 'disposition'

# ignore these features since they contain only NaN values, or because of the provided note
useless_feats = [
    'phencyclidine(pcp)screen,urine,noconf._last',
    'phencyclidine(pcp)screen,urine,noconf._min', 
    'phencyclidine(pcp)screen,urine,noconf._max', 
    'phencyclidine(pcp)screen,urine,noconf._median',
    'benzodiazepinesscreen,urine,noconf._last',       # only one person in the dataset has a non-nan value
    'ecodesmachinery'                                 # only 0 values
]
# other columns we may want to ignore
other_ignore_cols = []    

ignore_cols = useless_feats + [label_col] + other_ignore_cols

features = [col for col in df.columns if col not in ignore_cols]

In [30]:
X = df[features]
y = LabelEncoder().fit_transform(df[label_col])

Ensure that the labels correspond to 0 - discharge, 1 - admittance

In [31]:
if (df[label_col].iloc[0] == 'Discharge' and y[0] == 1) or (df[label_col].iloc[0] == 'Admit' and y[0] == 0):
    y = np.array([0 if y_i == 1 else 1 for y_i in y])

### Split into training and test sets

10% split as per Hong *et al.*

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state=2)

## Feature Transform

Overview:
- Floats need normalizing and imputing to median
- Objects need one-hot encoding and imputing to missing

- Chief complaints and Past Medical History are already one-hot encoded (leave as-is)

Details:
- Impute
    - Missing numerics are equated to median of values
    - Missing categories are equated to 'Missing'
    - Done before standardization to not mess with the scaling in standardization
- One Hot Encode 
    - only for categorical
    - binary categoricals are encoded as one feature
    - all other categories are encoded as n features
- Normalize
    - only for floats

Define pipelines for feature processing

(Impute, one hot encode, and normalize)

In [38]:
# pipeline transformation for numerical features
pipe_floats = Pipeline(
    steps=[
        ('impute (floats)', SimpleImputer(strategy='median', copy=False)),
        ('normalize (floats)', MaxMinScaler())
    ], 
    verbose=True
)

# pipeline transformation for categorical features
pipe_category = Pipeline(
    steps=[
        ('impute (categoric)', SimpleImputer(strategy='constant', fill_value = 'Missing')),
        ('one hot encode', OneHotEncoder(drop='if_binary')),
    ], 
    verbose=True
)

In [39]:
column_transformer = ColumnTransformer(
    [
        ('categoric', pipe_category, make_column_selector(dtype_include='object')),
        ('floats', pipe_floats, make_column_selector(dtype_include='float')),
    ],
    remainder='passthrough', verbose_feature_names_out=False)

In [40]:
# fit the transformer and transform the training set
X_t = column_transformer.fit_transform(X_train)

[Pipeline]  (step 1 of 3) Processing impute (categoric), total=   0.5s
[Pipeline] .... (step 2 of 3) Processing one hot encode, total=   2.5s
[Pipeline]  (step 3 of 3) Processing standardize (categoric), total=   0.4s
[Pipeline] ... (step 1 of 2) Processing impute (floats), total=  25.1s
[Pipeline]  (step 2 of 2) Processing standardize (floats), total=   3.4s


Save the transformed training set

In [60]:
k = pd.DataFrame(X_t, columns=column_transformer.get_feature_names_out()).astype('float32')

In [50]:
output_filename = "~/scratch/datasets/yale_new_haven/training_test_sets/balanced_split/features/yale_new_haven_balanced_training_features.csv"

In [62]:
k.to_csv(output_filename, index=False)

Transform half of the test set

In [45]:
X_t_test = column_transformer.transform(X_test)

In [63]:
k2 = pd.DataFrame(X_t_test, columns=column_transformer.get_feature_names_out()).astype('float32')

In [23]:
output_filename = "~/scratch/datasets/yale_new_haven/training_test_sets/balanced_split/features/yale_new_haven_balanced_test_features.csv"

In [65]:
k2.to_csv(output_filename, index=False)

### Save the labels

In [70]:
training_labels = pd.DataFrame(y_train)

In [10]:
output_filename = "~/scratch/datasets/yale_new_haven/training_test_sets/balanced_split/labels/yale_new_haven_balanced_training_labels.csv"

In [72]:
training_labels.to_csv(output_filename, index=False)

Save the test labels

In [73]:
test_labels = pd.DataFrame(y_test)

In [28]:
output_filename = "~/scratch/datasets/yale_new_haven/training_test_sets/balanced_split/labels/yale_new_haven_balanced_test_labels.csv"

In [75]:
test_labels.to_csv(output_filename, index=False)