In [2]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE, RFECV
%matplotlib inline

#plt.style.use('fivethirtyeight')
#%run ./two-histograms.ipynb

pd.set_option('display.max_columns', 200)

### Create column list for different categories

In [2]:
demographic = ['hospital_death','age','bmi','weight','height','elective_surgery','ethnicity','gender','hospital_id','hospital_admit_source',
'icu_admit_source','icu_id','icu_stay_type','icu_type','pre_icu_los_days']

# APACHE related measurements
# ---
# 18 measurements
apache_cov = ['albumin_apache','bilirubin_apache','bun_apache','creatinine_apache','glucose_apache','heart_rate_apache',
          'hematocrit_apache','resprate_apache','sodium_apache','temp_apache','urineoutput_apache','wbc_apache',
          'fio2_apache','map_apache','paco2_apache','paco2_for_ph_apache','pao2_apache','ph_apache']
# 4 indicators
apache_indicators = ['apache_post_operative','arf_apache','intubated_apache','ventilated_apache']

# 8 Chronic health status
apache_chronic = ['aids','cirrhosis','diabetes_mellitus','hepatic_failure',
                  'immunosuppression','leukemia','lymphoma','solid_tumor_with_metastasis']
# 6 others
apache_others =['apache_2_diagnosis','apache_3j_diagnosis', 'apache_3j_bodysystem','apache_2_bodysystem',
                'apache_4a_hospital_death_prob','apache_4a_icu_death_prob']

gcs = ['gcs_eyes_apache','gcs_motor_apache','gcs_unable_apache','gcs_verbal_apache']

apache = apache_cov + apache_indicators + apache_chronic + apache_others + gcs
# ---

vitals_names = ['diasbp_invasive','diasbp','diasbp_noninvasive','sysbp_invasive', 'sysbp', 'sysbp_noninvasive', 
                'mbp_invasive', 'mbp', 'mbp_noninvasive','heartrate','resprate' ,'spo2', 'temp']
labs_names = ['albumin','bilirubin', 'bun','calcium', 'creatinine', 'glucose', 'hco3', 'hemaglobin','hematocrit','inr',
        'lactate',  'platelets','potassium', 'sodium', 'wbc']

vitals = ['diasbp_invasive_max','diasbp_invasive_min','diasbp_max','diasbp_min','diasbp_noninvasive_max', 'diasbp_noninvasive_min', 
          'sysbp_invasive_max','sysbp_invasive_min','sysbp_max','sysbp_min', 'sysbp_noninvasive_max','sysbp_noninvasive_min',
          'mbp_invasive_max', 'mbp_invasive_min', 'mbp_max', 'mbp_min','mbp_noninvasive_max', 'mbp_noninvasive_min', 
          'heartrate_max','heartrate_min','resprate_max','resprate_min','spo2_max','spo2_min','temp_max','temp_min']

labs = ['albumin_max', 'albumin_min', 'bilirubin_max' ,'bilirubin_min', 'bun_max',
 'bun_min', 'calcium_max', 'calcium_min', 'creatinine_max', 'creatinine_min',
 'glucose_max', 'glucose_min', 'hco3_max' ,'hco3_min', 'hemaglobin_max',
 'hemaglobin_min' ,'hematocrit_max', 'hematocrit_min' ,'inr_max' ,'inr_min',
 'lactate_max', 'lactate_min', 'platelets_max', 'platelets_min',
 'potassium_max' ,'potassium_min', 'sodium_max', 'sodium_min', 'wbc_max',
 'wbc_min']

# Arterial blood gas test
lab_blood_gas = ['arterial_pco2_max', 'arterial_pco2_min', 'arterial_ph_max', 'arterial_ph_min', 
                 'arterial_po2_max', 'arterial_po2_min', 'pao2fio2ratio_max', 'pao2fio2ratio_min']

print("The number of columns for demographic: ",len(demographic))
print("The number of columns for vitals: ",len(vitals))
print("The number of columns for lab tests: ",len(labs))
print("The number of columns for vitals_names: ",len(vitals_names)) 
print("The number of columns for labsnames: ",len(labs_names))
print("The number of columns for arterial blood gas test: ",len(lab_blood_gas))
print("The number of columns for Glasgow Coma Score: ",len(gcs))
print("The number of columns for APACHE 3 measurements: ",len(apache_cov))
print("The number of columns for APACHE Chronic Health Status: ",len(apache_chronic))
print("The number of columns for other APACHE measurements: ",len(apache_others))
print("Total number of columns for APACHE: ",len(apache))

h1_vitals = ['h1_'+name for name in vitals]
d1_vitals = ['d1_'+name for name in vitals]
h1_labs = ['h1_'+name for name in labs]
d1_labs = ['d1_'+name for name in labs]
h1_lbg = ['h1_'+name for name in lab_blood_gas]
d1_lbg = ['d1_'+name for name in lab_blood_gas]

The number of columns for demographic:  15
The number of columns for vitals:  26
The number of columns for lab tests:  30
The number of columns for vitals_names:  13
The number of columns for labsnames:  15
The number of columns for arterial blood gas test:  8
The number of columns for Glasgow Coma Score:  4
The number of columns for APACHE 3 measurements:  18
The number of columns for APACHE Chronic Health Status:  8
The number of columns for other APACHE measurements:  6
Total number of columns for APACHE:  40


## 1. Load the dataset

In [15]:
train = pd.read_csv('../data/train_ph2_newfeatures.csv')
print(train.shape)

(91713, 196)


In [16]:
train.head()

Unnamed: 0,hospital_id,hospital_death,age,bmi,elective_surgery,ethnicity,gender,height,hospital_admit_source,icu_admit_source,icu_id,icu_stay_type,icu_type,pre_icu_los_days,weight,albumin_apache,apache_2_diagnosis,apache_3j_diagnosis,apache_post_operative,arf_apache,bilirubin_apache,bun_apache,creatinine_apache,fio2_apache,gcs_eyes_apache,gcs_motor_apache,gcs_unable_apache,gcs_verbal_apache,glucose_apache,heart_rate_apache,hematocrit_apache,intubated_apache,map_apache,paco2_apache,paco2_for_ph_apache,pao2_apache,ph_apache,resprate_apache,sodium_apache,temp_apache,urineoutput_apache,ventilated_apache,wbc_apache,d1_diasbp_invasive_max,d1_diasbp_invasive_min,d1_diasbp_max,d1_diasbp_min,d1_diasbp_noninvasive_max,d1_diasbp_noninvasive_min,d1_heartrate_max,d1_heartrate_min,d1_mbp_invasive_max,d1_mbp_invasive_min,d1_mbp_max,d1_mbp_min,d1_mbp_noninvasive_max,d1_mbp_noninvasive_min,d1_resprate_max,d1_resprate_min,d1_spo2_max,d1_spo2_min,d1_sysbp_invasive_max,d1_sysbp_invasive_min,d1_sysbp_max,d1_sysbp_min,d1_sysbp_noninvasive_max,d1_sysbp_noninvasive_min,d1_temp_max,d1_temp_min,h1_diasbp_invasive_max,h1_diasbp_invasive_min,h1_diasbp_max,h1_diasbp_min,h1_diasbp_noninvasive_max,h1_diasbp_noninvasive_min,h1_heartrate_max,h1_heartrate_min,h1_mbp_invasive_max,h1_mbp_invasive_min,h1_mbp_max,h1_mbp_min,h1_mbp_noninvasive_max,h1_mbp_noninvasive_min,h1_resprate_max,h1_resprate_min,h1_spo2_max,h1_spo2_min,h1_sysbp_invasive_max,h1_sysbp_invasive_min,h1_sysbp_max,h1_sysbp_min,h1_sysbp_noninvasive_max,h1_sysbp_noninvasive_min,h1_temp_max,h1_temp_min,d1_albumin_max,d1_albumin_min,d1_bilirubin_max,d1_bilirubin_min,d1_bun_max,d1_bun_min,d1_calcium_max,d1_calcium_min,d1_creatinine_max,d1_creatinine_min,d1_glucose_max,d1_glucose_min,d1_hco3_max,d1_hco3_min,d1_hemaglobin_max,d1_hemaglobin_min,d1_hematocrit_max,d1_hematocrit_min,d1_inr_max,d1_inr_min,d1_lactate_max,d1_lactate_min,d1_platelets_max,d1_platelets_min,d1_potassium_max,d1_potassium_min,d1_sodium_max,d1_sodium_min,d1_wbc_max,d1_wbc_min,h1_albumin_max,h1_albumin_min,h1_bilirubin_max,h1_bilirubin_min,h1_bun_max,h1_bun_min,h1_calcium_max,h1_calcium_min,h1_creatinine_max,h1_creatinine_min,h1_glucose_max,h1_glucose_min,h1_hco3_max,h1_hco3_min,h1_hemaglobin_max,h1_hemaglobin_min,h1_hematocrit_max,h1_hematocrit_min,h1_inr_max,h1_inr_min,h1_lactate_max,h1_lactate_min,h1_platelets_max,h1_platelets_min,h1_potassium_max,h1_potassium_min,h1_sodium_max,h1_sodium_min,h1_wbc_max,h1_wbc_min,d1_arterial_pco2_max,d1_arterial_pco2_min,d1_arterial_ph_max,d1_arterial_ph_min,d1_arterial_po2_max,d1_arterial_po2_min,d1_pao2fio2ratio_max,d1_pao2fio2ratio_min,h1_arterial_pco2_max,h1_arterial_pco2_min,h1_arterial_ph_max,h1_arterial_ph_min,h1_arterial_po2_max,h1_arterial_po2_min,h1_pao2fio2ratio_max,h1_pao2fio2ratio_min,apache_4a_hospital_death_prob,apache_4a_icu_death_prob,aids,cirrhosis,diabetes_mellitus,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,apache_3j_bodysystem,apache_2_bodysystem,apache_3j_diag_int,critically_ill,bun_to_creatinine,h1_ttl_tests,d1_ttl_tests,h1_num_labs,d1_num_labs,h1_lbg,d1_lbg,total_tests_increase,total_labs_increase,added_lbg,chronic_diagnosis
0,118,0,68.0,22.73,0,Caucasian,M,180.3,Floor,Floor,92,admit,CTICU,0.541667,73.9,2.3,113.0,502.01,0,0.0,0.4,31.0,2.51,,3.0,6.0,0.0,4.0,168.0,118.0,27.4,0.0,40.0,,,,,36.0,134.0,39.3,,0.0,14.1,46.0,32.0,68.0,37.0,68.0,37.0,119.0,72.0,66.0,40.0,89.0,46.0,89.0,46.0,34.0,10.0,100.0,74.0,122.0,64.0,131.0,73.0,131.0,73.0,39.9,37.2,,,68.0,63.0,68.0,63.0,119.0,108.0,,,86.0,85.0,86.0,85.0,26.0,18.0,100.0,74.0,,,131.0,115.0,131.0,115.0,39.5,37.5,2.3,2.3,0.4,0.4,31.0,30.0,8.5,7.4,2.51,2.23,168.0,109.0,19.0,15.0,8.9,8.9,27.4,27.4,,,1.3,1.0,233.0,233.0,4.0,3.4,136.0,134.0,14.1,14.1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.1,0.05,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Sepsis,Cardiovascular,502,0,12.35,20,54,0,28,0,0,34,28,0,1.0
1,81,0,77.0,27.42,0,Caucasian,F,160.0,Floor,Floor,90,admit,Med-Surg ICU,0.927778,70.2,,108.0,203.01,0,0.0,,9.0,0.56,1.0,1.0,3.0,0.0,1.0,145.0,120.0,36.9,0.0,46.0,37.0,37.0,51.0,7.45,33.0,145.0,35.1,,1.0,12.7,,,95.0,31.0,95.0,31.0,118.0,72.0,,,120.0,38.0,120.0,38.0,32.0,12.0,100.0,70.0,,,159.0,67.0,159.0,67.0,36.3,35.1,,,61.0,48.0,61.0,48.0,114.0,100.0,,,85.0,57.0,85.0,57.0,31.0,28.0,95.0,70.0,,,95.0,71.0,95.0,71.0,36.3,36.3,1.6,1.6,0.5,0.5,11.0,9.0,8.6,8.0,0.71,0.56,145.0,128.0,27.0,26.0,11.3,11.1,36.9,36.1,1.3,1.3,3.5,3.5,557.0,487.0,4.2,3.8,145.0,145.0,23.3,12.7,,,,,9.0,9.0,8.6,8.6,0.56,0.56,145.0,143.0,27.0,27.0,11.3,11.3,36.9,36.9,1.3,1.3,3.5,3.5,557.0,557.0,4.2,4.2,145.0,145.0,12.7,12.7,37.0,37.0,7.45,7.45,51.0,51.0,54.8,51.0,37.0,37.0,7.45,7.45,51.0,51.0,51.0,51.0,0.47,0.29,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Respiratory,Respiratory,203,0,15.49,54,58,26,30,1,1,4,4,0,1.0
2,118,0,25.0,31.95,0,Caucasian,F,172.7,Emergency Department,Accident & Emergency,93,admit,Med-Surg ICU,0.000694,95.3,,122.0,703.03,0,0.0,,,,,3.0,6.0,0.0,5.0,,102.0,,0.0,68.0,,,,,37.0,,36.7,,0.0,,,,88.0,48.0,88.0,48.0,96.0,68.0,,,102.0,68.0,102.0,68.0,21.0,8.0,98.0,91.0,,,148.0,105.0,148.0,105.0,37.0,36.7,,,88.0,58.0,88.0,58.0,96.0,78.0,,,91.0,83.0,91.0,83.0,20.0,16.0,98.0,91.0,,,148.0,124.0,148.0,124.0,36.7,36.7,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Metabolic,Metabolic,703,0,,20,20,0,0,0,0,0,0,0,0.0
3,118,0,81.0,22.64,1,Caucasian,F,165.1,Operating Room,Operating Room / Recovery,92,admit,CTICU,0.000694,61.7,,203.0,1206.03,1,0.0,,,,0.6,4.0,6.0,0.0,5.0,185.0,114.0,25.9,1.0,60.0,30.0,30.0,142.0,7.39,4.0,,34.8,,1.0,8.0,62.0,30.0,48.0,42.0,48.0,42.0,116.0,92.0,92.0,52.0,84.0,84.0,84.0,84.0,23.0,7.0,100.0,95.0,164.0,78.0,158.0,84.0,158.0,84.0,38.0,34.8,62.0,44.0,62.0,44.0,,,100.0,96.0,92.0,71.0,92.0,71.0,,,12.0,11.0,100.0,99.0,136.0,106.0,136.0,106.0,,,35.6,34.8,,,,,,,,,,,185.0,88.0,,,11.6,8.9,34.0,25.9,1.6,1.1,,,198.0,43.0,5.0,3.5,,,9.0,8.0,,,,,,,,,,,,,,,11.6,11.6,34.0,34.0,1.6,1.1,,,43.0,43.0,,,,,8.8,8.8,37.0,27.0,7.44,7.34,337.0,102.0,342.5,236.666667,36.0,33.0,7.37,7.34,337.0,265.0,337.0,337.0,0.04,0.03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Cardiovascular,Cardiovascular,1206,1,,38,48,10,14,1,1,10,4,0,0.0
4,33,0,19.0,,0,Caucasian,M,188.0,Emergency Department,Accident & Emergency,91,admit,Med-Surg ICU,0.073611,,,119.0,601.01,0,0.0,,,,,,,1.0,,,60.0,,0.0,103.0,,,,,16.0,,36.7,,0.0,,,,99.0,57.0,99.0,57.0,89.0,60.0,,,104.0,90.0,104.0,90.0,18.0,16.0,100.0,96.0,,,147.0,120.0,147.0,120.0,37.2,36.7,,,99.0,68.0,99.0,68.0,89.0,76.0,,,104.0,92.0,104.0,92.0,,,100.0,100.0,,,130.0,120.0,130.0,120.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Trauma,Trauma,601,0,,16,20,0,0,0,0,4,0,0,0.0


## Create independent features X and dependant feature y

In [28]:
# Original: all Variables
X = train.drop(columns=['hospital_death'])
y = train.hospital_death

In [29]:
# Specific columns to exclude for RFE search
cols_to_excl = ['apache_2_diagnosis','apache_3j_diagnosis','apache_2_bodysystem','hospital_id','icu_id']

In [30]:
# Drop unwanted cols
X = X.drop(columns = cols_to_excl)

# Select the numeric cols for Feature Selection
num_cols = [col for col in X.columns if X[col].dtype !='object']

X = X[num_cols]

X.shape

(91713, 183)

In [20]:
X.head()

Unnamed: 0,age,bmi,elective_surgery,height,pre_icu_los_days,weight,albumin_apache,apache_post_operative,arf_apache,bilirubin_apache,bun_apache,creatinine_apache,fio2_apache,gcs_eyes_apache,gcs_motor_apache,gcs_unable_apache,gcs_verbal_apache,glucose_apache,heart_rate_apache,hematocrit_apache,intubated_apache,map_apache,paco2_apache,paco2_for_ph_apache,pao2_apache,ph_apache,resprate_apache,sodium_apache,temp_apache,urineoutput_apache,ventilated_apache,wbc_apache,d1_diasbp_invasive_max,d1_diasbp_invasive_min,d1_diasbp_max,d1_diasbp_min,d1_diasbp_noninvasive_max,d1_diasbp_noninvasive_min,d1_heartrate_max,d1_heartrate_min,d1_mbp_invasive_max,d1_mbp_invasive_min,d1_mbp_max,d1_mbp_min,d1_mbp_noninvasive_max,d1_mbp_noninvasive_min,d1_resprate_max,d1_resprate_min,d1_spo2_max,d1_spo2_min,d1_sysbp_invasive_max,d1_sysbp_invasive_min,d1_sysbp_max,d1_sysbp_min,d1_sysbp_noninvasive_max,d1_sysbp_noninvasive_min,d1_temp_max,d1_temp_min,h1_diasbp_invasive_max,h1_diasbp_invasive_min,h1_diasbp_max,h1_diasbp_min,h1_diasbp_noninvasive_max,h1_diasbp_noninvasive_min,h1_heartrate_max,h1_heartrate_min,h1_mbp_invasive_max,h1_mbp_invasive_min,h1_mbp_max,h1_mbp_min,h1_mbp_noninvasive_max,h1_mbp_noninvasive_min,h1_resprate_max,h1_resprate_min,h1_spo2_max,h1_spo2_min,h1_sysbp_invasive_max,h1_sysbp_invasive_min,h1_sysbp_max,h1_sysbp_min,h1_sysbp_noninvasive_max,h1_sysbp_noninvasive_min,h1_temp_max,h1_temp_min,d1_albumin_max,d1_albumin_min,d1_bilirubin_max,d1_bilirubin_min,d1_bun_max,d1_bun_min,d1_calcium_max,d1_calcium_min,d1_creatinine_max,d1_creatinine_min,d1_glucose_max,d1_glucose_min,d1_hco3_max,d1_hco3_min,d1_hemaglobin_max,d1_hemaglobin_min,d1_hematocrit_max,d1_hematocrit_min,d1_inr_max,d1_inr_min,d1_lactate_max,d1_lactate_min,d1_platelets_max,d1_platelets_min,d1_potassium_max,d1_potassium_min,d1_sodium_max,d1_sodium_min,d1_wbc_max,d1_wbc_min,h1_albumin_max,h1_albumin_min,h1_bilirubin_max,h1_bilirubin_min,h1_bun_max,h1_bun_min,h1_calcium_max,h1_calcium_min,h1_creatinine_max,h1_creatinine_min,h1_glucose_max,h1_glucose_min,h1_hco3_max,h1_hco3_min,h1_hemaglobin_max,h1_hemaglobin_min,h1_hematocrit_max,h1_hematocrit_min,h1_inr_max,h1_inr_min,h1_lactate_max,h1_lactate_min,h1_platelets_max,h1_platelets_min,h1_potassium_max,h1_potassium_min,h1_sodium_max,h1_sodium_min,h1_wbc_max,h1_wbc_min,d1_arterial_pco2_max,d1_arterial_pco2_min,d1_arterial_ph_max,d1_arterial_ph_min,d1_arterial_po2_max,d1_arterial_po2_min,d1_pao2fio2ratio_max,d1_pao2fio2ratio_min,h1_arterial_pco2_max,h1_arterial_pco2_min,h1_arterial_ph_max,h1_arterial_ph_min,h1_arterial_po2_max,h1_arterial_po2_min,h1_pao2fio2ratio_max,h1_pao2fio2ratio_min,apache_4a_hospital_death_prob,apache_4a_icu_death_prob,aids,cirrhosis,diabetes_mellitus,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,apache_3j_diag_int,critically_ill,bun_to_creatinine,h1_ttl_tests,d1_ttl_tests,h1_num_labs,d1_num_labs,h1_lbg,d1_lbg,total_tests_increase,total_labs_increase,added_lbg,chronic_diagnosis
0,68.0,22.73,0,180.3,0.541667,73.9,2.3,0,0.0,0.4,31.0,2.51,,3.0,6.0,0.0,4.0,168.0,118.0,27.4,0.0,40.0,,,,,36.0,134.0,39.3,,0.0,14.1,46.0,32.0,68.0,37.0,68.0,37.0,119.0,72.0,66.0,40.0,89.0,46.0,89.0,46.0,34.0,10.0,100.0,74.0,122.0,64.0,131.0,73.0,131.0,73.0,39.9,37.2,,,68.0,63.0,68.0,63.0,119.0,108.0,,,86.0,85.0,86.0,85.0,26.0,18.0,100.0,74.0,,,131.0,115.0,131.0,115.0,39.5,37.5,2.3,2.3,0.4,0.4,31.0,30.0,8.5,7.4,2.51,2.23,168.0,109.0,19.0,15.0,8.9,8.9,27.4,27.4,,,1.3,1.0,233.0,233.0,4.0,3.4,136.0,134.0,14.1,14.1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.1,0.05,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,502,0,12.35,20,54,0,28,0,0,34,28,0,1.0
1,77.0,27.42,0,160.0,0.927778,70.2,,0,0.0,,9.0,0.56,1.0,1.0,3.0,0.0,1.0,145.0,120.0,36.9,0.0,46.0,37.0,37.0,51.0,7.45,33.0,145.0,35.1,,1.0,12.7,,,95.0,31.0,95.0,31.0,118.0,72.0,,,120.0,38.0,120.0,38.0,32.0,12.0,100.0,70.0,,,159.0,67.0,159.0,67.0,36.3,35.1,,,61.0,48.0,61.0,48.0,114.0,100.0,,,85.0,57.0,85.0,57.0,31.0,28.0,95.0,70.0,,,95.0,71.0,95.0,71.0,36.3,36.3,1.6,1.6,0.5,0.5,11.0,9.0,8.6,8.0,0.71,0.56,145.0,128.0,27.0,26.0,11.3,11.1,36.9,36.1,1.3,1.3,3.5,3.5,557.0,487.0,4.2,3.8,145.0,145.0,23.3,12.7,,,,,9.0,9.0,8.6,8.6,0.56,0.56,145.0,143.0,27.0,27.0,11.3,11.3,36.9,36.9,1.3,1.3,3.5,3.5,557.0,557.0,4.2,4.2,145.0,145.0,12.7,12.7,37.0,37.0,7.45,7.45,51.0,51.0,54.8,51.0,37.0,37.0,7.45,7.45,51.0,51.0,51.0,51.0,0.47,0.29,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,203,0,15.49,54,58,26,30,1,1,4,4,0,1.0
2,25.0,31.95,0,172.7,0.000694,95.3,,0,0.0,,,,,3.0,6.0,0.0,5.0,,102.0,,0.0,68.0,,,,,37.0,,36.7,,0.0,,,,88.0,48.0,88.0,48.0,96.0,68.0,,,102.0,68.0,102.0,68.0,21.0,8.0,98.0,91.0,,,148.0,105.0,148.0,105.0,37.0,36.7,,,88.0,58.0,88.0,58.0,96.0,78.0,,,91.0,83.0,91.0,83.0,20.0,16.0,98.0,91.0,,,148.0,124.0,148.0,124.0,36.7,36.7,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,703,0,,20,20,0,0,0,0,0,0,0,0.0
3,81.0,22.64,1,165.1,0.000694,61.7,,1,0.0,,,,0.6,4.0,6.0,0.0,5.0,185.0,114.0,25.9,1.0,60.0,30.0,30.0,142.0,7.39,4.0,,34.8,,1.0,8.0,62.0,30.0,48.0,42.0,48.0,42.0,116.0,92.0,92.0,52.0,84.0,84.0,84.0,84.0,23.0,7.0,100.0,95.0,164.0,78.0,158.0,84.0,158.0,84.0,38.0,34.8,62.0,44.0,62.0,44.0,,,100.0,96.0,92.0,71.0,92.0,71.0,,,12.0,11.0,100.0,99.0,136.0,106.0,136.0,106.0,,,35.6,34.8,,,,,,,,,,,185.0,88.0,,,11.6,8.9,34.0,25.9,1.6,1.1,,,198.0,43.0,5.0,3.5,,,9.0,8.0,,,,,,,,,,,,,,,11.6,11.6,34.0,34.0,1.6,1.1,,,43.0,43.0,,,,,8.8,8.8,37.0,27.0,7.44,7.34,337.0,102.0,342.5,236.666667,36.0,33.0,7.37,7.34,337.0,265.0,337.0,337.0,0.04,0.03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1206,1,,38,48,10,14,1,1,10,4,0,0.0
4,19.0,,0,188.0,0.073611,,,0,0.0,,,,,,,1.0,,,60.0,,0.0,103.0,,,,,16.0,,36.7,,0.0,,,,99.0,57.0,99.0,57.0,89.0,60.0,,,104.0,90.0,104.0,90.0,18.0,16.0,100.0,96.0,,,147.0,120.0,147.0,120.0,37.2,36.7,,,99.0,68.0,99.0,68.0,89.0,76.0,,,104.0,92.0,104.0,92.0,,,100.0,100.0,,,130.0,120.0,130.0,120.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,601,0,,16,20,0,0,0,0,4,0,0,0.0


## Impute missing data with SimpleImputer

In [9]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy = 'mean')
X_imputed = imputer.fit_transform(X)

## StandardScaler

In [10]:
# Instantiate our StandardScaler.

from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
X_ss = ss.fit_transform(X_imputed)  # Scale X_train

## OVERSAMPLING THE MINORITY CLASS

In [11]:
## Oversampling on minority class
from imblearn.over_sampling import SMOTE

sm = SMOTE(k_neighbors=5,sampling_strategy = "minority",random_state=42)
X_sm, y_sm = sm.fit_sample(X_ss, y)

Using TensorFlow backend.


## Feature Selection with RFE

In [12]:
estimator = LogisticRegression(penalty='l1',solver='liblinear',max_iter=200,random_state=42)
selector = RFE(estimator, step=2, verbose=10)

In [13]:
selector.fit(X_sm,y_sm)

Fitting estimator with 184 features.
Fitting estimator with 182 features.
Fitting estimator with 180 features.
Fitting estimator with 178 features.
Fitting estimator with 176 features.
Fitting estimator with 174 features.
Fitting estimator with 172 features.
Fitting estimator with 170 features.
Fitting estimator with 168 features.
Fitting estimator with 166 features.
Fitting estimator with 164 features.
Fitting estimator with 162 features.
Fitting estimator with 160 features.
Fitting estimator with 158 features.
Fitting estimator with 156 features.
Fitting estimator with 154 features.
Fitting estimator with 152 features.
Fitting estimator with 150 features.
Fitting estimator with 148 features.
Fitting estimator with 146 features.
Fitting estimator with 144 features.
Fitting estimator with 142 features.
Fitting estimator with 140 features.
Fitting estimator with 138 features.
Fitting estimator with 136 features.
Fitting estimator with 134 features.
Fitting estimator with 132 features.
F

RFE(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                 fit_intercept=True, intercept_scaling=1,
                                 l1_ratio=None, max_iter=200,
                                 multi_class='auto', n_jobs=None, penalty='l1',
                                 random_state=42, solver='liblinear',
                                 tol=0.0001, verbose=0, warm_start=False),
    n_features_to_select=None, step=2, verbose=10)

In [16]:
print('The number of selected features:',selector.n_features_)
selector.support_

The number of selected features: 92


array([ True, False,  True,  True,  True,  True, False,  True,  True,
       False, False, False,  True,  True,  True,  True,  True,  True,
       False,  True,  True, False, False, False, False, False,  True,
       False,  True,  True,  True, False, False, False,  True,  True,
        True,  True,  True,  True, False, False,  True,  True,  True,
        True, False,  True,  True,  True, False, False,  True,  True,
        True, False, False,  True, False, False,  True, False,  True,
       False,  True, False, False, False, False,  True,  True,  True,
       False,  True, False,  True, False, False,  True, False,  True,
       False, False,  True,  True, False,  True, False,  True,  True,
       False, False, False,  True,  True,  True,  True, False,  True,
        True,  True,  True, False, False, False,  True,  True,  True,
       False, False, False,  True,  True,  True, False, False, False,
       False, False, False,  True,  True, False, False, False, False,
       False, False,

In [18]:
selector.ranking_

array([ 1,  9,  1,  1,  1,  1, 16,  1,  1, 43,  9, 30,  1,  1,  1,  1,  1,
        1, 38,  1,  1,  8, 45, 12, 10, 25,  1, 17,  1,  1,  1,  3, 23, 42,
        1,  1,  1,  1,  1,  1, 21,  8,  1,  1,  1,  1, 31,  1,  1,  1, 46,
       38,  1,  1,  1, 26, 20,  1, 17, 19,  1, 22,  1, 37,  1, 21, 33, 13,
       32,  1,  1,  1, 11,  1, 36,  1, 18, 18,  1,  3,  1,  2, 16,  1,  1,
       27,  1, 47,  1,  1,  2, 25, 35,  1,  1,  1,  1, 13,  1,  1,  1,  1,
       30, 11, 15,  1,  1,  1, 40, 28, 14,  1,  1,  1, 20, 39, 35, 40, 33,
       32,  1,  1, 41,  6, 22, 23,  7, 39,  1, 28,  1,  1, 45, 42, 43, 24,
       29, 41,  5,  4, 24, 36,  1,  1,  1,  1, 15, 14, 37, 27,  1, 10, 29,
       26,  1,  1,  1,  1,  1,  1,  1,  1, 34, 34, 44,  6,  5, 31, 19,  1,
        1, 47,  1,  7,  1,  1,  1,  1, 44,  1, 12, 46,  1,  4])

In [26]:
rfe_df=pd.DataFrame({'name':X.columns, 'selected': selector.support_,'ranking': selector.ranking_})
rfe_df

Unnamed: 0,name,selected,ranking
0,age,True,1
1,bmi,False,9
2,elective_surgery,True,1
3,height,True,1
4,pre_icu_los_days,True,1
...,...,...,...
179,d1_lbg,True,1
180,total_tests_increase,False,12
181,total_labs_increase,False,46
182,added_lbg,True,1


In [35]:
rfe_df = rfe_df.sort_values(by='ranking', ascending=False)

In [37]:
#rfe_df.to_csv('../data/rfe_df.csv')

## Feature Selection with RFECV

In [38]:
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV


In [40]:
cv = StratifiedKFold(n_splits=5, random_state=42)  # reduce to 5 fold due to large dataset

estimator = LogisticRegression(penalty='l1',solver='liblinear',max_iter=200,random_state=42)
selector_cv = RFECV(estimator, step=2, cv=cv, scoring='roc_auc', verbose=10, n_jobs=-1)

In [None]:
selector_cv.fit(X_sm,y_sm)

In [None]:
print('The number of selected features:',selector_cv.n_features_)
selector_cv.support_


In [None]:
len(selector_cv.ranking_)

## Feature Selection

In [7]:
results = pd.read_csv('../data/rfe_df.csv')
results.drop('Unnamed: 0', axis=1)

Unnamed: 0,name,selected,ranking
0,apache_3j_diag_type,False,47
1,d1_bilirubin_min,False,47
2,d1_sysbp_invasive_max,False,46
3,total_labs_increase,False,46
4,h1_inr_max,False,45
...,...,...,...
179,h1_heartrate_max,True,1
180,h1_calcium_min,True,1
181,h1_calcium_max,True,1
182,h1_mbp_min,True,1


In [31]:
cols_to_drop = results.loc[results.selected==False, 'name'].tolist()
cols_to_drop.remove('apache_3j_diag_type')

In [27]:
X.head()

Unnamed: 0,age,bmi,elective_surgery,height,pre_icu_los_days,weight,albumin_apache,apache_post_operative,arf_apache,bilirubin_apache,bun_apache,creatinine_apache,fio2_apache,gcs_eyes_apache,gcs_motor_apache,gcs_unable_apache,gcs_verbal_apache,glucose_apache,heart_rate_apache,hematocrit_apache,intubated_apache,map_apache,paco2_apache,paco2_for_ph_apache,pao2_apache,ph_apache,resprate_apache,sodium_apache,temp_apache,urineoutput_apache,ventilated_apache,wbc_apache,d1_diasbp_invasive_max,d1_diasbp_invasive_min,d1_diasbp_max,d1_diasbp_min,d1_diasbp_noninvasive_max,d1_diasbp_noninvasive_min,d1_heartrate_max,d1_heartrate_min,d1_mbp_invasive_max,d1_mbp_invasive_min,d1_mbp_max,d1_mbp_min,d1_mbp_noninvasive_max,d1_mbp_noninvasive_min,d1_resprate_max,d1_resprate_min,d1_spo2_max,d1_spo2_min,d1_sysbp_invasive_max,d1_sysbp_invasive_min,d1_sysbp_max,d1_sysbp_min,d1_sysbp_noninvasive_max,d1_sysbp_noninvasive_min,d1_temp_max,d1_temp_min,h1_diasbp_invasive_max,h1_diasbp_invasive_min,h1_diasbp_max,h1_diasbp_min,h1_diasbp_noninvasive_max,h1_diasbp_noninvasive_min,h1_heartrate_max,h1_heartrate_min,h1_mbp_invasive_max,h1_mbp_invasive_min,h1_mbp_max,h1_mbp_min,h1_mbp_noninvasive_max,h1_mbp_noninvasive_min,h1_resprate_max,h1_resprate_min,h1_spo2_max,h1_spo2_min,h1_sysbp_invasive_max,h1_sysbp_invasive_min,h1_sysbp_max,h1_sysbp_min,h1_sysbp_noninvasive_max,h1_sysbp_noninvasive_min,h1_temp_max,h1_temp_min,d1_albumin_max,d1_albumin_min,d1_bilirubin_max,d1_bilirubin_min,d1_bun_max,d1_bun_min,d1_calcium_max,d1_calcium_min,d1_creatinine_max,d1_creatinine_min,d1_glucose_max,d1_glucose_min,d1_hco3_max,d1_hco3_min,d1_hemaglobin_max,d1_hemaglobin_min,d1_hematocrit_max,d1_hematocrit_min,d1_inr_max,d1_inr_min,d1_lactate_max,d1_lactate_min,d1_platelets_max,d1_platelets_min,d1_potassium_max,d1_potassium_min,d1_sodium_max,d1_sodium_min,d1_wbc_max,d1_wbc_min,h1_albumin_max,h1_albumin_min,h1_bilirubin_max,h1_bilirubin_min,h1_bun_max,h1_bun_min,h1_calcium_max,h1_calcium_min,h1_creatinine_max,h1_creatinine_min,h1_glucose_max,h1_glucose_min,h1_hco3_max,h1_hco3_min,h1_hemaglobin_max,h1_hemaglobin_min,h1_hematocrit_max,h1_hematocrit_min,h1_inr_max,h1_inr_min,h1_lactate_max,h1_lactate_min,h1_platelets_max,h1_platelets_min,h1_potassium_max,h1_potassium_min,h1_sodium_max,h1_sodium_min,h1_wbc_max,h1_wbc_min,d1_arterial_pco2_max,d1_arterial_pco2_min,d1_arterial_ph_max,d1_arterial_ph_min,d1_arterial_po2_max,d1_arterial_po2_min,d1_pao2fio2ratio_max,d1_pao2fio2ratio_min,h1_arterial_pco2_max,h1_arterial_pco2_min,h1_arterial_ph_max,h1_arterial_ph_min,h1_arterial_po2_max,h1_arterial_po2_min,h1_pao2fio2ratio_max,h1_pao2fio2ratio_min,apache_4a_hospital_death_prob,apache_4a_icu_death_prob,aids,cirrhosis,diabetes_mellitus,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,apache_3j_diag_int,critically_ill,bun_to_creatinine,h1_ttl_tests,d1_ttl_tests,h1_num_labs,d1_num_labs,h1_lbg,d1_lbg,total_tests_increase,total_labs_increase,added_lbg,chronic_diagnosis
0,68.0,22.73,0,180.3,0.541667,73.9,2.3,0,0.0,0.4,31.0,2.51,,3.0,6.0,0.0,4.0,168.0,118.0,27.4,0.0,40.0,,,,,36.0,134.0,39.3,,0.0,14.1,46.0,32.0,68.0,37.0,68.0,37.0,119.0,72.0,66.0,40.0,89.0,46.0,89.0,46.0,34.0,10.0,100.0,74.0,122.0,64.0,131.0,73.0,131.0,73.0,39.9,37.2,,,68.0,63.0,68.0,63.0,119.0,108.0,,,86.0,85.0,86.0,85.0,26.0,18.0,100.0,74.0,,,131.0,115.0,131.0,115.0,39.5,37.5,2.3,2.3,0.4,0.4,31.0,30.0,8.5,7.4,2.51,2.23,168.0,109.0,19.0,15.0,8.9,8.9,27.4,27.4,,,1.3,1.0,233.0,233.0,4.0,3.4,136.0,134.0,14.1,14.1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.1,0.05,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,502,0,12.35,20,54,0,28,0,0,34,28,0,1.0
1,77.0,27.42,0,160.0,0.927778,70.2,,0,0.0,,9.0,0.56,1.0,1.0,3.0,0.0,1.0,145.0,120.0,36.9,0.0,46.0,37.0,37.0,51.0,7.45,33.0,145.0,35.1,,1.0,12.7,,,95.0,31.0,95.0,31.0,118.0,72.0,,,120.0,38.0,120.0,38.0,32.0,12.0,100.0,70.0,,,159.0,67.0,159.0,67.0,36.3,35.1,,,61.0,48.0,61.0,48.0,114.0,100.0,,,85.0,57.0,85.0,57.0,31.0,28.0,95.0,70.0,,,95.0,71.0,95.0,71.0,36.3,36.3,1.6,1.6,0.5,0.5,11.0,9.0,8.6,8.0,0.71,0.56,145.0,128.0,27.0,26.0,11.3,11.1,36.9,36.1,1.3,1.3,3.5,3.5,557.0,487.0,4.2,3.8,145.0,145.0,23.3,12.7,,,,,9.0,9.0,8.6,8.6,0.56,0.56,145.0,143.0,27.0,27.0,11.3,11.3,36.9,36.9,1.3,1.3,3.5,3.5,557.0,557.0,4.2,4.2,145.0,145.0,12.7,12.7,37.0,37.0,7.45,7.45,51.0,51.0,54.8,51.0,37.0,37.0,7.45,7.45,51.0,51.0,51.0,51.0,0.47,0.29,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,203,0,15.49,54,58,26,30,1,1,4,4,0,1.0
2,25.0,31.95,0,172.7,0.000694,95.3,,0,0.0,,,,,3.0,6.0,0.0,5.0,,102.0,,0.0,68.0,,,,,37.0,,36.7,,0.0,,,,88.0,48.0,88.0,48.0,96.0,68.0,,,102.0,68.0,102.0,68.0,21.0,8.0,98.0,91.0,,,148.0,105.0,148.0,105.0,37.0,36.7,,,88.0,58.0,88.0,58.0,96.0,78.0,,,91.0,83.0,91.0,83.0,20.0,16.0,98.0,91.0,,,148.0,124.0,148.0,124.0,36.7,36.7,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,703,0,,20,20,0,0,0,0,0,0,0,0.0
3,81.0,22.64,1,165.1,0.000694,61.7,,1,0.0,,,,0.6,4.0,6.0,0.0,5.0,185.0,114.0,25.9,1.0,60.0,30.0,30.0,142.0,7.39,4.0,,34.8,,1.0,8.0,62.0,30.0,48.0,42.0,48.0,42.0,116.0,92.0,92.0,52.0,84.0,84.0,84.0,84.0,23.0,7.0,100.0,95.0,164.0,78.0,158.0,84.0,158.0,84.0,38.0,34.8,62.0,44.0,62.0,44.0,,,100.0,96.0,92.0,71.0,92.0,71.0,,,12.0,11.0,100.0,99.0,136.0,106.0,136.0,106.0,,,35.6,34.8,,,,,,,,,,,185.0,88.0,,,11.6,8.9,34.0,25.9,1.6,1.1,,,198.0,43.0,5.0,3.5,,,9.0,8.0,,,,,,,,,,,,,,,11.6,11.6,34.0,34.0,1.6,1.1,,,43.0,43.0,,,,,8.8,8.8,37.0,27.0,7.44,7.34,337.0,102.0,342.5,236.666667,36.0,33.0,7.37,7.34,337.0,265.0,337.0,337.0,0.04,0.03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1206,1,,38,48,10,14,1,1,10,4,0,0.0
4,19.0,,0,188.0,0.073611,,,0,0.0,,,,,,,1.0,,,60.0,,0.0,103.0,,,,,16.0,,36.7,,0.0,,,,99.0,57.0,99.0,57.0,89.0,60.0,,,104.0,90.0,104.0,90.0,18.0,16.0,100.0,96.0,,,147.0,120.0,147.0,120.0,37.2,36.7,,,99.0,68.0,99.0,68.0,89.0,76.0,,,104.0,92.0,104.0,92.0,,,100.0,100.0,,,130.0,120.0,130.0,120.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,601,0,,16,20,0,0,0,0,4,0,0,0.0


In [36]:
print(train.shape)
train_rfe = train.drop(columns=cols_to_drop, axis=1)
print(train_rfe.shape)

(91713, 196)
(91713, 105)


In [37]:
train_rfe.to_csv('../data/train_ph2_rfe.csv',index=False)

### Save the dataframe

In [None]:
df.shape

df.to_csv('../data/train_ph2_newfeatures.csv', index=False)

new_df2.to_csv('../data/train_ph3_newfeatures.csv', index=False)