In [1]:
#getting and working with data
import pandas as pd
import numpy as np
import re
import os
import scipy as sp
import missingno as msno

#visualizing results
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_context('poster', rc={'font.size':35,
                              'axes.titlesize':50,
                              'axes.labelsize':35})

#machine learning
import category_encoders as ce
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

from sklearn.preprocessing import StandardScaler, Normalizer, LabelEncoder, PolynomialFeatures
from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold, train_test_split, cross_val_score, cross_val_predict, GridSearchCV, RandomizedSearchCV

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import warnings; warnings.simplefilter('ignore')
np.set_printoptions(suppress=True)

In [2]:
train_data_path = 'C:/Users/Schindler/Documents/ProgrammingFun/GOSSIS_kaggle/training_v2.csv'
unlabeled_data_path = 'C:/Users/Schindler/Documents/ProgrammingFun/GOSSIS_kaggle/unlabeled.csv'

In [3]:
train_supplied = pd.read_csv(train_data_path)
train_supplied = pd.DataFrame(data = train_supplied)

print('Original data shape:\n', train_supplied.shape, '\n')
print('Group value counts:\n', train_supplied['hospital_death'].value_counts(), '\n')

#remove columns with meta data (e.g. unique identifiers) and/or zero variance
meta_cols = ['encounter_id', 'patient_id', 'readmission_status', 'hospital_id', 'icu_id']
train_supplied = train_supplied.drop(columns = meta_cols)

#convert negative numbers to nans 
cols_with_zeros = ['pre_icu_los_days', 'apache_4a_hospital_death_prob', 'apache_4a_icu_death_prob']
train_supplied.loc[~(train_supplied['pre_icu_los_days'] > 0), 'pre_icu_los_days']=np.nan
train_supplied.loc[~(train_supplied['apache_4a_hospital_death_prob'] > 0), 'apache_4a_hospital_death_prob']=np.nan
train_supplied.loc[~(train_supplied['apache_4a_icu_death_prob'] > 0), 'apache_4a_icu_death_prob']=np.nan

train_supplied.head()

Original data shape:
 (91713, 186) 

Group value counts:
 0    83798
1     7915
Name: hospital_death, dtype: int64 



Unnamed: 0,hospital_death,age,bmi,elective_surgery,ethnicity,gender,height,hospital_admit_source,icu_admit_source,icu_stay_type,icu_type,pre_icu_los_days,weight,albumin_apache,apache_2_diagnosis,apache_3j_diagnosis,apache_post_operative,arf_apache,bilirubin_apache,bun_apache,creatinine_apache,fio2_apache,gcs_eyes_apache,gcs_motor_apache,gcs_unable_apache,gcs_verbal_apache,glucose_apache,heart_rate_apache,hematocrit_apache,intubated_apache,map_apache,paco2_apache,paco2_for_ph_apache,pao2_apache,ph_apache,resprate_apache,sodium_apache,temp_apache,urineoutput_apache,ventilated_apache,wbc_apache,d1_diasbp_invasive_max,d1_diasbp_invasive_min,d1_diasbp_max,d1_diasbp_min,d1_diasbp_noninvasive_max,d1_diasbp_noninvasive_min,d1_heartrate_max,d1_heartrate_min,d1_mbp_invasive_max,d1_mbp_invasive_min,d1_mbp_max,d1_mbp_min,d1_mbp_noninvasive_max,d1_mbp_noninvasive_min,d1_resprate_max,d1_resprate_min,d1_spo2_max,d1_spo2_min,d1_sysbp_invasive_max,d1_sysbp_invasive_min,d1_sysbp_max,d1_sysbp_min,d1_sysbp_noninvasive_max,d1_sysbp_noninvasive_min,d1_temp_max,d1_temp_min,h1_diasbp_invasive_max,h1_diasbp_invasive_min,h1_diasbp_max,h1_diasbp_min,h1_diasbp_noninvasive_max,h1_diasbp_noninvasive_min,h1_heartrate_max,h1_heartrate_min,h1_mbp_invasive_max,h1_mbp_invasive_min,h1_mbp_max,h1_mbp_min,h1_mbp_noninvasive_max,h1_mbp_noninvasive_min,h1_resprate_max,h1_resprate_min,h1_spo2_max,h1_spo2_min,h1_sysbp_invasive_max,h1_sysbp_invasive_min,h1_sysbp_max,h1_sysbp_min,h1_sysbp_noninvasive_max,h1_sysbp_noninvasive_min,h1_temp_max,h1_temp_min,d1_albumin_max,d1_albumin_min,d1_bilirubin_max,d1_bilirubin_min,d1_bun_max,d1_bun_min,d1_calcium_max,d1_calcium_min,d1_creatinine_max,d1_creatinine_min,d1_glucose_max,d1_glucose_min,d1_hco3_max,d1_hco3_min,d1_hemaglobin_max,d1_hemaglobin_min,d1_hematocrit_max,d1_hematocrit_min,d1_inr_max,d1_inr_min,d1_lactate_max,d1_lactate_min,d1_platelets_max,d1_platelets_min,d1_potassium_max,d1_potassium_min,d1_sodium_max,d1_sodium_min,d1_wbc_max,d1_wbc_min,h1_albumin_max,h1_albumin_min,h1_bilirubin_max,h1_bilirubin_min,h1_bun_max,h1_bun_min,h1_calcium_max,h1_calcium_min,h1_creatinine_max,h1_creatinine_min,h1_glucose_max,h1_glucose_min,h1_hco3_max,h1_hco3_min,h1_hemaglobin_max,h1_hemaglobin_min,h1_hematocrit_max,h1_hematocrit_min,h1_inr_max,h1_inr_min,h1_lactate_max,h1_lactate_min,h1_platelets_max,h1_platelets_min,h1_potassium_max,h1_potassium_min,h1_sodium_max,h1_sodium_min,h1_wbc_max,h1_wbc_min,d1_arterial_pco2_max,d1_arterial_pco2_min,d1_arterial_ph_max,d1_arterial_ph_min,d1_arterial_po2_max,d1_arterial_po2_min,d1_pao2fio2ratio_max,d1_pao2fio2ratio_min,h1_arterial_pco2_max,h1_arterial_pco2_min,h1_arterial_ph_max,h1_arterial_ph_min,h1_arterial_po2_max,h1_arterial_po2_min,h1_pao2fio2ratio_max,h1_pao2fio2ratio_min,apache_4a_hospital_death_prob,apache_4a_icu_death_prob,aids,cirrhosis,diabetes_mellitus,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,apache_3j_bodysystem,apache_2_bodysystem
0,0,68.0,22.73,0,Caucasian,M,180.3,Floor,Floor,admit,CTICU,0.541667,73.9,2.3,113.0,502.01,0,0.0,0.4,31.0,2.51,,3.0,6.0,0.0,4.0,168.0,118.0,27.4,0.0,40.0,,,,,36.0,134.0,39.3,,0.0,14.1,46.0,32.0,68.0,37.0,68.0,37.0,119.0,72.0,66.0,40.0,89.0,46.0,89.0,46.0,34.0,10.0,100.0,74.0,122.0,64.0,131.0,73.0,131.0,73.0,39.9,37.2,,,68.0,63.0,68.0,63.0,119.0,108.0,,,86.0,85.0,86.0,85.0,26.0,18.0,100.0,74.0,,,131.0,115.0,131.0,115.0,39.5,37.5,2.3,2.3,0.4,0.4,31.0,30.0,8.5,7.4,2.51,2.23,168.0,109.0,19.0,15.0,8.9,8.9,27.4,27.4,,,1.3,1.0,233.0,233.0,4.0,3.4,136.0,134.0,14.1,14.1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.1,0.05,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Sepsis,Cardiovascular
1,0,77.0,27.42,0,Caucasian,F,160.0,Floor,Floor,admit,Med-Surg ICU,0.927778,70.2,,108.0,203.01,0,0.0,,9.0,0.56,1.0,1.0,3.0,0.0,1.0,145.0,120.0,36.9,0.0,46.0,37.0,37.0,51.0,7.45,33.0,145.0,35.1,,1.0,12.7,,,95.0,31.0,95.0,31.0,118.0,72.0,,,120.0,38.0,120.0,38.0,32.0,12.0,100.0,70.0,,,159.0,67.0,159.0,67.0,36.3,35.1,,,61.0,48.0,61.0,48.0,114.0,100.0,,,85.0,57.0,85.0,57.0,31.0,28.0,95.0,70.0,,,95.0,71.0,95.0,71.0,36.3,36.3,1.6,1.6,0.5,0.5,11.0,9.0,8.6,8.0,0.71,0.56,145.0,128.0,27.0,26.0,11.3,11.1,36.9,36.1,1.3,1.3,3.5,3.5,557.0,487.0,4.2,3.8,145.0,145.0,23.3,12.7,,,,,9.0,9.0,8.6,8.6,0.56,0.56,145.0,143.0,27.0,27.0,11.3,11.3,36.9,36.9,1.3,1.3,3.5,3.5,557.0,557.0,4.2,4.2,145.0,145.0,12.7,12.7,37.0,37.0,7.45,7.45,51.0,51.0,54.8,51.0,37.0,37.0,7.45,7.45,51.0,51.0,51.0,51.0,0.47,0.29,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Respiratory,Respiratory
2,0,25.0,31.95,0,Caucasian,F,172.7,Emergency Department,Accident & Emergency,admit,Med-Surg ICU,0.000694,95.3,,122.0,703.03,0,0.0,,,,,3.0,6.0,0.0,5.0,,102.0,,0.0,68.0,,,,,37.0,,36.7,,0.0,,,,88.0,48.0,88.0,48.0,96.0,68.0,,,102.0,68.0,102.0,68.0,21.0,8.0,98.0,91.0,,,148.0,105.0,148.0,105.0,37.0,36.7,,,88.0,58.0,88.0,58.0,96.0,78.0,,,91.0,83.0,91.0,83.0,20.0,16.0,98.0,91.0,,,148.0,124.0,148.0,124.0,36.7,36.7,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Metabolic,Metabolic
3,0,81.0,22.64,1,Caucasian,F,165.1,Operating Room,Operating Room / Recovery,admit,CTICU,0.000694,61.7,,203.0,1206.03,1,0.0,,,,0.6,4.0,6.0,0.0,5.0,185.0,114.0,25.9,1.0,60.0,30.0,30.0,142.0,7.39,4.0,,34.8,,1.0,8.0,62.0,30.0,48.0,42.0,48.0,42.0,116.0,92.0,92.0,52.0,84.0,84.0,84.0,84.0,23.0,7.0,100.0,95.0,164.0,78.0,158.0,84.0,158.0,84.0,38.0,34.8,62.0,44.0,62.0,44.0,,,100.0,96.0,92.0,71.0,92.0,71.0,,,12.0,11.0,100.0,99.0,136.0,106.0,136.0,106.0,,,35.6,34.8,,,,,,,,,,,185.0,88.0,,,11.6,8.9,34.0,25.9,1.6,1.1,,,198.0,43.0,5.0,3.5,,,9.0,8.0,,,,,,,,,,,,,,,11.6,11.6,34.0,34.0,1.6,1.1,,,43.0,43.0,,,,,8.8,8.8,37.0,27.0,7.44,7.34,337.0,102.0,342.5,236.666667,36.0,33.0,7.37,7.34,337.0,265.0,337.0,337.0,0.04,0.03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Cardiovascular,Cardiovascular
4,0,19.0,,0,Caucasian,M,188.0,,Accident & Emergency,admit,Med-Surg ICU,0.073611,,,119.0,601.01,0,0.0,,,,,,,,,,60.0,,0.0,103.0,,,,,16.0,,36.7,,0.0,,,,99.0,57.0,99.0,57.0,89.0,60.0,,,104.0,90.0,104.0,90.0,18.0,16.0,100.0,96.0,,,147.0,120.0,147.0,120.0,37.2,36.7,,,99.0,68.0,99.0,68.0,89.0,76.0,,,104.0,92.0,104.0,92.0,,,100.0,100.0,,,130.0,120.0,130.0,120.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Trauma,Trauma


### explore nans and create related features 

In [4]:
#explore missing data 
data = train_supplied

na_0_mean = (data[data['hospital_death'] == 0].isna().sum() / data[data['hospital_death'] == 0].shape[0] * 100).mean()
na_1_mean = (data[data['hospital_death'] == 1].isna().sum() / data[data['hospital_death'] == 1].shape[0] * 100).mean()
print(na_0_mean, na_1_mean)

#there is a bit more missing data for the entries that lived (not surprizing as you get more tests the poorer health)
#get count of missing data per entry
train_supplied['na_count'] = train_supplied.isna().sum(axis=1)

35.17367929903519 30.002338381211977


In [5]:
#are there parameters that have more missing values for yes vs no death
data = train_supplied

na_0 = (data[data['hospital_death'] == 0].isna().sum() / data[data['hospital_death'] == 0].shape[0]) * 100
na_1 = (data[data['hospital_death'] == 1].isna().sum() / data[data['hospital_death'] == 1].shape[0]) * 100
na_0_diff = data[data['hospital_death'] == 0].shape[0] - data[data['hospital_death'] == 0].isna().sum()
na_1_diff = data[data['hospital_death'] == 1].shape[0] - data[data['hospital_death'] == 1].isna().sum()
data_na_perc = pd.DataFrame(data=[na_0, na_1, na_0_diff, na_1_diff])
data_na_perc = data_na_perc.T.sort_values(by=0, ascending=False)
data_na_perc.columns = ['na_0', 'na_1', 'na_0_diff', 'na_1_diff']
data_na_perc['diff'] = data_na_perc['na_0'] - data_na_perc['na_1']
data_na_perc = data_na_perc.sort_values(by='diff', ascending=False)

#get count for each entry of how many missing for params that have at least 25% or 10% difference in missing between yes and no death
param_25_diff = data_na_perc[data_na_perc['diff'] > 25].index
train_supplied['param_25_diff_count'] = train_supplied[param_25_diff].isna().sum(axis=1)
param_10_diff = data_na_perc[data_na_perc['diff'] > 10].index
train_supplied['param_10_diff_count'] = train_supplied[param_10_diff].isna().sum(axis=1)

In [6]:
#remove features that have more than 50% missing data
print(train_supplied.shape)
train_missing = (train_supplied.isnull().sum() / len(train_supplied)).sort_values(ascending = False)
train_missing = train_missing.index[train_missing > 0.50]
print('There are %d columns with more than 50%% missing values' % len(train_missing))
train_supplied_missing = train_supplied.copy().drop(columns = train_missing)
print(train_supplied_missing.shape)

(91713, 184)
There are 74 columns with more than 50% missing values
(91713, 110)


### feature mapping

In [7]:
def age_mapping(age):
    
    # based off of APACHE III scoring system
    
    score = np.nan
    
    if age == np.nan:
        score = np.nan
    elif age <= 44:
        score = 0
    elif 45 <= age <=59:
        score = 5
    elif 60 <= age <=64:
        score = 11
    elif 65 <= age <=69:
        score = 13
    elif 70 <= age <=74:
        score = 16
    elif 75 <= age <=84:
        score = 17
    elif age >= 85:
        score = 24
        
    return score
    
train_supplied_missing['age_score'] = train_supplied_missing.apply(lambda row : age_mapping(row['age']), axis = 1)

In [36]:
def bmi_mapping(bmi):
    
    # based off of APACHE III scoring system
    
    score = np.nan
    
    if bmi == np.nan:
        score = np.nan
    elif bmi <= 20:
        score = 5
    elif 20.1 <= bmi <=25:
        score = 0
    elif 25.1 <= bmi <=30:
        score = 1
    elif 30.1 <= bmi <=35:
        score = 3
    elif 35.1 <= bmi <=40:
        score = 7
    elif bmi >= 40.1:
        score = 15
        
    return score
    
train_supplied_missing['bmi_score'] = train_supplied_missing.apply(lambda row : bmi_mapping(row['bmi']), axis = 1)

In [8]:
def chronic_health_mapping(row, health_conds):
    
    df_int = row[health_conds]
    
    # based off of info from APACHE II scoring system
    
    score = 0
    
    if df_int.isna().sum() == 9: #all nans
        score = np.nan
        
    if df_int['aids'] > 0:
        score += 23
    if df_int['cirrhosis'] > 0:
        score += 4
    if df_int['diabetes_mellitus'] > 0:
        score += 2 
    if df_int['hepatic_failure'] > 0:
        score += 16
    if df_int['immunosuppression'] > 0:
        score += 10
    if df_int['leukemia'] > 0:
        score += 10
    if df_int['lymphoma'] > 0:
        score += 13
    if df_int['solid_tumor_with_metastasis'] > 0:
        score += 11   
        
    return score

health_conds = ['aids', 'cirrhosis', 'diabetes_mellitus', 'hepatic_failure',
       'immunosuppression', 'leukemia', 'lymphoma',
       'solid_tumor_with_metastasis']

train_supplied_missing['chronic_health_score'] = train_supplied_missing.apply(lambda row : chronic_health_mapping(row, health_conds), axis = 1)

In [9]:
#create feature that is count of disease diagnosis
diseases = ['aids', 'cirrhosis', 'diabetes_mellitus', 'hepatic_failure',
       'immunosuppression', 'leukemia', 'lymphoma',
       'solid_tumor_with_metastasis']

train_supplied_missing['disease_count'] = train_supplied_missing[diseases].dropna().sum(axis=1)

def elective_surgery_mapping(elective_surgery, disease_count):
    
    # based off of APACHE II scoring system
    
    score = np.nan
    
    if disease_count == np.nan:
        score = np.nan
    
    if disease_count == 0:
        score = 0
    
    if disease_count > 0:
        if elective_surgery == 1:
            score = 2
        if elective_surgery == 0:
            score = 5
        
    return score
    
train_supplied_missing['elective_surgery_score'] = train_supplied_missing.apply(lambda row : elective_surgery_mapping(row['elective_surgery'], row['disease_count']), axis = 1)

In [10]:
def GCS_mapping(row, gcs_params):
    
    
    df_int = row[gcs_params]
    
    # based off of info from APACHE III scoring system and google search
    
    score = 0
    
    if df_int.isna().sum() == 4: #all nans
        score = np.nan

    elif df_int['gcs_unable_apache'] == 1: #under sedation so cannot give responses
            score = 3
    
    elif df_int['gcs_unable_apache'] == 0:
        score = df_int[['gcs_eyes_apache', 'gcs_motor_apache', 'gcs_verbal_apache']].sum()
                
    return score

gcs_params = ['gcs_eyes_apache',
       'gcs_motor_apache', 'gcs_unable_apache', 'gcs_verbal_apache']

train_supplied_missing['GCS_score'] = train_supplied_missing.apply(lambda row : GCS_mapping(row, gcs_params), axis = 1)

In [11]:
def bun_mapping(bun_apache):
    
    # based off of APACHE III scoring system
    
    score = np.nan
    
    if bun_apache == np.nan:
        score = np.nan
    elif bun_apache <= 6.1:
        score = 0
    elif 6.2 <= bun_apache <= 7.1:
        score = 2
    elif 7.2 <= bun_apache <= 14.3:
        score = 7
    elif 14.4 <= bun_apache <= 28.5:
        score = 11
    elif bun_apache >= 28.6:
        score = 12
        
    return score
    
train_supplied_missing['bun_score'] = train_supplied_missing.apply(lambda row : bun_mapping(row['d1_bun_max']), axis = 1)

In [12]:
def creatinine_mapping(creatinine_apache, arf_apache):
    
    # based off of APACHE III scoring system
    
    score = np.nan
    
    if creatinine_apache == np.nan:
        score = np.nan
    
    elif arf_apache == 1:
        if creatinine_apache <= 0.40:
            score = 3
        elif 0.41 <= creatinine_apache <= 1.40:
            score = 0
        elif 1.41 <= creatinine_apache <= 1.94:
            score = 4
        elif creatinine_apache >= 1.95:
            score = 7   
        
    else: #assume nan is same as no arf
        if creatinine_apache <= 1.40:
            score = 0
        elif creatinine_apache >= 1.41:
            score = 10
        
    return score
    
train_supplied_missing['creatinine_score'] = train_supplied_missing.apply(lambda row : creatinine_mapping(row['d1_creatinine_max'], row['arf_apache']), axis = 1)

In [13]:
def glucose_mapping(glucose_apache):
    
    # based off of APACHE III scoring system
    
    score = np.nan
    
    if glucose_apache == np.nan:
        score = np.nan
    elif glucose_apache <= 39:
        score = 8
    elif 39.1 <= glucose_apache <= 59:
        score = 7
    elif 59.1 <= glucose_apache <= 199:
        score = 0
    elif 199.1 <= glucose_apache <= 349:
        score = 3
    elif glucose_apache >= 349.1:
        score = 5
        
    return score
    
train_supplied_missing['glucose_score'] = train_supplied_missing.apply(lambda row : glucose_mapping(row['d1_glucose_max']), axis = 1)

In [14]:
def heart_rate_mapping(heart_rate_apache):
    
    # based off of APACHE III scoring system
    
    score = np.nan
    
    if heart_rate_apache == np.nan:
        score = np.nan
    elif heart_rate_apache <= 39:
        score = 8
    elif 40 <= heart_rate_apache <= 49:
        score = 5
    elif 50 <= heart_rate_apache <= 99:
        score = 0
    elif 100 <= heart_rate_apache <= 109:
        score = 1
    elif 110 <= heart_rate_apache <= 119:
        score = 5
    elif 120 <= heart_rate_apache <= 139:
        score = 7
    elif 140 <= heart_rate_apache <= 154:
        score = 13
    elif heart_rate_apache >= 155:
        score = 17
        
    return score
    
train_supplied_missing['heart_rate_score'] = train_supplied_missing.apply(lambda row : heart_rate_mapping(row['d1_heartrate_max']), axis = 1)

In [15]:
def hematocrit_mapping(hematocrit_apache):
    
    # based off of APACHE III scoring system
    
    score = np.nan
    
    if hematocrit_apache == np.nan:
        score = np.nan
    elif hematocrit_apache <= 40.9:
        score = 3
    elif 41 <= hematocrit_apache <= 49:
        score = 0
    elif hematocrit_apache >= 49.1:
        score = 3
        
    return score
    
train_supplied_missing['hematocrit_score'] = train_supplied_missing.apply(lambda row : hematocrit_mapping(row['d1_hematocrit_min']), axis = 1)

In [16]:
def map_mapping(map_apache):
    
    # based off of APACHE III scoring system
    
    score = np.nan
    
    if map_apache == np.nan:
        score = np.nan
    elif map_apache <= 39:
        score = 8
    elif 40 <= map_apache <= 49:
        score = 5
    elif 50 <= map_apache <= 99:
        score = 0
    elif 100 <= map_apache <= 109:
        score = 1
    elif 110 <= map_apache <= 119:
        score = 5
    elif 120 <= map_apache <= 139:
        score = 7
    elif 140 <= map_apache <= 154:
        score = 13
    elif map_apache >= 155:
        score = 17
        
    return score
    
train_supplied_missing['map_score'] = train_supplied_missing.apply(lambda row : map_mapping(row['d1_mbp_min']), axis = 1)

In [17]:
def resprate_mapping(resprate_apache, ventilated_apache):
    
    # based off of APACHE III scoring system
    
    score = np.nan
    
    if resprate_apache == np.nan:
        score = np.nan
    
    else:
        if ventilated_apache == 1: #as per scoring instructions
            if resprate_apache <= 6:
                score = 17
            elif 6.1 <= resprate_apache <= 12:
                score = 0
            elif 12.1 <= resprate_apache <= 24:
                score = 0
            elif 24.1 <= resprate_apache <= 34:
                score = 6
            elif 34.1 <= resprate_apache <= 39:
                score = 9
            elif 39.1 <= resprate_apache <= 49:
                score = 11
            elif resprate_apache >= 49.1:
                score = 18
        else:
            if resprate_apache <= 5:
                score = 17
            elif 5.1 <= resprate_apache <= 11:
                score = 8
            elif 11.1 <= resprate_apache <= 13:
                score = 7
            elif 13.1 <= resprate_apache <= 24:
                score = 0
            elif 24.1 <= resprate_apache <= 34:
                score = 6
            elif 34.1 <= resprate_apache <= 39:
                score = 9
            elif 39.1 <= resprate_apache <= 49:
                score = 11
            elif resprate_apache >= 49.1:
                score = 18            

    return score
    
train_supplied_missing['resprate_score'] = train_supplied_missing.apply(lambda row : resprate_mapping(row['d1_resprate_max'], row['ventilated_apache']), axis = 1)

In [18]:
def sodium_mapping(sodium_apache):
    
    # based off of APACHE III scoring system
    
    score = np.nan
    
    if sodium_apache == np.nan:
        score = np.nan
    elif sodium_apache <= 134:
        score = 1
    elif 134.1 <= sodium_apache <= 154:
        score = 0
    elif sodium_apache >= 154.1:
        score = 1
        
    return score
    
train_supplied_missing['sodium_score'] = train_supplied_missing.apply(lambda row : sodium_mapping(row['d1_sodium_max']), axis = 1)

In [19]:
def temp_mapping(temp_apache):
    
    # based off of APACHE III scoring system
    
    score = np.nan
    
    if temp_apache == np.nan:
        score = np.nan
    elif temp_apache <= 32.99:
        score = 28
    elif 33 <= temp_apache <= 33.49:
        score = 16
    elif 33.5 <= temp_apache <= 33.99:
        score = 13
    elif 34 <= temp_apache <= 34.99:
        score = 8        
    elif 35 <= temp_apache <= 35.99:
        score = 2        
    elif 36 <= temp_apache <= 39.99:
        score = 0        
    elif temp_apache >= 40:
        score = 4
        
    return score
    
train_supplied_missing['temp_score'] = train_supplied_missing.apply(lambda row : temp_mapping(row['d1_temp_min']), axis = 1)

In [20]:
def wbc_mapping(wbc_apache):
    
    # based off of APACHE III scoring system
    
    score = np.nan
    
    if wbc_apache == np.nan:
        score = np.nan
    elif wbc_apache <= 0.99:
        score = 19
    elif 1 <= wbc_apache <= 2.99:
        score = 5
    elif 3 <= wbc_apache <= 19.99:
        score = 0
    elif 20 <= wbc_apache <= 24.99:
        score = 1              
    elif wbc_apache >= 25:
        score = 5
        
    return score
    
train_supplied_missing['wbc_score'] = train_supplied_missing.apply(lambda row : wbc_mapping(row['d1_wbc_max']), axis = 1)

In [21]:
def d1_calcium_mapping(d1_calcium_min):
    
    # based off of APACHE III scoring system
    
    score = np.nan
    
    if d1_calcium_min == np.nan:
        score = np.nan
    elif d1_calcium_min <= 8.2:
        score = 3
    elif 8.3 <= d1_calcium_min <= 10.3:
        score = 0           
    elif d1_calcium_min >= 10.4:
        score = 3
        
    return score
    
train_supplied_missing['d1_calcium_score'] = train_supplied_missing.apply(lambda row : d1_calcium_mapping(row['d1_calcium_min']), axis = 1)

In [22]:
def d1_hco3_mapping(d1_hco3_min):
    
    # based off of APACHE III scoring system
    
    score = np.nan

    if d1_hco3_min == np.nan:
        score = np.nan
    elif d1_hco3_min <= 22:
        score = 3
    elif 22.1 <= d1_hco3_min <= 30:
        score = 0           
    elif d1_hco3_min >= 30.1:
        score = 3
        
    return score
    
train_supplied_missing['d1_hco3_score'] = train_supplied_missing.apply(lambda row : d1_hco3_mapping(row['d1_hco3_min']), axis = 1)

In [23]:
def d1_hemaglobin_mapping(d1_hemaglobin_min):
    
    # based off of APACHE III scoring system
    
    score = np.nan

    if d1_hemaglobin_min == np.nan:
        score = np.nan
    elif d1_hemaglobin_min <= 12:
        score = 3
    elif 12.1 <= d1_hemaglobin_min <= 18:
        score = 0           
    elif d1_hemaglobin_min >= 18.1:
        score = 3
        
    return score
    
train_supplied_missing['d1_hemaglobin_score'] = train_supplied_missing.apply(lambda row : d1_hemaglobin_mapping(row['d1_hemaglobin_min']), axis = 1)

In [24]:
def d1_platlet_mapping(d1_platelets_min):
    
    # based off of APACHE III scoring system
    
    score = np.nan
    
    if d1_platelets_min == np.nan:
        score = np.nan
    elif d1_platelets_min <= 20:
        score = 15
    elif 20.1 <= d1_platelets_min <= 50:
        score = 7    
    elif 50.1 <= d1_platelets_min <= 100:
        score = 3    
    elif 100.1 <= d1_platelets_min <= 150:
        score = 1           
    elif 150.1 <= d1_platelets_min <= 400:
        score = 0  
    elif 400.1 <= d1_platelets_min <= 450:
        score = 1   
    elif d1_platelets_min >= 450.1:
        score = 3
        
    return score
    
train_supplied_missing['d1_platlet_score'] = train_supplied_missing.apply(lambda row : d1_platlet_mapping(row['d1_platelets_min']), axis = 1)

In [25]:
def d1_potassium_mapping(d1_potassium_max):
    
    # based off of APACHE III scoring system
    
    score = np.nan
    
    if d1_potassium_max == np.nan:
        score = np.nan
    elif d1_potassium_max <= 2.4:
        score = 5 
    elif 2.5 <= d1_potassium_max <= 3:
        score = 3 
    elif 3.1 <= d1_potassium_max <= 3.4:
        score = 1    
    elif 3.5 <= d1_potassium_max <= 5.1:
        score = 0  
    elif 5.2 <= d1_potassium_max <= 6:
        score = 1 
    elif 6.1 <= d1_potassium_max <= 7:
        score = 3    
    elif d1_potassium_max >= 7.1:
        score = 5
        
    return score
    
train_supplied_missing['d1_potassium_score'] = train_supplied_missing.apply(lambda row : d1_potassium_mapping(row['d1_potassium_max']), axis = 1)

In [26]:
def d1_diasbp_mapping(d1_diasbp_min):
    
    # based off of APACHE III scoring system
    
    score = np.nan
    
    if d1_diasbp_min == np.nan:
        score = np.nan
    elif d1_diasbp_min <= 80:
        score = 0 
    elif 81 <= d1_diasbp_min <= 90:
        score = 1 
    elif 91 <= d1_diasbp_min <= 120:
        score = 2      
    elif d1_diasbp_min >= 121:
        score = 5
        
    return score
    
train_supplied_missing['d1_diasbp_score'] = train_supplied_missing.apply(lambda row : d1_diasbp_mapping(row['d1_diasbp_min']), axis = 1)

In [27]:
def d1_sysbp_mapping(d1_sysbp_min):
    
    # based off of APACHE III scoring system
    
    score = np.nan
    
    if d1_sysbp_min == np.nan:
        score = np.nan
    elif d1_sysbp_min <= 120:
        score = 0 
    elif 120 <= d1_sysbp_min <= 129:
        score = 1 
    elif 130 <= d1_sysbp_min <= 139:
        score = 2 
    elif 140 <= d1_sysbp_min <= 180:
        score = 5      
    elif d1_sysbp_min >= 181:
        score = 10
        
    return score
    
train_supplied_missing['d1_sysbp_score'] = train_supplied_missing.apply(lambda row : d1_sysbp_mapping(row['d1_sysbp_min']), axis = 1)

In [28]:
def d1_spo2_mapping(d1_spo2_min):
    
    # based off of APACHE III scoring system
    
    score = np.nan

    if d1_spo2_min == np.nan:
        score = np.nan
    elif d1_spo2_min <= 85:
        score = 10
    elif 86 <= d1_spo2_min <= 90:
        score = 3 
    elif 91 <= d1_spo2_min <= 94:
        score = 1 
    elif 95 <= d1_spo2_min <= 100:
        score = 0           
        
    return score
    
train_supplied_missing['d1_spo2_score'] = train_supplied_missing.apply(lambda row : d1_spo2_mapping(row['d1_spo2_min']), axis = 1)

In [29]:
#create difference scores for each
train_supplied_missing['bun_diff'] = train_supplied_missing['d1_bun_max'] - train_supplied_missing['d1_bun_min']
train_supplied_missing['creatinine_diff'] = train_supplied_missing['d1_creatinine_max'] - train_supplied_missing['d1_creatinine_min']
train_supplied_missing['glucose_diff'] = train_supplied_missing['d1_glucose_max'] - train_supplied_missing['d1_glucose_min']
train_supplied_missing['heart_rate_diff'] = train_supplied_missing['d1_heartrate_max'] - train_supplied_missing['d1_heartrate_min']
train_supplied_missing['hematocrit_diff'] = train_supplied_missing['d1_hematocrit_max'] - train_supplied_missing['d1_hematocrit_min']
train_supplied_missing['map_diff'] = train_supplied_missing['d1_mbp_max'] - train_supplied_missing['d1_mbp_min']
train_supplied_missing['resprate_diff'] = train_supplied_missing['d1_resprate_max'] - train_supplied_missing['d1_resprate_min']
train_supplied_missing['sodium_diff'] = train_supplied_missing['d1_sodium_max'] - train_supplied_missing['d1_sodium_min']
train_supplied_missing['temp_diff'] = train_supplied_missing['d1_temp_max'] - train_supplied_missing['d1_temp_min']
train_supplied_missing['wbc_diff'] = train_supplied_missing['d1_wbc_max'] - train_supplied_missing['d1_wbc_min']
train_supplied_missing['d1_calcium_diff'] = train_supplied_missing['d1_calcium_max'] - train_supplied_missing['d1_calcium_min']
train_supplied_missing['d1_hco3_diff'] = train_supplied_missing['d1_hco3_max'] - train_supplied_missing['d1_hco3_min']
train_supplied_missing['d1_hemaglobin_diff'] = train_supplied_missing['d1_hemaglobin_max'] - train_supplied_missing['d1_hemaglobin_min']
train_supplied_missing['d1_platlet_diff'] = train_supplied_missing['d1_platelets_max'] - train_supplied_missing['d1_platelets_min']
train_supplied_missing['d1_potassium_diff'] = train_supplied_missing['d1_potassium_max'] - train_supplied_missing['d1_potassium_min']
train_supplied_missing['d1_diasbp_diff'] = train_supplied_missing['d1_diasbp_max'] - train_supplied_missing['d1_diasbp_min']
train_supplied_missing['d1_sysbp_diff'] = train_supplied_missing['d1_sysbp_max'] - train_supplied_missing['d1_sysbp_min']
train_supplied_missing['d1_spo2_diff'] = train_supplied_missing['d1_spo2_max'] - train_supplied_missing['d1_spo2_min']


### encode categorical variables

In [39]:
#encode categorical variables
cat_cols = ['ethnicity', 'gender', 'hospital_admit_source', 'icu_admit_source', 'icu_stay_type', 
            'icu_type', 'apache_2_diagnosis', 'apache_3j_diagnosis', 'apache_3j_bodysystem', 'apache_2_bodysystem']

#encode categorical variables using targetencoder - will need to retrain on fianl feature set
data_cat_train = train_supplied_missing.copy()

Y_train = data_cat_train['hospital_death']
X_train = data_cat_train

# use target encoding to encode two categorical features
enc = ce.TargetEncoder(cols=cat_cols, handle_missing='return_nan', handle_unknown='return_nan').fit(X_train, Y_train)
train_supplied_missing_catenc = enc.transform(X_train)
print(train_supplied_missing_catenc.shape)
train_supplied_missing_catenc.head()

(91713, 152)


Unnamed: 0,hospital_death,age,bmi,elective_surgery,ethnicity,gender,height,hospital_admit_source,icu_admit_source,icu_stay_type,icu_type,pre_icu_los_days,weight,apache_2_diagnosis,apache_3j_diagnosis,apache_post_operative,arf_apache,bun_apache,creatinine_apache,gcs_eyes_apache,gcs_motor_apache,gcs_unable_apache,gcs_verbal_apache,glucose_apache,heart_rate_apache,hematocrit_apache,intubated_apache,map_apache,resprate_apache,sodium_apache,temp_apache,ventilated_apache,wbc_apache,d1_diasbp_max,d1_diasbp_min,d1_diasbp_noninvasive_max,d1_diasbp_noninvasive_min,d1_heartrate_max,d1_heartrate_min,d1_mbp_max,d1_mbp_min,d1_mbp_noninvasive_max,d1_mbp_noninvasive_min,d1_resprate_max,d1_resprate_min,d1_spo2_max,d1_spo2_min,d1_sysbp_max,d1_sysbp_min,d1_sysbp_noninvasive_max,d1_sysbp_noninvasive_min,d1_temp_max,d1_temp_min,h1_diasbp_max,h1_diasbp_min,h1_diasbp_noninvasive_max,h1_diasbp_noninvasive_min,h1_heartrate_max,h1_heartrate_min,h1_mbp_max,h1_mbp_min,h1_mbp_noninvasive_max,h1_mbp_noninvasive_min,h1_resprate_max,h1_resprate_min,h1_spo2_max,h1_spo2_min,h1_sysbp_max,h1_sysbp_min,h1_sysbp_noninvasive_max,h1_sysbp_noninvasive_min,h1_temp_max,h1_temp_min,d1_bun_max,d1_bun_min,d1_calcium_max,d1_calcium_min,d1_creatinine_max,d1_creatinine_min,d1_glucose_max,d1_glucose_min,d1_hco3_max,d1_hco3_min,d1_hemaglobin_max,d1_hemaglobin_min,d1_hematocrit_max,d1_hematocrit_min,d1_platelets_max,d1_platelets_min,d1_potassium_max,d1_potassium_min,d1_sodium_max,d1_sodium_min,d1_wbc_max,d1_wbc_min,apache_4a_hospital_death_prob,apache_4a_icu_death_prob,aids,cirrhosis,diabetes_mellitus,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,apache_3j_bodysystem,apache_2_bodysystem,na_count,param_25_diff_count,param_10_diff_count,age_score,chronic_health_score,disease_count,elective_surgery_score,GCS_score,bun_score,creatinine_score,glucose_score,heart_rate_score,hematocrit_score,map_score,resprate_score,sodium_score,temp_score,wbc_score,d1_calcium_score,d1_hco3_score,d1_hemaglobin_score,d1_platlet_score,d1_potassium_score,d1_diasbp_score,d1_sysbp_score,d1_spo2_score,bun_diff,creatinine_diff,glucose_diff,heart_rate_diff,hematocrit_diff,map_diff,resprate_diff,sodium_diff,temp_diff,wbc_diff,d1_calcium_diff,d1_hco3_diff,d1_hemaglobin_diff,d1_platlet_diff,d1_potassium_diff,d1_diasbp_diff,d1_sysbp_diff,d1_spo2_diff,bmi_score
0,0,68.0,22.73,0,0.087262,0.084417,180.3,0.138796,0.134136,0.085168,0.060205,0.541667,73.9,0.157922,0.104856,0,0.0,31.0,2.51,3.0,6.0,0.0,4.0,168.0,118.0,27.4,0.0,40.0,36.0,134.0,39.3,0.0,14.1,68.0,37.0,68.0,37.0,119.0,72.0,89.0,46.0,89.0,46.0,34.0,10.0,100.0,74.0,131.0,73.0,131.0,73.0,39.9,37.2,68.0,63.0,68.0,63.0,119.0,108.0,86.0,85.0,86.0,85.0,26.0,18.0,100.0,74.0,131.0,115.0,131.0,115.0,39.5,37.5,31.0,30.0,8.5,7.4,2.51,2.23,168.0,109.0,19.0,15.0,8.9,8.9,27.4,27.4,233.0,233.0,4.0,3.4,136.0,134.0,14.1,14.1,0.1,0.05,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.157922,0.108383,60,13,23,13.0,2,1.0,5.0,13.0,12.0,10.0,0.0,5.0,3.0,5.0,6.0,0.0,0.0,0.0,3.0,3.0,3.0,0.0,0.0,0.0,0.0,10.0,1.0,0.28,59.0,47.0,0.0,43.0,24.0,2.0,2.7,0.0,1.1,4.0,0.0,0.0,0.6,31.0,58.0,26.0,0.0
1,0,77.0,27.42,0,0.087262,0.088373,160.0,0.138796,0.134136,0.085168,0.087495,0.927778,70.2,0.197111,0.197111,0,0.0,9.0,0.56,1.0,3.0,0.0,1.0,145.0,120.0,36.9,0.0,46.0,33.0,145.0,35.1,1.0,12.7,95.0,31.0,95.0,31.0,118.0,72.0,120.0,38.0,120.0,38.0,32.0,12.0,100.0,70.0,159.0,67.0,159.0,67.0,36.3,35.1,61.0,48.0,61.0,48.0,114.0,100.0,85.0,57.0,85.0,57.0,31.0,28.0,95.0,70.0,95.0,71.0,95.0,71.0,36.3,36.3,11.0,9.0,8.6,8.0,0.71,0.56,145.0,128.0,27.0,26.0,11.3,11.1,36.9,36.1,557.0,487.0,4.2,3.8,145.0,145.0,23.3,12.7,0.47,0.29,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.112068,0.112068,19,0,8,17.0,2,1.0,5.0,5.0,7.0,0.0,0.0,5.0,3.0,8.0,6.0,0.0,2.0,1.0,3.0,0.0,3.0,3.0,0.0,0.0,0.0,10.0,2.0,0.15,17.0,46.0,0.8,82.0,20.0,0.0,1.2,10.6,0.6,1.0,0.2,70.0,0.4,64.0,92.0,30.0,1.0
2,0,25.0,31.95,0,0.087262,0.088373,172.7,0.087306,0.086385,0.085168,0.087495,0.000694,95.3,0.008493,0.007727,0,0.0,,,3.0,6.0,0.0,5.0,,102.0,,0.0,68.0,37.0,,36.7,0.0,,88.0,48.0,88.0,48.0,96.0,68.0,102.0,68.0,102.0,68.0,21.0,8.0,98.0,91.0,148.0,105.0,148.0,105.0,37.0,36.7,88.0,58.0,88.0,58.0,96.0,78.0,91.0,83.0,91.0,83.0,20.0,16.0,98.0,91.0,148.0,124.0,148.0,124.0,36.7,36.7,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.015163,0.015163,104,15,38,0.0,0,0.0,0.0,14.0,,,,0.0,,0.0,0.0,,0.0,,,,,,,0.0,0.0,1.0,,,,28.0,,34.0,13.0,,0.3,,,,,,,40.0,43.0,7.0,3.0
3,0,81.0,22.64,1,0.087262,0.088373,165.1,0.035251,0.0373,0.085168,0.060205,0.000694,61.7,0.028886,0.019481,1,0.0,,,4.0,6.0,0.0,5.0,185.0,114.0,25.9,1.0,60.0,4.0,,34.8,1.0,8.0,48.0,42.0,48.0,42.0,116.0,92.0,84.0,84.0,84.0,84.0,23.0,7.0,100.0,95.0,158.0,84.0,158.0,84.0,38.0,34.8,62.0,44.0,,,100.0,96.0,92.0,71.0,,,12.0,11.0,100.0,99.0,136.0,106.0,,,35.6,34.8,,,,,,,185.0,88.0,,,11.6,8.9,34.0,25.9,198.0,43.0,5.0,3.5,,,9.0,8.0,0.04,0.03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.079669,0.108383,48,2,8,17.0,0,0.0,0.0,15.0,,,0.0,5.0,3.0,0.0,0.0,,8.0,0.0,,,3.0,7.0,0.0,0.0,0.0,0.0,,,97.0,24.0,8.1,0.0,16.0,,3.2,1.0,,,2.7,155.0,1.5,6.0,74.0,5.0,0.0
4,0,19.0,,0,0.087262,0.084417,188.0,,0.086385,0.085168,0.087495,0.073611,,0.08774,0.091518,0,0.0,,,,,,,,60.0,,0.0,103.0,16.0,,36.7,0.0,,99.0,57.0,99.0,57.0,89.0,60.0,104.0,90.0,104.0,90.0,18.0,16.0,100.0,96.0,147.0,120.0,147.0,120.0,37.2,36.7,99.0,68.0,99.0,68.0,89.0,76.0,104.0,92.0,104.0,92.0,,,100.0,100.0,130.0,120.0,130.0,120.0,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.067413,0.067413,115,15,38,0.0,0,0.0,0.0,,,,,0.0,,0.0,0.0,,0.0,,,,,,,0.0,0.0,0.0,,,,29.0,,14.0,2.0,,0.5,,,,,,,42.0,27.0,4.0,


In [40]:
train_supplied_missing_catenc.to_csv('train_data_mine_noimpute.csv')

In [41]:
train_supplied_missing_catenc.corr()['hospital_death']

hospital_death                   1.000000
age                              0.111017
bmi                             -0.031247
elective_surgery                -0.093574
ethnicity                        0.014065
gender                           0.007024
height                          -0.019526
hospital_admit_source            0.117329
icu_admit_source                 0.111024
icu_stay_type                    0.016518
icu_type                         0.051856
pre_icu_los_days                 0.065221
weight                          -0.038362
apache_2_diagnosis               0.299261
apache_3j_diagnosis              0.320165
apache_post_operative           -0.083674
arf_apache                       0.027309
bun_apache                       0.181435
creatinine_apache                0.114699
gcs_eyes_apache                 -0.260373
gcs_motor_apache                -0.282449
gcs_unable_apache                0.051774
gcs_verbal_apache               -0.241044
glucose_apache                   0

### impute missing data

In [None]:
data = train_supplied_missing_catenc.drop('hospital_death', axis=1)

imp = IterativeImputer(max_iter=999, random_state=39, verbose=2).fit(data)
train_supplied_missing_catenc_imp = imp.transform(data)

#create final data set
final_data_train = pd.DataFrame(data=train_supplied_missing_catenc_imp, columns=data.columns.values)
#put back correct hospital death
final_data_train['hospital_death'] = train_supplied_missing_catenc['hospital_death']

final_data_train.head()

In [None]:
final_data_train.to_csv('train_data_mine_withimpute.csv')

In [None]:
#save model
pkl_filename = str("impute_model_999iter.pkl")  
with open(pkl_filename, 'wb') as file:  
    pickle.dump(imp, file)

### prepare test data

In [42]:
unlabeled_supplied = pd.read_csv(unlabeled_data_path)
unlabeled_supplied = pd.DataFrame(data = unlabeled_supplied)

print('Original data shape:\n', unlabeled_supplied.shape, '\n')

#remove columns with meta data (e.g. unique identifiers)
unlabeled_supplied = unlabeled_supplied.drop(columns = meta_cols)

#convert negative numbers to nans 
unlabeled_supplied.loc[~(unlabeled_supplied['pre_icu_los_days'] > 0), 'pre_icu_los_days']=np.nan
unlabeled_supplied.loc[~(unlabeled_supplied['apache_4a_hospital_death_prob'] > 0), 'apache_4a_hospital_death_prob']=np.nan
unlabeled_supplied.loc[~(unlabeled_supplied['apache_4a_icu_death_prob'] > 0), 'apache_4a_icu_death_prob']=np.nan

#get count of missing data per entry
unlabeled_supplied['na_count'] = unlabeled_supplied.isna().sum(axis=1)

#get count for each entry of how many missing for params that have at least 25% difference in missing between yes and no death
unlabeled_supplied['param_25_diff_count'] = unlabeled_supplied[param_25_diff].isna().sum(axis=1)
unlabeled_supplied['param_10_diff_count'] = unlabeled_supplied[param_10_diff].isna().sum(axis=1)

#remove features that have more than 50% missing data
print(unlabeled_supplied.shape)
unlabeled_supplied_missing = unlabeled_supplied.copy().drop(columns = train_missing)
print(unlabeled_supplied_missing.shape)

#feature mapping
unlabeled_supplied_missing['age_score'] = unlabeled_supplied_missing.apply(lambda row : age_mapping(row['age']), axis = 1)
unlabeled_supplied_missing['bmi_score'] = unlabeled_supplied_missing.apply(lambda row : bmi_mapping(row['bmi']), axis = 1)
unlabeled_supplied_missing['chronic_health_score'] = unlabeled_supplied_missing.apply(lambda row : chronic_health_mapping(row, health_conds), axis = 1)
unlabeled_supplied_missing['disease_count'] = unlabeled_supplied_missing[diseases].dropna().sum(axis=1)
unlabeled_supplied_missing['elective_surgery_score'] = unlabeled_supplied_missing.apply(lambda row : elective_surgery_mapping(row['elective_surgery'], row['disease_count']), axis = 1)
unlabeled_supplied_missing['GCS_score'] = unlabeled_supplied_missing.apply(lambda row : GCS_mapping(row, gcs_params), axis = 1)
unlabeled_supplied_missing['bun_score'] = unlabeled_supplied_missing.apply(lambda row : bun_mapping(row['d1_bun_max']), axis = 1)
unlabeled_supplied_missing['creatinine_score'] = unlabeled_supplied_missing.apply(lambda row : creatinine_mapping(row['d1_creatinine_max'], row['arf_apache']), axis = 1)
unlabeled_supplied_missing['glucose_score'] = unlabeled_supplied_missing.apply(lambda row : glucose_mapping(row['d1_glucose_max']), axis = 1)
unlabeled_supplied_missing['heart_rate_score'] = unlabeled_supplied_missing.apply(lambda row : heart_rate_mapping(row['d1_heartrate_max']), axis = 1)
unlabeled_supplied_missing['hematocrit_score'] = unlabeled_supplied_missing.apply(lambda row : hematocrit_mapping(row['d1_hematocrit_min']), axis = 1)
unlabeled_supplied_missing['map_score'] = unlabeled_supplied_missing.apply(lambda row : map_mapping(row['d1_mbp_min']), axis = 1)
unlabeled_supplied_missing['resprate_score'] = unlabeled_supplied_missing.apply(lambda row : resprate_mapping(row['d1_resprate_max'], row['ventilated_apache']), axis = 1)
unlabeled_supplied_missing['sodium_score'] = unlabeled_supplied_missing.apply(lambda row : sodium_mapping(row['d1_sodium_max']), axis = 1)
unlabeled_supplied_missing['temp_score'] = unlabeled_supplied_missing.apply(lambda row : temp_mapping(row['d1_temp_min']), axis = 1)
unlabeled_supplied_missing['wbc_score'] = unlabeled_supplied_missing.apply(lambda row : wbc_mapping(row['d1_wbc_max']), axis = 1)
unlabeled_supplied_missing['d1_calcium_score'] = unlabeled_supplied_missing.apply(lambda row : d1_calcium_mapping(row['d1_calcium_min']), axis = 1)
unlabeled_supplied_missing['d1_hco3_score'] = unlabeled_supplied_missing.apply(lambda row : d1_hco3_mapping(row['d1_hco3_min']), axis = 1)
unlabeled_supplied_missing['d1_hemaglobin_score'] = unlabeled_supplied_missing.apply(lambda row : d1_hemaglobin_mapping(row['d1_hemaglobin_min']), axis = 1)
unlabeled_supplied_missing['d1_platlet_score'] = unlabeled_supplied_missing.apply(lambda row : d1_platlet_mapping(row['d1_platelets_min']), axis = 1)
unlabeled_supplied_missing['d1_potassium_score'] = unlabeled_supplied_missing.apply(lambda row : d1_potassium_mapping(row['d1_potassium_max']), axis = 1)
unlabeled_supplied_missing['d1_diasbp_score'] = unlabeled_supplied_missing.apply(lambda row : d1_diasbp_mapping(row['d1_diasbp_min']), axis = 1)
unlabeled_supplied_missing['d1_sysbp_score'] = unlabeled_supplied_missing.apply(lambda row : d1_sysbp_mapping(row['d1_sysbp_min']), axis = 1)
unlabeled_supplied_missing['d1_spo2_score'] = unlabeled_supplied_missing.apply(lambda row : d1_spo2_mapping(row['d1_spo2_min']), axis = 1)

#create difference scores for each
unlabeled_supplied_missing['bun_diff'] = unlabeled_supplied_missing['d1_bun_max'] - unlabeled_supplied_missing['d1_bun_min']
unlabeled_supplied_missing['creatinine_diff'] = unlabeled_supplied_missing['d1_creatinine_max'] - unlabeled_supplied_missing['d1_creatinine_min']
unlabeled_supplied_missing['glucose_diff'] = unlabeled_supplied_missing['d1_glucose_max'] - unlabeled_supplied_missing['d1_glucose_min']
unlabeled_supplied_missing['heart_rate_diff'] = unlabeled_supplied_missing['d1_heartrate_max'] - unlabeled_supplied_missing['d1_heartrate_min']
unlabeled_supplied_missing['hematocrit_diff'] = unlabeled_supplied_missing['d1_hematocrit_max'] - unlabeled_supplied_missing['d1_hematocrit_min']
unlabeled_supplied_missing['map_diff'] = unlabeled_supplied_missing['d1_mbp_max'] - unlabeled_supplied_missing['d1_mbp_min']
unlabeled_supplied_missing['resprate_diff'] = unlabeled_supplied_missing['d1_resprate_max'] - unlabeled_supplied_missing['d1_resprate_min']
unlabeled_supplied_missing['sodium_diff'] = unlabeled_supplied_missing['d1_sodium_max'] - unlabeled_supplied_missing['d1_sodium_min']
unlabeled_supplied_missing['temp_diff'] = unlabeled_supplied_missing['d1_temp_max'] - unlabeled_supplied_missing['d1_temp_min']
unlabeled_supplied_missing['wbc_diff'] = unlabeled_supplied_missing['d1_wbc_max'] - unlabeled_supplied_missing['d1_wbc_min']
unlabeled_supplied_missing['d1_calcium_diff'] = unlabeled_supplied_missing['d1_calcium_max'] - unlabeled_supplied_missing['d1_calcium_min']
unlabeled_supplied_missing['d1_hco3_diff'] = unlabeled_supplied_missing['d1_hco3_max'] - unlabeled_supplied_missing['d1_hco3_min']
unlabeled_supplied_missing['d1_hemaglobin_diff'] = unlabeled_supplied_missing['d1_hemaglobin_max'] - unlabeled_supplied_missing['d1_hemaglobin_min']
unlabeled_supplied_missing['d1_platlet_diff'] = unlabeled_supplied_missing['d1_platelets_max'] - unlabeled_supplied_missing['d1_platelets_min']
unlabeled_supplied_missing['d1_potassium_diff'] = unlabeled_supplied_missing['d1_potassium_max'] - unlabeled_supplied_missing['d1_potassium_min']
unlabeled_supplied_missing['d1_diasbp_diff'] = unlabeled_supplied_missing['d1_diasbp_max'] - unlabeled_supplied_missing['d1_diasbp_min']
unlabeled_supplied_missing['d1_sysbp_diff'] = unlabeled_supplied_missing['d1_sysbp_max'] - unlabeled_supplied_missing['d1_sysbp_min']
unlabeled_supplied_missing['d1_spo2_diff'] = unlabeled_supplied_missing['d1_spo2_max'] - unlabeled_supplied_missing['d1_spo2_min']

#encode cat variables
unlabeled_supplied_missing_catenc = enc.transform(unlabeled_supplied_missing)

Original data shape:
 (39308, 186) 

(39308, 184)
(39308, 110)


In [43]:
unlabeled_supplied = pd.read_csv(unlabeled_data_path)
unlabeled_supplied = pd.DataFrame(data = unlabeled_supplied)

unlabeled_supplied_missing_catenc['encounter_id'] = unlabeled_supplied['encounter_id']
unlabeled_supplied_missing_catenc.to_csv('test_data_mine_noimpute.csv')

In [None]:
#impute missing
data_test = unlabeled_supplied_missing_catenc.drop('hosptial_death', axis=1)
unlabeled_supplied_missing_catenc_imp = imp.transform(data_test)

#create final data set
final_data_test = pd.DataFrame(data=unlabeled_supplied_missing_catenc_imp, columns=data_test.columns.values)
#put back correct hospital death
final_data_test['hospital_death'] = unlabeled_supplied_missing_catenc['hospital_death']

final_data_test.head()