In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split
from pycaret.classification import *
import shap

In [3]:
raw_data_joined = pd.read_csv('./output/raw_data_joined.csv', index_col=0)

In [4]:
comorb_lst = [i for i in raw_data_joined.columns if "DISEASE" in i]
comorb_lst.extend(["HTN", "IMMUNOCOMPROMISED", "OTHER"])

demo_lst = [i for i in raw_data_joined.columns if "AGE_" in i]
demo_lst.append("GENDER")


vitalSigns_lst = raw_data_joined.iloc[:,193:-2].columns.tolist()

lab_lst = raw_data_joined.iloc[:,13:193].columns.tolist()

As warned by the authors ("Beware NOT to use the data when the target variable is present, as it is unknown the order of the event (maybe the target event happened before the results were obtained)") which means that the inpatient's lab results could have been obtained after the inpatient's entry in the ICU thus rendering that data pointless

We will focus on creating a model that works within the 0-2 window since it's more clinically relevant. Therefore, all the data from patients admitted between the 0-2 window should be dropped

In [5]:
# ICU per window
pd.crosstab(raw_data_joined.WINDOW, raw_data_joined.ICU)

ICU,0,1
WINDOW,Unnamed: 1_level_1,Unnamed: 2_level_1
0-2,353,32
2-4,326,59
4-6,286,99
6-12,255,130
ABOVE_12,190,195


In [6]:
# Dropping the patients which were admitted to ICU in the first window
train_data = raw_data_joined.loc[~((raw_data_joined['WINDOW'] == '0-2') & (raw_data_joined['ICU'] == 1))]
train_data

Unnamed: 0,PATIENT_VISIT_IDENTIFIER,AGE_ABOVE65,AGE_PERCENTIL,GENDER,DISEASE GROUPING 1,DISEASE GROUPING 2,DISEASE GROUPING 3,DISEASE GROUPING 4,DISEASE GROUPING 5,DISEASE GROUPING 6,...,TEMPERATURE_DIFF,OXYGEN_SATURATION_DIFF,BLOODPRESSURE_DIASTOLIC_DIFF_REL,BLOODPRESSURE_SISTOLIC_DIFF_REL,HEART_RATE_DIFF_REL,RESPIRATORY_RATE_DIFF_REL,TEMPERATURE_DIFF_REL,OXYGEN_SATURATION_DIFF_REL,WINDOW,ICU
0,0,1,60,0,0.0,0.0,0.0,0.0,1.0,1.0,...,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,0-2,0
1,0,1,60,0,0.0,0.0,0.0,0.0,1.0,1.0,...,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,2-4,0
2,0,1,60,0,0.0,0.0,0.0,0.0,1.0,1.0,...,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,4-6,0
3,0,1,60,0,0.0,0.0,0.0,0.0,1.0,1.0,...,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,6-12,0
4,0,1,60,0,0.0,0.0,0.0,0.0,1.0,1.0,...,-0.238095,-0.818182,-0.389967,0.407558,-0.230462,0.096774,-0.242282,-0.814433,ABOVE_12,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1920,384,0,50,1,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,0-2,0
1921,384,0,50,1,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,2-4,0
1922,384,0,50,1,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,4-6,0
1923,384,0,50,1,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,6-12,0


In [7]:
icu_above_2 = train_data.groupby('PATIENT_VISIT_IDENTIFIER')\
    .agg({'ICU': max})\
    .reset_index()\
    .rename(columns={'ICU': 'ICU_NEW'})
    
# Merge back to original df
training_data = train_data.merge(icu_above_2, on=['PATIENT_VISIT_IDENTIFIER'], how='left')

It is important to notice the addition of a new column at the end of the dataset that tells us if the patient went into the ICU after the window we are studying to obtain the groundtruth value

In [8]:
training_data.head()

Unnamed: 0,PATIENT_VISIT_IDENTIFIER,AGE_ABOVE65,AGE_PERCENTIL,GENDER,DISEASE GROUPING 1,DISEASE GROUPING 2,DISEASE GROUPING 3,DISEASE GROUPING 4,DISEASE GROUPING 5,DISEASE GROUPING 6,...,OXYGEN_SATURATION_DIFF,BLOODPRESSURE_DIASTOLIC_DIFF_REL,BLOODPRESSURE_SISTOLIC_DIFF_REL,HEART_RATE_DIFF_REL,RESPIRATORY_RATE_DIFF_REL,TEMPERATURE_DIFF_REL,OXYGEN_SATURATION_DIFF_REL,WINDOW,ICU,ICU_NEW
0,0,1,60,0,0.0,0.0,0.0,0.0,1.0,1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0-2,0,1
1,0,1,60,0,0.0,0.0,0.0,0.0,1.0,1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,2-4,0,1
2,0,1,60,0,0.0,0.0,0.0,0.0,1.0,1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,4-6,0,1
3,0,1,60,0,0.0,0.0,0.0,0.0,1.0,1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,6-12,0,1
4,0,1,60,0,0.0,0.0,0.0,0.0,1.0,1.0,...,-0.818182,-0.389967,0.407558,-0.230462,0.096774,-0.242282,-0.814433,ABOVE_12,1,1


In [9]:
# Valid cases per window
pd.crosstab(training_data.WINDOW, training_data.ICU)

ICU,0,1
WINDOW,Unnamed: 1_level_1,Unnamed: 2_level_1
0-2,353,0
2-4,326,59
4-6,286,99
6-12,255,130
ABOVE_12,190,195


In [10]:
# keep only features from 0-2 hour window
mask_02 = training_data.WINDOW == '0-2'
training_data = training_data.loc[mask_02]

# valid cases for training
pd.crosstab(training_data.WINDOW, training_data.ICU_NEW)

ICU_NEW,0,1
WINDOW,Unnamed: 1_level_1,Unnamed: 2_level_1
0-2,190,163


We are left with 353 rows total but is as expected taking into account the demand for clinically relevant results

In [12]:
training_data

Unnamed: 0,PATIENT_VISIT_IDENTIFIER,AGE_ABOVE65,AGE_PERCENTIL,GENDER,DISEASE GROUPING 1,DISEASE GROUPING 2,DISEASE GROUPING 3,DISEASE GROUPING 4,DISEASE GROUPING 5,DISEASE GROUPING 6,...,OXYGEN_SATURATION_DIFF,BLOODPRESSURE_DIASTOLIC_DIFF_REL,BLOODPRESSURE_SISTOLIC_DIFF_REL,HEART_RATE_DIFF_REL,RESPIRATORY_RATE_DIFF_REL,TEMPERATURE_DIFF_REL,OXYGEN_SATURATION_DIFF_REL,WINDOW,ICU,ICU_NEW
0,0,1,60,0,0.0,0.0,0.0,0.0,1.0,1.0,...,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,0-2,0,1
9,2,0,10,0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.959596,-0.515528,-0.351328,-0.747001,-0.756272,-1.000000,-0.961262,0-2,0,1
14,3,0,40,1,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,0-2,0,0
19,4,0,10,0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.979798,-1.000000,-0.883669,-0.956805,-0.870968,-0.953536,-0.980333,0-2,0,0
24,5,0,10,0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.979798,-0.860870,-0.714460,-0.986481,-1.000000,-0.975891,-0.980129,0-2,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1868,380,0,40,1,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,0-2,0,1
1873,381,1,90,0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-0.612627,-1.000000,0-2,0,0
1878,382,0,50,0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,0-2,0,1
1883,383,0,40,1,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,0-2,0,0


In [13]:
features = ['AGE_PERCENTIL', 'GENDER'] + comorb_lst + lab_lst + vitalSigns_lst
X = training_data[features]
y = training_data['ICU_NEW']

In [14]:
SEED = 440
experiment = setup(
    training_data, 
    target='ICU_NEW',
    ignore_features=['PATIENT_VISIT_IDENTIFIER', 'ICU', 'WINDOW'],
    #+lab_columns_to_ignore,
    #fix_imbalance=True, # fixing train-test split imbalances
    #feature_selection=True, feature_selection_threshold=0.95, # conservative important feature selection
    #remove_perfect_collinearity=True, # in case we missed any perfectly collinear features
    session_id=SEED, # seed for reproductibility
    #silent=True # for kaggle compatibility
    )


 
Setup Succesfully Completed!


Unnamed: 0,Description,Value
0,session_id,440
1,Target Type,Binary
2,Label Encoded,
3,Original Data,"(353, 232)"
4,Missing Values,False
5,Numeric Features,217
6,Categorical Features,14
7,Ordinal Features,False
8,High Cardinality Features,False
9,High Cardinality Method,


In [15]:
compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,CatBoost Classifier,0.7165,0.7817,0.6235,0.7269,0.6684,0.4241
1,Extra Trees Classifier,0.7003,0.7732,0.597,0.708,0.6443,0.3895
2,Extreme Gradient Boosting,0.696,0.7539,0.603,0.6942,0.6412,0.3804
3,Gradient Boosting Classifier,0.6917,0.7649,0.5962,0.7012,0.6369,0.3729
4,Light Gradient Boosting Machine,0.6878,0.7565,0.6242,0.6832,0.6464,0.3694
5,Logistic Regression,0.684,0.7284,0.6053,0.6857,0.6372,0.3586
6,Ada Boost Classifier,0.6635,0.7206,0.6227,0.6426,0.6265,0.3214
7,Ridge Classifier,0.6598,0.0,0.5606,0.6609,0.6014,0.3075
8,Random Forest Classifier,0.6598,0.7412,0.4894,0.684,0.5622,0.2982
9,Decision Tree Classifier,0.6507,0.6493,0.6311,0.6208,0.6181,0.2975


PyCaret's recommended experiment workflow is to use compare_models() right after setup to evaluate top performing models and finalize a few candidates for continued experimentation. As such, the function that actually allows to you create a model is unimaginatively called create_model(). This function creates a model and scores it using stratified cross-validation. Similar to compare_models(), the output prints a score grid that shows Accuracy, Recall, Precision, F1 and Kappa by fold.

For the remaining part of this project, we will work with the top 5 models as our candidate models. The selections are for illustration purposes only and do not necessarily mean they are the top performing or ideal for this type of data.

- Extra Trees Classifier('et')
- CatBoost Classifier('catboost')
- Random Forest Classifier('rf')
- Logistic Regression('lr')
- Extreme Gradient Boosting('xgboost')

##### Extra Trees Classifier

In [35]:
et = create_model('et')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,0.68,0.7276,0.5,0.75,0.6,0.3506
1,0.8,0.9423,0.8333,0.7692,0.8,0.6006
2,0.72,0.8397,0.5833,0.7778,0.6667,0.4337
3,0.76,0.8077,0.8333,0.7143,0.7692,0.5223
4,0.72,0.7435,0.5455,0.75,0.6316,0.4147
5,0.8,0.8117,0.7273,0.8,0.7619,0.5902
6,0.72,0.8312,0.7273,0.6667,0.6957,0.4373
7,0.625,0.6399,0.6364,0.5833,0.6087,0.25
8,0.625,0.7517,0.4545,0.625,0.5263,0.2286
9,0.5417,0.6154,0.5455,0.5,0.5217,0.0833


In [36]:
tuned_et = tune_model('et')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,0.72,0.6827,0.5833,0.7778,0.6667,0.4337
1,0.88,0.9487,0.9167,0.8462,0.88,0.7604
2,0.72,0.8333,0.5833,0.7778,0.6667,0.4337
3,0.76,0.8141,0.75,0.75,0.75,0.5192
4,0.8,0.7597,0.7273,0.8,0.7619,0.5902
5,0.76,0.7922,0.7273,0.7273,0.7273,0.513
6,0.72,0.8052,0.6364,0.7,0.6667,0.4262
7,0.5833,0.6224,0.5455,0.5455,0.5455,0.1608
8,0.625,0.7902,0.4545,0.625,0.5263,0.2286
9,0.5,0.6224,0.5455,0.4615,0.5,0.0069


In [41]:
predict_model(tuned_et);

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,Extra Trees Classifier,0.7453,0.7476,0.7551,0.7115,0.7327,0.4898


##### CatBoost Classifier

In [43]:
catboost = create_model('catboost')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,0.68,0.8077,0.5,0.75,0.6,0.3506
1,0.84,0.8654,0.9167,0.7857,0.8462,0.6815
2,0.72,0.7949,0.6667,0.7273,0.6957,0.4373
3,0.8,0.8013,0.8333,0.7692,0.8,0.6006
4,0.76,0.8377,0.6364,0.7778,0.7,0.5033
5,0.76,0.8442,0.6364,0.7778,0.7,0.5033
6,0.8,0.8312,0.7273,0.8,0.7619,0.5902
7,0.5833,0.6713,0.5455,0.5455,0.5455,0.1608
8,0.7083,0.8112,0.6364,0.7,0.6667,0.4085
9,0.5833,0.6503,0.5455,0.5455,0.5455,0.1608


In [44]:
tuned_catboost = tune_model('catboost')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,0.72,0.8205,0.5,0.8571,0.6316,0.43
1,0.88,0.9038,1.0,0.8,0.8889,0.7619
2,0.76,0.7949,0.6667,0.8,0.7273,0.5161
3,0.84,0.8397,0.8333,0.8333,0.8333,0.6795
4,0.72,0.8247,0.5455,0.75,0.6316,0.4147
5,0.76,0.8247,0.6364,0.7778,0.7,0.5033
6,0.68,0.8117,0.6364,0.6364,0.6364,0.3506
7,0.5833,0.6503,0.5455,0.5455,0.5455,0.1608
8,0.7083,0.8462,0.6364,0.7,0.6667,0.4085
9,0.5417,0.6084,0.5455,0.5,0.5217,0.0833


In [45]:
predict_model(tuned_catboost);

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,CatBoost Classifier,0.6981,0.7479,0.6735,0.6735,0.6735,0.3928


##### Random Forest Classifier

In [46]:
rf = create_model('rf')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,0.6,0.6571,0.3333,0.6667,0.4444,0.183
1,0.72,0.766,0.8333,0.6667,0.7407,0.4444
2,0.68,0.7404,0.5,0.75,0.6,0.3506
3,0.8,0.7788,0.75,0.8182,0.7826,0.5981
4,0.72,0.763,0.5455,0.75,0.6316,0.4147
5,0.72,0.7955,0.6364,0.7,0.6667,0.4262
6,0.64,0.7305,0.5455,0.6,0.5714,0.2623
7,0.625,0.6783,0.5455,0.6,0.5714,0.2394
8,0.5833,0.6993,0.3636,0.5714,0.4444,0.1367
9,0.5833,0.6119,0.5455,0.5455,0.5455,0.1608


In [47]:
tuned_rf = tune_model('rf')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,0.76,0.7885,0.5833,0.875,0.7,0.513
1,0.8,0.9038,0.8333,0.7692,0.8,0.6006
2,0.68,0.8077,0.75,0.6429,0.6923,0.3631
3,0.72,0.8269,0.75,0.6923,0.72,0.4409
4,0.72,0.7143,0.5455,0.75,0.6316,0.4147
5,0.64,0.7468,0.4545,0.625,0.5263,0.2475
6,0.76,0.8896,0.7273,0.7273,0.7273,0.513
7,0.5833,0.6364,0.5455,0.5455,0.5455,0.1608
8,0.625,0.7972,0.4545,0.625,0.5263,0.2286
9,0.5833,0.6294,0.6364,0.5385,0.5833,0.1724


In [48]:
predict_model(tuned_rf);

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,Random Forest Classifier,0.717,0.7476,0.7347,0.6792,0.7059,0.434


##### Logistic Regression

In [49]:
lr = create_model('lr')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,0.68,0.7821,0.6667,0.6667,0.6667,0.359
1,0.8,0.8205,0.75,0.8182,0.7826,0.5981
2,0.72,0.8462,0.5833,0.7778,0.6667,0.4337
3,0.6,0.7244,0.4167,0.625,0.5,0.1883
4,0.76,0.7987,0.6364,0.7778,0.7,0.5033
5,0.68,0.7338,0.5455,0.6667,0.6,0.3377
6,0.64,0.7662,0.5455,0.6,0.5714,0.2623
7,0.6667,0.7063,0.7273,0.6154,0.6667,0.3379
8,0.6667,0.8042,0.6364,0.6364,0.6364,0.3287
9,0.625,0.6923,0.4545,0.625,0.5263,0.2286


In [50]:
tuned_lr = tune_model('lr')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,0.68,0.7756,0.6667,0.6667,0.6667,0.359
1,0.76,0.8205,0.75,0.75,0.75,0.5192
2,0.72,0.8526,0.5833,0.7778,0.6667,0.4337
3,0.6,0.7308,0.5,0.6,0.5455,0.1935
4,0.8,0.7987,0.7273,0.8,0.7619,0.5902
5,0.72,0.7403,0.5455,0.75,0.6316,0.4147
6,0.68,0.7727,0.5455,0.6667,0.6,0.3377
7,0.6667,0.6853,0.8182,0.6,0.6923,0.3469
8,0.6667,0.8042,0.6364,0.6364,0.6364,0.3287
9,0.625,0.6993,0.4545,0.625,0.5263,0.2286


In [51]:
predict_model(tuned_lr);

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,Logistic Regression,0.7075,0.7698,0.6327,0.7045,0.6667,0.4075


##### Extreme Gradient Boosting

In [52]:
xgboost = create_model('xgboost')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,0.64,0.75,0.4167,0.7143,0.5263,0.2671
1,0.76,0.859,0.8333,0.7143,0.7692,0.5223
2,0.72,0.8077,0.6667,0.7273,0.6957,0.4373
3,0.8,0.7885,0.8333,0.7692,0.8,0.6006
4,0.76,0.7597,0.6364,0.7778,0.7,0.5033
5,0.72,0.8052,0.5455,0.75,0.6316,0.4147
6,0.72,0.8247,0.7273,0.6667,0.6957,0.4373
7,0.5833,0.6643,0.5455,0.5455,0.5455,0.1608
8,0.75,0.7552,0.7273,0.7273,0.7273,0.4965
9,0.6667,0.7063,0.5455,0.6667,0.6,0.3191


In [53]:
tuned_xgboost = tune_model('xgboost')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,0.68,0.7372,0.5,0.75,0.6,0.3506
1,0.84,0.8718,0.9167,0.7857,0.8462,0.6815
2,0.76,0.8462,0.75,0.75,0.75,0.5192
3,0.72,0.7628,0.75,0.6923,0.72,0.4409
4,0.76,0.7727,0.7273,0.7273,0.7273,0.513
5,0.6,0.7662,0.3636,0.5714,0.4444,0.1554
6,0.8,0.7922,0.8182,0.75,0.7826,0.5981
7,0.625,0.6643,0.5455,0.6,0.5714,0.2394
8,0.75,0.7692,0.7273,0.7273,0.7273,0.4965
9,0.5833,0.6084,0.5455,0.5455,0.5455,0.1608


In [54]:
predict_model(tuned_xgboost);

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,Extreme Gradient Boosting,0.7075,0.7351,0.7347,0.6667,0.699,0.4159


## PyCaret with the dataframe Standarized and Normalized

In [26]:
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.pipeline import make_pipeline

pipeline = [
    StandardScaler(),
    Normalizer()
]

tr = make_pipeline(*pipeline)

In [27]:
data_prueba = raw_data_joined.iloc[:,4:229]

X_ready = tr.fit_transform(data_prueba)
X_ready = pd.DataFrame(X_ready, columns=data_prueba.columns)

data_prueba_dem = raw_data_joined.iloc[:,0:4]

data_prueba_pycaret = data_prueba_dem.join(X_ready)

aux_df_pycaret = raw_data_joined.iloc[:,229:231]

data_prueba_pycaret = data_prueba_pycaret.join(aux_df_pycaret)

In [28]:
data_prueba_pycaret

Unnamed: 0,PATIENT_VISIT_IDENTIFIER,AGE_ABOVE65,AGE_PERCENTIL,GENDER,DISEASE GROUPING 1,DISEASE GROUPING 2,DISEASE GROUPING 3,DISEASE GROUPING 4,DISEASE GROUPING 5,DISEASE GROUPING 6,...,TEMPERATURE_DIFF,OXYGEN_SATURATION_DIFF,BLOODPRESSURE_DIASTOLIC_DIFF_REL,BLOODPRESSURE_SISTOLIC_DIFF_REL,HEART_RATE_DIFF_REL,RESPIRATORY_RATE_DIFF_REL,TEMPERATURE_DIFF_REL,OXYGEN_SATURATION_DIFF_REL,WINDOW,ICU
0,0,1,60,0,-0.041853,-0.020429,-0.039560,-0.017064,0.314148,0.542968,...,-0.070432,-0.039602,-0.066468,-0.067127,-0.066671,-0.061639,-0.070358,-0.039569,0-2,0
1,0,1,60,0,-0.037514,-0.018311,-0.035459,-0.015295,0.281580,0.486677,...,-0.063130,-0.035497,-0.059577,-0.060168,-0.059759,-0.055248,-0.063064,-0.035467,2-4,0
2,0,1,60,0,-0.037514,-0.018311,-0.035459,-0.015295,0.281580,0.486677,...,-0.063130,-0.035497,-0.059577,-0.060168,-0.059759,-0.055248,-0.063064,-0.035467,4-6,0
3,0,1,60,0,-0.041088,-0.020055,-0.038837,-0.016752,0.308409,0.533048,...,-0.069145,-0.038879,-0.065254,-0.065901,-0.065453,-0.060512,-0.069072,-0.038846,6-12,0
4,0,1,60,0,-0.018762,-0.009158,-0.017734,-0.007649,0.140825,0.243399,...,0.111539,0.022440,0.086509,0.178283,0.146013,0.129847,0.111372,0.023194,ABOVE_12,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1920,384,0,50,1,-0.036946,-0.018033,-0.034922,-0.015063,-0.040631,-0.023508,...,-0.062174,-0.034959,-0.058675,-0.059257,-0.058854,-0.054412,-0.062109,-0.034930,0-2,0
1921,384,0,50,1,-0.038498,-0.018791,-0.036389,-0.015696,-0.042339,-0.024496,...,-0.064787,-0.036428,-0.061141,-0.061747,-0.061327,-0.056698,-0.064719,-0.036397,2-4,0
1922,384,0,50,1,-0.039982,-0.019516,-0.037792,-0.016302,-0.043971,-0.025440,...,-0.067284,-0.037833,-0.063498,-0.064127,-0.063692,-0.058884,-0.067214,-0.037801,4-6,0
1923,384,0,50,1,-0.035692,-0.017422,-0.033737,-0.014552,-0.039253,-0.022711,...,-0.060065,-0.033773,-0.056685,-0.057247,-0.056858,-0.052566,-0.060002,-0.033745,6-12,0


In [29]:
train_data_normalized = data_prueba_pycaret.loc[~((data_prueba_pycaret['WINDOW'] == '0-2') & (data_prueba_pycaret['ICU'] == 1))]

icu_above_2_normalized = train_data_normalized.groupby('PATIENT_VISIT_IDENTIFIER')\
    .agg({'ICU': max})\
    .reset_index()\
    .rename(columns={'ICU': 'ICU_NEW'})

training_data_normalized = train_data_normalized.merge(icu_above_2_normalized, on=['PATIENT_VISIT_IDENTIFIER'], how='left')

mask_02 = training_data_normalized.WINDOW == '0-2'

training_data_normalized = training_data_normalized.loc[mask_02]

In [30]:
training_data_normalized

Unnamed: 0,PATIENT_VISIT_IDENTIFIER,AGE_ABOVE65,AGE_PERCENTIL,GENDER,DISEASE GROUPING 1,DISEASE GROUPING 2,DISEASE GROUPING 3,DISEASE GROUPING 4,DISEASE GROUPING 5,DISEASE GROUPING 6,...,OXYGEN_SATURATION_DIFF,BLOODPRESSURE_DIASTOLIC_DIFF_REL,BLOODPRESSURE_SISTOLIC_DIFF_REL,HEART_RATE_DIFF_REL,RESPIRATORY_RATE_DIFF_REL,TEMPERATURE_DIFF_REL,OXYGEN_SATURATION_DIFF_REL,WINDOW,ICU,ICU_NEW
0,0,1,60,0,-0.041853,-0.020429,-0.039560,-0.017064,0.314148,0.542968,...,-0.039602,-0.066468,-0.067127,-0.066671,-0.061639,-0.070358,-0.039569,0-2,0,1
9,2,0,10,0,-0.030023,-0.014654,-0.028378,-0.012241,-0.033018,-0.019103,...,-0.014116,0.100127,0.105517,0.044716,0.011784,-0.050471,-0.014711,0-2,0,1
14,3,0,40,1,-0.021905,-0.010692,-0.020705,-0.008931,-0.024090,-0.013938,...,-0.020727,-0.034788,-0.035133,-0.034895,-0.032261,-0.036824,-0.020710,0-2,0,0
19,4,0,10,0,-0.039387,-0.019225,-0.037230,-0.016059,-0.043316,-0.025062,...,-0.027894,-0.062553,-0.027018,-0.042016,-0.019113,-0.047816,-0.028131,0-2,0,0
24,5,0,10,0,-0.038380,-0.018733,-0.036277,-0.015648,-0.042209,-0.024421,...,-0.027181,-0.006690,0.024916,-0.054818,-0.056524,-0.055218,-0.027320,0-2,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1868,380,0,40,1,-0.009744,-0.004756,-0.009211,-0.003973,-0.010716,-0.006200,...,-0.009220,-0.015476,-0.015629,-0.015523,-0.014351,-0.016381,-0.009213,0-2,0,1
1873,381,1,90,0,-0.047333,-0.023104,-0.044740,-0.019299,-0.052055,-0.030118,...,-0.044788,-0.075172,-0.075918,-0.075402,-0.069710,0.104755,-0.044751,0-2,0,0
1878,382,0,50,0,-0.040780,-0.019905,-0.038546,-0.016627,-0.044848,-0.025948,...,-0.038588,-0.064765,-0.065407,-0.064963,-0.060059,-0.068555,-0.038555,0-2,0,1
1883,383,0,40,1,-0.059346,-0.028967,-0.056095,-0.024197,-0.065266,-0.037762,...,-0.056155,-0.094251,-0.095185,-0.094538,-0.087402,-0.099766,-0.056108,0-2,0,0


In [31]:
features = ['AGE_PERCENTIL', 'GENDER'] + comorb_lst + lab_lst + vitalSigns_lst
X_normalized = training_data_normalized[features]
y_normalized = training_data_normalized['ICU_NEW']

X_train_n, X_test_n, y_train_n, y_test_n = train_test_split(X_normalized, y_normalized, test_size=0.2, 
                                                  random_state=0)

In [32]:
SEED = 440
experiment = setup(
    training_data_normalized, 
    target='ICU_NEW',
    ignore_features=['PATIENT_VISIT_IDENTIFIER', 'ICU', 'WINDOW'],
    #+lab_columns_to_ignore,
    #fix_imbalance=True, # fixing train-test split imbalances
    #feature_selection=True, feature_selection_threshold=0.95, # conservative important feature selection
    #remove_perfect_collinearity=True, # in case we missed any perfectly collinear features
    session_id=SEED, # seed for reproductibility
    #silent=True # for kaggle compatibility
    )

 
Setup Succesfully Completed!


Unnamed: 0,Description,Value
0,session_id,440
1,Target Type,Binary
2,Label Encoded,
3,Original Data,"(353, 232)"
4,Missing Values,False
5,Numeric Features,226
6,Categorical Features,5
7,Ordinal Features,False
8,High Cardinality Features,False
9,High Cardinality Method,


In [33]:
compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,CatBoost Classifier,0.7235,0.7915,0.6644,0.7179,0.6861,0.4397
1,Extreme Gradient Boosting,0.712,0.7721,0.6477,0.7059,0.6691,0.4159
2,Light Gradient Boosting Machine,0.6995,0.7627,0.6742,0.6844,0.6706,0.3948
3,Extra Trees Classifier,0.6992,0.7711,0.6386,0.6936,0.6582,0.3911
4,Naive Bayes,0.6917,0.7705,0.5083,0.7607,0.5971,0.3656
5,Logistic Regression,0.6838,0.7675,0.5962,0.6809,0.6317,0.3578
6,Gradient Boosting Classifier,0.6833,0.7713,0.6311,0.6691,0.6438,0.3602
7,Ridge Classifier,0.6755,0.0,0.6045,0.6699,0.6323,0.3425
8,Random Forest Classifier,0.6672,0.7221,0.5598,0.6668,0.5999,0.3216
9,Linear Discriminant Analysis,0.6638,0.6997,0.6242,0.6467,0.6323,0.3232


For the remaining part of this project, we will work with the top 5 models as our candidate models. The selections are for illustration purposes only and do not necessarily mean they are the top performing or ideal for this type of data.

- Extra Trees Classifier('et')
- CatBoost Classifier('catboost')
- Random Forest Classifier('rf')
- Logistic Regression('lr')
- Extreme Gradient Boosting('xgboost')