In [1]:
import time, matplotlib, sklearn

# visualizatoin 
import matplotlib.pyplot as plt 

# data wrangling
import pandas as pd
import numpy as np 

# data preprocessing
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.impute import SimpleImputer

# learning
from sklearn.linear_model import LogisticRegression

# the data intro has been tested with the following versions
print("pandas        Tested version: 2.0.3   Your version: %s" % pd.__version__)
print("numpy         Tested version: 1.21.5  Your version: %s" % np.__version__)
print("matplotlib    Tested version: 3.5.3   Your version: %s" % matplotlib.__version__)
print("scikit-learn  Tested version: 1.2.2   Your version: %s" % sklearn.__version__)

pandas        Tested version: 2.0.3   Your version: 2.2.3
numpy         Tested version: 1.21.5  Your version: 2.2.3
matplotlib    Tested version: 3.5.3   Your version: 3.8.4
scikit-learn  Tested version: 1.2.2   Your version: 1.6.1


In [2]:
# description
description = pd.read_csv('./physionet.org/files/widsdatathon2020/1.0.0/data/WiDS_Datathon_2020_Dictionary.csv')
description_dict = description.set_index('Variable Name').to_dict(orient='index')
# data
df = pd.read_csv('./physionet.org/files/widsdatathon2020/1.0.0/data/training_v2.csv')

df.head()

Unnamed: 0,encounter_id,patient_id,hospital_id,hospital_death,age,bmi,elective_surgery,ethnicity,gender,height,...,aids,cirrhosis,diabetes_mellitus,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,apache_3j_bodysystem,apache_2_bodysystem
0,66154,25312,118,0,68.0,22.73,0,Caucasian,M,180.3,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Sepsis,Cardiovascular
1,114252,59342,81,0,77.0,27.42,0,Caucasian,F,160.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Respiratory,Respiratory
2,119783,50777,118,0,25.0,31.95,0,Caucasian,F,172.7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Metabolic,Metabolic
3,79267,46918,118,0,81.0,22.64,1,Caucasian,F,165.1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Cardiovascular,Cardiovascular
4,92056,34377,33,0,19.0,,0,Caucasian,M,188.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Trauma,Trauma


In [3]:
test_size = 0.2 # proportion for train versus test+val split
val_size = 0.5 # proportion for test versus val split
random_state = 42  # random state is used to set a seed for randomness, which is only relevant for reproducibility purposes
max_missing = 0.7  # maximum percentage of missing values for a column to be dropped

In [4]:
from sklearn.base import BaseEstimator, TransformerMixin

In [5]:
class DropHighlyCorrelatedFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.8):
        self.threshold = threshold
        self.features_to_drop = None

    def fit(self, X, y=None):
        # Calculate the correlation matrix
        X = pd.DataFrame(X)
        corr_matrix = X.corr()

        # Find pairs of features that are highly correlated
        highly_correlated = [(i, j) for i in corr_matrix.columns for j in corr_matrix.columns if (corr_matrix.loc[i, j] > self.threshold) and (i != j)]

        # Remove one feature from each highly correlated pair
        self.features_to_drop = set()
        for i, j in highly_correlated:
            if i not in self.features_to_drop and j not in self.features_to_drop:
                self.features_to_drop.add(j)

        return self

    def transform(self, X):
        # Drop the highly correlated features
        X = pd.DataFrame(X)
        X_reduced = X.drop(columns=self.features_to_drop)
        return X_reduced

    def get_feature_names_out(self, input_features=None):
        new_features = []
        for i, feature in enumerate(input_features):
            if i not in self.features_to_drop:
                new_features.append(feature)
        return np.array(new_features)

In [6]:
description_dict

{'encounter_id': {'Category': 'identifier',
  'Unit of Measure': nan,
  'Data Type': 'integer',
  'Description': 'Unique identifier associated with a patient unit stay',
  'Example': nan},
 'hospital_id': {'Category': 'identifier',
  'Unit of Measure': nan,
  'Data Type': 'integer',
  'Description': 'Unique identifier associated with a hospital',
  'Example': nan},
 'patient_id': {'Category': 'identifier',
  'Unit of Measure': nan,
  'Data Type': 'integer',
  'Description': 'Unique identifier associated with a patient',
  'Example': nan},
 'hospital_death': {'Category': 'demographic',
  'Unit of Measure': nan,
  'Data Type': 'binary',
  'Description': 'Whether the patient died during this hospitalization',
  'Example': '0'},
 'age': {'Category': 'demographic',
  'Unit of Measure': 'Years',
  'Data Type': 'numeric',
  'Description': 'The age of the patient on unit admission',
  'Example': nan},
 'bmi': {'Category': 'demographic',
  'Unit of Measure': 'kilograms/metres^2',
  'Data Type':

In [16]:
start_time = time.time()

# save features
X = df.copy().drop(['hospital_death', 'patient_id', 'encounter_id', 'hospital_id', 'icu_id', # drop identifiers
                    'apache_4a_hospital_death_prob', 'apache_4a_icu_death_prob', # drop APACHE scores
                    'apache_2_bodysystem', # drop because of similarity with apache_3j_bodysystem
                    # "ethnicity" # Check if can be dropped
                    ],
                   axis=1)
# save target variable
y = df['hospital_death'].copy()
# save APACHE scores for later evaluation on train / test / validation data
y_apache = df['apache_4a_hospital_death_prob'].copy()

""" SPLIT DATA SET """
# split the dataset into train and test+validation set
(
    X_train,
    X_test,
    y_train,
    y_test,
    y_apache_train,
    y_apache_test,
    ) = train_test_split(X, y, y_apache, 
                         test_size=test_size, # used for testing and validation
                         random_state=random_state # for reproducibility
                        ) 
# split the test set into test + validation set
(
    X_val,
    X_test,
    y_val,
    y_test,
    y_apache_val,
    y_apache_test,
    ) = train_test_split(X_test, y_test, y_apache_test, 
                         test_size=val_size, # used for testing and validation
                         random_state=random_state # for reproducibility
                        ) 

"""MISSING VALUES"""
# drop columns with many missing values
missing = X_train.isna().sum() > max_missing * len(X_train)
missing = missing[missing].index
X_train = X_train.drop(missing, axis=1)
X_val = X_val.drop(missing, axis=1)
X_test = X_test.drop(missing, axis=1)

"""FURTHER PROCESSING PIPELINE"""
# define pre-processing steps for numerical features
num_transformer = Pipeline(steps=[("constant", VarianceThreshold()), # remove constant features
                                  ("drop_correlated", DropHighlyCorrelatedFeatures(threshold=0.8)),
                                  ("imputer", SimpleImputer(strategy="mean")),
                                  ("scaling", RobustScaler(quantile_range=(10, 90))),
                                 ])
# define preprocessing steps for categorical features
cat_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="MISSING")),
    ("encoder", OneHotEncoder(drop='first', sparse_output=False, handle_unknown="ignore")),
    ("constant", VarianceThreshold())
    ])
# create preprocessing pipeline
prep_pipeline = ColumnTransformer(
    transformers=[
        ('num', num_transformer, make_column_selector(dtype_exclude=object)), # apply to columns NOT of type object (int or float)
        ('cat', cat_transformer, make_column_selector(dtype_include=object)) # apply to columns of type object
    ])

prep_pipeline.fit(X_train, y_train)
display(prep_pipeline) # display preprocessing pipeline

# transform data sets
X_train = pd.DataFrame(prep_pipeline.transform(X_train), columns=prep_pipeline.get_feature_names_out())
X_val = pd.DataFrame(prep_pipeline.transform(X_val), columns=prep_pipeline.get_feature_names_out())
X_test = pd.DataFrame(prep_pipeline.transform(X_test), columns=prep_pipeline.get_feature_names_out())
        
"""PRINT STATS"""
print("Time: %.2fs" % (time.time() - start_time))
print("Train set: %s rows, %s columns" % X_train.shape)
print("Validation set: %s rows, %s columns" % X_val.shape)
print("Test set: %s rows, %s columns" % X_test.shape)

Time: 2.26s
Train set: 73370 rows, 117 columns
Validation set: 9171 rows, 117 columns
Test set: 9172 rows, 117 columns


In [8]:
X_train.columns

Index(['num__age', 'num__bmi', 'num__elective_surgery', 'num__height',
       'num__pre_icu_los_days', 'num__albumin_apache',
       'num__apache_2_diagnosis', 'num__apache_3j_diagnosis',
       'num__arf_apache', 'num__bilirubin_apache',
       ...
       'cat__apache_3j_bodysystem_Genitourinary',
       'cat__apache_3j_bodysystem_Gynecological',
       'cat__apache_3j_bodysystem_Hematological',
       'cat__apache_3j_bodysystem_MISSING',
       'cat__apache_3j_bodysystem_Metabolic',
       'cat__apache_3j_bodysystem_Musculoskeletal/Skin',
       'cat__apache_3j_bodysystem_Neurological',
       'cat__apache_3j_bodysystem_Respiratory',
       'cat__apache_3j_bodysystem_Sepsis', 'cat__apache_3j_bodysystem_Trauma'],
      dtype='object', length=111)

In [9]:
X_train

Unnamed: 0,num__age,num__bmi,num__elective_surgery,num__height,num__pre_icu_los_days,num__albumin_apache,num__apache_2_diagnosis,num__apache_3j_diagnosis,num__arf_apache,num__bilirubin_apache,...,cat__apache_3j_bodysystem_Genitourinary,cat__apache_3j_bodysystem_Gynecological,cat__apache_3j_bodysystem_Hematological,cat__apache_3j_bodysystem_MISSING,cat__apache_3j_bodysystem_Metabolic,cat__apache_3j_bodysystem_Musculoskeletal/Skin,cat__apache_3j_bodysystem_Neurological,cat__apache_3j_bodysystem_Respiratory,cat__apache_3j_bodysystem_Sepsis,cat__apache_3j_bodysystem_Trauma
0,0.522727,0.007552,0.0,0.007168,-0.061876,0.00000,0.917526,-0.001664,0.00000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.204545,0.465032,0.0,0.003584,-0.022076,0.00000,0.005155,-0.087792,0.00000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.613636,-0.030776,0.0,-0.265233,-0.041976,0.00000,-0.005155,0.244998,0.00000,0.000000,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,-0.068182,-0.327022,0.0,0.394265,0.033581,0.00000,0.938144,0.410548,0.00000,0.000000,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.818182,-0.436020,0.0,0.279570,-0.038556,-0.40288,-0.051546,0.076111,0.00000,-1.133078,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73365,-1.068182,-0.305333,1.0,-0.175627,-0.060011,0.00000,0.494845,0.914733,0.00000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
73366,0.159091,0.078142,1.0,0.358423,0.189049,0.00000,0.321833,-0.342606,0.00000,0.000000,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
73367,0.204545,0.151199,1.0,0.609319,0.044153,0.00000,0.407216,0.664710,0.00000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
73368,-0.386364,1.799951,0.0,-0.086022,-0.061876,0.49712,-0.108247,-0.170567,0.00000,-1.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [10]:
print(list(X_test.columns))

['num__age', 'num__bmi', 'num__elective_surgery', 'num__height', 'num__pre_icu_los_days', 'num__albumin_apache', 'num__apache_2_diagnosis', 'num__apache_3j_diagnosis', 'num__arf_apache', 'num__bilirubin_apache', 'num__bun_apache', 'num__creatinine_apache', 'num__gcs_eyes_apache', 'num__gcs_motor_apache', 'num__gcs_unable_apache', 'num__gcs_verbal_apache', 'num__glucose_apache', 'num__heart_rate_apache', 'num__hematocrit_apache', 'num__intubated_apache', 'num__map_apache', 'num__resprate_apache', 'num__sodium_apache', 'num__temp_apache', 'num__urineoutput_apache', 'num__ventilated_apache', 'num__wbc_apache', 'num__d1_diasbp_max', 'num__d1_diasbp_min', 'num__d1_heartrate_min', 'num__d1_resprate_max', 'num__d1_resprate_min', 'num__d1_spo2_max', 'num__d1_spo2_min', 'num__d1_sysbp_max', 'num__d1_sysbp_min', 'num__d1_temp_max', 'num__d1_temp_min', 'num__h1_diasbp_max', 'num__h1_diasbp_min', 'num__h1_heartrate_max', 'num__h1_resprate_max', 'num__h1_resprate_min', 'num__h1_spo2_max', 'num__h1_

In [11]:
X_train.columns

Index(['num__age', 'num__bmi', 'num__elective_surgery', 'num__height',
       'num__pre_icu_los_days', 'num__albumin_apache',
       'num__apache_2_diagnosis', 'num__apache_3j_diagnosis',
       'num__arf_apache', 'num__bilirubin_apache',
       ...
       'cat__apache_3j_bodysystem_Genitourinary',
       'cat__apache_3j_bodysystem_Gynecological',
       'cat__apache_3j_bodysystem_Hematological',
       'cat__apache_3j_bodysystem_MISSING',
       'cat__apache_3j_bodysystem_Metabolic',
       'cat__apache_3j_bodysystem_Musculoskeletal/Skin',
       'cat__apache_3j_bodysystem_Neurological',
       'cat__apache_3j_bodysystem_Respiratory',
       'cat__apache_3j_bodysystem_Sepsis', 'cat__apache_3j_bodysystem_Trauma'],
      dtype='object', length=111)

In [14]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(penalty="l1", solver="saga")
lr.fit(X_train, y_train)





In [15]:
from sklearn.metrics import accuracy_score
y_pred = lr.predict(X_test)
accuracy_score(y_test, y_pred)

0.9223724378543393

In [17]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

In [18]:
y_pred = rf.predict(X_test)
accuracy_score(y_test, y_pred)

0.9289140863497601