In [1]:
import shap
import numpy  as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import sklearn

from sklearn.impute   import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics  import accuracy_score, auc, roc_curve, precision_recall_curve, roc_auc_score, precision_score, recall_score, average_precision_score
from sklearn.model_selection import train_test_split, StratifiedKFold

from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost  import XGBClassifier

plt.style.use('ggplot')

pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.


In [None]:
def evaluate(model, testing_set_x, testing_set_y):
    predictions = model.predict_proba(testing_set_x)
    
    accuracy  = accuracy_score(testing_set_y, predictions[:,1] >= 0.5)
    roc_auc   = roc_auc_score(testing_set_y, predictions[:,1])
    precision = precision_score(testing_set_y, predictions[:,1] >= 0.5)
    recall    = recall_score(testing_set_y, predictions[:,1] >= 0.5)
    pr_auc    = average_precision_score(testing_set_y, predictions[:,1])
    
    result = pd.DataFrame([[accuracy, precision, recall, roc_auc, pr_auc]], columns=['Accuracy', 'Precision', 'Recall', 'ROC_auc','PR_auc'])
    return(result)

In [None]:
def run_experiment(df, model_class, n = 100, **kwargs):
    results = pd.DataFrame(columns=['Accuracy', 'Precision', 'Recall', 'ROC_auc','PR_auc'])
    for i in range(n):
        # Compose dataset
        train_x, test_x = train_test_split(df.drop('PATIENT_VISIT_IDENTIFIER', axis=1),
                               test_size = 0.3,
                               stratify  = df['ICU'],
                               random_state = i
                                )
        
        train_y = train_x.pop('ICU')
        test_y  = test_x.pop('ICU')
        
        # Train Model
        model = model_class(**kwargs)
        model.fit(train_x, train_y)
         
        # Evaluate results
        current_result = evaluate(model, test_x, test_y)
        results = results.append(current_result)
        
    return(results.reset_index(drop=True))

In [None]:
def print_results(df, plot = True, extras = False, color='dodgerblue'):
    print('|||||||||||||||||||||||||||||||||||||||||||||||||||||||')
    print('[ Experiment Results ]')
    print('Accuracy:   {}'.format(df.Accuracy.mean()))
    print('Precision:  {}'.format(df.Precision.mean()))
    print('Recall:     {}'.format(df.Recall.mean()))
    print('ROC Auc:    {}'.format(df.ROC_auc.mean()))
    print('PR Auc:     {}'.format(df.PR_auc.mean()))
    print('|||||||||||||||||||||||||||||||||||||||||||||||||||||||')
    
    if plot:
        fig = px.box(df.melt(var_name='metric'),
                       y = 'metric',
                       x = 'value',
                       title = 'Distribution of Metric Values Across 100 Runs',
                       color_discrete_sequence=[color]
                      )

        fig.update_xaxes(title='Metric')
        fig.update_yaxes(title='Value')

        fig.update_layout({'plot_bgcolor': 'rgba(0, 0, 0, 00)',
                           'paper_bgcolor': 'rgba(240, 240, 240, 100)'})
        fig.show()
        
        
    if extras:
        print('Also, the maximum results were:')
        print('    Accuracy:   {}'.format(df.Accuracy.max()))
        print('    Precision:  {}'.format(df.Precision.max()))
        print('    Recall:     {}'.format(df.Recall.max()))
        print('    ROC Auc:    {}'.format(df.ROC_auc.max()))
        print('    PR Auc:     {}'.format(df.PR_auc.max()))

In [None]:
# Read data
raw_data = pd.read_excel('Kaggle_Sirio_Libanes_ICU_Prediction.xlsx')
raw_data.sample(5)

# Data Preparation
raw_data['AGE_PERCENTIL'] = raw_data['AGE_PERCENTIL'].str.replace('Above ','').str.extract(r'(.+?)th')
raw_data['WINDOW'] = raw_data['WINDOW'].str.replace('ABOVE_12','12-more').str.extract(r'(.+?)-')

# Missingness as features
raw_data['row_missingness'] = raw_data.isnull().sum(axis=1)

# Mean imputation
mean_impute  = SimpleImputer(strategy='mean')
imputed_data = mean_impute.fit_transform(raw_data)
imputed_data = pd.DataFrame(imputed_data, columns = raw_data.columns)

In [None]:
raw_data['ICU'].value_counts()

In [None]:
rf_optimal = {
              'n_estimators':2100,
              'max_depth':27,
              'max_features':0.15,
              'max_samples':0.5363991145732665,
              'min_samples_split':2,
              'min_samples_leaf':4,
              'n_jobs':-1,
              'random_state':451,
            }

In [None]:
print(sklearn.__version__)

In [None]:
rf_experiment = run_experiment(imputed_data, model_class = RandomForestClassifier, **rf_optimal)
print_results(rf_experiment, color = '#3F3F3F')