In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from xgboost import XGBClassifier
import datetime
from sklearn.metrics import average_precision_score, precision_recall_curve
from sklearn.metrics import auc, plot_precision_recall_curve
from collections import defaultdict


In [2]:
PATH = 'data/'
SEEDS = [1, 2, 3, 4, 5]
MISSING_VALUE = -999999
PARAMS = {'etas': [0.01, 0.05, 0.1, 0.2, 0.3], 
          'max_depths': [3, 4, 5],  
          'subsamples':  [0.5,  0.8,  1.0],
         'colsample_bytrees': [0.5,  0.8,  1.0]}

In [3]:
VITAL_SIGN_COLS = ['Body_temperature', 'Pre_peripheral_O2_saturation',
       'Post_peripheral_O2_saturation', 'Pre_pulse_rate', 'Post_pulse_rate',
       'Pre_dyspnea', 'Post_dyspnea',]

SYMPTOM_COLS = [ 'Fever', 'Cough', 'Runny_nose',
       'Sore_throat', 'Smell', 'Diarrhea']

DEMOGRAPHIC_COLS = ['age', 'gender', 'bmi']

TIMES = ['morning', 'evening']

LABEL_COLS = ['Admitted_within_1_day', 'Admitted_within_2_days', 'Admitted_within_3_days']

vital_sign_feats = []
symptom_feats = []


for i in range(1, 4):
    for time in TIMES:
        for col in VITAL_SIGN_COLS:
            vital_sign_feats.append("{}_day{}_{}".format(col, i, time))
            
for i in range(1, 4):
    for time in TIMES:
        for col in SYMPTOM_COLS:
            symptom_feats.append("{}_day{}_{}".format(col, i, time))


In [4]:
symptom_one_hots = {'no': np.array([1, 0, 0, 0]), 'stay': np.array([0, 1, 0, 0]), 'up': np.array([0, 0, 1, 0]),
               'down': np.array([0, 0, 0, 1]), np.nan: np.array([MISSING_VALUE, MISSING_VALUE, MISSING_VALUE, MISSING_VALUE])}

gender_one_hots = {'male': np.array([1, 0, 0]), 'female': np.array([0, 1, 0]), 
                 'other': np.array([0, 0, 1]), np.nan: np.array([MISSING_VALUE, MISSING_VALUE, MISSING_VALUE])}

In [5]:
def encode_one_hot_features(features, one_hot_dict):
    encoded_features = []
    for feature in features:
        tmp = []
        for data in feature:
            tmp.append(one_hot_dict[data])
        tmp = np.concatenate(tmp)
        encoded_features.append(tmp)
    return np.array(encoded_features)

In [6]:
def prauc_score(label, pred):
    precision, recall, threshold = precision_recall_curve(label, pred)
    prauc = auc(recall, precision)
    return prauc

## Train model

In [7]:
val_admitted_within_1_day_praucs = defaultdict(lambda: [])
val_admitted_within_2_days_praucs = defaultdict(lambda: [])
val_admitted_within_3_days_praucs = defaultdict(lambda: [])

for seed in SEEDS:
    train_df = pd.read_csv(PATH + 'training_SEED_{}.csv'.format(seed))
    val_df = pd.read_csv(PATH + 'validation_SEED_{}.csv'.format(seed))
    
    
    # Replace Nan with MISSING_VALUE (-999999)
    train_df1 = train_df[vital_sign_feats + ['Age', 'BMI']].fillna(MISSING_VALUE)
    train_df2 = train_df[symptom_feats]
    train_df3 = train_df[['Gender']]
    
    # Replace Nan with MISSING_VALUE (-999999)
    val_df1 = val_df[vital_sign_feats + ['Age', 'BMI']].fillna(MISSING_VALUE)
    val_df2 = val_df[symptom_feats]
    val_df3 = val_df[['Gender']]
    
    
    train_feats1 = train_df1.values
    train_feats2 = train_df2.values
    train_feats3 = train_df3.values
    
    train_feats2 = encode_one_hot_features(train_feats2, symptom_one_hots)
    train_feats3 = encode_one_hot_features(train_feats3, gender_one_hots)
    
    train_feats = np.concatenate([train_feats1, train_feats2, train_feats3], axis=1)  
    train_labels = train_df[LABEL_COLS].values
    
    val_feats1 = val_df1.values
    val_feats2 = val_df2.values
    val_feats3 = val_df3.values
    
    val_feats2 = encode_one_hot_features(val_feats2, symptom_one_hots)
    val_feats3 = encode_one_hot_features(val_feats3, gender_one_hots)
    
    val_feats = np.concatenate([val_feats1, val_feats2, val_feats3], axis=1)  
    val_labels = val_df[LABEL_COLS].values
    
    
    for eta in PARAMS['etas']:
        for max_depth in PARAMS['max_depths']:
            for subsample in PARAMS['subsamples']:
                for colsample_bytree in PARAMS['colsample_bytrees']:
                    admitted_within_1_day_model = XGBClassifier(colsample_bytree=colsample_bytree, learning_rate=eta, 
                                          max_depth=max_depth,subsample=subsample, missing=MISSING_VALUE, n_jobs=4)
                    admitted_within_1_day_model.fit(train_feats, train_labels[:, 0])
                    
                    admitted_within_2_days_model = XGBClassifier(colsample_bytree=colsample_bytree, learning_rate=eta, 
                                          max_depth=max_depth,subsample=subsample, missing=MISSING_VALUE, n_jobs=4)
                    admitted_within_2_days_model.fit(train_feats, train_labels[:, 1])
                    
                    admitted_within_3_days_model = XGBClassifier(colsample_bytree=colsample_bytree, learning_rate=eta, 
                                          max_depth=max_depth,subsample=subsample, missing=MISSING_VALUE, n_jobs=4)
                    admitted_within_3_days_model.fit(train_feats, train_labels[:, 2])
                    
                    
                    y_pred_admitted_within_1_day_val = admitted_within_1_day_model.predict_proba(val_feats)
                    y_pred_admitted_within_2_days_val = admitted_within_2_days_model.predict_proba(val_feats)
                    y_pred_admitted_within_3_days_val = admitted_within_3_days_model.predict_proba(val_feats)
                    
                    
                    val_admitted_within_1_day_prauc = prauc_score(val_labels[:, 0], y_pred_admitted_within_1_day_val[:, 1])
                    val_admitted_within_2_days_prauc = prauc_score(val_labels[:, 1], y_pred_admitted_within_2_days_val[:, 1])
                    val_admitted_within_3_days_prauc = prauc_score(val_labels[:, 2], y_pred_admitted_within_3_days_val[:, 1])
                    
                    
                    param = (eta, max_depth, subsample, colsample_bytree)
                    
                    val_admitted_within_1_day_praucs[param].append(val_admitted_within_1_day_prauc)
                    val_admitted_within_2_days_praucs[param].append(val_admitted_within_2_days_prauc)
                    val_admitted_within_3_days_praucs[param].append(val_admitted_within_3_days_prauc)
                    

## Select the best parameter based on validation set

In [8]:
admitted_within_1_day_best_param = ()
admitted_within_1_day_best_val_prauc = 0
admitted_within_2_days_best_param = ()
admitted_within_2_days_best_val_prauc = 0
admitted_within_3_days_best_param = ()
admitted_within_3_days_best_val_prauc = 0

for param in val_admitted_within_1_day_praucs:
    if np.mean(val_admitted_within_1_day_praucs[param]) > admitted_within_1_day_best_val_prauc:
        admitted_within_1_day_best_val_prauc = np.mean(val_admitted_within_1_day_praucs[param])
        admitted_within_1_day_best_param = param
    
    if np.mean(val_admitted_within_2_days_praucs[param]) > admitted_within_2_days_best_val_prauc:
        admitted_within_2_days_best_val_prauc = np.mean(val_admitted_within_2_days_praucs[param])
        admitted_within_2_days_best_param = param
        
    if np.mean(val_admitted_within_3_days_praucs[param]) > admitted_within_3_days_best_val_prauc:
        admitted_within_3_days_best_val_prauc = np.mean(val_admitted_within_3_days_praucs[param])
        admitted_within_3_days_best_param = param

In [9]:
eta1, max_depth1, subsample1, colsample_bytree1 = admitted_within_1_day_best_param
eta2, max_depth2, subsample2, colsample_bytree2 = admitted_within_2_days_best_param
eta3, max_depth3, subsample3, colsample_bytree3 = admitted_within_3_days_best_param

## Evaluate on testing set

In [10]:
test_admitted_within_1_day_praucs = []
test_admitted_within_2_days_praucs = []
test_admitted_within_3_days_praucs = []

for seed in SEEDS:
    train_df = pd.read_csv(PATH + 'training_SEED_{}.csv'.format(seed))
    test_df = pd.read_csv(PATH + 'testing_SEED_{}.csv'.format(seed))
    
    # Replace Nan with MISSING_VALUE (-999999)
    train_df1 = train_df[vital_sign_feats + ['Age', 'BMI']].fillna(MISSING_VALUE)
    train_df2 = train_df[symptom_feats]
    train_df3 = train_df[['Gender']]
    
    
    # Replace Nan with MISSING_VALUE (-999999)
    test_df1 = test_df[vital_sign_feats + ['Age', 'BMI']].fillna(MISSING_VALUE)
    test_df2 = test_df[symptom_feats]
    test_df3 = test_df[['Gender']]
    
    train_feats1 = train_df1.values
    train_feats2 = train_df2.values
    train_feats3 = train_df3.values
    
    train_feats2 = encode_one_hot_features(train_feats2, symptom_one_hots)
    train_feats3 = encode_one_hot_features(train_feats3, gender_one_hots)
    
    train_feats = np.concatenate([train_feats1, train_feats2, train_feats3], axis=1)  
    train_labels = train_df[LABEL_COLS].values
    
    test_feats1 = test_df1.values
    test_feats2 = test_df2.values
    test_feats3 = test_df3.values
    
    test_feats2 = encode_one_hot_features(test_feats2, symptom_one_hots)
    test_feats3 = encode_one_hot_features(test_feats3, gender_one_hots)
    
    test_feats = np.concatenate([test_feats1, test_feats2, test_feats3], axis=1) 
    test_labels = test_df[LABEL_COLS].values
    
    
    admitted_within_1_day_model = XGBClassifier(colsample_bytree=colsample_bytree1, learning_rate=eta1, 
                                          max_depth=max_depth1,subsample=subsample1, missing=MISSING_VALUE, n_jobs=4)
    admitted_within_1_day_model.fit(train_feats, train_labels[:, 0])
    
    
    admitted_within_2_days_model = XGBClassifier(colsample_bytree=colsample_bytree2, learning_rate=eta2, 
                                          max_depth=max_depth2,subsample=subsample2, missing=MISSING_VALUE, n_jobs=4)
    admitted_within_2_days_model.fit(train_feats, train_labels[:, 1])
    
    
    admitted_within_3_days_model = XGBClassifier(colsample_bytree=colsample_bytree3, learning_rate=eta3, 
                                          max_depth=max_depth3,subsample=subsample3, missing=MISSING_VALUE, n_jobs=4)
    admitted_within_3_days_model.fit(train_feats, train_labels[:, 2])
    
    
    y_pred_admitted_within_1_day_test = admitted_within_1_day_model.predict_proba(test_feats)
    y_pred_admitted_within_2_days_test = admitted_within_2_days_model.predict_proba(test_feats)
    y_pred_admitted_within_3_days_test = admitted_within_3_days_model.predict_proba(test_feats)
    
    test_admitted_within_1_day_prauc = prauc_score(test_labels[:, 0], y_pred_admitted_within_1_day_test[:, 1])
    test_admitted_within_2_days_prauc = prauc_score(test_labels[:, 1], y_pred_admitted_within_2_days_test[:, 1])
    test_admitted_within_3_days_prauc = prauc_score(test_labels[:, 2], y_pred_admitted_within_3_days_test[:, 1])
    
    
    test_admitted_within_1_day_praucs.append(test_admitted_within_1_day_prauc)
    test_admitted_within_2_days_praucs.append(test_admitted_within_2_days_prauc)
    test_admitted_within_3_days_praucs.append(test_admitted_within_3_days_prauc)

print('Admitted within 1 day PRAUC: {:.4f} +- {:.4f}'.format(np.mean(test_admitted_within_1_day_praucs), np.std(test_admitted_within_1_day_praucs)))
print('Admitted within 2 days PRAUC: {:.4f} +- {:.4f}'.format(np.mean(test_admitted_within_2_days_praucs), np.std(test_admitted_within_2_days_praucs)))
print('Admitted within 3 days PRAUC: {:.4f} +- {:.4f}'.format(np.mean(test_admitted_within_3_days_praucs), np.std(test_admitted_within_3_days_praucs)))

Admitted within 1 day PRAUC: 0.7700 +- 0.0693
Admitted within 2 days PRAUC: 0.8359 +- 0.0372
Admitted within 3 days PRAUC: 0.9451 +- 0.0185
