In [61]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold, KFold
from sklearn.utils.class_weight import compute_class_weight

import tensorflow as tf
import tensorflow.keras as keras
from keras import Model
from keras import Sequential
from keras import regularizers
from keras.layers import Dense, Concatenate, Input, BatchNormalization
from keras.optimizers import Adagrad, Adam
from keras.callbacks import EarlyStopping
from keras.metrics import Precision

import numpy as np

import itertools as it
from statistics import mean

ImportError: cannot import name 'Precision'

In [2]:
# 1. Import data
train_data = pd.read_csv('train_features.csv')
df = pd.DataFrame(train_data)

train_labels = pd.read_csv('train_labels.csv') # Last 4 columns are real-valued, everything else is boolean
df2 = pd.DataFrame(train_labels)

print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 227940 entries, 0 to 227939
Data columns (total 37 columns):
pid                 227940 non-null int64
Time                227940 non-null int64
Age                 227940 non-null float64
EtCO2               9783 non-null float64
PTT                 10299 non-null float64
BUN                 20105 non-null float64
Lactate             10756 non-null float64
Temp                81115 non-null float64
Hgb                 22295 non-null float64
HCO3                12559 non-null float64
BaseExcess          19887 non-null float64
RRate               187785 non-null float64
Fibrinogen          2493 non-null float64
Phosphate           11590 non-null float64
WBC                 19083 non-null float64
Creatinine          17792 non-null float64
PaCO2               21043 non-null float64
AST                 5761 non-null float64
FiO2                26602 non-null float64
Platelets           18035 non-null float64
SaO2                13014 non-nu

In [3]:
# 2.  Pre-processing
# 2a. Handle missing data
df = df.fillna(df.mean())

# 2b. Concatenate rows from a single patient, do not duplicate age
df = df.drop("Time", axis=1)
num_patients = int(df.shape[0]/12)
num_feats = int(df.shape[1]*12 - 12 - 11) # Remove patient id, extra age entries

df_np = df.to_numpy()
df_np_new = np.zeros((num_patients, num_feats))

for patient_idx in range(num_patients):
    df_np_new[0,:] = np.hstack((df_np[patient_idx,1:], df_np[patient_idx+1,2:], df_np[patient_idx+2,2:], 
                                df_np[patient_idx+3,2::], df_np[patient_idx+4,2:], df_np[patient_idx+5,2:], 
                                df_np[patient_idx+6,2::], df_np[patient_idx+7,2:], df_np[patient_idx+8,2:], 
                                df_np[patient_idx+9,2::], df_np[patient_idx+10,2:], df_np[patient_idx+11,2:]))

# 2c. Process training labels
df2 = df2.drop("pid", axis=1)
df2_np = df2.to_numpy()

# Separate labels into classification and regression tasks
num_labels = df2.shape[1]
num_class_labels = num_labels - 4
num_regress_labels = 4

class_labels = df2_np[:,0:num_labels-4]
regress_labels = df2_np[:,num_labels-4:]

x_train = df_np_new
y_train_cls = class_labels
y_train_reg = regress_labels

In [64]:
# 2d. Normalize, balance the data with cost-sensitive loss
scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)

# # reduce size of data for now
# x_train = x_train[:2000,:]
# y_train_cls = y_train_cls[:2000,:]
# y_train_reg = y_train_reg[:2000,:]

from keras.callbacks import Callback

class RocCallback(Callback):
    def __init__(self,training_data,validation_data):
        self.x = training_data[0]
        self.y = training_data[1]
        self.x_val = validation_data[0]
        self.y_val = validation_data[1]


    def on_train_begin(self, logs={}):
        return

    def on_train_end(self, logs={}):
        return

    def on_epoch_begin(self, epoch, logs={}):
        return

    def on_epoch_end(self, epoch, logs={}):
        y_pred_train = self.model.predict(self.x)
        roc_train = roc_auc_score(self.y, y_pred_train)
        y_pred_val = self.model.predict(self.x_val)
        roc_val = roc_auc_score(self.y_val, y_pred_val)
        print('\rroc-auc_train: %s - roc-auc_val: %s' % (str(round(roc_train,4)),str(round(roc_val,4))),end=100*' '+'\n')
        return

    def on_batch_begin(self, batch, logs={}):
        return

    def on_batch_end(self, batch, logs={}):
        return

In [65]:
# 3.  Model architecture
def binary_classifier(lr, decay, alpha):

    x = Input(shape=(num_feats,))
    h1 = Dense(30, activation='relu')(x)
    h1n = BatchNormalization()(h1)
    h2 = Dense(30, activation='relu')(h1)
    h2n = BatchNormalization()(h2)
    h3 = Dense(10, activation='relu')(h2)
    h3n = BatchNormalization()(h3)
    y = Dense(1, activation='sigmoid')(h3n)
    
    # To-do: Insert outputs for regression outputs
    model = Model(inputs=x, outputs=y)
    ada_grad = Adagrad(lr=lr, epsilon=1e-08, decay=decay)
    #adam_grad = Adam(lr=0.1, beta_1=0.9, beta_2=0.9)

    model.compile(optimizer=ada_grad, loss='binary_crossentropy',
                  metrics=['accuracy'])

    return model

# 3.  Model architecture
def regressor(lr, decay):
    x = Input(shape=(num_feats,))
    h1 = Dense(10, activation='relu', kernel_initializer='he_uniform')(x)
    h1n = BatchNormalization()(h1)
    h2 = Dense(10, activation='relu', kernel_initializer='he_uniform')(h1n)
    h2n = BatchNormalization()(h2)
    h3 = Dense(10, activation='relu', kernel_initializer='he_uniform')(h2n)
    h3n = BatchNormalization()(h3)
    h4 = Dense(10, activation='relu', kernel_initializer='he_uniform')(h3n)
    h4n = BatchNormalization()(h4)
    y = Dense(num_regress_labels, activation='linear', kernel_initializer='he_normal')(h4n)

    # To-do: Insert outputs for regression outputs
    model = Model(inputs=x, outputs=y)
    ada_grad = Adagrad(lr=lr, epsilon=1e-08, decay=decay)
    #adam_grad = Adam(lr=0.1, beta_1=0.9, beta_2=0.9)

    model.compile(optimizer=ada_grad, loss='mean_squared_error',
                  metrics=['mse'])

    return model

In [66]:
class_weights = [compute_class_weight('balanced', np.unique(y_train_cls[:,i]), y_train_cls[:,i]) for i in range(num_class_labels)]

for i in range(num_class_labels):
   max_ind = class_weights[i].argmax()
   class_weights[i][max_ind] *= 1.1

class_weights = [dict(enumerate(weights)) for weights in class_weights]

In [70]:
# Try to overfit the model to a subset of the data
cls_idx = 0
n_splits=10
skf = StratifiedKFold(n_splits=n_splits)
folds = skf.split(x_train, y_train_cls[:,cls_idx])

lr=5e-3
decay=0.1
alpha=1e2
cls_model = binary_classifier(lr, decay, alpha)

batch_size = 20
epochs = 5

print(class_weights[cls_index])
for train_idx, test_idx in folds:
    sample_x = x_train[train_idx,:]
    sample_y = y_train_cls[train_idx,cls_idx]
    
    test_x = x_train[test_idx,:]
    test_y = y_train_cls[test_idx,cls_idx]
    
    roc = RocCallback(training_data=(sample_x, sample_y),
                      validation_data=(test_x, test_y))
        
    cls_model.fit(sample_x, sample_y,
                validation_data=(test_x, test_y),
                batch_size=batch_size, epochs=epochs,
                class_weight=class_weights[cls_index],
                callbacks=[roc],
                shuffle=True) 
    
    break

{0: 0.6833225411900137, 1: 2.0500883045525904}
Train on 17095 samples, validate on 1900 samples
Epoch 1/5


AttributeError: 'Model' object has no attribute 'predict_proba'

In [None]:
# Parameters to CV over
params = {}
params['lr'] = np.logspace(-4,1,8)
params['decay'] = np.logspace(-2,1,5)
params['batch_size'] = np.linspace(500,3500,3)

best_reg = {'err':99e6,'params':None, 'train_losses':[], 'val_losses':[]}
best_cls = [{'roc':0,'params':None,'acc':0, 'train_losses':[], 'val_losses':[]} for i in range(num_class_labels)]

epochs = 25
es = EarlyStopping(monitor='val_loss', patience=10, mode='min')

# Initialize splitter
n_splits=6
skf = StratifiedKFold(n_splits=n_splits)
kf = KFold(n_splits=n_splits)

# Get parameter combinations
keys = params.keys()
values = (params[key] for key in keys)
combinations = [dict(zip(keys, combination)) for combination in it.product(*values)]

for param_config in combinations:
    print(param_config)
    lr = param_config['lr']
    decay = param_config['decay']
    batch_size = int(param_config['batch_size'])
    
    # Generate folds
    cls_folds = [skf.split(x_train, y_train_cls[:,i]) for i in range(num_class_labels)]
    reg_folds = kf.split(x_train, y_train_reg)
    
    # Classifiers
    for i in range(1):
        
        rocs, accs, train_losses, val_losses = [], [], [], []
        for train_idx, test_idx in cls_folds[i]:
            model = binary_classifier(lr, decay)
            x, y = x_train[train_idx], y_train_cls[train_idx,i]
            x_val, y_val = x_train[test_idx] , y_train_cls[test_idx,i]
            history = model.fit(x, y,
                               batch_size=batch_size, epochs=epochs,
                               validation_data=(x_val,y_val),
                               class_weight=class_weights[i],
                               callbacks=[es],
                               shuffle=True) 
            
            accs.append(history.history['acc'][-1])
            rocs.append(history.history['auroc'][-1])
            train_losses.append(history.history['loss'])
            val_losses.append(history.history['val_loss'])

            del model
            
        if mean(rocs) > best_cls[i]['roc']:
            best_cls[i]['params'] = param_config
            best_cls[i]['roc'], best_cls[i]['acc'] = mean(rocs), mean(accs)
            best_cls[i]['train_losses'] = train_losses
            best_cls[i]['val_losses'] = val_losses
            print(f"New best roc for class #{i} : {best_cls[i]['roc']}")
                  
#     # Regressor
#     errs = []
#     for train_idx, test_idx in reg_folds:
#         model = regressor(lr, decay)
#         x, y = x_train[train_idx], y_train_reg[train_idx,:]
#         x_val, y_val = x_train[test_idx] , y_train_reg[test_idx,:]
#         history = model.fit(x, y,
#                            batch_size=batch_size, epochs=epochs,
#                            validation_data=(x_val,y_val)) 

#         errs.append(history.history['mean_squared_error'][-1])
#         del model

#     if mean(errs) < best_reg['err']:
#         best_reg['params'] = param_config
#         best_reg['err'] = mean(errs)

#         print(f"New best err for regressor : {best_reg['err']}")
              
print('<<<FINAL BEST PARAMS>>>')
for i in range(num_class_labels):
    print("----------")
    print(f"Class {i}")
    print("----------")
    print(best_cls[i]['params'])  
    print(f"ROC = {best_cls[i]['roc']}")
          
print("----------")
print("Regressor")
print("----------")
print(best_reg['params'])
print(f"MSE = {best_reg['err']}")