In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import roc_auc_score
import tensorflow as tf
import tensorflow.keras as keras
from keras import Model
from keras import Sequential
from keras.layers import Dense, Concatenate, Input
from keras.optimizers import Adagrad, Adam
import numpy as np

Using TensorFlow backend.


In [2]:
# 1. Import data
train_data = pd.read_csv('train_features.csv')
df = pd.DataFrame(train_data)

train_labels = pd.read_csv('train_labels.csv') # Last 4 columns are real-valued, everything else is boolean
df2 = pd.DataFrame(train_labels)

In [3]:
# 2.  Pre-processing
# 2a. Handle missing data
df = df.fillna(df.mean())

# 2b. Concatenate rows from a single patient, do not duplicate age
df = df.drop("Time", axis=1)
num_patients = int(df.shape[0]/12)
num_feats = int(df.shape[1]*12 - 12 - 11) # Remove patient id, extra age entries

df_np = df.to_numpy()
df_np_new = np.zeros((num_patients, num_feats))

for patient_idx in range(num_patients):
    df_np_new[0,:] = np.hstack((df_np[patient_idx,1:], df_np[patient_idx+1,2:], df_np[patient_idx+2,2:], 
                                df_np[patient_idx+3,2::], df_np[patient_idx+4,2:], df_np[patient_idx+5,2:], 
                                df_np[patient_idx+6,2::], df_np[patient_idx+7,2:], df_np[patient_idx+8,2:], 
                                df_np[patient_idx+9,2::], df_np[patient_idx+10,2:], df_np[patient_idx+11,2:]))

# 2c. Process training labels
df2 = df2.drop("pid", axis=1)
df2_np = df2.to_numpy()

# Separate labels into classification and regression tasks
num_labels = df2.shape[1]
num_class_labels = num_labels - 4
num_regress_labels = 4

class_labels = df2_np[:,0:num_labels-4]
regress_labels = df2_np[:,num_labels-4:]

In [4]:
# 2c. Normalize, balance the data with cost-sensitive loss
scaler = StandardScaler()
scaler.fit(df_np_new)
df_np_new = scaler.transform(df_np_new)

# 3.  Balance the data using cost-sensitive loss
from sklearn.utils import class_weight

class_weights = [class_weight.compute_class_weight('balanced',
                                                  np.unique(class_labels[:,idx]),
                                                  class_labels[:,idx]) for idx in range(num_class_labels)]

In [5]:
# 3.  Model architecture
x = Input(shape=(num_feats,))
h1 = Dense(64, activation='relu')(x)
h2 = Dense(64, activation='relu')(h1)
y = [Dense(1, activation='sigmoid')(h2) for idx in range(num_class_labels)]

models = [Model(inputs=x, outputs=y[i]) for i in range(num_class_labels)] 

# 4.  Optimizer
def auroc(y_true, y_pred):
    return tf.py_func(roc_auc_score, (y_true, y_pred), tf.double)

ada_grad = Adagrad(lr=0.1, epsilon=1e-08, decay=0.5)
adam_grad = Adam(lr=0.1, beta_1=0.9, beta_2=0.9)

In [10]:
# 5.  Compile the models
for i in range(num_class_labels):
    models[i].compile(optimizer=adam_grad, loss='binary_crossentropy',
                      metrics=['accuracy', auroc])

In [14]:
# 6.  Cross-validate : learning rate, momentum, 
for i in range(1):
    models[i].fit(
        df_np_new,
        class_labels[:,i],
        epochs=1000,
        batch_size=1000,       class_weight=class_weights[i]
    )

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
 1100/18995 [>.............................] - ETA: 3s - loss: 0.5775 - acc: 0.7364 - auroc: 0.4996

KeyboardInterrupt: 