In [1]:
import numpy as np
from utils import process_data, multilabel_confusion_matrix

from sklearn.preprocessing import MultiLabelBinarizer
from gensim.models import Word2Vec

from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D, CuDNNGRU, CuDNNLSTM, GRU, LSTM, Reshape, TimeDistributed
from keras.models import Model
from keras.callbacks import Callback, EarlyStopping

from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences

from sklearn.metrics import log_loss, roc_auc_score, accuracy_score, hamming_loss

Using TensorFlow backend.


In [2]:
# Defining some constants: 
window_size   = 7   # Window size for word2vec
embed_size    = 30   # Length of the vector that we willl get from the embedding layer
latent_dim    = 1024  # Hidden layers dimension 
dropout_rate  = 0.2   # Rate of the dropout layers
batch_size    = 2    # Batch size
epochs        = 30    # Number of epochs
max_features  = 60000
#maxlen        = 1000

In [3]:
notes_train_1, labels_train_1, gold_labels_train_1 = process_data('/host_home/data/i2b2/2014/training/training-RiskFactors-Complete-Set1', up=3) 
notes_train_2, labels_train_2, gold_labels_train_2 = process_data('/host_home/data/i2b2/2014/training/training-RiskFactors-Complete-Set2', up=3) 
notes_train = notes_train_1 + notes_train_2
labels_train = labels_train_1 + labels_train_2
gold_labels_train = gold_labels_train_1 + gold_labels_train_2
notes_test, labels_test, gold_labels_test = process_data('/host_home/data/i2b2/2014/testing/testing-RiskFactors-Complete') 
notes = notes_train + notes_test
labels = labels_train + labels_test
gold_labels = gold_labels_train + gold_labels_test
notes_train = np.array(notes_train)
notes_test = np.array(notes_test)

In [4]:
print(len(labels_train_1), 
      len(gold_labels_train_1), 
      len(notes_train_1),
      
      len(labels_train_2),
      len(gold_labels_train_2), 
      len(notes_train_2),
      
      len(labels_train), 
      len(gold_labels_train),
      len(notes_train),
      
      len(labels_test), 
      len(gold_labels_test), 
      len(notes_test),
      
      len(labels),
      len(gold_labels),
      len(notes))

521 521 521 269 269 269 790 790 790 514 514 514 1304 1304 1304


In [5]:
maxlen = max([len(i) for i in notes])

X_txt = [' '.join(i) for i in notes]
X_train_txt = [' '.join(i) for i in notes_train]
X_test_txt = [' '.join(i) for i in notes_test]

tokenizer = Tokenizer(num_words=max_features, filters='')
tokenizer.fit_on_texts(X_txt)

X_seq = tokenizer.texts_to_sequences(X_txt) 
X_seq = pad_sequences(X_seq, maxlen=maxlen, padding='post')

X_train_seq = tokenizer.texts_to_sequences(X_train_txt) 
X_train_seq = pad_sequences(X_train_seq, maxlen=maxlen, padding='post')

X_test_seq = tokenizer.texts_to_sequences(X_test_txt) 
X_test_seq = pad_sequences(X_test_seq, maxlen=maxlen, padding='post')

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))

In [6]:
# helper functions
def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')

def get_embedding_matrix(embedding_index, word_index, max_features, embed_size):
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.zeros((nb_words, embed_size))
    for word, i in word_index.items():
        if i >= max_features: 
            continue
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None: 
            embedding_matrix[i-1] = embedding_vector
    return embedding_matrix

# prepare embedding matrix
w2v = Word2Vec(notes, size=embed_size, window=window_size, min_count=1, workers=4)
embedding_index = dict(zip(w2v.wv.index2word, w2v.wv.vectors))
embedding_matrix = get_embedding_matrix(embedding_index=embedding_index, word_index=word_index, max_features=max_features, embed_size=embed_size)

In [7]:
all_labels = [label for notes_label in labels for label in notes_label]

mlb = MultiLabelBinarizer()
mlb.fit(all_labels)
l_train = []
l_test = []
for i in labels_train:
    l = mlb.transform(i)
    l_train.append(l)
for i in labels_test:
    l = mlb.transform(i)
    l_test.append(l)
    
Y_train = []
Y_test = []
num_labels = len(mlb.classes_)
for i in l_train:
    pad_i = np.concatenate((np.zeros((maxlen-i.shape[0],num_labels)), i))
    Y_train.append(pad_i)
for i in l_test:
    pad_i = np.concatenate((np.zeros((maxlen-i.shape[0],num_labels)), i))
    Y_test.append(pad_i)    
    
Y_train = np.array(Y_train)
Y_test = np.array(Y_test)

all_gold_labels = [label for notes_label in gold_labels for label in notes_label]

In [8]:
def get_cat_labels(label):
    c = '.'
    positions = [pos for pos, char in enumerate(label) if char == c]
    if label != 'O':
        sl = slice(positions[0]+1,positions[1])
        cat_label = label[sl]
    else:
        cat_label = label
    return cat_label    

In [9]:
cat_labels = [[set([get_cat_labels(i) for i in list(j)]) for j in k] for k in labels]
cat_labels_train = [[set([get_cat_labels(i) for i in list(j)]) for j in k] for k in labels_train]
cat_labels_test = [[set([get_cat_labels(i) for i in list(j)]) for j in k] for k in labels_test]

all_cat_labels = [label for notes_label in cat_labels for label in notes_label]

cat_mlb = MultiLabelBinarizer()
cat_mlb.fit(all_cat_labels)
l_cat_train = []
l_cat_test = []
for i in cat_labels_train:
    l = cat_mlb.transform(i)
    l_cat_train.append(l)
for i in cat_labels_test:
    l = cat_mlb.transform(i)
    l_cat_test.append(l)
    
Y_cat_train = []
Y_cat_test = []
num_cat_labels = len(cat_mlb.classes_)
for i in l_cat_train:
    pad_i = np.concatenate((np.zeros((maxlen-i.shape[0],num_cat_labels)), i))
    Y_cat_train.append(pad_i)
for i in l_cat_test:
    pad_i = np.concatenate((np.zeros((maxlen-i.shape[0],num_cat_labels)), i))
    Y_cat_test.append(pad_i)    
    
Y_cat_train = np.array(Y_cat_train)
Y_cat_test = np.array(Y_cat_test)

In [10]:
gmlb = MultiLabelBinarizer()
gmlb.fit(gold_labels)
num_gold_labels = len(gmlb.classes_)
Y_gold_train = gmlb.transform(gold_labels_train)
Y_gold_test = gmlb.transform(gold_labels_test)

In [11]:
print(X_seq.shape, X_train_seq.shape, X_test_seq.shape, Y_train.shape, Y_test.shape, num_labels, Y_cat_train.shape, Y_cat_test.shape, num_cat_labels, Y_gold_train.shape, Y_gold_test.shape, num_gold_labels)

(1304, 3674) (790, 3674) (514, 3674) (790, 3674, 97) (514, 3674, 97) 97 (790, 3674, 9) (514, 3674, 9) 9 (790, 96) (514, 96) 96


In [12]:
# model function with pretrained embedding matrix
def get_model_1(nb_words, num_labels, model_type='CuDNNLSTM'):
    inp = Input(shape=(maxlen, ))
    x = Embedding(nb_words, embed_size, weights=[embedding_matrix])(inp)
    x = SpatialDropout1D(0.5)(x)
    if model_type=='CuDNNGRU':
        x = Bidirectional(CuDNNGRU(128, return_sequences=True))(x)
    elif model_type=='GRU':
        x = Bidirectional(GRU(128, return_sequences=True))(x)
    elif model_type=='CuDNNLSTM':
        x = Bidirectional(CuDNNLSTM(128, return_sequences=True))(x)
    elif model_type=='LSTM':
        x = Bidirectional(LSTM(128, return_sequences=True))(x)
    else:
        raise ValueError('Please specify model_type as one of the following:n\CuDNNGRU, CuDNNLSTM, GRU, LSTM')
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    conc = Dense((maxlen * num_labels), activation="sigmoid")(conc)
    outp = Reshape((maxlen, num_labels))(conc)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

In [13]:
# prepare model metrics
class CustomEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            y_pred_roc = y_pred.flatten()
            y_pred_ham = (y_pred > 0.5).reshape((-1, y_pred.shape[2]))
            y_val_roc = self.y_val.flatten()
            y_val_ham = self.y_val.reshape((-1, self.y_val.shape[2]))
            #print(y_val.sum(), y_pred.sum())
            roc = roc_auc_score(y_val_roc, y_pred_roc)
            ham = hamming_loss(y_val_ham, y_pred_ham)
            sub = accuracy_score(y_val_ham, y_pred_ham)
            print("Adiitional val metrics: - ROC-AUC: %.6f - Hamming-Loss: %.6f - Subset-Accuracy: %.6f" % (roc, ham, sub))

In [14]:
batch_size=2
# train the model
model = get_model_1(nb_words=nb_words,num_labels=num_labels)
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 3674)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 3674, 30)     1349940     input_1[0][0]                    
__________________________________________________________________________________________________
spatial_dropout1d_1 (SpatialDro (None, 3674, 30)     0           embedding_1[0][0]                
__________________________________________________________________________________________________
bidirectional_1 (Bidi

In [15]:
custevl = CustomEvaluation(validation_data=(X_test_seq, Y_test), interval=1)
earlystop = EarlyStopping(monitor='val_loss', min_delta=3e-4, patience=3, verbose=0, mode='auto')
hist = model.fit(X_train_seq,Y_train, 
                 batch_size=batch_size,
                 epochs=epochs,
                 validation_data=(X_test_seq, Y_test),
                 callbacks=[custevl, earlystop],
                 verbose=1)

Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 790 samples, validate on 514 samples
Epoch 1/30
Adiitional val metrics: - ROC-AUC: 0.987455 - Hamming-Loss: 0.000787 - Subset-Accuracy: 0.935685
Epoch 2/30
Adiitional val metrics: - ROC-AUC: 0.988610 - Hamming-Loss: 0.000651 - Subset-Accuracy: 0.950401
Epoch 3/30
Adiitional val metrics: - ROC-AUC: 0.988504 - Hamming-Loss: 0.000532 - Subset-Accuracy: 0.961680
Epoch 4/30
Adiitional val metrics: - ROC-AUC: 0.988641 - Hamming-Loss: 0.000501 - Subset-Accuracy: 0.965343
Epoch 5/30
Adiitional val metrics: - ROC-AUC: 0.988586 - Hamming-Loss: 0.000453 - Subset-Accuracy: 0.969710
Epoch 6/30
Adiitional val metrics: - ROC-AUC: 0.988758 - Hamming-Loss: 0.000473 - Subset-Accuracy: 0.967786
Epoch 7/30
Adiitional val metrics: - ROC-AUC: 0.988825 - Hamming-Loss: 0.000486 - Subset-Accuracy: 0.966502


In [16]:
Y_pred = model.predict(X_test_seq)

multilabel_confusion_matrix(Y_test.reshape((Y_test.shape[0]*Y_test.shape[1],-1)), np.where(Y_pred.reshape((Y_pred.shape[0]*Y_pred.shape[1],-1)) > 0.5, 1, 0))

array([[[1888432,       0],
        [      4,       0]],

       [[1887909,       0],
        [    527,       0]],

       [[1888431,       0],
        [      5,       0]],

       [[1887168,       0],
        [   1268,       0]],

       [[1887904,       0],
        [    532,       0]],

       [[1888219,       0],
        [    217,       0]],

       [[1886768,       0],
        [   1668,       0]],

       [[1888362,       0],
        [     74,       0]],

       [[1887876,       0],
        [    560,       0]],

       [[1888296,       0],
        [    140,       0]],

       [[1888254,       0],
        [    182,       0]],

       [[1887288,       0],
        [   1148,       0]],

       [[1888040,       0],
        [    396,       0]],

       [[1888271,       0],
        [    165,       0]],

       [[1887288,       0],
        [   1148,       0]],

       [[1888396,       0],
        [     40,       0]],

       [[1888372,       0],
        [     64,       0]],

       [[18872

In [17]:
batch_size=2
# train the model
cat_model = get_model_1(nb_words=nb_words,num_labels=num_cat_labels)
cat_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 3674)         0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 3674, 30)     1349940     input_2[0][0]                    
__________________________________________________________________________________________________
spatial_dropout1d_2 (SpatialDro (None, 3674, 30)     0           embedding_2[0][0]                
__________________________________________________________________________________________________
bidirectional_2 (Bidirectional) (None, 3674, 256)    163840      spatial_dropout1d_2[0][0]        
__________________________________________________________________________________________________
global_ave

In [18]:
custevl = CustomEvaluation(validation_data=(X_test_seq, Y_cat_test), interval=1)
earlystop = EarlyStopping(monitor='val_loss', min_delta=3e-4, patience=3, verbose=0, mode='auto')
hist = cat_model.fit(X_train_seq,Y_cat_train, 
                 batch_size=batch_size,
                 epochs=epochs,
                 validation_data=(X_test_seq, Y_cat_test),
                 callbacks=[custevl, earlystop],
                 verbose=1)

Train on 790 samples, validate on 514 samples
Epoch 1/30
Adiitional val metrics: - ROC-AUC: 0.988439 - Hamming-Loss: 0.009499 - Subset-Accuracy: 0.920057
Epoch 2/30
Adiitional val metrics: - ROC-AUC: 0.988487 - Hamming-Loss: 0.011072 - Subset-Accuracy: 0.906654
Epoch 3/30
Adiitional val metrics: - ROC-AUC: 0.989647 - Hamming-Loss: 0.009544 - Subset-Accuracy: 0.919907
Epoch 4/30
Adiitional val metrics: - ROC-AUC: 0.989043 - Hamming-Loss: 0.011908 - Subset-Accuracy: 0.899534
Epoch 5/30
Adiitional val metrics: - ROC-AUC: 0.989058 - Hamming-Loss: 0.013589 - Subset-Accuracy: 0.884658
Epoch 6/30
Adiitional val metrics: - ROC-AUC: 0.990281 - Hamming-Loss: 0.009002 - Subset-Accuracy: 0.924854
Epoch 7/30
Adiitional val metrics: - ROC-AUC: 0.992128 - Hamming-Loss: 0.006139 - Subset-Accuracy: 0.951179
Epoch 8/30
Adiitional val metrics: - ROC-AUC: 0.992610 - Hamming-Loss: 0.006449 - Subset-Accuracy: 0.948426
Epoch 9/30
Adiitional val metrics: - ROC-AUC: 0.993459 - Hamming-Loss: 0.005202 - Subset-A

In [28]:
Y_cat_pred = cat_model.predict(X_test_seq)

multilabel_confusion_matrix(Y_cat_test.reshape((Y_cat_test.shape[0]*Y_cat_test.shape[1],-1)), np.where(Y_cat_pred.reshape((Y_cat_pred.shape[0]*Y_cat_pred.shape[1],-1)) > 0.5, 1, 0))

array([[[1884398,       0],
        [   4038,       0]],

       [[1886629,       0],
        [   1807,       0]],

       [[1888222,       0],
        [    214,       0]],

       [[1887919,       0],
        [    517,       0]],

       [[1887020,       0],
        [   1416,       0]],

       [[1883990,       0],
        [   4446,       0]],

       [[1543537,   43678],
        [  26137,  275084]],

       [[1888261,       0],
        [    175,       0]],

       [[1886804,       0],
        [   1632,       0]]])