In [1]:
import numpy as np
from gensim.models import Word2Vec
from keras.callbacks import Callback, EarlyStopping
from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D, CuDNNGRU, CuDNNLSTM, GRU, LSTM, Reshape, TimeDistributed
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import log_loss, roc_auc_score, accuracy_score, hamming_loss, f1_score
from sklearn.preprocessing import MultiLabelBinarizer
from tqdm import tnrange, tqdm_notebook
from utils import process_data, multilabel_confusion_matrix

Using TensorFlow backend.


In [2]:
# Defining some constants: 
window_size   = 5   # Window size for word2vec
embed_size    = 10   # Length of the vector that we willl get from the embedding layer
latent_dim    = 1024  # Hidden layers dimension 
dropout_rate  = 0.5   # Rate of the dropout layers
#batch_size    =     # Batch size
epochs        = 30    # Number of epochs
max_features  = 60000

In [3]:
notes_train_1, labels_train_1, gold_labels_train_1 = process_data('/host_home/data/i2b2/2014/training/training-RiskFactors-Complete-Set1', up=3) 
notes_train_2, labels_train_2, gold_labels_train_2 = process_data('/host_home/data/i2b2/2014/training/training-RiskFactors-Complete-Set2', up=3) 
notes_train = notes_train_1 + notes_train_2
labels_train = labels_train_1 + labels_train_2
gold_labels_train = gold_labels_train_1 + gold_labels_train_2
notes_test, labels_test, gold_labels_test = process_data('/host_home/data/i2b2/2014/testing/testing-RiskFactors-Complete') 
notes = notes_train + notes_test
labels = labels_train + labels_test
gold_labels = gold_labels_train + gold_labels_test
#notes_train = np.array(notes_train)
#notes_test = np.array(notes_test)

Loading files with 3 times upsampling for tagged lines in /host_home/data/i2b2/2014/training/training-RiskFactors-Complete-Set1


HBox(children=(IntProgress(value=0, max=521), HTML(value='')))


Loading files with 3 times upsampling for tagged lines in /host_home/data/i2b2/2014/training/training-RiskFactors-Complete-Set2


HBox(children=(IntProgress(value=0, max=269), HTML(value='')))


Loading files in /host_home/data/i2b2/2014/testing/testing-RiskFactors-Complete


HBox(children=(IntProgress(value=0, max=514), HTML(value='')))




In [4]:
print(len(labels_train_1), 
      len(gold_labels_train_1), 
      len(notes_train_1),
      
      len(labels_train_2),
      len(gold_labels_train_2), 
      len(notes_train_2),
      
      len(labels_train), 
      len(gold_labels_train),
      len(notes_train),
      
      len(labels_test), 
      len(gold_labels_test), 
      len(notes_test),
      
      len(labels),
      len(gold_labels),
      len(notes))

521 521 521 269 269 269 790 790 790 514 514 514 1304 1304 1304


In [5]:
# prepare features
X_txt = [' '.join(i) for i in notes]
X_train_txt = [' '.join(i) for i in notes_train]
X_test_txt = [' '.join(i) for i in notes_test]

tokenizer = Tokenizer(num_words=max_features, filters='')
tokenizer.fit_on_texts(X_txt)

X_seq = tokenizer.texts_to_sequences(X_txt) 
X_train_seq = tokenizer.texts_to_sequences(X_train_txt) 
X_test_seq = tokenizer.texts_to_sequences(X_test_txt) 

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))

In [6]:
# helper functions
def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')

def get_embedding_matrix(embedding_index, word_index, max_features, embed_size):
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.zeros((nb_words, embed_size))
    for word, i in word_index.items():
        if i >= max_features: 
            continue
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None: 
            embedding_matrix[i-1] = embedding_vector
    return embedding_matrix

# prepare embedding matrix
w2v = Word2Vec(notes, size=embed_size, window=window_size, min_count=1, workers=4)
embedding_index = dict(zip(w2v.wv.index2word, w2v.wv.vectors))
embedding_matrix = get_embedding_matrix(embedding_index=embedding_index, word_index=word_index, max_features=max_features, embed_size=embed_size)

In [7]:
# prepare targets
all_labels = [label for notes_label in labels for label in notes_label]

mlb = MultiLabelBinarizer()
mlb.fit(all_labels)
num_labels = len(mlb.classes_)
Y_train = []
Y_test = []
for i in labels_train:
    l = mlb.transform(i)
    Y_train.append(l)
for i in labels_test:
    l = mlb.transform(i)
    Y_test.append(l)

In [8]:
# function help convert labels to category labels
def get_cat_labels(label):
    c = '.'
    positions = [pos for pos, char in enumerate(label) if char == c]
    if label != 'O':
        sl = slice(positions[0]+1,positions[1])
        cat_label = label[sl]
    else:
        cat_label = label
    return cat_label    

In [9]:
# prepare cagtegory label targets
cat_labels = [[set([get_cat_labels(i) for i in list(j)]) for j in k] for k in labels]
cat_labels_train = [[set([get_cat_labels(i) for i in list(j)]) for j in k] for k in labels_train]
cat_labels_test = [[set([get_cat_labels(i) for i in list(j)]) for j in k] for k in labels_test]
all_cat_labels = [label for notes_label in cat_labels for label in notes_label]

cat_mlb = MultiLabelBinarizer()
cat_mlb.fit(all_cat_labels)
num_cat_labels = len(cat_mlb.classes_)
Y_cat_train = []
Y_cat_test = []
for i in cat_labels_train:
    l = cat_mlb.transform(i)
    Y_cat_train.append(l)
for i in cat_labels_test:
    l = cat_mlb.transform(i)
    Y_cat_test.append(l)

In [10]:
# model function with pretrained embedding matrix and Timedistributed
def get_model_2(nb_words, num_labels, model_type='CuDNNLSTM'):
    inp = Input(shape=(None, ))
    x = Embedding(nb_words, embed_size, weights=[embedding_matrix])(inp)
    x = SpatialDropout1D(dropout_rate)(x)
    if model_type=='CuDNNGRU':
        x = Bidirectional(CuDNNGRU(128, return_sequences=True))(x)
    elif model_type=='GRU':
        x = Bidirectional(GRU(128, return_sequences=True))(x)
    elif model_type=='CuDNNLSTM':
        x = Bidirectional(CuDNNLSTM(128, return_sequences=True))(x)
    elif model_type=='LSTM':
        x = Bidirectional(LSTM(128, return_sequences=True))(x)
    else:
        raise ValueError('Please specify model_type as one of the following:n\CuDNNGRU, CuDNNLSTM, GRU, LSTM')
#     outp = TimeDistributed(Dense((num_labels), activation="sigmoid"))(x)
    outp = Dense((num_labels), activation="sigmoid")(x)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

In [None]:
# model summary
model = get_model_2(nb_words=nb_words,num_labels=num_labels)
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, None)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, None, 10)          449840    
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, None, 10)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, None, 256)         143360    
_________________________________________________________________
dense_1 (Dense)              (None, None, 97)          24929     
Total params: 618,129
Trainable params: 618,129
Non-trainable params: 0
_______________

In [12]:
# prepare model metrics
class CustomEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = list(validation_data)

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = []
            for x in self.X_val:
                y = np.squeeze(self.model.predict_on_batch(x))
                y_pred.append(y)
            y_pred = np.concatenate(y_pred)
            y_pred_ham = y_pred > 0.5
            y_val = np.concatenate(self.y_val)
            roc = roc_auc_score(y_val, y_pred, average='micro')
            loss = log_loss(y_val, y_pred)
            ham = hamming_loss(y_val, y_pred_ham)
            sub = accuracy_score(y_val, y_pred_ham)
            f1 = f1_score(y_val, y_pred_ham, average='micro')
            print("Adiitional val metrics: - ROC-AUC: %.6f - Log-Loss: %.6f - Hamming-Loss: %.6f - Subset-Accuracy: %.6f - F1-Score: %.6f" % (roc, loss, ham, sub, f1))

In [13]:
# data generator function
def generator(X_seq, Y):
    while True:
        for x, y in zip(X_seq, Y):
            x = np.array(x).reshape((1,-1))
            y = np.array(y).reshape((1,-1, y.shape[1]))
            yield x, y

In [15]:
# model training
custevl = CustomEvaluation(validation_data=(X_test_seq, Y_test), interval=1)
earlystop = EarlyStopping(monitor='val_loss', min_delta=3e-4, patience=3, verbose=0, mode='auto')
train_gen = generator(X_train_seq,Y_train)
test_gen = generator(X_test_seq,Y_test)
hist = model.fit_generator(train_gen,
                 steps_per_epoch=len(Y_train),
                 epochs=epochs,
                 validation_data=test_gen,
                 validation_steps=len(Y_test),
                 callbacks=[custevl, earlystop],
                 verbose=1)

Epoch 1/30
  1/790 [..............................] - ETA: 25:06 - loss: 0.6931 - acc: 0.9865

InternalError: Failed to call ThenRnnBackward with model config: [rnn_mode, rnn_input_mode, rnn_direction_mode]: 2, 0, 0 , [num_layers, input_size, num_units, dir_count, seq_length, batch_size]: [1, 10, 128, 1, 788, 1] 
	 [[{{node training/Adam/gradients/bidirectional_1/CudnnRNN_grad/CudnnRNNBackprop}}]]

In [None]:
# prediction of test data
Y_pred = []
for x in X_test_seq:
    x = np.array(x).reshape((1,-1))
    y_pred = np.squeeze(model.predict_on_batch(x))
    Y_pred.append(y_pred)
Y_pred_concat = np.concatenate(Y_pred)
Y_val = np.concatenate(Y_test)

In [None]:
# confusion matrix 
cm = multilabel_confusion_matrix(Y_val, np.where(Y_pred_concat > 0.5, 1, 0))
for i, j in zip(cm, mlb.classes_):
    print(j+':\n', i,'\n')

In [18]:
# model summary for category label model
cat_model = get_model_2(nb_words=nb_words,num_labels=num_cat_labels)
cat_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, None)              0         
_________________________________________________________________
embedding_2 (Embedding)      (None, None, 10)          449840    
_________________________________________________________________
spatial_dropout1d_2 (Spatial (None, None, 10)          0         
_________________________________________________________________
bidirectional_2 (Bidirection (None, None, 256)         143360    
_________________________________________________________________
time_distributed_2 (TimeDist (None, None, 9)           2313      
Total params: 595,513
Trainable params: 595,513
Non-trainable params: 0
_________________________________________________________________


In [19]:
# model training for category label model
cat_custevl = CustomEvaluation(validation_data=(X_test_seq, Y_cat_test), interval=1)
earlystop = EarlyStopping(monitor='val_loss', min_delta=3e-4, patience=3, verbose=0, mode='auto')
cat_train_gen = generator(X_train_seq,Y_cat_train)
cat_test_gen = generator(X_test_seq,Y_cat_test)
cat_hist = cat_model.fit_generator(cat_train_gen,
                 steps_per_epoch=len(Y_cat_train),
                 epochs=epochs,
                 validation_data=cat_test_gen,
                 validation_steps=len(Y_cat_test),
                 callbacks=[cat_custevl, earlystop],
                 verbose=1)

Epoch 1/30
Adiitional val metrics: - ROC-AUC: 0.967911 - Log-Loss: 1.801534 - Hamming-Loss: 0.032248 - Subset-Accuracy: 0.834074 - F1-Score: 0.857611
Epoch 2/30
Adiitional val metrics: - ROC-AUC: 0.965380 - Log-Loss: 1.662158 - Hamming-Loss: 0.033159 - Subset-Accuracy: 0.840902 - F1-Score: 0.854452
Epoch 3/30
Adiitional val metrics: - ROC-AUC: 0.965910 - Log-Loss: 1.559360 - Hamming-Loss: 0.035290 - Subset-Accuracy: 0.837675 - F1-Score: 0.847795
Epoch 4/30
Adiitional val metrics: - ROC-AUC: 0.973688 - Log-Loss: 1.412469 - Hamming-Loss: 0.030469 - Subset-Accuracy: 0.856325 - F1-Score: 0.868445
Epoch 5/30
Adiitional val metrics: - ROC-AUC: 0.976233 - Log-Loss: 1.284806 - Hamming-Loss: 0.028703 - Subset-Accuracy: 0.868432 - F1-Score: 0.876495
Epoch 6/30
Adiitional val metrics: - ROC-AUC: 0.980319 - Log-Loss: 1.132684 - Hamming-Loss: 0.024508 - Subset-Accuracy: 0.893918 - F1-Score: 0.894738
Epoch 7/30
Adiitional val metrics: - ROC-AUC: 0.984009 - Log-Loss: 0.990020 - Hamming-Loss: 0.020303

In [20]:
# prediction of test data for category label model
Y_cat_pred = []
for x in X_test_seq:
    x = np.array(x).reshape((1,-1))
    y_pred = np.squeeze(cat_model.predict_on_batch(x))
    Y_cat_pred.append(y_pred)
Y_cat_pred_concat = np.concatenate(Y_cat_pred)
Y_cat_val = np.concatenate(Y_cat_test)

In [21]:
# confusion matrix for category label model
cat_cm = multilabel_confusion_matrix(Y_cat_val, np.where(Y_cat_pred_concat > 0.5, 1, 0))
for i, j in zip(cat_cm, cat_mlb.classes_):
    print(j+':\n', i,'\n')

CAD:
 [[310707    690]
 [  2877   1161]] 

DIABETES:
 [[313290    338]
 [   845    962]] 

FAMILY_HIST:
 [[315221      0]
 [   214      0]] 

HYPERLIPIDEMIA:
 [[314874     44]
 [   313    204]] 

HYPERTENSION:
 [[313585    434]
 [   629    787]] 

MEDICATION:
 [[310537    452]
 [  2229   2217]] 

O:
 [[  6391   7823]
 [  2746 298475]] 

OBESE:
 [[315233     27]
 [    74    101]] 

SMOKER:
 [[313364    439]
 [   860    772]] 



In [22]:
# prepare gold lebels for predicted data
gold_labels_pred = [{i for s in mlb.inverse_transform(y_pred>0.5) for i in s if i != 'O'} for y_pred in Y_pred]
gold_cat_labels_pred = [{i for s in cat_mlb.inverse_transform(y_pred>0.5) for i in s if i != 'O'} for y_pred in Y_cat_pred]

In [23]:
# prepare gold label targets
gmlb = MultiLabelBinarizer()
gmlb.fit(gold_labels)
num_gold_labels = len(gmlb.classes_)
Y_gold_test = gmlb.transform(gold_labels_test)
Y_gold_pred = gmlb.transform(gold_labels_pred)

In [24]:
# prepare cagtegory gold label targets
gold_cat_labels = [{get_cat_labels(i) for i in k} for k in gold_labels]
gold_cat_labels_test = [{get_cat_labels(i) for i in k} for k in gold_labels_test]

cat_gmlb = MultiLabelBinarizer()
cat_gmlb.fit(gold_cat_labels)
num_cat_gold_labels = len(cat_gmlb.classes_)
Y_gold_cat_test = cat_gmlb.transform(gold_cat_labels_test)
Y_gold_cat_pred = cat_gmlb.transform(gold_cat_labels_pred)

In [31]:
# confusion matrix for gold label
gcm = multilabel_confusion_matrix(Y_gold_test, Y_gold_pred)
for i, j in zip(gcm, gmlb.classes_):
    print(j+':\n', i,'\n')

I.CAD.after_DCT.event:
 [[513   0]
 [  1   0]] 

I.CAD.after_DCT.mention:
 [[335  12]
 [ 56 111]] 

I.CAD.after_DCT.symptom:
 [[511   0]
 [  3   0]] 

I.CAD.before_DCT.event:
 [[387   0]
 [127   0]] 

I.CAD.before_DCT.mention:
 [[337  11]
 [ 67  99]] 

I.CAD.before_DCT.symptom:
 [[474   0]
 [ 40   0]] 

I.CAD.before_DCT.test:
 [[461   0]
 [ 53   0]] 

I.CAD.during_DCT.event:
 [[505   0]
 [  9   0]] 

I.CAD.during_DCT.mention:
 [[325  18]
 [ 48 123]] 

I.CAD.during_DCT.symptom:
 [[488   0]
 [ 26   0]] 

I.CAD.during_DCT.test:
 [[508   0]
 [  6   0]] 

I.DIABETES.after_DCT.mention:
 [[152   9]
 [ 79 274]] 

I.DIABETES.before_DCT.A1C:
 [[443   0]
 [ 71   0]] 

I.DIABETES.before_DCT.glucose:
 [[496   0]
 [ 18   0]] 

I.DIABETES.before_DCT.mention:
 [[151  11]
 [ 78 274]] 

I.DIABETES.during_DCT.A1C:
 [[503   0]
 [ 11   0]] 

I.DIABETES.during_DCT.glucose:
 [[499   0]
 [ 15   0]] 

I.DIABETES.during_DCT.mention:
 [[151   9]
 [ 76 278]] 

I.FAMILY_HIST.present:
 [[495   0]
 [ 19   0]] 

I.HY

In [32]:
# confusion matrix for cagtegory gold label
cat_gcm = multilabel_confusion_matrix(Y_gold_cat_test, Y_gold_cat_pred)
for i, j in zip(cat_gcm, cat_gmlb.classes_):
    print(j+':\n', i,'\n')

CAD:
 [[274  16]
 [ 73 151]] 

DIABETES:
 [[134  18]
 [ 28 334]] 

FAMILY_HIST:
 [[495   0]
 [ 19   0]] 

HYPERLIPIDEMIA:
 [[261   7]
 [ 80 166]] 

HYPERTENSION:
 [[ 85  38]
 [ 34 357]] 

MEDICATION:
 [[ 62   2]
 [ 36 414]] 

OBESE:
 [[422   6]
 [ 12  74]] 

SMOKER:
 [[ 10  40]
 [ 76 388]] 



In [38]:
roc_auc_score(Y_gold_test, Y_gold_pred, average='micro')

0.7739841877228507

In [39]:
f1_score(Y_gold_test, Y_gold_pred, average='micro')

0.6985262902541088

In [43]:
f1_score(Y_gold_test[:, 1], Y_gold_pred[:, 1])

0.7655172413793104

In [48]:
f1_score(Y_gold_test, Y_gold_pred, average=None)

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


array([0.        , 0.76551724, 0.        , 0.        , 0.7173913 ,
       0.        , 0.        , 0.        , 0.78846154, 0.        ,
       0.        , 0.86163522, 0.        , 0.        , 0.86028257,
       0.        , 0.        , 0.8673947 , 0.        , 0.75      ,
       0.        , 0.        , 0.76726343, 0.        , 0.        ,
       0.77664975, 0.94617564, 0.        , 0.94632768, 0.49456522,
       0.94334278, 0.80440771, 0.        , 0.        , 0.        ,
       0.91348089, 0.73137698, 0.41772152, 0.25      , 0.        ,
       0.        , 0.72463768, 0.68449198, 0.        , 0.        ,
       0.75784753, 0.33043478, 0.0952381 , 0.83435583, 0.81521739,
       0.        , 0.        , 0.        , 0.91417166, 0.76521739,
       0.32894737, 0.075     , 0.        , 0.        , 0.68316832,
       0.2       , 0.        , 0.        , 0.73515982, 0.41935484,
       0.08695652, 0.82424242, 0.79329609, 0.        , 0.        ,
       0.        , 0.92248062, 0.71011236, 0.39240506, 0.05714

In [57]:
for i, j in zip(f1_score(Y_gold_cat_test, Y_gold_cat_pred, average=None), cat_gmlb.classes_):
    print(j+': '+str(i))

CAD: 0.772378516624041
DIABETES: 0.9355742296918768
FAMILY_HIST: 0.0
HYPERLIPIDEMIA: 0.7923627684964201
HYPERTENSION: 0.9083969465648855
MEDICATION: 0.956120092378753
OBESE: 0.891566265060241
SMOKER: 0.8699551569506727


  'precision', 'predicted', average, warn_for)


In [5]:
# prepare gold label targets
gmlb = MultiLabelBinarizer()
gmlb.fit(gold_labels)
num_gold_labels = len(gmlb.classes_)
Y_gold_test = gmlb.transform(gold_labels_test)

In [12]:
f1_score(Y_gold_test, Y_gold_test, average=None)

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [26]:
# model training batch by batch
epochs = 10
patience = 3
threshhold = 0.001
loss_n = []
for epoch in tnrange(epochs, desc='total progress'):
    for x, y in tqdm_notebook(zip(X_train_seq, Y_train), desc='epoch '+str(epoch+1)+' training', total=len(Y_train)):
        ly = len(y)
        x = np.array(x).reshape((1,-1))
        y = np.array(y).reshape((1,ly,-1))
        model.train_on_batch(x, y)
    Y_pred = []
    for x in tqdm_notebook(X_test_seq, desc='epoch '+str(epoch+1)+' validating'):
        x = np.array(x).reshape((1,-1))
        y_pred = np.squeeze(model.predict_on_batch(x))
        Y_pred.append(y_pred)
    Y_pred = np.concatenate(Y_pred)
    Y_pred_ham = Y_pred > 0.5
    Y_val = np.concatenate(Y_test)
    loss = log_loss(Y_val, Y_pred)
    ham = hamming_loss(Y_val, Y_pred_ham)
    sub = accuracy_score(Y_val, Y_pred_ham)
    f1 = f1_score(Y_val, Y_pred_ham, average='micro')
    print("Epoch %d val metrics: - Log-Loss: %.6f - Hamming-Loss: %.6f - Subset-Accuracy: %.6f - F1-Score: %.6f" % (epoch, loss, ham, sub, f1))
    loss_n.append(loss)
    if len(loss_n) > patience:
        for i in loss_n[-4:]:
            if loss_n[-1] >= i + threshhold:
                break

HBox(children=(IntProgress(value=0, description='total progress', max=10, style=ProgressStyle(description_widt…

HBox(children=(IntProgress(value=0, description='epoch 1 training', max=790, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='epoch 1 validating', max=514, style=ProgressStyle(description…

Epoch 0 val metrics: - Log-Loss: 0.351766 - Hamming-Loss: 0.001083 - Subset-Accuracy: 0.957040 - F1-Score: 0.949087


HBox(children=(IntProgress(value=0, description='epoch 2 training', max=790, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='epoch 2 validating', max=514, style=ProgressStyle(description…

Epoch 1 val metrics: - Log-Loss: 0.347610 - Hamming-Loss: 0.001072 - Subset-Accuracy: 0.957779 - F1-Score: 0.949651


HBox(children=(IntProgress(value=0, description='epoch 3 training', max=790, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='epoch 3 validating', max=514, style=ProgressStyle(description…

Epoch 2 val metrics: - Log-Loss: 0.337398 - Hamming-Loss: 0.001058 - Subset-Accuracy: 0.957161 - F1-Score: 0.950335


HBox(children=(IntProgress(value=0, description='epoch 4 training', max=790, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='epoch 4 validating', max=514, style=ProgressStyle(description…

Epoch 3 val metrics: - Log-Loss: 0.333103 - Hamming-Loss: 0.001052 - Subset-Accuracy: 0.956803 - F1-Score: 0.950598


HBox(children=(IntProgress(value=0, description='epoch 5 training', max=790, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='epoch 5 validating', max=514, style=ProgressStyle(description…

Epoch 4 val metrics: - Log-Loss: 0.322978 - Hamming-Loss: 0.001027 - Subset-Accuracy: 0.956368 - F1-Score: 0.951716


HBox(children=(IntProgress(value=0, description='epoch 6 training', max=790, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='epoch 6 validating', max=514, style=ProgressStyle(description…

Epoch 5 val metrics: - Log-Loss: 0.321765 - Hamming-Loss: 0.001019 - Subset-Accuracy: 0.956749 - F1-Score: 0.952139


HBox(children=(IntProgress(value=0, description='epoch 7 training', max=790, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='epoch 7 validating', max=514, style=ProgressStyle(description…

Epoch 6 val metrics: - Log-Loss: 0.319472 - Hamming-Loss: 0.001020 - Subset-Accuracy: 0.957300 - F1-Score: 0.952150


HBox(children=(IntProgress(value=0, description='epoch 8 training', max=790, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='epoch 8 validating', max=514, style=ProgressStyle(description…

Epoch 7 val metrics: - Log-Loss: 0.311171 - Hamming-Loss: 0.001006 - Subset-Accuracy: 0.956390 - F1-Score: 0.952800


HBox(children=(IntProgress(value=0, description='epoch 9 training', max=790, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='epoch 9 validating', max=514, style=ProgressStyle(description…

Epoch 8 val metrics: - Log-Loss: 0.312243 - Hamming-Loss: 0.001003 - Subset-Accuracy: 0.956964 - F1-Score: 0.952942


HBox(children=(IntProgress(value=0, description='epoch 10 training', max=790, style=ProgressStyle(description_…

HBox(children=(IntProgress(value=0, description='epoch 10 validating', max=514, style=ProgressStyle(descriptio…

Epoch 9 val metrics: - Log-Loss: 0.304512 - Hamming-Loss: 0.000993 - Subset-Accuracy: 0.957218 - F1-Score: 0.953429


In [54]:
from tqdm import tnrange, tqdm_notebook
epochs = 10
for epoch in tnrange(epochs):
    for x, y in tqdm_notebook(zip(X_train_seq, Y_cat_train), desc='training'):
        ly = len(y)
        x = np.array(x).reshape((1,-1))
        y = np.array(y).reshape((1,ly,-1))
        #if x.shape[1] != y.shape[1]:
        #    print(x.shape,y.shape)
        cat_model.train_on_batch(x, y)
    Y_cat_pred = []
    for x in tqdm_notebook(X_test_seq, desc='predicting'):
        x = np.array(x).reshape((1,-1))
        #if x.shape[1] != y.shape[1]:
        #    print(x.shape,y.shape)
        y_pred = np.squeeze(cat_model.predict_on_batch(x))
        Y_cat_pred.append(y_pred)
    Y_cat_pred = np.concatenate(Y_cat_pred)
    Y_cat_pred_ham = Y_cat_pred > 0.5
    Y_cat_val = np.concatenate(Y_cat_test)
    #print(y_val.sum(), y_pred.sum())
    #roc = roc_auc_score(Y_val, Y_pred)
    loss = log_loss(Y_cat_val, Y_cat_pred)
    ham = hamming_loss(Y_cat_val, Y_cat_pred_ham)
    sub = accuracy_score(Y_cat_val, Y_cat_pred_ham)
    f1 = f1_score(Y_val, Y_pred_ham, average='micro')
    print("Epoch %d val metrics: - Log-Loss: %.6f - Hamming-Loss: %.6f - Subset-Accuracy: %.6f - F1_Score: %.6f" % (epoch, loss, ham, sub, f1))

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', description='training', max=1, style=ProgressStyle(descr…

HBox(children=(IntProgress(value=0, description='predicting', max=514, style=ProgressStyle(description_width='…

Epoch 0 val metrics: - Log-Loss: 0.239218 - Hamming-Loss: 0.010005 - Subset-Accuracy: 0.954910


HBox(children=(IntProgress(value=1, bar_style='info', description='training', max=1, style=ProgressStyle(descr…

HBox(children=(IntProgress(value=0, description='predicting', max=514, style=ProgressStyle(description_width='…

Epoch 1 val metrics: - Log-Loss: 0.212364 - Hamming-Loss: 0.009780 - Subset-Accuracy: 0.955373


HBox(children=(IntProgress(value=1, bar_style='info', description='training', max=1, style=ProgressStyle(descr…

HBox(children=(IntProgress(value=0, description='predicting', max=514, style=ProgressStyle(description_width='…

Epoch 2 val metrics: - Log-Loss: 0.196264 - Hamming-Loss: 0.009550 - Subset-Accuracy: 0.955572


HBox(children=(IntProgress(value=1, bar_style='info', description='training', max=1, style=ProgressStyle(descr…

HBox(children=(IntProgress(value=0, description='predicting', max=514, style=ProgressStyle(description_width='…

Epoch 3 val metrics: - Log-Loss: 0.179833 - Hamming-Loss: 0.009293 - Subset-Accuracy: 0.955243


HBox(children=(IntProgress(value=1, bar_style='info', description='training', max=1, style=ProgressStyle(descr…

HBox(children=(IntProgress(value=0, description='predicting', max=514, style=ProgressStyle(description_width='…

Epoch 4 val metrics: - Log-Loss: 0.165139 - Hamming-Loss: 0.008795 - Subset-Accuracy: 0.956336


HBox(children=(IntProgress(value=1, bar_style='info', description='training', max=1, style=ProgressStyle(descr…

HBox(children=(IntProgress(value=0, description='predicting', max=514, style=ProgressStyle(description_width='…

Epoch 5 val metrics: - Log-Loss: 0.154288 - Hamming-Loss: 0.008518 - Subset-Accuracy: 0.956742


HBox(children=(IntProgress(value=1, bar_style='info', description='training', max=1, style=ProgressStyle(descr…

HBox(children=(IntProgress(value=0, description='predicting', max=514, style=ProgressStyle(description_width='…

Epoch 6 val metrics: - Log-Loss: 0.148316 - Hamming-Loss: 0.008245 - Subset-Accuracy: 0.957681


HBox(children=(IntProgress(value=1, bar_style='info', description='training', max=1, style=ProgressStyle(descr…

HBox(children=(IntProgress(value=0, description='predicting', max=514, style=ProgressStyle(description_width='…

Epoch 7 val metrics: - Log-Loss: 0.141742 - Hamming-Loss: 0.008039 - Subset-Accuracy: 0.958841


HBox(children=(IntProgress(value=1, bar_style='info', description='training', max=1, style=ProgressStyle(descr…

HBox(children=(IntProgress(value=0, description='predicting', max=514, style=ProgressStyle(description_width='…

Epoch 8 val metrics: - Log-Loss: 0.137761 - Hamming-Loss: 0.007891 - Subset-Accuracy: 0.959345


HBox(children=(IntProgress(value=1, bar_style='info', description='training', max=1, style=ProgressStyle(descr…

HBox(children=(IntProgress(value=0, description='predicting', max=514, style=ProgressStyle(description_width='…

Epoch 9 val metrics: - Log-Loss: 0.135815 - Hamming-Loss: 0.007833 - Subset-Accuracy: 0.959215
