In [None]:
from keras.models import Model, load_model
from keras.layers import Input, Embedding, LSTM, Dense, TimeDistributed, Reshape, Bidirectional, concatenate, Flatten, Concatenate, Add, Average, Lambda
from keras_contrib.layers import CRF
from future.utils import iteritems
import keras as k
import os, sys, pickle, glob
import numpy as np

Using TensorFlow backend.


In [None]:
class LSTM_CRF():
    
    def __init__(self, max_words, vocab_size, tag_size, output_dim):     
        self.max_words = max_words
        self.vocab_size = vocab_size
        self.tag_size = tag_size
        self.model = None
        self.output_dim = output_dim
        
    def define_model(self, word_embedding_dim=50, lstm_cell=20):
        word_input = Input(shape=(self.max_words,), name="word_input")
        word = Embedding(input_dim=self.vocab_size, output_dim=word_embedding_dim)(word_input)
        
        pos_tag_input = Input(shape=(self.max_words,), name="pos_tag_input")
        pos_tag = Embedding(input_dim=self.tag_size, output_dim=word_embedding_dim)(pos_tag_input)
        pos_tag = Lambda(lambda x: x * 5)(pos_tag)
        concat = Average()([word, pos_tag])

        model = Bidirectional(
            LSTM(lstm_cell, return_sequences=True),
            merge_mode='concat'
        )(concat)
        model = TimeDistributed(
            Dense(self.output_dim, activation='softmax')
        )(model)
        crf = CRF(self.output_dim, name="output")
        output = crf(model)

        m = Model(inputs=[word_input,pos_tag_input], outputs=output)
        adam = k.optimizers.Adam(lr=0.01, beta_1=0.9, beta_2=0.999)

        m.compile(optimizer=adam, loss=crf.loss_function, metrics=[crf.accuracy, 'accuracy'])  

        self.model = m

In [None]:
with open('data/train_augmented_bio_annotations.npy', 'rb') as f:
    annotation = np.load(f, allow_pickle=True)
with open('data/train_augmented_bio_sentences.npy', 'rb') as f:
    text = np.load(f,allow_pickle=True)
with open('data/train_pos_tag.npy', 'rb') as f:
    pos_tags = np.load(f,allow_pickle=True)

In [None]:
word_to_ix = {}
for sentence in text:
    for word in sentence:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)

In [None]:
pos_tag_to_ix = {}
for pos in pos_tags:
  for tag in pos:
    if tag not in pos_tag_to_ix:
      pos_tag_to_ix[tag] = len(pos_tag_to_ix)

In [None]:
tag_to_ix = {"B_Task": 0, "I_Task": 1, "B_Process": 2, "I_Process": 3, "B_Material": 4, "I_Material": 5, "O": 6}
idx_to_tag = {v: k for k, v in iteritems(tag_to_ix)}
idx_to_tag

{0: 'B_Task',
 1: 'I_Task',
 2: 'B_Process',
 3: 'I_Process',
 4: 'B_Material',
 5: 'I_Material',
 6: 'O'}

In [None]:
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

maxlen = max([len(s) for s in text])
n_words = len(word_to_ix)
n_tags = len(tag_to_ix) 

X_word = [[word_to_ix[w] for w in s] for s in text]
X_word = pad_sequences(maxlen=maxlen, sequences=X_word, padding="post",value=n_words)

X_tag = [[pos_tag_to_ix[w] for w in s] for s in pos_tags]
X_tag = pad_sequences(maxlen=maxlen, sequences=X_tag, padding="post",value=len(pos_tag_to_ix))

y = [[tag_to_ix[w] for w in s] for s in annotation]
y = pad_sequences(maxlen=maxlen, sequences=y, padding="post", value=tag_to_ix["O"])
y = [to_categorical(i, num_classes=n_tags) for i in y]


In [None]:
lstm = LSTM_CRF(max_words=maxlen, vocab_size=n_words+1, tag_size=len(pos_tag_to_ix)+1, output_dim=n_tags)
lstm.define_model()
lstm.model.summary()

Instructions for updating:
If using Keras pass *_constraint arguments to layers.




Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
pos_tag_input (InputLayer)      (None, 297)          0                                            
__________________________________________________________________________________________________
word_input (InputLayer)         (None, 297)          0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 297, 50)      2150        pos_tag_input[0][0]              
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 297, 50)      412050      word_input[0][0]                

In [None]:
from keras.callbacks import ModelCheckpoint
import matplotlib.pyplot as plt

history = lstm.model.fit([X_word,X_tag] , np.array(y), batch_size=16, epochs=7, verbose=1)

plt.style.use('ggplot')


Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


In [None]:
def plot_history(history):
    accuracy = history.history['accuracy']
    # val_accuracy = history.history['val_accuracy']
    loss = history.history['loss']
    # val_loss = history.history['val_loss']
    x = range(1, len(accuracy) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, accuracy, 'b', label='Training acc')
    # plt.plot(x, val_accuracy, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    # plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()

plot_history(history)

## Testing ##

In [None]:
with open('data/test_bio_annotations.npy', 'rb') as f:
    dev_annotation = np.load(f, allow_pickle=True)
with open('data/test_bio_sentences.npy', 'rb') as f:
    dev_text = np.load(f,allow_pickle=True)
with open('data/test_pos_tag.npy', 'rb') as f:
    dev_pos_tag = np.load(f,allow_pickle=True)

In [None]:
X_dev = [[word_to_ix[w] if w in word_to_ix else n_words for w in s] for s in dev_text]
X_dev = pad_sequences(maxlen=maxlen, sequences=X_dev, padding="post",value=n_words)

X_dev_tag = [[pos_tag_to_ix[w] if w in pos_tag_to_ix else len(pos_tag_to_ix) for w in s] for s in dev_pos_tag]
X_dev_tag = pad_sequences(maxlen=maxlen, sequences=X_dev_tag, padding="post",value=len(pos_tag_to_ix))

y_dev = [[tag_to_ix[w] for w in s] for s in dev_annotation]
y_dev = pad_sequences(maxlen=maxlen, sequences=y_dev, padding="post", value=tag_to_ix["O"])
y_dev = [to_categorical(i, num_classes=n_tags) for i in y_dev]


In [None]:
y_pred = lstm.model.predict([X_dev,X_dev_tag])
y_pred.shape

(100, 297, 7)

In [None]:
def pred2label(pred, text):
    out = []
    for i,pred_i in enumerate(pred):
        for j,p in enumerate(pred_i):
            p_i = np.argmax(p)
            if('Task' in idx_to_tag[p_i]):
              out.append('Task')
            elif 'Material' in idx_to_tag[p_i]:
              out.append('Material')
            elif 'Process' in idx_to_tag[p_i]:
              out.append('Process')
            else:
              out.append(idx_to_tag[p_i])
    return out

In [None]:
pred_labels = pred2label(y_pred, dev_text)
test_labels = pred2label(y_dev, dev_text)

In [None]:
from sklearn.metrics import classification_report, f1_score
report = classification_report(y_pred=pred_labels, y_true=test_labels)
print(report)
print(f1_score(y_pred=pred_labels, y_true=test_labels,average = "macro"))