In [1]:
import numpy as np
import random
import time
import gzip
import sys
import pickle as pkl

#from sklearn.model_selection import StratifiedKFold
import tensorflow as tf
import keras
from keras.models import Model
from keras.layers import *

Using TensorFlow backend.


# Create, train and evaluate Bi-LSTM on test set

In [2]:
tf.__version__

'2.1.0'

In [3]:
keras.__version__

'2.3.1'

In [4]:
np.random.seed(1337)  # for reproducibility
DATA_PATH = "bi-lstm-data/"
EMBEDDINGS_PATH = DATA_PATH+'embeddings.pkl.gz'
TRAIN_SET_PATH = DATA_PATH+'train_set.pkl.gz'
TEST_SET_PATH = DATA_PATH+'test_set.csv'
MODEL_PATH = DATA_PATH+'bi-lstm-model.h5'

In [5]:
# Load train set embeddings
f = gzip.open(EMBEDDINGS_PATH, 'rb')
embeddings = pkl.load(f)
f.close()

label2Idx = embeddings['label2Idx']
wordEmbeddings = embeddings['wordEmbeddings']
caseEmbeddings = embeddings['caseEmbeddings']

In [6]:
# Inverse label mapping
idx2Label = {v: k for k, v in label2Idx.items()}

In [7]:
print(label2Idx,'\n---\n', idx2Label)

{'B-MISC': 0, 'B-LOC': 1, 'I-PER': 2, 'I-ORG': 3, 'O': 4, 'I-MISC': 5, 'B-PER': 6, 'I-LOC': 7, 'B-ORG': 8} 
---
 {0: 'B-MISC', 1: 'B-LOC', 2: 'I-PER', 3: 'I-ORG', 4: 'O', 5: 'I-MISC', 6: 'B-PER', 7: 'I-LOC', 8: 'B-ORG'}


In [8]:
# Load all train set
f = gzip.open(TRAIN_SET_PATH, 'rb')
data = pkl.load(f)
f.close()

In [9]:
data[0] # word2Idx, case2Idx, label2Idx

[[12558, 1, 1445, 1, 1, 1, 18468, 1, 6566, 1, 1],
 [3, 4, 3, 4, 4, 0, 1, 4, 2, 4, 4],
 [1, 4, 1, 4, 4, 4, 4, 4, 8, 4, 4]]

In [10]:
wordEmbeddings.shape, caseEmbeddings.shape

((75272, 300), (8, 8))

# Bi-LSTM model

In [11]:
n_out = len(label2Idx)

words_input = Input(shape=(None,), dtype='int32', name='words_input')
words = Embedding(input_dim=wordEmbeddings.shape[0], output_dim=wordEmbeddings.shape[1],  
                  weights=[wordEmbeddings], trainable=False)(words_input)

casing_input = Input(shape=(None,), dtype='int32', name='casing_input')
casing = Embedding(output_dim=caseEmbeddings.shape[1], input_dim=caseEmbeddings.shape[0], 
                   weights=[caseEmbeddings], trainable=False)(casing_input)

output = concatenate([words, casing])
output = Bidirectional(LSTM(50, return_sequences=True, dropout=0.25, 
                            recurrent_dropout=0.25))(output)
output = TimeDistributed(Dense(n_out, activation='softmax'))(output)

#Create our model and compile it using Nadam optimizer with categorical cross-entropy for sparse y-labels
model = Model(inputs=[words_input, casing_input], outputs=[output])
model.compile(loss='sparse_categorical_crossentropy', optimizer='nadam')
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
words_input (InputLayer)        (None, None)         0                                            
__________________________________________________________________________________________________
casing_input (InputLayer)       (None, None)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 300)    22581600    words_input[0][0]                
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, None, 8)      64          casing_input[0][0]               
____________________________________________________________________________________________

In [12]:
def compute_f1(predictions, correct, idx2Label): 
    """Compute F1 score = (precision * recall) / (precision + recall). 
    """
    label_pred = []    
    for sentence in predictions:
        label_pred.append([idx2Label[element] for element in sentence])
        
    label_correct = []    
    for sentence in correct:
        label_correct.append([idx2Label[element] for element in sentence])
                
    prec = compute_precision(label_pred, label_correct)
    rec = compute_precision(label_correct, label_pred)
    
    f1 = 0
    if (rec+prec) > 0:
        f1 = 2.0 * prec * rec / (prec + rec);
        
    return prec, rec, f1

In [13]:
def compute_precision(guessed_sentences, correct_sentences):
    """Compute precision"""
    
    assert(len(guessed_sentences) == len(correct_sentences))
    correctCount = 0
    count = 0

    for sentenceIdx in range(len(guessed_sentences)):
        guessed = guessed_sentences[sentenceIdx]
        correct = correct_sentences[sentenceIdx]
        assert(len(guessed) == len(correct))
        idx = 0
        while idx < len(guessed):
            if guessed[idx][0] == 'B': # A new chunk starts
                count += 1
                
                if guessed[idx] == correct[idx]:
                    idx += 1
                    correctlyFound = True
                    
                    # Scan until it no longer starts with I
                    while idx < len(guessed) and guessed[idx][0] == 'I': 
                        if guessed[idx] != correct[idx]:
                            correctlyFound = False
                        
                        idx += 1
                    
                    # The chunk in correct was longer
                    if idx < len(guessed):
                        if correct[idx][0] == 'I': 
                            correctlyFound = False
                        
                    
                    if correctlyFound:
                        correctCount += 1
                else:
                    idx += 1
            else:  
                idx += 1
    
    precision = 0
    if count > 0:    
        precision = float(correctCount) / count
        
    return precision

In [14]:
def iterate_minibatches(dataset): 
    endIdx = len(dataset)   
    for idx in range(endIdx):
        tokens, casing, labels = dataset[idx]        
            
        labels = np.expand_dims([labels], -1)     
        yield labels, np.asarray([tokens]), np.asarray([casing])

In [15]:
def predict(dataset):
    """Compute predictions. 
    Return tuple: predicted labels, correct labels"""
    
    correctLabels = []
    predLabels = []
    for tokens, casing, labels in dataset:    
        tokens = np.asarray([tokens])     
        casing = np.asarray([casing])
        #print('Tokens: ', tokens)
        #print('Casing: ', casing)
        #print('---')
        pred = model.predict([tokens, casing], verbose=False)[0]  
        #print("Pred1: ", pred)
        pred = pred.argmax(axis=-1) #Predict the classes            
        #print("Pred2: ", pred)
        correctLabels.append(labels)
        predLabels.append(pred)
        
        
    return predLabels, correctLabels

In [16]:
def train_test(train_data, test_data, number_of_epochs):
    """Train model and compute precision, recall and F1 score on test data
    """
    for epoch in range(number_of_epochs):    
        print("--------- Epoch %d -----------" % epoch)
        random.shuffle(train_data)
        start_time = time.time()    
    
        # Train one sentence at a time (i.e. online training) to avoid padding of sentences
        cnt = 0
        for batch in iterate_minibatches(train_data):
            labels, tokens, casing = batch     
            # Single gradient update over one batch of samples.   
            model.train_on_batch([tokens, casing], labels) 
            cnt += 1
        
            if cnt % 100 == 0:
                print('Sentence: %d / %d' % (cnt, len(train_data)), end='\r')
        print("%.2f sec for training                 " % (time.time() - start_time))
    
    
    # Performance on test dataset       
    predLabels, correctLabels = predict(test_data)        
    pre_test, rec_test, f1_test = compute_f1(predLabels, correctLabels, idx2Label)
    print("Test-Data: Prec: %.3f, Rec: %.3f, F1: %.3f" % (pre_test, rec_test, f1_test))
    
    print("%.2f sec for evaluation" % (time.time() - start_time))
    print("")


# Train model and compute precision, recall and F1 score on test data

In [17]:
data_len = len(data)
train_pcnt = round(data_len / 100 * 70)
test_pcnt = round(data_len / 100 * 30)
train_pcnt, test_pcnt, train_pcnt + test_pcnt, data_len

(8228, 3526, 11754, 11755)

In [18]:
%%time
# k-fold cross  validation
folds = 1
number_of_epochs = 10
for i in range(folds):
    print("*** Fold: ", i)
    random.shuffle(data)
    train_data = data[:train_pcnt]
    test_data = data[train_pcnt:]
    train_test(train_data, test_data, number_of_epochs)

*** Fold:  0
--------- Epoch 0 -----------
166.30 sec for training                 
--------- Epoch 1 -----------
161.83 sec for training                 
--------- Epoch 2 -----------
165.99 sec for training                 
--------- Epoch 3 -----------
162.41 sec for training                 
--------- Epoch 4 -----------
161.68 sec for training                 
--------- Epoch 5 -----------
161.58 sec for training                 
--------- Epoch 6 -----------
161.65 sec for training                 
--------- Epoch 7 -----------
323.54 sec for training                 
--------- Epoch 8 -----------
161.75 sec for training                 
--------- Epoch 9 -----------
161.68 sec for training                 
Test-Data: Prec: 0.837, Rec: 0.827, F1: 0.832
177.82 sec for evaluation

CPU times: user 1h 5min 24s, sys: 12min 47s, total: 1h 18min 11s
Wall time: 30min 4s


In [19]:
# Save model in a HDF5 file 
model.save(MODEL_PATH)  