In [1]:
import json
import itertools
import numpy as np
from feature_utils import TensorMaker
import time

Using TensorFlow backend.


In [2]:
def convert_keys(data):
    if isinstance(data, dict):
        if 'data' in data.keys():
            temp = {}
            for k, v in data['data'].items(): temp[int(k)] = v
            data['data'] = temp
            return data
        else:
            return data
    else:
        return data

# load CoNLL2003
train = json.load(open('data/conll2003/en/train.json'), object_hook = convert_keys)
valid = json.load(open('data/conll2003/en/valid.json'), object_hook = convert_keys)
test = json.load(open('data/conll2003/en/test.json'), object_hook = convert_keys)

In [3]:
print("Creating vocabularies...")

WORDS, TAGS = list(), list()
for _, d in itertools.chain(train['data'].items(), valid['data'].items(), test['data'].items()):
    for w in d['sentence']:
        if w.lower() not in WORDS: WORDS.append(w.lower())
            
    for t in d['tags']:
        if t not in TAGS: TAGS.append(t)
        
        
##### initialize TensorMaker ######

print("Initializing TensorMaker...")

MAX_LEN_SENT = 125   # maximum sentence length
MAX_LEN_WORD = 25    # maximum word length

TM = TensorMaker(WORDS, TAGS, max_len_word=MAX_LEN_WORD, word_padding='post', word_truncating='post')

Creating vocabularies...
Initializing TensorMaker...


In [4]:
print("Loading word embeddings...")

d = 50
emb_dir = 'embeddings/glove.6B/glove.6B.{}d.txt'.format(d)

e = open(emb_dir, encoding='UTF-8')

embeddings = dict()
for line in e:
    values = line.split()
    word = values[0]
    coef = np.asarray(values[1:], dtype='float32')
    embeddings[word] = coef
    
e.close()

embedding_dim = (len(TM.word2idx), d)
E = np.zeros(embedding_dim)

for i, w in enumerate(TM.word2idx):
    emb_vec = embeddings.get(w)
    if emb_vec is not None:
        E[i,:] = emb_vec



Loading word embeddings...


In [5]:
WORD_EMBEDDING_LAYER = 8

print("Defining model...")

from models.blstm_cnn_crf_word_features_model import blstm_cnn_wd_ft_ner
from models.blstm_crf_model import blstm_ner
from keras_contrib.layers import CRF
from keras_contrib.losses import crf_loss
from keras_contrib.metrics import crf_accuracy

# model = blstm_ner(max_len_sent=MAX_LEN_SENT,
#                   embedding_dims=embedding_dim,
#                   num_tags=len(TM.tag2idx))

model = blstm_cnn_wd_ft_ner(max_len_sent=MAX_LEN_SENT,
                            max_len_word=MAX_LEN_WORD,
                            num_tags=len(TM.tag2idx),
                            word_embedding_dims=embedding_dim,
                            char_embedding_dims=(len(TM.char2idx), 25),
                            word_feature_embedding_dims=(6,10),
                            main_dropout=0.25,
                            char_dropout=0.25,
                            recurrent_dropout=0.25)


model.layers[WORD_EMBEDDING_LAYER].set_weights([E])
model.layers[WORD_EMBEDDING_LAYER].trainable = False

Defining model...


In [None]:
BATCH_SIZE = 32


from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard, Callback, ReduceLROnPlateau
from keras.optimizers import RMSprop
from generators import DataGenerator


from validation import sentence_metrics
from generators import TestDataGenerator
#self, data, batch_size, tensor_maker, sentences, characters, word_features, tags):
TG = TestDataGenerator(data=test['data'], batch_size=BATCH_SIZE, tensor_maker=TM, sentences=True, characters=True, word_features=True, tags=True)

model.compile(optimizer=RMSprop(lr=0.005), loss=crf_loss, metrics=[crf_accuracy])

early_stopping = EarlyStopping(monitor='val_crf_accuracy', min_delta=0.001, patience=9)
reduce_on_plateau = ReduceLROnPlateau(monitor='val_crf_accuracy', 
                                      factor=0.2, 
                                      patience=5,
                                      mode='max',
                                      min_lr=0.0001)

MODEL_NAME = time.time()
checkpointer = ModelCheckpoint(filepath='./trained_models/model_{}'.format(MODEL_NAME), verbose=True, save_best_only=True)

tb = TensorBoard(log_dir='./tf_logs/blstm_cnn_crf_{}'.format(time.time()),
                 batch_size=BATCH_SIZE,
                 write_grads=True,
                 write_graph=False,
                 histogram_freq=0)

DG = DataGenerator(data=train['data'], batch_size=BATCH_SIZE, tensor_maker=TM, shuffle=True, sentences=True, characters=True, word_features=True, tags=True)
VG = DataGenerator(data=valid['data'], batch_size=BATCH_SIZE, tensor_maker=TM, shuffle=True, sentences=True, characters=True, word_features=True, tags=True)


In [None]:
model.fit_generator(generator=DG,
                    validation_data=VG,
                    validation_steps=len(VG),
                    steps_per_epoch=len(DG),
                    epochs=75,
                    callbacks=[early_stopping, tb, reduce_on_plateau, checkpointer],
                    shuffle=True)

In [11]:
print("Validating on test data...")

#from validation import sentence_metrics
from generators import TestDataGenerator
from seqeval.metrics import classification_report

TG = TestDataGenerator(test['data'], BATCH_SIZE, TM, True, True, True, True)

actual, pred = list(), list()
for batch in TG:
    
    X_data, Y_test = batch
    Y_pred = model.predict_on_batch(X_data)
    Y_pred, Y_test= np.argmax(Y_pred, axis = 2), np.argmax(Y_test, axis = 2)
    
    for i in range(Y_pred.shape[0]):
        pred.append(TM.convert2tags(Y_pred[i, :]))
        actual.append(TM.convert2tags(Y_test[i, :]))

print(classification_report(actual, pred))

Validating on test data...
             precision    recall  f1-score   support

        LOC       0.79      0.82      0.81      1667
        ORG       0.73      0.73      0.73      1661
       MISC       0.66      0.66      0.66       702
        PER       0.81      0.82      0.81      1616

avg / total       0.76      0.77      0.77      5646

