In [1]:
import time
import re

import tensorflow as tf
import numpy as np

from tageval import evaluate_tagging_file

In [66]:
class MyConfig(object):
    '''My configuration'''
    learning_rate = 0.0001
    embed_dim = 300
    tags_dim = 54
    fea_dim = embed_dim + tags_dim + 1
    hidden_dim = 256
    step_n = 52 # max sentence length
    layers_n = 2
    class_n = 2
    random_scale = 0.1
    keep_prob = 1.0 # used for dropout
    forget_bias = 1.0
    grad_clip = 10
    # below are related to training
    epoch_n = 5
    batch_size = 10
    display_iter = 1000

config = MyConfig()

In [24]:
dev_out_path = './dev_result.txt'
test_out_path = './test_result.txt'

In [97]:
# def find_max_len(data_path):
#     max_len = 0
#     for sentence in open(data_path, encoding='utf-8').read().strip().split('\n\n'):
#         max_len = max(max_len, len(sentence.strip().split('\n')))
#     return max_len

# train_max_len = max(find_max_len('./data/train/train.txt'), find_max_len('./conll_data/conll2003_train.txt'))
# dev_max_len = find_max_len('./data/dev/dev.txt')
# test_max_len = find_max_len('./data/test/test.nolabels.txt')
# print(train_max_len, dev_max_len, test_max_len)

52 41 52


In [28]:
mycompile = lambda pat:  re.compile(pat,  re.UNICODE)
re_patten = {'<NUM>': mycompile('^[0-9\.,/-]+$'),
             '<URL>': mycompile('https?://\S+')}

def norm_word(word):
    '''normalize word'''
    if len(word) > 0 and word[0] == '@':
        return'<@>'
    for key, patten in re_patten.items():
        if patten.match(word):
            return key
    return word

def get_words(data_path):
    words_list = []
    for sentence in open(data_path, encoding='utf-8').read().strip().split('\n\n'):
        for line in sentence.strip().split('\n'):
            words_list.append(norm_word(line.strip().split('\t')[0]))
    return words_list

# def get_words_dict(word_list):
#     '''get words_dict'''
#     words_set = set(['<PAD>', '<@>', '<UKN>'])
#     for word in word_list:
#         words_set.add(word)
#     words_dict = dict(zip(words_set, range(len(words_set))))
#     return words_dict

def get_words_dict(data_path):
    '''get words_dict'''
    words_set = set(['<PAD>', '<@>', '<UKN>'])
    for sentence in open(data_path, encoding='utf-8').read().strip().split('\n\n'):
        for line in sentence.strip().split('\n'):
            words_set.add(norm_word(line.strip().split('\t')[0]))
    words_dict = dict(zip(words_set, range(len(words_set))))
    return words_dict

In [29]:
# words_list = get_words('./data/train/train.txt')
# words_list.extend(get_words('./conll_data/conll2003_train.txt'))
# words_dict = get_words_dict(words_list)
words_dict = get_words_dict('./data/train/combined_data.txt')
vocab_size = len(words_dict)
print('vocab_size:', vocab_size)

vocab_size: 31109


In [30]:
label_dict = {'O': 0, 'B': 1, 'I':1}

# words_dict = get_words_dict('./data/train/train.txt')
# vocab_size = len(words_dict)
# print('vocab_size:', vocab_size)

In [32]:
# load word2vec model pretrained on GoogleNews
load_big_model = False
if load_big_model:
    import gensim
    w2v_model = gensim.models.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin.gz', binary=True)
    embed_np = np.zeros((vocab_size, config.embed_dim), dtype=np.float32)
    for key, val in words_dict.items():
        if key in w2v_model:
            embed_np[val] = w2v_model[key]
    embed_np.dump('./my_embedding')
    del w2v_model
else:
    embed_np = np.load('./my_embedding')
print('embed_np.shape:', embed_np.shape)

embed_np.shape: (31109, 300)


In [47]:
tags_dict = {'URL': 40, 'WP$': 2, 'VBG': 3, 'VBZ': 1, 'RBR': 4, 'IN': 6, 'RB': 7, 'CD': 9, 'VBD': 10, 'NONE': 11, 'JJR': 29, 'FW': 12, 'PDT': 13, 'VB': 15, ')': 16, 'NNS': 17, 'USR': 18, 'MD': 19, 'RT': 38, 'EX': 20, 'O': 21, 'NNPS': 22, 'RBS': 23, 'CC': 50, 'WDT': 14, '``': 24, 'VPP': 34, 'SYM': 26, 'NNP': 51, 'TO': 27, 'HT': 28, 'WP': 30, 'LS': 31, 'JJS': 32, 'DT': 33, 'POS': 35, 'WRB': 36, 'VBN': 37, "''": 39, 'UH': 41, 'PRP': 42, 'RP': 43, ',': 8, 'NN': 44, 'PRP$': 5, 'JJ': 45, '$': 46, '#': 47, '(': 49, 'VBP': 25, '<PAD>': 0, ':': 53, 'TD': 52, '.': 48}
print(tags_dict)
inv_tags_dict = {v: k for k, v in tags_dict.items()}

tags_n = len(tags_dict)

{'DT': 33, 'NNS': 17, 'URL': 40, ')': 16, 'WP': 30, 'UH': 41, 'FW': 12, 'VBN': 37, 'LS': 31, 'RP': 43, 'RT': 38, 'CD': 9, 'VBG': 3, ',': 8, 'IN': 6, '(': 49, '$': 46, '#': 47, 'PRP$': 5, '<PAD>': 0, 'PRP': 42, 'JJS': 32, 'MD': 19, 'VBD': 10, '.': 48, ':': 53, 'VB': 15, 'NN': 44, 'RB': 7, 'SYM': 26, 'JJ': 45, 'O': 21, 'PDT': 13, 'WP$': 2, 'NNPS': 22, 'VBP': 25, 'JJR': 29, 'POS': 35, 'EX': 20, 'VPP': 34, 'NNP': 51, 'RBR': 4, "''": 39, '``': 24, 'TO': 27, 'VBZ': 1, 'WRB': 36, 'USR': 18, 'RBS': 23, 'WDT': 14, 'HT': 28, 'NONE': 11, 'CC': 50, 'TD': 52}


In [51]:
def norm_pos_tag(tag):
    if tag == '\"':
        return tags_dict["''"]
    elif tag == 'NN|SYM':
        return tags_dict['NN']
    return tags_dict[tag]

In [96]:
def word_to_idx(word):
    word = norm_word(word)
    if word in words_dict:
        return words_dict[word]
    return words_dict['<UKN>']

def load_test_data(data_path):
    '''
    Return
    data: list [list [word]]
    data_idx: np.array [num_sentence, num_words]
    data_pos: np.array [num_sentence, num_words]
    data_cap: np.array [num_sentence, num_words]
    data_len: np.array [num_sentence]
    '''
    data, data_idx, data_pos, data_cap, data_len = [], [], [], [], []
    for sentence in open(data_path, encoding='utf-8').read().strip().split('\n\n'):
        sent_data, sent_data_idx, sent_data_cap, sent_data_pos = [], [], [], []
        for line in sentence.strip().split('\n'):
            line = line.strip().split('\t')
            sent_data.append(line[0])
            sent_data_idx.append(word_to_idx(line[0]))
            sent_data_pos.append(norm_pos_tag(line[1]))
            sent_data_cap.append(1 if line[0][0].isupper() else 0)
        sent_data_idx.extend([words_dict['<PAD>']] * (config.step_n - len(sent_data_idx)))
        sent_data_pos.extend([0] * (config.step_n - len(sent_data_pos)))
        sent_data_cap.extend([0] * (config.step_n - len(sent_data_cap)))
        data.append(sent_data)
        data_idx.append(sent_data_idx)
        data_pos.append(sent_data_pos)
        data_cap.append(sent_data_cap)
        data_len.append(len(sent_data))
    return data, np.asarray(data_idx, dtype=np.int32), np.asarray(data_pos, dtype=np.int32), np.asarray(data_cap, dtype=np.int32), np.asarray(data_len, dtype=np.int32)

def load_train_data(data_path):
    '''
    Return
    data: list [list [word]]
    data_idx: np.array [num_sentence, num_words]
    data_pos: np.array [num_sentence, num_words]
    data_cap: np.array [num_sentence, num_words]
    data_len: np.array [num_sentence]
    label: np.array [num_sentence, num_words, 2]
    '''
    data, data_idx, data_pos, data_cap, data_len, label = [], [], [], [], [], []
    for sentence in open(data_path, encoding='utf-8').read().strip().split('\n\n'):
        sent_data, sent_data_idx, sent_data_cap, sent_data_pos, sent_label = [], [], [], [], []
        for line in sentence.strip().split('\n'):
            line = line.strip().split('\t')
            sent_data.append(line[0])
            sent_data_idx.append(word_to_idx(line[0]))
            sent_label.append([0, 0])
            sent_label[-1][label_dict[line[1]]] = 1
            sent_data_pos.append(norm_pos_tag(line[2]))
            sent_data_cap.append(1 if line[0][0].isupper() else 0)
        sent_data_idx.extend([words_dict['<PAD>']] * (config.step_n - len(sent_data_idx)))
        sent_data_pos.extend([0] * (config.step_n - len(sent_data_pos)))
        sent_data_cap.extend([0] * (config.step_n - len(sent_data_cap)))
        sent_label.extend([[0, 0]] * (config.step_n - len(sent_label)))
        data.append(sent_data)
        data_idx.append(sent_data_idx)
        data_pos.append(sent_data_pos)
        data_cap.append(sent_data_cap)
        data_len.append(len(sent_data))
        label.append(sent_label)
    return data, np.asarray(data_idx, dtype=np.int32), np.asarray(data_pos, dtype=np.int32), np.asarray(data_cap, dtype=np.int32), np.asarray(data_len, dtype=np.int32), np.asarray(label, dtype=np.int32)

In [44]:
# train_data, train_data_idx, train_data_len = load_test_data('./data/train/train.txt')

In [84]:
train_data, train_data_idx, train_data_pos, train_data_cap, train_data_len, train_label = load_train_data('./data/train/combined_data.txt')
dev_data, dev_data_idx, dev_data_pos, dev_data_cap, dev_data_len, dev_label = load_train_data('./data/dev/dev_pos.txt')
# train_data_cap.shape = train_data_cap.shape + (1,)
# dev_data_cap.shape = dev_data_cap.shape + (1,)

In [97]:
test_data, test_data_idx, test_data_pos, test_data_cap, test_data_len = load_test_data('./data/test/test_pos.txt')

In [85]:
print(train_data_idx.shape, train_data_idx.dtype)
print(train_label.shape, train_label.dtype)
print(train_data_cap.shape, train_data_cap.dtype)

(23072, 52) int32
(23072, 52, 2) int32
(23072, 52) int32


In [14]:
test_data, test_data_idx, test_data_len = load_test_data('./data/test/test.nolabels.txt')

In [62]:
def get_lstm_cell(size):
    '''Get a lstm cell with size and wrapped with dropout'''
    return tf.contrib.rnn.DropoutWrapper(
        tf.contrib.rnn.LSTMCell(
            size, forget_bias=config.forget_bias, state_is_tuple=True
        ),
        output_keep_prob=config.keep_prob
    )

In [82]:
tf.reset_default_graph() # clear old graph

input_data = tf.placeholder(tf.int32, [None, config.step_n])
input_pos = tf.placeholder(tf.int32, [None, config.step_n])
input_cap = tf.placeholder(tf.int32, [None, config.step_n])
input_len = tf.placeholder(tf.int32, [None])

input_label = tf.placeholder(tf.float32, [None, config.step_n, config.class_n])

embedding = tf.Variable(embed_np)
softmax_w = tf.Variable(tf.random_normal(shape=[2 * config.hidden_dim, config.class_n], stddev=config.random_scale))
softmax_b = tf.Variable(tf.random_normal(shape=[config.class_n], stddev=config.random_scale))

input_embed = tf.nn.embedding_lookup(embedding, input_data)
input_pos_one_hot = tf.one_hot(input_pos, depth=tags_n, dtype=tf.float32)
input_cap_one_hot = tf.one_hot(input_cap, depth=2, dtype=tf.float32)
print(input_embed.get_shape(), input_pos_one_hot.get_shape(), input_cap_one_hot.get_shape())
input_fea = tf.concat([input_embed, input_pos_one_hot, input_cap_one_hot], axis=2)
# should be [None, config.step_n, config.fea_dim]
print(input_fea.get_shape())

fw_cell = tf.contrib.rnn.MultiRNNCell([get_lstm_cell(config.hidden_dim)] * config.layers_n, state_is_tuple=True)
bw_cell = tf.contrib.rnn.MultiRNNCell([get_lstm_cell(config.hidden_dim)] * config.layers_n, state_is_tuple=True)
output, _, _ = tf.contrib.rnn.static_bidirectional_rnn(fw_cell, bw_cell,
                                       tf.unstack(tf.transpose(input_fea, perm=[1, 0, 2])),
                                       dtype=tf.float32, sequence_length=input_len)
output = tf.reshape(tf.transpose(tf.stack(output), perm=[1, 0, 2]), [-1, 2 * config.hidden_dim])
# print('output:', output.get_shape())
logits = tf.nn.softmax(tf.matmul(output, softmax_w) + softmax_b)
logits = tf.reshape(logits, [-1, config.step_n, config.class_n])

pred_y = tf.to_int32(tf.argmax(logits, 2))

# print(logits.get_shape())
# print(input_label.get_shape())

cross_entropy = input_label * tf.log(logits)
cross_entropy = -tf.reduce_sum(cross_entropy, reduction_indices=2)
mask = tf.sign(tf.reduce_max(tf.abs(input_label), reduction_indices=2))
cross_entropy *= mask
cross_entropy = tf.reduce_sum(cross_entropy, reduction_indices=1)
cross_entropy /= tf.cast(input_len, tf.float32)
loss = tf.reduce_mean(cross_entropy)

optimizer = tf.train.AdamOptimizer(config.learning_rate)
tvars = tf.trainable_variables()
grads, _ = tf.clip_by_global_norm(tf.gradients(loss, tvars), config.grad_clip)
train_op = optimizer.apply_gradients(zip(grads, tvars))

init = tf.global_variables_initializer()

(?, 52, 300) (?, 52, 54) (?, 52, 2)
(?, 52, 356)


In [87]:
print(train_data_idx.dtype, train_data_pos.dtype, train_data_cap.dtype)

int32 int32 int32


In [89]:
'''Training is here'''
training_iters = train_data_idx.shape[0]

sess = tf.Session()
sess.run(init)

start_t = time.time()
for _ in range(config.epoch_n):
    iter_i = 0
    while iter_i < training_iters:
        batch_x = train_data_idx[iter_i : min(len(train_data_idx), iter_i + config.batch_size)]
        batch_pos = train_data_pos[iter_i : min(len(train_data_idx), iter_i + config.batch_size)]
        batch_cap = train_data_cap[iter_i : min(len(train_data_idx), iter_i + config.batch_size)]
        batch_len = train_data_len[iter_i : min(len(train_data_idx), iter_i + config.batch_size)]
        batch_y = train_label[iter_i : min(len(train_data_idx), iter_i + config.batch_size)]
        sess.run(train_op, feed_dict = {
                input_data: batch_x, input_pos: batch_pos, input_cap: batch_cap,
                input_len: batch_len, input_label: batch_y})
        if iter_i % config.display_iter == 0:
            tloss = sess.run(loss, feed_dict = {
                    input_data: batch_x, input_pos: batch_pos, input_cap: batch_cap,
                    input_len: batch_len, input_label: batch_y})
            print('Iter %d, current loss: %.5f' % (iter_i, tloss))
        iter_i += config.batch_size
print('Training complete, time used:', time.time() - start_t)

Iter 0, current loss: 0.80071
Iter 1000, current loss: 0.25278
Iter 2000, current loss: 0.19532
Iter 3000, current loss: 0.51025
Iter 4000, current loss: 0.11887
Iter 5000, current loss: 0.14929
Iter 6000, current loss: 0.18773
Iter 7000, current loss: 0.16350
Iter 8000, current loss: 0.04540
Iter 9000, current loss: 0.01795
Iter 10000, current loss: 0.02717
Iter 11000, current loss: 0.00900
Iter 12000, current loss: 0.04182
Iter 13000, current loss: 0.01675
Iter 14000, current loss: 0.01248
Iter 15000, current loss: 0.06180
Iter 16000, current loss: 0.00891
Iter 17000, current loss: 0.01173
Iter 18000, current loss: 0.01015
Iter 19000, current loss: 0.03032
Iter 20000, current loss: 0.36667
Iter 21000, current loss: 0.01943
Iter 22000, current loss: 0.07754
Iter 23000, current loss: 0.03351
Iter 0, current loss: 0.32233
Iter 1000, current loss: 0.12696
Iter 2000, current loss: 0.04874
Iter 3000, current loss: 0.02487
Iter 4000, current loss: 0.03145
Iter 5000, current loss: 0.06958
It

In [27]:
saver = tf.train.Saver()
saver.save(sess, './models/ner_model1')

'./models/ner_model1'

In [90]:
def label_to_file(filename, label, data_len):
    with open(filename, 'w') as ofile:
        for sent_label, sent_len in zip(label, data_len):
            for i in range(sent_len):
                tlabel = 'O'
                if sent_label[i] == 1:
                    if i > 0 and sent_label[i-1] == 1:
                        tlabel = 'I'
                    else:
                        tlabel = 'B'                    
                ofile.write(tlabel+'\n')
            ofile.write('\n')

In [93]:
dev_pred = sess.run(pred_y, feed_dict = {
        input_data: dev_data_idx, input_pos: dev_data_pos, input_cap: dev_data_cap,
        input_len: dev_data_len})
label_to_file(dev_out_path, dev_pred, dev_data_len)

In [95]:
start_t = time.time()
evaluate_tagging_file('./data/dev/dev.txt', dev_out_path)
print('Test complete, time used:', time.time() - start_t)

Span-level NER evaluation
F = 0.4562,  Prec = 0.4841 (198/409),  Rec = 0.4314 (198/459)
(959 sentences, 13360 tokens, 459 gold spans, 409 predicted spans)
Test complete, time used: 0.0693213939666748


In [98]:
dev_pred = sess.run(pred_y, feed_dict = {
        input_data: test_data_idx, input_pos: test_data_pos, input_cap: test_data_cap,
        input_len: test_data_len})
label_to_file(test_out_path, test_pred, test_data_len)