In [79]:
import time
import re

import tensorflow as tf
import numpy as np

from tageval import evaluate_tagging_file

In [120]:
class MyConfig(object):
    '''My configuration'''
    learning_rate = 0.0001
    embed_dim = 300
    hidden_dim = 256
    step_n = 52 # max sentence length
    layers_n = 2
    class_n = 2
    random_scale = 0.1
    keep_prob = 1.0 # used for dropout
    forget_bias = 1.0
    grad_clip = 10
    # below are related to training
    epoch_n = 5
    batch_size = 10
    display_iter = 1000

config = MyConfig()

In [45]:
dev_out_path = './t_result'

In [97]:
# def find_max_len(data_path):
#     max_len = 0
#     for sentence in open(data_path, encoding='utf-8').read().strip().split('\n\n'):
#         max_len = max(max_len, len(sentence.strip().split('\n')))
#     return max_len

# train_max_len = max(find_max_len('./data/train/train.txt'), find_max_len('./conll_data/conll2003_train.txt'))
# dev_max_len = find_max_len('./data/dev/dev.txt')
# test_max_len = find_max_len('./data/test/test.nolabels.txt')
# print(train_max_len, dev_max_len, test_max_len)

52 41 52


In [98]:
mycompile = lambda pat:  re.compile(pat,  re.UNICODE)
re_patten = {'<NUM>': mycompile('^[0-9\.,/-]+$'),
             '<URL>': mycompile('https?://\S+')}

def norm_word(word):
    '''normalize word'''
    if len(word) > 0 and word[0] == '@':
        return'<@>'
    for key, patten in re_patten.items():
        if patten.match(word):
            return key
    return word

def get_words(data_path):
    words_list = []
    for sentence in open(data_path, encoding='utf-8').read().strip().split('\n\n'):
        for line in sentence.strip().split('\n'):
            words_list.append(norm_word(line.strip().split('\t')[0]))
    return words_list

def get_words_dict(word_list):
    '''get words_dict'''
    words_set = set(['<PAD>', '<@>', '<UKN>'])
    for word in word_list:
        words_set.add(word)
    words_dict = dict(zip(words_set, range(len(words_set))))
    return words_dict

# def get_words_dict(data_path):
#     '''get words_dict'''
#     words_set = set(['<PAD>', '<@>', '<UKN>'])
#     for sentence in open(data_path, encoding='utf-8').read().strip().split('\n\n'):
#         for line in sentence.strip().split('\n'):
#             words_set.add(norm_word(line.strip().split('\t')[0]))
#     words_dict = dict(zip(words_set, range(len(words_set))))
#     return words_dict

In [99]:
words_list = get_words('./data/train/train.txt')
words_list.extend(get_words('./conll_data/conll2003_train.txt'))
words_dict = get_words_dict(words_list)
vocab_size = len(words_dict)
print('vocab_size:', vocab_size)

vocab_size: 31525


In [6]:
label_dict = {'O': 0, 'B': 1, 'I':1}

# words_dict = get_words_dict('./data/train/train.txt')
# vocab_size = len(words_dict)
# print('vocab_size:', vocab_size)

vocab_size: 9456


In [113]:
# load word2vec model pretrained on GoogleNews
load_big_model = False
if load_big_model:
    import gensim
    w2v_model = gensim.models.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin.gz', binary=True)
    embed_np = np.zeros((vocab_size, config.embed_dim), dtype=np.float32)
    for key, val in words_dict.items():
        if key in w2v_model:
            embed_np[val] = w2v_model[key]
    embed_np.dump('./my_embedding')
    del w2v_model
else:
    embed_np = np.load('./my_embedding')
print('embed_np.shape:', embed_np.shape)

embed_np.shape: (31525, 300)


In [17]:
def word_to_idx(word):
    word = norm_word(word)
    if word in words_dict:
        return words_dict[word]
    return words_dict['<UKN>']

def load_data_label(data_path):
    '''
    Return
    data: list [list [word]]
    data_idx: np.array [num_sentence, num_words]
    data_len: np.array [num_sentence]
    label: np.array [num_sentence, num_words, 2]
    '''
    data, data_idx, data_len, label = [], [], [], []
    for sentence in open(data_path, encoding='utf-8').read().strip().split('\n\n'):
        sent_data, sent_data_idx, sent_label = [], [], []
        for line in sentence.strip().split('\n'):
            line = line.strip().split('\t')
            sent_data.append(line[0])
            sent_data_idx.append(word_to_idx(line[0]))
            sent_label.append([0, 0])
            sent_label[-1][label_dict[line[1]]] = 1
        sent_data_idx.extend([words_dict['<PAD>']] * (config.step_n - len(sent_data_idx)))
        sent_label.extend([[0, 0]] * (config.step_n - len(sent_label)))
        data.append(sent_data)
        data_idx.append(sent_data_idx)
        data_len.append(len(sent_data))
        label.append(sent_label)
    return data, np.asarray(data_idx, dtype=np.int32), np.asarray(data_len, dtype=np.int32), np.asarray(label, dtype=np.int32)

# def load_test_data(data_path):
#     def load_data_label(data_path):
#     '''
#     Return
#     data: list [list [word]]
#     data_idx: np.array [num_sentence, num_words]
#     data_len: np.array [num_sentence]
#     '''

In [131]:
import spacy
nlp = spacy.load('en')
doc = nlp(u'This is a sentence.')



    Only loading the 'en' tokenizer.



In [132]:
for word in doc:
    print(word.text, word.lemma, word.lemma_, word.tag, word.tag_, word.pos, word.pos_)

This 0  0  0 
is 0  0  0 
a 0  0  0 
sentence 0  0  0 
. 0  0  0 


In [107]:
train_data, train_data_idx, train_data_len, train_label = load_data_label('./data/train/train.txt')
dev_data, dev_data_idx, dev_data_len, dev_label = load_data_label('./data/dev/dev.txt')

In [108]:
train_data_conll, train_data_idx_conll, train_data_len_conll, train_label_conll = load_data_label('./conll_data/conll2003_train.txt')

In [109]:
train_data_idx = np.concatenate((train_data_idx, train_data_idx_conll), axis=0)
train_data_len = np.concatenate((train_data_len, train_data_len_conll), axis=0)
train_label = np.concatenate((train_label, train_label_conll), axis=0)

In [110]:
print(train_data_idx.shape, train_data_idx.dtype)
print(train_label.shape, train_label.dtype)

(23072, 52) int32
(23072, 52, 2) int32


In [4]:
def get_lstm_cell(size):
    '''Get a lstm cell with size and wrapped with dropout'''
    return tf.contrib.rnn.DropoutWrapper(
        tf.contrib.rnn.LSTMCell(
            size, forget_bias=config.forget_bias, state_is_tuple=True
        ),
        output_keep_prob=config.keep_prob
    )

In [121]:
tf.reset_default_graph() # clear old graph

input_data = tf.placeholder(tf.int32, [None, config.step_n])
input_len = tf.placeholder(tf.int32, [None])
input_label = tf.placeholder(tf.float32, [None, config.step_n, config.class_n])

embedding = tf.Variable(embed_np)
softmax_w = tf.Variable(tf.random_normal(shape=[2 * config.hidden_dim, config.class_n], stddev=config.random_scale))
softmax_b = tf.Variable(tf.random_normal(shape=[config.class_n], stddev=config.random_scale))

input_embed = tf.nn.embedding_lookup(embedding, input_data)
# should be [None, config.step_n, config.embed_dim]
# print(input_embed.get_shape())

fw_cell = tf.contrib.rnn.MultiRNNCell([get_lstm_cell(config.hidden_dim)] * config.layers_n, state_is_tuple=True)
bw_cell = tf.contrib.rnn.MultiRNNCell([get_lstm_cell(config.hidden_dim)] * config.layers_n, state_is_tuple=True)
output, _, _ = tf.contrib.rnn.static_bidirectional_rnn(fw_cell, bw_cell,
                                       tf.unstack(tf.transpose(input_embed, perm=[1, 0, 2])),
                                       dtype=tf.float32, sequence_length=input_len)
output = tf.reshape(tf.transpose(tf.stack(output), perm=[1, 0, 2]), [-1, 2 * config.hidden_dim])
# print('output:', output.get_shape())
logits = tf.nn.softmax(tf.matmul(output, softmax_w) + softmax_b)
logits = tf.reshape(logits, [-1, config.step_n, config.class_n])

pred_y = tf.to_int32(tf.argmax(logits, 2))

# print(logits.get_shape())
# print(input_label.get_shape())

cross_entropy = input_label * tf.log(logits)
cross_entropy = -tf.reduce_sum(cross_entropy, reduction_indices=2)
mask = tf.sign(tf.reduce_max(tf.abs(input_label), reduction_indices=2))
cross_entropy *= mask
cross_entropy = tf.reduce_sum(cross_entropy, reduction_indices=1)
cross_entropy /= tf.cast(input_len, tf.float32)
loss = tf.reduce_mean(cross_entropy)

optimizer = tf.train.AdamOptimizer(config.learning_rate)
tvars = tf.trainable_variables()
grads, _ = tf.clip_by_global_norm(tf.gradients(loss, tvars), config.grad_clip)
train_op = optimizer.apply_gradients(zip(grads, tvars))

init = tf.global_variables_initializer()

In [122]:
'''Training is here'''
training_iters = train_data_idx.shape[0]

sess = tf.Session()
sess.run(init)

start_t = time.time()
for _ in range(config.epoch_n):
    iter_i = 0
    while iter_i < training_iters:
        batch_x = train_data_idx[iter_i : min(len(train_data_idx), iter_i + config.batch_size)]
        batch_y = train_label[iter_i : min(len(train_data_idx), iter_i + config.batch_size)]
        batch_len = train_data_len[iter_i : min(len(train_data_idx), iter_i + config.batch_size)]
        sess.run(train_op, feed_dict={input_data: batch_x, input_len: batch_len, input_label: batch_y})
        if iter_i % config.display_iter == 0:
            tloss = sess.run(loss, feed_dict={input_data: batch_x, input_len: batch_len, input_label: batch_y})
            print('Iter %d, current loss: %.5f' % (iter_i, tloss))
        iter_i += config.batch_size
print('Training complete, time used:', time.time() - start_t)

Iter 0, current loss: 0.78438
Iter 1000, current loss: 0.27375
Iter 2000, current loss: 0.20644
Iter 3000, current loss: 0.59745
Iter 4000, current loss: 0.22995
Iter 5000, current loss: 0.22609
Iter 6000, current loss: 0.29554
Iter 7000, current loss: 0.26306
Iter 8000, current loss: 0.11595
Iter 9000, current loss: 0.08131
Iter 10000, current loss: 0.08091
Iter 11000, current loss: 0.01852
Iter 12000, current loss: 0.09810
Iter 13000, current loss: 0.07720
Iter 14000, current loss: 0.02811
Iter 15000, current loss: 0.05858
Iter 16000, current loss: 0.01446
Iter 17000, current loss: 0.01262
Iter 18000, current loss: 0.01033
Iter 19000, current loss: 0.03960
Iter 20000, current loss: 0.36965
Iter 21000, current loss: 0.02823
Iter 22000, current loss: 0.08570
Iter 23000, current loss: 0.03997
Iter 0, current loss: 0.26396
Iter 1000, current loss: 0.15037
Iter 2000, current loss: 0.06800
Iter 3000, current loss: 0.03756
Iter 4000, current loss: 0.01938
Iter 5000, current loss: 0.08270
It

In [74]:
def label_to_file(filename, label, data_len):
    with open(filename, 'w') as ofile:
        for sent_label, sent_len in zip(label, data_len):
            for i in range(sent_len):
                tlabel = 'O'
                if sent_label[i] == 1:
                    if i > 0 and sent_label[i-1] == 1:
                        tlabel = 'I'
                    else:
                        tlabel = 'B'                    
                ofile.write(tlabel+'\n')
            ofile.write('\n')

In [123]:
dev_pred = sess.run(pred_y, feed_dict={input_data: dev_data_idx, input_len: dev_data_len})

In [124]:
label_to_file(dev_out_path, dev_pred, dev_data_len)

In [125]:
evaluate_tagging_file('./data/dev/dev.txt', dev_out_path)

Span-level NER evaluation
F = 0.3323,  Prec = 0.3193 (159/498),  Rec = 0.3464 (159/459)
(959 sentences, 13360 tokens, 459 gold spans, 498 predicted spans)
