In [1]:
import time
import re

import tensorflow as tf
import numpy as np

from tageval import evaluate_tagging_file

In [23]:
class MyConfig(object):
    '''My configuration'''
    learning_rate = 0.0001
    embed_dim = 300
    tags_dim = 54
    fea_dim = embed_dim + tags_dim + 2
    hidden_dim = 256
    class_n = 2
    random_scale = 0.1
    # below are related to training
    epoch_n = 3
    batch_size = 200
    display_iter = 5000

config = MyConfig()

In [3]:
dev_out_path = './mlp_dev_result.txt'

In [4]:
mycompile = lambda pat:  re.compile(pat,  re.UNICODE)
re_patten = {'<NUM>': mycompile('^[0-9\.,/-]+$'),
             '<URL>': mycompile('https?://\S+')}

def norm_word(word):
    '''normalize word'''
    if len(word) > 0 and word[0] == '@':
        return'<@>'
    for key, patten in re_patten.items():
        if patten.match(word):
            return key
    return word

def get_words(data_path):
    words_list = []
    for sentence in open(data_path, encoding='utf-8').read().strip().split('\n\n'):
        for line in sentence.strip().split('\n'):
            words_list.append(norm_word(line.strip().split('\t')[0]))
    return words_list

# def get_words_dict(word_list):
#     '''get words_dict'''
#     words_set = set(['<PAD>', '<@>', '<UKN>'])
#     for word in word_list:
#         words_set.add(word)
#     words_dict = dict(zip(words_set, range(len(words_set))))
#     return words_dict

def get_words_dict(data_path):
    '''get words_dict'''
    words_set = set(['<PAD>', '<@>', '<UKN>'])
    for sentence in open(data_path, encoding='utf-8').read().strip().split('\n\n'):
        for line in sentence.strip().split('\n'):
            words_set.add(norm_word(line.strip().split('\t')[0]))
    words_dict = dict(zip(words_set, range(len(words_set))))
    return words_dict

In [5]:
words_dict = get_words_dict('./data/train/combined_data.txt')
vocab_size = len(words_dict)
print('vocab_size:', vocab_size)

vocab_size: 31109


In [6]:
label_dict = {'O': 0, 'B': 1, 'I':1}

In [7]:
# load word2vec model pretrained on GoogleNews
load_big_model = False
if load_big_model:
    import gensim
    w2v_model = gensim.models.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin.gz', binary=True)
    embed_np = np.zeros((vocab_size, config.embed_dim), dtype=np.float32)
    for key, val in words_dict.items():
        if key in w2v_model:
            embed_np[val] = w2v_model[key]
    embed_np.dump('./my_embedding')
    del w2v_model
else:
    embed_np = np.load('./my_embedding')
print('embed_np.shape:', embed_np.shape)

embed_np.shape: (31109, 300)


In [8]:
tags_dict = {'URL': 40, 'WP$': 2, 'VBG': 3, 'VBZ': 1, 'RBR': 4, 'IN': 6, 'RB': 7, 'CD': 9, 'VBD': 10, 'NONE': 11, 'JJR': 29, 'FW': 12, 'PDT': 13, 'VB': 15, ')': 16, 'NNS': 17, 'USR': 18, 'MD': 19, 'RT': 38, 'EX': 20, 'O': 21, 'NNPS': 22, 'RBS': 23, 'CC': 50, 'WDT': 14, '``': 24, 'VPP': 34, 'SYM': 26, 'NNP': 51, 'TO': 27, 'HT': 28, 'WP': 30, 'LS': 31, 'JJS': 32, 'DT': 33, 'POS': 35, 'WRB': 36, 'VBN': 37, "''": 39, 'UH': 41, 'PRP': 42, 'RP': 43, ',': 8, 'NN': 44, 'PRP$': 5, 'JJ': 45, '$': 46, '#': 47, '(': 49, 'VBP': 25, '<PAD>': 0, ':': 53, 'TD': 52, '.': 48}
print(tags_dict)
inv_tags_dict = {v: k for k, v in tags_dict.items()}

tags_n = len(tags_dict)

{'POS': 35, 'URL': 40, 'CC': 50, 'MD': 19, 'NNPS': 22, 'VBD': 10, 'WP': 30, ',': 8, '$': 46, 'NNP': 51, '(': 49, 'RBS': 23, 'VBG': 3, 'JJ': 45, 'NONE': 11, 'RB': 7, ':': 53, ')': 16, 'O': 21, "''": 39, 'VBN': 37, 'UH': 41, 'WRB': 36, 'TO': 27, 'FW': 12, 'WDT': 14, 'NNS': 17, 'JJS': 32, 'JJR': 29, 'PRP$': 5, 'VPP': 34, 'PRP': 42, 'TD': 52, 'IN': 6, 'HT': 28, 'EX': 20, 'VB': 15, 'VBP': 25, 'CD': 9, 'WP$': 2, '<PAD>': 0, 'USR': 18, 'DT': 33, 'RP': 43, '.': 48, 'VBZ': 1, '``': 24, 'RBR': 4, 'PDT': 13, 'LS': 31, 'RT': 38, 'NN': 44, 'SYM': 26, '#': 47}


In [9]:
def norm_pos_tag(tag):
    if tag == '\"':
        return tags_dict["''"]
    elif tag == 'NN|SYM':
        return tags_dict['NN']
    return tags_dict[tag]

In [16]:
def word_to_idx(word):
    word = norm_word(word)
    if word in words_dict:
        return words_dict[word]
    return words_dict['<UKN>']

def load_train_data(data_path):
    '''
    Return
    data: list [list [word]]
    data_idx: np.array [num_words]
    data_pos: np.array [num_words]
    data_cap: np.array [num_words]
    data_len: np.array [num_sentence]
    label: np.array [num_words, 2]
    '''
    data, data_idx, data_pos, data_cap, data_len, label = [], [], [], [], [], []
    for sentence in open(data_path, encoding='utf-8').read().strip().split('\n\n'):
        sentence = sentence.strip().split('\n')
        for line in sentence:
            line = line.strip().split('\t')        
            data.append(line[0])
            data_idx.append(word_to_idx(line[0]))
            data_pos.append(norm_pos_tag(line[2]))
            data_cap.append(1 if line[0][0].isupper() else 0)
            label.append([0, 0])
            label[-1][label_dict[line[1]]] = 1
        data_len.append(len(sentence))
    return data, np.asarray(data_idx, dtype=np.int32), np.asarray(data_pos, dtype=np.int32), np.asarray(data_cap, dtype=np.int32), np.asarray(data_len, dtype=np.int32), np.asarray(label, dtype=np.int32)

In [17]:
train_data, train_data_idx, train_data_pos, train_data_cap, train_data_len, train_label = load_train_data('./data/train/combined_data.txt')
dev_data, dev_data_idx, dev_data_pos, dev_data_cap, dev_data_len, dev_label = load_train_data('./data/dev/dev_pos.txt')

In [18]:
print(train_data_idx.shape, train_data_idx.dtype)
print(train_label.shape, train_label.dtype)
print(train_data_cap.shape, train_data_cap.dtype)
print(train_data_len.shape, train_data_len.dtype)

(343685,) int32
(343685, 2) int32
(343685,) int32
(23072,) int32


In [32]:
tf.reset_default_graph() # clear old graph

input_data = tf.placeholder(tf.int32, [None])
input_pos = tf.placeholder(tf.int32, [None])
input_cap = tf.placeholder(tf.int32, [None])

input_label = tf.placeholder(tf.int32, [None, config.class_n])

embedding = tf.Variable(embed_np)
softmax_w = tf.Variable(tf.random_normal(shape=[2 * config.hidden_dim, config.class_n], stddev=config.random_scale))
softmax_b = tf.Variable(tf.random_normal(shape=[config.class_n], stddev=config.random_scale))

weights = {
    'layer_1': tf.Variable(tf.random_normal([config.fea_dim, config.hidden_dim])),
    'layer_out': tf.Variable(tf.random_normal([config.hidden_dim, config.class_n]))
}

biases = {
    'layer_1': tf.Variable(tf.random_normal([config.hidden_dim])),
    'layer_out': tf.Variable(tf.random_normal([config.class_n]))
}

input_embed = tf.nn.embedding_lookup(embedding, input_data)
input_pos_one_hot = tf.one_hot(input_pos, depth=tags_n, dtype=tf.float32)
input_cap_one_hot = tf.one_hot(input_cap, depth=2, dtype=tf.float32)
print(input_embed.get_shape(), input_pos_one_hot.get_shape(), input_cap_one_hot.get_shape())
input_fea = tf.concat([input_embed, input_pos_one_hot, input_cap_one_hot], axis=1)
# should be [None, config.fea_dim]
print(input_fea.get_shape())

hidden_fea = tf.matmul(input_fea, weights['layer_1']) + biases['layer_1']
hidden_fea = tf.nn.relu(hidden_fea)
print(hidden_fea.get_shape())

output = tf.matmul(hidden_fea, weights['layer_out']) + biases['layer_out']
pred_y = tf.to_int32(tf.argmax(output, 1))
correct_pred = tf.equal(pred_y, input_label)
# print(correct_pred.get_shape())
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
# Compute loss
# y_one_hot = tf.one_hot(y, depth=output_dim)
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=output, labels=input_label)) / config.batch_size

train_op = tf.train.AdamOptimizer(learning_rate=config.learning_rate).minimize(loss)

init = tf.global_variables_initializer()

(?, 300) (?, 54) (?, 2)
(?, 356)
(?, 256)


In [33]:
'''Training is here'''
training_iters = train_data_idx.shape[0]

sess = tf.Session()
sess.run(init)

start_t = time.time()
for _ in range(config.epoch_n):
# for _ in range(1):
    iter_i = 0
    while iter_i < training_iters:
        batch_x = train_data_idx[iter_i : min(len(train_data_idx), iter_i + config.batch_size)]
        batch_pos = train_data_pos[iter_i : min(len(train_data_idx), iter_i + config.batch_size)]
        batch_cap = train_data_cap[iter_i : min(len(train_data_idx), iter_i + config.batch_size)]
        batch_y = train_label[iter_i : min(len(train_data_idx), iter_i + config.batch_size)]
        sess.run(train_op, feed_dict = {
                input_data: batch_x, input_pos: batch_pos, input_cap: batch_cap, input_label: batch_y})
        if iter_i % config.display_iter == 0:
            tloss = sess.run(loss, feed_dict = {
                    input_data: batch_x, input_pos: batch_pos, input_cap: batch_cap, input_label: batch_y})
            print('Iter %d, current loss: %.5f' % (iter_i, tloss))
        iter_i += config.batch_size
print('Training complete, time used:', time.time() - start_t)

Iter 0, current loss: 0.25650
Iter 5000, current loss: 0.18363
Iter 10000, current loss: 0.11161
Iter 15000, current loss: 0.09278
Iter 20000, current loss: 0.07222
Iter 25000, current loss: 0.07948
Iter 30000, current loss: 0.01840
Iter 35000, current loss: 0.03125
Iter 40000, current loss: 0.02693
Iter 45000, current loss: 0.04486
Iter 50000, current loss: 0.03579
Iter 55000, current loss: 0.05911
Iter 60000, current loss: 0.03992
Iter 65000, current loss: 0.03166
Iter 70000, current loss: 0.01197
Iter 75000, current loss: 0.06403
Iter 80000, current loss: 0.03870
Iter 85000, current loss: 0.01569
Iter 90000, current loss: 0.01334
Iter 95000, current loss: 0.00899
Iter 100000, current loss: 0.01066
Iter 105000, current loss: 0.01798
Iter 110000, current loss: 0.01014
Iter 115000, current loss: 0.01636
Iter 120000, current loss: 0.01190
Iter 125000, current loss: 0.02199
Iter 130000, current loss: 0.00401
Iter 135000, current loss: 0.01573
Iter 140000, current loss: 0.02417
Iter 14500

KeyboardInterrupt: 

In [42]:
def label_to_file(filename, label, data_len):
    with open(filename, 'w') as ofile:
        count = 0
        for sent_len in data_len:
            for i in range(sent_len):
                tlabel = 'O'
                if label[count] == 1:
                    if i > 0 and count > 0 and label[count-1] == 1:
                        tlabel = 'I'
                    else:
                        tlabel = 'B'                    
                ofile.write(tlabel+'\n')
                count += 1
            ofile.write('\n')

In [43]:
start_t = time.time()
dev_pred = sess.run(pred_y, feed_dict = {
        input_data: dev_data_idx, input_pos: dev_data_pos, input_cap: dev_data_cap})
print('Dev complete, time used:', time.time() - start_t)
label_to_file(dev_out_path, dev_pred, dev_data_len)

Dev complete, time used: 0.043956756591796875


In [44]:
evaluate_tagging_file('./data/dev/dev.txt', dev_out_path)

Span-level NER evaluation
F = 0.2662,  Prec = 0.2645 (123/465),  Rec = 0.2680 (123/459)
(959 sentences, 13360 tokens, 459 gold spans, 465 predicted spans)
