In [1]:
import time

import tensorflow as tf
import numpy as np

In [128]:
class MyConfig(object):
    '''My configuration'''
    learning_rate = 0.0003
    embed_dim = 300
    hidden_dim = 256
    step_n = 52 # max sentence length
    layers_n = 2
    class_n = 2
    random_scale = 0.1
    keep_prob = 0.5 # used for dropout
    grad_clip = 10
    # below are related to training
    epoch_n = 1
    batch_size = 20
    display_iter = 200

config = MyConfig()

In [10]:
def find_max_len(data_path):
    max_len = 0
    for sentence in open(data_path, encoding='utf-8').read().strip().split('\n\n'):
        max_len = max(max_len, len(sentence.strip().split('\n')))
    return max_len

train_max_len = find_max_len('./data/train/train.txt')
dev_max_len = find_max_len('./data/dev/dev.txt')
test_max_len = find_max_len('./data/test/test.nolabels.txt')
print(train_max_len, dev_max_len, test_max_len)

In [2]:
def get_lstm_cell(size):
    '''Get a lstm cell with size and wrapped with dropout'''
    return tf.contrib.rnn.DropoutWrapper(
        tf.contrib.rnn.BasicLSTMCell(
            size, forget_bias=config.forget_bias, state_is_tuple=True
        ),
        output_keep_prob=config.keep_prob
    )

In [35]:
def norm_word(word):
    '''normalize word'''
    if len(word) > 0 and word[0] == '@':
        return'<@>'
    return word

def get_words_dict(data_path):
    '''get words_dict'''
    words_set = set(['<PAD>', '<@>', '<UKN>'])
    for sentence in open(data_path, encoding='utf-8').read().strip().split('\n\n'):
        for line in sentence.strip().split('\n'):
            words_set.add(norm_word(line.strip().split('\t')[0]))
    words_dict = dict(zip(words_set, range(len(words_set))))
    return words_dict

In [36]:
label_dict = {'O': 0, 'B': 1, 'I':1}

words_dict = get_words_dict('./data/train/train.txt')
vocab_size = len(words_dict)
print('vocab_size:', vocab_size)

vocab_size: 9456


In [108]:
def word_to_idx(word):
    word = norm_word(word)
    if word in words_dict:
        return words_dict[word]
    return words_dict['<UKN>']

def load_data_label(data_path):
    '''
    Return
    data: list [list [word]]
    data_idx: np.array [num_sentence, num_words]
    data_len: np.array [num_sentence]
    label: np.array [num_sentence, num_words, 2]
    '''
    data, data_idx, data_len, label = [], [], [], []
    for sentence in open(data_path, encoding='utf-8').read().strip().split('\n\n'):
        sent_data, sent_data_idx, sent_label = [], [], []
        for line in sentence.strip().split('\n'):
            line = line.strip().split('\t')
            sent_data.append(line[0])
            sent_data_idx.append(word_to_idx(line[0]))
            sent_label.append([0, 0])
            sent_label[-1][label_dict[line[1]]] = 1
        sent_data_idx.extend([words_dict['<PAD>']] * (config.step_n - len(sent_data_idx)))
        sent_label.extend([[0, 0]] * (config.step_n - len(sent_label)))
        data.append(sent_data)
        data_idx.append(sent_data_idx)
        data_len.append(len(sent_data))
        label.append(sent_label)
    return data, np.asarray(data_idx), np.asarray(data_len), np.asarray(label)

In [109]:
train_data, train_data_idx, train_data_len, train_label = load_data_label('./data/train/train.txt')
dev_data, dev_data_idx, dev_data_len, dev_label = load_data_label('./data/dev/dev.txt')

In [91]:
print(train_data_idx.shape, train_data_idx.dtype)
print(train_label.shape, train_label.dtype)

(2394, 52) int32
(2394, 52, 2) int32


In [129]:
tf.reset_default_graph() # clear old graph

input_data = tf.placeholder(tf.int32, [None, config.step_n])
input_len = tf.placeholder(tf.int32, [None])
input_label = tf.placeholder(tf.float32, [None, config.step_n, config.class_n])

embedding = tf.Variable(tf.random_normal([vocab_size, config.embed_dim]))
softmax_w = tf.Variable(tf.random_normal(shape=[2 * config.hidden_dim, config.class_n], stddev=config.random_scale))
softmax_b = tf.Variable(tf.random_normal(shape=[config.class_n], stddev=config.random_scale))

input_embed = tf.nn.embedding_lookup(embedding, input_data)
# should be [None, config.step_n, config.embed_dim]
# print(input_embed.get_shape())

fw_cell = tf.contrib.rnn.LSTMCell(config.hidden_dim, state_is_tuple=True)
fw_cell = tf.contrib.rnn.DropoutWrapper(fw_cell, output_keep_prob=config.keep_prob)
bw_cell = tf.contrib.rnn.LSTMCell(config.hidden_dim, state_is_tuple=True)
bw_cell = tf.contrib.rnn.DropoutWrapper(bw_cell, output_keep_prob=config.keep_prob)
fw_cell = tf.contrib.rnn.MultiRNNCell([fw_cell] * config.layers_n, state_is_tuple=True)
bw_cell = tf.contrib.rnn.MultiRNNCell([bw_cell] * config.layers_n, state_is_tuple=True)
# words_used_in_sent = tf.sign(tf.reduce_max(tf.abs(input_label), reduction_indices=2))
# length = tf.cast(tf.reduce_sum(words_used_in_sent, reduction_indices=1), tf.int32)
output, _, _ = tf.contrib.rnn.static_bidirectional_rnn(fw_cell, bw_cell,
                                       tf.unstack(tf.transpose(input_embed, perm=[1, 0, 2])),
                                       dtype=tf.float32, sequence_length=input_len)
output = tf.reshape(tf.transpose(tf.stack(output), perm=[1, 0, 2]), [-1, 2 * config.hidden_dim])
# print('output:', output.get_shape())
logits = tf.nn.softmax(tf.matmul(output, softmax_w) + softmax_b)
logits = tf.reshape(logits, [-1, config.step_n, config.class_n])

pred_y = tf.to_int32(tf.argmax(logits, 2))

# print(logits.get_shape())
# print(input_label.get_shape())

cross_entropy = input_label * tf.log(logits)
cross_entropy = -tf.reduce_sum(cross_entropy, reduction_indices=2)
mask = tf.sign(tf.reduce_max(tf.abs(input_label), reduction_indices=2))
cross_entropy *= mask
cross_entropy = tf.reduce_sum(cross_entropy, reduction_indices=1)
cross_entropy /= tf.cast(input_len, tf.float32)
loss = tf.reduce_mean(cross_entropy)

optimizer = tf.train.AdamOptimizer(config.learning_rate)
tvars = tf.trainable_variables()
grads, _ = tf.clip_by_global_norm(tf.gradients(loss, tvars), config.grad_clip)
train_op = optimizer.apply_gradients(zip(grads, tvars))

init = tf.global_variables_initializer()

output: (?, 512)
(?, 52, 2)
(?, 52, 2)


In [130]:
'''Training is here'''
training_iters = train_data_idx.shape[0]

sess = tf.Session()
sess.run(init)

start_t = time.time()
for _ in range(config.epoch_n):
    iter_i = 0
    while iter_i < training_iters:
        batch_x = train_data_idx[iter_i : min(len(train_data), iter_i + config.batch_size)]
        batch_y = train_label[iter_i : min(len(train_data), iter_i + config.batch_size)]
        batch_len = train_data_len[iter_i : min(len(train_data), iter_i + config.batch_size)]
        sess.run(train_op, feed_dict={input_data: batch_x, input_len: batch_len, input_label: batch_y})
        if iter_i % config.display_iter == 0:
            tloss = sess.run(loss, feed_dict={input_data: batch_x, input_len: batch_len, input_label: batch_y})
            print('Iter %d, current loss: %.5f' % (iter_i, tloss))
        iter_i += config.batch_size
print('Training complete, time used:', time.time() - start_t)

Iter 0, current loss: 0.55993
Iter 200, current loss: 0.17122
Iter 400, current loss: 0.24986
Iter 600, current loss: 0.28127
Iter 800, current loss: 0.22982
Iter 1000, current loss: 0.21043
Iter 1200, current loss: 0.30745
Iter 1400, current loss: 0.29178
Iter 1600, current loss: 0.23876
Iter 1800, current loss: 0.21266
Iter 2000, current loss: 0.24083
Iter 2200, current loss: 0.27678
Training complete, time used: 180.01592302322388
