In [1]:
import time
import re

import tensorflow as tf
from tensorflow.contrib import rnn
import numpy as np
import pandas as pd

In [22]:
class MyConfig(object):
    '''My configuration'''
    learning_rate = 0.001
    grad_max = 2.0 # used for gradient clipping
    layers_n = 2
    use_pretrain = True
    embed_size = 300 # if using pretrained, most have the same dimension
    steps_n = 20 # rnn steps number
    batch_size = 20
    epoch_n = 2 # one epoch means training throught the dataset
    keep_prob = 1.0 # used for dropout
    loss_min = 0.5 # used to early stop training
    display_iter = 5000
    forget_bias = 1.0

config = MyConfig()

In [19]:
mycompile = lambda pat:  re.compile(pat,  re.UNICODE)
re_patten = {'<NUM>': mycompile('^[0-9\.,/-]+$'),
             '<URL>': mycompile('https?://\S+')}

def norm_word(word):
    '''normalize word'''
    if len(word) > 0 and word[0] == '@':
        return'<@>'
    for key, patten in re_patten.items():
        if patten.match(word):
            return key
    return word

In [21]:
def load_data_label(data_path):
    '''
    Return
    data: list [list [word]]
    label: np.array [num_sentence, num_words, 2]
    '''
    data, label = [], []
    for sentence in open(data_path, encoding='utf-8').read().strip().split('\n\n'):
        for line in sentence.strip().split('\n'):
            line = line.split()
            data.append(line[0])
            label.append(line[1])
    return data, label

In [41]:
finetune_words, finetune_tags = load_data_label('./data/pos.txt')

In [32]:
data_path = './data/'

train_words = pd.read_csv(data_path + 'train_x.csv')['word']
train_tags = pd.read_csv(data_path + 'train_y.csv')['tag']

dev_words = pd.read_csv(data_path + 'dev_x.csv')['word']
dev_tags = pd.read_csv(data_path + 'dev_y.csv')['tag']

In [33]:
'''Add padding tag with value 0 for both data and label'''
# tags_set = set(train_tags)
# tags_set = {'DT', 'CD', 'NNS', 'VBG', 'PRP', '(', 'IN', 'NN', 'UH', 'RBS', 'VBP', '``', 'RP', 'PRP$', 'WP$', '$', '<PAD>', 'JJS', "''", 'RT', 'PDT', 'NONE', 'LS', 'VBD', 'EX', 'POS', ':', 'TO', 'CC', '.', 'VBZ', ',', 'RBR', '#', 'JJ', 'WDT', 'SYM', 'URL', 'O', 'TD', 'WP', 'USR', 'VBN', 'VB', 'RB', 'HT', 'NNP', 'VPP', 'MD', ')', 'JJR', 'WRB', 'NNPS', 'FW'}
# tags_dict = dict(zip(tags_set, range(len(tags_set))))
# tags_dict['<PAD>'] = 0
tags_dict = {'URL': 40, 'WP$': 2, 'IN': 6, 'VBG': 3, 'VBZ': 1, 'RBR': 4, 'PRP$': 5, 'TD': 52, 'RB': 7, 'CD': 9, 'VBD': 10, 'NONE': 11, 'WP': 30, 'FW': 12, 'PDT': 13, 'VB': 15, ')': 16, 'NNS': 17, 'USR': 18, 'MD': 19, 'EX': 20, 'O': 21, 'NNPS': 22, 'RBS': 23, ':': 53, 'WDT': 14, '``': 24, 'POS': 35, "''": 39, 'NNP': 51, 'TO': 27, 'HT': 28, 'JJR': 29, 'LS': 31, 'JJS': 32, 'DT': 33, 'VPP': 34, 'WRB': 36, 'VBN': 37, '<PAD>': 0, 'UH': 41, 'PRP': 42, 'RP': 43, ',': 8, 'NN': 44, 'JJ': 45, '$': 46, '#': 47, '(': 49, 'VBP': 25, 'RT': 38, 'CC': 50, 'SYM': 26, '.': 48}
print(tags_dict)

n_classes = len(tags_dict)
print('n_classes', n_classes)

inv_tags_dict = {v: k for k, v in tags_dict.items()}

unique_words = set([norm_word(word) for word in train_words] + [norm_word(word) for word in finetune_words])
unique_words.add('<UKN>')
words_dict = dict(zip(unique_words, range(1, len(unique_words)+1)))
words_dict['<PAD>'] = 0

vocab_size = len(words_dict)
print('vocab_size', len(words_dict))

{'``': 24, 'URL': 40, 'TO': 27, 'WP': 30, '.': 48, 'LS': 31, 'HT': 28, 'WDT': 14, 'RT': 38, 'NONE': 11, '$': 46, ')': 16, 'VPP': 34, 'NNP': 51, 'POS': 35, 'VB': 15, 'NN': 44, 'FW': 12, 'USR': 18, 'IN': 6, 'RP': 43, 'VBP': 25, 'NNPS': 22, 'SYM': 26, 'RBS': 23, 'JJ': 45, 'WRB': 36, 'VBZ': 1, 'UH': 41, 'PDT': 13, 'RBR': 4, '#': 47, 'CC': 50, 'VBD': 10, 'PRP$': 5, "''": 39, 'MD': 19, 'CD': 9, 'PRP': 42, 'JJS': 32, 'NNS': 17, ':': 53, 'WP$': 2, '<PAD>': 0, 'VBN': 37, 'TD': 52, 'VBG': 3, 'EX': 20, ',': 8, 'DT': 33, 'O': 21, '(': 49, 'RB': 7, 'JJR': 29}
n_classes 54
vocab_size 35781


In [24]:
def word_to_idx(word):
    word = norm_word(word)
    if word in words_dict:
        return words_dict[word]
    return words_dict['<UKN>']

In [34]:
train_data = np.asarray([word_to_idx(word) for word in train_words])
train_label = np.asarray([tags_dict[tag] for tag in train_tags])

dev_data = np.asarray([word_to_idx(word) for word in dev_words])
dev_label = np.asarray([tags_dict[tag] for tag in dev_tags])
dev_len = dev_data.shape[0]
print('dev_len:', dev_len)

finetune_words = np.asarray([word_to_idx(word) for word in finetune_words])
finetune_label = np.asarray([tags_dict[tag] for tag in finetune_tags])

dev_len: 243021


In [26]:
def pad_reshape(data):
    '''Add padding and reshape data to [-1, config.steps_n]
    data may be data or label
    '''
    n_instances = data.shape[0] // config.steps_n
    if data.shape[0] % config.steps_n != 0:
        n_instances += 1
        data = np.concatenate((data, [0 for _ in range(n_instances * config.steps_n - data.shape[0])]), 0)
    return np.reshape(data, (n_instances, config.steps_n))

In [35]:
train_data = pad_reshape(train_data)
train_label = pad_reshape(train_label)
dev_data = pad_reshape(dev_data)
dev_label = pad_reshape(dev_label)
print('Train shape', train_data.shape, train_label.shape)
print('Dev shape', dev_data.shape, dev_label.shape)

finetune_data = pad_reshape(finetune_words)
finetune_label = pad_reshape(finetune_label)

Train shape (34824, 20) (34824, 20)
Dev shape (12152, 20) (12152, 20)


In [36]:
if config.use_pretrain:
    # load word2vec model pretrained on GoogleNews
    load_big_model = False
    if load_big_model:
        import gensim
        w2v_model = gensim.models.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin.gz', binary=True)
        embed_np = np.zeros((vocab_size, config.embed_size), dtype=np.float32)
        for key, val in words_dict.items():
            if key in w2v_model: embed_np[val] = w2v_model[key]
            else: embed_np[val] = np.zeros((config.embed_size), dtype=np.float32)
        embed_np.dump('./my_embedding')
    else:
        embed_np = np.load('./my_embedding')
    print('embed_np.shape:', embed_np.shape)

embed_np.shape: (35781, 300)


In [37]:
tf.reset_default_graph() # clear old graph

def get_lstm_cell(size):
    '''Get a lstm cell with size and wrapped with dropout'''
    return tf.contrib.rnn.DropoutWrapper(
        tf.contrib.rnn.BasicLSTMCell(
            size, forget_bias=config.forget_bias, state_is_tuple=True
        ),
        output_keep_prob=config.keep_prob
    )

def RNN(x, lstm_cell, embedding, softmax_w, softmax_b):
    '''
    RNN model
    Parameters dimension transfer:
    x[batch_size, steps_n]
    => embedding_lookup => x_embed[batch_size, embed_size]
    => unstack x_embed => x_embed[batch_size * [steps_n * [embed_size]]]
    => RNN cell => outputs[batch_size * [steps_n, embed_size]]
    => concat & reshape => outputs[batch_size * steps_n, embed_size]
    => outputs * softmax_w + softmax_b => logits[batch_size * steps_n, n_classes]
    '''
    x_embed = tf.nn.embedding_lookup(embedding, x)
    x_embed = tf.nn.dropout(x_embed, config.keep_prob)
    x_embed = tf.unstack(x_embed, num=config.steps_n, axis=1)

    outputs, states = rnn.static_rnn(lstm_cell, x_embed, dtype=tf.float32)    
    outputs = tf.reshape(tf.concat(axis=1, values=outputs), [-1, config.embed_size])

    return tf.matmul(outputs, softmax_w) + softmax_b

# tf input
x = tf.placeholder('int32', [None, config.steps_n])
y = tf.placeholder('int32', [None, config.steps_n])

# tf parameters
# embedding[vocab_size, embed_size]
# softmax_w[embed_size, n_classes]
# softmax_b[n_classes]
if config.use_pretrain:
    embedding = tf.Variable(embed_np)
else:
    embedding = tf.Variable(tf.random_normal([vocab_size, config.embed_size]))
softmax_w = tf.Variable(tf.random_normal([config.embed_size, n_classes]))
softmax_b = tf.Variable(tf.random_normal([n_classes]))

lstm_cell = rnn.MultiRNNCell(
    [get_lstm_cell(config.embed_size) for _ in range(config.layers_n)],
    state_is_tuple=True
)

# Run RNN
logits = RNN(x, lstm_cell, embedding, softmax_w, softmax_b)

# Compute loss
loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example(
    [logits],
    [tf.reshape(y, [-1])],
    [tf.ones([tf.size(y)])]
)
cost = tf.reduce_sum(loss) / config.batch_size

# Gradient clipping and train
optimizer = tf.train.AdamOptimizer(learning_rate=config.learning_rate)
grads = optimizer.compute_gradients(cost)
clipped_grads = [(tf.clip_by_value(grad, -config.grad_max, config.grad_max), var) for grad, var in grads]
train_op = optimizer.apply_gradients(clipped_grads)

# Evaluate model
pred_y = tf.to_int32(tf.argmax(logits, 1))
correct_pred = tf.equal(pred_y, tf.reshape(y, [-1]))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

# Initializing the variables
init = tf.global_variables_initializer()

In [38]:
'''Training is here'''
training_iters = train_data.shape[0]

sess = tf.Session()
sess.run(init)

start_t = time.time()
for _ in range(config.epoch_n):
    iter_i = 0
    while iter_i < training_iters:
        batch_x = train_data[iter_i : min(len(train_data), iter_i + config.batch_size)]
        batch_y = train_label[iter_i : min(len(train_data), iter_i + config.batch_size)]
        sess.run(train_op, feed_dict={x: batch_x, y: batch_y})
        if iter_i % config.display_iter == 0:
            acc, loss = sess.run([accuracy, cost], feed_dict={x: batch_x, y: batch_y})
            print('Iter %d, current loss: %.5f, training accuracy: %.5f' % (iter_i, loss, acc))
            if loss < config.loss_min:
                break
        iter_i += config.batch_size
print('Training complete, time used:', time.time() - start_t)

print('Dev Accuracy:', sess.run(accuracy, feed_dict={x: dev_data, y: dev_label}))

Iter 0, current loss: 86.23028, training accuracy: 0.04500
Iter 5000, current loss: 6.67756, training accuracy: 0.88000
Iter 10000, current loss: 7.87178, training accuracy: 0.87000
Iter 15000, current loss: 3.65875, training accuracy: 0.92750
Iter 20000, current loss: 3.87113, training accuracy: 0.91250
Iter 25000, current loss: 2.97841, training accuracy: 0.94000
Iter 30000, current loss: 4.14982, training accuracy: 0.93000
Iter 0, current loss: 4.73613, training accuracy: 0.92500
Iter 5000, current loss: 4.02439, training accuracy: 0.91750
Iter 10000, current loss: 6.48993, training accuracy: 0.89000
Iter 15000, current loss: 3.20460, training accuracy: 0.92500
Iter 20000, current loss: 3.22262, training accuracy: 0.91500
Iter 25000, current loss: 2.13039, training accuracy: 0.95000
Iter 30000, current loss: 3.00373, training accuracy: 0.94750
Training complete, time used: 410.21717596054077
Dev Accuracy: 0.905365


In [45]:
class FinetuneConfig(object):
    '''My configuration'''
    batch_size = 20
    epoch_n = 10 # one epoch means training throught the dataset
    display_iter = 300

finetune_config = FinetuneConfig()

In [46]:
'''Training is here'''
training_iters = finetune_data.shape[0]

# sess = tf.Session()
# sess.run(init)

start_t = time.time()
for _ in range(finetune_config.epoch_n):
    iter_i = 0
    while iter_i < training_iters:
        batch_x = finetune_data[iter_i : min(len(finetune_data), iter_i + finetune_config.batch_size)]
        batch_y = finetune_label[iter_i : min(len(finetune_data), iter_i + finetune_config.batch_size)]
        sess.run(train_op, feed_dict={x: batch_x, y: batch_y})
        if iter_i % finetune_config.display_iter == 0:
            acc, loss = sess.run([accuracy, cost], feed_dict={x: batch_x, y: batch_y})
            print('Iter %d, current loss: %.5f, training accuracy: %.5f' % (iter_i, loss, acc))
        iter_i += finetune_config.batch_size
print('Training complete, time used:', time.time() - start_t)

Iter 0, current loss: 30.93182, training accuracy: 0.69750
Iter 300, current loss: 14.61668, training accuracy: 0.80250
Iter 600, current loss: 7.64191, training accuracy: 0.86250
Iter 0, current loss: 6.32613, training accuracy: 0.91000
Iter 300, current loss: 5.17655, training accuracy: 0.92250
Iter 600, current loss: 3.59878, training accuracy: 0.93750
Iter 0, current loss: 3.14150, training accuracy: 0.94750
Iter 300, current loss: 2.73355, training accuracy: 0.95750
Iter 600, current loss: 1.82287, training accuracy: 0.97000
Iter 0, current loss: 1.88675, training accuracy: 0.97500
Iter 300, current loss: 1.56020, training accuracy: 0.97750
Iter 600, current loss: 1.19642, training accuracy: 0.98000
Iter 0, current loss: 0.89757, training accuracy: 0.98500
Iter 300, current loss: 0.84321, training accuracy: 0.98750
Iter 600, current loss: 0.69333, training accuracy: 0.98750
Iter 0, current loss: 0.68134, training accuracy: 0.99750
Iter 300, current loss: 0.61351, training accuracy

In [88]:
def load_pos_train(data_path):
    '''
    Return
    data: list [word]
    data_len: list []
    label: list [label]
    '''
    data, data_len, label = [], [], []
    for sentence in open(data_path, encoding='utf-8').read().strip().split('\n\n'):
        lines = sentence.strip().split('\n')
        data_len.append(len(lines))
        for line in lines:
            line = line.strip().split('\t')
            data.append(line[0])
            label.append(line[1])
    return data, data_len, label

def load_pos_test(data_path):
    '''
    Return
    data: list [word]
    data_len: list []
    '''
    data, data_len = [], []
    for sentence in open(data_path, encoding='utf-8').read().strip().split('\n\n'):
        lines = sentence.strip().split('\n')
        data_len.append(len(lines))
        for line in lines:
            line = line.strip().split('\t')
            data.append(line[0])
    return data, data_len

In [93]:
# test_words, test_data_len, test_label = load_pos_train('./data/dev/dev.txt')
test_words, test_data_len = load_pos_test('./data/test/test.nolabels.txt')
test_data = np.asarray([word_to_idx(word) for word in test_words])
test_data = pad_reshape(test_data)
test_pred = sess.run(pred_y, feed_dict={x: test_data})
print(test_pred.shape)

(33360,)


In [98]:
def write_to_train(data_path, test_words, test_label, test_pred, test_data_len):
    count = 0
    with open(data_path, 'w') as ofile:
        for t_len in test_data_len:
            for _ in range(t_len):
                ofile.write(test_words[count] + '\t' + test_label[count] + '\t' + inv_tags_dict[test_pred[count]] + '\n')
                count += 1
            ofile.write('\n')

def write_to_test(data_path, test_words, test_pred, test_data_len):
    count = 0
    with open(data_path, 'w') as ofile:
        for t_len in test_data_len:
            for _ in range(t_len):
                ofile.write(test_words[count] + '\t' + inv_tags_dict[test_pred[count]] + '\n')
                count += 1
            ofile.write('\n')

In [99]:
# write_to_train('./data/dev/dev_pos.txt', test_words, test_label, test_pred, test_data_len)
write_to_test('./data/test/test_pos.txt', test_words, test_pred, test_data_len)