## Brown Corpus from nltk

In [1]:
from nltk.corpus import brown
import collections
import re

words = brown.words()  # only use 'news' for quick development purpose
sents = brown.sents()  # only use 'news' for quick development purpose
counter = collections.Counter()
for word in words:
    counter[word] += 1
print("unique word:", len(counter))
print("# of words:", len(words), "# of sents:", len(sents))
print("max len(sents[i]):", max([len(s) for s in sents]))
print("# of sents with length < 30:", len([len(s) for s in sents if len(s) < 30]))
print("sample sent:", sents[0])


preprocess_sents = []
corpus_counter = collections.Counter()
for sent in sents:
    tmp_sent = []
    for word in sent:
        if re.search('[a-zA-Z]', word):
            tmp_sent.append(word.lower())
            corpus_counter[word.lower()] += 1
    if len(tmp_sent) <= 28:
        preprocess_sents.append(tmp_sent)

print("===============after preprocessing===============")

print("unique word:", len(corpus_counter))
print("# of words:", sum([len(s) for s in preprocess_sents]), "# of sents:", len(preprocess_sents))
print("max len(sents[i]):", max([len(s) for s in preprocess_sents]))
print("sample sent:", preprocess_sents[0])

unique word: 56057
# of words: 1161192 # of sents: 57340
max len(sents[i]): 180
# of sents with length < 30: 45692
sample sent: ['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.']
unique word: 48052
# of words: 653927 # of sents: 48129
max len(sents[i]): 28
sample sent: ['the', 'fulton', 'county', 'grand', 'jury', 'said', 'friday', 'an', 'investigation', 'of', "atlanta's", 'recent', 'primary', 'election', 'produced', 'no', 'evidence', 'that', 'any', 'irregularities', 'took', 'place']


## Use Brown Corpus to train an seq2seq autoencoder 

In [2]:
import tensorflow as tf
import numpy as np
import random

random.seed(1337)

class EncoderDecoder:
    def __init__(self, vocabulary={}, state_size=64, n_max_length=30):     
        self.state_size = state_size
        self.n_max_length = n_max_length
        self.vocabulary = vocabulary
        self.reverse_vocabulary = {k: v for k, v in vocabulary.items()}
        
        ######################
        # Graph Construction #
        ######################
        self.graph = tf.Graph()
        with self.graph.as_default():
            #self.sen_en = tf.placeholder(tf.int32, shape=(None, self.n_max_length), name="sen_en")
            #self.sen_de = tf.placeholder(tf.int32, shape=(None, self.n_max_length), name="sen_de")
            self.sen_en = tf.placeholder(tf.int32, shape=(None, None), name="sen_en")
            self.sen_de = tf.placeholder(tf.int32, shape=(None, None), name="sen_de")
            self.sen_en_length = tf.placeholder(tf.int32, shape=(None,), name="sen_en_length")
            self.sen_de_length = tf.placeholder(tf.int32, shape=(None,), name="sen_de_length")
            
            batch_size_en = tf.shape(self.sen_en)[0]
            batch_size_de = tf.shape(self.sen_de)[0]
            batch_max_length_de = tf.shape(self.sen_de)[1]
            
            # TODO sen_en_embedding could also be self-trained embedding: embedding_lookup
            self.embedding = tf.Variable(tf.random_uniform([len(self.vocabulary), self.state_size], -1.0, 1.0), dtype=tf.float32)
            #self.sen_en_embedding = tf.one_hot(self.sen_en, len(self.vocabulary))
            #self.sen_de_embedding = tf.one_hot(self.sen_de, len(self.vocabulary))
            self.sen_en_embedding = tf.nn.embedding_lookup(self.embedding, self.sen_en)
            self.sen_de_embedding = tf.nn.embedding_lookup(self.embedding, self.sen_de)
            
            # build encoder decoder structure
            with tf.variable_scope("encoder") as scope:
                self.cell_en_fw = tf.contrib.rnn.BasicLSTMCell(self.state_size)
                self.cell_en_bw = tf.contrib.rnn.BasicLSTMCell(self.state_size)
            with tf.variable_scope("decoder") as scope:
                self.cell_de = tf.contrib.rnn.BasicLSTMCell(self.state_size*2)
            with tf.variable_scope("encoder") as scope:
                self.cell_en_fw_init = self.cell_en_fw.zero_state(batch_size_en, tf.float32)
                self.cell_en_bw_init = self.cell_en_bw.zero_state(batch_size_en, tf.float32)
                self.h_state_en, self.final_state_en = tf.nn.bidirectional_dynamic_rnn(
                    self.cell_en_fw,
                    self.cell_en_bw,
                    self.sen_en_embedding,
                    sequence_length=self.sen_en_length,
                    initial_state_fw=self.cell_en_fw_init,
                    initial_state_bw=self.cell_en_bw_init,      
                )
            with tf.variable_scope("decoder") as scope:
                self.cell_de_init = tf.contrib.rnn.LSTMStateTuple(
                    c=tf.concat([self.final_state_en[0].c, self.final_state_en[1].c], 1),
                    h=tf.concat([self.final_state_en[0].h, self.final_state_en[1].h], 1),
                )             
                self.h_state_de, self.final_state_de = tf.nn.dynamic_rnn(
                    self.cell_de,
                    self.sen_en_embedding,
                    sequence_length=self.sen_de_length,
                    initial_state=self.cell_de_init,
                )

            with tf.variable_scope("softmax") as scope:
                W = tf.get_variable("W", [self.state_size*2, len(self.vocabulary)], initializer=tf.random_normal_initializer(seed=None))
                b = tf.get_variable("b", [len(self.vocabulary)], initializer=tf.random_normal_initializer(seed=None))               
            self.logits = tf.reshape(
                tf.add(tf.matmul(tf.reshape(self.h_state_de, (-1, self.state_size*2)), W), b),
                shape=(-1, batch_max_length_de, len(self.vocabulary))
            )
            self.prediction = tf.nn.softmax(self.logits)
                
            # construct loss and train op
            self.cross_ent = tf.nn.sparse_softmax_cross_entropy_with_logits(
                labels=self.sen_de,
                logits=self.logits
            )        
            #self.mask = tf.sign(tf.reduce_max(tf.abs(self.sen_de_embedding), 2))
            self.mask = tf.sequence_mask(self.sen_de_length, maxlen=batch_max_length_de)
            self.loss = tf.reduce_mean(
                #tf.reduce_sum(tf.multiply(self.cross_ent, self.mask), 1) / tf.reduce_sum(self.mask, 1)
                tf.divide(
                    tf.reduce_sum(
                        tf.where(
                            self.mask,
                            self.cross_ent,
                            tf.zeros_like(self.cross_ent)
                        ), 1
                    ),
                    tf.to_float(self.sen_de_length)
                )
            )
            
            """
            optimizer = tf.train.AdamOptimizer()
            self.op_train = optimizer.minimize(self.loss)
            """
            # Calculate and clip gradients
            params = tf.trainable_variables()
            gradients = tf.gradients(self.loss, params)
            self.clipped_gradients, _ = tf.clip_by_global_norm(gradients, 1)
            # Optimization
            optimizer = tf.train.AdamOptimizer()
            self.op_train = optimizer.apply_gradients(zip(self.clipped_gradients, params))
            
            # initializer
            gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.1)
            self.sess = tf.Session(
                graph=self.graph,
                config=tf.ConfigProto(gpu_options=gpu_options)
            )           
            self.init = tf.global_variables_initializer()
            self.sess.run(self.init)
            
    def train(self, batch_sen_en, batch_sen_de, batch_sen_en_length, batch_sen_de_length):
        """
        Parameters
        ----------
        batch_sen_en: numpy, shape=(n, max_length), dtype=int
        batch_sen_de: numpy, shape=(n, max_length), dtype=int
        batch_sen_en_length: numpy, shape=(n,), dtype=int
        batch_sen_de_length: numpy, shape=(n,), dtype=int
        """
        assert batch_sen_en.shape[0] == batch_sen_de.shape[0]
        assert batch_sen_en.shape[1] == self.n_max_length  # training always input same length as self.n_max_length
        _, loss, prediction, sen_en_embedding, mask, cross_ent, clipped_gradients = self.sess.run(
            [self.op_train, self.loss, self.prediction, self.sen_en_embedding, self.mask, self.cross_ent, self.clipped_gradients],
            feed_dict={
                self.sen_en: batch_sen_en,
                self.sen_de: batch_sen_de,
                self.sen_en_length: batch_sen_en_length,
                self.sen_de_length: batch_sen_de_length,
            }
        )
        return loss, prediction, sen_en_embedding, mask, cross_ent, clipped_gradients
        
    def predict(self, batch_sen_en, batch_sen_de, batch_sen_en_length, batch_sen_de_length):
        """
        Parameters
        ----------
        batch_sen_en: numpy, shape=(n, max_length), dtype=int
        batch_sen_de: numpy, shape=(n, max_length), dtype=int
        batch_sen_en_length: numpy, shape=(n,), dtype=int
        batch_sen_de_length: numpy, shape=(n,), dtype=int
        """
        assert batch_sen_en.shape[0] == batch_sen_de.shape[0]
        loss, prediction = self.sess.run(
            [self.loss, self.prediction],
            feed_dict={
                self.sen_en: batch_sen_en,
                self.sen_de: batch_sen_de,
                self.sen_en_length: batch_sen_en_length,
                self.sen_de_length: batch_sen_de_length,
            }
        )
        return loss, prediction
    
    def get_dqn_state(self, batch_sen_en, batch_sen_en_length, batch_cursor):
        """
        Parameters
        ----------
        batch_sen_en: numpy, shape=(n, max_length), dtype=int
        batch_sen_en_length: numpy, shape=(n,), dtype=int
        batch_cursor: numpy, shape=(n,), dtype=int
        Returns
        -------
        dqn_state: numpy, shape=(n, state_size*3)
        """
        batch_size = batch_sen_en.shape[0]
        batch_dqn_state = np.empty((batch_size, self.state_size*3), dtype=np.float32)
        batch_h_state_en, batch_sen_en_embedding = self.sess.run(
            [self.h_state_en, self.sen_en_embedding],
            feed_dict={
                self.sen_en: batch_sen_en,
                self.sen_en_length: batch_sen_en_length
            }
        )
        for i in range(batch_size):
            batch_dqn_state[i,:self.state_size] = batch_h_state_en[0][i,batch_cursor[i]-1]
            batch_dqn_state[i,self.state_size:self.state_size*2] = batch_sen_en_embedding[i,batch_cursor[i]]
            batch_dqn_state[i,self.state_size*2:] = batch_h_state_en[1][i,batch_cursor[i]+1]
        return batch_dqn_state
    
    def encode(self, batch_sen_en, batch_sen_en_length):  # is wrong in nltk_brown_ae_bilstm version
        """
        Parameters
        ----------
        batch_sen_en: numpy, shape=(n, max_length), dtype=int
        batch_sen_en_length: numpy, shape=(n,), dtype=int
        Returns
        -------
        batch_state_en: LSTMStateTuple
        """
        batch_state_en = self.sess.run(
            self.cell_de_init,
            feed_dict={
                self.sen_en: batch_sen_en,
                self.sen_en_length: batch_sen_en_length,
            }
        )
        return batch_state_en
    
    def decode(self, batch_state_en):  # is wrong in nltk_brown_ae_bilstm version
        """
        Parameters
        ----------
        batch_state_en: LSTMStateTuple
        Returns
        -------
        batch_sen_de: numpy, shape=(n, max_length), dtype=int
        """
        batch_size = batch_state_en.c.shape[0]
        batch_sen_de = np.empty([batch_size, self.n_max_length], dtype=np.int32)
        
        tmp_sen_en = np.empty([batch_size, 1], dtype=np.int32)
        tmp_sen_en_length = np.ones([batch_size], dtype=np.int32)
        tmp_sen_en[:] = self.vocabulary["<s>"]
        tmp_last_state = batch_state_en
        for i in range(self.n_max_length):
            tmp_predict, tmp_last_state = self.sess.run(
                [self.prediction, self.final_state_de],
                feed_dict={
                    self.cell_de_init: tmp_last_state,
                    self.sen_en: tmp_sen_en,
                    self.sen_de: tmp_sen_en,  # only need its batch_size
                    self.sen_de_length: tmp_sen_en_length,  # only need its length
                }
            )
            tmp_sen_en = np.argmax(tmp_predict, axis=2)
            batch_sen_de[:,i] = tmp_sen_en[:,0]
           
        return batch_sen_de

    
def evaluate(batch_sen_de, batch_sen_de_length, batch_prediction, vocabulary):
    """
    Parameters
    ----------
    batch_sen_de: numpy, shape=(n, max_length), dtype=int
    batch_sen_de_length: numpy, shape=(n,), dtype=int
    batch_prediction: numpy, shape=(n, max_length, len(vocabulary))
    """
    assert batch_sen_de.shape[0] == batch_prediction.shape[0]
    acc_word = 0
    acc_sen_end = 0
    for i in range(batch_sen_de.shape[0]):
        is_first_end = False
        for j in range(batch_sen_de_length[i]):
            cur_pred_word = np.argmax(batch_prediction[i, j])
            if cur_pred_word == batch_sen_de[i, j]:
                acc_word += 1
                if not is_first_end and cur_pred_word == vocabulary["</s>"]:
                    acc_sen_end += 1
            if cur_pred_word == vocabulary["</s>"]:
                is_first_end = True
    return 1. * acc_word / np.sum(batch_sen_de_length), 1. * acc_sen_end / batch_sen_de.shape[0]

In [3]:
def generate_data(corpus_sents, max_length, extend_vocabulary):
    sen_en = np.full((len(corpus_sents), max_length), extend_vocabulary["<pad>"], dtype=np.int32)
    sen_de = np.full((len(corpus_sents), max_length), extend_vocabulary["<pad>"], dtype=np.int32)
    sen_en_length = np.zeros((len(corpus_sents),), dtype=np.int32)
    sen_de_length = np.zeros((len(corpus_sents),), dtype=np.int32)

    def get_random_sequence(sent, max_length):
        x = np.full((max_length), extend_vocabulary["<pad>"], dtype=np.int32)
        for i, word in enumerate(sent):
            if word in extend_vocabulary:
                x[i] = extend_vocabulary[word]
            else:
                x[i] = extend_vocabulary["<unk>"]
        return x

    for i in range(len(corpus_sents)):
        l = len(corpus_sents[i])
        sen_en[i, :] = get_random_sequence(corpus_sents[i], max_length)
        sen_en[i,1:l+1] = sen_en[i,:l]
        sen_en[i,0] = extend_vocabulary["<s>"]
        sen_en[i, l+1] = extend_vocabulary["</s>"]
        sen_de[i, :max_length-1] = sen_en[i, 1:]
        sen_en_length[i] = l + 2
        sen_de_length[i] = l + 1
    
    return sen_en, sen_de, sen_en_length, sen_de_length

def get_total_accuracy(data_sen_en, data_sen_de, data_sen_en_length, data_sen_de_length, extend_vocabulary, pretrained_lstm):
    n_hit_word, n_hit_length = 0, 0
    n_total_word, n_total_length = 0, 0
    cur_idx = 0
    while cur_idx < data_sen_en.shape[0]:
        batch_sen_en = data_sen_en[cur_idx: cur_idx + n_batch_size]
        batch_sen_de = data_sen_de[cur_idx: cur_idx + n_batch_size]
        batch_sen_en_length = data_sen_en_length[cur_idx: cur_idx + n_batch_size]
        batch_sen_de_length = data_sen_de_length[cur_idx: cur_idx + n_batch_size]
        
        _, predictions = pretrained_lstm.predict(
            batch_sen_en, batch_sen_de, batch_sen_en_length, batch_sen_de_length
        )
        cur_idx += n_batch_size
        cur_acc_word, cur_acc_length = evaluate(batch_sen_de, batch_sen_de_length, predictions, extend_vocabulary)
        n_hit_word += cur_acc_word * np.sum(batch_sen_de_length)
        n_total_word += np.sum(batch_sen_de_length)
        n_hit_length += cur_acc_length * batch_sen_de.shape[0]
        n_total_length += batch_sen_de.shape[0]
    return 1. * n_hit_word / n_total_word, 1. * n_hit_length / n_total_length
    
# hyperparameter
vocabulary_size = 200
origin_vocabulary = {}
for word, n in corpus_counter.most_common(vocabulary_size):
    origin_vocabulary["{}".format(word)] = len(origin_vocabulary)
extend_vocabulary = dict(origin_vocabulary)
for w in ["<pad>", "<unk>", "<s>", "</s>"]:
    extend_vocabulary[w] = len(extend_vocabulary)
#vocabulary = {"<pad>": 0, "<unk>": 1, "<s>": 2, "</s>": 3, "a": 4, "b": 5}
state_size=64
n_max_length=30
n_batch_size=100

# generate training/testing data
n_train = int(len(preprocess_sents) * 0.8 * 0.8)
n_valid = int(len(preprocess_sents) * 0.8 * 0.2)
n_test = len(preprocess_sents) - n_train - n_valid
print("n_train", n_train, "n_valid", n_valid, "n_test", n_test)
train_sen_en, train_sen_de, train_sen_en_length, train_sen_de_length = generate_data(preprocess_sents[:n_train],
                                                                                     n_max_length, extend_vocabulary)
valid_sen_en, valid_sen_de, valid_sen_en_length, valid_sen_de_length = generate_data(preprocess_sents[n_train:n_train+n_valid],
                                                                                     n_max_length, extend_vocabulary)
test_sen_en, test_sen_de, test_sen_en_length, test_sen_de_length = generate_data(preprocess_sents[n_train+n_valid:],
                                                                                 n_max_length, extend_vocabulary)
reverse_extend_vocabulary = {v: k for k, v in extend_vocabulary.items()}
print(train_sen_en[0])
print(preprocess_sents[0])
print([reverse_extend_vocabulary[i] for i in train_sen_en[0]])
print(test_sen_en[0])
print(preprocess_sents[n_train+n_valid])
print([reverse_extend_vocabulary[i] for i in test_sen_en[0]])

pretrained_lstm = EncoderDecoder(vocabulary=extend_vocabulary, state_size=state_size, n_max_length=n_max_length)

for epoch in range(20):
    cur_idx = 0
    while cur_idx < train_sen_en.shape[0]:
        batch_sen_en = train_sen_en[cur_idx: cur_idx + n_batch_size]
        batch_sen_de = train_sen_de[cur_idx: cur_idx + n_batch_size]
        batch_sen_en_length = train_sen_en_length[cur_idx: cur_idx + n_batch_size]
        batch_sen_de_length = train_sen_de_length[cur_idx: cur_idx + n_batch_size]
        
        loss, predictions, sen_en_embedding, mask, cross_ent, clipped_gradients = pretrained_lstm.train(
            batch_sen_en, batch_sen_de, batch_sen_en_length, batch_sen_de_length
        )
        cur_idx += n_batch_size
    print("epoch", epoch, "valid", get_total_accuracy(
        valid_sen_en, valid_sen_de, valid_sen_en_length, valid_sen_de_length, extend_vocabulary, pretrained_lstm
    )) 
    print("last loss", loss)
print("train", get_total_accuracy(
    train_sen_en, train_sen_de, train_sen_en_length, train_sen_de_length, extend_vocabulary, pretrained_lstm
))
print("test", get_total_accuracy(
    test_sen_en, test_sen_de, test_sen_en_length, test_sen_de_length, extend_vocabulary, pretrained_lstm
)) 

n_train 30802 n_valid 7700 n_test 9627
[202   0 201 201 201 201  52 201  28 201   1 201 201 201 201 201  49 201
   6  73 201 201 159 203 200 200 200 200 200 200]
['the', 'fulton', 'county', 'grand', 'jury', 'said', 'friday', 'an', 'investigation', 'of', "atlanta's", 'recent', 'primary', 'election', 'produced', 'no', 'evidence', 'that', 'any', 'irregularities', 'took', 'place']
['<s>', 'the', '<unk>', '<unk>', '<unk>', '<unk>', 'said', '<unk>', 'an', '<unk>', 'of', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', 'no', '<unk>', 'that', 'any', '<unk>', '<unk>', 'place', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
[202 201 129 201 203 200 200 200 200 200 200 200 200 200 200 200 200 200
 200 200 200 200 200 200 200 200 200 200 200 200]
['ekstrohm', 'never', 'slept']
['<s>', '<unk>', 'never', '<unk>', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>',

In [4]:
my_sen_en = np.full((1,30), extend_vocabulary["<pad>"], dtype=np.int32)
my_sen_en[0,0] = extend_vocabulary["<s>"]
my_sen_en[0,1] = extend_vocabulary["i"]
my_sen_en[0,2] = extend_vocabulary["like"]
my_sen_en[0,3] = extend_vocabulary["you"]
my_sen_en[0,4] = extend_vocabulary["</s>"]
my_sen_en_length = np.empty((1,), dtype=np.int32)
my_sen_de_length = np.empty((1,), dtype=np.int32)
my_sen_en_length[0] = 5
my_sen_de_length[0] = 4

my_state_en = pretrained_lstm.encode(my_sen_en, my_sen_en_length)
print(my_state_en.c.shape)
print("encode", my_sen_en)
print("decode", pretrained_lstm.decode(my_state_en))  # exposure bias
print("HAHA")
my_loss, my_prediction = pretrained_lstm.predict(my_sen_en, my_sen_en, my_sen_en_length, my_sen_de_length)
print("my_loss", my_loss, "my_prediction", my_prediction.shape)
print(np.argmax(my_prediction[0,0]))
print(np.argmax(my_prediction[0,1]))
print(np.argmax(my_prediction[0,2]))
print(np.argmax(my_prediction[0,3]))
print(np.argmax(my_prediction[0,4]))

(1, 128)
encode [[202  19  77  32 203 200 200 200 200 200 200 200 200 200 200 200 200 200
  200 200 200 200 200 200 200 200 200 200 200 200]]
decode [[ 19  77  32 203 203  20 203 192  20   7 201 170 203  12   4 201 203  20
  140  82 107  71 203  12  14 201   2 201 203  60]]
HAHA
my_loss 17.6518 my_prediction (1, 30, 204)
19
77
32
203
107


In [5]:
print("batch_sen_en.shape", batch_sen_en.shape)
print("batch_sen_en_length.shape", batch_sen_en_length.shape)
batch_cursor = np.array([2, 2], dtype=np.int32)
batch_dqn_state = pretrained_lstm.get_dqn_state(batch_sen_en, batch_sen_en_length, batch_cursor)
print("batch_dqn_state.shape", batch_dqn_state.shape)


batch_sen_en.shape (2, 30)
batch_sen_en_length.shape (2,)
batch_dqn_state.shape (2, 192)
