## Brown Corpus from nltk

In [4]:
from nltk.corpus import brown
import collections
import re

words = brown.words()  # only use 'news' for quick development purpose
sents = brown.sents()  # only use 'news' for quick development purpose
counter = collections.Counter()
for word in words:
    counter[word] += 1
print("unique word:", len(counter))
print("# of words:", len(words), "# of sents:", len(sents))
print("max len(sents[i]):", max([len(s) for s in sents]))
print("# of sents with length < 30:", len([len(s) for s in sents if len(s) < 30]))
print("sample sent:", sents[0])


preprocess_sents = []
corpus_counter = collections.Counter()
for sent in sents:
    tmp_sent = []
    for word in sent:
        if re.search('[a-zA-Z]', word):
            tmp_sent.append(word.lower())
            corpus_counter[word.lower()] += 1
    if len(tmp_sent) <= 28:
        preprocess_sents.append(tmp_sent)

print("===============after preprocessing===============")

print("unique word:", len(corpus_counter))
print("# of words:", sum([len(s) for s in preprocess_sents]), "# of sents:", len(preprocess_sents))
print("max len(sents[i]):", max([len(s) for s in preprocess_sents]))
print("sample sent:", preprocess_sents[0])

unique word: 56057
# of words: 1161192 # of sents: 57340
max len(sents[i]): 180
# of sents with length < 30: 45692
sample sent: ['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.']
unique word: 48052
# of words: 653927 # of sents: 48129
max len(sents[i]): 28
sample sent: ['the', 'fulton', 'county', 'grand', 'jury', 'said', 'friday', 'an', 'investigation', 'of', "atlanta's", 'recent', 'primary', 'election', 'produced', 'no', 'evidence', 'that', 'any', 'irregularities', 'took', 'place']


## Use Brown Corpus to train an seq2seq autoencoder 

In [5]:
import tensorflow as tf
import numpy as np
import random

random.seed(1337)

class EncoderDecoder:
    def __init__(self, vocabulary={}, state_size=64, n_max_length=30):     
        self.state_size = state_size
        self.n_max_length = n_max_length
        self.vocabulary = vocabulary

        
        ######################
        # Graph Construction #
        ######################
        self.graph = tf.Graph()
        with self.graph.as_default():
            self.sen_en = tf.placeholder(tf.int32, shape=(None, self.n_max_length), name="sen_en")
            self.sen_de = tf.placeholder(tf.int32, shape=(None, self.n_max_length), name="sen_de")
            self.sen_en_length = tf.placeholder(tf.int32, shape=(None,), name="sen_en_length")
            self.sen_de_length = tf.placeholder(tf.int32, shape=(None,), name="sen_de_length")
            
            batch_size = tf.shape(self.sen_en)[0]
            
            # TODO sen_en_embedding could also be self-trained embedding: embedding_lookup
            self.embedding = tf.Variable(tf.random_uniform([len(self.vocabulary), self.state_size], -1.0, 1.0), dtype=tf.float32)
            #self.sen_en_embedding = tf.one_hot(self.sen_en, len(self.vocabulary))
            #self.sen_de_embedding = tf.one_hot(self.sen_de, len(self.vocabulary))
            self.sen_en_embedding = tf.nn.embedding_lookup(self.embedding, self.sen_en)
            self.sen_de_embedding = tf.nn.embedding_lookup(self.embedding, self.sen_de)
            
            # build encoder decoder structure
            with tf.variable_scope("encoder") as scope:
                self.cell_en = tf.contrib.rnn.BasicLSTMCell(self.state_size)
            with tf.variable_scope("decoder") as scope:
                self.cell_de = tf.contrib.rnn.BasicLSTMCell(self.state_size)
            with tf.variable_scope("encoder") as scope:
                self.cell_en_init = self.cell_en.zero_state(batch_size, tf.float32)
                self.h_state_en, self.final_state_en = tf.nn.dynamic_rnn(
                    self.cell_en,
                    self.sen_en_embedding,
                    sequence_length=self.sen_en_length,
                    initial_state=self.cell_en_init,
                )
            with tf.variable_scope("decoder") as scope:
                self.cell_de_init = self.final_state_en
                self.h_state_de, self.final_state_de = tf.nn.dynamic_rnn(
                    self.cell_de,
                    self.sen_de_embedding,
                    sequence_length=self.sen_de_length,
                    initial_state=self.cell_de_init,
                )
            

            with tf.variable_scope("softmax") as scope:
                W = tf.get_variable("W", [self.state_size, len(self.vocabulary)], initializer=tf.random_normal_initializer(seed=None))
                b = tf.get_variable("b", [len(self.vocabulary)], initializer=tf.random_normal_initializer(seed=None))               
            self.logits = tf.reshape(
                tf.add(tf.matmul(tf.reshape(self.h_state_de, (-1, self.state_size)), W), b),
                shape=(-1, self.n_max_length, len(self.vocabulary))
            )
            self.prediction = tf.nn.softmax(self.logits)
                
            # construct loss and train op
            self.cross_ent = tf.nn.sparse_softmax_cross_entropy_with_logits(
                labels=self.sen_en,
                logits=self.logits
            )        
            #self.mask = tf.sign(tf.reduce_max(tf.abs(self.sen_de_embedding), 2))
            self.mask = tf.sequence_mask(self.sen_de_length, maxlen=self.n_max_length)
            self.loss = tf.reduce_mean(
                #tf.reduce_sum(tf.multiply(self.cross_ent, self.mask), 1) / tf.reduce_sum(self.mask, 1)
                tf.divide(
                    tf.reduce_sum(
                        tf.where(
                            self.mask,
                            self.cross_ent,
                            tf.zeros_like(self.cross_ent)
                        ), 1
                    ),
                    tf.to_float(self.sen_de_length)
                )
            )
            
            """
            optimizer = tf.train.AdamOptimizer()
            self.op_train = optimizer.minimize(self.loss)
            """
            # Calculate and clip gradients
            params = tf.trainable_variables()
            gradients = tf.gradients(self.loss, params)
            self.clipped_gradients, _ = tf.clip_by_global_norm(gradients, 1)
            # Optimization
            optimizer = tf.train.AdamOptimizer()
            self.op_train = optimizer.apply_gradients(zip(self.clipped_gradients, params))
            
            # initializer
            gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.1)
            self.sess = tf.Session(
                graph=self.graph,
                config=tf.ConfigProto(gpu_options=gpu_options)
            )           
            self.init = tf.global_variables_initializer()
            self.sess.run(self.init)
            
    def train(self, batch_sen_en, batch_sen_de, batch_sen_en_length, batch_sen_de_length):
        """
        Parameters
        ----------
        batch_sen_en: numpy, shape=(n, max_length), dtype=int
        batch_sen_de: numpy, shape=(n, max_length), dtype=int
        batch_sen_en_length: numpy, shape=(n,), dtype=int
        batch_sen_de_length: numpy, shape=(n,), dtype=int
        """
        assert batch_sen_en.shape[0] == batch_sen_de.shape[0]
        _, loss, prediction, sen_en_embedding, mask, cross_ent, clipped_gradients = self.sess.run(
            [self.op_train, self.loss, self.prediction, self.sen_en_embedding, self.mask, self.cross_ent, self.clipped_gradients],
            feed_dict={
                self.sen_en: batch_sen_en,
                self.sen_de: batch_sen_de,
                self.sen_en_length: batch_sen_en_length,
                self.sen_de_length: batch_sen_de_length,
            }
        )
        return loss, prediction, sen_en_embedding, mask, cross_ent, clipped_gradients
        
    def predict(self, batch_sen_en, batch_sen_de, batch_sen_en_length, batch_sen_de_length):
        """
        Parameters
        ----------
        batch_sen_en: numpy, shape=(n, max_length), dtype=int
        batch_sen_de: numpy, shape=(n, max_length), dtype=int
        batch_sen_en_length: numpy, shape=(n,), dtype=int
        batch_sen_de_length: numpy, shape=(n,), dtype=int
        """
        assert batch_sen_en.shape[0] == batch_sen_de.shape[0]
        loss, prediction = self.sess.run(
            [self.loss, self.prediction],
            feed_dict={
                self.sen_en: batch_sen_en,
                self.sen_de: batch_sen_de,
                self.sen_en_length: batch_sen_en_length,
                self.sen_de_length: batch_sen_de_length,
            }
        )
        return loss, prediction

    
def evaluate(batch_sen_en, batch_sen_en_length, batch_prediction, vocabulary):
    """
    Parameters
    ----------
    batch_sen_en: numpy, shape=(n, max_length), dtype=int
    batch_sen_en_length: numpy, shape=(n,), dtype=int
    batch_prediction: numpy, shape=(n, max_length, len(vocabulary))
    """
    assert batch_sen_en.shape[0] == batch_prediction.shape[0]
    acc_word = 0
    acc_sen_end = 0
    for i in range(batch_sen_en.shape[0]):
        is_first_end = False
        for j in range(batch_sen_en_length[i]):
            cur_pred_word = np.argmax(batch_prediction[i, j])
            if cur_pred_word == batch_sen_en[i, j]:
                acc_word += 1
                if not is_first_end and cur_pred_word == vocabulary["</s>"]:
                    acc_sen_end += 1
            if cur_pred_word == vocabulary["</s>"]:
                is_first_end = True
    return 1. * acc_word / np.sum(batch_sen_en_length), 1. * acc_sen_end / batch_sen_en.shape[0]

In [6]:
def generate_data(corpus_sents, max_length, extend_vocabulary):
    sen_en = np.full((len(corpus_sents), max_length), extend_vocabulary["<pad>"], dtype=np.int32)
    sen_de = np.full((len(corpus_sents), max_length), extend_vocabulary["<pad>"], dtype=np.int32)
    sen_en_length = np.zeros((len(corpus_sents),), dtype=np.int32)
    sen_de_length = np.zeros((len(corpus_sents),), dtype=np.int32)

    def get_random_sequence(sent, max_length):
        x = np.full((max_length), extend_vocabulary["<pad>"], dtype=np.int32)
        for i, word in enumerate(sent):
            if word in extend_vocabulary:
                x[i] = extend_vocabulary[word]
            else:
                x[i] = extend_vocabulary["<unk>"]
        return x

    for i in range(len(corpus_sents)):
        l = len(corpus_sents[i])
        sen_en[i, :] = get_random_sequence(corpus_sents[i], max_length)
        sen_en[i, l] = extend_vocabulary["</s>"]
        sen_de[i, 1:l+1] = sen_en[i, :l]
        sen_de[i, 0] = extend_vocabulary["<s>"]
        sen_en_length[i] = l + 1
        sen_de_length[i] = l + 1
    
    return sen_en, sen_de, sen_en_length, sen_de_length

def get_total_accuracy(data_sen_en, data_sen_de, data_sen_en_length, data_sen_de_length, extend_vocabulary, pretrained_lstm):
    n_hit_word, n_hit_length = 0, 0
    n_total_word, n_total_length = 0, 0
    cur_idx = 0
    while cur_idx < data_sen_en.shape[0]:
        batch_sen_en = data_sen_en[cur_idx: cur_idx + n_batch_size]
        batch_sen_de = data_sen_de[cur_idx: cur_idx + n_batch_size]
        batch_sen_en_length = data_sen_en_length[cur_idx: cur_idx + n_batch_size]
        batch_sen_de_length = data_sen_de_length[cur_idx: cur_idx + n_batch_size]
        
        _, predictions = pretrained_lstm.predict(
            batch_sen_en, batch_sen_de, batch_sen_en_length, batch_sen_de_length
        )
        cur_idx += n_batch_size
        cur_acc_word, cur_acc_length = evaluate(batch_sen_en, batch_sen_en_length, predictions, extend_vocabulary)
        n_hit_word += cur_acc_word * np.sum(batch_sen_en_length)
        n_total_word += np.sum(batch_sen_en_length)
        n_hit_length += cur_acc_length * batch_sen_en.shape[0]
        n_total_length += batch_sen_en.shape[0]
    return 1. * n_hit_word / n_total_word, 1. * n_hit_length / n_total_length
    
# hyperparameter
vocabulary_size = 200
origin_vocabulary = {}
for word, n in corpus_counter.most_common(vocabulary_size):
    origin_vocabulary["{}".format(word)] = len(origin_vocabulary)
extend_vocabulary = dict(origin_vocabulary)
for w in ["<pad>", "<unk>", "<s>", "</s>"]:
    extend_vocabulary[w] = len(extend_vocabulary)
#vocabulary = {"<pad>": 0, "<unk>": 1, "<s>": 2, "</s>": 3, "a": 4, "b": 5}
state_size=64
n_max_length=30
n_batch_size=100

# generate training/testing data
n_train = int(len(preprocess_sents) * 0.8 * 0.8)
n_valid = int(len(preprocess_sents) * 0.8 * 0.2)
n_test = len(preprocess_sents) - n_train - n_valid
print("n_train", n_train, "n_valid", n_valid, "n_test", n_test)
train_sen_en, train_sen_de, train_sen_en_length, train_sen_de_length = generate_data(preprocess_sents[:n_train],
                                                                                     n_max_length, extend_vocabulary)
valid_sen_en, valid_sen_de, valid_sen_en_length, valid_sen_de_length = generate_data(preprocess_sents[n_train:n_train+n_valid],
                                                                                     n_max_length, extend_vocabulary)
test_sen_en, test_sen_de, test_sen_en_length, test_sen_de_length = generate_data(preprocess_sents[n_train+n_valid:],
                                                                                 n_max_length, extend_vocabulary)
reverse_extend_vocabulary = {v: k for k, v in extend_vocabulary.items()}
print(train_sen_en[0])
print(preprocess_sents[0])
print([reverse_extend_vocabulary[i] for i in train_sen_en[0]])
print(test_sen_en[0])
print(preprocess_sents[n_train+n_valid])
print([reverse_extend_vocabulary[i] for i in test_sen_en[0]])

pretrained_lstm = EncoderDecoder(vocabulary=extend_vocabulary, state_size=state_size, n_max_length=n_max_length)

for epoch in range(20):
    cur_idx = 0
    while cur_idx < train_sen_en.shape[0]:
        batch_sen_en = train_sen_en[cur_idx: cur_idx + n_batch_size]
        batch_sen_de = train_sen_de[cur_idx: cur_idx + n_batch_size]
        batch_sen_en_length = train_sen_en_length[cur_idx: cur_idx + n_batch_size]
        batch_sen_de_length = train_sen_de_length[cur_idx: cur_idx + n_batch_size]
        
        loss, predictions, sen_en_embedding, mask, cross_ent, clipped_gradients = pretrained_lstm.train(
            batch_sen_en, batch_sen_de, batch_sen_en_length, batch_sen_de_length
        )
        cur_idx += n_batch_size
    print("epoch", epoch, "valid", get_total_accuracy(
        valid_sen_en, valid_sen_de, valid_sen_en_length, valid_sen_de_length, extend_vocabulary, pretrained_lstm
    )) 
print("train", get_total_accuracy(
    train_sen_en, train_sen_de, train_sen_en_length, train_sen_de_length, extend_vocabulary, pretrained_lstm
))
print("test", get_total_accuracy(
    test_sen_en, test_sen_de, test_sen_en_length, test_sen_de_length, extend_vocabulary, pretrained_lstm
)) 

n_train 30802 n_valid 7700 n_test 9627
[  0 201 201 201 201  52 201  28 201   1 201 201 201 201 201  49 201   6
  73 201 201 159 203 200 200 200 200 200 200 200]
['the', 'fulton', 'county', 'grand', 'jury', 'said', 'friday', 'an', 'investigation', 'of', "atlanta's", 'recent', 'primary', 'election', 'produced', 'no', 'evidence', 'that', 'any', 'irregularities', 'took', 'place']
['the', '<unk>', '<unk>', '<unk>', '<unk>', 'said', '<unk>', 'an', '<unk>', 'of', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', 'no', '<unk>', 'that', 'any', '<unk>', '<unk>', 'place', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
[201 129 201 203 200 200 200 200 200 200 200 200 200 200 200 200 200 200
 200 200 200 200 200 200 200 200 200 200 200 200]
['ekstrohm', 'never', 'slept']
['<unk>', 'never', '<unk>', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pa