## Autoencoder Encoder-Decoder

In [1]:
import tensorflow as tf
import numpy as np


class EncoderDecoder:
    def __init__(self, vocabulary={}, state_size=64, n_max_length=30, n_training_batch=100):     
        self.state_size = state_size
        self.n_max_length = n_max_length
        self.n_training_batch = n_training_batch
        self.vocabulary = vocabulary

        
        ######################
        # Graph Construction #
        ######################
        self.graph = tf.Graph()
        with self.graph.as_default():
            self.sen_en = tf.placeholder(tf.int32, shape=(None, self.n_max_length), name="sen_en")
            self.sen_de = tf.placeholder(tf.int32, shape=(None, self.n_max_length), name="sen_de")
            self.sen_en_length = tf.placeholder(tf.int32, shape=(None,), name="sen_en_length")
            self.sen_de_length = tf.placeholder(tf.int32, shape=(None,), name="sen_de_length")
            
            batch_size = tf.shape(self.sen_en)[0]
            
            # TODO sen_en_embedding could also be self-trained embedding: embedding_lookup
            self.sen_en_embedding = tf.one_hot(self.sen_en, len(self.vocabulary))
            self.sen_de_embedding = tf.one_hot(self.sen_de, len(self.vocabulary))
            
            # build encoder decoder structure
            with tf.variable_scope("encoder") as scope:
                self.cell_en = tf.contrib.rnn.BasicLSTMCell(self.state_size)
            with tf.variable_scope("decoder") as scope:
                self.cell_de = tf.contrib.rnn.BasicLSTMCell(self.state_size)
            with tf.variable_scope("encoder") as scope:
                self.cell_en_init = self.cell_en.zero_state(batch_size, tf.float32)
                self.h_state_en, self.final_state_en = tf.nn.dynamic_rnn(
                    self.cell_en,
                    self.sen_en_embedding,
                    sequence_length=self.sen_en_length,
                    initial_state=self.cell_en_init,
                )
            with tf.variable_scope("decoder") as scope:
                self.cell_de_init = self.final_state_en
                self.h_state_de, self.final_state_de = tf.nn.dynamic_rnn(
                    self.cell_de,
                    self.sen_de_embedding,
                    sequence_length=self.sen_de_length,
                    initial_state=self.cell_de_init,
                )
            

            with tf.variable_scope("softmax") as scope:
                W = tf.get_variable("W", [self.state_size, len(self.vocabulary)], initializer=tf.random_normal_initializer(seed=None))
                b = tf.get_variable("b", [len(self.vocabulary)], initializer=tf.random_normal_initializer(seed=None))               
            self.logits = tf.reshape(
                tf.add(tf.matmul(tf.reshape(self.h_state_de, (-1, self.state_size)), W), b),
                shape=(-1, self.n_max_length, len(self.vocabulary))
            )
            self.prediction = tf.nn.softmax(self.logits)
                
            # construct loss and train op
            self.cross_ent = tf.nn.sparse_softmax_cross_entropy_with_logits(
                labels=self.sen_en,
                logits=self.logits
            )        
            #self.mask = tf.sign(tf.reduce_max(tf.abs(self.sen_de_embedding), 2))
            self.mask = tf.sequence_mask(self.sen_de_length, maxlen=self.n_max_length)
            self.loss = tf.reduce_mean(
                #tf.reduce_sum(tf.multiply(self.cross_ent, self.mask), 1) / tf.reduce_sum(self.mask, 1)
                tf.divide(
                    tf.reduce_sum(
                        tf.where(
                            self.mask,
                            self.cross_ent,
                            tf.zeros_like(self.cross_ent)
                        ), 1
                    ),
                    tf.to_float(self.sen_de_length)
                )
            )
            
            """
            optimizer = tf.train.AdamOptimizer()
            self.op_train = optimizer.minimize(self.loss)
            """
            # Calculate and clip gradients
            params = tf.trainable_variables()
            gradients = tf.gradients(self.loss, params)
            self.clipped_gradients, _ = tf.clip_by_global_norm(gradients, 1)
            # Optimization
            optimizer = tf.train.AdamOptimizer()
            self.op_train = optimizer.apply_gradients(zip(self.clipped_gradients, params))
            
            # initializer
            gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.1)
            self.sess = tf.Session(
                graph=self.graph,
                config=tf.ConfigProto(gpu_options=gpu_options)
            )           
            self.init = tf.global_variables_initializer()
            self.sess.run(self.init)
            
    def train(self, batch_sen_en, batch_sen_de, batch_sen_en_length, batch_sen_de_length):
        """
        Parameters
        ----------
        batch_sen_en: numpy, shape=(n, max_length), dtype=int
        batch_sen_de: numpy, shape=(n, max_length), dtype=int
        batch_sen_en_length: numpy, shape=(n,), dtype=int
        batch_sen_de_length: numpy, shape=(n,), dtype=int
        """
        assert batch_sen_en.shape[0] == batch_sen_de.shape[0]
        _, loss, prediction, sen_en_embedding, mask, cross_ent, clipped_gradients = self.sess.run(
            [self.op_train, self.loss, self.prediction, self.sen_en_embedding, self.mask, self.cross_ent, self.clipped_gradients],
            feed_dict={
                self.sen_en: batch_sen_en,
                self.sen_de: batch_sen_de,
                self.sen_en_length: batch_sen_en_length,
                self.sen_de_length: batch_sen_de_length,
            }
        )
        return loss, prediction, sen_en_embedding, mask, cross_ent, clipped_gradients
        
    def predict(self, batch_sen_en, batch_sen_de, batch_sen_en_length, batch_sen_de_length):
        """
        Parameters
        ----------
        batch_sen_en: numpy, shape=(n, max_length), dtype=int
        batch_sen_de: numpy, shape=(n, max_length), dtype=int
        batch_sen_en_length: numpy, shape=(n,), dtype=int
        batch_sen_de_length: numpy, shape=(n,), dtype=int
        """
        assert batch_sen_en.shape[0] == batch_sen_de.shape[1]
        loss, prediction = self.sess.run(
            [self.loss, self.prediction],
            feed_dict={
                self.sen_en: batch_sen_en,
                self.sen_de: batch_sen_de,
                self.sen_en_length: batch_sen_en_length,
                self.sen_de_length: batch_sen_de_length,
            }
        )
        return loss, prediction


def evaluate(batch_sen_en, batch_sen_en_length, batch_prediction, vocabulary):
    """
    Parameters
    ----------
    batch_sen_en: numpy, shape=(n, max_length), dtype=int
    batch_sen_en_length: numpy, shape=(n,), dtype=int
    batch_prediction: numpy, shape=(n, max_length, len(vocabulary))
    """
    assert batch_sen_en.shape[0] == batch_prediction.shape[0]
    acc_word = 0
    acc_sen_end = 0
    for i in range(batch_sen_en.shape[0]):
        is_first_end = False
        for j in range(batch_sen_en_length[i]):
            cur_pred_word = np.argmax(batch_prediction[i, j])
            if cur_pred_word == batch_sen_en[i, j]:
                acc_word += 1
                if not is_first_end and cur_pred_word == vocabulary["</s>"]:
                    acc_sen_end += 1
            if cur_pred_word == vocabulary["</s>"]:
                is_first_end = True
    return 1. * acc_word / np.sum(batch_sen_en_length), 1. * acc_sen_end / batch_sen_en.shape[0]

In [3]:
import random
import numpy as np


def get_sequence(vocabulary, n, max_length):
    assert n < max_length
    x = np.zeros((max_length,), dtype=np.int32)
    for i in range(n):
        x[i] = vocabulary[random.choice(["a", "b"])]      
    return x


# hyperparameter
vocabulary = {"<pad>": 0, "<unk>": 1, "<s>": 2, "</s>": 3, "a": 4, "b": 5}
state_size=64
n_max_length=30
n_training_batch=100

pretrained_lstm = EncoderDecoder(vocabulary=vocabulary, state_size=state_size, n_max_length=n_max_length, n_training_batch=n_training_batch)

for epoch in range(20):
    for n_batch in range(100):
        # generate training batch
        batch_sen_en = np.zeros((n_training_batch, n_max_length), dtype=np.int32)
        batch_sen_de = np.zeros((n_training_batch, n_max_length), dtype=np.int32)
        batch_sen_en_length = np.zeros((n_training_batch,), dtype=np.int32)
        batch_sen_de_length = np.zeros((n_training_batch,), dtype=np.int32)
        for i in range(n_training_batch):
            #l = random.randint(n_max_length - 2, n_max_length-1)
            l = random.randint(n_max_length // 2, n_max_length-1)
            #l = n_max_length-1
            batch_sen_en[i, :] = get_sequence(vocabulary, l, n_max_length)
            batch_sen_en[i, l] = vocabulary["</s>"]
            batch_sen_de[i, 1:l+1] = batch_sen_en[i, :l]
            batch_sen_de[i, 0] = vocabulary["<s>"]
            batch_sen_en_length[i] = l + 1
            batch_sen_de_length[i] = l + 1
        
        loss, predictions, sen_en_embedding, mask, cross_ent, clipped_gradients = pretrained_lstm.train(batch_sen_en, batch_sen_de, batch_sen_en_length, batch_sen_de_length)
        #print("epoch", epoch, "n_batch", n_batch, "loss", loss)
    #for i in range(n_max_length):
    #    print("label", batch_sen_de[0, i], "pred", predictions[0, i], "sen_en_embedding", sen_en_embedding[0, i], "mask", mask[0, i], "corss_ent", cross_ent[0, i])
    print(evaluate(batch_sen_en, batch_sen_en_length, predictions, vocabulary))

(0.64509306260575294, 0.41)
(0.70984235193864509, 0.74)
(0.71656600517687663, 0.83)
(0.77243589743589747, 0.95)
(0.77742375967228039, 0.99)
(0.75624999999999998, 0.85)
(0.77514273166447079, 1.0)
(0.79238601150951748, 0.99)
(0.74386569091691779, 1.0)
(0.80498100464330946, 1.0)
(0.81107771575783594, 1.0)
(0.81731601731601733, 1.0)
(0.81360309944037879, 0.94)
(0.83004818221638199, 1.0)
(0.83113456464379942, 0.73)
(0.83822884699693112, 1.0)
(0.82586644125105668, 0.99)
(0.83851725607158079, 1.0)
(0.84145824501907585, 0.98)
(0.86910327241818952, 1.0)


In [11]:
import numpy as np
a = np.arange(3)
b = np.arange(1, 4)
a/  b

array([ 0.        ,  0.5       ,  0.66666667])