In [1]:
from bunch import Bunch
from attn_gru_cell import AttentionGRUCell
from copy import deepcopy

import tensorflow as tf
import numpy as np
import json, pprint

In [2]:
args = Bunch({
    'n_epochs': 10,
    'batch_size': 64,
    'embed_dim': 80,
    'hidden_size': 80,
    'dropout_rate': 0.1,
    'n_hops': 2,
    'clip_norm': 5.0,
})

In [3]:
class BaseDataLoader(object):
    def __init__(self):
        self.data = {
            'size': None,
            'val':{
                'inputs': None,
                'questions': None,
                'answers': None,},
            'len':{
                'inputs_len': None,
                'inputs_sent_len': None,
                'questions_len': None,
                'answers_len': None}
        }
        self.vocab = {
            'size': None,
            'word2idx': None,
            'idx2word': None,
        }
        self.params = {
            'vocab_size': None,
            '<start>': None,
            '<end>': None,
            'max_input_len': None,
            'max_sent_len': None,
            'max_quest_len': None,
            'max_answer_len': None,
        }

    def input_fn(self):
        return tf.estimator.inputs.numpy_input_fn(
            x = {
                'inputs': self.data['val']['inputs'],
                'questions': self.data['val']['questions'],
                'inputs_len': self.data['len']['inputs_len'],
                'inputs_sent_len': self.data['len']['inputs_sent_len'],
                'questions_len': self.data['len']['questions_len'],
                'answers_len': self.data['len']['answers_len']
            },
            y = self.data['val']['answers'] if self.is_training else None,
            batch_size = args.batch_size,
            num_epochs = args.n_epochs if self.is_training else 1,
            shuffle = self.is_training)


class DataLoader(BaseDataLoader):
    def __init__(self, path, is_training, vocab=None, params=None):
        super().__init__()
        data, lens = self.load_data(path)
        if is_training:
            self.build_vocab(data)
        else:
            self.demo = data
            self.vocab = vocab
            self.params = deepcopy(params)
        self.padding(data, lens)
        self.is_training = is_training


    def load_data(self, path):
        data, lens = bAbI_data_load(path)
        self.data['size'] = len(data[0])
        return data, lens


    def build_vocab(self, data):
        signals = ['<pad>', '<unk>', '<start>', '<end>']
        inputs, questions, answers = data
        i_words = [w for facts in inputs for fact in facts for w in fact if w != '<end>']
        q_words = [w for question in questions for w in question]
        a_words = [w for answer in answers for w in answer if w != '<end>']
        words = list(set(i_words + q_words + a_words))
        self.params['vocab_size'] = len(words) + 4
        self.params['<start>'] = 2
        self.params['<end>'] = 3
        self.vocab['word2idx'] = {word: idx for idx, word in enumerate(signals + words)}
        self.vocab['idx2word'] = {idx: word for word, idx in self.vocab['word2idx'].items()}
        

    def padding(self, data, lens):
        inputs_len, inputs_sent_len, questions_len, answers_len = lens

        self.params['max_input_len'] = max(inputs_len)
        self.params['max_sent_len'] = max([fact_len for batch in inputs_sent_len for fact_len in batch])
        self.params['max_quest_len'] = max(questions_len)
        self.params['max_answer_len'] = max(answers_len)

        self.data['len']['inputs_len'] = np.array(inputs_len)
        for batch in inputs_sent_len:
            batch += [0] * (self.params['max_input_len'] - len(batch))
        self.data['len']['inputs_sent_len'] = np.array(inputs_sent_len)
        self.data['len']['questions_len'] = np.array(questions_len)
        self.data['len']['answers_len'] = np.array(answers_len)
        
        inputs, questions, answers = deepcopy(data)
        for facts in inputs:
            for sentence in facts:
                for i in range(len(sentence)):
                    sentence[i] = self.vocab['word2idx'].get(sentence[i], self.vocab['word2idx']['<unk>'])
                sentence += [0] * (self.params['max_sent_len'] - len(sentence))
            paddings = [0] * self.params['max_sent_len']
            facts += [paddings] * (self.params['max_input_len'] - len(facts))
        for question in questions:
            for i in range(len(question)):
                question[i] = self.vocab['word2idx'].get(question[i], self.vocab['word2idx']['<unk>'])
            question += [0] * (self.params['max_quest_len'] - len(question))
        for answer in answers:
            for i in range(len(answer)):
                answer[i] = self.vocab['word2idx'].get(answer[i], self.vocab['word2idx']['<unk>'])

        self.data['val']['inputs'] = np.array(inputs)
        self.data['val']['questions'] = np.array(questions)
        self.data['val']['answers'] = np.array(answers)


def bAbI_data_load(path, END=['<end>']):
    inputs = []
    questions = []
    answers = []

    inputs_len = []
    inputs_sent_len = []
    questions_len = []
    answers_len = []

    for d in open(path):
        index = d.split(' ')[0]
        if index == '1':
            fact = []
        if '?' in d:
            temp = d.split('\t')
            q = temp[0].strip().replace('?', '').split(' ')[1:] + ['?']
            a = temp[1].split() + END
            fact_copied = deepcopy(fact)
            inputs.append(fact_copied)
            questions.append(q)
            answers.append(a)

            inputs_len.append(len(fact_copied))
            inputs_sent_len.append([len(s) for s in fact_copied])
            questions_len.append(len(q))
            answers_len.append(len(a))
        else:
            tokens = d.replace('.', '').replace('\n', '').split(' ')[1:] + END
            fact.append(tokens)
    return [inputs, questions, answers], [inputs_len, inputs_sent_len, questions_len, answers_len]

In [4]:
def model_fn(features, labels, mode, params):
    if labels is None:
        labels = tf.placeholder(tf.int64, [None, params['max_answer_len']])
    
    logits = forward(features, params, is_training=True, reuse=False,
                     seq_inputs=shift_right(labels, params))
        
    predicted_ids = forward(features, params, is_training=False, reuse=True)

    if mode == tf.estimator.ModeKeys.PREDICT:
        return tf.estimator.EstimatorSpec(mode=mode, predictions=predicted_ids)
    
    if mode == tf.estimator.ModeKeys.TRAIN:
        loss_op = tf.reduce_mean(tf.contrib.seq2seq.sequence_loss(
            logits=logits, targets=labels, weights=tf.ones_like(labels, tf.float32)))

        variables = tf.trainable_variables()
        grads = tf.gradients(loss_op, variables)
        clipped_grads, _ = tf.clip_by_global_norm(grads, args.clip_norm)

        train_op = tf.train.AdamOptimizer().apply_gradients(zip(clipped_grads, variables),
            global_step=tf.train.get_global_step())
        return tf.estimator.EstimatorSpec(mode=mode, loss=loss_op, train_op=train_op)


def forward(features, params, is_training, reuse, seq_inputs=None):
    with tf.variable_scope('word_embedding', reuse=reuse):
        embedding = embedding_module(params, reuse)
    with tf.variable_scope('input_module', reuse=reuse):
        fact_vecs = input_module(features, params, embedding, is_training, reuse)
    with tf.variable_scope('question_module', reuse=reuse):
        q_vec = question_module(features, embedding, reuse)
    with tf.variable_scope('memory_module', reuse=reuse):
        memory = memory_module(features, fact_vecs, q_vec, is_training, reuse)
    with tf.variable_scope('answer_module', reuse=reuse):
        logits = answer_module(
            features, params, memory, q_vec, embedding, is_training, reuse, seq_inputs)
    return logits


def embedding_module(params, reuse):
    embedding = tf.get_variable('lookup_table', [params['vocab_size'], args.embed_dim], tf.float32)
    embedding = zero_index_pad(embedding)
    return embedding


def input_module(features, params, embedding, is_training, reuse):
    cell_fw = GRU('cell_fw', args.hidden_size//2)
    cell_bw = GRU('cell_bw', args.hidden_size//2)

    inputs = tf.nn.embedding_lookup(embedding, features['inputs'])         # (B, I, S, D)
    position = position_encoding(params['max_sent_len'], args.embed_dim)
    inputs = tf.reduce_sum(inputs * position, 2)                           # (B, I, D)
    birnn_out, _ = tf.nn.bidirectional_dynamic_rnn(cell_fw,
                                                   cell_bw,
                                                   inputs,
                                                   features['inputs_len'],
                                                   dtype=np.float32)
    fact_vecs = tf.concat(birnn_out, -1)                                   # (B, I, D)
    fact_vecs = tf.layers.dropout(fact_vecs, args.dropout_rate, training=is_training)
    
    return fact_vecs


def question_module(features, embedding, reuse):
    cell = GRU('question_rnn')

    questions = tf.nn.embedding_lookup(embedding, features['questions'])
    _, q_vec = tf.nn.dynamic_rnn(cell,
                                 questions,
                                 features['questions_len'],
                                 dtype=np.float32)
    
    return q_vec


def memory_module(features, fact_vecs, q_vec, is_training, reuse):
    proj_1 = tf.layers.Dense(args.embed_dim, tf.tanh, name='attn_proj_1')
    proj_2 = tf.layers.Dense(1, name='attn_proj_2')
    attn_gru = AttentionGRUCell(args.hidden_size, name='attn_gru')
    memory_proj = tf.layers.Dense(args.hidden_size, tf.nn.relu, name='memory_proj')

    memory = q_vec
    for i in range(args.n_hops):
        print('==> Memory Episode', i)
        episode = gen_episode(features,
                              memory,
                              q_vec,
                              fact_vecs,
                              proj_1,
                              proj_2,
                              attn_gru,
                              is_training)
        memory = memory_proj(tf.concat([memory, episode, q_vec], 1))
    
    return memory  # (B, D)


def gen_episode(features, memory, q_vec, fact_vecs, proj_1, proj_2, attn_gru, is_training):
    def gen_attn(fact_vec):
        features = [fact_vec * q_vec,
                    fact_vec * memory,
                    tf.abs(fact_vec - q_vec),
                    tf.abs(fact_vec - memory)]
        feature_vec = tf.concat(features, 1)
        attention = proj_1(feature_vec)
        attention = proj_2(attention)
        return tf.squeeze(attention, 1)

    # Gates (attentions) are activated, if sentence relevant to the question or memory
    attns = tf.map_fn(gen_attn, tf.transpose(fact_vecs, [1,0,2]))
    attns = tf.transpose(attns)                                      # (B, n_fact)
    attns = tf.nn.softmax(attns)                                     # (B, n_fact)
    attns = tf.expand_dims(attns, -1)                                # (B, n_fact, 1)
    
    # The relevant facts are summarized in another GRU
    _, episode = tf.nn.dynamic_rnn(attn_gru,
                                   tf.concat([fact_vecs, attns], 2), # (B, n_fact, D+1)
                                   features['inputs_len'],
                                   dtype=np.float32)
    return episode                                                   # (B, D)


def answer_module(features, params, memory, q_vec, embedding, is_training, reuse,
                  seq_inputs=None):
    state_proj = tf.layers.Dense(args.hidden_size, name='state_proj')
    vocab_proj = tf.layers.Dense(params['vocab_size'], name='vocab_proj')
    cell = GRU('decoder_rnn')

    memory = tf.layers.dropout(memory, args.dropout_rate, training=is_training)
    init_state = state_proj(tf.concat((memory, q_vec), -1))

    if is_training:
        helper = tf.contrib.seq2seq.TrainingHelper(
            inputs = tf.nn.embedding_lookup(embedding, seq_inputs),
            sequence_length = tf.to_int32(features['answers_len']))
        decoder = tf.contrib.seq2seq.BasicDecoder(
            cell = cell,
            helper = helper,
            initial_state = init_state,
            output_layer = vocab_proj)
        decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(
            decoder = decoder)
        return decoder_output.rnn_output
    else:
        helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(
            embedding = embedding,
            start_tokens = tf.tile(
                tf.constant([params['<start>']], dtype=tf.int32), [tf.shape(init_state)[0]]),
            end_token = params['<end>'])
        decoder = tf.contrib.seq2seq.BasicDecoder(
            cell = cell,
            helper = helper,
            initial_state = init_state,
            output_layer = vocab_proj)
        decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(
            decoder = decoder,
            maximum_iterations = params['max_answer_len'])
        return decoder_output.sample_id


def shift_right(x, params):
    batch_size = tf.shape(x)[0]
    start = tf.to_int64(tf.fill([batch_size, 1], params['<start>']))
    return tf.concat([start, x[:, :-1]], 1)


def GRU(name, rnn_size=None):
    rnn_size = args.hidden_size if rnn_size is None else rnn_size
    return tf.nn.rnn_cell.GRUCell(
        rnn_size, kernel_initializer=tf.orthogonal_initializer(), name=name)


def zero_index_pad(embedding):
    return tf.concat((tf.zeros([1, args.embed_dim]), embedding[1:, :]), axis=0)


def position_encoding(sentence_size, embedding_size):
    encoding = np.ones((embedding_size, sentence_size), dtype=np.float32)
    ls = sentence_size + 1
    le = embedding_size + 1
    for i in range(1, le):
        for j in range(1, ls):
            encoding[i-1, j-1] = (i - (le-1)/2) * (j - (ls-1)/2)
    encoding = 1 + 4 * encoding / embedding_size / sentence_size
    return np.transpose(encoding)

In [5]:
def main():
    tf.logging.set_verbosity(tf.logging.INFO)
    print(json.dumps(args, indent=4))

    train_dl = DataLoader(
        path='../temp/qa5_three-arg-relations_train.txt',
        is_training=True)
    test_dl = DataLoader(
        path='../temp/qa5_three-arg-relations_test.txt',
        is_training=False, vocab=train_dl.vocab, params=train_dl.params)

    model = tf.estimator.Estimator(model_fn, params=train_dl.params)
    model.train(train_dl.input_fn())
    gen = model.predict(test_dl.input_fn())
    preds = np.concatenate(list(gen))
    preds = np.reshape(preds, [test_dl.data['size'], 2])
    print('Testing Accuracy:', (test_dl.data['val']['answers'][:, 0] == preds[:, 0]).mean())
    demo(test_dl.demo, test_dl.vocab['idx2word'], preds)


def demo(demo, idx2word, ids, demo_idx=3):
    demo_i, demo_q, demo_a = demo
    print()
    pprint.pprint(demo_i[demo_idx])
    print()
    print('Question:', demo_q[demo_idx])
    print()
    print('Prediction:', [idx2word[id] for id in ids[demo_idx]])


if __name__ == '__main__':
    main()

{
    "n_epochs": 10,
    "batch_size": 64,
    "embed_dim": 80,
    "hidden_size": 80,
    "dropout_rate": 0.1,
    "n_hops": 2,
    "clip_norm": 5.0
}
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/var/folders/sx/fv0r97j96fz8njp14dt5g7940000gn/T/tmps9znfnj3', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x11e235b38>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
==> Memory Episode 0
==> Memory Episode 1
==> Memory Episode 0
==> Memory Episode 1
INFO:tensorflow:Done calling