In [1]:
import re
import tarfile
import numpy as np
import tensorflow as tf
from sklearn.utils import shuffle
from functools import reduce
import os
from urllib.request import urlretrieve

np.random.seed(0)
tf.set_random_seed(1234)

print('tf version: ', tf.__version__)
print('GPU : ', tf.test.is_gpu_available())


tf version:  1.14.0
GPU :  True


In [2]:
base_dir = os.path.join('./', 'datasets')
def get_file(filename, url=None, datadir=None):
    if url is None:
        raise
    if datadir is None:
        datadir = base_dir
    if not os.path.exists(datadir):
        os.makedirs(datadir)

    fpath = os.path.join(datadir, filename)

    download = False
    if os.path.exists(fpath):
        pass
    else:
        download = True

    if download:
        print('Downloading data from', url)
        try:
            try:
                urlretrieve(url, fpath)
            except URLError as e:
                raise
            except HTTPError as e:
                raise
        except (Exception, KeyboardInterrupt) as e:
            if os.path.exists(fpath):
                os.remove(fpath)
            raise

    return fpath


In [3]:
def inference(x, q, n_batch,
              vocab_size=None,
              embedding_dim=None,
              story_maxlen=None,
              question_maxlen=None):
    def weight_variable(shape, stddev=0.08):
        initial = tf.truncated_normal(shape, stddev=stddev)
        return tf.Variable(initial)

    def bias_variable(shape):
        initial = tf.zeros(shape, dtype=tf.float32)
        return tf.Variable(initial)

    A = weight_variable([vocab_size, embedding_dim])
    B = weight_variable([vocab_size, embedding_dim])
    C = weight_variable([vocab_size, question_maxlen])
    m = tf.nn.embedding_lookup(A, x)
    u = tf.nn.embedding_lookup(B, q)
    c = tf.nn.embedding_lookup(C, x)
    p = tf.nn.softmax(tf.einsum('ijk,ilk->ijl', m, u))
    o = tf.add(p, c)
    o = tf.transpose(o, perm=[0, 2, 1])
    ou = tf.concat([o, u], axis=-1)

    cell = tf.contrib.rnn.BasicLSTMCell(embedding_dim//2, forget_bias=1.0)
    initial_state = cell.zero_state(n_batch, tf.float32)
    state = initial_state
    outputs = []
    with tf.variable_scope('LSTM'):
        for t in range(question_maxlen):
            if t > 0:
                tf.get_variable_scope().reuse_variables()
            (cell_output, state) = cell(ou[:, t, :], state)
            outputs.append(cell_output)
    output = outputs[-1]
    W = weight_variable([embedding_dim//2, vocab_size], stddev=0.01)
    a = tf.nn.softmax(tf.matmul(output, W))

    return a



In [4]:

def loss(y, t):
    cross_entropy = \
        tf.reduce_mean(-tf.reduce_sum(
                       t * tf.log(tf.clip_by_value(y, 1e-10, 1.0)),
                       reduction_indices=[1]))
    return cross_entropy


def training(loss):
    optimizer = \
        tf.train.AdamOptimizer(learning_rate=0.001, beta1=0.9, beta2=0.999)
    train_step = optimizer.minimize(loss)
    return train_step


def accuracy(y, t):
    correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(t, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    return accuracy


def tokenize(sent):
    return [x.strip() for x in re.split('(\W+)', sent) if x.strip()]


def parse_stories(lines):
    data = []
    story = []
    for line in lines:
        line = line.decode('utf-8').strip()
        nid, line = line.split(' ', 1)
        nid = int(nid)
        if nid == 1:
            story = []
        if '\t' in line:
            q, a, supporting = line.split('\t')
            q = tokenize(q)
            substory = [x for x in story if x]
            data.append((substory, q, a))
            story.append('')
        else:
            sent = tokenize(line)
            story.append(sent)
    return data


def get_stories(f, max_length=None):
    def flatten(data):
        return reduce(lambda x, y: x + y, data)

    data = parse_stories(f.readlines())
    data = [(flatten(story), q, answer)
            for story, q, answer in data
            if not max_length or len(flatten(story)) < max_length]
    return data


def vectorize_stories(data, word_indices, story_maxlen, question_maxlen):
    X = []
    Q = []
    A = []
    for story, question, answer in data:
        x = [word_indices[w] for w in story]
        q = [word_indices[w] for w in question]
        a = np.zeros(len(word_indices) + 1)   
        a[word_indices[answer]] = 1
        X.append(x)
        Q.append(q)
        A.append(a)

    return (padding(X, maxlen=story_maxlen),
            padding(Q, maxlen=question_maxlen), np.array(A))


def padding(words, maxlen):
    for i, word in enumerate(words):
        words[i] = [0] * (maxlen - len(word)) + word
    return np.array(words)




In [8]:

print('Fetching data...')
try:
    path = \
        get_file('babi-tasks-v1-2.tar.gz',
                 url='https://dikers-data.s3.cn-northwest-1.amazonaws.com.cn/dataset/babi_tasks_1-20_v1-2.tar.gz')
except Exception as e:
    raise
tar = tarfile.open(path)

challenge = 'tasks_1-20_v1-2/en-10k/qa1_single-supporting-fact_{}.txt'
train_stories = get_stories(tar.extractfile(challenge.format('train')))
test_stories = get_stories(tar.extractfile(challenge.format('test')))



Fetching data...


In [9]:
content, question, answer= train_stories[100]

print( '{}\n'.format(' '.join(content)))
print( 'Q: {}\n'.format(' '.join(question)))
print( 'A: {}'.format(answer))
 

Daniel moved to the garden . Mary went back to the bathroom .

Q: Where is Daniel ?

A: garden


In [10]:
train_stories[0]
print(train_stories[0])

(['Mary', 'moved', 'to', 'the', 'bathroom', '.', 'John', 'went', 'to', 'the', 'hallway', '.'], ['Where', 'is', 'Mary', '?'], 'bathroom')


In [11]:
vocab = set()
for story, q, answer in train_stories + test_stories:
    vocab |= set(story + q + [answer])
vocab = sorted(vocab)
vocab_size = len(vocab) + 1   

story_maxlen = \
    max(map(len, (x for x, _, _ in train_stories + test_stories)))
question_maxlen = \
    max(map(len, (x for _, x, _ in train_stories + test_stories)))



print("story_maxlen ", story_maxlen)
print("question_maxlen ", question_maxlen)

story_maxlen  68
question_maxlen  4


In [12]:
print('Vectorizing data...')
word_indices = dict((c, i + 1) for i, c in enumerate(vocab))
inputs_train, questions_train, answers_train = \
    vectorize_stories(train_stories, word_indices,
                      story_maxlen, question_maxlen)

inputs_test, questions_test, answers_test = \
    vectorize_stories(test_stories, word_indices,
                      story_maxlen, question_maxlen)

Vectorizing data...


In [13]:
print(train_stories[1])
print(len(train_stories))
print(len(vocab))

(['Mary', 'moved', 'to', 'the', 'bathroom', '.', 'John', 'went', 'to', 'the', 'hallway', '.', 'Daniel', 'went', 'back', 'to', 'the', 'hallway', '.', 'Sandra', 'moved', 'to', 'the', 'garden', '.'], ['Where', 'is', 'Daniel', '?'], 'hallway')
10000
21


In [14]:
tf.reset_default_graph()
print('Building model...')
x = tf.placeholder(tf.int32, shape=[None, story_maxlen])
q = tf.placeholder(tf.int32, shape=[None, question_maxlen])
a = tf.placeholder(tf.float32, shape=[None, vocab_size])
n_batch = tf.placeholder(tf.int32, shape=[])

y = inference(x, q, n_batch,
              vocab_size=vocab_size,
              embedding_dim=64,
              story_maxlen=story_maxlen,
              question_maxlen=question_maxlen)
loss_ = loss(y, a)
train_step = training(loss_)
acc = accuracy(y, a)
history = {
    'val_loss': [],
    'val_acc': []
}



Building model...
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [15]:
%%time
print('Training model...')
epochs = 500
batch_size = 128

init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)

n_batches = len(inputs_train) // batch_size

for epoch in range(epochs):
    inputs_train_, questions_train_, answers_train_ = \
        shuffle(inputs_train, questions_train, answers_train)

    for i in range(n_batches):
        start = i * batch_size
        end = start + batch_size

        sess.run(train_step, feed_dict={
            x: inputs_train_[start:end],
            q: questions_train_[start:end],
            a: answers_train_[start:end],
            n_batch: batch_size
        })

    val_loss = loss_.eval(session=sess, feed_dict={
        x: inputs_test,
        q: questions_test,
        a: answers_test,
        n_batch: len(inputs_test)
    })
    val_acc = acc.eval(session=sess, feed_dict={
        x: inputs_test,
        q: questions_test,
        a: answers_test,
        n_batch: len(inputs_test)
    })

    history['val_loss'].append(val_loss)
    history['val_acc'].append(val_acc)
    if epoch % 50 ==0: 
        print('epoch: {:5d}  val_loss: {:.5f}  val_acc: {:.5f}'
              .format(epoch, val_loss , val_acc))

Training model...
epoch:     0  val_loss: 1.83817  val_acc: 0.18200
epoch:    50  val_loss: 1.79596  val_acc: 0.15700
epoch:   100  val_loss: 0.36723  val_acc: 0.88200
epoch:   150  val_loss: 0.25522  val_acc: 0.92000
epoch:   200  val_loss: 0.30249  val_acc: 0.92400
epoch:   250  val_loss: 0.33845  val_acc: 0.92700
epoch:   300  val_loss: 0.38969  val_acc: 0.93100
epoch:   350  val_loss: 0.45012  val_acc: 0.93000
epoch:   400  val_loss: 0.49528  val_acc: 0.93100
epoch:   450  val_loss: 0.53495  val_acc: 0.93900
CPU times: user 4min 13s, sys: 6.96 s, total: 4min 20s
Wall time: 2min 15s
