In [97]:
import re
import tarfile
import functools

import numpy as np

from keras.utils.data_utils import get_file
from keras.layers.embeddings import Embedding
from keras import layers
from keras.layers import recurrent
from keras.models import Model
from keras.preprocessing.sequence import pad_sequences

In [98]:
def tokenize(sent):
    '''Return the tokens of a sentence including punctuation.
    >>> tokenize('Bob dropped the apple. Where is the apple?')
    ['Bob', 'dropped', 'the', 'apple', '.', 'Where', 'is', 'the', 'apple', '?']
    '''
    return [x.strip() for x in re.split('(\W+)?', sent) if x.strip()]

In [99]:
def parse_stories(lines, only_supporting=False):
    '''Parse stories provided in the bAbi tasks format
    If only_supporting is true,
    only the sentences that support the answer are kept.
    '''
    data = []
    story = []
    for line in lines:
        line = line.decode('utf-8').strip()
        nid, line = line.split(' ', 1)
        nid = int(nid)
        if nid == 1:
            story = []
        if '\t' in line:
            q, a, supporting = line.split('\t')
            q = tokenize(q)
            substory = None
            if only_supporting:
                # Only select the related substory
                supporting = map(int, supporting.split())
                substory = [story[i - 1] for i in supporting]
            else:
                # Provide all the substories
                substory = [x for x in story if x]
            data.append((substory, q, a))
            story.append('')
        else:
            sent = tokenize(line)
            story.append(sent)
    return data

In [100]:
def get_stories(f, only_supporting=False, max_length=None):
    '''Given a file name, read the file, retrieve the stories,
    and then convert the sentences into a single story.
    If max_length is supplied,
    any stories longer than max_length tokens will be discarded.
    '''
    data = parse_stories(f.readlines(), only_supporting=only_supporting)
    flatten = lambda data: functools.reduce(lambda x, y: x + y, data)
    data = [(flatten(story), q, answer) 
            for story, q, answer in data if not max_length or len(flatten(story)) < max_length]
    return data

In [101]:
def vectorize_stories(data, word_idx, story_maxlen, query_maxlen):
    xs = []
    xqs = []
    ys = []
    for story, query, answer in data:
        x = [word_idx[w] for w in story]
        xq = [word_idx[w] for w in query]
        # let's not forget that index 0 is reserved
        y = np.zeros(len(word_idx) + 1)
        y[word_idx[answer]] = 1
        xs.append(x)
        xqs.append(xq)
        ys.append(y)
    return pad_sequences(xs, maxlen=story_maxlen), pad_sequences(xqs, maxlen=query_maxlen), np.array(ys)

In [102]:
RNN = recurrent.LSTM
EMBED_HIDDEN_SIZE = 50
SENT_HIDDEN_SIZE = 100
QUERY_HIDDEN_SIZE = 100
BATCH_SIZE = 32
EPOCHS = 20
print('RNN / Embed / Sent / Query = {}, {}, {}, {}'.format(RNN,
                                                           EMBED_HIDDEN_SIZE,
                                                           SENT_HIDDEN_SIZE,
                                                           QUERY_HIDDEN_SIZE))

RNN / Embed / Sent / Query = <class 'keras.layers.recurrent.LSTM'>, 50, 100, 100


In [103]:
try:
    path = get_file('babi-tasks-v1-2.tar.gz', origin='https://s3.amazonaws.com/text-datasets/babi_tasks_1-20_v1-2.tar.gz')
except:
    print('Error downloading dataset, please download it manually:\n'
          '$ wget http://www.thespermwhale.com/jaseweston/babi/tasks_1-20_v1-2.tar.gz\n'
          '$ mv tasks_1-20_v1-2.tar.gz ~/.keras/datasets/babi-tasks-v1-2.tar.gz')
    raise
tar = tarfile.open(path)

In [118]:
# Default QA1 with 1000 samples
# challenge = 'tasks_1-20_v1-2/en/qa1_single-supporting-fact_{}.txt'
# QA1 with 10,000 samples
# challenge = 'tasks_1-20_v1-2/en-10k/qa1_single-supporting-fact_{}.txt'
# QA2 with 1000 samples
# challenge = 'tasks_1-20_v1-2/en/qa2_two-supporting-facts_{}.txt'
# QA2 with 10,000 samples
challenge = 'tasks_1-20_v1-2/en-10k/qa2_two-supporting-facts_{}.txt'
train = get_stories(tar.extractfile(challenge.format('train')))
test = get_stories(tar.extractfile(challenge.format('test')))

  return _compile(pattern, flags).split(string, maxsplit)


In [119]:
vocab = set()
for story, q, answer in train + test:
    vocab |= set(story + q + [answer])
vocab = sorted(vocab)

In [120]:
# Reserve 0 for masking via pad_sequences
vocab_size = len(vocab) + 1
word_idx = dict((c, i + 1) for i, c in enumerate(vocab))
story_maxlen = max(map(len, (x for x, _, _ in train + test)))
query_maxlen = max(map(len, (x for _, x, _ in train + test)))

In [121]:
x, xq, y = vectorize_stories(train, word_idx, story_maxlen, query_maxlen)
tx, txq, ty = vectorize_stories(test, word_idx, story_maxlen, query_maxlen)

In [122]:
print('vocab = {}'.format(vocab))
print('x.shape = {}'.format(x.shape))
print('xq.shape = {}'.format(xq.shape))
print('y.shape = {}'.format(y.shape))
print('story_maxlen, query_maxlen = {}, {}'.format(story_maxlen, query_maxlen))

vocab = ['.', '?', 'Daniel', 'John', 'Mary', 'Sandra', 'Where', 'apple', 'back', 'bathroom', 'bedroom', 'discarded', 'down', 'dropped', 'football', 'garden', 'got', 'grabbed', 'hallway', 'is', 'journeyed', 'kitchen', 'left', 'milk', 'moved', 'office', 'picked', 'put', 'the', 'there', 'to', 'took', 'travelled', 'up', 'went']
x.shape = (10000, 552)
xq.shape = (10000, 5)
y.shape = (10000, 36)
story_maxlen, query_maxlen = 552, 5


In [123]:
print('Build model...')

sentence = layers.Input(shape=(story_maxlen,), dtype='int32')
encoded_sentence = layers.Embedding(vocab_size, EMBED_HIDDEN_SIZE)(sentence)
encoded_sentence = layers.Dropout(0.15)(encoded_sentence)

question = layers.Input(shape=(query_maxlen,), dtype='int32')
encoded_question = layers.Embedding(vocab_size, EMBED_HIDDEN_SIZE)(question)
encoded_question = layers.Dropout(0.15)(encoded_question)
encoded_question = RNN(EMBED_HIDDEN_SIZE)(encoded_question)
encoded_question = layers.RepeatVector(story_maxlen)(encoded_question)

merged = layers.add([encoded_sentence, encoded_question])
merged = RNN(EMBED_HIDDEN_SIZE)(merged)
merged = layers.Dropout(0.15)(merged)
preds = layers.Dense(vocab_size, activation='softmax')(merged)

model = Model([sentence, question], preds)
model.compile(optimizer='adadelta',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

Build model...


In [None]:
print('Training')
model.fit([x, xq], y,
          batch_size=BATCH_SIZE,
          epochs=EPOCHS,
          validation_split=0.05)
loss, acc = model.evaluate([tx, txq], ty,
                           batch_size=BATCH_SIZE)
print('Test loss / test accuracy = {:.4f} / {:.4f}'.format(loss, acc))

Training
Train on 9500 samples, validate on 500 samples
Epoch 1/20
 352/9500 [>.............................] - ETA: 811s - loss: 3.5688 - acc: 0.1790

In [109]:
predictions = model.predict([tx, txq])

In [111]:
def mindex(array, maxval):
    for index in range(len(array)):
        entry = array[index]
        if maxval == entry:
            return index

In [112]:
word_indices = [mindex(array, max(array)) for array in predictions_for]

In [115]:
words = []
for index in range(len(potential_words)):
    words.append(vocab[potential_words[index]])

In [116]:
print(words)

['bedroom', 'garden', 'moved', 'hallway', 'moved', 'is', 'hallway', 'is', 'is', 'the', 'moved', 'the', 'moved', 'hallway', 'garden', 'moved', 'hallway', 'moved', 'is', 'hallway', 'is', 'the', 'bedroom', 'garden', 'hallway', 'moved', 'garden', 'moved', 'hallway', 'moved', 'the', 'the', 'moved', 'moved', 'bedroom', 'the', 'is', 'moved', 'is', 'hallway', 'bedroom', 'hallway', 'hallway', 'moved', 'hallway', 'is', 'garden', 'hallway', 'the', 'garden', 'garden', 'garden', 'hallway', 'is', 'moved', 'bedroom', 'hallway', 'garden', 'moved', 'the', 'is', 'the', 'bedroom', 'hallway', 'bedroom', 'moved', 'moved', 'garden', 'the', 'is', 'hallway', 'the', 'hallway', 'garden', 'bedroom', 'moved', 'the', 'moved', 'garden', 'the', 'hallway', 'hallway', 'garden', 'the', 'the', 'hallway', 'hallway', 'garden', 'the', 'moved', 'bedroom', 'bedroom', 'is', 'bedroom', 'the', 'moved', 'moved', 'moved', 'garden', 'moved', 'garden', 'bedroom', 'is', 'hallway', 'garden', 'is', 'is', 'bedroom', 'hallway', 'moved',

In [91]:
vocab[indices_of_max]

TypeError: only integer scalar arrays can be converted to a scalar index

In [89]:
indices_of_max = np.argmax(predictions_for, axis=1)

In [95]:
vocab[indices_of_max[15]]

'moved'

In [90]:
words = [vocab[i] for i in indices_of_max]

bedroom
garden
moved
hallway
moved
is
hallway
is
is
the
moved
the
moved
hallway
garden
moved
hallway
moved
is
hallway
is
the
bedroom
garden
hallway
moved
garden
moved
hallway
moved
the
the
moved
moved
bedroom
the
is
moved
is
hallway
bedroom
hallway
hallway
moved
hallway
is
garden
hallway
the
garden
garden
garden
hallway
is
moved
bedroom
hallway
garden
moved
the
is
the
bedroom
hallway
bedroom
moved
moved
garden
the
is
hallway
the
hallway
garden
bedroom
moved
the
moved
garden
the
hallway
hallway
garden
the
the
hallway
hallway
garden
the
moved
bedroom
bedroom
is
bedroom
the
moved
moved
moved
garden
moved
garden
bedroom
is
hallway
garden
is
is
bedroom
hallway
moved
bedroom
bedroom
moved
moved
the
hallway
moved
bedroom
is
the
garden
bedroom
bedroom
bedroom
the
the
hallway
the
hallway
hallway
hallway
the
moved
garden
garden
bedroom
the
moved
bedroom
bedroom
the
is
the
bedroom
garden
moved
garden
moved
moved
hallway
hallway
hallway
the
the
garden
garden
garden
bedroom
the
is
is
bedroom
garden

In [96]:
def get_answer(story, question):
    
    return answer