In [1]:
%matplotlib inline

In [8]:
import numpy as np
import keras.backend as K
import re

In [3]:
np.set_printoptions(4)
cfg = K.tf.ConfigProto(gpu_options={'allow_growth': True})
K.set_session(K.tf.Session(config=cfg))

download https://s3.amazonaws.com/text-datasets/babi_tasks_1-20_v1-2.tar.gz

In [4]:
challenges = {
    # QA1 with 10,000 samples
    'single_supporting_fact_10k': 'tasks_1-20_v1-2/en-10k/qa1_single-supporting-fact_{}.txt',
    # QA2 with 10,000 samples
    'two_supporting_facts_10k': 'tasks_1-20_v1-2/en-10k/qa2_two-supporting-facts_{}.txt',
    'two_supporting_facts_1k': 'tasks_1-20_v1-2/en/qa2_two-supporting-facts_{}.txt',
}
challenge_type = 'single_supporting_fact_10k'
# challenge_type = 'two_supporting_facts_10k'
challenge = challenges[challenge_type]

In [9]:
def tokenize(sent):
    return [x.strip() for x in re.split('(\W+)?', sent) if x.strip()]
def parse_stories(lines):
    data = []
    story = []
    for line in lines:
        line = line.strip()
        nid, line = line.split(' ', 1)
        if int(nid) == 1: story = []
        if '\t' in line:
            q, a, supporting = line.split('\t')
            q = tokenize(q)
            substory = None
            substory = [[str(i)+":"]+x for i,x in enumerate(story) if x]
            data.append((substory, q, a))
            story.append('')
        else: story.append(tokenize(line))
    return data
def get_stories(f):
    data = parse_stories(open(f).readlines())
    return [(story, q, answer) for story, q, answer in data]

train_stories = get_stories(challenge.format('train'))
test_stories = get_stories(challenge.format('test'))

  return _compile(pattern, flags).split(string, maxsplit)


In [11]:
stories = train_stories + test_stories
story_maxlen = max((len(s) for x, _, _ in stories for s in x))
story_maxsents = max((len(x) for x, _, _ in stories))
query_maxlen = max(len(x) for _, x, _ in stories)

In [12]:
import collections

In [13]:
def do_flatten(el): 
    return isinstance(el, collections.Iterable) and not isinstance(el, (str, bytes))
def flatten(l):
    for el in l:
        if do_flatten(el): yield from flatten(el)
        else: yield el

In [14]:
vocab = sorted(set(flatten(stories)))
vocab.insert(0, '<PAD>')
vocab_size = len(vocab)

In [15]:
story_maxsents, vocab_size, story_maxlen, query_maxlen, len(train_stories), len(test_stories)

(10, 32, 8, 4, 10000, 1000)

In [16]:
test_stories[534]

([['0:', 'Mary', 'moved', 'to', 'the', 'office', '.'],
  ['1:', 'John', 'moved', 'to', 'the', 'garden', '.'],
  ['3:', 'Sandra', 'moved', 'to', 'the', 'bedroom', '.'],
  ['4:', 'Sandra', 'went', 'back', 'to', 'the', 'office', '.'],
  ['6:', 'John', 'went', 'to', 'the', 'bedroom', '.'],
  ['7:', 'John', 'journeyed', 'to', 'the', 'garden', '.'],
  ['9:', 'Daniel', 'went', 'back', 'to', 'the', 'hallway', '.'],
  ['10:', 'John', 'journeyed', 'to', 'the', 'bedroom', '.'],
  ['12:', 'Daniel', 'journeyed', 'to', 'the', 'bathroom', '.'],
  ['13:', 'John', 'travelled', 'to', 'the', 'garden', '.']],
 ['Where', 'is', 'Daniel', '?'],
 'bathroom')

In [17]:
word_idx = dict((c, i) for i, c in enumerate(vocab))

In [20]:
from keras.preprocessing.sequence import pad_sequences

def vectorize_stories(data, word_idx, story_maxlen, query_maxlen):
    X = []; Xq = []; Y = []
    for story, query, answer in data:
        x = [[word_idx[w] for w in s] for s in story]
        xq = [word_idx[w] for w in query]
        y = [word_idx[answer]]
        X.append(x); Xq.append(xq); Y.append(y)
    return ([pad_sequences(x, maxlen=story_maxlen) for x in X],
            pad_sequences(Xq, maxlen=query_maxlen), np.array(Y))

In [21]:
inputs_train, queries_train, answers_train = vectorize_stories(train_stories, 
     word_idx, story_maxlen, query_maxlen)
inputs_test, queries_test, answers_test = vectorize_stories(test_stories, 
     word_idx, story_maxlen, query_maxlen)

In [22]:
def stack_inputs(inputs):
    for i,it in enumerate(inputs):
        inputs[i] = np.concatenate([it, 
                           np.zeros((story_maxsents-it.shape[0],story_maxlen), 'int')])
    return np.stack(inputs)
inputs_train = stack_inputs(inputs_train)
inputs_test = stack_inputs(inputs_test)

In [25]:
inputs_train[2]

array([[ 0,  2, 15, 26, 29, 28, 19,  1],
       [ 0,  6, 14, 31, 29, 28, 22,  1],
       [ 7, 13, 31, 18, 29, 28, 22,  1],
       [ 0,  8, 16, 26, 29, 28, 21,  1],
       [ 0,  9, 14, 26, 29, 28, 27,  1],
       [ 0, 10, 16, 24, 29, 28, 19,  1],
       [ 0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0]])

In [26]:
inps = [inputs_train, queries_train]
val_inps = [inputs_test, queries_test]

## model

In [27]:
emb_dim = 20

In [89]:
from keras.layers import TimeDistributed, Lambda, Input, Reshape, Activation, dot, Dense, add
from keras.layers.embeddings import Embedding
from keras.models import Model
from keras.engine.topology import Layer
from keras import initializers

In [30]:
def emb_sent_bow(inp):
    emb = TimeDistributed(Embedding(vocab_size, emb_dim))(inp)
    return Lambda(lambda x: K.sum(x, 2))(emb)

In [32]:
inp_story = Input((story_maxsents, story_maxlen))
emb_story = emb_sent_bow(inp_story)
inp_story.shape, emb_story.shape

(TensorShape([Dimension(None), Dimension(10), Dimension(8)]),
 TensorShape([Dimension(None), Dimension(10), Dimension(20)]))

In [35]:
inp_q = Input((query_maxlen,))
emb_q = Embedding(vocab_size, emb_dim)(inp_q)
emb_q = Lambda(lambda x: K.sum(x, 1))(emb_q)
emb_q = Reshape((1, emb_dim))(emb_q)
inp_q.shape, emb_q.shape

(TensorShape([Dimension(None), Dimension(4)]),
 TensorShape([Dimension(None), Dimension(1), Dimension(20)]))

In [38]:
x = dot([emb_story, emb_q], axes=2)
x = Reshape((story_maxsents,))(x)
x = Activation('softmax')(x)
match = Reshape((story_maxsents,1))(x)
match.shape

TensorShape([Dimension(None), Dimension(10), Dimension(1)])

In [40]:
emb_c = emb_sent_bow(inp_story)
x = dot([match, emb_c], axes=1)
response = Reshape((emb_dim,))(x)
res = Dense(vocab_size, activation='softmax')(response)

In [42]:
answer = Model([inp_story, inp_q], res)

In [43]:
answer.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy',
               metrics=['accuracy'])

In [45]:
K.set_value(answer.optimizer.lr, 1e-2)
hist=answer.fit(inps, answers_train, verbose=1, epochs=4, batch_size=32,
           validation_data=(val_inps, answers_test))

Train on 10000 samples, validate on 1000 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


## test

In [46]:
f = Model([inp_story, inp_q], match)

In [47]:
qnum=6

In [48]:
l_st = len(train_stories[qnum][0])+1
train_stories[qnum]

([['0:', 'Sandra', 'travelled', 'to', 'the', 'office', '.'],
  ['1:', 'Sandra', 'went', 'to', 'the', 'bathroom', '.'],
  ['3:', 'Mary', 'went', 'to', 'the', 'bedroom', '.'],
  ['4:', 'Daniel', 'moved', 'to', 'the', 'hallway', '.']],
 ['Where', 'is', 'Sandra', '?'],
 'bathroom')

In [49]:
np.squeeze(f.predict([inputs_train[qnum:qnum+1], queries_train[qnum:qnum+1]]))[:l_st]

array([  6.7585e-03,   9.9301e-01,   2.8352e-06,   2.2931e-04,   3.6505e-12], dtype=float32)

In [50]:
answers_train[qnum:qnum+10,0]

array([19, 19, 27, 22, 19, 20, 19, 19, 20, 20])

In [51]:
np.argmax(answer.predict([inputs_train[qnum:qnum+10], queries_train[qnum:qnum+10]]), 1)

array([19, 19, 27, 22, 19, 20, 19, 19, 20, 20])

In [52]:
answer.predict([inputs_train[qnum:qnum+1], queries_train[qnum:qnum+1]])

array([[  2.6171e-13,   2.7784e-13,   1.4070e-13,   4.5004e-13,
          2.8679e-13,   1.3773e-13,   8.2310e-14,   2.4749e-13,
          3.2073e-13,   4.5508e-13,   4.7181e-13,   1.6863e-13,
          1.9341e-13,   1.6086e-13,   1.2592e-13,   2.1884e-13,
          5.3411e-13,   1.6679e-13,   2.7770e-13,   1.0000e+00,
          3.7796e-10,   1.5543e-11,   6.2230e-13,   1.0592e-13,
          4.0973e-13,   1.0297e-10,   2.9755e-13,   1.0429e-13,
          6.2321e-13,   3.4418e-13,   9.5986e-13,   2.3595e-13]], dtype=float32)

In [53]:
vocab[19]

'bathroom'

## multihop

In [61]:
challenge_type = 'two_supporting_facts_10k'
challenge = challenges[challenge_type]

In [62]:
train_stories = get_stories(challenge.format('train'))
test_stories = get_stories(challenge.format('test'))

  return _compile(pattern, flags).split(string, maxsplit)


In [63]:
test_stories[534]

([['0:', 'Mary', 'went', 'to', 'the', 'hallway', '.'],
  ['1:', 'Daniel', 'went', 'back', 'to', 'the', 'bedroom', '.'],
  ['2:', 'Sandra', 'went', 'back', 'to', 'the', 'garden', '.'],
  ['3:', 'Mary', 'went', 'to', 'the', 'office', '.'],
  ['4:', 'Mary', 'journeyed', 'to', 'the', 'kitchen', '.'],
  ['5:', 'Sandra', 'moved', 'to', 'the', 'office', '.'],
  ['6:', 'Sandra', 'journeyed', 'to', 'the', 'hallway', '.'],
  ['7:', 'Daniel', 'journeyed', 'to', 'the', 'garden', '.'],
  ['8:', 'Mary', 'journeyed', 'to', 'the', 'bathroom', '.'],
  ['9:', 'John', 'went', 'back', 'to', 'the', 'bathroom', '.'],
  ['10:', 'Sandra', 'travelled', 'to', 'the', 'garden', '.'],
  ['11:', 'John', 'moved', 'to', 'the', 'office', '.'],
  ['12:', 'Daniel', 'went', 'back', 'to', 'the', 'kitchen', '.'],
  ['13:', 'Mary', 'moved', 'to', 'the', 'kitchen', '.'],
  ['14:', 'Mary', 'moved', 'to', 'the', 'hallway', '.'],
  ['15:', 'Mary', 'went', 'to', 'the', 'kitchen', '.'],
  ['16:', 'Sandra', 'went', 'back', 'to', '

In [66]:
stories = train_stories + test_stories
story_maxlen = max((len(s) for x, _, _ in stories for s in x))
story_maxsents = max((len(x) for x, _, _ in stories))
query_maxlen = max(len(x) for _, x, _ in stories)

vocab = sorted(set(flatten(stories)))
vocab.insert(0, '<PAD>')
vocab_size = len(vocab)

word_idx = dict((c, i) for i, c in enumerate(vocab))

inputs_train, queries_train, answers_train = vectorize_stories(train_stories, 
     word_idx, story_maxlen, query_maxlen)
inputs_test, queries_test, answers_test = vectorize_stories(test_stories, 
     word_idx, story_maxlen, query_maxlen)

inputs_train = stack_inputs(inputs_train)
inputs_test = stack_inputs(inputs_test)

inps = [inputs_train, queries_train]
val_inps = [inputs_test, queries_test]

inputs_train.shape, inputs_test.shape

((10000, 88, 8), (1000, 88, 8))

In [67]:
emb_dim = 30

In [103]:
def emb_sent_bow(inp):
    emb_op = TimeDistributed(Embedding(vocab_size, emb_dim))
    emb = emb_op(inp)
    emb = Lambda(lambda x: K.sum(x, 2))(emb)
    return Elemwise(0, False)(emb), emb_op
#    return emb, emb_op

In [69]:
inp_story = Input((story_maxsents, story_maxlen))
inp_q = Input((query_maxlen,))

In [108]:
emb_story, emb_story_op = emb_sent_bow(inp_story)

In [109]:
emb_q = emb_story_op.layer(inp_q)
emb_q = Lambda(lambda x: K.sum(x, 1))(emb_q)

In [110]:
h = Dense(emb_dim)

In [111]:
def one_hop(u, A):
    C, _ = emb_sent_bow(inp_story)
    x = Reshape((1, emb_dim))(u)
    x = dot([A, x], axes=2)
    x = Reshape((story_maxsents,))(x)
    x = Activation('softmax')(x)
    match = Reshape((story_maxsents,1))(x)

    x = dot([match, C], axes=1)
    x = Reshape((emb_dim,))(x)
    x = h(x)
    x = add([x, emb_q])
    return x, C

In [112]:
response, emb_story = one_hop(emb_q, emb_story)
response, emb_story = one_hop(response, emb_story)

In [113]:
res = Dense(vocab_size, activation='softmax')(response)

In [114]:
answer = Model([inp_story, inp_q], res)
answer.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy',
               metrics=['accuracy'])

In [115]:
answer.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_4 (InputLayer)             (None, 88, 8)         0                                            
____________________________________________________________________________________________________
input_5 (InputLayer)             (None, 5)             0                                            
____________________________________________________________________________________________________
time_distributed_17 (TimeDistrib (None, 88, 8, 30)     3720        input_4[0][0]                    
____________________________________________________________________________________________________
embedding_19 (Embedding)         (None, 5, 30)         3720        input_5[0][0]                    
___________________________________________________________________________________________

In [116]:
K.set_value(answer.optimizer.lr, 5e-3)
hist=answer.fit(inps, answers_train, verbose=1, epochs=8, batch_size=32,
           validation_data=(val_inps, answers_test))

Train on 10000 samples, validate on 1000 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


In [117]:
np.array(hist.history['val_acc'])

array([ 0.31 ,  0.678,  0.764,  0.794,  0.804,  0.826,  0.833,  0.825])

In [107]:
class Elemwise(Layer):
    def __init__(self, axis, is_mult, init='glorot_uniform', **kwargs):
        self.init = initializers.get(init)
        self.axis = axis
        self.is_mult = is_mult
        super(Elemwise, self).__init__(**kwargs)

    def build(self, input_shape):
        input_dims = input_shape[1:]
        dims = [1] * len(input_dims)
        dims[self.axis] = input_dims[self.axis]
        self.b = self.add_weight(name='{}_bo'.format(self.name),
                                 shape=dims,
                                 initializer=self.init,)
        self.built = True

    def call(self, x, mask=None):
        return x * self.b if self.is_mult else x + self.b

    def compute_output_shape(self, input_shape):
        return input_shape

    def get_config(self):
        config = {'init': self.init.__name__, 'axis': self.axis}
        base_config = super(Dense, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))