In [55]:
import pickle
import numpy as np

In [56]:
with open('train_qa.txt', 'rb') as f:
    train_data = pickle.load(f)

In [57]:
with open('test_qa.txt', 'rb') as f:
    test_data = pickle.load(f)

In [58]:
type(test_data)

list

In [59]:
type(train_data)

list

In [60]:
len(train_data)

10000

In [61]:
len(test_data)

1000

In [63]:
train_data[0]

(['Mary',
  'moved',
  'to',
  'the',
  'bathroom',
  '.',
  'Sandra',
  'journeyed',
  'to',
  'the',
  'bedroom',
  '.'],
 ['Is', 'Sandra', 'in', 'the', 'hallway', '?'],
 'no')

In [68]:
all_data = train_data + test_data

In [72]:
vocab = set()

for story, question, answer in all_data:
    vocab = vocab.union(set(story))
    vocab = vocab.union(set(question))
vocab.add('no')
vocab.add('yes')

In [78]:
vocab_len = len(vocab) + 1
vocab_len

38

In [75]:
vocab

{'.',
 '?',
 'Daniel',
 'Is',
 'John',
 'Mary',
 'Sandra',
 'apple',
 'back',
 'bathroom',
 'bedroom',
 'discarded',
 'down',
 'dropped',
 'football',
 'garden',
 'got',
 'grabbed',
 'hallway',
 'in',
 'journeyed',
 'kitchen',
 'left',
 'milk',
 'moved',
 'no',
 'office',
 'picked',
 'put',
 'the',
 'there',
 'to',
 'took',
 'travelled',
 'up',
 'went',
 'yes'}

In [79]:
# Longest Story
all_story_lens = [len(data[0]) for data in all_data]

In [81]:
max_story_len = max(all_story_lens)
max_story_len

156

In [83]:
all_questions_lens = [len(data[1]) for data in all_data]

In [84]:
max_question_len = max(all_questions_lens)
max_question_len

6

In [85]:
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

In [90]:
tokenizer = Tokenizer(filters=[])
tokenizer.fit_on_texts(vocab)

In [91]:
tokenizer.word_index

{'is': 1,
 'no': 2,
 'milk': 3,
 'dropped': 4,
 'there': 5,
 'journeyed': 6,
 'went': 7,
 '?': 8,
 'mary': 9,
 '.': 10,
 'moved': 11,
 'put': 12,
 'in': 13,
 'down': 14,
 'apple': 15,
 'grabbed': 16,
 'up': 17,
 'yes': 18,
 'sandra': 19,
 'bathroom': 20,
 'got': 21,
 'john': 22,
 'garden': 23,
 'travelled': 24,
 'discarded': 25,
 'kitchen': 26,
 'the': 27,
 'office': 28,
 'hallway': 29,
 'back': 30,
 'football': 31,
 'picked': 32,
 'took': 33,
 'bedroom': 34,
 'left': 35,
 'to': 36,
 'daniel': 37}

In [94]:
train_story_text = []
train_question_text = []
train_answer_text = []

In [99]:
for story, question, answer in train_data:
    train_story_text.append(story)
    train_question_text.append(question)
    train_answer_text.append(answer)

In [112]:
def vectorize_stories(data, word_index=tokenizer.word_index, 
                      max_story_len=max_story_len, max_question_len=max_question_len):
    # Stories = X
    X = []
    # Questions = Xq
    Xq = []
    # Answers = Y
    Y = []
    
    for story, query, answer in data:
        x = [word_index[word.lower()] for word in story]
        xq = [word_index[word.lower()] for word in query]
        y = np.zeros(len(word_index) + 1)
        y[word_index[answer]] = 1
        
        X.append(x)
        Xq.append(xq)
        Y.append(y)
        
    return (pad_sequences(X, maxlen=max_story_len), pad_sequences(Xq, maxlen=max_question_len), np.array(Y))

In [113]:
inputs_train, queries_train, answers_train = vectorize_stories(train_data)

In [114]:
inputs_test, queries_test, answers_test = vectorize_stories(test_data)

In [115]:
inputs_test

array([[ 0,  0,  0, ..., 27, 34, 10],
       [ 0,  0,  0, ..., 27, 23, 10],
       [ 0,  0,  0, ..., 27, 23, 10],
       ...,
       [ 0,  0,  0, ..., 27, 15, 10],
       [ 0,  0,  0, ..., 27, 23, 10],
       [ 0,  0,  0, ..., 15,  5, 10]], dtype=int32)

In [116]:
answers_test

array([[0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [118]:
tokenizer.word_index['yes']

18

In [119]:
tokenizer.word_index['no']

2

In [117]:
sum(answers_test)

array([  0.,   0., 503.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0., 497.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.])

In [121]:
from keras.models import Sequential, Model
from keras.layers.embeddings import Embedding
from keras.layers import Input, Activation, Dense, Permute, Dropout, add, dot, concatenate, LSTM

In [129]:
input_sequence = Input((max_story_len, ))
question = Input((max_question_len, ))

In [124]:
vocab_size = len(vocab) + 1

In [125]:
#Input Encoder M
input_encoder_m = Sequential()
input_encoder_m.add(Embedding(input_dim=vocab_size, output_dim=64))
input_encoder_m.add(Dropout(0.3))

#Output:
#(samples, story_maxlen, embedding_dim)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [126]:
#Input Encoder C
input_encoder_c = Sequential()
input_encoder_c.add(Embedding(input_dim=vocab_size, output_dim=max_question_len))
input_encoder_c.add(Dropout(0.3))

#Output:
#(samples, story_maxlen, max_qustion_len)

In [127]:
#Input Question Encoder
question_encoder = Sequential()
question_encoder.add(Embedding(input_dim=vocab_size, output_dim=64, input_length=max_question_len))
question_encoder.add(Dropout(0.3))

#Output:
#(samples, query_maxlen, embedding_dim)

In [132]:
input_encoded_m = input_encoder_m(input_sequence)
input_encoded_c = input_encoder_c(input_sequence)
question_encoded = question_encoder(question)

In [133]:
match = dot([input_encoded_m, question_encoded], axes=(2,2))
match = Activation('softmax')(match)