**Loading the Data**

In [1]:
import pickle
import numpy as np

In [2]:
with open("train_qa.txt", "rb") as fp: 
    train_data =  pickle.load(fp)

In [3]:
with open("test_qa.txt", "rb") as fp: 
    test_data =  pickle.load(fp)

----

**display data format**

In [4]:
type(test_data)

list

In [8]:
type(train_data)

list

In [9]:
len(test_data)

1000

In [10]:
len(train_data)

10000

In [14]:
train_data[0]

(['Mary',
  'moved',
  'to',
  'the',
  'bathroom',
  '.',
  'Sandra',
  'journeyed',
  'to',
  'the',
  'bedroom',
  '.'],
 ['Is', 'Sandra', 'in', 'the', 'hallway', '?'],
 'no')

In [7]:
' '.join(train_data[0][0])

'Mary moved to the bathroom . Sandra journeyed to the bedroom .'

In [8]:
' '.join(train_data[0][1])

'Is Sandra in the hallway ?'

In [5]:
[story, question , answer] = train_data[0]

In [19]:
story

['Mary',
 'moved',
 'to',
 'the',
 'bathroom',
 '.',
 'Sandra',
 'journeyed',
 'to',
 'the',
 'bedroom',
 '.']

In [6]:
question

['Is', 'Sandra', 'in', 'the', 'hallway', '?']

In [21]:
answer

'no'

** extract vocabulary from all train and test data **

In [4]:
# Create a set that holds the vocab words
vocab = set()

In [5]:
all_data = test_data + train_data

In [6]:
for story, question , answer in all_data:
    vocab = vocab.union(set(story))
    vocab = vocab.union(set(question))

In [7]:
vocab.add('no')
vocab.add('yes')

In [8]:
vocab

{'.',
 '?',
 'Daniel',
 'Is',
 'John',
 'Mary',
 'Sandra',
 'apple',
 'back',
 'bathroom',
 'bedroom',
 'discarded',
 'down',
 'dropped',
 'football',
 'garden',
 'got',
 'grabbed',
 'hallway',
 'in',
 'journeyed',
 'kitchen',
 'left',
 'milk',
 'moved',
 'no',
 'office',
 'picked',
 'put',
 'the',
 'there',
 'to',
 'took',
 'travelled',
 'up',
 'went',
 'yes'}

In [9]:
vocab_len = len(vocab) + 1  # add an extra space to hold a 0 for Keras's pad_sequences

In [10]:
vocab_len

38

In [31]:
# [data[0] for data in all_data] # display all story 

In [33]:
# [len (data[0]) for data in all_data] # display length of all story 

In [11]:
max_story_len = max([len(data[0]) for data in all_data])

In [16]:
max_story_len

156

In [12]:
max_question_len = max([len(data[1]) for data in all_data])

In [18]:
max_question_len

6

**Vectorizing the Data**

In [26]:
vocab

{'.',
 '?',
 'Daniel',
 'Is',
 'John',
 'Mary',
 'Sandra',
 'apple',
 'back',
 'bathroom',
 'bedroom',
 'discarded',
 'down',
 'dropped',
 'football',
 'garden',
 'got',
 'grabbed',
 'hallway',
 'in',
 'journeyed',
 'kitchen',
 'left',
 'milk',
 'moved',
 'no',
 'office',
 'picked',
 'put',
 'the',
 'there',
 'to',
 'took',
 'travelled',
 'up',
 'went',
 'yes'}

In [13]:
# Reserve 0 for pad_sequences
vocab_size = len(vocab) + 1

In [14]:
vocab_size

38

-----------

In [20]:
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

In [21]:
# integer encode sequences of words
#tokenizer = Tokenizer(
tokenizer = Tokenizer(filters=[])
tokenizer.fit_on_texts(vocab)

In [22]:
tokenizer.word_index

{'no': 1,
 'mary': 2,
 'dropped': 3,
 'is': 4,
 'picked': 5,
 'back': 6,
 '?': 7,
 'football': 8,
 'daniel': 9,
 'got': 10,
 'journeyed': 11,
 'the': 12,
 'put': 13,
 'garden': 14,
 'left': 15,
 'hallway': 16,
 'down': 17,
 'moved': 18,
 'yes': 19,
 'grabbed': 20,
 'to': 21,
 'sandra': 22,
 'travelled': 23,
 'up': 24,
 'took': 25,
 'milk': 26,
 'went': 27,
 'kitchen': 28,
 'office': 29,
 'john': 30,
 '.': 31,
 'there': 32,
 'apple': 33,
 'bathroom': 34,
 'discarded': 35,
 'bedroom': 36,
 'in': 37}

In [23]:
tokenizer.word_index["put"]

13

In [24]:
train_story_text = []
train_question_text = []
train_answers = []

for story,question,answer in train_data:
    train_story_text.append(story)
    train_question_text.append(question)
    train_answers.append(answer)

In [25]:
train_story_seq = tokenizer.texts_to_sequences(train_story_text)

In [26]:
train_question_seq = tokenizer.texts_to_sequences(train_question_text)

In [27]:
len(train_story_text)

10000

In [37]:
train_story_text[0]

['Mary',
 'moved',
 'to',
 'the',
 'bathroom',
 '.',
 'Sandra',
 'journeyed',
 'to',
 'the',
 'bedroom',
 '.']

In [38]:
train_story_seq[0]

[8, 5, 21, 35, 10, 36, 7, 32, 21, 35, 3, 36]

In [26]:
train_answers[0]

'no'

In [51]:
len(train_story_seq)

10000

In [28]:
word_index = tokenizer.word_index

In [29]:
word_index["no"]

1

In [30]:
train_story_seq[:2]

[[2, 18, 21, 12, 34, 31, 22, 11, 21, 12, 36, 31],
 [2,
  18,
  21,
  12,
  34,
  31,
  22,
  11,
  21,
  12,
  36,
  31,
  2,
  27,
  6,
  21,
  12,
  36,
  31,
  9,
  27,
  6,
  21,
  12,
  16,
  31]]

In [42]:
len(train_question_seq)

10000

In [43]:
train_question_seq[0]

[25, 7, 12, 35, 6, 29]

**vectorize story, question and answer**

In [31]:
def vectorize_stories(data, word_index=tokenizer.word_index, max_story_len=max_story_len,max_question_len=max_question_len):
   # X = STORIES
    X = []
    # Xq = QUERY/QUESTION
    Xq = []
    # Y = CORRECT ANSWER
    Y = []
    
    
    for story, query, answer in data:
        
        # Grab the word index for every word in story
        x = [word_index[word.lower()] for word in story]
        # Grab the word index for every word in query
        xq = [word_index[word.lower()] for word in query]
        
        # Grab the Answers (either Yes/No so we don't need to use list comprehension here)
        # Index 0 is reserved so we're going to use + 1
        y = np.zeros(len(word_index) + 1)
        
        # Now that y is all zeros and we know its just Yes/No , we can use numpy logic to create this assignment
        #
        y[word_index[answer]] = 1
        
        # Append each set of story,query, and answer to their respective holding lists
        X.append(x)
        Xq.append(xq)
        Y.append(y)
        
    # Finally, pad the sequences based on their max length so the RNN can be trained on uniformly long sequences.
        
    # RETURN TUPLE FOR UNPACKING
    return (pad_sequences(X, maxlen=max_story_len),pad_sequences(Xq, maxlen=max_question_len), np.array(Y))

In [32]:
inputs_train, queries_train, answers_train = vectorize_stories(train_data)

**understand format of vectorized train and test data sets**

In [33]:
inputs_train.shape

(10000, 156)

In [34]:
inputs_train[0]

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  2, 18, 21, 12, 34, 31, 22, 11, 21,
       12, 36, 31], dtype=int32)

In [50]:
train_story_seq[0]

[8, 5, 21, 35, 10, 36, 7, 32, 21, 35, 3, 36]

In [71]:
queries_train.shape

(10000, 6)

In [72]:
queries_train[0]

array([30, 36, 13, 31,  5, 28], dtype=int32)

In [77]:
train_question_seq[0]

[30, 36, 13, 31, 5, 28]

In [78]:
len(answers_train)

10000

In [79]:
answers_train[0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0.])

In [80]:
train_answers[0]

'no'

In [35]:
tokenizer.word_index["no"]

1

In [91]:
len(answers_train[0])

38

In [36]:
inputs_test, queries_test, answers_test = vectorize_stories(test_data)

In [93]:
inputs_test[0]

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  8, 20, 31, 17, 21,  6, 37, 11,  7,
       31, 32,  6], dtype=int32)

In [94]:
queries_test

array([[30, 37, 13, 31, 16, 28],
       [30, 37, 13, 31, 16, 28],
       [30, 37, 13, 31,  1, 28],
       ...,
       [30,  8, 13, 31, 32, 28],
       [30, 36, 13, 31,  1, 28],
       [30,  8, 13, 31,  1, 28]], dtype=int32)

In [95]:
answers_test[2]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
       0., 0., 0., 0.])

In [52]:
sum(answers_test)

array([  0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
       503.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0., 497.,   0.,
         0.,   0.,   0.,   0.,   0.])

In [37]:
tokenizer.word_index['yes']

19

In [38]:
tokenizer.word_index['no']

1

**Creating the Model**

In [39]:
from keras.models import Sequential, Model
from keras.layers.embeddings import Embedding
from keras.layers import Input, Activation, Dense, Permute, Dropout
from keras.layers import add, dot, concatenate
from keras.layers import LSTM
from keras.utils import plot_model

**Placeholders for Inputs**

In [40]:
input_sequence = Input((max_story_len,))
question = Input((max_question_len,))

In [57]:
question.shape

TensorShape([None, 6])

In [58]:
input_sequence.shape

TensorShape([None, 156])

**Building the Networks**

**Encoders**

**Input Encoder**

In [41]:
# Input gets embedded to a sequence of vectors
input_encoder_m = Sequential()
input_encoder_m.add(Embedding(input_dim=vocab_size,output_dim=64))
input_encoder_m.add(Dropout(0.3))

In [60]:
input_encoder_m.input

<tf.Tensor 'embedding_input:0' shape=(None, None) dtype=float32>

In [61]:
input_encoder_m.output

<tf.Tensor 'dropout/cond/Identity:0' shape=(None, None, 64) dtype=float32>

In [42]:
vocab_size

38

**Input Encoder**

In [43]:
# embed the input into a sequence of vectors of size query_maxlen
input_encoder_c = Sequential()
input_encoder_c.add(Embedding(input_dim=vocab_size,output_dim=max_question_len))
input_encoder_c.add(Dropout(0.3))

In [64]:
input_encoder_c.input

<tf.Tensor 'embedding_1_input:0' shape=(None, None) dtype=float32>

In [65]:
input_encoder_c.output

<tf.Tensor 'dropout_1/cond/Identity:0' shape=(None, None, 6) dtype=float32>

**Question Encoder**

In [44]:
# embed the question into a sequence of vectors
question_encoder = Sequential()
question_encoder.add(Embedding(input_dim=vocab_size,
                               output_dim=64,
                               input_length=max_question_len))
question_encoder.add(Dropout(0.3))
# output: (samples, query_maxlen, embedding_dim)

In [67]:
question_encoder.input

<tf.Tensor 'embedding_2_input:0' shape=(None, 6) dtype=float32>

In [68]:
question_encoder.output

<tf.Tensor 'dropout_2/cond/Identity:0' shape=(None, 6, 64) dtype=float32>

**Encode the Sequences**

In [45]:
# encode input sequence and questions (which are indices)
# to sequences of dense vectors
input_encoded_m = input_encoder_m(input_sequence)
input_encoded_c = input_encoder_c(input_sequence)
question_encoded = question_encoder(question)

In [70]:
input_encoded_m.shape

TensorShape([None, 156, 64])

In [71]:
input_encoded_c.shape

TensorShape([None, 156, 6])

In [72]:
question_encoded.shape

TensorShape([None, 6, 64])

**Use dot product to compute the match between first input vector seq and the query**

In [46]:
# shape: `(samples, story_maxlen, query_maxlen)`
match = dot([input_encoded_m, question_encoded], axes=(2, 2))
#match = Activation('softmax')(match)

In [47]:
# shape: `(samples, story_maxlen, query_maxlen)`
match.shape
#match = Activation('softmax')(match)

TensorShape([None, 156, 6])

In [48]:
# shape: `(samples, story_maxlen, query_maxlen)`
#match = dot([input_encoded_m, question_encoded], axes=(2, 2))
match = Activation('softmax')(match)

In [49]:
match.shape

TensorShape([None, 156, 6])

In [50]:
response = add([match, input_encoded_c]) 
response.shape

TensorShape([None, 156, 6])

**Add this match matrix with the second input vector sequence**

In [51]:
# add the match matrix with the second input vector sequence
#response = add([match, input_encoded_c])  # (samples, story_maxlen, query_maxlen)
response = Permute((2, 1))(response)  # (samples, query_maxlen, story_maxlen)
response.shape

TensorShape([None, 6, 156])

In [52]:
question_encoded.shape

TensorShape([None, 6, 64])

**Concatenate**

In [53]:
# concatenate the match matrix with the question vector sequence
answer = concatenate([response, question_encoded])

In [54]:
answer.shape

TensorShape([None, 6, 220])

In [55]:
# Reduce with RNN (LSTM)
answer = LSTM(32)(answer)  # (samples, 32)

In [136]:
answer.shape

TensorShape([None, 32])

In [56]:
# Regularization with Dropout
answer = Dropout(0.5)(answer)
answer = Dense(vocab_size)(answer)  # (samples, vocab_size)

In [138]:
answer.shape

TensorShape([None, 38])

In [57]:
answer = Activation('softmax')(answer)
answer.shape

TensorShape([None, 38])

In [60]:
# we output a probability distribution over the vocabulary
#answer = Activation('softmax')(answer)

# build the final model
model = Model([input_sequence, question], answer)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy',
              metrics=['accuracy'])

In [62]:
#plot_model(model, "my_first_model_with_shape_info.png", show_shapes=True)

In [63]:
#plot_model(model, "my_first_model.png")

In [64]:
model.summary()

Model: "functional_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 156)]        0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 6)]          0                                            
__________________________________________________________________________________________________
sequential (Sequential)         (None, None, 64)     2432        input_1[0][0]                    
__________________________________________________________________________________________________
sequential_2 (Sequential)       (None, 6, 64)        2432        input_2[0][0]                    
_______________________________________________________________________________________

In [65]:
# train, set epochs to 100  for a good accuracy 
history = model.fit([inputs_train, queries_train], answers_train,batch_size=32,epochs=3,validation_data=([inputs_test, queries_test], answers_test))

Epoch 1/3
Epoch 2/3
Epoch 3/3


**Saving the Model**

In [66]:
filename = 'epoch3.h5'
model.save(filename)

**Evaluating the Model**

**Plotting Out Training History**

In [68]:
#import matplotlib.pyplot as plt
#%matplotlib inline
#print(history.history.keys())
# summarize history for accuracy
#plt.plot(history.history['acc'])
#plt.plot(history.history['val_acc'])
#plt.title('model accuracy')
#plt.ylabel('accuracy')
#plt.xlabel('epoch')
#plt.legend(['train', 'test'], loc='upper left')
#plt.show()

**Evaluating on Given Test Set**

In [67]:
model.load_weights(filename)
pred_results = model.predict(([inputs_test, queries_test]))

In [68]:
test_data[0][0]

['Mary',
 'got',
 'the',
 'milk',
 'there',
 '.',
 'John',
 'moved',
 'to',
 'the',
 'bedroom',
 '.']

In [69]:
story =' '.join(word for word in test_data[0][0])
print(story)

Mary got the milk there . John moved to the bedroom .


In [70]:
query = ' '.join(word for word in test_data[0][1])
print(query)

Is John in the kitchen ?


In [71]:
print("True Test Answer from Data is:",test_data[0][2])

True Test Answer from Data is: no


In [72]:
#Generate prediction from model
val_max = np.argmax(pred_results[0])

for key, val in tokenizer.word_index.items():
    if val == val_max:
        k = key

print("Predicted answer is: ", k)
print("Probability of certainty was: ", pred_results[0][val_max])

Predicted answer is:  no
Probability of certainty was:  0.9999975


**Writing Your Own Stories and Questions**
Remember you can only use words from the existing vocab

In [73]:
vocab

{'.',
 '?',
 'Daniel',
 'Is',
 'John',
 'Mary',
 'Sandra',
 'apple',
 'back',
 'bathroom',
 'bedroom',
 'discarded',
 'down',
 'dropped',
 'football',
 'garden',
 'got',
 'grabbed',
 'hallway',
 'in',
 'journeyed',
 'kitchen',
 'left',
 'milk',
 'moved',
 'no',
 'office',
 'picked',
 'put',
 'the',
 'there',
 'to',
 'took',
 'travelled',
 'up',
 'went',
 'yes'}

In [74]:
# Note the whitespace of the periods
my_story = "John left the kitchen . Sandra dropped the football in the garden ."
my_story.split()

['John',
 'left',
 'the',
 'kitchen',
 '.',
 'Sandra',
 'dropped',
 'the',
 'football',
 'in',
 'the',
 'garden',
 '.']

In [75]:
my_question = "Is the football in the garden ?"

In [76]:
my_question.split()

['Is', 'the', 'football', 'in', 'the', 'garden', '?']

In [77]:
mydata = [(my_story.split(),my_question.split(),'yes')]

In [78]:
my_story,my_ques,my_ans = vectorize_stories(mydata)

In [79]:
pred_results = model.predict(([ my_story, my_ques]))

In [80]:
#Generate prediction from model
val_max = np.argmax(pred_results[0])

for key, val in tokenizer.word_index.items():
    if val == val_max:
        k = key

print("Predicted answer is: ", k)
print("Probability of certainty was: ", pred_results[0][val_max])

Predicted answer is:  yes
Probability of certainty was:  0.9993863
