In [1]:
import pickle
import numpy as np


In [2]:
with open('train_qa.txt','rb') as f:
    train_data = pickle.load(f)

with open('test_qa.txt','rb') as f:
    test_data = pickle.load(f)
    

In [4]:
type(train_data),type(test_data)

(list, list)

In [6]:
len(train_data),len(test_data)

(10000, 1000)

In [10]:
train_data[0], len(train_data[0])

((['Mary',
   'moved',
   'to',
   'the',
   'bathroom',
   '.',
   'Sandra',
   'journeyed',
   'to',
   'the',
   'bedroom',
   '.'],
  ['Is', 'Sandra', 'in', 'the', 'hallway', '?'],
  'no'),
 3)

In [12]:
print(' '.join(train_data[0][0]))
print(' '.join(train_data[0][1]))
print(train_data[0][2])

Mary moved to the bathroom . Sandra journeyed to the bedroom .
Is Sandra in the hallway ?
no


In [13]:
all_data = test_data + train_data

In [14]:
len(all_data)

11000

In [15]:
set(train_data[0][0])

{'.',
 'Mary',
 'Sandra',
 'bathroom',
 'bedroom',
 'journeyed',
 'moved',
 'the',
 'to'}

In [17]:
vocab = set()
for story,question,answer in all_data:
    vocab = vocab.union(set(story))
    vocab = vocab.union(set(question))

In [19]:
vocab.add('no')
vocab.add('yes')

In [20]:
vocab

{'.',
 '?',
 'Daniel',
 'Is',
 'John',
 'Mary',
 'Sandra',
 'apple',
 'back',
 'bathroom',
 'bedroom',
 'discarded',
 'down',
 'dropped',
 'football',
 'garden',
 'got',
 'grabbed',
 'hallway',
 'in',
 'journeyed',
 'kitchen',
 'left',
 'milk',
 'moved',
 'no',
 'office',
 'picked',
 'put',
 'the',
 'there',
 'to',
 'took',
 'travelled',
 'up',
 'went',
 'yes'}

In [21]:
vocab_length = len(vocab) + 1

In [22]:
vocab_length

38

In [25]:
all_data[0][0]

['Mary',
 'got',
 'the',
 'milk',
 'there',
 '.',
 'John',
 'moved',
 'to',
 'the',
 'bedroom',
 '.']

In [27]:
all_story_lens=[len(data[0]) for data in all_data]

In [31]:
max_story_len=max(all_story_lens)
max_story_len# 156

156

In [30]:
max_question_len = max([len(data[1])for data in all_data])
max_question_len

6

In [33]:
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

In [34]:
tokenizer = Tokenizer(filters=[])

In [35]:
tokenizer.fit_on_texts(vocab)

In [38]:
tokenizer.word_index

{'apple': 1,
 'is': 2,
 'bathroom': 3,
 'got': 4,
 'yes': 5,
 'bedroom': 6,
 'journeyed': 7,
 'there': 8,
 'in': 9,
 'john': 10,
 'milk': 11,
 'the': 12,
 'went': 13,
 'picked': 14,
 'left': 15,
 'to': 16,
 '?': 17,
 'daniel': 18,
 'office': 19,
 'kitchen': 20,
 'up': 21,
 'no': 22,
 'discarded': 23,
 'sandra': 24,
 'hallway': 25,
 'moved': 26,
 'travelled': 27,
 'took': 28,
 'mary': 29,
 'garden': 30,
 'down': 31,
 'grabbed': 32,
 'put': 33,
 '.': 34,
 'back': 35,
 'dropped': 36,
 'football': 37}

In [39]:
train_story_text = []
train_question_text = []
train_answers = []

In [40]:
for story,question,answer in train_data:
    train_story_text.append(story)
    train_question_text.append(question)
    train_answers.append(answer)

In [42]:
#train_story_text

In [43]:
train_story_seq = tokenizer.texts_to_sequences(train_story_text)

In [50]:
len(train_story_seq),len(train_story_text)

(10000, 10000)

In [83]:
len(tokenizer.word_index)

37

In [84]:
def vectorize_stories(data,word_index=tokenizer.word_index,max_story_len = max_story_len,max_question_len= max_question_len):
    # STORIES X
    X = []
    # QUESTION Xq
    Xq = []
    # correct answer (yes/no)
    Y = []
    
    for story,query,answer in data:
        #  for each story [29,15,.....]
        x=[word_index[word.lower()] for word in story]
        # for each question
        xq = [word_index[word.lower()] for word in query]
        
        y = np.zeros(len(word_index)+1)
        
        y[word_index[answer]] = 1
        
        X.append(x)
        Xq.append(xq)
        Y.append(y)
        
    X = pad_sequences(X,maxlen=max_story_len)
    Xq = pad_sequences(Xq,maxlen=max_question_len)
    Y = np.array(Y)
    
    return (X,Xq,Y)
        

In [85]:
inputs_train,queries_train,answers_train = vectorize_stories(train_data)

In [86]:
inputs_test,queries_test,answers_test = vectorize_stories(test_data)

In [87]:
inputs_test

array([[ 0,  0,  0, ..., 12,  6, 34],
       [ 0,  0,  0, ..., 12, 30, 34],
       [ 0,  0,  0, ..., 12, 30, 34],
       ...,
       [ 0,  0,  0, ..., 12,  1, 34],
       [ 0,  0,  0, ..., 12, 30, 34],
       [ 0,  0,  0, ...,  1,  8, 34]])

In [88]:
answers_test

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [89]:
tokenizer.word_index['yes'],tokenizer.word_index['no']

(5, 22)

In [90]:
sum(answers_test)

array([  0.,   0.,   0.,   0.,   0., 497.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
       503.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.])

In [91]:
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers import Input,Activation,Dense,Permute,Dropout,add,dot,concatenate,LSTM

In [92]:
#place holder shape (max_story_len,batch_size)
input_sequence = Input(shape=(max_story_len,))
question = Input(shape = (max_question_len,))

In [93]:
vocab_size = len(vocab) + 1

In [98]:
# INPUT ENCODER M

input_encoder_m = Sequential()
input_encoder_m.add(Embedding(input_dim=vocab_size,output_dim=64))
input_encoder_m.add(Dropout(0.3))



# ( samples,stories_maxlen,embedding_dim)

In [95]:
# INPUT ENCODER C

input_encoder_c = Sequential()
input_encoder_c.add(Embedding(input_dim=vocab_size,output_dim=max_question_len))
input_encoder_c.add(Dropout(0.3))


# ( samples,stories_maxlen,max_question_len)

In [97]:
# Question encoder

question_encoder = Sequential()
question_encoder.add(Embedding(input_dim=vocab_size,output_dim=64,input_length=max_question_len))
question_encoder.add(Dropout(0.3))


# ( samples,query_maxlen,embedding_dim)

In [99]:
# Encoded <--- encoder(input)

input_encoded_m = input_encoder_m(input_sequence)
input_encoded_c = input_encoder_c(input_sequence)
question_encoded = question_encoder(question)

In [100]:
match = dot([input_encoded_m,question_encoded],axes=(2,2))
match = Activation('softmax')(match)

In [101]:
response = add([match,input_encoded_c])
response = Permute((2,1))(response)