In [0]:
import numpy as np

In [0]:
np.random.seed(42)

Download the data

In [0]:
!wget http://www.thespermwhale.com/jaseweston/babi/tasks_1-20_v1-2.tar.gz --quiet

Parse bAbI stories

In [0]:
def parse_stories(lines):
    
    stories = []
    questions = []
    answers = []
    
    story = ''
    for line in lines:
        line = line.decode('utf-8').strip()
        #Get line number and rest of the line
        nid, line = line.split(' ', 1)
        nid = int(nid)
        if nid == 1:
            #Start a new story
            story = ''
        if '\t' in line:
            #End of the story
            q, a, supporting = line.split('\t')
            stories.append(story)
            questions.append(q)
            answers.append(a)            
        else:
            if (story == ''):
                story = line
            else:
                story += ' ' + line
    return stories, questions, answers

Extract the train and test files

In [0]:
import tarfile

Checking the content of the file

In [0]:
with tarfile.open('tasks_1-20_v1-2.tar.gz') as tar:
    f = tar.extractfile('tasks_1-20_v1-2/en-10k/qa1_single-supporting-fact_test.txt')
    print(f.readlines())

In [0]:
with tarfile.open('tasks_1-20_v1-2.tar.gz') as tar:
    train_stories_txt, train_q_txt, train_a_txt  = parse_stories(tar.extractfile('tasks_1-20_v1-2/en-10k/qa1_single-supporting-fact_train.txt'))
    test_stories_txt, test_q_txt, test_a_txt = parse_stories(tar.extractfile('tasks_1-20_v1-2/en-10k/qa1_single-supporting-fact_test.txt'))

In [0]:
#tar.getnames()

In [0]:
train_stories_txt[0]

In [0]:
train_q_txt[0]

In [0]:
train_a_txt[0]

# Build Tokenizer

In [0]:
from tensorflow.python.keras.preprocessing.text import Tokenizer

In [0]:
t = Tokenizer()

In [0]:
#Fit on training data
t.fit_on_texts(train_stories_txt)
t.fit_on_texts(train_q_txt)
t.fit_on_texts(train_a_txt)

In [0]:
#Fit on test data
t.fit_on_texts(test_stories_txt)
t.fit_on_texts(test_q_txt)
t.fit_on_texts(test_a_txt)

In [0]:
vocab_size =  len(t.word_index) + 1 #Tokenizer starts with index 1

In [0]:
vocab_size

In [0]:
train_stories_seq = t.texts_to_sequences(train_stories_txt)
train_q_seq = t.texts_to_sequences(train_q_txt)
train_a_seq = t.texts_to_sequences(train_a_txt)

In [0]:
test_stories_seq = t.texts_to_sequences(test_stories_txt)
test_q_seq = t.texts_to_sequences(test_q_txt)
test_a_seq = t.texts_to_sequences(test_a_txt)

In [0]:
story_maxlen = max([len(txt) for txt in train_stories_seq + test_stories_seq])

In [0]:
question_maxlen = max([len(txt) for txt in train_q_seq + test_q_seq])

In [0]:
answer_maxlen = max([len(txt) for txt in train_a_seq + test_a_seq])

In [0]:
story_maxlen

In [0]:
question_maxlen

In [0]:
answer_maxlen

Pad the sequences

In [0]:
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

In [0]:
train_stories_seq = pad_sequences(train_stories_seq,maxlen=story_maxlen)
train_q_seq = pad_sequences(train_q_seq,maxlen=question_maxlen)
train_a_seq = pad_sequences(train_a_seq,maxlen=answer_maxlen)

In [0]:
test_stories_seq = pad_sequences(test_stories_seq,maxlen=story_maxlen)
test_q_seq = pad_sequences(test_q_seq,maxlen=question_maxlen)
test_a_seq = pad_sequences(test_a_seq,maxlen=answer_maxlen)

int to word converter

In [0]:
int_to_word = dict((i,w) for w, i in t.word_index.items())

In [0]:
int_to_word[11]

# Define the model layers

In [0]:
from tensorflow.python.keras.models import Sequential, Model

In [0]:
from tensorflow.python.keras.layers import Embedding, Dense, LSTM, Activation, dot, Permute, add, concatenate, Dropout, Input

Define input for story and question

In [0]:
story = Input(shape=(story_maxlen,))

In [0]:
question = Input(shape=(question_maxlen,))

Build 3 encoders to provide 3 Embeddings
1. Input Memory - m_encoder
2. Controller embedding
3. Question embedding

Embedding A for Input memory

In [0]:
m_encoder = Sequential()
m_encoder.add(Embedding(input_dim=vocab_size,output_dim=story_maxlen))
m_encoder.add(Dropout(0.3))
m_embedded_output = m_encoder(story)
#output is batch_size x story_maxlen x story_maxlen (embedding size)

Embedding C for use with Controller

In [0]:
c_encoder = Sequential()
c_encoder.add(Embedding(input_dim=vocab_size, output_dim=question_maxlen))
c_encoder.add(Dropout(0.3))
c_embedded_output = c_encoder(story)
#output is batch_size x story_maxlen x question_maxlen (embedding size)

Embedding B for Question

In [0]:
question_encoder = Sequential()
question_encoder.add(Embedding(input_dim=vocab_size, output_dim=story_maxlen, input_length=question_maxlen))
question_encoder.add(Dropout(0.3))
question_embeddding_output = question_encoder(question)
#output is batch_size x question_maxlen x story_maxlen (embedding size)

Attention

In [0]:
attention_weights = dot([m_embedded_output, question_embeddding_output], axes=(2, 2))
attention_weights = Activation('softmax')(attention_weights)
#output is batch_size x story_maxlen x question_maxlen

Calculate Weighted_sum (here we are using Add function)

In [0]:
weighted_sum = add([attention_weights, c_embedded_output])  
#Output batch_size x story_maxlen x question_maxlen

permuted_weighted_sum = Permute((2, 1))(weighted_sum)  
#Output batch_size x question_maxlen x story_maxlen

Add both permuted_weighted_sum to Question embedding (for first hop)

In [0]:
output_1 = add([permuted_weighted_sum, question_embeddding_output])
#Output batch_size x query_maxlen x story_maxlen

Output using LSTM

In [0]:
answer = LSTM(32)(output_1)
#Last hidden state - batch_size x 32

In [0]:
answer = Dropout(0.3)(answer)

FC Layer to predict answer using SoftMax

In [0]:
answer = Dense(vocab_size)(answer)
answer = Activation('softmax')(answer)
#Output batch_size x vocab_size

# Build the model

In [0]:
model = Model([story, question], answer)

In [0]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy',metrics=['accuracy'])

Train the model

In [0]:
model.fit([train_stories_seq, train_q_seq], train_a_seq,
          batch_size=32,
          epochs=200,
          validation_data=([test_stories_seq, test_q_seq], test_a_seq))

In [0]:
model.save('models/babi_memn2n_task_1.hd5')

# Model Prediction

In [0]:
test_num = 885

In [0]:
#Get padded story seuqence
story_seq_ex = test_stories_seq[test_num]

#Get padded question sequence
question_seq_ex = test_q_seq[test_num]

#reshape to batch_size 1
story_seq_ex = np.reshape(story_seq_ex,(1,len(story_seq_ex)))
question_seq_ex = np.reshape(question_seq_ex,(1,len(question_seq_ex)))

#Predict
result = model.predict([story_seq_ex, question_seq_ex])

#Get the index with highest probability
result = np.argmax(result)

#Convert index to word
result = int_to_word[result]

In [0]:
print ('Story : \n' + test_stories_txt[test_num])
print ('Question : \n' + test_q_txt[test_num])
print ('Answer : \n' + result)

### Models Learned

1.   Linear Regression -> y = wX + b
2.   Linear Classification -> y= softmax(xW+b) - 92%
3.   Dense or Fully Connected Layers network -> 97%
4.   CNN -> 99%
5.   Word2Vec
6.   RNN/LSTM/GRU
7.   Char-RNN -> Language Modeling
8.   Seq2Seq Model 
9.   Seq2Seq using Attention
10. Memory Networks



### Techniques to improve Model

1.   Dropout
2.   Batching
3.   Number of iterations
4.   Learning Rate
5.   Number of hidden layers
6.   Neurons in each layer
7.   Normalize the data
8.   Optimizers - SGD, Adam, Adadelta
9.   Activation functions - ReLU
10. CNN - Filter size, stride, padding, pooling
11. RNN - Memory units

