# Load dataset from disk

In [1]:
import json

def data_from_json(filename):
    with open(filename) as data_file:
        data = json.load(data_file)
    return data

train_data = data_from_json('datasets/train-v1.1.json')
dev_data = data_from_json('datasets/dev-v1.1.json')

### Check an instance from the dataset to see how it actually looks like!

In [2]:
val = train_data['data'][0]
para = val['paragraphs']
print('context: ', para[0]['context'])
print('qas: ', para[0]['qas'])

context:  Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.
qas:  [{'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?', 'answers': [{'answer_start': 515, 'text': 'Saint Bernadette Soubirous'}], 'id': '5733be284776f41900661182'}, {'question': 'What is in front of the Notre Dame Main Building?', 'answers': [{'answer_sta

### Calculate the number of samples present in training and dev data. Play around! 

In [3]:
def total_examples(data):
    total_ques = 0
    total_paras = 0
    total_articles = 0
    for article in data['data']:
        total_articles += 1
        total_paras += len(article['paragraphs'])
        for para in article['paragraphs']:
            total_ques += len(para['qas'])     
    print('num_paras:', total_paras, 'num_articles:' , total_articles)
    return total_ques

print('train_data:', total_examples(train_data))
print('dev_data:', total_examples(dev_data))

num_paras: 18896 num_articles: 442
train_data: 87599
num_paras: 2067 num_articles: 48
dev_data: 10570


In [4]:
def list_topics(data):
    list_topics = [data['data'][idx]['title'] for idx in range(0,len(data['data']))]
    return list_topics

print(list_topics(train_data)[:25])

['University_of_Notre_Dame', 'Beyoncé', 'Montana', 'Genocide', 'Antibiotics', 'Frédéric_Chopin', 'Sino-Tibetan_relations_during_the_Ming_dynasty', 'IPod', 'The_Legend_of_Zelda:_Twilight_Princess', 'Spectre_(2015_film)', '2008_Sichuan_earthquake', 'New_York_City', 'To_Kill_a_Mockingbird', 'Solar_energy', 'Tajikistan', 'Anthropology', 'Portugal', 'Kanye_West', 'Buddhism', 'American_Idol', 'Dog', '2008_Summer_Olympics_torch_relay', 'Alfred_North_Whitehead', 'Financial_crisis_of_2007%E2%80%9308', 'Saint_Barth%C3%A9lemy']


#### We reduce the size of the dataset so that we can train our models easily, even with less computational resources. This will also give us a lot of flexibilty to try more models, faster but obviously, at the cost of performance!



In [5]:
train_context = []
train_questions = []
train_answer = []
train_answer_span = []

dev_context = []
dev_questions = []
dev_answer = []
dev_answer_span = []

incorrect_ans = 0

def segregate_lists(data, num, list_context, list_questions, list_answer, list_answer_span):
    for article in data['data'][:num]:
        for para in article['paragraphs']:
            list_context.append(para['context'])
            for qa in para['qas']:
                list_questions.append(qa['question'])
                ans_text = qa['answers'][0]['text']
                list_answer.append(ans_text)
                list_answer_span.append([qa['answers'][0]['answer_start'], qa['answers'][0]['answer_start']+len(ans_text)])
                
                #check if the answer spans actually correspond to the given answers 
                if ans_text != para['context'][qa['answers'][0]['answer_start']:qa['answers'][0]['answer_start']+len(ans_text)]:
                    incorrect_ans +=1
                    print(ans_text)
                    print(para['context'][qa['answers'][0]['answer_start']:qa['answers'][0]['answer_start']+len(ans_text)])

segregate_lists(train_data, 50, train_context, train_questions, train_answer, train_answer_span)
print (len(train_context), len(train_questions), len(train_answer), len(train_answer_span))
#print(train_answer_span)
print(incorrect_ans)
segregate_lists(dev_data, 20, dev_context, dev_questions, dev_answer, dev_answer_span)
print (len(dev_context), len(dev_questions), len(dev_answer), len(dev_answer_span))

2694 12723 12723 12723
0
888 4913 4913 4913


#### Find the longest context and question based on the no. of words. These values will later be used in preparing input data for the Embedding layer. 

In [32]:
import string

context_maxlen_str= max(train_context+dev_context, key=len)
ques_maxlen_str = max(train_questions+dev_questions, key =len)

for c in string.punctuation:
    context_maxlen_str= context_maxlen_str.replace(c,"")
    ques_maxlen_str = ques_maxlen_str.replace(c,"")


context_maxlen = len(context_maxlen_str.split())
ques_maxlen = len(ques_maxlen_str.split())
print('max length of context: ', context_maxlen, 'words.\nmax length of question: ', ques_maxlen , 'words')


max length of context:  629 words.
max length of question:  29 words


### How to use pre-trained word embeddings in the embedding layer of our model? 
1) Convert all text samples in the dataset into sequences of word indices. Word index = integer ID for the word.
2) Construct an "embedding matrix" which will contain at index i the embedding vector for the word of index i in our word index.
3) Load this embedding matrix into a Keras Embedding layer

The Tokenizer class in Keras allows to vectorize a text corpus, by turning each text into either a sequence of integers (each integer being the index of a token in a dictionary) or into a vector where the coefficient for each token could be binary, based on word count, based on tf-idf...

In [31]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer()  
tokenizer.fit_on_texts(train_context+dev_context+train_questions+dev_questions) # stores everything in the word_index

train_context_sequences = tokenizer.texts_to_sequences(train_context) # Transforms each text in the argument in a sequence of integers.
train_ques_sequences = tokenizer.texts_to_sequences(train_questions)
dev_context_sequences= tokenizer.texts_to_sequences(dev_context)
dev_ques_sequences = tokenizer.texts_to_sequences(dev_questions)

word_index = tokenizer.word_index

print('%s unique tokens.' % len(word_index))

#pad_sequences is used to ensure that all sequences in a list have the same length.
train_context_data = pad_sequences(train_context_sequences, maxlen=context_maxlen)
dev_context_data = pad_sequences(dev_context_sequences, maxlen= context_maxlen)

train_ques_data = pad_sequences(train_ques_sequences, maxlen= ques_maxlen)
dev_ques_data = pad_sequences(dev_ques_sequences, maxlen= ques_maxlen)

print(dev_ques_data.shape)
print(dev_context_data.shape)
#print(tokenizer.word_index) - dictionary mapping words (str) to their rank/index (int)
#print(sequences)
# print(tokenizer.word_counts) - dictionary mapping words (str) to the number of times they appeared on during fit.

32061 unique tokens.
(4913, 29)
(888, 629)


### Preparing the embedding layer
We compute an index, mapping words to known embeddings, by parsing the data dump of pre-trained embeddings

In [41]:
EMBEDDING_DIM = 100 #dimension of gloVe 

embedding_matrix = np.zeros((len(word_index) + 1,  EMBEDDING_DIM)) 
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

#### Load this embedding matrix into an Embedding layer. We set trainable=False to prevent the weights from being updated during training.



In [43]:
from keras.layers import Embedding

ques_embedding_layer = Embedding(len(word_index) + 1, #input_dim: vocab_size 
                            EMBEDDING_DIM, # the size of the output vectors from this layer
                            weights=[embedding_matrix],
                            input_length=ques_maxlen, # length of input sequences
                            trainable=False)
context_embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=context_maxlen,
                            trainable=False)

# Building the QA model

In [137]:
P = Input(shape=(context_maxlen, EMBEDDING_DIM), name='P')
Q = Input(shape=(ques_maxlen, EMBEDDING_DIM), name='Q')
W = 28
passage_input = P
question_input = Q
encoder = Bidirectional(LSTM(units=W,return_sequences=True))

passage_encoding = P
passage_encoding = encoder(passage_encoding)
passage_encoding = TimeDistributed(Dense(W, use_bias=False, trainable=True, weights=np.concatenate((np.eye(W), np.eye(W)), axis=1)))(passage_encoding)

question_encoding = Q
question_encoding = encoder(question_encoding)
question_encoding = TimeDistributed(Dense(W, use_bias=False, trainable=True, weights=np.concatenate((np.eye(W), np.eye(W)), axis=1)))(question_encoding)

question_attention_vector = TimeDistributed(Dense(1))(question_encoding)
question_attention_vector = Lambda(lambda q: keras.activations.softmax(q, axis=1))(question_attention_vector)

question_attention_vector = Lambda(lambda q: q[0] * q[1])([question_encoding, question_attention_vector])
question_attention_vector = Lambda(lambda q: K.sum(q, axis=1))(question_attention_vector)
question_attention_vector = RepeatVector(context_maxlen)(question_attention_vector)

answer_start = Lambda(lambda arg: concatenate([arg[0], arg[1], arg[2]]))([
            passage_encoding,
            question_attention_vector,
            multiply([passage_encoding, question_attention_vector])])

answer_start = TimeDistributed(Dense(W, activation='relu'))(answer_start)
answer_start = TimeDistributed(Dense(1))(answer_start)
answer_start = Flatten()(answer_start)
answer_start = Activation('softmax')(answer_start)

# Answer end prediction depends on the start prediction
def s_answer_feature(x):
    maxind = K.argmax( x,axis=1,)
    return maxind

x = Lambda(lambda x: K.tf.cast(s_answer_feature(x), dtype=K.tf.int32))(answer_start)
start_feature = Lambda(lambda arg: K.tf.gather_nd(arg[0], K.tf.stack(
            [K.tf.range(K.tf.shape(arg[1])[0]), K.tf.cast(arg[1], K.tf.int32)], axis=1)))([passage_encoding, x])
start_feature = RepeatVector(context_maxlen)(start_feature)

# Answer end prediction
answer_end = Lambda(lambda arg: concatenate([
            arg[0],
            arg[1],
            arg[2],
            multiply([arg[0], arg[1]]),
            multiply([arg[0], arg[2]])]))([passage_encoding, question_attention_vector, start_feature])

answer_end = TimeDistributed(Dense(W, activation='relu'))(answer_end)
answer_end = TimeDistributed(Dense(1))(answer_end)
answer_end = Flatten()(answer_end)
answer_end = Activation('softmax')(answer_end)

input_placeholders = [P, Q]
inputs = input_placeholders
outputs = [answer_start, answer_end]
