In [0]:
import re
import numpy as np
from keras.preprocessing.sequence import pad_sequences
import json

from keras import layers
from keras.layers import recurrent
from keras.layers.embeddings import Embedding
from keras.models import Model

import tensorflow as tf

In [0]:
def tokenize(sent):
  return [x.strip() for x in re.split(r'(\W+)', sent) if x.strip()]

In [0]:
with open('drive/My Drive/final_project/train-v2.0.json', 'r') as f:
  content = json.loads(f.read())
type(content)

dict

In [0]:
print(content.keys())

dict_keys(['version', 'data'])


In [0]:
data = content['data']
data[0].keys()

dict_keys(['title', 'paragraphs'])

In [0]:
data[320]['paragraphs'][0]

{'context': 'The modern English word green comes from the Middle English and Anglo-Saxon word grene, from the same Germanic root as the words "grass" and "grow". It is the color of living grass and leaves and as a result is the color most associated with springtime, growth and nature. By far the largest contributor to green in nature is chlorophyll, the chemical by which plants photosynthesize and convert sunlight into chemical energy. Many creatures have adapted to their green environments by taking on a green hue themselves as camouflage. Several minerals have a green color, including the emerald, which is colored green by its chromium content.',
 'qas': [{'answers': [{'answer_start': 326, 'text': 'chlorophyll'}],
   'id': '5729604faf94a219006aa341',
   'is_impossible': False,
   'question': 'What, in nature, is most likely to make things green?'},
  {'answers': [{'answer_start': 522, 'text': 'camouflage'}],
   'id': '5729604faf94a219006aa342',
   'is_impossible': False,
   'question

In [0]:
def parse_data(data):
  vocab_set = set()
  vocab = {}
  triplex_list = []
  context_list = []
  question_list = []
  answer_list = []

  # Context and questions extracting
  for topic in data:
    for part in topic['paragraphs']:
      blocks = part['qas']
      for block in blocks:
        context = part['context']
        vocab_set |= set(tokenize(context))
        context_list.append(context)
        vocab_set |= set(tokenize(block['question']))
        question_list.append(block['question'])

  # Making dictionary with shape {'token': number}, where numbers are in range 1..
  i = 1
  for token in vocab_set:
    vocab[token] = i
    i += 1
  
  # Context vectorization and finding of context_maxlen
  context_vectors = []
  context_maxlen = 0
  for context in context_list:
    vectorized_context = []
    tokens = tokenize(context)
    for token in tokens:
      vectorized_context.append(vocab[token])
    context_vectors.append(vectorized_context)
    if len(tokens) > context_maxlen:
      context_maxlen = len(tokens)
  context_vectors = pad_sequences(context_vectors, maxlen=context_maxlen, padding='post')

  # Answers extracting and vectorization
  for topic in data:
    for part in topic['paragraphs']:
      blocks = part['qas']
      for block in blocks:
        answer = np.zeros(context_maxlen + 1)
        if len(block['answers']) == 1:
          answer_start = block['answers'][0]['answer_start']
          text = block['answers'][0]['text']
          answer[answer_start:answer_start + len(text)] = 1
        answer_list.append(answer)
    answer_vectors = pad_sequences(answer_list, maxlen=context_maxlen, padding='post')
  
  # Question vectorization and question_maxlen finding 
  question_vectors = []
  question_maxlen = 0
  for question in question_list:
    vectorized_question = []
    tokens = tokenize(question)
    for token in tokens:
      vectorized_question.append(vocab[token])
    question_vectors.append(vectorized_question)
    if len(tokens) > question_maxlen:
      question_maxlen = len(tokens)
  question_vectors = pad_sequences(question_vectors, maxlen=question_maxlen, padding='post')


  return context_vectors, question_vectors, answer_vectors, vocab, context_maxlen, question_maxlen

In [0]:
context_vectors, question_vectors, answer_vectors, vocab, context_maxlen, question_maxlen = parse_data(data)

In [179]:
print('context ', type(context_vectors))
print('question ', type(question_vectors))
print('answer ', type(answer_vectors))
print('context ', context_vectors.shape)
print('question ', question_vectors.shape)
print('answer ', answer_vectors.shape)
print('context_maxlen= ', context_maxlen)
print('question_maxlen= ', question_maxlen)
print('Length of vocabulary= ', len(vocab))


context  <class 'numpy.ndarray'>
question  <class 'numpy.ndarray'>
answer  <class 'numpy.ndarray'>
context  (130319, 844)
question  (130319, 60)
answer  (130319, 844)
context_maxlen=  844
question_maxlen=  60
Length of vocabulary=  99372


In [182]:
RNN = recurrent.LSTM
EMBED_HIDDEN_SIZE = 50
CONTEXT_HIDDEN_SIZE = 100
QUESTION_HIDDEN_SIZE = 100
BATCH_SIZE = 32
EPOCHS = 20
vocab_size = len(vocab) + 1
print('RNN / Embed / Sent / Query = {}, {}, {}, {}'.format(RNN,
                                                           EMBED_HIDDEN_SIZE,
                                                           CONTEXT_HIDDEN_SIZE,
                                                           QUESTION_HIDDEN_SIZE))

RNN / Embed / Sent / Query = <class 'keras.layers.recurrent.LSTM'>, 50, 100, 100


In [189]:
print('Build model...')

context = layers.Input(shape=(context_maxlen,), dtype='int32')
encoded_context = layers.Embedding(vocab_size, EMBED_HIDDEN_SIZE)(context)
encoded_context = RNN(CONTEXT_HIDDEN_SIZE)(encoded_context)

question = layers.Input(shape=(question_maxlen,), dtype='int32')
encoded_question = layers.Embedding(vocab_size, EMBED_HIDDEN_SIZE)(question)
encoded_question = RNN(QUESTION_HIDDEN_SIZE)(encoded_question)

merged = layers.concatenate([encoded_context, encoded_question])
preds = layers.Dense(vocab_size, activation='softmax')(merged)

model = Model([context, question], preds)
model.compile(optimizer='rmsprop',
              loss=tf.nn.sigmoid_cross_entropy_with_logits,
              metrics=['mse'])


model.summary()

Build model...
Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            (None, 844)          0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            (None, 60)           0                                            
__________________________________________________________________________________________________
embedding_5 (Embedding)         (None, 844, 50)      4968650     input_5[0][0]                    
__________________________________________________________________________________________________
embedding_6 (Embedding)         (None, 60, 50)       4968650     input_6[0][0]                    
_____________________________________________________________________________

In [190]:
print('Training')
model.fit([context_vectors, question_vectors], answer_vectors,
          batch_size=BATCH_SIZE,
          epochs=EPOCHS,
          validation_split=0.05)


Training


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 123803 samples, validate on 6516 samples
Epoch 1/20


InvalidArgumentError: ignored

In [0]:
print('Evaluation')
loss, mse = model.evaluate([tx, txq], ty,
                           batch_size=BATCH_SIZE)
print('Test loss / test accuracy = {:.4f} / {:.4f}'.format(loss, mse))