In [15]:
from google.colab import drive
drive.mount('/content/drive')

import pickle

import re
import numpy as np
from keras.preprocessing.sequence import pad_sequences
import json

from keras import layers
from keras.layers import recurrent
from keras.layers.embeddings import Embedding
from keras.models import Model

import tensorflow as tf

from sklearn.metrics import hamming_loss, precision_score,\
                             recall_score, accuracy_score


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
def tokenize(sent):
  return [x.strip() for x in re.split(r'(\W+)', sent) if x.strip()]

In [0]:
with open('drive/My Drive/final_project/train-v2.0.json', 'r') as f:
  content = json.loads(f.read())
type(content)

dict

In [0]:
print(content.keys())

dict_keys(['version', 'data'])


In [0]:
data = content['data']
data[0].keys()

dict_keys(['title', 'paragraphs'])

In [0]:
data[320]['paragraphs'][0]

{'context': 'The modern English word green comes from the Middle English and Anglo-Saxon word grene, from the same Germanic root as the words "grass" and "grow". It is the color of living grass and leaves and as a result is the color most associated with springtime, growth and nature. By far the largest contributor to green in nature is chlorophyll, the chemical by which plants photosynthesize and convert sunlight into chemical energy. Many creatures have adapted to their green environments by taking on a green hue themselves as camouflage. Several minerals have a green color, including the emerald, which is colored green by its chromium content.',
 'qas': [{'answers': [{'answer_start': 326, 'text': 'chlorophyll'}],
   'id': '5729604faf94a219006aa341',
   'is_impossible': False,
   'question': 'What, in nature, is most likely to make things green?'},
  {'answers': [{'answer_start': 522, 'text': 'camouflage'}],
   'id': '5729604faf94a219006aa342',
   'is_impossible': False,
   'question

In [0]:
def parse_data(data):
  vocab_set = set()
  vocab = {}
  triplex_list = []
  context_list = []
  question_list = []
  answer_list = []

  # Context and questions extracting
  for topic in data:
    for part in topic['paragraphs']:
      blocks = part['qas']
      for block in blocks:
        context = part['context']
        vocab_set |= set(tokenize(context))
        context_list.append(context)
        vocab_set |= set(tokenize(block['question']))
        question_list.append(block['question'])

  # Making dictionary with shape {'token': number}, where numbers are in range 1..
  i = 1
  for token in vocab_set:
    vocab[token] = i
    i += 1
  
  # Context vectorization and finding of context_maxlen
  context_vectors = []
  context_maxlen = 0
  for context in context_list:
    vectorized_context = []
    tokens = tokenize(context)
    for token in tokens:
      vectorized_context.append(vocab[token])
    context_vectors.append(vectorized_context)
    if len(tokens) > context_maxlen:
      context_maxlen = len(tokens)
  context_vectors = pad_sequences(context_vectors, maxlen=context_maxlen, padding='post')

  # Answers extracting and vectorization
  for topic in data:
    for part in topic['paragraphs']:
      blocks = part['qas']
      for block in blocks:
        answer = np.zeros(context_maxlen + 1)
        if len(block['answers']) == 1:
          answer_start = block['answers'][0]['answer_start']
          text = block['answers'][0]['text']
          answer[answer_start:answer_start + len(text)] = 1
        answer_list.append(answer)
    answer_vectors = pad_sequences(answer_list, maxlen=context_maxlen, padding='post')
  
  # Question vectorization and question_maxlen finding 
  question_vectors = []
  question_maxlen = 0
  for question in question_list:
    vectorized_question = []
    tokens = tokenize(question)
    for token in tokens:
      vectorized_question.append(vocab[token])
    question_vectors.append(vectorized_question)
    if len(tokens) > question_maxlen:
      question_maxlen = len(tokens)
  question_vectors = pad_sequences(question_vectors, maxlen=question_maxlen, padding='post')


  return context_vectors, question_vectors, answer_vectors, vocab, context_maxlen, question_maxlen

In [0]:
context_vectors, question_vectors, answer_vectors, vocab, context_maxlen, question_maxlen = parse_data(data)

In [0]:
with open('drive/My Drive/final_project/context_vectors.pickle', 'wb') as f:
  pickle.dump(context_vectors, f)
with open('drive/My Drive/final_project/question_vectors.pickle', 'wb') as f:
  pickle.dump(question_vectors, f)
with open('drive/My Drive/final_project/answer_vectors.pickle', 'wb') as f:
  pickle.dump(answer_vectors, f)
other = {'vocab': vocab, 'context_maxlen': context_maxlen, 'question_maxlen': question_maxlen}
with open('drive/My Drive/final_project/other.pickle', 'wb') as f:
  pickle.dump(other, f)

#with open(PIK, "wb") as f:
#    pickle.dump(len(data), f)
#    for value in data:
#        pickle.dump(value, f)
#data2 = []
#with open(PIK, "rb") as f:
#    for _ in range(pickle.load(f)):
#        data2.append(pickle.load(f))
#print data2

In [0]:
with open('drive/My Drive/final_project/context_vectors.pickle', 'rb') as f:
  context_vectors = pickle.load(f)
with open('drive/My Drive/final_project/question_vectors.pickle', 'rb') as f:
  question_vectors = pickle.load(f)
with open('drive/My Drive/final_project/answer_vectors.pickle', 'rb') as f:
  answer_vectors = pickle.load(f)
with open('drive/My Drive/final_project/other.pickle', 'rb') as f:
  other = pickle.load(f)
vocab = other['vocab']
context_maxlen = other['context_maxlen']
question_maxlen = other['question_maxlen']


In [0]:
print('context ', type(context_vectors))
print('question ', type(question_vectors))
print('answer ', type(answer_vectors))
print('context ', context_vectors.shape)
print('question ', question_vectors.shape)
print('answer ', answer_vectors.shape)
print('context_maxlen= ', context_maxlen)
print('question_maxlen= ', question_maxlen)
print('Length of vocabulary= ', len(vocab))


context  <class 'numpy.ndarray'>
question  <class 'numpy.ndarray'>
answer  <class 'numpy.ndarray'>
context  (130319, 844)
question  (130319, 60)
answer  (130319, 844)
context_maxlen=  844
question_maxlen=  60
Length of vocabulary=  99372


In [7]:
from sklearn.model_selection import train_test_split
context_train, context_test, question_train, question_test,\
  answer_train, answer_test = train_test_split(context_vectors,
  question_vectors, answer_vectors, test_size=0.2, random_state=42)

print('context_train', type(context_train), context_train.shape)
print('context_test', type(context_test), context_test.shape)
print('question_train', type(question_train), question_train.shape)
print('question_test', type(question_test), question_test.shape)
print('answer_train', type(answer_train), answer_train.shape)
print('answer_test', type(answer_test), answer_test.shape)


context_train <class 'numpy.ndarray'> (104255, 844)
context_test <class 'numpy.ndarray'> (26064, 844)
question_train <class 'numpy.ndarray'> (104255, 60)
question_test <class 'numpy.ndarray'> (26064, 60)
answer_train <class 'numpy.ndarray'> (104255, 844)
answer_test <class 'numpy.ndarray'> (26064, 844)


In [8]:
RNN = recurrent.LSTM
EMBED_HIDDEN_SIZE = 100
CONTEXT_HIDDEN_SIZE = 200
QUESTION_HIDDEN_SIZE = 200
BATCH_SIZE = 32
EPOCHS = 4
vocab_size = len(vocab) + 1
print('RNN / Embed / Sent / Query = {}, {}, {}, {}'.format(RNN,
                                                           EMBED_HIDDEN_SIZE,
                                                           CONTEXT_HIDDEN_SIZE,
                                                           QUESTION_HIDDEN_SIZE))

RNN / Embed / Sent / Query = <class 'keras.layers.recurrent.LSTM'>, 100, 200, 200


In [0]:
def label_based_metrics(y_true, y_pred):
  acc = []
  hamm = []
  prec = []
  rec = []
  for i in range(len(y_true)):
    acc_i = accuracy_score(y_true[i], y_pred[i])
    hamm_i = hamming_loss(y_true[i], y_pred[i])
    prec_i = precision_score(y_true[i], y_pred[i], average='weighted')
    rec_i = recall_score(y_true[i], y_pred[i], average='weighted')
    acc.append(acc_i)
    hamm.append(hamm_i)
    prec.append(prec_i)
    rec.append(rec)
  label_based_accuracy = np.mean(acc)
  label_based_hamming_loss = np.mean(hamm)
  label_based_precision = np.mean(prec)
  label_based_recall = np.mean(rec)
  return label_based_accuracy, label_based_hamming_loss, \
          label_based_precision, label_based_recall

In [25]:
print('Build model...')

context = layers.Input(shape=(context_maxlen,), dtype='int32')
encoded_context = layers.Embedding(vocab_size, EMBED_HIDDEN_SIZE)(context)
encoded_context = RNN(CONTEXT_HIDDEN_SIZE)(encoded_context)

question = layers.Input(shape=(question_maxlen,), dtype='int32')
encoded_question = layers.Embedding(vocab_size, EMBED_HIDDEN_SIZE)(question)
encoded_question = RNN(QUESTION_HIDDEN_SIZE)(encoded_question)

merged = layers.concatenate([encoded_context, encoded_question])
preds = layers.Dense(context_maxlen, activation='sigmoid')(merged)

model = Model([context, question], preds)
model.compile(optimizer='rmsprop',
              loss=tf.nn.sigmoid_cross_entropy_with_logits,
              metrics=['accuracy', label_based_metrics])


model.summary()

Build model...


TypeError: ignored

In [0]:
from keras.callbacks import ModelCheckpoint

callback = ModelCheckpoint(filepath='/content/drive/My Drive/final_project/weights_file_1',
              monitor='val_loss',
              mode='auto',
              save_best_only=True)

print('Training')
history = model.fit([context_train, question_train], answer_train,
          batch_size=BATCH_SIZE,
          epochs=EPOCHS,
          validation_split=0.05,
          callbacks=[callback])


In [0]:
accuracy=history.history['accuracy']
val_accuracy=history.history['val_accuracy']
loss=history.history['loss']
val_loss=history.history['val_loss']
print('accuracy= ', mse)
print('val_accuracy= ', val_mse)
print('loss= ', loss)
print('val_loss= ', val_loss)

In [0]:
print('Evaluation')
loss, accuracy = model.evaluate([context_test, question_test], answer_test,
                           batch_size=BATCH_SIZE)
print('Test loss / test mse = {:.4f} / {:.4f}'.format(loss, accuracy))

In [0]:
from tensorflow.keras.models import model_from_json

json_file = '/content/drive/My Drive/final_project/model.json'
model_json_1 = model.to_json()

with open(json_file, 'w') as f:
  f.write(model_json_1)
