In [1]:
from google.colab import drive
drive.mount('/content/drive')

import os
import re
import numpy as np
import pickle
import json

#from keras.preprocessing.sequence import pad_sequences

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import Bidirectional, Dropout, Embedding, LSTM
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.metrics import CosineSimilarity
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.preprocessing.sequence import pad_sequences
#from keras.layers import recurrent

#from keras.layers.embeddings import Embedding
from keras.models import Model
from keras.callbacks import ModelCheckpoint

#from sklearn.metrics import hamming_loss, precision_score,\
#                            recall_score, accuracy_score
#from sklearn.metrics.pairwise import cosine_distances
#from scipy.spatial.distance import cosine

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Using TensorFlow backend.


In [0]:
def tokenize(sent):
  return [x.strip() for x in re.split(r'(\W+)', sent) if x.strip()]

In [3]:
with open('drive/My Drive/final_project/train-v2.0.json', 'r') as f:
  content = json.loads(f.read())
type(content)

dict

In [4]:
data = content['data']
data[0].keys()

dict_keys(['title', 'paragraphs'])

In [0]:
def parse_data(data):
  vocab_set = set()
  vocab = {}
  triplex_list = []
  context_list = []
  question_list = []
  answer_list = []
  original_answer_list = []


  # Context and questions extracting
  for topic in data:
    for part in topic['paragraphs']:
      blocks = part['qas']
      for block in blocks:
        if len(block['answers']) == 1:
          context = part['context']
          vocab_set |= set(tokenize(context))
          context_list.append(context)
          vocab_set |= set(tokenize(block['question']))
          question_list.append(block['question'])
  
  # Making dictionary with shape {'token': number}, where numbers are in range 1..
  i = 1
  for token in vocab_set:
    vocab[token] = i
    i += 1
  
  # Context vectorization and finding of context_maxlen
  context_vectors = []
  context_maxlen = 0
  for context in context_list:
    vectorized_context = []
    tokens = tokenize(context)
    for token in tokens:
      vectorized_context.append(vocab[token])
    context_vectors.append(vectorized_context)
    if len(tokens) > context_maxlen:
      context_maxlen = len(tokens)
  context_vectors = pad_sequences(context_vectors, maxlen=context_maxlen, padding='post')
  
  # Answer extracting
  for topic in data:
    for part in topic['paragraphs']:
      blocks = part['qas']
      for block in blocks:
        if len(block['answers']) == 1:
          context = part['context']
          tokens = tokenize(context)
          answer_vector = np.zeros(len(tokens))
          answer_start = block['answers'][0]['answer_start']
          text = block['answers'][0]['text']
          before_answer = context[:answer_start]
          tokens_before = tokenize(before_answer)
          answer_symbols = context[answer_start:answer_start + len(text)]
          answer_tokens = tokenize(answer_symbols)
          if answer_tokens != tokenize(text):
            print('Mistake')
            break
          answer_vector[len(tokens_before): len(tokens_before) + len(answer_tokens)] = 1
          original_answer_list.append(text)
          answer_list.append(answer_vector)
  
  answer_vectors = pad_sequences(answer_list, maxlen=context_maxlen, padding='post')
  
  # Question vectorization and question_maxlen finding 
  question_vectors = []
  question_maxlen = 0
  for question in question_list:
    vectorized_question = []
    tokens = tokenize(question)
    for token in tokens:
      vectorized_question.append(vocab[token])
    question_vectors.append(vectorized_question)
    if len(tokens) > question_maxlen:
      question_maxlen = len(tokens)
  question_vectors = pad_sequences(question_vectors, maxlen=question_maxlen, padding='post')

  return context_vectors, question_vectors, answer_vectors,\
   vocab, context_maxlen, question_maxlen, context_list, question_list, answer_list, original_answer_list

In [0]:
context_vectors, question_vectors, answer_vectors, vocab, context_maxlen,\
 question_maxlen, context_list, question_list, answer_list, original_answer_list = parse_data(data)

In [0]:
from sklearn.model_selection import train_test_split
context_train, context_test, question_train, question_test,\
  answer_train, answer_test = train_test_split(context_vectors,
  question_vectors, answer_vectors, shuffle=False, test_size=0.2, random_state=42)

In [0]:
RNN = recurrent.LSTM
EMBED_HIDDEN_SIZE = 100
CONTEXT_HIDDEN_SIZE = 200
QUESTION_HIDDEN_SIZE = 200

vocab_size = len(vocab) + 1

In [9]:
glove_dir = 'drive/My Drive/final_project'
embeddings_index = {}
f = open(os.path.join(glove_dir, 'glove.6B.100d.txt'))
for line in f:
  values = line.split()
  word = values[0]
  coefs = np.asarray(values[1:], dtype='float32')
  embeddings_index[word] = coefs
f.close()

len(embeddings_index)

400000

In [10]:
embedding_matrix = np.zeros((vocab_size, EMBED_HIDDEN_SIZE))
for word, i in vocab.items():
  embedding_vector = embeddings_index.get(word)
  if embedding_vector is not None:
    embedding_matrix[i] = embedding_vector

embedding_matrix.shape

(96570, 100)

In [11]:
# build model

context = layers.Input(shape=(context_maxlen,), dtype='int32')
encoded_context = layers.Embedding(vocab_size, EMBED_HIDDEN_SIZE)(context)
_context = layers.Bidirectional(RNN(CONTEXT_HIDDEN_SIZE))(encoded_context)
dropout_1 = layers.Dropout(0.33)(_context)

question = layers.Input(shape=(question_maxlen,), dtype='int32')
encoded_question = layers.Embedding(vocab_size, EMBED_HIDDEN_SIZE)(question)
_question = layers.Bidirectional(RNN(QUESTION_HIDDEN_SIZE))(encoded_question)
dropout_2 = layers.Dropout(0.33)(_question)

merged = layers.concatenate([dropout_1, dropout_2])
dropout_ = layers.Dropout(0.33)(merged)
preds = layers.Dense(context_maxlen, activation='sigmoid')(dropout_)


model = Model([context, question], preds)

Build model...


In [0]:
model.layers[2].set_weights([embedding_matrix])
model.layers[2].trainable = False

model.layers[3].set_weights([embedding_matrix])
model.layers[3].trainable = False

In [14]:
optimizer = keras.optimizers.RMSprop(learning_rate=0.005)

model.compile(optimizer=optimizer,
              loss=BinaryCrossentropy(),
              metrics=['accuracy', \
                       CosineSimilarity(axis=1)])

model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 844)          0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 60)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 844, 100)     9657000     input_1[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 60, 100)      9657000     input_2[0][0]                    
____________________________________________________________________________________________

In [0]:
with open(json_file, 'r') as f:
  loaded_model = model_from_json(f.read())

In [0]:
model.load_weights('/content/drive/My Drive/final_project/weights_file_10.h5')

In [0]:
# not for run

#opt = tf.keras.optimizers.RMSprop(learning_rate=0.005)

#lr_schedule = keras.optimizers.schedules.ExponentialDecay(
#    initial_learning_rate=1e-2,
#    decay_steps=10000,
#    decay_rate=0.9)
#optimizer = keras.optimizers.SGD(learning_rate=lr_schedule)

#model.compile(optimizer=opt,
#              loss=BinaryCrossentropy(),
#              metrics=['accuracy', \
#                       CosineSimilarity(axis=1)])

In [15]:
BATCH_SIZE = 512
EPOCHS = 3

callback = ModelCheckpoint(filepath='/content/drive/My Drive/final_project/weights_file_11.h5',
              monitor='val_loss',
              mode='auto',
              save_best_only=True)

reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5,
                              patience=3, min_lr=0.001)


print('Training')
history = model.fit([context_train, question_train], answer_train,
          batch_size=BATCH_SIZE,
          epochs=EPOCHS,
          validation_split=0.05,
          callbacks=[callback, reduce_lr])

Training
Train on 65983 samples, validate on 3473 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [0]:
history_dict = history.history
lr = history_dict['lr']
lr

In [0]:
loss_values = history_dict['loss']
val_loss_values = history_dict['val_loss']
acc = history_dict['accuracy']
val_acc = history_dict['val_accuracy']
cosine_similarity=history_dict['cosine_similarity']
val_cosine_similarity=history_dict['val_cosine_similarity']

epochs = list(range(1, len(acc) + 1))

import plotly.graph_objs as go
from plotly.offline import iplot

trace0 = go.Scatter(
    x = epochs,
    y = loss_values,
    mode = 'lines',
    name = 'training loss'
)
trace1 = go.Scatter(
    x = epochs,
    y = val_loss_values,
    mode = 'lines',
    name = 'validation loss'
)
trace2 = go.Scatter(
    x = epochs,
    y = cosine_similarity,
    mode = 'lines',
    name = 'cosine_similarity'
)
trace3 = go.Scatter(
    x = epochs,
    y = val_cosine_similarity,
    mode = 'lines',
    name = 'val_cosine_similarity'
)

data_1 = [trace0, trace1]

layout_1 = {'title': 'Config.1 Training and validation loss', 'xaxis': {'title': 'epochs'}, 'yaxis': {'title': 'loss'}}
fig_1 = go.Figure(data=data_1, layout=layout_1)
iplot(fig_1, show_link=False)

data_2 = [trace2, trace3]

layout_2 = {'title': 'Config.1 Training and validation cosine_similarity', 'xaxis': {'title': 'epochs'}, 'yaxis': {'title': 'cosine_similarity'}}
fig_2 = go.Figure(data=data_2, layout=layout_2)
iplot(fig_2, show_link=False)

In [0]:
reversed_vocab = {}
for word in vocab:
  reversed_vocab[vocab[word]] = word


In [0]:
def get_words(number):

  original_context = context_list[number]
  context_vec = context_train[number]
  restored_context = []
  for i in context_vec:
    if i != 0:
      restored_context.append(reversed_vocab[i])
  print('original_context: ', original_context)
  print('restored_context: ', restored_context)

  original_question = question_list[number]
  question_vec = question_train[number]
  restored_question = []
  for i in question_vec:
    if i != 0:
      restored_question.append(reversed_vocab[i])
  print('original_question: ', original_question)
  print('restored_question: ', restored_question)
                            

  original_answer = original_answer_list[number]
  context = context_vec.reshape(1, 844)
  question = question_vec.reshape(1, 60)
  answer_predicted = model.predict([context, question])
  predicted_answer = []
  for num, value in enumerate(answer_predicted[0]):
    if num < len(tokenize(original_context)):
      if round(value) != 0:
        predicted_answer.append(tokenize(original_context)[num])
  restored_answer = []
  for num, value in enumerate(answer_list[number]):
    if value != 0:
      restored_answer.append(tokenize(original_context)[num])

  print('original_answer: ', original_answer)
  print('restored_answer: ', restored_answer)
  print('predicted_answer: ', predicted_answer)
  return answer_predicted


In [0]:
answer_predicted = get_words(52875)

original_context:  By 1979, with the establishment of the Foundation for Ancient Research and Mormon Studies (FARMS) as a California non-profit research institution, an effort led by Robert F. Smith began to take full account of Larson’s work and to publish a Critical Text of the Book of Mormon. Thus was born the FARMS Critical Text Project which published the first volume of the 3-volume Book of Mormon Critical Text in 1984. The third volume of that first edition was published in 1987, but was already being superseded by a second, revised edition of the entire work, greatly aided through the advice and assistance of then Yale doctoral candidate Grant Hardy, Dr. Gordon C. Thomasson, Professor John W. Welch (the head of FARMS), Professor Royal Skousen, and others too numerous to mention here. However, these were merely preliminary steps to a far more exacting and all-encompassing project.
restored_context:  ['By', '1979', ',', 'with', 'the', 'establishment', 'of', 'the', 'Foundation', '

In [0]:
print(np.mean(answer_predicted[0]))
np.max(answer_predicted[0])

0.022625238


0.83646905

In [0]:
json_file = '/content/drive/My Drive/final_project/model.json'
model_json = model.to_json()

with open(json_file, 'w') as f:
  f.write(model_json)