import gc
gc.collect()

In [None]:
!pip install tokenizers

Collecting tokenizers
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 5.7MB/s 
[?25hInstalling collected packages: tokenizers
Successfully installed tokenizers-0.10.2


In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import json
import collections
import tensorflow_hub as hub
import numpy as np
import tokenizers
from tqdm import tqdm
import time
import string 
import re

In [None]:
class Sample:
    def __init__(self, question, context, q_ids=None, start_char_idx=None, answer_text=None, all_answers=None):
        self.question = question
        self.context = context
        self.start_char_idx = start_char_idx
        self.answer_text = answer_text
        self.all_answers = all_answers
        self.q_ids = q_ids
        self.skip = False
        self.start_token_idx = -1
        self.end_token_idx = -1

    def preprocess(self):
        # clean context and question
        context = " ".join(str(self.context).split())
        question = " ".join(str(self.question).split())
        # tokenize context and question
        tokenized_context = tokenizer.encode(context)
        tokenized_question = tokenizer.encode(question)
        # if this is validation or training sample, preprocess answer
        if self.answer_text is not None:
            answer = " ".join(str(self.answer_text).split())
            # check if end character index is in the context
            end_char_idx = self.start_char_idx + len(answer)
            if end_char_idx >= len(context):
                self.skip = True
                return
            # mark all the character indexes in context that are also in answer     
            is_char_in_ans = [0] * len(context)
            for idx in range(self.start_char_idx, end_char_idx):
                is_char_in_ans[idx] = 1
            ans_token_idx = []
            # find all the tokens that are in the answers
            for idx, (start, end) in enumerate(tokenized_context.offsets):
                if sum(is_char_in_ans[start:end]) > 0:
                    ans_token_idx.append(idx)
            if len(ans_token_idx) == 0:
                self.skip = True
                return
            # get start and end token indexes
            self.start_token_idx = ans_token_idx[0]
            self.end_token_idx = ans_token_idx[-1]
        # create inputs as usual
        input_ids = tokenized_context.ids + tokenized_question.ids[1:]
        token_type_ids = [0] * len(tokenized_context.ids) + [1] * len(tokenized_question.ids[1:])
        attention_mask = [1] * len(input_ids)
        padding_length = max_seq_length - len(input_ids)
        # add padding if necessary
        if padding_length > 0:
            input_ids = input_ids + ([0] * padding_length)
            attention_mask = attention_mask + ([0] * padding_length)
            token_type_ids = token_type_ids + ([0] * padding_length)
        elif padding_length < 0:
            self.skip = True
            return
        self.input_word_ids = input_ids
        self.input_type_ids = token_type_ids
        self.input_mask = attention_mask
        self.context_token_to_char = tokenized_context.offsets

In [None]:
def create_squad_examples(raw_data):
    squad_examples = []
    for item in raw_data["data"]:
        for para in item["paragraphs"]:
            context = para["context"]
            for qa in para["qas"]:
                question = qa["question"]
                q_id = qa['id']
                if "answers" in qa:
                    answer_text = qa["answers"][0]["text"]
                    all_answers = [_["text"] for _ in qa["answers"]]
                    start_char_idx = qa["answers"][0]["answer_start"]
                    squad_eg = Sample(question, context, q_id,start_char_idx, answer_text, all_answers)
                else:
                    squad_eg = Sample(question, context, q_id)
                squad_eg.preprocess()
                squad_examples.append(squad_eg)
    return squad_examples


def create_inputs_targets(squad_examples):
    dataset_dict = {
        "input_word_ids": [],
        "input_type_ids": [],
        "input_mask": [],
        "start_token_idx": [],
        "end_token_idx": [],
    }
    for item in squad_examples:
        if not item.skip:
            for key in dataset_dict:
                dataset_dict[key].append(getattr(item, key))
    for key in dataset_dict:
        dataset_dict[key] = np.array(dataset_dict[key])
    x = [dataset_dict["input_word_ids"],
         dataset_dict["input_mask"],
         dataset_dict["input_type_ids"]]
    y = [dataset_dict["start_token_idx"], dataset_dict["end_token_idx"]]
    return x, y

In [None]:
def train_eval(train_set, n=320):
  results = []
  indices = np.random.choice(len(train_set['data']), n)
 
  eval_data = {}
  train_data = {}
  for i, item in enumerate(train_set['data']):
    results.append(item)
    
  train_data['data'] = results[:n]
  eval_data['data'] = results[n:]
  return train_data, eval_data

In [None]:
train_path = keras.utils.get_file("train.json", "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json")
with open(train_path) as f: train_data = json.load(f)

Downloading data from https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json


In [None]:
train0, test = train_eval(train_data, 310)
train, val = train_eval(train0, 220)
# Train set
#with open('train.json', 'w') as jobj:
#  json.dump(train, jobj)

# Validation data set
with open('testset.json', 'w') as jobj:
  json.dump(val, jobj)

In [None]:
max_seq_length = 384

# "https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/2"
input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name='input_word_ids')
input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name='input_mask')
input_type_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name='input_type_ids')
encoder_input = {'input_word_ids': input_word_ids, 'input_mask': input_mask, 'input_type_ids': input_type_ids}
bert_layer = hub.KerasLayer('https://tfhub.dev/google/electra_base/2', trainable=True)

outputs = bert_layer(encoder_input)
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy().decode("utf-8")
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenizers.BertWordPieceTokenizer(vocab=vocab_file, lowercase=True)
train_squad_examples = create_squad_examples(train)
x_train, y_train = create_inputs_targets(train_squad_examples)
print(f"{len(train_squad_examples)} training points created.")
eval_squad_examples = create_squad_examples(val)
x_eval, y_eval = create_inputs_targets(eval_squad_examples)
print(f"{len(eval_squad_examples)} evaluation points created.")
test_squad_examples = create_squad_examples(test)
x_test, y_test = create_inputs_targets(test_squad_examples)
print(f"{len(test_squad_examples)} test points created.")


sequence_output = outputs['sequence_output']
start_logits = layers.Dense(1, name="start_logit", use_bias=False)(sequence_output)
start_logits = layers.Flatten()(start_logits)
end_logits = layers.Dense(1, name="end_logit", use_bias=False)(sequence_output)
end_logits = layers.Flatten()(end_logits)
start_probs = layers.Activation(keras.activations.softmax)(start_logits)
end_probs = layers.Activation(keras.activations.softmax)(end_logits)

# model.summary()

45819 training points created.
16375 evaluation points created.
25405 test points created.


In [None]:
def normalize_text(text):
  # convert to lower case
  text = text.lower()
  # remove redundant whitespaces
  text = "".join(ch for ch in text if ch not in set(string.punctuation))
  # remove articles
  regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
  text = re.sub(regex, " ", text)
  text = " ".join(text.split())
  return text


def get_tokens(s):
    if not s: return []
    return normalize_text(s).split()


def compute_f1(a_gold, a_pred):
    gold_toks = get_tokens(a_gold)
    pred_toks = get_tokens(a_pred)
    common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
    num_same = sum(common.values())
    if len(gold_toks) == 0 or len(pred_toks) == 0:
        # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
        return int(gold_toks == pred_toks)
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(pred_toks)
    recall = 1.0 * num_same / len(gold_toks)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1

In [None]:
def accuracy(pred_start, pred_end, data):
  count = 0
  f1 = 0
  N = len(pred_end)
  eval_examples_no_skip = [_ for _ in data if _.skip == False]
  for idx, (start, end) in enumerate(zip(pred_start, pred_end)):
    # take the required Sample object with the ground-truth answers in it
    squad_eg = eval_examples_no_skip[idx]
    # use offsets to get back the span of text corresponding to
    # our predicted first and last tokens
    offsets = squad_eg.context_token_to_char
    start = np.argmax(start)
    end = np.argmax(end)
    if start >= len(offsets):
      continue
    pred_char_start = offsets[start][0]
    if end < len(offsets):
      pred_char_end = offsets[end][1]
      pred_ans = squad_eg.context[pred_char_start:pred_char_end]
    else:
      pred_ans = squad_eg.context[pred_char_start:]
    normalized_pred_ans = normalize_text(pred_ans)
    # clean the real answers
    normalized_true_ans = [normalize_text(_) for _ in squad_eg.all_answers]
    # check if the predicted answer is in an array of the ground-truth answers
    if normalized_pred_ans in normalized_true_ans:
      count += 1
    f1 += max(compute_f1(normalized_pred_ans, x) for x in normalized_true_ans)
    
  acc = count / N
  f1_score = f1/N
  return acc, f1_score
      

In [None]:
optimizer = keras.optimizers.Adam(lr=1e-5, beta_1=0.9, beta_2=0.98, epsilon=1e-9)
loss_tracker = keras.losses.SparseCategoricalCrossentropy(from_logits=False)
def train_step(data, model):
  X, y = data
  with tf.GradientTape() as tape:
    y_pred = model(X, training=True)
    # compute loss
    loss = loss_tracker(y, y_pred)
    
  # compute gradients
  # trainable_vars = self.trainable_variables
  gradients = tape.gradient(loss, electra_model.trainable_weights)
  # Update weights
  optimizer.apply_gradients(zip(gradients, electra_model.trainable_weights))

  # loss_tracker.update_state(loss)
  # pred_start, pred_end = y_pred[0], y_pred[1]
  
  # acc = accuracy(pred_start, pred_end)
  return {'loss': loss}

def test_step(data, model):
  X, y = data
  y_pred = model.predict(X)
  # loss
  val_loss = loss_tracker(y, y_pred)
  pred_start, pred_end = y_pred
  val_acc, f1 = accuracy(pred_start, pred_end, eval_squad_examples)
  return {'val_loss': val_loss, 'val_acc': val_acc, 'f1_score': f1}

In [None]:
def train(model, X_train, X_val, epochs=10, train_steps=None, val_steps=None):
  x_train, y_train = X_train
  x_val, y_val = X_val
  # x_eval, y_eval = X_val
  loss = []
  exact = []
  f1_score = []
  # accuracy = []
  val_loss = []
  val_exact = []
  val_f1_score = []

  # train and val sets
  data_x = list(map(lambda x: x[:train_steps], x_train))
  # data_y = list(map(lambda x: x[:train_steps], x_train))
  x_val = list(map(lambda x: x[:val_steps], x_val))
  y_val = list(map(lambda x: x[:val_steps], y_val))
  print('start training...')
  for epoch in range(epochs):
    print(f"Epoch: {epoch+1}")
    temp_loss = 0
    # temp_acc = 0
    start = time.time()
    for i in range(1, len(x_train[0])-1):
      train_ds = ([x_train[0][i-1:i], x_train[1][i-1:i], x_train[2][i-1:i]], [y_train[0][i-1:i], y_train[1][i-1:i]])
      results = train_step(train_ds, model)
      temp_loss += results['loss'].numpy()
      # temp_acc += results['accuracy']
      
      # stop
      if train_steps is not None:
        if i == train_steps:
          break
    # saving tempory statistics
    if train_steps is not None:
      loss.append(temp_loss/train_steps)
    else:
      return
    #
    
    y_pred = model.predict(data_x)
    pred_start, pred_end = y_pred[0], y_pred[1]
    acc = accuracy(pred_start, pred_end, train_squad_examples)
    exact.append(acc[0])
    f1_score.append(acc[1])
    #
    if val_steps is not None:
      print('prediction...')

      val_results = test_step((x_val, y_val), model)
      val_loss.append(val_results['val_loss'])
      val_exact.append(val_results['val_acc'])
      val_f1_score.append(val_results['f1_score'])
    else:
      return
    end = time.time()

    # show performance after every epoch
    print(f"Time used: {end-start}\tloss: {temp_loss/train_steps:.4f}\texact: {exact[-1]:.4f}\tf1_score: {f1_score[-1]:.4f}\tval_loss: {val_loss[-1]:.4f}\tval_EM: {val_exact[-1]}\tf1_score: {f1_score[-1]:.4f}\n")
    
    
  return {'loss': loss, 'exact_match': exact, 'f1_score': f1_score, 'val_loss': [x.numpy() for x in val_loss], 'val_EM': val_exact, 'val_f1_score': val_f1_score}

In [None]:
electra_model = tf.keras.Model(inputs=[input_word_ids, input_mask, input_type_ids], outputs=[start_probs, end_probs])

In [None]:
history = train(electra_model, (x_train, y_train), (x_eval, y_eval), train_steps=9000, val_steps=3500)

start training...
Epoch: 1
prediction...
Time used: 1182.8075726032257	loss: 1.3359	exact: 0.7550	f1_score: 0.8643	val_loss: 1.4836	val_EM: 0.5748571428571428	f1_score: 0.8643

Epoch: 2
prediction...
Time used: 1171.6056807041168	loss: 0.6287	exact: 0.8371	f1_score: 0.9152	val_loss: 1.6709	val_EM: 0.5797142857142857	f1_score: 0.9152

Epoch: 3
prediction...
Time used: 1169.7826507091522	loss: 0.3796	exact: 0.8862	f1_score: 0.9396	val_loss: 2.0373	val_EM: 0.58	f1_score: 0.9396

Epoch: 4
prediction...
Time used: 1164.7095324993134	loss: 0.2522	exact: 0.9142	f1_score: 0.9513	val_loss: 2.2218	val_EM: 0.5751428571428572	f1_score: 0.9513

Epoch: 5
prediction...
Time used: 1163.3522696495056	loss: 0.1867	exact: 0.9260	f1_score: 0.9593	val_loss: 2.4106	val_EM: 0.5688571428571428	f1_score: 0.9593

Epoch: 6
prediction...
Time used: 1165.4316608905792	loss: 0.1468	exact: 0.9483	f1_score: 0.9704	val_loss: 2.5207	val_EM: 0.5817142857142857	f1_score: 0.9704

Epoch: 7
prediction...
Time used: 1162.693

In [None]:
def Save_performance(history, name='Electra'):
  history = {k: [float(x) for x in v] for k, v in history.items()}
  with open(name+'.json', 'w') as obj:
    json.dump(history, obj)

In [None]:
Save_performance(history, name='electra_history')

In [None]:
electra_model.save('electra_saved_model', include_optimizer=False)



INFO:tensorflow:Assets written to: electra_saved_model/assets


INFO:tensorflow:Assets written to: electra_saved_model/assets


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
def generate_prediction(model, data):
    # Stored results
    
    ######
    print('Create samples...')
    sample_examples = create_squad_examples(data)
    print('Samples creation completed...')
    print('Create input data...')
    x_eval, _ = create_inputs_targets(sample_examples)
    # get the offsets of the first and last tokens of predicted answers
    st = time.time()
    pred_start, pred_end = model.predict(x_eval)
    ed = time.time()
    print(f"Time for prediction: {(ed - st)}s")
    count = 0
    pred_ans = None
    eval_examples_no_skip = [_ for _ in sample_examples if _.skip == False]
    # for every pair of offsets
    for idx, (start, end) in enumerate(zip(pred_start, pred_end)):
        # take the required Sample object with the ground-truth answers in it
        squad_eg = eval_examples_no_skip[idx]
        # use offsets to get back the span of text corresponding to
        # our predicted first and last tokens
        offsets = squad_eg.context_token_to_char
        # Get the best i.e max
        start = np.argmax(start)
        end = np.argmax(end)
        if start >= len(offsets):
            continue
        pred_char_start = offsets[start][0]
        if end < len(offsets):
            pred_char_end = offsets[end][1]
            pred_ans = squad_eg.context[pred_char_start:pred_char_end]
        else:
            pred_ans = squad_eg.context[pred_char_start:]

        yield squad_eg.q_ids, pred_ans

       


def save(generator):
    import json
    with open('electra_prediction.json', 'w') as p:
        json.dump(dict([i for i in generator]), p)


In [None]:
# save(generate_prediction(electra_model, raw_eval_data))

In [None]:
save(generate_prediction(electra_model, test))

Create samples...
Samples creation completed...
Create input data...
Time for prediction: 377.07872462272644s


In [None]:
import shutil
import os
folder = os.path.join('drive', 'MyDrive', 'saved_nlp_models')
if not os.path.exists(folder):
  os.makedirs(folder)
shutil.move('electra_saved_model', folder)

'drive/MyDrive/saved_nlp_models/electra_saved_model'

In [None]:
def generate_prediction_(model, data):
    # Stored results
    res = {}
    count = 0
    ######
    print('Create samples...')
    sample_examples = create_squad_examples(data)
    print('Samples creation completed...')
    print('Create input data...')
    x_eval, _ = create_inputs_targets(sample_examples)
    # get the offsets of the first and last tokens of predicted answers
    st = time.time()
    pred_start, pred_end = model.predict(x_eval)
    ed = time.time()
    print(f"Time for prediction: {(ed - st)}s")
    count = 0
    pred_ans = None

    eval_examples_no_skip = [_ for _ in sample_examples if _.skip == False]
    # for every pair of offsets
    for idx, (start, end) in enumerate(zip(pred_start, pred_end)):
        # take the required Sample object with the ground-truth answers in it
        squad_eg = eval_examples_no_skip[idx]
        # use offsets to get back the span of text corresponding to
        # our predicted first and last tokens
        offsets = squad_eg.context_token_to_char
        q_id = squad_eg.q_ids
        # Get the best i.e max
        start = np.argmax(start)
        end = np.argmax(end)
        if start >= len(offsets):
            continue
        pred_char_start = offsets[start][0]
        if end < len(offsets):
            pred_char_end = offsets[end][1]
            pred_ans = squad_eg.context[pred_char_start:pred_char_end]
        else:
            pred_ans = squad_eg.context[pred_char_start:]
        normalized_pred_ans = normalize_text(pred_ans)
        normalized_true_ans = [normalize_text(_) for _ in squad_eg.all_answers]
        if normalized_pred_ans not in normalized_true_ans:
          count += 1
          res[q_id] = normalized_pred_ans
    return res



In [None]:
import os
model_name = ['expert_saved_model', 'baseline_saved_model', 'electra_saved_model']
folder = os.path.join('drive', 'MyDrive', 'saved_nlp_models')

model = tf.keras.models.load_model(os.path.join(folder, model_name[-1]))

In [None]:
results = generate_prediction_(electra_model, test)

Create samples...
Samples creation completed...
Create input data...
Time for prediction: 377.0438332557678s


In [None]:
with open('electra_wrongly_classified.json', 'w') as obj:
  json.dump(results, obj)

In [None]:
def get_raw_scores(dataset):
    ground_truth = {}
    for article in dataset['data']:
        for p in article['paragraphs']:
            for qa in p['qas']:
                qid = qa['id']
                gold_answers = [a['text'] for a in qa['answers']
                                if normalize_text(a['text'])]
                ground_truth[qid] = [qa['question'], gold_answers]
    return ground_truth

In [None]:
ground_truth = get_raw_scores(test)

In [None]:
len(ground_truth.keys())

In [None]:
print(f"Number of misclassified: {len(results.keys())}")

In [None]:
for i, (k,v) in enumerate(ground_truth.items()):
  if k in results:
    print(f"id: {k}\nQuestion: {v[0]}\nGround truth: {v[1][0]}\nPrediction: {results[k]}\n")
  if i==50:
    break


id: 5728027d3acd2414000df20d
Question: What year was the PlayStation 3 released?
Ground truth: 2006
Prediction: 2005

id: 5728027d3acd2414000df20f
Question: What was the thinner version of the PS3 called?
Ground truth: Slim
Prediction: 

id: 5728027d3acd2414000df210
Question: What year did the Super Slim model hit stores?
Ground truth: 2012
Prediction: 2009

id: 572805603acd2414000df27a
Question: What event did Sony take the PlayStation 3 to four months after E3?
Ground truth: Tokyo Game Show
Prediction: e3 2005

id: 5728066eff5b5019007d9b2b
Question: By the time the system appeared at E3 2006, how many Ethernet ports was it down to?
Ground truth: one
Prediction: one hdmi port one ethernet port and four usb ports

id: 5728066eff5b5019007d9b2d
Question: In addition to the 20 GB model, what larger model did Sony offer?
Ground truth: 60 GB
Prediction: 60 gb model

id: 572807bb3acd2414000df2b1
Question: Which region experience a setback that pushed back the release of the PlayStation 3?
Gr

In [None]:
print(f"Proportion of wrongly classifier: {len(results.keys())/len(ground_truth.keys())}")

Proportion of wrongly classifier: 0.0
