<a href="https://colab.research.google.com/github/duncansamuelgeorgefreeman/colab/blob/master/bert_Squad_QA_tensorflow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers



In [2]:
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [3]:
#'/gdrive/My Drive/foo.txt'

In [4]:
import os
import re
import json
import string
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizer, TFBertModel, BertConfig

In [5]:
# config
MAX_LEN = 384
CONFIG = BertConfig()

In [6]:
slow_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
save_path = "/gdrive/My Drive/bert_base_uncased/"
if not os.path.exists(save_path):
    os.makedirs(save_path)
slow_tokenizer.save_pretrained(save_path)

('/gdrive/My Drive/bert_base_uncased/tokenizer_config.json',
 '/gdrive/My Drive/bert_base_uncased/special_tokens_map.json',
 '/gdrive/My Drive/bert_base_uncased/vocab.txt',
 '/gdrive/My Drive/bert_base_uncased/added_tokens.json')

In [7]:
tokenizer = BertWordPieceTokenizer(os.path.join(save_path, "vocab.txt"), lowercase=True)

### Load Data

In [8]:
train_data_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json"
train_path = keras.utils.get_file("train.json", train_data_url)
eval_data_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json"
eval_path = keras.utils.get_file("eval.json", eval_data_url)

### Preprocess Data

EXAMPLE

```
FeaturesDict({
    'answers': Sequence({
        'answer_start': tf.int32,
        'text': Text(shape=(), dtype=tf.string),
    }),
    'context': Text(shape=(), dtype=tf.string),
    'id': tf.string,
    'question': Text(shape=(), dtype=tf.string),
    'title': Text(shape=(), dtype=tf.string),
})
```
answers/answer_start:
56 (answer start index).

answers/answer_text:
mobile phone (the text of the answer)

context:
The difference in the above factors and cause is mobile phones for the case of θ=0 is the reason that most broadcasting (transmissions intended for the public) uses vertical polarization.

id:
hash id of item

question:
Test of question for which answer is answer_text (text of question).

title:
Radio_devices (title of item).

In [9]:
# 1. Go through json and store every example as a SquadExample object;
# 2. Go through each SquadExample object and create x_train, y_train, x_eval, y_eval

class SquadExample:
  def __init__(self, question, context, answer_start_idx, answer_text, all_answers):
    self.question = question
    self.context = context
    self.answer_start_idx = answer_start_idx
    self.answer_text = answer_text
    self.all_answers = all_answers
    self.skip = False

  def preprocess(self):
    # Clean context, question, answer text.
    context = " ".join(str(self.context).split())
    question = " ".join(str(self.question).split())
    answer_text = " ".join(str(self.answer_text).split())
    answer_start_idx = self.answer_start_idx

    #print("question: {}\nanswer_text: {}\nanswer_start_idx: {}".format(question, answer_text, answer_start_idx))

    # Find end_idx of answer in context.
    answer_end_idx = answer_start_idx + len(answer_text)
    if answer_end_idx >= len(context):
      self.skip = True
      return

    # Create array of 0's len(context) and add mask for indices of answer.
    answer_mask = [0] * len(context)
    for idx in range(answer_start_idx, answer_end_idx):
      answer_mask[idx] = 1

    # Tokenize context
    tokenized_context = tokenizer.encode(context)

    # Find and save indices of tokens of answer.
    answer_tokens_idx = []
    for idx, (start, end) in enumerate(tokenized_context.offsets):
      if sum(answer_mask[start:end]) > 0:
        answer_tokens_idx.append(idx)

    if len(answer_tokens_idx) == 0:
      self.skip = True
      return

    #print("answer_tokens_idx: ", answer_tokens_idx)

    # Get start and end token index for tokens from answer.
    start_token_idx = answer_tokens_idx[0]
    end_token_idx = answer_tokens_idx[-1]

    # Tokenize question.
    tokenized_question = tokenizer.encode(question)

    # Create model inputs.
    input_ids = tokenized_context.ids + tokenized_question.ids[1:]
    token_type_ids = [0] * len(tokenized_context.ids) + [1] * len(tokenized_question.ids[1:])
    attention_mask = [1] * len(input_ids)

    # Pad and create attention masks
    padding_length = MAX_LEN - len(input_ids)
    if padding_length > 0:
      input_ids = input_ids + ([0] * padding_length)
      attention_mask = attention_mask + ([0] * padding_length)
      token_type_ids = token_type_ids + ([0] * padding_length)
    else:
      self.skip = True
      return

    self.input_ids = input_ids
    self.token_type_ids = token_type_ids
    self.attention_mask = attention_mask
    self.start_token_idx = start_token_idx
    self.end_token_idx = end_token_idx
    self.context_token_to_char = tokenized_context.offsets

with open(train_path) as f:
    raw_train_data = json.load(f)

with open(eval_path) as f:
    raw_eval_data = json.load(f)

def create_squad_examples(raw_data):
  squad_examples = []
  for item in raw_data["data"]:
    for paragraph in item["paragraphs"]:
      context = paragraph["context"]
      for qa in paragraph["qas"]:
        question = qa["question"]
        answer_text = qa["answers"][0]["text"]
        all_answers = [_["text"] for _ in qa["answers"]]
        start_char_idx = qa["answers"][0]["answer_start"]
        squad_ex = SquadExample(question, context, start_char_idx, answer_text, all_answers)
        squad_ex.preprocess()
        squad_examples.append(squad_ex)
  return squad_examples


def create_input_targets(squad_examples):
  dataset_dict = {
        "input_ids": [],
        "token_type_ids": [],
        "attention_mask": [],
        "start_token_idx": [],
        "end_token_idx": [],
  }

  for item in squad_examples:
    if item.skip == False:
      for key in dataset_dict:
        dataset_dict[key].append(getattr(item, key))
  for key in dataset_dict:
        dataset_dict[key] = np.array(dataset_dict[key])

  x = [
        dataset_dict["input_ids"],
        dataset_dict["token_type_ids"],
        dataset_dict["attention_mask"],]

  y = [dataset_dict["start_token_idx"], dataset_dict["end_token_idx"]]
  return x, y

train_squad_examples = create_squad_examples(raw_train_data)
x_train, y_train = create_input_targets(train_squad_examples)
print(f"{len(train_squad_examples)} training points created.")

eval_squad_examples = create_squad_examples(raw_eval_data)
x_eval, y_eval = create_input_targets(eval_squad_examples)
print(f"{len(eval_squad_examples)} evaluation points created.")


87599 training points created.
10570 evaluation points created.


In [10]:
# model

def create_model():
  # BERT encoder
  encoder = TFBertModel.from_pretrained("bert-base-uncased")

  # Question/Answer model
  input_ids = layers.Input(shape=(MAX_LEN,), dtype=tf.int32)
  token_type_ids = layers.Input(shape=(MAX_LEN,), dtype=tf.int32)
  attention_mask = layers.Input(shape=(MAX_LEN,), dtype=tf.int32)

  embedding = encoder(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)[0]

  start_logits = layers.Dense(1, name="start_logit", use_bias=False)(embedding)
  start_logits = layers.Flatten()(start_logits)
  end_logits = layers.Dense(1, name="end_logit", use_bias=False)(embedding)
  end_logits = layers.Flatten()(end_logits)

  start_probs = layers.Activation(keras.activations.softmax)(start_logits)
  end_probs = layers.Activation(keras.activations.softmax)(end_logits)

  model = keras.Model(
        inputs=[input_ids, token_type_ids, attention_mask],
        outputs=[start_probs, end_probs],
    )
  
  loss = keras.losses.SparseCategoricalCrossentropy(from_logits=False)
  optimizer = keras.optimizers.Adam(lr=5e-5)
  model.compile(optimizer=optimizer, loss=[loss, loss])
  return model

In [11]:
use_tpu = True
if use_tpu:
    # Create distribution strategy
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)

    # Create model
    with strategy.scope():
        model = create_model()
else:
    model = create_model()

model.summary()

INFO:absl:Entering into master device scope: /job:worker/replica:0/task:0/device:CPU:0


INFO:tensorflow:Initializing the TPU system: grpc://10.39.254.114:8470


INFO:tensorflow:Initializing the TPU system: grpc://10.39.254.114:8470


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Found TPU system:


INFO:tensorflow:Found TPU system:


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)
Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: <cyfunction Socket.send at 0x7f6a989be110> is not a module, class, method, function, traceback, frame, or code object


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: <cyfunction Socket.send at 0x7f6a989be110> is not a module, class, method, function, traceback, frame, or code object


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: <cyfunction Socket.send at 0x7f6a989be110> is not a module, class, method, function, traceback, frame, or code object





Cause: while/else statement not yet supported


The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.
Cause: while/else statement not yet supported


Cause: while/else statement not yet supported
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 384)]        0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 384)]        0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 384)]        0                                            
__________________________________________________________________________________________________
tf_bert_model (TFBertModel)     TFBaseModelOutputWit 109482240   input_1[0][0]                    
                                                

In [12]:
 ## Create Evaluation Callback

 def normalize_text(text):
   text = text.lower()
   exclude = set(string.punctuation)
   text = "".join(ch for ch in text if ch not in exclude)

   regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
   text = re.sub(regex, " ", text)
   text = " ".join(text.split())
   return text

class ExactMatch(keras.callbacks.Callback):
    """
    Each `SquadExample` object contains the character level offsets for each token
    in its input paragraph. We use them to get back the span of text corresponding
    to the tokens between our predicted start and end tokens.
    All the ground-truth answers are also present in each `SquadExample` object.
    We calculate the percentage of data points where the span of text obtained
    from model predictions matches one of the ground-truth answers.
    """

    def __init__(self, x_eval, y_eval):
        self.x_eval = x_eval
        self.y_eval = y_eval

    def on_epoch_end(self, epoch, logs=None):
        pred_start, pred_end = self.model.predict(self.x_eval)
        count = 0
        eval_examples_no_skip = [_ for _ in eval_squad_examples if _.skip == False]
        for idx, (start, end) in enumerate(zip(pred_start, pred_end)):
            squad_eg = eval_examples_no_skip[idx]
            offsets = squad_eg.context_token_to_char
            start = np.argmax(start)
            end = np.argmax(end)
            if start >= len(offsets):
                continue
            pred_char_start = offsets[start][0]
            if end < len(offsets):
                pred_char_end = offsets[end][1]
                pred_ans = squad_eg.context[pred_char_start:pred_char_end]
            else:
                pred_ans = squad_eg.context[pred_char_start:]

            normalized_pred_ans = normalize_text(pred_ans)
            normalized_true_ans = [normalize_text(_) for _ in squad_eg.all_answers]
            if normalized_pred_ans in normalized_true_ans:
                count += 1
        acc = count / len(self.y_eval[0])
        print(f"\nepoch={epoch+1}, exact match score={acc:.2f}")


## Train and Evaluate

In [13]:
exact_match_callback = ExactMatch(x_eval, y_eval)
model.fit(
    x_train,
    y_train,
    epochs=3,  # For demonstration, 3 epochs are recommended
    verbose=2,
    batch_size=64,
    callbacks=[exact_match_callback],
)

Epoch 1/3


The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.




The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.






1346/1346 - 322s - loss: 2.5664 - activation_4_loss: 1.3368 - activation_5_loss: 1.2295


The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.



epoch=1, exact match score=0.77
Epoch 2/3
1346/1346 - 210s - loss: 1.5512 - activation_4_loss: 0.8248 - activation_5_loss: 0.7264

epoch=2, exact match score=0.79
Epoch 3/3
1346/1346 - 210s - loss: 1.0829 - activation_4_loss: 0.5811 - activation_5_loss: 0.5017

epoch=3, exact match score=0.79


<tensorflow.python.keras.callbacks.History at 0x7f6937cb6358>