<a href="https://colab.research.google.com/github/cyyeh/kaggle/blob/master/google-qa/google_qa_shortans_albert_tpu.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# TPU Training for NQA Short Answers


---

# Import Libraries and Environment Setup

In [0]:
# make sure colab use tf2.x
try:
  %tensorflow_version 2.x
except Exception:
  pass

In [0]:
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import os

In [21]:
# install huggingface transformers
!pip install transformers



In [0]:
from transformers import TFAlbertPreTrainedModel, TFAlbertModel, AlbertConfig
from transformers.modeling_tf_utils import get_initializer

### Setup TPU

In [23]:
# create tpu resolver and strategy
resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])
tf.config.experimental_connect_to_cluster(resolver)
tf.tpu.experimental.initialize_tpu_system(resolver)
tpu_strategy = tf.distribute.experimental.TPUStrategy(resolver)

KeyError: ignored

### Load Training Dataset

In [0]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


Define Task Flag Here

In [0]:
SHORT_ANS_YESNO = 'short_ans_yesno'
SHORT_ANS_ENTITY = 'short_ans_entity'

In [0]:
def read_short_ans_train_dataset(task=SHORT_ANS_YESNO):
  assert task in {SHORT_ANS_YESNO, SHORT_ANS_ENTITY}, \
    f"task should be {SHORT_ANS_YESNO} or {SHORT_ANS_ENTITY}"

  SHORT_ANS_YESNO_DF = f"{SHORT_ANS_YESNO}.pkl"
  SHORT_ANS_ENTITY_DF = f"{SHORT_ANS_ENTITY}.pkl"
  if task == SHORT_ANS_YESNO:
    datapath = f"drive/My Drive/{SHORT_ANS_YESNO_DF}"
    if not os.path.exists(datapath):
      print("short answer yesno dataset is not found!")
      return
  elif task == SHORT_ANS_ENTITY:
    datapath = f"drive/My Drive/{SHORT_ANS_ENTITY_DF}"
    if not os.path.exists(f"drive/My Drive/{SHORT_ANS_ENTITY_DF}"):
      print("short answer entity dataset is not found!")
      return
  
  return pd.read_pickle(datapath)

In [0]:
train_df = read_short_ans_train_dataset(task=SHORT_ANS_ENTITY)

If training dataset is not found, please check this [Colab notebook for preparing training data](https://colab.research.google.com/drive/122bYIInseyFwrRFlTNEGLSNFP594i9OV)

In [0]:
print(len(train_df))
train_df.head()

152148


Unnamed: 0,token_id,segment_id,mask_id,label_start_token,label_end_token
0,"[2, 56, 25, 14, 127, 757, 275, 16, 17034, 8, 1...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",28,39
1,"[2, 184, 31, 9, 5909, 154, 449, 72, 25, 14, 44...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",16,18
2,"[2, 98, 1001, 16, 4270, 8005, 4330, 1384, 209,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0,0
3,"[2, 72, 41, 14, 127, 4041, 19, 14, 4101, 3, 13...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",16,18
4,"[2, 72, 257, 169, 3409, 16931, 16, 14, 9358, 1...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",15,20


Create distributed dataset

In [0]:
def short_ans_df_to_dataset(df, batch_size=16, task=SHORT_ANS_YESNO, dist_train=True):
  assert task in {SHORT_ANS_YESNO, SHORT_ANS_ENTITY}, \
    f"task should be {SHORT_ANS_YESNO} or {SHORT_ANS_ENTITY}"

  df = df.copy()

  if task == 'short_ans_yesno':
    label_yes_no = df.pop('label_yes_no')
    dataset = tf.data.Dataset.from_tensor_slices((dict(df), label_yes_no))
  elif task == 'short_ans_entity':
    label_start_token = df.pop('label_start_token')
    label_end_token = df.pop('label_end_token')
    dataset = tf.data.Dataset.from_tensor_slices((dict(df), label_start_token, label_end_token))
  
  dataset = (dataset
              .shuffle(buffer_size=len(df))
              .batch(batch_size, drop_remainder=True)
            )
  
  return (
    tpu_strategy.experimental_distribute_dataset(dataset) 
    if dist_train else dataset
  )

In [0]:
dist_train_ds = short_ans_df_to_dataset(train_df, task=SHORT_ANS_ENTITY)

### [Short Ans YESNO] Create TFAlbertForSequenceClassification Model

In [0]:
class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel):
  def __init__(self, config, *inputs, **kwargs):
    super(TFAlbertForSequenceClassification, self).__init__(config, *inputs, **kwargs)
    self.num_labels = config.num_labels

    self.albert = TFAlbertModel(config, name="albert")
    self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
    self.classifier = tf.keras.layers.Dense(
      config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
    )

  def call(self, inputs, **kwargs):
    outputs = self.albert(inputs, **kwargs)

    pooled_output = outputs[1]

    pooled_output = self.dropout(pooled_output, training=kwargs.get("training", False))
    logits = self.classifier(pooled_output)

    outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
    return outputs  # logits, (hidden_states), (attentions)

### [Short Ans Entity]Create TFAlbertForQuestionAnswering

In [0]:
class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel):
  def __init__(self, config, *inputs, **kwargs):
    super().__init__(config, *inputs, **kwargs)
    self.num_labels = config.num_labels

    self.albert = TFAlbertModel(config, name="albert")
    self.qa_outputs = tf.keras.layers.Dense(
      config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
    )

  def call(self, inputs, **kwargs):  
    outputs = self.albert(inputs, **kwargs)

    sequence_output = outputs[0]

    logits = self.qa_outputs(sequence_output)
    start_logits, end_logits = tf.split(logits, 2, axis=-1)
    start_logits = tf.squeeze(start_logits, axis=-1)
    end_logits = tf.squeeze(end_logits, axis=-1)

    outputs = (start_logits, end_logits,) + outputs[2:]

    return outputs  # start_logits, end_logits, (hidden_states), (attentions)

### Create Model

In [0]:
def create_short_ans_model(task=SHORT_ANS_YESNO):
  assert task in {SHORT_ANS_YESNO, SHORT_ANS_ENTITY}, \
    f"task should be {SHORT_ANS_YESNO} or {SHORT_ANS_ENTITY}"

  # input layers
  token_ids = keras.Input(shape=(512,), dtype='int32', name='token_ids')
  segment_ids = keras.Input(shape=(512,), dtype='int32', name='segment_ids')
  mask_ids = keras.Input(shape=(512,), dtype='int32', name='mask_ids')

  if task == SHORT_ANS_YESNO:
    config = AlbertConfig.from_pretrained('albert-base-v2', num_labels=3)
    albert_qa_layer = TFAlbertForSequenceClassification(config)
  else:
    albert_qa_layer = TFAlbertForQuestionAnswering.from_pretrained('albert-base-v2')

  # both tasks use the same input format
  albert_qa_outputs = albert_qa_layer([token_ids, mask_ids, segment_ids])

  if task == SHORT_ANS_YESNO:
    logits = albert_qa_outputs[0]

    # create model
    model = keras.Model(
      inputs=[token_ids, mask_ids, segment_ids], 
      outputs=[logits]
    )
  else:
    start_logits, end_logits = albert_qa_outputs[:2]

    # create model
    model = keras.Model(
      inputs=[token_ids, mask_ids, segment_ids], 
      outputs=[start_logits, end_logits]
    )

  return model

### TPU Training

In [0]:
def train_short_ans_using_tpu(
    dist_train_ds, 
    task=SHORT_ANS_YESNO, 
    learning_rate=2e-5, 
    epsilon=1e-8, 
    epochs=10,
    batch_size=16
):
  @tf.function
  def train_step(dist_inputs, task=SHORT_ANS_YESNO):
    # calculate loss and gradient for each replica
    def step_fn_yesno(inputs):
      features, label_yes_no = inputs
      one_hot_label = tf.one_hot(label_yes_no, 3)
      one_hot_label_index = tf.argmax(one_hot_label, axis=1)

      with tf.GradientTape() as tape:
        logits = model(features)
        loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=one_hot_label_index, logits=logits)
        avg_loss = loss / batch_size

      gradients = tape.gradient(avg_loss, model.trainable_variables)
      optimizer.apply_gradients(zip(gradients, model.trainable_variables))

      train_loss(avg_loss)
      label_yes_no_train_accuracy(one_hot_label_index, logits)

      return avg_loss

    def step_fn_entity(inputs):
      features, start_tokens, end_tokens = inputs

      with tf.GradientTape() as tape:
        start_logits, end_logits = model(features)
          
        start_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=start_tokens, logits=start_logits)
        end_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=end_tokens, logits=end_logits)

        loss = (start_loss + end_loss) / 2.0
        avg_loss = loss / batch_size

      gradients = tape.gradient(avg_loss, model.trainable_variables)
      optimizer.apply_gradients(zip(gradients, model.trainable_variables))

      train_loss(avg_loss)
      start_train_accuracy(start_tokens, start_logits)
      end_train_accuracy(end_tokens, end_logits)

      return avg_loss
    
    # combine loss for all replicas
    if task == SHORT_ANS_YESNO:
      per_example_losses = tpu_strategy.experimental_run_v2(step_fn_yesno, args=(dist_inputs,))
    else:
      per_example_losses = tpu_strategy.experimental_run_v2(step_fn_entity, args=(dist_inputs,))
    sum_loss = tpu_strategy.reduce(tf.distribute.ReduceOp.SUM, per_example_losses, axis=0)
    return sum_loss  
  
  assert task in {SHORT_ANS_YESNO, SHORT_ANS_ENTITY}, \
    f"task should be {SHORT_ANS_YESNO} or {SHORT_ANS_ENTITY}"

  train_loss = tf.keras.metrics.Mean(name='train_loss')
  if task == SHORT_ANS_YESNO:
    label_yes_no_train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
      name='label_yes_no_train_accuracy'
    )
  else:
    start_train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
      name='start_train_accuracy'
    )
    end_train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
      name='end_train_accuracy'
    )

  with tpu_strategy.scope():
    model = create_short_ans_model(task)
    optimizer = tf.keras.optimizers.Adam(
      learning_rate=learning_rate, 
      epsilon=epsilon
    )

    train_loss.reset_states()
    if task == SHORT_ANS_YESNO:
      label_yes_no_train_accuracy.reset_states()
    else:
      start_train_accuracy.reset_states()
      end_train_accuracy.reset_states()

    for epoch in range(epochs):
      i = 0
      for inputs in dist_train_ds:
        train_step(inputs, task)
        i = i + 1
        
        training_result = f"epoch: {epoch}, batch: {i}, loss: {train_loss.result()}, "
        if task == SHORT_ANS_YESNO:
          training_result += f"label_yes_no_train_accuracy: {label_yes_no_train_accuracy.result()*100}"
        else:
          training_result += f"start_accuracy: {start_train_accuracy.result()*100}, end_accuracy: {end_train_accuracy.result()*100}"
        print(training_result)

        train_loss.reset_states()
        if task == SHORT_ANS_YESNO:
          label_yes_no_train_accuracy.reset_states()
        else:
          start_train_accuracy.reset_states()
          end_train_accuracy.reset_states()

In [0]:
train_short_ans_using_tpu(dist_train_ds, task=SHORT_ANS_ENTITY)

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: Cell is empty


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: Cell is empty


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: Cell is empty








epoch: 0, batch: 1, loss: 0.38915392756462097, start_accuracy: 0.0, end_accuracy: 0.0
epoch: 0, batch: 2, loss: 0.3564722537994385, start_accuracy: 0.0, end_accuracy: 0.0
epoch: 0, batch: 3, loss: 0.36094802618026733, start_accuracy: 0.0, end_accuracy: 0.0
epoch: 0, batch: 4, loss: 0.35990041494369507, start_accuracy: 0.0, end_accuracy: 0.0
epoch: 0, batch: 5, loss: 0.24091550707817078, start_accuracy: 50.0, end_accuracy: 50.0
epoch: 0, batch: 6, loss: 0.2908211648464203, start_accuracy: 50.0, end_accuracy: 50.0
epoch: 0, batch: 7, loss: 0.3150749206542969, start_accuracy: 0.0, end_accuracy: 0.0
epoch: 0, batch: 8, loss: 0.2462438941001892, start_accuracy: 50.0, end_accuracy: 50.0
epoch: 0, batch: 9, loss: 0.2815420627593994, start_accuracy: 0.0, end_accuracy: 0.0
epoch: 0, batch: 10, loss: 0.1546507030725479, start_accuracy: 50.0, end_accuracy: 50.0
epoch: 0, batch: 11, loss: 0.181248277425766, start_accuracy: 50.0, end_accuracy: 50.0
epoch: 0, batch: 12, loss: 0.2030486911535263, sta

KeyboardInterrupt: ignored