# BERT Q&A Training

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

from scripts import tf2_0_baseline_w_bert_translated_to_tf2_0 as tf2baseline # Oliviera's script
from scripts.tf2_0_baseline_w_bert_translated_to_tf2_0 import AnswerType
from scripts import bert_modeling as modeling
from scripts import bert_optimization as optimization
from scripts import bert_tokenization as tokenization
from scripts import albert
from scripts import albert_tokenization

import tqdm
import json
import absl
import sys
import os

TF 2.0 Baseline Loaded


## Define Flags

In [7]:
def del_all_flags(FLAGS):
    flags_dict = FLAGS._flags()
    keys_list = [keys for keys in flags_dict]
    for keys in keys_list:
        FLAGS.__delattr__(keys)

del_all_flags(absl.flags.FLAGS)

flags = absl.flags

flags.DEFINE_string(
    "model", "albert",
    "The name of model to use. Choose from ['bert', 'albert'].")

flags.DEFINE_string(
    "config_file", "models/albert_xxl/config.json",
    "The config json file corresponding to the pre-trained BERT/ALBERT model. "
    "This specifies the model architecture.")

flags.DEFINE_string("vocab_file", "models/albert_xxl/vocab/modified-30k-clean.model",
                    "The vocabulary file that the ALBERT/BERT model was trained on.")

flags.DEFINE_string(
    "output_dir", "output/",
    "The output directory where the model checkpoints will be written.")

flags.DEFINE_string("train_precomputed_file", "data/train.tf_record",
                    "Precomputed tf records for training.")

flags.DEFINE_integer("train_num_precomputed", 272565,
                     "Number of precomputed tf records for training.")

flags.DEFINE_string(
    "output_checkpoint_file", "tf2_albert_finetuned.ckpt",
    "Where to save finetuned checkpoints to.")

flags.DEFINE_string(
    "output_predictions_file", "predictions.json",
    "Where to print predictions in NQ prediction format, to be passed to"
    "natural_questions.nq_eval.")

flags.DEFINE_string(
    "log_dir", "logs/",
    "Where logs, specifically Tensorboard logs, will be saved to.")

flags.DEFINE_string(
    "bert_init_checkpoint", "models/bert_joint_baseline/tf2_bert_joint.ckpt",
    "Initial checkpoint (usually from a pre-trained BERT model).")

flags.DEFINE_bool(
    "do_lower_case", True,
    "Whether to lower case the input text. Should be True for uncased "
    "models and False for cased models.")

# This should be changed to 512 at some point,
# as training was done with that value, it may
# not make a big difference though
flags.DEFINE_integer(
    "max_seq_length", 384,
    "The maximum total input sequence length after WordPiece tokenization. "
    "Sequences longer than this will be truncated, and sequences shorter "
    "than this will be padded.")

flags.DEFINE_integer(
    "doc_stride", 128,
    "When splitting up a long document into chunks, how much stride to "
    "take between chunks.")

flags.DEFINE_integer(
    "max_query_length", 64,
    "The maximum number of tokens for the question. Questions longer than "
    "this will be truncated to this length.")

flags.DEFINE_bool("do_train", True, "Whether to run training.")

flags.DEFINE_bool("do_predict", False, "Whether to run eval on the dev set.")

flags.DEFINE_integer("train_batch_size", 1, "Total batch size for training.")

flags.DEFINE_integer("predict_batch_size", 8,
                     "Total batch size for predictions.")

flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.")

flags.DEFINE_integer("num_train_epochs", 3,
                   "Total number of training epochs to perform.")

flags.DEFINE_float(
    "warmup_proportion", 0.1,
    "Proportion of training to perform linear learning rate warmup for. "
    "E.g., 0.1 = 10% of training.")

flags.DEFINE_integer("save_checkpoints_steps", 1000,
                     "How often to save the model checkpoint.")

flags.DEFINE_integer("iterations_per_loop", 1000,
                     "How many steps to make in each estimator call.")

flags.DEFINE_integer(
    "n_best_size", 20,
    "The total number of n-best predictions to generate in the "
    "nbest_predictions.json output file.")

flags.DEFINE_integer(
    "verbosity", 1, "How verbose our error messages should be")

flags.DEFINE_integer(
    "max_answer_length", 30,
    "The maximum length of an answer that can be generated. This is needed "
    "because the start and end predictions are not conditioned on one another.")

flags.DEFINE_float(
    "include_unknowns", -1.0,
    "If positive, probability of including answers of type `UNKNOWN`.")

flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.")

flags.DEFINE_bool("use_one_hot_embeddings", False, "Whether to use use_one_hot_embeddings")

absl.flags.DEFINE_string(
    "gcp_project", None,
    "[Optional] Project name for the Cloud TPU-enabled project. If not "
    "specified, we will attempt to automatically detect the GCE project from "
    "metadata.")

flags.DEFINE_bool(
    "verbose_logging", False,
    "If true, all of the warnings related to data processing will be printed. "
    "A number of warnings are expected for a normal NQ evaluation.")

# TODO(Edan): Look at nested contents too at some point
# Around 5% of long answers are nested, and around 50% of questions have
# long answers
# This means that this setting alone restricts us from a correct answer
# around 2.5% of the time
flags.DEFINE_boolean(
    "skip_nested_contexts", True,
    "Completely ignore context that are not top level nodes in the page.")

flags.DEFINE_integer("task_id", 0,
                     "Train and dev shard to read from and write to.")

flags.DEFINE_integer("max_contexts", 48,
                     "Maximum number of contexts to output for an example.")

flags.DEFINE_integer(
    "max_position", 50,
    "Maximum context position for which to generate special tokens.")

## Custom flags

flags.DEFINE_integer(
    "n_examples", -1,
    "Number of examples to read from files. Only applicable during testing")

flags.DEFINE_string(
    "train_file", "data/simplified-nq-train.jsonl",
    "NQ json for training. E.g., dev-v1.1.jsonl.gz or test-v1.1.jsonl.gz")

flags.DEFINE_string(
    "albert_pretrain_checkpoint", "models/albert_xxl/tf2_model.h5",
    "Pretrain checkpoint (for Albert only).")

## Special flags - do not change

flags.DEFINE_string(
    "predict_file", "data/simplified-nq-test.jsonl",
    "NQ json for predictions. E.g., dev-v1.1.jsonl.gz or test-v1.1.jsonl.gz")
flags.DEFINE_boolean("logtostderr", True, "Logs to stderr")
flags.DEFINE_boolean("undefok", True, "it's okay to be undefined")
flags.DEFINE_string('f', '', 'kernel')
flags.DEFINE_string('HistoryManager.hist_file', '', 'kernel')

FLAGS = flags.FLAGS
FLAGS(sys.argv) # Parse the flags

['/home/ejmejm/anaconda3/envs/tf2/lib/python3.7/site-packages/ipykernel_launcher.py']

## Check Number of Training Examples

In [3]:
if False:
    # https://stackoverflow.com/questions/9629179/python-counting-lines-in-a-huge-10gb-file-as-fast-as-possible
    def blocks(f, size=65536):
        while True:
            b = f.read(size)
            if not b:
                break
            yield b

    n_records = 0
    for record in tf.compat.v1.python_io.tf_record_iterator(FLAGS.train_precomputed_file):
        n_records += 1

    with open(FLAGS.train_file, 'r') as f:
        n_train_examples = sum([bl.count('\n') for bl in blocks(f)])
 
    print('# Training Examples:', n_train_examples)
    print('# Training Records:', n_records)

    if FLAGS.do_train:
        assert FLAGS.train_num_precomputed == n_records, \
            'Number of records does not match up with the given records file!'

## Build the Model

In [4]:
class TDense(tf.keras.layers.Layer):
    def __init__(self,
                 output_size,
                 kernel_initializer=None,
                 bias_initializer="zeros",
                **kwargs):
        super().__init__(**kwargs)
        self.output_size = output_size
        self.kernel_initializer = kernel_initializer
        self.bias_initializer = bias_initializer

    def build(self,input_shape):
        dtype = tf.as_dtype(self.dtype or tf.keras.backend.floatx())
        if not (dtype.is_floating or dtype.is_complex):
            raise TypeError("Unable to build `TDense` layer with "
                            "non-floating point (and non-complex) "
                            "dtype %s" % (dtype,))
        input_shape = tf.TensorShape(input_shape)
        if tf.compat.dimension_value(input_shape[-1]) is None:
            raise ValueError("The last dimension of the inputs to "
                             "`TDense` should be defined. "
                             "Found `None`.")
        last_dim = tf.compat.dimension_value(input_shape[-1])
        ### tf 2.1 rc min_ndim=3 -> min_ndim=2
        self.input_spec = tf.keras.layers.InputSpec(min_ndim=2, axes={-1: last_dim})
        self.kernel = self.add_weight(
            "kernel",
            shape=[self.output_size,last_dim],
            initializer=self.kernel_initializer,
            dtype=self.dtype,
            trainable=True)
        self.bias = self.add_weight(
            "bias",
            shape=[self.output_size],
            initializer=self.bias_initializer,
            dtype=self.dtype,
            trainable=True)
        super(TDense, self).build(input_shape)
    def call(self,x):
        return tf.matmul(x,self.kernel,transpose_b=True)+self.bias

In [5]:
def get_bert_model(config_file):
    """Builds and returns a BERT model."""
    config = modeling.BertConfig.from_json_file(config_file)
    input_ids = tf.keras.Input(shape=(FLAGS.max_seq_length,),dtype=tf.int32,name='input_ids')
    input_mask = tf.keras.Input(shape=(FLAGS.max_seq_length,),dtype=tf.int32,name='input_mask')
    segment_ids = tf.keras.Input(shape=(FLAGS.max_seq_length,),dtype=tf.int32,name='segment_ids')
    
    bert_layer = modeling.BertModel(config=config, name='bert')
    pooled_output, sequence_output = bert_layer(input_word_ids=input_ids,
                                                input_mask=input_mask,
                                                input_type_ids=segment_ids)
    
    # Maybe try sharing the start and end logits variables
    seq_layer = TDense(2, name='td_seq')
    # seq_layer = tf.keras.layers.TimeDistributed(seq_layer, name='td_seq')
    
    seq_logits = seq_layer(sequence_output)
    start_logits, end_logits = tf.split(seq_logits, axis=-1, num_or_size_splits=2, name='split')
    start_logits = tf.squeeze(start_logits, axis=-1, name='start_logits')
    end_logits = tf.squeeze(end_logits, axis=-1, name='end_logits')
    
    ans_type_layer = TDense(len(tf2baseline.AnswerType), name='ans_type_logits')
    ans_type_logits = ans_type_layer(pooled_output)
    
    return tf.keras.Model([input_ids, input_mask, segment_ids],
                          [start_logits, end_logits, ans_type_logits],
                          name='bert_baseline')

def get_albert_model(config_file):
    """Create an Albert model from pretrained configuration file with vocab_size changed
    to 30522, and optionally loads the pretrained weights.
    """
    config = albert.AlbertConfig.from_json_file(config_file)
    config.vocab_size = 30522
    albert_layer = albert.AlbertModel(config=config)
    
    input_ids = tf.keras.Input(shape=(FLAGS.max_seq_length,),dtype=tf.int32,name='input_ids')
    input_mask = tf.keras.Input(shape=(FLAGS.max_seq_length,),dtype=tf.int32,name='input_mask')
    segment_ids = tf.keras.Input(shape=(FLAGS.max_seq_length,),dtype=tf.int32,name='segment_ids')

    pooled_output, sequence_output = albert_layer(input_word_ids=input_ids,
                                                    input_mask=input_mask,
                                                    input_type_ids=segment_ids)
    
    # Maybe try sharing the start and end logits variables
    seq_layer = TDense(2, name='td_seq')
    # seq_layer = tf.keras.layers.TimeDistributed(seq_layer, name='td_seq')
    
    seq_logits = seq_layer(sequence_output)
    start_logits, end_logits = tf.split(seq_logits, axis=-1, num_or_size_splits=2, name='split')
    start_logits = tf.squeeze(start_logits, axis=-1, name='start_logits')
    end_logits = tf.squeeze(end_logits, axis=-1, name='end_logits')
    
    ans_type_layer = TDense(len(tf2baseline.AnswerType), name='ans_type_logits')
    ans_type_logits = ans_type_layer(pooled_output)
    
    albert_model = tf.keras.Model([input_ids, input_mask, segment_ids],
                          [start_logits, end_logits, ans_type_logits],
                          name='albert')
        
    return albert_model

def build_model(model_name, config_file):
    """Build model according to model_name.
    
    Args:
        model_name: ['bert', 'albert']
        config_file: path to config file
        pretrain_ckpt: path to pretrain checkpoint (albert only)
    Returns:
        the specified model
    """
    if model_name == 'albert':
        model = get_albert_model(config_file)
    elif model_name == 'bert':
        model = get_bert_model(config_file)
    else:
        raise ValueError('{} is not supported'.format(model_name))
    return model

In [6]:
# freeze all pretrain weights of albert
def freeze_pretrain_weights(model):
    """Freeze pretrain weights of the albert model.
    """
    albert_layer = model.get_layer('albert_model')
    albert_layer.embedding_postprocessor.trainable = False
    albert_layer.encoder.trainable = False
    albert_layer.pooler_transform.trainable = False

# helper function to load the pretrain weights
def load_pretrain_weights(model, config_file, ckpt_file):
    """Loads the pretrained model's weights, except for the embedding layer,
    into the new model, which has embedding vocab size of 30522 instead of 30000.
    
    Args:
        model: the same model architecture as the pre-trained model except for embedding
        config_file: path to the config file to re-create the pre-trained model
        ckpt_file: path to the checkpoint of the pre-trained model
    """
    # re-create the pre-trained model
    config = albert.AlbertConfig.from_json_file(config_file)
    albert_layer_pretrain = albert.AlbertModel(config=config, name='albert_pretrain')

    input_ids_pretrain = tf.keras.Input(shape=(FLAGS.max_seq_length,),dtype=tf.int32,name='input_ids_pretrain')
    input_mask_pretrain = tf.keras.Input(shape=(FLAGS.max_seq_length,),dtype=tf.int32,name='input_mask_pretrain')
    segment_ids_pretrain = tf.keras.Input(shape=(FLAGS.max_seq_length,),dtype=tf.int32,name='segment_ids_pretrain')

    pooled_output_pretrain, sequence_output_pretrain = albert_layer_pretrain(input_word_ids=input_ids_pretrain,
                                                    input_mask=input_mask_pretrain,
                                                    input_type_ids=segment_ids_pretrain)

    albert_model_pretrain = tf.keras.Model(inputs=[input_ids_pretrain,input_mask_pretrain,segment_ids_pretrain], 
           outputs=[pooled_output_pretrain, sequence_output_pretrain])
    
    # load the weights into the pre-trained model
    albert_model_pretrain.load_weights(ckpt_file)
    
    # set the pre-train weights on the new model
    albert_layer = model.get_layer('albert_model')
    albert_layer.embedding_postprocessor.set_weights(albert_layer_pretrain.embedding_postprocessor.get_weights())
    albert_layer.encoder.set_weights(albert_layer_pretrain.encoder.get_weights())
    albert_layer.pooler_transform.set_weights(albert_layer_pretrain.pooler_transform.get_weights())
    
    del albert_model_pretrain
    
def compile_model(model, model_type, learning_rate,
                  num_train_steps, num_warmup_steps,
                  init_checkpoint=None):
    
    if model_type.lower() not in ('bert', 'albert'):
        raise ValueError('`model_type` must be one of the following values: ["bert", "albert"]!')
    
    if init_checkpoint:
        if model_type.lower() == 'bert':
            model.load_weights(init_checkpoint)
            print('Loaded model weights!')
        elif model_type.lower() == 'albert':
            load_pretrain_weights(model, FLAGS.config_file, init_checkpoint)
    
    # TODO(Edan): Add a way to have no loss on this for when there is no answer
    # Computes the loss for positions.
    def compute_loss(positions, logits):
        one_hot_positions = tf.one_hot(
            tf.cast(positions, tf.int32), depth=FLAGS.max_seq_length, dtype=tf.float32)
        log_probs = tf.nn.log_softmax(logits, axis=-1)
        loss = -tf.reduce_mean(
            input_tensor=tf.reduce_sum(input_tensor=one_hot_positions * log_probs, axis=-1))
        return loss

    # Computes the loss for labels.
    def compute_label_loss(labels, logits):
        one_hot_labels = tf.one_hot(
            tf.cast(labels, tf.int32), depth=len(tf2baseline.AnswerType), dtype=tf.float32)
        log_probs = tf.nn.log_softmax(logits, axis=-1)
        loss = -tf.reduce_mean(
            input_tensor=tf.reduce_sum(input_tensor=one_hot_labels * log_probs, axis=-1))
        return loss
    
    losses = {
        'tf_op_layer_start_logits': compute_loss,
        'tf_op_layer_end_logits': compute_loss,
        'ans_type_logits': compute_label_loss
    }
    loss_weights = {
        'tf_op_layer_start_logits': 1.0,
        'tf_op_layer_end_logits': 1.0,
        'ans_type_logits': 1.0
    }

    optimizer = optimization.create_optimizer(learning_rate,
                                              num_train_steps,
                                              num_warmup_steps)
    
    model.compile(optimizer=optimizer,
                  loss=losses,
                  loss_weights=loss_weights,
                  metrics=[tf.keras.metrics.sparse_categorical_accuracy])

In [7]:
tf.keras.backend.clear_session()

num_train_features = FLAGS.train_num_precomputed
num_train_steps = int(num_train_features / FLAGS.train_batch_size *
                      FLAGS.num_train_epochs)
num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

model = build_model(FLAGS.model, FLAGS.config_file)
model.load_weights('models/albert_xl/tmp_albert_weights.h5')
compile_model(model=model,
              model_type=FLAGS.model,
              learning_rate=FLAGS.learning_rate,
              num_train_steps=num_train_steps,
              num_warmup_steps=num_warmup_steps,
              init_checkpoint=None)
# compile_model(model=model,
#               model_type=FLAGS.model,
#               learning_rate=FLAGS.learning_rate,
#               num_train_steps=num_train_steps,
#               num_warmup_steps=num_warmup_steps,
#               init_checkpoint=FLAGS.albert_pretrain_checkpoint)

if FLAGS.model.lower() == 'albert':
    freeze_pretrain_weights(model)

model.summary()

Model: "albert"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 384)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 384)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 384)]        0                                            
__________________________________________________________________________________________________
albert_model (AlbertModel)      ((None, 2048), (None 58791680    input_ids[0][0]                  
                                                                 input_mask[0][0]            

## Convert Pretrained Model Weights to Tensorflow 2.0 (Only Once)

In [8]:
if False:
    # Map var names to vars
    model_params = {v.name:v for v in model.trainable_variables}
    # Get all root name paths to wieghts
    model_roots = np.unique([v.name.split('/')[0] for v in model.trainable_variables])
    print(model_roots)
    # Get all saves var names
    saved_names = [k for k,v in tf.train.list_variables('../input/bertjointbaseline/bert_joint.ckpt')]
    a_map = {v:v+':0' for v in saved_names}
    model_roots = np.unique([v.name.split('/')[0] for v in model.trainable_variables])
    def transform(x):
        x = x.replace('attention/self','attention')
        x = x.replace('attention','self_attention')
        x = x.replace('attention/output','attention_output')  

        x = x.replace('/dense','')
        x = x.replace('/LayerNorm','_layer_norm')
        x = x.replace('embeddings_layer_norm','embeddings/layer_norm')  

        x = x.replace('attention_output_layer_norm','attention_layer_norm')  
        x = x.replace('embeddings/word_embeddings','word_embeddings/embeddings')

        x = x.replace('/embeddings/','/embedding_postprocessor/')  
        x = x.replace('/token_type_embeddings','/type_embeddings')  
        x = x.replace('/pooler/','/pooler_transform/')  
        x = x.replace('answer_type_output_bias','ans_type_logits/bias')  
        x = x.replace('answer_type_output_','ans_type_logits/')
        x = x.replace('cls/nq/output_','td_seq/')
        x = x.replace('/weights','/kernel')

        return x
    # Maps saved name to new name
    a_map = {k:model_params.get(transform(v),None) for k,v in a_map.items() if k!='global_step'}
    for saved_name, curr_var in a_map.items():
        if curr_var is None:
            print(saved_name)
    print('---------------')
    print('Missaligned variables:', set([p for p in model_params.keys()]).difference(
        set([v.name if v is not None else None for v in a_map.values()])))
    tf.compat.v1.train.init_from_checkpoint(ckpt_dir_or_file='../input/bertjointbaseline/bert_joint.ckpt',
                                            assignment_map=a_map)
    
    model.save_weights('/kaggle/working/tf2_bert_joint.ckpt')

## Generate Formatted Training Data (TFRecord, Only Once)

In [9]:
if False:
    def data_generator(path, chunk_size=30000):
        curr_pos = 0
        last_line = False
        with open(path, 'rt') as f:
            while not last_line:
                examples = []
                for i in range(curr_pos, curr_pos+chunk_size):
                    line = f.readline()
                    if line is None:
                        last_line = True
                        break
                    examples.append(tf2baseline.create_example_from_jsonl(line))
                    examples[-1] = tf2baseline.read_nq_entry(examples[-1], FLAGS.do_train)[0]
                curr_pos = i + 1
                yield examples
            
    chunk_size = 100
    example_gen = data_generator(FLAGS.train_file, chunk_size=chunk_size)

    train_writer = tf2baseline.FeatureWriter(
        filename=os.path.join(FLAGS.output_dir, "train.tf_record"),
        is_training=FLAGS.do_train)
    train_features = []

    def append_feature(feature):
        train_features.append(feature)
        train_writer.process_feature(feature)

    tokenizer = tokenization.FullTokenizer(
        vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)

    for _ in tqdm.tqdm(range(int(np.ceil(n_train_examples/chunk_size)))):
        # If we want to change how the features are generated then this
        # is the function
        # Right now if an answer is split over 2 blocks, I think the
        # end pos of the first is the last token, and the first pos of
        # the second is the first token of the respective blocks
        num_spans_to_ids = tf2baseline.convert_examples_to_features(
            examples=next(example_gen),
            tokenizer=tokenizer,
            is_training=FLAGS.do_train,
            output_fn=append_feature)

        train_writer._writer.flush()

    train_writer.close()
    train_filename = train_writer.filename

    print(f'# Features written: {train_writer.num_features}\n')

    print('**Features**\n')

    for e in dir(train_features[0]):
        if not e.startswith('__'):
            print(e)

## Create Generator for Training Data

In [10]:
train_filenames = tf.io.gfile.glob(FLAGS.train_precomputed_file)

name_to_features = {
    "unique_ids": tf.io.FixedLenFeature([], tf.int64),
    "input_ids": tf.io.FixedLenFeature([FLAGS.max_seq_length], tf.int64),
    "input_mask": tf.io.FixedLenFeature([FLAGS.max_seq_length], tf.int64),
    "segment_ids": tf.io.FixedLenFeature([FLAGS.max_seq_length], tf.int64),
}
if FLAGS.do_train:
    name_to_features["start_positions"] = tf.io.FixedLenFeature([], tf.int64)
    name_to_features["end_positions"] = tf.io.FixedLenFeature([], tf.int64)
    name_to_features["answer_types"] = tf.io.FixedLenFeature([], tf.int64)

def decode_record(record, name_to_features):
    """Decodes a record to a TensorFlow example."""
    example = tf.io.parse_single_example(serialized=record, features=name_to_features)

    # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
    # So cast all int64 to int32.
    for name in list(example.keys()):
        t = example[name]
        if t.dtype == tf.int64:
            t = tf.cast(t, dtype=tf.int32)
        example[name] = t

    return example

def data_generator(params):
    """The actual input function."""
    batch_size = params["batch_size"]
    if 'seed' not in params:
        params['seed'] = 42

    # For training, we want a lot of parallel reading and shuffling.
    # For eval, we want no shuffling and parallel reading doesn't matter.
    dataset = tf.data.TFRecordDataset(train_filenames)
    if FLAGS.do_train:
        dataset = dataset.repeat()
        dataset = dataset.shuffle(buffer_size=5000, seed=params['seed'])
        
    dataset = dataset.map(lambda r: decode_record(r, name_to_features))
    dataset = dataset.batch(batch_size=batch_size, drop_remainder=False)
    
    data_iter = iter(dataset)
    for examples in data_iter:
        inputs = {
            # 'unique_id': examples['unique_ids'],
            'input_ids': examples['input_ids'],
            'input_mask': examples['input_mask'],
            'segment_ids': examples['segment_ids']
        }

        targets = {
            'tf_op_layer_start_logits': examples['start_positions'],
            'tf_op_layer_end_logits': examples['end_positions'],
            'ans_type_logits': examples['answer_types'],
        }

        yield inputs, targets

In [11]:
# Create training callbacks
ckpt_callback = tf.keras.callbacks.ModelCheckpoint(
    os.path.join(FLAGS.output_dir, FLAGS.output_checkpoint_file), monitor='val_acc', verbose=0, save_best_only=True,
    save_weights_only=True, mode='max', period=1)

tensorboard_callback = tf.keras.callbacks.TensorBoard(
    log_dir=FLAGS.log_dir, update_freq=128)

# Change this to True to actually train
if True:
    H = model.fit_generator(data_generator({'batch_size': FLAGS.train_batch_size}),
                            steps_per_epoch=FLAGS.train_num_precomputed // FLAGS.train_batch_size,
                            epochs=FLAGS.num_train_epochs,
                            callbacks=[ckpt_callback, tensorboard_callback])

Epoch 1/3
 41349/272565 [===>..........................] - ETA: 54:40:46 - loss: 13.1110 - tf_op_layer_start_logits_loss: 5.9539 - tf_op_layer_end_logits_loss: 5.9546 - ans_type_logits_loss: 1.2058 - tf_op_layer_start_logits_sparse_categorical_accuracy: 0.0028 - tf_op_layer_end_logits_sparse_categorical_accuracy: 0.0023 - ans_type_logits_sparse_categorical_accuracy: 0.7939

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/ejmejm/anaconda3/envs/tf2/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3319, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-11-068d6402ca80>", line 14, in <module>
    callbacks=[ckpt_callback, tensorboard_callback])
  File "/home/ejmejm/anaconda3/envs/tf2/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/training.py", line 1297, in fit_generator
    steps_name='steps_per_epoch')
  File "/home/ejmejm/anaconda3/envs/tf2/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/training_generator.py", line 265, in model_iteration
    batch_outs = batch_function(*batch_data)
  File "/home/ejmejm/anaconda3/envs/tf2/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/training.py", line 973, in train_on_batch
    class_weight=class_weight, reset_metrics=reset_metrics)
  File "/home/ejmejm/anaconda3/envs/tf2/lib/python3.7/site-packages/tensorflow_co

KeyboardInterrupt: 

## Download for Tensorboard Logs

In [12]:
import tarfile

# Unzip and run using "tensorboard --logdir="./logs" --port=6006"
tar = tarfile.open("logs.tar.gz", "w:gz")
tar.add(FLAGS.log_dir, arcname="logs")
tar.close()