# BERT Q&A Training

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

from scripts import tf2_0_baseline_w_bert_translated_to_tf2_0 as tf2baseline # Oliviera's script
from scripts.tf2_0_baseline_w_bert_translated_to_tf2_0 import AnswerType
from scripts import albert
from scripts import albert_tokenization

import tqdm
import json
import absl
import sys
import os

TF 2.0 Baseline Loaded


## Define Flags

In [2]:
def del_all_flags(FLAGS):
    flags_dict = FLAGS._flags()
    keys_list = [keys for keys in flags_dict]
    for keys in keys_list:
        FLAGS.__delattr__(keys)

del_all_flags(absl.flags.FLAGS)

flags = absl.flags

flags.DEFINE_string(
    "model", "albert",
    "The name of model to use. Choose from ['bert', 'albert'].")

flags.DEFINE_string(
    "config_file", "models/albert_xl/config.json",
    "The config json file corresponding to the pre-trained BERT/ALBERT model. "
    "This specifies the model architecture.")

flags.DEFINE_string("vocab_file", "models/albert_xxl/vocab/modified-30k-clean.model",
                    "The vocabulary file that the ALBERT/BERT model was trained on.")

flags.DEFINE_string(
    "output_dir", "output/",
    "The output directory where the model checkpoints will be written.")

flags.DEFINE_string("train_precomputed_file", "data/train.tf_record",
                    "Precomputed tf records for training.")

flags.DEFINE_integer("train_num_precomputed", 272565,
                     "Number of precomputed tf records for training.")

flags.DEFINE_string(
    "output_checkpoint_file", "tf2_albert_finetuned.ckpt",
    "Where to save finetuned checkpoints to.")

flags.DEFINE_string(
    "init_checkpoint", "models/albert_xl/tf2_model.h5",
    "Initial checkpoint (usually from a pre-trained BERT model).")

flags.DEFINE_bool(
    "do_lower_case", True,
    "Whether to lower case the input text. Should be True for uncased "
    "models and False for cased models.")

# This should be changed to 512 at some point,
# as training was done with that value, it may
# not make a big difference though
flags.DEFINE_integer(
    "max_seq_length", 384,
    "The maximum total input sequence length after WordPiece tokenization. "
    "Sequences longer than this will be truncated, and sequences shorter "
    "than this will be padded.")

flags.DEFINE_integer(
    "doc_stride", 128,
    "When splitting up a long document into chunks, how much stride to "
    "take between chunks.")

flags.DEFINE_integer(
    "max_query_length", 64,
    "The maximum number of tokens for the question. Questions longer than "
    "this will be truncated to this length.")

flags.DEFINE_bool("do_train", True, "Whether to run training.")

flags.DEFINE_bool("do_predict", False, "Whether to run eval on the dev set.")

## Special flags - do not change

flags.DEFINE_string(
    "predict_file", "data/simplified-nq-test.jsonl",
    "NQ json for predictions. E.g., dev-v1.1.jsonl.gz or test-v1.1.jsonl.gz")
flags.DEFINE_boolean("logtostderr", True, "Logs to stderr")
flags.DEFINE_boolean("undefok", True, "it's okay to be undefined")
flags.DEFINE_string('f', '', 'kernel')
flags.DEFINE_string('HistoryManager.hist_file', '', 'kernel')

FLAGS = flags.FLAGS
FLAGS(sys.argv) # Parse the flags

['/home/ejmejm/anaconda3/envs/tf2/lib/python3.7/site-packages/ipykernel_launcher.py']

## Build the Model

In [3]:
class TDense(tf.keras.layers.Layer):
    def __init__(self,
                 output_size,
                 kernel_initializer=None,
                 bias_initializer="zeros",
                **kwargs):
        super().__init__(**kwargs)
        self.output_size = output_size
        self.kernel_initializer = kernel_initializer
        self.bias_initializer = bias_initializer

    def build(self,input_shape):
        dtype = tf.as_dtype(self.dtype or tf.keras.backend.floatx())
        if not (dtype.is_floating or dtype.is_complex):
            raise TypeError("Unable to build `TDense` layer with "
                            "non-floating point (and non-complex) "
                            "dtype %s" % (dtype,))
        input_shape = tf.TensorShape(input_shape)
        if tf.compat.dimension_value(input_shape[-1]) is None:
            raise ValueError("The last dimension of the inputs to "
                             "`TDense` should be defined. "
                             "Found `None`.")
        last_dim = tf.compat.dimension_value(input_shape[-1])
        ### tf 2.1 rc min_ndim=3 -> min_ndim=2
        self.input_spec = tf.keras.layers.InputSpec(min_ndim=2, axes={-1: last_dim})
        self.kernel = self.add_weight(
            "kernel",
            shape=[self.output_size,last_dim],
            initializer=self.kernel_initializer,
            dtype=self.dtype,
            trainable=True)
        self.bias = self.add_weight(
            "bias",
            shape=[self.output_size],
            initializer=self.bias_initializer,
            dtype=self.dtype,
            trainable=True)
        super(TDense, self).build(input_shape)
    def call(self,x):
        return tf.matmul(x,self.kernel,transpose_b=True)+self.bias

In [4]:
VOCAB_SIZE = 30209

# this is the helper function to create the albert model
# config_file is used to create the model
# pretrain_ckpt is used to load the pretrain weights except for the embedding layer
def get_albert_model(config_file, max_seq_length, vocab_size, pretrain_ckpt=None):
    """ create albert model from pretrained configuration file with vocab_size changed to VOCAB_SIZE
        and optionally loads the pretrained weights
    """
    
    config = albert.AlbertConfig.from_json_file(config_file)
    config.vocab_size = vocab_size
    albert_layer = albert.AlbertModel(config=config)
    
    input_ids = tf.keras.Input(shape=(max_seq_length,),dtype=tf.int32,name='input_ids')
    input_mask = tf.keras.Input(shape=(max_seq_length,),dtype=tf.int32,name='input_mask')
    segment_ids = tf.keras.Input(shape=(max_seq_length,),dtype=tf.int32,name='segment_ids')

    pooled_output, sequence_output = albert_layer(input_word_ids=input_ids,
                                                    input_mask=input_mask,
                                                    input_type_ids=segment_ids)
    
    # Maybe try sharing the start and end logits variables
    seq_layer = TDense(2, name='td_seq')
    # seq_layer = tf.keras.layers.TimeDistributed(seq_layer, name='td_seq')
    
    seq_logits = seq_layer(sequence_output)
    start_logits, end_logits = tf.split(seq_logits, axis=-1, num_or_size_splits=2, name='split')
    start_logits = tf.squeeze(start_logits, axis=-1, name='start_logits')
    end_logits = tf.squeeze(end_logits, axis=-1, name='end_logits')
    
    ans_type_layer = TDense(len(tf2baseline.AnswerType), name='ans_type_logits')
    ans_type_logits = ans_type_layer(pooled_output)
    
    albert_model = tf.keras.Model([input_ids, input_mask, segment_ids],
                          [start_logits, end_logits, ans_type_logits],
                          name='albert')
    
    if pretrain_ckpt:
        load_pretrain_weights(albert_model, config_file, pretrain_ckpt, max_seq_length)
        
    return albert_model

def load_pretrain_weights(model, config_file, ckpt_file, max_seq_length):
    """loads the pretrained model's weights, except for the embedding layer,
    into the new model, which has [0:29999] loaded
    
    Args:
        model: the same model architecture as the pre-trained model except for embedding
        config_file: path to the config file to re-create the pre-trained model
        ckpt_file: path to the checkpoint of the pre-trained model
    """
    
    # re-create the pre-trained model
    config = albert.AlbertConfig.from_json_file(config_file)
    albert_layer_pretrain = albert.AlbertModel(config=config, name='albert_pretrain')

    input_ids_pretrain = tf.keras.Input(shape=(max_seq_length,),dtype=tf.int32,name='input_ids_pretrain')
    input_mask_pretrain = tf.keras.Input(shape=(max_seq_length,),dtype=tf.int32,name='input_mask_pretrain')
    segment_ids_pretrain = tf.keras.Input(shape=(max_seq_length,),dtype=tf.int32,name='segment_ids_pretrain')

    pooled_output_pretrain, sequence_output_pretrain = albert_layer_pretrain(input_word_ids=input_ids_pretrain,
                                                    input_mask=input_mask_pretrain,
                                                    input_type_ids=segment_ids_pretrain)

    albert_model_pretrain = tf.keras.Model(inputs=[input_ids_pretrain,input_mask_pretrain,segment_ids_pretrain], 
           outputs=[pooled_output_pretrain, sequence_output_pretrain])
    
    # load the weights into the pre-trained model
    albert_model_pretrain.load_weights(ckpt_file)
    
    # set the pre-train weights on the new model
    albert_layer = model.get_layer('albert_model')
    albert_layer.embedding_postprocessor.set_weights(albert_layer_pretrain.embedding_postprocessor.get_weights())
    albert_layer.encoder.set_weights(albert_layer_pretrain.encoder.get_weights())
    albert_layer.pooler_transform.set_weights(albert_layer_pretrain.pooler_transform.get_weights())
    # load the embedding
    embedding_weights = albert_layer.embedding_lookup.get_weights()
    embedding_weights_pretrain = albert_layer_pretrain.embedding_lookup.get_weights()
    # the embedding weights are stored in a list of size 1, so we need to do [0] to get the actual weights
    new_embedding_weights = tf.concat([embedding_weights_pretrain[0], embedding_weights[0][30000:]],axis=0)
    # then we unsqueeze the first dimension after concat
    new_embedding_weights = tf.expand_dims(new_embedding_weights, axis=0)
    albert_layer.embedding_lookup.set_weights(new_embedding_weights)

# function that builds bert/albert from config, optionally loads the pretrain weights for albert
def build_model(config_file, max_seq_length, init_ckpt, vocab_size=VOCAB_SIZE):
    """ build model according to model_name
    
    Args:
        model_name: ['bert', 'albert']
        config_file: path to config file
        max_seq_length: the maximum length for each scan
        pretrain_ckpt: path to pretrain checkpoint (albert only)
        vocab_size: size of the new vocab, (albert only)
    Returns:
        the specified model
    """

    model = get_albert_model(config_file=config_file, 
                             max_seq_length=max_seq_length, 
                             pretrain_ckpt=init_ckpt,
                             vocab_size=vocab_size)
    return model

In [5]:
model = build_model(config_file=FLAGS.config_file,
                    max_seq_length=FLAGS.max_seq_length,
                    init_ckpt=FLAGS.init_checkpoint)

model.save_weights('new_albert_model.h5')