# BERT Q&A Training

In [1]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf

from scripts import tf2_0_baseline_w_bert_translated_to_tf2_0 as tf2baseline # Oliviera's script
from scripts import bert_modeling as modeling
from scripts import bert_optimization as optimization
from scripts import bert_tokenization as tokenization
import tqdm
import json
import absl
import sys
import time
import zipfile

tf2baseline.FLAGS.include_unknowns = 1./8.

TF 2.0 Baseline Loaded


## Define Flags

In [2]:
def del_all_flags(FLAGS):
    flags_dict = FLAGS._flags()
    keys_list = [keys for keys in flags_dict]
    for keys in keys_list:
        FLAGS.__delattr__(keys)

del_all_flags(absl.flags.FLAGS)

flags = absl.flags

flags.DEFINE_string(
    "bert_config_file", "/kaggle/input/bertjointbaseline/bert_config.json",
    "The config json file corresponding to the pre-trained BERT model. "
    "This specifies the model architecture.")

flags.DEFINE_string("vocab_file", "models/bert_joint_baseline/vocab-nq.txt",
                    "The vocabulary file that the BERT model was trained on.")

flags.DEFINE_string(
    "output_dir", "./output/",
    "The output directory where the model checkpoints will be written.")

flags.DEFINE_string("train_precomputed_file", "/kaggle/input/qa-train-record/train.tf_record",
                    "Precomputed tf records for training.")

flags.DEFINE_integer("train_num_precomputed", 6000,
                     "Number of precomputed tf records for training.")

flags.DEFINE_string(
    "output_prediction_file", "tf2_bert_finetuned.ckpt",
    "Where to save finetuned checkpoints to.")

flags.DEFINE_string(
    "output_ckeckpoint", "predictions.json",
    "Where to print predictions in NQ prediction format, to be passed to"
    "natural_questions.nq_eval.")

flags.DEFINE_string(
    "log_dir", "/kaggle/working/logs/",
    "Where logs, specifically Tensorboard logs, will be saved to.")

flags.DEFINE_string(
    "init_checkpoint", "/kaggle/input/bert-q-a-convert-weights-to-2-0/tf2_bert_joint.ckpt",
    "Initial checkpoint (usually from a pre-trained BERT model).")

flags.DEFINE_bool(
    "do_lower_case", True,
    "Whether to lower case the input text. Should be True for uncased "
    "models and False for cased models.")

# This should be changed to 512 at some point,
# as training was done with that value, it may
# not make a big difference though
flags.DEFINE_integer(
    "max_seq_length", 384,
    "The maximum total input sequence length after WordPiece tokenization. "
    "Sequences longer than this will be truncated, and sequences shorter "
    "than this will be padded.")

flags.DEFINE_integer(
    "doc_stride", 128,
    "When splitting up a long document into chunks, how much stride to "
    "take between chunks.")

flags.DEFINE_integer(
    "max_query_length", 64,
    "The maximum number of tokens for the question. Questions longer than "
    "this will be truncated to this length.")

flags.DEFINE_bool("do_train", True, "Whether to run training.")

flags.DEFINE_bool("do_predict", False, "Whether to run eval on the dev set.")

flags.DEFINE_integer("train_batch_size", 2, "Total batch size for training.")

flags.DEFINE_integer("predict_batch_size", 8,
                     "Total batch size for predictions.")

flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.")

flags.DEFINE_integer("num_train_epochs", 3,
                   "Total number of training epochs to perform.")

flags.DEFINE_float(
    "warmup_proportion", 0.1,
    "Proportion of training to perform linear learning rate warmup for. "
    "E.g., 0.1 = 10% of training.")

flags.DEFINE_integer("save_checkpoints_steps", 1000,
                     "How often to save the model checkpoint.")

flags.DEFINE_integer("iterations_per_loop", 1000,
                     "How many steps to make in each estimator call.")

flags.DEFINE_integer(
    "n_best_size", 20,
    "The total number of n-best predictions to generate in the "
    "nbest_predictions.json output file.")

flags.DEFINE_integer(
    "verbosity", 1, "How verbose our error messages should be")

flags.DEFINE_integer(
    "max_answer_length", 30,
    "The maximum length of an answer that can be generated. This is needed "
    "because the start and end predictions are not conditioned on one another.")

flags.DEFINE_float(
    "include_unknowns", 1./8.,
    "If positive, probability of including answers of type `UNKNOWN`.")

flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.")

flags.DEFINE_bool("use_one_hot_embeddings", False, "Whether to use use_one_hot_embeddings")

absl.flags.DEFINE_string(
    "gcp_project", None,
    "[Optional] Project name for the Cloud TPU-enabled project. If not "
    "specified, we will attempt to automatically detect the GCE project from "
    "metadata.")

flags.DEFINE_bool(
    "verbose_logging", False,
    "If true, all of the warnings related to data processing will be printed. "
    "A number of warnings are expected for a normal NQ evaluation.")

flags.DEFINE_boolean(
    "skip_nested_contexts", True,
    "Completely ignore context that are not top level nodes in the page.")

flags.DEFINE_integer("task_id", 0,
                     "Train and dev shard to read from and write to.")

flags.DEFINE_integer("max_contexts", 48,
                     "Maximum number of contexts to output for an example.")

flags.DEFINE_integer(
    "max_position", 50,
    "Maximum context position for which to generate special tokens.")

## Custom flags

flags.DEFINE_integer(
    "n_examples", -1,
    "Number of examples to read from files. Only applicable during testing")

flags.DEFINE_string(
    "train_file", "./data/simplified-nq-train.jsonl.zip",
    "NQ json for training. E.g., dev-v1.1.jsonl.gz or test-v1.1.jsonl.gz")

## Special flags - do not change

flags.DEFINE_string(
    "predict_file", "/home/ejmejm/MLProjects/nqa_kaggle/data/simplified-nq-test.jsonl",
    "NQ json for predictions. E.g., dev-v1.1.jsonl.gz or test-v1.1.jsonl.gz")
flags.DEFINE_boolean("logtostderr", True, "Logs to stderr")
flags.DEFINE_boolean("undefok", True, "it's okay to be undefined")
flags.DEFINE_string('f', '', 'kernel')
flags.DEFINE_string('HistoryManager.hist_file', '', 'kernel')

FLAGS = flags.FLAGS
FLAGS(sys.argv) # Parse the flags

['/home/ejmejm/anaconda3/envs/tf2/lib/python3.7/site-packages/ipykernel_launcher.py']

## Generate Formatted Training Data (TFRecord, Only Once)

In [3]:
def blocks(f, size=65536):
    while True:
        b = f.read(size)
        if not b:
            break
        yield b
    
with zipfile.ZipFile(FLAGS.train_file) as zip_file:
    with zip_file.open('simplified-nq-train.jsonl', 'r') as f:
        n_train_examples = sum([bl.decode('UTF-8').count('\n') for bl in blocks(f)])

print('# Training Examples:', n_train_examples)

KeyboardInterrupt: 

In [4]:
def data_generator(chunk_size=1000):
    curr_pos = 0
    last_line = False
    with zipfile.ZipFile(FLAGS.train_file) as zip_file:
        with zip_file.open('simplified-nq-train.jsonl', 'r') as f:
            while not last_line:
                examples = []
                for i in range(curr_pos, curr_pos+chunk_size):
                    line = f.readline().decode('UTF-8')
                    if line is None:
                        last_line = True
                        break
                    examples.append(tf2baseline.create_example_from_jsonl(line))
                    examples[-1] = tf2baseline.read_nq_entry(examples[-1], FLAGS.do_train)[0]
                curr_pos = i + 1
                yield examples

In [5]:
# gen = data_generator(500)
# start_time = time.time()
# for i in range(4):
#     next(gen)
# end_time = time.time()
# print(f'Time to read all: {(end_time - start_time) * (300000. / 2000.)}s')

In [11]:
chunk_size = 500
example_gen = data_generator(chunk_size=chunk_size)

train_writer = tf2baseline.FeatureWriter(
    filename=os.path.join(FLAGS.output_dir, "train.tf_record"),
    is_training=FLAGS.do_train)

def append_feature(feature):
    train_writer.process_feature(feature)

tokenizer = tokenization.FullTokenizer(
    vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)

try:
    for i in tqdm.tqdm(range(int(np.ceil(n_train_examples/chunk_size)))):
        # If we want to change how the features are generated then this
        # is the function
        # Right now if an answer is split over 2 blocks, I think the
        # end pos of the first is the last token, and the first pos of
        # the second is the first token of the respective blocks
        num_spans_to_ids = tf2baseline.convert_examples_to_features(
            examples=next(example_gen),
            tokenizer=tokenizer,
            is_training=FLAGS.do_train,
            output_fn=append_feature)

        train_writer._writer.flush()
        with open('output/finished_loop_idx.txt', 'w+') as f:
            f.write(str(i))
finally:
    train_writer.close()
    train_filename = train_writer.filename

    print(f'# Features written: {train_writer.num_features}\n')

  2%|▏         | 2/100 [00:02<02:21,  1.44s/it]


NameError: name 'train_writer' is not defined

In [53]:
print(list(zip(tokenizer.convert_ids_to_tokens(fs[30].input_ids), fs[30].segment_ids)))

[('[CLS]', 0), ('[Q]', 0), ('who', 0), ('did', 0), ('the', 0), ('voice', 0), ('of', 0), ('the', 0), ('magician', 0), ('in', 0), ('frost', 0), ('##y', 0), ('the', 0), ('snow', 0), ('##man', 0), ('[SEP]', 0), ('every', 1), ('year', 1), ('on', 1), ('cbs', 1), ('and', 1), ('was', 1), ('even', 1), ('included', 1), ('as', 1), ('a', 1), ('bonus', 1), ('on', 1), ('its', 1), ('dvd', 1), ('release', 1), ('.', 1), ('john', 1), ('goodman', 1), ('provides', 1), ('the', 1), ('voice', 1), ('of', 1), ('frost', 1), ('##y', 1), ('in', 1), ('this', 1), ('special', 1), (',', 1), ('and', 1), ('jonathan', 1), ('winters', 1), ('serves', 1), ('as', 1), ('narrator', 1), ('.', 1), ('frost', 1), ('##y', 1), ("'", 1), ('s', 1), ('appearance', 1), ('is', 1), ('physically', 1), ('different', 1), (',', 1), ('his', 1), ('personality', 1), ('and', 1), ('humor', 1), ('have', 1), ('changed', 1), (',', 1), ('and', 1), ('he', 1), ('has', 1), ('the', 1), ('ability', 1), ('to', 1), ('live', 1), ('without', 1), ('his', 1), (

In [52]:
i = 30
print(fs[i].input_ids)
print()
print(fs[i].segment_ids)
print()
print(fs[i].input_mask)

[101, 104, 2040, 2106, 1996, 2376, 1997, 1996, 16669, 1999, 10097, 2100, 1996, 4586, 2386, 102, 2296, 2095, 2006, 6568, 1998, 2001, 2130, 2443, 2004, 1037, 6781, 2006, 2049, 4966, 2713, 1012, 2198, 14514, 3640, 1996, 2376, 1997, 10097, 2100, 1999, 2023, 2569, 1010, 1998, 5655, 12214, 4240, 2004, 11185, 1012, 10097, 2100, 1005, 1055, 3311, 2003, 8186, 2367, 1010, 2010, 6180, 1998, 8562, 2031, 2904, 1010, 1998, 2002, 2038, 1996, 3754, 2000, 2444, 2302, 2010, 2327, 6045, 1010, 1999, 3622, 5688, 2007, 1996, 2434, 1998, 2049, 2060, 25815, 1012, 2036, 1999, 5688, 2000, 1996, 2434, 19247, 1010, 1996, 2569, 26777, 2035, 5254, 1997, 4234, 1006, 2750, 1996, 2569, 17274, 1996, 2927, 1997, 3467, 1007, 1998, 2038, 2019, 4483, 2923, 4323, 1010, 2004, 10097, 2100, 2573, 2000, 2644, 1037, 5971, 3237, 3005, 4031, 13387, 2015, 2041, 4586, 15173, 2007, 2028, 12509, 1012, 1996, 5722, 1997, 10097, 2100, 1996, 4586, 2386, 1011, 1011, 2023, 2384, 3442, 1011, 2000, 1011, 2678, 2143, 2001, 2550, 2011, 4438, 28