# ALBERT Q&A Training

In [1]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf

from scripts import tf2_0_baseline_w_bert_translated_to_tf2_0 as tf2baseline # Oliviera's script
from scripts.tf2_0_baseline_w_bert_translated_to_tf2_0 import AnswerType
from scripts import albert_tokenization as tokenization

import absl
import collections
import json
import sys
import threading
import time
import tqdm
import zipfile

tf2baseline.FLAGS.include_unknowns = -1

TF 2.0 Baseline Loaded


## Define Flags

In [2]:
def del_all_flags(FLAGS):
    flags_dict = FLAGS._flags()
    keys_list = [keys for keys in flags_dict]
    for keys in keys_list:
        FLAGS.__delattr__(keys)

del_all_flags(absl.flags.FLAGS)

flags = absl.flags

flags.DEFINE_string("vocab_file", "models/albert_xxl/vocab/modified-30k-clean.model",
                    "The vocabulary file that the BERT/ALBERT model was trained on.")

flags.DEFINE_string(
    "output_dir", "output/",
    "The output directory where the model checkpoints will be written.")

flags.DEFINE_string("train_precomputed_file", "albert_train.tf_record",
                    "Precomputed tf records for training.")

flags.DEFINE_bool(
    "do_lower_case", True,
    "Whether to lower case the input text. Should be True for uncased "
    "models and False for cased models.")

flags.DEFINE_integer(
    "max_seq_length", 256,
    "The maximum total input sequence length after WordPiece tokenization. "
    "Sequences longer than this will be truncated, and sequences shorter "
    "than this will be padded.")

flags.DEFINE_integer(
    "doc_stride", 128,
    "When splitting up a long document into chunks, how much stride to "
    "take between chunks.")

flags.DEFINE_integer(
    "max_query_length", 64,
    "The maximum number of tokens for the question. Questions longer than "
    "this will be truncated to this length.")

flags.DEFINE_bool("do_train", True, "Whether to run training.")

flags.DEFINE_bool("do_predict", False, "Whether to run eval on the dev set.")

flags.DEFINE_integer(
    "max_answer_length", 30,
    "The maximum length of an answer that can be generated. This is needed "
    "because the start and end predictions are not conditioned on one another.")

flags.DEFINE_float(
    "include_unknowns", -1,
    "If positive, probability of including answers of type `UNKNOWN`.")

flags.DEFINE_boolean(
    "skip_nested_contexts", True,
    "Completely ignore context that are not top level nodes in the page.")

flags.DEFINE_integer("max_contexts", 48,
                     "Maximum number of contexts to output for an example.")

flags.DEFINE_integer(
    "max_position", 50,
    "Maximum context position for which to generate special tokens.")

## Custom flags

flags.DEFINE_integer(
    "n_examples", -1,
    "Number of examples to read from files. Only applicable during testing")

flags.DEFINE_string(
    "train_file", "data/simplified-nq-train.jsonl.zip",
    "NQ json for training. E.g., dev-v1.1.jsonl.gz or test-v1.1.jsonl.gz")

## Special flags - do not change

flags.DEFINE_string(
    "predict_file", "/home/ejmejm/MLProjects/nqa_kaggle/data/simplified-nq-test.jsonl",
    "NQ json for predictions. E.g., dev-v1.1.jsonl.gz or test-v1.1.jsonl.gz")
flags.DEFINE_boolean("logtostderr", True, "Logs to stderr")
flags.DEFINE_boolean("undefok", True, "it's okay to be undefined")
flags.DEFINE_string('f', '', 'kernel')
flags.DEFINE_string('HistoryManager.hist_file', '', 'kernel')

FLAGS = flags.FLAGS
FLAGS(sys.argv) # Parse the flags

['/home/ejmejm/anaconda3/envs/tf2/lib/python3.7/site-packages/ipykernel_launcher.py']

## Generate Formatted Training Data (TFRecord, Only Once)

In [3]:
def blocks(f, size=65536):
    while True:
        b = f.read(size)
        if not b:
            break
        yield b
    
with zipfile.ZipFile(FLAGS.train_file) as zip_file:
    with zip_file.open('simplified-nq-train.jsonl', 'r') as f:
        n_train_examples = sum([bl.decode('UTF-8').count('\n') for bl in blocks(f)])

print('# Training Examples:', n_train_examples)

# Training Examples: 307373


In [4]:
def data_generator(chunk_size=1000):
    curr_pos = 0
    last_line = False
    with zipfile.ZipFile(FLAGS.train_file) as zip_file:
        with zip_file.open('simplified-nq-train.jsonl', 'r') as f:
            while not last_line:
                examples = []
                for i in range(curr_pos, curr_pos+chunk_size):
                    line = f.readline().decode('UTF-8')
                    if line is None:
                        last_line = True
                        break
                    examples.append(tf2baseline.create_example_from_jsonl(line, lowercase=True))
                    examples[-1] = tf2baseline.read_nq_entry(examples[-1], FLAGS.do_train)[0]
                curr_pos = i + 1
                yield examples

In [5]:
def example_to_verifier_feature(example, tokenizer):
    question_text = example.questions[0]
    answer_text = example.answer.text
    if answer_text == '[':
        answer_text = None
        
    question_tokens = tokenizer.tokenize(question_text)
    answer_tokens = tokenizer.tokenize(answer_text)
    
    if len(question_tokens) > FLAGS.max_query_length:
        question_tokens = question_tokens[-FLAGS.max_query_length:]
        
    max_answer_length = FLAGS.max_seq_length - (FLAGS.max_query_length + 4)
    if len(answer_tokens) > max_answer_length:
        answer_tokens = answer_tokens[:max_answer_length]
        
    n_q_tokens = len(question_tokens) + 3
    n_a_tokens = len(answer_tokens) + 1
    n_pad_tokens = FLAGS.max_seq_length - (n_q_tokens + n_a_tokens)

    input_tokens = ['[CLS]', '[Q]'] + \
                 question_tokens + \
                 ['[SEP]'] + \
                 answer_tokens + \
                 ['[SEP]'] + \
                 ['[PAD]'] * n_pad_tokens
    
    
    
    input_ids = tokenizer.convert_tokens_to_ids(input_tokens)
    segment_ids = np.concatenate([np.zeros(n_q_tokens, dtype=np.int32),
                                  np.ones(n_a_tokens, dtype=np.int32),
                                  np.zeros(n_pad_tokens, dtype=np.int32)])
    input_mask = np.concatenate([np.ones(FLAGS.max_seq_length - n_pad_tokens, dtype=np.int32),
                                 np.zeros(n_pad_tokens, dtype=np.int32)])
    
    assert len(input_ids) == FLAGS.max_seq_length
    assert len(segment_ids) == FLAGS.max_seq_length
    assert len(input_mask) == FLAGS.max_seq_length
    
    return {'input_ids': input_ids,
            'segment_ids': segment_ids,
            'input_mask': input_mask}

In [6]:
def encode_example(features):
    example = tf.train.Example(features=tf.train.Features(feature=features))
    return example.SerializeToString()

def create_int_feature(values):
    feature = tf.train.Feature(
        int64_list=tf.train.Int64List(value=list(values)))
    return feature

def dict_to_feature(d):
    od = collections.OrderedDict()
    for k, v in d.items():
        od[k] = create_int_feature(v)
    return encode_example(od)

In [7]:
chunk_size = 400
example_gen = data_generator(chunk_size=chunk_size)

writer = tf.io.TFRecordWriter(os.path.join(FLAGS.output_dir, 'verifier_train.tf_record'))
tokenizer = tokenization.FullTokenizer(
    None,
    spm_model_file=FLAGS.vocab_file)

try:
    n_batches = int(np.ceil(n_train_examples / chunk_size))
    for _ in tqdm.tqdm(range(n_batches)):
        examples = next(example_gen)
        examples = [example for example in examples if example.answer.type != AnswerType.UNKNOWN]
        for i in range(0, len(examples), 2):
            if i + 1 >= len(examples):
                feature_data = example_to_verifier_feature(examples[i], tokenizer)
                feature = dict_to_feature(feature_data)
                writer.write(feature)
                break

            for switch in range(2):
                if switch == 1:
                    tmp_answer = examples[i].answer
                    examples[i].answer = examples[i+1].answer
                    examples[i+1].answer = tmp_answer

                feature_data_1 = example_to_verifier_feature(examples[i], tokenizer)
                feature_data_2 = example_to_verifier_feature(examples[i+1], tokenizer)

                feature_1 = dict_to_feature(feature_data_1)
                feature_2 = dict_to_feature(feature_data_2)

                writer.write(feature_1)
                writer.write(feature_2)
finally:
    writer.close()

 13%|█▎        | 97/769 [13:41<1:26:40,  7.74s/it]

TypeError: not a string