# ALBERT Q&A Training

In [5]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf

from scripts import tf2_0_baseline_w_bert_translated_to_tf2_0 as tf2baseline # Oliviera's script
from scripts import albert_tokenization as tokenization

import absl
import collections
import json
import sys
import threading
import time
import tqdm
import zipfile

tf2baseline.FLAGS.include_unknowns = -1

## Define Flags

In [3]:
def del_all_flags(FLAGS):
    flags_dict = FLAGS._flags()
    keys_list = [keys for keys in flags_dict]
    for keys in keys_list:
        FLAGS.__delattr__(keys)

del_all_flags(absl.flags.FLAGS)

flags = absl.flags

flags.DEFINE_string("vocab_file", "models/albert_xxl/vocab/modified-30k-clean.model",
                    "The vocabulary file that the BERT/ALBERT model was trained on.")

flags.DEFINE_string(
    "output_dir", "output/",
    "The output directory where the model checkpoints will be written.")

flags.DEFINE_string("train_precomputed_file", "train_full.tf_record",
                    "Precomputed tf records for training.")

flags.DEFINE_bool(
    "do_lower_case", True,
    "Whether to lower case the input text. Should be True for uncased "
    "models and False for cased models.")

flags.DEFINE_integer(
    "max_seq_length", 384,
    "The maximum total input sequence length after WordPiece tokenization. "
    "Sequences longer than this will be truncated, and sequences shorter "
    "than this will be padded.")

flags.DEFINE_integer(
    "doc_stride", 128,
    "When splitting up a long document into chunks, how much stride to "
    "take between chunks.")

flags.DEFINE_integer(
    "max_query_length", 64,
    "The maximum number of tokens for the question. Questions longer than "
    "this will be truncated to this length.")

flags.DEFINE_bool("do_train", True, "Whether to run training.")

flags.DEFINE_bool("do_predict", False, "Whether to run eval on the dev set.")

flags.DEFINE_integer(
    "max_answer_length", 30,
    "The maximum length of an answer that can be generated. This is needed "
    "because the start and end predictions are not conditioned on one another.")

flags.DEFINE_float(
    "include_unknowns", -1,
    "If positive, probability of including answers of type `UNKNOWN`.")

flags.DEFINE_boolean(
    "skip_nested_contexts", True,
    "Completely ignore context that are not top level nodes in the page.")

flags.DEFINE_integer("max_contexts", 48,
                     "Maximum number of contexts to output for an example.")

flags.DEFINE_integer(
    "max_position", 50,
    "Maximum context position for which to generate special tokens.")

## Custom flags

flags.DEFINE_integer(
    "n_examples", -1,
    "Number of examples to read from files. Only applicable during testing")

flags.DEFINE_string(
    "train_file", "data/simplified-nq-train.jsonl.zip",
    "NQ json for training. E.g., dev-v1.1.jsonl.gz or test-v1.1.jsonl.gz")

## Special flags - do not change

flags.DEFINE_string(
    "predict_file", "/home/ejmejm/MLProjects/nqa_kaggle/data/simplified-nq-test.jsonl",
    "NQ json for predictions. E.g., dev-v1.1.jsonl.gz or test-v1.1.jsonl.gz")
flags.DEFINE_boolean("logtostderr", True, "Logs to stderr")
flags.DEFINE_boolean("undefok", True, "it's okay to be undefined")
flags.DEFINE_string('f', '', 'kernel')
flags.DEFINE_string('HistoryManager.hist_file', '', 'kernel')

FLAGS = flags.FLAGS
FLAGS(sys.argv) # Parse the flags

['/home/ejmejm/anaconda3/envs/tf2/lib/python3.7/site-packages/ipykernel_launcher.py']

## Generate Formatted Training Data (TFRecord, Only Once)

In [3]:
def blocks(f, size=65536):
    while True:
        b = f.read(size)
        if not b:
            break
        yield b
    
with zipfile.ZipFile(FLAGS.train_file) as zip_file:
    with zip_file.open('simplified-nq-train.jsonl', 'r') as f:
        n_train_examples = sum([bl.decode('UTF-8').count('\n') for bl in blocks(f)])

print('# Training Examples:', n_train_examples)

# Training Examples: 307373


In [4]:
# tokenizer = tokenization.FullTokenizer(
#     None,
#     spm_model_file=FLAGS.vocab_file)

# tokenizer.tokenize('This this is a test [UNK] [ UNK] [Q]')

In [None]:
example_ids = []
with zipfile.ZipFile(FLAGS.train_file) as zip_file:
    with zip_file.open('simplified-nq-train.jsonl', 'r') as f:
        while True:
            line = f.readline().decode('UTF-8')
            if line is None:
                last_line = True
                break
            example = json.loads(line, object_pairs_hook=collections.OrderedDict)
            example_ids.append(example)

In [None]:
len(example_ids) - len(set(example_ids))

In [5]:
def data_generator(chunk_size=1000):
    curr_pos = 0
    last_line = False
    with zipfile.ZipFile(FLAGS.train_file) as zip_file:
        with zip_file.open('simplified-nq-train.jsonl', 'r') as f:
            while not last_line:
                examples = []
                for i in range(curr_pos, curr_pos+chunk_size):
                    line = f.readline().decode('UTF-8')
                    if line is None:
                        last_line = True
                        break
                    examples.append(tf2baseline.create_example_from_jsonl(line, lowercase=True))
                    examples[-1] = tf2baseline.read_nq_entry(examples[-1], FLAGS.do_train)[0]
                curr_pos = i + 1
                yield examples

In [None]:
chunk_size = 500
example_gen = data_generator(chunk_size=chunk_size)

train_writer = tf2baseline.FeatureWriter(
    filename=os.path.join(FLAGS.output_dir, FLAGS.train_precomputed_file),
    is_training=FLAGS.do_train)

tokenizer = tokenization.FullTokenizer(
    None,
    spm_model_file=FLAGS.vocab_file)

def append_feature(feature, lock):
    with lock:
        train_writer.process_feature(feature)
    
def create_features(examples, lock):
    num_spans_to_ids = tf2baseline.convert_examples_to_features(
        examples=examples,
        tokenizer=tokenizer,
        is_training=FLAGS.do_train,
        output_fn=lambda x: append_feature(x, lock))



n_steps = int(np.ceil(n_train_examples/chunk_size))
n_threads = 1
lock = threading.Lock()
threads = []
try:
    curr_step = 0
    pbar = tqdm.tqdm(total=n_steps)
    while curr_step < n_steps:
        # Get rid of threads that have finished
        for thread in threads:
            if not thread.isAlive():
                thread.handled = True
                pbar.update(1)
        threads = [thread for thread in threads if not thread.handled]
        
        # Create new threads to replace finished ones
        if len(threads) < n_threads:
            thread = threading.Thread(target=create_features, args=(next(example_gen), lock))
            thread.handled = False
            threads.append(thread)
            thread.start()
            curr_step += 1
        
        with open('output/albert_finished_loop_idx_2.txt', 'w+') as f:
            f.write(str(curr_step))

        time.sleep(0.05)
finally:
    for thread in threads:
        thread.join()
        pbar.update(1)
    
    train_writer._writer.flush()
    train_writer.close()
    train_filename = train_writer.filename
    pbar.close()

    print(f'# Features written: {train_writer.num_features}\n')

 57%|█████▋    | 351/615 [16:58:02<13:14:57, 180.67s/it]