In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

from scripts import nqa_utils
from scripts.nqa_utils import AnswerType
from scripts import bert_modeling as modeling
from scripts import bert_optimization
from scripts import albert_optimization
from scripts import albert
from scripts import albert_tokenization as tokenization

import tqdm
import json
import absl
import sys
import os

NQA Utils Loaded!


In [80]:
### Define Flags ###

def del_all_flags(FLAGS):
    flags_dict = FLAGS._flags()
    keys_list = [keys for keys in flags_dict]
    for keys in keys_list:
        FLAGS.__delattr__(keys)

del_all_flags(absl.flags.FLAGS)

flags = absl.flags

flags.DEFINE_string(
    "model", "albert",
    "The name of model to use. Choose from ['bert', 'albert'].")

flags.DEFINE_string(
    "config_file", "models/albert_xxl/config.json",
    "The config json file corresponding to the pre-trained BERT/ALBERT model. "
    "This specifies the model architecture.")

flags.DEFINE_string("vocab_file", "models/albert_xxl/vocab/modified-30k-clean.model",
                    "The vocabulary file that the ALBERT/BERT model was trained on.")

flags.DEFINE_string(
    "output_dir", "output/",
    "The output directory where the model checkpoints will be written.")

flags.DEFINE_string("train_precomputed_file", "data/albert_train.tf_record",
                    "Precomputed tf records for training.")

flags.DEFINE_integer("train_num_precomputed", -1,
                     "Number of precomputed tf records for training.")

flags.DEFINE_string(
    "output_checkpoint_file", "tf2_albert_finetuned.ckpt",
    "Where to save finetuned checkpoints to.")

flags.DEFINE_string(
    "output_predictions_file", "predictions.json",
    "Where to print predictions in NQ prediction format, to be passed to"
    "natural_questions.nq_eval.")

flags.DEFINE_string(
    "log_dir", "logs/",
    "Where logs, specifically Tensorboard logs, will be saved to.")

flags.DEFINE_integer(
    "log_freq", 128,
    "How many samples between each training log update.")

flags.DEFINE_string(
    "init_checkpoint", "models/bert_joint_baseline/tf2_bert_joint.ckpt",
    "Initial checkpoint (usually from a pre-trained BERT model).")

flags.DEFINE_bool(
    "do_lower_case", True,
    "Whether to lower case the input text. Should be True for uncased "
    "models and False for cased models.")

# This should be changed to 512 at some point,
# as training was done with that value, it may
# not make a big difference though
flags.DEFINE_integer(
    "max_seq_length", 384,
    "The maximum total input sequence length after WordPiece tokenization. "
    "Sequences longer than this will be truncated, and sequences shorter "
    "than this will be padded.")

flags.DEFINE_integer(
    "doc_stride", 128,
    "When splitting up a long document into chunks, how much stride to "
    "take between chunks.")

flags.DEFINE_integer(
    "max_query_length", 64,
    "The maximum number of tokens for the question. Questions longer than "
    "this will be truncated to this length.")

flags.DEFINE_bool("do_train", True, "Whether to run training.")

flags.DEFINE_bool("do_predict", False, "Whether to run eval on the dev set.")

flags.DEFINE_integer("train_batch_size", 1, "Total batch size for training.")

flags.DEFINE_integer("predict_batch_size", 8,
                     "Total batch size for predictions.")

flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.")

flags.DEFINE_integer("num_train_epochs", 3,
                   "Total number of training epochs to perform.")

flags.DEFINE_float(
    "warmup_proportion", 0.1,
    "Proportion of training to perform linear learning rate warmup for. "
    "E.g., 0.1 = 10% of training.")

flags.DEFINE_integer("save_checkpoints_steps", 10000,
                     "How often to save the model checkpoint.")

flags.DEFINE_integer("iterations_per_loop", 1000,
                     "How many steps to make in each estimator call.")

flags.DEFINE_integer(
    "n_best_size", 20,
    "The total number of n-best predictions to generate in the "
    "nbest_predictions.json output file.")

flags.DEFINE_integer(
    "verbosity", 1, "How verbose our error messages should be")

flags.DEFINE_integer(
    "max_answer_length", 30,
    "The maximum length of an answer that can be generated. This is needed "
    "because the start and end predictions are not conditioned on one another.")

flags.DEFINE_float(
    "include_unknowns", 1.0,
    "If positive, probability of including answers of type `UNKNOWN`.")

flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.")

flags.DEFINE_string("tpu_name", None, "Name of the TPU to use.")

flags.DEFINE_string("tpu_zone", None, "Which zone the TPU is in.")

flags.DEFINE_bool("use_one_hot_embeddings", False, "Whether to use use_one_hot_embeddings")

absl.flags.DEFINE_string(
    "gcp_project", None,
    "[Optional] Project name for the Cloud TPU-enabled project. If not "
    "specified, we will attempt to automatically detect the GCE project from "
    "metadata.")

flags.DEFINE_bool(
    "verbose_logging", False,
    "If true, all of the warnings related to data processing will be printed. "
    "A number of warnings are expected for a normal NQ evaluation.")

# TODO(Edan): Look at nested contents too at some point
# Around 5% of long answers are nested, and around 50% of questions have
# long answers
# This means that this setting alone restricts us from a correct answer
# around 2.5% of the time
flags.DEFINE_boolean(
    "skip_nested_contexts", True,
    "Completely ignore context that are not top level nodes in the page.")

flags.DEFINE_integer("task_id", 0,
                     "Train and dev shard to read from and write to.")

flags.DEFINE_integer("max_contexts", 48,
                     "Maximum number of contexts to output for an example.")

flags.DEFINE_integer(
    "max_position", 50,
    "Maximum context position for which to generate special tokens.")

## Custom flags

flags.DEFINE_integer(
    "n_examples", -1,
    "Number of examples to read from files. Only applicable during testing")

flags.DEFINE_string(
    "train_file", "data/simplified-nq-train.jsonl",
    "NQ json for training. E.g., dev-v1.1.jsonl.gz or test-v1.1.jsonl.gz")

## Special flags - do not change

flags.DEFINE_string(
    "predict_file", "data/simplified-nq-test.jsonl",
    "NQ json for predictions. E.g., dev-v1.1.jsonl.gz or test-v1.1.jsonl.gz")
flags.DEFINE_boolean("logtostderr", True, "Logs to stderr")
flags.DEFINE_boolean("undefok", True, "it's okay to be undefined")
flags.DEFINE_string('f', '', 'kernel')
flags.DEFINE_string('HistoryManager.hist_file', '', 'kernel')

FLAGS = flags.FLAGS
FLAGS(sys.argv) # Parse the flags

VOCAB_SIZE = 30209

In [47]:
def blocks(f, size=65536):
    while True:
        b = f.read(size)
        if not b:
            break
        yield b

n_records = 0
for record in tf.compat.v1.python_io.tf_record_iterator(FLAGS.train_precomputed_file):
    n_records += 1

# with open(FLAGS.train_file, 'r') as f:
#     n_train_examples = sum([bl.count('\n') for bl in blocks(f)])

# print('# Training Examples:', n_train_examples)
print('# Training Records:', n_records)

if FLAGS.do_train and FLAGS.train_num_precomputed != n_records:
    print('Changing the number of precomuted records listed to use all avaliable data.')
    FLAGS.train_num_precomputed = n_records

# Training Records: 457137
Changing the number of precomuted records listed to use all avaliable data.


In [48]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

In [67]:
### Create Generator for Training Data ###

train_filenames = tf.io.gfile.glob(FLAGS.train_precomputed_file)

name_to_features = {
    "input_ids": tf.io.FixedLenFeature([FLAGS.max_seq_length], tf.int64),
    "input_mask": tf.io.FixedLenFeature([FLAGS.max_seq_length], tf.int64),
    "segment_ids": tf.io.FixedLenFeature([FLAGS.max_seq_length], tf.int64),
}
if FLAGS.do_train:
    name_to_features["start_positions"] = tf.io.FixedLenFeature([], tf.int64)
    name_to_features["end_positions"] = tf.io.FixedLenFeature([], tf.int64)
    name_to_features["answer_types"] = tf.io.FixedLenFeature([], tf.int64)

def decode_record(record, name_to_features):
    """Decodes a record to a TensorFlow example."""
    example = tf.io.parse_single_example(serialized=record, features=name_to_features)

    # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
    # So cast all int64 to int32.
    for name in list(example.keys()):
        t = example[name]
        if t.dtype == tf.int64:
            t = tf.cast(t, dtype=tf.int32)
        example[name] = t

    output = ({
        'input_ids': example['input_ids'],
        'input_mask': example['input_mask'],
        'segment_ids': example['segment_ids']
    },
    {
        'tf_op_layer_start_logits': example['start_positions'],
        'tf_op_layer_end_logits': example['end_positions'],
        'ans_type_logits': example['answer_types']
    })

    return output

def data_generator(batch_size=32, seed=42, valid_frac=0.05):
    """The actual input function."""

    # For training, we want a lot of parallel reading and shuffling.
    # For eval, we want no shuffling and parallel reading doesn't matter.
    dataset = tf.data.TFRecordDataset(train_filenames)
    dataset = dataset.map(lambda r: decode_record(r, name_to_features))

    if valid_frac <= 0:
        dataset = dataset.shuffle(buffer_size=20000, seed=seed, reshuffle_each_iteration=True)
        dataset = dataset.batch(batch_size=batch_size, drop_remainder=False)
        dataset = dataset.repeat()
        return dataset, None

    train_size = int(FLAGS.train_num_precomputed * (1.0 - valid_frac))

    train_dataset = dataset.take(train_size)
    valid_dataset = dataset.skip(train_size)

    train_dataset = train_dataset.batch(batch_size=batch_size, drop_remainder=False)
    valid_dataset = valid_dataset.batch(batch_size=batch_size, drop_remainder=False)

#     train_dataset = train_dataset.shuffle(buffer_size=20000, seed=seed, reshuffle_each_iteration=True)
#     valid_dataset = valid_dataset.shuffle(buffer_size=5000, seed=seed, reshuffle_each_iteration=True)

    return train_dataset, valid_dataset

### Train the Model ###

valid_frac = 0.02
train_dataset, valid_dataset = data_generator(batch_size=10000, valid_frac=valid_frac)
n_valid = np.ceil(FLAGS.train_num_precomputed * valid_frac)

In [68]:
test_data = next(iter(train_dataset))

In [74]:
sum([1 if x == True else 0 for x in test_data[1]['tf_op_layer_start_logits'].numpy() == 0])

6465

In [57]:
test_data = next(iter(valid_dataset))

In [66]:
test_data[1]['tf_op_layer_start_logits'].numpy()

False

In [50]:
valid_data = next(iter(valid_dataset))
valid_data = valid_data[0]
valid_ids = valid_data['input_ids']
valid_ids = valid_ids.numpy()

In [51]:
valid_ids_set = set([tuple(x) for x in valid_ids])

In [76]:
def data_generator(chunk_size=1000):
    curr_pos = 0
    last_line = False
    with open('data/simplified-nq-train.jsonl', 'r') as f:
        while not last_line:
            raw_examples = []
            examples = []
            for i in range(curr_pos, curr_pos+chunk_size):
                line = f.readline()
                if line is None:
                    last_line = True
                    break
                raw_examples.append(line)
                examples.append(nqa_utils.create_example_from_jsonl(line, lowercase=True))
                examples[-1] = nqa_utils.read_nq_entry(examples[-1], FLAGS.do_train)[0]
            curr_pos = i + 1
            yield examples, raw_examples

In [77]:
chunk_size = 500
example_gen = data_generator(chunk_size=chunk_size)

In [78]:
def convert_examples_to_features(examples, tokenizer, is_training, output_fn, lowercase=False):
    for i, example in enumerate(examples):
        if example.answer.type != 0:
            continue
            
        example_index = example.example_id
        features = nqa_utils.convert_single_example(example, tokenizer, is_training, lowercase=lowercase)

        for feature in features:
            feature.example_index = example_index
            feature.unique_id = feature.example_index + feature.doc_span_index
            output_fn(i, feature)

In [79]:
n_valid

9143.0

In [81]:
new_file = open('output/actual_valid.jsonl', 'a')

global n_valid
n_valid = 0

tokenizer = tokenization.FullTokenizer(
    None,
    spm_model_file=FLAGS.vocab_file)

def append_feature(i, feature, raw_examples):
    global n_valid
    
    new_file.write(raw_examples[i])

    n_valid += 1
    if n_valid % 50 == 0:
        print('# Valid Examples:', n_valid)
        
n_examples = 307373
chunk_size = 500
example_gen = data_generator(chunk_size=chunk_size)

curr_step = 0
pbar = tqdm.tqdm(total=int(np.ceil(n_examples/chunk_size)))
while True:
    try:
        examples, raw_examples = next(example_gen)
    except:
        break
    
    convert_examples_to_features(
        examples=examples,
        tokenizer=tokenizer,
        is_training=FLAGS.do_train,
        output_fn=lambda x, y: append_feature(x, y, raw_examples))
    
    curr_step += 1
    pbar.update(1)

pbar.close()


  0%|          | 0/615 [00:00<?, ?it/s][A

# Valid Examples: 50
# Valid Examples: 100
# Valid Examples: 150
# Valid Examples: 200
# Valid Examples: 250
# Valid Examples: 300
# Valid Examples: 350
# Valid Examples: 400
# Valid Examples: 450
# Valid Examples: 500
# Valid Examples: 550
# Valid Examples: 600
# Valid Examples: 650
# Valid Examples: 700
# Valid Examples: 750


KeyboardInterrupt: 

In [86]:
def data_generator(chunk_size=1000):
    curr_pos = 0
    last_line = False
    with open('data/simplified-nq-train.jsonl', 'r') as f:
        while not last_line:
            raw_examples = []
            examples = []
            for i in range(curr_pos, curr_pos+chunk_size):
                line = f.readline()
                if line is None:
                    last_line = True
                    break
                raw_examples.append(line)
                examples.append(nqa_utils.create_example_from_jsonl(line, lowercase=True))
                examples[-1] = nqa_utils.read_nq_entry(examples[-1], FLAGS.do_train)[0]
            curr_pos = i + 1
            yield examples, raw_examples

In [87]:
import random

In [88]:
def convert_examples_to_features(examples, tokenizer, is_training, output_fn, lowercase=False):
    for i, example in enumerate(examples):
        if example.answer.type != 0:
            continue
        example_index = example.example_id
        features = nqa_utils.convert_single_example(example, tokenizer, is_training, lowercase=lowercase)
        random.shuffle(features)

        for feature in features:
            if feature.answer_type == 0:
                feature.example_index = example_index
                feature.unique_id = feature.example_index + feature.doc_span_index
                output_fn(i, feature)
                break

In [89]:
FLAGS.include_unknowns = 1.0

In [90]:
new_file = open('output/actual_valid.jsonl', 'a')

In [91]:
# example_gen = data_generator(chunk_size=5)

# def append_feature(i, feature):
#     print(feature.answer_type)

# tokenizer = tokenization.FullTokenizer(
#     None,
#     spm_model_file=FLAGS.vocab_file)

# convert_examples_to_features(
#     examples=next(example_gen)[0],
#     tokenizer=tokenizer,
#     is_training=FLAGS.do_train,
#     output_fn=append_feature)

In [93]:
n_valid = 0
used_ids = set([])

def append_feature(i, feature, raw_examples):
    global n_valid, used_ids
    
    if feature.answer_type == 0:
        new_file.write(raw_examples[i])
        
        n_valid += 1
        if n_valid % 50 == 0:
            print('# Valid Examples:', n_valid)

n_examples = 307373
chunk_size = 100
example_gen = data_generator(chunk_size=chunk_size)

curr_step = 0
pbar = tqdm.tqdm(total=int(np.ceil(n_examples/chunk_size)))
while True:
    try:
        examples, raw_examples = next(example_gen)
    except:
        break
    
    convert_examples_to_features(
        examples=examples,
        tokenizer=tokenizer,
        is_training=FLAGS.do_train,
        output_fn=lambda x, y: append_feature(x, y, raw_examples))
    
    curr_step += 1
    pbar.update(1)

pbar.close()



  0%|          | 0/3074 [00:00<?, ?it/s][A[A

  0%|          | 1/3074 [00:13<11:09:25, 13.07s/it][A[A

# Valid Examples: 50




  0%|          | 2/3074 [00:25<10:56:07, 12.82s/it][A[A

# Valid Examples: 100




  0%|          | 3/3074 [00:39<11:18:38, 13.26s/it][A[A

# Valid Examples: 150




  0%|          | 4/3074 [00:50<10:47:56, 12.66s/it][A[A

# Valid Examples: 200




  0%|          | 5/3074 [01:07<11:43:56, 13.76s/it][A[A

# Valid Examples: 250




  0%|          | 6/3074 [01:37<16:02:31, 18.82s/it][A[A

# Valid Examples: 300




  0%|          | 7/3074 [02:01<17:23:36, 20.42s/it][A[A

# Valid Examples: 350




  0%|          | 8/3074 [02:12<14:57:55, 17.57s/it][A[A

# Valid Examples: 400




  0%|          | 9/3074 [02:31<15:14:00, 17.89s/it][A[A

# Valid Examples: 450




  0%|          | 10/3074 [03:04<19:00:16, 22.33s/it][A[A

# Valid Examples: 500




  0%|          | 11/3074 [03:24<18:33:55, 21.82s/it][A[A

# Valid Examples: 550




  0%|          | 12/3074 [03:52<20:07:31, 23.66s/it][A[A

# Valid Examples: 600


KeyboardInterrupt: 

In [94]:
new_file.close()

In [95]:
lines = []
with open('output/actual_valid.jsonl', 'r') as f:
    for line in f:
        lines.append(line)

In [96]:
for line in lines:
    if not line.endswith('\n'):
        print('a')

In [97]:
import random

In [98]:
random.shuffle(lines)

In [99]:
with open('output/actual_valid.jsonl', 'w+') as f:
    for line in lines:
        f.write(line)

In [100]:
len(lines)

1235