In [1]:
# Install Prerequisites
# TX2 prerequisites: https://docs.nvidia.com/deeplearning/frameworks/install-tf-jetson-platform/index.html

# !pip3 install --upgrade pip
# !pip3 install numpy
# !pip3 install tensorflow_hub==0.7
# !pip3 install --extra-index-url https://developer.download.nvidia.com/compute/redist/jp/v43 tensorflow-gpu==1.15  # GPU version of TensorFlow
# !pip3 install sentencepiece==0.1.2  # Last version that works due to wheel being out of date on the TX2.

In [1]:
from pathlib import Path

albert_root = Path("ALBERT/albert_xlarge")
output_dir_base = Path("output")

data_dir = str(albert_root/"glue")
albert_config_file = str(albert_root/"albert_config.json")
vocab_file = str(albert_root/"30k-clean.vocab")
hub_module = "https://tfhub.dev/google/albert_base/3"
spm_model_file = str(albert_root/"30k-clean.model")
max_seq_length = 128  # Max length of a tweet is 280.

optimizer = "adamw"
batch_size = 8
save_checkpoint_steps = 1000

In [3]:
task_name = "CoLA"
warmup_steps = 320
learning_rate = 1e-5
train_steps = 5336

task_name = "MNLI".lower()
warmup_steps = 1000
learning_rate = 3e-5
train_steps = 20000

task_name = "SST-2".lower()
warmup_steps = 1256
learning_rate = 1e-5
train_steps = 20935

output_dir = str(output_dir_base/task_name)

In [4]:
# Helper Class

from datetime import datetime

class Stopwatch:
    """Prints progress details."""
    def __init__(self, msg, numb_of_samples=0):
        self.current_time = datetime.now()
        self.numb_of_samples = numb_of_samples
        print(msg, end="")
    def stop(self):
        runtime = datetime.now().timestamp() - self.current_time.timestamp()
        if self.numb_of_samples > 0:
            print("done.  Runtime:", datetime.now() - self.current_time, " Samples:", self.numb_of_samples, 
                  " Samples per Second:", self.numb_of_samples / runtime)
        else:
            print("done.  Runtime:", datetime.now() - self.current_time)

In [6]:
import classifier_utils

processor = classifier_utils.ColaProcessor(use_spm=True, do_lower_case=True)
# processor.get_train_examples(data_dir)
# processor.get_labels()

[<classifier_utils.InputExample at 0x7f2e68632048>,
 <classifier_utils.InputExample at 0x7f2e68f74cc0>,
 <classifier_utils.InputExample at 0x7f2e68f74dd8>,
 <classifier_utils.InputExample at 0x7f2e68f74e10>,
 <classifier_utils.InputExample at 0x7f2e68f74f28>,
 <classifier_utils.InputExample at 0x7f2e68f74f60>,
 <classifier_utils.InputExample at 0x7f2e68f7c0b8>,
 <classifier_utils.InputExample at 0x7f2e68f7c0f0>,
 <classifier_utils.InputExample at 0x7f2e68f7c208>,
 <classifier_utils.InputExample at 0x7f2e68f7c240>,
 <classifier_utils.InputExample at 0x7f2e68f7c2b0>,
 <classifier_utils.InputExample at 0x7f2e68f7c320>,
 <classifier_utils.InputExample at 0x7f2e68f7c390>,
 <classifier_utils.InputExample at 0x7f2e68f7c400>,
 <classifier_utils.InputExample at 0x7f2e68f7c470>,
 <classifier_utils.InputExample at 0x7f2e68f7c4e0>,
 <classifier_utils.InputExample at 0x7f2e68f7c550>,
 <classifier_utils.InputExample at 0x7f2e68f7c5c0>,
 <classifier_utils.InputExample at 0x7f2e68f7c630>,
 <classifier

In [5]:
# Tokenize
# Uses sentencepiece to tokenize tweets

import classifier_utils
import fine_tuning_utils
import os
import tensorflow as tf
tf.get_logger().setLevel('ERROR')  # Tensorflow WARNING can be safely ignored.
import warnings
warnings.filterwarnings("ignore")  # Ignore more warnings


# Changeme, preprocess data
# processor = classifier_utils.MnliProcessor(use_spm=True, do_lower_case=True)
processor = classifier_utils.Sst2Processor(use_spm=True, do_lower_case=True)
label_list = processor.get_labels()


sw = Stopwatch("Getting training data...")

train_examples = processor.get_train_examples(data_dir=data_dir)  # Load training data
tf.logging.info("Example tweet:  %s", train_examples[0].__dict__)

sw.stop()


sw = Stopwatch("Tokenizing train data...", len(train_examples))

# Tokenizer fn
tokenizer = fine_tuning_utils.create_vocab(vocab_file=vocab_file, do_lower_case=True, spm_model_file=spm_model_file, 
                                           hub_module=hub_module)

tf.gfile.MakeDirs(output_dir) # Make output dirs if doesn't exist.
train_file = os.path.join(output_dir, task_name + "_train.tf_record")  # Load tokenized data, if exists

# Tokenize data if not already done.
if not tf.gfile.Exists(train_file):
    classifier_utils.file_based_convert_examples_to_features(train_examples, label_list, max_seq_length, tokenizer,
                                                             train_file, task_name)

sw.stop()

Getting training data...done.  Runtime: 0:00:01.118836
Tokenizing train data...done.  Runtime: 0:00:03.006684  Samples: 67349  Samples per Second: 22399.819909629066


In [6]:
# Create estimator

import modeling
from tensorflow.contrib import tpu

run_config = tpu.RunConfig(cluster=None, master=None, model_dir=output_dir, save_checkpoints_steps=save_checkpoint_steps, 
                           keep_checkpoint_max=0, 
                           tpu_config=tpu.TPUConfig(iterations_per_loop=save_checkpoint_steps, num_shards=8,
                                                    per_host_input_for_training=tpu.InputPipelineConfig.PER_HOST_V2))

model_fn = classifier_utils.model_fn_builder(albert_config=None, num_labels=len(label_list), init_checkpoint=None,
                                             learning_rate=learning_rate, num_train_steps=train_steps, 
                                             num_warmup_steps=warmup_steps, use_tpu=False, use_one_hot_embeddings=False, 
                                             task_name=task_name, hub_module=hub_module, optimizer=optimizer)

# Estimator supports TPU, but using GPU here.
estimator = tpu.TPUEstimator(use_tpu=False, model_fn=model_fn, config=run_config, train_batch_size=batch_size, 
                             eval_batch_size=batch_size, predict_batch_size=batch_size)

In [7]:
# Train ALBERT

train_input_fn = classifier_utils.file_based_input_fn_builder(input_file=train_file, seq_length=max_seq_length,
                                                              is_training=True, drop_remainder=True, task_name=task_name,
                                                              use_tpu=False, bsz=batch_size)

sw = Stopwatch("Training...", train_steps)

estimator.train(input_fn=train_input_fn, max_steps=train_steps)

sw.stop()

Training...done.  Runtime: 1:50:54.465966  Samples: 20935  Samples per Second: 3.146007528704789


In [8]:
# Eval helper functions
# Functions stolen from run_classifier.py in ALBERT repo

best_trial_info_file = os.path.join(output_dir, "best_trial.txt")

def _best_trial_info():
    """Returns information about which checkpoints have been evaled so far."""
    if tf.gfile.Exists(best_trial_info_file):
        with tf.gfile.GFile(best_trial_info_file, "r") as best_info:
            global_step, best_metric_global_step, metric_value = (best_info.read().split(":"))
            global_step = int(global_step)
            best_metric_global_step = int(best_metric_global_step)
            metric_value = float(metric_value)
    else:
        metric_value = -1
        best_metric_global_step = -1
        global_step = -1
    tf.logging.info("Best trial info: Step: %s, Best Value Step: %s, "
                    "Best Value: %s", global_step, best_metric_global_step, metric_value)
    return global_step, best_metric_global_step, metric_value

def _remove_checkpoint(checkpoint_path):
    for ext in ["meta", "data-00000-of-00001", "index"]:
        src_ckpt = checkpoint_path + ".{}".format(ext)
        tf.logging.info("removing {}".format(src_ckpt))
        tf.gfile.Remove(src_ckpt)

def _find_valid_cands(curr_step):
    filenames = tf.gfile.ListDirectory(output_dir)
    candidates = []
    for filename in filenames:
        if filename.endswith(".index"):
            ckpt_name = filename[:-6]
            idx = ckpt_name.split("-")[-1]
            if int(idx) > curr_step:
                candidates.append(filename)
    return candidates

In [9]:
# Evaluate model

import pandas as pd


sw = Stopwatch("Getting development data...")

eval_examples = processor.get_dev_examples(data_dir)
tf.logging.info("Example tweet:  %s", eval_examples[0].__dict__)

sw.stop()


sw = Stopwatch("Tokenizing development data...", len(eval_examples))

eval_file = os.path.join(output_dir, task_name + "_eval.tf_record")  # Load eval data, if exists

# Load eval data if not already done.
if not tf.gfile.Exists(eval_file):
    classifier_utils.file_based_convert_examples_to_features(eval_examples, label_list, max_seq_length, tokenizer,
                                                             eval_file, task_name)

sw.stop()


# Eval fn
eval_input_fn = classifier_utils.file_based_input_fn_builder(input_file=eval_file, seq_length=max_seq_length, 
                                                             is_training=False, drop_remainder=False,
                                                             task_name=task_name, use_tpu=False, bsz=batch_size)


sw = Stopwatch("Evaluating model...", len(eval_examples))

def evaluate_model():
    """Outputs to file, and returns a dataframe."""
    output_eval_file = os.path.join(output_dir, "eval_results.txt")
    eval_steps = int(len(eval_examples) // save_checkpoint_steps) + 1
    out = []

    # Goes through the saved trained data and evaluates model
    global_step, best_perf_global_step, best_perf = _best_trial_info()
    writer = tf.gfile.GFile(output_eval_file, "w")
    while global_step < train_steps:
        steps_and_files = {}
        filenames = tf.gfile.ListDirectory(output_dir)
        for filename in filenames:
            if filename.endswith(".index"):
                ckpt_name = filename[:-6]
                cur_filename = os.path.join(output_dir, ckpt_name)
                if cur_filename.split("-")[-1] == "best":
                    continue
                gstep = int(cur_filename.split("-")[-1])
                if gstep not in steps_and_files:
                    tf.logging.info("Add {} to eval list.".format(cur_filename))
                    steps_and_files[gstep] = cur_filename
        tf.logging.info("found {} files.".format(len(steps_and_files)))
        if not steps_and_files:
            tf.logging.info("found 0 file, global step: {}. Sleeping.".format(global_step))
            time.sleep(60)
        else:
            for checkpoint in sorted(steps_and_files.items()):
                step, checkpoint_path = checkpoint
                if global_step >= step:
                    if (best_perf_global_step != step and len(_find_valid_cands(step)) > 1):
                        _remove_checkpoint(checkpoint_path)
                    continue
                
                # Evaluate model accuraccy
                result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps, checkpoint_path=checkpoint_path)
                global_step = result["global_step"]
                
                out.append(result)  # Save result

                tf.logging.info("***** Eval results *****")
                for key in sorted(result.keys()):
                    tf.logging.info("  %s = %s", key, str(result[key]))
                    writer.write("%s = %s\n" % (key, str(result[key])))
                writer.write("best = {}\n".format(best_perf))

                if result["eval_accuracy"] > best_perf:
                    best_perf = result["eval_accuracy"]
                    best_perf_global_step = global_step
                elif len(_find_valid_cands(global_step)) > 1:
                    _remove_checkpoint(checkpoint_path)
                writer.write("=" * 50 + "\n")
                writer.flush()
                with tf.gfile.GFile(best_trial_info_file, "w") as best_info:
                    best_info.write("{}:{}:{}".format(global_step, best_perf_global_step, best_perf))
    writer.close()
    
    return out

eval_results = evaluate_model()

sw.stop()

print(eval_results)

Getting development data...done.  Runtime: 0:00:00.021762
Tokenizing development data...done.  Runtime: 0:00:00.343835  Samples: 872  Samples per Second: 2536.1661397770354
Evaluating model...done.  Runtime: 0:01:59.658626  Samples: 872  Samples per Second: 7.287398165041215
[{'eval_accuracy': 0.5, 'eval_loss': 0.726717, 'loss': 0.726717, 'global_step': 0}, {'eval_accuracy': 1.0, 'eval_loss': 0.06715764, 'loss': 0.06715764, 'global_step': 1000}, {'eval_accuracy': 1.0, 'eval_loss': 0.027459215, 'loss': 0.027459215, 'global_step': 2000}, {'eval_accuracy': 0.875, 'eval_loss': 0.5638903, 'loss': 0.5638903, 'global_step': 3000}, {'eval_accuracy': 1.0, 'eval_loss': 0.001639137, 'loss': 0.001639137, 'global_step': 4000}, {'eval_accuracy': 1.0, 'eval_loss': 0.0045564356, 'loss': 0.0045564356, 'global_step': 5000}, {'eval_accuracy': 1.0, 'eval_loss': 0.0028660398, 'loss': 0.0028660398, 'global_step': 6000}, {'eval_accuracy': 1.0, 'eval_loss': 0.0039797523, 'loss': 0.0039797523, 'global_step': 7

In [10]:
# # Predict

# predict_examples = processor.get_test_examples(data_dir)
# num_actual_predict_examples = len(predict_examples)

# # Get eval data
# predict_file = os.path.join(output_dir, "predict.tf_record")
# classifier_utils.file_based_convert_examples_to_features(predict_examples, label_list, max_seq_length, tokenizer, 
#                                                          predict_file, task_name)
    
# tf.logging.info("***** Running prediction*****")
# tf.logging.info("  Num examples = %d (%d actual, %d padding)",
#                 len(predict_examples), num_actual_predict_examples,
#                 len(predict_examples) - num_actual_predict_examples)

# predict_input_fn = classifier_utils.file_based_input_fn_builder(input_file=predict_file, seq_length=max_seq_length, 
#                                                                 is_training=False, drop_remainder=False,
#                                                                 task_name=task_name, use_tpu=False, bsz=batch_size)

# checkpoint_path = os.path.join(output_dir, "model.ckpt-best")

# # Predict on test data
# result = estimator.predict(input_fn=predict_input_fn, checkpoint_path=checkpoint_path)

# tf.logging.info("***** Predict results *****")

# for (i, (example, prediction)) in enumerate(zip(predict_examples, result)):
#     if i >= num_actual_predict_examples:
#         break
#     probabilities = prediction["probabilities"]
#     output_line = "\t".join(str(class_probability) for class_probability in probabilities) + "\n"
#     print(output_line)
# #     actual_label = label_list[int(prediction["predictions"])]
# #     print(example.guid + "\t" + actual_label + "\n")