<a href="https://colab.research.google.com/github/demelin/ai_reimplementations/blob/main/dialoue_continuation_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets



In [None]:
import os
import json
import torch
import pickle
import random
import numpy as np

from tqdm import trange, tqdm
from torch.utils.data import (
    Dataset, DataLoader, RandomSampler, SequentialSampler)
from transformers import (
    AutoConfig, AutoTokenizer,
    AutoModelForSequenceClassification,
    get_linear_schedule_with_warmup, get_constant_schedule)
from datasets import load_dataset

In [None]:
# Define experiment configuration


class Config(object):

  """ Contains default settings for model training and evaluation. """

  def __init__(
      self,
      max_seq_len=256,
      train_batch_size=32,
      eval_batch_size=32,
      gradient_accumulation_steps=4,
      num_train_steps=1000,
      eval_after_steps=50,
      patience=3,
      learning_rate=1e-5,
      weight_decay=0.01,
      adam_epsilon=1e-8,
      max_grad_norm=1.0,
      random_seed=42,
      logging_steps=25,
      use_cpu=False,
      checkpoint="facebookAI/roberta-base",
      dataset="daily_dialog"
      ):

    self.max_seq_len = max_seq_len
    self.train_batch_size = train_batch_size
    self.eval_batch_size = eval_batch_size
    self.gradient_accumulation_steps = gradient_accumulation_steps
    self.num_train_steps = num_train_steps
    self.learning_rate = learning_rate
    self.eval_after_steps = eval_after_steps
    self.patience = patience
    self.weight_decay = weight_decay
    self.adam_epsilon = adam_epsilon
    self.max_grad_norm = max_grad_norm

    self.use_cpu = use_cpu
    self.checkpoint = checkpoint
    self.dataset = dataset
    self.random_seed = random_seed
    self.logging_steps = logging_steps

In [None]:
class NewDataset(Dataset):

    """ Dataset object for model training. Expected as input by DataLoader. """

    def __init__(self, input_ids, attention_masks, labels):

        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.labels = labels

    def __len__(self):

        return len(self.labels)

    def __getitem__(self, item_id):

        input_ids = self.input_ids[item_id]
        attention_mask = self.attention_masks[item_id]
        label = self.labels[item_id]
        sample = {
            "input_ids": input_ids,
            "labels": label,
            "attention_mask": attention_mask,
            }

        return sample

In [None]:
# Prepare data for training / evaluation / testing

class DataProcessor(object):

  """ Prepares data for use with the model and returs
    data + DataLoader objects. """

  def __init__(self, config, tokenizer):

    self.config = config
    self.tokenizer = tokenizer
    self.labels_list = [0, 1]
    self.num_labels = len(self.labels_list)

    # Load data
    self.train_data, self.eval_data, self.test_data = self._get_raw_data()
    # Transorm data
    self.train_data = self._transform_data(self.train_data)
    self.eval_data = self._transform_data(self.eval_data)
    self.test_data = self._transform_data(self.test_data, is_test=True)
    # Embed training and valiadtion data
    train_samples = self.encode_truncate_pad(self.train_data)
    eval_samples = self.encode_truncate_pad(self.eval_data)
    # Create Datasets
    train_dataset =  self._make_dataset(train_samples)
    eval_dataset =  self._make_dataset(eval_samples)

    # Create DataLoaders
    # Use a random sampler for training
    train_sampler = RandomSampler(train_dataset)
    self.train_dataloader = DataLoader(train_dataset,
                                       sampler=train_sampler,
                                       batch_size=self.config.train_batch_size)

    # Use a sequential sampler for evaluation for consistency
    eval_sampler = SequentialSampler(eval_dataset)
    self.eval_dataloader = DataLoader(eval_dataset,
                                      sampler=eval_sampler,
                                      batch_size=self.config.eval_batch_size)


  def _get_raw_data(self):

    """ Downloads datasets from HuggingFace Datasets;
      Assumes use of the DailyDialog dataset, may require adjustment for other
      datasets. """

    print("Dowloading data ...")
    # For prototyping purposes, we only look at a subset of the data
    train_data = load_dataset(
        self.config.dataset, split="train")["dialog"]
    eval_data = load_dataset(
        self.config.dataset, split="validation")[:500]["dialog"]
    test_data = load_dataset(
        self.config.dataset, split="test")["dialog"]

    return train_data, eval_data, test_data


  def _transform_data(self, dialog_list, num_distractors=1, is_test=False):

    """ Transforms available dialogs for the dialog-completion task. """

    print("Transforming data ...")
    # Declare dialogue participant "tags"; customizable / optional
    person_tags = ["Person 1", "Person 2"]

    # dialog_list is a list of lists, with the final element in each list
    # representing the target continuation
    # 1. Split dialogues into context and target
    # 2. Sample distractor continuations form other dialogs for each dialog
    # 3. Create model inputs by paring contexts with continuations and assigning
    # a label to each sample

    # Note: Here the distractors are selected at random, but we can exploit
    # semantic similarity and other methods to "mine" hard distractors for
    # more robust models

    # Obtain true samples
    true_samples = list()
    distractor_bank = set()
    for dlg in dialog_list:
      # Add person tags to the context sentences (may not be neccessary?)
      context_text = dlg[:-1] + [" "]
      aug_context_text = list()
      for turn_id, turn in enumerate(context_text):
        turn = "{:s}: {}".format(person_tags[turn_id % len(person_tags)],
                                 turn.strip())

        aug_context_text.append(turn)

      new_sample = {"context": ' '.join(aug_context_text),
                    "target": dlg[-1],
                    "label": 1}
      true_samples.append(new_sample)
      distractor_bank.add(new_sample["target"])

    # Get negative samples for training and eval
    if not is_test:
      # Obtain false samples
      false_samples = list()
      distractor_bank = list(distractor_bank)
      for ts in true_samples:
        for _ in range(num_distractors):
          sampled_distractor = random.choice(distractor_bank)
          # make sure the distractor is different from the true target
          while sampled_distractor == ts["target"]:
            sampled_distractor = random.choice(distractor_bank)
          false_samples.append({"context": ts["context"],
                                "target": sampled_distractor,
                                "label": 0})

        # Combine and shuffle
        all_samples = true_samples + false_samples

    else:
      # Test data is created differently, by pairing each context with N
      # (e.g. 5 here) distractors; in 50% of cases correct continuation is
      # replaced with another distractor

      random.uniform(0, 1)

      # Create test samples
      all_samples = list()
      distractor_bank = list(distractor_bank)
      for ts in true_samples:
        keep_true = random.uniform(0., 1.)
        if keep_true < 0.5:
          # Keep the true continuation
          target_list = [ts["target"]]
          num_distractors = 5
        else:
          # Replace true continuation with distractor
          target_list = list()
          num_distractors = 6

        for _ in range(num_distractors):
          sampled_distractor = random.choice(distractor_bank)
          while sampled_distractor == ts["target"] or sampled_distractor in target_list:
            sampled_distractor = random.choice(distractor_bank)
          target_list.append(sampled_distractor)

        # Get ID of correct continuation
        if keep_true < 0.5:
          label = ts["target"]
        else:
          label = -1

        all_samples.append({"context": ts["context"],
                            "targets": target_list,
                            "label": label})

    return all_samples


  def encode_truncate_pad(self, dialog_samples):
      """ Encodes, truncates, and pads the individual samples. """

      # RoBERTa specific tokenizer properties for smart-ish truncation
      num_special_tokens = 4  # 1 * <s> + 3 * </s>
      max_sample_len = self.config.max_seq_len - num_special_tokens

      # Truncation strategy: Only runcate the context, by truncating the
      # beginning of the context rather than its end, as the context end is
      # likely to be more predictive of the dialog continuation
      print("Encoding data ...")
      encoded_samples = list()
      for ds in dialog_samples:
        # Encode context and target
        context_ids = self.tokenizer.encode(ds["context"], add_special_tokens=False)
        target_ids = self.tokenizer.encode(ds["target"], add_special_tokens=False)

        # Skip sample if the response is too long
        if len(target_ids) >= (max_sample_len - 100):
          continue

        max_context_len = max_sample_len - len(target_ids)
        # Truncate context
        truncated_context_ids = context_ids[
            max(0, len(context_ids) - max_context_len):]
        sample_ids = [self.tokenizer.bos_token_id] + truncated_context_ids + [
            self.tokenizer.sep_token_id] * 2 + target_ids
        # Create attention mask
        attention_mask = [1] * len(sample_ids)
        # Pad sequences to max_seq_len if necessary
        pad_size = self.config.max_seq_len - len(sample_ids)
        if pad_size > 0:
            sample_ids += [self.tokenizer.pad_token_id] * pad_size
            attention_mask += [0] * pad_size

        # Collect
        encoded_samples.append({"input_ids": sample_ids,
                                "attention_mask": attention_mask,
                                "label": ds["label"]})

      return encoded_samples


  def _make_dataset(self, encoded_samples):

    """ Converts encoded sample into a format usable by the PLM. """

    # Convert to tensors
    feature_dict = {
      "input_ids": torch.tensor(
          [s["input_ids"] for s in encoded_samples], dtype=torch.long),
      "attention_masks": torch.tensor(
          [s["attention_mask"] for s in encoded_samples], dtype=torch.long),
      "labels": torch.tensor(
          [s["label"] for s in encoded_samples], dtype=torch.long)
      }

    # Create DataSet
    return NewDataset(feature_dict["input_ids"],
                      feature_dict["attention_masks"],
                      feature_dict["labels"])


In [None]:
# Models to use for sequence cclassification method: "xlm-roberta-base" (implemented here)
# Models to use for embedding-based method: "bert-base" + transformer layer stack (initialized randomly)
# Datasets:
#   English-only: DailyDialogue
#   Multilingual: Opensubtitles (requires splitting into turns, cleaning; but seems overall promising)
#       Model: xlm-roberta


class ModelTrainer(object):

  """ Trains and evaluates a dialogue-continuation classifier. """

  def __init__(self, config, tokenizer, data_processor,
               model_id=0, model_path=None):

    # Instantiate variables
    self.config = config
    self.tokenizer = tokenizer
    self.data_processor = data_processor

    # Initialize model to fine-tune (either from HF or from disc)
    # We use two taget labels, one denoting a valid dialogue continuation and
    # another denoting an invalid one
    if model_path is None:
        self.model = AutoModelForSequenceClassification.from_pretrained(
            self.config.checkpoint, num_labels=2)
    else:
        self.model = AutoModelForSequenceClassification.from_pretrained(
            model_path)

    # Check if GPU can be used
    self.device = "cuda" if torch.cuda.is_available() and not self.config.use_cpu else "cpu"
    if self.device == "cuda":
      print("CUDA is available, using GPU for model training and evaluation.")
    else:
      print("CUDA is unavailable, using CPU for model training and evaluation.")
    self.model.to(self.device)

    # Define model name
    self.model_name = "next_sentence_cls_{}".format(model_id)  # for multiple runs

    # Create dataloaders for training and validation
    self.train_dataloader = self.data_processor.train_dataloader
    self.eval_dataloader = self.data_processor.eval_dataloader
    # Get test data for testing
    self.test_data = self.data_processor.test_data

    # Model saving path
    self.root_dir = "/content/"
    self.model_save_path = "/content/train_run_{:d}".format(model_id)
    if not os.path.isdir(self.model_save_path):
          os.mkdir(self.model_save_path)


  def train(self):

    """ Trains the model. """

    # Compute number of training epochs from total training steps
    num_train_epochs = max(1, int(np.round(self.config.num_train_steps // max(
        1, (len(self.train_dataloader)) // self.config.gradient_accumulation_steps))))

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in self.model.named_parameters() if not any(
            nd in n for nd in no_decay)],
          'weight_decay': self.config.weight_decay},
        {'params': [p for n, p in self.model.named_parameters() if any(
            nd in n for nd in no_decay)],
          'weight_decay': 0.0}]
    optimizer = torch.optim.AdamW(optimizer_grouped_parameters,
                                  lr=self.config.learning_rate,
                                  eps=self.config.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=1,
        num_training_steps=self.config.num_train_steps)

    # Initialize traning variables
    step, global_step = 0, 0
    tr_loss, last_reported_loss = 0.0, 0.0
    mean_training_loss = 0.0
    best_eval_acc = 0.
    stale_count = 0
    # Reset model gradient
    self.model.zero_grad()

    print("Starting training model: {:s}".format(self.model_name))

    # Report hyper-parameters used during training
    print('=' * 20)
    print('Hyperparameters:')
    print("\tmax_seq_len: {}".format(self.config.max_seq_len))
    print("\tgradient_accumulation_steps: {}".format(
        self.config.gradient_accumulation_steps))
    print("\tnum_train_steps: {}".format(self.config.num_train_steps))
    print("\tlearning_rate: {}".format(self.config.learning_rate))
    print("\tweight_decay: {}".format(self.config.weight_decay))
    print("\tadam_epsilon: {}".format(self.config.adam_epsilon))
    print("\tmax_grad_norm: {}".format(self.config.max_grad_norm))
    print("\ttrain_batch_size: {}".format(self.config.train_batch_size))
    print("\teval_batch_size: {}".format(self.config.eval_batch_size))
    print('=' * 20)

    train_iterator = trange(num_train_epochs, desc="Epoch")
    for epoch_id, _ in enumerate(train_iterator):
      epoch_iterator = tqdm(self.train_dataloader, desc="Iteration")
      for batch_id, batch in enumerate(epoch_iterator):

        # Training mode
        self.model.train()
        batch_inputs = {k: t.to(self.device) for k, t in batch.items()}
        outputs = self.model(**batch_inputs)

        # Compute model loss
        loss = outputs.loss

        # Accumulate loss for gradient accumulation
        loss = loss / self.config.gradient_accumulation_steps

        # Track training loss
        tr_loss += loss.item()

        # Get gradients
        loss.backward()
        step += 1

        # Update model after N gradient accumulation steps
        if step % self.config.gradient_accumulation_steps == 0:
          # Backpropagate, reglarize, optimize
          torch.nn.utils.clip_grad_norm_(
              self.model.parameters(), self.config.max_grad_norm)
          optimizer.step()
          scheduler.step()
          # Reset gradient
          self.model.zero_grad()
          global_step += 1

          # Report
          if global_step % self.config.logging_steps == 0:
            curr_mean_loss = (
                tr_loss - last_reported_loss) / self.config.logging_steps
            curr_learning_rate = scheduler.get_last_lr()
            last_reported_loss = tr_loss
            print("\tGlobal step: {:d} | LR: {} | Avg. loss: {:.3f}".format(
                global_step, curr_learning_rate, curr_mean_loss))

          # Evaluate
          if global_step % self.config.eval_after_steps == 0:
            print("=" * 20)
            print("\tEvaluating on valiadtion set!")
            eval_acc = self.eval()
            if eval_acc > best_eval_acc:
              best_eval_acc = eval_acc
              stale_count = 0
              # Save best model
              self._save_model()
            else:
              stale_count += 1
            print("\tEvaluation accuracy: {:.3f} | Stale count: {:d}".format(
                eval_acc, stale_count))
            print("=" * 20)
            if stale_count >= self.config.patience:
              print("Early stop!")
              break

        # Stop iteration after maximum number of training steps
        if self.config.num_train_steps < global_step:
          epoch_iterator.close()
          break

      # Stop iteration after maximum number of training steps
      if 0 < self.config.num_train_steps < global_step or stale_count >= self.config.patience:
        train_iterator.close()
        mean_training_loss = tr_loss / global_step
        print("== Finished training model: {:s} ==".format(self.model_name))
        print("\tMean training loss: {:.3f}".format(mean_training_loss))
        break

    # Report mean training loss for weighted logit combination
    return global_step, mean_training_loss


  def eval(self):

    """ Evaluates the model. """

    # Track performance
    eval_step = 0
    eval_acc = 0.

    # Iterate over evaluation data
    for batch_id, batch in enumerate(
        tqdm(self.eval_dataloader, desc="Evaluating")):

      # Evaluation mode
      self.model.eval()

      # Iterate over evaluation data
      with torch.no_grad():
        # Prepare inputs
        batch_inputs = {
            "input_ids": batch["input_ids"],
            "attention_mask": batch["attention_mask"]
            }
        labels = batch["labels"].to(self.device)

        batch_inputs = {k: t.to(self.device) for k, t in batch_inputs.items()}
        # Get outputs
        outputs = self.model(**batch_inputs)
        # Get model loss
        logits = outputs[0]

      # Compute model accuracy
      batch_acc = self._compute_accuracy(logits, labels)
      print(batch_acc)
      eval_acc += batch_acc
      eval_step += 1

    # Report model accuracy on the evaluation set
    mean_eval_acc = eval_acc / eval_step
    print("== Finished evaluating model: {:s} ==".format(self.model_name))
    print("\tMean evaluation acc: {:.3f}".format(mean_eval_acc))

    return mean_eval_acc


  def test(self, context=None, targets=None, label=None):

    """ Tests the model on some arbitrary context-target pair.
      In case multiple continuations are marked as valid, returns the one that
      the model is most confident in. """

    # Prepare data for input to model
    # If no data was provided by user, sample from test data

    if context is None or targets is None:
      print("Sampling a test item from the test data!")
      test_sample = random.choice(self.test_data)
      context = test_sample["context"]
      targets = test_sample["targets"]
      label = test_sample["label"]
    else:
      if type(context) == list:
        context = " ".join([c.strip() for c in context])

    print("Context: {}".format(context))
    print("Targets: {}".format(targets))
    print("Expected output: {}".format(label))

    # Create test samples
    test_samples = list()
    for tgt in targets:
      test_samples.append({"context": context,
                           "target": tgt,
                           "label": int(tgt == label)})
    test_samples = self.data_processor.encode_truncate_pad(test_samples)


    # Evaluation mode
    self.model.eval()

    out_id = None
    max_conf = 0.
    # Iterate over continuations
    for smp_id, smp in enumerate(test_samples):

      with torch.no_grad():
        # Prepare inputs
        input_ids = torch.tensor([smp["input_ids"]], dtype=torch.long)
        attention_mask = torch.tensor([smp["attention_mask"]], dtype=torch.long)
        batch_inputs = {
          "input_ids": input_ids,
          "attention_mask": attention_mask
          }
        batch_inputs = {k: t.to(self.device) for k, t in batch_inputs.items()}
        # Get outputs
        outputs = self.model(**batch_inputs)
        # Get model loss
        logits = outputs[0]
        # Get model prediction and confidence
        probs = torch.nn.functional.softmax(logits, dim=-1)
        conf, cls = torch.max(probs, -1)
        if cls == 1:
          if conf > max_conf:
            out_id = smp_id
            max_conf = conf

    if out_id is None:
      return -1

    else:
      return targets[out_id]


  @staticmethod
  def _compute_accuracy(preds, labels):

    """ Helper for computing evaluation accuracy """

    return (torch.argmax(preds, -1) == labels).float().mean().item()


  def _save_model(self):

    """ Saves a model checkpoint to disk. """

    # Save model
    model = self.model.module if hasattr(self.model, 'module') else self.model
    model.save_pretrained(self.model_save_path)



In [None]:
# Initialize config
config = Config()

# Set random seed
random.seed(config.random_seed)
torch.manual_seed(config.random_seed)
if torch.cuda.is_available():
  torch.cuda.manual_seed_all(config.random_seed)

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(config.checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
# Prepare data
data_processor = DataProcessor(config, tokenizer)

Dowloading data ...
Transforming data ...
Transforming data ...
Transforming data ...
Encoding data ...


Token indices sequence length is longer than the specified maximum sequence length for this model (785 > 512). Running this sequence through the model will result in indexing errors


Encoding data ...


In [None]:
# Reset GPU
torch.cuda.empty_cache()

In [None]:
trainer = ModelTrainer(config, tokenizer, data_processor)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at facebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


CUDA is available, using GPU for model training and evaluation.


In [None]:
# Train model
trainer.train()


Starting training model: next_sentence_cls_0
Hyperparameters:
	max_seq_len: 256
	gradient_accumulation_steps: 4
	num_train_steps: 1000
	learning_rate: 1e-05
	weight_decay: 0.01
	adam_epsilon: 1e-08
	max_grad_norm: 1.0
	train_batch_size: 32
	eval_batch_size: 32


Epoch:   0%|          | 0/5 [00:00<?, ?it/s]
Iteration:   0%|          | 0/695 [00:00<?, ?it/s][A
Iteration:   0%|          | 1/695 [00:01<20:20,  1.76s/it][A
Iteration:   0%|          | 2/695 [00:02<11:27,  1.01it/s][A
Iteration:   0%|          | 3/695 [00:03<12:29,  1.08s/it][A
Iteration:   1%|          | 4/695 [00:05<16:53,  1.47s/it][A
Iteration:   1%|          | 5/695 [00:05<12:41,  1.10s/it][A
Iteration:   1%|          | 6/695 [00:07<13:06,  1.14s/it][A
Iteration:   1%|          | 7/695 [00:08<13:19,  1.16s/it][A
Iteration:   1%|          | 8/695 [00:09<13:28,  1.18s/it][A
Iteration:   1%|▏         | 9/695 [00:10<13:45,  1.20s/it][A
Iteration:   1%|▏         | 10/695 [00:12<13:50,  1.21s/it][A
Iteration:   2%|▏         | 11/695 [00:13<13:53,  1.22s/it][A
Iteration:   2%|▏         | 12/695 [00:14<13:57,  1.23s/it][A
Iteration:   2%|▏         | 13/695 [00:15<14:06,  1.24s/it][A
Iteration:   2%|▏         | 14/695 [00:17<14:06,  1.24s/it][A
Iteration:   2%|▏         | 

	Global step: 25 | LR: [9.75975975975976e-06, 9.75975975975976e-06] | Avg. loss: 0.697



Iteration:  15%|█▍        | 101/695 [02:07<12:36,  1.27s/it][A
Iteration:  15%|█▍        | 102/695 [02:08<12:35,  1.27s/it][A
Iteration:  15%|█▍        | 103/695 [02:10<12:32,  1.27s/it][A
Iteration:  15%|█▍        | 104/695 [02:11<12:31,  1.27s/it][A
Iteration:  15%|█▌        | 105/695 [02:12<12:35,  1.28s/it][A
Iteration:  15%|█▌        | 106/695 [02:14<12:31,  1.28s/it][A
Iteration:  15%|█▌        | 107/695 [02:15<12:28,  1.27s/it][A
Iteration:  16%|█▌        | 108/695 [02:16<12:24,  1.27s/it][A
Iteration:  16%|█▌        | 109/695 [02:17<12:28,  1.28s/it][A
Iteration:  16%|█▌        | 110/695 [02:19<12:23,  1.27s/it][A
Iteration:  16%|█▌        | 111/695 [02:20<12:24,  1.27s/it][A
Iteration:  16%|█▌        | 112/695 [02:21<12:26,  1.28s/it][A
Iteration:  16%|█▋        | 113/695 [02:22<12:26,  1.28s/it][A
Iteration:  16%|█▋        | 114/695 [02:24<12:19,  1.27s/it][A
Iteration:  17%|█▋        | 115/695 [02:25<12:17,  1.27s/it][A
Iteration:  17%|█▋        | 116/695 [02

	Global step: 50 | LR: [9.50950950950951e-06, 9.50950950950951e-06] | Avg. loss: 0.683
	Evaluating on valiadtion set!




Evaluating:   0%|          | 0/32 [00:00<?, ?it/s][A[A

Evaluating:   3%|▎         | 1/32 [00:01<00:38,  1.25s/it][A[A

0.84375




Evaluating:   6%|▋         | 2/32 [00:01<00:23,  1.29it/s][A[A

0.65625




Evaluating:   9%|▉         | 3/32 [00:02<00:18,  1.61it/s][A[A

0.75




Evaluating:  12%|█▎        | 4/32 [00:02<00:15,  1.82it/s][A[A

0.75




Evaluating:  16%|█▌        | 5/32 [00:03<00:13,  1.96it/s][A[A

0.84375




Evaluating:  19%|█▉        | 6/32 [00:03<00:12,  2.06it/s][A[A

0.75




Evaluating:  22%|██▏       | 7/32 [00:03<00:11,  2.12it/s][A[A

0.625




Evaluating:  25%|██▌       | 8/32 [00:04<00:11,  2.16it/s][A[A

0.8125




Evaluating:  28%|██▊       | 9/32 [00:04<00:10,  2.20it/s][A[A

0.78125




Evaluating:  31%|███▏      | 10/32 [00:05<00:09,  2.22it/s][A[A

0.78125




Evaluating:  34%|███▍      | 11/32 [00:05<00:09,  2.23it/s][A[A

0.75




Evaluating:  38%|███▊      | 12/32 [00:06<00:08,  2.25it/s][A[A

0.90625




Evaluating:  41%|████      | 13/32 [00:06<00:08,  2.26it/s][A[A

0.71875




Evaluating:  44%|████▍     | 14/32 [00:06<00:07,  2.25it/s][A[A

0.78125




Evaluating:  47%|████▋     | 15/32 [00:07<00:07,  2.26it/s][A[A

0.65625




Evaluating:  50%|█████     | 16/32 [00:07<00:07,  2.27it/s][A[A

0.71875




Evaluating:  53%|█████▎    | 17/32 [00:08<00:06,  2.26it/s][A[A

0.71875




Evaluating:  56%|█████▋    | 18/32 [00:08<00:06,  2.27it/s][A[A

0.625




Evaluating:  59%|█████▉    | 19/32 [00:09<00:05,  2.26it/s][A[A

0.8125




Evaluating:  62%|██████▎   | 20/32 [00:09<00:05,  2.27it/s][A[A

0.625




Evaluating:  66%|██████▌   | 21/32 [00:10<00:04,  2.27it/s][A[A

0.6875




Evaluating:  69%|██████▉   | 22/32 [00:10<00:04,  2.27it/s][A[A

0.65625




Evaluating:  72%|███████▏  | 23/32 [00:10<00:03,  2.27it/s][A[A

0.625




Evaluating:  75%|███████▌  | 24/32 [00:11<00:03,  2.27it/s][A[A

0.75




Evaluating:  78%|███████▊  | 25/32 [00:11<00:03,  2.27it/s][A[A

0.65625




Evaluating:  81%|████████▏ | 26/32 [00:12<00:02,  2.26it/s][A[A

0.75




Evaluating:  84%|████████▍ | 27/32 [00:12<00:02,  2.27it/s][A[A

0.71875




Evaluating:  88%|████████▊ | 28/32 [00:13<00:01,  2.27it/s][A[A

0.65625




Evaluating:  91%|█████████ | 29/32 [00:13<00:01,  2.27it/s][A[A

0.59375




Evaluating:  94%|█████████▍| 30/32 [00:14<00:00,  2.27it/s][A[A

0.71875




Evaluating:  97%|█████████▋| 31/32 [00:14<00:00,  2.27it/s][A[A

Evaluating: 100%|██████████| 32/32 [00:14<00:00,  2.19it/s]

0.625
0.75
== Finished evaluating model: next_sentence_cls_0 ==
	Mean evaluation acc: 0.722




Iteration:  29%|██▉       | 200/695 [04:29<51:24,  6.23s/it][A

	Evaluation accuracy: 0.722 | Stale count: 0



Iteration:  29%|██▉       | 201/695 [04:30<37:12,  4.52s/it][A
Iteration:  29%|██▉       | 202/695 [04:31<29:03,  3.54s/it][A
Iteration:  29%|██▉       | 203/695 [04:32<23:22,  2.85s/it][A
Iteration:  29%|██▉       | 204/695 [04:34<19:27,  2.38s/it][A
Iteration:  29%|██▉       | 205/695 [04:35<16:42,  2.05s/it][A
Iteration:  30%|██▉       | 206/695 [04:36<14:42,  1.80s/it][A
Iteration:  30%|██▉       | 207/695 [04:37<13:19,  1.64s/it][A
Iteration:  30%|██▉       | 208/695 [04:39<12:22,  1.52s/it][A
Iteration:  30%|███       | 209/695 [04:40<11:46,  1.45s/it][A
Iteration:  30%|███       | 210/695 [04:41<11:15,  1.39s/it][A
Iteration:  30%|███       | 211/695 [04:42<10:55,  1.35s/it][A
Iteration:  31%|███       | 212/695 [04:44<10:41,  1.33s/it][A
Iteration:  31%|███       | 213/695 [04:45<10:35,  1.32s/it][A
Iteration:  31%|███       | 214/695 [04:46<10:25,  1.30s/it][A
Iteration:  31%|███       | 215/695 [04:47<10:16,  1.28s/it][A
Iteration:  31%|███       | 216/695 [04

	Global step: 75 | LR: [9.25925925925926e-06, 9.25925925925926e-06] | Avg. loss: 0.634



Iteration:  43%|████▎     | 301/695 [06:36<08:19,  1.27s/it][A
Iteration:  43%|████▎     | 302/695 [06:38<08:16,  1.26s/it][A
Iteration:  44%|████▎     | 303/695 [06:39<08:14,  1.26s/it][A
Iteration:  44%|████▎     | 304/695 [06:40<08:12,  1.26s/it][A
Iteration:  44%|████▍     | 305/695 [06:41<08:15,  1.27s/it][A
Iteration:  44%|████▍     | 306/695 [06:43<08:11,  1.26s/it][A
Iteration:  44%|████▍     | 307/695 [06:44<08:09,  1.26s/it][A
Iteration:  44%|████▍     | 308/695 [06:45<08:07,  1.26s/it][A
Iteration:  44%|████▍     | 309/695 [06:47<08:10,  1.27s/it][A
Iteration:  45%|████▍     | 310/695 [06:48<08:06,  1.26s/it][A
Iteration:  45%|████▍     | 311/695 [06:49<08:05,  1.26s/it][A
Iteration:  45%|████▍     | 312/695 [06:50<08:05,  1.27s/it][A
Iteration:  45%|████▌     | 313/695 [06:52<08:05,  1.27s/it][A
Iteration:  45%|████▌     | 314/695 [06:53<08:02,  1.27s/it][A
Iteration:  45%|████▌     | 315/695 [06:54<08:00,  1.27s/it][A
Iteration:  45%|████▌     | 316/695 [06

	Global step: 100 | LR: [9.00900900900901e-06, 9.00900900900901e-06] | Avg. loss: 0.563
	Evaluating on valiadtion set!




Evaluating:   0%|          | 0/32 [00:00<?, ?it/s][A[A

Evaluating:   3%|▎         | 1/32 [00:01<00:38,  1.25s/it][A[A

0.65625




Evaluating:   6%|▋         | 2/32 [00:01<00:23,  1.30it/s][A[A

0.5




Evaluating:   9%|▉         | 3/32 [00:02<00:17,  1.61it/s][A[A

0.625




Evaluating:  12%|█▎        | 4/32 [00:02<00:15,  1.82it/s][A[A

0.625




Evaluating:  16%|█▌        | 5/32 [00:03<00:13,  1.96it/s][A[A

0.625




Evaluating:  19%|█▉        | 6/32 [00:03<00:12,  2.06it/s][A[A

0.71875




Evaluating:  22%|██▏       | 7/32 [00:03<00:11,  2.12it/s][A[A

0.5625




Evaluating:  25%|██▌       | 8/32 [00:04<00:11,  2.16it/s][A[A

0.78125




Evaluating:  28%|██▊       | 9/32 [00:04<00:10,  2.20it/s][A[A

0.625




Evaluating:  31%|███▏      | 10/32 [00:05<00:09,  2.22it/s][A[A

0.65625




Evaluating:  34%|███▍      | 11/32 [00:05<00:09,  2.23it/s][A[A

0.71875




Evaluating:  38%|███▊      | 12/32 [00:06<00:08,  2.25it/s][A[A

0.78125




Evaluating:  41%|████      | 13/32 [00:06<00:08,  2.26it/s][A[A

0.5




Evaluating:  44%|████▍     | 14/32 [00:06<00:07,  2.25it/s][A[A

0.65625




Evaluating:  47%|████▋     | 15/32 [00:07<00:07,  2.26it/s][A[A

0.5




Evaluating:  50%|█████     | 16/32 [00:07<00:07,  2.26it/s][A[A

0.71875




Evaluating:  53%|█████▎    | 17/32 [00:08<00:06,  2.27it/s][A[A

0.90625




Evaluating:  56%|█████▋    | 18/32 [00:08<00:06,  2.27it/s][A[A

0.84375




Evaluating:  59%|█████▉    | 19/32 [00:09<00:05,  2.26it/s][A[A

0.84375




Evaluating:  62%|██████▎   | 20/32 [00:09<00:05,  2.27it/s][A[A

0.9375




Evaluating:  66%|██████▌   | 21/32 [00:10<00:04,  2.27it/s][A[A

0.8125




Evaluating:  69%|██████▉   | 22/32 [00:10<00:04,  2.28it/s][A[A

0.78125




Evaluating:  72%|███████▏  | 23/32 [00:10<00:03,  2.27it/s][A[A

0.90625




Evaluating:  75%|███████▌  | 24/32 [00:11<00:03,  2.27it/s][A[A

0.875




Evaluating:  78%|███████▊  | 25/32 [00:11<00:03,  2.27it/s][A[A

0.875




Evaluating:  81%|████████▏ | 26/32 [00:12<00:02,  2.26it/s][A[A

0.90625




Evaluating:  84%|████████▍ | 27/32 [00:12<00:02,  2.27it/s][A[A

0.84375




Evaluating:  88%|████████▊ | 28/32 [00:13<00:01,  2.27it/s][A[A

0.90625




Evaluating:  91%|█████████ | 29/32 [00:13<00:01,  2.27it/s][A[A

0.9375




Evaluating:  94%|█████████▍| 30/32 [00:14<00:00,  2.27it/s][A[A

0.875




Evaluating:  97%|█████████▋| 31/32 [00:14<00:00,  2.27it/s][A[A

Evaluating: 100%|██████████| 32/32 [00:14<00:00,  2.19it/s]

0.875
1.0
== Finished evaluating model: next_sentence_cls_0 ==
	Mean evaluation acc: 0.762




Iteration:  58%|█████▊    | 400/695 [09:00<33:44,  6.86s/it][A

	Evaluation accuracy: 0.762 | Stale count: 0



Iteration:  58%|█████▊    | 401/695 [09:01<24:21,  4.97s/it][A
Iteration:  58%|█████▊    | 402/695 [09:02<18:48,  3.85s/it][A
Iteration:  58%|█████▊    | 403/695 [09:04<14:55,  3.07s/it][A
Iteration:  58%|█████▊    | 404/695 [09:05<12:14,  2.52s/it][A
Iteration:  58%|█████▊    | 405/695 [09:06<10:24,  2.15s/it][A
Iteration:  58%|█████▊    | 406/695 [09:07<09:04,  1.88s/it][A
Iteration:  59%|█████▊    | 407/695 [09:09<08:08,  1.69s/it][A
Iteration:  59%|█████▊    | 408/695 [09:10<07:29,  1.57s/it][A
Iteration:  59%|█████▉    | 409/695 [09:11<07:05,  1.49s/it][A
Iteration:  59%|█████▉    | 410/695 [09:12<06:42,  1.41s/it][A
Iteration:  59%|█████▉    | 411/695 [09:14<06:27,  1.36s/it][A
Iteration:  59%|█████▉    | 412/695 [09:15<06:17,  1.34s/it][A
Iteration:  59%|█████▉    | 413/695 [09:16<06:13,  1.32s/it][A
Iteration:  60%|█████▉    | 414/695 [09:17<06:06,  1.30s/it][A
Iteration:  60%|█████▉    | 415/695 [09:19<06:00,  1.29s/it][A
Iteration:  60%|█████▉    | 416/695 [09

	Global step: 125 | LR: [8.75875875875876e-06, 8.75875875875876e-06] | Avg. loss: 0.514



Iteration:  72%|███████▏  | 501/695 [11:08<04:06,  1.27s/it][A
Iteration:  72%|███████▏  | 502/695 [11:09<04:03,  1.26s/it][A
Iteration:  72%|███████▏  | 503/695 [11:10<04:01,  1.26s/it][A
Iteration:  73%|███████▎  | 504/695 [11:11<04:01,  1.26s/it][A
Iteration:  73%|███████▎  | 505/695 [11:13<04:02,  1.28s/it][A
Iteration:  73%|███████▎  | 506/695 [11:14<03:59,  1.27s/it][A
Iteration:  73%|███████▎  | 507/695 [11:15<03:58,  1.27s/it][A
Iteration:  73%|███████▎  | 508/695 [11:16<03:56,  1.27s/it][A
Iteration:  73%|███████▎  | 509/695 [11:18<03:56,  1.27s/it][A
Iteration:  73%|███████▎  | 510/695 [11:19<03:54,  1.27s/it][A
Iteration:  74%|███████▎  | 511/695 [11:20<03:52,  1.26s/it][A
Iteration:  74%|███████▎  | 512/695 [11:22<03:51,  1.26s/it][A
Iteration:  74%|███████▍  | 513/695 [11:23<03:51,  1.27s/it][A
Iteration:  74%|███████▍  | 514/695 [11:24<03:48,  1.26s/it][A
Iteration:  74%|███████▍  | 515/695 [11:25<03:46,  1.26s/it][A
Iteration:  74%|███████▍  | 516/695 [11

	Global step: 150 | LR: [8.50850850850851e-06, 8.50850850850851e-06] | Avg. loss: 0.464
	Evaluating on valiadtion set!




Evaluating:   0%|          | 0/32 [00:00<?, ?it/s][A[A

Evaluating:   3%|▎         | 1/32 [00:01<00:38,  1.23s/it][A[A

0.90625




Evaluating:   6%|▋         | 2/32 [00:01<00:22,  1.31it/s][A[A

0.65625




Evaluating:   9%|▉         | 3/32 [00:02<00:17,  1.62it/s][A[A

0.875




Evaluating:  12%|█▎        | 4/32 [00:02<00:15,  1.83it/s][A[A

0.8125




Evaluating:  16%|█▌        | 5/32 [00:02<00:13,  1.97it/s][A[A

0.75




Evaluating:  19%|█▉        | 6/32 [00:03<00:12,  2.07it/s][A[A

0.875




Evaluating:  22%|██▏       | 7/32 [00:03<00:11,  2.13it/s][A[A

0.78125




Evaluating:  25%|██▌       | 8/32 [00:04<00:11,  2.17it/s][A[A

0.875




Evaluating:  28%|██▊       | 9/32 [00:04<00:10,  2.20it/s][A[A

0.84375




Evaluating:  31%|███▏      | 10/32 [00:05<00:09,  2.22it/s][A[A

0.8125




Evaluating:  34%|███▍      | 11/32 [00:05<00:09,  2.23it/s][A[A

0.9375




Evaluating:  38%|███▊      | 12/32 [00:06<00:08,  2.25it/s][A[A

0.84375




Evaluating:  41%|████      | 13/32 [00:06<00:08,  2.26it/s][A[A

0.8125




Evaluating:  44%|████▍     | 14/32 [00:06<00:07,  2.26it/s][A[A

0.75




Evaluating:  47%|████▋     | 15/32 [00:07<00:07,  2.26it/s][A[A

0.6875




Evaluating:  50%|█████     | 16/32 [00:07<00:07,  2.27it/s][A[A

0.78125




Evaluating:  53%|█████▎    | 17/32 [00:08<00:06,  2.27it/s][A[A

0.875




Evaluating:  56%|█████▋    | 18/32 [00:08<00:06,  2.26it/s][A[A

0.75




Evaluating:  59%|█████▉    | 19/32 [00:09<00:05,  2.26it/s][A[A

0.8125




Evaluating:  62%|██████▎   | 20/32 [00:09<00:05,  2.26it/s][A[A

0.78125




Evaluating:  66%|██████▌   | 21/32 [00:10<00:04,  2.27it/s][A[A

0.75




Evaluating:  69%|██████▉   | 22/32 [00:10<00:04,  2.27it/s][A[A

0.8125




Evaluating:  72%|███████▏  | 23/32 [00:10<00:03,  2.27it/s][A[A

0.875




Evaluating:  75%|███████▌  | 24/32 [00:11<00:03,  2.26it/s][A[A

0.8125




Evaluating:  78%|███████▊  | 25/32 [00:11<00:03,  2.26it/s][A[A

0.8125




Evaluating:  81%|████████▏ | 26/32 [00:12<00:02,  2.25it/s][A[A

0.8125




Evaluating:  84%|████████▍ | 27/32 [00:12<00:02,  2.26it/s][A[A

0.8125




Evaluating:  88%|████████▊ | 28/32 [00:13<00:01,  2.27it/s][A[A

0.8125




Evaluating:  91%|█████████ | 29/32 [00:13<00:01,  2.27it/s][A[A

0.8125




Evaluating:  94%|█████████▍| 30/32 [00:14<00:00,  2.27it/s][A[A

0.8125




Evaluating:  97%|█████████▋| 31/32 [00:14<00:00,  2.27it/s][A[A

Evaluating: 100%|██████████| 32/32 [00:14<00:00,  2.19it/s]

0.71875
1.0
== Finished evaluating model: next_sentence_cls_0 ==
	Mean evaluation acc: 0.814




Iteration:  86%|████████▋ | 600/695 [13:34<11:44,  7.42s/it][A

	Evaluation accuracy: 0.814 | Stale count: 0



Iteration:  86%|████████▋ | 601/695 [13:34<08:23,  5.36s/it][A
Iteration:  87%|████████▋ | 602/695 [13:35<06:23,  4.12s/it][A
Iteration:  87%|████████▋ | 603/695 [13:37<04:59,  3.26s/it][A
Iteration:  87%|████████▋ | 604/695 [13:38<04:02,  2.67s/it][A
Iteration:  87%|████████▋ | 605/695 [13:39<03:21,  2.24s/it][A
Iteration:  87%|████████▋ | 606/695 [13:40<02:52,  1.94s/it][A
Iteration:  87%|████████▋ | 607/695 [13:42<02:32,  1.74s/it][A
Iteration:  87%|████████▋ | 608/695 [13:43<02:18,  1.60s/it][A
Iteration:  88%|████████▊ | 609/695 [13:44<02:09,  1.50s/it][A
Iteration:  88%|████████▊ | 610/695 [13:46<02:01,  1.43s/it][A
Iteration:  88%|████████▊ | 611/695 [13:47<01:55,  1.38s/it][A
Iteration:  88%|████████▊ | 612/695 [13:48<01:52,  1.35s/it][A
Iteration:  88%|████████▊ | 613/695 [13:49<01:49,  1.34s/it][A
Iteration:  88%|████████▊ | 614/695 [13:51<01:46,  1.31s/it][A
Iteration:  88%|████████▊ | 615/695 [13:52<01:43,  1.30s/it][A
Iteration:  89%|████████▊ | 616/695 [13

	Global step: 175 | LR: [8.258258258258259e-06, 8.258258258258259e-06] | Avg. loss: 0.444



Iteration:   1%|          | 6/695 [00:07<14:32,  1.27s/it][A
Iteration:   1%|          | 7/695 [00:08<14:30,  1.26s/it][A
Iteration:   1%|          | 8/695 [00:10<14:28,  1.26s/it][A
Iteration:   1%|▏         | 9/695 [00:11<14:28,  1.27s/it][A
Iteration:   1%|▏         | 10/695 [00:12<14:34,  1.28s/it][A
Iteration:   2%|▏         | 11/695 [00:13<14:29,  1.27s/it][A
Iteration:   2%|▏         | 12/695 [00:15<14:25,  1.27s/it][A
Iteration:   2%|▏         | 13/695 [00:16<14:25,  1.27s/it][A
Iteration:   2%|▏         | 14/695 [00:17<14:27,  1.27s/it][A
Iteration:   2%|▏         | 15/695 [00:18<14:23,  1.27s/it][A
Iteration:   2%|▏         | 16/695 [00:20<14:19,  1.27s/it][A
Iteration:   2%|▏         | 17/695 [00:21<14:20,  1.27s/it][A
Iteration:   3%|▎         | 18/695 [00:22<14:26,  1.28s/it][A
Iteration:   3%|▎         | 19/695 [00:24<14:20,  1.27s/it][A
Iteration:   3%|▎         | 20/695 [00:25<14:17,  1.27s/it][A
Iteration:   3%|▎         | 21/695 [00:26<14:13,  1.27s/it

	Global step: 200 | LR: [8.00800800800801e-06, 8.00800800800801e-06] | Avg. loss: 0.413
	Evaluating on valiadtion set!




Evaluating:   0%|          | 0/32 [00:00<?, ?it/s][A[A

Evaluating:   3%|▎         | 1/32 [00:01<00:37,  1.22s/it][A[A

1.0




Evaluating:   6%|▋         | 2/32 [00:01<00:22,  1.31it/s][A[A

0.6875




Evaluating:   9%|▉         | 3/32 [00:02<00:17,  1.62it/s][A[A

0.84375




Evaluating:  12%|█▎        | 4/32 [00:02<00:15,  1.82it/s][A[A

0.84375




Evaluating:  16%|█▌        | 5/32 [00:02<00:13,  1.96it/s][A[A

0.90625




Evaluating:  19%|█▉        | 6/32 [00:03<00:12,  2.06it/s][A[A

0.875




Evaluating:  22%|██▏       | 7/32 [00:03<00:11,  2.12it/s][A[A

0.84375




Evaluating:  25%|██▌       | 8/32 [00:04<00:11,  2.16it/s][A[A

0.90625




Evaluating:  28%|██▊       | 9/32 [00:04<00:10,  2.19it/s][A[A

0.875




Evaluating:  31%|███▏      | 10/32 [00:05<00:09,  2.21it/s][A[A

0.875




Evaluating:  34%|███▍      | 11/32 [00:05<00:09,  2.23it/s][A[A

0.90625




Evaluating:  38%|███▊      | 12/32 [00:06<00:08,  2.24it/s][A[A

0.84375




Evaluating:  41%|████      | 13/32 [00:06<00:08,  2.25it/s][A[A

0.78125




Evaluating:  44%|████▍     | 14/32 [00:06<00:08,  2.25it/s][A[A

0.9375




Evaluating:  47%|████▋     | 15/32 [00:07<00:07,  2.26it/s][A[A

0.75




Evaluating:  50%|█████     | 16/32 [00:07<00:07,  2.27it/s][A[A

0.84375




Evaluating:  53%|█████▎    | 17/32 [00:08<00:06,  2.27it/s][A[A

0.84375




Evaluating:  56%|█████▋    | 18/32 [00:08<00:06,  2.27it/s][A[A

0.78125




Evaluating:  59%|█████▉    | 19/32 [00:09<00:05,  2.26it/s][A[A

0.75




Evaluating:  62%|██████▎   | 20/32 [00:09<00:05,  2.26it/s][A[A

0.78125




Evaluating:  66%|██████▌   | 21/32 [00:10<00:04,  2.27it/s][A[A

0.71875




Evaluating:  69%|██████▉   | 22/32 [00:10<00:04,  2.27it/s][A[A

0.75




Evaluating:  72%|███████▏  | 23/32 [00:10<00:03,  2.26it/s][A[A

0.78125




Evaluating:  75%|███████▌  | 24/32 [00:11<00:03,  2.27it/s][A[A

0.6875




Evaluating:  78%|███████▊  | 25/32 [00:11<00:03,  2.27it/s][A[A

0.78125




Evaluating:  81%|████████▏ | 26/32 [00:12<00:02,  2.26it/s][A[A

0.6875




Evaluating:  84%|████████▍ | 27/32 [00:12<00:02,  2.27it/s][A[A

0.78125




Evaluating:  88%|████████▊ | 28/32 [00:13<00:01,  2.27it/s][A[A

0.84375




Evaluating:  91%|█████████ | 29/32 [00:13<00:01,  2.27it/s][A[A

0.75




Evaluating:  94%|█████████▍| 30/32 [00:14<00:00,  2.27it/s][A[A

0.75




Evaluating:  97%|█████████▋| 31/32 [00:14<00:00,  2.27it/s][A[A

Evaluating: 100%|██████████| 32/32 [00:14<00:00,  2.19it/s]

Iteration:  15%|█▌        | 105/695 [02:27<55:31,  5.65s/it][A

0.6875
0.875
== Finished evaluating model: next_sentence_cls_0 ==
	Mean evaluation acc: 0.812
	Evaluation accuracy: 0.812 | Stale count: 1



Iteration:  15%|█▌        | 106/695 [02:28<40:14,  4.10s/it][A
Iteration:  15%|█▌        | 107/695 [02:29<31:50,  3.25s/it][A
Iteration:  16%|█▌        | 108/695 [02:30<25:57,  2.65s/it][A
Iteration:  16%|█▌        | 109/695 [02:32<21:51,  2.24s/it][A
Iteration:  16%|█▌        | 110/695 [02:33<19:05,  1.96s/it][A
Iteration:  16%|█▌        | 111/695 [02:34<17:00,  1.75s/it][A
Iteration:  16%|█▌        | 112/695 [02:35<15:32,  1.60s/it][A
Iteration:  16%|█▋        | 113/695 [02:37<14:32,  1.50s/it][A
Iteration:  16%|█▋        | 114/695 [02:38<13:56,  1.44s/it][A
Iteration:  17%|█▋        | 115/695 [02:39<13:23,  1.39s/it][A
Iteration:  17%|█▋        | 116/695 [02:40<12:59,  1.35s/it][A
Iteration:  17%|█▋        | 117/695 [02:42<12:45,  1.32s/it][A
Iteration:  17%|█▋        | 118/695 [02:43<12:40,  1.32s/it][A
Iteration:  17%|█▋        | 119/695 [02:44<12:28,  1.30s/it][A
Iteration:  17%|█▋        | 120/695 [02:46<12:18,  1.28s/it][A
Iteration:  17%|█▋        | 121/695 [02

	Global step: 225 | LR: [7.757757757757758e-06, 7.757757757757758e-06] | Avg. loss: 0.397



Iteration:  30%|██▉       | 206/695 [04:35<10:22,  1.27s/it][A
Iteration:  30%|██▉       | 207/695 [04:36<10:19,  1.27s/it][A
Iteration:  30%|██▉       | 208/695 [04:37<10:15,  1.26s/it][A
Iteration:  30%|███       | 209/695 [04:38<10:13,  1.26s/it][A
Iteration:  30%|███       | 210/695 [04:40<10:17,  1.27s/it][A
Iteration:  30%|███       | 211/695 [04:41<10:15,  1.27s/it][A
Iteration:  31%|███       | 212/695 [04:42<10:13,  1.27s/it][A
Iteration:  31%|███       | 213/695 [04:44<10:13,  1.27s/it][A
Iteration:  31%|███       | 214/695 [04:45<10:14,  1.28s/it][A
Iteration:  31%|███       | 215/695 [04:46<10:10,  1.27s/it][A
Iteration:  31%|███       | 216/695 [04:47<10:06,  1.27s/it][A
Iteration:  31%|███       | 217/695 [04:49<10:06,  1.27s/it][A
Iteration:  31%|███▏      | 218/695 [04:50<10:08,  1.28s/it][A
Iteration:  32%|███▏      | 219/695 [04:51<10:04,  1.27s/it][A
Iteration:  32%|███▏      | 220/695 [04:52<10:01,  1.27s/it][A
Iteration:  32%|███▏      | 221/695 [04

	Global step: 250 | LR: [7.507507507507507e-06, 7.507507507507507e-06] | Avg. loss: 0.384
	Evaluating on valiadtion set!




Evaluating:   0%|          | 0/32 [00:00<?, ?it/s][A[A

Evaluating:   3%|▎         | 1/32 [00:01<00:38,  1.23s/it][A[A

1.0




Evaluating:   6%|▋         | 2/32 [00:01<00:23,  1.30it/s][A[A

0.90625




Evaluating:   9%|▉         | 3/32 [00:02<00:17,  1.61it/s][A[A

0.9375




Evaluating:  12%|█▎        | 4/32 [00:02<00:15,  1.83it/s][A[A

0.90625




Evaluating:  16%|█▌        | 5/32 [00:02<00:13,  1.97it/s][A[A

0.96875




Evaluating:  19%|█▉        | 6/32 [00:03<00:12,  2.07it/s][A[A

0.9375




Evaluating:  22%|██▏       | 7/32 [00:03<00:11,  2.12it/s][A[A

0.90625




Evaluating:  25%|██▌       | 8/32 [00:04<00:11,  2.17it/s][A[A

0.90625




Evaluating:  28%|██▊       | 9/32 [00:04<00:10,  2.20it/s][A[A

0.96875




Evaluating:  31%|███▏      | 10/32 [00:05<00:09,  2.22it/s][A[A

0.875




Evaluating:  34%|███▍      | 11/32 [00:05<00:09,  2.23it/s][A[A

0.96875




Evaluating:  38%|███▊      | 12/32 [00:06<00:08,  2.25it/s][A[A

0.90625




Evaluating:  41%|████      | 13/32 [00:06<00:08,  2.26it/s][A[A

0.9375




Evaluating:  44%|████▍     | 14/32 [00:06<00:07,  2.25it/s][A[A

0.90625




Evaluating:  47%|████▋     | 15/32 [00:07<00:07,  2.27it/s][A[A

0.84375




Evaluating:  50%|█████     | 16/32 [00:07<00:07,  2.27it/s][A[A

0.90625




Evaluating:  53%|█████▎    | 17/32 [00:08<00:06,  2.27it/s][A[A

0.90625




Evaluating:  56%|█████▋    | 18/32 [00:08<00:06,  2.27it/s][A[A

0.65625




Evaluating:  59%|█████▉    | 19/32 [00:09<00:05,  2.26it/s][A[A

0.71875




Evaluating:  62%|██████▎   | 20/32 [00:09<00:05,  2.27it/s][A[A

0.75




Evaluating:  66%|██████▌   | 21/32 [00:10<00:04,  2.27it/s][A[A

0.6875




Evaluating:  69%|██████▉   | 22/32 [00:10<00:04,  2.28it/s][A[A

0.8125




Evaluating:  72%|███████▏  | 23/32 [00:10<00:03,  2.27it/s][A[A

0.8125




Evaluating:  75%|███████▌  | 24/32 [00:11<00:03,  2.27it/s][A[A

0.6875




Evaluating:  78%|███████▊  | 25/32 [00:11<00:03,  2.27it/s][A[A

0.78125




Evaluating:  81%|████████▏ | 26/32 [00:12<00:02,  2.26it/s][A[A

0.6875




Evaluating:  84%|████████▍ | 27/32 [00:12<00:02,  2.27it/s][A[A

0.75




Evaluating:  88%|████████▊ | 28/32 [00:13<00:01,  2.28it/s][A[A

0.71875




Evaluating:  91%|█████████ | 29/32 [00:13<00:01,  2.27it/s][A[A

0.6875




Evaluating:  94%|█████████▍| 30/32 [00:14<00:00,  2.27it/s][A[A

0.71875




Evaluating:  97%|█████████▋| 31/32 [00:14<00:00,  2.28it/s][A[A

Evaluating: 100%|██████████| 32/32 [00:14<00:00,  2.20it/s]

0.5625
0.875
== Finished evaluating model: next_sentence_cls_0 ==
	Mean evaluation acc: 0.831




Iteration:  44%|████▍     | 305/695 [07:05<56:58,  8.77s/it][A

	Evaluation accuracy: 0.831 | Stale count: 0



Iteration:  44%|████▍     | 306/695 [07:06<40:51,  6.30s/it][A
Iteration:  44%|████▍     | 307/695 [07:07<30:55,  4.78s/it][A
Iteration:  44%|████▍     | 308/695 [07:08<23:59,  3.72s/it][A
Iteration:  44%|████▍     | 309/695 [07:09<19:06,  2.97s/it][A
Iteration:  45%|████▍     | 310/695 [07:11<15:47,  2.46s/it][A
Iteration:  45%|████▍     | 311/695 [07:12<13:24,  2.10s/it][A
Iteration:  45%|████▍     | 312/695 [07:13<11:46,  1.85s/it][A
Iteration:  45%|████▌     | 313/695 [07:15<10:37,  1.67s/it][A
Iteration:  45%|████▌     | 314/695 [07:16<09:52,  1.56s/it][A
Iteration:  45%|████▌     | 315/695 [07:17<09:16,  1.46s/it][A
Iteration:  45%|████▌     | 316/695 [07:18<08:51,  1.40s/it][A
Iteration:  46%|████▌     | 317/695 [07:20<08:36,  1.37s/it][A
Iteration:  46%|████▌     | 318/695 [07:21<08:27,  1.35s/it][A
Iteration:  46%|████▌     | 319/695 [07:22<08:15,  1.32s/it][A
Iteration:  46%|████▌     | 320/695 [07:23<08:07,  1.30s/it][A
Iteration:  46%|████▌     | 321/695 [07

	Global step: 275 | LR: [7.257257257257258e-06, 7.257257257257258e-06] | Avg. loss: 0.352



Iteration:  58%|█████▊    | 406/695 [09:13<06:08,  1.27s/it][A
Iteration:  59%|█████▊    | 407/695 [09:14<06:05,  1.27s/it][A
Iteration:  59%|█████▊    | 408/695 [09:15<06:03,  1.27s/it][A
Iteration:  59%|█████▉    | 409/695 [09:17<06:01,  1.26s/it][A
Iteration:  59%|█████▉    | 410/695 [09:18<06:02,  1.27s/it][A
Iteration:  59%|█████▉    | 411/695 [09:19<05:59,  1.26s/it][A
Iteration:  59%|█████▉    | 412/695 [09:21<05:58,  1.27s/it][A
Iteration:  59%|█████▉    | 413/695 [09:22<05:57,  1.27s/it][A
Iteration:  60%|█████▉    | 414/695 [09:23<05:57,  1.27s/it][A
Iteration:  60%|█████▉    | 415/695 [09:24<05:54,  1.27s/it][A
Iteration:  60%|█████▉    | 416/695 [09:26<05:52,  1.26s/it][A
Iteration:  60%|██████    | 417/695 [09:27<05:50,  1.26s/it][A
Iteration:  60%|██████    | 418/695 [09:28<05:53,  1.28s/it][A
Iteration:  60%|██████    | 419/695 [09:29<05:50,  1.27s/it][A
Iteration:  60%|██████    | 420/695 [09:31<05:47,  1.26s/it][A
Iteration:  61%|██████    | 421/695 [09

	Global step: 300 | LR: [7.007007007007007e-06, 7.007007007007007e-06] | Avg. loss: 0.377
	Evaluating on valiadtion set!




Evaluating:   0%|          | 0/32 [00:00<?, ?it/s][A[A

Evaluating:   3%|▎         | 1/32 [00:01<00:38,  1.23s/it][A[A

0.96875




Evaluating:   6%|▋         | 2/32 [00:01<00:23,  1.30it/s][A[A

0.8125




Evaluating:   9%|▉         | 3/32 [00:02<00:17,  1.61it/s][A[A

0.90625




Evaluating:  12%|█▎        | 4/32 [00:02<00:15,  1.82it/s][A[A

0.8125




Evaluating:  16%|█▌        | 5/32 [00:03<00:13,  1.96it/s][A[A

0.8125




Evaluating:  19%|█▉        | 6/32 [00:03<00:12,  2.06it/s][A[A

0.84375




Evaluating:  22%|██▏       | 7/32 [00:03<00:11,  2.11it/s][A[A

0.875




Evaluating:  25%|██▌       | 8/32 [00:04<00:11,  2.15it/s][A[A

0.8125




Evaluating:  28%|██▊       | 9/32 [00:04<00:10,  2.19it/s][A[A

0.875




Evaluating:  31%|███▏      | 10/32 [00:05<00:09,  2.22it/s][A[A

0.84375




Evaluating:  34%|███▍      | 11/32 [00:05<00:09,  2.23it/s][A[A

0.9375




Evaluating:  38%|███▊      | 12/32 [00:06<00:08,  2.25it/s][A[A

0.90625




Evaluating:  41%|████      | 13/32 [00:06<00:08,  2.26it/s][A[A

0.875




Evaluating:  44%|████▍     | 14/32 [00:06<00:07,  2.25it/s][A[A

0.90625




Evaluating:  47%|████▋     | 15/32 [00:07<00:07,  2.27it/s][A[A

0.78125




Evaluating:  50%|█████     | 16/32 [00:07<00:07,  2.27it/s][A[A

0.875




Evaluating:  53%|█████▎    | 17/32 [00:08<00:06,  2.26it/s][A[A

0.9375




Evaluating:  56%|█████▋    | 18/32 [00:08<00:06,  2.26it/s][A[A

0.875




Evaluating:  59%|█████▉    | 19/32 [00:09<00:05,  2.26it/s][A[A

0.78125




Evaluating:  62%|██████▎   | 20/32 [00:09<00:05,  2.26it/s][A[A

0.875




Evaluating:  66%|██████▌   | 21/32 [00:10<00:04,  2.27it/s][A[A

0.8125




Evaluating:  69%|██████▉   | 22/32 [00:10<00:04,  2.27it/s][A[A

0.78125




Evaluating:  72%|███████▏  | 23/32 [00:10<00:03,  2.26it/s][A[A

0.90625




Evaluating:  75%|███████▌  | 24/32 [00:11<00:03,  2.26it/s][A[A

0.75




Evaluating:  78%|███████▊  | 25/32 [00:11<00:03,  2.27it/s][A[A

0.8125




Evaluating:  81%|████████▏ | 26/32 [00:12<00:02,  2.26it/s][A[A

0.71875




Evaluating:  84%|████████▍ | 27/32 [00:12<00:02,  2.27it/s][A[A

0.8125




Evaluating:  88%|████████▊ | 28/32 [00:13<00:01,  2.27it/s][A[A

0.875




Evaluating:  91%|█████████ | 29/32 [00:13<00:01,  2.27it/s][A[A

0.90625




Evaluating:  94%|█████████▍| 30/32 [00:14<00:00,  2.27it/s][A[A

0.78125




Evaluating:  97%|█████████▋| 31/32 [00:14<00:00,  2.27it/s][A[A

Evaluating: 100%|██████████| 32/32 [00:14<00:00,  2.19it/s]

0.6875
1.0
== Finished evaluating model: next_sentence_cls_0 ==
	Mean evaluation acc: 0.849




Iteration:  73%|███████▎  | 505/695 [11:43<27:48,  8.78s/it][A

	Evaluation accuracy: 0.849 | Stale count: 0



Iteration:  73%|███████▎  | 506/695 [11:44<19:51,  6.31s/it][A
Iteration:  73%|███████▎  | 507/695 [11:45<14:59,  4.78s/it][A
Iteration:  73%|███████▎  | 508/695 [11:46<11:35,  3.72s/it][A
Iteration:  73%|███████▎  | 509/695 [11:48<09:13,  2.97s/it][A
Iteration:  73%|███████▎  | 510/695 [11:49<07:36,  2.47s/it][A
Iteration:  74%|███████▎  | 511/695 [11:50<06:25,  2.10s/it][A
Iteration:  74%|███████▎  | 512/695 [11:51<05:36,  1.84s/it][A
Iteration:  74%|███████▍  | 513/695 [11:53<05:02,  1.66s/it][A
Iteration:  74%|███████▍  | 514/695 [11:54<04:41,  1.55s/it][A
Iteration:  74%|███████▍  | 515/695 [11:55<04:23,  1.46s/it][A
Iteration:  74%|███████▍  | 516/695 [11:56<04:11,  1.40s/it][A
Iteration:  74%|███████▍  | 517/695 [11:58<04:02,  1.36s/it][A
Iteration:  75%|███████▍  | 518/695 [11:59<03:57,  1.34s/it][A
Iteration:  75%|███████▍  | 519/695 [12:00<03:51,  1.32s/it][A
Iteration:  75%|███████▍  | 520/695 [12:02<03:48,  1.31s/it][A
Iteration:  75%|███████▍  | 521/695 [12

	Global step: 325 | LR: [6.7567567567567575e-06, 6.7567567567567575e-06] | Avg. loss: 0.346



Iteration:  87%|████████▋ | 606/695 [13:51<01:53,  1.28s/it][A
Iteration:  87%|████████▋ | 607/695 [13:52<01:51,  1.27s/it][A
Iteration:  87%|████████▋ | 608/695 [13:53<01:50,  1.27s/it][A
Iteration:  88%|████████▊ | 609/695 [13:55<01:49,  1.27s/it][A
Iteration:  88%|████████▊ | 610/695 [13:56<01:48,  1.28s/it][A
Iteration:  88%|████████▊ | 611/695 [13:57<01:47,  1.28s/it][A
Iteration:  88%|████████▊ | 612/695 [13:59<01:45,  1.27s/it][A
Iteration:  88%|████████▊ | 613/695 [14:00<01:43,  1.27s/it][A
Iteration:  88%|████████▊ | 614/695 [14:01<01:43,  1.27s/it][A
Iteration:  88%|████████▊ | 615/695 [14:02<01:41,  1.27s/it][A
Iteration:  89%|████████▊ | 616/695 [14:04<01:40,  1.27s/it][A
Iteration:  89%|████████▉ | 617/695 [14:05<01:38,  1.27s/it][A
Iteration:  89%|████████▉ | 618/695 [14:06<01:38,  1.28s/it][A
Iteration:  89%|████████▉ | 619/695 [14:07<01:36,  1.27s/it][A
Iteration:  89%|████████▉ | 620/695 [14:09<01:35,  1.27s/it][A
Iteration:  89%|████████▉ | 621/695 [14

	Global step: 350 | LR: [6.506506506506507e-06, 6.506506506506507e-06] | Avg. loss: 0.347
	Evaluating on valiadtion set!




Evaluating:   0%|          | 0/32 [00:00<?, ?it/s][A[A

Evaluating:   3%|▎         | 1/32 [00:01<00:38,  1.23s/it][A[A

0.96875




Evaluating:   6%|▋         | 2/32 [00:01<00:23,  1.30it/s][A[A

0.8125




Evaluating:   9%|▉         | 3/32 [00:02<00:17,  1.62it/s][A[A

0.875




Evaluating:  12%|█▎        | 4/32 [00:02<00:15,  1.83it/s][A[A

0.84375




Evaluating:  16%|█▌        | 5/32 [00:02<00:13,  1.97it/s][A[A

0.90625




Evaluating:  19%|█▉        | 6/32 [00:03<00:12,  2.06it/s][A[A

0.875




Evaluating:  22%|██▏       | 7/32 [00:03<00:11,  2.12it/s][A[A

0.84375




Evaluating:  25%|██▌       | 8/32 [00:04<00:11,  2.17it/s][A[A

0.84375




Evaluating:  28%|██▊       | 9/32 [00:04<00:10,  2.21it/s][A[A

0.8125




Evaluating:  31%|███▏      | 10/32 [00:05<00:09,  2.22it/s][A[A

0.84375




Evaluating:  34%|███▍      | 11/32 [00:05<00:09,  2.24it/s][A[A

0.9375




Evaluating:  38%|███▊      | 12/32 [00:06<00:08,  2.25it/s][A[A

0.90625




Evaluating:  41%|████      | 13/32 [00:06<00:08,  2.26it/s][A[A

0.78125




Evaluating:  44%|████▍     | 14/32 [00:06<00:07,  2.25it/s][A[A

0.90625




Evaluating:  47%|████▋     | 15/32 [00:07<00:07,  2.26it/s][A[A

0.8125




Evaluating:  50%|█████     | 16/32 [00:07<00:07,  2.27it/s][A[A

0.84375




Evaluating:  53%|█████▎    | 17/32 [00:08<00:06,  2.26it/s][A[A

0.9375




Evaluating:  56%|█████▋    | 18/32 [00:08<00:06,  2.27it/s][A[A

0.84375




Evaluating:  59%|█████▉    | 19/32 [00:09<00:05,  2.26it/s][A[A

0.90625




Evaluating:  62%|██████▎   | 20/32 [00:09<00:05,  2.26it/s][A[A

0.875




Evaluating:  66%|██████▌   | 21/32 [00:10<00:04,  2.27it/s][A[A

0.75




Evaluating:  69%|██████▉   | 22/32 [00:10<00:04,  2.27it/s][A[A

0.78125




Evaluating:  72%|███████▏  | 23/32 [00:10<00:03,  2.27it/s][A[A

0.84375




Evaluating:  75%|███████▌  | 24/32 [00:11<00:03,  2.27it/s][A[A

0.75




Evaluating:  78%|███████▊  | 25/32 [00:11<00:03,  2.26it/s][A[A

0.8125




Evaluating:  81%|████████▏ | 26/32 [00:12<00:02,  2.26it/s][A[A

0.71875




Evaluating:  84%|████████▍ | 27/32 [00:12<00:02,  2.27it/s][A[A

0.8125




Evaluating:  88%|████████▊ | 28/32 [00:13<00:01,  2.27it/s][A[A

0.8125




Evaluating:  91%|█████████ | 29/32 [00:13<00:01,  2.27it/s][A[A

0.875




Evaluating:  94%|█████████▍| 30/32 [00:14<00:00,  2.27it/s][A[A

0.8125




Evaluating:  97%|█████████▋| 31/32 [00:14<00:00,  2.28it/s][A[A

Evaluating: 100%|██████████| 32/32 [00:14<00:00,  2.20it/s]

Iteration:   1%|▏         | 10/695 [00:27<1:05:49,  5.77s/it][A

0.78125
1.0
== Finished evaluating model: next_sentence_cls_0 ==
	Mean evaluation acc: 0.848
	Evaluation accuracy: 0.848 | Stale count: 1



Iteration:   2%|▏         | 11/695 [00:27<47:19,  4.15s/it]  [A
Iteration:   2%|▏         | 12/695 [00:28<37:15,  3.27s/it][A
Iteration:   2%|▏         | 13/695 [00:30<30:13,  2.66s/it][A
Iteration:   2%|▏         | 14/695 [00:31<25:22,  2.24s/it][A
Iteration:   2%|▏         | 15/695 [00:32<22:07,  1.95s/it][A
Iteration:   2%|▏         | 16/695 [00:33<19:40,  1.74s/it][A
Iteration:   2%|▏         | 17/695 [00:35<18:01,  1.60s/it][A
Iteration:   3%|▎         | 18/695 [00:36<16:50,  1.49s/it][A
Iteration:   3%|▎         | 19/695 [00:37<16:10,  1.44s/it][A
Iteration:   3%|▎         | 20/695 [00:38<15:32,  1.38s/it][A
Iteration:   3%|▎         | 21/695 [00:40<15:03,  1.34s/it][A
Iteration:   3%|▎         | 22/695 [00:41<14:49,  1.32s/it][A
Iteration:   3%|▎         | 23/695 [00:42<14:44,  1.32s/it][A
Iteration:   3%|▎         | 24/695 [00:44<14:29,  1.30s/it][A
Iteration:   4%|▎         | 25/695 [00:45<14:21,  1.29s/it][A
Iteration:   4%|▎         | 26/695 [00:46<14:18,  1.

	Global step: 375 | LR: [6.2562562562562565e-06, 6.2562562562562565e-06] | Avg. loss: 0.322



Iteration:  16%|█▌        | 111/695 [02:34<12:21,  1.27s/it][A
Iteration:  16%|█▌        | 112/695 [02:35<12:16,  1.26s/it][A
Iteration:  16%|█▋        | 113/695 [02:36<12:13,  1.26s/it][A
Iteration:  16%|█▋        | 114/695 [02:38<12:12,  1.26s/it][A
Iteration:  17%|█▋        | 115/695 [02:39<12:18,  1.27s/it][A
Iteration:  17%|█▋        | 116/695 [02:40<12:12,  1.26s/it][A
Iteration:  17%|█▋        | 117/695 [02:42<12:09,  1.26s/it][A
Iteration:  17%|█▋        | 118/695 [02:43<12:07,  1.26s/it][A
Iteration:  17%|█▋        | 119/695 [02:44<12:12,  1.27s/it][A
Iteration:  17%|█▋        | 120/695 [02:45<12:06,  1.26s/it][A
Iteration:  17%|█▋        | 121/695 [02:47<12:04,  1.26s/it][A
Iteration:  18%|█▊        | 122/695 [02:48<12:02,  1.26s/it][A
Iteration:  18%|█▊        | 123/695 [02:49<12:07,  1.27s/it][A
Iteration:  18%|█▊        | 124/695 [02:50<12:01,  1.26s/it][A
Iteration:  18%|█▊        | 125/695 [02:52<11:59,  1.26s/it][A
Iteration:  18%|█▊        | 126/695 [02

	Global step: 400 | LR: [6.006006006006007e-06, 6.006006006006007e-06] | Avg. loss: 0.309
	Evaluating on valiadtion set!




Evaluating:   0%|          | 0/32 [00:00<?, ?it/s][A[A

Evaluating:   3%|▎         | 1/32 [00:01<00:38,  1.24s/it][A[A

0.9375




Evaluating:   6%|▋         | 2/32 [00:01<00:23,  1.29it/s][A[A

0.84375




Evaluating:   9%|▉         | 3/32 [00:02<00:18,  1.60it/s][A[A

0.84375




Evaluating:  12%|█▎        | 4/32 [00:02<00:15,  1.81it/s][A[A

0.8125




Evaluating:  16%|█▌        | 5/32 [00:03<00:13,  1.96it/s][A[A

0.875




Evaluating:  19%|█▉        | 6/32 [00:03<00:12,  2.06it/s][A[A

0.84375




Evaluating:  22%|██▏       | 7/32 [00:03<00:11,  2.12it/s][A[A

0.84375




Evaluating:  25%|██▌       | 8/32 [00:04<00:11,  2.16it/s][A[A

0.90625




Evaluating:  28%|██▊       | 9/32 [00:04<00:10,  2.20it/s][A[A

0.8125




Evaluating:  31%|███▏      | 10/32 [00:05<00:09,  2.22it/s][A[A

0.875




Evaluating:  34%|███▍      | 11/32 [00:05<00:09,  2.23it/s][A[A

0.9375




Evaluating:  38%|███▊      | 12/32 [00:06<00:08,  2.25it/s][A[A

0.90625




Evaluating:  41%|████      | 13/32 [00:06<00:08,  2.26it/s][A[A

0.84375




Evaluating:  44%|████▍     | 14/32 [00:06<00:07,  2.25it/s][A[A

0.90625




Evaluating:  47%|████▋     | 15/32 [00:07<00:07,  2.26it/s][A[A

0.78125




Evaluating:  50%|█████     | 16/32 [00:07<00:07,  2.27it/s][A[A

0.875




Evaluating:  53%|█████▎    | 17/32 [00:08<00:06,  2.27it/s][A[A

0.90625




Evaluating:  56%|█████▋    | 18/32 [00:08<00:06,  2.26it/s][A[A

0.875




Evaluating:  59%|█████▉    | 19/32 [00:09<00:05,  2.25it/s][A[A

0.875




Evaluating:  62%|██████▎   | 20/32 [00:09<00:05,  2.26it/s][A[A

0.875




Evaluating:  66%|██████▌   | 21/32 [00:10<00:04,  2.26it/s][A[A

0.75




Evaluating:  69%|██████▉   | 22/32 [00:10<00:04,  2.27it/s][A[A

0.78125




Evaluating:  72%|███████▏  | 23/32 [00:10<00:03,  2.27it/s][A[A

0.875




Evaluating:  75%|███████▌  | 24/32 [00:11<00:03,  2.27it/s][A[A

0.78125




Evaluating:  78%|███████▊  | 25/32 [00:11<00:03,  2.27it/s][A[A

0.78125




Evaluating:  81%|████████▏ | 26/32 [00:12<00:02,  2.26it/s][A[A

0.8125




Evaluating:  84%|████████▍ | 27/32 [00:12<00:02,  2.27it/s][A[A

0.84375




Evaluating:  88%|████████▊ | 28/32 [00:13<00:01,  2.28it/s][A[A

0.84375




Evaluating:  91%|█████████ | 29/32 [00:13<00:01,  2.27it/s][A[A

0.78125




Evaluating:  94%|█████████▍| 30/32 [00:14<00:00,  2.27it/s][A[A

0.78125




Evaluating:  97%|█████████▋| 31/32 [00:14<00:00,  2.27it/s][A[A

Evaluating: 100%|██████████| 32/32 [00:14<00:00,  2.19it/s]

0.78125
1.0
== Finished evaluating model: next_sentence_cls_0 ==
	Mean evaluation acc: 0.850




Iteration:  30%|███       | 210/695 [05:05<1:11:51,  8.89s/it][A

	Evaluation accuracy: 0.850 | Stale count: 0



Iteration:  30%|███       | 211/695 [05:05<51:30,  6.38s/it]  [A
Iteration:  31%|███       | 212/695 [05:07<38:56,  4.84s/it][A
Iteration:  31%|███       | 213/695 [05:08<30:08,  3.75s/it][A
Iteration:  31%|███       | 214/695 [05:09<24:02,  3.00s/it][A
Iteration:  31%|███       | 215/695 [05:10<19:52,  2.48s/it][A
Iteration:  31%|███       | 216/695 [05:12<16:50,  2.11s/it][A
Iteration:  31%|███       | 217/695 [05:13<14:44,  1.85s/it][A
Iteration:  31%|███▏      | 218/695 [05:14<13:18,  1.67s/it][A
Iteration:  32%|███▏      | 219/695 [05:15<12:21,  1.56s/it][A
Iteration:  32%|███▏      | 220/695 [05:17<11:34,  1.46s/it][A
Iteration:  32%|███▏      | 221/695 [05:18<11:05,  1.40s/it][A
Iteration:  32%|███▏      | 222/695 [05:19<10:44,  1.36s/it][A
Iteration:  32%|███▏      | 223/695 [05:20<10:34,  1.34s/it][A
Iteration:  32%|███▏      | 224/695 [05:22<10:23,  1.32s/it][A
Iteration:  32%|███▏      | 225/695 [05:23<10:15,  1.31s/it][A
Iteration:  33%|███▎      | 226/695 [

	Global step: 425 | LR: [5.755755755755756e-06, 5.755755755755756e-06] | Avg. loss: 0.301



Iteration:  45%|████▍     | 311/695 [07:12<08:11,  1.28s/it][A
Iteration:  45%|████▍     | 312/695 [07:14<08:08,  1.27s/it][A
Iteration:  45%|████▌     | 313/695 [07:15<08:06,  1.27s/it][A
Iteration:  45%|████▌     | 314/695 [07:16<08:06,  1.28s/it][A
Iteration:  45%|████▌     | 315/695 [07:18<08:07,  1.28s/it][A
Iteration:  45%|████▌     | 316/695 [07:19<08:04,  1.28s/it][A
Iteration:  46%|████▌     | 317/695 [07:20<08:03,  1.28s/it][A
Iteration:  46%|████▌     | 318/695 [07:21<07:59,  1.27s/it][A
Iteration:  46%|████▌     | 319/695 [07:23<08:01,  1.28s/it][A
Iteration:  46%|████▌     | 320/695 [07:24<07:55,  1.27s/it][A
Iteration:  46%|████▌     | 321/695 [07:25<07:52,  1.26s/it][A
Iteration:  46%|████▋     | 322/695 [07:26<07:51,  1.26s/it][A
Iteration:  46%|████▋     | 323/695 [07:28<07:55,  1.28s/it][A
Iteration:  47%|████▋     | 324/695 [07:29<07:50,  1.27s/it][A
Iteration:  47%|████▋     | 325/695 [07:30<07:49,  1.27s/it][A
Iteration:  47%|████▋     | 326/695 [07

	Global step: 450 | LR: [5.505505505505506e-06, 5.505505505505506e-06] | Avg. loss: 0.302
	Evaluating on valiadtion set!




Evaluating:   0%|          | 0/32 [00:00<?, ?it/s][A[A

Evaluating:   3%|▎         | 1/32 [00:01<00:38,  1.24s/it][A[A

0.9375




Evaluating:   6%|▋         | 2/32 [00:01<00:23,  1.30it/s][A[A

0.875




Evaluating:   9%|▉         | 3/32 [00:02<00:18,  1.61it/s][A[A

0.875




Evaluating:  12%|█▎        | 4/32 [00:02<00:15,  1.81it/s][A[A

0.8125




Evaluating:  16%|█▌        | 5/32 [00:03<00:13,  1.96it/s][A[A

0.875




Evaluating:  19%|█▉        | 6/32 [00:03<00:12,  2.06it/s][A[A

0.84375




Evaluating:  22%|██▏       | 7/32 [00:03<00:11,  2.11it/s][A[A

0.84375




Evaluating:  25%|██▌       | 8/32 [00:04<00:11,  2.16it/s][A[A

0.9375




Evaluating:  28%|██▊       | 9/32 [00:04<00:10,  2.19it/s][A[A

0.8125




Evaluating:  31%|███▏      | 10/32 [00:05<00:09,  2.22it/s][A[A

0.8125




Evaluating:  34%|███▍      | 11/32 [00:05<00:09,  2.23it/s][A[A

0.96875




Evaluating:  38%|███▊      | 12/32 [00:06<00:08,  2.25it/s][A[A

0.9375




Evaluating:  41%|████      | 13/32 [00:06<00:08,  2.26it/s][A[A

0.84375




Evaluating:  44%|████▍     | 14/32 [00:06<00:07,  2.25it/s][A[A

0.875




Evaluating:  47%|████▋     | 15/32 [00:07<00:07,  2.27it/s][A[A

0.8125




Evaluating:  50%|█████     | 16/32 [00:07<00:07,  2.27it/s][A[A

0.8125




Evaluating:  53%|█████▎    | 17/32 [00:08<00:06,  2.26it/s][A[A

0.90625




Evaluating:  56%|█████▋    | 18/32 [00:08<00:06,  2.26it/s][A[A

0.875




Evaluating:  59%|█████▉    | 19/32 [00:09<00:05,  2.25it/s][A[A

0.875




Evaluating:  62%|██████▎   | 20/32 [00:09<00:05,  2.26it/s][A[A

0.84375




Evaluating:  66%|██████▌   | 21/32 [00:10<00:04,  2.27it/s][A[A

0.6875




Evaluating:  69%|██████▉   | 22/32 [00:10<00:04,  2.27it/s][A[A

0.78125




Evaluating:  72%|███████▏  | 23/32 [00:10<00:03,  2.27it/s][A[A

0.84375




Evaluating:  75%|███████▌  | 24/32 [00:11<00:03,  2.26it/s][A[A

0.90625




Evaluating:  78%|███████▊  | 25/32 [00:11<00:03,  2.26it/s][A[A

0.78125




Evaluating:  81%|████████▏ | 26/32 [00:12<00:02,  2.26it/s][A[A

0.78125




Evaluating:  84%|████████▍ | 27/32 [00:12<00:02,  2.27it/s][A[A

0.90625




Evaluating:  88%|████████▊ | 28/32 [00:13<00:01,  2.27it/s][A[A

0.875




Evaluating:  91%|█████████ | 29/32 [00:13<00:01,  2.27it/s][A[A

0.875




Evaluating:  94%|█████████▍| 30/32 [00:14<00:00,  2.27it/s][A[A

0.78125




Evaluating:  97%|█████████▋| 31/32 [00:14<00:00,  2.28it/s][A[A

Evaluating: 100%|██████████| 32/32 [00:14<00:00,  2.19it/s]

0.75
0.875
== Finished evaluating model: next_sentence_cls_0 ==
	Mean evaluation acc: 0.851




Iteration:  59%|█████▉    | 410/695 [09:39<36:01,  7.58s/it][A

	Evaluation accuracy: 0.851 | Stale count: 0



Iteration:  59%|█████▉    | 411/695 [09:40<25:55,  5.48s/it][A
Iteration:  59%|█████▉    | 412/695 [09:41<19:49,  4.20s/it][A
Iteration:  59%|█████▉    | 413/695 [09:42<15:34,  3.31s/it][A
Iteration:  60%|█████▉    | 414/695 [09:43<12:38,  2.70s/it][A
Iteration:  60%|█████▉    | 415/695 [09:45<10:37,  2.28s/it][A
Iteration:  60%|█████▉    | 416/695 [09:46<09:10,  1.97s/it][A
Iteration:  60%|██████    | 417/695 [09:47<08:07,  1.75s/it][A
Iteration:  60%|██████    | 418/695 [09:48<07:24,  1.61s/it][A
Iteration:  60%|██████    | 419/695 [09:50<06:56,  1.51s/it][A
Iteration:  60%|██████    | 420/695 [09:51<06:34,  1.43s/it][A
Iteration:  61%|██████    | 421/695 [09:52<06:17,  1.38s/it][A
Iteration:  61%|██████    | 422/695 [09:53<06:06,  1.34s/it][A
Iteration:  61%|██████    | 423/695 [09:55<06:02,  1.33s/it][A
Iteration:  61%|██████    | 424/695 [09:56<05:54,  1.31s/it][A
Iteration:  61%|██████    | 425/695 [09:57<05:49,  1.29s/it][A
Iteration:  61%|██████▏   | 426/695 [09

	Global step: 475 | LR: [5.255255255255256e-06, 5.255255255255256e-06] | Avg. loss: 0.302



Iteration:  74%|███████▎  | 511/695 [11:47<03:55,  1.28s/it][A
Iteration:  74%|███████▎  | 512/695 [11:48<03:52,  1.27s/it][A
Iteration:  74%|███████▍  | 513/695 [11:49<03:50,  1.27s/it][A
Iteration:  74%|███████▍  | 514/695 [11:50<03:48,  1.26s/it][A
Iteration:  74%|███████▍  | 515/695 [11:52<03:49,  1.27s/it][A
Iteration:  74%|███████▍  | 516/695 [11:53<03:46,  1.27s/it][A
Iteration:  74%|███████▍  | 517/695 [11:54<03:45,  1.26s/it][A
Iteration:  75%|███████▍  | 518/695 [11:55<03:43,  1.26s/it][A
Iteration:  75%|███████▍  | 519/695 [11:57<03:43,  1.27s/it][A
Iteration:  75%|███████▍  | 520/695 [11:58<03:41,  1.27s/it][A
Iteration:  75%|███████▍  | 521/695 [11:59<03:39,  1.26s/it][A
Iteration:  75%|███████▌  | 522/695 [12:00<03:38,  1.26s/it][A
Iteration:  75%|███████▌  | 523/695 [12:02<03:38,  1.27s/it][A
Iteration:  75%|███████▌  | 524/695 [12:03<03:36,  1.27s/it][A
Iteration:  76%|███████▌  | 525/695 [12:04<03:35,  1.27s/it][A
Iteration:  76%|███████▌  | 526/695 [12

	Global step: 500 | LR: [5.005005005005006e-06, 5.005005005005006e-06] | Avg. loss: 0.290
	Evaluating on valiadtion set!




Evaluating:   0%|          | 0/32 [00:00<?, ?it/s][A[A

Evaluating:   3%|▎         | 1/32 [00:01<00:38,  1.24s/it][A[A

1.0




Evaluating:   6%|▋         | 2/32 [00:01<00:23,  1.30it/s][A[A

0.9375




Evaluating:   9%|▉         | 3/32 [00:02<00:17,  1.62it/s][A[A

0.9375




Evaluating:  12%|█▎        | 4/32 [00:02<00:15,  1.83it/s][A[A

0.84375




Evaluating:  16%|█▌        | 5/32 [00:02<00:13,  1.97it/s][A[A

0.90625




Evaluating:  19%|█▉        | 6/32 [00:03<00:12,  2.07it/s][A[A

0.90625




Evaluating:  22%|██▏       | 7/32 [00:03<00:11,  2.13it/s][A[A

0.90625




Evaluating:  25%|██▌       | 8/32 [00:04<00:11,  2.17it/s][A[A

0.9375




Evaluating:  28%|██▊       | 9/32 [00:04<00:10,  2.20it/s][A[A

0.875




Evaluating:  31%|███▏      | 10/32 [00:05<00:09,  2.23it/s][A[A

0.875




Evaluating:  34%|███▍      | 11/32 [00:05<00:09,  2.24it/s][A[A

0.96875




Evaluating:  38%|███▊      | 12/32 [00:06<00:08,  2.26it/s][A[A

0.9375




Evaluating:  41%|████      | 13/32 [00:06<00:08,  2.27it/s][A[A

0.875




Evaluating:  44%|████▍     | 14/32 [00:06<00:07,  2.26it/s][A[A

0.9375




Evaluating:  47%|████▋     | 15/32 [00:07<00:07,  2.27it/s][A[A

0.875




Evaluating:  50%|█████     | 16/32 [00:07<00:07,  2.27it/s][A[A

0.875




Evaluating:  53%|█████▎    | 17/32 [00:08<00:06,  2.27it/s][A[A

0.84375




Evaluating:  56%|█████▋    | 18/32 [00:08<00:06,  2.27it/s][A[A

0.8125




Evaluating:  59%|█████▉    | 19/32 [00:09<00:05,  2.26it/s][A[A

0.875




Evaluating:  62%|██████▎   | 20/32 [00:09<00:05,  2.27it/s][A[A

0.875




Evaluating:  66%|██████▌   | 21/32 [00:10<00:04,  2.28it/s][A[A

0.65625




Evaluating:  69%|██████▉   | 22/32 [00:10<00:04,  2.28it/s][A[A

0.78125




Evaluating:  72%|███████▏  | 23/32 [00:10<00:03,  2.27it/s][A[A

0.78125




Evaluating:  75%|███████▌  | 24/32 [00:11<00:03,  2.27it/s][A[A

0.75




Evaluating:  78%|███████▊  | 25/32 [00:11<00:03,  2.27it/s][A[A

0.78125




Evaluating:  81%|████████▏ | 26/32 [00:12<00:02,  2.27it/s][A[A

0.71875




Evaluating:  84%|████████▍ | 27/32 [00:12<00:02,  2.28it/s][A[A

0.8125




Evaluating:  88%|████████▊ | 28/32 [00:13<00:01,  2.28it/s][A[A

0.84375




Evaluating:  91%|█████████ | 29/32 [00:13<00:01,  2.28it/s][A[A

0.78125




Evaluating:  94%|█████████▍| 30/32 [00:13<00:00,  2.27it/s][A[A

0.8125




Evaluating:  97%|█████████▋| 31/32 [00:14<00:00,  2.28it/s][A[A

Evaluating: 100%|██████████| 32/32 [00:14<00:00,  2.20it/s]

0.75
0.875
== Finished evaluating model: next_sentence_cls_0 ==
	Mean evaluation acc: 0.854




Iteration:  88%|████████▊ | 610/695 [14:12<10:23,  7.34s/it][A

	Evaluation accuracy: 0.854 | Stale count: 0



Iteration:  88%|████████▊ | 611/695 [14:13<07:25,  5.30s/it][A
Iteration:  88%|████████▊ | 612/695 [14:14<05:38,  4.08s/it][A
Iteration:  88%|████████▊ | 613/695 [14:15<04:24,  3.22s/it][A
Iteration:  88%|████████▊ | 614/695 [14:16<03:33,  2.63s/it][A
Iteration:  88%|████████▊ | 615/695 [14:18<02:58,  2.23s/it][A
Iteration:  89%|████████▊ | 616/695 [14:19<02:32,  1.93s/it][A
Iteration:  89%|████████▉ | 617/695 [14:20<02:14,  1.72s/it][A
Iteration:  89%|████████▉ | 618/695 [14:21<02:01,  1.58s/it][A
Iteration:  89%|████████▉ | 619/695 [14:23<01:53,  1.50s/it][A
Iteration:  89%|████████▉ | 620/695 [14:24<01:47,  1.43s/it][A
Iteration:  89%|████████▉ | 621/695 [14:25<01:41,  1.37s/it][A
Iteration:  89%|████████▉ | 622/695 [14:26<01:37,  1.34s/it][A
Iteration:  90%|████████▉ | 623/695 [14:28<01:35,  1.33s/it][A
Iteration:  90%|████████▉ | 624/695 [14:29<01:32,  1.31s/it][A
Iteration:  90%|████████▉ | 625/695 [14:30<01:30,  1.29s/it][A
Iteration:  90%|█████████ | 626/695 [14

	Global step: 525 | LR: [4.754754754754755e-06, 4.754754754754755e-06] | Avg. loss: 0.289



Iteration:   2%|▏         | 16/695 [00:20<14:29,  1.28s/it][A
Iteration:   2%|▏         | 17/695 [00:21<14:21,  1.27s/it][A
Iteration:   3%|▎         | 18/695 [00:22<14:18,  1.27s/it][A
Iteration:   3%|▎         | 19/695 [00:23<14:18,  1.27s/it][A
Iteration:   3%|▎         | 20/695 [00:25<14:23,  1.28s/it][A
Iteration:   3%|▎         | 21/695 [00:26<14:16,  1.27s/it][A
Iteration:   3%|▎         | 22/695 [00:27<14:11,  1.27s/it][A
Iteration:   3%|▎         | 23/695 [00:29<14:10,  1.26s/it][A
Iteration:   3%|▎         | 24/695 [00:30<14:16,  1.28s/it][A
Iteration:   4%|▎         | 25/695 [00:31<14:11,  1.27s/it][A
Iteration:   4%|▎         | 26/695 [00:32<14:05,  1.26s/it][A
Iteration:   4%|▍         | 27/695 [00:34<14:04,  1.26s/it][A
Iteration:   4%|▍         | 28/695 [00:35<14:11,  1.28s/it][A
Iteration:   4%|▍         | 29/695 [00:36<14:06,  1.27s/it][A
Iteration:   4%|▍         | 30/695 [00:37<14:06,  1.27s/it][A
Iteration:   4%|▍         | 31/695 [00:39<14:00,  1.27

	Global step: 550 | LR: [4.504504504504505e-06, 4.504504504504505e-06] | Avg. loss: 0.267
	Evaluating on valiadtion set!




Evaluating:   0%|          | 0/32 [00:00<?, ?it/s][A[A

Evaluating:   3%|▎         | 1/32 [00:01<00:37,  1.22s/it][A[A

0.9375




Evaluating:   6%|▋         | 2/32 [00:01<00:22,  1.32it/s][A[A

0.9375




Evaluating:   9%|▉         | 3/32 [00:02<00:17,  1.63it/s][A[A

0.96875




Evaluating:  12%|█▎        | 4/32 [00:02<00:15,  1.84it/s][A[A

0.84375




Evaluating:  16%|█▌        | 5/32 [00:02<00:13,  1.97it/s][A[A

0.90625




Evaluating:  19%|█▉        | 6/32 [00:03<00:12,  2.08it/s][A[A

0.875




Evaluating:  22%|██▏       | 7/32 [00:03<00:11,  2.13it/s][A[A

0.90625




Evaluating:  25%|██▌       | 8/32 [00:04<00:11,  2.17it/s][A[A

0.875




Evaluating:  28%|██▊       | 9/32 [00:04<00:10,  2.21it/s][A[A

0.84375




Evaluating:  31%|███▏      | 10/32 [00:05<00:09,  2.23it/s][A[A

0.875




Evaluating:  34%|███▍      | 11/32 [00:05<00:09,  2.24it/s][A[A

0.9375




Evaluating:  38%|███▊      | 12/32 [00:06<00:08,  2.26it/s][A[A

0.9375




Evaluating:  41%|████      | 13/32 [00:06<00:08,  2.27it/s][A[A

0.875




Evaluating:  44%|████▍     | 14/32 [00:06<00:07,  2.26it/s][A[A

0.90625




Evaluating:  47%|████▋     | 15/32 [00:07<00:07,  2.27it/s][A[A

0.90625




Evaluating:  50%|█████     | 16/32 [00:07<00:07,  2.28it/s][A[A

0.90625




Evaluating:  53%|█████▎    | 17/32 [00:08<00:06,  2.27it/s][A[A

0.90625




Evaluating:  56%|█████▋    | 18/32 [00:08<00:06,  2.27it/s][A[A

0.84375




Evaluating:  59%|█████▉    | 19/32 [00:09<00:05,  2.27it/s][A[A

0.84375




Evaluating:  62%|██████▎   | 20/32 [00:09<00:05,  2.27it/s][A[A

0.875




Evaluating:  66%|██████▌   | 21/32 [00:09<00:04,  2.28it/s][A[A

0.75




Evaluating:  69%|██████▉   | 22/32 [00:10<00:04,  2.29it/s][A[A

0.78125




Evaluating:  72%|███████▏  | 23/32 [00:10<00:03,  2.28it/s][A[A

0.84375




Evaluating:  75%|███████▌  | 24/32 [00:11<00:03,  2.27it/s][A[A

0.8125




Evaluating:  78%|███████▊  | 25/32 [00:11<00:03,  2.27it/s][A[A

0.8125




Evaluating:  81%|████████▏ | 26/32 [00:12<00:02,  2.27it/s][A[A

0.78125




Evaluating:  84%|████████▍ | 27/32 [00:12<00:02,  2.28it/s][A[A

0.90625




Evaluating:  88%|████████▊ | 28/32 [00:13<00:01,  2.28it/s][A[A

0.84375




Evaluating:  91%|█████████ | 29/32 [00:13<00:01,  2.28it/s][A[A

0.8125




Evaluating:  94%|█████████▍| 30/32 [00:13<00:00,  2.28it/s][A[A

0.8125




Evaluating:  97%|█████████▋| 31/32 [00:14<00:00,  2.28it/s][A[A

Evaluating: 100%|██████████| 32/32 [00:14<00:00,  2.20it/s]

0.6875
1.0
== Finished evaluating model: next_sentence_cls_0 ==
	Mean evaluation acc: 0.867




Iteration:  17%|█▋        | 115/695 [02:45<1:11:21,  7.38s/it][A

	Evaluation accuracy: 0.867 | Stale count: 0



Iteration:  17%|█▋        | 116/695 [02:46<51:23,  5.32s/it]  [A
Iteration:  17%|█▋        | 117/695 [02:47<39:27,  4.10s/it][A
Iteration:  17%|█▋        | 118/695 [02:48<31:08,  3.24s/it][A
Iteration:  17%|█▋        | 119/695 [02:50<25:23,  2.64s/it][A
Iteration:  17%|█▋        | 120/695 [02:51<21:25,  2.24s/it][A
Iteration:  17%|█▋        | 121/695 [02:52<18:31,  1.94s/it][A
Iteration:  18%|█▊        | 122/695 [02:53<16:29,  1.73s/it][A
Iteration:  18%|█▊        | 123/695 [02:55<15:07,  1.59s/it][A
Iteration:  18%|█▊        | 124/695 [02:56<14:16,  1.50s/it][A
Iteration:  18%|█▊        | 125/695 [02:57<13:32,  1.43s/it][A
Iteration:  18%|█▊        | 126/695 [02:58<13:03,  1.38s/it][A
Iteration:  18%|█▊        | 127/695 [03:00<12:42,  1.34s/it][A
Iteration:  18%|█▊        | 128/695 [03:01<12:34,  1.33s/it][A
Iteration:  19%|█▊        | 129/695 [03:02<12:23,  1.31s/it][A
Iteration:  19%|█▊        | 130/695 [03:04<12:16,  1.30s/it][A
Iteration:  19%|█▉        | 131/695 [

	Global step: 575 | LR: [4.254254254254255e-06, 4.254254254254255e-06] | Avg. loss: 0.255



Iteration:  31%|███       | 216/695 [04:53<10:12,  1.28s/it][A
Iteration:  31%|███       | 217/695 [04:54<10:06,  1.27s/it][A
Iteration:  31%|███▏      | 218/695 [04:55<10:04,  1.27s/it][A
Iteration:  32%|███▏      | 219/695 [04:57<10:03,  1.27s/it][A
Iteration:  32%|███▏      | 220/695 [04:58<10:07,  1.28s/it][A
Iteration:  32%|███▏      | 221/695 [04:59<10:03,  1.27s/it][A
Iteration:  32%|███▏      | 222/695 [05:00<10:00,  1.27s/it][A
Iteration:  32%|███▏      | 223/695 [05:02<09:59,  1.27s/it][A
Iteration:  32%|███▏      | 224/695 [05:03<10:02,  1.28s/it][A
Iteration:  32%|███▏      | 225/695 [05:04<09:58,  1.27s/it][A
Iteration:  33%|███▎      | 226/695 [05:05<09:54,  1.27s/it][A
Iteration:  33%|███▎      | 227/695 [05:07<09:52,  1.27s/it][A
Iteration:  33%|███▎      | 228/695 [05:08<09:53,  1.27s/it][A
Iteration:  33%|███▎      | 229/695 [05:09<09:49,  1.27s/it][A
Iteration:  33%|███▎      | 230/695 [05:11<09:49,  1.27s/it][A
Iteration:  33%|███▎      | 231/695 [05

	Global step: 600 | LR: [4.004004004004005e-06, 4.004004004004005e-06] | Avg. loss: 0.260
	Evaluating on valiadtion set!




Evaluating:   0%|          | 0/32 [00:00<?, ?it/s][A[A

Evaluating:   3%|▎         | 1/32 [00:01<00:37,  1.23s/it][A[A

1.0




Evaluating:   6%|▋         | 2/32 [00:01<00:22,  1.31it/s][A[A

0.96875




Evaluating:   9%|▉         | 3/32 [00:02<00:17,  1.62it/s][A[A

0.96875




Evaluating:  12%|█▎        | 4/32 [00:02<00:15,  1.83it/s][A[A

0.90625




Evaluating:  16%|█▌        | 5/32 [00:02<00:13,  1.97it/s][A[A

0.90625




Evaluating:  19%|█▉        | 6/32 [00:03<00:12,  2.07it/s][A[A

0.90625




Evaluating:  22%|██▏       | 7/32 [00:03<00:11,  2.13it/s][A[A

0.9375




Evaluating:  25%|██▌       | 8/32 [00:04<00:11,  2.17it/s][A[A

0.9375




Evaluating:  28%|██▊       | 9/32 [00:04<00:10,  2.21it/s][A[A

1.0




Evaluating:  31%|███▏      | 10/32 [00:05<00:09,  2.22it/s][A[A

0.875




Evaluating:  34%|███▍      | 11/32 [00:05<00:09,  2.24it/s][A[A

1.0




Evaluating:  38%|███▊      | 12/32 [00:06<00:08,  2.26it/s][A[A

0.9375




Evaluating:  41%|████      | 13/32 [00:06<00:08,  2.27it/s][A[A

0.9375




Evaluating:  44%|████▍     | 14/32 [00:06<00:07,  2.26it/s][A[A

0.9375




Evaluating:  47%|████▋     | 15/32 [00:07<00:07,  2.27it/s][A[A

0.90625




Evaluating:  50%|█████     | 16/32 [00:07<00:07,  2.28it/s][A[A

0.90625




Evaluating:  53%|█████▎    | 17/32 [00:08<00:06,  2.27it/s][A[A

0.84375




Evaluating:  56%|█████▋    | 18/32 [00:08<00:06,  2.27it/s][A[A

0.8125




Evaluating:  59%|█████▉    | 19/32 [00:09<00:05,  2.26it/s][A[A

0.84375




Evaluating:  62%|██████▎   | 20/32 [00:09<00:05,  2.27it/s][A[A

0.84375




Evaluating:  66%|██████▌   | 21/32 [00:10<00:04,  2.28it/s][A[A

0.71875




Evaluating:  69%|██████▉   | 22/32 [00:10<00:04,  2.28it/s][A[A

0.75




Evaluating:  72%|███████▏  | 23/32 [00:10<00:03,  2.28it/s][A[A

0.75




Evaluating:  75%|███████▌  | 24/32 [00:11<00:03,  2.28it/s][A[A

0.78125




Evaluating:  78%|███████▊  | 25/32 [00:11<00:03,  2.27it/s][A[A

0.78125




Evaluating:  81%|████████▏ | 26/32 [00:12<00:02,  2.27it/s][A[A

0.78125




Evaluating:  84%|████████▍ | 27/32 [00:12<00:02,  2.27it/s][A[A

0.84375




Evaluating:  88%|████████▊ | 28/32 [00:13<00:01,  2.28it/s][A[A

0.8125




Evaluating:  91%|█████████ | 29/32 [00:13<00:01,  2.28it/s][A[A

0.75




Evaluating:  94%|█████████▍| 30/32 [00:13<00:00,  2.27it/s][A[A

0.8125




Evaluating:  97%|█████████▋| 31/32 [00:14<00:00,  2.28it/s][A[A

Evaluating: 100%|██████████| 32/32 [00:14<00:00,  2.20it/s]

Iteration:  45%|████▌     | 315/695 [07:13<35:37,  5.63s/it][A

0.65625
0.875
== Finished evaluating model: next_sentence_cls_0 ==
	Mean evaluation acc: 0.865
	Evaluation accuracy: 0.865 | Stale count: 1



Iteration:  45%|████▌     | 316/695 [07:13<25:48,  4.08s/it][A
Iteration:  46%|████▌     | 317/695 [07:14<20:23,  3.24s/it][A
Iteration:  46%|████▌     | 318/695 [07:16<16:35,  2.64s/it][A
Iteration:  46%|████▌     | 319/695 [07:17<13:58,  2.23s/it][A
Iteration:  46%|████▌     | 320/695 [07:18<12:10,  1.95s/it][A
Iteration:  46%|████▌     | 321/695 [07:20<10:49,  1.74s/it][A
Iteration:  46%|████▋     | 322/695 [07:21<09:54,  1.59s/it][A
Iteration:  46%|████▋     | 323/695 [07:22<09:14,  1.49s/it][A
Iteration:  47%|████▋     | 324/695 [07:23<08:50,  1.43s/it][A
Iteration:  47%|████▋     | 325/695 [07:25<08:29,  1.38s/it][A
Iteration:  47%|████▋     | 326/695 [07:26<08:14,  1.34s/it][A
Iteration:  47%|████▋     | 327/695 [07:27<08:04,  1.32s/it][A
Iteration:  47%|████▋     | 328/695 [07:28<08:00,  1.31s/it][A
Iteration:  47%|████▋     | 329/695 [07:30<07:52,  1.29s/it][A
Iteration:  47%|████▋     | 330/695 [07:31<07:48,  1.28s/it][A
Iteration:  48%|████▊     | 331/695 [07

	Global step: 625 | LR: [3.7537537537537537e-06, 3.7537537537537537e-06] | Avg. loss: 0.263



Iteration:  60%|█████▉    | 416/695 [09:20<05:55,  1.28s/it][A
Iteration:  60%|██████    | 417/695 [09:21<05:52,  1.27s/it][A
Iteration:  60%|██████    | 418/695 [09:22<05:50,  1.26s/it][A
Iteration:  60%|██████    | 419/695 [09:24<05:48,  1.26s/it][A
Iteration:  60%|██████    | 420/695 [09:25<05:49,  1.27s/it][A
Iteration:  61%|██████    | 421/695 [09:26<05:46,  1.27s/it][A
Iteration:  61%|██████    | 422/695 [09:27<05:44,  1.26s/it][A
Iteration:  61%|██████    | 423/695 [09:29<05:43,  1.26s/it][A
Iteration:  61%|██████    | 424/695 [09:30<05:44,  1.27s/it][A
Iteration:  61%|██████    | 425/695 [09:31<05:43,  1.27s/it][A
Iteration:  61%|██████▏   | 426/695 [09:32<05:41,  1.27s/it][A
Iteration:  61%|██████▏   | 427/695 [09:34<05:38,  1.26s/it][A
Iteration:  62%|██████▏   | 428/695 [09:35<05:39,  1.27s/it][A
Iteration:  62%|██████▏   | 429/695 [09:36<05:36,  1.27s/it][A
Iteration:  62%|██████▏   | 430/695 [09:38<05:34,  1.26s/it][A
Iteration:  62%|██████▏   | 431/695 [09

	Global step: 650 | LR: [3.5035035035035036e-06, 3.5035035035035036e-06] | Avg. loss: 0.266
	Evaluating on valiadtion set!




Evaluating:   0%|          | 0/32 [00:00<?, ?it/s][A[A

Evaluating:   3%|▎         | 1/32 [00:01<00:38,  1.23s/it][A[A

0.96875




Evaluating:   6%|▋         | 2/32 [00:01<00:23,  1.30it/s][A[A

0.9375




Evaluating:   9%|▉         | 3/32 [00:02<00:17,  1.62it/s][A[A

0.9375




Evaluating:  12%|█▎        | 4/32 [00:02<00:15,  1.83it/s][A[A

0.84375




Evaluating:  16%|█▌        | 5/32 [00:02<00:13,  1.97it/s][A[A

0.875




Evaluating:  19%|█▉        | 6/32 [00:03<00:12,  2.07it/s][A[A

0.84375




Evaluating:  22%|██▏       | 7/32 [00:03<00:11,  2.13it/s][A[A

0.90625




Evaluating:  25%|██▌       | 8/32 [00:04<00:11,  2.16it/s][A[A

0.90625




Evaluating:  28%|██▊       | 9/32 [00:04<00:10,  2.20it/s][A[A

0.90625




Evaluating:  31%|███▏      | 10/32 [00:05<00:09,  2.22it/s][A[A

0.84375




Evaluating:  34%|███▍      | 11/32 [00:05<00:09,  2.23it/s][A[A

0.96875




Evaluating:  38%|███▊      | 12/32 [00:06<00:08,  2.25it/s][A[A

0.9375




Evaluating:  41%|████      | 13/32 [00:06<00:08,  2.26it/s][A[A

0.90625




Evaluating:  44%|████▍     | 14/32 [00:06<00:07,  2.25it/s][A[A

0.9375




Evaluating:  47%|████▋     | 15/32 [00:07<00:07,  2.26it/s][A[A

0.875




Evaluating:  50%|█████     | 16/32 [00:07<00:07,  2.27it/s][A[A

0.90625




Evaluating:  53%|█████▎    | 17/32 [00:08<00:06,  2.26it/s][A[A

0.90625




Evaluating:  56%|█████▋    | 18/32 [00:08<00:06,  2.26it/s][A[A

0.875




Evaluating:  59%|█████▉    | 19/32 [00:09<00:05,  2.25it/s][A[A

0.875




Evaluating:  62%|██████▎   | 20/32 [00:09<00:05,  2.26it/s][A[A

0.875




Evaluating:  66%|██████▌   | 21/32 [00:10<00:04,  2.27it/s][A[A

0.75




Evaluating:  69%|██████▉   | 22/32 [00:10<00:04,  2.27it/s][A[A

0.78125




Evaluating:  72%|███████▏  | 23/32 [00:10<00:03,  2.27it/s][A[A

0.84375




Evaluating:  75%|███████▌  | 24/32 [00:11<00:03,  2.27it/s][A[A

0.8125




Evaluating:  78%|███████▊  | 25/32 [00:11<00:03,  2.27it/s][A[A

0.78125




Evaluating:  81%|████████▏ | 26/32 [00:12<00:02,  2.26it/s][A[A

0.78125




Evaluating:  84%|████████▍ | 27/32 [00:12<00:02,  2.27it/s][A[A

0.875




Evaluating:  88%|████████▊ | 28/32 [00:13<00:01,  2.28it/s][A[A

0.84375




Evaluating:  91%|█████████ | 29/32 [00:13<00:01,  2.27it/s][A[A

0.8125




Evaluating:  94%|█████████▍| 30/32 [00:14<00:00,  2.26it/s][A[A

0.8125




Evaluating:  97%|█████████▋| 31/32 [00:14<00:00,  2.27it/s][A[A

Evaluating: 100%|██████████| 32/32 [00:14<00:00,  2.19it/s]

0.75
1.0
== Finished evaluating model: next_sentence_cls_0 ==
	Mean evaluation acc: 0.871




Iteration:  74%|███████▍  | 515/695 [11:46<22:22,  7.46s/it][A

	Evaluation accuracy: 0.871 | Stale count: 0



Iteration:  74%|███████▍  | 516/695 [11:46<16:03,  5.38s/it][A
Iteration:  74%|███████▍  | 517/695 [11:48<12:16,  4.14s/it][A
Iteration:  75%|███████▍  | 518/695 [11:49<09:38,  3.27s/it][A
Iteration:  75%|███████▍  | 519/695 [11:50<07:48,  2.66s/it][A
Iteration:  75%|███████▍  | 520/695 [11:51<06:33,  2.25s/it][A
Iteration:  75%|███████▍  | 521/695 [11:53<05:38,  1.94s/it][A
Iteration:  75%|███████▌  | 522/695 [11:54<05:00,  1.74s/it][A
Iteration:  75%|███████▌  | 523/695 [11:55<04:34,  1.60s/it][A
Iteration:  75%|███████▌  | 524/695 [11:57<04:17,  1.51s/it][A
Iteration:  76%|███████▌  | 525/695 [11:58<04:03,  1.43s/it][A
Iteration:  76%|███████▌  | 526/695 [11:59<03:52,  1.38s/it][A
Iteration:  76%|███████▌  | 527/695 [12:00<03:46,  1.35s/it][A
Iteration:  76%|███████▌  | 528/695 [12:02<03:42,  1.33s/it][A
Iteration:  76%|███████▌  | 529/695 [12:03<03:38,  1.31s/it][A
Iteration:  76%|███████▋  | 530/695 [12:04<03:34,  1.30s/it][A
Iteration:  76%|███████▋  | 531/695 [12

	Global step: 675 | LR: [3.2532532532532535e-06, 3.2532532532532535e-06] | Avg. loss: 0.252



Iteration:  89%|████████▊ | 616/695 [13:53<01:40,  1.28s/it][A
Iteration:  89%|████████▉ | 617/695 [13:55<01:39,  1.27s/it][A
Iteration:  89%|████████▉ | 618/695 [13:56<01:37,  1.27s/it][A
Iteration:  89%|████████▉ | 619/695 [13:57<01:36,  1.27s/it][A
Iteration:  89%|████████▉ | 620/695 [13:58<01:35,  1.28s/it][A
Iteration:  89%|████████▉ | 621/695 [14:00<01:34,  1.27s/it][A
Iteration:  89%|████████▉ | 622/695 [14:01<01:32,  1.27s/it][A
Iteration:  90%|████████▉ | 623/695 [14:02<01:31,  1.27s/it][A
Iteration:  90%|████████▉ | 624/695 [14:03<01:30,  1.27s/it][A
Iteration:  90%|████████▉ | 625/695 [14:05<01:29,  1.27s/it][A
Iteration:  90%|█████████ | 626/695 [14:06<01:27,  1.27s/it][A
Iteration:  90%|█████████ | 627/695 [14:07<01:26,  1.27s/it][A
Iteration:  90%|█████████ | 628/695 [14:09<01:25,  1.28s/it][A
Iteration:  91%|█████████ | 629/695 [14:10<01:24,  1.27s/it][A
Iteration:  91%|█████████ | 630/695 [14:11<01:22,  1.27s/it][A
Iteration:  91%|█████████ | 631/695 [14

	Global step: 700 | LR: [3.0030030030030034e-06, 3.0030030030030034e-06] | Avg. loss: 0.249
	Evaluating on valiadtion set!




Evaluating:   0%|          | 0/32 [00:00<?, ?it/s][A[A

Evaluating:   3%|▎         | 1/32 [00:01<00:37,  1.22s/it][A[A

0.96875




Evaluating:   6%|▋         | 2/32 [00:01<00:22,  1.31it/s][A[A

0.96875




Evaluating:   9%|▉         | 3/32 [00:02<00:17,  1.63it/s][A[A

0.9375




Evaluating:  12%|█▎        | 4/32 [00:02<00:15,  1.84it/s][A[A

0.8125




Evaluating:  16%|█▌        | 5/32 [00:02<00:13,  1.98it/s][A[A

0.90625




Evaluating:  19%|█▉        | 6/32 [00:03<00:12,  2.08it/s][A[A

0.84375




Evaluating:  22%|██▏       | 7/32 [00:03<00:11,  2.14it/s][A[A

0.90625




Evaluating:  25%|██▌       | 8/32 [00:04<00:11,  2.18it/s][A[A

0.9375




Evaluating:  28%|██▊       | 9/32 [00:04<00:10,  2.22it/s][A[A

0.875




Evaluating:  31%|███▏      | 10/32 [00:05<00:09,  2.23it/s][A[A

0.84375




Evaluating:  34%|███▍      | 11/32 [00:05<00:09,  2.25it/s][A[A

0.96875




Evaluating:  38%|███▊      | 12/32 [00:06<00:08,  2.27it/s][A[A

0.9375




Evaluating:  41%|████      | 13/32 [00:06<00:08,  2.28it/s][A[A

0.90625




Evaluating:  44%|████▍     | 14/32 [00:06<00:07,  2.27it/s][A[A

0.9375




Evaluating:  47%|████▋     | 15/32 [00:07<00:07,  2.28it/s][A[A

0.84375




Evaluating:  50%|█████     | 16/32 [00:07<00:06,  2.29it/s][A[A

0.875




Evaluating:  53%|█████▎    | 17/32 [00:08<00:06,  2.28it/s][A[A

0.875




Evaluating:  56%|█████▋    | 18/32 [00:08<00:06,  2.28it/s][A[A

0.84375




Evaluating:  59%|█████▉    | 19/32 [00:09<00:05,  2.28it/s][A[A

0.90625




Evaluating:  62%|██████▎   | 20/32 [00:09<00:05,  2.28it/s][A[A

0.875




Evaluating:  66%|██████▌   | 21/32 [00:09<00:04,  2.29it/s][A[A

0.75




Evaluating:  69%|██████▉   | 22/32 [00:10<00:04,  2.28it/s][A[A

0.8125




Evaluating:  72%|███████▏  | 23/32 [00:10<00:03,  2.29it/s][A[A

0.84375




Evaluating:  75%|███████▌  | 24/32 [00:11<00:03,  2.28it/s][A[A

0.84375




Evaluating:  78%|███████▊  | 25/32 [00:11<00:03,  2.28it/s][A[A

0.8125




Evaluating:  81%|████████▏ | 26/32 [00:12<00:02,  2.28it/s][A[A

0.8125




Evaluating:  84%|████████▍ | 27/32 [00:12<00:02,  2.28it/s][A[A

0.875




Evaluating:  88%|████████▊ | 28/32 [00:13<00:01,  2.29it/s][A[A

0.84375




Evaluating:  91%|█████████ | 29/32 [00:13<00:01,  2.28it/s][A[A

0.875




Evaluating:  94%|█████████▍| 30/32 [00:13<00:00,  2.28it/s][A[A

0.8125




Evaluating:  97%|█████████▋| 31/32 [00:14<00:00,  2.29it/s][A[A

Evaluating: 100%|██████████| 32/32 [00:14<00:00,  2.21it/s]

0.78125
0.875
== Finished evaluating model: next_sentence_cls_0 ==
	Mean evaluation acc: 0.872




Iteration:   3%|▎         | 20/695 [00:50<1:38:08,  8.72s/it][A

	Evaluation accuracy: 0.872 | Stale count: 0



Iteration:   3%|▎         | 21/695 [00:50<1:10:24,  6.27s/it][A
Iteration:   3%|▎         | 22/695 [00:51<53:18,  4.75s/it]  [A
Iteration:   3%|▎         | 23/695 [00:52<41:22,  3.69s/it][A
Iteration:   3%|▎         | 24/695 [00:54<33:00,  2.95s/it][A
Iteration:   4%|▎         | 25/695 [00:55<27:16,  2.44s/it][A
Iteration:   4%|▎         | 26/695 [00:56<23:06,  2.07s/it][A
Iteration:   4%|▍         | 27/695 [00:57<20:15,  1.82s/it][A
Iteration:   4%|▍         | 28/695 [00:59<18:16,  1.64s/it][A
Iteration:   4%|▍         | 29/695 [01:00<16:59,  1.53s/it][A
Iteration:   4%|▍         | 30/695 [01:01<15:56,  1.44s/it][A
Iteration:   4%|▍         | 31/695 [01:02<15:14,  1.38s/it][A
Iteration:   5%|▍         | 32/695 [01:04<14:46,  1.34s/it][A
Iteration:   5%|▍         | 33/695 [01:05<14:30,  1.31s/it][A
Iteration:   5%|▍         | 34/695 [01:06<14:12,  1.29s/it][A
Iteration:   5%|▌         | 35/695 [01:07<13:59,  1.27s/it][A
Iteration:   5%|▌         | 36/695 [01:09<13:52,  

	Global step: 725 | LR: [2.752752752752753e-06, 2.752752752752753e-06] | Avg. loss: 0.236



Iteration:  17%|█▋        | 121/695 [02:55<12:00,  1.25s/it][A
Iteration:  18%|█▊        | 122/695 [02:56<11:56,  1.25s/it][A
Iteration:  18%|█▊        | 123/695 [02:57<11:55,  1.25s/it][A
Iteration:  18%|█▊        | 124/695 [02:59<11:56,  1.25s/it][A
Iteration:  18%|█▊        | 125/695 [03:00<12:00,  1.26s/it][A
Iteration:  18%|█▊        | 126/695 [03:01<11:54,  1.26s/it][A
Iteration:  18%|█▊        | 127/695 [03:02<11:51,  1.25s/it][A
Iteration:  18%|█▊        | 128/695 [03:04<11:49,  1.25s/it][A
Iteration:  19%|█▊        | 129/695 [03:05<11:56,  1.27s/it][A
Iteration:  19%|█▊        | 130/695 [03:06<11:51,  1.26s/it][A
Iteration:  19%|█▉        | 131/695 [03:07<11:47,  1.25s/it][A
Iteration:  19%|█▉        | 132/695 [03:09<11:46,  1.25s/it][A
Iteration:  19%|█▉        | 133/695 [03:10<11:53,  1.27s/it][A
Iteration:  19%|█▉        | 134/695 [03:11<11:48,  1.26s/it][A
Iteration:  19%|█▉        | 135/695 [03:12<11:45,  1.26s/it][A
Iteration:  20%|█▉        | 136/695 [03

	Global step: 750 | LR: [2.502502502502503e-06, 2.502502502502503e-06] | Avg. loss: 0.231
	Evaluating on valiadtion set!




Evaluating:   0%|          | 0/32 [00:00<?, ?it/s][A[A

Evaluating:   3%|▎         | 1/32 [00:01<00:38,  1.23s/it][A[A

0.96875




Evaluating:   6%|▋         | 2/32 [00:01<00:22,  1.31it/s][A[A

0.9375




Evaluating:   9%|▉         | 3/32 [00:02<00:17,  1.63it/s][A[A

0.9375




Evaluating:  12%|█▎        | 4/32 [00:02<00:15,  1.84it/s][A[A

0.84375




Evaluating:  16%|█▌        | 5/32 [00:02<00:13,  1.98it/s][A[A

0.90625




Evaluating:  19%|█▉        | 6/32 [00:03<00:12,  2.08it/s][A[A

0.8125




Evaluating:  22%|██▏       | 7/32 [00:03<00:11,  2.14it/s][A[A

0.90625




Evaluating:  25%|██▌       | 8/32 [00:04<00:10,  2.19it/s][A[A

0.875




Evaluating:  28%|██▊       | 9/32 [00:04<00:10,  2.22it/s][A[A

0.875




Evaluating:  31%|███▏      | 10/32 [00:05<00:09,  2.24it/s][A[A

0.8125




Evaluating:  34%|███▍      | 11/32 [00:05<00:09,  2.26it/s][A[A

0.96875




Evaluating:  38%|███▊      | 12/32 [00:06<00:08,  2.27it/s][A[A

0.9375




Evaluating:  41%|████      | 13/32 [00:06<00:08,  2.28it/s][A[A

0.90625




Evaluating:  44%|████▍     | 14/32 [00:06<00:07,  2.28it/s][A[A

0.9375




Evaluating:  47%|████▋     | 15/32 [00:07<00:07,  2.28it/s][A[A

0.84375




Evaluating:  50%|█████     | 16/32 [00:07<00:06,  2.29it/s][A[A

0.875




Evaluating:  53%|█████▎    | 17/32 [00:08<00:06,  2.28it/s][A[A

0.9375




Evaluating:  56%|█████▋    | 18/32 [00:08<00:06,  2.28it/s][A[A

0.84375




Evaluating:  59%|█████▉    | 19/32 [00:09<00:05,  2.28it/s][A[A

0.90625




Evaluating:  62%|██████▎   | 20/32 [00:09<00:05,  2.28it/s][A[A

0.875




Evaluating:  66%|██████▌   | 21/32 [00:09<00:04,  2.28it/s][A[A

0.78125




Evaluating:  69%|██████▉   | 22/32 [00:10<00:04,  2.29it/s][A[A

0.8125




Evaluating:  72%|███████▏  | 23/32 [00:10<00:03,  2.28it/s][A[A

0.8125




Evaluating:  75%|███████▌  | 24/32 [00:11<00:03,  2.29it/s][A[A

0.84375




Evaluating:  78%|███████▊  | 25/32 [00:11<00:03,  2.28it/s][A[A

0.8125




Evaluating:  81%|████████▏ | 26/32 [00:12<00:02,  2.28it/s][A[A

0.84375




Evaluating:  84%|████████▍ | 27/32 [00:12<00:02,  2.29it/s][A[A

0.90625




Evaluating:  88%|████████▊ | 28/32 [00:13<00:01,  2.29it/s][A[A

0.875




Evaluating:  91%|█████████ | 29/32 [00:13<00:01,  2.29it/s][A[A

0.875




Evaluating:  94%|█████████▍| 30/32 [00:13<00:00,  2.29it/s][A[A

0.84375




Evaluating:  97%|█████████▋| 31/32 [00:14<00:00,  2.29it/s][A[A

Evaluating: 100%|██████████| 32/32 [00:14<00:00,  2.21it/s]

0.75
1.0
== Finished evaluating model: next_sentence_cls_0 ==
	Mean evaluation acc: 0.877




Iteration:  32%|███▏      | 220/695 [05:20<59:15,  7.49s/it][A

	Evaluation accuracy: 0.877 | Stale count: 0



Iteration:  32%|███▏      | 221/695 [05:20<42:41,  5.40s/it][A
Iteration:  32%|███▏      | 222/695 [05:21<32:41,  4.15s/it][A
Iteration:  32%|███▏      | 223/695 [05:23<25:45,  3.27s/it][A
Iteration:  32%|███▏      | 224/695 [05:24<20:54,  2.66s/it][A
Iteration:  32%|███▏      | 225/695 [05:25<17:34,  2.24s/it][A
Iteration:  33%|███▎      | 226/695 [05:26<15:08,  1.94s/it][A
Iteration:  33%|███▎      | 227/695 [05:28<13:28,  1.73s/it][A
Iteration:  33%|███▎      | 228/695 [05:29<12:18,  1.58s/it][A
Iteration:  33%|███▎      | 229/695 [05:30<11:33,  1.49s/it][A
Iteration:  33%|███▎      | 230/695 [05:31<10:57,  1.41s/it][A
Iteration:  33%|███▎      | 231/695 [05:33<10:28,  1.36s/it][A
Iteration:  33%|███▎      | 232/695 [05:34<10:10,  1.32s/it][A
Iteration:  34%|███▎      | 233/695 [05:35<10:02,  1.30s/it][A
Iteration:  34%|███▎      | 234/695 [05:36<09:50,  1.28s/it][A
Iteration:  34%|███▍      | 235/695 [05:38<09:43,  1.27s/it][A
Iteration:  34%|███▍      | 236/695 [05

	Global step: 775 | LR: [2.2522522522522524e-06, 2.2522522522522524e-06] | Avg. loss: 0.223



Iteration:  46%|████▌     | 321/695 [07:25<07:50,  1.26s/it][A
Iteration:  46%|████▋     | 322/695 [07:26<07:46,  1.25s/it][A
Iteration:  46%|████▋     | 323/695 [07:28<07:45,  1.25s/it][A
Iteration:  47%|████▋     | 324/695 [07:29<07:43,  1.25s/it][A
Iteration:  47%|████▋     | 325/695 [07:30<07:45,  1.26s/it][A
Iteration:  47%|████▋     | 326/695 [07:31<07:42,  1.25s/it][A
Iteration:  47%|████▋     | 327/695 [07:33<07:39,  1.25s/it][A
Iteration:  47%|████▋     | 328/695 [07:34<07:39,  1.25s/it][A
Iteration:  47%|████▋     | 329/695 [07:35<07:40,  1.26s/it][A
Iteration:  47%|████▋     | 330/695 [07:36<07:37,  1.25s/it][A
Iteration:  48%|████▊     | 331/695 [07:38<07:34,  1.25s/it][A
Iteration:  48%|████▊     | 332/695 [07:39<07:32,  1.25s/it][A
Iteration:  48%|████▊     | 333/695 [07:40<07:34,  1.26s/it][A
Iteration:  48%|████▊     | 334/695 [07:41<07:31,  1.25s/it][A
Iteration:  48%|████▊     | 335/695 [07:43<07:28,  1.25s/it][A
Iteration:  48%|████▊     | 336/695 [07

	Global step: 800 | LR: [2.0020020020020023e-06, 2.0020020020020023e-06] | Avg. loss: 0.214
	Evaluating on valiadtion set!




Evaluating:   0%|          | 0/32 [00:00<?, ?it/s][A[A

Evaluating:   3%|▎         | 1/32 [00:01<00:37,  1.21s/it][A[A

0.96875




Evaluating:   6%|▋         | 2/32 [00:01<00:22,  1.32it/s][A[A

0.9375




Evaluating:   9%|▉         | 3/32 [00:02<00:17,  1.64it/s][A[A

0.96875




Evaluating:  12%|█▎        | 4/32 [00:02<00:15,  1.84it/s][A[A

0.84375




Evaluating:  16%|█▌        | 5/32 [00:02<00:13,  1.99it/s][A[A

0.90625




Evaluating:  19%|█▉        | 6/32 [00:03<00:12,  2.09it/s][A[A

0.84375




Evaluating:  22%|██▏       | 7/32 [00:03<00:11,  2.15it/s][A[A

0.90625




Evaluating:  25%|██▌       | 8/32 [00:04<00:10,  2.19it/s][A[A

0.9375




Evaluating:  28%|██▊       | 9/32 [00:04<00:10,  2.22it/s][A[A

0.875




Evaluating:  31%|███▏      | 10/32 [00:05<00:09,  2.24it/s][A[A

0.875




Evaluating:  34%|███▍      | 11/32 [00:05<00:09,  2.26it/s][A[A

0.96875




Evaluating:  38%|███▊      | 12/32 [00:06<00:08,  2.28it/s][A[A

0.9375




Evaluating:  41%|████      | 13/32 [00:06<00:08,  2.28it/s][A[A

0.90625




Evaluating:  44%|████▍     | 14/32 [00:06<00:07,  2.29it/s][A[A

0.9375




Evaluating:  47%|████▋     | 15/32 [00:07<00:07,  2.30it/s][A[A

0.875




Evaluating:  50%|█████     | 16/32 [00:07<00:06,  2.30it/s][A[A

0.875




Evaluating:  53%|█████▎    | 17/32 [00:08<00:06,  2.30it/s][A[A

0.875




Evaluating:  56%|█████▋    | 18/32 [00:08<00:06,  2.30it/s][A[A

0.84375




Evaluating:  59%|█████▉    | 19/32 [00:09<00:05,  2.30it/s][A[A

0.90625




Evaluating:  62%|██████▎   | 20/32 [00:09<00:05,  2.30it/s][A[A

0.875




Evaluating:  66%|██████▌   | 21/32 [00:09<00:04,  2.30it/s][A[A

0.78125




Evaluating:  69%|██████▉   | 22/32 [00:10<00:04,  2.30it/s][A[A

0.78125




Evaluating:  72%|███████▏  | 23/32 [00:10<00:03,  2.30it/s][A[A

0.8125




Evaluating:  75%|███████▌  | 24/32 [00:11<00:03,  2.30it/s][A[A

0.84375




Evaluating:  78%|███████▊  | 25/32 [00:11<00:03,  2.30it/s][A[A

0.78125




Evaluating:  81%|████████▏ | 26/32 [00:12<00:02,  2.30it/s][A[A

0.8125




Evaluating:  84%|████████▍ | 27/32 [00:12<00:02,  2.30it/s][A[A

0.90625




Evaluating:  88%|████████▊ | 28/32 [00:12<00:01,  2.30it/s][A[A

0.875




Evaluating:  91%|█████████ | 29/32 [00:13<00:01,  2.29it/s][A[A

0.875




Evaluating:  94%|█████████▍| 30/32 [00:13<00:00,  2.30it/s][A[A

0.84375




Evaluating:  97%|█████████▋| 31/32 [00:14<00:00,  2.30it/s][A[A

Evaluating: 100%|██████████| 32/32 [00:14<00:00,  2.22it/s]

0.75
1.0
== Finished evaluating model: next_sentence_cls_0 ==
	Mean evaluation acc: 0.879




Iteration:  60%|██████    | 420/695 [09:54<39:54,  8.71s/it][A

	Evaluation accuracy: 0.879 | Stale count: 0



Iteration:  61%|██████    | 421/695 [09:54<28:33,  6.25s/it][A
Iteration:  61%|██████    | 422/695 [09:56<21:33,  4.74s/it][A
Iteration:  61%|██████    | 423/695 [09:57<16:42,  3.68s/it][A
Iteration:  61%|██████    | 424/695 [09:58<13:17,  2.94s/it][A
Iteration:  61%|██████    | 425/695 [09:59<10:57,  2.44s/it][A
Iteration:  61%|██████▏   | 426/695 [10:01<09:17,  2.07s/it][A
Iteration:  61%|██████▏   | 427/695 [10:02<08:06,  1.82s/it][A
Iteration:  62%|██████▏   | 428/695 [10:03<07:18,  1.64s/it][A
Iteration:  62%|██████▏   | 429/695 [10:04<06:46,  1.53s/it][A
Iteration:  62%|██████▏   | 430/695 [10:06<06:19,  1.43s/it][A
Iteration:  62%|██████▏   | 431/695 [10:07<06:01,  1.37s/it][A
Iteration:  62%|██████▏   | 432/695 [10:08<05:48,  1.33s/it][A
Iteration:  62%|██████▏   | 433/695 [10:09<05:42,  1.31s/it][A
Iteration:  62%|██████▏   | 434/695 [10:10<05:33,  1.28s/it][A
Iteration:  63%|██████▎   | 435/695 [10:12<05:28,  1.26s/it][A
Iteration:  63%|██████▎   | 436/695 [10

	Global step: 825 | LR: [1.7517517517517518e-06, 1.7517517517517518e-06] | Avg. loss: 0.233



Iteration:  75%|███████▍  | 521/695 [11:59<03:41,  1.27s/it][A
Iteration:  75%|███████▌  | 522/695 [12:00<03:41,  1.28s/it][A
Iteration:  75%|███████▌  | 523/695 [12:01<03:32,  1.24s/it][A
Iteration:  75%|███████▌  | 524/695 [12:03<03:33,  1.25s/it][A
Iteration:  76%|███████▌  | 525/695 [12:04<03:33,  1.25s/it][A
Iteration:  76%|███████▌  | 526/695 [12:05<03:30,  1.25s/it][A
Iteration:  76%|███████▌  | 527/695 [12:06<03:30,  1.26s/it][A
Iteration:  76%|███████▌  | 528/695 [12:08<03:26,  1.24s/it][A
Iteration:  76%|███████▌  | 529/695 [12:09<03:27,  1.25s/it][A
Iteration:  76%|███████▋  | 530/695 [12:10<03:25,  1.24s/it][A
Iteration:  76%|███████▋  | 531/695 [12:11<03:23,  1.24s/it][A
Iteration:  77%|███████▋  | 532/695 [12:13<03:22,  1.24s/it][A
Iteration:  77%|███████▋  | 533/695 [12:14<03:23,  1.25s/it][A
Iteration:  77%|███████▋  | 534/695 [12:15<03:20,  1.25s/it][A
Iteration:  77%|███████▋  | 535/695 [12:16<03:18,  1.24s/it][A
Iteration:  77%|███████▋  | 536/695 [12

	Global step: 850 | LR: [1.5015015015015017e-06, 1.5015015015015017e-06] | Avg. loss: 0.220
	Evaluating on valiadtion set!




Evaluating:   0%|          | 0/32 [00:00<?, ?it/s][A[A

Evaluating:   3%|▎         | 1/32 [00:01<00:37,  1.22s/it][A[A

1.0




Evaluating:   6%|▋         | 2/32 [00:01<00:22,  1.32it/s][A[A

0.96875




Evaluating:   9%|▉         | 3/32 [00:02<00:17,  1.63it/s][A[A

0.9375




Evaluating:  12%|█▎        | 4/32 [00:02<00:15,  1.84it/s][A[A

0.84375




Evaluating:  16%|█▌        | 5/32 [00:02<00:13,  1.99it/s][A[A

0.90625




Evaluating:  19%|█▉        | 6/32 [00:03<00:12,  2.09it/s][A[A

0.84375




Evaluating:  22%|██▏       | 7/32 [00:03<00:11,  2.15it/s][A[A

0.90625




Evaluating:  25%|██▌       | 8/32 [00:04<00:10,  2.19it/s][A[A

0.9375




Evaluating:  28%|██▊       | 9/32 [00:04<00:10,  2.23it/s][A[A

0.90625




Evaluating:  31%|███▏      | 10/32 [00:05<00:09,  2.25it/s][A[A

0.875




Evaluating:  34%|███▍      | 11/32 [00:05<00:09,  2.26it/s][A[A

1.0




Evaluating:  38%|███▊      | 12/32 [00:06<00:08,  2.28it/s][A[A

0.9375




Evaluating:  41%|████      | 13/32 [00:06<00:08,  2.29it/s][A[A

0.90625




Evaluating:  44%|████▍     | 14/32 [00:06<00:07,  2.29it/s][A[A

0.9375




Evaluating:  47%|████▋     | 15/32 [00:07<00:07,  2.30it/s][A[A

0.875




Evaluating:  50%|█████     | 16/32 [00:07<00:06,  2.30it/s][A[A

0.90625




Evaluating:  53%|█████▎    | 17/32 [00:08<00:06,  2.30it/s][A[A

0.8125




Evaluating:  56%|█████▋    | 18/32 [00:08<00:06,  2.30it/s][A[A

0.84375




Evaluating:  59%|█████▉    | 19/32 [00:09<00:05,  2.29it/s][A[A

0.90625




Evaluating:  62%|██████▎   | 20/32 [00:09<00:05,  2.29it/s][A[A

0.875




Evaluating:  66%|██████▌   | 21/32 [00:09<00:04,  2.30it/s][A[A

0.78125




Evaluating:  69%|██████▉   | 22/32 [00:10<00:04,  2.30it/s][A[A

0.8125




Evaluating:  72%|███████▏  | 23/32 [00:10<00:03,  2.29it/s][A[A

0.8125




Evaluating:  75%|███████▌  | 24/32 [00:11<00:03,  2.30it/s][A[A

0.84375




Evaluating:  78%|███████▊  | 25/32 [00:11<00:03,  2.30it/s][A[A

0.78125




Evaluating:  81%|████████▏ | 26/32 [00:12<00:02,  2.29it/s][A[A

0.8125




Evaluating:  84%|████████▍ | 27/32 [00:12<00:02,  2.30it/s][A[A

0.90625




Evaluating:  88%|████████▊ | 28/32 [00:12<00:01,  2.30it/s][A[A

0.875




Evaluating:  91%|█████████ | 29/32 [00:13<00:01,  2.30it/s][A[A

0.875




Evaluating:  94%|█████████▍| 30/32 [00:13<00:00,  2.30it/s][A[A

0.8125




Evaluating:  97%|█████████▋| 31/32 [00:14<00:00,  2.31it/s][A[A

Evaluating: 100%|██████████| 32/32 [00:14<00:00,  2.22it/s]

0.75
1.0
== Finished evaluating model: next_sentence_cls_0 ==
	Mean evaluation acc: 0.881




Iteration:  89%|████████▉ | 620/695 [14:23<09:14,  7.39s/it][A

	Evaluation accuracy: 0.881 | Stale count: 0



Iteration:  89%|████████▉ | 621/695 [14:24<06:35,  5.34s/it][A
Iteration:  89%|████████▉ | 622/695 [14:25<04:59,  4.11s/it][A
Iteration:  90%|████████▉ | 623/695 [14:26<03:53,  3.24s/it][A
Iteration:  90%|████████▉ | 624/695 [14:28<03:07,  2.64s/it][A
Iteration:  90%|████████▉ | 625/695 [14:29<02:35,  2.23s/it][A
Iteration:  90%|█████████ | 626/695 [14:30<02:13,  1.93s/it][A
Iteration:  90%|█████████ | 627/695 [14:31<01:56,  1.72s/it][A
Iteration:  90%|█████████ | 628/695 [14:33<01:45,  1.57s/it][A
Iteration:  91%|█████████ | 629/695 [14:34<01:37,  1.48s/it][A
Iteration:  91%|█████████ | 630/695 [14:35<01:31,  1.41s/it][A
Iteration:  91%|█████████ | 631/695 [14:36<01:26,  1.36s/it][A
Iteration:  91%|█████████ | 632/695 [14:38<01:23,  1.32s/it][A
Iteration:  91%|█████████ | 633/695 [14:39<01:21,  1.31s/it][A
Iteration:  91%|█████████ | 634/695 [14:40<01:18,  1.29s/it][A
Iteration:  91%|█████████▏| 635/695 [14:41<01:16,  1.27s/it][A
Iteration:  92%|█████████▏| 636/695 [14

(868, 0.0)

In [None]:
# Test model
trainer.test()


Sampling a test item from the test data!
Context: Person 1: It seems that nobody in your class likes Paul . Person 2: The boy is good for nothing . Person 1: As a teacher , I don ’ t know how to help him . Person 2: 
Targets: [" Well , you're right , the resume is limited in how much it can tell someone about a person . That's why job interviews are important to let people know the real you that they can't see from a piece of paper . But resumes can be helpful in explaining things and giving a good impression to a potential employer . ", " I love it . It's so festive ! And where did you get that cup ? It's got the same logo . ", " I'm sorry , dad . But I would rather stay at school than go to cram school . ", ' No , thank you . ', ' That looks great . Do you have ties here ? ', " Yes , I have , Mr . Thomas . We'll pack them two dozen to one carton , and the gross weight is around 25 kilos a carton . "]
Expected output: -1
Encoding data ...


-1

In [None]:
context = ["Person 1: Do you happen to have their email or phone number so we get in touch?", "Person 2: Email of who? Myself or the store owner ?", "Person 1:"]
targets = ["I dont want to answer.",
           "I dont know.",
           "Email.",
           "Sure, the email is example@companyname.com.",
           "No.",
           "Yes.",
           "Maybe.",
           "Something not understood.",
           "Phone number.",
           "I'm not entirely confident if I'm permitted to share contacts."]
label = -1

trainer.test(context=context, targets=targets)

Context: Person 1: Do you happen to have their email or phone number so we get in touch? Person 2: Email of who? Myself or the store owner ? Person 1:
Targets: ['I dont want to answer.', 'I dont know.', 'Email.', 'Sure, the email is example@companyname.com.', 'No.', 'Yes.', 'Maybe.', 'Something not understood.', 'Phone number.', "I'm not entirely confident if I'm permitted to share contacts."]
Expected output: None
Encoding data ...


'Phone number.'