In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m64.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m31.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m122.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m87.2 MB/s[0m eta [36m0:00:

In [98]:
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, AdamW, get_cosine_schedule_with_warmup
import pandas as pd
import numpy as np
import json
import math
import torch
from tqdm import tqdm
import torch.nn as nn
import torch.nn.functional as F

In [99]:
model_name = r"bert-base-uncased"
qasper_classification_path = r"/content/drive/MyDrive/Colab Notebooks/Qasper_classification.json"

device = r"cuda"
tokenizer = AutoTokenizer.from_pretrained(model_name)
attributes = [
                "unanswerable",
                "extractive_spans",
                "yes_no",
                "abstractive"
            ]
max_token_length = 512
bert_model = AutoModel.from_pretrained(model_name, return_dict = True)
batch_size = 8

n_labels = 4
lr = 1.5e-6
warmup = 0.2
weight_decay = 0.001
n_epochs = 1
MODEL_PATH = r"/content/drive/MyDrive/Colab Notebooks/model.bin"

Exception in thread QueueFeederThread:
Traceback (most recent call last):
  File "/usr/lib/python3.10/multiprocessing/queues.py", line 239, in _feed
    reader_close()
  File "/usr/lib/python3.10/multiprocessing/connection.py", line 177, in close
    self._close()
  File "/usr/lib/python3.10/multiprocessing/connection.py", line 361, in _close
    _close(self._handle)
OSError: [Errno 9] Bad file descriptor

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.10/threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.10/multiprocessing/queues.py", line 271, in _feed
    queue_sem.release()
ValueError: semaphore or lock released too many times
Exception in thread QueueFeederThread:
Traceback (most recent call last):
  File "/usr/lib/python3.10/multiprocessing/queues.py", line 239

In [100]:
class Qasper_Dataset(Dataset):

    def __init__(self, data_path, data_type, tokenizer, attributes, max_token_length = 128, sample = None):
        self.data_path = data_path
        self.data_type = data_type
        self.tokenizer = tokenizer
        self.attributes = attributes
        self.max_token_length = max_token_length
        self.data = self.__load_data__()

    def __load_data__(self):
        data_fd = open(self.data_path)
        self.data = json.load(data_fd)
        return (
            self.data['data'][self.data_type]
        )

    def __len__(self):
        return (
            len(self.data)
        )

    def __getitem__(self, index):
        context = ""
        for i in range(len(self.data[index][0])):
            if i != 0:
                context = context + " "
            context = context + self.data[index][0][i]
        # context = self.data[index][0]
        question = self.data[index][1]
        labels = [self.data[index][2][self.attributes[0]], self.data[index][2][self.attributes[1]], self.data[index][2][self.attributes[2]], self.data[index][2][self.attributes[3]]]
        labels = torch.FloatTensor(labels)
        # labels = Variable(labels, requires_grad = True)
        tokens = self.tokenizer.encode_plus(
                                            question,
                                            context,
                                            add_special_tokens=True,
                                            return_tensors='pt',
                                            truncation=True,
                                            padding='max_length',
                                            max_length=self.max_token_length,
                                            return_attention_mask = True
                                            )
        return {'input_ids': tokens.input_ids.flatten(), 'attention_mask': tokens.attention_mask.flatten(), 'labels': labels}

In [101]:
train_data = Qasper_Dataset(
                data_path = qasper_classification_path,
                data_type = "train_data",
                tokenizer = tokenizer,
                attributes = attributes,
                max_token_length = 512,
                sample = None
                )

val_data = Qasper_Dataset(
                    data_path = qasper_classification_path,
                    data_type = "validation_data",
                    tokenizer = tokenizer,
                    attributes = attributes,
                    max_token_length = 512,
                    sample = None
                    )

train_dataloader = DataLoader(
                  train_data,
                  batch_size = batch_size,
                  num_workers = 2,
                  shuffle = True
              )

val_dataloader = DataLoader(
                  val_data,
                  batch_size = batch_size,
                  num_workers = 2,
                  shuffle = True
              )

In [102]:
class Qasper_Classifier(nn.Module):

    def __init__(self, model, n_labels):

        super(Qasper_Classifier, self).__init__()
        self.pretrained_model = model # bert model
        self.n_labels = n_labels

        self.hidden = torch.nn.Linear(self.pretrained_model.config.hidden_size, self.pretrained_model.config.hidden_size)
        print(f"hidden - {self.hidden.weight.requires_grad}")
        self.classifier = torch.nn.Linear(self.pretrained_model.config.hidden_size, self.n_labels)
        print(f"classifier - {self.classifier.weight.requires_grad}")
        torch.nn.init.xavier_uniform_(self.classifier.weight)
        # print(f"loss - {self.loss_func.weight.requires_grad}")
        self.dropout = nn.Dropout(0.3)
        # print(f"dropout - {self.dropout.weight.requires_grad}")


    def forward(self, input_ids, attention_mask, labels = None):
        # print("Forward Propagation started")
        # input_ids.requires_grad = True; attention_mask,requires_grad = True
        print(f"input - {input_ids.requires_grad}, attention_mask - {attention_mask.requires_grad}")
        output = self.pretrained_model(
                                    input_ids = input_ids,
                                    attention_mask = attention_mask
                                       )
        pooled_output = torch.mean(output.last_hidden_state, 1)
        pooled_output = self.dropout(pooled_output)
        pooled_output = self.hidden(pooled_output)
        pooled_output = F.relu(pooled_output)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        # loss = 0
        # loss.requires_grad = True
        # logits.requires_grad = True
        # labels.required_grad = True
        # loss = Variable(loss, requires_grad = True)
        return logits

In [103]:
def train_fn(data_loader, model, optimizer, device, scheduler):

  model.train()

  for idx, data in tqdm(enumerate(data_loader), total = len(data_loader)):
    # print(data.keys())
    print(f"Training loop {idx}")
    input_ids = data['input_ids']
    attention_mask = data['attention_mask']
    labels = data['labels']

    input_ids = input_ids.to(device, dtype=torch.long)
    attention_mask = attention_mask.to(device, dtype=torch.long)
    labels = labels.to(device, dtype=torch.float)

    optimizer.zero_grad()
    print(input_ids)
    print(type(input_ids))
    print(attention_mask)
    print(type(attention_mask))
    logits = model(
                  input_ids = input_ids,
                  attention_mask = attention_mask,
              )

    loss = torch.zeros(1, requires_grad=True)
    # if labels is not None:
    print(f"type - {type(logits)} type - {type(labels)}")
    loss_fn = nn.BCEWithLogitsLoss(reduction='mean')
    loss = loss_fn(logits.view(-1, len(attributes)), labels.view(-1, len(attributes)))
    print(f"loss - {loss}, logits - {logits}")
    print(f"loss__ - {loss.requires_grad}, logits__ - {logits.requires_grad}")

    loss.backward()
    optimizer.step()
    scheduler.step()

In [104]:
def eval_fn(data_loader, model):

  model.eval()

  tot_logits = []
  tot_labels = []

  with torch.no_grad():
    for idx, data in tqdm(enumerate(data_loader), total = len(data_loader)):
      print(f"Validation Loop {idx}")
      input_ids = data['input_ids']
      attention_mask = data['attention_mask']
      labels = data['labels']

      input_ids = input_ids.to(device, dtype=torch.long)
      attention_mask = attention_mask.to(device, dtype=torch.long)
      labels = labels.to(device, dtype=torch.float)

      logits = model(
                    input_ids = input_ids,
                    attention_mask = attention_mask,
                )
      logits = logits.cpu().detach().numpy().tolist()
      labels = labels.cpu().detach().numpy().tolist()

      tot_logits.extend(logits)
      tot_labels.extend(labels)

    return (
      tot_logits,
      tot_labels
   )

In [105]:
model = Qasper_Classifier(bert_model, len(attributes))
param_optimizer = list(model.named_parameters())
no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
optimizer_parameters = [
  {
      "params": [
          p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
      ],
      "weight_decay": 0.001,
  },
  {
      "params": [
          p for n, p in param_optimizer if any(nd in n for nd in no_decay)
      ],
      "weight_decay": 0.0,
  },
]

num_train_steps = int(len(train_data) / batch_size * n_epochs)
optimizer = AdamW(optimizer_parameters, lr=3e-5)
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

hidden - True
classifier - True




In [106]:
def clipping_fn(logits, max_val, min_val):

  for i in range(len(logits)):
    for j in range(len(logits[i])):

        if logits[i][j] >= (max_val + min_val)/2:
          logits[i][j] = max_val
        else:
          logits[i][j] = min_val

  return (
      logits
  )

def metric_accuracy(logits, labels):
  confusion = [
      {
         'tp': 0, 'tn': 0, 'fp': 0, 'fn': 0,
      },
      {
         'tp': 0, 'tn': 0, 'fp': 0, 'fn': 0,
      },
      {
         'tp': 0, 'tn': 0, 'fp': 0, 'fn': 0,
      },
      {
         'tp': 0, 'tn': 0, 'fp': 0, 'fn': 0,
      },
  ]

  for i in range(len(logits)):
    for j in range(len(logits[i])):

      if logits[i][j] == 1 and labels[i][j] == 1:
          confusion[j]['tp'] += 1

      if logits[i][j] == 0 and labels[i][j] == 0:
          confusion[j]['tn'] += 1

      if logits[i][j] == 1 and labels[i][j] == 0:
          confusion[j]['fn'] += 1

      if logits[i][j] == 0 and labels[i][j] == 1:
          confusion[j]['fp'] += 1

  print(confusion)

  accuracy = list()

  for i in range(len(confusion)):
    accuracy.append((confusion[i]['tp'] + confusion[i]['tn'])/(confusion[i]['tp'] + confusion[i]['tn'] + confusion[i]['fp'] + confusion[i]['fn']))

  return (
      accuracy
  )


In [107]:
def __run__():
    best_accuracy = 0
    model.to(device)
    print(model)

    for epoch in range(n_epochs):
        train_fn(train_dataloader, model, optimizer, device, scheduler)
        logits, labels = eval_fn(val_dataloader, model)
        logits = clipping_fn(logits, 1.0, 0.0)
        accuracy = metric_accuracy(logits, labels)
        overall_accuracy = (accuracy[0] + accuracy[1] + accuracy[2] + accuracy[3])/len(accuracy)

        print(f"Detailed accuracy after {epoch} epoch:")
        print(f"unanswerable accuarcy: {accuracy[0]}")
        print(f"extractive accuarcy: {accuracy[1]}")
        print(f"yes_no accuarcy: {accuracy[2]}")
        print(f"abstractive accuarcy: {accuracy[3]}")
        print(f"Overall accuarcy: {overall_accuracy}")
        print(f"Best accuarcy: {best_accuracy}")

        if overall_accuracy > best_accuracy:
            torch.save(model.state_dict(), MODEL_PATH)
            best_accuracy = overall_accuracy
            print(best_accuracy)
            print("Model Updated")

In [108]:
__run__()

Qasper_Classifier(
  (pretrained_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

  0%|          | 0/289 [00:00<?, ?it/s]

Training loop 0
tensor([[  101,  2129,  2079,  ...,     0,     0,     0],
        [  101,  2129,  2064,  ...,     0,     0,     0],
        [  101,  2054,  2024,  ...,     0,     0,     0],
        ...,
        [  101,  2054, 15756,  ...,     0,     0,     0],
        [  101,  2029,  2944,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.6920968890190125, logits - tensor([[ 0.1180, -0.3500, -0.4331,  0.1804],
        [-0.0655, -0.2152, -0.0058,  0.1718],
        [-0.2948, -0.1481, -0.1785, -0.0338],
        [ 0.0591, -0.2294, -0.0189, -0.0320],
      

  0%|          | 1/289 [00:00<04:05,  1.18it/s]

Training loop 1
tensor([[ 101, 2054, 4895,  ...,    0,    0,    0],
        [ 101, 2054, 2951,  ...,    0,    0,    0],
        [ 101, 2079, 2027,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2515,  ..., 1035, 1045,  102],
        [ 101, 2106, 2027,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.6461156606674194, logits - tensor([[-0.3202, -0.2056, -0.3761, -0.1771],
        [-0.4019,  0.0182, -0.2976, -0.1176],
        [ 0.0619, -0.0971, -0.5334, -0.2860],
        [-0.3154,  0.0052, -0.3687,  0.1308],
        [-0.1562, -0.0896, -0.5025, -0.296

  1%|          | 2/289 [00:01<03:47,  1.26it/s]

Training loop 2
tensor([[ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2079, 1996,  ...,    0,    0,    0],
        [ 101, 1999, 2029,  ...,    0,    0,    0],
        ...,
        [ 101, 2003, 2045,  ...,    0,    0,    0],
        [ 101, 2129, 2024,  ...,    0,    0,    0],
        [ 101, 2058, 2029,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


  1%|          | 3/289 [00:02<03:41,  1.29it/s]

loss - 0.5809512138366699, logits - tensor([[-0.2264,  0.3189, -0.3762,  0.0681],
        [-0.0288, -0.0655, -0.3265, -0.4341],
        [-0.1343,  0.3904, -0.6490, -0.2909],
        [-0.1817, -0.0768, -0.2722, -0.1447],
        [-0.2206,  0.2070, -0.5501, -0.1082],
        [-0.3288,  0.0993,  0.0320, -0.6249],
        [-0.4405,  0.4121, -0.5332, -0.3437],
        [-0.3684,  0.1069, -0.7283, -0.3278]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 3
tensor([[  101,  2003, 13675,  ...,  2714,  4708,   102],
        [  101,  2054,  2951,  ...,     0,     0,     0],
        [  101,  2129,  2079,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  2003,  ...,     0,     0,     0],
        [  101,  2054,  2024,  ...,     0,     0,     0],
        [  101,  2054,  2024,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0

  1%|▏         | 4/289 [00:03<03:39,  1.30it/s]

Training loop 4
tensor([[  101,  2054,  2024,  ...,     0,     0,     0],
        [  101,  2003,  2045,  ...,     0,     0,     0],
        [  101,  2129,  2502,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  6254,  ...,     0,     0,     0],
        [  101,  2129,  2079,  ...,     0,     0,     0],
        [  101,  2054, 12783,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.49041447043418884, logits - tensor([[-0.9776,  0.1814, -0.5667, -0.6912],
        [-0.4892, -0.0968, -0.2912, -0.7046],
        [-0.8261,  0.3883, -0.8459, -0.5499],
        [-0.7716, -0.1870, -0.9442, -1.0169],
     

  2%|▏         | 5/289 [00:03<03:38,  1.30it/s]

Training loop 5
tensor([[ 101, 2054, 2785,  ...,    0,    0,    0],
        [ 101, 2079, 2027,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        ...,
        [ 101, 2079, 2027,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5328041911125183, logits - tensor([[-1.2396e+00,  4.0273e-02, -9.9085e-01, -5.8711e-01],
        [-1.0005e+00,  5.1171e-03, -1.0168e+00, -8.2322e-01],
        [-4.1178e-01, -3.8900e-01, -9.0118e-01, -7.1879e-01],
        [-1.1872e+00,  2.9388e-01, -7.11

  2%|▏         | 6/289 [00:04<03:37,  1.30it/s]

Training loop 6
tensor([[ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5238460302352905, logits - tensor([[-1.2514,  0.4463, -0.8593, -0.7394],
        [-1.1071,  0.4024, -0.9518, -0.2748],
        [-1.3860,  0.1395, -1.4812, -1.1582],
        [-1.4472,  0.1385, -0.9304, -0.6437],
        [-1.2240,  0.7764, -1.0134, -0.793

  2%|▏         | 7/289 [00:05<03:38,  1.29it/s]

Training loop 7
tensor([[  101,  2054, 10873,  ...,     0,     0,     0],
        [  101,  2054,  4275,  ...,     0,     0,     0],
        [  101,  2029,  4275,  ...,     0,     0,     0],
        ...,
        [  101,  2054, 12978,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,  1996,  6878,   102],
        [  101,  2129,  2116,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.43743830919265747, logits - tensor([[-1.6524,  0.4207, -1.1303, -0.7061],
        [-1.3019,  0.2522, -1.0646, -0.7430],
        [-1.3545,  0.3437, -1.1226, -0.8970],
        [-1.9594,  0.0488, -1.5459, -1.0220],
     

  3%|▎         | 8/289 [00:06<03:37,  1.29it/s]

Training loop 8
tensor([[  101,  2054,  4800,  ...,     0,     0,     0],
        [  101,  2054, 19962,  ...,     0,     0,     0],
        [  101,  2054,  7241,  ...,     0,     0,     0],
        ...,
        [  101,  2024,  2951,  ...,     0,     0,     0],
        [  101,  2054,  7861,  ...,     0,     0,     0],
        [  101,  2054,  2110,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.47149884700775146, logits - tensor([[-2.1291,  0.5518, -1.5467, -0.8577],
        [-1.6768,  0.7352, -1.1082, -1.1111],
        [-1.4283,  0.5309, -1.2390, -1.3017],
        [-1.5515,  0.4769, -1.2891, -0.7052],
     

  3%|▎         | 9/289 [00:06<03:37,  1.29it/s]

Training loop 9
tensor([[  101,  2003,  2023,  ...,     0,     0,     0],
        [  101,  2054,  2024,  ...,     0,     0,     0],
        [  101,  2054,  3716,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  2020,  ...,     0,     0,     0],
        [  101,  2054, 12046,  ...,     0,     0,     0],
        [  101,  2054, 26163,  ...,  1996,  6903,   102]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4427841901779175, logits - tensor([[-2.4979,  0.4825, -1.7471, -1.3434],
        [-1.6914,  0.4284, -1.6471, -1.6462],
        [-2.6133,  0.0848, -1.5791, -1.0965],
        [-2.1961,  0.5772, -1.9203, -1.2853],
      

  3%|▎         | 10/289 [00:07<03:38,  1.27it/s]

Training loop 10
tensor([[  101,  2054,  2944,  ...,     0,     0,     0],
        [  101,  2054, 12637,  ...,     0,     0,     0],
        [  101,  2054,  2060,  ...,     0,     0,     0],
        ...,
        [  101,  2043,  2027,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0],
        [  101,  2129,  2116,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4495290517807007, logits - tensor([[-2.2147,  0.5901, -1.8414, -1.2268],
        [-2.2454,  0.6098, -1.7974, -1.4070],
        [-2.7858,  0.5515, -1.8432, -1.5046],
        [-2.5531,  0.7018, -2.0096, -1.5745],
     

  4%|▍         | 11/289 [00:08<03:40,  1.26it/s]

Training loop 11
tensor([[ 101, 2054, 3350,  ..., 1997, 3375,  102],
        [ 101, 2003, 1996,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2515,  ...,    0,    0,    0],
        [ 101, 2129, 2116,  ...,    0,    0,    0],
        [ 101, 2515, 1996,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5719358921051025, logits - tensor([[-2.3326e+00,  1.1587e+00, -1.9420e+00, -1.1427e+00],
        [-2.7102e+00,  8.0093e-01, -2.2695e+00, -1.5112e+00],
        [-2.8464e+00,  2.1613e-01, -2.0158e+00, -1.2973e+00],
        [-2.6425e+00,  1.2616e+00, -2.1

  4%|▍         | 12/289 [00:09<03:39,  1.26it/s]

Training loop 12
tensor([[ 101, 2054, 3115,  ...,    0,    0,    0],
        [ 101, 2106, 2027,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2146,  ...,    0,    0,    0],
        [ 101, 2054, 1005,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.37132567167282104, logits - tensor([[-3.4579e+00,  4.7959e-01, -2.6316e+00, -1.2940e+00],
        [-3.1650e+00,  7.5336e-01, -2.2364e+00, -8.1842e-01],
        [-2.8379e+00,  7.5461e-01, -2.7201e+00, -1.4361e+00],
        [-3.6909e+00, -1.8453e-03, -2.

  4%|▍         | 13/289 [00:10<03:40,  1.25it/s]

Training loop 13
tensor([[ 101, 2515, 1996,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2054, 1020,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2515,  ...,    0,    0,    0],
        [ 101, 2054, 4556,  ...,    0,    0,    0],
        [ 101, 2003, 2944,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5268291234970093, logits - tensor([[-3.8880,  0.1926, -2.7480, -0.9926],
        [-2.7976,  0.0905, -2.1701, -0.6677],
        [-3.1034,  0.5669, -2.7155, -1.1517],
        [-2.1834, -0.1384, -1.2067, -0.4455],
        [-3.2960,  0.6456, -2.7487, -0.99

  5%|▍         | 14/289 [00:11<03:41,  1.24it/s]

Training loop 14
tensor([[ 101, 2054, 9312,  ...,    0,    0,    0],
        [ 101, 2079, 2027,  ...,    0,    0,    0],
        [ 101, 2079, 6048,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2312,  ...,    0,    0,    0],
        [ 101, 2129, 2003,  ..., 8043, 6561,  102],
        [ 101, 2129, 3020,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.46780070662498474, logits - tensor([[-3.0307,  0.5826, -2.3017, -0.8239],
        [-3.5374,  0.2496, -1.9102, -0.7275],
        [-3.1806,  0.2867, -2.2328, -1.3371],
        [-3.1360,  0.6453, -2.1781, -0.9345],
        [-2.8553,  0.3835, -2.0651, -1.4

  5%|▌         | 15/289 [00:11<03:40,  1.24it/s]

Training loop 15
tensor([[  101,  2079, 14391,  ...,     0,     0,     0],
        [  101,  2001,  1996,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  6388,  ...,     0,     0,     0],
        [  101,  2129,  2027,  ...,     0,     0,     0],
        [  101,  2129,  3020,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.46494409441947937, logits - tensor([[-2.5643,  0.4779, -2.2086, -0.4502],
        [-3.4263,  0.0985, -2.4075, -1.6116],
        [-3.1153, -0.2807, -1.6289, -0.6184],
        [-2.9587,  0.5049, -2.2533, -0.7054],
    

  6%|▌         | 16/289 [00:12<03:40,  1.24it/s]

Training loop 16
tensor([[  101,  2129,  2003,  ...,     0,     0,     0],
        [  101,  2054, 12046,  ...,     0,     0,     0],
        [  101,  2515,  1037,  ...,     0,     0,     0],
        ...,
        [  101,  2129,  2079,  ...,     0,     0,     0],
        [  101,  2106,  2027,  ...,     0,     0,     0],
        [  101,  2054,  3793,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5117313265800476, logits - tensor([[-2.6115,  0.3537, -1.6154, -0.4686],
        [-3.1888,  0.1103, -2.0664, -1.1904],
        [-3.5532, -0.2295, -1.9988, -1.5088],
        [-3.3818,  0.7025, -2.6134, -0.7986],
     

  6%|▌         | 17/289 [00:13<03:40,  1.23it/s]

Training loop 17
tensor([[ 101, 2079, 2027,  ...,    0,    0,    0],
        [ 101, 2129, 2001,  ...,    0,    0,    0],
        [ 101, 2040, 2001,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2024,  ...,    0,    0,    0],
        [ 101, 2029, 2951,  ...,    0,    0,    0],
        [ 101, 2054, 2951,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.34693682193756104, logits - tensor([[-3.9349,  0.1339, -2.1473, -1.7025],
        [-4.2150,  0.4974, -1.7520, -1.6996],
        [-2.9972,  0.2423, -2.1666, -0.7546],
        [-3.6387,  0.7392, -1.9692, -1.3141],
        [-3.5549,  0.3289, -2.0652, -0.7

  6%|▌         | 18/289 [00:14<03:38,  1.24it/s]

Training loop 18
tensor([[  101,  2054,  2773,  ...,     0,     0,     0],
        [  101,  2054,  2951,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0],
        ...,
        [  101,  2129,  2003,  ...,     0,     0,     0],
        [  101,  2054, 28616,  ...,     0,     0,     0],
        [  101,  2129,  2079,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2851128578186035, logits - tensor([[-3.4995,  0.1831, -2.1233, -1.2491],
        [-4.2126,  0.1785, -1.8117, -1.5966],
        [-4.0337, -0.0717, -1.8500, -1.2706],
        [-3.6994,  0.3036, -1.8122, -1.1644],
     

  7%|▋         | 19/289 [00:15<03:37,  1.24it/s]

Training loop 19
tensor([[  101,  2054,  3025,  ...,  7782,  7127,   102],
        [  101,  2129,  2172,  ...,     0,     0,     0],
        [  101,  2029, 16105,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  2003,  ...,     0,     0,     0],
        [  101,  2073,  2024,  ...,     0,     0,     0],
        [  101,  2054,  9312,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


  7%|▋         | 20/289 [00:15<03:36,  1.24it/s]

loss - 0.2585883140563965, logits - tensor([[-3.0300,  0.5356, -0.9143, -1.1028],
        [-3.2137,  0.3828, -1.1291, -1.2018],
        [-3.7169,  1.1277, -1.0398, -1.3253],
        [-3.3169,  0.3937, -2.0124, -0.6769],
        [-3.4552,  0.3423, -1.1141, -0.9133],
        [-3.9745,  0.0450, -1.3345, -1.4126],
        [-4.3320,  0.2954, -1.7935, -1.0179],
        [-3.5118,  0.8949, -1.8024, -1.3823]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 20
tensor([[  101,  2129,  2079,  ...,     0,     0,     0],
        [  101,  2054, 26163,  ...,     0,     0,     0],
        [  101,  2129,  2003,  ...,     0,     0,     0],
        ...,
        [  101,  2029,  2951,  ...,     0,     0,     0],
        [  101,  2054, 17463,  ...,     0,     0,     0],
        [  101,  2040,  2003,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 

  7%|▋         | 21/289 [00:16<03:34,  1.25it/s]

Training loop 21
tensor([[ 101, 2054, 9312,  ...,    0,    0,    0],
        [ 101, 2129, 2172,  ...,    0,    0,    0],
        [ 101, 2129, 2172,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2502,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2079, 2027,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5385749340057373, logits - tensor([[-3.5092,  1.0461, -0.8301, -1.0093],
        [-3.9289,  0.4666, -1.3967, -1.2157],
        [-3.7672,  0.8105, -2.1392, -1.2682],
        [-4.2591,  0.6373, -2.3156, -1.5306],
        [-3.8336,  0.6128, -1.5597, -1.10

  8%|▊         | 22/289 [00:17<03:33,  1.25it/s]

Training loop 22
tensor([[ 101, 2054, 2465,  ...,    0,    0,    0],
        [ 101, 2054, 6123,  ...,    0,    0,    0],
        [ 101, 2106, 2027,  ...,    0,    0,    0],
        ...,
        [ 101, 2001, 1996,  ...,    0,    0,    0],
        [ 101, 2339, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 8310,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.40473026037216187, logits - tensor([[-4.0806,  0.3742, -1.4246, -0.7824],
        [-3.9689,  0.7864, -2.1194, -1.8508],
        [-3.8620,  0.6439, -1.1676, -0.4206],
        [-3.5322,  0.4866, -1.6317, -1.1236],
        [-3.6258,  0.7513, -1.9385, -0.4

  8%|▊         | 23/289 [00:18<03:31,  1.25it/s]

Training loop 23
tensor([[  101,  2029,  7885,  ...,     0,     0,     0],
        [  101,  2515,  1996,  ...,     0,     0,     0],
        [  101,  2129,  2079,  ...,     0,     0,     0],
        ...,
        [  101,  2079,  2027,  ...,     0,     0,     0],
        [  101,  2054, 26163,  ...,     0,     0,     0],
        [  101,  2079,  2027,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.7024661898612976, logits - tensor([[-4.2164,  1.0417, -1.9080, -1.6329],
        [-3.6068,  0.5757, -1.7629, -1.2372],
        [-3.6627,  0.5932, -1.5282, -1.0900],
        [-4.3559,  0.9789, -1.5729, -1.3071],
     

  8%|▊         | 24/289 [00:19<03:30,  1.26it/s]

Training loop 24
tensor([[ 101, 2003, 2070,  ...,    0,    0,    0],
        [ 101, 2054, 2515,  ...,    0,    0,    0],
        [ 101, 2054, 6882,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2653,  ...,    0,    0,    0],
        [ 101, 2129, 2116,  ...,    0,    0,    0],
        [ 101, 2129, 6048,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.35189658403396606, logits - tensor([[-3.9727,  0.5629, -1.5859, -1.2579],
        [-3.6917,  0.3167, -1.7051, -1.0514],
        [-4.5916,  0.5006, -1.6972, -1.0152],
        [-3.6538,  0.7319, -1.6084, -1.1624],
        [-3.3726,  0.4109, -1.9482, -0.6

  9%|▊         | 25/289 [00:19<03:29,  1.26it/s]

Training loop 25
tensor([[  101,  2003,  1996,  ...,     0,     0,     0],
        [  101,  2129,  2116,  ...,     0,     0,     0],
        [  101,  2064,  1996,  ...,     0,     0,     0],
        ...,
        [  101,  2054, 11541,  ...,     0,     0,     0],
        [  101,  2054,  2001,  ...,     0,     0,     0],
        [  101,  2129,  2001,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5484951734542847, logits - tensor([[-3.9593,  0.4737, -1.2833, -1.3606],
        [-4.2988,  0.6466, -2.2202, -1.4976],
        [-4.8069,  0.5907, -2.0609, -1.7764],
        [-3.8949,  1.1654, -1.7776, -1.7615],
     

  9%|▉         | 26/289 [00:20<03:28,  1.26it/s]

Training loop 26
tensor([[  101,  2054, 28616,  ...,     0,     0,     0],
        [  101,  2054,  2024,  ...,     0,     0,     0],
        [  101,  2029,  2951,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  7814,  ...,     0,     0,     0],
        [  101,  2079,  2027,  ...,     0,     0,     0],
        [  101,  2054, 26163,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4137881398200989, logits - tensor([[-4.8030,  0.3758, -1.7793, -1.5312],
        [-3.9048,  1.1176, -1.4976, -0.8470],
        [-3.8750,  0.9237, -1.4671, -1.1810],
        [-4.7367,  0.7440, -2.1339, -1.7852],
     

  9%|▉         | 27/289 [00:21<03:26,  1.27it/s]

Training loop 27
tensor([[ 101, 2054, 2020,  ...,    0,    0,    0],
        [ 101, 2129, 2079,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2951,  ...,    0,    0,    0],
        [ 101, 2129, 2172,  ...,    0,    0,    0],
        [ 101, 2106, 2027,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3466132879257202, logits - tensor([[-3.6277,  0.1735, -1.5605, -1.1803],
        [-4.1524,  0.4312, -1.9004, -0.5913],
        [-3.9098,  0.7442, -1.4110, -1.8186],
        [-4.3667,  0.4975, -1.6563, -1.1377],
        [-4.3470,  0.8789, -1.1935, -1.03

 10%|▉         | 28/289 [00:22<03:24,  1.27it/s]

Training loop 28
tensor([[  101,  2029,  2838,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0],
        [  101,  2054, 15973,  ...,     0,     0,     0],
        ...,
        [  101,  2106,  1996,  ...,     0,     0,     0],
        [  101,  2054,  2828,  ...,     0,     0,     0],
        [  101,  2038,  2045,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.36353158950805664, logits - tensor([[-3.8013, -0.0283, -0.5087, -1.5934],
        [-4.7352,  0.6298, -1.1132, -1.1779],
        [-3.9954,  0.5084, -1.6383, -1.8113],
        [-4.4875,  0.7938, -1.4860, -0.4537],
    

 10%|█         | 29/289 [00:22<03:23,  1.28it/s]

Training loop 29
tensor([[ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2129, 2079,  ...,    0,    0,    0],
        [ 101, 2054, 3130,  ...,    0,    0,    0],
        ...,
        [ 101, 2339, 2003,  ..., 8270, 4667,  102],
        [ 101, 2129, 2502,  ...,    0,    0,    0],
        [ 101, 2129, 2146,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.40200456976890564, logits - tensor([[-4.4983, -0.1059, -1.6743, -1.4671],
        [-3.8579,  0.3690, -1.1586, -1.1446],
        [-3.8478,  0.4797, -1.4016, -0.8422],
        [-3.9412, -0.1288, -1.8937, -0.9935],
        [-4.3857,  0.8270, -1.8996, -0.6

 10%|█         | 30/289 [00:23<03:22,  1.28it/s]

Training loop 30
tensor([[  101,  2054,  2093,  ...,     0,     0,     0],
        [  101,  2054,  2951,  ...,     0,     0,     0],
        [  101,  2129,  2502,  ...,     0,     0,     0],
        ...,
        [  101,  2054, 16745,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0],
        [  101,  2129,  2172,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3616459369659424, logits - tensor([[-3.7329,  1.2782, -1.5843, -1.2661],
        [-3.9607,  0.3083, -1.5139, -0.6960],
        [-4.4308,  0.5587, -0.9528, -1.0624],
        [-3.5810,  0.2107, -1.3195, -1.2069],
     

 11%|█         | 31/289 [00:24<03:20,  1.29it/s]

Training loop 31
tensor([[  101,  2054, 26163,  ...,  2000,  4468,   102],
        [  101,  2054,  2838,  ...,     0,     0,     0],
        [  101,  2129,  2027,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  2024,  ...,     0,     0,     0],
        [  101,  2029,  3563,  ...,     0,     0,     0],
        [  101,  2054,  2951,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2866433262825012, logits - tensor([[-3.8506, -0.0130, -1.4947, -0.8635],
        [-4.4918,  0.4245, -1.6550, -1.0582],
        [-4.0397,  0.3510, -1.1892, -1.6186],
        [-4.0289,  0.7591, -1.0974, -1.7038],
     

 11%|█         | 32/289 [00:25<03:20,  1.28it/s]

Training loop 32
tensor([[ 101, 2054, 2653,  ...,    0,    0,    0],
        [ 101, 2079, 2027,  ...,    0,    0,    0],
        [ 101, 2129, 2172,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2172,  ...,    0,    0,    0],
        [ 101, 2129, 2020,  ...,    0,    0,    0],
        [ 101, 2073, 2515,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5367078185081482, logits - tensor([[-4.2869,  0.0360, -1.7760, -1.4983],
        [-3.8299,  0.0628, -1.7002, -1.0357],
        [-4.2634,  0.5129, -1.4735, -1.1813],
        [-4.0812,  1.0284, -1.7768, -0.9744],
        [-3.2901,  0.7246, -0.9451, -1.14

 11%|█▏        | 33/289 [00:26<03:19,  1.28it/s]

Training loop 33
tensor([[ 101, 2029, 2416,  ...,    0,    0,    0],
        [ 101, 2129, 2079,  ...,    0,    0,    0],
        [ 101, 2011, 2129,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2951,  ...,    0,    0,    0],
        [ 101, 2029, 1056,  ...,    0,    0,    0],
        [ 101, 2054, 4730,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2469981461763382, logits - tensor([[-3.8878,  0.4429, -1.7969, -1.4378],
        [-3.6474, -0.0398, -2.2498, -1.7822],
        [-4.0683,  1.1536, -1.3928, -1.4054],
        [-4.5258, -0.4216, -2.2050, -1.0017],
        [-3.3584,  0.7107, -1.1766, -0.59

 12%|█▏        | 34/289 [00:26<03:18,  1.29it/s]

Training loop 34
tensor([[ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2003,  ...,    0,    0,    0],
        [ 101, 2029, 4493,  ...,    0,    0,    0],
        [ 101, 2054, 2001,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4640332758426666, logits - tensor([[-3.6251,  0.0615, -1.6318, -1.1093],
        [-3.4142,  1.1392, -1.9695, -1.7369],
        [-4.5239,  0.0299, -1.9575, -1.1031],
        [-4.8456,  0.7631, -1.2518, -2.1562],
        [-3.4876,  0.5194, -2.0084, -0.73

 12%|█▏        | 35/289 [00:27<03:17,  1.29it/s]

Training loop 35
tensor([[  101,  2054, 27425,  ...,     0,     0,     0],
        [  101,  2054,  2785,  ...,     0,     0,     0],
        [  101,  2079,  2027,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  2024,  ...,     0,     0,     0],
        [  101,  2515,  1996,  ...,     0,     0,     0],
        [  101,  2129,  2172,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5159815549850464, logits - tensor([[-4.0707,  0.5704, -2.3819, -0.8774],
        [-4.1329,  0.6018, -1.6617, -0.8485],
        [-3.9807,  0.8572, -1.3532, -1.1351],
        [-3.5497,  1.5702, -1.1078, -0.8175],
     

 12%|█▏        | 36/289 [00:28<03:16,  1.29it/s]

Training loop 36
tensor([[ 101, 2129, 2003,  ...,    0,    0,    0],
        [ 101, 2029, 4155,  ...,    0,    0,    0],
        [ 101, 2129, 2079,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 4275,  ..., 2241, 2944,  102],
        [ 101, 2054, 2951,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.26731443405151367, logits - tensor([[-4.1132,  0.8381, -1.4280, -1.4695],
        [-4.5101,  0.2895, -1.6763, -1.0957],
        [-4.3230,  0.7208, -1.4082, -2.1536],
        [-4.8241, -0.0927, -2.0291, -0.0535],
        [-4.0728,  0.2609, -1.9206, -0.5

 13%|█▎        | 37/289 [00:29<03:14,  1.30it/s]

Training loop 37
tensor([[ 101, 2054, 8310,  ...,    0,    0,    0],
        [ 101, 2129, 2003,  ...,    0,    0,    0],
        [ 101, 2006, 2054,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2116,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2129, 2116,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4079580008983612, logits - tensor([[-4.3455,  1.3871, -2.6593, -0.7895],
        [-3.7983,  0.6282, -1.6056, -1.5700],
        [-4.3451,  0.5889, -2.1207, -1.6028],
        [-3.9728,  0.2240, -2.4806, -0.9622],
        [-3.9136,  0.3987, -2.2743, -0.31

 13%|█▎        | 38/289 [00:29<03:12,  1.30it/s]

Training loop 38
tensor([[  101,  2054,  2024,  ...,     0,     0,     0],
        [  101,  2079,  2027,  ...,     0,     0,     0],
        [  101,  2054,  2024,  ...,     0,     0,     0],
        ...,
        [  101,  2054, 15152,  ...,     0,     0,     0],
        [  101,  2054,  2529,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.40886881947517395, logits - tensor([[-4.2138,  0.3335, -2.1207, -0.8863],
        [-4.8091,  0.1189, -2.2701, -1.3881],
        [-4.9704,  0.5631, -2.5756, -1.0093],
        [-4.2564,  0.9899, -1.6323, -1.8454],
    

 13%|█▎        | 39/289 [00:30<03:10,  1.31it/s]

Training loop 39
tensor([[  101,  2054,  4275,  ...,     0,     0,     0],
        [  101,  2054, 26163,  ...,     0,     0,     0],
        [  101,  2129,  2001,  ...,     0,     0,     0],
        ...,
        [  101,  2079,  2027,  ...,     0,     0,     0],
        [  101,  2515,  1996,  ...,     0,     0,     0],
        [  101,  2054,  2828,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.46615415811538696, logits - tensor([[-4.2977,  0.7741, -2.2923, -0.8889],
        [-4.5108,  0.8753, -2.0355, -0.4193],
        [-4.8104,  0.2524, -2.7725, -0.7674],
        [-3.0810, -0.1241, -1.9591, -0.8371],
    

 14%|█▍        | 40/289 [00:31<03:09,  1.32it/s]

Training loop 40
tensor([[ 101, 2129, 2116,  ...,    0,    0,    0],
        [ 101, 2079, 1996,  ...,    0,    0,    0],
        [ 101, 2011, 2129,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2024,  ...,    0,    0,    0],
        [ 101, 2079, 2027,  ...,    0,    0,    0],
        [ 101, 2079, 1996,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5631566047668457, logits - tensor([[-3.1987, -0.1810, -1.8362, -0.5352],
        [-3.6262, -0.7166, -2.2941, -0.0279],
        [-4.2024,  0.3078, -3.0063, -1.3569],
        [-3.7145,  0.3408, -1.8197,  0.3640],
        [-4.0213,  0.4653, -1.8646, -0.15

 14%|█▍        | 41/289 [00:32<03:07,  1.32it/s]

Training loop 41
tensor([[ 101, 2003, 2045,  ...,    0,    0,    0],
        [ 101, 2054, 4895,  ...,    0,    0,    0],
        [ 101, 2129, 2079,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2003,  ...,    0,    0,    0],
        [ 101, 2079, 3431,  ...,    0,    0,    0],
        [ 101, 2003, 1996,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5241851806640625, logits - tensor([[-4.3475,  0.6344, -2.9523, -0.7490],
        [-4.7532,  0.5033, -1.9972, -0.9948],
        [-3.6138,  1.2688, -1.7165, -0.7338],
        [-4.8562,  0.3571, -2.6429, -1.0563],
        [-3.6050,  0.6415, -2.8052, -1.50

 15%|█▍        | 42/289 [00:32<03:06,  1.32it/s]

Training loop 42
tensor([[ 101, 2106, 1996,  ...,    0,    0,    0],
        [ 101, 2054, 4155,  ...,    0,    0,    0],
        [ 101, 2129, 2515,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2079,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2129, 2003,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3630523681640625, logits - tensor([[-4.7660,  0.0711, -1.7312, -0.9575],
        [-3.3329, -0.0215, -2.7332, -1.3077],
        [-4.7541, -0.0574, -2.1466, -1.1018],
        [-4.1273,  0.2344, -1.8824, -0.9719],
        [-3.2411, -0.1931, -1.3700, -0.99

 15%|█▍        | 43/289 [00:33<03:05,  1.32it/s]

Training loop 43
tensor([[  101,  2054,  6364,  ...,     0,     0,     0],
        [  101,  2129,  2116,  ...,     0,     0,     0],
        [  101,  2054, 15756,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  2003,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0],
        [  101,  2054,  2024,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.30612534284591675, logits - tensor([[-4.4486,  0.2158, -1.8254, -0.6795],
        [-4.3942, -0.1044, -2.0569, -0.8167],
        [-4.1108,  0.1449, -1.9302, -0.7121],
        [-4.0651,  0.5000, -2.0439, -0.6219],
    

 15%|█▌        | 44/289 [00:34<03:04,  1.33it/s]

Training loop 44
tensor([[  101,  2129,  2003,  ...,     0,     0,     0],
        [  101,  2054,  2079,  ...,     0,     0,     0],
        [  101,  2054, 12046,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  3921,  ...,     0,     0,     0],
        [  101,  2029, 15792,  ...,  1024,  1024,   102],
        [  101,  2054,  2951,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3782041072845459, logits - tensor([[-4.1831e+00,  1.4442e-03, -1.9809e+00, -1.1117e+00],
        [-4.1535e+00, -9.7532e-02, -1.6165e+00, -7.5459e-01],
        [-4.4663e+00,  5.3201e-01, -2.2855e+00, -6.1027e-01],
   

 16%|█▌        | 45/289 [00:35<03:04,  1.33it/s]

Training loop 45
tensor([[  101,  2054, 12046,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0],
        [  101,  2129,  2003,  ...,     0,     0,     0],
        ...,
        [  101,  2129,  2502,  ...,     0,     0,     0],
        [  101,  2054, 14676,  ...,     0,     0,     0],
        [  101,  2054,  2024,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.45440673828125, logits - tensor([[-4.7618,  0.3421, -1.7861,  0.0173],
        [-4.0523,  0.4900, -1.6364, -1.3814],
        [-3.5125, -0.4890, -2.3575, -0.8264],
        [-3.9944,  1.1015, -2.0855, -0.8565],
       

 16%|█▌        | 46/289 [00:35<03:02,  1.33it/s]

Training loop 46
tensor([[  101,  2001,  1996,  ...,     0,     0,     0],
        [  101,  2129,  2172,  ...,     0,     0,     0],
        [  101,  2029, 10056,  ...,     0,     0,     0],
        ...,
        [  101,  2054, 15756,  ...,     0,     0,     0],
        [  101,  2054, 11380,  ...,     0,     0,     0],
        [  101,  2029,  1997,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4037497043609619, logits - tensor([[-4.1848, -0.0309, -2.5503, -0.5062],
        [-4.1381, -1.0399, -2.1180, -0.3834],
        [-3.8380, -0.0579, -2.0494, -0.7644],
        [-4.0934,  0.1912, -1.5247, -0.7889],
     

 16%|█▋        | 47/289 [00:36<03:01,  1.33it/s]

Training loop 47
tensor([[ 101, 2129, 2172,  ...,    0,    0,    0],
        [ 101, 2054, 3671,  ...,    0,    0,    0],
        [ 101, 2006, 2029,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2951,  ..., 2023, 2862,  102],
        [ 101, 2029, 2951,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3596497178077698, logits - tensor([[-3.8878, -1.1201, -1.3045,  0.7345],
        [-4.8753, -0.1921, -1.2943, -0.8216],
        [-4.2728,  0.5491, -1.8346, -1.5681],
        [-4.8216,  0.3426, -2.5237, -1.3769],
        [-3.9643, -0.0854, -2.2876, -1.13

 17%|█▋        | 48/289 [00:37<03:00,  1.33it/s]

Training loop 48
tensor([[  101,  2064,  1996,  ...,  2005,  6885,   102],
        [  101,  2129,  2003,  ...,     0,     0,     0],
        [  101,  2054,  2951,  ...,     0,     0,     0],
        ...,
        [  101,  2129,  2024,  ...,     0,     0,     0],
        [  101,  2054, 12878,  ...,     0,     0,     0],
        [  101,  2054,  2024,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3946689963340759, logits - tensor([[-4.3805,  0.5143, -2.1046, -1.2165],
        [-4.2628,  0.0751, -1.9670, -0.6143],
        [-4.6607,  0.9525, -2.3819, -0.5138],
        [-4.6296,  0.7428, -1.7215, -0.7609],
     

 17%|█▋        | 49/289 [00:38<03:00,  1.33it/s]

Training loop 49
tensor([[ 101, 2079, 2027,  ...,    0,    0,    0],
        [ 101, 2129, 2515,  ...,    0,    0,    0],
        [ 101, 2029, 3653,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2129, 2844,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5586422681808472, logits - tensor([[-4.1608,  0.7785, -1.8743, -1.0616],
        [-4.7545,  0.8043, -1.1248, -0.8204],
        [-4.5790,  0.8573, -1.9760, -0.2651],
        [-3.9711,  0.5284, -1.8390, -1.1285],
        [-4.1005,  0.0189, -1.9544, -1.02

 17%|█▋        | 50/289 [00:38<02:58,  1.34it/s]

Training loop 50
tensor([[ 101, 2054, 2828,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2129, 2003,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2951,  ...,    0,    0,    0],
        [ 101, 2024, 3463,  ...,    0,    0,    0],
        [ 101, 2029, 9844,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.45527905225753784, logits - tensor([[-4.0772,  0.8152, -2.0694, -1.2676],
        [-3.6304,  0.9102, -1.7844, -1.1229],
        [-5.1645,  1.1327, -2.9306, -1.5079],
        [-4.2742,  0.6136, -1.8185, -1.5583],
        [-4.1980,  0.4907, -1.3313, -1.4

 18%|█▊        | 51/289 [00:39<02:58,  1.33it/s]

Training loop 51
tensor([[  101,  2029, 15756,  ...,     0,     0,     0],
        [  101,  2054, 21641,  ...,     0,     0,     0],
        [  101,  2054,  2024,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  2003,  ...,     0,     0,     0],
        [  101,  2029,  1997,  ...,     0,     0,     0],
        [  101,  2054,  2951,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2937636077404022, logits - tensor([[-4.5283,  0.2981, -1.6654, -1.5475],
        [-4.5144,  0.3253, -1.5950, -1.0410],
        [-4.3877,  0.3508, -1.6906, -1.4379],
        [-4.1147,  0.5867, -2.1466, -1.4251],
     

 18%|█▊        | 52/289 [00:40<02:57,  1.33it/s]

Training loop 52
tensor([[ 101, 2129, 2003,  ..., 1055, 2012,  102],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2054, 2785,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2502,  ...,    0,    0,    0],
        [ 101, 2079, 2027,  ...,    0,    0,    0],
        [ 101, 2054, 2001,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5051567554473877, logits - tensor([[-3.6065,  0.4646, -1.9575, -1.2294],
        [-4.2891, -0.2205, -1.7605, -0.7667],
        [-3.9953,  0.5206, -1.5898, -1.4481],
        [-4.3646,  0.8046, -2.5862, -1.9485],
        [-3.9818,  0.8233, -2.1541, -1.39

 18%|█▊        | 53/289 [00:41<02:57,  1.33it/s]

Training loop 53
tensor([[  101,  2058,  2054,  ...,     0,     0,     0],
        [  101,  2129,  2027,  ...,     0,     0,     0],
        [  101,  2515,  1996,  ...,     0,     0,     0],
        ...,
        [  101,  2079,  1996,  ...,     0,     0,     0],
        [  101,  2054, 23807,  ...,     0,     0,     0],
        [  101,  2029,  4155,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4588855504989624, logits - tensor([[-4.2320,  0.3529, -1.4556, -1.1722],
        [-3.5202,  0.3609, -1.4162, -1.2197],
        [-5.0624,  0.9303, -1.7417, -1.1825],
        [-5.3378,  0.5703, -2.4157, -0.7942],
     

 19%|█▊        | 54/289 [00:41<02:57,  1.33it/s]

Training loop 54
tensor([[  101,  2129,  2116,  ...,     0,     0,     0],
        [  101,  2054, 21677,  ...,     0,     0,     0],
        [  101,  2054, 16325,  ...,  2323,  9125,   102],
        ...,
        [  101,  2054, 15306,  ...,     0,     0,     0],
        [  101,  2129,  2502,  ...,     0,     0,     0],
        [  101,  2054,  2828,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3578398525714874, logits - tensor([[-3.9282,  0.0693, -1.2344, -0.6282],
        [-3.7693,  0.2070, -1.8441, -1.7358],
        [-4.7746,  0.4282, -2.6750, -1.0467],
        [-4.2327,  0.7171, -1.2345, -1.2030],
     

 19%|█▉        | 55/289 [00:42<02:56,  1.33it/s]

Training loop 55
tensor([[ 101, 2054, 2003,  ..., 2022, 3697,  102],
        [ 101, 2129, 2502,  ...,    0,    0,    0],
        [ 101, 2129, 2515,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2079,  ...,    0,    0,    0],
        [ 101, 2054, 9646,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3546020984649658, logits - tensor([[-4.2569,  0.0917, -2.1373, -0.6979],
        [-4.3470,  0.2107, -1.6475, -1.0070],
        [-4.2790,  0.1363, -1.2219, -1.0319],
        [-4.7083,  0.6536, -1.6598, -1.1300],
        [-4.2480,  0.2212, -1.4994, -1.20

 19%|█▉        | 56/289 [00:43<02:54,  1.33it/s]

Training loop 56
tensor([[ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2129, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 2048,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ..., 2013, 7418,  102]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 20%|█▉        | 57/289 [00:44<02:53,  1.33it/s]

loss - 0.33201777935028076, logits - tensor([[-4.3060,  0.1483, -1.1330, -1.0473],
        [-4.7746,  0.1719, -2.1526, -0.8195],
        [-3.9235,  0.2254, -1.6365, -1.2356],
        [-4.3126,  0.0409, -2.3595, -1.4480],
        [-4.8344,  0.2261, -2.0680, -1.6406],
        [-4.5698, -0.1984, -1.9972, -0.6576],
        [-5.0258, -0.2982, -1.5296, -0.8251],
        [-4.1418,  0.3380, -2.1385, -1.0284]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 57
tensor([[  101,  2029, 12046,  ...,     0,     0,     0],
        [  101,  2011,  2129,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  2003,  ...,     0,     0,     0],
        [  101,  2029,  4800,  ...,     0,     0,     0],
        [  101,  2129,  2079,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0,

 20%|██        | 58/289 [00:44<02:52,  1.34it/s]

Training loop 58
tensor([[  101,  2054,  7017,  ...,     0,     0,     0],
        [  101,  2129,  2116,  ...,     0,     0,     0],
        [  101,  2029,  4563,  ...,     0,     0,     0],
        ...,
        [  101,  2129,  2502,  ...,     0,     0,     0],
        [  101,  2029, 26163,  ...,     0,     0,     0],
        [  101,  2054, 26163,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.31869614124298096, logits - tensor([[-4.7313,  0.6663, -2.4761, -0.2813],
        [-3.7586, -0.9344, -1.8201,  0.5757],
        [-4.8298,  0.3227, -2.8566, -1.4659],
        [-4.4995,  0.0468, -2.1546, -0.9774],
    

 20%|██        | 59/289 [00:45<02:51,  1.34it/s]

Training loop 59
tensor([[ 101, 2129, 2024,  ...,    0,    0,    0],
        [ 101, 2129, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 2731,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2172,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2129, 2024,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.31248560547828674, logits - tensor([[-5.0458,  1.3269, -3.1032, -1.2259],
        [-4.7330,  0.0638, -1.0599, -0.4640],
        [-4.2067,  0.3535, -1.9765, -0.8282],
        [-4.2763, -0.7667, -1.8338,  0.9792],
        [-5.3614,  0.5785, -1.9228, -1.4

 21%|██        | 60/289 [00:46<02:49,  1.35it/s]

Training loop 60
tensor([[  101,  2054, 26163,  ...,     0,     0,     0],
        [  101,  2054,  4127,  ...,     0,     0,     0],
        [  101,  2054,  2024,  ...,     0,     0,     0],
        ...,
        [  101,  2054, 10640,  ...,     0,     0,     0],
        [  101,  2079,  2027,  ...,     0,     0,     0],
        [  101,  2029,  9896,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3588885962963104, logits - tensor([[-4.6059,  1.1803, -1.7226, -0.9160],
        [-4.5479,  0.6452, -2.7201, -1.3505],
        [-4.1446, -0.3615, -2.2776,  1.3895],
        [-4.3304,  0.7246, -2.2791, -0.6310],
     

 21%|██        | 61/289 [00:47<02:48,  1.35it/s]

Training loop 61
tensor([[  101,  2054, 12046,  ...,  2015,  1012,   102],
        [  101,  2129,  2024,  ...,     0,     0,     0],
        [  101,  2054,  6882,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  2951,  ...,     0,     0,     0],
        [  101,  2054,  2367,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3776007890701294, logits - tensor([[-4.4409,  0.8899, -2.3481, -0.8463],
        [-4.0858,  1.0517, -2.0387, -0.5436],
        [-4.4670,  0.5888, -2.7371, -1.2144],
        [-4.4061,  1.6936, -2.5178, -1.4180],
     

 21%|██▏       | 62/289 [00:47<02:47,  1.35it/s]

Training loop 62
tensor([[ 101, 2106, 1996,  ...,    0,    0,    0],
        [ 101, 2001, 2836,  ...,    0,    0,    0],
        [ 101, 2054, 2951,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2951,  ...,    0,    0,    0],
        [ 101, 2129, 2003,  ...,    0,    0,    0],
        [ 101, 2129, 2172,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.48704904317855835, logits - tensor([[-4.3805,  1.1922, -2.3278, -0.5213],
        [-4.6143,  0.0969, -2.8737, -0.6712],
        [-4.0848,  0.8026, -2.0235, -0.6850],
        [-4.8425,  0.8043, -2.1945, -1.4986],
        [-4.6626,  1.0980, -2.9336, -0.7

 22%|██▏       | 63/289 [00:48<02:46,  1.36it/s]

Training loop 63
tensor([[ 101, 2129, 2312,  ...,    0,    0,    0],
        [ 101, 2129, 2020,  ...,    0,    0,    0],
        [ 101, 2054, 4942,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2003,  ...,    0,    0,    0],
        [ 101, 2515, 2944,  ...,    0,    0,    0],
        [ 101, 2129, 2027,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3092392683029175, logits - tensor([[-4.2831,  1.3246, -2.1031, -0.2562],
        [-5.0088,  0.6141, -2.6860, -0.3222],
        [-4.5864,  0.9864, -2.2710, -1.1733],
        [-3.5226, -0.7917, -2.1110,  0.8099],
        [-4.6489,  0.1678, -3.0385, -1.10

 22%|██▏       | 64/289 [00:49<02:45,  1.36it/s]

Training loop 64
tensor([[ 101, 2054, 1996,  ...,    0,    0,    0],
        [ 101, 2003, 1996,  ...,    0,    0,    0],
        [ 101, 2054, 9312,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2515,  ...,    0,    0,    0],
        [ 101, 2079, 2027,  ...,    0,    0,    0],
        [ 101, 2003, 2107,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5298585295677185, logits - tensor([[-4.1283,  0.5866, -2.8681, -0.5344],
        [-5.4348,  0.7756, -3.2723, -1.0373],
        [-4.4995, -0.2512, -2.2374, -0.2567],
        [-3.9714,  1.1461, -2.3182, -0.4048],
        [-4.4869, -0.2856, -2.7784, -0.27

 22%|██▏       | 65/289 [00:50<02:45,  1.36it/s]

Training loop 65
tensor([[ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2129, 2172,  ...,    0,    0,    0],
        [ 101, 2003, 1996,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 2773,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3839721381664276, logits - tensor([[-5.0384,  0.6998, -2.3763, -1.1210],
        [-3.5638, -0.2316, -2.3370,  0.5152],
        [-4.5681,  0.8436, -2.9446, -1.7039],
        [-2.9448, -1.0332, -2.0586,  1.4853],
        [-4.1301,  1.4810, -1.9883, -0.86

 23%|██▎       | 66/289 [00:50<02:43,  1.36it/s]

Training loop 66
tensor([[ 101, 2129, 2106,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2054, 4493,  ...,    0,    0,    0],
        [ 101, 2003, 2023,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.46614527702331543, logits - tensor([[-4.0103,  1.1095, -1.9346, -1.3804],
        [-5.1072,  0.8917, -2.1426, -0.9326],
        [-4.1996,  0.6437, -2.3779, -1.3935],
        [-4.1835,  0.8846, -2.1240, -1.0857],
        [-3.9300,  1.3306, -3.5497, -0.8

 23%|██▎       | 67/289 [00:51<02:43,  1.36it/s]

Training loop 67
tensor([[  101,  2054,  2024,  ...,     0,     0,     0],
        [  101,  2029,  3730,  ...,     0,     0,     0],
        [  101,  2024,  1996,  ...,     0,     0,     0],
        ...,
        [  101,  2054, 26163,  ...,     0,     0,     0],
        [  101,  2073,  2079,  ...,     0,     0,     0],
        [  101,  2003,  1996,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.47473278641700745, logits - tensor([[-5.4585,  1.0486, -2.5807, -0.5068],
        [-3.6245,  0.6296, -1.9314, -0.1325],
        [-3.9892,  0.8403, -2.5315, -1.2307],
        [-4.7330,  1.2360, -1.9894, -1.1380],
    

 24%|██▎       | 68/289 [00:52<02:42,  1.36it/s]

Training loop 68
tensor([[ 101, 2079, 2027,  ...,    0,    0,    0],
        [ 101, 2054, 2001,  ...,    0,    0,    0],
        [ 101, 2129, 2024,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 6847,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3819042444229126, logits - tensor([[-4.5316,  0.1733, -2.1619, -1.2163],
        [-4.8845,  0.7697, -2.4751, -1.2558],
        [-3.5078, -0.1017, -2.5435, -0.9587],
        [-2.4871, -1.2084, -1.8753,  0.8385],
        [-4.9445,  0.7891, -1.9011, -0.31

 24%|██▍       | 69/289 [00:52<02:41,  1.36it/s]

Training loop 69
tensor([[  101,  2054,  2003,  ...,     0,     0,     0],
        [  101,  2003,  1996,  ...,     0,     0,     0],
        [  101,  2129,  2116,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  3180,  ...,     0,     0,     0],
        [  101,  2054,  2946,  ...,  2871,  1013,   102],
        [  101,  2024,  2023,  ..., 16406,  1007,   102]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4244527220726013, logits - tensor([[-5.1416,  0.1644, -2.6493, -0.4118],
        [-4.3638,  0.5841, -2.1598, -1.1514],
        [-3.2745, -1.6075, -2.4007,  1.0227],
        [-4.5970,  0.3173, -2.3359, -1.0858],
     

 24%|██▍       | 70/289 [00:53<02:40,  1.36it/s]

Training loop 70
tensor([[ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2129, 2079,  ...,    0,    0,    0],
        [ 101, 2054, 3471,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2060,  ...,    0,    0,    0],
        [ 101, 2054, 6847,  ...,    0,    0,    0],
        [ 101, 2129, 2003,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.34584516286849976, logits - tensor([[-3.6197,  0.7647, -1.3716, -1.0726],
        [-5.0775, -0.1263, -1.9813, -1.1336],
        [-4.6971, -0.1197, -1.7263, -0.2939],
        [-2.3292, -1.6431, -1.5169,  1.1820],
        [-4.0271, -0.1142, -2.8166, -1.2

 25%|██▍       | 71/289 [00:54<02:40,  1.36it/s]

Training loop 71
tensor([[ 101, 2129, 2116,  ...,    0,    0,    0],
        [ 101, 2054, 2079,  ...,    0,    0,    0],
        [ 101, 2054, 6753,  ...,    0,    0,    0],
        ...,
        [ 101, 2001, 2023,  ...,    0,    0,    0],
        [ 101, 2029, 2951,  ...,    0,    0,    0],
        [ 101, 2003, 2151,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3705423176288605, logits - tensor([[-3.8077,  0.7816, -1.8515, -1.3484],
        [-4.5806,  0.6486, -1.8798, -1.0233],
        [-5.2347,  0.4951, -2.0626, -1.3601],
        [-3.8681,  0.3360, -1.8682, -0.4533],
        [-4.5871,  0.9515, -1.9514, -1.54

 25%|██▍       | 72/289 [00:55<02:40,  1.35it/s]

Training loop 72
tensor([[ 101, 2129, 2515,  ...,    0,    0,    0],
        [ 101, 2106, 1996,  ...,    0,    0,    0],
        [ 101, 2079, 2027,  ...,    0,    0,    0],
        ...,
        [ 101, 2029, 2176,  ...,    0,    0,    0],
        [ 101, 2054, 5579,  ...,    0,    0,    0],
        [ 101, 2054, 4708,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4743537902832031, logits - tensor([[-3.5371,  0.9424, -2.2904, -2.1460],
        [-3.6530,  0.4631, -2.0943, -2.0089],
        [-3.5786,  1.1953, -1.8747, -2.1588],
        [-3.1686,  0.7581, -2.2112, -1.0644],
        [-4.5917,  0.6420, -1.5331, -1.08

 25%|██▌       | 73/289 [00:55<02:39,  1.36it/s]

Training loop 73
tensor([[  101,  2129,  2172,  ...,     0,     0,     0],
        [  101,  2054,  2079,  ...,     0,     0,     0],
        [  101,  2054,  2024,  ...,     0,     0,     0],
        ...,
        [  101,  2129,  2502,  ...,     0,     0,     0],
        [  101,  2054, 24828,  ...,     0,     0,     0],
        [  101,  2079,  2027,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3333877623081207, logits - tensor([[-2.5697e+00, -1.5275e+00, -1.9197e+00,  1.2828e+00],
        [-4.1442e+00, -3.6577e-03, -2.2057e+00, -1.3214e+00],
        [-4.5168e+00,  1.9358e-01, -2.2374e+00, -1.2912e+00],
   

 26%|██▌       | 74/289 [00:56<02:38,  1.36it/s]

Training loop 74
tensor([[ 101, 2054, 8518,  ...,    0,    0,    0],
        [ 101, 2054, 2951,  ...,    0,    0,    0],
        [ 101, 2003, 5746,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2079,  ...,    0,    0,    0],
        [ 101, 2079, 2027,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3907780051231384, logits - tensor([[-4.7815, -0.1417, -1.5511, -1.5494],
        [-4.7109,  0.4849, -2.2059, -1.3086],
        [-3.4984, -0.7398, -2.1976,  0.1018],
        [-3.7985,  0.0817, -1.4083, -1.2050],
        [-4.1549,  0.1322, -1.9099, -0.84

 26%|██▌       | 75/289 [00:57<02:38,  1.35it/s]

Training loop 75
tensor([[  101,  2054,  7861,  ...,     0,     0,     0],
        [  101,  2073,  2515,  ...,     0,     0,     0],
        [  101,  2029, 13221,  ...,     0,     0,     0],
        ...,
        [  101,  2129,  2172,  ...,     0,     0,     0],
        [  101,  2129,  2515,  ...,     0,     0,     0],
        [  101,  2129,  2079,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2617841362953186, logits - tensor([[-4.2529,  0.4186, -0.9503, -1.4372],
        [-3.9594, -0.2915, -2.1791, -1.0969],
        [-3.9781, -0.0825, -1.4511, -1.3703],
        [-4.1016,  1.5430, -1.8754, -2.2717],
     

 26%|██▋       | 76/289 [00:58<02:37,  1.35it/s]

Training loop 76
tensor([[  101,  2054, 12046,  ...,     0,     0,     0],
        [  101,  2054, 12046,  ...,     0,     0,     0],
        [  101,  2129,  2003,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  2024,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0],
        [  101,  2054,  2024,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.42993563413619995, logits - tensor([[-3.7029, -0.3536, -1.3659, -1.1575],
        [-3.7161, -0.1855, -1.4996, -1.1810],
        [-5.2097,  0.9545, -1.7940, -1.7584],
        [-3.7908, -0.5896, -1.8047, -0.9547],
    

 27%|██▋       | 77/289 [00:58<02:37,  1.34it/s]

Training loop 77
tensor([[  101,  2129,  2079,  ...,     0,     0,     0],
        [  101,  2029,  7271,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0],
        ...,
        [  101, 26384,  2012,  ...,  4696,  2302,   102],
        [  101,  2029,  2944,  ...,  1010,  1002,   102],
        [  101,  2129,  2003,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.27226710319519043, logits - tensor([[-4.5297,  0.4951, -2.0581, -1.3203],
        [-3.8207, -0.7804, -1.5922, -0.9616],
        [-4.0468,  0.2508, -1.4132, -1.2845],
        [-2.3311, -2.1696, -1.7465,  1.5644],
    

 27%|██▋       | 78/289 [00:59<02:37,  1.34it/s]

Training loop 78
tensor([[  101,  2129,  2003,  ...,     0,     0,     0],
        [  101,  2006,  2054,  ...,     0,     0,     0],
        [  101,  2054,  2828,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  2003,  ...,     0,     0,     0],
        [  101,  2003,  2045,  ...,     0,     0,     0],
        [  101,  2054, 12158,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3624168932437897, logits - tensor([[-5.0822,  0.7091, -1.1622, -1.3118],
        [-5.2122,  0.6121, -2.3872, -1.7704],
        [-4.8037,  0.3140, -1.9643, -1.5698],
        [-4.3152,  0.5894, -1.5203, -1.9700],
     

 27%|██▋       | 79/289 [01:00<02:35,  1.35it/s]

Training loop 79
tensor([[ 101, 2011, 2129,  ...,    0,    0,    0],
        [ 101, 2054, 2951,  ...,    0,    0,    0],
        [ 101, 2129, 2116,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 4275,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.351767361164093, logits - tensor([[-3.7843,  1.0710, -1.5244, -1.8874],
        [-4.5822,  1.6218, -1.6770, -1.6074],
        [-4.1968,  0.4934, -1.4578, -1.9435],
        [-3.9543,  0.6683, -1.9448, -1.8768],
        [-4.4558,  1.1002, -1.7030, -2.182

 28%|██▊       | 80/289 [01:01<02:34,  1.35it/s]

Training loop 80
tensor([[ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2054, 4275,  ...,    0,    0,    0],
        [ 101, 2029, 3716,  ...,    0,    0,    0],
        ...,
        [ 101, 2106, 2027,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2129, 2312,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4512695074081421, logits - tensor([[-4.5291,  0.9141, -2.1199, -1.2906],
        [-4.6819,  0.7018, -1.7314, -1.7736],
        [-4.5702,  0.9019, -1.9626, -1.6092],
        [-4.3534,  1.6632, -2.0664, -1.8112],
        [-4.5939,  0.9594, -2.1294, -1.76

 28%|██▊       | 81/289 [01:01<02:33,  1.35it/s]

Training loop 81
tensor([[ 101, 2515, 1996,  ...,    0,    0,    0],
        [ 101, 2054, 2836,  ...,    0,    0,    0],
        [ 101, 2129, 2003,  ..., 1024, 2065,  102],
        ...,
        [ 101, 2003, 2019,  ...,    0,    0,    0],
        [ 101, 2054, 4725,  ...,    0,    0,    0],
        [ 101, 2024, 2045,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5137412548065186, logits - tensor([[-4.7970,  0.9832, -1.3660, -2.0434],
        [-3.7484, -0.3593, -2.3726,  0.7222],
        [-3.7620,  0.6717, -2.7601, -1.2749],
        [-4.5841,  0.9996, -2.2248, -1.7763],
        [-4.2908,  1.8513, -1.6004, -1.73

 28%|██▊       | 82/289 [01:02<02:32,  1.36it/s]

Training loop 82
tensor([[ 101, 2079, 1996,  ...,    0,    0,    0],
        [ 101, 2054, 2020,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2029, 9324,  ...,    0,    0,    0],
        [ 101, 2054, 2020,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.24069397151470184, logits - tensor([[-3.7561,  0.9810, -1.0686, -2.0020],
        [-4.3660,  1.7829, -1.9670, -2.3036],
        [-3.8474,  1.5520, -1.6699, -1.9607],
        [-4.6398,  1.1123, -1.8604, -1.7981],
        [-3.0166, -1.3918, -2.0832,  0.8

 29%|██▊       | 83/289 [01:03<02:31,  1.36it/s]

Training loop 83
tensor([[ 101, 2003, 1996,  ...,    0,    0,    0],
        [ 101, 2054, 2027,  ...,    0,    0,    0],
        [ 101, 2106, 1996,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2001,  ...,    0,    0,    0],
        [ 101, 2515, 2037,  ...,    0,    0,    0],
        [ 101, 2129, 2116,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.744330644607544, logits - tensor([[-3.1493, -0.7693, -1.6377,  0.5245],
        [-4.6149,  0.7273, -1.3508, -1.6902],
        [-3.5655, -0.4170, -1.5241,  0.2818],
        [-3.9909,  2.0026, -1.6776, -2.2784],
        [-4.1485,  1.9392, -2.0070, -2.087

 29%|██▉       | 84/289 [01:04<02:31,  1.36it/s]

Training loop 84
tensor([[  101,  2029, 15792,  ...,     0,     0,     0],
        [  101,  1999,  1996,  ...,     0,     0,     0],
        [  101,  2054,  4800,  ...,     0,     0,     0],
        ...,
        [  101,  2106,  2027,  ...,     0,     0,     0],
        [  101,  2129,  2027,  ...,     0,     0,     0],
        [  101,  2054,  2024,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5334857106208801, logits - tensor([[-3.0227, -0.5847, -1.7998,  1.1865],
        [-4.4187,  0.9705, -1.4187, -1.4491],
        [-4.8822,  0.9983, -0.9549, -1.5561],
        [-4.6580,  1.6624, -1.6642, -2.1079],
     

 29%|██▉       | 85/289 [01:04<02:29,  1.36it/s]

Training loop 85
tensor([[ 101, 2054, 2110,  ...,    0,    0,    0],
        [ 101, 2029, 4155,  ..., 2891, 6038,  102],
        [ 101, 2003, 2023,  ...,    0,    0,    0],
        ...,
        [ 101, 2003, 2023,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 4275,  ..., 2232, 2000,  102]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5304061770439148, logits - tensor([[-4.2156,  0.8396, -1.7611, -1.1163],
        [-3.6641, -0.2305, -1.6462, -0.0606],
        [-4.6350,  0.5801, -1.5702, -1.2677],
        [-4.1455,  1.0850, -1.4263, -1.8246],
        [-4.0231,  0.9053, -2.6124, -1.69

 30%|██▉       | 86/289 [01:05<02:29,  1.36it/s]

Training loop 86
tensor([[ 101, 2129, 2024,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 2110,  ...,    0,    0,    0],
        ...,
        [ 101, 2515, 2037,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2129, 2003,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.29243719577789307, logits - tensor([[-4.1493,  0.9942, -2.0915, -1.5286],
        [-2.2547, -1.3828, -1.8137,  2.2900],
        [-4.0867,  0.5089, -2.3790, -0.9873],
        [-3.9348,  0.0967, -2.4695, -0.1640],
        [-4.0987, -0.1908, -2.0182, -0.2

 30%|███       | 87/289 [01:06<02:28,  1.36it/s]

Training loop 87
tensor([[ 101, 2129, 8321,  ...,    0,    0,    0],
        [ 101, 2129, 2003,  ...,    0,    0,    0],
        [ 101, 2129, 2027,  ...,    0,    0,    0],
        ...,
        [ 101, 2029, 2951,  ...,    0,    0,    0],
        [ 101, 2029, 7705,  ...,    0,    0,    0],
        [ 101, 2129, 2003,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.37136590480804443, logits - tensor([[-4.1252,  0.5611, -1.5562, -1.1003],
        [-3.7975,  0.7939, -1.0767, -1.3983],
        [-4.1734,  0.0429, -1.8569, -0.7772],
        [-4.1104,  0.4123, -2.0983, -1.1866],
        [-4.3996,  0.2181, -1.9301, -0.2

 30%|███       | 88/289 [01:07<02:27,  1.36it/s]

Training loop 88
tensor([[ 101, 2029, 2951,  ...,    0,    0,    0],
        [ 101, 2029, 1997,  ...,    0,    0,    0],
        [ 101, 2129, 2079,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 7620,  ...,    0,    0,    0],
        [ 101, 2029, 2093,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.33635032176971436, logits - tensor([[-4.3212,  0.3708, -1.4190, -1.5862],
        [-4.2972,  0.2679, -1.7131, -0.7800],
        [-4.5937,  0.4355, -1.6297, -1.2091],
        [-4.7834,  0.6406, -2.2536, -1.0350],
        [-4.9930,  0.0555, -1.6786, -2.1

 31%|███       | 89/289 [01:07<02:26,  1.36it/s]

Training loop 89
tensor([[ 101, 2339, 3816,  ...,    0,    0,    0],
        [ 101, 2054, 4730,  ...,    0,    0,    0],
        [ 101, 2054, 4127,  ..., 2013, 1996,  102],
        ...,
        [ 101, 2079, 2027,  ...,    0,    0,    0],
        [ 101, 2515, 1996,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.34936338663101196, logits - tensor([[-4.7961,  0.4441, -1.8302, -1.2274],
        [-4.8689,  0.3355, -1.2174, -0.8372],
        [-4.0426,  0.3320, -1.6003, -0.4637],
        [-2.1724, -1.9126, -1.8048,  2.0557],
        [-4.8218,  0.8815, -1.6830, -1.4

 31%|███       | 90/289 [01:08<02:25,  1.37it/s]

Training loop 90
tensor([[  101,  2515,  1996,  ...,     0,     0,     0],
        [  101,  2054,  2020,  ...,     0,     0,     0],
        [  101,  2054, 11633,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  6177,  ...,     0,     0,     0],
        [  101,  2129,  2079,  ...,     0,     0,     0],
        [  101,  2054,  6695,  ...,  2196,  2583,   102]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3935856223106384, logits - tensor([[-3.8056, -1.1336, -1.2036, -1.5107],
        [-3.2930, -1.7741, -1.7584,  0.9775],
        [-3.7597,  0.1765, -1.7032, -1.3329],
        [-3.8460,  0.2729, -1.4842, -0.2163],
     

 31%|███▏      | 91/289 [01:09<02:25,  1.36it/s]

Training loop 91
tensor([[ 101, 2029, 2773,  ...,    0,    0,    0],
        [ 101, 2054, 2079,  ...,    0,    0,    0],
        [ 101, 2054, 4013,  ..., 2029, 6698,  102],
        ...,
        [ 101, 2129, 2116,  ...,    0,    0,    0],
        [ 101, 2003, 2009,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.38946956396102905, logits - tensor([[-4.7035, -0.0381, -1.5151, -1.6642],
        [-4.0003,  0.0271, -1.5337, -1.3112],
        [-4.4227,  0.7634, -1.8909, -0.9276],
        [-4.5380,  0.2587, -1.9603, -1.1530],
        [-3.8027, -0.1486, -0.8937, -0.9

 32%|███▏      | 92/289 [01:09<02:24,  1.36it/s]

Training loop 92
tensor([[ 101, 2054, 6123,  ...,    0,    0,    0],
        [ 101, 2054, 2951,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2828,  ...,    0,    0,    0],
        [ 101, 2054, 8476,  ...,    0,    0,    0],
        [ 101, 2054, 2397,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3832547664642334, logits - tensor([[-4.0652, -1.1743, -2.5237,  0.3797],
        [-4.4441, -0.2469, -2.0204, -1.5633],
        [-2.7516, -1.5158, -2.0621,  1.1595],
        [-3.7598,  0.3713, -1.3395, -0.5412],
        [-3.7048,  0.1600, -1.6664, -1.22

 32%|███▏      | 93/289 [01:10<02:25,  1.35it/s]

Training loop 93
tensor([[  101,  2054,  9312,  ...,     0,     0,     0],
        [  101,  2054, 13100,  ...,     0,     0,     0],
        [  101,  2003,  2045,  ...,     0,     0,     0],
        ...,
        [  101,  2129,  2079,  ...,     0,     0,     0],
        [  101,  2054,  2951,  ...,     0,     0,     0],
        [  101,  2029,  2653,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.28587836027145386, logits - tensor([[-4.7151,  0.6458, -2.1122, -0.5409],
        [-3.7388, -1.0553, -2.0829,  0.0468],
        [-2.9664, -0.2685, -1.1292, -0.6673],
        [-3.6489,  0.0096, -1.5691, -0.6010],
    

 33%|███▎      | 94/289 [01:11<02:25,  1.34it/s]

Training loop 94
tensor([[  101,  2029,  2024,  ...,     0,     0,     0],
        [  101,  2129,  2312,  ...,     0,     0,     0],
        [  101,  2054, 23807,  ...,     0,     0,     0],
        ...,
        [  101,  2054, 12046,  ...,     0,     0,     0],
        [  101,  2029, 12739,  ...,     0,     0,     0],
        [  101,  2029,  4432,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 33%|███▎      | 95/289 [01:12<02:25,  1.34it/s]

loss - 0.35739243030548096, logits - tensor([[-4.4184, -0.4950, -1.3335, -0.3028],
        [-4.4469,  0.2904, -1.7311, -0.9218],
        [-3.7867,  0.5180, -1.6302, -0.5273],
        [-4.2302, -0.1032, -1.7702, -1.2558],
        [-4.4999, -0.3709, -1.5485, -0.5534],
        [-3.9807,  0.4190, -1.4739, -1.0254],
        [-3.9134, -0.1218, -1.3368, -0.9743],
        [-4.3856,  0.0971, -2.4505, -0.4725]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 95
tensor([[  101,  2054, 12783,  ...,     0,     0,     0],
        [  101,  2024,  1996,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0],
        ...,
        [  101,  2129,  2003,  ...,  2006,  2023,   102],
        [  101,  2029, 14336,  ...,     0,     0,     0],
        [  101,  2129,  2024,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0,

 33%|███▎      | 96/289 [01:12<02:24,  1.34it/s]

Training loop 96
tensor([[  101,  2054,  2024,  ...,     0,     0,     0],
        [  101,  2129,  2079,  ...,     0,     0,     0],
        [  101,  2106,  2027,  ...,     0,     0,     0],
        ...,
        [  101,  2029, 10474,  ...,     0,     0,     0],
        [  101,  2011,  2129,  ...,     0,     0,     0],
        [  101,  2054,  3001,  ...,  1998,  1996,   102]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3172416687011719, logits - tensor([[-4.7032,  0.6252, -2.6967, -0.5770],
        [-3.1750, -2.2070, -2.1798,  1.8627],
        [-4.0880,  0.0425, -1.3573, -1.4528],
        [-4.4165,  0.2474, -1.3821, -0.7739],
     

 34%|███▎      | 97/289 [01:13<02:23,  1.33it/s]

Training loop 97
tensor([[ 101, 2129, 2502,  ...,    0,    0,    0],
        [ 101, 2129, 2312,  ...,    0,    0,    0],
        [ 101, 2029, 4155,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 5579,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.29974299669265747, logits - tensor([[-3.8426,  0.1721, -1.6655, -0.3784],
        [-4.3928,  0.1864, -1.6115, -0.6647],
        [-4.6709,  0.4719, -2.0672, -0.9226],
        [-4.4858,  1.3130, -2.2370, -0.8779],
        [-5.3395, -0.1250, -1.9151, -0.7

 34%|███▍      | 98/289 [01:14<02:23,  1.33it/s]

Training loop 98
tensor([[  101,  2129,  2312,  ...,     0,     0,     0],
        [  101,  2029,  4655,  ...,     0,     0,     0],
        [  101,  2054,  2060,  ...,     0,     0,     0],
        ...,
        [  101,  2024, 12943,  ...,     0,     0,     0],
        [  101,  2054,  2024,  ...,     0,     0,     0],
        [  101,  2064,  2037,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.43265289068222046, logits - tensor([[-2.9110, -1.8680, -1.9050,  1.6827],
        [-2.5159, -1.3404, -2.4572,  1.8087],
        [-3.7859, -0.6210, -2.3211,  0.4552],
        [-4.8478,  1.0401, -2.5312, -1.2652],
    

 34%|███▍      | 99/289 [01:15<02:22,  1.33it/s]

Training loop 99
tensor([[  101,  2054, 14965,  ...,     0,     0,     0],
        [  101,  2054,  2024,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  9312,  ...,     0,     0,     0],
        [  101,  2079,  2027,  ...,     0,     0,     0],
        [  101,  2054,  2024,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4012534022331238, logits - tensor([[-3.8993,  0.4430, -1.9872, -0.5610],
        [-5.1184,  1.3249, -2.4920, -2.1981],
        [-4.8267,  1.4199, -1.7371, -1.0479],
        [-4.2589,  0.8952, -2.1818, -1.2323],
     

 35%|███▍      | 100/289 [01:15<02:21,  1.34it/s]

Training loop 100
tensor([[ 101, 2012, 2054,  ...,    0,    0,    0],
        [ 101, 2054, 4083,  ...,    0,    0,    0],
        [ 101, 2129, 2024,  ...,    0,    0,    0],
        ...,
        [ 101, 2011, 2129,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.36658769845962524, logits - tensor([[-4.1970,  1.0844, -1.7173, -1.3844],
        [-4.1844,  1.1658, -2.1520, -0.8081],
        [-4.3497,  0.6558, -1.7594, -0.3880],
        [-4.2401,  1.1568, -2.3440, -1.3695],
        [-4.3698,  0.7560, -2.2058, -0.

 35%|███▍      | 101/289 [01:16<02:20,  1.33it/s]

Training loop 101
tensor([[ 101, 2029, 3279,  ...,    0,    0,    0],
        [ 101, 2129, 2502,  ...,    0,    0,    0],
        [ 101, 2029, 5449,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2079,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2079, 2027,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3369804620742798, logits - tensor([[-4.4585,  0.5574, -1.9025, -1.8836],
        [-4.4842,  0.7516, -2.2986, -1.0991],
        [-4.9217,  1.4169, -2.1158, -1.8357],
        [-5.4225,  1.6518, -3.2566, -1.9109],
        [-4.8005,  0.3286, -1.7913, -1.1

 35%|███▌      | 102/289 [01:17<02:20,  1.33it/s]

Training loop 102
tensor([[  101,  2054,  2024,  ...,     0,     0,     0],
        [  101,  2054,  2731,  ...,  2224,  1037,   102],
        [  101,  2054, 15066,  ...,     0,     0,     0],
        ...,
        [  101,  2106,  1996,  ...,     0,     0,     0],
        [  101,  2106,  2027,  ...,     0,     0,     0],
        [  101,  2024,  1996,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3553321659564972, logits - tensor([[-4.4397,  0.2295, -2.4133, -1.0000],
        [-4.5246,  1.4688, -2.2950, -1.0306],
        [-5.1231,  1.0319, -2.6983, -1.1800],
        [-2.6835, -2.5195, -1.7882,  1.9335],
    

 36%|███▌      | 103/289 [01:18<02:20,  1.32it/s]

Training loop 103
tensor([[ 101, 2054, 2024,  ..., 1012, 5310,  102],
        [ 101, 2054, 9324,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2079,  ...,    0,    0,    0],
        [ 101, 2129, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 7060,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19856950640678406, logits - tensor([[-4.5016,  0.8775, -2.4522, -0.9558],
        [-3.9096,  0.3701, -2.2994, -1.5996],
        [-4.8751,  0.0297, -2.3778, -1.3431],
        [-4.3605,  1.3110, -2.3577, -0.9176],
        [-3.8387,  0.8859, -1.6312, -0.

 36%|███▌      | 104/289 [01:19<02:19,  1.32it/s]

Training loop 104
tensor([[ 101, 2054, 2944,  ...,    0,    0,    0],
        [ 101, 2129, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 6327,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2106, 1996,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.23904958367347717, logits - tensor([[-5.2816,  1.8110, -2.1504, -1.4224],
        [-4.1684,  1.8324, -2.2877, -1.4111],
        [-4.7235,  1.8048, -2.8017, -1.1218],
        [-4.3731,  0.8555, -2.3615, -0.9282],
        [-4.7337, -0.3015, -2.3462, -1.

 36%|███▋      | 105/289 [01:19<02:18,  1.33it/s]

Training loop 105
tensor([[ 101, 2079, 2027,  ...,    0,    0,    0],
        [ 101, 2054, 2079,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        ...,
        [ 101, 2106, 1996,  ...,    0,    0,    0],
        [ 101, 2073, 2106,  ...,    0,    0,    0],
        [ 101, 2054, 2106,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.23730099201202393, logits - tensor([[-3.1827,  0.2049, -0.8278, -1.2158],
        [-4.2527,  1.4880, -2.5737, -1.7030],
        [-4.1025,  1.3229, -2.7382, -0.8666],
        [-3.6256,  0.9425, -2.1323, -1.5861],
        [-4.3045,  1.0660, -1.7087, -1.

 37%|███▋      | 106/289 [01:20<02:17,  1.33it/s]

Training loop 106
tensor([[  101,  2054,  2951,  ...,     0,     0,     0],
        [  101,  2129,  2027,  ...,     0,     0,     0],
        [  101,  2054,  2024,  ...,     0,     0,     0],
        ...,
        [  101,  2029, 26163,  ...,     0,     0,     0],
        [  101,  2029, 22822,  ...,     0,     0,     0],
        [  101,  2129,  2079,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.32030820846557617, logits - tensor([[-4.7504,  1.7746, -2.4032, -1.2075],
        [-4.3315,  1.1300, -2.0760, -1.7964],
        [-4.7430,  1.8290, -2.6331, -0.9237],
        [-4.2354,  1.5695, -2.0560, -1.9132],
   

 37%|███▋      | 107/289 [01:21<02:16,  1.33it/s]

Training loop 107
tensor([[ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2079, 2027,  ...,    0,    0,    0],
        [ 101, 2515, 1996,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2079,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2129, 2411,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3489397168159485, logits - tensor([[-4.3354,  1.7451, -2.5994, -1.1901],
        [-2.6341, -0.4842, -0.2790, -0.9998],
        [-3.7479, -0.2473, -1.5841, -1.4392],
        [-5.1818,  1.2754, -2.6128, -2.3115],
        [-4.8782,  0.9370, -2.4689, -1.1

 37%|███▋      | 108/289 [01:21<02:15,  1.34it/s]

Training loop 108
tensor([[ 101, 2129, 2502,  ...,    0,    0,    0],
        [ 101, 2079, 1996,  ...,    0,    0,    0],
        [ 101, 1999, 2054,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2047,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2129, 2515,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4561995565891266, logits - tensor([[-3.8123, -0.5919, -2.3014,  0.1203],
        [-2.6133,  0.2142, -0.3266, -2.1587],
        [-3.9986,  1.4273, -2.1941, -1.3376],
        [-4.8383,  1.2783, -2.2798, -1.7253],
        [-5.0206,  1.3784, -2.1164, -1.3

 38%|███▊      | 109/289 [01:22<02:14,  1.34it/s]

Training loop 109
tensor([[  101,  2054, 10954,  ...,     0,     0,     0],
        [  101,  2003,  3793,  ...,     0,     0,     0],
        [  101,  2054,  2024,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  2024,  ...,     0,     0,     0],
        [  101,  2054,  2176,  ...,     0,     0,     0],
        [  101,  2129,  2172,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4147528409957886, logits - tensor([[-3.5885,  0.7232, -2.7614, -1.6768],
        [-3.8959,  0.3880, -1.4856, -1.6426],
        [-2.8898, -1.9960, -2.1849,  1.7725],
        [-4.7412,  1.5549, -2.7726, -2.4453],
    

 38%|███▊      | 110/289 [01:23<02:13,  1.34it/s]

Training loop 110
tensor([[ 101, 1999, 2029,  ...,    0,    0,    0],
        [ 101, 2029, 4275,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 4022,  ...,    0,    0,    0],
        [ 101, 2129, 2027,  ...,    0,    0,    0],
        [ 101, 2129, 2116,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.25234270095825195, logits - tensor([[-3.8776, -1.7062, -2.6445,  1.9096],
        [-3.2080, -1.6852, -2.7473,  2.5038],
        [-3.9062,  1.0579, -2.6897, -1.7180],
        [-4.2196,  1.0224, -2.2571, -1.2651],
        [-3.9250, -0.7653, -3.2750,  0.

 38%|███▊      | 111/289 [01:24<02:12,  1.34it/s]

Training loop 111
tensor([[  101,  2029, 13588,  ...,     0,     0,     0],
        [  101,  2054, 12046,  ...,     0,     0,     0],
        [  101,  2054,  2020,  ...,     0,     0,     0],
        ...,
        [  101,  2129,  2031,  ...,     0,     0,     0],
        [  101,  2054, 26163,  ...,     0,     0,     0],
        [  101,  2515,  3653,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2687731683254242, logits - tensor([[-4.6186,  1.7422, -2.6390, -1.8571],
        [-4.7069,  1.6161, -3.0566, -2.3987],
        [-3.6983, -0.8586, -3.2575,  1.5231],
        [-4.7416,  1.6272, -2.5572, -1.3549],
    

 39%|███▉      | 112/289 [01:24<02:12,  1.34it/s]

Training loop 112
tensor([[ 101, 2054, 2020,  ...,    0,    0,    0],
        [ 101, 2054, 2093,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2064,  ...,    0,    0,    0],
        [ 101, 2029, 6847,  ..., 1998, 3231,  102],
        [ 101, 2129, 2079,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.26370882987976074, logits - tensor([[-4.7502,  1.7151, -2.4682, -1.9150],
        [-4.5212,  0.7607, -2.3483, -1.7098],
        [-5.4530,  1.1685, -3.0604, -2.1142],
        [-4.7593,  1.6880, -2.6207, -1.7647],
        [-2.6468,  0.2854, -0.4349, -1.

 39%|███▉      | 113/289 [01:25<02:11,  1.34it/s]

Training loop 113
tensor([[  101,  2054,  2024,  ...,     0,     0,     0],
        [  101,  2054, 26293,  ...,     0,     0,     0],
        [  101,  1999,  2029,  ...,  1010, 27593,   102],
        ...,
        [  101,  2129,  2515,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0],
        [  101,  2073,  2106,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 39%|███▉      | 114/289 [01:26<02:11,  1.33it/s]

loss - 0.22094440460205078, logits - tensor([[-4.0461,  1.4308, -3.0099, -1.6279],
        [-4.0731, -1.3548, -3.0449,  1.2162],
        [-4.1965,  0.8136, -1.9526, -0.9188],
        [-1.7622, -0.1524,  0.4366, -1.2820],
        [-3.1537, -1.2566, -2.7907,  0.2553],
        [-4.3521,  1.2653, -2.5193, -1.2085],
        [-4.4581,  2.0904, -2.6845, -2.1056],
        [-4.9528,  1.8305, -2.2135, -2.4763]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 114
tensor([[ 101, 2129, 2488,  ...,    0,    0,    0],
        [ 101, 2054, 2020,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2079,  ...,    0,    0,    0],
        [ 101, 2129, 2079,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1,

 40%|███▉      | 115/289 [01:27<02:11,  1.32it/s]

Training loop 115
tensor([[  101,  2054,  2003,  ...,     0,     0,     0],
        [  101,  2054,  2024,  ...,  2367,  3793,   102],
        [  101,  2003, 14255,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  4577,  ...,     0,     0,     0],
        [  101,  2129,  2116,  ...,     0,     0,     0],
        [  101,  2097,  2122,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.25890523195266724, logits - tensor([[-4.7080,  1.3791, -2.6703, -1.6015],
        [-5.1901,  1.4892, -3.3977, -0.9053],
        [-2.4793, -0.5833, -0.1658, -1.9173],
        [-4.2749,  1.2704, -1.8035, -1.5984],
   

 40%|████      | 116/289 [01:27<02:10,  1.32it/s]

Training loop 116
tensor([[  101,  2054,  2024,  ...,     0,     0,     0],
        [  101,  2129,  2502,  ...,     0,     0,     0],
        [  101,  2011,  2129,  ...,     0,     0,     0],
        ...,
        [  101,  2129,  2079,  ...,     0,     0,     0],
        [  101,  2001, 14324,  ...,     0,     0,     0],
        [  101,  2054, 11338,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5768263339996338, logits - tensor([[-5.1834, -0.7076, -3.6285,  0.8264],
        [-4.3545,  1.3278, -2.9018, -1.3799],
        [-5.4726,  2.1304, -2.4478, -2.1090],
        [-4.9261,  2.0021, -3.4072, -2.2120],
    

 40%|████      | 117/289 [01:28<02:10,  1.32it/s]

Training loop 117
tensor([[  101,  2024,  2070,  ...,     0,     0,     0],
        [  101,  2054,  2785,  ...,     0,     0,     0],
        [  101,  2079,  2027,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  3653,  ...,     0,     0,     0],
        [  101,  2129,  2172,  ...,     0,     0,     0],
        [  101,  2054, 17537,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3412935137748718, logits - tensor([[-3.5495,  0.0116, -0.1303, -1.3111],
        [-4.3194,  1.8064, -2.7022, -2.1320],
        [-2.5635, -0.7768,  0.4534, -1.4136],
        [-4.7364,  1.4736, -2.9939, -1.7053],
    

 41%|████      | 118/289 [01:29<02:10,  1.31it/s]

Training loop 118
tensor([[ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2029, 2986,  ...,    0,    0,    0],
        [ 101, 2006, 2029,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2951,  ...,    0,    0,    0],
        [ 101, 2011, 2129,  ...,    0,    0,    0],
        [ 101, 2079, 2027,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.38672521710395813, logits - tensor([[-5.1056,  1.5439, -3.0336, -1.3675],
        [-4.5673,  1.5235, -2.7327, -1.4753],
        [-4.5906,  0.8063, -2.4580, -1.7629],
        [-4.4954,  1.8140, -2.7474, -2.3306],
        [-4.8395,  1.2389, -2.6992, -2.

 41%|████      | 119/289 [01:30<02:09,  1.31it/s]

Training loop 119
tensor([[  101,  2029,  4155,  ...,     0,     0,     0],
        [  101,  2129,  2172,  ...,     0,     0,     0],
        [  101,  2054,  2024,  ...,     0,     0,     0],
        ...,
        [  101,  2029, 13058,  ...,     0,     0,     0],
        [  101,  2011,  2129,  ...,     0,     0,     0],
        [  101,  2129,  2079,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.21855367720127106, logits - tensor([[-4.5692,  1.0445, -3.0707, -1.5018],
        [-4.3718,  1.7025, -2.6538, -1.6429],
        [-4.0380,  1.5867, -3.0186, -1.1613],
        [-3.9606,  1.7469, -2.9344, -1.7741],
   

 42%|████▏     | 120/289 [01:31<02:08,  1.31it/s]

Training loop 120
tensor([[ 101, 2079, 2027,  ...,    0,    0,    0],
        [ 101, 2129, 2003,  ...,    0,    0,    0],
        [ 101, 2079, 2027,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2106,  ...,    0,    0,    0],
        [ 101, 2129, 2027,  ...,    0,    0,    0],
        [ 101, 2054, 2951,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.46084529161453247, logits - tensor([[-2.5374, -0.3762, -0.2921, -1.1211],
        [-5.4647,  1.6180, -3.6751, -1.1218],
        [-3.3950,  0.0829, -2.1125, -1.5677],
        [-4.5382,  1.5027, -2.4260, -2.0140],
        [-4.9964,  2.1793, -3.2245, -2.

 42%|████▏     | 121/289 [01:31<02:08,  1.31it/s]

Training loop 121
tensor([[ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2129, 2079,  ...,    0,    0,    0],
        [ 101, 2079, 1996,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2129, 2172,  ...,    0,    0,    0],
        [ 101, 2029, 2951,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2332189530134201, logits - tensor([[-5.3196,  1.8880, -3.0393, -0.7027],
        [-4.8311,  0.6438, -3.2628, -0.1452],
        [-1.5939, -1.2980,  1.1637, -0.7327],
        [-3.2088, -0.0744, -2.2444,  0.6662],
        [-3.8130, -0.1353, -1.1699, -1.4

 42%|████▏     | 122/289 [01:32<02:06,  1.32it/s]

Training loop 122
tensor([[  101,  2079,  2027,  ...,     0,     0,     0],
        [  101,  2054, 12783,  ...,     0,     0,     0],
        [  101,  2106,  5579,  ...,     0,     0,     0],
        ...,
        [  101,  2129,  2502,  ...,     0,     0,     0],
        [  101,  2129,  2502,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.17939847707748413, logits - tensor([[-2.4374, -0.9425,  0.1572, -1.3775],
        [-4.9141,  0.5720, -3.2950, -1.3444],
        [-1.3761, -1.9169,  2.0728, -1.5865],
        [-4.3891,  1.5417, -3.4722, -1.8044],
   

 43%|████▎     | 123/289 [01:33<02:05,  1.32it/s]

Training loop 123
tensor([[ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2129, 2003,  ..., 4349, 8484,  102],
        [ 101, 2029, 4942,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2079,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2129, 2502,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2602578401565552, logits - tensor([[-3.7210, -1.6814, -2.5981,  0.9705],
        [-4.3586,  0.3482, -3.2764, -0.7766],
        [-4.5714,  0.4034, -2.8417, -1.1140],
        [-2.1766, -0.8516, -0.0422, -1.2256],
        [-4.8767, -0.4721, -2.0494, -0.9

 43%|████▎     | 124/289 [01:34<02:04,  1.32it/s]

Training loop 124
tensor([[ 101, 2515, 1996,  ...,    0,    0,    0],
        [ 101, 2011, 2129,  ...,    0,    0,    0],
        [ 101, 2003, 1996,  ...,    0,    0,    0],
        ...,
        [ 101, 2339, 2027,  ...,    0,    0,    0],
        [ 101, 2029, 2653,  ...,    0,    0,    0],
        [ 101, 2054, 2897,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3685789704322815, logits - tensor([[-2.4010, -1.8034,  0.9265, -1.2189],
        [-3.5713, -0.4564, -1.5103, -0.9997],
        [-3.9289, -0.7425, -2.2737, -1.1255],
        [-4.8981,  1.4268, -3.6116, -1.4990],
        [-4.5764,  1.1727, -3.3461, -1.4

 43%|████▎     | 125/289 [01:34<02:03,  1.33it/s]

Training loop 125
tensor([[  101,  2054,  2951,  ...,     0,     0,     0],
        [  101,  2054,  5754,  ...,     0,     0,     0],
        [  101,  2029, 15756,  ...,     0,     0,     0],
        ...,
        [  101,  2339,  2515,  ...,     0,     0,     0],
        [  101,  2011,  2129,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.22756749391555786, logits - tensor([[-4.6823,  0.7648, -3.1751, -1.9929],
        [-5.0439,  1.7184, -3.1507, -1.6979],
        [-4.8344,  1.7170, -3.3381, -1.5052],
        [-5.5206,  1.5812, -3.5239, -1.6531],
   

 44%|████▎     | 126/289 [01:35<02:02,  1.33it/s]

Training loop 126
tensor([[ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2129, 2515,  ...,    0,    0,    0],
        [ 101, 2515, 1996,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 4155,  ...,    0,    0,    0],
        [ 101, 2054, 2944,  ...,    0,    0,    0],
        [ 101, 2054, 4800,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3746846914291382, logits - tensor([[-4.5398,  1.5322, -3.2724, -1.6011],
        [-5.0612,  0.6296, -3.3645, -1.2776],
        [-1.8892, -1.9796,  1.8971, -1.8207],
        [-4.5485,  1.2957, -3.9333, -1.4374],
        [-5.3370,  1.3225, -3.9250, -2.1

 44%|████▍     | 127/289 [01:36<02:01,  1.33it/s]

Training loop 127
tensor([[  101,  2003, 10709,  ...,     0,     0,     0],
        [  101,  2079,  2027,  ...,     0,     0,     0],
        [  101,  2129,  2488,  ...,     0,     0,     0],
        ...,
        [  101,  2029,  6254,  ...,     0,     0,     0],
        [  101,  2054,  2024,  ..., 19962,  1009,   102],
        [  101,  2079,  2027,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.45498591661453247, logits - tensor([[-3.1864, -0.8474, -0.2043, -1.5615],
        [-4.2967, -0.1825, -2.2731, -0.8023],
        [-5.0975,  0.9625, -4.0359, -1.5838],
        [-5.4770,  1.5827, -3.2499, -1.6103],
   

 44%|████▍     | 128/289 [01:37<02:01,  1.33it/s]

Training loop 128
tensor([[  101,  2129,  2079,  ...,     0,     0,     0],
        [  101,  2515,  1996,  ...,     0,     0,     0],
        [  101,  2054,  2024,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  2951,  ...,     0,     0,     0],
        [  101,  2079,  2027,  ...,  2168, 21641,   102],
        [  101,  2054,  2003,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3379128575325012, logits - tensor([[-4.7711e+00,  6.7489e-01, -2.6212e+00, -1.5897e+00],
        [-2.4702e+00, -1.4926e+00,  6.2827e-01, -1.5396e+00],
        [-4.5418e+00,  9.2228e-01, -3.1624e+00, -1.4765e+00],
  

 45%|████▍     | 129/289 [01:37<02:00,  1.33it/s]

Training loop 129
tensor([[  101,  2054,  2951,  ...,     0,     0,     0],
        [  101,  2029, 12046,  ...,     0,     0,     0],
        [  101,  2029,  2951,  ...,     0,     0,     0],
        ...,
        [  101,  2054, 16105,  ...,     0,     0,     0],
        [  101,  2129,  2515,  ...,     0,     0,     0],
        [  101,  2129,  2079,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2556264102458954, logits - tensor([[-5.2097,  1.3319, -3.3386, -0.5664],
        [-5.4520,  0.8571, -3.1167, -1.6692],
        [-4.9410,  1.8924, -4.3235, -2.0482],
        [-4.7038,  0.7867, -2.6997, -1.3945],
    

 45%|████▍     | 130/289 [01:38<02:00,  1.32it/s]

Training loop 130
tensor([[  101,  2129,  2024,  ...,     0,     0,     0],
        [  101,  2129,  2003,  ..., 12170, 13578,   102],
        [  101,  2054,  2020,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  2020,  ...,     0,     0,     0],
        [  101,  2515, 24665,  ...,     0,     0,     0],
        [  101,  2054,  8377,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1885446012020111, logits - tensor([[-5.0344,  0.7818, -2.6616, -1.6728],
        [-4.9385,  0.0676, -3.5762, -0.6302],
        [-4.2972, -0.7036, -2.8540,  0.4921],
        [-5.0147,  0.2460, -3.3505, -1.2874],
    

 45%|████▌     | 131/289 [01:39<01:59,  1.33it/s]

Training loop 131
tensor([[ 101, 2079, 2027,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2079, 2027,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2116,  ...,    0,    0,    0],
        [ 101, 2054, 2001,  ...,    0,    0,    0],
        [ 101, 2129, 2116,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.17300504446029663, logits - tensor([[-3.5401, -0.5144, -0.2561, -1.6642],
        [-4.4274, -1.8221, -3.0355,  1.4378],
        [-2.8288, -1.1864,  0.9459, -1.1991],
        [-4.8383,  1.3555, -4.0908, -1.1138],
        [-4.8200,  1.9635, -3.2583, -1.

 46%|████▌     | 132/289 [01:40<01:58,  1.33it/s]

Training loop 132
tensor([[  101,  2054,  2951,  ...,     0,     0,     0],
        [  101,  2054,  2079,  ...,     0,     0,     0],
        [  101,  2011,  2129,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  2735,  ...,     0,     0,     0],
        [  101,  2054, 13588,  ...,  2095,  1997,   102],
        [  101,  2029,  4294,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1996915489435196, logits - tensor([[-4.7158,  1.3685, -2.7481, -1.0337],
        [-5.8275,  0.3804, -3.0183, -1.5727],
        [-4.8551,  0.6736, -3.5189, -1.5340],
        [-5.2820,  1.3012, -3.2545, -1.7161],
    

 46%|████▌     | 133/289 [01:40<01:58,  1.32it/s]

Training loop 133
tensor([[ 101, 2079, 2027,  ...,    0,    0,    0],
        [ 101, 2054, 2020,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 8107,  ...,    0,    0,    0],
        [ 101, 2515, 2023,  ...,    0,    0,    0],
        [ 101, 2003, 2023,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 46%|████▋     | 134/289 [01:41<01:57,  1.32it/s]

loss - 0.325959175825119, logits - tensor([[-2.9655, -1.5523,  0.1241, -2.2580],
        [-3.9964,  1.4606, -2.6198, -1.3370],
        [-5.2218,  1.8907, -3.7137, -2.0203],
        [-5.3463,  1.0433, -3.8279, -1.5904],
        [-5.3293,  0.7094, -3.0757, -1.0756],
        [-5.0946,  0.8501, -3.3351, -1.5944],
        [-2.4422, -2.0020,  1.5100, -1.8604],
        [-2.2827, -2.2306,  2.0665, -2.2087]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 134
tensor([[  101,  2029,  2944,  ...,     0,     0,     0],
        [  101,  2029,  2048,  ...,     0,     0,     0],
        [  101,  2029,  2416,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  2024,  ...,     0,     0,     0],
        [  101,  2129,  2502,  ...,     0,     0,     0],
        [  101,  2054, 20062,  ...,  1037,  1010,   102]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 

 47%|████▋     | 135/289 [01:42<01:57,  1.31it/s]

Training loop 135
tensor([[ 101, 2029, 2951,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        ...,
        [ 101, 2024, 2037,  ...,    0,    0,    0],
        [ 101, 2054, 2951,  ...,    0,    0,    0],
        [ 101, 2129, 2024,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.20894911885261536, logits - tensor([[-4.4632,  1.5331, -2.4981, -1.9271],
        [-5.2064,  1.7552, -3.1460, -1.9984],
        [-5.5713,  0.8194, -3.4499, -1.9802],
        [-4.4651,  0.9252, -2.1103, -1.6921],
        [-4.7521,  1.3170, -2.5522, -1.

 47%|████▋     | 136/289 [01:43<01:56,  1.31it/s]

Training loop 136
tensor([[ 101, 2054, 4725,  ..., 2344, 2000,  102],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2079, 2027,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2502,  ...,    0,    0,    0],
        [ 101, 2079, 1996,  ...,    0,    0,    0],
        [ 101, 2129, 2024,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.477866530418396, logits - tensor([[-4.8096,  0.5643, -3.0982, -1.3117],
        [-3.9111, -1.1818, -4.0728,  0.0641],
        [-3.0244, -1.1337,  1.5305, -2.3758],
        [-4.5740,  2.0216, -3.4754, -1.8175],
        [-2.5918, -1.7422,  1.8360, -2.03

 47%|████▋     | 137/289 [01:43<01:56,  1.31it/s]

Training loop 137
tensor([[  101,  2054,  6747,  ...,     0,     0,     0],
        [  101,  2129,  2079,  ...,     0,     0,     0],
        [  101,  2054,  2024,  ...,     0,     0,     0],
        ...,
        [  101,  2003,  1996,  ...,     0,     0,     0],
        [  101,  2054,  2001,  ...,     0,     0,     0],
        [  101,  2054, 12046,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3952009975910187, logits - tensor([[-4.3543,  1.7548, -3.4103, -2.0989],
        [-4.4963,  1.3608, -4.1795, -1.6115],
        [-5.2442,  1.4164, -3.6269, -1.7773],
        [-4.6299,  0.5474, -3.6371, -0.6108],
    

 48%|████▊     | 138/289 [01:44<01:55,  1.31it/s]

Training loop 138
tensor([[ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2079, 1996,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        ...,
        [ 101, 2079, 2070,  ...,    0,    0,    0],
        [ 101, 2029, 2951,  ...,    0,    0,    0],
        [ 101, 2129, 2079,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2268776148557663, logits - tensor([[-4.4509, -1.8471, -3.7908,  1.6618],
        [-2.7796, -1.2707,  0.4963, -1.7664],
        [-4.9123,  2.0599, -3.6298, -2.4067],
        [-5.0306,  1.9686, -3.5972, -1.7067],
        [-2.5292, -1.4098,  1.5749, -1.7

 48%|████▊     | 139/289 [01:45<01:54,  1.31it/s]

Training loop 139
tensor([[ 101, 2073, 2079,  ...,    0,    0,    0],
        [ 101, 2129, 2079,  ...,    0,    0,    0],
        [ 101, 2129, 2024,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2079,  ...,    0,    0,    0],
        [ 101, 2054, 2048,  ...,    0,    0,    0],
        [ 101, 2079, 1996,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1191614642739296, logits - tensor([[-3.9785,  0.0392, -1.4069, -2.2940],
        [-5.0052,  1.4053, -2.3807, -1.4891],
        [-4.5579,  1.4830, -2.9081, -2.0262],
        [-5.7159,  0.9838, -3.2851, -1.7726],
        [-4.9682,  2.1679, -3.7631, -1.6

 48%|████▊     | 140/289 [01:46<01:53,  1.32it/s]

Training loop 140
tensor([[ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 2951,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        ...,
        [ 101, 2011, 2129,  ..., 2897, 1006,  102],
        [ 101, 2129, 2172,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.25283506512641907, logits - tensor([[-4.9116,  1.0981, -3.1145, -0.1246],
        [-5.6578,  2.9658, -3.8168, -1.8518],
        [-4.9549,  0.8207, -3.8477, -1.0393],
        [-4.1960,  0.8146, -2.9121, -1.5092],
        [-5.0425,  1.8190, -3.7142, -1.

 49%|████▉     | 141/289 [01:46<01:52,  1.32it/s]

Training loop 141
tensor([[ 101, 2054, 7461,  ...,    0,    0,    0],
        [ 101, 2106, 2027,  ...,    0,    0,    0],
        [ 101, 2129, 2003,  ...,    0,    0,    0],
        ...,
        [ 101, 2003, 1996,  ...,    0,    0,    0],
        [ 101, 2054, 5337,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ..., 1997, 1996,  102]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.41128918528556824, logits - tensor([[-5.6623,  1.8154, -3.3106, -2.2996],
        [-2.5345, -3.1449,  2.5415, -2.5341],
        [-4.6846,  1.7303, -3.5343, -1.9116],
        [-4.4384, -0.0356, -1.6993, -2.6183],
        [-2.4147, -2.3847,  2.3399, -2.

 49%|████▉     | 142/289 [01:47<01:51,  1.32it/s]

Training loop 142
tensor([[  101,  2054, 27024,  ...,     0,     0,     0],
        [  101,  2129,  2312,  ...,     0,     0,     0],
        [  101,  2129,  2116,  ...,     0,     0,     0],
        ...,
        [  101,  2029,  2951,  ...,     0,     0,     0],
        [  101,  2129,  2079,  ...,     0,     0,     0],
        [  101,  2054,  2951,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.33645349740982056, logits - tensor([[-4.1484,  1.2244, -3.0693, -1.6374],
        [-4.2307, -0.0680, -3.1334, -0.1037],
        [-4.6017,  1.7887, -3.9746, -2.4794],
        [-3.9228,  0.2496, -3.0289, -0.1983],
   

 49%|████▉     | 143/289 [01:48<01:50,  1.32it/s]

Training loop 143
tensor([[ 101, 2054, 2951,  ...,    0,    0,    0],
        [ 101, 2129, 2003,  ...,    0,    0,    0],
        [ 101, 2129, 2502,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2003,  ...,    0,    0,    0],
        [ 101, 2003, 1996,  ...,    0,    0,    0],
        [ 101, 2515, 1996,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.15717713534832, logits - tensor([[-5.2623,  1.5309, -3.2691, -1.9599],
        [-5.2412,  1.5882, -3.5265, -1.7633],
        [-4.9708,  2.0462, -3.3902, -1.3403],
        [-3.7739,  1.0050, -3.1707, -0.6427],
        [-4.6448,  1.4103, -3.5050, -1.593

 50%|████▉     | 144/289 [01:49<01:49,  1.32it/s]

Training loop 144
tensor([[ 101, 2129, 2020,  ...,    0,    0,    0],
        [ 101, 2129, 2146,  ...,    0,    0,    0],
        [ 101, 2029, 2951,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2001,  ...,    0,    0,    0],
        [ 101, 2054, 2698,  ...,    0,    0,    0],
        [ 101, 2129, 2003,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.14378465712070465, logits - tensor([[-5.1063,  1.0680, -3.2351, -0.9262],
        [-4.6197, -0.0208, -3.1819,  0.5223],
        [-4.5436,  1.8213, -3.5185, -1.9211],
        [-3.1183, -2.1247,  1.7767, -1.5203],
        [-3.0383, -0.6608, -3.0616,  1.

 50%|█████     | 145/289 [01:49<01:48,  1.32it/s]

Training loop 145
tensor([[ 101, 2129, 2079,  ...,    0,    0,    0],
        [ 101, 2129, 2079,  ...,    0,    0,    0],
        [ 101, 2054, 2951,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2785,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 2367,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2914455235004425, logits - tensor([[-5.4088,  1.1415, -3.4007, -1.6548],
        [-6.0014,  1.5468, -3.9002, -1.8158],
        [-5.0813,  1.3249, -2.9441, -1.5704],
        [-4.7824,  1.0949, -3.7971, -1.4002],
        [-4.0165, -0.6436, -3.4889,  0.2

 51%|█████     | 146/289 [01:50<01:48,  1.32it/s]

Training loop 146
tensor([[ 101, 2079, 2027,  ...,    0,    0,    0],
        [ 101, 2054, 4773,  ...,    0,    0,    0],
        [ 101, 2129, 2079,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 4155,  ...,    0,    0,    0],
        [ 101, 2054, 7885,  ...,    0,    0,    0],
        [ 101, 2054, 2001,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.11854878067970276, logits - tensor([[-2.1287, -2.7966,  3.3066, -2.5429],
        [-4.4416, -0.5512, -3.3524,  0.4051],
        [-5.0725,  1.5400, -4.0334, -1.4030],
        [-4.9318,  1.1926, -2.9541, -2.1890],
        [-4.5884,  1.6838, -4.1597, -2.

 51%|█████     | 147/289 [01:51<01:47,  1.32it/s]

Training loop 147
tensor([[  101,  2054,  2001,  ...,     0,     0,     0],
        [  101,  2129,  2172,  ...,  1997, 17953,   102],
        [  101,  2054,  2003,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  2024,  ...,  2006,  2087,   102],
        [  101,  2054,  2951,  ...,     0,     0,     0],
        [  101,  2054,  5107,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.25260263681411743, logits - tensor([[-4.9108, -1.1476, -4.5453,  1.4863],
        [-4.2562,  0.6366, -3.5895,  0.4962],
        [-4.2127,  0.6846, -2.0673, -1.1471],
        [-5.3143,  1.3340, -3.5668, -1.6045],
   

 51%|█████     | 148/289 [01:52<01:47,  1.31it/s]

Training loop 148
tensor([[ 101, 2071, 2017,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 7860,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 7861,  ...,    0,    0,    0],
        [ 101, 2054, 2951,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.12199892848730087, logits - tensor([[-3.1346, -2.4933,  1.4287, -2.4764],
        [-5.5678,  1.1924, -3.6645, -1.1272],
        [-4.5354,  1.0619, -3.3936, -1.2799],
        [-3.7432,  1.5130, -3.0668, -1.2035],
        [-5.5454,  1.8576, -3.6749, -1.

 52%|█████▏    | 149/289 [01:53<01:46,  1.32it/s]

Training loop 149
tensor([[  101,  2054,  2003,  ...,     0,     0,     0],
        [  101,  2129,  2079,  ...,     0,     0,     0],
        [  101,  2054,  2024,  ...,     0,     0,     0],
        ...,
        [  101,  2029, 12719,  ...,  2890,  2546,   102],
        [  101,  2054,  4275,  ...,     0,     0,     0],
        [  101,  2079,  2027,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3459552526473999, logits - tensor([[-5.2818,  1.4224, -3.5656, -2.2324],
        [-5.3530,  0.7788, -3.1276, -1.6308],
        [-5.2760,  0.9027, -2.9780, -1.4317],
        [-5.3974,  0.6732, -4.1688, -1.6785],
    

 52%|█████▏    | 150/289 [01:53<01:45,  1.32it/s]

Training loop 150
tensor([[  101,  2129,  2024,  ...,  1051,  4492,   102],
        [  101,  2106,  2027,  ...,     0,     0,     0],
        [  101,  2054, 11433,  ...,  3278, 12157,   102],
        ...,
        [  101,  2003,  1996,  ...,     0,     0,     0],
        [  101,  2029, 13931,  ...,     0,     0,     0],
        [  101,  2054,  2004,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4566183090209961, logits - tensor([[-4.6426,  0.8053, -2.7190, -1.2435],
        [-2.5043, -3.0178,  2.7390, -1.5200],
        [-5.4762,  1.2716, -2.7897, -1.3884],
        [-3.8326,  1.3634, -2.6321, -1.6928],
    

 52%|█████▏    | 151/289 [01:54<01:44,  1.32it/s]

Training loop 151
tensor([[ 101, 2029, 4563,  ...,    0,    0,    0],
        [ 101, 2129, 2079,  ...,    0,    0,    0],
        [ 101, 2054, 5981,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2944,  ...,    0,    0,    0],
        [ 101, 2054, 2951,  ...,    0,    0,    0],
        [ 101, 2129, 2515,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2627556324005127, logits - tensor([[-4.9596,  2.1397, -4.1713, -2.1417],
        [-4.1014,  0.9408, -2.5831, -1.4462],
        [-4.1823, -0.8535, -4.1225,  1.3541],
        [-4.9061,  1.4762, -3.6340, -1.8572],
        [-5.3905,  0.9599, -3.2870, -1.7

 53%|█████▎    | 152/289 [01:55<01:43,  1.32it/s]

Training loop 152
tensor([[  101,  2054, 12046,  ...,     0,     0,     0],
        [  101,  2024,  2151,  ...,     0,     0,     0],
        [  101,  2079,  2027,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  7861,  ...,     0,     0,     0],
        [  101,  2515,  4748,  ...,     0,     0,     0],
        [  101,  2079,  2027,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.34004876017570496, logits - tensor([[-5.3093,  1.8357, -3.9451, -2.8046],
        [-2.2684, -3.0963,  2.8035, -1.8732],
        [-2.7259, -3.4292,  2.2354, -1.3138],
        [-4.9304,  1.5913, -2.9357, -1.8025],
   

 53%|█████▎    | 153/289 [01:56<01:42,  1.32it/s]

Training loop 153
tensor([[  101,  2129,  2079,  ...,     0,     0,     0],
        [  101,  2054,  2024,  ...,  6026, 11834,   102],
        [  101,  2129,  2515,  ...,     0,     0,     0],
        ...,
        [  101,  2079,  2027,  ...,     0,     0,     0],
        [  101,  2054,  3176,  ...,     0,     0,     0],
        [  101,  2129,  2079,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.20631739497184753, logits - tensor([[-5.7060,  1.8037, -4.7467, -1.4222],
        [-4.5759,  0.6134, -3.3579, -1.4962],
        [-5.5766,  0.9517, -3.3465, -1.6899],
        [-1.7108, -2.9918,  1.7248, -1.3674],
   

 53%|█████▎    | 154/289 [01:56<01:42,  1.32it/s]

Training loop 154
tensor([[ 101, 2054, 4155,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 2001,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2828,  ...,    0,    0,    0],
        [ 101, 2129, 2172,  ...,    0,    0,    0],
        [ 101, 2079, 2027,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.488764226436615, logits - tensor([[-4.4122,  1.4843, -3.6146, -1.8293],
        [-4.6597,  1.6212, -3.4413, -1.8312],
        [-5.1400,  1.9333, -4.0466, -0.9539],
        [-6.3477,  1.0835, -4.5063, -1.2638],
        [-5.4682,  0.9550, -3.2233, -2.04

 54%|█████▎    | 155/289 [01:57<01:41,  1.32it/s]

Training loop 155
tensor([[ 101, 2054, 2653,  ...,    0,    0,    0],
        [ 101, 2003, 2003,  ...,    0,    0,    0],
        [ 101, 2129, 2003,  ...,    0,    0,    0],
        ...,
        [ 101, 2003, 2529,  ..., 2058, 2035,  102],
        [ 101, 2079, 2027,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.16234147548675537, logits - tensor([[-4.4414,  1.3056, -4.3106, -1.4357],
        [-2.7189, -2.4530,  1.6406, -1.9000],
        [-4.9256,  0.6947, -3.9111, -1.5098],
        [-4.7920,  0.3233, -2.4336, -1.6361],
        [-4.9607,  1.3560, -3.6015, -1.

 54%|█████▍    | 156/289 [01:58<01:40,  1.32it/s]

Training loop 156
tensor([[  101,  2054,  2024,  ...,     0,     0,     0],
        [  101,  2054,  2024,  ...,     0,     0,     0],
        [  101,  2054,  3653,  ...,     0,     0,     0],
        ...,
        [  101,  1059, 14949,  ...,     0,     0,     0],
        [  101,  2054,  8107,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4790741503238678, logits - tensor([[-3.6696, -2.3032, -3.6980,  1.4377],
        [-4.2891, -1.4879, -2.7042,  2.0091],
        [-4.5458,  1.4707, -2.8821, -1.4340],
        [-4.7800,  1.1117, -4.0372, -1.9504],
    

 54%|█████▍    | 157/289 [01:59<01:40,  1.31it/s]

Training loop 157
tensor([[ 101, 2054, 2653,  ...,    0,    0,    0],
        [ 101, 2129, 2001,  ...,    0,    0,    0],
        [ 101, 2129, 2312,  ...,    0,    0,    0],
        ...,
        [ 101, 2073, 2003,  ...,    0,    0,    0],
        [ 101, 2029, 2048,  ...,    0,    0,    0],
        [ 101, 2129, 2116,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.36901283264160156, logits - tensor([[-4.3740,  1.6788, -3.9526, -1.5905],
        [-5.5615,  2.0278, -4.3002, -0.8112],
        [-4.5166,  1.2010, -2.9751, -1.4725],
        [-3.3195, -1.8548, -3.2460,  1.0881],
        [-5.1772,  1.7801, -3.0767, -1.

 55%|█████▍    | 158/289 [01:59<01:39,  1.32it/s]

Training loop 158
tensor([[ 101, 2129, 2312,  ...,    0,    0,    0],
        [ 101, 2129, 2024,  ...,    0,    0,    0],
        [ 101, 2129, 2079,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2172,  ...,    0,    0,    0],
        [ 101, 2129, 2106,  ...,    0,    0,    0],
        [ 101, 2054, 4155,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.215627059340477, logits - tensor([[-4.7081, -2.4791, -3.9361,  1.2243],
        [-5.1615,  2.1617, -3.8232, -1.7583],
        [-4.9561,  0.8584, -3.7478, -1.5125],
        [-4.1680, -1.1272, -3.0416,  1.2506],
        [-2.1900, -2.1748,  1.2315, -1.44

 55%|█████▌    | 159/289 [02:00<01:38,  1.32it/s]

Training loop 159
tensor([[  101,  2054,  4294,  ...,     0,     0,     0],
        [  101,  2054,  2828,  ...,     0,     0,     0],
        [  101,  2129,  2003,  ..., 13103,  6922,   102],
        ...,
        [  101,  2129,  2172,  ...,     0,     0,     0],
        [  101,  2054,  2060,  ...,     0,     0,     0],
        [  101,  2129,  2079,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.30419886112213135, logits - tensor([[-5.0367,  0.5875, -3.5553, -1.6796],
        [-5.0292,  1.3562, -4.2456, -1.5496],
        [-5.1456,  0.3625, -3.5479, -0.7704],
        [-2.9736, -2.0500,  1.2251, -1.5137],
   

 55%|█████▌    | 160/289 [02:01<01:37,  1.32it/s]

Training loop 160
tensor([[  101,  2029,  3086,  ...,     0,     0,     0],
        [  101,  2129,  2172,  ...,     0,     0,     0],
        [  101,  2129,  2515,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  2001,  ...,     0,     0,     0],
        [  101,  2054,  2024,  ...,     0,     0,     0],
        [  101,  2024, 22594,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.14408957958221436, logits - tensor([[-5.5993,  2.0515, -3.3166, -1.4453],
        [-5.3344,  1.1443, -4.1925, -0.7756],
        [-5.6942,  1.2613, -3.3603, -0.9151],
        [-5.1404,  1.7921, -4.0925, -0.9071],
   

 56%|█████▌    | 161/289 [02:02<01:37,  1.31it/s]

Training loop 161
tensor([[ 101, 2054, 2465,  ..., 2344, 2000,  102],
        [ 101, 2029, 2951,  ...,    0,    0,    0],
        [ 101, 2029, 2330,  ...,    0,    0,    0],
        ...,
        [ 101, 2515, 1996,  ...,    0,    0,    0],
        [ 101, 2054, 6388,  ...,    0,    0,    0],
        [ 101, 2031, 1996,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4248327612876892, logits - tensor([[-4.8197,  0.9282, -3.2716, -0.8208],
        [-5.4603,  2.4456, -4.1320, -1.1397],
        [-5.3763,  1.7036, -4.4980, -2.2679],
        [-5.7060,  1.4937, -4.2371, -1.2519],
        [-5.1447,  0.7746, -4.0422, -1.2

 56%|█████▌    | 162/289 [02:02<01:36,  1.32it/s]

Training loop 162
tensor([[ 101, 2054, 2001,  ...,    0,    0,    0],
        [ 101, 2029, 4155,  ...,    0,    0,    0],
        [ 101, 2029, 4155,  ...,    0,    0,    0],
        ...,
        [ 101, 1999, 2054,  ...,    0,    0,    0],
        [ 101, 2129, 2116,  ...,    0,    0,    0],
        [ 101, 2054, 2951,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.21831203997135162, logits - tensor([[-4.6986,  0.9291, -3.1844, -1.1840],
        [-4.9348,  1.8052, -3.5488, -1.2487],
        [-4.1894,  1.7561, -3.1959, -1.0172],
        [-4.9048,  0.9499, -4.3373, -1.1880],
        [-4.3347,  1.6351, -3.1451, -1.

 56%|█████▋    | 163/289 [02:03<01:35,  1.33it/s]

Training loop 163
tensor([[ 101, 2054, 2951,  ...,    0,    0,    0],
        [ 101, 2515, 1996,  ...,    0,    0,    0],
        [ 101, 2054, 2125,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2129, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 2653,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.25545573234558105, logits - tensor([[-4.1321,  0.5323, -3.3370, -1.1963],
        [-3.8184, -2.6900, -0.0636, -1.0847],
        [-5.7429,  1.6562, -4.5192, -1.6953],
        [-5.5690,  1.6613, -3.9926, -1.5150],
        [-5.5069,  1.9737, -4.5130, -1.

 57%|█████▋    | 164/289 [02:04<01:34,  1.33it/s]

Training loop 164
tensor([[  101,  2129,  2515,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0],
        [  101,  2129,  2172,  ...,     0,     0,     0],
        ...,
        [  101,  2054, 26163,  ...,     0,     0,     0],
        [  101,  2079,  1996,  ...,     0,     0,     0],
        [  101,  2515,  1996,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.38839247822761536, logits - tensor([[-5.5649,  0.7327, -3.5608, -0.2823],
        [-4.7576,  2.3300, -4.1088, -1.9608],
        [-4.7677, -1.3547, -3.6137,  0.8802],
        [-5.2140,  0.7515, -3.9957, -1.2008],
   

 57%|█████▋    | 165/289 [02:05<01:33,  1.33it/s]

Training loop 165
tensor([[  101,  2054,  2024,  ...,     0,     0,     0],
        [  101,  2054,  2024,  ...,     0,     0,     0],
        [  101,  2129,  2027,  ...,     0,     0,     0],
        ...,
        [  101,  2054, 26163,  ...,     0,     0,     0],
        [  101,  2029,  3698,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.25180232524871826, logits - tensor([[-5.0762,  1.6142, -3.7845, -0.8787],
        [-4.9964,  1.0715, -4.1448, -0.6225],
        [-5.0228,  1.1236, -3.2563, -0.6212],
        [-2.7979, -3.2274,  2.0526, -1.8375],
   

 57%|█████▋    | 166/289 [02:05<01:32,  1.33it/s]

Training loop 166
tensor([[  101,  2129,  2001,  ...,     0,     0,     0],
        [  101,  2079,  2027,  ...,     0,     0,     0],
        [  101,  2003, 10488,  ...,     0,     0,     0],
        ...,
        [  101,  2129,  2488,  ...,     0,     0,     0],
        [  101,  2054,  3084,  ...,     0,     0,     0],
        [  101,  2029,  9312,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3671301305294037, logits - tensor([[-5.9786,  1.7687, -3.9355, -0.9703],
        [-3.1553, -2.4809,  0.4240, -1.5846],
        [-4.6676, -1.6102,  0.3492, -1.7590],
        [-5.1869,  0.5014, -3.7992, -0.9988],
    

 58%|█████▊    | 167/289 [02:06<01:31,  1.33it/s]

Training loop 167
tensor([[ 101, 2029, 2061,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ..., 2008, 7236,  102],
        [ 101, 2054, 2828,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2024, 2045,  ...,    0,    0,    0],
        [ 101, 2054, 2785,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5479092597961426, logits - tensor([[-5.0346,  1.0358, -2.7639, -0.2513],
        [-4.6681,  1.0075, -3.5599, -0.6010],
        [-4.7635, -2.7809, -3.4447,  1.5990],
        [-4.7748,  0.7467, -3.4666, -0.9193],
        [-5.3170,  1.5962, -3.2131, -1.4

 58%|█████▊    | 168/289 [02:07<01:30,  1.33it/s]

Training loop 168
tensor([[  101,  2071,  2017,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0],
        [  101,  2079,  2027,  ...,     0,     0,     0],
        ...,
        [  101,  2054, 12046,  ...,     0,     0,     0],
        [  101,  2129,  2116,  ...,     0,     0,     0],
        [  101,  2129,  2488,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1824173629283905, logits - tensor([[-3.5774, -3.1808, -0.2321,  0.3190],
        [-5.9276,  0.9287, -3.9097, -0.7694],
        [-3.4053, -2.0088,  1.4596, -2.2970],
        [-6.2621,  0.8165, -4.1794, -1.4559],
    

 58%|█████▊    | 169/289 [02:08<01:29,  1.33it/s]

Training loop 169
tensor([[  101,  2129,  2027,  ...,     0,     0,     0],
        [  101,  2129,  2079,  ..., 10719, 10697,   102],
        [  101,  2054,  2024,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  4155,  ...,     0,     0,     0],
        [  101,  2129,  2024,  ...,     0,     0,     0],
        [  101,  2054,  2024,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3499930500984192, logits - tensor([[-4.6011,  0.8631, -4.0476, -1.9288],
        [-4.7743, -0.1676, -3.2754, -0.2399],
        [-3.9704,  1.2003, -3.7862, -1.0208],
        [-5.7739,  1.0729, -4.5556, -0.5976],
    

 59%|█████▉    | 170/289 [02:08<01:29,  1.33it/s]

Training loop 170
tensor([[  101,  2339,  5796,  ...,     0,     0,     0],
        [  101,  2054, 16134,  ...,     0,     0,     0],
        [  101,  2054,  2020,  ...,     0,     0,     0],
        ...,
        [  101,  2129,  2172,  ...,     0,     0,     0],
        [  101,  2129,  2079,  ...,     0,     0,     0],
        [  101,  2054,  2024,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.28371310234069824, logits - tensor([[-5.0169,  2.0236, -4.0351, -0.0172],
        [-4.9822,  0.7796, -4.0194, -0.4427],
        [-5.3539,  0.2960, -3.3337, -0.2097],
        [-5.1656,  0.9319, -4.0940, -1.4143],
   

 59%|█████▉    | 171/289 [02:09<01:28,  1.33it/s]

Training loop 171
tensor([[ 101, 2054, 2001,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2054, 7860,  ...,    0,    0,    0],
        ...,
        [ 101, 2323, 2037,  ...,    0,    0,    0],
        [ 101, 2129, 2024,  ...,    0,    0,    0],
        [ 101, 2129, 2003,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.23956668376922607, logits - tensor([[-3.5461, -1.8974, -2.8617,  1.3240],
        [-5.2301,  0.1918, -3.2430,  0.3262],
        [-4.9717,  1.0611, -3.8633, -1.0548],
        [-5.6845,  1.0205, -4.1919, -0.3327],
        [-6.0048,  1.2602, -4.1611, -0.

 60%|█████▉    | 172/289 [02:10<01:27,  1.34it/s]

Training loop 172
tensor([[ 101, 2029, 2653,  ...,    0,    0,    0],
        [ 101, 2054, 7885,  ...,    0,    0,    0],
        [ 101, 2003, 2009,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2079,  ...,    0,    0,    0],
        [ 101, 2054, 7060,  ..., 2270, 5193,  102],
        [ 101, 2054, 2003,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3488752841949463, logits - tensor([[-4.7270, -0.0923, -3.0543, -0.6909],
        [-5.5017,  1.1605, -3.9992, -1.1318],
        [-2.4767, -2.8712,  2.1641, -2.2061],
        [-5.4957,  0.4176, -4.2247, -1.0129],
        [-4.2059,  1.0037, -3.3927,  0.0

 60%|█████▉    | 173/289 [02:11<01:26,  1.34it/s]

Training loop 173
tensor([[ 101, 2003, 1996,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2129, 2515,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2653,  ...,    0,    0,    0],
        [ 101, 2029, 2773,  ...,    0,    0,    0],
        [ 101, 2129, 2079,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3031541705131531, logits - tensor([[-3.7612, -3.3734,  0.7216, -0.5606],
        [-5.4959,  0.5030, -4.1782, -0.4257],
        [-5.0427,  0.7191, -3.4107, -0.1623],
        [-4.5759,  0.6473, -4.6680, -0.9958],
        [-5.7407,  0.5119, -5.0110, -0.6

 60%|██████    | 174/289 [02:11<01:25,  1.34it/s]

Training loop 174
tensor([[  101,  2054, 15066,  ...,     0,     0,     0],
        [  101,  2029,  2951,  ...,     0,     0,     0],
        [  101,  2054,  2001,  ...,     0,     0,     0],
        ...,
        [  101,  2129,  2079,  ...,     0,     0,     0],
        [  101,  2129,  2079,  ...,  2946,  1012,   102],
        [  101,  2054,  2024,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2834744155406952, logits - tensor([[-5.8193e+00,  4.3433e-01, -3.9758e+00, -9.0771e-01],
        [-5.9516e+00,  7.0870e-01, -4.1743e+00, -7.9012e-01],
        [-5.3359e+00,  1.1004e+00, -3.9962e+00, -5.7367e-01],
  

 61%|██████    | 175/289 [02:12<01:25,  1.33it/s]

Training loop 175
tensor([[  101,  2006, 10474,  ...,     0,     0,     0],
        [  101,  2054,  7551,  ...,  2217,  2515,   102],
        [  101,  2029,  2951,  ...,     0,     0,     0],
        ...,
        [  101,  2129,  2502,  ...,     0,     0,     0],
        [  101,  2054,  2512,  ...,     0,     0,     0],
        [  101,  2129,  2116,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.16748389601707458, logits - tensor([[-3.4322, -2.4788,  2.0761, -2.1190],
        [-4.7532,  0.5721, -3.5653, -0.8073],
        [-5.7160,  0.8692, -4.1063, -1.5419],
        [-5.4931,  1.3122, -4.0546, -0.3688],
   

 61%|██████    | 176/289 [02:13<01:25,  1.32it/s]

Training loop 176
tensor([[  101,  2003,  2023,  ...,     0,     0,     0],
        [  101,  2079,  2027,  ...,     0,     0,     0],
        [  101,  2129,  2172,  ...,     0,     0,     0],
        ...,
        [  101,  2129,  2079,  ...,     0,     0,     0],
        [  101,  2429,  2000,  ...,     0,     0,     0],
        [  101,  2029, 18224,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1383751481771469, logits - tensor([[-2.8255, -2.3436,  1.6769, -1.8213],
        [-2.8719, -1.8051,  1.0271, -1.1195],
        [-4.3547, -2.1643, -3.1081,  1.5805],
        [-3.4191, -2.4191,  2.4583, -1.7106],
    

 61%|██████    | 177/289 [02:14<01:24,  1.32it/s]

Training loop 177
tensor([[  101,  2029,  2951,  ...,     0,     0,     0],
        [  101,  2054, 26163,  ...,     0,     0,     0],
        [  101,  2054,  2024,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  2951,  ...,     0,     0,     0],
        [  101,  2054,  2001,  ...,  1996,  3737,   102],
        [  101,  2129,  2003,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.23994356393814087, logits - tensor([[-4.0579,  1.2255, -4.2733, -0.7280],
        [-3.9957, -1.9278, -3.6682,  1.7924],
        [-4.8027,  1.1709, -3.8861, -1.0949],
        [-2.4457, -2.5559,  1.2430, -2.6055],
   

 62%|██████▏   | 178/289 [02:14<01:24,  1.32it/s]

Training loop 178
tensor([[ 101, 2129, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 2838,  ...,    0,    0,    0],
        [ 101, 2129, 2079,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2011, 2129,  ...,    0,    0,    0],
        [ 101, 2129, 5656,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2251177430152893, logits - tensor([[-4.9353,  1.4975, -3.9035, -1.5797],
        [-4.6968,  1.2736, -4.0596, -0.7900],
        [-4.6959,  1.2568, -3.3354, -1.8256],
        [-4.9499,  1.3623, -2.8078, -0.8629],
        [-4.5369,  1.0357, -2.9998, -1.6

 62%|██████▏   | 179/289 [02:15<01:23,  1.32it/s]

Training loop 179
tensor([[ 101, 2079, 2027,  ...,    0,    0,    0],
        [ 101, 2054, 2027,  ...,    0,    0,    0],
        [ 101, 2079, 2027,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2024,  ...,    0,    0,    0],
        [ 101, 2029, 6847,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.28519153594970703, logits - tensor([[-3.8900, -2.4297,  2.0112, -2.2727],
        [-5.9388,  1.6300, -5.2467, -1.1219],
        [-3.7310, -2.6106,  2.0118, -2.1881],
        [-4.4474, -2.6131, -3.0497,  1.4369],
        [-4.5324,  1.3125, -3.0810, -1.

 62%|██████▏   | 180/289 [02:16<01:22,  1.32it/s]

Training loop 180
tensor([[  101,  2054,  2024,  ...,     0,     0,     0],
        [  101,  2054,  3698,  ...,     0,     0,     0],
        [  101,  2129,  2502,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  2951,  ...,     0,     0,     0],
        [  101,  2024,  1996,  ...,     0,     0,     0],
        [  101,  2054, 11541,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.17020195722579956, logits - tensor([[-4.8775,  1.9526, -4.4082, -1.9717],
        [-4.8770,  1.6748, -3.2836, -1.5893],
        [-6.0721,  0.9831, -4.6440, -1.2947],
        [-5.1299,  1.7860, -4.6338, -1.6618],
   

 63%|██████▎   | 181/289 [02:17<01:21,  1.32it/s]

Training loop 181
tensor([[ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 2951,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        ...,
        [ 101, 2029, 2759,  ...,    0,    0,    0],
        [ 101, 2029, 2048,  ...,    0,    0,    0],
        [ 101, 2054, 2951,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3104891777038574, logits - tensor([[-4.7275, -2.0960, -3.4384,  1.6490],
        [-4.5630,  1.0930, -3.1003, -1.6647],
        [-4.4578,  1.1038, -3.2113, -0.9951],
        [-5.5141,  1.7170, -4.2407, -1.5970],
        [-5.0480, -0.7445, -3.4647,  0.8

 63%|██████▎   | 182/289 [02:17<01:20,  1.32it/s]

Training loop 182
tensor([[  101,  2038, 26261,  ...,     0,     0,     0],
        [  101,  2515,  2037,  ...,     0,     0,     0],
        [  101,  2054,  9312,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  2024,  ...,     0,     0,     0],
        [  101,  2029,  2944,  ...,     0,     0,     0],
        [  101,  2054, 26163,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.22018267214298248, logits - tensor([[-3.5549, -2.8779,  2.0647, -2.4719],
        [-3.2933, -2.3105,  1.7244, -2.2116],
        [-4.9899,  1.2171, -3.5999, -0.8436],
        [-3.3278, -1.8679, -3.7552,  1.8733],
   

 63%|██████▎   | 183/289 [02:18<01:19,  1.33it/s]

Training loop 183
tensor([[ 101, 2129, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        ...,
        [ 101, 2003, 1996,  ...,    0,    0,    0],
        [ 101, 2011, 2129,  ...,    0,    0,    0],
        [ 101, 2054, 3431,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3360389173030853, logits - tensor([[-4.0497, -2.0600, -3.6148,  0.7178],
        [-5.0546,  1.0416, -3.9133, -1.6397],
        [-4.4797,  1.2316, -2.9698, -1.2713],
        [-5.7466,  1.4772, -4.9420, -1.4100],
        [-5.0111,  1.9597, -3.5901, -1.0

 64%|██████▎   | 184/289 [02:19<01:18,  1.33it/s]

Training loop 184
tensor([[  101,  2029,  7708,  ...,     0,     0,     0],
        [  101,  2054,  2024,  ...,     0,     0,     0],
        [  101,  2054, 12247,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  2951,  ...,     0,     0,     0],
        [  101,  2029, 26163,  ...,     0,     0,     0],
        [  101,  2079,  2027,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.10020739585161209, logits - tensor([[-4.7547,  2.0114, -4.1619, -2.0953],
        [-5.0865,  0.8047, -2.9977, -1.6091],
        [-5.4146,  1.6218, -4.5688, -1.8980],
        [-5.0227,  0.9772, -4.6228, -2.1608],
   

 64%|██████▍   | 185/289 [02:20<01:17,  1.33it/s]

Training loop 185
tensor([[ 101, 2029, 2951,  ...,    0,    0,    0],
        [ 101, 2054, 4512,  ...,    0,    0,    0],
        [ 101, 2054, 2001,  ...,    0,    0,    0],
        ...,
        [ 101, 2024, 2653,  ...,    0,    0,    0],
        [ 101, 2129, 2515,  ...,    0,    0,    0],
        [ 101, 2129, 2515,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.32976335287094116, logits - tensor([[-5.5397,  2.1633, -4.3444, -3.2096],
        [-5.6799,  2.2123, -4.4528, -2.2503],
        [-4.7422, -2.5369, -4.4272,  1.4259],
        [-5.6978,  1.4247, -4.1211, -1.8487],
        [-4.1975, -0.9316,  0.6106, -2.

 64%|██████▍   | 186/289 [02:20<01:17,  1.33it/s]

Training loop 186
tensor([[  101,  2129,  2079,  ...,     0,     0,     0],
        [  101,  2073,  2515,  ...,  1010,  1040,   102],
        [  101,  2029,  2060,  ...,  3207, 23460,   102],
        ...,
        [  101,  2054,  2024,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0],
        [  101,  2003,  3793,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5527716279029846, logits - tensor([[-5.8745,  2.5642, -5.0244, -1.4636],
        [-5.1693,  1.5019, -3.8929, -0.7443],
        [-5.0030, -0.1267, -3.5269, -0.0988],
        [-3.2089, -3.0409,  1.0899, -2.5016],
    

 65%|██████▍   | 187/289 [02:21<01:16,  1.34it/s]

Training loop 187
tensor([[ 101, 2129, 2502,  ...,    0,    0,    0],
        [ 101, 2129, 2502,  ...,    0,    0,    0],
        [ 101, 2029, 2731,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2001,  ...,    0,    0,    0],
        [ 101, 2106, 2027,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.09047961980104446, logits - tensor([[-4.0399, -2.8215, -3.3403,  2.1945],
        [-5.4110,  1.8250, -4.1586, -1.3075],
        [-4.4912,  1.2562, -3.4436, -1.6314],
        [-5.6012,  2.2748, -4.6478, -2.3349],
        [-5.0298,  1.6803, -4.0439, -1.

 65%|██████▌   | 188/289 [02:22<01:15,  1.34it/s]

Training loop 188
tensor([[ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2006, 2054,  ...,    0,    0,    0],
        [ 101, 2129, 2001,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 1013,  ...,    0,    0,    0],
        [ 101, 2054, 2465,  ...,    0,    0,    0],
        [ 101, 2029, 3698,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1245877742767334, logits - tensor([[-5.2679,  1.1655, -3.9899, -1.8215],
        [-4.2967, -0.6936, -3.5618,  0.1986],
        [-5.1811,  1.6919, -3.5227, -1.6339],
        [-5.1515,  0.5432, -3.6559, -1.0509],
        [-4.2708, -2.0712, -3.6814,  1.9

 65%|██████▌   | 189/289 [02:23<01:14,  1.34it/s]

Training loop 189
tensor([[ 101, 2024, 7885,  ...,    0,    0,    0],
        [ 101, 2079, 2027,  ...,    0,    0,    0],
        [ 101, 2079, 2027,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2092,  ..., 1037, 3618,  102],
        [ 101, 2054, 2951,  ...,    0,    0,    0],
        [ 101, 2029, 2951,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.15373355150222778, logits - tensor([[-2.8743, -1.1580,  1.4034, -1.2818],
        [-3.4285, -3.5899,  2.6695, -1.8859],
        [-3.0985, -2.5066,  2.0819, -2.5723],
        [-3.5877, -2.0761,  1.2384, -2.0151],
        [-6.0931,  1.6493, -4.2770, -2.

 66%|██████▌   | 190/289 [02:23<01:13,  1.34it/s]

Training loop 190
tensor([[  101,  2054,  2020,  ...,     0,     0,     0],
        [  101,  2054, 13931,  ...,     0,     0,     0],
        [  101,  2129,  2024,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  3599,  ...,     0,     0,     0],
        [  101,  2054,  2739,  ...,     0,     0,     0],
        [  101,  2129,  2106,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.42919057607650757, logits - tensor([[-4.8682, -1.6691, -4.1065,  2.4250],
        [-4.9511,  1.1618, -4.0733, -2.5158],
        [-4.7770,  2.3046, -4.4049, -2.5535],
        [-5.6957,  0.9503, -3.2964, -1.0451],
   

 66%|██████▌   | 191/289 [02:24<01:13,  1.34it/s]

Training loop 191
tensor([[ 101, 2054, 2951,  ...,    0,    0,    0],
        [ 101, 2029, 4275,  ...,    0,    0,    0],
        [ 101, 2129, 2079,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2048,  ...,    0,    0,    0],
        [ 101, 2129, 2146,  ...,    0,    0,    0],
        [ 101, 2129, 2003,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.21231701970100403, logits - tensor([[-4.6917,  1.6296, -3.6467, -1.8608],
        [-4.4179, -1.5042, -3.5487,  0.9790],
        [-5.6002,  2.3827, -4.3688, -2.1154],
        [-3.5539, -2.0517,  1.6683, -2.6276],
        [-4.6285,  1.6306, -4.3251, -2.

 66%|██████▋   | 192/289 [02:25<01:12,  1.34it/s]

Training loop 192
tensor([[  101,  2054,  2003,  ...,     0,     0,     0],
        [  101,  2054,  4127,  ...,     0,     0,     0],
        [  101,  2054,  2951,  ...,     0,     0,     0],
        ...,
        [  101,  2024, 11345,  ...,     0,     0,     0],
        [  101,  2029,  9942,  ...,     0,     0,     0],
        [  101,  2005,  1996,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5865799784660339, logits - tensor([[-4.2948, -2.2912, -4.5322,  2.0839],
        [-5.5020,  2.4959, -4.5777, -2.6992],
        [-5.3790,  2.5389, -4.5736, -1.8644],
        [-5.7043,  1.7953, -3.8873, -1.8841],
    

 67%|██████▋   | 193/289 [02:26<01:11,  1.34it/s]

Training loop 193
tensor([[ 101, 2339, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 4708,  ...,    0,    0,    0],
        [ 101, 2079, 2027,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2024,  ...,    0,    0,    0],
        [ 101, 2079, 2027,  ...,    0,    0,    0],
        [ 101, 2129, 1999,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.11370159685611725, logits - tensor([[-5.0955,  0.5523, -4.5475, -1.4533],
        [-4.5516,  1.7636, -3.5007, -1.1100],
        [-3.4761, -3.1212,  2.3585, -2.6390],
        [-4.9556,  1.9360, -4.1743, -0.9585],
        [-5.1500,  0.6607, -4.0145, -1.

 67%|██████▋   | 194/289 [02:26<01:10,  1.35it/s]

Training loop 194
tensor([[ 101, 2003, 2045,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2029, 4118,  ...,    0,    0,    0],
        ...,
        [ 101, 2515, 1996,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.10459282249212265, logits - tensor([[-4.4684, -1.9607,  0.8999, -2.1676],
        [-6.2994,  1.0822, -4.2920, -1.4661],
        [-5.4022,  2.3728, -4.8363, -1.9546],
        [-3.6300, -3.0672,  1.4070, -2.2464],
        [-3.9701, -2.9817,  1.6218, -2.

 67%|██████▋   | 195/289 [02:27<01:09,  1.35it/s]

Training loop 195
tensor([[ 101, 2054, 6882,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 2001,  ...,    0,    0,    0],
        ...,
        [ 101, 2079, 1996,  ...,    0,    0,    0],
        [ 101, 2003, 1996,  ..., 2015, 1012,  102],
        [ 101, 2054, 6123,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 68%|██████▊   | 196/289 [02:28<01:08,  1.35it/s]

loss - 0.42069944739341736, logits - tensor([[-4.9849,  1.0363, -3.9138, -1.2789],
        [-4.9213,  1.8119, -3.7673, -1.4354],
        [-5.6881, -1.9834, -3.2826,  2.2493],
        [-5.0367,  1.6299, -3.8619, -1.2401],
        [-5.3152,  1.1159, -4.5565, -1.9104],
        [-3.4589, -2.4623,  2.0451, -1.9100],
        [-4.1132, -1.6175,  0.4998, -1.8632],
        [-5.4906,  1.9279, -4.3570, -1.5449]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 196
tensor([[ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2003, 1996,  ...,    0,    0,    0],
        [ 101, 2129, 2079,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1,

 68%|██████▊   | 197/289 [02:29<01:08,  1.34it/s]

Training loop 197
tensor([[ 101, 2129, 2116,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2129, 2079,  ...,    0,    0,    0],
        ...,
        [ 101, 2079, 2027,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2982088625431061, logits - tensor([[-5.4437,  1.3534, -4.3490, -1.6828],
        [-5.1392,  1.0468, -3.5062, -2.2345],
        [-4.9417,  1.3779, -5.4717, -1.4408],
        [-3.9007, -2.9080,  1.7654, -3.0354],
        [-4.7837,  1.2484, -4.3128, -1.2

 69%|██████▊   | 198/289 [02:29<01:08,  1.34it/s]

Training loop 198
tensor([[  101,  2054,  2024,  ...,     0,     0,     0],
        [  101,  2054,  2097,  ...,     0,     0,     0],
        [  101,  2006,  2029,  ...,     0,     0,     0],
        ...,
        [  101,  2129,  2079,  ...,     0,     0,     0],
        [  101,  2029,  2653,  ...,     0,     0,     0],
        [  101,  2029,  6612,  ..., 12963,  5198,   102]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.26912546157836914, logits - tensor([[-3.4384, -1.9320, -3.4356,  1.7227],
        [-5.0724,  1.2110, -4.6895, -1.0591],
        [-4.7055,  1.2329, -4.0040, -0.9479],
        [-5.4121,  1.4458, -4.8404, -1.9843],
   

 69%|██████▉   | 199/289 [02:30<01:07,  1.33it/s]

Training loop 199
tensor([[ 101, 2079, 2027,  ...,    0,    0,    0],
        [ 101, 2024, 1996,  ...,    0,    0,    0],
        [ 101, 2054, 6801,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2809,  ...,    0,    0,    0],
        [ 101, 2515, 2023,  ...,    0,    0,    0],
        [ 101, 2054, 3015,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2545899450778961, logits - tensor([[-3.4439, -2.7914,  2.6568, -2.5858],
        [-3.9861, -3.0391,  1.8325, -2.7681],
        [-5.1915,  1.0819, -4.2613, -1.4602],
        [-5.3425,  1.4170, -4.0713, -0.4493],
        [-3.5647, -1.9762,  1.4147, -2.0

 69%|██████▉   | 200/289 [02:31<01:06,  1.33it/s]

Training loop 200
tensor([[ 101, 2054, 2944,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2106, 2027,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2951,  ...,    0,    0,    0],
        [ 101, 2079, 2027,  ...,    0,    0,    0],
        [ 101, 2129, 2024,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2081121802330017, logits - tensor([[-4.9530,  0.9686, -4.8832, -1.6443],
        [-6.0394,  1.5187, -4.0353, -0.6497],
        [-2.7359, -2.3375,  2.3669, -2.4615],
        [-5.4926,  1.2432, -4.7144, -1.4199],
        [-5.2170,  1.6499, -4.7165, -0.9

 70%|██████▉   | 201/289 [02:32<01:06,  1.33it/s]

Training loop 201
tensor([[ 101, 2054, 2951,  ...,    0,    0,    0],
        [ 101, 2129, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 2836,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 5887,  ...,    0,    0,    0],
        [ 101, 2054, 8467,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.27584612369537354, logits - tensor([[-4.8628,  1.3930, -4.6736, -1.2199],
        [-4.8981,  1.5886, -4.2499, -1.0708],
        [-4.7503, -1.2893, -4.1406,  1.5375],
        [-4.8490,  1.5149, -4.2590, -0.8052],
        [-4.8979,  1.2932, -4.4265, -1.

 70%|██████▉   | 202/289 [02:32<01:05,  1.33it/s]

Training loop 202
tensor([[  101,  2054,  9312,  ...,     0,     0,     0],
        [  101,  2079,  2027,  ...,     0,     0,     0],
        [  101,  2339,  2106,  ...,     0,     0,     0],
        ...,
        [  101,  2129,  2502,  ...,     0,     0,     0],
        [  101,  2054, 26163,  ...,     0,     0,     0],
        [  101,  2054, 10640,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.28725215792655945, logits - tensor([[-5.4654,  1.0648, -5.0850, -1.2067],
        [-3.1892, -3.3859,  2.8268, -2.2999],
        [-5.6086,  1.5760, -4.3049, -0.1446],
        [-4.6150,  0.4277, -4.5671, -0.4783],
   

 70%|███████   | 203/289 [02:33<01:04,  1.33it/s]

Training loop 203
tensor([[ 101, 2054, 1005,  ..., 3588, 6251,  102],
        [ 101, 2129, 2003,  ...,    0,    0,    0],
        [ 101, 2129, 2172,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 5876,  ...,    0,    0,    0],
        [ 101, 2003, 1996,  ...,    0,    0,    0],
        [ 101, 2029, 8518,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.23503044247627258, logits - tensor([[-5.1180,  1.5481, -3.8681, -0.9487],
        [-6.1910,  1.4525, -4.9483, -0.6102],
        [-5.0234,  1.0523, -3.0925, -1.2518],
        [-5.4710,  1.0393, -4.2464, -0.5078],
        [-4.5389, -2.0238, -4.1575,  2.

 71%|███████   | 204/289 [02:34<01:03,  1.34it/s]

Training loop 204
tensor([[  101,  2129,  2312,  ...,     0,     0,     0],
        [  101,  2029,  8789,  ...,     0,     0,     0],
        [  101,  2029,  2653,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  2020,  ...,     0,     0,     0],
        [  101,  2054, 16913,  ...,     0,     0,     0],
        [  101,  2003,  1996,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1579379439353943, logits - tensor([[-5.2042, -0.0846, -5.0085,  0.0757],
        [-6.5312,  1.6165, -5.0589, -1.3771],
        [-4.9575,  1.1197, -3.5935, -0.5419],
        [-4.0552, -3.8516,  2.8936, -1.3035],
    

 71%|███████   | 205/289 [02:35<01:02,  1.34it/s]

Training loop 205
tensor([[ 101, 2054, 2951,  ...,    0,    0,    0],
        [ 101, 2003, 2023,  ...,    0,    0,    0],
        [ 101, 2129, 2116,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2079, 2027,  ...,    0,    0,    0],
        [ 101, 2054, 6549,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.13028612732887268, logits - tensor([[-4.4593,  1.1871, -4.5278, -0.4883],
        [-3.9030, -2.5854,  2.7943, -2.5372],
        [-5.8941,  1.5295, -4.4845, -1.0324],
        [-4.0157, -2.8341, -3.9641,  2.6687],
        [-5.5606,  0.6503, -4.0763, -0.

 71%|███████▏  | 206/289 [02:35<01:01,  1.34it/s]

Training loop 206
tensor([[ 101, 2029, 4275,  ...,    0,    0,    0],
        [ 101, 2054, 3025,  ...,    0,    0,    0],
        [ 101, 2129, 2003,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2047,  ...,    0,    0,    0],
        [ 101, 2003, 2037,  ...,    0,    0,    0],
        [ 101, 2054, 5579,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3524874448776245, logits - tensor([[-5.5848,  0.5535, -4.6288, -1.0313],
        [-5.4823,  0.5269, -4.2568, -0.1780],
        [-5.6905,  1.0413, -4.3265, -0.9738],
        [-5.2727,  0.0760, -5.2791, -1.4946],
        [-5.9692,  0.2293, -3.6053, -0.5

 72%|███████▏  | 207/289 [02:36<01:01,  1.34it/s]

Training loop 207
tensor([[  101,  2129,  2312,  ...,     0,     0,     0],
        [  101,  2079,  2035,  ...,     0,     0,     0],
        [  101,  2003, 12801,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  2024,  ...,     0,     0,     0],
        [  101,  2129,  2079,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1748390793800354, logits - tensor([[-4.5930, -2.5712, -3.7229,  2.5928],
        [-3.4274, -2.9875,  1.9113, -1.8452],
        [-3.8999, -3.3630,  1.2900, -1.7063],
        [-5.3829,  0.9981, -4.9103, -0.4157],
    

 72%|███████▏  | 208/289 [02:37<01:00,  1.34it/s]

Training loop 208
tensor([[ 101, 2073, 2106,  ...,    0,    0,    0],
        [ 101, 2024, 1996,  ...,    0,    0,    0],
        [ 101, 2029, 1997,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2011, 2129,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3293265104293823, logits - tensor([[-5.2423,  1.2247, -4.3432, -0.9457],
        [-4.1176, -2.2584,  1.6393, -2.5781],
        [-5.5630,  1.3174, -4.2562, -1.6663],
        [-5.4247,  1.1621, -4.9680, -0.7495],
        [-5.9703,  0.8665, -3.6302, -0.2

 72%|███████▏  | 209/289 [02:38<00:59,  1.33it/s]

Training loop 209
tensor([[ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 2592,  ...,    0,    0,    0],
        ...,
        [ 101, 2029, 3787,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2129, 2079,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3175809979438782, logits - tensor([[-4.6619, -0.8982, -4.4069,  1.0778],
        [-5.3426,  1.0355, -4.7404, -0.5083],
        [-4.6851,  1.2322, -3.8125, -1.1750],
        [-4.5516, -1.9575, -3.1420,  2.5894],
        [-3.0984, -3.3910,  2.4133, -1.4

 73%|███████▎  | 210/289 [02:38<00:59,  1.34it/s]

Training loop 210
tensor([[ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2515, 5887,  ...,    0,    0,    0],
        [ 101, 2054, 7860,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2951,  ...,    0,    0,    0],
        [ 101, 2054, 2001,  ...,    0,    0,    0],
        [ 101, 2054, 2020,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.36136144399642944, logits - tensor([[-5.0421,  1.1154, -3.9254, -0.6749],
        [-2.6904, -3.2580,  2.0883, -1.9899],
        [-5.9158,  1.6926, -4.5246, -0.7579],
        [-4.2965, -2.5042, -3.5330,  2.8706],
        [-5.2959,  1.0295, -4.8025, -0.

 73%|███████▎  | 211/289 [02:39<00:57,  1.35it/s]

Training loop 211
tensor([[ 101, 2515, 1996,  ...,    0,    0,    0],
        [ 101, 2029, 2944,  ..., 2029, 2024,  102],
        [ 101, 2339, 2515,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2003,  ...,    0,    0,    0],
        [ 101, 2029, 5109,  ...,    0,    0,    0],
        [ 101, 2079, 2027,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.30174335837364197, logits - tensor([[-3.7567, -2.1087,  1.8366, -2.0913],
        [-5.2503,  0.2905, -4.3949, -0.3200],
        [-5.5077,  1.4363, -4.9930, -0.6513],
        [-4.5510, -2.1077, -4.1584,  2.6752],
        [-5.0377,  1.2177, -4.1134, -0.

 73%|███████▎  | 212/289 [02:40<00:57,  1.35it/s]

Training loop 212
tensor([[  101,  2054,  2024,  ...,     0,     0,     0],
        [  101,  2054,  2020,  ...,     0,     0,     0],
        [  101,  2129,  2079,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  2003,  ...,     0,     0,     0],
        [  101,  2515,  2037,  ...,     0,     0,     0],
        [  101,  2054, 15756,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.34835708141326904, logits - tensor([[-6.4751,  1.2707, -4.9744, -0.5347],
        [-5.9071,  0.5167, -4.8744, -1.1335],
        [-5.4790, -0.0491, -4.8291, -1.2873],
        [-5.5369,  0.4855, -4.9928, -1.2623],
   

 74%|███████▎  | 213/289 [02:41<00:56,  1.34it/s]

Training loop 213
tensor([[  101,  2054,  2024,  ...,  1037,  2944,   102],
        [  101,  2129,  2116,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0],
        ...,
        [  101,  2079,  2027,  ...,  1998,  2382,   102],
        [  101,  2079, 22889,  ...,     0,     0,     0],
        [  101,  2054,  2512,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.23331597447395325, logits - tensor([[-4.9074,  1.1302, -3.9796, -0.9768],
        [-5.5071,  1.4063, -5.7047, -1.1054],
        [-4.5165, -2.5042, -4.4761,  2.5301],
        [-4.5657,  0.4254, -4.1835, -0.3952],
   

 74%|███████▍  | 214/289 [02:41<00:55,  1.34it/s]

Training loop 214
tensor([[ 101, 2129, 2020,  ...,    0,    0,    0],
        [ 101, 2029, 2951,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 5754,  ...,    0,    0,    0],
        [ 101, 2054, 4487,  ...,    0,    0,    0],
        [ 101, 2129, 2003,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.16157080233097076, logits - tensor([[-5.0425,  0.3337, -4.3020, -0.2861],
        [-5.6095,  0.7031, -4.4845, -0.3765],
        [-6.2085,  0.5286, -4.9249, -1.0036],
        [-4.3704, -3.1749,  2.1712, -2.8804],
        [-3.3314, -2.6070,  2.2956, -1.

 74%|███████▍  | 215/289 [02:42<00:55,  1.34it/s]

Training loop 215
tensor([[  101,  2129,  2024,  ...,     0,     0,     0],
        [  101,  2129,  2003,  ...,     0,     0,     0],
        [  101,  2029,  3485,  ...,     0,     0,     0],
        ...,
        [  101,  2054, 26163,  ...,     0,     0,     0],
        [  101,  2129,  2003,  ...,     0,     0,     0],
        [  101,  2054,  2024,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1413261890411377, logits - tensor([[-5.4824,  0.8717, -3.8695, -1.0578],
        [-5.7011,  1.1122, -4.3050, -1.3841],
        [-3.3224, -3.1386, -3.6249,  2.6168],
        [-5.4350,  1.1416, -5.2504, -1.2890],
    

 75%|███████▍  | 216/289 [02:43<00:54,  1.33it/s]

Training loop 216
tensor([[  101,  2129,  2116,  ...,     0,     0,     0],
        [  101,  2079,  2027,  ...,     0,     0,     0],
        [  101,  2029,  2653,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  5919,  ...,     0,     0,     0],
        [  101,  2054,  2785,  ...,     0,     0,     0],
        [  101,  2054, 10864,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.37133073806762695, logits - tensor([[-5.5813,  1.1480, -6.0176, -1.9843],
        [-3.9736, -3.9767,  1.7361, -1.8938],
        [-4.7292,  1.4643, -4.6123, -1.5467],
        [-5.1914,  1.5034, -4.6239, -1.4831],
   

 75%|███████▌  | 217/289 [02:44<00:54,  1.33it/s]

Training loop 217
tensor([[ 101, 2054, 1005,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2129, 2172,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 2785,  ...,    0,    0,    0],
        [ 101, 2029, 7861,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2653425335884094, logits - tensor([[-5.2645,  1.7231, -4.3660, -1.2218],
        [-6.1168,  0.4322, -4.6783, -1.6877],
        [-4.9384,  1.2900, -4.0826, -1.1818],
        [-5.6422,  1.1539, -4.6964, -1.1587],
        [-5.0927,  1.5829, -5.0437, -1.2

 75%|███████▌  | 218/289 [02:44<00:53,  1.33it/s]

Training loop 218
tensor([[  101,  2054,  2951,  ...,     0,     0,     0],
        [  101,  2054, 13100,  ...,     0,     0,     0],
        [  101,  2079,  2027,  ...,     0,     0,     0],
        ...,
        [  101,  2029,  1997,  ...,     0,     0,     0],
        [  101,  2129,  2515,  ...,     0,     0,     0],
        [  101,  2054,  4127,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.16974061727523804, logits - tensor([[-4.8055,  1.2784, -4.3706, -0.8613],
        [-4.7628,  1.4976, -5.5043, -1.6379],
        [-3.5286, -1.5627,  0.6702, -2.3022],
        [-5.1830, -1.4440, -1.1526, -2.3308],
   

 76%|███████▌  | 219/289 [02:45<00:52,  1.32it/s]

Training loop 219
tensor([[  101,  2129,  2024,  ...,  1043,  1006,   102],
        [  101,  2054, 20062,  ...,     0,     0,     0],
        [  101,  2003,  2944,  ...,     0,     0,     0],
        ...,
        [  101,  2129,  2003,  ...,     0,     0,     0],
        [  101,  2024,  4738,  ..., 20284,  2072,   102],
        [  101,  2029,  2003,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 76%|███████▌  | 220/289 [02:46<00:52,  1.32it/s]

loss - 0.3814570903778076, logits - tensor([[-4.9413,  1.0079, -4.1631, -1.1334],
        [-4.9252,  1.5016, -4.1052, -1.4405],
        [-4.9052, -0.6563, -0.6264, -2.7180],
        [-4.4428,  1.8169, -4.4624, -1.0678],
        [-5.4824,  1.5734, -4.9639, -1.5844],
        [-5.3657,  1.0420, -5.1271, -0.7021],
        [-4.2328, -0.8407, -0.8225, -2.0381],
        [-5.8365,  2.0565, -3.4785, -1.9422]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 220
tensor([[  101,  2079,  2027,  ...,     0,     0,     0],
        [  101,  2079,  2027,  ...,     0,     0,     0],
        [  101,  2003,  2047,  ...,     0,     0,     0],
        ...,
        [  101,  2024,  5813,  ...,     0,     0,     0],
        [  101,  2129,  2106,  ...,     0,     0,     0],
        [  101,  2029, 13764,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0,

 76%|███████▋  | 221/289 [02:47<00:51,  1.33it/s]

Training loop 221
tensor([[ 101, 2515, 1996,  ...,    0,    0,    0],
        [ 101, 2054, 2944,  ...,    0,    0,    0],
        [ 101, 2054, 2951,  ...,    0,    0,    0],
        ...,
        [ 101, 2011, 2129,  ...,    0,    0,    0],
        [ 101, 2129, 2003,  ...,    0,    0,    0],
        [ 101, 2129, 2502,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3573072552680969, logits - tensor([[-4.6277, -2.8525,  1.5286, -1.9904],
        [-5.1746,  1.2495, -4.0837, -1.6923],
        [-5.2792,  1.5063, -5.1115, -1.5202],
        [-4.7494,  0.7843, -3.5886, -1.5332],
        [-4.2569, -2.8240,  3.1693, -2.7

 77%|███████▋  | 222/289 [02:47<00:50,  1.33it/s]

Training loop 222
tensor([[  101,  2106,  2027,  ...,     0,     0,     0],
        [  101,  2129,  5514,  ...,     0,     0,     0],
        [  101,  2129,  2172,  ..., 14526,  1012,   102],
        ...,
        [  101,  2003,  2047,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0],
        [  101,  2029,  2951,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.10306918621063232, logits - tensor([[-4.0488, -2.8027,  1.8209, -2.5236],
        [-4.9185, -2.1344, -4.3982,  1.3153],
        [-5.3305,  0.9954, -4.8257, -1.5947],
        [-5.5632,  0.7414, -4.2877, -1.1650],
   

 77%|███████▋  | 223/289 [02:48<00:49,  1.34it/s]

Training loop 223
tensor([[  101,  2129,  2003,  ...,     0,     0,     0],
        [  101,  2054,  2512,  ...,     0,     0,     0],
        [  101,  2079,  2027,  ...,     0,     0,     0],
        ...,
        [  101,  2029, 12046,  ...,     0,     0,     0],
        [  101,  2054,  4275,  ...,     0,     0,     0],
        [  101,  2054,  2951,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.20622992515563965, logits - tensor([[-4.6780,  1.5992, -4.6038, -1.0240],
        [-4.6414,  1.8233, -4.7883, -2.0952],
        [-3.3877, -2.5398,  2.2846, -2.5940],
        [-5.6384,  1.2856, -4.0504, -0.9748],
   

 78%|███████▊  | 224/289 [02:49<00:48,  1.34it/s]

Training loop 224
tensor([[ 101, 2001, 2151,  ...,    0,    0,    0],
        [ 101, 2129, 2024,  ...,    0,    0,    0],
        [ 101, 2129, 2502,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2079,  ...,    0,    0,    0],
        [ 101, 2129, 2001,  ...,    0,    0,    0],
        [ 101, 2129, 2079,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5713174343109131, logits - tensor([[-3.4735, -3.2701,  2.0678, -3.1407],
        [-5.4782,  1.3634, -5.4219, -1.7037],
        [-5.7425,  1.9783, -4.9528, -1.3718],
        [-3.8217, -1.6508,  1.6321, -2.8878],
        [-5.3393,  1.5253, -4.9410, -0.9

 78%|███████▊  | 225/289 [02:50<00:47,  1.34it/s]

Training loop 225
tensor([[  101,  2029,  2274,  ...,     0,     0,     0],
        [  101,  2129,  2116,  ...,     0,     0,     0],
        [  101,  2024,  2045,  ...,     0,     0,     0],
        ...,
        [  101,  2029, 14402,  ..., 13462,  2036,   102],
        [  101,  2054,  2001,  ...,     0,     0,     0],
        [  101,  2054,  2024,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.09009647369384766, logits - tensor([[-5.0250,  1.0322, -3.6629, -1.3730],
        [-4.4622, -3.0031, -4.0040,  2.4977],
        [-4.1271, -2.6366,  1.3186, -2.5246],
        [-6.2375,  1.7069, -4.6809, -1.5331],
   

 78%|███████▊  | 226/289 [02:50<00:47,  1.34it/s]

Training loop 226
tensor([[ 101, 2054, 9312,  ...,    0,    0,    0],
        [ 101, 2129, 2515,  ...,    0,    0,    0],
        [ 101, 2129, 2502,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2003,  ..., 1999, 1996,  102],
        [ 101, 2129, 2003,  ...,    0,    0,    0],
        [ 101, 2129, 2027,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4225054681301117, logits - tensor([[-5.1336,  2.0928, -4.1689, -1.4323],
        [-5.3481,  1.3814, -4.4093, -1.5199],
        [-5.0716,  1.3937, -4.3377, -1.6453],
        [-3.9240,  0.8511, -3.3030, -1.4307],
        [-4.0561, -1.6037, -3.0680,  1.7

 79%|███████▊  | 227/289 [02:51<00:46,  1.34it/s]

Training loop 227
tensor([[ 101, 2054, 4275,  ...,    0,    0,    0],
        [ 101, 2129, 2079,  ...,    0,    0,    0],
        [ 101, 2129, 4294,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 1999,  ...,    0,    0,    0],
        [ 101, 2054, 2048,  ...,    0,    0,    0],
        [ 101, 2129, 2502,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.22613438963890076, logits - tensor([[-5.1067,  1.8194, -4.2405, -2.2062],
        [-4.6129,  1.2138, -4.1770, -1.0033],
        [-5.4389,  0.7166, -3.4628, -2.0180],
        [-4.9799,  1.7140, -4.3782, -1.3731],
        [-4.6431,  1.1473, -4.0471, -2.

 79%|███████▉  | 228/289 [02:52<00:45,  1.34it/s]

Training loop 228
tensor([[  101,  2054, 18804,  ...,     0,     0,     0],
        [  101,  2054, 23760,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  9324,  ...,     0,     0,     0],
        [  101,  2054,  2024,  ...,     0,     0,     0],
        [  101,  2029,  6970,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.43900632858276367, logits - tensor([[-4.9808,  1.3556, -4.1770, -1.4498],
        [-5.6026,  1.2879, -3.8517, -2.1778],
        [-4.8016,  1.2043, -3.7722, -0.5883],
        [-4.5034, -1.8642,  1.3839, -2.2843],
   

 79%|███████▉  | 229/289 [02:53<00:44,  1.34it/s]

Training loop 229
tensor([[  101,  2029,  1997,  ...,     0,     0,     0],
        [  101,  2129,  2515,  ..., 29160,  5166,   102],
        [  101,  2054,  2024,  ...,     0,     0,     0],
        ...,
        [  101,  2129,  2146,  ...,     0,     0,     0],
        [  101,  2003,  2023,  ...,     0,     0,     0],
        [  101,  2129,  2003,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.20885567367076874, logits - tensor([[-3.6408, -2.9490, -3.1480,  2.5177],
        [-4.6444,  0.6864, -2.5263, -0.9981],
        [-5.6029,  1.8110, -5.2079, -2.0368],
        [-6.1479,  1.2146, -5.5770, -1.3491],
   

 80%|███████▉  | 230/289 [02:53<00:44,  1.33it/s]

Training loop 230
tensor([[ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2054, 2653,  ...,    0,    0,    0],
        [ 101, 2129, 2116,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2951,  ...,    0,    0,    0],
        [ 101, 2129, 2515,  ...,    0,    0,    0],
        [ 101, 2079, 1996,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.09913821518421173, logits - tensor([[-5.0504,  0.7379, -3.7240, -1.6078],
        [-5.3085,  1.9127, -4.9035, -2.2209],
        [-5.0658,  1.4088, -3.9089, -2.1969],
        [-5.0517,  1.2624, -4.1762, -1.0321],
        [-6.6810,  1.7695, -4.7541, -1.

 80%|███████▉  | 231/289 [02:54<00:43,  1.33it/s]

Training loop 231
tensor([[  101,  2129,  2116,  ...,     0,     0,     0],
        [  101,  2129,  2116,  ...,     0,     0,     0],
        [  101,  2054, 26163,  ...,     0,     0,     0],
        ...,
        [  101,  2079,  1996,  ...,     0,     0,     0],
        [  101,  2129,  2116,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,  2109,  2000,   102]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.28913289308547974, logits - tensor([[-3.2750, -2.6148, -2.6095,  2.5311],
        [-3.5786, -3.6851, -2.9335,  2.3799],
        [-5.6378,  1.6423, -4.9574, -2.0843],
        [-4.6429,  1.6219, -3.6579, -0.9798],
   

 80%|████████  | 232/289 [02:55<00:42,  1.33it/s]

Training loop 232
tensor([[ 101, 2515, 2037,  ...,    0,    0,    0],
        [ 101, 2129, 2515,  ...,    0,    0,    0],
        [ 101, 2029, 2613,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2838,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2003, 1996,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2294325828552246, logits - tensor([[-3.0795, -2.3408,  2.8148, -2.2866],
        [-5.0809,  1.7080, -4.4502, -1.4031],
        [-4.0633,  1.5385, -3.8719, -1.1114],
        [-5.2005,  1.1250, -4.4100, -1.4938],
        [-3.8807, -0.8303, -2.8074,  0.3

 81%|████████  | 233/289 [02:56<00:42,  1.33it/s]

Training loop 233
tensor([[ 101, 2029, 9324,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2129, 2027,  ...,    0,    0,    0],
        ...,
        [ 101, 2515, 2037,  ...,    0,    0,    0],
        [ 101, 2054, 2020,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.18612805008888245, logits - tensor([[-6.3678,  1.4614, -5.2947, -1.5164],
        [-5.6649,  1.3833, -3.8968, -1.2888],
        [-5.6462,  1.6187, -4.5774, -1.6027],
        [-4.7347,  1.3506, -4.4109, -1.0724],
        [-5.4151,  1.5324, -5.2489, -1.

 81%|████████  | 234/289 [02:56<00:41,  1.33it/s]

Training loop 234
tensor([[ 101, 2129, 3144,  ...,    0,    0,    0],
        [ 101, 2003, 1996,  ...,    0,    0,    0],
        [ 101, 2024, 2045,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2330,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2129, 2024,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.10275503993034363, logits - tensor([[-4.5861,  1.7586, -3.5774, -1.5601],
        [-3.8369, -3.0970,  2.7649, -2.8894],
        [-4.1621, -2.3370,  2.1267, -3.1335],
        [-4.7088,  0.9361, -4.4155, -1.2208],
        [-4.7223,  1.9407, -4.9355, -2.

 81%|████████▏ | 235/289 [02:57<00:40,  1.34it/s]

Training loop 235
tensor([[  101,  2029,  7060,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0],
        [  101,  2029,  7511,  ...,     0,     0,     0],
        ...,
        [  101,  2003, 10640,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0],
        [  101,  2054,  4155,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.13930213451385498, logits - tensor([[-4.8595,  0.2823, -2.2451, -2.0064],
        [-6.2798,  2.0320, -5.6206, -2.0655],
        [-5.4025,  1.5104, -4.8762, -2.0144],
        [-3.8255, -2.2121,  1.3915, -2.3394],
   

 82%|████████▏ | 236/289 [02:58<00:39,  1.34it/s]

Training loop 236
tensor([[ 101, 2029, 2951,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2054, 2020,  ...,    0,    0,    0],
        ...,
        [ 101, 2029, 3319,  ...,    0,    0,    0],
        [ 101, 2054, 5884,  ...,    0,    0,    0],
        [ 101, 2129, 2001,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.35252445936203003, logits - tensor([[-4.5093,  2.2511, -4.6021, -1.4445],
        [-4.5884,  1.2457, -4.0353, -2.0703],
        [-5.3307,  1.5422, -4.1238, -0.7609],
        [-5.1458,  1.5698, -4.2552, -1.6413],
        [-5.2156,  1.4788, -4.5642, -2.

 82%|████████▏ | 237/289 [02:59<00:38,  1.34it/s]

Training loop 237
tensor([[  101,  2029,  2773,  ...,     0,     0,     0],
        [  101,  2129,  2172,  ...,     0,     0,     0],
        [  101,  2079,  2027,  ...,     0,     0,     0],
        ...,
        [  101,  2129,  2312,  ...,     0,     0,     0],
        [  101,  2054, 12158,  ...,     0,     0,     0],
        [  101,  2515,  1996,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1631889045238495, logits - tensor([[-6.1147,  2.0532, -4.0932, -1.9897],
        [-3.0467, -3.1540, -3.2277,  2.6563],
        [-3.7810, -2.5537,  2.6802, -3.4741],
        [-5.5274,  0.6449, -3.1512, -2.1965],
    

 82%|████████▏ | 238/289 [02:59<00:38,  1.33it/s]

Training loop 238
tensor([[  101,  2054,  4155,  ...,     0,     0,     0],
        [  101,  2054,  2951,  ..., 10230,  1010,   102],
        [  101,  2003,  2023,  ...,     0,     0,     0],
        ...,
        [  101,  2054, 26163,  ...,     0,     0,     0],
        [  101,  2003,  2045,  ...,     0,     0,     0],
        [  101,  2054, 17953,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.09892480820417404, logits - tensor([[-5.8409,  2.1529, -4.6088, -1.3725],
        [-5.1804,  0.7551, -4.5006, -0.3714],
        [-4.1808, -3.2426,  2.5674, -2.9834],
        [-5.0261,  1.4531, -5.3280, -2.5331],
   

 83%|████████▎ | 239/289 [03:00<00:37,  1.33it/s]

Training loop 239
tensor([[ 101, 2054, 3176,  ...,    0,    0,    0],
        [ 101, 2029, 2951,  ...,    0,    0,    0],
        [ 101, 2129, 2515,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2515,  ...,    0,    0,    0],
        [ 101, 2129, 2116,  ...,    0,    0,    0],
        [ 101, 2054, 4294,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3912365734577179, logits - tensor([[-6.1847,  1.0437, -4.4763, -1.3511],
        [-5.7663,  1.5431, -4.2390, -2.1241],
        [-4.9943,  1.5738, -4.9758, -1.3706],
        [-5.4433,  0.4313, -3.7644, -1.6624],
        [-5.2469,  1.1239, -4.6389, -0.9

 83%|████████▎ | 240/289 [03:01<00:37,  1.32it/s]

Training loop 240
tensor([[ 101, 2129, 2515,  ...,    0,    0,    0],
        [ 101, 2079, 2027,  ..., 2005, 1996,  102],
        [ 101, 2029, 9312,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2003,  ...,    0,    0,    0],
        [ 101, 2129, 2502,  ...,    0,    0,    0],
        [ 101, 2079, 1996,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1595018208026886, logits - tensor([[-5.1482,  1.9686, -4.2911, -1.3361],
        [-4.4505, -2.8899,  3.2521, -3.4277],
        [-5.4790,  1.9686, -4.5748, -1.5522],
        [-4.8525,  1.4099, -3.8819, -0.6470],
        [-6.0485,  1.0549, -4.9317, -1.8

 83%|████████▎ | 241/289 [03:02<00:36,  1.32it/s]

Training loop 241
tensor([[ 101, 2011, 2129,  ...,    0,    0,    0],
        [ 101, 2079, 1996,  ...,    0,    0,    0],
        [ 101, 2129, 2502,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 9312,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2515, 1996,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2986257076263428, logits - tensor([[-5.0603,  1.1444, -3.5113, -1.2648],
        [-4.2934, -3.7026,  3.1413, -2.4761],
        [-5.3911, -2.7946, -4.0075,  1.7444],
        [-4.4462, -2.4789,  2.9491, -2.9927],
        [-4.7248,  2.1135, -4.2508, -1.3

 84%|████████▎ | 242/289 [03:02<00:35,  1.32it/s]

Training loop 242
tensor([[ 101, 2129, 2515,  ...,    0,    0,    0],
        [ 101, 2129, 2312,  ...,    0,    0,    0],
        [ 101, 2054, 2060,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2003,  ...,    0,    0,    0],
        [ 101, 2129, 2116,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4063212275505066, logits - tensor([[-5.6685,  1.2664, -4.1561, -2.0723],
        [-4.0463, -2.2900, -2.9253,  2.3910],
        [-4.3648,  1.3755, -4.6004, -1.3137],
        [-4.1830, -2.7524, -3.7225,  1.9869],
        [-3.7414, -2.8098,  1.2888, -2.6

 84%|████████▍ | 243/289 [03:03<00:34,  1.32it/s]

Training loop 243
tensor([[  101,  2054,  3130,  ...,     0,     0,     0],
        [  101,  2029, 13221,  ...,     0,     0,     0],
        [  101,  2054,  2951,  ...,     0,     0,     0],
        ...,
        [  101,  2029,  4353,  ...,     0,     0,     0],
        [  101,  2024,  1996,  ...,     0,     0,     0],
        [  101,  2054,  2951,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.26624614000320435, logits - tensor([[-4.7683,  1.9371, -4.5376, -0.5833],
        [-6.3277,  1.8565, -4.9388, -1.3697],
        [-5.1147,  1.1554, -3.6209, -0.7967],
        [-4.6071, -3.1483,  2.5935, -2.5247],
   

 84%|████████▍ | 244/289 [03:04<00:34,  1.32it/s]

Training loop 244
tensor([[  101,  2079,  2027,  ...,     0,     0,     0],
        [  101,  2129,  2106,  ...,     0,     0,     0],
        [  101,  2129,  2079,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  2003,  ...,     0,     0,     0],
        [  101,  2003,  1996,  ...,     0,     0,     0],
        [  101,  2054, 24710,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.21841482818126678, logits - tensor([[-3.6446, -3.3923,  3.6430, -3.1609],
        [-5.2321,  1.1719, -4.7643, -0.8606],
        [-5.9192,  1.3330, -5.3887, -2.6521],
        [-3.8249, -2.3347, -4.4083,  2.7395],
   

 85%|████████▍ | 245/289 [03:05<00:33,  1.32it/s]

Training loop 245
tensor([[ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2129, 2502,  ..., 2006, 1996,  102],
        [ 101, 2054, 4275,  ...,    0,    0,    0],
        ...,
        [ 101, 2079, 2027,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2129, 2515,  ..., 1996, 2944,  102]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2801172733306885, logits - tensor([[-6.0719,  0.3746, -5.3453, -0.4004],
        [-4.8584,  1.4139, -4.3951, -1.1437],
        [-4.3397,  1.3374, -3.6031, -0.4916],
        [-4.6249, -2.2359, -4.0635,  2.0733],
        [-6.0474,  1.4082, -5.3605, -0.7

 85%|████████▌ | 246/289 [03:05<00:32,  1.32it/s]

Training loop 246
tensor([[ 101, 2073, 2106,  ...,    0,    0,    0],
        [ 101, 2129, 2024,  ...,    0,    0,    0],
        [ 101, 2129, 2079,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2106, 2027,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2108679711818695, logits - tensor([[-5.2246,  1.0189, -4.4888, -1.1132],
        [-4.5951,  1.7105, -4.7225, -0.3919],
        [-3.7899, -1.3572, -3.4468,  0.8767],
        [-5.2757,  0.3148, -4.9500, -0.6071],
        [-5.5646,  1.7270, -4.4696, -1.3

 85%|████████▌ | 247/289 [03:06<00:31,  1.33it/s]

Training loop 247
tensor([[  101,  2106,  2027,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0],
        [  101,  2339,  2515,  ...,     0,     0,     0],
        ...,
        [  101,  2054, 10640,  ...,     0,     0,     0],
        [  101,  2054,  2024,  ...,     0,     0,     0],
        [  101,  2054,  2024,  ...,  1037, 15743,   102]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.28860679268836975, logits - tensor([[-3.9742, -3.4409,  2.6491, -2.5283],
        [-5.2145,  0.6190, -4.3249, -0.8570],
        [-4.9313,  1.1659, -4.3034, -1.2234],
        [-6.0077,  1.4344, -5.5515, -0.6470],
   

 86%|████████▌ | 248/289 [03:07<00:30,  1.33it/s]

Training loop 248
tensor([[ 101, 2029, 2951,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 2176,  ..., 4294, 2015,  102],
        ...,
        [ 101, 2129, 2001,  ...,    0,    0,    0],
        [ 101, 2129, 2116,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.13618488609790802, logits - tensor([[-5.6587,  0.7565, -5.3433, -1.2693],
        [-5.1338,  1.2546, -4.9783, -0.9709],
        [-5.6591,  0.8903, -4.1694, -0.3089],
        [-4.7592,  1.1508, -4.1104, -1.4801],
        [-5.2847,  1.4344, -4.4168, -1.

 86%|████████▌ | 249/289 [03:08<00:29,  1.34it/s]

Training loop 249
tensor([[ 101, 2029, 2653,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2079,  ...,    0,    0,    0],
        [ 101, 2054, 2951,  ...,    0,    0,    0],
        [ 101, 2020, 2151,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.11582124978303909, logits - tensor([[-6.2704,  1.5729, -5.3749, -1.0454],
        [-6.0708,  1.1869, -4.9131, -1.0496],
        [-5.4025,  1.1498, -5.3157, -1.4906],
        [-5.4673,  1.8276, -4.6496, -1.7299],
        [-5.0100,  1.5193, -5.7353, -0.

 87%|████████▋ | 250/289 [03:08<00:29,  1.34it/s]

Training loop 250
tensor([[ 101, 2129, 2116,  ...,    0,    0,    0],
        [ 101, 2003, 1996,  ...,    0,    0,    0],
        [ 101, 2129, 2079,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2020,  ...,    0,    0,    0],
        [ 101, 2129, 2079,  ...,    0,    0,    0],
        [ 101, 2054, 3698,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.23956376314163208, logits - tensor([[-4.0475, -2.0733, -3.7988,  2.1134],
        [-4.5928, -2.8907,  2.3569, -2.7249],
        [-4.3157,  0.9404, -4.8346, -1.3663],
        [-5.2092,  1.3160, -5.6469, -0.7914],
        [-5.4418,  1.2939, -4.6808, -1.

 87%|████████▋ | 251/289 [03:09<00:28,  1.34it/s]

Training loop 251
tensor([[ 101, 2129, 2172,  ...,    0,    0,    0],
        [ 101, 2054, 7236,  ...,    0,    0,    0],
        [ 101, 2003, 1996,  ...,    0,    0,    0],
        ...,
        [ 101, 2029, 2838,  ...,    0,    0,    0],
        [ 101, 2054, 2951,  ...,    0,    0,    0],
        [ 101, 2064, 8040,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1560632735490799, logits - tensor([[-4.6328, -1.0980, -4.5859,  1.9650],
        [-6.0625,  1.3665, -5.4720, -0.9058],
        [-4.9620, -2.0204,  2.0528, -2.9515],
        [-4.4037, -2.2631, -3.8788,  1.8062],
        [-5.3910,  1.7121, -4.7082, -1.1

 87%|████████▋ | 252/289 [03:10<00:27,  1.34it/s]

Training loop 252
tensor([[  101,  2054,  2024,  ...,     0,     0,     0],
        [  101,  2029, 10474,  ...,  1010,  1189,   102],
        [  101,  2054,  2838,  ...,     0,     0,     0],
        ...,
        [  101,  2024,  1996,  ...,     0,     0,     0],
        [  101,  2054, 26163,  ...,     0,     0,     0],
        [  101,  2079,  2027,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2612287998199463, logits - tensor([[-5.2994,  0.9600, -4.3861, -0.4236],
        [-4.7366,  1.1136, -4.6558, -0.8232],
        [-5.1295,  0.9917, -4.4870, -1.5677],
        [-5.7497,  1.2574, -4.8164, -0.8799],
    

 88%|████████▊ | 253/289 [03:11<00:27,  1.33it/s]

Training loop 253
tensor([[ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2129, 2003,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2000, 2054,  ...,    0,    0,    0],
        [ 101, 2515, 2169,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.16660138964653015, logits - tensor([[-5.4817,  1.1011, -5.5834, -1.0012],
        [-4.6444,  0.5217, -4.2975, -1.3271],
        [-4.0214, -2.1290, -4.3086,  2.3570],
        [-4.5182, -3.4693,  3.0533, -2.2355],
        [-5.3942,  1.4359, -4.5694, -0.

 88%|████████▊ | 254/289 [03:11<00:26,  1.33it/s]

Training loop 254
tensor([[  101,  2054,  2020,  ...,     0,     0,     0],
        [  101,  2129,  2003,  ...,     0,     0,     0],
        [  101,  2054,  2291,  ...,     0,     0,     0],
        ...,
        [  101,  2054, 21641,  ...,     0,     0,     0],
        [  101,  2054,  5449,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.22595879435539246, logits - tensor([[-5.0269,  0.9383, -5.0301, -0.3183],
        [-4.5248,  0.5901, -4.5818, -0.2512],
        [-5.6384,  0.7631, -3.7031, -1.1915],
        [-5.6097,  1.9723, -4.6410, -1.3871],
   

 88%|████████▊ | 255/289 [03:12<00:25,  1.33it/s]

Training loop 255
tensor([[ 101, 2129, 2003,  ...,    0,    0,    0],
        [ 101, 2129, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 4012,  ...,    0,    0,    0],
        [ 101, 2129, 2079,  ...,    0,    0,    0],
        [ 101, 2106, 2027,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3404565751552582, logits - tensor([[-5.3998,  1.1031, -4.9277, -1.1741],
        [-5.0230,  0.8834, -4.0630, -0.7062],
        [-4.8524,  1.4042, -4.5905, -0.3572],
        [-5.2910,  1.1661, -4.5191, -1.2459],
        [-5.5809,  1.2787, -5.1833, -0.8

 89%|████████▊ | 256/289 [03:13<00:24,  1.33it/s]

Training loop 256
tensor([[ 101, 2029, 2350,  ...,    0,    0,    0],
        [ 101, 2129, 2116,  ...,    0,    0,    0],
        [ 101, 2129, 2003,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2312,  ...,    0,    0,    0],
        [ 101, 2054, 6907,  ...,    0,    0,    0],
        [ 101, 2054, 4118,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.26319560408592224, logits - tensor([[-5.1416,  1.0319, -4.8164, -1.0319],
        [-5.1619,  1.6102, -5.2152, -0.7696],
        [-5.6792,  0.8932, -5.6194, -0.6406],
        [-5.3069,  1.1184, -5.0793, -0.8519],
        [-4.8633,  1.7110, -4.5213, -1.

 89%|████████▉ | 257/289 [03:14<00:24,  1.33it/s]

Training loop 257
tensor([[ 101, 2129, 2079,  ...,    0,    0,    0],
        [ 101, 2129, 2172,  ...,    0,    0,    0],
        [ 101, 2515, 2027,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2951,  ...,    0,    0,    0],
        [ 101, 2106, 2027,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 89%|████████▉ | 258/289 [03:14<00:23,  1.33it/s]

loss - 0.7840906381607056, logits - tensor([[-5.6001,  1.5490, -5.1934, -0.5433],
        [-4.5256,  1.4826, -3.9621, -0.7289],
        [-2.4926, -3.1543,  2.7843, -2.5685],
        [-5.4307,  1.1669, -4.4551, -1.1780],
        [-4.4704, -1.9907, -4.0126,  2.4998],
        [-3.3194, -2.9069, -3.7040,  2.8862],
        [-3.5046, -4.3752,  3.5434, -2.1235],
        [-4.3948, -2.0053, -4.3720,  1.9558]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 258
tensor([[ 101, 2129, 2001,  ...,    0,    0,    0],
        [ 101, 2054, 4275,  ...,    0,    0,    0],
        [ 101, 2515, 2037,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2106,  ...,    0,    0,    0],
        [ 101, 2054, 4155,  ...,    0,    0,    0],
        [ 101, 2054, 7848,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 

 90%|████████▉ | 259/289 [03:15<00:22,  1.33it/s]

Training loop 259
tensor([[  101,  2129,  2172,  ...,     0,     0,     0],
        [  101,  2106,  2027,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0],
        ...,
        [  101,  2129,  2003,  ...,     0,     0,     0],
        [  101,  2029, 26163,  ...,     0,     0,     0],
        [  101,  2129,  2488,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19754891097545624, logits - tensor([[-3.7904, -2.1751, -3.7779,  2.7339],
        [-4.5605, -3.2224,  2.4651, -2.8764],
        [-5.7372,  0.6921, -4.6043, -0.9319],
        [-4.8545,  1.1165, -4.4279, -2.3291],
   

 90%|████████▉ | 260/289 [03:16<00:21,  1.33it/s]

Training loop 260
tensor([[ 101, 2011, 2129,  ...,    0,    0,    0],
        [ 101, 2058, 2029,  ...,    0,    0,    0],
        [ 101, 2129, 2116,  ...,    0,    0,    0],
        ...,
        [ 101, 2079, 2027,  ...,    0,    0,    0],
        [ 101, 2054, 3642,  ...,    0,    0,    0],
        [ 101, 2129, 2116,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.11892379820346832, logits - tensor([[-3.2421, -3.1013, -3.2632,  3.7213],
        [-5.2459,  1.3714, -5.3071, -1.5324],
        [-5.6234,  1.7875, -5.3835, -1.0668],
        [-5.3022,  1.2220, -5.3867, -0.9108],
        [-5.6637,  1.0405, -4.6692, -2.

 90%|█████████ | 261/289 [03:17<00:21,  1.33it/s]

Training loop 261
tensor([[ 101, 2129, 2116,  ...,    0,    0,    0],
        [ 101, 2129, 2172,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        ...,
        [ 101, 2011, 2129,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 2529,  ..., 7644, 1012,  102]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2070193588733673, logits - tensor([[-5.4479,  1.6014, -4.7523, -0.5127],
        [-5.5087,  1.0729, -4.5793, -0.9825],
        [-5.8420,  0.7166, -4.4914, -1.2905],
        [-2.9735, -2.9965, -2.6354,  2.4526],
        [-4.1506, -2.6150, -3.9260,  2.1

 91%|█████████ | 262/289 [03:17<00:20,  1.33it/s]

Training loop 262
tensor([[  101,  2129,  2003,  ...,     0,     0,     0],
        [  101,  2079,  2027,  ...,     0,     0,     0],
        [  101,  2054,  4800,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  2944,  ...,     0,     0,     0],
        [  101,  2054, 16105,  ...,     0,     0,     0],
        [  101,  2054,  2951,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.23726209998130798, logits - tensor([[-4.4268,  0.6614, -4.6140, -0.9413],
        [-4.4564, -2.8271,  1.6113, -3.1321],
        [-4.8846,  0.7197, -4.5749, -1.8901],
        [-5.2117,  1.6534, -4.1171, -1.1039],
   

 91%|█████████ | 263/289 [03:18<00:19,  1.33it/s]

Training loop 263
tensor([[  101,  2003,  1996,  ...,     0,     0,     0],
        [  101,  2029, 26384,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0],
        ...,
        [  101,  2006,  2029,  ...,     0,     0,     0],
        [  101,  2079,  1996,  ...,     0,     0,     0],
        [  101,  2054,  3176,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.31382784247398376, logits - tensor([[-4.2097, -2.7445,  2.2698, -3.3781],
        [-4.7959,  1.0658, -4.8677, -1.2058],
        [-3.7690, -2.6004, -3.7183,  2.6892],
        [-4.6817,  1.3120, -4.0815, -1.1420],
   

 91%|█████████▏| 264/289 [03:19<00:18,  1.33it/s]

Training loop 264
tensor([[ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2079, 2027,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2502,  ...,    0,    0,    0],
        [ 101, 2129, 2116,  ...,    0,    0,    0],
        [ 101, 2054, 2060,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.12834018468856812, logits - tensor([[-5.9320,  0.5847, -4.5077, -1.1081],
        [-3.9955, -3.0938, -3.6957,  2.8806],
        [-4.4378, -3.8860,  2.0350, -2.4020],
        [-6.6842,  0.5653, -4.9483, -1.8970],
        [-5.6105,  1.4343, -4.5972, -1.

 92%|█████████▏| 265/289 [03:20<00:17,  1.34it/s]

Training loop 265
tensor([[ 101, 2024, 2060,  ...,    0,    0,    0],
        [ 101, 2054, 4155,  ...,    0,    0,    0],
        [ 101, 2129, 2003,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2129, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.12573666870594025, logits - tensor([[-4.3165, -3.0062,  2.4943, -2.0832],
        [-4.1230,  1.7474, -3.5763, -2.1614],
        [-5.5851,  1.0442, -5.7580, -0.8432],
        [-5.7611,  0.5156, -5.5669, -1.5885],
        [-5.6899,  1.1583, -4.5617, -0.

 92%|█████████▏| 266/289 [03:20<00:17,  1.34it/s]

Training loop 266
tensor([[  101,  2003,  1996,  ...,     0,     0,     0],
        [  101,  2029,  2951,  ...,     0,     0,     0],
        [  101,  2003,  1996,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  2024,  ..., 26163,  1010,   102],
        [  101,  2079,  2027,  ...,     0,     0,     0],
        [  101,  2079,  2027,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3148752748966217, logits - tensor([[-4.6592, -2.8621,  2.0678, -3.0249],
        [-6.1298,  1.3924, -5.4147, -1.3534],
        [-4.5139, -3.2617,  2.0438, -3.5280],
        [-4.5068, -2.6139,  2.2197, -3.5187],
    

 92%|█████████▏| 267/289 [03:21<00:16,  1.33it/s]

Training loop 267
tensor([[  101,  2054,  2024,  ...,  3512,  8651,   102],
        [  101,  2054,  3130,  ...,     0,     0,     0],
        [  101,  2029,  4895,  ...,     0,     0,     0],
        ...,
        [  101,  2073,  2515,  ...,     0,     0,     0],
        [  101,  2054, 15306,  ...,     0,     0,     0],
        [  101,  2054,  2020,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.266954243183136, logits - tensor([[-4.2357,  0.8567, -3.4742, -0.5484],
        [-5.7318,  1.6525, -6.1735, -2.6096],
        [-5.3274,  1.4039, -4.2780, -1.6279],
        [-4.5431,  1.7687, -4.2795, -1.5699],
     

 93%|█████████▎| 268/289 [03:22<00:15,  1.33it/s]

Training loop 268
tensor([[ 101, 2001, 9312,  ...,    0,    0,    0],
        [ 101, 2054, 2653,  ...,    0,    0,    0],
        [ 101, 2054, 2093,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 7885,  ...,    0,    0,    0],
        [ 101, 2054, 2465,  ...,    0,    0,    0],
        [ 101, 2129, 2116,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2363833487033844, logits - tensor([[-5.3363, -3.1794,  1.5808, -2.8717],
        [-5.1565, -1.9729, -4.3688,  1.4139],
        [-5.1761,  1.1054, -4.8703, -1.0839],
        [-4.5102, -2.2886,  2.0462, -2.6994],
        [-6.0380,  0.7858, -5.4191, -2.3

 93%|█████████▎| 269/289 [03:23<00:15,  1.33it/s]

Training loop 269
tensor([[ 101, 2129, 2003,  ...,    0,    0,    0],
        [ 101, 2029, 2110,  ...,    0,    0,    0],
        [ 101, 2079, 1996,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 3787,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2129, 2312,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4816567301750183, logits - tensor([[-4.6129,  1.1240, -4.4114, -0.8148],
        [-4.9179, -1.5312, -3.9873,  2.1976],
        [-4.3095, -2.3596,  1.6289, -2.4561],
        [-5.3095,  1.4808, -5.5403, -1.5287],
        [-4.8627, -1.8914, -4.0942,  1.8

 93%|█████████▎| 270/289 [03:23<00:14,  1.33it/s]

Training loop 270
tensor([[ 101, 2001, 2023,  ...,    0,    0,    0],
        [ 101, 2129, 2172,  ...,    0,    0,    0],
        [ 101, 2054, 9312,  ...,    0,    0,    0],
        ...,
        [ 101, 2073, 2079,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 2001,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5085135698318481, logits - tensor([[-4.6327, -1.6296,  0.2226, -2.9149],
        [-5.2914, -2.2497, -3.7472,  1.2581],
        [-5.0187, -2.0518, -4.0319,  2.4000],
        [-5.7889,  1.0781, -4.7270, -0.7210],
        [-6.3229,  1.2958, -6.0104, -2.0

 94%|█████████▍| 271/289 [03:24<00:13,  1.33it/s]

Training loop 271
tensor([[ 101, 2011, 2129,  ...,    0,    0,    0],
        [ 101, 2029, 7982,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2515,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 2060,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.32564157247543335, logits - tensor([[-5.1134, -1.5835, -5.3905,  1.6602],
        [-5.8236,  0.9149, -5.0455, -1.7341],
        [-5.0306,  1.6295, -4.5800, -1.8505],
        [-4.6895,  1.3255, -4.1819, -0.9491],
        [-6.3987,  1.9206, -4.7124, -2.

 94%|█████████▍| 272/289 [03:25<00:12,  1.33it/s]

Training loop 272
tensor([[ 101, 2129, 2488,  ...,    0,    0,    0],
        [ 101, 2054, 2060,  ...,    0,    0,    0],
        [ 101, 2054, 9312,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2129, 2027,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.23535093665122986, logits - tensor([[-4.7437, -1.2349, -4.3313,  1.0594],
        [-5.7972,  1.8598, -4.6997, -2.5921],
        [-5.7042,  1.4512, -5.0253, -1.7837],
        [-4.9348,  1.4758, -4.6997, -1.4657],
        [-4.2050, -1.5719,  2.1689, -3.

 94%|█████████▍| 273/289 [03:26<00:12,  1.33it/s]

Training loop 273
tensor([[ 101, 2054, 2048,  ...,    0,    0,    0],
        [ 101, 2054, 2951,  ...,    0,    0,    0],
        [ 101, 2129, 2003,  ...,    0,    0,    0],
        ...,
        [ 101, 2106, 2027,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2079, 1996,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2900090515613556, logits - tensor([[-4.6380,  2.4951, -5.2540, -1.7061],
        [-4.9532,  1.5673, -3.8972, -1.8085],
        [-5.2493,  1.5812, -4.5289, -1.6053],
        [-3.9873, -2.9986,  2.3919, -2.2637],
        [-5.4000,  2.3145, -4.6950, -1.3

 95%|█████████▍| 274/289 [03:26<00:11,  1.33it/s]

Training loop 274
tensor([[  101,  2054,  2951,  ...,     0,     0,     0],
        [  101,  2054,  2024,  ...,  2041,  1997,   102],
        [  101,  2054, 13100,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  2838,  ...,  2029,  2097,   102],
        [  101,  2054,  9312,  ...,     0,     0,     0],
        [  101,  2129,  2106,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.21115592122077942, logits - tensor([[-6.0044,  1.9691, -4.9996, -2.0549],
        [-3.9860,  0.3797, -3.5740, -0.5400],
        [-4.5895, -1.6699, -3.6278,  1.6813],
        [-6.4864,  2.0926, -5.2705, -1.4441],
   

 95%|█████████▌| 275/289 [03:27<00:10,  1.33it/s]

Training loop 275
tensor([[ 101, 2079, 1996,  ...,    0,    0,    0],
        [ 101, 2079, 2027,  ...,    0,    0,    0],
        [ 101, 2000, 2029,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2951,  ...,    0,    0,    0],
        [ 101, 2029, 2925,  ...,    0,    0,    0],
        [ 101, 2129, 2116,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.17899343371391296, logits - tensor([[-4.1013, -1.5455,  1.9377, -2.4012],
        [-3.9669, -2.6849,  2.0795, -2.0407],
        [-6.1779,  1.4007, -5.0424, -1.5524],
        [-4.2305, -2.3816,  0.8457, -2.3102],
        [-5.5450, -2.2263,  0.8851, -2.

 96%|█████████▌| 276/289 [03:28<00:09,  1.33it/s]

Training loop 276
tensor([[ 101, 2054, 2951,  ...,    0,    0,    0],
        [ 101, 2129, 2024,  ...,    0,    0,    0],
        [ 101, 2054, 6210,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2001,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2073, 2515,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2358378767967224, logits - tensor([[-4.5778,  2.2512, -4.7114, -2.3877],
        [-5.1878,  2.0643, -3.9120, -0.6422],
        [-6.5901,  1.0112, -5.4518, -2.0974],
        [-3.8419, -2.4851,  0.3800, -2.1581],
        [-5.2642,  1.8822, -4.6074, -1.3

 96%|█████████▌| 277/289 [03:29<00:08,  1.33it/s]

Training loop 277
tensor([[  101,  2129,  2001,  ...,     0,     0,     0],
        [  101,  2040,  2024,  ...,     0,     0,     0],
        [  101,  2054,  4800,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  2047,  ...,     0,     0,     0],
        [  101,  2011,  2129,  ...,     0,     0,     0],
        [  101,  2029, 16227,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.27517426013946533, logits - tensor([[-5.6939,  1.8127, -4.7766, -1.6366],
        [-5.4405,  1.6362, -5.3204, -2.5409],
        [-5.7800,  1.6715, -5.9670, -2.2709],
        [-5.1396,  1.6625, -4.4307, -1.8826],
   

 96%|█████████▌| 278/289 [03:29<00:08,  1.34it/s]

Training loop 278
tensor([[ 101, 2029, 2270,  ...,    0,    0,    0],
        [ 101, 2079, 2027,  ...,    0,    0,    0],
        [ 101, 2129, 2092,  ...,    0,    0,    0],
        ...,
        [ 101, 2029, 2944,  ...,    0,    0,    0],
        [ 101, 2029, 2613,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.18384817242622375, logits - tensor([[-5.4973,  0.9761, -4.7884, -1.8177],
        [-5.5081, -2.1441,  1.3988, -3.2269],
        [-5.6885,  0.5691, -4.3602, -0.9949],
        [-4.6847,  1.4498, -4.9800, -2.4631],
        [-5.4240, -0.1569, -4.4663,  0.

 97%|█████████▋| 279/289 [03:30<00:07,  1.33it/s]

Training loop 279
tensor([[ 101, 2054, 6882,  ...,    0,    0,    0],
        [ 101, 2106, 2027,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        ...,
        [ 101, 2079, 2027,  ...,    0,    0,    0],
        [ 101, 2054, 2828,  ...,    0,    0,    0],
        [ 101, 2040, 2024,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.08843068033456802, logits - tensor([[-5.5543,  1.8119, -5.0057, -1.9521],
        [-4.5417, -2.7291,  1.7164, -3.5434],
        [-5.6963,  1.7012, -4.8615, -2.3451],
        [-5.0599,  1.1967, -3.9429, -1.4389],
        [-5.7417,  1.3208, -5.5537, -1.

 97%|█████████▋| 280/289 [03:31<00:06,  1.33it/s]

Training loop 280
tensor([[ 101, 2079, 1996,  ...,    0,    0,    0],
        [ 101, 2054, 2529,  ...,    0,    0,    0],
        [ 101, 2054, 2777,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2951,  ...,    0,    0,    0],
        [ 101, 2064, 2023,  ...,    0,    0,    0],
        [ 101, 2054, 4493,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.25396037101745605, logits - tensor([[-4.6274, -1.5393, -0.2520, -3.2941],
        [-4.9718,  1.6000, -3.8770, -1.6855],
        [-5.7238,  2.2938, -5.8755, -2.7254],
        [-4.5013, -0.3263, -3.2236,  0.7807],
        [-5.4975,  1.6200, -4.3671, -1.

 97%|█████████▋| 281/289 [03:32<00:06,  1.32it/s]

Training loop 281
tensor([[ 101, 2129, 2515,  ...,    0,    0,    0],
        [ 101, 2029, 4155,  ...,    0,    0,    0],
        [ 101, 2003, 2037,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2129, 2003,  ...,    0,    0,    0],
        [ 101, 2129, 2502,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.33069828152656555, logits - tensor([[-5.9908,  1.0616, -3.9329, -1.1789],
        [-5.2717,  0.5368, -4.5600, -1.6776],
        [-4.1012, -2.6247,  2.4290, -3.1344],
        [-4.7298,  2.4549, -4.1015, -1.2485],
        [-4.9072,  0.9061, -5.1884, -2.

 98%|█████████▊| 282/289 [03:32<00:05,  1.32it/s]

Training loop 282
tensor([[ 101, 2011, 2129,  ...,    0,    0,    0],
        [ 101, 2054, 2367,  ...,    0,    0,    0],
        [ 101, 2079, 2027,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 9312,  ...,    0,    0,    0],
        [ 101, 2054, 6847,  ...,    0,    0,    0],
        [ 101, 2054, 4207,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4869537949562073, logits - tensor([[-5.1660,  0.6288, -4.0041, -0.5101],
        [-5.0932,  1.9688, -4.3386, -1.5163],
        [-4.6807, -3.1217,  2.7497, -2.7294],
        [-5.2518, -1.4351, -4.5628,  0.4418],
        [-5.3444,  2.4571, -5.6479, -2.5

 98%|█████████▊| 283/289 [03:33<00:04,  1.32it/s]

Training loop 283
tensor([[  101,  2054,  2003,  ...,  2312, 29536,   102],
        [  101,  2054,  2110,  ...,     0,     0,     0],
        [  101,  2054,  2020,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  2024,  ...,     0,     0,     0],
        [  101,  2129,  2003,  ...,     0,     0,     0],
        [  101,  2029, 25957,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.23608249425888062, logits - tensor([[-5.8344,  0.5885, -4.5099, -0.6002],
        [-5.2388,  1.6218, -4.1465, -1.9822],
        [-4.7811,  1.8275, -5.1751, -2.3167],
        [-6.2956,  0.7670, -5.3003, -1.7095],
   

 98%|█████████▊| 284/289 [03:34<00:03,  1.33it/s]

Training loop 284
tensor([[  101,  2054,  2951,  ...,     0,     0,     0],
        [  101,  2029,  2653,  ...,     0,     0,     0],
        [  101,  2054, 21641,  ...,     0,     0,     0],
        ...,
        [  101,  2515,  1996,  ...,     0,     0,     0],
        [  101,  2054,  2951,  ...,     0,     0,     0],
        [  101,  2129,  2502,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.13875186443328857, logits - tensor([[-5.4648,  1.2790, -5.5771, -1.7078],
        [-5.6040,  1.2033, -4.6521, -0.9195],
        [-5.2139, -0.2610, -4.3706,  0.0960],
        [-5.9941, -1.4106, -4.8429,  2.0549],
   

 99%|█████████▊| 285/289 [03:35<00:03,  1.33it/s]

Training loop 285
tensor([[ 101, 2054, 9312,  ...,    0,    0,    0],
        [ 101, 2129, 2515,  ...,    0,    0,    0],
        [ 101, 2129, 2106,  ...,    0,    0,    0],
        ...,
        [ 101, 2029, 2024,  ...,    0,    0,    0],
        [ 101, 2079, 1996,  ...,    0,    0,    0],
        [ 101, 2129, 2079,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.22748126089572906, logits - tensor([[-4.1122, -0.7873, -3.9434,  1.0861],
        [-5.5968,  1.6935, -4.6140, -2.1006],
        [-5.4963,  1.1752, -4.9531, -2.5555],
        [-5.3709,  1.3568, -4.6867, -1.5815],
        [-4.6236, -2.3928,  2.1778, -2.

 99%|█████████▉| 286/289 [03:35<00:02,  1.33it/s]

Training loop 286
tensor([[ 101, 2054, 3784,  ...,    0,    0,    0],
        [ 101, 2054, 2836,  ...,    0,    0,    0],
        [ 101, 2129, 2106,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2009, 3504,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.40784621238708496, logits - tensor([[-6.0168,  1.6755, -4.1835, -1.8366],
        [-6.1453,  1.8419, -4.6510, -1.8082],
        [-5.5538,  1.3703, -4.0126, -1.6581],
        [-5.3574,  1.1897, -4.6921, -2.5776],
        [-5.9999,  1.7680, -4.4625, -2.

 99%|█████████▉| 287/289 [03:36<00:01,  1.34it/s]

Training loop 287
tensor([[ 101, 2054, 4155,  ...,    0,    0,    0],
        [ 101, 2003, 2045,  ...,    0,    0,    0],
        [ 101, 2031, 1996,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2011, 2129,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1929818093776703, logits - tensor([[-5.3051,  2.0596, -4.9793, -1.2822],
        [-5.3465, -3.0167,  2.3568, -3.2046],
        [-4.1471, -2.6011,  2.7666, -2.9612],
        [-5.6617,  1.8532, -3.9047, -1.6844],
        [-4.8136, -3.3146,  2.6413, -2.5

100%|█████████▉| 288/289 [03:37<00:00,  1.34it/s]

Training loop 288
tensor([[  101,  2106,  2027,  ...,     0,     0,     0],
        [  101,  2079,  2027,  ...,  1998, 26839,   102],
        [  101,  2054,  2024,  ...,     0,     0,     0],
        [  101,  2054,  2020,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.09572974592447281, logits - tensor([[-4.2665, -3.2418,  1.3946, -1.9885],
        [-4.9306, -2.2154,  1.1617, -2.5061],
        [-5.7642,  1.6925, -5.2455, -1.7319],
        [-6.0857,  1.8875, -5.0661, -1.6338]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


100%|██████████| 289/289 [03:37<00:00,  1.33it/s]
  0%|          | 0/194 [00:00<?, ?it/s]

Validation Loop 0
input - False, attention_mask - False


  1%|          | 1/194 [00:00<00:53,  3.59it/s]

Validation Loop 1
input - False, attention_mask - False


  1%|          | 2/194 [00:00<00:49,  3.90it/s]

Validation Loop 2
input - False, attention_mask - False


  2%|▏         | 3/194 [00:00<00:47,  4.02it/s]

Validation Loop 3
input - False, attention_mask - False


  2%|▏         | 4/194 [00:01<00:48,  3.88it/s]

Validation Loop 4
input - False, attention_mask - False


  3%|▎         | 5/194 [00:01<00:47,  3.95it/s]

Validation Loop 5
input - False, attention_mask - False


  3%|▎         | 6/194 [00:01<00:46,  4.02it/s]

Validation Loop 6
input - False, attention_mask - False


  4%|▎         | 7/194 [00:01<00:46,  4.03it/s]

Validation Loop 7
input - False, attention_mask - False


  4%|▍         | 8/194 [00:02<00:47,  3.96it/s]

Validation Loop 8
input - False, attention_mask - False


  5%|▍         | 9/194 [00:02<00:46,  3.97it/s]

Validation Loop 9
input - False, attention_mask - False


  5%|▌         | 10/194 [00:02<00:46,  4.00it/s]

Validation Loop 10
input - False, attention_mask - False


  6%|▌         | 11/194 [00:02<00:45,  4.04it/s]

Validation Loop 11
input - False, attention_mask - False


  6%|▌         | 12/194 [00:03<00:45,  3.99it/s]

Validation Loop 12
input - False, attention_mask - False


  7%|▋         | 13/194 [00:03<00:45,  4.01it/s]

Validation Loop 13
input - False, attention_mask - False


  7%|▋         | 14/194 [00:03<00:44,  4.03it/s]

Validation Loop 14
input - False, attention_mask - False


  8%|▊         | 15/194 [00:03<00:44,  4.05it/s]

Validation Loop 15
input - False, attention_mask - False


  8%|▊         | 16/194 [00:04<00:44,  4.04it/s]

Validation Loop 16
input - False, attention_mask - False


  9%|▉         | 17/194 [00:04<00:44,  4.01it/s]

Validation Loop 17
input - False, attention_mask - False


  9%|▉         | 18/194 [00:04<00:43,  4.02it/s]

Validation Loop 18
input - False, attention_mask - False


 10%|▉         | 19/194 [00:04<00:43,  4.06it/s]

Validation Loop 19
input - False, attention_mask - False


 10%|█         | 20/194 [00:04<00:43,  4.04it/s]

Validation Loop 20
input - False, attention_mask - False


 11%|█         | 21/194 [00:05<00:42,  4.05it/s]

Validation Loop 21
input - False, attention_mask - False


 11%|█▏        | 22/194 [00:05<00:42,  4.04it/s]

Validation Loop 22
input - False, attention_mask - False


 12%|█▏        | 23/194 [00:05<00:42,  4.03it/s]

Validation Loop 23
input - False, attention_mask - False


 12%|█▏        | 24/194 [00:05<00:42,  4.02it/s]

Validation Loop 24
input - False, attention_mask - False


 13%|█▎        | 25/194 [00:06<00:42,  4.00it/s]

Validation Loop 25
input - False, attention_mask - False


 13%|█▎        | 26/194 [00:06<00:42,  3.99it/s]

Validation Loop 26
input - False, attention_mask - False


 14%|█▍        | 27/194 [00:06<00:41,  4.02it/s]

Validation Loop 27
input - False, attention_mask - False


 14%|█▍        | 28/194 [00:06<00:41,  4.00it/s]

Validation Loop 28
input - False, attention_mask - False


 15%|█▍        | 29/194 [00:07<00:41,  3.99it/s]

Validation Loop 29
input - False, attention_mask - False


 15%|█▌        | 30/194 [00:07<00:41,  3.99it/s]

Validation Loop 30
input - False, attention_mask - False


 16%|█▌        | 31/194 [00:07<00:40,  3.98it/s]

Validation Loop 31
input - False, attention_mask - False


 16%|█▋        | 32/194 [00:08<00:41,  3.94it/s]

Validation Loop 32
input - False, attention_mask - False


 17%|█▋        | 33/194 [00:08<00:41,  3.92it/s]

Validation Loop 33
input - False, attention_mask - False


 18%|█▊        | 34/194 [00:08<00:41,  3.89it/s]

Validation Loop 34
input - False, attention_mask - False


 18%|█▊        | 35/194 [00:08<00:40,  3.91it/s]

Validation Loop 35
input - False, attention_mask - False


 19%|█▊        | 36/194 [00:09<00:39,  3.97it/s]

Validation Loop 36
input - False, attention_mask - False


 19%|█▉        | 37/194 [00:09<00:39,  3.94it/s]

Validation Loop 37
input - False, attention_mask - False


 20%|█▉        | 38/194 [00:09<00:39,  3.96it/s]

Validation Loop 38
input - False, attention_mask - False


 20%|██        | 39/194 [00:09<00:39,  3.96it/s]

Validation Loop 39
input - False, attention_mask - False


 21%|██        | 40/194 [00:10<00:38,  3.95it/s]

Validation Loop 40
input - False, attention_mask - False


 21%|██        | 41/194 [00:10<00:38,  3.97it/s]

Validation Loop 41
input - False, attention_mask - False


 22%|██▏       | 42/194 [00:10<00:38,  3.95it/s]

Validation Loop 42
input - False, attention_mask - False


 22%|██▏       | 43/194 [00:10<00:37,  3.99it/s]

Validation Loop 43
input - False, attention_mask - False


 23%|██▎       | 44/194 [00:11<00:37,  4.00it/s]

Validation Loop 44
input - False, attention_mask - False


 23%|██▎       | 45/194 [00:11<00:37,  4.00it/s]

Validation Loop 45
input - False, attention_mask - False


 24%|██▎       | 46/194 [00:11<00:37,  4.00it/s]

Validation Loop 46
input - False, attention_mask - False


 24%|██▍       | 47/194 [00:11<00:36,  4.00it/s]

Validation Loop 47
input - False, attention_mask - False


 25%|██▍       | 48/194 [00:12<00:36,  3.99it/s]

Validation Loop 48
input - False, attention_mask - False


 25%|██▌       | 49/194 [00:12<00:36,  4.02it/s]

Validation Loop 49
input - False, attention_mask - False


 26%|██▌       | 50/194 [00:12<00:35,  4.03it/s]

Validation Loop 50
input - False, attention_mask - False


 26%|██▋       | 51/194 [00:12<00:35,  4.07it/s]

Validation Loop 51
input - False, attention_mask - False


 27%|██▋       | 52/194 [00:13<00:35,  4.02it/s]

Validation Loop 52
input - False, attention_mask - False


 27%|██▋       | 53/194 [00:13<00:35,  4.01it/s]

Validation Loop 53
input - False, attention_mask - False


 28%|██▊       | 54/194 [00:13<00:35,  3.97it/s]

Validation Loop 54
input - False, attention_mask - False


 28%|██▊       | 55/194 [00:13<00:34,  3.99it/s]

Validation Loop 55
input - False, attention_mask - False


 29%|██▉       | 56/194 [00:14<00:34,  3.98it/s]

Validation Loop 56
input - False, attention_mask - False


 29%|██▉       | 57/194 [00:14<00:34,  3.99it/s]

Validation Loop 57
input - False, attention_mask - False


 30%|██▉       | 58/194 [00:14<00:34,  4.00it/s]

Validation Loop 58
input - False, attention_mask - False


 30%|███       | 59/194 [00:14<00:33,  4.02it/s]

Validation Loop 59
input - False, attention_mask - False


 31%|███       | 60/194 [00:15<00:33,  4.01it/s]

Validation Loop 60
input - False, attention_mask - False


 31%|███▏      | 61/194 [00:15<00:33,  3.98it/s]

Validation Loop 61
input - False, attention_mask - False


 32%|███▏      | 62/194 [00:15<00:32,  4.03it/s]

Validation Loop 62
input - False, attention_mask - False


 32%|███▏      | 63/194 [00:15<00:32,  4.02it/s]

Validation Loop 63
input - False, attention_mask - False


 33%|███▎      | 64/194 [00:16<00:32,  4.03it/s]

Validation Loop 64
input - False, attention_mask - False


 34%|███▎      | 65/194 [00:16<00:32,  4.03it/s]

Validation Loop 65
input - False, attention_mask - False


 34%|███▍      | 66/194 [00:16<00:31,  4.01it/s]

Validation Loop 66
input - False, attention_mask - False


 35%|███▍      | 67/194 [00:16<00:31,  4.01it/s]

Validation Loop 67
input - False, attention_mask - False


 35%|███▌      | 68/194 [00:17<00:31,  4.02it/s]

Validation Loop 68
input - False, attention_mask - False


 36%|███▌      | 69/194 [00:17<00:31,  4.00it/s]

Validation Loop 69
input - False, attention_mask - False


 36%|███▌      | 70/194 [00:17<00:30,  4.00it/s]

Validation Loop 70
input - False, attention_mask - False


 37%|███▋      | 71/194 [00:17<00:30,  4.01it/s]

Validation Loop 71
input - False, attention_mask - False


 37%|███▋      | 72/194 [00:18<00:30,  4.05it/s]

Validation Loop 72
input - False, attention_mask - False


 38%|███▊      | 73/194 [00:18<00:29,  4.05it/s]

Validation Loop 73
input - False, attention_mask - False


 38%|███▊      | 74/194 [00:18<00:29,  4.07it/s]

Validation Loop 74
input - False, attention_mask - False


 39%|███▊      | 75/194 [00:18<00:29,  4.03it/s]

Validation Loop 75
input - False, attention_mask - False


 39%|███▉      | 76/194 [00:19<00:29,  4.00it/s]

Validation Loop 76
input - False, attention_mask - False


 40%|███▉      | 77/194 [00:19<00:29,  3.99it/s]

Validation Loop 77
input - False, attention_mask - False


 40%|████      | 78/194 [00:19<00:28,  4.02it/s]

Validation Loop 78
input - False, attention_mask - False


 41%|████      | 79/194 [00:19<00:28,  4.03it/s]

Validation Loop 79
input - False, attention_mask - False


 41%|████      | 80/194 [00:19<00:28,  4.07it/s]

Validation Loop 80
input - False, attention_mask - False


 42%|████▏     | 81/194 [00:20<00:27,  4.08it/s]

Validation Loop 81
input - False, attention_mask - False


 42%|████▏     | 82/194 [00:20<00:27,  4.07it/s]

Validation Loop 82
input - False, attention_mask - False


 43%|████▎     | 83/194 [00:20<00:27,  4.06it/s]

Validation Loop 83
input - False, attention_mask - False


 43%|████▎     | 84/194 [00:20<00:27,  4.04it/s]

Validation Loop 84
input - False, attention_mask - False


 44%|████▍     | 85/194 [00:21<00:26,  4.06it/s]

Validation Loop 85
input - False, attention_mask - False


 44%|████▍     | 86/194 [00:21<00:26,  4.06it/s]

Validation Loop 86
input - False, attention_mask - False


 45%|████▍     | 87/194 [00:21<00:26,  4.02it/s]

Validation Loop 87
input - False, attention_mask - False


 45%|████▌     | 88/194 [00:21<00:26,  4.03it/s]

Validation Loop 88
input - False, attention_mask - False


 46%|████▌     | 89/194 [00:22<00:26,  4.03it/s]

Validation Loop 89
input - False, attention_mask - False


 46%|████▋     | 90/194 [00:22<00:25,  4.02it/s]

Validation Loop 90
input - False, attention_mask - False


 47%|████▋     | 91/194 [00:22<00:25,  4.03it/s]

Validation Loop 91
input - False, attention_mask - False


 47%|████▋     | 92/194 [00:22<00:25,  4.02it/s]

Validation Loop 92
input - False, attention_mask - False


 48%|████▊     | 93/194 [00:23<00:25,  4.00it/s]

Validation Loop 93
input - False, attention_mask - False


 48%|████▊     | 94/194 [00:23<00:24,  4.00it/s]

Validation Loop 94
input - False, attention_mask - False


 49%|████▉     | 95/194 [00:23<00:24,  3.99it/s]

Validation Loop 95
input - False, attention_mask - False


 49%|████▉     | 96/194 [00:23<00:24,  3.98it/s]

Validation Loop 96
input - False, attention_mask - False


 50%|█████     | 97/194 [00:24<00:24,  4.03it/s]

Validation Loop 97
input - False, attention_mask - False


 51%|█████     | 98/194 [00:24<00:24,  3.96it/s]

Validation Loop 98
input - False, attention_mask - False


 51%|█████     | 99/194 [00:24<00:23,  3.99it/s]

Validation Loop 99
input - False, attention_mask - False


 52%|█████▏    | 100/194 [00:24<00:23,  4.01it/s]

Validation Loop 100
input - False, attention_mask - False


 52%|█████▏    | 101/194 [00:25<00:23,  4.00it/s]

Validation Loop 101
input - False, attention_mask - False


 53%|█████▎    | 102/194 [00:25<00:23,  4.00it/s]

Validation Loop 102
input - False, attention_mask - False


 53%|█████▎    | 103/194 [00:25<00:22,  3.98it/s]

Validation Loop 103
input - False, attention_mask - False


 54%|█████▎    | 104/194 [00:25<00:22,  3.97it/s]

Validation Loop 104
input - False, attention_mask - False


 54%|█████▍    | 105/194 [00:26<00:22,  3.94it/s]

Validation Loop 105
input - False, attention_mask - False


 55%|█████▍    | 106/194 [00:26<00:22,  3.91it/s]

Validation Loop 106
input - False, attention_mask - False


 55%|█████▌    | 107/194 [00:26<00:22,  3.93it/s]

Validation Loop 107
input - False, attention_mask - False


 56%|█████▌    | 108/194 [00:27<00:21,  3.94it/s]

Validation Loop 108
input - False, attention_mask - False


 56%|█████▌    | 109/194 [00:27<00:21,  3.94it/s]

Validation Loop 109
input - False, attention_mask - False


 57%|█████▋    | 110/194 [00:27<00:21,  3.95it/s]

Validation Loop 110
input - False, attention_mask - False


 57%|█████▋    | 111/194 [00:27<00:20,  3.96it/s]

Validation Loop 111
input - False, attention_mask - False


 58%|█████▊    | 112/194 [00:28<00:20,  4.00it/s]

Validation Loop 112
input - False, attention_mask - False


 58%|█████▊    | 113/194 [00:28<00:20,  3.95it/s]

Validation Loop 113
input - False, attention_mask - False


 59%|█████▉    | 114/194 [00:28<00:20,  3.94it/s]

Validation Loop 114
input - False, attention_mask - False


 59%|█████▉    | 115/194 [00:28<00:19,  3.96it/s]

Validation Loop 115
input - False, attention_mask - False


 60%|█████▉    | 116/194 [00:29<00:19,  3.98it/s]

Validation Loop 116
input - False, attention_mask - False


 60%|██████    | 117/194 [00:29<00:19,  3.96it/s]

Validation Loop 117
input - False, attention_mask - False


 61%|██████    | 118/194 [00:29<00:19,  3.99it/s]

Validation Loop 118
input - False, attention_mask - False


 61%|██████▏   | 119/194 [00:29<00:18,  4.02it/s]

Validation Loop 119
input - False, attention_mask - False


 62%|██████▏   | 120/194 [00:30<00:18,  4.03it/s]

Validation Loop 120
input - False, attention_mask - False


 62%|██████▏   | 121/194 [00:30<00:17,  4.06it/s]

Validation Loop 121
input - False, attention_mask - False


 63%|██████▎   | 122/194 [00:30<00:17,  4.05it/s]

Validation Loop 122
input - False, attention_mask - False


 63%|██████▎   | 123/194 [00:30<00:17,  4.03it/s]

Validation Loop 123
input - False, attention_mask - False


 64%|██████▍   | 124/194 [00:31<00:17,  4.03it/s]

Validation Loop 124
input - False, attention_mask - False


 64%|██████▍   | 125/194 [00:31<00:17,  4.05it/s]

Validation Loop 125
input - False, attention_mask - False


 65%|██████▍   | 126/194 [00:31<00:16,  4.04it/s]

Validation Loop 126
input - False, attention_mask - False


 65%|██████▌   | 127/194 [00:31<00:16,  4.05it/s]

Validation Loop 127
input - False, attention_mask - False


 66%|██████▌   | 128/194 [00:31<00:16,  4.03it/s]

Validation Loop 128
input - False, attention_mask - False


 66%|██████▋   | 129/194 [00:32<00:16,  4.05it/s]

Validation Loop 129
input - False, attention_mask - False


 67%|██████▋   | 130/194 [00:32<00:15,  4.07it/s]

Validation Loop 130
input - False, attention_mask - False


 68%|██████▊   | 131/194 [00:32<00:15,  4.09it/s]

Validation Loop 131
input - False, attention_mask - False


 68%|██████▊   | 132/194 [00:32<00:15,  4.06it/s]

Validation Loop 132
input - False, attention_mask - False


 69%|██████▊   | 133/194 [00:33<00:15,  4.05it/s]

Validation Loop 133
input - False, attention_mask - False


 69%|██████▉   | 134/194 [00:33<00:14,  4.05it/s]

Validation Loop 134
input - False, attention_mask - False


 70%|██████▉   | 135/194 [00:33<00:14,  4.06it/s]

Validation Loop 135
input - False, attention_mask - False


 70%|███████   | 136/194 [00:33<00:14,  4.10it/s]

Validation Loop 136
input - False, attention_mask - False


 71%|███████   | 137/194 [00:34<00:13,  4.10it/s]

Validation Loop 137
input - False, attention_mask - False


 71%|███████   | 138/194 [00:34<00:13,  4.09it/s]

Validation Loop 138
input - False, attention_mask - False


 72%|███████▏  | 139/194 [00:34<00:13,  4.10it/s]

Validation Loop 139
input - False, attention_mask - False


 72%|███████▏  | 140/194 [00:34<00:13,  4.05it/s]

Validation Loop 140
input - False, attention_mask - False


 73%|███████▎  | 141/194 [00:35<00:13,  4.05it/s]

Validation Loop 141
input - False, attention_mask - False


 73%|███████▎  | 142/194 [00:35<00:12,  4.05it/s]

Validation Loop 142
input - False, attention_mask - False


 74%|███████▎  | 143/194 [00:35<00:12,  4.07it/s]

Validation Loop 143
input - False, attention_mask - False


 74%|███████▍  | 144/194 [00:35<00:12,  4.10it/s]

Validation Loop 144
input - False, attention_mask - False


 75%|███████▍  | 145/194 [00:36<00:11,  4.11it/s]

Validation Loop 145
input - False, attention_mask - False


 75%|███████▌  | 146/194 [00:36<00:11,  4.09it/s]

Validation Loop 146
input - False, attention_mask - False


 76%|███████▌  | 147/194 [00:36<00:11,  4.08it/s]

Validation Loop 147
input - False, attention_mask - False


 76%|███████▋  | 148/194 [00:36<00:11,  4.06it/s]

Validation Loop 148
input - False, attention_mask - False


 77%|███████▋  | 149/194 [00:37<00:11,  4.06it/s]

Validation Loop 149
input - False, attention_mask - False


 77%|███████▋  | 150/194 [00:37<00:10,  4.05it/s]

Validation Loop 150
input - False, attention_mask - False


 78%|███████▊  | 151/194 [00:37<00:10,  4.05it/s]

Validation Loop 151
input - False, attention_mask - False


 78%|███████▊  | 152/194 [00:37<00:10,  4.04it/s]

Validation Loop 152
input - False, attention_mask - False


 79%|███████▉  | 153/194 [00:38<00:10,  4.04it/s]

Validation Loop 153
input - False, attention_mask - False


 79%|███████▉  | 154/194 [00:38<00:10,  3.99it/s]

Validation Loop 154
input - False, attention_mask - False


 80%|███████▉  | 155/194 [00:38<00:09,  3.95it/s]

Validation Loop 155
input - False, attention_mask - False


 80%|████████  | 156/194 [00:38<00:09,  3.97it/s]

Validation Loop 156
input - False, attention_mask - False


 81%|████████  | 157/194 [00:39<00:09,  3.95it/s]

Validation Loop 157
input - False, attention_mask - False


 81%|████████▏ | 158/194 [00:39<00:09,  3.94it/s]

Validation Loop 158
input - False, attention_mask - False


 82%|████████▏ | 159/194 [00:39<00:08,  3.94it/s]

Validation Loop 159
input - False, attention_mask - False


 82%|████████▏ | 160/194 [00:39<00:08,  3.94it/s]

Validation Loop 160
input - False, attention_mask - False


 83%|████████▎ | 161/194 [00:40<00:08,  3.97it/s]

Validation Loop 161
input - False, attention_mask - False


 84%|████████▎ | 162/194 [00:40<00:08,  3.96it/s]

Validation Loop 162
input - False, attention_mask - False


 84%|████████▍ | 163/194 [00:40<00:07,  3.99it/s]

Validation Loop 163
input - False, attention_mask - False


 85%|████████▍ | 164/194 [00:40<00:07,  3.97it/s]

Validation Loop 164
input - False, attention_mask - False


 85%|████████▌ | 165/194 [00:41<00:07,  3.99it/s]

Validation Loop 165
input - False, attention_mask - False


 86%|████████▌ | 166/194 [00:41<00:07,  3.98it/s]

Validation Loop 166
input - False, attention_mask - False


 86%|████████▌ | 167/194 [00:41<00:06,  3.97it/s]

Validation Loop 167
input - False, attention_mask - False


 87%|████████▋ | 168/194 [00:41<00:06,  3.98it/s]

Validation Loop 168
input - False, attention_mask - False


 87%|████████▋ | 169/194 [00:42<00:06,  4.00it/s]

Validation Loop 169
input - False, attention_mask - False


 88%|████████▊ | 170/194 [00:42<00:06,  3.96it/s]

Validation Loop 170
input - False, attention_mask - False


 88%|████████▊ | 171/194 [00:42<00:05,  3.96it/s]

Validation Loop 171
input - False, attention_mask - False


 89%|████████▊ | 172/194 [00:42<00:05,  3.95it/s]

Validation Loop 172
input - False, attention_mask - False


 89%|████████▉ | 173/194 [00:43<00:05,  3.94it/s]

Validation Loop 173
input - False, attention_mask - False


 90%|████████▉ | 174/194 [00:43<00:05,  3.95it/s]

Validation Loop 174
input - False, attention_mask - False


 90%|█████████ | 175/194 [00:43<00:04,  3.94it/s]

Validation Loop 175
input - False, attention_mask - False


 91%|█████████ | 176/194 [00:43<00:04,  3.94it/s]

Validation Loop 176
input - False, attention_mask - False


 91%|█████████ | 177/194 [00:44<00:04,  3.92it/s]

Validation Loop 177
input - False, attention_mask - False


 92%|█████████▏| 178/194 [00:44<00:04,  3.99it/s]

Validation Loop 178
input - False, attention_mask - False


 92%|█████████▏| 179/194 [00:44<00:03,  3.95it/s]

Validation Loop 179
input - False, attention_mask - False


 93%|█████████▎| 180/194 [00:44<00:03,  3.99it/s]

Validation Loop 180
input - False, attention_mask - False


 93%|█████████▎| 181/194 [00:45<00:03,  4.04it/s]

Validation Loop 181
input - False, attention_mask - False


 94%|█████████▍| 182/194 [00:45<00:02,  4.07it/s]

Validation Loop 182
input - False, attention_mask - False


 94%|█████████▍| 183/194 [00:45<00:02,  4.00it/s]

Validation Loop 183
input - False, attention_mask - False


 95%|█████████▍| 184/194 [00:45<00:02,  4.03it/s]

Validation Loop 184
input - False, attention_mask - False


 95%|█████████▌| 185/194 [00:46<00:02,  4.05it/s]

Validation Loop 185
input - False, attention_mask - False


 96%|█████████▌| 186/194 [00:46<00:01,  4.06it/s]

Validation Loop 186
input - False, attention_mask - False


 96%|█████████▋| 187/194 [00:46<00:01,  4.07it/s]

Validation Loop 187
input - False, attention_mask - False


 97%|█████████▋| 188/194 [00:46<00:01,  4.08it/s]

Validation Loop 188
input - False, attention_mask - False


 97%|█████████▋| 189/194 [00:47<00:01,  4.07it/s]

Validation Loop 189
input - False, attention_mask - False


 98%|█████████▊| 190/194 [00:47<00:00,  4.06it/s]

Validation Loop 190
input - False, attention_mask - False


 98%|█████████▊| 191/194 [00:47<00:00,  4.05it/s]

Validation Loop 191
input - False, attention_mask - False


 99%|█████████▉| 192/194 [00:47<00:00,  4.03it/s]

Validation Loop 192
input - False, attention_mask - False


 99%|█████████▉| 193/194 [00:48<00:00,  4.05it/s]

Validation Loop 193
input - False, attention_mask - False


100%|██████████| 194/194 [00:48<00:00,  4.00it/s]


[{'tp': 0, 'tn': 1552, 'fp': 0, 'fn': 0}, {'tp': 928, 'tn': 329, 'fp': 33, 'fn': 262}, {'tp': 157, 'tn': 1365, 'fp': 3, 'fn': 27}, {'tp': 152, 'tn': 1098, 'fp': 279, 'fn': 23}]
Detailed accuracy after 0 epoch:
unanswerable accuarcy: 1.0
extractive accuarcy: 0.8099226804123711
yes_no accuarcy: 0.9806701030927835
abstractive accuarcy: 0.8054123711340206
Overall accuarcy: 0.8990012886597938
Best accuarcy: 0
0.8990012886597938
Model Updated


  0%|          | 0/289 [00:00<?, ?it/s]

Training loop 0
tensor([[ 101, 2029, 2951,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ..., 3512, 8651,  102],
        [ 101, 2029, 2951,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2116,  ...,    0,    0,    0],
        [ 101, 2515, 2023,  ...,    0,    0,    0],
        [ 101, 2339, 2515,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.29370707273483276, logits - tensor([[-5.8305,  0.6722, -4.0738, -2.0037],
        [-5.0734,  1.0319, -4.6316, -1.4720],
        [-5.0458,  1.2961, -4.3867, -1.6387],
        [-4.5938, -2.3633, -4.0337,  1.9145],
        [-5.2633, -2.6111,  2.1162, -3.06

  0%|          | 1/289 [00:00<03:45,  1.28it/s]

Training loop 1
tensor([[  101,  2071,  2017,  ...,     0,     0,     0],
        [  101,  2054,  2515,  ...,     0,     0,     0],
        [  101,  2029, 12046,  ...,     0,     0,     0],
        ...,
        [  101,  2006,  2029,  ...,     0,     0,     0],
        [  101,  2073,  2515,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19038671255111694, logits - tensor([[-4.3928, -1.6747,  1.5117, -3.6181],
        [-5.2244,  1.3448, -5.3623, -1.9483],
        [-5.1000,  2.3616, -5.3077, -1.8770],
        [-4.8779,  1.8745, -4.7378, -2.2381],
     

  1%|          | 2/289 [00:01<03:41,  1.30it/s]

Training loop 2
tensor([[  101,  2054,  2951,  ...,     0,     0,     0],
        [  101,  2054,  8476,  ...,     0,     0,     0],
        [  101,  2129,  1013,  ...,     0,     0,     0],
        ...,
        [  101,  2029, 10474,  ...,  1010,  1189,   102],
        [  101,  2024,  2951,  ...,     0,     0,     0],
        [  101,  2006,  2029,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2855498194694519, logits - tensor([[-5.5719,  1.7621, -4.9363, -1.5807],
        [-4.6137,  2.0663, -3.8080, -1.5880],
        [-5.3408,  0.8896, -5.0688, -1.1240],
        [-6.5951,  1.6029, -5.3008, -1.0128],
      

  1%|          | 3/289 [00:02<03:37,  1.31it/s]

Training loop 3
tensor([[ 101, 2129, 2515,  ...,    0,    0,    0],
        [ 101, 2003, 1996,  ...,    0,    0,    0],
        [ 101, 2515, 1996,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 3793,  ...,    0,    0,    0],
        [ 101, 2029, 4563,  ...,    0,    0,    0],
        [ 101, 2054, 4155,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.20119114220142365, logits - tensor([[-4.9860,  2.1286, -4.0611, -0.6791],
        [-4.4217, -3.4056,  2.2351, -2.3100],
        [-4.8111, -3.8483,  1.6501, -1.9439],
        [-4.4752,  1.4872, -4.4330, -1.5039],
        [-4.1303, -3.0343,  1.2227, -2.17

  1%|▏         | 4/289 [00:03<03:34,  1.33it/s]

Training loop 4
tensor([[  101,  2129,  2024,  ...,     0,     0,     0],
        [  101,  2054, 28616,  ...,     0,     0,     0],
        [  101,  2054, 12783,  ...,     0,     0,     0],
        ...,
        [  101,  2029,  2951,  ...,     0,     0,     0],
        [  101,  2054, 12546,  ...,     0,     0,     0],
        [  101,  2129,  2116,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.35747605562210083, logits - tensor([[-5.4871,  1.2455, -5.3862, -1.1798],
        [-6.2736,  1.3476, -4.3435, -2.1639],
        [-5.1740,  0.6023, -4.4039, -0.7335],
        [-4.8043,  0.9919, -3.9279, -0.4816],
     

  2%|▏         | 5/289 [00:03<03:33,  1.33it/s]

Training loop 5
tensor([[  101,  2129,  2116,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0],
        [  101,  2054,  2001,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  6177,  ...,     0,     0,     0],
        [  101,  2054,  5468,  ...,     0,     0,     0],
        [  101,  2054, 13100,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.30004650354385376, logits - tensor([[-5.6661,  1.5478, -5.2788, -0.6027],
        [-6.6718,  2.5147, -4.6776, -0.9343],
        [-5.1864,  1.2728, -3.7400, -0.4976],
        [-4.5699,  0.8231, -4.5588, -1.4501],
     

  2%|▏         | 6/289 [00:04<03:32,  1.33it/s]

Training loop 6
tensor([[ 101, 2054, 2020,  ...,    0,    0,    0],
        [ 101, 2054, 3471,  ...,    0,    0,    0],
        [ 101, 2031, 1996,  ...,    0,    0,    0],
        ...,
        [ 101, 2515, 4748,  ...,    0,    0,    0],
        [ 101, 2129, 2079,  ...,    0,    0,    0],
        [ 101, 2129, 2488,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.44521644711494446, logits - tensor([[-4.9834, -1.1634, -4.2510,  1.3810],
        [-4.4219,  1.0421, -4.0128, -1.3753],
        [-3.9943, -3.3769,  2.4924, -3.0093],
        [-4.7944,  0.9512, -3.8342, -1.5051],
        [-5.6222,  1.7283, -4.9480, -0.97

  2%|▏         | 7/289 [00:05<03:31,  1.33it/s]

Training loop 7
tensor([[ 101, 2079, 1996,  ...,    0,    0,    0],
        [ 101, 2129, 2502,  ...,    0,    0,    0],
        [ 101, 2054, 2653,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2047,  ...,    0,    0,    0],
        [ 101, 2129, 2502,  ..., 2006, 1996,  102],
        [ 101, 2024, 2653,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1150812953710556, logits - tensor([[-4.4607,  0.7737, -2.7810, -1.7304],
        [-7.0893,  2.0436, -5.3018, -1.9464],
        [-6.1927,  1.8538, -4.7812, -1.6308],
        [-4.7296,  0.4124, -3.8295, -0.7965],
        [-3.7059, -0.7645, -3.3677,  1.576

  3%|▎         | 8/289 [00:06<03:31,  1.33it/s]

Training loop 8
tensor([[ 101, 2054, 6177,  ...,    0,    0,    0],
        [ 101, 2129, 2172,  ...,    0,    0,    0],
        [ 101, 2079, 2027,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2515,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2079, 2027,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.20881816744804382, logits - tensor([[-5.4252,  0.5264, -4.5961, -2.0617],
        [-5.9270,  1.6578, -5.3277, -1.5990],
        [-3.4840, -3.8474,  2.6903, -3.2562],
        [-5.8519,  1.3752, -4.0038, -0.8025],
        [-4.5808,  1.6490, -5.0660, -0.79

  3%|▎         | 9/289 [00:06<03:30,  1.33it/s]

Training loop 9
tensor([[ 101, 2003, 1996,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2079, 2027,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 5887,  ...,    0,    0,    0],
        [ 101, 2129, 2024,  ...,    0,    0,    0],
        [ 101, 2054, 9646,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


  3%|▎         | 10/289 [00:07<03:28,  1.34it/s]

loss - 0.4883597791194916, logits - tensor([[-4.9331, -3.0068,  1.6875, -2.9054],
        [-5.9174, -1.5413, -5.3461,  2.7780],
        [-4.2317, -3.3745,  2.5165, -2.2633],
        [-7.0199,  1.8884, -5.1489, -1.0508],
        [-4.3660, -2.9289,  2.3773, -2.3874],
        [-6.6200,  0.9700, -5.5242, -1.4926],
        [-5.3827,  1.0964, -5.4802, -2.0162],
        [-6.6443,  1.3899, -6.0026, -1.7351]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 10
tensor([[  101,  2003,  1996,  ...,     0,     0,     0],
        [  101,  2129,  2844,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0],
        ...,
        [  101,  2029, 26384,  ...,     0,     0,     0],
        [  101,  2029, 17953,  ...,     0,     0,     0],
        [  101,  2054,  2024,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 

  4%|▍         | 11/289 [00:08<03:27,  1.34it/s]

Training loop 11
tensor([[ 101, 2129, 2502,  ...,    0,    0,    0],
        [ 101, 2129, 2488,  ...,    0,    0,    0],
        [ 101, 2129, 2003,  ...,    0,    0,    0],
        ...,
        [ 101, 2029, 1056,  ...,    0,    0,    0],
        [ 101, 2054, 6770,  ...,    0,    0,    0],
        [ 101, 2054, 2951,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1170356497168541, logits - tensor([[-5.1125,  1.5159, -4.3525, -1.3260],
        [-4.8155, -2.7415, -4.2852,  3.2290],
        [-5.4203,  1.0153, -4.6932, -1.3267],
        [-4.1463,  0.4547, -3.5205, -1.3625],
        [-4.7108, -2.9143, -4.8638,  2.44

  4%|▍         | 12/289 [00:09<03:26,  1.34it/s]

Training loop 12
tensor([[ 101, 2079, 2027,  ...,    0,    0,    0],
        [ 101, 2029, 4155,  ...,    0,    0,    0],
        [ 101, 2054, 2079,  ...,    0,    0,    0],
        ...,
        [ 101, 2515, 2037,  ...,    0,    0,    0],
        [ 101, 2129, 2312,  ...,    0,    0,    0],
        [ 101, 2129, 2079,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2626610994338989, logits - tensor([[-3.9264, -3.1629,  1.9374, -2.3788],
        [-5.5224,  1.0467, -4.1598, -1.1411],
        [-5.4560,  1.5690, -4.8457, -1.3101],
        [-6.4423,  1.8884, -5.3105, -1.5725],
        [-4.1175, -1.0253, -3.4738,  1.26

  4%|▍         | 13/289 [00:09<03:29,  1.32it/s]

Training loop 13
tensor([[ 101, 2129, 2003,  ...,    0,    0,    0],
        [ 101, 2024, 2045,  ...,    0,    0,    0],
        [ 101, 2129, 2172,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2003,  ...,    0,    0,    0],
        [ 101, 2129, 2003,  ..., 2006, 2023,  102],
        [ 101, 2054, 2024,  ..., 1012, 5310,  102]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.39603525400161743, logits - tensor([[-6.1370,  1.4903, -4.5222, -0.8735],
        [-4.4312, -3.0891,  2.3769, -2.5734],
        [-4.5861, -2.0776, -3.8344,  1.8618],
        [-6.3075,  1.2157, -4.2766, -1.3526],
        [-5.2385,  1.3545, -4.8834, -1.0

  5%|▍         | 14/289 [00:10<03:29,  1.31it/s]

Training loop 14
tensor([[ 101, 2129, 2079,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2129, 2079,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2951,  ...,    0,    0,    0],
        [ 101, 2011, 2129,  ..., 1007, 1996,  102],
        [ 101, 2054, 2060,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.30367690324783325, logits - tensor([[-5.7258,  0.3323, -4.4908, -0.4659],
        [-4.2942, -2.1743, -3.0724,  2.0710],
        [-4.4866,  0.9426, -4.3988, -2.1160],
        [-5.4250,  0.7576, -5.0955, -0.4586],
        [-3.8081, -1.8672, -4.0220,  1.6

  5%|▌         | 15/289 [00:11<03:28,  1.32it/s]

Training loop 15
tensor([[ 101, 2054, 2024,  ..., 1037, 2944,  102],
        [ 101, 2129, 2172,  ...,    0,    0,    0],
        [ 101, 2013, 2043,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2027,  ...,    0,    0,    0],
        [ 101, 2054, 2951,  ...,    0,    0,    0],
        [ 101, 2003, 1996,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.17939238250255585, logits - tensor([[-6.1212,  1.0958, -5.1048, -0.8668],
        [-5.7445, -1.7777, -4.0569,  1.0264],
        [-6.3796,  0.5658, -4.1838, -1.0168],
        [-4.6840, -2.4422,  2.2504, -2.9145],
        [-5.9240,  1.6223, -4.6733, -1.4

  6%|▌         | 16/289 [00:12<03:27,  1.31it/s]

Training loop 16
tensor([[  101,  2054,  2024,  ...,     0,     0,     0],
        [  101,  2129,  2024,  ...,     0,     0,     0],
        [  101,  2079,  2027,  ...,     0,     0,     0],
        ...,
        [  101,  2029,  4155,  ...,     0,     0,     0],
        [  101,  2001, 14324,  ...,     0,     0,     0],
        [  101,  2054,  2024,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2597610056400299, logits - tensor([[-5.6456,  1.5064, -4.6942, -0.6384],
        [-5.5574,  1.1160, -5.5161, -0.7428],
        [-3.7758, -2.7834,  2.3143, -3.0494],
        [-5.4878,  1.5861, -4.8805, -0.5437],
     

  6%|▌         | 17/289 [00:12<03:26,  1.32it/s]

Training loop 17
tensor([[  101,  2129,  2502,  ...,     0,     0,     0],
        [  101,  2029, 14402,  ..., 13462,  2036,   102],
        [  101,  2054,  2951,  ...,     0,     0,     0],
        ...,
        [  101,  2129,  2079,  ...,     0,     0,     0],
        [  101,  2054,  6801,  ...,     0,     0,     0],
        [  101,  2054, 15152,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.21352998912334442, logits - tensor([[-6.5215,  0.7134, -5.0031, -1.6834],
        [-5.8893,  1.4817, -3.8797, -0.7307],
        [-6.0483,  1.2614, -4.8330, -1.6828],
        [-5.7315,  1.2316, -4.6488, -0.0426],
    

  6%|▌         | 18/289 [00:13<03:24,  1.32it/s]

Training loop 18
tensor([[  101,  2054,  2003,  ...,     0,     0,     0],
        [  101,  2129,  2020,  ...,     0,     0,     0],
        [  101,  2054,  4275,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  6847,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0],
        [  101,  2054, 18847,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.304597944021225, logits - tensor([[-4.2188, -2.1201, -4.4256,  1.8543],
        [-6.5068,  0.9748, -4.7728, -2.1520],
        [-5.2218, -2.5011, -4.6189,  2.1350],
        [-5.5582,  0.4803, -4.6711, -1.0156],
      

  7%|▋         | 19/289 [00:14<03:24,  1.32it/s]

Training loop 19
tensor([[ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2054, 3115,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 8321,  ...,    0,    0,    0],
        [ 101, 2339, 2003,  ...,    0,    0,    0],
        [ 101, 2003, 1996,  ..., 2015, 1012,  102]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.186684250831604, logits - tensor([[-5.8086,  0.8287, -4.7154, -1.1455],
        [-5.1889,  1.0450, -3.9083, -1.6207],
        [-5.8747,  0.7265, -4.6223, -2.0775],
        [-5.4727,  0.7468, -4.9689, -1.4499],
        [-5.6638, -0.2890, -5.0129,  0.281

  7%|▋         | 20/289 [00:15<03:23,  1.32it/s]

Training loop 20
tensor([[ 101, 2129, 2502,  ...,    0,    0,    0],
        [ 101, 2054, 8377,  ...,    0,    0,    0],
        [ 101, 2079, 2027,  ...,    0,    0,    0],
        ...,
        [ 101, 2029, 2653,  ...,    0,    0,    0],
        [ 101, 2054, 5449,  ...,    0,    0,    0],
        [ 101, 2054, 2001,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3641277551651001, logits - tensor([[-5.1665,  0.7389, -4.5628, -1.0576],
        [-5.6509,  1.1754, -5.0720, -1.4675],
        [-4.7930, -3.7407,  3.0132, -2.9304],
        [-5.1704,  1.5207, -4.6731, -1.1053],
        [-5.1376,  0.5504, -4.1232, -0.35

  7%|▋         | 21/289 [00:15<03:23,  1.32it/s]

Training loop 21
tensor([[ 101, 2079, 1996,  ...,    0,    0,    0],
        [ 101, 2129, 2116,  ...,    0,    0,    0],
        [ 101, 1999, 2029,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2944,  ...,    0,    0,    0],
        [ 101, 2079, 1996,  ...,    0,    0,    0],
        [ 101, 2079, 2027,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.25816160440444946, logits - tensor([[-5.3490, -2.9065,  1.7427, -3.3675],
        [-5.2030,  0.9263, -5.2786, -1.3812],
        [-4.4184, -1.6094, -4.5042,  1.5032],
        [-4.7983, -1.4048, -3.9169,  2.1693],
        [-6.0182,  1.7627, -5.3833, -1.4

  8%|▊         | 22/289 [00:16<03:21,  1.32it/s]

Training loop 22
tensor([[ 101, 2054, 3463,  ...,    0,    0,    0],
        [ 101, 2054, 2093,  ...,    0,    0,    0],
        [ 101, 2339, 2079,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2054, 4275,  ...,    0,    0,    0],
        [ 101, 2129, 2001,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.365478515625, logits - tensor([[-6.0173,  1.3395, -4.7892, -0.9578],
        [-5.4770,  1.5943, -4.9962, -1.8907],
        [-6.2109,  1.2484, -5.1893, -1.3118],
        [-4.7910,  0.8082, -4.7616, -1.3508],
        [-5.7882,  1.2813, -5.0933, -0.8689],

  8%|▊         | 23/289 [00:17<03:20,  1.33it/s]

Training loop 23
tensor([[  101,  2054,  9312,  ...,     0,     0,     0],
        [  101,  2003,  1996,  ...,     0,     0,     0],
        [  101,  2515,  2027,  ...,     0,     0,     0],
        ...,
        [  101,  2054, 11541,  ...,     0,     0,     0],
        [  101,  2054,  2944,  ...,     0,     0,     0],
        [  101,  2129,  2079,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.32026124000549316, logits - tensor([[-5.4475,  1.3445, -4.7108, -0.9749],
        [-5.0302, -3.3852,  2.9565, -2.5290],
        [-4.8685, -3.3748,  1.8696, -2.3002],
        [-5.4745,  1.8654, -5.2931, -1.2132],
    

  8%|▊         | 24/289 [00:18<03:18,  1.33it/s]

Training loop 24
tensor([[  101,  2129,  2079,  ...,     0,     0,     0],
        [  101,  2129,  2079,  ...,     0,     0,     0],
        [  101,  2054,  2024,  ...,     0,     0,     0],
        ...,
        [  101,  2003,  1996,  ...,     0,     0,     0],
        [  101,  2029, 17841,  ...,     0,     0,     0],
        [  101,  2054,  2274,  ...,  2079,  2017,   102]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.34015265107154846, logits - tensor([[-6.2231,  1.3084, -4.9039, -0.6146],
        [-6.0816,  0.7562, -5.0853, -0.5155],
        [-5.7117,  1.5396, -4.6515, -1.2167],
        [-5.5187,  1.9195, -4.3173, -1.0174],
    

  9%|▊         | 25/289 [00:18<03:17,  1.34it/s]

Training loop 25
tensor([[  101,  2054,  6847,  ...,     0,     0,     0],
        [  101,  2079,  2027,  ...,     0,     0,     0],
        [  101,  2515,  1037,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  3176,  ...,     0,     0,     0],
        [  101,  2054, 15756,  ...,     0,     0,     0],
        [  101,  2054,  2944,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.24975517392158508, logits - tensor([[-6.6654,  1.4314, -4.7970, -0.7800],
        [-3.9280, -2.4294,  1.7362, -2.1252],
        [-3.8392, -2.9362,  2.7022, -2.6649],
        [-5.1914,  0.7750, -4.4297, -0.5547],
    

  9%|▉         | 26/289 [00:19<03:16,  1.34it/s]

Training loop 26
tensor([[  101,  2129,  2079,  ...,     0,     0,     0],
        [  101,  2129,  2172,  ...,  1997, 17953,   102],
        [  101,  2129,  2079,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  2024,  ...,     0,     0,     0],
        [  101,  2054, 12046,  ...,     0,     0,     0],
        [  101,  2001,  1996,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2322511374950409, logits - tensor([[-5.5167,  1.4623, -4.5800, -0.8436],
        [-5.1687, -1.1010, -4.2795,  1.2355],
        [-4.3018, -1.0984, -4.5942,  2.4774],
        [-6.6495,  0.3959, -4.6591, -0.0792],
     

  9%|▉         | 27/289 [00:20<03:16,  1.33it/s]

Training loop 27
tensor([[ 101, 2054, 3793,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2129, 2116,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 8310,  ...,    0,    0,    0],
        [ 101, 2054, 2951,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.36513036489486694, logits - tensor([[-6.5457,  1.4760, -5.2301, -1.8853],
        [-5.8919,  0.1537, -4.4534, -0.1010],
        [-5.2251,  0.8171, -4.5635, -1.2405],
        [-5.3544,  0.2705, -5.4276, -1.3450],
        [-4.9837, -1.0627, -4.4903,  2.1

 10%|▉         | 28/289 [00:21<03:16,  1.33it/s]

Training loop 28
tensor([[  101,  2029,  2951,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0],
        [  101,  2129,  2502,  ...,     0,     0,     0],
        ...,
        [  101,  2054, 26163,  ...,     0,     0,     0],
        [  101,  2054,  2020,  ...,     0,     0,     0],
        [  101,  2515,  1996,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.12267743051052094, logits - tensor([[-6.3912,  1.6963, -5.1549, -1.2377],
        [-5.6489,  0.9945, -5.7755, -1.1838],
        [-4.5746, -2.0183, -5.1251,  2.3806],
        [-5.0027,  1.8298, -4.8072, -0.4395],
    

 10%|█         | 29/289 [00:21<03:15,  1.33it/s]

Training loop 29
tensor([[ 101, 2129, 2116,  ...,    0,    0,    0],
        [ 101, 2029, 2944,  ...,    0,    0,    0],
        [ 101, 2054, 2001,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 2529,  ...,    0,    0,    0],
        [ 101, 2054, 2698,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2850421071052551, logits - tensor([[-5.8069,  1.2955, -5.1773, -1.0178],
        [-6.0154,  1.6410, -5.5681, -0.7196],
        [-6.0945,  1.6112, -5.5347, -0.9813],
        [-6.0873,  0.6436, -5.2374, -1.1062],
        [-5.7711,  0.5147, -4.9632, -1.21

 10%|█         | 30/289 [00:22<03:15,  1.33it/s]

Training loop 30
tensor([[ 101, 2054, 4275,  ...,    0,    0,    0],
        [ 101, 2029, 2416,  ...,    0,    0,    0],
        [ 101, 2129, 2001,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2129, 2027,  ...,    0,    0,    0],
        [ 101, 2054, 2192,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.16648831963539124, logits - tensor([[-5.1453,  0.6617, -4.7240, -0.9058],
        [-6.0765,  1.9364, -5.5976, -1.6108],
        [-6.6704,  1.4694, -5.2457, -0.3576],
        [-5.2429,  0.6395, -4.4203, -0.2178],
        [-4.9681, -1.9886, -4.1236,  2.2

 11%|█         | 31/289 [00:23<03:14,  1.33it/s]

Training loop 31
tensor([[ 101, 2054, 9312,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2079, 2027,  ...,    0,    0,    0],
        ...,
        [ 101, 2079, 1996,  ...,    0,    0,    0],
        [ 101, 2064, 1996,  ...,    0,    0,    0],
        [ 101, 2001, 2836,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.15909472107887268, logits - tensor([[-4.7817, -1.3091, -3.5255,  1.8954],
        [-4.5477,  0.7415, -4.5751, -0.7840],
        [-4.2999, -3.1092,  2.1882, -1.6871],
        [-5.8269,  0.2108, -5.1639, -0.6400],
        [-5.9770,  0.8984, -4.4331, -0.3

 11%|█         | 32/289 [00:24<03:14,  1.32it/s]

Training loop 32
tensor([[  101,  2054,  2003,  ...,     0,     0,     0],
        [  101,  2029, 15792,  ...,  1024,  1024,   102],
        [  101,  2129,  2502,  ...,     0,     0,     0],
        ...,
        [  101,  2129,  2003,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.15208956599235535, logits - tensor([[-4.8489,  1.2152, -4.5198, -1.3168],
        [-5.3326,  0.3344, -4.4721, -0.2921],
        [-5.6015,  1.0498, -5.4927, -1.4303],
        [-3.6910, -2.1497,  2.8365, -2.7461],
    

 11%|█▏        | 33/289 [00:24<03:12,  1.33it/s]

Training loop 33
tensor([[  101,  2054,  2951,  ...,     0,     0,     0],
        [  101,  2129,  2312,  ...,     0,     0,     0],
        [  101,  2129,  2172,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  2024,  ...,     0,     0,     0],
        [  101,  2054, 15756,  ...,     0,     0,     0],
        [  101,  2129,  2172,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2367759644985199, logits - tensor([[-6.0534,  0.9541, -5.0746, -1.1585],
        [-4.5945, -2.5268, -4.3235,  2.6071],
        [-4.1115, -2.2492, -3.2850,  2.2705],
        [-5.7688,  1.2661, -5.1924, -0.7932],
     

 12%|█▏        | 34/289 [00:25<03:12,  1.32it/s]

Training loop 34
tensor([[ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2129, 2502,  ...,    0,    0,    0],
        [ 101, 2029, 2330,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2515,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.38767510652542114, logits - tensor([[-5.3822,  1.2733, -5.0856, -0.9919],
        [-5.6741,  0.9881, -5.5845, -1.2539],
        [-6.3830,  1.9293, -5.5082, -1.9126],
        [-6.1663,  1.2851, -5.2743, -1.0241],
        [-5.4240, -1.3098, -4.6093,  1.6

 12%|█▏        | 35/289 [00:26<03:12,  1.32it/s]

Training loop 35
tensor([[  101,  2054,  2951,  ...,     0,     0,     0],
        [  101,  2079,  2027,  ...,     0,     0,     0],
        [  101,  2054, 12046,  ...,     0,     0,     0],
        ...,
        [  101,  2129,  2027,  ...,     0,     0,     0],
        [  101,  2079,  2027,  ...,  1998,  2382,   102],
        [  101,  2129,  2116,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1934145987033844, logits - tensor([[-5.6722,  0.7579, -4.7931, -1.1338],
        [-3.8693, -2.6044,  2.1241, -2.8950],
        [-5.8457,  1.3172, -4.4512, -1.0240],
        [-5.5171,  1.4903, -5.0847, -1.6682],
     

 12%|█▏        | 36/289 [00:27<03:11,  1.32it/s]

Training loop 36
tensor([[  101,  2054,  2003,  ...,     0,     0,     0],
        [  101,  2515,  2944,  ...,     0,     0,     0],
        [  101,  2515,  2037,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  2003,  ...,     0,     0,     0],
        [  101,  2129,  2502,  ...,     0,     0,     0],
        [  101,  2079, 22889,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.13897374272346497, logits - tensor([[-6.2725,  1.4961, -5.0440, -1.1040],
        [-4.6911, -3.1045,  1.6923, -2.7220],
        [-5.2419, -3.0754,  2.6037, -2.5331],
        [-6.4116,  1.5239, -5.4146, -1.0429],
    

 13%|█▎        | 37/289 [00:27<03:10,  1.32it/s]

Training loop 37
tensor([[ 101, 2054, 2951,  ...,    0,    0,    0],
        [ 101, 2029, 3563,  ...,    0,    0,    0],
        [ 101, 2024, 2037,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2003, 1996,  ...,    0,    0,    0],
        [ 101, 2073, 2106,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.11361856758594513, logits - tensor([[-5.6238,  1.8789, -5.6058, -1.8778],
        [-6.1626,  0.4305, -5.1059, -1.1858],
        [-4.7619, -2.9066,  1.8804, -2.7641],
        [-4.7730, -4.0172,  2.4437, -3.2003],
        [-5.5841,  0.4813, -5.1253, -1.2

 13%|█▎        | 38/289 [00:28<03:09,  1.32it/s]

Training loop 38
tensor([[  101,  2054,  2951,  ...,     0,     0,     0],
        [  101,  2006,  2054,  ...,     0,     0,     0],
        [  101,  2029,  2784,  ...,     0,     0,     0],
        ...,
        [  101,  2079,  2027,  ...,  1998, 26839,   102],
        [  101,  2029,  2838,  ...,     0,     0,     0],
        [  101,  2129,  2001,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.12761321663856506, logits - tensor([[-6.4782,  1.4571, -5.1494, -1.7510],
        [-4.7199, -1.9156, -4.9259,  0.9921],
        [-5.8779,  2.2027, -4.4976, -1.7551],
        [-5.5619,  0.6513, -4.7817, -0.2272],
    

 13%|█▎        | 39/289 [00:29<03:08,  1.33it/s]

Training loop 39
tensor([[ 101, 2129, 2001,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 2020,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2129, 2003,  ...,    0,    0,    0],
        [ 101, 2011, 2129,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.31868910789489746, logits - tensor([[-5.8714,  0.5839, -4.4753, -1.1709],
        [-4.8192,  1.1461, -3.9896, -0.9006],
        [-6.0922,  1.2550, -5.8695, -1.2632],
        [-5.6534,  0.9647, -4.2049, -1.0442],
        [-4.4212, -1.4545, -3.6291,  1.0

 14%|█▍        | 40/289 [00:30<03:08,  1.32it/s]

Training loop 40
tensor([[  101,  2079,  2027,  ...,     0,     0,     0],
        [  101,  2054,  4730,  ...,     0,     0,     0],
        [  101,  2024,  2070,  ...,     0,     0,     0],
        ...,
        [  101,  2106,  1996,  ...,     0,     0,     0],
        [  101,  2054, 17537,  ...,     0,     0,     0],
        [  101,  2129,  2024,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.168476402759552, logits - tensor([[-4.5301, -2.8420,  1.8604, -2.2858],
        [-4.5051,  1.1337, -5.0348, -1.4377],
        [-4.9545, -3.1280,  2.1604, -3.3967],
        [-4.7651, -0.0132, -3.8967, -1.6521],
      

 14%|█▍        | 41/289 [00:30<03:07,  1.32it/s]

Training loop 41
tensor([[  101,  2054, 12046,  ...,     0,     0,     0],
        [  101,  2079,  1996,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0],
        ...,
        [  101,  2079,  2027,  ...,     0,     0,     0],
        [  101,  2029,  2951,  ...,     0,     0,     0],
        [  101,  2054,  2024,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.23189681768417358, logits - tensor([[-5.2533, -2.7185, -4.0454,  1.9629],
        [-4.7966, -3.0851,  2.8472, -2.3560],
        [-4.6481, -3.1329, -4.5453,  3.1006],
        [-4.6782, -3.3704,  2.7985, -3.8473],
    

 15%|█▍        | 42/289 [00:31<03:06,  1.33it/s]

Training loop 42
tensor([[ 101, 2129, 2116,  ...,    0,    0,    0],
        [ 101, 2003, 5754,  ...,    0,    0,    0],
        [ 101, 2054, 2097,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2116,  ...,    0,    0,    0],
        [ 101, 2339, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 2653,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.26193904876708984, logits - tensor([[-3.8656, -2.2504, -4.1552,  2.3001],
        [-5.2157, -3.1874,  1.8149, -2.2233],
        [-5.2736,  1.9131, -5.0385, -0.7367],
        [-5.5450,  1.2371, -5.2829, -1.4965],
        [-5.4933, -1.4735, -4.4744,  1.0

 15%|█▍        | 43/289 [00:32<03:04,  1.33it/s]

Training loop 43
tensor([[ 101, 2129, 2024,  ...,    0,    0,    0],
        [ 101, 2129, 2027,  ...,    0,    0,    0],
        [ 101, 2129, 2079,  ...,    0,    0,    0],
        ...,
        [ 101, 2003, 2019,  ...,    0,    0,    0],
        [ 101, 2029, 1997,  ...,    0,    0,    0],
        [ 101, 2024, 2045,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2933211624622345, logits - tensor([[-6.5136,  0.1487, -5.1892, -1.4167],
        [-5.2247,  0.7880, -4.6472, -0.6458],
        [-6.3809,  1.0223, -5.1837, -1.3592],
        [-4.9174,  1.1674, -4.5139, -2.1369],
        [-5.6551,  0.6002, -4.9371, -1.99

 15%|█▌        | 44/289 [00:33<03:04,  1.33it/s]

Training loop 44
tensor([[  101,  2515,  1996,  ...,     0,     0,     0],
        [  101,  2054, 28616,  ...,     0,     0,     0],
        [  101,  2054,  7861,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  2951,  ...,     0,     0,     0],
        [  101,  2106,  2027,  ...,     0,     0,     0],
        [  101,  2054,  2512,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.16231122612953186, logits - tensor([[-4.4937, -3.0220,  2.1886, -2.3988],
        [-5.1950,  1.6363, -4.5829, -1.2350],
        [-6.2813,  1.0844, -4.5238, -1.7606],
        [-6.5796,  1.9162, -5.1513, -1.4669],
    

 16%|█▌        | 45/289 [00:33<03:03,  1.33it/s]

Training loop 45
tensor([[ 101, 2129, 2312,  ...,    0,    0,    0],
        [ 101, 2054, 2397,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2001,  ...,    0,    0,    0],
        [ 101, 2106, 1996,  ...,    0,    0,    0],
        [ 101, 2079, 2027,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.0929606705904007, logits - tensor([[-5.9707,  0.6948, -4.8298, -1.1272],
        [-6.0346,  1.6929, -5.9053, -1.6845],
        [-6.1004,  1.9444, -5.4463, -1.2619],
        [-4.6175, -1.7349, -3.8717,  1.4741],
        [-4.8585,  1.1171, -3.4545, -1.24

 16%|█▌        | 46/289 [00:34<03:02,  1.33it/s]

Training loop 46
tensor([[ 101, 2054, 2836,  ...,    0,    0,    0],
        [ 101, 2024, 2045,  ...,    0,    0,    0],
        [ 101, 2129, 2172,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2129, 2116,  ...,    0,    0,    0],
        [ 101, 2054, 3787,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1903163343667984, logits - tensor([[-5.2505,  1.1901, -3.8636, -1.5862],
        [-4.9456, -3.3794,  2.4035, -3.3850],
        [-3.9460, -1.5406, -4.1051,  2.0592],
        [-5.1150,  1.2431, -4.0881, -1.0641],
        [-5.6425,  1.0082, -5.1949, -2.35

 16%|█▋        | 47/289 [00:35<03:01,  1.33it/s]

Training loop 47
tensor([[ 101, 2054, 2785,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2001, 2023,  ...,    0,    0,    0],
        ...,
        [ 101, 2515, 1996,  ...,    0,    0,    0],
        [ 101, 2054, 3176,  ...,    0,    0,    0],
        [ 101, 2011, 2129,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.07609138637781143, logits - tensor([[-5.2262,  2.0084, -3.9367, -2.0679],
        [-5.9292,  0.8264, -4.7544, -1.9382],
        [-3.7688, -2.6015,  1.9999, -2.0332],
        [-4.9037, -2.9263,  2.4807, -2.8181],
        [-4.7367, -3.0098,  1.8619, -2.8

 17%|█▋        | 48/289 [00:36<03:01,  1.32it/s]

Training loop 48
tensor([[  101,  2054,  2003,  ...,     0,     0,     0],
        [  101,  2054, 13588,  ...,     0,     0,     0],
        [  101,  2029, 13248,  ...,     0,     0,     0],
        ...,
        [  101,  2029,  2028,  ...,     0,     0,     0],
        [  101,  2001,  9312,  ...,     0,     0,     0],
        [  101,  2129,  2024,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5543839931488037, logits - tensor([[-5.5537,  2.0556, -4.2645, -1.9199],
        [-5.4336,  1.3559, -5.0311, -1.1142],
        [-4.2394, -1.9365, -3.5096,  2.2675],
        [-6.3990,  1.6279, -5.9945, -1.7299],
     

 17%|█▋        | 49/289 [00:36<03:01,  1.32it/s]

Training loop 49
tensor([[ 101, 2054, 2515,  ...,    0,    0,    0],
        [ 101, 2003, 2023,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 3350,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.6093313694000244, logits - tensor([[-5.2331,  0.6415, -4.1001, -1.4995],
        [-4.9235, -2.4529,  1.9194, -2.8621],
        [-5.5467,  0.8795, -5.3123, -2.0556],
        [-4.8205,  1.8747, -3.8973, -1.6322],
        [-5.4786,  0.9789, -4.7696, -1.67

 17%|█▋        | 50/289 [00:37<03:01,  1.32it/s]

Training loop 50
tensor([[ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2029, 2613,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ..., 2022, 3697,  102],
        ...,
        [ 101, 2011, 2129,  ...,    0,    0,    0],
        [ 101, 2054, 2951,  ...,    0,    0,    0],
        [ 101, 2129, 2024,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.22685536742210388, logits - tensor([[-5.0199, -2.0383, -4.5416,  2.3285],
        [-5.4361,  1.3188, -3.8055, -1.1964],
        [-5.1815,  0.8344, -4.6111, -0.8141],
        [-6.4509,  0.8137, -5.2497, -1.3185],
        [-4.9155, -2.9059,  2.4852, -3.1

 18%|█▊        | 51/289 [00:38<03:01,  1.31it/s]

Training loop 51
tensor([[  101,  2129,  2172,  ...,     0,     0,     0],
        [  101,  2079,  1996,  ...,     0,     0,     0],
        [  101,  2129,  2003,  ...,     0,     0,     0],
        ...,
        [  101,  2129,  2502,  ...,     0,     0,     0],
        [  101,  2054,  2024,  ...,     0,     0,     0],
        [  101,  2054, 12247,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1606941670179367, logits - tensor([[-5.2684,  1.0590, -4.7870, -1.0519],
        [-4.0344, -3.0468,  3.1218, -2.8398],
        [-4.8071,  0.9596, -4.3941, -1.4836],
        [-5.7033,  2.2621, -5.3550, -1.9298],
     

 18%|█▊        | 52/289 [00:39<03:00,  1.31it/s]

Training loop 52
tensor([[  101,  2029,  4942,  ...,     0,     0,     0],
        [  101,  2515,  1996,  ...,     0,     0,     0],
        [  101,  2009,  3504,  ...,     0,     0,     0],
        ...,
        [  101,  2029,  8107,  ..., 12170, 13578,   102],
        [  101,  2054,  4275,  ...,     0,     0,     0],
        [  101,  2054,  2024,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3083067536354065, logits - tensor([[-6.1457,  1.7704, -5.1126, -2.0913],
        [-4.0335, -3.2613,  1.9835, -2.4484],
        [-6.3836,  1.1320, -4.3712, -1.7274],
        [-5.8594,  1.9756, -4.4583, -0.9401],
     

 18%|█▊        | 53/289 [00:40<02:59,  1.31it/s]

Training loop 53
tensor([[ 101, 2079, 2027,  ...,    0,    0,    0],
        [ 101, 2024, 1996,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 2048,  ...,    0,    0,    0],
        [ 101, 2054, 2020,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.08489511907100677, logits - tensor([[-4.7885, -2.9949,  2.7551, -3.5593],
        [-4.7687, -3.3519,  2.2193, -3.1469],
        [-6.3338,  1.2334, -5.6218, -1.0031],
        [-4.8088, -3.1289,  2.5018, -2.9801],
        [-5.9444,  0.9959, -5.2399, -2.0

 19%|█▊        | 54/289 [00:40<02:59,  1.31it/s]

Training loop 54
tensor([[ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 2020,  ...,    0,    0,    0],
        [ 101, 2129, 2116,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 3921,  ...,    0,    0,    0],
        [ 101, 2054, 7885,  ...,    0,    0,    0],
        [ 101, 2129, 2079,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.24706797301769257, logits - tensor([[-4.2591, -2.4457, -4.6875,  1.7348],
        [-5.8466,  1.6058, -5.2691, -1.3519],
        [-4.7309,  1.0029, -3.5482, -0.8509],
        [-5.1838,  1.5010, -3.8502, -1.7752],
        [-4.8964, -3.5011,  2.7066, -3.2

 19%|█▉        | 55/289 [00:41<02:58,  1.31it/s]

Training loop 55
tensor([[ 101, 2129, 2079,  ...,    0,    0,    0],
        [ 101, 2129, 2116,  ...,    0,    0,    0],
        [ 101, 2054, 3176,  ...,    0,    0,    0],
        ...,
        [ 101, 2029, 3716,  ...,    0,    0,    0],
        [ 101, 2129, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.10843240469694138, logits - tensor([[-6.1233,  0.9740, -3.9711, -0.9173],
        [-5.8178,  1.7279, -5.3632, -0.8817],
        [-5.1951,  1.4430, -5.2946, -2.1336],
        [-6.1741,  1.2579, -5.1265, -2.5445],
        [-5.6827,  1.7484, -4.6816, -1.1

 19%|█▉        | 56/289 [00:42<02:57,  1.32it/s]

Training loop 56
tensor([[  101,  2054, 16134,  ...,     0,     0,     0],
        [  101,  2515,  2037,  ...,     0,     0,     0],
        [  101,  2079,  3431,  ...,     0,     0,     0],
        ...,
        [  101,  2079,  2027,  ...,     0,     0,     0],
        [  101,  2054,  7236,  ...,     0,     0,     0],
        [  101,  2129,  2001,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.23809444904327393, logits - tensor([[-5.7108,  0.6305, -4.1564, -1.4994],
        [-5.0775, -3.5684,  2.3280, -3.3709],
        [-3.2289, -3.0373,  3.2586, -2.5860],
        [-4.9083, -0.5473, -0.6552, -2.1640],
    

 20%|█▉        | 57/289 [00:43<02:55,  1.32it/s]

Training loop 57
tensor([[ 101, 2058, 2029,  ...,    0,    0,    0],
        [ 101, 1999, 2029,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        ...,
        [ 101, 2079, 1996,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2029, 2024,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.11346347630023956, logits - tensor([[-5.1820,  1.8884, -4.5747, -1.1695],
        [-4.6998, -1.8940, -3.5910,  2.1061],
        [-5.0607,  2.0631, -4.2288, -1.0124],
        [-5.7817,  1.2459, -4.8216, -0.5306],
        [-5.8348,  1.3784, -4.3667, -1.4

 20%|██        | 58/289 [00:43<02:54,  1.32it/s]

Training loop 58
tensor([[ 101, 2054, 8310,  ...,    0,    0,    0],
        [ 101, 2054, 2020,  ...,    0,    0,    0],
        [ 101, 2003, 2944,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2172,  ...,    0,    0,    0],
        [ 101, 2079, 2027,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1881226897239685, logits - tensor([[-4.8228, -1.8784, -4.8442,  2.2520],
        [-5.1717, -1.4815, -4.5981,  1.1228],
        [-4.6921, -2.9483,  1.6848, -2.2168],
        [-6.8782,  0.6868, -4.9733, -1.9987],
        [-4.8236,  1.5795, -4.3618, -1.14

 20%|██        | 59/289 [00:44<02:53,  1.33it/s]

Training loop 59
tensor([[ 101, 2029, 2048,  ...,    0,    0,    0],
        [ 101, 2054, 2951,  ...,    0,    0,    0],
        [ 101, 2054, 2828,  ...,    0,    0,    0],
        ...,
        [ 101, 2515, 3818,  ...,    0,    0,    0],
        [ 101, 2054, 9312,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.24005216360092163, logits - tensor([[-5.5560,  1.5988, -5.0291, -2.1953],
        [-5.7157,  2.4072, -4.1413, -1.8565],
        [-5.7064,  1.0434, -3.8449, -1.1901],
        [-6.2432, -2.3336, -4.9132,  1.8170],
        [-5.9826,  1.3006, -5.3188, -0.6

 21%|██        | 60/289 [00:45<02:52,  1.33it/s]

Training loop 60
tensor([[ 101, 2054, 2048,  ...,    0,    0,    0],
        [ 101, 2129, 2079,  ...,    0,    0,    0],
        [ 101, 2054, 6747,  ...,    0,    0,    0],
        ...,
        [ 101, 2029, 2024,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2129, 2844,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.312772274017334, logits - tensor([[-5.3002,  0.9863, -4.5545, -0.9887],
        [-6.0234,  0.8243, -5.0888, -0.1545],
        [-5.1465,  1.1645, -4.7336, -1.6114],
        [-6.0540,  0.1223, -5.0630, -0.8813],
        [-5.5821,  1.4563, -4.7442, -1.029

 21%|██        | 61/289 [00:46<02:51,  1.33it/s]

Training loop 61
tensor([[ 101, 2079, 2027,  ...,    0,    0,    0],
        [ 101, 2054, 2001,  ...,    0,    0,    0],
        [ 101, 2129, 2003,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 6882,  ...,    0,    0,    0],
        [ 101, 2129, 2079,  ...,    0,    0,    0],
        [ 101, 2054, 8190,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.33762624859809875, logits - tensor([[-3.5838, -3.2421,  3.1265, -2.4950],
        [-4.8353,  1.2588, -4.5199, -0.9062],
        [-6.4581,  0.9074, -4.4373, -0.7988],
        [-6.4889,  1.2818, -5.5725, -2.0074],
        [-4.4807, -0.7757, -4.7626,  0.6

 21%|██▏       | 62/289 [00:46<02:50,  1.33it/s]

Training loop 62
tensor([[  101,  2079,  2027,  ...,     0,     0,     0],
        [  101,  2129,  2515,  ...,     0,     0,     0],
        [  101,  2054,  2951,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  2003,  ...,     0,     0,     0],
        [  101,  2054,  2024,  ...,     0,     0,     0],
        [  101,  2029, 16227,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19092433154582977, logits - tensor([[-3.2001, -2.6565,  2.9368, -2.1707],
        [-5.2445,  0.9627, -5.5952, -1.5558],
        [-5.1167,  1.7338, -5.0585, -1.0432],
        [-7.0386,  1.6020, -7.1434, -1.9024],
    

 22%|██▏       | 63/289 [00:47<02:50,  1.33it/s]

Training loop 63
tensor([[  101,  2054,  2003,  ...,     0,     0,     0],
        [  101,  2079,  2027,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  2003,  ...,     0,     0,     0],
        [  101,  2029,  2951,  ...,     0,     0,     0],
        [  101,  2054, 24828,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2700265049934387, logits - tensor([[-5.7410,  0.9022, -5.4097, -1.8039],
        [-4.9410, -3.3937,  2.0665, -2.2679],
        [-5.5953,  2.1837, -5.1629, -1.4675],
        [-5.3759, -1.5105, -4.4557,  1.1132],
     

 22%|██▏       | 64/289 [00:48<02:49,  1.33it/s]

Training loop 64
tensor([[ 101, 2129, 2024,  ..., 1051, 4492,  102],
        [ 101, 2129, 2116,  ...,    0,    0,    0],
        [ 101, 2011, 2129,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2515,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2029, 4155,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2769066393375397, logits - tensor([[-6.0183,  1.2183, -3.6110, -0.7259],
        [-5.4990,  1.5119, -5.9862, -0.9752],
        [-3.5396, -2.8108, -4.1116,  2.8465],
        [-5.9616,  1.4050, -5.2888, -1.0215],
        [-6.4344,  1.8704, -4.9156, -1.38

 22%|██▏       | 65/289 [00:49<02:48,  1.33it/s]

Training loop 65
tensor([[ 101, 2054, 2785,  ...,    0,    0,    0],
        [ 101, 2054, 5903,  ...,    0,    0,    0],
        [ 101, 2054, 6695,  ..., 2196, 2583,  102],
        ...,
        [ 101, 2000, 2054,  ...,    0,    0,    0],
        [ 101, 2054, 2951,  ...,    0,    0,    0],
        [ 101, 2129, 2146,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2923792600631714, logits - tensor([[-5.6803, -1.9391, -5.0525,  1.9111],
        [-5.2903,  1.2263, -5.4606, -2.1053],
        [-5.2315,  1.4051, -3.9026, -0.5994],
        [-5.6036, -2.0190, -4.6182,  2.8191],
        [-5.0071, -2.3435, -1.9778, -0.13

 23%|██▎       | 66/289 [00:49<02:48,  1.33it/s]

Training loop 66
tensor([[ 101, 2129, 2116,  ...,    0,    0,    0],
        [ 101, 2129, 2116,  ...,    0,    0,    0],
        [ 101, 2129, 2488,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2106,  ...,    0,    0,    0],
        [ 101, 2054, 4127,  ..., 2013, 1996,  102],
        [ 101, 2129, 2003,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.38876670598983765, logits - tensor([[-5.7813,  1.2545, -5.9927, -1.9004],
        [-5.0218, -1.5066, -4.0009,  2.0033],
        [-6.4958,  1.4188, -4.7983, -1.2383],
        [-4.8710,  0.2654, -4.1284, -0.6702],
        [-4.4907, -3.0762,  2.5327, -3.0

 23%|██▎       | 67/289 [00:50<02:47,  1.33it/s]

Training loop 67
tensor([[ 101, 2054, 4773,  ...,    0,    0,    0],
        [ 101, 2054, 2828,  ...,    0,    0,    0],
        [ 101, 2129, 2079,  ...,    0,    0,    0],
        ...,
        [ 101, 2011, 2129,  ...,    0,    0,    0],
        [ 101, 2024, 1996,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4291473627090454, logits - tensor([[-5.8055, -0.9404, -3.8234,  1.7647],
        [-5.9102,  0.8707, -4.0296, -0.4528],
        [-6.4880,  1.6641, -5.0770, -1.1135],
        [-3.6954, -3.3393,  2.1237, -2.6402],
        [-5.5347,  0.8105, -3.6682, -0.39

 24%|██▎       | 68/289 [00:51<02:47,  1.32it/s]

Training loop 68
tensor([[ 101, 2129, 2515,  ...,    0,    0,    0],
        [ 101, 2024, 1996,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        ...,
        [ 101, 2515, 1996,  ...,    0,    0,    0],
        [ 101, 2129, 2116,  ...,    0,    0,    0],
        [ 101, 2079, 2027,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.22078253328800201, logits - tensor([[-5.7777,  0.3022, -4.6947, -0.5311],
        [-4.9708, -4.6895,  2.8537, -2.7835],
        [-6.2549,  0.9867, -4.2098, -1.2165],
        [-6.0684,  0.8171, -4.8455, -0.5090],
        [-5.0028,  1.2051, -4.0368, -0.8

 24%|██▍       | 69/289 [00:52<02:46,  1.32it/s]

Training loop 69
tensor([[ 101, 2129, 2027,  ...,    0,    0,    0],
        [ 101, 2054, 6210,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 7017,  ...,    0,    0,    0],
        [ 101, 2079, 2027,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.21060161292552948, logits - tensor([[-6.4834,  1.5646, -5.6126, -1.1523],
        [-6.4956,  1.7954, -4.6951, -0.6546],
        [-5.6658,  1.5818, -5.3208, -1.1571],
        [-5.9960,  1.6375, -5.0952, -1.2955],
        [-5.0781,  1.3399, -4.6944, -0.9

 24%|██▍       | 70/289 [00:52<02:46,  1.32it/s]

Training loop 70
tensor([[ 101, 2129, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 9312,  ...,    0,    0,    0],
        [ 101, 2515, 1996,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2172,  ...,    0,    0,    0],
        [ 101, 2006, 2029,  ...,    0,    0,    0],
        [ 101, 2079, 2027,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.07582725584506989, logits - tensor([[-5.4394,  1.7704, -4.9246, -0.4386],
        [-5.9009,  1.9167, -5.0233, -1.1991],
        [-5.2228, -3.7798,  2.5025, -3.6221],
        [-4.8449, -2.4248, -4.6551,  3.4318],
        [-4.2914, -2.7307, -3.8330,  2.6

 25%|██▍       | 71/289 [00:53<02:45,  1.32it/s]

Training loop 71
tensor([[ 101, 2001, 1996,  ...,    0,    0,    0],
        [ 101, 2029, 1997,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        ...,
        [ 101, 2079, 2035,  ...,    0,    0,    0],
        [ 101, 2029, 2951,  ...,    0,    0,    0],
        [ 101, 2029, 6975,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.348618745803833, logits - tensor([[-4.9355, -3.4157,  2.5476, -1.7072],
        [-5.3121, -2.8114, -4.6641,  2.7640],
        [-6.8189,  1.2293, -5.2400, -1.3441],
        [-5.6466,  1.1636, -5.1698, -1.4385],
        [-6.1986,  0.8422, -5.7514, -1.148

 25%|██▍       | 72/289 [00:54<02:43,  1.32it/s]

Training loop 72
tensor([[ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2003, 2070,  ...,    0,    0,    0],
        [ 101, 2079, 2027,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 6123,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2129, 2172,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.17843393981456757, logits - tensor([[-5.8021,  1.3566, -4.7734, -1.1940],
        [-3.5171, -2.5160,  2.7857, -3.0041],
        [-4.2915, -4.0298,  3.1879, -3.0023],
        [-5.5248,  0.7780, -4.2048, -1.1546],
        [-5.6402,  1.1632, -5.0394, -1.0

 25%|██▌       | 73/289 [00:55<02:43,  1.32it/s]

Training loop 73
tensor([[  101,  2079,  2027,  ...,     0,     0,     0],
        [  101,  2054,  2951,  ...,     0,     0,     0],
        [  101,  2003,  2023,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  2024,  ...,     0,     0,     0],
        [  101,  2054,  6882,  ...,     0,     0,     0],
        [  101,  2129,  2024,  ..., 20446, 13087,   102]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19104589521884918, logits - tensor([[-3.5703, -3.2412,  2.3467, -2.0394],
        [-5.8604,  1.7018, -4.2646, -0.8287],
        [-4.0604, -2.5200,  2.6383, -3.0112],
        [-4.9882, -0.7783, -4.2973,  1.4375],
    

 26%|██▌       | 74/289 [00:55<02:43,  1.32it/s]

Training loop 74
tensor([[  101,  2106,  2027,  ...,     0,     0,     0],
        [  101,  2054,  2399,  ...,     0,     0,     0],
        [  101,  2029, 18224,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  4012,  ...,     0,     0,     0],
        [  101,  2129,  2024,  ...,     0,     0,     0],
        [  101,  2106,  2027,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.18150153756141663, logits - tensor([[-4.9888, -3.5818,  2.9605, -2.8942],
        [-6.1263,  0.1138, -5.4616, -1.2360],
        [-5.0056, -2.0462, -4.8187,  1.8475],
        [-6.0252,  1.1749, -4.8810, -1.0019],
    

 26%|██▌       | 75/289 [00:56<02:42,  1.32it/s]

Training loop 75
tensor([[  101,  2054,  2003,  ...,     0,     0,     0],
        [  101,  2129,  2003,  ...,     0,     0,     0],
        [  101,  2129,  2003,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  2003,  ...,  5372, 12139,   102],
        [  101,  2054,  2003,  ...,     0,     0,     0],
        [  101,  2029,  2653,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.14537647366523743, logits - tensor([[-6.0843,  1.7856, -5.5064, -1.4267],
        [-5.8291,  0.7375, -4.5674, -0.8398],
        [-5.5492,  1.4586, -5.2270, -0.4609],
        [-6.5058,  1.0947, -4.9694, -0.7415],
    

 26%|██▋       | 76/289 [00:57<02:41,  1.32it/s]

Training loop 76
tensor([[  101,  2029,  2350,  ...,     0,     0,     0],
        [  101,  2079,  2027,  ...,     0,     0,     0],
        [  101,  2054, 26163,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  4127,  ...,     0,     0,     0],
        [  101,  2029,  2944,  ...,     0,     0,     0],
        [  101,  2129,  2079,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2582353353500366, logits - tensor([[-5.4931,  1.2555, -4.7932, -1.1342],
        [-4.6719, -4.0864,  3.4520, -2.7408],
        [-6.0602,  1.4092, -4.9394, -1.4932],
        [-4.6010, -3.4924,  3.0373, -1.9230],
     

 27%|██▋       | 77/289 [00:58<02:40,  1.32it/s]

Training loop 77
tensor([[  101,  2054,  2024,  ...,  6026, 11834,   102],
        [  101,  2515,  1996,  ...,     0,     0,     0],
        [  101,  2129,  2312,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  7620,  ...,     0,     0,     0],
        [  101,  2054,  2785,  ...,     0,     0,     0],
        [  101,  2064,  2023,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.11562266945838928, logits - tensor([[-5.7334,  0.6662, -4.7461, -0.8265],
        [-4.7238, -3.4222,  2.8557, -2.3324],
        [-6.3007,  1.2619, -5.3518, -1.4688],
        [-5.2463, -2.6756, -3.9143,  3.0704],
    

 27%|██▋       | 78/289 [00:58<02:39,  1.32it/s]

Training loop 78
tensor([[ 101, 2029, 4800,  ...,    0,    0,    0],
        [ 101, 2029, 1997,  ...,    0,    0,    0],
        [ 101, 2058, 2054,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2092,  ...,    0,    0,    0],
        [ 101, 2024, 2151,  ...,    0,    0,    0],
        [ 101, 2129, 5656,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.13649089634418488, logits - tensor([[-5.7123,  1.3829, -3.6432, -0.8698],
        [-5.7877,  1.1815, -4.4714, -1.1541],
        [-5.4098,  1.3401, -4.5336, -1.2314],
        [-6.3602,  1.3843, -5.1613, -1.4585],
        [-4.6436,  0.9884, -4.7456, -0.3

 27%|██▋       | 79/289 [00:59<02:38,  1.32it/s]

Training loop 79
tensor([[ 101, 2079, 2027,  ...,    0,    0,    0],
        [ 101, 2054, 2367,  ...,    0,    0,    0],
        [ 101, 2129, 2116,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 2951,  ...,    0,    0,    0],
        [ 101, 2079, 2027,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.25336748361587524, logits - tensor([[-4.1275, -3.4713,  3.4627, -2.4289],
        [-6.1431,  1.4493, -4.8645, -1.7871],
        [-6.0604,  0.7978, -5.0879, -1.7482],
        [-5.5601,  1.4776, -4.8912, -1.0241],
        [-4.0873, -2.5755,  3.0165, -2.5

 28%|██▊       | 80/289 [01:00<02:37,  1.32it/s]

Training loop 80
tensor([[ 101, 2054, 2024,  ..., 1997, 1996,  102],
        [ 101, 2054, 2951,  ..., 2023, 2862,  102],
        [ 101, 2054, 9312,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 3716,  ...,    0,    0,    0],
        [ 101, 2106, 1996,  ...,    0,    0,    0],
        [ 101, 2054, 5835,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.18018777668476105, logits - tensor([[-6.1273,  0.5044, -4.6354, -0.4178],
        [-5.8549,  1.1528, -4.5278, -1.2437],
        [-6.1703,  0.5860, -5.7846, -1.4431],
        [-5.6389, -2.4312, -5.6194,  1.5274],
        [-5.8197,  1.6308, -5.2210, -1.4

 28%|██▊       | 81/289 [01:01<02:37,  1.32it/s]

Training loop 81
tensor([[  101,  2054,  4800,  ...,     0,     0,     0],
        [  101,  2054,  2944,  ...,     0,     0,     0],
        [  101,  2129,  2515,  ...,     0,     0,     0],
        ...,
        [  101,  2129,  2079,  ...,     0,     0,     0],
        [  101,  2054, 26163,  ...,     0,     0,     0],
        [  101,  2054,  6327,  ...,  2839,  1010,   102]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.12488789856433868, logits - tensor([[-6.1650,  1.1014, -5.0169, -1.9728],
        [-6.2383,  1.3912, -5.6711, -1.4615],
        [-5.4428,  1.9861, -5.1627, -0.7295],
        [-5.9269,  1.2771, -4.0642, -2.0010],
    

 28%|██▊       | 82/289 [01:01<02:36,  1.33it/s]

Training loop 82
tensor([[  101,  2003,  2037,  ...,     0,     0,     0],
        [  101,  2040,  2003,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0],
        ...,
        [  101,  2079,  2027,  ...,     0,     0,     0],
        [  101,  2515,  1996,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,  2312, 29536,   102]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.13375504314899445, logits - tensor([[-5.6032,  0.3443, -4.1945, -0.8074],
        [-5.1001,  2.5281, -4.6962, -1.4616],
        [-5.0047, -1.7082, -5.1261,  1.9091],
        [-4.9556,  0.7693, -4.6721, -0.5146],
    

 29%|██▊       | 83/289 [01:02<02:36,  1.32it/s]

Training loop 83
tensor([[ 101, 2054, 2079,  ...,    0,    0,    0],
        [ 101, 2129, 2116,  ...,    0,    0,    0],
        [ 101, 2054, 4155,  ...,    0,    0,    0],
        ...,
        [ 101, 2079, 2027,  ...,    0,    0,    0],
        [ 101, 2129, 2024,  ...,    0,    0,    0],
        [ 101, 2054, 3130,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.18365535140037537, logits - tensor([[-5.6699,  1.6360, -5.2446, -1.1120],
        [-6.0081,  0.6854, -5.0526, -1.1534],
        [-6.1166,  1.9147, -5.9554, -1.8388],
        [-5.2585,  1.6880, -4.9901, -1.1833],
        [-4.2331, -4.2840,  2.2835, -2.8

 29%|██▉       | 84/289 [01:03<02:34,  1.32it/s]

Training loop 84
tensor([[ 101, 2003, 1996,  ...,    0,    0,    0],
        [ 101, 2054, 2001,  ...,    0,    0,    0],
        [ 101, 2054, 6847,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2024,  ...,    0,    0,    0],
        [ 101, 2129, 2079,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1909846067428589, logits - tensor([[-5.3038, -3.0905,  1.8467, -3.5412],
        [-6.7898,  1.8291, -6.7677, -1.7506],
        [-5.7377,  1.5834, -3.8953, -0.8806],
        [-5.3717, -2.0297, -5.2770,  2.2499],
        [-5.6148,  1.0378, -5.1547, -1.26

 29%|██▉       | 85/289 [01:04<02:33,  1.33it/s]

Training loop 85
tensor([[ 101, 2129, 2172,  ...,    0,    0,    0],
        [ 101, 2054, 2951,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ..., 1996, 3643,  102],
        ...,
        [ 101, 2029, 2112,  ...,    0,    0,    0],
        [ 101, 2029, 8789,  ...,    0,    0,    0],
        [ 101, 2054, 9312,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1417095959186554, logits - tensor([[-4.7366,  1.1181, -4.5032, -1.5294],
        [-5.5621,  1.6334, -5.2955, -1.9092],
        [-6.1247,  0.3242, -4.1528, -0.9753],
        [-5.9245,  1.8255, -5.6004, -1.5905],
        [-4.5959, -3.1308,  2.4778, -2.79

 30%|██▉       | 86/289 [01:04<02:32,  1.33it/s]

Training loop 86
tensor([[  101,  2054, 21641,  ...,     0,     0,     0],
        [  101,  2079,  2027,  ...,     0,     0,     0],
        [  101,  2073,  2515,  ...,  1010,  1040,   102],
        ...,
        [  101,  2054, 16745,  ...,     0,     0,     0],
        [  101,  2054,  2020,  ...,     0,     0,     0],
        [  101,  2054, 12783,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2838349938392639, logits - tensor([[-4.5873, -1.3355, -4.4811,  1.4386],
        [-4.9755, -3.6885,  3.0181, -2.8969],
        [-5.4658,  1.0971, -4.0318, -1.1206],
        [-5.4353,  0.7888, -4.8710, -0.4687],
     

 30%|███       | 87/289 [01:05<02:31,  1.33it/s]

Training loop 87
tensor([[ 101, 2129, 2003,  ...,    0,    0,    0],
        [ 101, 2129, 2020,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 3280,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2129, 2172,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3639225363731384, logits - tensor([[-6.0312,  1.0403, -5.1584, -1.7683],
        [-5.4710,  1.4927, -4.3262, -0.5715],
        [-5.5691, -2.2043, -4.7673,  0.9616],
        [-6.3762,  1.6993, -4.9271, -2.4226],
        [-6.4791,  1.2422, -5.1182, -1.41

 30%|███       | 88/289 [01:06<02:30,  1.33it/s]

Training loop 88
tensor([[  101,  2054,  3025,  ..., 10229,  2000,   102],
        [  101,  2054,  4973,  ...,     0,     0,     0],
        [  101,  2040,  2024,  ...,     0,     0,     0],
        ...,
        [  101,  2339,  2515,  ...,     0,     0,     0],
        [  101,  2054,  2024,  ...,     0,     0,     0],
        [  101,  2029,  2951,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3443485498428345, logits - tensor([[-5.0834,  1.4088, -4.0709, -0.5126],
        [-6.5910,  1.3007, -5.0636, -1.1760],
        [-5.5132,  1.9281, -5.6889, -1.9723],
        [-4.5954, -2.9610,  1.8633, -2.4043],
     

 31%|███       | 89/289 [01:07<02:30,  1.33it/s]

Training loop 89
tensor([[  101,  2029, 12719,  ...,  2890,  2546,   102],
        [  101,  2054,  2024,  ...,     0,     0,     0],
        [  101,  2029,  4155,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  2024,  ...,     0,     0,     0],
        [  101,  2054, 16105,  ...,     0,     0,     0],
        [  101,  2129,  2116,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1699449121952057, logits - tensor([[-4.4297,  1.0489, -3.8168, -0.7636],
        [-6.0086,  1.9501, -6.3238, -1.7536],
        [-5.1940,  1.3846, -4.8865, -1.0835],
        [-5.4457,  1.2790, -4.7028, -2.0898],
     

 31%|███       | 90/289 [01:07<02:30,  1.33it/s]

Training loop 90
tensor([[ 101, 2024, 1996,  ...,    0,    0,    0],
        [ 101, 2129, 2172,  ...,    0,    0,    0],
        [ 101, 2054, 2001,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 9312,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3730745315551758, logits - tensor([[-4.6419, -2.9041,  2.2842, -2.5343],
        [-5.0292, -0.2596, -3.6749,  0.8766],
        [-6.1724,  1.3476, -4.4660, -1.2592],
        [-5.9326,  1.9095, -5.1312, -2.0161],
        [-4.2097, -3.7987,  2.2009, -2.69

 31%|███▏      | 91/289 [01:08<02:30,  1.32it/s]

Training loop 91
tensor([[ 101, 2029, 1017,  ...,    0,    0,    0],
        [ 101, 2029, 4275,  ...,    0,    0,    0],
        [ 101, 2129, 2079,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 2001,  ...,    0,    0,    0],
        [ 101, 2129, 2001,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3632650375366211, logits - tensor([[-4.4112, -2.5284, -3.8845,  1.3922],
        [-5.5479, -0.9555, -5.4348,  0.8345],
        [-5.8625,  0.9079, -4.2090, -0.7603],
        [-5.2370,  1.5749, -4.3349, -0.8412],
        [-5.6643,  0.7088, -4.7723, -1.10

 32%|███▏      | 92/289 [01:09<02:30,  1.31it/s]

Training loop 92
tensor([[ 101, 2011, 2129,  ..., 2897, 1006,  102],
        [ 101, 2054, 9312,  ...,    0,    0,    0],
        [ 101, 2106, 2027,  ...,    0,    0,    0],
        ...,
        [ 101, 2079, 2027,  ...,    0,    0,    0],
        [ 101, 2011, 2129,  ...,    0,    0,    0],
        [ 101, 2029, 3653,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 32%|███▏      | 93/289 [01:10<02:29,  1.31it/s]

loss - 0.35654011368751526, logits - tensor([[-5.0717, -2.1835, -5.0155,  2.2691],
        [-6.4023,  1.6888, -5.6317, -1.6354],
        [-3.9735, -1.5715,  2.2995, -2.7334],
        [-5.4263, -3.2361, -4.3607,  3.3392],
        [-5.8529,  1.3373, -4.6911, -1.3329],
        [-4.6411, -3.2074,  1.9470, -3.4051],
        [-4.7867, -2.2379, -5.1384,  0.6538],
        [-6.2999,  1.9340, -5.2519, -1.9057]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 93
tensor([[  101,  2129,  2003,  ...,     0,     0,     0],
        [  101,  2011,  2129,  ...,     0,     0,     0],
        [  101,  2129,  2079,  ...,     0,     0,     0],
        ...,
        [  101,  2129,  2003,  ...,     0,     0,     0],
        [  101,  2054, 10640,  ...,     0,     0,     0],
        [  101,  2129,  2172,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0,

 33%|███▎      | 94/289 [01:11<02:28,  1.31it/s]

Training loop 94
tensor([[  101,  2001,  1996,  ...,     0,     0,     0],
        [  101,  2054,  6388,  ...,     0,     0,     0],
        [  101,  2054,  2944,  ...,     0,     0,     0],
        ...,
        [  101,  2515,  2169,  ...,     0,     0,     0],
        [  101,  2129,  2172,  ...,     0,     0,     0],
        [  101,  2054, 15306,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.26324760913848877, logits - tensor([[-4.7294,  0.2093, -2.5955, -1.6985],
        [-5.7133,  1.9456, -5.3354, -1.3382],
        [-6.2092,  1.4177, -4.9714, -1.0064],
        [-5.4942, -3.6249,  1.7565, -3.5661],
    

 33%|███▎      | 95/289 [01:11<02:26,  1.32it/s]

Training loop 95
tensor([[ 101, 2129, 2079,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2029, 5109,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2312,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2129, 2003,  ..., 4349, 8484,  102]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.18872766196727753, logits - tensor([[-6.4958,  1.1816, -5.6939, -1.7414],
        [-6.3142,  1.8918, -5.5965, -1.3978],
        [-5.8695,  1.6340, -4.2214, -1.9451],
        [-5.1261,  1.5612, -4.6962, -0.9188],
        [-5.5743,  1.8538, -4.5998, -1.5

 33%|███▎      | 96/289 [01:12<02:26,  1.31it/s]

Training loop 96
tensor([[  101,  2054, 12783,  ...,     0,     0,     0],
        [  101,  2024,  2060,  ...,     0,     0,     0],
        [  101,  2054,  2024,  ...,     0,     0,     0],
        ...,
        [  101,  2129,  2515,  ...,     0,     0,     0],
        [  101,  2054,  2024,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.09622922539710999, logits - tensor([[-5.9428,  2.0582, -6.6984, -1.0572],
        [-5.4939, -3.8454,  2.5200, -2.0044],
        [-5.4760, -2.1432, -5.5900,  1.7094],
        [-4.7533, -2.0545,  1.5925, -2.5360],
    

 34%|███▎      | 97/289 [01:13<02:26,  1.31it/s]

Training loop 97
tensor([[ 101, 2129, 2027,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2129, 2079,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2836,  ...,    0,    0,    0],
        [ 101, 2003, 1996,  ...,    0,    0,    0],
        [ 101, 2079, 2027,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2128620445728302, logits - tensor([[-5.4129,  2.0135, -4.5272, -2.0979],
        [-5.7650,  1.5741, -4.7874, -1.8680],
        [-4.6489,  1.0774, -3.7483, -1.4726],
        [-4.8140, -4.3005,  1.9824, -3.2598],
        [-5.9360,  2.0230, -4.6231, -1.99

 34%|███▍      | 98/289 [01:14<02:24,  1.32it/s]

Training loop 98
tensor([[ 101, 2129, 4294,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2515,  ...,    0,    0,    0],
        [ 101, 2129, 2116,  ...,    0,    0,    0],
        [ 101, 2129, 2020,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.15140241384506226, logits - tensor([[-5.9105,  1.2300, -5.3022, -1.8840],
        [-4.7960, -2.6736, -4.6080,  2.0240],
        [-5.8133,  0.7823, -4.5468, -1.8106],
        [-6.0080,  1.0233, -4.8114, -1.7459],
        [-6.9142,  1.5543, -5.7747, -1.7

 34%|███▍      | 99/289 [01:14<02:24,  1.32it/s]

Training loop 99
tensor([[  101,  2054,  2024,  ...,     0,     0,     0],
        [  101,  2129,  2515,  ...,     0,     0,     0],
        [  101,  2029, 12046,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  2110,  ...,     0,     0,     0],
        [  101,  2054, 14965,  ...,     0,     0,     0],
        [  101,  2129,  2027,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.32357341051101685, logits - tensor([[-5.7330,  1.0753, -4.9978, -1.2699],
        [-5.2512,  1.6838, -5.4976, -1.2498],
        [-6.0346,  2.1328, -5.1475, -1.5179],
        [-4.9724, -2.5245,  1.9379, -3.2825],
    

 35%|███▍      | 100/289 [01:15<02:23,  1.32it/s]

Training loop 100
tensor([[ 101, 2054, 6847,  ...,    0,    0,    0],
        [ 101, 2029, 9324,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2003,  ..., 1055, 2012,  102],
        [ 101, 2129, 2172,  ...,    0,    0,    0],
        [ 101, 2054, 4127,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.36158907413482666, logits - tensor([[-7.1105,  1.6318, -5.3292, -1.4506],
        [-5.5222,  1.5630, -5.3774, -2.7869],
        [-5.8624,  1.0319, -5.7112, -1.5848],
        [-6.0871,  1.8250, -5.2568, -1.9833],
        [-6.7747,  0.5554, -5.7552, -1.

 35%|███▍      | 101/289 [01:16<02:22,  1.32it/s]

Training loop 101
tensor([[  101,  2054, 20062,  ...,  1037,  1010,   102],
        [  101,  2129,  2027,  ...,     0,     0,     0],
        [  101,  2129,  6048,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  2003,  ...,     0,     0,     0],
        [  101,  2029,  2951,  ...,     0,     0,     0],
        [  101,  2079,  2027,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1812431812286377, logits - tensor([[-5.4195,  1.1183, -3.8717, -1.0132],
        [-4.7604,  2.1968, -4.1415, -1.5811],
        [-5.9130,  1.4766, -5.1172, -1.5068],
        [-4.8938, -2.3130, -4.2258,  2.1426],
    

 35%|███▌      | 102/289 [01:17<02:21,  1.32it/s]

Training loop 102
tensor([[ 101, 2129, 2515,  ...,    0,    0,    0],
        [ 101, 2129, 2079,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2312,  ...,    0,    0,    0],
        [ 101, 2515, 1996,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.29977232217788696, logits - tensor([[-6.5370,  0.7459, -5.6932, -2.2926],
        [-6.2938,  1.2447, -4.4218, -1.3875],
        [-5.4395,  1.4766, -4.1736, -0.8396],
        [-5.8905,  1.1728, -4.9757, -1.3306],
        [-5.3031,  1.6043, -4.6649, -0.

 36%|███▌      | 103/289 [01:17<02:20,  1.33it/s]

Training loop 103
tensor([[ 101, 2129, 2116,  ...,    0,    0,    0],
        [ 101, 2129, 2003,  ...,    0,    0,    0],
        [ 101, 2079, 2027,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 9312,  ...,    0,    0,    0],
        [ 101, 2129, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.25049108266830444, logits - tensor([[-4.7237, -2.0333, -4.8730,  2.0896],
        [-5.4357,  1.4264, -4.5723, -1.3874],
        [-4.9220, -3.4273,  2.2588, -2.3633],
        [-4.7186, -1.9581, -5.4825,  2.5162],
        [-5.6640,  0.8651, -5.6198, -1.

 36%|███▌      | 104/289 [01:18<02:18,  1.33it/s]

Training loop 104
tensor([[  101,  2054,  2003,  ...,     0,     0,     0],
        [  101,  2129,  2001,  ...,     0,     0,     0],
        [  101,  2029, 13221,  ...,     0,     0,     0],
        ...,
        [  101,  2129,  3020,  ...,     0,     0,     0],
        [  101,  2054,  2001,  ...,     0,     0,     0],
        [  101,  2054,  7551,  ...,  2217,  2515,   102]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.17320922017097473, logits - tensor([[-4.4851, -1.9342, -3.3605,  2.2254],
        [-5.3396,  0.7581, -4.6119, -1.3498],
        [-6.0874,  1.7842, -5.2726, -1.9877],
        [-5.1281,  0.6532, -4.0883, -1.8538],
   

 36%|███▋      | 105/289 [01:19<02:18,  1.33it/s]

Training loop 105
tensor([[  101,  2054,  2591,  ...,     0,     0,     0],
        [  101,  2029, 26163,  ...,     0,     0,     0],
        [  101,  2129,  2312,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  8107,  ...,     0,     0,     0],
        [  101,  2129,  2003,  ...,     0,     0,     0],
        [  101,  2054,  1005,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.27666574716567993, logits - tensor([[-5.3281,  0.7248, -4.0188, -1.0117],
        [-5.7677,  1.1624, -4.7001, -1.9409],
        [-6.2270,  1.2113, -4.6381, -1.1895],
        [-4.4066,  1.5537, -4.4521, -1.0165],
   

 37%|███▋      | 106/289 [01:20<02:17,  1.33it/s]

Training loop 106
tensor([[  101,  2129,  3020,  ...,     0,     0,     0],
        [  101,  2054,  2828,  ...,     0,     0,     0],
        [  101,  2054,  2653,  ...,     0,     0,     0],
        ...,
        [  101,  2129,  2116,  ...,     0,     0,     0],
        [  101,  2054, 15792,  ...,     0,     0,     0],
        [  101,  2024,  2151,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.09662185609340668, logits - tensor([[-5.2280,  1.6766, -4.2430, -0.8549],
        [-5.4553,  0.8314, -5.4613, -1.8757],
        [-4.9069,  1.0451, -4.8840, -0.9619],
        [-4.4445, -1.7784, -4.6298,  1.3483],
   

 37%|███▋      | 107/289 [01:20<02:17,  1.32it/s]

Training loop 107
tensor([[ 101, 2129, 2079,  ...,    0,    0,    0],
        [ 101, 2029, 2653,  ...,    0,    0,    0],
        [ 101, 2029, 2003,  ...,    0,    0,    0],
        ...,
        [ 101, 2073, 2106,  ...,    0,    0,    0],
        [ 101, 2029, 2951,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.36742427945137024, logits - tensor([[-5.7493,  1.1068, -5.3141, -2.2988],
        [-5.4862,  2.7837, -4.2100, -1.6890],
        [-6.0954,  1.6334, -5.0143, -1.4074],
        [-5.1739, -3.0348, -4.5726,  2.9718],
        [-4.6308, -2.4237, -4.9526,  2.

 37%|███▋      | 108/289 [01:21<02:16,  1.33it/s]

Training loop 108
tensor([[  101,  2071,  2017,  ...,     0,     0,     0],
        [  101,  2054, 12158,  ...,     0,     0,     0],
        [  101,  2079,  2027,  ...,     0,     0,     0],
        ...,
        [  101,  2029,  4275,  ...,     0,     0,     0],
        [  101,  2054,  7860,  ...,     0,     0,     0],
        [  101,  2003,  5746,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.26122766733169556, logits - tensor([[-5.5628, -3.1066, -0.6920, -1.7298],
        [-5.0023,  1.6803, -4.2795, -1.5553],
        [-5.2313, -4.0215,  2.9267, -3.6348],
        [-5.7955,  1.9196, -4.9870, -2.8498],
   

 38%|███▊      | 109/289 [01:22<02:15,  1.33it/s]

Training loop 109
tensor([[ 101, 2129, 2003,  ...,    0,    0,    0],
        [ 101, 2129, 2116,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2946,  ..., 2871, 1013,  102],
        [ 101, 2054, 2951,  ...,    0,    0,    0],
        [ 101, 2054, 2951,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 38%|███▊      | 110/289 [01:23<02:15,  1.32it/s]

loss - 0.2276443988084793, logits - tensor([[-6.1700,  1.5747, -4.8195, -1.6358],
        [-4.0971, -2.0055, -4.2056,  2.1162],
        [-5.3839,  0.7609, -4.8688, -2.0320],
        [-5.7368,  1.5000, -4.5384, -2.1502],
        [-6.0711,  0.9179, -4.6817, -0.2438],
        [-5.2007,  0.8687, -3.9414, -0.6611],
        [-5.2555,  1.2082, -5.1341, -1.4248],
        [-5.8765,  1.3694, -4.9499, -1.8926]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 110
tensor([[ 101, 1999, 2054,  ...,    0,    0,    0],
        [ 101, 2129, 2844,  ...,    0,    0,    0],
        [ 101, 2054, 3653,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2079,  ...,    0,    0,    0],
        [ 101, 2106, 2027,  ...,    0,    0,    0],
        [ 101, 2001, 2151,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 

 38%|███▊      | 111/289 [01:23<02:14,  1.32it/s]

Training loop 111
tensor([[ 101, 2129, 2172,  ...,    0,    0,    0],
        [ 101, 2054, 2897,  ...,    0,    0,    0],
        [ 101, 2323, 2037,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2129, 2312,  ...,    0,    0,    0],
        [ 101, 2054, 2951,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.07760181277990341, logits - tensor([[-5.5244,  1.3571, -4.8659, -1.0894],
        [-6.4513,  1.9746, -5.5876, -1.1888],
        [-4.2142, -3.6847,  2.6948, -2.1560],
        [-6.5312,  1.8829, -4.8923, -1.5623],
        [-5.5430,  1.7302, -4.7032, -2.

 39%|███▉      | 112/289 [01:24<02:13,  1.32it/s]

Training loop 112
tensor([[  101,  2029,  4155,  ...,     0,     0,     0],
        [  101,  2054,  2024,  ..., 19962,  1009,   102],
        [  101,  2011,  2129,  ...,     0,     0,     0],
        ...,
        [  101,  2129,  2502,  ...,     0,     0,     0],
        [  101,  2054,  2838,  ...,     0,     0,     0],
        [  101,  2054,  6753,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.15335538983345032, logits - tensor([[-5.2282,  0.5748, -4.7967, -0.9724],
        [-4.7916,  0.4575, -4.3448,  0.2594],
        [-6.0072,  0.8656, -5.0890, -0.9107],
        [-5.5071,  2.2436, -5.9165, -1.7292],
   

 39%|███▉      | 113/289 [01:25<02:12,  1.33it/s]

Training loop 113
tensor([[  101,  2129,  2515,  ...,     0,     0,     0],
        [  101,  2129,  2079,  ...,     0,     0,     0],
        [  101,  2129,  2106,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  2653,  ...,     0,     0,     0],
        [  101,  2024,  1996,  ...,     0,     0,     0],
        [  101,  2029, 26163,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.21007999777793884, logits - tensor([[-6.3475,  0.9303, -4.3185, -0.3988],
        [-5.6657,  1.8125, -4.6704, -0.5018],
        [-6.2918,  1.7747, -5.0660, -1.2101],
        [-5.4186,  1.0551, -5.1434, -0.7741],
   

 39%|███▉      | 114/289 [01:26<02:11,  1.33it/s]

Training loop 114
tensor([[ 101, 2129, 2003,  ...,    0,    0,    0],
        [ 101, 2129, 2116,  ...,    0,    0,    0],
        [ 101, 2073, 2079,  ...,    0,    0,    0],
        ...,
        [ 101, 2106, 1996,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2129, 2003,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2785458564758301, logits - tensor([[-5.1463,  1.5480, -4.8650, -1.6300],
        [-5.0950,  2.0399, -4.7188, -0.7252],
        [-5.7253,  2.0129, -4.3792, -0.4650],
        [-5.2110, -1.5702, -4.8586,  1.8333],
        [-5.1420,  1.2553, -5.2680, -1.1

 40%|███▉      | 115/289 [01:26<02:10,  1.33it/s]

Training loop 115
tensor([[  101,  2054, 12046,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0],
        [  101,  2029, 26163,  ...,     0,     0,     0],
        ...,
        [  101,  2129,  2172,  ...,     0,     0,     0],
        [  101,  2003,  1996,  ...,     0,     0,     0],
        [  101,  2129,  2172,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.24263203144073486, logits - tensor([[-5.6953,  1.8371, -5.8275, -1.6930],
        [-5.0778,  1.5260, -4.7733, -1.5278],
        [-6.3202,  2.3018, -5.7436, -2.2897],
        [-6.1114,  1.6961, -5.0114, -1.7926],
   

 40%|████      | 116/289 [01:27<02:10,  1.33it/s]

Training loop 116
tensor([[  101,  2054, 26163,  ...,     0,     0,     0],
        [  101,  2129,  2515,  ...,     0,     0,     0],
        [  101,  2054,  2024,  ...,     0,     0,     0],
        ...,
        [  101,  2003, 10709,  ...,     0,     0,     0],
        [  101,  2054,  2951,  ...,     0,     0,     0],
        [  101,  2129,  2079,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.24640969932079315, logits - tensor([[-5.9649,  2.1794, -5.5617, -2.2839],
        [-6.8192,  1.4538, -5.5306, -1.6893],
        [-5.2439,  1.3302, -5.4594, -1.1390],
        [-5.7675,  1.7110, -4.8469, -1.4681],
   

 40%|████      | 117/289 [01:28<02:09,  1.33it/s]

Training loop 117
tensor([[  101,  2129,  2001,  ...,     0,     0,     0],
        [  101,  2054,  2512,  ...,     0,     0,     0],
        [  101,  2029,  6612,  ..., 12963,  5198,   102],
        ...,
        [  101,  2029,  2024,  ...,     0,     0,     0],
        [  101,  2054,  9324,  ...,     0,     0,     0],
        [  101,  2079,  2027,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.12467905879020691, logits - tensor([[-5.3012,  0.6605, -4.5310, -0.8602],
        [-5.6894,  1.6924, -4.4078, -1.2736],
        [-5.5903,  0.6898, -4.6347, -1.3239],
        [-5.2757,  0.8917, -4.1877, -1.4186],
   

 41%|████      | 118/289 [01:29<02:09,  1.33it/s]

Training loop 118
tensor([[ 101, 2054, 7860,  ...,    0,    0,    0],
        [ 101, 2029, 4655,  ...,    0,    0,    0],
        [ 101, 2054, 6388,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2003,  ...,    0,    0,    0],
        [ 101, 2079, 2027,  ...,    0,    0,    0],
        [ 101, 2054, 2060,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.08767937123775482, logits - tensor([[-5.7673,  1.4511, -4.1910, -1.7825],
        [-5.0393, -1.9755, -4.5019,  3.1523],
        [-5.7836,  1.3825, -5.5022, -1.3429],
        [-6.6034,  1.9221, -5.6926, -1.5337],
        [-5.6485,  1.4760, -5.6826, -1.

 41%|████      | 119/289 [01:29<02:08,  1.32it/s]

Training loop 119
tensor([[  101,  2129,  2003,  ...,     0,     0,     0],
        [  101,  2006,  2054,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0],
        ...,
        [  101,  2024, 28667,  ...,     0,     0,     0],
        [  101,  2079,  2027,  ...,     0,     0,     0],
        [  101,  2054,  2951,  ...,  2006, 16475,   102]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.22850725054740906, logits - tensor([[-5.3172,  1.3579, -4.6525, -1.0454],
        [-5.6850, -1.8563, -5.0604,  1.7576],
        [-6.0734,  1.4487, -5.1707, -0.6617],
        [-5.9656,  1.4375, -5.0595, -1.3315],
   

 42%|████▏     | 120/289 [01:30<02:08,  1.32it/s]

Training loop 120
tensor([[ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2054, 4155,  ...,    0,    0,    0],
        [ 101, 2054, 2001,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 2001,  ...,    0,    0,    0],
        [ 101, 2029, 1997,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.44777172803878784, logits - tensor([[-5.5836,  1.5107, -4.3658, -1.5051],
        [-6.0588,  1.8282, -5.2400, -2.1604],
        [-4.9347, -1.6515, -4.8768,  1.3871],
        [-5.0639,  1.2572, -4.7156, -0.3316],
        [-5.5981,  1.5012, -5.2367, -2.

 42%|████▏     | 121/289 [01:31<02:06,  1.33it/s]

Training loop 121
tensor([[  101,  2054, 11633,  ...,     0,     0,     0],
        [  101,  2054,  7461,  ...,     0,     0,     0],
        [  101,  2079,  2027,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  2024,  ...,     0,     0,     0],
        [  101,  2129,  2172,  ...,     0,     0,     0],
        [  101,  2029,  2003,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19476599991321564, logits - tensor([[-5.9645,  1.9127, -5.5316, -1.3319],
        [-7.0054,  1.3908, -4.7003, -2.1256],
        [-4.6093, -3.7952,  1.7071, -2.2716],
        [-5.9148,  1.9459, -6.0676, -2.9189],
   

 42%|████▏     | 122/289 [01:32<02:05,  1.33it/s]

Training loop 122
tensor([[  101,  2079,  1996,  ...,     0,     0,     0],
        [  101,  2029,  4275,  ...,     0,     0,     0],
        [  101,  2054, 23760,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  7861,  ...,     0,     0,     0],
        [  101,  2129,  2003,  ...,     0,     0,     0],
        [  101,  2106,  2027,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2787618935108185, logits - tensor([[-3.7543, -3.5668,  1.5176, -3.1814],
        [-6.1168,  1.9926, -5.3485, -1.5831],
        [-6.3870,  2.0494, -5.2282, -2.2040],
        [-5.7256,  1.9004, -5.6342, -2.8548],
    

 43%|████▎     | 123/289 [01:32<02:04,  1.33it/s]

Training loop 123
tensor([[ 101, 2054, 5919,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2054, 4155,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2029, 2951,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.22011196613311768, logits - tensor([[-5.4437,  2.4345, -5.6368, -1.8722],
        [-5.0856, -1.1482, -4.6211,  1.4974],
        [-5.0707,  1.4694, -4.8103, -2.3658],
        [-6.2009,  1.5461, -5.1691, -0.9243],
        [-5.4181,  1.7255, -5.1738, -2.

 43%|████▎     | 124/289 [01:33<02:03,  1.33it/s]

Training loop 124
tensor([[ 101, 2106, 2027,  ...,    0,    0,    0],
        [ 101, 2003, 1996,  ...,    0,    0,    0],
        [ 101, 2011, 2129,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2060,  ...,    0,    0,    0],
        [ 101, 2029, 2944,  ..., 2029, 2024,  102],
        [ 101, 2003, 2045,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2276020050048828, logits - tensor([[-5.3281, -4.4100,  2.4698, -3.0996],
        [-5.3589, -1.7394,  1.1377, -2.7054],
        [-5.3539, -0.5389, -4.2494,  0.4590],
        [-6.3908,  1.6775, -5.0272, -1.6929],
        [-4.5252, -3.4801,  2.0497, -1.7

 43%|████▎     | 125/289 [01:34<02:03,  1.33it/s]

Training loop 125
tensor([[ 101, 2129, 2024,  ...,    0,    0,    0],
        [ 101, 2054, 3698,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2172,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2079, 2027,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5009070038795471, logits - tensor([[-6.3951,  0.8882, -4.7036, -1.1382],
        [-5.6276,  1.3274, -4.6118, -1.3581],
        [-5.0568,  2.1013, -4.6415, -1.8838],
        [-5.2135,  0.3383, -4.1046, -0.8170],
        [-5.0170,  1.3214, -4.8570, -1.9

 44%|████▎     | 126/289 [01:35<02:03,  1.32it/s]

Training loop 126
tensor([[ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2054, 2951,  ...,    0,    0,    0],
        [ 101, 2054, 2951,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2172,  ...,    0,    0,    0],
        [ 101, 2054, 2079,  ...,    0,    0,    0],
        [ 101, 2079, 2027,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5481994152069092, logits - tensor([[-5.7025,  1.6548, -5.1012, -1.3945],
        [-4.8239,  1.6561, -5.7517, -1.9836],
        [-5.2657,  1.6339, -4.3185, -1.3738],
        [-5.9677,  0.8904, -4.5100, -1.5104],
        [-5.6549,  1.7919, -5.2431, -1.5

 44%|████▍     | 127/289 [01:35<02:02,  1.32it/s]

Training loop 127
tensor([[ 101, 2054, 4155,  ...,    0,    0,    0],
        [ 101, 2129, 2515,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        ...,
        [ 101, 2079, 2027,  ...,    0,    0,    0],
        [ 101, 2079, 2027,  ...,    0,    0,    0],
        [ 101, 2129, 2502,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.09703379124403, logits - tensor([[-5.6320,  2.1467, -4.4506, -1.7729],
        [-6.5679,  1.5019, -5.4735, -1.3986],
        [-5.9902,  1.0739, -4.4470, -1.2561],
        [-4.9447, -1.7974, -4.0905,  1.5695],
        [-6.0227,  1.5334, -5.1023, -1.566

 44%|████▍     | 128/289 [01:36<02:01,  1.32it/s]

Training loop 128
tensor([[  101,  2054,  2024,  ...,     0,     0,     0],
        [  101,  2079,  2027,  ...,     0,     0,     0],
        [  101,  2054, 15756,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  2003,  ...,     0,     0,     0],
        [  101,  2054,  2951,  ...,     0,     0,     0],
        [  101,  2079,  2027,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4037054777145386, logits - tensor([[-6.6666,  1.9900, -5.9339, -1.0893],
        [-5.2305, -2.8082,  1.8422, -2.2131],
        [-5.9467,  1.7902, -4.7746, -1.7105],
        [-5.2896,  1.3113, -5.3221, -1.3577],
    

 45%|████▍     | 129/289 [01:37<02:00,  1.33it/s]

Training loop 129
tensor([[ 101, 2054, 4275,  ..., 2232, 2000,  102],
        [ 101, 2054, 2024,  ..., 2041, 1997,  102],
        [ 101, 2129, 2027,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2129, 2116,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.31686753034591675, logits - tensor([[-5.6973e+00,  3.7717e-04, -4.3711e+00, -1.3369e+00],
        [-5.5831e+00,  1.2900e+00, -5.2180e+00, -1.0612e+00],
        [-5.8538e+00, -1.3432e+00, -6.1583e+00,  8.3550e-01],
        [-4.3506e+00, -2.2314e+00,  1

 45%|████▍     | 130/289 [01:38<01:59,  1.33it/s]

Training loop 130
tensor([[  101,  2029, 14402,  ...,     0,     0,     0],
        [  101,  2129,  2502,  ...,     0,     0,     0],
        [  101,  2054,  2024,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  2003,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0],
        [  101,  2054,  9896,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 45%|████▌     | 131/289 [01:38<01:59,  1.33it/s]

loss - 0.1626502126455307, logits - tensor([[-5.8212,  1.9628, -4.8571, -2.4575],
        [-5.1448, -2.3550, -5.1475,  1.5822],
        [-6.3534,  0.8153, -5.4003, -1.3366],
        [-5.7882,  1.2308, -4.9430, -1.2559],
        [-5.5359,  1.4015, -4.7866, -0.5938],
        [-6.6597,  1.2562, -5.8302, -0.9870],
        [-5.8891,  0.7467, -4.9914, -0.4709],
        [-6.0996,  2.3572, -5.4348, -1.9957]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 131
tensor([[ 101, 2054, 2951,  ...,    0,    0,    0],
        [ 101, 2079, 2027,  ...,    0,    0,    0],
        [ 101, 2054, 2048,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2116,  ...,    0,    0,    0],
        [ 101, 2054, 2838,  ...,    0,    0,    0],
        [ 101, 2024, 1996,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 

 46%|████▌     | 132/289 [01:39<01:58,  1.33it/s]

Training loop 132
tensor([[ 101, 1999, 2054,  ...,    0,    0,    0],
        [ 101, 2054, 1020,  ...,    0,    0,    0],
        [ 101, 2003, 2009,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2502,  ...,    0,    0,    0],
        [ 101, 2054, 2739,  ...,    0,    0,    0],
        [ 101, 2129, 2172,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2768746316432953, logits - tensor([[-5.8930,  1.5444, -4.9181, -1.5176],
        [-5.0518, -1.6853, -5.0602,  1.8507],
        [-5.4600,  1.2144, -4.6551, -1.9843],
        [-6.2305,  1.4714, -4.6896, -1.4462],
        [-5.4590, -4.5316,  2.4024, -2.7

 46%|████▌     | 133/289 [01:40<01:57,  1.33it/s]

Training loop 133
tensor([[  101,  2129,  2515,  ...,     0,     0,     0],
        [  101,  2054,  3350,  ...,  1997,  3375,   102],
        [  101,  2024,  2023,  ..., 16406,  1007,   102],
        ...,
        [  101,  2003, 10488,  ...,     0,     0,     0],
        [  101,  2029, 13764,  ...,     0,     0,     0],
        [  101,  2054,  2828,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.262630432844162, logits - tensor([[-5.3527,  0.6869, -4.7620, -2.2801],
        [-5.4093,  0.2716, -3.8888, -0.1609],
        [-5.4067, -1.8048, -0.6244, -2.3551],
        [-5.0028,  0.7996, -5.3839, -2.1956],
     

 46%|████▋     | 134/289 [01:41<01:57,  1.32it/s]

Training loop 134
tensor([[  101,  2079,  2027,  ...,     0,     0,     0],
        [  101,  2029,  2060,  ...,  3207, 23460,   102],
        [  101,  2129,  2079,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  2003,  ...,     0,     0,     0],
        [  101,  2129,  2116,  ...,     0,     0,     0],
        [  101,  2129,  2172,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.23045578598976135, logits - tensor([[-4.8230, -2.5374,  1.2479, -2.1575],
        [-5.5380,  0.6016, -5.1740, -1.1708],
        [-6.1467,  0.9493, -5.3315, -1.5643],
        [-6.2789,  2.3259, -5.6078, -1.4170],
   

 47%|████▋     | 135/289 [01:41<01:56,  1.32it/s]

Training loop 135
tensor([[ 101, 2129, 2079,  ...,    0,    0,    0],
        [ 101, 2129, 2502,  ...,    0,    0,    0],
        [ 101, 2129, 2003,  ...,    0,    0,    0],
        ...,
        [ 101, 2000, 2029,  ...,    0,    0,    0],
        [ 101, 2054, 6123,  ...,    0,    0,    0],
        [ 101, 2515, 1996,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.23235458135604858, logits - tensor([[-5.9984,  0.6494, -4.9249, -1.2771],
        [-6.1144,  2.3559, -5.1883, -2.4401],
        [-5.9978,  1.4889, -5.3994, -1.3402],
        [-5.4504,  2.0205, -5.3276, -2.0186],
        [-6.5206,  1.1900, -5.5360, -1.

 47%|████▋     | 136/289 [01:42<01:55,  1.33it/s]

Training loop 136
tensor([[  101,  2054,  2951,  ...,  2006, 16475,   102],
        [  101,  2054,  5754,  ...,     0,     0,     0],
        [  101,  2129,  2502,  ...,     0,     0,     0],
        ...,
        [  101,  2003,  1996,  ...,     0,     0,     0],
        [  101,  2054,  2951,  ...,     0,     0,     0],
        [  101,  2054,  2951,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.20600774884223938, logits - tensor([[-5.4199,  0.8241, -4.4705, -1.6317],
        [-5.0570,  1.3307, -4.5884, -0.7145],
        [-5.6498,  1.7392, -4.1521, -1.6943],
        [-5.9567,  1.6195, -5.5677, -1.2619],
   

 47%|████▋     | 137/289 [01:43<01:54,  1.32it/s]

Training loop 137
tensor([[ 101, 2129, 2003,  ...,    0,    0,    0],
        [ 101, 2129, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 6549,  ...,    0,    0,    0],
        ...,
        [ 101, 2029, 1997,  ...,    0,    0,    0],
        [ 101, 2054, 2951,  ...,    0,    0,    0],
        [ 101, 2029, 1997,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2513583302497864, logits - tensor([[-5.3865,  1.4843, -5.0616, -0.6382],
        [-5.1640,  0.9371, -4.8889, -1.2900],
        [-5.7997,  1.8622, -6.0433, -1.8886],
        [-5.2770,  1.0129, -3.9223, -0.3575],
        [-6.0821,  0.9731, -5.1939, -1.6

 48%|████▊     | 138/289 [01:44<01:54,  1.32it/s]

Training loop 138
tensor([[ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2079, 2027,  ...,    0,    0,    0],
        [ 101, 2129, 2079,  ...,    0,    0,    0],
        ...,
        [ 101, 2106, 2027,  ...,    0,    0,    0],
        [ 101, 2079, 2151,  ...,    0,    0,    0],
        [ 101, 2129, 2079,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.0890297144651413, logits - tensor([[-6.2281,  0.8253, -5.9642, -2.0822],
        [-5.4366, -3.6420,  2.4640, -3.1461],
        [-5.2469,  0.6514, -5.0921, -1.0614],
        [-5.5035, -3.6796,  2.7958, -3.6709],
        [-5.2514,  1.9019, -4.3613, -0.3

 48%|████▊     | 139/289 [01:44<01:53,  1.33it/s]

Training loop 139
tensor([[  101,  2129,  2106,  ...,     0,     0,     0],
        [  101,  2129,  2172,  ...,     0,     0,     0],
        [  101,  2054,  4207,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  2024,  ..., 12225,  1013,   102],
        [  101,  2129,  2024,  ...,     0,     0,     0],
        [  101,  2029,  2986,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2542753219604492, logits - tensor([[-5.3117,  1.8904, -5.5128, -1.8758],
        [-5.4494,  0.8573, -4.6540, -0.7293],
        [-6.0549,  1.2192, -4.4728, -1.7848],
        [-5.7023,  1.4043, -4.7292, -1.4557],
    

 48%|████▊     | 140/289 [01:45<01:52,  1.33it/s]

Training loop 140
tensor([[ 101, 2079, 2027,  ...,    0,    0,    0],
        [ 101, 2129, 2515,  ...,    0,    0,    0],
        [ 101, 2079, 2027,  ...,    0,    0,    0],
        ...,
        [ 101, 2024, 3463,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2011, 2129,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.07168134301900864, logits - tensor([[-4.3038, -3.9641,  2.4073, -2.8255],
        [-5.7338,  1.4865, -4.2110, -1.0625],
        [-4.8327, -3.7111,  2.3783, -2.2933],
        [-5.2022, -2.8609, -3.4800,  2.7442],
        [-6.3304,  2.5890, -5.9759, -2.

 49%|████▉     | 141/289 [01:46<01:51,  1.33it/s]

Training loop 141
tensor([[ 101, 2129, 2003,  ...,    0,    0,    0],
        [ 101, 2073, 2515,  ...,    0,    0,    0],
        [ 101, 2054, 2951,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2092,  ..., 1037, 3618,  102],
        [ 101, 2129, 2001,  ...,    0,    0,    0],
        [ 101, 2029, 2048,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2929902672767639, logits - tensor([[-5.9795,  1.0396, -5.2874, -1.1186],
        [-5.7498,  1.8063, -5.6179, -1.3115],
        [-4.8431,  1.6139, -5.3337, -1.6626],
        [-5.5873,  1.2308, -5.5170, -0.9731],
        [-5.8257,  0.2346, -4.6109, -1.0

 49%|████▉     | 142/289 [01:47<01:50,  1.33it/s]

Training loop 142
tensor([[ 101, 2024, 2045,  ...,    0,    0,    0],
        [ 101, 2079, 2027,  ...,    0,    0,    0],
        [ 101, 2029, 2004,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2005, 1996,  ...,    0,    0,    0],
        [ 101, 2006, 2054,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.34942352771759033, logits - tensor([[-5.0919, -3.4540,  2.2607, -2.1810],
        [-4.8621, -3.4451,  1.8483, -2.4070],
        [-5.3848,  1.0472, -4.8004, -0.9179],
        [-6.1083,  1.9568, -5.2361, -2.1695],
        [-6.3052,  1.1446, -4.9456, -1.

 49%|████▉     | 143/289 [01:47<01:49,  1.33it/s]

Training loop 143
tensor([[  101,  2079,  1996,  ...,     0,     0,     0],
        [  101,  2054, 12878,  ...,     0,     0,     0],
        [  101,  2029,  3787,  ...,     0,     0,     0],
        ...,
        [  101,  2029,  2759,  ...,     0,     0,     0],
        [  101,  2054,  2024,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.24527981877326965, logits - tensor([[-5.2769, -2.1571,  1.2577, -2.1770],
        [-5.6528,  1.0534, -5.3084, -1.3423],
        [-5.7552,  1.8640, -5.3501, -2.0201],
        [-5.0401,  1.6134, -5.1361, -1.4534],
   

 50%|████▉     | 144/289 [01:48<01:49,  1.33it/s]

Training loop 144
tensor([[ 101, 2029, 8518,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2029, 4493,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2116,  ...,    0,    0,    0],
        [ 101, 2129, 2502,  ...,    0,    0,    0],
        [ 101, 2003, 1996,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.312702476978302, logits - tensor([[-5.8469,  0.9173, -5.2902, -1.8413],
        [-6.5669,  1.5278, -5.0550, -1.3185],
        [-4.5788,  1.0912, -5.2326, -2.2233],
        [-6.0252,  0.2037, -4.8201, -0.9479],
        [-5.9717,  1.7356, -5.7791, -1.33

 50%|█████     | 145/289 [01:49<01:48,  1.33it/s]

Training loop 145
tensor([[ 101, 2515, 1996,  ...,    0,    0,    0],
        [ 101, 2054, 9312,  ...,    0,    0,    0],
        [ 101, 2054, 2951,  ...,    0,    0,    0],
        ...,
        [ 101, 2029, 2951,  ...,    0,    0,    0],
        [ 101, 2054, 4155,  ...,    0,    0,    0],
        [ 101, 2029, 2951,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.09920937567949295, logits - tensor([[-5.4845, -3.1671,  1.4906, -1.5446],
        [-6.2258,  2.2071, -5.0333, -2.4870],
        [-5.6029,  1.3821, -4.9502, -1.0338],
        [-5.6301,  1.2779, -4.7879, -1.2343],
        [-6.3670,  1.6290, -5.6397, -1.

 51%|█████     | 146/289 [01:50<01:47,  1.33it/s]

Training loop 146
tensor([[ 101, 2038, 2045,  ...,    0,    0,    0],
        [ 101, 2515, 2037,  ...,    0,    0,    0],
        [ 101, 2129, 2146,  ...,    0,    0,    0],
        ...,
        [ 101, 2003, 2045,  ...,    0,    0,    0],
        [ 101, 2054, 4556,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.17707431316375732, logits - tensor([[-5.4136, -2.8750,  1.3640, -2.6347],
        [-5.0565, -3.0198,  1.5356, -1.3363],
        [-5.8626,  1.0869, -6.0167, -1.2404],
        [-4.5231, -2.6693,  2.2918, -2.1368],
        [-5.9351,  1.4053, -4.7888, -1.

 51%|█████     | 147/289 [01:50<01:46,  1.33it/s]

Training loop 147
tensor([[  101,  2129,  2003,  ...,     0,     0,     0],
        [  101,  2079,  2027,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0],
        ...,
        [  101,  2054, 27425,  ...,     0,     0,     0],
        [  101,  2054, 12046,  ...,     0,     0,     0],
        [  101,  2054,  3642,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.16766445338726044, logits - tensor([[-5.6669,  0.8037, -5.6308, -1.6410],
        [-5.1571, -4.0301,  1.9937, -2.6715],
        [-4.5113, -2.4009, -5.1351,  2.8747],
        [-4.4830, -3.1054,  1.9851, -2.7689],
   

 51%|█████     | 148/289 [01:51<01:46,  1.33it/s]

Training loop 148
tensor([[ 101, 2054, 7860,  ...,    0,    0,    0],
        [ 101, 2029, 6847,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2312,  ...,    0,    0,    0],
        [ 101, 2011, 2129,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3992519676685333, logits - tensor([[-6.0167,  1.5272, -6.1501, -1.5456],
        [-5.4794,  1.4562, -4.8331, -1.3583],
        [-6.0594, -2.0949, -5.2725,  2.3321],
        [-4.0337, -2.5944,  1.0766, -2.1715],
        [-5.7532,  0.8357, -5.5221, -1.8

 52%|█████▏    | 149/289 [01:52<01:44,  1.34it/s]

Training loop 149
tensor([[  101,  2054,  2529,  ...,  7644,  1012,   102],
        [  101,  2029, 26163,  ...,     0,     0,     0],
        [  101,  2064,  1996,  ...,  2005,  6885,   102],
        ...,
        [  101,  2079,  2027,  ...,     0,     0,     0],
        [  101,  2054,  2951,  ...,     0,     0,     0],
        [  101,  2054,  2653,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3492242395877838, logits - tensor([[-6.5677,  0.9583, -5.5004, -1.2276],
        [-6.8402,  1.9162, -4.6277, -1.6256],
        [-5.8620, -1.8933, -3.1226,  1.0637],
        [-4.8336,  1.6383, -4.2423, -1.4876],
    

 52%|█████▏    | 150/289 [01:53<01:44,  1.33it/s]

Training loop 150
tensor([[ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2129, 2079,  ...,    0,    0,    0],
        [ 101, 2029, 2951,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2653,  ...,    0,    0,    0],
        [ 101, 2129, 2024,  ...,    0,    0,    0],
        [ 101, 2029, 2591,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3761634826660156, logits - tensor([[-4.5435, -1.5422, -4.8963,  1.6734],
        [-6.2890,  1.6520, -5.2693, -1.3037],
        [-6.1256,  1.2886, -4.8552, -1.4943],
        [-4.8823,  1.7462, -4.3823, -1.5151],
        [-5.3106,  1.1857, -5.4179, -1.3

 52%|█████▏    | 151/289 [01:53<01:43,  1.33it/s]

Training loop 151
tensor([[  101,  2515, 24665,  ...,     0,     0,     0],
        [  101,  2106,  6818,  ...,  3445,  3891,   102],
        [  101,  2129,  2003,  ...,     0,     0,     0],
        ...,
        [  101,  1999,  2054,  ...,     0,     0,     0],
        [  101,  2029, 13100,  ...,     0,     0,     0],
        [  101,  2079,  1996,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1235198900103569, logits - tensor([[-4.4622, -4.0490,  2.8579, -2.7077],
        [-4.8633, -2.5772,  0.6944, -2.4441],
        [-6.1104,  1.5984, -4.9537, -0.9012],
        [-5.7746,  1.0091, -4.5909, -1.0983],
    

 53%|█████▎    | 152/289 [01:54<01:43,  1.33it/s]

Training loop 152
tensor([[  101,  2129,  2116,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0],
        [  101,  2054,  2785,  ...,     0,     0,     0],
        ...,
        [  101,  1999,  2029,  ...,     0,     0,     0],
        [  101,  2003,  1996,  ...,     0,     0,     0],
        [  101,  2054, 21641,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.32216542959213257, logits - tensor([[-6.0686,  2.6006, -5.8227, -1.7254],
        [-5.0516,  1.4125, -4.8932, -1.6742],
        [-6.0843,  1.2448, -5.3708, -1.3282],
        [-5.8681,  1.7295, -6.2888, -2.3283],
   

 53%|█████▎    | 153/289 [01:55<01:42,  1.32it/s]

Training loop 153
tensor([[  101,  2011,  2129,  ...,     0,     0,     0],
        [  101,  2129,  2515,  ...,     0,     0,     0],
        [  101,  2079,  7551,  ...,     0,     0,     0],
        ...,
        [  101,  2129,  2116,  ...,     0,     0,     0],
        [  101,  2129,  2172,  ..., 14526,  1012,   102],
        [  101,  2054,  2106,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.20493006706237793, logits - tensor([[-5.2923, -0.3637, -4.2913,  1.4382],
        [-6.0041,  1.7983, -5.5828, -1.6865],
        [-5.3866, -3.0162,  1.1118, -1.8864],
        [-4.6936,  1.6385, -4.7962, -1.2735],
   

 53%|█████▎    | 154/289 [01:56<01:42,  1.32it/s]

Training loop 154
tensor([[  101,  2029, 16105,  ...,     0,     0,     0],
        [  101,  2129,  2116,  ...,     0,     0,     0],
        [  101,  2129,  2003,  ...,     0,     0,     0],
        ...,
        [  101,  2129,  2003,  ...,     0,     0,     0],
        [  101,  2054,  2020,  ...,     0,     0,     0],
        [  101,  2054,  2024,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.16041235625743866, logits - tensor([[-6.1391,  1.5078, -5.2202, -1.4145],
        [-5.6872,  1.4718, -5.4713, -1.2608],
        [-5.1842,  0.9671, -4.5630, -1.0944],
        [-5.8556,  2.0151, -5.7049, -1.9408],
   

 54%|█████▎    | 155/289 [01:57<01:41,  1.32it/s]

Training loop 155
tensor([[  101,  2079, 10474,  ...,     0,     0,     0],
        [  101,  2054,  4275,  ...,     0,     0,     0],
        [  101,  2129,  2024,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  2110,  ...,     0,     0,     0],
        [  101,  2054,  2024,  ...,     0,     0,     0],
        [  101,  2054,  7861,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.33838212490081787, logits - tensor([[-5.0632,  0.2732, -3.5534, -2.1421],
        [-5.2327,  0.8312, -4.5618, -1.9927],
        [-5.4137,  1.4906, -4.3389, -0.8684],
        [-5.4127,  0.7517, -4.4631, -1.2402],
   

 54%|█████▍    | 156/289 [01:57<01:40,  1.32it/s]

Training loop 156
tensor([[  101,  2129,  2488,  ...,     0,     0,     0],
        [  101,  2129,  2079,  ...,     0,     0,     0],
        [  101,  2029,  2613,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  2125,  ...,     0,     0,     0],
        [  101,  2054, 17953,  ...,     0,     0,     0],
        [  101,  2054, 12046,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19597335159778595, logits - tensor([[-5.4367, -1.9466, -5.3858,  1.7935],
        [-5.6147,  1.2257, -5.3485, -1.6902],
        [-6.2681,  1.0126, -5.5043, -1.5843],
        [-5.8166,  1.1103, -5.4364, -1.2690],
   

 54%|█████▍    | 157/289 [01:58<01:39,  1.32it/s]

Training loop 157
tensor([[  101,  2003,  2944,  ...,     0,     0,     0],
        [  101,  2054,  3921,  ...,     0,     0,     0],
        [  101,  2129,  2515,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  4942,  ...,     0,     0,     0],
        [  101,  2054, 10640,  ...,     0,     0,     0],
        [  101,  2129,  8321,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.25513139367103577, logits - tensor([[-6.1480,  0.7923, -4.1422, -1.6064],
        [-5.8161,  1.7041, -4.9990, -1.0910],
        [-5.2822, -1.2574, -4.9049,  2.3022],
        [-5.3518,  1.7005, -4.8036, -1.5751],
   

 55%|█████▍    | 158/289 [01:59<01:38,  1.32it/s]

Training loop 158
tensor([[  101,  2029,  2951,  ...,     0,     0,     0],
        [  101,  2129,  2502,  ...,     0,     0,     0],
        [  101,  2129,  2172,  ...,     0,     0,     0],
        ...,
        [  101,  2054, 26163,  ...,     0,     0,     0],
        [  101,  2054, 24710,  ...,     0,     0,     0],
        [  101,  2129,  2116,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.32240650057792664, logits - tensor([[-5.0552,  0.8842, -4.4155, -1.3215],
        [-5.6301,  0.8395, -5.7348, -1.2938],
        [-4.8036, -2.3763, -4.7092,  1.7663],
        [-5.7995,  1.5428, -4.8301, -1.1289],
   

 55%|█████▌    | 159/289 [02:00<01:37,  1.33it/s]

Training loop 159
tensor([[  101,  2054,  2003,  ...,     0,     0,     0],
        [  101,  2029, 10474,  ...,     0,     0,     0],
        [  101,  2054, 15306,  ...,     0,     0,     0],
        ...,
        [  101,  2129,  2515,  ...,     0,     0,     0],
        [  101,  2029,  4118,  ...,     0,     0,     0],
        [  101,  2054,  4118,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19805538654327393, logits - tensor([[-5.5139,  0.9874, -4.9425, -1.1375],
        [-6.3170,  2.3247, -5.8935, -2.1130],
        [-4.6904,  0.6359, -4.6601, -1.8194],
        [-4.8516, -3.0400, -4.5627,  3.0260],
   

 55%|█████▌    | 160/289 [02:00<01:36,  1.33it/s]

Training loop 160
tensor([[  101,  2054,  2001,  ...,     0,     0,     0],
        [  101,  2054,  2515,  ...,     0,     0,     0],
        [  101,  2029,  2944,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  2003,  ...,     0,     0,     0],
        [  101,  2129,  2079,  ...,     0,     0,     0],
        [  101,  2054, 26163,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.10071965306997299, logits - tensor([[-4.4835, -2.0709, -4.8968,  2.7714],
        [-5.4785,  2.0244, -4.5861, -2.4656],
        [-6.6932,  2.5439, -5.5412, -1.9043],
        [-5.2268,  2.0480, -5.2774, -1.7823],
   

 56%|█████▌    | 161/289 [02:01<01:35,  1.34it/s]

Training loop 161
tensor([[ 101, 2054, 6327,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2054, 2951,  ...,    0,    0,    0],
        ...,
        [ 101, 2029, 2951,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2054, 3025,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19563832879066467, logits - tensor([[-5.3813,  1.8095, -4.9865, -1.8624],
        [-5.8204,  1.6052, -5.9771, -2.3550],
        [-5.6232,  1.8668, -4.9167, -1.2971],
        [-5.2079,  1.9084, -4.9713, -1.3211],
        [-5.4376,  1.0315, -5.1049, -1.

 56%|█████▌    | 162/289 [02:02<01:34,  1.34it/s]

Training loop 162
tensor([[ 101, 2129, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2129, 2003,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2020,  ...,    0,    0,    0],
        [ 101, 2106, 2027,  ...,    0,    0,    0],
        [ 101, 2003, 1996,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2887383699417114, logits - tensor([[-7.2413,  1.4732, -5.7549, -0.9187],
        [-5.4025,  1.5479, -4.3031, -0.9811],
        [-6.0427,  0.9899, -5.4269,  0.4068],
        [-6.3703,  1.2271, -5.1352, -1.4104],
        [-6.1338,  0.9344, -4.8804, -1.1

 56%|█████▋    | 163/289 [02:03<01:33,  1.34it/s]

Training loop 163
tensor([[ 101, 2011, 2129,  ...,    0,    0,    0],
        [ 101, 2129, 2079,  ...,    0,    0,    0],
        [ 101, 2029, 4725,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2079, 2027,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.09920618683099747, logits - tensor([[-4.9288, -1.5397, -4.8283,  2.4264],
        [-5.2292,  1.3688, -4.7855, -0.3911],
        [-4.6573,  1.3504, -3.9908, -1.2156],
        [-5.6557,  1.0834, -4.6583, -1.6284],
        [-5.7055,  2.2661, -5.1387, -1.

 57%|█████▋    | 164/289 [02:03<01:33,  1.34it/s]

Training loop 164
tensor([[  101,  2339,  2106,  ...,     0,     0,     0],
        [  101,  2054, 15306,  ...,     0,     0,     0],
        [  101,  2054,  1999,  ...,     0,     0,     0],
        ...,
        [  101,  2129,  2079,  ...,     0,     0,     0],
        [  101,  2079,  1996,  ...,     0,     0,     0],
        [  101,  2515,  2023,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.20067861676216125, logits - tensor([[-5.6429,  1.2790, -5.0787, -0.8584],
        [-5.8173,  1.6595, -3.6573, -1.2582],
        [-5.3489,  1.9394, -5.4116, -1.8964],
        [-5.3255,  1.6684, -5.5423, -1.3237],
   

 57%|█████▋    | 165/289 [02:04<01:32,  1.34it/s]

Training loop 165
tensor([[  101,  2129,  2172,  ...,     0,     0,     0],
        [  101,  2040, 21118,  ...,     0,     0,     0],
        [  101,  2129,  2172,  ...,     0,     0,     0],
        ...,
        [  101,  2054, 12783,  ...,     0,     0,     0],
        [  101,  2011,  2129,  ...,     0,     0,     0],
        [  101,  2129,  2515,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.17210054397583008, logits - tensor([[-7.2120,  1.5141, -5.7466, -1.7296],
        [-5.1464,  0.9248, -4.4410, -0.7968],
        [-4.6658, -1.9823, -3.7224,  2.5745],
        [-5.3410,  1.0056, -5.0931, -0.8318],
   

 57%|█████▋    | 166/289 [02:05<01:31,  1.34it/s]

Training loop 166
tensor([[ 101, 2029, 2951,  ...,    0,    0,    0],
        [ 101, 2129, 2502,  ...,    0,    0,    0],
        [ 101, 2054, 2785,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2054, 2515,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.20687291026115417, logits - tensor([[-5.6952,  1.5415, -5.2155, -1.5962],
        [-5.7523,  1.4025, -5.4328, -1.8102],
        [-6.0153,  1.5118, -5.0361, -1.0982],
        [-5.2479, -3.3625,  2.2929, -2.3254],
        [-4.8332,  1.4151, -4.9156, -0.

 58%|█████▊    | 167/289 [02:05<01:31,  1.34it/s]

Training loop 167
tensor([[  101,  2054,  2003,  ...,     0,     0,     0],
        [  101,  2129,  2502,  ...,     0,     0,     0],
        [  101,  2129,  2003,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  2003,  ...,     0,     0,     0],
        [  101,  2029, 10056,  ...,     0,     0,     0],
        [  101,  2040,  2001,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3882545232772827, logits - tensor([[-5.5281,  1.3214, -4.2763, -0.8867],
        [-4.3033, -1.8152, -3.6543,  1.6363],
        [-6.1603,  0.9261, -5.4336, -1.1978],
        [-5.6192, -4.3283,  2.8668, -3.2182],
    

 58%|█████▊    | 168/289 [02:06<01:30,  1.33it/s]

Training loop 168
tensor([[  101,  2129,  2116,  ...,     0,     0,     0],
        [  101,  2029,  3032,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0],
        ...,
        [  101,  2129,  2027,  ...,     0,     0,     0],
        [  101,  2054, 23807,  ...,     0,     0,     0],
        [  101,  2029,  2944,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.14845693111419678, logits - tensor([[-5.7739,  1.6658, -4.3803, -0.0615],
        [-5.8122,  1.6353, -5.5854, -1.9903],
        [-5.9515,  1.3626, -5.6270, -1.9009],
        [-5.5568,  1.6371, -5.1307, -0.9289],
   

 58%|█████▊    | 169/289 [02:07<01:30,  1.33it/s]

Training loop 169
tensor([[ 101, 2024, 1996,  ...,    0,    0,    0],
        [ 101, 2029, 4275,  ...,    0,    0,    0],
        [ 101, 2003, 2023,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2502,  ...,    0,    0,    0],
        [ 101, 2515, 1996,  ...,    0,    0,    0],
        [ 101, 2129, 2003,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.21888534724712372, logits - tensor([[-5.1507, -3.2823,  2.6841, -2.5612],
        [-5.0312, -3.2512, -4.4481,  3.4387],
        [-4.4805, -3.4131,  2.5343, -2.3140],
        [-3.7916, -1.8319, -3.9440,  3.3762],
        [-5.3956,  0.8915, -5.4037, -1.

 59%|█████▉    | 170/289 [02:08<01:29,  1.33it/s]

Training loop 170
tensor([[ 101, 2129, 2116,  ...,    0,    0,    0],
        [ 101, 2054, 2653,  ...,    0,    0,    0],
        [ 101, 2054, 5754,  ...,    0,    0,    0],
        ...,
        [ 101, 2003, 1996,  ...,    0,    0,    0],
        [ 101, 2054, 8107,  ...,    0,    0,    0],
        [ 101, 2024, 2045,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19577306509017944, logits - tensor([[-5.5168,  0.8815, -5.7987, -1.5977],
        [-6.1421,  1.9082, -4.8489, -1.5954],
        [-5.6970,  0.9874, -4.7970, -1.5068],
        [-6.1533,  1.8822, -5.2524, -1.2128],
        [-5.5142,  1.0569, -5.5672, -1.

 59%|█████▉    | 171/289 [02:09<01:28,  1.33it/s]

Training loop 171
tensor([[  101,  2054,  2027,  ...,     0,     0,     0],
        [  101,  2029,  7708,  ...,     0,     0,     0],
        [  101,  2029, 15756,  ...,     0,     0,     0],
        ...,
        [  101,  2129,  2079,  ...,     0,     0,     0],
        [  101,  2029,  3086,  ...,     0,     0,     0],
        [  101,  2079,  6048,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.16120702028274536, logits - tensor([[-5.5216,  1.0479, -4.8415, -1.4948],
        [-6.2948,  2.2470, -5.6602, -1.1981],
        [-5.2805,  1.4908, -4.9730, -1.8250],
        [-6.9589,  1.7491, -6.5233, -1.2197],
   

 60%|█████▉    | 172/289 [02:09<01:28,  1.33it/s]

Training loop 172
tensor([[ 101, 2054, 3653,  ...,    0,    0,    0],
        [ 101, 2129, 2079,  ...,    0,    0,    0],
        [ 101, 2079, 2027,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2809,  ...,    0,    0,    0],
        [ 101, 2054, 2951,  ...,    0,    0,    0],
        [ 101, 2006, 2029,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.43614017963409424, logits - tensor([[-6.2114,  0.7825, -5.3649, -1.3367],
        [-5.6011,  0.6421, -4.9635, -0.8142],
        [-6.0746, -3.6411,  2.0162, -2.3496],
        [-5.4943,  1.5064, -5.0354, -0.7425],
        [-5.7557,  1.4618, -4.5157, -1.

 60%|█████▉    | 173/289 [02:10<01:27,  1.32it/s]

Training loop 173
tensor([[  101,  2106,  1996,  ...,     0,     0,     0],
        [  101,  2003,  2151,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0],
        ...,
        [  101,  2054, 15066,  ...,     0,     0,     0],
        [  101,  2129,  2515,  ...,     0,     0,     0],
        [  101,  2054,  4493,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.16090573370456696, logits - tensor([[-4.2358, -4.3249,  2.1775, -2.8717],
        [-5.4085, -1.2660, -3.5536,  1.5989],
        [-5.9517,  1.6324, -5.1870, -0.6589],
        [-6.1354,  0.4642, -3.7128,  0.0470],
   

 60%|██████    | 174/289 [02:11<01:26,  1.33it/s]

Training loop 174
tensor([[ 101, 2129, 2079,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2129, 2079,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2106, 2027,  ...,    0,    0,    0],
        [ 101, 2054, 2838,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.38114312291145325, logits - tensor([[-6.4360,  1.2267, -5.5703, -1.3926],
        [-6.4623,  1.1204, -4.9725, -1.3105],
        [-5.9662,  1.5264, -6.1814, -1.9649],
        [-4.4188,  0.9901, -4.1889, -0.2005],
        [-4.9619, -3.5043,  2.3639, -2.

 61%|██████    | 175/289 [02:12<01:26,  1.32it/s]

Training loop 175
tensor([[  101,  2054, 26163,  ...,     0,     0,     0],
        [  101,  2054,  2512,  ...,     0,     0,     0],
        [  101,  2054,  6254,  ...,     0,     0,     0],
        ...,
        [  101,  2003,  1996,  ...,     0,     0,     0],
        [  101,  2054,  2079,  ...,     0,     0,     0],
        [  101,  2129,  2079,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2787105441093445, logits - tensor([[-4.1649, -3.6465, -3.5565,  3.2743],
        [-5.4079,  0.9023, -5.4010, -0.9834],
        [-5.0847,  1.1536, -4.7351, -1.6829],
        [-4.9578, -2.5655,  2.2627, -2.2694],
    

 61%|██████    | 176/289 [02:12<01:25,  1.32it/s]

Training loop 176
tensor([[  101,  2129,  2024,  ...,     0,     0,     0],
        [  101,  2129,  2024,  ...,     0,     0,     0],
        [  101,  2054,  3793,  ...,     0,     0,     0],
        ...,
        [  101,  2054, 12046,  ...,     0,     0,     0],
        [  101,  2129,  2146,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.24232569336891174, logits - tensor([[-5.6044,  0.8229, -4.7114, -0.0886],
        [-5.7768,  0.7697, -4.3188, -0.7048],
        [-6.5504,  1.6682, -5.0205, -1.5826],
        [-5.7493,  0.9480, -5.6565, -1.8576],
   

 61%|██████    | 177/289 [02:13<01:24,  1.32it/s]

Training loop 177
tensor([[  101,  2054,  2003,  ...,     0,     0,     0],
        [  101,  2054, 15756,  ...,     0,     0,     0],
        [  101,  2054,  2582,  ...,     0,     0,     0],
        ...,
        [  101,  2054, 11338,  ...,     0,     0,     0],
        [  101,  2054,  7241,  ...,     0,     0,     0],
        [  101,  2129,  2106,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2059309184551239, logits - tensor([[-5.5834,  0.9826, -5.4071, -1.5958],
        [-5.0268,  1.7289, -5.0716, -1.5580],
        [-4.6915,  1.3185, -4.4406, -1.9841],
        [-5.4504,  1.3855, -4.8786, -1.0888],
    

 62%|██████▏   | 178/289 [02:14<01:24,  1.31it/s]

Training loop 178
tensor([[ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2129, 2515,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2024,  ..., 1043, 1006,  102],
        [ 101, 2129, 2079,  ...,    0,    0,    0],
        [ 101, 2029, 2951,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3099523186683655, logits - tensor([[-6.1940,  0.8864, -5.2778, -1.0964],
        [-5.5325, -0.8800, -5.6018,  1.6736],
        [-6.2623,  1.7791, -5.5534, -1.2005],
        [-4.7867,  1.7623, -5.5813, -1.4798],
        [-5.3049,  0.5438, -5.3613, -2.0

 62%|██████▏   | 179/289 [02:15<01:23,  1.31it/s]

Training loop 179
tensor([[  101,  2129,  2079,  ...,     0,     0,     0],
        [  101,  2129,  2024,  ...,     0,     0,     0],
        [  101,  2129,  2079,  ...,     0,     0,     0],
        ...,
        [  101,  2054, 16325,  ...,  2323,  9125,   102],
        [  101,  2129,  2027,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.24345281720161438, logits - tensor([[-6.1012,  0.7543, -5.9580, -2.0469],
        [-4.9162, -0.2524, -6.1551, -1.4453],
        [-7.2079,  1.9229, -6.4622, -1.7003],
        [-5.6503,  1.9014, -4.4022, -1.0692],
   

 62%|██████▏   | 180/289 [02:15<01:22,  1.32it/s]

Training loop 180
tensor([[  101,  2054, 13100,  ...,     0,     0,     0],
        [  101,  2079,  2027,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0],
        ...,
        [  101,  2515,  1996,  ...,     0,     0,     0],
        [  101,  2029,  2416,  ...,     0,     0,     0],
        [  101,  2054,  4022,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1651739925146103, logits - tensor([[-5.8883,  1.9087, -4.7839, -2.1717],
        [-5.0001, -3.7556,  2.4376, -3.4658],
        [-5.5724,  1.9010, -4.9975, -1.7116],
        [-5.3531,  1.2689, -4.8456, -0.6387],
    

 63%|██████▎   | 181/289 [02:16<01:21,  1.32it/s]

Training loop 181
tensor([[ 101, 2054, 2001,  ...,    0,    0,    0],
        [ 101, 2031, 1996,  ...,    0,    0,    0],
        [ 101, 2129, 2079,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2054, 2020,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1764230877161026, logits - tensor([[-5.0424, -2.2572, -4.7356,  1.9197],
        [-4.4648, -3.1698,  2.5521, -2.3533],
        [-5.6477,  1.4983, -4.9927, -1.8325],
        [-5.7083,  0.5945, -4.9521, -1.7541],
        [-5.7323,  0.9275, -4.3120, -0.9

 63%|██████▎   | 182/289 [02:17<01:21,  1.32it/s]

Training loop 182
tensor([[  101,  2054,  2785,  ...,     0,     0,     0],
        [  101,  2129,  2003,  ...,     0,     0,     0],
        [  101,  2003,  2023,  ...,     0,     0,     0],
        ...,
        [  101,  2029, 15756,  ...,     0,     0,     0],
        [  101,  2129,  2027,  ...,     0,     0,     0],
        [  101,  2054,  2024,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.27426210045814514, logits - tensor([[-4.9756,  1.2960, -5.1385, -2.0076],
        [-6.0714,  1.4981, -5.8857, -1.0578],
        [-4.9525, -2.9059,  2.1692, -2.4236],
        [-5.5547,  1.2783, -4.7066, -1.5312],
   

 63%|██████▎   | 183/289 [02:18<01:20,  1.32it/s]

Training loop 183
tensor([[ 101, 2129, 2502,  ...,    0,    0,    0],
        [ 101, 2024, 7885,  ...,    0,    0,    0],
        [ 101, 2079, 2035,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2129, 2106,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3281848430633545, logits - tensor([[-5.6365,  1.2536, -5.1071, -1.3611],
        [-5.5452, -1.3067, -1.4346, -1.6735],
        [-5.4813, -3.8554,  2.5313, -3.0666],
        [-5.0943, -1.3791, -4.7998,  0.6705],
        [-4.4299, -2.8454,  2.8981, -2.4

 64%|██████▎   | 184/289 [02:18<01:19,  1.32it/s]

Training loop 184
tensor([[ 101, 2054, 2951,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ..., 2367, 3793,  102],
        ...,
        [ 101, 2515, 1996,  ...,    0,    0,    0],
        [ 101, 2029, 2951,  ...,    0,    0,    0],
        [ 101, 2054, 2020,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.271503210067749, logits - tensor([[-4.6127,  2.3118, -4.9737, -1.8615],
        [-6.3897,  1.4790, -5.3329, -1.9937],
        [-5.4513,  1.2177, -4.5577, -1.4491],
        [-6.3546,  1.6155, -4.8912, -0.8286],
        [-5.6622,  1.0086, -4.0800, -1.37

 64%|██████▍   | 185/289 [02:19<01:18,  1.33it/s]

Training loop 185
tensor([[  101,  2129,  2502,  ...,     0,     0,     0],
        [  101,  2129,  2116,  ...,     0,     0,     0],
        [  101,  2054, 12637,  ...,     0,     0,     0],
        ...,
        [  101,  2129,  2116,  ..., 15756,  5754,   102],
        [  101,  2079,  1996,  ...,     0,     0,     0],
        [  101,  2054,  4155,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2986219823360443, logits - tensor([[-4.5431,  1.7263, -4.6497, -1.3513],
        [-5.5219,  1.4446, -5.3225, -1.9722],
        [-5.6023,  0.2901, -4.7330, -1.9024],
        [-6.1059,  1.2080, -5.3104, -1.8917],
    

 64%|██████▍   | 186/289 [02:20<01:17,  1.33it/s]

Training loop 186
tensor([[  101,  2054,  5754,  ...,     0,     0,     0],
        [  101,  2129,  2502,  ...,     0,     0,     0],
        [  101,  2515,  1996,  ...,     0,     0,     0],
        ...,
        [  101,  2129,  2001,  ...,     0,     0,     0],
        [  101,  2029,  7861,  ...,     0,     0,     0],
        [  101,  2029, 13931,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.34384045004844666, logits - tensor([[-6.8487,  1.1562, -5.7767, -1.8873],
        [-6.2389,  0.9472, -5.8184, -1.3468],
        [-4.9685, -4.1289,  2.7359, -3.2329],
        [-5.1175, -2.1505, -5.1460,  1.9959],
   

 65%|██████▍   | 187/289 [02:21<01:16,  1.34it/s]

Training loop 187
tensor([[  101,  2079,  1996,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0],
        [  101,  2054,  2001,  ...,     0,     0,     0],
        ...,
        [  101,  2029,  2653,  ...,     0,     0,     0],
        [  101,  2054,  2785,  ...,     0,     0,     0],
        [  101,  2054, 13100,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.24489542841911316, logits - tensor([[-4.5685, -3.1993,  2.2086, -2.2284],
        [-5.0359, -1.7797, -4.5918,  2.0160],
        [-6.5721,  2.1597, -5.5477, -1.7485],
        [-5.0165,  0.8151, -5.0120, -1.3064],
   

 65%|██████▌   | 188/289 [02:21<01:15,  1.33it/s]

Training loop 188
tensor([[  101,  2029,  3698,  ...,     0,     0,     0],
        [  101,  2054,  2093,  ...,     0,     0,     0],
        [  101,  2054, 10864,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  4895,  ...,     0,     0,     0],
        [  101,  2054,  2060,  ...,     0,     0,     0],
        [  101,  2129,  2502,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.08089528977870941, logits - tensor([[-6.3064,  2.7433, -6.4662, -2.4886],
        [-6.2670,  1.9049, -5.2485, -1.7349],
        [-5.2957,  1.5228, -4.7535, -0.9051],
        [-5.4995,  0.7207, -5.1617, -1.7216],
   

 65%|██████▌   | 189/289 [02:22<01:14,  1.33it/s]

Training loop 189
tensor([[ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 2592,  ...,    0,    0,    0],
        [ 101, 2054, 4013,  ..., 2029, 6698,  102],
        ...,
        [ 101, 2054, 7885,  ...,    0,    0,    0],
        [ 101, 2011, 2129,  ...,    0,    0,    0],
        [ 101, 2029, 2093,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2687663733959198, logits - tensor([[-5.9755,  1.0827, -4.3907, -0.6166],
        [-5.5876,  1.6586, -5.2894, -1.5017],
        [-4.9210,  1.2282, -4.5298, -0.6977],
        [-6.0735,  1.1316, -3.7943, -1.3471],
        [-5.5772,  1.9306, -4.9692, -0.9

 66%|██████▌   | 190/289 [02:23<01:14,  1.33it/s]

Training loop 190
tensor([[  101,  2054,  2020,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0],
        [  101,  2040,  2020,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  2951,  ...,     0,     0,     0],
        [  101,  2064,  1996,  ...,     0,     0,     0],
        [  101,  2054, 13100,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.22617630660533905, logits - tensor([[-4.0676, -3.1885, -4.2397,  3.1877],
        [-5.0017,  1.9028, -4.5917, -1.9149],
        [-5.9340,  1.5141, -5.1366, -1.6672],
        [-6.4516,  1.7502, -4.9598, -0.9677],
   

 66%|██████▌   | 191/289 [02:24<01:13,  1.34it/s]

Training loop 191
tensor([[ 101, 2054, 2176,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        ...,
        [ 101, 2515, 1996,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2129, 2003,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.18694007396697998, logits - tensor([[-5.5803,  1.5839, -5.2064, -0.8155],
        [-4.7892, -3.6055, -4.2001,  4.0349],
        [-5.8626,  0.5731, -4.4742, -0.9267],
        [-6.0575,  1.0686, -5.7512, -1.5359],
        [-4.6982,  1.7037, -3.7960, -1.

 66%|██████▋   | 192/289 [02:24<01:12,  1.34it/s]

Training loop 192
tensor([[  101,  2029,  4432,  ...,     0,     0,     0],
        [  101,  2054,  2048,  ...,     0,     0,     0],
        [  101,  2054,  2041,  ...,     0,     0,     0],
        ...,
        [  101,  2054, 10954,  ...,     0,     0,     0],
        [  101,  2054,  2024,  ...,     0,     0,     0],
        [  101,  1059, 14949,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.21011123061180115, logits - tensor([[-6.1526,  1.8145, -5.3510, -1.1065],
        [-5.7776,  1.6816, -5.2156, -1.2432],
        [-6.0607,  1.6499, -5.3567, -1.4492],
        [-5.9661,  1.5908, -5.5139, -1.9284],
   

 67%|██████▋   | 193/289 [02:25<01:11,  1.34it/s]

Training loop 193
tensor([[ 101, 2054, 3653,  ...,    0,    0,    0],
        [ 101, 2129, 2003,  ...,    0,    0,    0],
        [ 101, 2129, 2172,  ...,    0,    0,    0],
        ...,
        [ 101, 2029, 2048,  ...,    0,    0,    0],
        [ 101, 2054, 2828,  ...,    0,    0,    0],
        [ 101, 2054, 4512,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.26544249057769775, logits - tensor([[-5.3236,  1.3631, -4.7477, -1.5008],
        [-5.9192,  0.8068, -5.0786, -1.1681],
        [-4.0870, -3.2572, -3.4364,  2.8188],
        [-5.9900,  1.5945, -5.3480, -1.3377],
        [-6.5478,  1.7121, -5.9327, -1.

 67%|██████▋   | 194/289 [02:26<01:11,  1.33it/s]

Training loop 194
tensor([[ 101, 2129, 6048,  ...,    0,    0,    0],
        [ 101, 2129, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2079,  ...,    0,    0,    0],
        [ 101, 2054, 2001,  ...,    0,    0,    0],
        [ 101, 2029, 2944,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3540734052658081, logits - tensor([[-6.1352,  1.8054, -5.8021, -2.2340],
        [-5.7802,  1.2822, -5.1025, -1.6712],
        [-5.6930,  1.2224, -4.9412, -1.4170],
        [-6.5082,  0.9281, -5.1290, -1.1474],
        [-5.8363,  1.2468, -4.9230, -0.9

 67%|██████▋   | 195/289 [02:27<01:11,  1.32it/s]

Training loop 195
tensor([[ 101, 2029, 2731,  ...,    0,    0,    0],
        [ 101, 2012, 2054,  ...,    0,    0,    0],
        [ 101, 2129, 2079,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2003, 2023,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2788888216018677, logits - tensor([[-4.8660,  1.9827, -4.2860, -1.7946],
        [-5.3663,  1.3295, -4.2169, -0.8722],
        [-6.5135,  0.8421, -5.6284, -1.1867],
        [-6.3063,  1.8880, -5.3175, -1.1384],
        [-6.2899,  1.9224, -6.0049, -1.5

 68%|██████▊   | 196/289 [02:27<01:10,  1.32it/s]

Training loop 196
tensor([[ 101, 2054, 4294,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2515, 1996,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2172,  ...,    0,    0,    0],
        [ 101, 2073, 2106,  ...,    0,    0,    0],
        [ 101, 2129, 2502,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2224292904138565, logits - tensor([[-5.8060,  1.5636, -5.6573, -1.5417],
        [-4.4280, -1.5875, -3.6436,  1.1909],
        [-4.9909, -3.1035,  2.1284, -2.4905],
        [-4.8151, -1.6972, -4.6669,  1.3602],
        [-4.5393, -3.7238,  3.5863, -3.7

 68%|██████▊   | 197/289 [02:28<01:09,  1.32it/s]

Training loop 197
tensor([[ 101, 2054, 2951,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2129, 2079,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.28862613439559937, logits - tensor([[-5.6232,  0.9729, -4.1423, -0.7666],
        [-5.5679,  1.4331, -5.5818, -1.4725],
        [-4.0139, -2.9889, -3.9982,  3.4640],
        [-5.6662, -2.3487, -4.8542,  2.2986],
        [-4.3919, -3.1903, -3.8211,  3.

 69%|██████▊   | 198/289 [02:29<01:08,  1.32it/s]

Training loop 198
tensor([[  101,  2054,  2003,  ...,     0,     0,     0],
        [  101,  2079,  2027,  ...,     0,     0,     0],
        [  101,  2079,  2027,  ...,     0,     0,     0],
        ...,
        [  101,  2054, 26163,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0],
        [  101,  2003,  2003,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2656596302986145, logits - tensor([[-5.8533,  1.3887, -4.7909, -1.8199],
        [-4.5017, -3.2969,  2.4218, -2.6544],
        [-4.5676, -2.9734,  2.0932, -2.7618],
        [-5.0181,  0.8818, -4.2155, -0.7910],
    

 69%|██████▉   | 199/289 [02:30<01:08,  1.31it/s]

Training loop 199
tensor([[  101,  2129,  2003,  ...,     0,     0,     0],
        [  101,  2054,  3431,  ...,     0,     0,     0],
        [  101,  2515,  1996,  ...,     0,     0,     0],
        ...,
        [  101,  2129,  2064,  ...,     0,     0,     0],
        [  101,  2024, 22594,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,  1999,  1996,   102]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.40073132514953613, logits - tensor([[-4.9035, -1.4396, -4.4785,  1.4057],
        [-4.9700,  1.6053, -5.1479, -0.8367],
        [-3.9849, -3.3200,  2.4330, -2.8570],
        [-5.0077, -3.0544,  1.1033, -2.3373],
   

 69%|██████▉   | 200/289 [02:30<01:07,  1.32it/s]

Training loop 200
tensor([[  101,  2129,  2003,  ...,     0,     0,     0],
        [  101,  2129,  2001,  ...,     0,     0,     0],
        [  101,  2029, 12739,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  2003,  ...,     0,     0,     0],
        [  101,  2129,  2312,  ...,     0,     0,     0],
        [  101,  2054,  2951,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3489387035369873, logits - tensor([[-5.3556,  0.9737, -4.6440, -1.4406],
        [-5.8872,  1.0780, -6.0402, -0.8908],
        [-6.0161,  0.9643, -5.2269, -0.6372],
        [-5.1579,  1.1255, -4.6375, -0.6456],
    

 70%|██████▉   | 201/289 [02:31<01:06,  1.32it/s]

Training loop 201
tensor([[  101,  2129,  2079,  ...,     0,     0,     0],
        [  101,  2029,  6847,  ...,  1998,  3231,   102],
        [  101,  2054,  2003,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  2951,  ...,     0,     0,     0],
        [  101,  2054, 12046,  ...,     0,     0,     0],
        [  101,  2029,  2951,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.09869317710399628, logits - tensor([[-6.5689,  1.2228, -5.2637, -0.8819],
        [-4.5577,  0.7339, -3.7319, -1.2112],
        [-5.3081, -2.9901, -4.0230,  3.4136],
        [-5.3106, -3.5973,  2.2838, -3.0398],
   

 70%|██████▉   | 202/289 [02:32<01:05,  1.33it/s]

Training loop 202
tensor([[  101,  2054,  2951,  ...,     0,     0,     0],
        [  101,  2054,  2024,  ...,     0,     0,     0],
        [  101,  2054,  2465,  ...,  2344,  2000,   102],
        ...,
        [  101,  2029, 25957,  ...,     0,     0,     0],
        [  101,  2129,  2003,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.20436111092567444, logits - tensor([[-4.9010,  1.6223, -5.4848, -1.7187],
        [-4.5310, -3.1732, -4.0760,  3.1857],
        [-5.9218,  1.1975, -5.8770, -1.7452],
        [-4.8090, -2.4214, -4.3698,  2.3238],
   

 70%|███████   | 203/289 [02:33<01:04,  1.33it/s]

Training loop 203
tensor([[ 101, 2129, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2129, 2312,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2060,  ...,    0,    0,    0],
        [ 101, 2129, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 9329,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.35347214341163635, logits - tensor([[-5.9593,  1.6775, -4.6432, -2.1788],
        [-5.7350, -1.9823, -5.5477,  1.6023],
        [-6.4159,  1.2263, -6.4071, -1.6901],
        [-6.1358,  0.6163, -5.0179, -1.5645],
        [-6.1886,  1.5307, -5.2214, -1.

 71%|███████   | 204/289 [02:33<01:03,  1.33it/s]

Training loop 204
tensor([[ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2129, 2024,  ...,    0,    0,    0],
        [ 101, 2054, 2951,  ...,    0,    0,    0],
        ...,
        [ 101, 2323, 2037,  ...,    0,    0,    0],
        [ 101, 2339, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.08937931060791016, logits - tensor([[-5.8800,  2.2074, -5.4892, -1.3303],
        [-5.4362,  1.5753, -4.1374, -1.3645],
        [-5.7463,  1.6118, -4.2345, -1.3660],
        [-5.3598,  1.5808, -5.3100, -1.5870],
        [-5.6469,  1.5188, -4.8508, -1.

 71%|███████   | 205/289 [02:34<01:02,  1.34it/s]

Training loop 205
tensor([[ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2029, 9324,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2079,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2129, 2003,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.12839651107788086, logits - tensor([[-5.4370,  1.6442, -5.3594, -0.9778],
        [-5.5709,  1.0294, -4.6800, -1.5824],
        [-6.7844,  1.4625, -6.3617, -1.0140],
        [-5.6278,  1.0059, -4.5249, -0.6539],
        [-5.8418,  1.1015, -4.8742, -1.

 71%|███████▏  | 206/289 [02:35<01:02,  1.34it/s]

Training loop 206
tensor([[ 101, 2011, 2129,  ...,    0,    0,    0],
        [ 101, 2054, 2001,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        ...,
        [ 101, 2029, 2176,  ...,    0,    0,    0],
        [ 101, 2011, 2129,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ..., 1037, 9621,  102]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.22109568119049072, logits - tensor([[-6.1315,  1.2660, -5.7154, -2.3302],
        [-5.0673,  2.5078, -5.9277, -1.8385],
        [-5.6641,  0.5893, -4.9209, -1.9252],
        [-4.2853, -2.6568, -4.0368,  2.7129],
        [-6.2392,  0.6321, -5.2674, -0.

 72%|███████▏  | 207/289 [02:36<01:01,  1.34it/s]

Training loop 207
tensor([[  101,  2054,  2777,  ...,     0,     0,     0],
        [  101,  2129,  2003,  ..., 13103,  6922,   102],
        [  101,  2129,  2003,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  2024,  ...,     0,     0,     0],
        [  101,  2129,  2024,  ...,     0,     0,     0],
        [  101,  2054,  3599,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.31504932045936584, logits - tensor([[-5.9678,  0.7716, -5.2664, -1.4831],
        [-6.0668,  1.0795, -5.7232, -1.4427],
        [-5.8370,  1.6339, -4.9187, -0.8178],
        [-5.2462,  1.4360, -5.1926, -2.4374],
   

 72%|███████▏  | 208/289 [02:36<01:00,  1.33it/s]

Training loop 208
tensor([[ 101, 2029, 2110,  ...,    0,    0,    0],
        [ 101, 2054, 3698,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2024,  ...,    0,    0,    0],
        [ 101, 2054, 2001,  ...,    0,    0,    0],
        [ 101, 2054, 2236,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4247164726257324, logits - tensor([[-5.1936, -0.9829, -4.5491,  1.0493],
        [-4.8754,  1.1588, -4.9882, -1.6271],
        [-5.5677,  1.5530, -4.6614, -0.8188],
        [-5.7783,  1.6616, -4.8394, -1.9228],
        [-5.4073,  2.3217, -5.7313, -1.6

 72%|███████▏  | 209/289 [02:37<01:00,  1.32it/s]

Training loop 209
tensor([[ 101, 2003, 1996,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2129, 2172,  ...,    0,    0,    0],
        ...,
        [ 101, 2079, 1996,  ...,    0,    0,    0],
        [ 101, 2129, 2116,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.30555975437164307, logits - tensor([[-4.8135, -2.8167,  0.5237, -0.8124],
        [-5.5793,  2.1481, -4.3597, -1.6030],
        [-5.6201,  1.3442, -4.7278, -1.4810],
        [-5.7870,  2.4262, -4.3241, -2.0083],
        [-3.9671, -1.9322, -4.1365,  2.

 73%|███████▎  | 210/289 [02:38<00:59,  1.33it/s]

Training loop 210
tensor([[ 101, 2054, 2060,  ...,    0,    0,    0],
        [ 101, 2129, 2079,  ...,    0,    0,    0],
        [ 101, 2054, 4275,  ..., 2241, 2944,  102],
        ...,
        [ 101, 2079, 1996,  ...,    0,    0,    0],
        [ 101, 2129, 2079,  ...,    0,    0,    0],
        [ 101, 2054, 2001,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3639070391654968, logits - tensor([[-6.3393,  1.4265, -5.7469, -1.7099],
        [-5.3272,  0.8682, -4.5752, -0.4241],
        [-5.4827,  0.4973, -3.7277, -1.0934],
        [-5.2500, -2.5110, -4.8029,  1.9555],
        [-5.8464,  0.2701, -5.0925, -1.6

 73%|███████▎  | 211/289 [02:39<00:58,  1.32it/s]

Training loop 211
tensor([[ 101, 2054, 2951,  ...,    0,    0,    0],
        [ 101, 2003, 1996,  ...,    0,    0,    0],
        [ 101, 2129, 2172,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2129, 2079,  ...,    0,    0,    0],
        [ 101, 2029, 7705,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19359007477760315, logits - tensor([[-6.2647,  1.5745, -5.0159, -1.7494],
        [-4.8846, -3.9624,  3.3476, -2.8699],
        [-5.6625,  1.3701, -5.2144, -1.5476],
        [-5.1717,  1.7345, -4.8126, -1.4334],
        [-5.7860,  1.3435, -5.8448, -0.

 73%|███████▎  | 212/289 [02:39<00:57,  1.33it/s]

Training loop 212
tensor([[  101,  2029, 26163,  ...,     0,     0,     0],
        [  101,  2054,  2024,  ...,     0,     0,     0],
        [  101,  2129,  2106,  ...,     0,     0,     0],
        ...,
        [  101,  2054, 26163,  ...,  2000,  4468,   102],
        [  101,  2129,  2001,  ...,     0,     0,     0],
        [  101,  2054,  2024,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.21352988481521606, logits - tensor([[-5.8091,  2.3377, -4.6308, -0.9758],
        [-5.8374,  1.5135, -5.2317, -2.0757],
        [-5.6438,  2.6319, -5.3181, -1.4304],
        [-5.4828,  1.0784, -5.7988, -1.5321],
   

 74%|███████▎  | 213/289 [02:40<00:56,  1.33it/s]

Training loop 213
tensor([[  101,  2054,  2024,  ...,     0,     0,     0],
        [  101,  2129,  2024,  ...,     0,     0,     0],
        [  101,  2054, 13931,  ...,     0,     0,     0],
        ...,
        [  101,  2106,  2027,  ...,     0,     0,     0],
        [  101,  2129,  2003,  ...,     0,     0,     0],
        [  101,  2054, 26163,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.28974419832229614, logits - tensor([[-6.0316,  1.8225, -4.7297, -1.4023],
        [-6.0915,  0.6100, -4.8949, -0.5477],
        [-5.0274,  0.5491, -5.8759, -1.4435],
        [-5.3420,  1.0177, -5.3307, -1.7585],
   

 74%|███████▍  | 214/289 [02:41<00:56,  1.33it/s]

Training loop 214
tensor([[  101,  2054,  2047,  ...,     0,     0,     0],
        [  101,  2054,  2024,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  2024,  ..., 26163,  1010,   102],
        [  101,  2054, 26163,  ...,     0,     0,     0],
        [  101,  2129,  2312,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1432531774044037, logits - tensor([[-5.1145,  1.2305, -5.2667, -1.5822],
        [-3.8065, -2.1494, -4.1169,  2.3961],
        [-6.4273,  0.6647, -4.2923, -1.2891],
        [-5.1605,  1.9697, -5.1861, -2.4111],
    

 74%|███████▍  | 215/289 [02:42<00:55,  1.33it/s]

Training loop 215
tensor([[ 101, 2129, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 2951,  ...,    0,    0,    0],
        [ 101, 2515, 2023,  ...,    0,    0,    0],
        ...,
        [ 101, 2079, 2027,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2029, 2951,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.10203395038843155, logits - tensor([[-5.9970,  1.0085, -4.3829, -1.4561],
        [-5.5146,  2.4311, -4.5698, -1.2270],
        [-4.7883, -2.5886,  2.2203, -2.8175],
        [-6.3374,  0.4686, -5.8205, -1.4376],
        [-5.1839,  1.6250, -5.0310, -1.

 75%|███████▍  | 216/289 [02:42<00:55,  1.32it/s]

Training loop 216
tensor([[  101,  2054,  7885,  ...,     0,     0,     0],
        [  101,  2029, 13058,  ...,     0,     0,     0],
        [  101,  2029, 13221,  ...,     0,     0,     0],
        ...,
        [  101,  2001,  2023,  ...,     0,     0,     0],
        [  101,  2129,  2079,  ...,  2946,  1012,   102],
        [  101,  2129,  2116,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.10835342854261398, logits - tensor([[-5.3426,  1.5017, -4.8409, -0.9075],
        [-5.8173,  2.0763, -5.7900, -0.7938],
        [-4.4721,  1.4253, -4.9579, -2.3223],
        [-5.7648,  1.0799, -4.9577, -1.0217],
   

 75%|███████▌  | 217/289 [02:43<00:54,  1.31it/s]

Training loop 217
tensor([[ 101, 2011, 2129,  ...,    0,    0,    0],
        [ 101, 2054, 4487,  ...,    0,    0,    0],
        [ 101, 2515, 1996,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2951,  ...,    0,    0,    0],
        [ 101, 2054, 6847,  ...,    0,    0,    0],
        [ 101, 2054, 2001,  ..., 1996, 3737,  102]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.23905514180660248, logits - tensor([[-4.6339, -3.8459, -3.3296,  3.0432],
        [-6.2775,  1.0375, -4.6130, -1.9935],
        [-4.7354, -2.7903,  0.8386, -1.8798],
        [-5.7615,  2.0786, -4.8144, -1.1782],
        [-5.3437,  1.2770, -4.8626, -1.

 75%|███████▌  | 218/289 [02:44<00:53,  1.32it/s]

Training loop 218
tensor([[  101,  2054,  5579,  ...,     0,     0,     0],
        [  101,  2003,  2045,  ...,     0,     0,     0],
        [  101,  2054, 20062,  ...,     0,     0,     0],
        ...,
        [  101,  2129,  2146,  ...,     0,     0,     0],
        [  101,  2054,  2951,  ...,     0,     0,     0],
        [  101,  2129,  2024,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.21590639650821686, logits - tensor([[-5.7916,  1.8656, -5.5553, -1.3534],
        [-5.1747, -0.4212, -2.4045, -1.3141],
        [-5.9394,  1.1652, -5.1505, -1.3126],
        [-5.5664,  1.4424, -4.7147, -1.2996],
   

 76%|███████▌  | 219/289 [02:45<00:53,  1.32it/s]

Training loop 219
tensor([[ 101, 2054, 2093,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ..., 2109, 2000,  102],
        [ 101, 2054, 2001,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2291,  ...,    0,    0,    0],
        [ 101, 2079, 2027,  ...,    0,    0,    0],
        [ 101, 2073, 2515,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.20872829854488373, logits - tensor([[-6.6255,  1.9585, -5.5961, -1.6813],
        [-5.1934,  1.1792, -4.7429, -0.4075],
        [-4.8394, -2.2194, -4.3422,  2.9081],
        [-6.2626,  1.6666, -5.3818, -1.5679],
        [-4.2356, -2.6632, -4.0591,  2.

 76%|███████▌  | 220/289 [02:45<00:52,  1.32it/s]

Training loop 220
tensor([[ 101, 2129, 2003,  ...,    0,    0,    0],
        [ 101, 2079, 2027,  ...,    0,    0,    0],
        [ 101, 2054, 5337,  ...,    0,    0,    0],
        ...,
        [ 101, 2079, 1996,  ...,    0,    0,    0],
        [ 101, 2129, 2502,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.18158334493637085, logits - tensor([[-5.8093,  1.8619, -5.0041, -1.1930],
        [-4.4180, -4.3553,  3.4018, -2.2574],
        [-6.1489,  1.4641, -4.4153, -1.6350],
        [-5.2132,  0.8319, -5.3204, -0.8880],
        [-5.4334,  1.8231, -5.0960, -1.

 76%|███████▋  | 221/289 [02:46<00:51,  1.32it/s]

Training loop 221
tensor([[ 101, 2129, 2515,  ...,    0,    0,    0],
        [ 101, 2129, 2515,  ...,    0,    0,    0],
        [ 101, 2079, 2027,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2172,  ...,    0,    0,    0],
        [ 101, 2129, 2079,  ...,    0,    0,    0],
        [ 101, 2054, 2330,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.18539880216121674, logits - tensor([[-6.4448,  1.5628, -6.1721, -1.4363],
        [-5.3168,  1.4418, -4.9304, -1.3319],
        [-5.2431, -3.6807,  2.6751, -3.0774],
        [-4.1307, -3.7761,  1.7626, -2.1965],
        [-5.2439,  0.7577, -4.1700, -1.

 77%|███████▋  | 222/289 [02:47<00:50,  1.33it/s]

Training loop 222
tensor([[  101,  2106,  2027,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0],
        [  101,  2054, 26163,  ...,     0,     0,     0],
        ...,
        [  101,  2029,  9312,  ...,     0,     0,     0],
        [  101,  2079,  2027,  ...,  2217,  1012,   102],
        [  101,  2029, 22822,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.16160306334495544, logits - tensor([[-4.3912, -4.3747,  2.7863, -2.9775],
        [-6.2678,  1.4810, -4.2262, -1.3183],
        [-5.2797,  1.1666, -5.8034, -1.2919],
        [-5.7844,  0.7953, -5.4162, -0.9144],
   

 77%|███████▋  | 223/289 [02:48<00:49,  1.33it/s]

Training loop 223
tensor([[ 101, 2054, 2027,  ...,    0,    0,    0],
        [ 101, 2054, 4155,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        ...,
        [ 101, 2079, 2027,  ...,    0,    0,    0],
        [ 101, 2129, 2502,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.35361555218696594, logits - tensor([[-5.8934,  1.6798, -5.2389, -1.3847],
        [-6.0826,  1.2662, -6.0295, -2.1328],
        [-5.6506,  0.5871, -4.0876, -2.1880],
        [-5.8780,  1.6492, -5.0595, -1.8390],
        [-5.9578,  1.5930, -5.8619, -1.

 78%|███████▊  | 224/289 [02:48<00:48,  1.33it/s]

Training loop 224
tensor([[ 101, 2129, 2172,  ...,    0,    0,    0],
        [ 101, 2058, 2029,  ...,    0,    0,    0],
        [ 101, 2129, 2502,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2079,  ...,    0,    0,    0],
        [ 101, 2129, 2146,  ...,    0,    0,    0],
        [ 101, 2129, 2003,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2732993960380554, logits - tensor([[-5.0159, -1.1938, -5.3598,  1.3602],
        [-6.0112,  1.1198, -4.7486, -1.4038],
        [-5.1007, -2.0129, -4.3646,  1.9706],
        [-5.4337, -2.5457, -4.5766,  2.4460],
        [-5.4812, -4.3758,  3.5085, -3.6

 78%|███████▊  | 225/289 [02:49<00:48,  1.33it/s]

Training loop 225
tensor([[  101,  2054, 26163,  ...,     0,     0,     0],
        [  101,  2129,  2116,  ...,     0,     0,     0],
        [  101,  2079,  1996,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  8518,  ...,     0,     0,     0],
        [  101,  2029,  2951,  ...,     0,     0,     0],
        [  101,  2129,  2079,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.17075777053833008, logits - tensor([[-6.0759,  0.9918, -4.8667, -1.9703],
        [-7.2077,  1.5883, -5.6108, -0.9726],
        [-4.3895, -3.9982,  2.7928, -2.8084],
        [-6.1992,  1.6218, -5.4126, -1.6595],
   

 78%|███████▊  | 226/289 [02:50<00:47,  1.33it/s]

Training loop 226
tensor([[ 101, 2054, 2773,  ...,    0,    0,    0],
        [ 101, 2515, 1996,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2515,  ..., 1010, 1053,  102],
        [ 101, 2129, 2515,  ...,    0,    0,    0],
        [ 101, 2029, 2784,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.32397037744522095, logits - tensor([[-4.9503,  2.5585, -4.6506, -1.4017],
        [-4.8039, -3.1088,  3.0240, -2.2200],
        [-4.5721, -1.8322, -4.7473,  1.5289],
        [-5.7084,  0.9553, -3.9938, -1.0808],
        [-5.5065,  2.1143, -4.9927, -1.

 79%|███████▊  | 227/289 [02:51<00:46,  1.34it/s]

Training loop 227
tensor([[  101,  2129,  2003,  ...,     0,     0,     0],
        [  101,  2079,  2027,  ...,  2168, 21641,   102],
        [  101,  2054,  2951,  ...,     0,     0,     0],
        ...,
        [  101,  2029,  2951,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0],
        [  101,  2129,  2001,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.29976969957351685, logits - tensor([[-5.3364,  0.9247, -4.8940, -1.7118],
        [-4.7771, -3.1439,  1.7140, -1.5390],
        [-6.3807,  1.9130, -5.0199, -1.4795],
        [-6.0188,  1.6546, -5.0773, -1.3511],
   

 79%|███████▉  | 228/289 [02:51<00:45,  1.34it/s]

Training loop 228
tensor([[ 101, 2106, 2027,  ...,    0,    0,    0],
        [ 101, 2054, 2653,  ...,    0,    0,    0],
        [ 101, 2129, 2024,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2312,  ...,    0,    0,    0],
        [ 101, 2054, 2020,  ...,    0,    0,    0],
        [ 101, 2011, 2129,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1887388974428177, logits - tensor([[-4.8920, -3.4809,  3.4044, -2.3082],
        [-6.0757,  1.7771, -4.7465, -1.4453],
        [-5.0229,  1.4865, -4.4654, -1.5962],
        [-5.4816,  0.8338, -4.6436, -1.7297],
        [-5.0014,  1.0425, -4.1362, -0.4

 79%|███████▉  | 229/289 [02:52<00:44,  1.34it/s]

Training loop 229
tensor([[  101,  2054,  2951,  ...,     0,     0,     0],
        [  101,  2079,  2027,  ...,     0,     0,     0],
        [  101,  2003,  2023,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  6882,  ...,     0,     0,     0],
        [  101,  2129,  2079,  ...,     0,     0,     0],
        [  101,  2129,  2515,  ..., 29160,  5166,   102]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.10483904182910919, logits - tensor([[-6.0515,  1.9984, -5.9201, -1.8316],
        [-4.3773, -4.4372,  3.8704, -3.1258],
        [-4.8049, -3.3421,  1.4734, -1.7276],
        [-6.1594,  2.2447, -6.2327, -2.0885],
   

 80%|███████▉  | 230/289 [02:53<00:44,  1.34it/s]

Training loop 230
tensor([[ 101, 2054, 2093,  ...,    0,    0,    0],
        [ 101, 2079, 2027,  ...,    0,    0,    0],
        [ 101, 2129, 2515,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 2951,  ...,    0,    0,    0],
        [ 101, 2129, 2079,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.17294524610042572, logits - tensor([[-5.8151,  2.1159, -5.9835, -1.6551],
        [-4.2647, -3.6982,  2.5868, -2.5145],
        [-5.1222,  0.3416, -4.1102, -0.5637],
        [-5.5427,  1.1941, -4.6549, -1.3106],
        [-4.7551, -2.7499,  1.1401, -2.

 80%|███████▉  | 231/289 [02:54<00:43,  1.34it/s]

Training loop 231
tensor([[  101,  2054,  2024,  ...,     0,     0,     0],
        [  101,  2054, 26163,  ...,     0,     0,     0],
        [  101,  2129,  2411,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  3015,  ...,     0,     0,     0],
        [  101,  2129,  2003,  ...,     0,     0,     0],
        [  101,  2079,  2027,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.12776526808738708, logits - tensor([[-6.0689,  2.2576, -5.9130, -1.0536],
        [-6.1929,  1.3796, -5.6679, -1.4333],
        [-7.1466,  1.3145, -5.7510, -1.3340],
        [-4.6438, -0.4469, -5.2483,  0.3036],
   

 80%|████████  | 232/289 [02:54<00:42,  1.33it/s]

Training loop 232
tensor([[  101,  2064,  8040,  ...,     0,     0,     0],
        [  101,  2003, 14255,  ...,     0,     0,     0],
        [  101,  2029,  2653,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  2110,  ...,     0,     0,     0],
        [  101,  2129,  2024,  ...,     0,     0,     0],
        [  101,  2073,  2515,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.16845300793647766, logits - tensor([[-5.3608, -4.2963,  2.7598, -2.9640],
        [-4.9828, -2.6598,  1.8147, -2.3033],
        [-4.6133, -2.8876, -4.6443,  2.8975],
        [-5.7150, -3.2216,  2.0066, -2.7919],
   

 81%|████████  | 233/289 [02:55<00:41,  1.33it/s]

Training loop 233
tensor([[ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2003, 1996,  ...,    0,    0,    0],
        [ 101, 2054, 2951,  ...,    0,    0,    0],
        ...,
        [ 101, 2029, 2944,  ..., 1010, 1002,  102],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2054, 4730,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.34949713945388794, logits - tensor([[-4.7304,  1.6839, -5.5772, -0.9541],
        [-4.1916, -3.5526,  1.9478, -2.4739],
        [-6.1101,  1.6525, -5.2081, -1.3119],
        [-4.8541, -4.2100,  1.9996, -2.6282],
        [-5.5495,  2.1218, -4.4052, -1.

 81%|████████  | 234/289 [02:56<00:41,  1.33it/s]

Training loop 234
tensor([[  101,  2054, 23760,  ...,  1998,  3231,   102],
        [  101,  2054,  3001,  ...,  1998,  1996,   102],
        [  101,  2054,  9312,  ...,     0,     0,     0],
        ...,
        [  101,  2106,  1996,  ...,     0,     0,     0],
        [  101,  2054,  2024,  ...,     0,     0,     0],
        [  101,  2003,  1996,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.06990090012550354, logits - tensor([[-5.8434,  1.5156, -5.5239, -1.2502],
        [-5.4529, -1.5022, -4.5392,  1.5659],
        [-6.9991,  2.2344, -5.5929, -1.8981],
        [-4.8820, -2.5004, -3.8999,  2.3783],
   

 81%|████████▏ | 235/289 [02:57<00:40,  1.32it/s]

Training loop 235
tensor([[ 101, 2129, 2027,  ...,    0,    0,    0],
        [ 101, 2054, 4708,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        ...,
        [ 101, 2073, 2079,  ...,    0,    0,    0],
        [ 101, 2129, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 3716,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4091085195541382, logits - tensor([[-5.6347,  1.2889, -5.0012, -1.2140],
        [-6.1142,  1.5551, -5.8888, -1.7687],
        [-5.0075, -2.2614, -4.1233,  2.5995],
        [-5.4441,  1.8120, -4.7196, -1.3771],
        [-6.3412,  2.0738, -5.5317, -2.2

 82%|████████▏ | 236/289 [02:57<00:40,  1.32it/s]

Training loop 236
tensor([[  101,  2003,  1996,  ...,     0,     0,     0],
        [  101,  2054,  2024,  ...,     0,     0,     0],
        [  101,  2129,  2024,  ...,     0,     0,     0],
        ...,
        [  101,  2029, 14336,  ...,     0,     0,     0],
        [  101,  2054,  2001,  ...,     0,     0,     0],
        [  101,  2106,  5579,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2691138982772827, logits - tensor([[-4.9337, -3.8753,  2.4119, -2.5987],
        [-5.0770, -3.1107, -3.7722,  2.8113],
        [-5.9354,  1.2023, -5.2880, -1.4774],
        [-5.0744, -4.3187,  2.6097, -2.4973],
    

 82%|████████▏ | 237/289 [02:58<00:39,  1.32it/s]

Training loop 237
tensor([[  101,  2054, 11433,  ...,     0,     0,     0],
        [  101,  2129,  2515,  ...,     0,     0,     0],
        [  101,  2054,  2024,  ...,     0,     0,     0],
        ...,
        [  101,  2003,  2035,  ...,     0,     0,     0],
        [  101,  2129,  2003,  ...,     0,     0,     0],
        [  101,  2129,  2502,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4321606159210205, logits - tensor([[-5.5119,  2.0372, -4.1900, -1.5324],
        [-6.3409,  1.7137, -5.8314, -2.1992],
        [-5.0305, -2.5075, -4.8964,  2.0897],
        [-5.2702, -3.5158,  2.0366, -2.6668],
    

 82%|████████▏ | 238/289 [02:59<00:38,  1.32it/s]

Training loop 238
tensor([[  101,  2054, 26163,  ...,     0,     0,     0],
        [  101,  2003,  2023,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  2024,  ...,     0,     0,     0],
        [  101,  2129,  2116,  ...,     0,     0,     0],
        [  101,  2054,  5754,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3487175405025482, logits - tensor([[-5.6122, -1.8301, -4.6027,  2.1901],
        [-5.3311, -3.9679,  3.2592, -2.4306],
        [-6.1612,  0.7597, -5.6497, -1.7084],
        [-5.0438,  0.8206, -4.4363, -1.5046],
    

 83%|████████▎ | 239/289 [03:00<00:38,  1.31it/s]

Training loop 239
tensor([[ 101, 2129, 2502,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 2079,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2129, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.24052929878234863, logits - tensor([[-5.4518, -2.3521, -5.3280,  2.6828],
        [-5.7004,  1.5371, -5.3912, -1.5214],
        [-5.6075,  0.8882, -4.9502, -1.6621],
        [-5.1154, -3.4286,  2.4817, -2.5609],
        [-5.6648,  2.7078, -5.7639, -1.

 83%|████████▎ | 240/289 [03:01<00:37,  1.31it/s]

Training loop 240
tensor([[ 101, 2129, 2003,  ...,    0,    0,    0],
        [ 101, 2129, 2116,  ...,    0,    0,    0],
        [ 101, 2079, 2027,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 8518,  ...,    0,    0,    0],
        [ 101, 2029, 8518,  ...,    0,    0,    0],
        [ 101, 2129, 2515,  ..., 1035, 1045,  102]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.24598504602909088, logits - tensor([[-4.4952, -2.0747, -5.0032,  1.7878],
        [-4.8091, -1.7709, -3.7288,  2.6293],
        [-3.7950, -4.1176,  2.7069, -2.7278],
        [-4.7175, -3.4537,  2.1451, -3.6070],
        [-5.7300,  1.4777, -4.4331, -0.

 83%|████████▎ | 241/289 [03:01<00:36,  1.31it/s]

Training loop 241
tensor([[ 101, 2129, 2020,  ...,    0,    0,    0],
        [ 101, 2129, 2502,  ...,    0,    0,    0],
        [ 101, 2029, 7271,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2172,  ...,    0,    0,    0],
        [ 101, 2054, 2001,  ...,    0,    0,    0],
        [ 101, 2129, 2172,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1934427171945572, logits - tensor([[-5.1610,  0.8242, -5.0051, -1.8376],
        [-4.0470, -3.5192, -3.7898,  2.6264],
        [-5.8366,  1.5681, -4.9128, -1.5519],
        [-5.0219,  1.4123, -4.5023, -1.0269],
        [-5.1856, -2.8458,  2.4226, -2.8

 84%|████████▎ | 242/289 [03:02<00:35,  1.32it/s]

Training loop 242
tensor([[  101,  2129,  2079,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0],
        [  101,  2029,  2731,  ...,     0,     0,     0],
        ...,
        [  101,  2029, 15792,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0],
        [  101,  2054,  2024,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.11050739884376526, logits - tensor([[-5.1739,  1.0834, -4.5837, -0.8628],
        [-4.1137, -1.7065, -4.1909,  2.0196],
        [-4.9861, -2.7723, -5.0242,  2.6381],
        [-6.5783,  1.8449, -5.3057, -1.8984],
   

 84%|████████▍ | 243/289 [03:03<00:34,  1.33it/s]

Training loop 243
tensor([[  101,  2079, 14391,  ...,     0,     0,     0],
        [  101,  2079,  2027,  ...,     0,     0,     0],
        [  101,  2029,  2773,  ...,     0,     0,     0],
        ...,
        [  101,  2129,  2003,  ...,     0,     0,     0],
        [  101,  2129,  2172,  ...,     0,     0,     0],
        [  101,  2029,  2773,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.09345881640911102, logits - tensor([[-5.5757, -3.2687,  2.2791, -2.4874],
        [-4.6561, -3.4045,  2.0448, -3.0366],
        [-5.7954,  1.9918, -4.3538, -1.3528],
        [-4.4405, -3.9090,  2.6559, -2.0972],
   

 84%|████████▍ | 244/289 [03:04<00:33,  1.33it/s]

Training loop 244
tensor([[  101,  2054,  2003,  ...,     0,     0,     0],
        [  101,  2054,  2001,  ...,     0,     0,     0],
        [  101,  2054, 17463,  ...,     0,     0,     0],
        ...,
        [  101,  2515,  2037,  ...,     0,     0,     0],
        [  101,  2054,  2024,  ...,     0,     0,     0],
        [  101,  2003,  1996,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4447620213031769, logits - tensor([[-6.8958,  1.7455, -6.3204, -1.4654],
        [-5.3988,  1.7538, -5.6412, -1.7330],
        [-4.5021, -2.4187, -4.7017,  1.4966],
        [-5.9687,  1.4964, -4.4527, -1.4173],
    

 85%|████████▍ | 245/289 [03:04<00:33,  1.33it/s]

Training loop 245
tensor([[ 101, 2029, 2951,  ...,    0,    0,    0],
        [ 101, 2054, 2020,  ...,    0,    0,    0],
        [ 101, 2029, 2653,  ...,    0,    0,    0],
        ...,
        [ 101, 2029, 7982,  ...,    0,    0,    0],
        [ 101, 2054, 2020,  ...,    0,    0,    0],
        [ 101, 2054, 5981,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.07211492955684662, logits - tensor([[-5.1436,  1.5563, -4.4860, -1.9846],
        [-6.2882,  1.6092, -5.8147, -2.4026],
        [-5.9802,  1.8207, -5.0478, -1.5173],
        [-5.1436, -2.9373,  2.1443, -2.2228],
        [-4.9345, -4.4773,  3.1717, -2.

 85%|████████▌ | 246/289 [03:05<00:32,  1.34it/s]

Training loop 246
tensor([[  101,  2054, 12158,  ...,     0,     0,     0],
        [  101,  2003,  1996,  ...,     0,     0,     0],
        [  101,  2029,  3698,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  6907,  ...,     0,     0,     0],
        [  101,  2054,  2330,  ...,     0,     0,     0],
        [  101,  2129,  2116,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.12602020800113678, logits - tensor([[-5.2312, -2.8429, -4.2594,  2.1298],
        [-5.0657, -3.1861,  2.0026, -2.4434],
        [-6.6088,  1.5705, -5.9360, -1.4511],
        [-4.5119, -3.7507,  2.6644, -2.6068],
   

 85%|████████▌ | 247/289 [03:06<00:31,  1.34it/s]

Training loop 247
tensor([[ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2129, 2079,  ...,    0,    0,    0],
        [ 101, 2054, 7060,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2003,  ..., 1007, 1027,  102],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.15995724499225616, logits - tensor([[-6.2630, -3.0143, -5.2901,  2.8369],
        [-5.8070,  2.0043, -5.9143, -1.7635],
        [-5.4354,  1.2898, -5.2664, -1.4770],
        [-6.2197,  2.2118, -5.2717, -1.6819],
        [-5.6100,  1.2770, -5.5590, -1.

 86%|████████▌ | 248/289 [03:07<00:30,  1.33it/s]

Training loop 248
tensor([[  101,  2054,  2024,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0],
        [  101,  2106,  2027,  ...,     0,     0,     0],
        ...,
        [  101,  2029,  2944,  ...,     0,     0,     0],
        [  101,  2054,  2060,  ...,     0,     0,     0],
        [  101,  2054, 16105,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.10748379677534103, logits - tensor([[-5.5115,  0.7746, -3.5732, -0.0096],
        [-7.1098,  1.8650, -5.4879, -1.5407],
        [-4.4602, -3.2558,  2.3731, -2.2937],
        [-6.0744,  1.7041, -5.7602, -1.6274],
   

 86%|████████▌ | 249/289 [03:07<00:30,  1.33it/s]

Training loop 249
tensor([[ 101, 2097, 2122,  ...,    0,    0,    0],
        [ 101, 2129, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 7885,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 2020,  ...,    0,    0,    0],
        [ 101, 2054, 2836,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.20491495728492737, logits - tensor([[-4.5222, -3.6372,  2.6759, -2.2777],
        [-5.7695,  1.0898, -4.9609, -1.5883],
        [-5.8632,  1.6928, -5.4983, -0.9388],
        [-5.1018, -4.3316,  2.5239, -3.2547],
        [-5.7057, -2.9253,  2.3205, -2.

 87%|████████▋ | 250/289 [03:08<00:29,  1.33it/s]

Training loop 250
tensor([[  101,  2054,  2003,  ...,  1012,   102,     0],
        [  101,  1999,  2029,  ...,  1010, 27593,   102],
        [  101,  2079,  1996,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  2079,  ...,     0,     0,     0],
        [  101,  2129,  2003,  ..., 12170, 13578,   102],
        [  101,  2054, 26163,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 1, 1, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2793918550014496, logits - tensor([[-4.7852,  1.1243, -4.6389, -1.8144],
        [-4.3778,  0.7949, -4.0794, -1.0270],
        [-4.6793, -3.2291,  2.6255, -2.5612],
        [-5.4116,  1.6553, -4.3397, -1.4738],
    

 87%|████████▋ | 251/289 [03:09<00:28,  1.33it/s]

Training loop 251
tensor([[  101,  2029,  2653,  ...,     0,     0,     0],
        [  101,  2129,  2003,  ...,     0,     0,     0],
        [  101,  2029,  9312,  ...,     0,     0,     0],
        ...,
        [  101,  2515,  1996,  ...,     0,     0,     0],
        [  101,  2054, 12046,  ...,     0,     0,     0],
        [  101,  2054,  2367,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.17496128380298615, logits - tensor([[-5.6895,  1.5965, -3.8334, -1.8954],
        [-6.3828,  0.8895, -6.4668, -1.8341],
        [-5.2283,  1.8990, -5.2387, -2.3627],
        [-5.5572,  1.6435, -5.3320, -0.8764],
   

 87%|████████▋ | 252/289 [03:10<00:27,  1.33it/s]

Training loop 252
tensor([[  101,  2129,  2024,  ...,     0,     0,     0],
        [  101,  2029,  2394,  ...,     0,     0,     0],
        [  101,  2054, 26163,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  2024,  ...,     0,     0,     0],
        [  101,  2129,  2502,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4169408679008484, logits - tensor([[-5.6543,  1.5729, -5.7913, -2.0167],
        [-4.0438, -2.2237, -3.8235,  2.3418],
        [-6.5604,  1.0969, -5.4102, -1.6772],
        [-5.4355,  1.5911, -5.1433, -1.4612],
    

 88%|████████▊ | 253/289 [03:10<00:27,  1.33it/s]

Training loop 253
tensor([[ 101, 2129, 2079,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2054, 2944,  ...,    0,    0,    0],
        ...,
        [ 101, 2024, 2070,  ...,    0,    0,    0],
        [ 101, 2054, 2093,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.36059629917144775, logits - tensor([[-6.6088,  1.0783, -6.2164, -1.8446],
        [-7.1085,  1.4192, -5.7081, -1.5725],
        [-5.8730,  1.8645, -4.4798, -1.2490],
        [-3.8329, -1.8004, -4.3785,  3.1087],
        [-4.4681,  1.4780, -4.9614, -1.

 88%|████████▊ | 254/289 [03:11<00:26,  1.33it/s]

Training loop 254
tensor([[ 101, 2054, 3176,  ...,    0,    0,    0],
        [ 101, 2129, 2003,  ...,    0,    0,    0],
        [ 101, 2029, 2270,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2031,  ...,    0,    0,    0],
        [ 101, 2024, 3463,  ...,    0,    0,    0],
        [ 101, 2054, 4800,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3580220341682434, logits - tensor([[-5.7827,  1.4086, -5.4296, -2.0712],
        [-5.9731,  1.1676, -5.4362, -1.5757],
        [-5.5072,  1.2246, -4.4777, -1.2169],
        [-6.3906,  1.7042, -5.7127, -1.3814],
        [-4.7688,  1.6794, -5.1797, -1.5

 88%|████████▊ | 255/289 [03:12<00:25,  1.33it/s]

Training loop 255
tensor([[ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2129, 2079,  ...,    0,    0,    0],
        [ 101, 2054, 2828,  ...,    0,    0,    0],
        ...,
        [ 101, 2029, 2060,  ...,    0,    0,    0],
        [ 101, 2129, 2116,  ...,    0,    0,    0],
        [ 101, 2003, 1996,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3318859934806824, logits - tensor([[-5.4728,  1.8153, -4.9497, -1.4646],
        [-6.5040,  1.2296, -4.9044, -0.8287],
        [-6.6676,  0.8690, -4.9538, -1.0724],
        [-5.7371,  1.6635, -4.9894, -2.1374],
        [-5.2997, -2.6050, -4.2353,  2.7

 89%|████████▊ | 256/289 [03:13<00:25,  1.32it/s]

Training loop 256
tensor([[  101,  2054, 15488,  ...,     0,     0,     0],
        [  101,  2129,  1999,  ...,     0,     0,     0],
        [  101,  2054,  6364,  ...,     0,     0,     0],
        ...,
        [  101,  1999,  1996,  ...,     0,     0,     0],
        [  101,  2054,  4155,  ...,     0,     0,     0],
        [  101,  2054, 24828,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 89%|████████▉ | 257/289 [03:13<00:24,  1.31it/s]

loss - 0.3671034574508667, logits - tensor([[-5.0575,  1.8319, -5.3755, -2.7271],
        [-5.7948,  1.4563, -4.6235, -0.9295],
        [-5.2735,  2.4119, -5.2099, -3.1326],
        [-6.5704,  1.5714, -5.5931, -0.9386],
        [-5.9186,  2.4145, -6.1613, -2.2508],
        [-5.5547,  1.9734, -4.7080, -1.5663],
        [-4.7939,  0.8751, -5.0310, -1.4983],
        [-4.7211, -2.7778, -4.4700,  2.9028]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 257
tensor([[ 101, 2129, 2515,  ...,    0,    0,    0],
        [ 101, 2515, 1996,  ...,    0,    0,    0],
        [ 101, 2129, 2020,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2502,  ...,    0,    0,    0],
        [ 101, 2054, 2001,  ...,    0,    0,    0],
        [ 101, 2054, 4725,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 

 89%|████████▉ | 258/289 [03:14<00:23,  1.31it/s]

Training loop 258
tensor([[ 101, 2054, 2951,  ...,    0,    0,    0],
        [ 101, 2029, 2951,  ...,    0,    0,    0],
        [ 101, 2029, 2951,  ...,    0,    0,    0],
        ...,
        [ 101, 2029, 3319,  ...,    0,    0,    0],
        [ 101, 2129, 2003,  ...,    0,    0,    0],
        [ 101, 2129, 2172,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19236379861831665, logits - tensor([[-5.4803,  1.7208, -4.9794, -1.7822],
        [-4.2214, -3.4562, -3.3243,  3.2663],
        [-5.8901,  2.1988, -5.1055, -1.5516],
        [-5.3043, -3.1042,  2.3554, -2.4640],
        [-5.3481,  0.7538, -3.4125, -1.

 90%|████████▉ | 259/289 [03:15<00:22,  1.31it/s]

Training loop 259
tensor([[  101,  2129,  2079,  ...,     0,     0,     0],
        [  101,  2029, 26163,  ...,     0,     0,     0],
        [  101,  2129,  2079,  ...,     0,     0,     0],
        ...,
        [  101,  2054, 13100,  ...,     0,     0,     0],
        [  101,  2029,  4275,  ...,     0,     0,     0],
        [  101,  2129,  2488,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.25070810317993164, logits - tensor([[-4.9606,  1.8856, -4.3112, -1.0978],
        [-5.3021,  1.7949, -4.3387, -1.4476],
        [-5.8143,  1.2761, -5.1015, -1.6309],
        [-6.3417,  1.7342, -5.6634, -1.2622],
   

 90%|████████▉ | 260/289 [03:16<00:22,  1.31it/s]

Training loop 260
tensor([[  101,  2054, 15973,  ...,     0,     0,     0],
        [  101,  2129,  2079,  ...,     0,     0,     0],
        [  101,  1999,  2029,  ...,     0,     0,     0],
        ...,
        [  101,  2029,  2176,  ...,     0,     0,     0],
        [  101,  2029,  2024,  ...,     0,     0,     0],
        [  101,  2054,  2001,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1821904331445694, logits - tensor([[-5.4919,  1.1837, -4.1628, -1.5243],
        [-6.1327,  1.4398, -4.9164, -0.7552],
        [-6.1289,  1.4457, -5.3413, -0.9148],
        [-5.9349,  1.1346, -5.0230, -2.0799],
    

 90%|█████████ | 261/289 [03:16<00:21,  1.32it/s]

Training loop 261
tensor([[ 101, 2129, 2024,  ..., 2009, 3397,  102],
        [ 101, 2129, 2003,  ...,    0,    0,    0],
        [ 101, 2006, 2029,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2129, 2515,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.42528870701789856, logits - tensor([[-5.1093,  1.3025, -4.2954, -1.3124],
        [-6.3222,  2.1456, -5.4126, -1.3642],
        [-5.2878,  1.1881, -4.2280, -1.1207],
        [-5.3985,  1.5138, -4.3715, -0.5738],
        [-5.3002,  0.8111, -4.3439, -0.

 91%|█████████ | 262/289 [03:17<00:20,  1.32it/s]

Training loop 262
tensor([[  101,  2054,  2465,  ...,     0,     0,     0],
        [  101,  2054, 26163,  ...,     0,     0,     0],
        [  101,  2054,  2838,  ...,     0,     0,     0],
        ...,
        [  101,  2064,  2023,  ...,     0,     0,     0],
        [  101,  2054,  2951,  ...,     0,     0,     0],
        [  101,  2054,  2951,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.23931056261062622, logits - tensor([[-5.6764,  2.4939, -5.0233, -1.8355],
        [-6.1800,  2.1212, -5.7602, -1.9050],
        [-6.4966,  1.5086, -5.7020, -2.0436],
        [-4.1911, -2.9294,  1.8853, -2.6900],
   

 91%|█████████ | 263/289 [03:18<00:19,  1.33it/s]

Training loop 263
tensor([[ 101, 2129, 2502,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2339, 2027,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 6388,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.11723395437002182, logits - tensor([[-5.1882, -2.0738, -5.6579,  1.9587],
        [-6.2947,  1.5570, -5.2307, -1.2551],
        [-5.8655,  1.9329, -5.9212, -0.8499],
        [-4.3782,  1.9479, -5.2676, -1.6172],
        [-4.2268, -3.0835,  2.7622, -2.

 91%|█████████▏| 264/289 [03:19<00:18,  1.33it/s]

Training loop 264
tensor([[  101,  2079,  2027,  ...,     0,     0,     0],
        [  101,  2129,  2020,  ...,     0,     0,     0],
        [  101,  2054,  2024,  ...,     0,     0,     0],
        ...,
        [  101,  2029,  2024,  ...,     0,     0,     0],
        [  101,  2054, 10640,  ...,     0,     0,     0],
        [  101,  2054, 17953,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.08582600951194763, logits - tensor([[-5.1411, -4.2357,  2.6921, -2.6126],
        [-6.0577,  1.3410, -4.9958, -1.6491],
        [-5.7190,  2.0022, -5.3357, -1.3119],
        [-4.2308, -3.0593,  2.5883, -2.2621],
   

 92%|█████████▏| 265/289 [03:19<00:18,  1.33it/s]

Training loop 265
tensor([[  101,  2003,  2009,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0],
        [  101,  2054,  3722,  ...,     0,     0,     0],
        ...,
        [  101,  2054, 14676,  ...,     0,     0,     0],
        [  101,  2024,  1996,  ...,     0,     0,     0],
        [  101,  2029,  4155,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.29428038001060486, logits - tensor([[-5.4661, -3.4715,  3.1007, -3.0171],
        [-5.4557,  0.2477, -5.0515, -1.2614],
        [-5.7458,  2.0089, -5.6696, -1.1175],
        [-5.5971,  1.6614, -4.8463, -1.4312],
   

 92%|█████████▏| 266/289 [03:20<00:17,  1.33it/s]

Training loop 266
tensor([[ 101, 2000, 2029,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2129, 2003,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 4083,  ...,    0,    0,    0],
        [ 101, 2029, 2951,  ...,    0,    0,    0],
        [ 101, 2029, 3698,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1988646388053894, logits - tensor([[-6.0848,  1.5551, -5.1307, -1.6917],
        [-5.5047,  1.9502, -4.4760, -1.1118],
        [-5.9207,  1.1405, -5.5057, -0.9188],
        [-6.0771,  2.0222, -5.2947, -1.4861],
        [-5.0362,  0.7497, -4.6422, -1.4

 92%|█████████▏| 267/289 [03:21<00:16,  1.33it/s]

Training loop 267
tensor([[ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2079, 2027,  ...,    0,    0,    0],
        [ 101, 2054, 4155,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2060,  ...,    0,    0,    0],
        [ 101, 2129, 2515,  ...,    0,    0,    0],
        [ 101, 2129, 2502,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.36625903844833374, logits - tensor([[-4.9875, -3.1301, -4.6570,  3.0154],
        [-4.9929, -3.8571,  3.0905, -2.5263],
        [-5.6537,  1.7873, -4.6273, -2.2630],
        [-6.4547,  1.2802, -5.0372, -1.6049],
        [-5.9491,  1.8905, -5.0271, -2.

 93%|█████████▎| 268/289 [03:22<00:15,  1.33it/s]

Training loop 268
tensor([[  101,  2054, 10873,  ...,     0,     0,     0],
        [  101,  2029,  2028,  ...,     0,     0,     0],
        [  101,  2029,  9896,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  2003,  ...,     0,     0,     0],
        [  101,  2054,  4725,  ...,     0,     0,     0],
        [  101,  2054,  2176,  ...,  4294,  2015,   102]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.15994475781917572, logits - tensor([[-4.2554, -3.1766, -3.8939,  2.5787],
        [-5.7080,  1.0770, -5.0881, -1.7753],
        [-5.5301,  2.3695, -4.7242, -2.0884],
        [-5.0835,  1.3759, -5.1218, -1.1435],
   

 93%|█████████▎| 269/289 [03:22<00:14,  1.33it/s]

Training loop 269
tensor([[  101,  2054, 12046,  ...,  2096,  1996,   102],
        [  101,  2003, 13675,  ...,  2714,  4708,   102],
        [  101,  2129,  2172,  ...,     0,     0,     0],
        ...,
        [  101,  2054, 15756,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,  2057,  2036,   102]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.29279160499572754, logits - tensor([[-5.1508, -0.1849, -4.3990,  0.5554],
        [-5.4888,  1.2258, -3.7555, -0.9502],
        [-5.3857,  2.3817, -5.8088, -1.6624],
        [-5.9844,  1.3285, -4.8454, -1.2475],
   

 93%|█████████▎| 270/289 [03:23<00:14,  1.33it/s]

Training loop 270
tensor([[  101,  2129,  2079,  ...,     0,     0,     0],
        [  101,  2054,  2024,  ...,     0,     0,     0],
        [  101,  2054,  2024,  ...,     0,     0,     0],
        ...,
        [  101,  2029,  2925,  ...,     0,     0,     0],
        [  101,  2054, 18804,  ...,     0,     0,     0],
        [  101,  2029,  4493,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2812651991844177, logits - tensor([[-4.5782,  1.0718, -4.3273, -1.2080],
        [-4.3857, -0.9280, -4.1758,  1.9734],
        [-6.1858,  2.0068, -5.0445, -1.3874],
        [-5.3182,  0.1073, -4.4907, -1.6709],
    

 94%|█████████▍| 271/289 [03:24<00:13,  1.33it/s]

Training loop 271
tensor([[ 101, 2106, 2027,  ...,    0,    0,    0],
        [ 101, 2054, 2785,  ...,    0,    0,    0],
        [ 101, 2029, 7718,  ...,    0,    0,    0],
        ...,
        [ 101, 2029, 4155,  ...,    0,    0,    0],
        [ 101, 2129, 2079,  ...,    0,    0,    0],
        [ 101, 2079, 2027,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.287493497133255, logits - tensor([[-5.1004, -4.0808,  2.2093, -3.5646],
        [-5.2545, -0.4232, -5.1084,  2.1536],
        [-5.1734, -1.8694, -4.3281,  2.0089],
        [-4.6973, -1.0377, -4.5433,  1.3215],
        [-4.9850, -3.8189,  2.6451, -2.76

 94%|█████████▍| 272/289 [03:25<00:12,  1.33it/s]

Training loop 272
tensor([[ 101, 2129, 2502,  ...,    0,    0,    0],
        [ 101, 2054, 2951,  ...,    0,    0,    0],
        [ 101, 2073, 2003,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2773,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2129, 2024,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.11898056417703629, logits - tensor([[-5.1467,  1.4026, -4.6959, -1.6400],
        [-5.6016,  1.7131, -5.3985, -1.5094],
        [-6.2665,  1.8186, -4.4711, -1.3218],
        [-6.1503,  0.8285, -5.1440, -1.4419],
        [-5.1755,  0.9225, -4.3082, -0.

 94%|█████████▍| 273/289 [03:25<00:12,  1.33it/s]

Training loop 273
tensor([[ 101, 2029, 2061,  ...,    0,    0,    0],
        [ 101, 2129, 2502,  ...,    0,    0,    0],
        [ 101, 2029, 5461,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2951,  ...,    0,    0,    0],
        [ 101, 2029, 4563,  ...,    0,    0,    0],
        [ 101, 2054, 2048,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.29090428352355957, logits - tensor([[-5.4571,  0.9658, -4.5300, -0.5008],
        [-6.3878,  1.8680, -5.5966, -1.3050],
        [-5.3774,  0.9527, -5.0467, -1.2367],
        [-6.5866,  1.8674, -4.8227, -0.3619],
        [-6.7926,  2.0592, -4.7775, -1.

 95%|█████████▍| 274/289 [03:26<00:11,  1.33it/s]

Training loop 274
tensor([[ 101, 2129, 2515,  ...,    0,    0,    0],
        [ 101, 2129, 2003,  ...,    0,    0,    0],
        [ 101, 2129, 2024,  ...,    0,    0,    0],
        ...,
        [ 101, 2024, 1996,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 3784,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2905234098434448, logits - tensor([[-6.1434,  1.1689, -4.9741, -2.0541],
        [-6.1388,  1.7029, -5.1649, -1.4106],
        [-5.5705,  1.6294, -5.1544, -1.7169],
        [-6.2179,  1.1843, -5.0291, -0.8987],
        [-5.8224,  0.9475, -4.7977, -1.9

 95%|█████████▌| 275/289 [03:27<00:10,  1.33it/s]

Training loop 275
tensor([[ 101, 2054, 9312,  ...,    0,    0,    0],
        [ 101, 2079, 2027,  ...,    0,    0,    0],
        [ 101, 2129, 2079,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2003,  ...,    0,    0,    0],
        [ 101, 2129, 2079,  ...,    0,    0,    0],
        [ 101, 2029, 2951,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.34854328632354736, logits - tensor([[-6.5193,  0.7528, -4.8141, -1.9735],
        [-5.0625, -3.9021,  2.3662, -2.4404],
        [-5.7760,  1.2121, -5.2454, -1.6433],
        [-5.3647,  1.3647, -4.7375, -0.9363],
        [-5.6605,  1.5543, -5.1030, -1.

 96%|█████████▌| 276/289 [03:28<00:09,  1.32it/s]

Training loop 276
tensor([[  101,  2079,  2027,  ...,     0,     0,     0],
        [  101,  2029, 13588,  ...,     0,     0,     0],
        [  101,  2054,  2024,  ...,     0,     0,     0],
        ...,
        [  101,  2024,  4738,  ..., 20284,  2072,   102],
        [  101,  2054, 10640,  ...,     0,     0,     0],
        [  101,  2129,  2001,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2576863765716553, logits - tensor([[-5.4064, -4.0420,  2.7079, -2.7901],
        [-5.8015,  1.9664, -4.6692, -1.4406],
        [-5.3826,  0.8959, -4.6157, -1.7113],
        [-4.8048,  1.8834, -4.2196, -1.3894],
    

 96%|█████████▌| 277/289 [03:28<00:09,  1.32it/s]

Training loop 277
tensor([[  101,  2129,  2079,  ...,     0,     0,     0],
        [  101,  2129,  2079,  ...,     0,     0,     0],
        [  101,  2054, 26163,  ...,  1996,  6903,   102],
        ...,
        [  101,  2054,  7957,  ...,     0,     0,     0],
        [  101,  2054, 26293,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2566952109336853, logits - tensor([[-5.3683,  1.8122, -5.3253, -1.2402],
        [-5.9736,  2.1010, -5.4136, -1.6340],
        [-5.9200,  1.6002, -4.5967, -1.0421],
        [-5.1753, -1.3873, -4.6153,  1.1667],
    

 96%|█████████▌| 278/289 [03:29<00:08,  1.31it/s]

Training loop 278
tensor([[ 101, 2054, 2785,  ...,    0,    0,    0],
        [ 101, 2054, 7885,  ...,    0,    0,    0],
        [ 101, 2079, 1996,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 9986,  ...,    0,    0,    0],
        [ 101, 2054, 2001,  ...,    0,    0,    0],
        [ 101, 2003, 1996,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.27594849467277527, logits - tensor([[-6.0590,  0.9797, -5.8832, -1.4390],
        [-5.7665,  2.2569, -4.9013, -1.9959],
        [-5.6602, -4.2598,  3.9301, -2.7223],
        [-6.4505,  2.1416, -5.2014, -1.7413],
        [-6.4698,  1.6957, -5.6487, -2.

 97%|█████████▋| 279/289 [03:30<00:07,  1.31it/s]

Training loop 279
tensor([[ 101, 2054, 2951,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2003, 1996,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2951,  ...,    0,    0,    0],
        [ 101, 2129, 2024,  ...,    0,    0,    0],
        [ 101, 2029, 6177,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2850600779056549, logits - tensor([[-5.5404,  2.1312, -5.6382, -0.9512],
        [-5.6200,  1.0299, -5.2031, -0.8299],
        [-4.3700, -2.7388,  0.5130, -0.7103],
        [-5.5724,  1.1722, -4.3755, -1.3498],
        [-4.8453, -3.2415,  2.3340, -1.6

 97%|█████████▋| 280/289 [03:31<00:06,  1.30it/s]

Training loop 280
tensor([[  101,  2054,  2951,  ...,     0,     0,     0],
        [  101,  2054,  2653,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0],
        ...,
        [  101,  2029, 25957,  ...,     0,     0,     0],
        [  101,  2079,  1996,  ...,     0,     0,     0],
        [  101,  2029,  2951,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.10453754663467407, logits - tensor([[-6.3147,  1.2500, -4.6709, -0.7380],
        [-5.7145,  1.6298, -5.2572, -1.4385],
        [-5.4320,  2.0034, -5.4060, -1.5314],
        [-5.6168,  1.4761, -4.5572, -2.2278],
   

 97%|█████████▋| 281/289 [03:32<00:06,  1.31it/s]

Training loop 281
tensor([[ 101, 2129, 2079,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2054, 4800,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2003,  ...,    0,    0,    0],
        [ 101, 2001, 2234,  ...,    0,    0,    0],
        [ 101, 2001, 1996,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.09907788038253784, logits - tensor([[-4.7906,  0.8796, -3.8331, -1.2016],
        [-6.6856,  1.9588, -5.7921, -1.6507],
        [-5.3045,  1.5701, -4.8427, -1.3402],
        [-5.6729,  1.2168, -4.3711, -1.7393],
        [-5.3587,  1.7209, -4.5765, -1.

 98%|█████████▊| 282/289 [03:32<00:05,  1.31it/s]

Training loop 282
tensor([[ 101, 2054, 2842,  ...,    0,    0,    0],
        [ 101, 2515, 1996,  ...,    0,    0,    0],
        [ 101, 2054, 2731,  ..., 2224, 1037,  102],
        ...,
        [ 101, 2054, 2828,  ...,    0,    0,    0],
        [ 101, 2054, 2951,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.10353469848632812, logits - tensor([[-5.6106,  1.8423, -4.5362, -2.2321],
        [-5.1459, -3.7405,  2.0609, -2.7991],
        [-5.8160,  0.5466, -4.8652, -1.3749],
        [-6.7168,  1.5342, -6.0550, -1.1851],
        [-4.9355,  1.0041, -4.8129, -1.

 98%|█████████▊| 283/289 [03:33<00:04,  1.32it/s]

Training loop 283
tensor([[  101,  2106,  2027,  ...,     0,     0,     0],
        [  101,  2003,  2045,  ...,     0,     0,     0],
        [  101,  2029, 12739,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  2951,  ...,     0,     0,     0],
        [  101,  2054,  4127,  ...,     0,     0,     0],
        [  101,  2054,  2828,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.0921248346567154, logits - tensor([[-5.2897, -3.5390,  3.2082, -3.2974],
        [-5.2521, -3.2647,  2.8828, -2.5765],
        [-7.4572,  1.9534, -5.4794, -0.5017],
        [-5.8980,  1.8334, -6.1543, -1.4841],
    

 98%|█████████▊| 284/289 [03:34<00:03,  1.32it/s]

Training loop 284
tensor([[ 101, 2054, 2515,  ...,    0,    0,    0],
        [ 101, 2029, 6254,  ...,    0,    0,    0],
        [ 101, 2054, 3291,  ...,    0,    0,    0],
        ...,
        [ 101, 2029, 2951,  ...,    0,    0,    0],
        [ 101, 2054, 2020,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.18213985860347748, logits - tensor([[-4.9146,  1.7064, -4.7274, -1.1360],
        [-6.5290,  1.6717, -4.7600, -1.3023],
        [-6.4708,  2.3335, -5.5465, -1.8592],
        [-5.7660,  2.2264, -4.9011, -0.6072],
        [-5.3929, -1.2927, -4.3960,  1.

 99%|█████████▊| 285/289 [03:35<00:03,  1.32it/s]

Training loop 285
tensor([[  101,  2054,  2020,  ...,     0,     0,     0],
        [  101,  2054, 21677,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0],
        ...,
        [  101,  2129,  2515,  ...,     0,     0,     0],
        [  101,  2003,  2045,  ...,     0,     0,     0],
        [  101,  2054,  2739,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.11719745397567749, logits - tensor([[-5.6963,  1.5894, -5.7669, -0.5210],
        [-5.3273,  1.5193, -5.4683, -1.6389],
        [-5.7945,  1.0083, -5.0175, -1.2913],
        [-5.8234,  1.1379, -4.4080, -1.5104],
   

 99%|█████████▉| 286/289 [03:35<00:02,  1.32it/s]

Training loop 286
tensor([[ 101, 2079, 2027,  ...,    0,    0,    0],
        [ 101, 2054, 2951,  ...,    0,    0,    0],
        [ 101, 2129, 2116,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2488,  ...,    0,    0,    0],
        [ 101, 2029, 2773,  ...,    0,    0,    0],
        [ 101, 2054, 2951,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.27160531282424927, logits - tensor([[-4.4683, -3.3124,  2.6993, -2.4022],
        [-3.7578, -3.0571, -3.6270,  2.8356],
        [-6.6286,  1.2641, -6.1821, -0.7725],
        [-6.0575,  1.7984, -4.9285, -2.3640],
        [-5.4766, -0.9200, -0.6331, -2.

 99%|█████████▉| 287/289 [03:36<00:01,  1.33it/s]

Training loop 287
tensor([[  101,  2003, 10640,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0],
        [  101,  2054,  2951,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  4275,  ...,     0,     0,     0],
        [  101,  2129,  5514,  ...,     0,     0,     0],
        [  101,  2129,  2312,  ...,     0,     0,     0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2394292801618576, logits - tensor([[-4.1288, -2.9928,  2.1250, -2.1583],
        [-4.8536,  1.8542, -4.5342, -1.6992],
        [-6.9079,  1.1513, -5.8469, -2.0346],
        [-6.4665,  1.3039, -5.6068, -1.5498],
    

100%|█████████▉| 288/289 [03:37<00:00,  1.34it/s]

Training loop 288
tensor([[ 101, 2029, 7511,  ...,    0,    0,    0],
        [ 101, 2129, 2172,  ...,    0,    0,    0],
        [ 101, 2054, 2001,  ...,    0,    0,    0],
        [ 101, 2011, 2129,  ...,    0,    0,    0]], device='cuda:0')
<class 'torch.Tensor'>
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
<class 'torch.Tensor'>
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.06234811246395111, logits - tensor([[-5.5039,  2.5746, -4.3634, -1.8349],
        [-5.4136, -1.7509, -4.2076,  1.3158],
        [-4.7529, -2.9744, -4.2582,  2.1666],
        [-5.6210, -2.2625, -5.3322,  2.8919]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


100%|██████████| 289/289 [03:37<00:00,  1.33it/s]
  0%|          | 0/194 [00:00<?, ?it/s]

Validation Loop 0
input - False, attention_mask - False


  1%|          | 1/194 [00:00<00:51,  3.72it/s]

Validation Loop 1
input - False, attention_mask - False


  1%|          | 2/194 [00:00<00:48,  3.98it/s]

Validation Loop 2
input - False, attention_mask - False


  2%|▏         | 3/194 [00:00<00:47,  3.99it/s]

Validation Loop 3
input - False, attention_mask - False


  2%|▏         | 4/194 [00:01<00:48,  3.89it/s]

Validation Loop 4
input - False, attention_mask - False


  3%|▎         | 5/194 [00:01<00:47,  3.98it/s]

Validation Loop 5
input - False, attention_mask - False


  3%|▎         | 6/194 [00:01<00:47,  3.99it/s]

Validation Loop 6
input - False, attention_mask - False


  4%|▎         | 7/194 [00:01<00:46,  4.00it/s]

Validation Loop 7
input - False, attention_mask - False


  4%|▍         | 8/194 [00:02<00:46,  3.98it/s]

Validation Loop 8
input - False, attention_mask - False


  5%|▍         | 9/194 [00:02<00:46,  4.01it/s]

Validation Loop 9
input - False, attention_mask - False


  5%|▌         | 10/194 [00:02<00:45,  4.01it/s]

Validation Loop 10
input - False, attention_mask - False


  6%|▌         | 11/194 [00:02<00:45,  4.05it/s]

Validation Loop 11
input - False, attention_mask - False


  6%|▌         | 12/194 [00:03<00:45,  4.04it/s]

Validation Loop 12
input - False, attention_mask - False


  7%|▋         | 13/194 [00:03<00:45,  4.00it/s]

Validation Loop 13
input - False, attention_mask - False


  7%|▋         | 14/194 [00:03<00:44,  4.02it/s]

Validation Loop 14
input - False, attention_mask - False


  8%|▊         | 15/194 [00:03<00:44,  4.04it/s]

Validation Loop 15
input - False, attention_mask - False


  8%|▊         | 16/194 [00:03<00:43,  4.05it/s]

Validation Loop 16
input - False, attention_mask - False


  9%|▉         | 17/194 [00:04<00:43,  4.07it/s]

Validation Loop 17
input - False, attention_mask - False


  9%|▉         | 18/194 [00:04<00:43,  4.02it/s]

Validation Loop 18
input - False, attention_mask - False


 10%|▉         | 19/194 [00:04<00:43,  4.04it/s]

Validation Loop 19
input - False, attention_mask - False


 10%|█         | 20/194 [00:04<00:42,  4.05it/s]

Validation Loop 20
input - False, attention_mask - False


 11%|█         | 21/194 [00:05<00:42,  4.03it/s]

Validation Loop 21
input - False, attention_mask - False


 11%|█▏        | 22/194 [00:05<00:42,  4.01it/s]

Validation Loop 22
input - False, attention_mask - False


 12%|█▏        | 23/194 [00:05<00:42,  3.99it/s]

Validation Loop 23
input - False, attention_mask - False


 12%|█▏        | 24/194 [00:06<00:43,  3.94it/s]

Validation Loop 24
input - False, attention_mask - False


 13%|█▎        | 25/194 [00:06<00:42,  3.93it/s]

Validation Loop 25
input - False, attention_mask - False


 13%|█▎        | 26/194 [00:06<00:43,  3.90it/s]

Validation Loop 26
input - False, attention_mask - False


 14%|█▍        | 27/194 [00:06<00:42,  3.89it/s]

Validation Loop 27
input - False, attention_mask - False


 14%|█▍        | 28/194 [00:07<00:42,  3.89it/s]

Validation Loop 28
input - False, attention_mask - False


 15%|█▍        | 29/194 [00:07<00:42,  3.86it/s]

Validation Loop 29
input - False, attention_mask - False


 15%|█▌        | 30/194 [00:07<00:41,  3.91it/s]

Validation Loop 30
input - False, attention_mask - False


 16%|█▌        | 31/194 [00:07<00:41,  3.89it/s]

Validation Loop 31
input - False, attention_mask - False


 16%|█▋        | 32/194 [00:08<00:41,  3.93it/s]

Validation Loop 32
input - False, attention_mask - False


 17%|█▋        | 33/194 [00:08<00:40,  3.94it/s]

Validation Loop 33
input - False, attention_mask - False


 18%|█▊        | 34/194 [00:08<00:40,  3.93it/s]

Validation Loop 34
input - False, attention_mask - False


 18%|█▊        | 35/194 [00:08<00:40,  3.91it/s]

Validation Loop 35
input - False, attention_mask - False


 19%|█▊        | 36/194 [00:09<00:40,  3.92it/s]

Validation Loop 36
input - False, attention_mask - False


 19%|█▉        | 37/194 [00:09<00:40,  3.91it/s]

Validation Loop 37
input - False, attention_mask - False


 20%|█▉        | 38/194 [00:09<00:40,  3.88it/s]

Validation Loop 38
input - False, attention_mask - False


 20%|██        | 39/194 [00:09<00:39,  3.93it/s]

Validation Loop 39
input - False, attention_mask - False


 21%|██        | 40/194 [00:10<00:39,  3.92it/s]

Validation Loop 40
input - False, attention_mask - False


 21%|██        | 41/194 [00:10<00:38,  3.94it/s]

Validation Loop 41
input - False, attention_mask - False


 22%|██▏       | 42/194 [00:10<00:38,  3.97it/s]

Validation Loop 42
input - False, attention_mask - False


 22%|██▏       | 43/194 [00:10<00:37,  4.01it/s]

Validation Loop 43
input - False, attention_mask - False


 23%|██▎       | 44/194 [00:11<00:36,  4.06it/s]

Validation Loop 44
input - False, attention_mask - False


 23%|██▎       | 45/194 [00:11<00:37,  4.00it/s]

Validation Loop 45
input - False, attention_mask - False


 24%|██▎       | 46/194 [00:11<00:36,  4.00it/s]

Validation Loop 46
input - False, attention_mask - False


 24%|██▍       | 47/194 [00:11<00:37,  3.97it/s]

Validation Loop 47
input - False, attention_mask - False


 25%|██▍       | 48/194 [00:12<00:37,  3.94it/s]

Validation Loop 48
input - False, attention_mask - False


 25%|██▌       | 49/194 [00:12<00:36,  3.96it/s]

Validation Loop 49
input - False, attention_mask - False


 26%|██▌       | 50/194 [00:12<00:36,  3.99it/s]

Validation Loop 50
input - False, attention_mask - False


 26%|██▋       | 51/194 [00:12<00:35,  4.01it/s]

Validation Loop 51
input - False, attention_mask - False


 27%|██▋       | 52/194 [00:13<00:35,  4.03it/s]

Validation Loop 52
input - False, attention_mask - False


 27%|██▋       | 53/194 [00:13<00:34,  4.05it/s]

Validation Loop 53
input - False, attention_mask - False


 28%|██▊       | 54/194 [00:13<00:34,  4.05it/s]

Validation Loop 54
input - False, attention_mask - False


 28%|██▊       | 55/194 [00:13<00:34,  4.03it/s]

Validation Loop 55
input - False, attention_mask - False


 29%|██▉       | 56/194 [00:14<00:34,  4.05it/s]

Validation Loop 56
input - False, attention_mask - False


 29%|██▉       | 57/194 [00:14<00:34,  4.03it/s]

Validation Loop 57
input - False, attention_mask - False


 30%|██▉       | 58/194 [00:14<00:33,  4.03it/s]

Validation Loop 58
input - False, attention_mask - False


 30%|███       | 59/194 [00:14<00:33,  3.99it/s]

Validation Loop 59
input - False, attention_mask - False


 31%|███       | 60/194 [00:15<00:33,  4.00it/s]

Validation Loop 60
input - False, attention_mask - False


 31%|███▏      | 61/194 [00:15<00:33,  3.99it/s]

Validation Loop 61
input - False, attention_mask - False


 32%|███▏      | 62/194 [00:15<00:33,  3.97it/s]

Validation Loop 62
input - False, attention_mask - False


 32%|███▏      | 63/194 [00:15<00:32,  4.00it/s]

Validation Loop 63
input - False, attention_mask - False


 33%|███▎      | 64/194 [00:16<00:32,  3.95it/s]

Validation Loop 64
input - False, attention_mask - False


 34%|███▎      | 65/194 [00:16<00:32,  3.98it/s]

Validation Loop 65
input - False, attention_mask - False


 34%|███▍      | 66/194 [00:16<00:32,  4.00it/s]

Validation Loop 66
input - False, attention_mask - False


 35%|███▍      | 67/194 [00:16<00:31,  4.03it/s]

Validation Loop 67
input - False, attention_mask - False


 35%|███▌      | 68/194 [00:17<00:31,  4.01it/s]

Validation Loop 68
input - False, attention_mask - False


 36%|███▌      | 69/194 [00:17<00:31,  4.00it/s]

Validation Loop 69
input - False, attention_mask - False


 36%|███▌      | 70/194 [00:17<00:31,  4.00it/s]

Validation Loop 70
input - False, attention_mask - False


 37%|███▋      | 71/194 [00:17<00:30,  3.97it/s]

Validation Loop 71
input - False, attention_mask - False


 37%|███▋      | 72/194 [00:18<00:30,  3.98it/s]

Validation Loop 72
input - False, attention_mask - False


 38%|███▊      | 73/194 [00:18<00:30,  3.99it/s]

Validation Loop 73
input - False, attention_mask - False


 38%|███▊      | 74/194 [00:18<00:30,  3.99it/s]

Validation Loop 74
input - False, attention_mask - False


 39%|███▊      | 75/194 [00:18<00:30,  3.96it/s]

Validation Loop 75
input - False, attention_mask - False


 39%|███▉      | 76/194 [00:19<00:29,  3.96it/s]

Validation Loop 76
input - False, attention_mask - False


 40%|███▉      | 77/194 [00:19<00:29,  3.96it/s]

Validation Loop 77
input - False, attention_mask - False


 40%|████      | 78/194 [00:19<00:29,  3.98it/s]

Validation Loop 78
input - False, attention_mask - False


 41%|████      | 79/194 [00:19<00:28,  3.99it/s]

Validation Loop 79
input - False, attention_mask - False


 41%|████      | 80/194 [00:20<00:28,  4.02it/s]

Validation Loop 80
input - False, attention_mask - False


 42%|████▏     | 81/194 [00:20<00:28,  4.01it/s]

Validation Loop 81
input - False, attention_mask - False


 42%|████▏     | 82/194 [00:20<00:27,  4.02it/s]

Validation Loop 82
input - False, attention_mask - False


 43%|████▎     | 83/194 [00:20<00:27,  3.98it/s]

Validation Loop 83
input - False, attention_mask - False


 43%|████▎     | 84/194 [00:21<00:27,  3.97it/s]

Validation Loop 84
input - False, attention_mask - False


 44%|████▍     | 85/194 [00:21<00:27,  4.03it/s]

Validation Loop 85
input - False, attention_mask - False


 44%|████▍     | 86/194 [00:21<00:27,  3.97it/s]

Validation Loop 86
input - False, attention_mask - False


 45%|████▍     | 87/194 [00:21<00:26,  4.00it/s]

Validation Loop 87
input - False, attention_mask - False


 45%|████▌     | 88/194 [00:22<00:26,  4.04it/s]

Validation Loop 88
input - False, attention_mask - False


 46%|████▌     | 89/194 [00:22<00:25,  4.05it/s]

Validation Loop 89
input - False, attention_mask - False


 46%|████▋     | 90/194 [00:22<00:26,  3.99it/s]

Validation Loop 90
input - False, attention_mask - False


 47%|████▋     | 91/194 [00:22<00:26,  3.93it/s]

Validation Loop 91
input - False, attention_mask - False


 47%|████▋     | 92/194 [00:23<00:26,  3.92it/s]

Validation Loop 92
input - False, attention_mask - False


 48%|████▊     | 93/194 [00:23<00:25,  3.93it/s]

Validation Loop 93
input - False, attention_mask - False


 48%|████▊     | 94/194 [00:23<00:25,  3.97it/s]

Validation Loop 94
input - False, attention_mask - False


 49%|████▉     | 95/194 [00:23<00:24,  3.97it/s]

Validation Loop 95
input - False, attention_mask - False


 49%|████▉     | 96/194 [00:24<00:24,  4.00it/s]

Validation Loop 96
input - False, attention_mask - False


 50%|█████     | 97/194 [00:24<00:24,  3.97it/s]

Validation Loop 97
input - False, attention_mask - False


 51%|█████     | 98/194 [00:24<00:24,  3.97it/s]

Validation Loop 98
input - False, attention_mask - False


 51%|█████     | 99/194 [00:24<00:23,  3.97it/s]

Validation Loop 99
input - False, attention_mask - False


 52%|█████▏    | 100/194 [00:25<00:23,  3.98it/s]

Validation Loop 100
input - False, attention_mask - False


 52%|█████▏    | 101/194 [00:25<00:23,  3.95it/s]

Validation Loop 101
input - False, attention_mask - False


 53%|█████▎    | 102/194 [00:25<00:23,  3.95it/s]

Validation Loop 102
input - False, attention_mask - False


 53%|█████▎    | 103/194 [00:25<00:23,  3.94it/s]

Validation Loop 103
input - False, attention_mask - False


 54%|█████▎    | 104/194 [00:26<00:22,  3.95it/s]

Validation Loop 104
input - False, attention_mask - False


 54%|█████▍    | 105/194 [00:26<00:22,  3.97it/s]

Validation Loop 105
input - False, attention_mask - False


 55%|█████▍    | 106/194 [00:26<00:22,  3.92it/s]

Validation Loop 106
input - False, attention_mask - False


 55%|█████▌    | 107/194 [00:26<00:22,  3.93it/s]

Validation Loop 107
input - False, attention_mask - False


 56%|█████▌    | 108/194 [00:27<00:21,  3.97it/s]

Validation Loop 108
input - False, attention_mask - False


 56%|█████▌    | 109/194 [00:27<00:21,  3.94it/s]

Validation Loop 109
input - False, attention_mask - False


 57%|█████▋    | 110/194 [00:27<00:21,  3.96it/s]

Validation Loop 110
input - False, attention_mask - False


 57%|█████▋    | 111/194 [00:27<00:20,  3.99it/s]

Validation Loop 111
input - False, attention_mask - False


 58%|█████▊    | 112/194 [00:28<00:20,  3.95it/s]

Validation Loop 112
input - False, attention_mask - False


 58%|█████▊    | 113/194 [00:28<00:20,  3.93it/s]

Validation Loop 113
input - False, attention_mask - False


 59%|█████▉    | 114/194 [00:28<00:20,  3.98it/s]

Validation Loop 114
input - False, attention_mask - False


 59%|█████▉    | 115/194 [00:28<00:19,  3.98it/s]

Validation Loop 115
input - False, attention_mask - False


 60%|█████▉    | 116/194 [00:29<00:19,  3.95it/s]

Validation Loop 116
input - False, attention_mask - False


 60%|██████    | 117/194 [00:29<00:19,  3.96it/s]

Validation Loop 117
input - False, attention_mask - False


 61%|██████    | 118/194 [00:29<00:19,  3.97it/s]

Validation Loop 118
input - False, attention_mask - False


 61%|██████▏   | 119/194 [00:29<00:18,  4.00it/s]

Validation Loop 119
input - False, attention_mask - False


 62%|██████▏   | 120/194 [00:30<00:18,  3.99it/s]

Validation Loop 120
input - False, attention_mask - False


 62%|██████▏   | 121/194 [00:30<00:18,  4.01it/s]

Validation Loop 121
input - False, attention_mask - False


 63%|██████▎   | 122/194 [00:30<00:17,  4.04it/s]

Validation Loop 122
input - False, attention_mask - False


 63%|██████▎   | 123/194 [00:30<00:17,  4.05it/s]

Validation Loop 123
input - False, attention_mask - False


 64%|██████▍   | 124/194 [00:31<00:17,  4.03it/s]

Validation Loop 124
input - False, attention_mask - False


 64%|██████▍   | 125/194 [00:31<00:17,  4.05it/s]

Validation Loop 125
input - False, attention_mask - False


 65%|██████▍   | 126/194 [00:31<00:16,  4.06it/s]

Validation Loop 126
input - False, attention_mask - False


 65%|██████▌   | 127/194 [00:31<00:16,  4.04it/s]

Validation Loop 127
input - False, attention_mask - False


 66%|██████▌   | 128/194 [00:32<00:16,  4.04it/s]

Validation Loop 128
input - False, attention_mask - False


 66%|██████▋   | 129/194 [00:32<00:16,  4.06it/s]

Validation Loop 129
input - False, attention_mask - False


 67%|██████▋   | 130/194 [00:32<00:15,  4.04it/s]

Validation Loop 130
input - False, attention_mask - False


 68%|██████▊   | 131/194 [00:32<00:15,  4.06it/s]

Validation Loop 131
input - False, attention_mask - False


 68%|██████▊   | 132/194 [00:33<00:15,  4.04it/s]

Validation Loop 132
input - False, attention_mask - False


 69%|██████▊   | 133/194 [00:33<00:15,  4.05it/s]

Validation Loop 133
input - False, attention_mask - False


 69%|██████▉   | 134/194 [00:33<00:14,  4.07it/s]

Validation Loop 134
input - False, attention_mask - False


 70%|██████▉   | 135/194 [00:33<00:14,  4.10it/s]

Validation Loop 135
input - False, attention_mask - False


 70%|███████   | 136/194 [00:34<00:14,  4.09it/s]

Validation Loop 136
input - False, attention_mask - False


 71%|███████   | 137/194 [00:34<00:14,  4.06it/s]

Validation Loop 137
input - False, attention_mask - False


 71%|███████   | 138/194 [00:34<00:13,  4.04it/s]

Validation Loop 138
input - False, attention_mask - False


 72%|███████▏  | 139/194 [00:34<00:13,  4.01it/s]

Validation Loop 139
input - False, attention_mask - False


 72%|███████▏  | 140/194 [00:35<00:13,  4.01it/s]

Validation Loop 140
input - False, attention_mask - False


 73%|███████▎  | 141/194 [00:35<00:13,  3.97it/s]

Validation Loop 141
input - False, attention_mask - False


 73%|███████▎  | 142/194 [00:35<00:13,  3.99it/s]

Validation Loop 142
input - False, attention_mask - False


 74%|███████▎  | 143/194 [00:35<00:12,  3.96it/s]

Validation Loop 143
input - False, attention_mask - False


 74%|███████▍  | 144/194 [00:36<00:12,  3.97it/s]

Validation Loop 144
input - False, attention_mask - False


 75%|███████▍  | 145/194 [00:36<00:12,  3.99it/s]

Validation Loop 145
input - False, attention_mask - False


 75%|███████▌  | 146/194 [00:36<00:12,  3.97it/s]

Validation Loop 146
input - False, attention_mask - False


 76%|███████▌  | 147/194 [00:36<00:11,  4.01it/s]

Validation Loop 147
input - False, attention_mask - False


 76%|███████▋  | 148/194 [00:37<00:11,  4.02it/s]

Validation Loop 148
input - False, attention_mask - False


 77%|███████▋  | 149/194 [00:37<00:11,  4.03it/s]

Validation Loop 149
input - False, attention_mask - False


 77%|███████▋  | 150/194 [00:37<00:10,  4.02it/s]

Validation Loop 150
input - False, attention_mask - False


 78%|███████▊  | 151/194 [00:37<00:10,  4.02it/s]

Validation Loop 151
input - False, attention_mask - False


 78%|███████▊  | 152/194 [00:38<00:10,  4.03it/s]

Validation Loop 152
input - False, attention_mask - False


 79%|███████▉  | 153/194 [00:38<00:10,  4.03it/s]

Validation Loop 153
input - False, attention_mask - False


 79%|███████▉  | 154/194 [00:38<00:10,  3.98it/s]

Validation Loop 154
input - False, attention_mask - False


 80%|███████▉  | 155/194 [00:38<00:09,  3.98it/s]

Validation Loop 155
input - False, attention_mask - False


 80%|████████  | 156/194 [00:39<00:09,  3.98it/s]

Validation Loop 156
input - False, attention_mask - False


 81%|████████  | 157/194 [00:39<00:09,  3.98it/s]

Validation Loop 157
input - False, attention_mask - False


 81%|████████▏ | 158/194 [00:39<00:09,  3.97it/s]

Validation Loop 158
input - False, attention_mask - False


 82%|████████▏ | 159/194 [00:39<00:08,  4.01it/s]

Validation Loop 159
input - False, attention_mask - False


 82%|████████▏ | 160/194 [00:40<00:08,  4.03it/s]

Validation Loop 160
input - False, attention_mask - False


 83%|████████▎ | 161/194 [00:40<00:08,  4.04it/s]

Validation Loop 161
input - False, attention_mask - False


 84%|████████▎ | 162/194 [00:40<00:07,  4.02it/s]

Validation Loop 162
input - False, attention_mask - False


 84%|████████▍ | 163/194 [00:40<00:07,  4.01it/s]

Validation Loop 163
input - False, attention_mask - False


 85%|████████▍ | 164/194 [00:41<00:07,  4.03it/s]

Validation Loop 164
input - False, attention_mask - False


 85%|████████▌ | 165/194 [00:41<00:07,  4.01it/s]

Validation Loop 165
input - False, attention_mask - False


 86%|████████▌ | 166/194 [00:41<00:07,  3.96it/s]

Validation Loop 166
input - False, attention_mask - False


 86%|████████▌ | 167/194 [00:41<00:06,  3.95it/s]

Validation Loop 167
input - False, attention_mask - False


 87%|████████▋ | 168/194 [00:42<00:06,  3.91it/s]

Validation Loop 168
input - False, attention_mask - False


 87%|████████▋ | 169/194 [00:42<00:06,  3.93it/s]

Validation Loop 169
input - False, attention_mask - False


 88%|████████▊ | 170/194 [00:42<00:06,  3.96it/s]

Validation Loop 170
input - False, attention_mask - False


 88%|████████▊ | 171/194 [00:42<00:05,  3.95it/s]

Validation Loop 171
input - False, attention_mask - False


 89%|████████▊ | 172/194 [00:43<00:05,  3.99it/s]

Validation Loop 172
input - False, attention_mask - False


 89%|████████▉ | 173/194 [00:43<00:05,  3.96it/s]

Validation Loop 173
input - False, attention_mask - False


 90%|████████▉ | 174/194 [00:43<00:04,  4.00it/s]

Validation Loop 174
input - False, attention_mask - False


 90%|█████████ | 175/194 [00:43<00:04,  4.02it/s]

Validation Loop 175
input - False, attention_mask - False


 91%|█████████ | 176/194 [00:44<00:04,  3.95it/s]

Validation Loop 176
input - False, attention_mask - False


 91%|█████████ | 177/194 [00:44<00:04,  3.92it/s]

Validation Loop 177
input - False, attention_mask - False


 92%|█████████▏| 178/194 [00:44<00:04,  4.00it/s]

Validation Loop 178
input - False, attention_mask - False


 92%|█████████▏| 179/194 [00:44<00:03,  3.94it/s]

Validation Loop 179
input - False, attention_mask - False


 93%|█████████▎| 180/194 [00:45<00:03,  3.93it/s]

Validation Loop 180
input - False, attention_mask - False


 93%|█████████▎| 181/194 [00:45<00:03,  3.94it/s]

Validation Loop 181
input - False, attention_mask - False


 94%|█████████▍| 182/194 [00:45<00:03,  3.96it/s]

Validation Loop 182
input - False, attention_mask - False


 94%|█████████▍| 183/194 [00:45<00:02,  4.00it/s]

Validation Loop 183
input - False, attention_mask - False


 95%|█████████▍| 184/194 [00:46<00:02,  3.96it/s]

Validation Loop 184
input - False, attention_mask - False


 95%|█████████▌| 185/194 [00:46<00:02,  3.99it/s]

Validation Loop 185
input - False, attention_mask - False


 96%|█████████▌| 186/194 [00:46<00:01,  4.03it/s]

Validation Loop 186
input - False, attention_mask - False


 96%|█████████▋| 187/194 [00:46<00:01,  4.03it/s]

Validation Loop 187
input - False, attention_mask - False


 97%|█████████▋| 188/194 [00:47<00:01,  4.01it/s]

Validation Loop 188
input - False, attention_mask - False


 97%|█████████▋| 189/194 [00:47<00:01,  3.98it/s]

Validation Loop 189
input - False, attention_mask - False


 98%|█████████▊| 190/194 [00:47<00:00,  4.01it/s]

Validation Loop 190
input - False, attention_mask - False


 98%|█████████▊| 191/194 [00:47<00:00,  4.04it/s]

Validation Loop 191
input - False, attention_mask - False


 99%|█████████▉| 192/194 [00:48<00:00,  4.03it/s]

Validation Loop 192
input - False, attention_mask - False


 99%|█████████▉| 193/194 [00:48<00:00,  4.02it/s]

Validation Loop 193
input - False, attention_mask - False


100%|██████████| 194/194 [00:48<00:00,  3.98it/s]


[{'tp': 0, 'tn': 1552, 'fp': 0, 'fn': 0}, {'tp': 927, 'tn': 331, 'fp': 34, 'fn': 260}, {'tp': 158, 'tn': 1367, 'fp': 2, 'fn': 25}, {'tp': 153, 'tn': 1098, 'fp': 278, 'fn': 23}]
Detailed accuracy after 1 epoch:
unanswerable accuarcy: 1.0
extractive accuarcy: 0.8105670103092784
yes_no accuarcy: 0.9826030927835051
abstractive accuarcy: 0.8060567010309279
Overall accuarcy: 0.8998067010309279
Best accuarcy: 0.8990012886597938
0.8998067010309279
Model Updated


In [185]:
context = ["Alice and Bob are playing chess. Alice is a beginner and Bob is an expert. Alice makes a blunder and loses her queen. Bob decides to give her a chance and offers to trade his queen for a pawn. Alice accepts the offer"]
question1 = "Was Alice playing Chess?"
question2 = "Why did Bob offer to trade his queen to Alice for a pawn?"
question3 = "Comment down on the expertise level of the players."

dummy_init = {
                    "unanswerable": False,
                    "extractive_spans": False,
                    "yes_no": False,
                    "abstractive": False
                }

layer1 = [context, question1, dummy_init]
layer2 = [context, question2, dummy_init]
layer3 = [context, question3, dummy_init]
# json.dump(data, f, indent=4)

predictData = json.load(open(r"/content/drive/MyDrive/Colab Notebooks/examplepredict.json"))
# predictData['data']['predict_data'][0][0] = context
# predictData['data']['predict_data'][0][1] = question

predictData['data']['predict_data'] = [layer1, layer2, layer3]

print(predictData['data']['predict_data'][0][1])

with open("/content/drive/MyDrive/Colab Notebooks/examplepredict.json", "w") as f:
  json.dump(predictData, f, indent=4)

Was Alice playing Chess?


In [186]:
# device = "cpu"
predict_data = Qasper_Dataset(
                    data_path = r"/content/drive/MyDrive/Colab Notebooks/examplepredict.json",
                    data_type = "predict_data",
                    tokenizer = tokenizer,
                    attributes = attributes,
                    max_token_length = 512,
                    sample = None
                    )

predict_dataloader = DataLoader(
                  predict_data,
                  batch_size = 3,
                  num_workers = 1,
                  shuffle = False
              )

In [187]:
for idx, data in tqdm(enumerate(predict_dataloader), total = len(predict_dataloader)):
    # print(data.keys())
    # device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    # print(f"Training loop {idx}")
    input_ids = data['input_ids']
    attention_mask = data['attention_mask']
    # labels = data['labels']

    input_ids = input_ids.to(device, dtype=torch.long)
    attention_mask = attention_mask.to(device, dtype=torch.long)
    # labels = labels.to(device, dtype=torch.float)

    optimizer.zero_grad()
    # print(input_ids)
    # print(type(input_ids))
    # print(attention_mask)
    # print(type(attention_mask))
    logits = model(
                  input_ids = input_ids,
                  attention_mask = attention_mask,
              )
    sigmoid = nn.Sigmoid()
    # logits = clipping_fn(logits, 1.0, 0.0)
    logits = sigmoid(logits)
    print(logits)

100%|██████████| 1/1 [00:00<00:00,  5.29it/s]

Training loop 0
input - False, attention_mask - False
tensor([[0.0058, 0.0159, 0.9633, 0.0451],
        [0.0027, 0.7935, 0.0072, 0.1872],
        [0.0024, 0.6740, 0.0208, 0.1372]], device='cuda:0',
       grad_fn=<SigmoidBackward0>)





In [None]:
context = "Hello my name name is Preetam."
question = "What is my name?"

print("Loading Model...")

classification_model = Qasper_Classifier(bert_model, len(attributes))
classification_model.load_state_dict(torch.load(MODEL_PATH))

# classification_model = torch.load(MODEL_PATH)
print(classification_model)

In [None]:
print(input_ids)
print(type(input_ids))
print(attention_mask)
print(type(attention_mask))

In [None]:
# print(classification_model)

In [None]:
# optimizer.zero_grad()
# print(model)
# logits = model(
#               input_ids = input_ids,
#               attention_mask = attention_mask,
#           )

In [None]:
print(logits)