In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m62.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m35.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m116.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m68.4 MB/s[0m eta [36m0:00:

In [3]:
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, AdamW, get_cosine_schedule_with_warmup
import pandas as pd
import numpy as np
import json
import math
import torch
from tqdm import tqdm
import torch.nn as nn
import torch.nn.functional as F

In [14]:
model_name = r"bert-base-uncased"
qasper_classification_path = r"/content/drive/MyDrive/Colab Notebooks/Qasper_classification.json"

device = r"cuda"
tokenizer = AutoTokenizer.from_pretrained(model_name)
attributes = [
                "unanswerable",
                "extractive_spans",
                "yes_no",
                "abstractive"
            ]
max_token_length = 512
bert_model = AutoModel.from_pretrained(model_name, return_dict = True)
batch_size = 8

n_labels = 4
lr = 1.5e-6
warmup = 0.2
weight_decay = 0.001
n_epochs = 8
MODEL_PATH = r"/content/drive/MyDrive/Colab Notebooks/model.bin"

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
class Qasper_Dataset(Dataset):

    def __init__(self, data_path, data_type, tokenizer, attributes, max_token_length = 128, sample = None):
        self.data_path = data_path
        self.data_type = data_type
        self.tokenizer = tokenizer
        self.attributes = attributes
        self.max_token_length = max_token_length
        self.data = self.__load_data__()

    def __load_data__(self):
        data_fd = open(self.data_path)
        self.data = json.load(data_fd)
        return (
            self.data['data'][self.data_type]
        )

    def __len__(self):
        return (
            len(self.data)
        )

    def __getitem__(self, index):
        context = ""
        for i in range(len(self.data[index][0])):
            if i != 0:
                context = context + " "
            context = context + self.data[index][0][i]
        # context = self.data[index][0]
        question = self.data[index][1]
        labels = [self.data[index][2][self.attributes[0]], self.data[index][2][self.attributes[1]], self.data[index][2][self.attributes[2]], self.data[index][2][self.attributes[3]]]
        labels = torch.FloatTensor(labels)
        # labels = Variable(labels, requires_grad = True)
        tokens = self.tokenizer.encode_plus(
                                            question,
                                            context,
                                            add_special_tokens=True,
                                            return_tensors='pt',
                                            truncation=True,
                                            padding='max_length',
                                            max_length=self.max_token_length,
                                            return_attention_mask = True
                                            )
        return {'input_ids': tokens.input_ids.flatten(), 'attention_mask': tokens.attention_mask.flatten(), 'labels': labels}

In [6]:
train_data = Qasper_Dataset(
                data_path = qasper_classification_path,
                data_type = "train_data",
                tokenizer = tokenizer,
                attributes = attributes,
                max_token_length = 512,
                sample = None
                )

val_data = Qasper_Dataset(
                    data_path = qasper_classification_path,
                    data_type = "validation_data",
                    tokenizer = tokenizer,
                    attributes = attributes,
                    max_token_length = 512,
                    sample = None
                    )

train_dataloader = DataLoader(
                  train_data,
                  batch_size = batch_size,
                  num_workers = 2,
                  shuffle = True
              )

val_dataloader = DataLoader(
                  val_data,
                  batch_size = batch_size,
                  num_workers = 2,
                  shuffle = True
              )

In [7]:
class Qasper_Classifier(nn.Module):

    def __init__(self, model, n_labels):

        super(Qasper_Classifier, self).__init__()
        self.pretrained_model = model # bert model
        self.n_labels = n_labels

        self.hidden = torch.nn.Linear(self.pretrained_model.config.hidden_size, self.pretrained_model.config.hidden_size)
        print(f"hidden - {self.hidden.weight.requires_grad}")
        self.classifier = torch.nn.Linear(self.pretrained_model.config.hidden_size, self.n_labels)
        print(f"classifier - {self.classifier.weight.requires_grad}")
        torch.nn.init.xavier_uniform_(self.classifier.weight)
        # print(f"loss - {self.loss_func.weight.requires_grad}")
        self.dropout = nn.Dropout(0.3)
        # print(f"dropout - {self.dropout.weight.requires_grad}")


    def forward(self, input_ids, attention_mask, labels = None):
        # print("Forward Propagation started")
        # input_ids.requires_grad = True; attention_mask,requires_grad = True
        print(f"input - {input_ids.requires_grad}, attention_mask - {attention_mask.requires_grad}")
        output = self.pretrained_model(
                                    input_ids = input_ids,
                                    attention_mask = attention_mask
                                       )
        pooled_output = torch.mean(output.last_hidden_state, 1)
        pooled_output = self.dropout(pooled_output)
        pooled_output = self.hidden(pooled_output)
        pooled_output = F.relu(pooled_output)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        # loss = 0
        # loss.requires_grad = True
        # logits.requires_grad = True
        # labels.required_grad = True
        # loss = Variable(loss, requires_grad = True)
        return logits

In [8]:
def train_fn(data_loader, model, optimizer, device, scheduler):

  model.train()

  for idx, data in tqdm(enumerate(data_loader), total = len(data_loader)):
    # print(data.keys())
    print(f"Training loop {idx}")
    input_ids = data['input_ids']
    attention_mask = data['attention_mask']
    labels = data['labels']

    input_ids = input_ids.to(device, dtype=torch.long)
    attention_mask = attention_mask.to(device, dtype=torch.long)
    labels = labels.to(device, dtype=torch.float)

    optimizer.zero_grad()
    logits = model(
                  input_ids = input_ids,
                  attention_mask = attention_mask,
              )

    loss = torch.zeros(1, requires_grad=True)
    # if labels is not None:
    print(f"type - {type(logits)} type - {type(labels)}")
    loss_fn = nn.BCEWithLogitsLoss(reduction='mean')
    loss = loss_fn(logits.view(-1, len(attributes)), labels.view(-1, len(attributes)))
    print(f"loss - {loss}, logits - {logits}")
    print(f"loss__ - {loss.requires_grad}, logits__ - {logits.requires_grad}")

    loss.backward()
    optimizer.step()
    scheduler.step()

In [9]:
def eval_fn(data_loader, model):

  model.eval()

  tot_logits = []
  tot_labels = []

  with torch.no_grad():
    for idx, data in tqdm(enumerate(data_loader), total = len(data_loader)):
      print(f"Validation Loop {idx}")
      input_ids = data['input_ids']
      attention_mask = data['attention_mask']
      labels = data['labels']

      input_ids = input_ids.to(device, dtype=torch.long)
      attention_mask = attention_mask.to(device, dtype=torch.long)
      labels = labels.to(device, dtype=torch.float)

      logits = model(
                    input_ids = input_ids,
                    attention_mask = attention_mask,
                )
      logits = logits.cpu().detach().numpy().tolist()
      labels = labels.cpu().detach().numpy().tolist()

      tot_logits.extend(logits)
      tot_labels.extend(labels)

    return (
      tot_logits,
      tot_labels
   )

In [10]:
model = Qasper_Classifier(bert_model, len(attributes))
param_optimizer = list(model.named_parameters())
no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
optimizer_parameters = [
  {
      "params": [
          p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
      ],
      "weight_decay": 0.001,
  },
  {
      "params": [
          p for n, p in param_optimizer if any(nd in n for nd in no_decay)
      ],
      "weight_decay": 0.0,
  },
]

num_train_steps = int(len(train_data) / batch_size * n_epochs)
optimizer = AdamW(optimizer_parameters, lr=3e-5)
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

hidden - True
classifier - True




In [11]:
def clipping_fn(logits, max_val, min_val):

  for i in range(len(logits)):
    for j in range(len(logits[i])):

        if logits[i][j] >= (max_val + min_val)/2:
          logits[i][j] = max_val
        else:
          logits[i][j] = min_val

  return (
      logits
  )

def metric_accuracy(logits, labels):
  confusion = [
      {
         'tp': 0, 'tn': 0, 'fp': 0, 'fn': 0,
      },
      {
         'tp': 0, 'tn': 0, 'fp': 0, 'fn': 0,
      },
      {
         'tp': 0, 'tn': 0, 'fp': 0, 'fn': 0,
      },
      {
         'tp': 0, 'tn': 0, 'fp': 0, 'fn': 0,
      },
  ]

  for i in range(len(logits)):
    for j in range(len(logits[i])):

      if logits[i][j] == 1 and labels[i][j] == 1:
          confusion[j]['tp'] += 1

      if logits[i][j] == 0 and labels[i][j] == 0:
          confusion[j]['tn'] += 1

      if logits[i][j] == 1 and labels[i][j] == 0:
          confusion[j]['fn'] += 1

      if logits[i][j] == 0 and labels[i][j] == 1:
          confusion[j]['fp'] += 1

  print(confusion)

  accuracy = list()

  for i in range(len(confusion)):
    accuracy.append((confusion[i]['tp'] + confusion[i]['tn'])/(confusion[i]['tp'] + confusion[i]['tn'] + confusion[i]['fp'] + confusion[i]['fn']))

  return (
      accuracy
  )


In [12]:
def __run__():
    best_accuracy = 0
    model.to(device)
    print(model)

    for epoch in range(n_epochs):
        train_fn(train_dataloader, model, optimizer, device, scheduler)
        logits, labels = eval_fn(val_dataloader, model)
        logits = clipping_fn(logits, 1.0, 0.0)
        accuracy = metric_accuracy(logits, labels)
        overall_accuracy = (accuracy[0] + accuracy[1] + accuracy[2] + accuracy[3])/len(accuracy)

        print(f"Detailed accuracy after {epoch} epoch:")
        print(f"unanswerable accuarcy: {accuracy[0]}")
        print(f"extractive accuarcy: {accuracy[1]}")
        print(f"yes_no accuarcy: {accuracy[2]}")
        print(f"abstractive accuarcy: {accuracy[3]}")
        print(f"Overall accuarcy: {overall_accuracy}")
        print(f"Best accuarcy: {best_accuracy}")

        if overall_accuracy > best_accuracy:
            torch.save(model.state_dict(), MODEL_PATH)
            best_accuracy = overall_accuracy
            print(best_accuracy)
            print("Model Updated")

In [15]:
__run__()

Qasper_Classifier(
  (pretrained_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

  0%|          | 0/289 [00:00<?, ?it/s]

Training loop 0
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.7028533816337585, logits - tensor([[ 0.1042,  0.0097, -0.2244, -0.1596],
        [ 0.1670, -0.0519, -0.0425,  0.0588],
        [ 0.1172, -0.0451, -0.1672, -0.1193],
        [-0.0832,  0.1046,  0.0711, -0.1118],
        [-0.1584,  0.1696,  0.0891, -0.1099],
        [ 0.4863, -0.0772, -0.3482,  0.2355],
        [ 0.1764,  0.0063, -0.0162,  0.0363],
        [ 0.1982,  0.0634, -0.0465,  0.2797]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  0%|          | 1/289 [00:01<09:13,  1.92s/it]

Training loop 1
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.6853101253509521, logits - tensor([[-0.1424, -0.0101, -0.3388,  0.0840],
        [-0.0852, -0.1045, -0.0972,  0.1649],
        [-0.3189, -0.2214, -0.4059,  0.4939],
        [ 0.0824, -0.1066, -0.1465,  0.1066],
        [ 0.0138, -0.2534, -0.2397,  0.1971],
        [ 0.0809,  0.1549, -0.3624,  0.2311],
        [-0.3401,  0.2418,  0.1447, -0.1136],
        [ 0.0332, -0.1696, -0.1445, -0.0409]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  1%|          | 2/289 [00:02<05:43,  1.20s/it]

Training loop 2
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.6147778034210205, logits - tensor([[-0.4106, -0.4785, -0.5138,  0.2478],
        [-0.2846, -0.1528, -0.5409,  0.0443],
        [-0.3495,  0.1380, -0.1606, -0.1931],
        [-0.3280, -0.1689, -0.1162,  0.0031],
        [-0.3541, -0.0143, -0.1969,  0.0283],
        [-0.5343, -0.0571, -0.2391, -0.0469],
        [-0.3851, -0.1578, -0.4680,  0.0748],
        [-0.2937, -0.1071, -0.1639, -0.3436]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  1%|          | 3/289 [00:03<04:39,  1.02it/s]

Training loop 3
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5957533717155457, logits - tensor([[-0.4819, -0.2597, -0.3032, -0.1127],
        [-0.4697, -0.2522, -0.2284, -0.2188],
        [-0.6736, -0.4450, -0.4620, -0.3260],
        [-0.5177, -0.0302, -0.5503,  0.0062],
        [-0.4071, -0.5279, -0.4282,  0.1710],
        [-0.5101, -0.3590, -0.5314, -0.2878],
        [-0.8873, -0.6347, -0.5324, -0.2357],
        [-0.3635, -0.0956, -0.4562, -0.2305]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  1%|▏         | 4/289 [00:04<04:08,  1.15it/s]

Training loop 4
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5818080306053162, logits - tensor([[-1.0069, -0.6679, -0.5300, -0.0125],
        [-0.7948, -0.3089, -0.7872, -0.2601],
        [-0.4711, -0.3987, -0.7247, -0.2407],
        [-0.6551, -0.7820, -0.3909, -0.1286],
        [-0.2151, -0.2935, -0.5887, -0.0130],
        [-0.8544, -0.5565, -0.6047,  0.0713],
        [-0.7502, -0.3691, -0.7070, -0.4118],
        [-1.1612, -0.8339, -0.7577, -0.1022]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  2%|▏         | 5/289 [00:04<03:50,  1.23it/s]

Training loop 5
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.551196813583374, logits - tensor([[-1.0126, -0.1755, -0.9058, -0.2336],
        [-1.6195, -0.5557, -0.7622, -0.3558],
        [-1.1998, -0.2310, -0.7059,  0.1139],
        [-0.9249, -0.1117, -0.6436, -0.0208],
        [-1.2634, -0.3007, -0.7514,  0.1372],
        [-0.6774, -0.0947,  0.0315, -0.3829],
        [-0.7381,  0.0833, -0.5035, -0.3253],
        [-1.0455, -0.1571, -0.7979, -0.4488]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  2%|▏         | 6/289 [00:05<03:39,  1.29it/s]

Training loop 6
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5209705829620361, logits - tensor([[-1.8281, -0.4920, -1.5392, -0.1220],
        [-0.9116, -0.2818, -1.0426, -0.5625],
        [-1.3844, -0.6705, -1.2763, -0.3706],
        [-1.8776, -0.5641, -1.1224, -0.1320],
        [-1.5939, -0.1862, -0.7548,  0.0676],
        [-1.7353, -0.1506, -0.7913, -0.1114],
        [-1.3389, -0.4225, -0.6555, -0.2416],
        [-1.4168,  0.1924, -0.7973, -0.1538]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  2%|▏         | 7/289 [00:06<03:33,  1.32it/s]

Training loop 7
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


  3%|▎         | 8/289 [00:06<03:28,  1.35it/s]

loss - 0.4873426556587219, logits - tensor([[-2.1008, -0.4520, -1.5375, -0.4752],
        [-2.4284, -0.0949, -1.6188, -0.2740],
        [-2.0106, -0.7987, -1.2694, -0.2528],
        [-1.8112, -0.6187, -1.3448, -0.0565],
        [-1.5617, -0.8794, -1.0617, -0.1979],
        [-1.8821, -0.4487, -1.2231, -0.1110],
        [-2.5179, -0.4146, -2.0991, -0.0082],
        [-1.8930, -0.3746, -1.6042, -0.4460]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 8
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.43921059370040894, logits - tensor([[-2.2210, -0.9024, -1.6777, -1.0045],
        [-2.0487, -0.3383, -1.8436, -0.7008],
        [-1.9218, -0.5244, -1.4592, -0.4444],
        [-2.3729, -0.2881, -1.5280, -0.3435],
        [-1.7751, -0.3162, -1.1943, -0.7713],
        [-2.4526, -0.3124, -2.2694, -0.6014],
        [-2.5940, -0.8656, -2.1520, -0.2375],
        [-2.0641, -0.0401, -1.9668, -0.28

  3%|▎         | 9/289 [00:07<03:24,  1.37it/s]

Training loop 9
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


  3%|▎         | 10/289 [00:08<03:21,  1.38it/s]

loss - 0.44565173983573914, logits - tensor([[-2.2696e+00,  1.4955e-03, -1.2453e+00, -2.0129e-01],
        [-2.3053e+00, -4.7729e-02, -1.4867e+00, -1.0293e+00],
        [-2.2568e+00,  5.4823e-02, -1.6943e+00, -1.0197e+00],
        [-2.9667e+00,  1.2054e-01, -1.8318e+00, -3.5008e-01],
        [-2.0127e+00, -6.4067e-01, -1.7404e+00, -4.3490e-01],
        [-2.7316e+00,  2.4668e-01, -1.5940e+00, -1.1918e+00],
        [-2.5440e+00,  3.1309e-01, -1.6080e+00, -5.8534e-02],
        [-2.2981e+00, -1.6988e-01, -1.5217e+00, -7.1446e-01]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 10
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3658773899078369, logits - tensor([[-2.2973,  0.3393, -1.8163, -0.4363],
        [-2.4970,  0.3274, -2.4342, -0.6383],
        [-3.5747,  0.1868, -2.0751, -0.0884],
        [-2.5596,  0.3159, -1.6312, -0.2965],
        [-2.4723,  0.4640, -1.5620, -0.7738],
    

  4%|▍         | 11/289 [00:08<03:19,  1.39it/s]

Training loop 11
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.41273200511932373, logits - tensor([[-3.2058, -0.4984, -1.9928, -0.3593],
        [-2.6127,  0.3226, -1.4714, -1.2611],
        [-3.0126,  0.6461, -2.5039, -0.9938],
        [-3.3564,  0.6025, -3.0150, -0.4697],
        [-3.7968,  0.6113, -2.2555, -1.2558],
        [-3.1510, -0.1892, -2.1265, -1.0281],
        [-2.8331,  0.6715, -1.9173, -0.3520],
        [-2.7212,  0.9445, -2.0758, -0.2570]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  4%|▍         | 12/289 [00:09<03:18,  1.40it/s]

Training loop 12
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.34853583574295044, logits - tensor([[-3.1357,  0.6242, -2.0420, -1.3550],
        [-3.1063,  0.6320, -1.9796, -0.8802],
        [-2.8834,  0.6093, -2.8197, -0.7142],
        [-2.6831,  1.1948, -1.6301, -0.8660],
        [-2.9055,  0.8573, -2.4511, -0.9397],
        [-3.0180,  0.2744, -2.7330, -0.7285],
        [-3.3917,  1.1396, -2.1552, -0.5891],
        [-3.1272,  0.6339, -2.0000, -0.7913]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  4%|▍         | 13/289 [00:10<03:17,  1.40it/s]

Training loop 13
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5642852783203125, logits - tensor([[-2.5978,  1.7167, -2.1404, -0.8103],
        [-2.4816,  0.7892, -1.8581, -1.1691],
        [-3.5542,  0.5526, -2.6968, -1.1173],
        [-2.8421,  0.3906, -2.0663, -0.7653],
        [-3.3097,  1.0994, -2.2684, -1.0998],
        [-2.6149,  0.7441, -2.1425, -0.8076],
        [-3.0452,  0.3081, -2.5575, -1.2112],
        [-2.4853,  1.2990, -2.9195, -0.3889]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  5%|▍         | 14/289 [00:11<03:16,  1.40it/s]

Training loop 14
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3325369954109192, logits - tensor([[-3.0294,  0.5599, -1.4744, -0.9726],
        [-3.0168,  0.4842, -2.1420, -1.0216],
        [-3.5147,  0.9928, -1.9652, -0.5287],
        [-2.8349,  1.5234, -2.1164, -1.3693],
        [-2.9524,  0.6174, -2.4257, -1.1265],
        [-2.9389,  0.6850, -1.9043, -0.8095],
        [-3.3386,  0.2916, -2.6238, -0.6921],
        [-3.1118,  0.6774, -1.7245, -0.2166]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  5%|▌         | 15/289 [00:11<03:15,  1.40it/s]

Training loop 15
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4000723958015442, logits - tensor([[-2.6536,  0.9437, -1.8866, -0.7514],
        [-3.0857,  0.9122, -2.8256, -0.6020],
        [-3.3645,  1.0734, -2.6977, -1.0298],
        [-3.1460,  0.9543, -2.0046, -0.8545],
        [-3.0407,  0.4268, -2.1479, -0.7116],
        [-3.6324,  1.6314, -2.0349, -0.9106],
        [-3.0523,  0.6472, -2.1870, -1.6252],
        [-2.8036,  1.2250, -2.1132, -0.9327]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  6%|▌         | 16/289 [00:12<03:14,  1.40it/s]

Training loop 16
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5787496566772461, logits - tensor([[-2.6119,  0.6274, -1.5869, -1.2079],
        [-3.4685,  0.1300, -1.8962, -0.9470],
        [-2.8585,  0.0984, -1.6784, -1.3256],
        [-3.4418,  1.0628, -1.8033, -0.9043],
        [-2.8844,  1.4043, -2.0842, -1.6651],
        [-3.2470,  0.8092, -2.4320, -1.1422],
        [-3.0618,  0.9674, -2.0035, -1.3747],
        [-3.4318,  0.9840, -2.2421, -1.7087]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  6%|▌         | 17/289 [00:13<03:13,  1.41it/s]

Training loop 17
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.49585357308387756, logits - tensor([[-3.4528,  0.4444, -2.2217, -1.2021],
        [-3.4790,  1.0091, -2.5476, -0.7512],
        [-3.1236,  0.6218, -2.0180, -1.0968],
        [-3.2919,  0.2114, -2.7847, -0.8966],
        [-3.2069,  0.5916, -1.6925, -0.8536],
        [-3.7394,  1.1870, -2.3160, -1.1554],
        [-4.1507,  1.2559, -2.3674, -1.2966],
        [-2.9583,  0.4196, -1.6055, -0.8316]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  6%|▌         | 18/289 [00:13<03:12,  1.41it/s]

Training loop 18
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5274248123168945, logits - tensor([[-3.2382,  0.3069, -1.8410, -0.7966],
        [-3.5771,  0.9290, -2.7733, -1.3464],
        [-2.8229,  1.1712, -1.9225, -1.6136],
        [-3.5131,  0.0745, -2.3722, -1.6906],
        [-3.2290,  0.5511, -2.3977, -1.0662],
        [-2.8254,  0.7690, -2.2516, -1.4998],
        [-4.0248,  0.1865, -2.6816, -1.1045],
        [-3.1120,  0.5917, -2.0354, -1.2034]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  7%|▋         | 19/289 [00:14<03:12,  1.41it/s]

Training loop 19
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4605872333049774, logits - tensor([[-3.4260,  1.2671, -3.0524, -0.8093],
        [-2.7964,  1.0598, -2.5729, -0.8875],
        [-4.0412,  0.0526, -2.7760, -0.5945],
        [-3.2524,  0.8070, -2.0176, -0.5480],
        [-4.2260,  0.1393, -2.4338, -1.1516],
        [-3.7221,  0.4421, -2.4227, -1.0173],
        [-3.6938,  0.4579, -2.3579, -1.6524],
        [-3.5764,  0.6317, -2.4948, -1.2945]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  7%|▋         | 20/289 [00:15<03:11,  1.41it/s]

Training loop 20
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.297846257686615, logits - tensor([[-3.2766,  0.6102, -2.0002, -1.1520],
        [-3.4744, -0.0106, -2.1330, -2.0457],
        [-3.1606,  0.2944, -2.2999, -0.9949],
        [-4.1496,  0.9979, -1.8060, -1.3520],
        [-3.7860,  0.6724, -2.6051, -0.8088],
        [-2.9760,  0.3267, -1.9378, -0.8608],
        [-3.8736, -0.2199, -2.0253, -0.3704],
        [-3.5076,  0.5773, -1.8555, -0.5898]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  7%|▋         | 21/289 [00:16<03:10,  1.41it/s]

Training loop 21
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3134767413139343, logits - tensor([[-3.9355,  0.5061, -2.0680, -0.9154],
        [-2.8354,  0.2991, -1.5208, -0.5124],
        [-3.7631,  0.6990, -1.9610, -1.6524],
        [-3.7190,  0.3360, -1.3711, -1.3093],
        [-3.0930,  0.1315, -2.1127, -0.8468],
        [-3.4240,  0.1882, -1.4310, -1.6789],
        [-3.2374,  0.1698, -1.6570, -1.3027],
        [-3.2703,  0.2667, -1.9460, -1.4754]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  8%|▊         | 22/289 [00:16<03:09,  1.41it/s]

Training loop 22
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3301808834075928, logits - tensor([[-4.0111,  0.6196, -2.2501, -0.9490],
        [-3.3294, -0.0125, -2.3256, -0.2807],
        [-3.7865,  0.8621, -2.3250, -1.1592],
        [-3.6742,  0.4397, -2.5776, -1.1179],
        [-3.2345,  0.6581, -2.0303, -0.8646],
        [-3.2469,  0.4591, -1.4684, -0.7547],
        [-3.4476,  0.1032, -1.5928, -1.2856],
        [-3.7588,  0.2410, -2.0876, -1.4607]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  8%|▊         | 23/289 [00:17<03:08,  1.41it/s]

Training loop 23
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5792046785354614, logits - tensor([[-3.1933,  0.6471, -0.9557, -1.6368],
        [-3.3246,  0.5108, -2.0485, -0.8385],
        [-4.2043, -0.2743, -2.2717, -2.0288],
        [-3.7171,  0.2711, -1.6769, -1.4871],
        [-3.4028,  0.1875, -1.4072, -1.6303],
        [-3.4417,  0.5218, -1.9978, -1.5619],
        [-3.3822,  0.1489, -1.4375, -1.2577],
        [-4.1221,  0.9053, -2.0405, -1.4696]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  8%|▊         | 24/289 [00:18<03:08,  1.41it/s]

Training loop 24
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4480438232421875, logits - tensor([[-3.9233,  0.6285, -2.2783, -1.4921],
        [-4.3788,  0.4852, -1.7934, -1.3501],
        [-3.8137,  0.4023, -2.1610, -1.0979],
        [-3.4776,  0.7877, -2.0771, -1.3649],
        [-2.8496,  0.1935, -1.8190, -1.0891],
        [-3.4505,  0.4681, -2.0523, -0.7066],
        [-3.1019, -0.8916, -1.5840, -1.3444],
        [-3.2522,  0.6523, -1.1114, -1.6863]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  9%|▊         | 25/289 [00:18<03:07,  1.41it/s]

Training loop 25
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5902776718139648, logits - tensor([[-4.1060,  0.7826, -1.7414, -1.9494],
        [-3.5851,  0.2574, -2.0190, -1.8035],
        [-3.9549,  1.0663, -1.7096, -0.7390],
        [-3.2072,  0.4420, -1.3062, -0.9933],
        [-3.4934,  0.3504, -2.0360, -1.0728],
        [-3.6631,  0.1867, -2.0195, -0.4550],
        [-3.3770,  0.5036, -2.1469, -1.4194],
        [-3.4445,  0.4682, -1.8333, -1.2880]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  9%|▉         | 26/289 [00:19<03:08,  1.40it/s]

Training loop 26
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


  9%|▉         | 27/289 [00:20<03:07,  1.40it/s]

loss - 0.37700289487838745, logits - tensor([[-3.6817,  0.1372, -1.2531, -0.8605],
        [-3.6264,  0.4157, -2.1783, -1.4598],
        [-3.6205, -0.0347, -1.7501, -1.6753],
        [-4.6728,  0.7637, -2.0407, -1.7295],
        [-3.2758,  0.1026, -1.6126, -0.8169],
        [-3.8614,  0.2696, -1.8902, -1.1627],
        [-3.9837,  0.1172, -2.1773, -0.9875],
        [-3.6468,  0.3516, -1.5260, -1.9503]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 27
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5062513947486877, logits - tensor([[-4.2271,  0.0164, -1.3706, -1.0077],
        [-3.7473,  0.4524, -1.7235, -0.9338],
        [-4.0063,  0.1375, -2.0940, -1.6322],
        [-3.4776,  1.0034, -2.3348, -1.8230],
        [-3.8223,  0.7769, -2.2138, -1.7077],
        [-3.4693,  0.1109, -1.8786, -0.4130],
        [-5.1183,  0.5387, -2.3141, -0.7415],
        [-3.6957,  0.7521, -1.4390, -1.1

 10%|▉         | 28/289 [00:21<03:07,  1.39it/s]

Training loop 28
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.45028388500213623, logits - tensor([[-3.9877,  0.1727, -1.6603, -0.6895],
        [-4.3372,  0.4601, -1.5785, -0.9480],
        [-3.1655,  0.5580, -1.9981, -1.7074],
        [-3.8902,  0.3583, -1.7573, -1.3052],
        [-3.8823,  0.7787, -2.0734, -1.6532],
        [-4.3794,  0.0596, -2.0565, -1.3613],
        [-2.9358,  0.5184, -1.4391, -0.9139],
        [-4.4454,  0.5413, -1.9431, -0.9300]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 10%|█         | 29/289 [00:21<03:07,  1.38it/s]

Training loop 29
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.39034324884414673, logits - tensor([[-3.2789,  0.0419, -1.6846, -1.2405],
        [-3.6645,  0.2650, -1.3304, -1.2779],
        [-3.8510,  0.6752, -1.4736, -0.9478],
        [-3.3719,  0.1607, -1.7237, -1.3388],
        [-3.5879, -0.1390, -1.8255, -0.8184],
        [-3.9769,  0.4165, -1.9509, -1.1623],
        [-4.0039,  0.7995, -0.9577, -1.3724],
        [-3.8413,  0.1886, -1.5581, -0.9898]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 10%|█         | 30/289 [00:22<03:07,  1.38it/s]

Training loop 30
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.30371344089508057, logits - tensor([[-4.0738,  0.8546, -2.6207, -1.4945],
        [-3.4638,  0.5155, -1.1959, -0.9918],
        [-3.9863,  0.2655, -1.7542, -1.7741],
        [-3.5243,  0.9368, -1.3242, -1.4260],
        [-3.2483,  0.6587, -1.3840, -1.2078],
        [-3.9334,  0.7670, -1.4562, -1.3078],
        [-4.0044,  0.4141, -1.5254, -0.8605],
        [-4.0507, -0.2103, -1.6515, -1.5265]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 11%|█         | 31/289 [00:23<03:07,  1.38it/s]

Training loop 31
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5023117065429688, logits - tensor([[-3.4220,  0.7728, -0.9724, -0.9040],
        [-3.1368,  0.2522, -1.7207, -0.2109],
        [-3.6820,  0.2855, -1.4668, -0.8284],
        [-3.7889,  0.0852, -1.9782, -0.9423],
        [-3.2907,  0.3982, -1.6696, -0.9761],
        [-3.4145,  0.2933, -2.1498, -1.5545],
        [-3.8889,  0.3084, -1.3498, -0.9567],
        [-4.0335,  0.0988, -1.8862, -0.5843]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 11%|█         | 32/289 [00:24<03:05,  1.38it/s]

Training loop 32
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4230709373950958, logits - tensor([[-3.4401e+00,  7.6747e-01, -8.1931e-01, -1.2710e+00],
        [-3.9939e+00,  9.2588e-01, -1.5458e+00, -1.7178e-01],
        [-4.3239e+00,  1.3072e-01, -2.2277e+00, -1.1182e+00],
        [-3.4570e+00,  4.5057e-01, -2.0907e+00, -1.0100e+00],
        [-3.6148e+00,  8.4222e-01, -1.5544e+00, -1.1994e+00],
        [-3.8588e+00, -6.6343e-02, -2.3002e+00, -9.3838e-01],
        [-4.4144e+00,  8.0923e-01, -2.2491e+00, -1.6447e+00],
        [-4.0463e+00,  4.2100e-03, -2.2581e+00, -3.7767e-01]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 11%|█▏        | 33/289 [00:24<03:05,  1.38it/s]

Training loop 33
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3968566358089447, logits - tensor([[-4.0027,  0.9072, -1.8549, -0.5101],
        [-2.8395,  0.0683, -1.3917, -0.5810],
        [-4.1702,  0.2036, -2.2274, -0.6888],
        [-3.6905,  0.4011, -2.1896, -0.5624],
        [-3.4308,  0.4776, -2.3257, -0.4849],
        [-4.4746,  0.4884, -1.2970, -0.8806],
        [-3.5922,  0.8615, -1.5394,  0.0813],
        [-4.0336,  0.1610, -1.3133, -0.8823]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 12%|█▏        | 34/289 [00:25<03:04,  1.38it/s]

Training loop 34
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4503580927848816, logits - tensor([[-3.8771,  0.6978, -1.4131, -1.2806],
        [-4.1832,  0.2888, -1.8645, -1.2494],
        [-4.4355,  0.1834, -1.9226, -0.8632],
        [-3.7373,  0.4567, -1.9885, -0.5615],
        [-3.7986,  0.4687, -2.1220, -0.6407],
        [-3.8639,  0.4493, -2.2158, -0.7269],
        [-3.9295,  0.5641, -1.8028, -1.2536],
        [-4.4671,  0.3603, -1.8846, -0.9820]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 12%|█▏        | 35/289 [00:26<03:03,  1.39it/s]

Training loop 35
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.28478556871414185, logits - tensor([[-3.5451,  0.4740, -2.1660, -0.8376],
        [-4.0523,  0.6881, -1.7126, -0.9711],
        [-3.7567,  0.4059, -2.3252, -0.6324],
        [-3.7816,  0.3257, -1.8210, -0.6127],
        [-4.0638,  0.6417, -2.1226, -1.1035],
        [-3.6379, -0.0921, -1.7038, -0.7849],
        [-4.0489,  0.7182, -2.0160, -0.5924],
        [-3.5002,  0.6868, -2.1957, -1.0142]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 12%|█▏        | 36/289 [00:26<03:02,  1.39it/s]

Training loop 36
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4058401882648468, logits - tensor([[-4.2658,  0.3332, -2.5026, -0.7833],
        [-4.3949,  0.0412, -2.0416, -1.4328],
        [-4.2639,  0.1636, -2.1673,  0.0539],
        [-4.4935,  1.0667, -2.2621, -1.1054],
        [-3.3849, -0.2480, -1.4952, -0.4496],
        [-4.1014,  0.4138, -1.5957, -1.0894],
        [-4.3760,  0.7756, -2.0497, -1.0913],
        [-3.3963,  0.7163, -1.9293, -1.0889]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 13%|█▎        | 37/289 [00:27<03:01,  1.39it/s]

Training loop 37
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4038187861442566, logits - tensor([[-4.2458,  0.7346, -3.0852, -0.4275],
        [-3.8972,  0.6334, -2.3909, -0.7193],
        [-4.1839,  0.1398, -1.7454, -1.0871],
        [-4.6414,  0.2355, -2.4909, -0.3417],
        [-3.7487,  0.3098, -2.2048,  0.0519],
        [-4.2567,  0.6051, -1.5442, -0.9490],
        [-4.4452,  0.0905, -2.3771, -1.0252],
        [-3.6457,  0.0558, -1.8994, -0.7598]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 13%|█▎        | 38/289 [00:28<03:01,  1.39it/s]

Training loop 38
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 13%|█▎        | 39/289 [00:29<03:00,  1.39it/s]

loss - 0.4023360013961792, logits - tensor([[-3.5583,  0.3479, -2.0396, -0.7095],
        [-4.9980,  0.1914, -2.7178, -0.6690],
        [-4.4587,  0.0838, -1.7552, -0.5757],
        [-4.1669,  0.2407, -1.7619, -0.8010],
        [-4.3324,  0.3030, -2.9007, -0.4796],
        [-3.9280,  0.5585, -1.9006, -0.8656],
        [-4.2345,  0.0525, -1.4281, -0.9485],
        [-3.9795,  0.2060, -2.4517, -0.8562]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 39
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.30486875772476196, logits - tensor([[-4.0795,  1.0111, -2.8938, -0.5094],
        [-4.4062,  0.9185, -2.8588, -0.1450],
        [-4.4000,  0.6073, -2.2381, -0.3611],
        [-4.2806,  0.7009, -2.5434, -0.3418],
        [-3.6755,  0.4431, -2.2078, -0.9555],
        [-4.9847,  1.0961, -2.6629, -1.1378],
        [-4.5602,  0.6165, -2.6617, -0.7705],
        [-3.7834, -0.1845, -2.2787, -0.6

 14%|█▍        | 40/289 [00:29<02:59,  1.39it/s]

Training loop 40
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4254859685897827, logits - tensor([[-4.2372,  0.6978, -2.4330, -0.7489],
        [-4.2016,  0.3702, -1.9750, -1.0875],
        [-3.9346,  0.5178, -2.8125, -0.8468],
        [-4.2182,  0.8385, -2.2991, -0.5282],
        [-3.8481,  0.5991, -2.7854,  0.1681],
        [-5.1838,  0.0067, -2.2126, -0.3794],
        [-4.2389,  0.4546, -1.6688, -0.3081],
        [-3.7560,  0.6226, -1.6743, -0.6255]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 14%|█▍        | 41/289 [00:30<02:59,  1.38it/s]

Training loop 41
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5289998054504395, logits - tensor([[-4.6523,  0.6168, -2.3315, -1.1489],
        [-4.3545,  0.6461, -2.7300, -1.0505],
        [-4.5383,  0.2387, -2.3712, -1.0836],
        [-3.4518,  0.6792, -2.1994, -0.7619],
        [-4.4222,  0.1571, -2.0207, -0.5677],
        [-3.8026,  0.3498, -2.4437, -0.5832],
        [-4.3321,  0.4672, -2.8107, -1.0421],
        [-4.2124,  0.6796, -2.3395, -0.1674]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 15%|█▍        | 42/289 [00:31<02:58,  1.38it/s]

Training loop 42
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5776796936988831, logits - tensor([[-4.3700, -0.1159, -2.3763, -0.9441],
        [-4.4597,  1.3979, -2.8376, -1.1886],
        [-4.5797,  0.2240, -3.0101, -1.2496],
        [-4.6509,  0.2048, -2.6885, -1.5149],
        [-4.5096,  0.4171, -2.1487, -0.4255],
        [-4.4974,  0.5355, -2.1493, -0.7174],
        [-4.2326,  0.3003, -2.9640, -1.0931],
        [-4.1485,  0.7763, -2.5421, -1.2216]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 15%|█▍        | 43/289 [00:31<02:57,  1.38it/s]

Training loop 43
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.37970614433288574, logits - tensor([[-3.8939,  0.2183, -2.1123, -1.4825],
        [-5.0980,  0.4270, -2.9292, -1.3391],
        [-4.7460, -0.2214, -2.3246, -1.5359],
        [-4.4956,  0.6887, -2.2777, -1.3247],
        [-4.2948,  0.6459, -1.9322, -1.1345],
        [-4.7069,  0.0804, -2.3060, -0.7898],
        [-4.0115,  0.7058, -2.2285, -1.4799],
        [-3.5895,  0.3945, -1.6952, -0.7446]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 15%|█▌        | 44/289 [00:32<02:57,  1.38it/s]

Training loop 44
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.36739495396614075, logits - tensor([[-5.1363,  0.7004, -2.2090, -0.4860],
        [-4.4771,  0.4017, -1.9967, -1.0918],
        [-3.9994,  0.6214, -1.4031, -0.9395],
        [-4.3449,  0.4970, -2.2592, -0.7222],
        [-4.4115,  0.5266, -2.0425, -1.0635],
        [-5.0826, -0.0474, -2.3644, -1.3124],
        [-3.8943,  1.0793, -2.1272, -1.1635],
        [-4.1657,  0.1549, -2.1967, -0.9350]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 16%|█▌        | 45/289 [00:33<02:56,  1.38it/s]

Training loop 45
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.48993009328842163, logits - tensor([[-4.4183,  0.1564, -2.1383, -0.7086],
        [-5.1463,  0.7772, -2.2198, -0.9178],
        [-3.9616,  0.4318, -2.2179, -1.2504],
        [-4.3550,  0.3199, -2.4021, -1.5780],
        [-4.1732,  0.2312, -2.8677, -1.3887],
        [-4.0762,  0.4749, -1.6923, -1.1861],
        [-4.5851, -0.1304, -2.2158, -1.8770],
        [-4.4343,  0.2332, -2.3824, -2.0251]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 16%|█▌        | 46/289 [00:34<02:56,  1.38it/s]

Training loop 46
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.551551103591919, logits - tensor([[-4.8638,  0.8279, -2.3838, -1.2754],
        [-4.3941,  0.6167, -1.8653, -1.3913],
        [-4.7633, -0.0387, -2.2627, -0.9647],
        [-4.2233,  0.2531, -1.8369, -1.1765],
        [-4.7399,  0.7228, -1.7413, -1.2447],
        [-4.4921,  0.1949, -2.3066, -1.4700],
        [-4.7539,  0.1542, -2.1436, -1.3537],
        [-4.2221,  0.3356, -1.8650, -1.0842]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 16%|█▋        | 47/289 [00:34<02:55,  1.38it/s]

Training loop 47
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.36646848917007446, logits - tensor([[-3.9357,  0.7899, -1.8278, -1.5755],
        [-4.7407,  0.1172, -1.2129, -0.8421],
        [-4.1054,  0.5988, -3.0596, -1.3780],
        [-4.8000, -0.0864, -1.6014, -0.3710],
        [-4.3112, -0.0315, -1.9028, -0.9213],
        [-4.7501,  0.4260, -2.0676, -1.3177],
        [-4.1739,  0.1385, -2.5795, -1.3939],
        [-4.7203,  0.2383, -1.6712, -1.1203]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 17%|█▋        | 48/289 [00:35<02:55,  1.37it/s]

Training loop 48
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.37137019634246826, logits - tensor([[-3.8485,  0.0983, -1.4220, -1.6423],
        [-4.4992,  0.5153, -1.9200, -0.6162],
        [-4.3303,  0.5199, -1.8495, -1.3440],
        [-4.6303, -0.0068, -2.5682, -1.1954],
        [-4.0503,  0.8434, -1.5204, -1.7194],
        [-3.8029,  0.1133, -1.3493, -1.3867],
        [-4.1162,  0.2071, -2.6652, -0.9257],
        [-4.5395,  0.6300, -1.4385, -0.6954]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 17%|█▋        | 49/289 [00:36<02:55,  1.37it/s]

Training loop 49
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3524828553199768, logits - tensor([[-4.0171, -0.2665, -1.2358, -1.0637],
        [-4.5577,  0.5627, -2.0471, -1.3100],
        [-4.1018,  0.0958, -1.6769, -1.3706],
        [-4.2986,  0.5149, -2.1523, -1.7696],
        [-4.0760,  0.6605, -1.4406, -1.6127],
        [-4.7522,  1.4758, -2.0416, -1.6831],
        [-4.4891,  0.9844, -2.2648, -2.1322],
        [-3.9803,  0.3368, -1.4705, -1.4048]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 17%|█▋        | 50/289 [00:37<02:54,  1.37it/s]

Training loop 50
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.477887362241745, logits - tensor([[-4.1918,  0.9879, -2.0684, -1.4585],
        [-3.8429,  0.3430, -1.6507, -1.6337],
        [-4.6719,  0.5279, -2.1146, -1.6872],
        [-4.0349,  1.0389, -1.1942, -2.0235],
        [-4.1673,  1.0053, -1.8323, -1.4885],
        [-4.6048, -0.4693, -1.5267, -1.1919],
        [-4.5728,  0.9172, -1.6849, -2.3867],
        [-3.9543,  0.7687, -1.3626, -1.6135]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 18%|█▊        | 51/289 [00:37<02:53,  1.37it/s]

Training loop 51
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.28358742594718933, logits - tensor([[-4.8527,  0.8485, -1.7186, -1.3297],
        [-4.3308,  0.4182, -1.8813, -0.9316],
        [-4.6235,  0.5686, -1.9954, -1.4201],
        [-4.1255,  0.8431, -1.2706, -1.7245],
        [-4.0116,  0.0929, -1.7060, -1.5642],
        [-4.3809,  0.6557, -2.1830, -1.7752],
        [-3.9245,  0.7084, -2.4179, -2.4995],
        [-3.8240,  0.5217, -1.4708, -1.6997]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 18%|█▊        | 52/289 [00:38<02:52,  1.37it/s]

Training loop 52
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.35495811700820923, logits - tensor([[-4.0399,  0.9227, -1.1907, -1.9412],
        [-4.1576,  0.5623, -1.9813, -1.6873],
        [-3.9483,  0.2520, -1.4921, -1.3361],
        [-4.0343,  0.5836, -1.8481, -1.1716],
        [-2.9261,  0.4556, -1.6872, -0.7967],
        [-3.9225,  0.7087, -1.2776, -1.5453],
        [-4.1157,  1.3054, -2.5700, -2.2786],
        [-3.5285,  0.9858, -1.9076, -1.2963]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 18%|█▊        | 53/289 [00:39<02:52,  1.37it/s]

Training loop 53
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.44824182987213135, logits - tensor([[-4.2003,  0.7448, -1.3846, -0.8958],
        [-4.5598,  0.7826, -1.6358, -2.2111],
        [-4.4928,  0.9263, -2.3233, -1.6002],
        [-4.9771,  0.5140, -1.5784, -1.7910],
        [-4.4179,  0.5769, -1.6578, -1.4470],
        [-4.4483,  0.9061, -1.5690, -0.8943],
        [-3.9620,  0.8588, -1.7723, -1.6111],
        [-4.1329,  0.6821, -1.3197, -1.3203]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 19%|█▊        | 54/289 [00:39<02:51,  1.37it/s]

Training loop 54
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5874117612838745, logits - tensor([[-4.1885,  0.1913, -2.2991, -1.8822],
        [-4.6597,  0.3612, -2.2169, -0.6583],
        [-4.9128,  1.8215, -2.1247, -1.4107],
        [-5.2301,  0.3944, -2.2805, -1.8058],
        [-4.1011,  1.0805, -1.7387, -1.7475],
        [-3.7583,  0.7091, -1.8962, -1.6288],
        [-3.9472,  0.3720, -1.9761, -0.9303],
        [-4.7320,  0.9323, -2.3189, -1.6766]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 19%|█▉        | 55/289 [00:40<02:50,  1.37it/s]

Training loop 55
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5730757713317871, logits - tensor([[-3.8020, -0.1816, -1.5728, -1.3811],
        [-4.1907,  0.6371, -1.6643, -1.5130],
        [-4.7917,  1.1790, -1.8104, -2.6250],
        [-5.3664,  0.9385, -2.0080, -1.6613],
        [-4.9154,  0.9820, -1.7037, -1.3948],
        [-4.5193,  1.0456, -1.6054, -1.3502],
        [-4.5576,  0.4492, -1.9913, -1.6459],
        [-3.4886,  0.0728, -1.6635, -1.7915]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 19%|█▉        | 56/289 [00:41<02:50,  1.37it/s]

Training loop 56
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.39747536182403564, logits - tensor([[-4.4368,  0.9488, -1.3469, -0.9468],
        [-4.4847,  0.2007, -1.7560, -1.0229],
        [-4.6511,  0.5500, -1.0446, -1.1363],
        [-3.7990,  0.2167, -1.0746, -0.9842],
        [-4.1615,  0.2129, -1.4955, -0.7501],
        [-5.3966,  1.1095, -2.2064, -1.8752],
        [-4.0140,  0.3507, -1.6015, -0.7379],
        [-4.5643,  1.3097, -1.9993, -1.8305]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 20%|█▉        | 57/289 [00:42<02:49,  1.37it/s]

Training loop 57
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.28708982467651367, logits - tensor([[-4.8496,  0.2760, -1.6019, -0.4982],
        [-4.3603,  0.0284, -1.1210, -0.4459],
        [-3.6486, -0.0338, -1.8126, -0.6590],
        [-4.5520,  0.9024, -1.9011, -1.0260],
        [-4.0495,  0.1373, -1.9517, -0.9762],
        [-2.9643, -0.1217, -1.4729, -1.2986],
        [-4.8662,  0.1329, -2.1139, -0.7322],
        [-4.4695,  1.0701, -1.4209, -0.8023]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 20%|██        | 58/289 [00:42<02:48,  1.37it/s]

Training loop 58
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4400586485862732, logits - tensor([[-4.0913,  0.1063, -1.3146, -1.3344],
        [-4.4451,  0.5280, -2.2454, -1.1296],
        [-4.4573,  0.7242, -1.5521, -0.9412],
        [-5.0826,  0.7061, -1.6040, -1.8733],
        [-3.8931,  0.4117, -1.9671, -0.6238],
        [-4.6899,  0.7520, -1.9134, -0.5532],
        [-3.7426, -0.4848, -1.4444,  0.0344],
        [-3.8767,  0.6150, -2.0230, -0.7184]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 20%|██        | 59/289 [00:43<02:47,  1.37it/s]

Training loop 59
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.444618821144104, logits - tensor([[-4.8052,  0.8587, -2.3094, -1.8412],
        [-4.5038,  0.5368, -1.4184, -0.8835],
        [-3.4173,  0.2549, -1.4678, -0.2281],
        [-3.9978,  0.9776, -2.1180, -0.8962],
        [-4.2967,  0.2247, -2.3944, -1.8597],
        [-4.7127,  1.1188, -1.6612, -0.6580],
        [-4.3178,  0.2730, -1.9404, -0.8856],
        [-3.2496, -1.1586, -1.7988,  0.0276]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 21%|██        | 60/289 [00:44<02:46,  1.37it/s]

Training loop 60
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4276222586631775, logits - tensor([[-4.6261,  0.2841, -1.3322, -0.6444],
        [-4.2446,  0.5917, -2.4503, -1.4990],
        [-4.1988,  0.9278, -1.9551, -1.3134],
        [-4.4468,  0.1489, -1.9365, -1.6193],
        [-3.8710,  0.8609, -1.7943, -0.8674],
        [-3.9804,  0.9101, -1.9933, -1.6090],
        [-4.4417,  0.6123, -1.7636, -1.6766],
        [-4.8177, -0.2669, -2.0473,  0.0737]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 21%|██        | 61/289 [00:45<02:46,  1.37it/s]

Training loop 61
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.34515655040740967, logits - tensor([[-4.3542,  0.1927, -1.3200, -1.5203],
        [-3.3479,  0.6321, -2.2642, -1.4887],
        [-4.3658,  0.4851, -2.1118, -1.5231],
        [-4.5168,  0.4397, -2.1686, -1.0445],
        [-3.9214,  0.2533, -1.6054, -1.0721],
        [-4.0474,  0.4559, -1.8598, -1.1542],
        [-3.8411,  1.1540, -1.5078, -1.0667],
        [-4.5007,  0.4959, -1.5492, -0.9122]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 21%|██▏       | 62/289 [00:45<02:45,  1.37it/s]

Training loop 62
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4283688962459564, logits - tensor([[-3.5494,  0.0749, -1.6287, -1.3924],
        [-4.3194,  0.3863, -1.8629, -1.6967],
        [-3.2216,  0.2284, -1.2615, -1.5552],
        [-3.7490,  0.8134, -1.9499, -1.1487],
        [-3.9570,  0.8944, -1.3814, -1.7829],
        [-3.8224,  0.4975, -1.4705, -0.6301],
        [-3.8636,  0.5746, -1.3842, -1.5849],
        [-4.1363,  0.3451, -1.5421, -1.6936]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 22%|██▏       | 63/289 [00:46<02:45,  1.36it/s]

Training loop 63
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.42069393396377563, logits - tensor([[-4.1268,  0.3295, -1.8083, -1.0444],
        [-4.3930,  1.6332, -2.2214, -0.9544],
        [-3.7081,  0.3707, -1.5970, -0.9967],
        [-4.3548,  0.6401, -1.5949, -1.3672],
        [-4.1602,  0.1959, -2.2317, -1.4880],
        [-3.8370,  0.3404, -1.8337, -1.0059],
        [-4.3089,  0.4495, -1.3584, -1.1342],
        [-4.2035,  0.6705, -1.3638, -0.8429]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 22%|██▏       | 64/289 [00:47<02:45,  1.36it/s]

Training loop 64
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3371809124946594, logits - tensor([[-4.0594,  0.8717, -1.5995, -1.1771],
        [-3.7504,  0.8195, -1.5044, -1.7491],
        [-4.1374,  0.0049, -1.7652, -1.0553],
        [-4.3069,  0.0736, -1.7382, -1.7682],
        [-4.1628,  0.3428, -1.3899, -1.4234],
        [-3.3500,  0.2874, -2.1018, -1.6007],
        [-4.7415, -0.3119, -2.1636, -1.9264],
        [-4.3943,  0.0600, -1.8916, -1.0294]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 22%|██▏       | 65/289 [00:48<02:44,  1.36it/s]

Training loop 65
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.33841371536254883, logits - tensor([[-3.6618,  0.5012, -1.1032, -0.8924],
        [-3.7697,  0.3496, -1.8092, -1.7382],
        [-4.0370,  0.2539, -2.0885, -1.5690],
        [-5.0722,  1.4423, -1.6545, -0.9949],
        [-3.8282,  0.4352, -1.6326, -1.4669],
        [-4.4931,  0.7674, -1.8629, -1.1521],
        [-3.9341,  0.5653, -1.6074, -2.0451],
        [-4.3489,  0.8742, -2.2570, -0.5637]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 23%|██▎       | 66/289 [00:48<02:44,  1.35it/s]

Training loop 66
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 23%|██▎       | 67/289 [00:49<02:43,  1.35it/s]

loss - 0.4557535648345947, logits - tensor([[-3.8097,  0.0883, -1.8691, -0.6867],
        [-3.9682,  0.4665, -1.9591, -1.1074],
        [-3.5983, -0.9810, -1.7594,  0.6449],
        [-4.6731,  0.5866, -1.8416, -1.0927],
        [-4.8564,  0.8905, -1.8018, -1.0846],
        [-4.1047,  1.0076, -1.6301, -1.0939],
        [-3.6327,  0.1187, -1.3331, -1.5083],
        [-4.3876,  1.0447, -1.9690, -0.6299]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 67
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.39491569995880127, logits - tensor([[-3.5021, -0.1862, -1.4759, -0.1820],
        [-4.5762,  0.5953, -1.9889, -1.2722],
        [-4.4798,  0.2281, -2.5806, -1.2135],
        [-4.5647,  1.1558, -2.3871, -1.1824],
        [-4.0964,  0.3334, -1.9157, -1.7124],
        [-4.2002,  1.1306, -1.4580, -1.5783],
        [-3.4143,  0.7829, -1.1329, -0.9051],
        [-4.2107,  1.6330, -1.4041, -0.8

 24%|██▎       | 68/289 [00:50<02:44,  1.35it/s]

Training loop 68
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3456002473831177, logits - tensor([[-4.4722,  1.2760, -2.6671, -1.2858],
        [-4.1743,  1.3171, -1.9773, -1.2906],
        [-3.7906, -0.4816, -2.5104, -0.9980],
        [-4.5755,  0.7709, -2.1152, -1.2941],
        [-3.9310,  0.6737, -1.9799, -1.0103],
        [-4.2881,  1.1302, -2.1433, -1.1352],
        [-4.2446,  0.5767, -1.4987, -1.0146],
        [-4.4635,  0.9711, -2.2668, -1.4285]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 24%|██▍       | 69/289 [00:51<02:43,  1.35it/s]

Training loop 69
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.40623098611831665, logits - tensor([[-4.6740,  0.9637, -1.9399, -0.8659],
        [-4.1979,  1.0965, -2.1287, -1.4788],
        [-4.4105,  1.4686, -2.1626, -1.1689],
        [-4.2650,  0.6755, -2.0665, -0.9978],
        [-4.3643,  0.9379, -2.8454, -0.8207],
        [-3.9260,  0.3537, -1.9375, -1.1958],
        [-4.5301,  0.6148, -1.2232, -0.7124],
        [-3.9543,  0.8295, -1.9092, -1.3539]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 24%|██▍       | 70/289 [00:51<02:42,  1.34it/s]

Training loop 70
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.43973881006240845, logits - tensor([[-4.8671,  0.1848, -2.4443, -1.5526],
        [-3.8068, -0.3581, -1.5258,  0.2666],
        [-4.2671,  0.8497, -2.2055, -1.8619],
        [-4.4358,  0.5883, -1.8700, -1.6153],
        [-4.9199,  0.6848, -2.0616, -1.4067],
        [-4.1852,  0.7255, -2.0088, -1.6254],
        [-4.7238,  1.4602, -2.6061, -0.7341],
        [-4.4446,  0.8406, -3.0140, -1.2679]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 25%|██▍       | 71/289 [00:52<02:41,  1.35it/s]

Training loop 71
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.45700743794441223, logits - tensor([[-4.0994,  0.8966, -1.5298, -2.1117],
        [-4.4663,  1.0388, -2.7137, -0.9614],
        [-4.4860,  0.7621, -2.4129, -0.9714],
        [-4.0518,  0.8251, -2.3762, -1.8373],
        [-2.9988,  0.9260, -1.6045, -0.8874],
        [-4.9191,  0.5424, -2.5576, -1.0663],
        [-4.8120,  0.4837, -2.8117, -1.4054],
        [-4.8036,  0.9265, -2.0828, -0.9499]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 25%|██▍       | 72/289 [00:53<02:41,  1.35it/s]

Training loop 72
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.44047021865844727, logits - tensor([[-4.3616,  0.9079, -2.0860, -1.8247],
        [-4.1518, -0.3301, -2.2595,  0.1128],
        [-4.4773,  0.4069, -1.9182, -1.4268],
        [-4.6550,  1.6752, -2.0894, -0.6307],
        [-5.0682,  0.8275, -1.8057, -1.7740],
        [-4.6000,  1.0236, -2.4727, -2.0537],
        [-4.2995,  0.3588, -2.2051, -0.9722],
        [-5.0654,  0.4441, -1.6039, -1.1587]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 25%|██▌       | 73/289 [00:53<02:40,  1.35it/s]

Training loop 73
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5085415840148926, logits - tensor([[-4.2002,  0.6949, -1.1535, -1.5541],
        [-4.3257,  1.0051, -1.8606, -1.0892],
        [-4.7821,  0.9565, -1.9574, -1.2359],
        [-3.8028,  0.8483, -2.1486, -1.3497],
        [-4.9963,  1.3760, -2.3670, -1.0713],
        [-4.3495,  0.2151, -2.6434, -1.4225],
        [-4.3880,  1.3509, -1.8975, -1.3098],
        [-3.2389, -0.7054, -1.0405, -0.1173]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 26%|██▌       | 74/289 [00:54<02:39,  1.35it/s]

Training loop 74
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.20248252153396606, logits - tensor([[-4.2654,  0.5666, -2.2156, -1.2111],
        [-4.5642,  1.1673, -2.0002, -0.6771],
        [-4.7044,  1.2266, -2.1413, -1.7083],
        [-3.8229,  0.4120, -1.7820, -1.1833],
        [-4.1903,  0.7054, -2.0237, -1.5637],
        [-4.4600,  0.9891, -2.0232, -1.1919],
        [-3.7803, -0.3599, -1.4941, -1.7573],
        [-4.9156,  1.0700, -1.9527, -1.6216]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 26%|██▌       | 75/289 [00:55<02:38,  1.35it/s]

Training loop 75
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4375271201133728, logits - tensor([[-4.7758,  1.1984, -2.8692, -1.2569],
        [-3.7669,  0.3659, -1.9570, -0.7905],
        [-4.2123,  0.4294, -1.7897, -0.9909],
        [-4.0547, -0.2600, -1.6936, -1.0564],
        [-3.4185,  0.8701, -1.8550, -1.3292],
        [-4.5571,  0.4241, -1.9512, -1.3413],
        [-3.7066,  0.0545, -2.6193, -1.9853],
        [-4.4082,  0.4633, -1.9891, -1.1262]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 26%|██▋       | 76/289 [00:56<02:38,  1.34it/s]

Training loop 76
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4103638529777527, logits - tensor([[-3.9884,  0.2714, -1.8260, -1.2985],
        [-3.7808,  0.1832, -1.9741, -1.0523],
        [-4.7760,  0.0176, -2.1839, -0.8530],
        [-4.9331,  0.9939, -2.2014, -1.4128],
        [-4.4685,  1.3462, -1.1236, -1.5590],
        [-4.8375,  0.3834, -2.6623, -0.8619],
        [-3.8653,  0.4220, -1.5245, -1.1466],
        [-4.0749,  0.3408, -1.7528, -0.9948]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 27%|██▋       | 77/289 [00:56<02:37,  1.35it/s]

Training loop 77
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.29242146015167236, logits - tensor([[-4.1479,  0.9123, -1.4444, -1.0357],
        [-4.0649,  0.2114, -2.4794, -1.1987],
        [-4.4096,  0.6985, -2.0044, -0.9158],
        [-4.0117,  1.0847, -2.1758, -0.9969],
        [-4.4791,  0.4459, -2.7837, -1.5196],
        [-4.6831,  0.2800, -2.1511, -1.2170],
        [-4.9118,  0.7760, -2.1365, -1.2527],
        [-4.2732,  0.0278, -1.7486, -1.2129]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 27%|██▋       | 78/289 [00:57<02:37,  1.34it/s]

Training loop 78
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.34937503933906555, logits - tensor([[-5.0492,  0.7653, -1.9475, -1.7923],
        [-5.0903,  0.3108, -2.5753, -1.1779],
        [-5.3307,  0.6945, -1.9036, -1.9130],
        [-4.5502,  0.6742, -2.8358, -1.7690],
        [-5.2162,  0.8426, -2.1177, -1.7959],
        [-4.0720,  0.8629, -2.7077, -1.7667],
        [-4.4388,  0.4579, -1.9835, -1.1028],
        [-4.4415,  0.6062, -1.9586, -1.4349]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 27%|██▋       | 79/289 [00:58<02:36,  1.35it/s]

Training loop 79
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3654979467391968, logits - tensor([[-4.8458,  1.2713, -2.2808, -1.4966],
        [-3.9177,  0.7294, -1.6904, -1.3145],
        [-3.9819,  0.8081, -1.5129, -2.0289],
        [-4.7750,  0.6079, -2.7359, -1.6918],
        [-4.2219,  1.2894, -1.0047, -1.7838],
        [-4.6915,  0.3203, -2.1511, -1.7006],
        [-4.0729,  1.1785, -2.0255, -1.6316],
        [-2.8631, -0.6861, -1.4335,  0.5199]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 28%|██▊       | 80/289 [00:59<02:35,  1.35it/s]

Training loop 80
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.509736180305481, logits - tensor([[-4.7200,  1.0278, -2.0720, -1.3363],
        [-4.5568,  0.7640, -1.1417, -1.9473],
        [-4.2948,  0.4682, -2.3847, -0.8241],
        [-3.7459,  0.5731, -1.5991, -0.9983],
        [-4.6672,  0.7354, -2.5771, -1.8848],
        [-4.8606,  0.3328, -2.5879, -0.9040],
        [-4.7306,  0.7767, -1.9865, -1.3266],
        [-4.2225,  1.0578, -2.3688, -0.9383]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 28%|██▊       | 81/289 [00:59<02:34,  1.34it/s]

Training loop 81
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.29222413897514343, logits - tensor([[-5.3126,  1.0411, -2.3908, -1.5880],
        [-4.2266,  0.3987, -1.5540, -1.3250],
        [-4.2690,  0.5628, -1.9252, -1.2096],
        [-3.3893,  0.1277, -1.8532, -0.3976],
        [-4.6177,  0.7617, -1.5867, -1.7302],
        [-4.5361,  0.9316, -2.2281, -1.5301],
        [-3.3112,  0.4191, -1.6948, -0.7520],
        [-3.1079,  0.6533, -1.4207, -1.6688]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 28%|██▊       | 82/289 [01:00<02:34,  1.34it/s]

Training loop 82
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3526782989501953, logits - tensor([[-3.7166,  0.3734, -1.8605, -1.8648],
        [-4.9395,  0.3297, -2.1776, -1.2291],
        [-4.1738,  0.8001, -2.4015, -1.4337],
        [-4.5311,  0.7360, -2.7375, -1.4721],
        [-3.9456,  0.2061, -2.2204, -0.7874],
        [-2.7853, -0.7178, -1.1866,  1.0637],
        [-5.0966, -0.4507, -2.5657, -0.4790],
        [-5.1640,  0.7164, -1.9645, -1.3429]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 29%|██▊       | 83/289 [01:01<02:34,  1.34it/s]

Training loop 83
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2042839527130127, logits - tensor([[-4.2831,  1.3473, -2.0120, -1.3671],
        [-3.8972,  0.8466, -1.9508, -1.1863],
        [-4.1485,  0.2082, -2.8613, -2.1569],
        [-3.6272,  0.3833, -2.1676, -0.8564],
        [-3.8709,  1.2861, -1.8940, -1.8953],
        [-3.9842,  0.9763, -2.2677, -0.4945],
        [-4.2142, -1.3167, -1.7529,  0.2334],
        [-4.5068,  0.2128, -1.6899, -1.7902]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 29%|██▉       | 84/289 [01:02<02:33,  1.34it/s]

Training loop 84
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4812365174293518, logits - tensor([[-2.9368, -0.7163, -1.0594,  0.8737],
        [-4.0204,  1.1084, -1.5805, -1.3969],
        [-4.5990,  1.6329, -1.7302, -1.3770],
        [-4.3832,  0.6183, -2.5635, -1.3251],
        [-4.5057,  0.9441, -2.3173, -1.5095],
        [-4.5362,  0.5884, -2.3819, -1.5926],
        [-4.4545,  1.0814, -2.4842, -1.5608],
        [-4.6701,  0.7948, -2.0699, -1.8196]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 29%|██▉       | 85/289 [01:02<02:32,  1.33it/s]

Training loop 85
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4212413430213928, logits - tensor([[-4.4238,  1.1098, -2.0174, -1.9916],
        [-4.4534,  0.4625, -2.0834, -1.2686],
        [-3.8107, -0.3791, -1.4323, -0.7889],
        [-4.0963,  0.6182, -2.4987, -1.3252],
        [-4.3174,  1.0909, -2.2911, -0.8456],
        [-3.9820,  0.9635, -2.6143, -1.9091],
        [-4.3634,  0.7954, -2.1220, -1.2107],
        [-4.5077, -0.5789, -1.9666,  1.0684]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 30%|██▉       | 86/289 [01:03<02:33,  1.33it/s]

Training loop 86
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3176664710044861, logits - tensor([[-4.3819,  0.9111, -1.7018, -1.3291],
        [-4.0421,  1.1619, -2.3013, -1.5963],
        [-4.3771,  0.5558, -2.0582, -1.3381],
        [-5.1811,  1.1260, -1.6951, -1.7103],
        [-3.9925,  0.5621, -2.3724, -1.2436],
        [-5.1012,  1.1134, -2.5879, -0.8600],
        [-4.9112,  0.9145, -2.1604, -1.3462],
        [-5.0218,  0.2282, -1.8323, -0.6612]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 30%|███       | 87/289 [01:04<02:33,  1.32it/s]

Training loop 87
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4201313257217407, logits - tensor([[-4.3375,  1.0731, -2.6670, -2.1932],
        [-4.5163,  0.8909, -2.0296, -1.3465],
        [-4.1576,  0.3960, -2.4578, -1.0986],
        [-4.6125,  0.9993, -1.8443, -1.1504],
        [-5.1725,  0.8080, -2.6047, -1.9126],
        [-4.0918, -1.3961, -1.4180,  0.7910],
        [-4.2099,  0.5898, -2.3043, -1.9484],
        [-4.9621,  0.7977, -2.6215, -1.6356]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 30%|███       | 88/289 [01:05<02:33,  1.31it/s]

Training loop 88
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.31523674726486206, logits - tensor([[-4.8124,  0.5985, -2.4523, -0.7747],
        [-5.3132,  0.7082, -1.8537, -0.6947],
        [-4.7230,  0.3878, -2.1326, -0.2657],
        [-4.5557, -0.8173, -1.9218,  0.1938],
        [-4.5800,  0.1338, -2.0820,  0.6311],
        [-3.7104,  1.1610, -1.8439, -1.2647],
        [-4.3208,  0.6497, -2.6912, -1.4399],
        [-5.0094, -0.4732, -1.9469, -0.2498]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 31%|███       | 89/289 [01:06<02:32,  1.31it/s]

Training loop 89
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3111651539802551, logits - tensor([[-5.8514,  0.9125, -2.0330, -1.4758],
        [-3.3318, -1.2264, -1.3747,  1.2515],
        [-1.7470, -0.6345, -1.1128,  1.2953],
        [-4.5671,  0.8656, -2.1268, -0.8973],
        [-4.6893,  1.3687, -2.2196, -1.8725],
        [-4.3161, -0.1269, -1.6881, -1.3450],
        [-4.7779,  0.7545, -2.1180, -1.4300],
        [-5.0791,  0.8071, -2.0515, -1.9842]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 31%|███       | 90/289 [01:06<02:31,  1.31it/s]

Training loop 90
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3490610122680664, logits - tensor([[-3.9844,  0.5692, -1.9362, -1.3081],
        [-4.2633,  1.3569, -2.2122, -1.6123],
        [-2.8025, -1.4701, -1.0904,  1.8717],
        [-4.3580,  0.1489, -2.3332, -0.9508],
        [-5.4315,  1.1996, -2.3526, -0.6457],
        [-4.7133, -0.4388, -2.0602,  0.2238],
        [-3.8601,  1.0433, -2.3991, -1.5785],
        [-4.3956,  0.8734, -2.0371, -1.2975]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 31%|███▏      | 91/289 [01:07<02:30,  1.31it/s]

Training loop 91
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.590430736541748, logits - tensor([[-3.5268,  0.6576, -1.7776, -1.7982],
        [-4.4414,  1.5053, -1.8367, -1.6582],
        [-3.7804,  1.0835, -2.2161, -1.7855],
        [-4.3603,  1.3565, -1.9862, -2.3713],
        [-5.1890,  0.9660, -2.6526, -1.4384],
        [-4.9278, -0.4596, -2.3366,  0.8260],
        [-2.8005, -1.3450, -1.3727,  0.7119],
        [-4.3411, -0.4546, -2.0090, -0.2620]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 32%|███▏      | 92/289 [01:08<02:29,  1.32it/s]

Training loop 92
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5383152961730957, logits - tensor([[-5.3379,  0.8296, -2.2229, -1.8567],
        [-4.3222,  1.7482, -2.0989, -1.4509],
        [-4.1348,  1.0763, -1.6612, -1.7609],
        [-4.1819,  1.7275, -1.8580, -1.3491],
        [-4.5133,  1.6300, -1.9568, -1.5245],
        [-3.8171,  1.4026, -2.3434, -1.7851],
        [-4.5672,  0.4727, -1.9848, -1.8649],
        [-4.6595,  0.7195, -1.7724, -2.2856]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 32%|███▏      | 93/289 [01:09<02:28,  1.32it/s]

Training loop 93
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3601619005203247, logits - tensor([[-4.8467,  0.5048, -1.9766, -0.0902],
        [-4.6937,  1.2709, -2.3687, -1.6779],
        [-4.3125,  1.5247, -1.9337, -1.3921],
        [-4.2425,  0.8824, -1.3472, -1.7040],
        [-4.0789,  0.9803, -2.0918, -2.2240],
        [-4.4442,  1.2338, -1.9957, -2.5092],
        [-4.3879,  1.9667, -2.2897, -1.9646],
        [-2.7490, -0.7393, -2.4215,  0.5423]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 33%|███▎      | 94/289 [01:09<02:27,  1.32it/s]

Training loop 94
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.46714916825294495, logits - tensor([[-3.8224,  1.3698, -1.3804, -1.6661],
        [-4.4731,  1.4434, -2.2001, -2.3448],
        [-4.4072,  0.8601, -2.2313, -2.1785],
        [-4.4968,  1.4246, -2.3831, -1.5157],
        [-3.3246,  0.8157, -1.8857, -1.4953],
        [-4.5075,  1.0032, -2.1364, -2.0228],
        [-4.5753, -0.1195, -2.0010, -0.7899],
        [-4.3486,  0.3354, -2.0399,  0.0789]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 33%|███▎      | 95/289 [01:10<02:27,  1.32it/s]

Training loop 95
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.6956424713134766, logits - tensor([[-3.4829,  0.4254, -2.2537, -0.7597],
        [-3.9526,  0.7749, -2.0754, -1.8037],
        [-3.9455,  0.6665, -1.6507, -1.1666],
        [-4.1497,  0.8735, -1.1368, -1.7724],
        [-4.4600,  1.0450, -1.9497, -1.1803],
        [-4.0529,  0.4232, -2.0625, -1.9325],
        [-5.3099,  1.5560, -1.5759, -2.4189],
        [-4.3258,  0.9271, -2.2732, -1.0768]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 33%|███▎      | 96/289 [01:11<02:26,  1.32it/s]

Training loop 96
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.514121949672699, logits - tensor([[-4.5542,  1.1716, -2.6024, -2.4493],
        [-4.5288,  0.8180, -1.7035, -1.6602],
        [-4.3162,  0.7722, -1.8358, -1.1186],
        [-3.5479,  1.0576, -2.2693, -1.0926],
        [-4.2346,  0.6923, -1.7858, -2.0029],
        [-4.3362,  0.9526, -1.6703, -1.5198],
        [-3.6305,  0.6175, -1.7357, -0.7405],
        [-4.3808,  0.5621, -1.9389, -1.8331]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 34%|███▎      | 97/289 [01:12<02:26,  1.31it/s]

Training loop 97
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.46576082706451416, logits - tensor([[-3.8816,  0.7452, -2.1852, -1.5216],
        [-4.0796,  0.6172, -1.6400, -1.7299],
        [-3.9730,  1.1768, -2.5276, -1.6988],
        [-3.6858,  0.9214, -2.1231, -0.9623],
        [-4.1248,  1.0787, -1.3864, -0.8399],
        [-3.9337,  0.6629, -1.9176, -1.2421],
        [-4.4460,  0.2489, -2.3202, -1.7704],
        [-3.5004,  0.9761, -1.6855, -1.2632]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 34%|███▍      | 98/289 [01:12<02:25,  1.32it/s]

Training loop 98
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3848567008972168, logits - tensor([[-3.5252,  0.1979, -1.0544, -1.1102],
        [-4.0025,  0.6667, -1.5932, -1.3581],
        [-3.5904,  0.2963, -1.8966, -1.2012],
        [-3.4144,  0.6460, -1.3364, -1.1750],
        [-4.0895,  0.4340, -1.6970, -1.0528],
        [-3.7743, -0.0077, -1.5457, -1.3016],
        [-3.7046,  0.4066, -1.2532, -1.7989],
        [-3.7956,  0.5069, -1.9042, -1.9886]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 34%|███▍      | 99/289 [01:13<02:24,  1.31it/s]

Training loop 99
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.393230140209198, logits - tensor([[-4.1957,  0.4601, -1.6666, -1.3498],
        [-4.1156,  0.8817, -2.0383, -1.3019],
        [-3.0886,  0.0080, -1.1419, -1.9028],
        [-3.9383,  0.0668, -1.4613, -1.1095],
        [-3.9848, -0.9799, -1.9850,  0.7305],
        [-3.6844,  0.2783, -1.6085, -1.2957],
        [-3.3820,  1.0667, -1.3454, -0.4955],
        [-2.8481,  0.7388, -1.2455, -0.8438]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 35%|███▍      | 100/289 [01:14<02:24,  1.31it/s]

Training loop 100
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3900309205055237, logits - tensor([[-4.5123, -0.8183, -1.8930,  0.4921],
        [-4.0120, -0.1572, -1.5380, -1.2738],
        [-3.0951,  0.5174, -1.2685, -1.0380],
        [-3.4474,  0.8405, -1.7888, -1.1017],
        [-4.8279, -0.3412, -2.2245, -0.5797],
        [-3.6232, -0.2873, -2.0202, -0.7415],
        [-4.7439, -0.8684, -2.2376, -0.2610],
        [-3.1662, -0.4037, -1.6314, -1.2951]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 35%|███▍      | 101/289 [01:15<02:23,  1.31it/s]

Training loop 101
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3676695227622986, logits - tensor([[-3.5316,  0.8181, -1.0968, -0.7956],
        [-3.3596,  0.4833, -1.6164, -1.1201],
        [-3.7373, -0.3372, -1.4893, -1.3048],
        [-3.3201, -0.3825, -2.3369, -0.5509],
        [-3.8258,  0.4836, -1.3268, -1.4913],
        [-3.6758, -0.2963, -0.9066, -0.8837],
        [-3.5937, -0.1939, -0.9304, -1.1229],
        [-3.3995,  0.1575, -1.7962, -1.1926]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 35%|███▌      | 102/289 [01:15<02:22,  1.31it/s]

Training loop 102
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.45010101795196533, logits - tensor([[-4.3771, -0.9604, -2.6244, -0.7429],
        [-4.7795, -0.3000, -1.7173, -0.8664],
        [-4.1118, -0.3302, -1.2222, -1.3397],
        [-4.6529, -1.0906, -1.8033,  0.1008],
        [-3.6049, -0.0224, -1.0747, -1.1007],
        [-3.6202,  0.3017, -1.3184, -0.8945],
        [-4.2805, -0.2799, -1.6430, -0.9585],
        [-4.0321,  0.0218, -1.8545, -1.1582]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 36%|███▌      | 103/289 [01:16<02:22,  1.31it/s]

Training loop 103
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4705509543418884, logits - tensor([[-3.8624,  0.2103, -1.7606, -0.3878],
        [-3.6815, -0.4671, -2.3721,  0.6524],
        [-3.8198, -0.4360, -1.6651, -1.2974],
        [-4.1961, -0.7390, -1.8732, -0.5250],
        [-3.8450,  0.1863, -2.6736, -1.2282],
        [-4.0365,  0.2525, -1.8163, -0.6731],
        [-4.0181, -0.7612, -1.5179, -0.7015],
        [-3.4641,  0.3692, -2.5690, -0.8204]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 36%|███▌      | 104/289 [01:17<02:22,  1.30it/s]

Training loop 104
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.31368157267570496, logits - tensor([[-3.9278,  0.1656, -1.3192, -0.8496],
        [-4.0531,  0.0635, -1.2894, -0.4812],
        [-3.5697,  0.1506, -1.8838, -0.5625],
        [-3.8068, -0.1268, -1.0318, -0.7764],
        [-3.4697,  0.0372, -1.8276, -1.3435],
        [-3.4074,  0.0913, -1.5115, -0.8580],
        [-4.3064, -2.6205, -2.3701,  1.6963],
        [-3.4465,  0.0699, -1.7886, -1.0818]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 36%|███▋      | 105/289 [01:18<02:21,  1.30it/s]

Training loop 105
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.6210504174232483, logits - tensor([[-3.2596, -0.2884, -1.2225, -0.5683],
        [-3.9668, -0.2591, -1.3441, -0.6533],
        [-2.8758,  0.1297, -0.8863, -0.8867],
        [-3.0266,  0.5508, -1.0330, -1.2797],
        [-3.5967, -0.2988, -1.6772, -0.7633],
        [-3.1331,  0.3883, -1.7831, -0.5258],
        [-3.7286,  0.1719, -2.3150, -1.2071],
        [-2.9389, -1.8089, -2.0359,  2.0191]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 37%|███▋      | 106/289 [01:19<02:20,  1.30it/s]

Training loop 106
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4329356551170349, logits - tensor([[-3.9316,  0.0057, -1.6098, -0.7065],
        [-3.2137, -0.0569, -1.3503, -1.3185],
        [-3.5547,  0.1234, -1.5669, -0.5935],
        [-4.3006,  0.0369, -1.6109, -1.6287],
        [-3.9543, -0.1497, -1.6174, -1.0253],
        [-3.6354, -0.4675, -1.1046, -1.1297],
        [-3.0575, -0.6542, -1.6750, -0.8549],
        [-3.4281,  0.0308, -1.5997, -0.9002]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 37%|███▋      | 107/289 [01:19<02:20,  1.30it/s]

Training loop 107
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.46562090516090393, logits - tensor([[-3.7083e+00, -3.0488e-02, -1.3110e+00, -8.0911e-01],
        [-4.3039e+00, -2.9703e-02, -1.6398e+00, -1.2681e+00],
        [-3.5191e+00,  1.9951e-02, -8.2413e-01, -7.2767e-01],
        [-3.4916e+00, -4.0819e-01, -2.1910e+00, -8.7596e-01],
        [-3.6567e+00, -7.5108e-01, -1.8725e+00, -1.2553e+00],
        [-3.4682e+00,  1.7626e-03, -1.5872e+00, -1.0319e+00],
        [-3.9417e+00, -1.4109e-01, -1.7108e+00, -7.3521e-01],
        [-3.6231e+00,  4.0277e-01, -1.4614e+00, -7.9078e-01]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 37%|███▋      | 108/289 [01:20<02:19,  1.30it/s]

Training loop 108
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5831743478775024, logits - tensor([[-3.4358,  0.4792, -1.1850, -0.9622],
        [-3.0324, -0.0860, -0.7490, -0.9877],
        [-3.5050, -0.0831, -1.2208, -1.3088],
        [-3.8989, -0.3666, -1.5437, -0.5177],
        [-3.1674,  0.3559, -1.4818, -1.0108],
        [-3.5493, -0.1198, -1.9155, -0.6758],
        [-4.3149, -1.2228, -1.7891,  0.4609],
        [-3.4596,  0.4305, -1.2078, -1.1824]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 38%|███▊      | 109/289 [01:21<02:19,  1.29it/s]

Training loop 109
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3304525911808014, logits - tensor([[-2.4752, -2.1727, -1.7511,  2.1539],
        [-3.8965, -0.1991, -0.7721, -0.7082],
        [-2.7074, -1.3820, -2.1492,  1.6193],
        [-3.5258,  0.3352, -1.3770,  0.0706],
        [-3.4151,  0.0886, -1.5105, -0.9358],
        [-3.5987,  0.0160, -1.8518, -0.8303],
        [-3.5460,  0.2162, -1.4646, -1.0841],
        [-3.0837,  0.1912, -1.0150, -0.8910]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 38%|███▊      | 110/289 [01:22<02:19,  1.29it/s]

Training loop 110
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4349977970123291, logits - tensor([[-3.3298e+00, -3.4866e-03, -1.1592e+00, -8.6486e-01],
        [-3.8269e+00,  3.9463e-02, -1.6502e+00, -1.0370e+00],
        [-3.7212e+00,  6.5610e-01, -1.6877e+00, -6.3709e-01],
        [-3.6781e+00,  4.9571e-02, -1.6827e+00, -1.5791e+00],
        [-3.7434e+00,  1.5806e-01, -1.7740e+00, -1.2251e+00],
        [-3.7133e+00, -3.8227e-01, -1.3224e+00, -9.4449e-01],
        [-3.5468e+00, -4.2399e-01, -1.1839e+00, -9.1668e-01],
        [-3.9771e+00,  4.1398e-01, -1.4033e+00, -1.1008e+00]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 38%|███▊      | 111/289 [01:22<02:18,  1.28it/s]

Training loop 111
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.401607871055603, logits - tensor([[-3.1170,  0.1433, -1.1692, -1.1607],
        [-3.8769,  0.2737, -1.2309, -1.0938],
        [-3.8300,  0.1363, -1.6111, -1.2250],
        [-3.1645,  0.2127, -0.7582, -1.3645],
        [-3.2047,  0.5620, -1.0101, -1.0090],
        [-3.0327,  0.2517, -0.8828, -0.9639],
        [-4.0337,  0.0864, -1.8510, -1.2826],
        [-2.8032,  0.0083, -0.8097, -0.6728]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 39%|███▉      | 112/289 [01:23<02:17,  1.29it/s]

Training loop 112
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.346267968416214, logits - tensor([[-3.3510,  0.1920, -1.5374, -1.2992],
        [-3.6731,  0.2834, -1.3824, -1.7878],
        [-3.5183,  0.6213, -1.6947, -1.3685],
        [-3.1527, -0.5096, -1.1390, -0.9279],
        [-4.0161,  0.2654, -1.5330, -0.9417],
        [-4.2442,  0.0269, -0.9763, -1.1876],
        [-4.0859, -0.1824, -1.6138, -0.5041],
        [-3.6188,  0.0725, -1.0699, -1.2063]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 39%|███▉      | 113/289 [01:24<02:16,  1.29it/s]

Training loop 113
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.40151727199554443, logits - tensor([[-4.4265,  0.8252, -1.4181, -1.0917],
        [-3.7620,  0.0689, -1.9703, -1.6900],
        [-3.8827, -0.0389, -1.1259, -0.7140],
        [-3.7424,  0.3159, -1.5813, -1.0876],
        [-4.3267, -0.3387, -1.0012, -1.4367],
        [-4.0227,  0.6693, -1.2637, -1.4630],
        [-4.2444, -0.8999, -2.0791,  0.1374],
        [-4.1906,  0.2476, -1.5342, -1.1987]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 39%|███▉      | 114/289 [01:25<02:16,  1.28it/s]

Training loop 114
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4260817766189575, logits - tensor([[-4.2488, -0.2482, -1.3363, -1.7681],
        [-3.6588, -1.3254, -1.1421,  0.9000],
        [-3.7448, -0.0834, -1.3435, -0.7775],
        [-4.3448, -0.4427, -1.8246, -0.5855],
        [-3.6730, -0.3700, -1.4664, -0.6714],
        [-3.6514, -0.0468, -1.3494, -0.9902],
        [-3.6582,  0.3446, -1.0042, -1.0706],
        [-4.7272,  0.3570, -1.0190, -1.1217]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 40%|███▉      | 115/289 [01:26<02:16,  1.28it/s]

Training loop 115
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.38212424516677856, logits - tensor([[-4.0165,  0.4290, -1.1798, -1.5918],
        [-3.5315, -1.2621, -1.4377,  0.7206],
        [-3.8707,  1.0923, -0.9388, -0.9560],
        [-3.6621,  0.3394, -1.2312, -2.0931],
        [-4.0612,  0.4412, -1.0418, -1.5677],
        [-3.8421,  0.3125, -1.3153, -1.2152],
        [-4.2918,  0.4280, -1.3346, -1.6572],
        [-4.4225, -0.0425, -1.3292, -1.0420]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 40%|████      | 116/289 [01:26<02:14,  1.29it/s]

Training loop 116
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5080259442329407, logits - tensor([[-4.3433,  0.0789, -1.0196, -1.1113],
        [-4.3895,  0.1193, -1.0275, -1.6470],
        [-3.6524,  0.5818, -1.2223, -0.8654],
        [-3.8118, -0.0739, -1.4831, -1.4273],
        [-3.6364,  0.5410, -1.8221, -1.2241],
        [-4.0237,  0.2547, -1.4039, -1.8412],
        [-3.7436, -0.4961, -1.5216, -0.2210],
        [-3.5063,  0.1472, -1.2276, -0.9820]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 40%|████      | 117/289 [01:27<02:13,  1.29it/s]

Training loop 117
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.6104410886764526, logits - tensor([[-3.8774,  0.9854, -1.0964, -1.6912],
        [-3.9106,  0.3234, -1.3022, -1.6196],
        [-3.5656,  0.7160, -1.2860, -1.3822],
        [-3.6530, -0.2053, -1.7017,  0.2539],
        [-3.6888,  1.1326, -1.1022, -2.1970],
        [-3.8222,  1.0584, -1.1151, -2.0639],
        [-3.9584,  0.4020, -0.9884, -1.4610],
        [-4.6418,  0.0370, -2.2528, -0.6496]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 41%|████      | 118/289 [01:28<02:12,  1.29it/s]

Training loop 118
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4271951913833618, logits - tensor([[-4.0850,  0.8217, -0.7285, -1.7878],
        [-3.0057,  0.6619, -1.2236, -0.9340],
        [-3.5283,  0.8585, -1.3353, -1.0195],
        [-3.4405, -0.0714, -1.7202, -1.7362],
        [-4.1462,  0.5342, -1.0249, -0.8147],
        [-3.6024,  0.8751, -1.2328, -1.2988],
        [-3.8282,  1.0349, -1.3643, -1.0739],
        [-3.8071,  0.8415, -1.8595, -1.3498]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 41%|████      | 119/289 [01:29<02:11,  1.30it/s]

Training loop 119
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.39222973585128784, logits - tensor([[-3.8864,  0.2986, -1.3604, -2.2780],
        [-3.3101, -1.9829, -1.8602,  1.5113],
        [-4.2294,  0.2684, -1.4676, -1.7545],
        [-3.8386,  0.3526, -1.2307, -1.1484],
        [-3.9524,  0.6953, -1.2797, -1.8376],
        [-3.7580,  1.1992, -1.4186, -1.6458],
        [-3.9223,  0.5604, -1.7592, -1.2714],
        [-3.9633,  0.2778, -1.3226, -1.1318]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 42%|████▏     | 120/289 [01:29<02:09,  1.30it/s]

Training loop 120
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5324147939682007, logits - tensor([[-4.2794,  0.8228, -1.1247, -2.1853],
        [-4.3317,  1.0413, -1.5971, -2.4552],
        [-4.7578,  0.5210, -1.8177, -1.2133],
        [-4.6289,  0.9047, -1.8546, -1.8406],
        [-4.5771,  1.1938, -2.0123, -1.6303],
        [-4.2144,  1.0916, -1.6377, -2.2825],
        [-4.2085,  0.6653, -1.5938, -1.2009],
        [-3.5588,  0.5755, -2.0911, -1.4524]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 42%|████▏     | 121/289 [01:30<02:08,  1.31it/s]

Training loop 121
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3316585123538971, logits - tensor([[-3.9157, -1.6499, -2.3236,  1.2533],
        [-4.0274,  1.0352, -1.7411, -1.2024],
        [-4.6999,  0.9963, -1.6323, -0.7785],
        [-4.0652,  0.6051, -1.6280, -1.4382],
        [-3.7187,  0.5451, -1.4709, -1.5271],
        [-4.3604,  0.6897, -1.4214, -2.1194],
        [-3.9194,  0.7613, -1.4167, -1.6551],
        [-4.4410,  1.1059, -1.0952, -1.7807]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 42%|████▏     | 122/289 [01:31<02:07,  1.31it/s]

Training loop 122
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3401477336883545, logits - tensor([[-3.4863,  0.5275, -1.5550, -1.0095],
        [-4.0764,  0.7970, -1.3668, -1.2307],
        [-4.4546,  0.8445, -1.4218, -1.5625],
        [-4.2137,  0.5162, -0.8981, -1.4272],
        [-3.8484,  0.5968, -1.5172, -1.3839],
        [-4.2093,  0.4077, -1.5395, -1.8632],
        [-4.3688,  0.6266, -1.2555, -1.2528],
        [-4.0627, -0.9756, -2.4004,  0.7429]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 43%|████▎     | 123/289 [01:32<02:06,  1.31it/s]

Training loop 123
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5482234358787537, logits - tensor([[-3.7222, -0.7819, -2.1215,  0.3407],
        [-4.2656,  0.5704, -1.6669, -2.3895],
        [-5.2141,  0.0368, -2.6771, -1.3401],
        [-4.4954,  0.8025, -1.3782, -2.0399],
        [-4.0925, -0.4637, -2.3719, -0.1433],
        [-4.2558,  0.8983, -2.1429, -1.9248],
        [-3.5265,  0.2144, -1.0532, -1.9483],
        [-3.7966,  0.8297, -2.2311, -1.6662]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 43%|████▎     | 124/289 [01:32<02:05,  1.32it/s]

Training loop 124
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.507086455821991, logits - tensor([[-4.3874,  0.8837, -1.0246, -1.6713],
        [-4.8296,  0.6091, -0.9875, -1.4837],
        [-3.9478,  0.1226, -2.0928, -1.7703],
        [-4.3718,  0.9021, -1.4892, -1.3243],
        [-5.4588,  0.4137, -2.0064, -1.7724],
        [-4.0778,  0.6692, -1.7254, -1.7607],
        [-3.5541,  1.0259, -1.7084, -1.8990],
        [-4.0794, -0.5681, -2.0871, -0.3917]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 43%|████▎     | 125/289 [01:33<02:04,  1.32it/s]

Training loop 125
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2854154109954834, logits - tensor([[-4.9279, -0.0532, -1.8955, -0.9273],
        [-4.0558,  0.6556, -1.6857, -1.7280],
        [-5.5673,  0.2858, -1.9077, -1.3018],
        [-4.2265,  0.3185, -2.1143, -2.0303],
        [-4.7173,  0.8823, -1.5832, -1.2863],
        [-4.3000,  1.2047, -1.8325, -1.3526],
        [-3.2238, -1.4640, -1.7372,  1.2770],
        [-4.3510,  0.7841, -2.2612, -0.6911]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 44%|████▎     | 126/289 [01:34<02:03,  1.32it/s]

Training loop 126
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.370702862739563, logits - tensor([[-4.8277,  0.8856, -2.0681, -1.7604],
        [-2.7881, -1.2302, -1.7878, -0.0062],
        [-4.4002,  0.5491, -1.5083, -1.4218],
        [-4.4157,  0.8202, -1.7395, -1.1625],
        [-4.7393, -0.1938, -1.8755,  0.4038],
        [-4.4775,  0.3726, -2.0364, -1.7559],
        [-4.2556,  0.5491, -1.6046, -1.1141],
        [-3.0586, -2.2801, -1.8503,  1.6085]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 44%|████▍     | 127/289 [01:35<02:02,  1.32it/s]

Training loop 127
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5297930240631104, logits - tensor([[-4.8733,  0.6602, -1.7224, -1.7745],
        [-3.8075,  0.1640, -1.4908, -1.9234],
        [-3.9455,  0.9238, -1.4726, -1.4595],
        [-4.1714,  0.5536, -1.7285, -1.6598],
        [-3.5819,  0.5930, -0.9031, -1.3090],
        [-3.1106, -3.1073, -2.3969,  1.6345],
        [-4.2038,  0.6265, -1.6931, -1.8169],
        [-4.6914, -0.4528, -2.2492, -2.2805]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 44%|████▍     | 128/289 [01:35<02:01,  1.33it/s]

Training loop 128
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4807767868041992, logits - tensor([[-3.9365,  0.1542, -1.8627, -0.6958],
        [-4.1929,  0.4806, -1.7000, -1.3686],
        [-4.7617,  1.0984, -1.4654, -1.1498],
        [-3.8525,  0.6932, -1.7436, -1.2685],
        [-3.8197, -1.8119, -1.2416,  1.5752],
        [-4.1574,  0.2156, -1.5971, -1.4299],
        [-4.8729, -0.1318, -2.2334, -2.0052],
        [-3.7500,  0.0620, -1.4731, -1.0620]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 45%|████▍     | 129/289 [01:36<02:00,  1.33it/s]

Training loop 129
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.33672383427619934, logits - tensor([[-3.9640, -2.1634, -1.5711,  0.8288],
        [-3.8342,  0.3483, -0.9752, -1.3827],
        [-4.0928,  0.2690, -1.4202, -0.7185],
        [-4.5938,  0.7373, -1.2484, -0.9252],
        [-4.2688,  0.3201, -1.8073, -1.1574],
        [-4.5848,  0.3883, -1.9630, -1.0318],
        [-4.9765,  0.9000, -1.7322, -1.2432],
        [-4.1106,  0.5113, -1.1524, -1.2338]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 45%|████▍     | 130/289 [01:37<01:59,  1.33it/s]

Training loop 130
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.43006807565689087, logits - tensor([[-3.7969, -0.7446, -2.0930, -0.3041],
        [-4.7023,  0.7993, -1.7008, -1.1934],
        [-4.7948, -0.0988, -1.7666, -0.9289],
        [-3.3520,  0.5585, -1.3866, -0.6814],
        [-4.4483,  0.5132, -1.3817, -1.3696],
        [-3.8548,  0.1929, -1.3216, -1.0786],
        [-4.3167,  0.7768, -1.3159, -0.8281],
        [-3.9610,  0.6877, -1.2226, -1.1700]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 45%|████▌     | 131/289 [01:38<01:59,  1.33it/s]

Training loop 131
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4593733549118042, logits - tensor([[-4.1672,  0.2850, -1.4407, -1.0356],
        [-3.7600, -0.1372, -1.6170, -1.0776],
        [-4.9357,  0.2831, -1.6665, -0.9243],
        [-4.2120,  0.4174, -1.8186, -0.9941],
        [-4.0472,  0.4399, -1.2783, -1.4002],
        [-3.6912,  0.0495, -1.2427, -1.6910],
        [-4.0388, -0.9944, -1.0127, -0.0281],
        [-3.9927,  0.9725, -1.9647, -1.6614]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 46%|████▌     | 132/289 [01:38<01:58,  1.33it/s]

Training loop 132
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3171446919441223, logits - tensor([[-4.6391,  1.0552, -1.5067, -1.1507],
        [-3.8118, -0.0671, -1.0360, -1.2282],
        [-4.0832,  0.0272, -1.1607, -1.0762],
        [-4.4109,  0.2472, -1.6805, -1.5858],
        [-4.3069,  0.1628, -1.8470, -0.7157],
        [-4.8840, -0.1210, -0.7715, -0.6363],
        [-4.2559,  0.5880, -1.5089, -1.0526],
        [-4.2387, -0.2784, -1.1591, -0.9823]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 46%|████▌     | 133/289 [01:39<01:57,  1.33it/s]

Training loop 133
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.31827807426452637, logits - tensor([[-4.5048,  0.2144, -1.6054, -1.3161],
        [-4.6188,  0.3239, -2.1787, -1.2182],
        [-3.7789, -2.8263, -2.2148,  1.6349],
        [-4.4872,  0.2576, -1.1554, -1.4428],
        [-3.8624, -0.1732, -1.2383, -1.2371],
        [-5.2651, -0.1587, -1.8703,  0.4648],
        [-3.9527,  0.8580, -1.6807, -1.7086],
        [-4.0556,  0.5374, -1.2180, -1.3515]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 46%|████▋     | 134/289 [01:40<01:56,  1.33it/s]

Training loop 134
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3760678172111511, logits - tensor([[-4.3659, -0.2376, -1.4903, -1.1692],
        [-3.1563, -2.9551, -2.2925,  1.3709],
        [-4.3700,  0.4907, -1.8926, -0.7112],
        [-4.2169,  0.2883, -1.5432, -1.4644],
        [-4.2697,  0.4180, -1.1317, -0.8951],
        [-4.6152, -0.0629, -1.0443, -0.8065],
        [-4.0012,  0.0705, -1.7283, -0.8818],
        [-4.7596,  0.4001, -1.8966, -0.9379]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 47%|████▋     | 135/289 [01:41<01:57,  1.31it/s]

Training loop 135
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.40745043754577637, logits - tensor([[-4.8347e+00,  3.7018e-01, -1.5593e+00, -4.0758e-01],
        [-4.5995e+00, -2.0359e-01, -1.4277e+00, -1.0815e+00],
        [-5.0207e+00,  3.2784e-01, -1.8558e+00, -8.0248e-01],
        [-3.9216e+00,  5.9655e-01, -1.8661e+00, -1.2929e+00],
        [-3.9420e+00,  7.3915e-04, -1.3429e+00, -8.8782e-01],
        [-4.9132e+00, -1.9071e-01, -1.6684e+00, -6.2267e-01],
        [-3.9651e+00, -9.0969e-01, -1.8849e+00, -2.8846e-01],
        [-4.1148e+00,  7.2990e-01, -1.1338e+00, -9.3006e-01]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 47%|████▋     | 136/289 [01:41<01:57,  1.31it/s]

Training loop 136
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5093657970428467, logits - tensor([[-2.6791, -2.3323, -1.9218,  1.5844],
        [-4.5569,  0.2911, -1.9725, -0.6531],
        [-3.7561, -0.3948, -1.8881, -0.9466],
        [-4.4163,  0.5109, -1.8202, -1.0326],
        [-4.1471,  0.1768, -1.2751, -1.3393],
        [-3.4021, -0.1052, -2.0145, -0.8069],
        [-4.5409,  0.7413, -1.4315, -0.8866],
        [-4.5622,  0.6200, -1.7435, -1.0890]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 47%|████▋     | 137/289 [01:42<01:56,  1.31it/s]

Training loop 137
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.493876576423645, logits - tensor([[-3.8158,  0.0740, -1.4333, -0.7470],
        [-4.2742,  0.6322, -1.3264, -0.6820],
        [-4.1653,  0.2119, -1.7184, -0.7291],
        [-5.5992, -0.3950, -2.0086, -1.4584],
        [-3.5307, -0.2934, -1.2172, -1.2918],
        [-4.5703,  0.2011, -2.0032, -1.4158],
        [-4.7230,  0.5553, -1.8572, -1.2519],
        [-2.4594, -2.9794, -2.1041,  1.1840]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 48%|████▊     | 138/289 [01:43<01:55,  1.31it/s]

Training loop 138
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.42394304275512695, logits - tensor([[-4.6030, -0.2782, -1.5591, -1.6943],
        [-4.7275,  0.2925, -1.3283, -1.1278],
        [-3.9454, -0.0084, -0.9752, -1.5118],
        [-5.7121, -0.4846, -2.5077,  0.3187],
        [-4.8317,  0.3070, -1.7603, -0.8752],
        [-4.4490,  0.3386, -1.7148, -1.6603],
        [-4.0424,  0.8984, -1.2673, -0.9808],
        [-4.6810,  0.1984, -1.4951, -1.0184]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 48%|████▊     | 139/289 [01:44<01:53,  1.32it/s]

Training loop 139
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3496275544166565, logits - tensor([[-5.3678e+00,  3.1356e-01, -1.7984e+00, -1.4221e+00],
        [-4.2107e+00, -2.1544e+00, -2.2871e+00,  1.3017e+00],
        [-4.2984e+00,  1.4743e-01, -1.7963e+00, -1.5502e+00],
        [-3.3250e+00, -2.4737e+00, -1.3015e+00,  9.9562e-01],
        [-4.1886e+00,  3.5238e-03, -7.5601e-01, -1.3483e+00],
        [-4.7970e+00, -2.6674e-01, -1.7345e+00, -1.1251e+00],
        [-4.8753e+00,  1.4751e-01, -1.4066e+00, -1.0735e+00],
        [-4.7319e+00,  3.0002e-01, -1.1561e+00, -4.0195e-01]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 48%|████▊     | 140/289 [01:44<01:52,  1.33it/s]

Training loop 140
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4251556396484375, logits - tensor([[-5.3790,  0.1297, -2.2174, -1.1841],
        [-4.2374,  0.2353, -1.9964, -1.2360],
        [-5.3180,  0.3573, -1.7689, -1.5103],
        [-4.3749, -0.0766, -1.4769, -1.2001],
        [-4.2593, -0.2718, -1.7677, -0.7764],
        [-4.4730,  0.5716, -1.6770, -0.8194],
        [-4.0577,  0.4492, -2.1357, -0.6173],
        [-4.5743,  0.9017, -1.4108, -1.4638]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 49%|████▉     | 141/289 [01:45<01:51,  1.33it/s]

Training loop 141
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2671038806438446, logits - tensor([[-6.3498,  0.2000, -2.1204, -0.6527],
        [-4.0715, -2.0295, -1.4871,  1.1674],
        [-4.3334,  0.2448, -2.4347, -1.1674],
        [-4.5674, -0.1696, -2.1160, -1.0962],
        [-4.6265,  0.9722, -1.9923, -0.7457],
        [-3.9873,  0.2687, -2.0953, -0.6501],
        [-5.0573,  0.3853, -1.6248, -1.6345],
        [-5.3690,  0.4105, -2.6749, -0.7880]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 49%|████▉     | 142/289 [01:46<01:50,  1.33it/s]

Training loop 142
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3485236167907715, logits - tensor([[-4.8862,  0.4014, -1.5608, -1.1621],
        [-4.3874,  0.3645, -1.5021, -1.3904],
        [-5.8930,  0.0630, -1.9847, -0.9678],
        [-5.1797,  0.3039, -1.6485, -1.2690],
        [-4.9034,  0.3754, -1.8612, -0.8017],
        [-4.6668, -0.0563, -1.1083, -0.8180],
        [-5.4962,  0.6049, -2.1272, -1.6465],
        [-5.9099, -1.7162, -1.7463,  0.8213]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 49%|████▉     | 143/289 [01:47<01:49,  1.33it/s]

Training loop 143
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4539851248264313, logits - tensor([[-4.0458,  0.3561, -1.6676, -1.2156],
        [-5.1059, -2.1067, -2.4301,  1.7745],
        [-5.1578,  0.7705, -1.7016, -0.6474],
        [-4.9823,  0.0646, -2.1891, -0.9311],
        [-4.8065, -1.9521, -2.7269,  1.4199],
        [-3.1583, -2.3744, -2.0815,  1.6147],
        [-5.4013,  0.3893, -2.4188, -0.7179],
        [-4.4573,  0.0508, -1.7801, -0.3267]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 50%|████▉     | 144/289 [01:47<01:48,  1.33it/s]

Training loop 144
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4222699999809265, logits - tensor([[-4.8656,  0.4303, -2.0124, -1.3625],
        [-5.9268,  0.3078, -2.2873, -0.9553],
        [-5.4187,  0.9680, -1.9376, -0.7334],
        [-5.1046,  0.4731, -2.7092, -0.4648],
        [-4.5859,  0.3782, -1.2263, -0.7229],
        [-5.3729,  0.5134, -1.7180, -0.8314],
        [-4.7941,  0.4060, -1.5031, -1.0678],
        [-5.0842,  0.4255, -1.9595, -1.0011]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 50%|█████     | 145/289 [01:48<01:47,  1.34it/s]

Training loop 145
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3498612344264984, logits - tensor([[-4.7693,  0.7421, -1.7353, -1.5421],
        [-5.5683,  0.2315, -1.8366, -0.4942],
        [-3.8669, -1.8891, -2.1010,  1.2649],
        [-4.8386,  0.8890, -2.9207, -0.8518],
        [-5.0396, -0.0854, -2.4508, -0.6380],
        [-4.6427,  0.3674, -2.3251, -1.3226],
        [-4.7261,  1.0028, -1.8134, -0.9402],
        [-5.5115,  1.1964, -2.2426, -0.8739]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 51%|█████     | 146/289 [01:49<01:46,  1.34it/s]

Training loop 146
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5056889057159424, logits - tensor([[-4.6147, -0.2307, -1.7551, -1.1586],
        [-4.8101, -0.1926, -1.8243,  0.3810],
        [-4.5447,  0.5583, -1.9204, -0.9389],
        [-5.4267,  0.8374, -2.1637, -1.6781],
        [-5.4732,  0.1911, -2.0868, -1.0996],
        [-5.0163,  1.1185, -2.0449, -0.9742],
        [-5.0802,  0.3301, -1.6073, -0.8996],
        [-4.5403,  0.4669, -2.0304, -1.1078]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 51%|█████     | 147/289 [01:50<01:46,  1.34it/s]

Training loop 147
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.40321803092956543, logits - tensor([[-4.8615,  0.4610, -2.1994, -1.7191],
        [-5.2119,  1.3840, -2.2533, -1.0600],
        [-3.6895, -3.0402, -2.0345,  2.0968],
        [-5.1227,  1.0593, -1.6861, -1.0602],
        [-5.0678,  0.3248, -2.1806, -1.1083],
        [-4.5148,  0.6127, -2.0723, -1.8352],
        [-4.4786,  1.1671, -1.8038, -0.8932],
        [-5.7165,  0.6647, -2.4426, -1.3739]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 51%|█████     | 148/289 [01:50<01:45,  1.34it/s]

Training loop 148
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3434022068977356, logits - tensor([[-6.0185,  1.2734, -2.3022, -0.8398],
        [-4.4551,  1.1744, -0.8566, -1.4397],
        [-5.0636,  0.8255, -1.8448, -1.4993],
        [-5.0944,  0.4086, -1.7112, -1.7571],
        [-4.5877,  0.5211, -2.0440, -1.0555],
        [-4.0126, -1.9294, -1.5799,  1.2651],
        [-5.2186,  1.4190, -1.4518, -0.9569],
        [-3.7132, -1.4984, -1.9544,  2.8482]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 52%|█████▏    | 149/289 [01:51<01:44,  1.34it/s]

Training loop 149
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.25948190689086914, logits - tensor([[-5.5841,  0.6095, -2.0306, -1.3836],
        [-4.7268,  0.8803, -2.0691, -1.1112],
        [-3.4520, -2.4779, -2.0008,  1.7453],
        [-4.8302,  0.6143, -2.1739, -0.8493],
        [-5.6210,  1.4034, -1.9668, -1.1889],
        [-5.0578,  0.5920, -2.1128, -1.5390],
        [-5.0020,  0.8209, -1.7812, -1.7448],
        [-5.1840,  0.9119, -2.8381, -0.6317]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 52%|█████▏    | 150/289 [01:52<01:43,  1.34it/s]

Training loop 150
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.20220817625522614, logits - tensor([[-4.9747,  1.2997, -2.0392, -0.9567],
        [-4.8359,  1.0997, -1.8108, -0.8322],
        [-4.3026,  0.9946, -2.2253, -0.9218],
        [-4.0931,  0.4099, -1.6900, -1.4644],
        [-6.0821,  0.8397, -2.5898, -0.8998],
        [-5.1212,  1.2997, -1.5956, -0.8504],
        [-4.9668,  0.6654, -1.8802, -1.2826],
        [-4.3887,  0.5989, -1.7445, -0.9872]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 52%|█████▏    | 151/289 [01:53<01:43,  1.34it/s]

Training loop 151
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4310142397880554, logits - tensor([[-5.1933,  0.4847, -2.4029, -1.5795],
        [-4.4579, -0.6744, -2.3478,  0.9296],
        [-4.6791, -1.3132, -2.4081, -0.0605],
        [-4.1052,  0.4146, -2.1188, -1.2310],
        [-4.3206,  0.4503, -2.0242, -1.2368],
        [-5.5017,  0.5820, -1.4066, -1.4692],
        [-4.8807, -1.5352, -1.9225,  0.8560],
        [-4.1204,  1.2523, -1.8223, -1.6550]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 53%|█████▎    | 152/289 [01:53<01:42,  1.34it/s]

Training loop 152
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4620969295501709, logits - tensor([[-4.9520,  1.0989, -1.8414, -1.4018],
        [-4.4221,  0.8463, -1.2085, -1.2367],
        [-5.4039,  0.5212, -2.4079, -1.5894],
        [-4.3588,  0.9779, -1.8839, -1.3245],
        [-4.1710, -1.7920, -2.0700,  1.0993],
        [-5.2185,  1.3729, -1.7008, -0.8132],
        [-4.7487,  0.6643, -1.9456, -1.8065],
        [-5.8911,  1.0871, -2.6841, -1.6431]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 53%|█████▎    | 153/289 [01:54<01:41,  1.33it/s]

Training loop 153
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3690624237060547, logits - tensor([[-5.7571,  0.8034, -2.1426, -1.7919],
        [-5.5035,  1.1186, -2.0777, -1.0368],
        [-4.5430,  0.6723, -2.0870, -1.1898],
        [-4.6626,  1.1673, -2.1197, -1.2910],
        [-3.6196, -2.2966, -1.3101,  1.5951],
        [-5.6249,  1.1569, -1.9312, -1.5739],
        [-5.6149,  1.1772, -2.4036, -1.1239],
        [-4.4668,  0.7836, -2.2984, -1.3809]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 53%|█████▎    | 154/289 [01:55<01:41,  1.33it/s]

Training loop 154
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.30943572521209717, logits - tensor([[-4.9740,  1.1162, -1.5546, -1.8447],
        [-4.5252,  1.3584, -1.4572, -1.4631],
        [-5.6005,  1.0460, -2.2101, -1.3229],
        [-5.6704,  1.5485, -2.1165, -1.8301],
        [-3.8890,  1.2869, -1.6419, -1.1319],
        [-5.2915,  1.8324, -2.1650, -2.0347],
        [-4.7983, -2.2719, -1.9067,  1.2787],
        [-5.0551,  0.6311, -2.2338, -1.8838]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 54%|█████▎    | 155/289 [01:56<01:40,  1.33it/s]

Training loop 155
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.47117626667022705, logits - tensor([[-4.9574,  0.7266, -1.9857, -2.1698],
        [-4.4635,  0.6649, -2.0089, -2.0205],
        [-4.8260,  1.1755, -1.7792, -1.4738],
        [-5.5947,  1.1097, -2.0215, -1.8498],
        [-4.5225,  0.8894, -1.3516, -2.7746],
        [-4.6940,  0.3541, -1.3865, -2.2944],
        [-5.1484,  0.5659, -1.7609, -1.6021],
        [-5.2070,  0.9539, -2.4903, -2.2750]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 54%|█████▍    | 156/289 [01:56<01:40,  1.32it/s]

Training loop 156
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2551581561565399, logits - tensor([[-4.0043, -2.2971, -1.8461,  1.1830],
        [-4.7608,  1.4732, -1.3946, -1.4583],
        [-5.5063,  1.2376, -1.7485, -1.9207],
        [-5.1271,  0.2970, -1.8564, -1.2495],
        [-4.9351, -0.8659, -1.6444,  0.3929],
        [-4.5390,  1.0692, -1.5780, -1.2658],
        [-5.1619,  0.6519, -2.1502, -1.8499],
        [-4.5005,  0.4478, -1.6404, -1.7019]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 54%|█████▍    | 157/289 [01:57<01:39,  1.32it/s]

Training loop 157
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3348057270050049, logits - tensor([[-5.3778,  1.8759, -1.5873, -2.4146],
        [-3.9000,  0.6980, -1.7441, -2.3876],
        [-5.3592,  1.2267, -1.8555, -2.1747],
        [-3.8299,  0.9847, -1.5936, -2.0148],
        [-4.3029,  0.8977, -1.5494, -2.1071],
        [-5.2157,  0.9606, -1.7574, -1.5606],
        [-4.9838,  0.9433, -1.6445, -1.6205],
        [-4.9590,  1.6927, -1.5777, -1.6745]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 55%|█████▍    | 158/289 [01:58<01:38,  1.33it/s]

Training loop 158
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3560693860054016, logits - tensor([[-5.1593,  0.9774, -1.9540, -2.2406],
        [-3.7403,  0.6648, -1.4023, -1.6142],
        [-4.7151,  0.5938, -1.7337, -2.2656],
        [-5.0736,  1.0388, -1.6321, -2.2191],
        [-5.7078,  0.9439, -1.9599, -1.9970],
        [-3.9770, -2.3865, -1.1136,  0.9830],
        [-4.7977,  1.7003, -1.5082, -1.5011],
        [-5.3258,  1.5281, -1.6529, -1.8896]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 55%|█████▌    | 159/289 [01:59<01:37,  1.33it/s]

Training loop 159
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3263813555240631, logits - tensor([[-4.6396,  1.2743, -1.5952, -1.8135],
        [-4.8374,  0.9585, -1.7804, -1.7073],
        [-3.9740, -1.8854, -2.0345,  1.0238],
        [-4.5571, -1.5702, -1.4865,  1.3705],
        [-5.2109,  1.6196, -1.4276, -2.3344],
        [-5.6361,  1.5842, -1.7934, -2.0615],
        [-4.8526, -2.0731, -1.3870,  1.6674],
        [-5.5239,  1.2230, -2.2378, -1.8414]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 55%|█████▌    | 160/289 [01:59<01:36,  1.33it/s]

Training loop 160
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.47348982095718384, logits - tensor([[-4.1197, -2.6910, -2.4281,  1.2079],
        [-4.8494,  1.2830, -1.4536, -2.0769],
        [-5.5635,  0.9452, -1.9473, -2.8110],
        [-5.2604,  0.8981, -2.2976, -1.7802],
        [-4.5749,  1.6573, -1.8264, -1.9435],
        [-5.0737,  1.0920, -1.8624, -2.3625],
        [-5.6281,  1.4975, -1.7644, -2.7354],
        [-4.6142,  1.5030, -1.3960, -2.8977]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 56%|█████▌    | 161/289 [02:00<01:35,  1.34it/s]

Training loop 161
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.32882627844810486, logits - tensor([[-2.9111, -2.1235, -1.3939,  1.4570],
        [-5.1890,  0.6252, -2.1931, -2.0661],
        [-4.5785,  0.8670, -2.3609, -2.3611],
        [-4.7830,  0.9833, -1.6421, -2.3186],
        [-4.3329,  0.8963, -1.5081, -2.1067],
        [-4.9670,  1.4810, -1.8353, -1.9321],
        [-4.6192,  1.1900, -1.8354, -1.8940],
        [-4.6518,  1.3556, -1.6502, -2.1672]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 56%|█████▌    | 162/289 [02:01<01:34,  1.34it/s]

Training loop 162
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.45097726583480835, logits - tensor([[-4.9793,  0.9672, -1.8917, -2.3229],
        [-5.2037,  1.0439, -1.8970, -1.9564],
        [-4.3652,  0.9470, -1.4173, -1.8286],
        [-4.9321,  1.0132, -1.9131, -1.5257],
        [-4.8055,  1.4414, -1.7920, -1.8748],
        [-4.7835,  0.8825, -1.6757, -2.1161],
        [-4.6281,  1.1564, -1.8929, -2.7273],
        [-5.1809,  1.2000, -1.6827, -2.2019]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 56%|█████▋    | 163/289 [02:02<01:34,  1.34it/s]

Training loop 163
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2998369634151459, logits - tensor([[-4.0526, -2.0651, -0.6737,  1.1543],
        [-5.1932,  0.8541, -1.4449, -2.0110],
        [-4.6984, -1.9302, -2.4940,  1.6082],
        [-4.5779,  0.9075, -1.4859, -1.5145],
        [-4.9955,  1.5332, -0.8180, -2.7984],
        [-4.6937,  1.1961, -1.7963, -1.6909],
        [-4.7345,  0.6006, -1.8162, -2.5920],
        [-4.9995,  1.0632, -1.0543, -2.3571]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 57%|█████▋    | 164/289 [02:02<01:33,  1.34it/s]

Training loop 164
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.36142468452453613, logits - tensor([[-4.8172,  1.2980, -1.6877, -2.0633],
        [-4.5180,  1.0653, -1.4536, -1.8746],
        [-5.0912,  1.0389, -2.2276, -2.0268],
        [-4.7333,  1.1868, -1.7613, -2.2617],
        [-5.4454,  1.3061, -2.0445, -2.3098],
        [-4.7595,  1.5092, -2.0381, -2.7162],
        [-4.1469,  0.7589, -1.4660, -1.9611],
        [-5.7309, -1.9453, -2.0405,  0.3406]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 57%|█████▋    | 165/289 [02:03<01:32,  1.34it/s]

Training loop 165
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.569930911064148, logits - tensor([[-5.7196, -0.4702, -1.5261, -0.6487],
        [-4.5912,  1.6046, -1.4372, -2.5884],
        [-4.9532,  0.6815, -1.9621, -1.8590],
        [-4.4041,  0.8306, -1.4803, -1.8832],
        [-5.1322,  1.0070, -1.3927, -2.0330],
        [-5.2313,  0.7643, -1.7407, -2.9363],
        [-4.5880, -2.0129, -1.9516,  0.7558],
        [-4.5752,  0.8281, -1.6380, -2.2658]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 57%|█████▋    | 166/289 [02:04<01:31,  1.34it/s]

Training loop 166
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1722918152809143, logits - tensor([[-4.6933,  0.9541, -1.1878, -2.0767],
        [-5.0448,  1.0054, -1.8866, -1.6911],
        [-4.0488, -1.6347, -1.4028,  0.1343],
        [-5.1867,  1.6844, -1.5369, -2.7353],
        [-5.4592,  0.7159, -1.6495, -2.6722],
        [-4.8252,  0.6652, -1.3227, -2.0370],
        [-4.5160,  0.4044, -1.8352, -2.1846],
        [-5.2845,  1.4335, -1.4856, -2.7423]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 58%|█████▊    | 167/289 [02:05<01:30,  1.34it/s]

Training loop 167
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.13910816609859467, logits - tensor([[-3.7096, -1.9572, -1.8787,  1.4668],
        [-4.3721,  0.5525, -1.6683, -1.9359],
        [-4.6844,  1.6304, -2.0429, -1.6784],
        [-4.7663,  1.5702, -1.5993, -2.1898],
        [-5.0123,  1.4594, -1.7759, -1.7203],
        [-4.6514,  1.4000, -1.5829, -2.7821],
        [-4.9104,  1.5092, -1.5655, -2.3921],
        [-4.5336,  0.7402, -1.5273, -2.1805]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 58%|█████▊    | 168/289 [02:05<01:29,  1.34it/s]

Training loop 168
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.37074196338653564, logits - tensor([[-3.5477,  0.7222, -2.1157, -2.1068],
        [-4.2275,  1.6034, -1.8969, -2.2967],
        [-4.0305,  0.8086, -1.4921, -1.6520],
        [-4.8575,  0.8681, -1.5680, -2.0229],
        [-3.9404, -2.5947, -2.4143,  0.0932],
        [-4.8673,  1.7331, -2.2600, -1.9871],
        [-5.6842,  1.1593, -1.8407, -2.2943],
        [-3.7968,  0.7289, -1.7338, -2.6752]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 58%|█████▊    | 169/289 [02:06<01:29,  1.34it/s]

Training loop 169
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.41680192947387695, logits - tensor([[-4.8831,  0.5557, -1.4070, -1.5259],
        [-4.8145,  0.5773, -1.7920, -2.4075],
        [-3.9920, -1.7274, -2.1868,  0.3031],
        [-4.8930,  1.6431, -1.8577, -2.8862],
        [-4.3680, -2.1191, -2.1540,  0.1617],
        [-4.8164,  0.7077, -1.9938, -2.6041],
        [-5.0940,  1.4619, -2.4178, -2.0016],
        [-4.4138, -0.7150, -1.0918, -1.0442]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 59%|█████▉    | 170/289 [02:07<01:28,  1.35it/s]

Training loop 170
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3131968677043915, logits - tensor([[-4.2440,  1.0753, -1.4966, -1.9135],
        [-4.5196,  0.4832, -1.9955, -2.2851],
        [-3.9948, -3.1454, -2.2771,  0.4999],
        [-4.7893,  1.1225, -1.3149, -1.8957],
        [-5.3476, -1.9441, -2.3879,  0.9299],
        [-4.5543,  1.1128, -1.8487, -1.6407],
        [-5.0243,  1.0150, -2.0166, -2.5963],
        [-4.9897,  1.0920, -2.2231, -2.4895]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 59%|█████▉    | 171/289 [02:08<01:27,  1.35it/s]

Training loop 171
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4624463617801666, logits - tensor([[-4.6365, -2.1935, -1.7851,  1.9114],
        [-4.5570,  1.1859, -2.1522, -2.1914],
        [-4.7178,  1.2898, -1.9968, -2.1372],
        [-5.1086,  1.5786, -2.3396, -1.9900],
        [-4.2353,  1.4713, -1.5488, -1.4574],
        [-4.2055,  1.2789, -1.6406, -2.0782],
        [-4.9185,  1.2460, -1.7569, -2.2120],
        [-4.5178,  1.5883, -1.6246, -2.3050]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 60%|█████▉    | 172/289 [02:08<01:27,  1.34it/s]

Training loop 172
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 60%|█████▉    | 173/289 [02:09<01:26,  1.34it/s]

loss - 0.30509960651397705, logits - tensor([[-4.5908,  1.4906, -2.1017, -2.1998],
        [-5.0172,  1.1905, -2.1190, -1.9938],
        [-3.7605,  0.8183, -1.4449, -2.1138],
        [-4.9114,  0.8453, -1.4367, -1.4380],
        [-5.8009,  1.5869, -2.2342, -2.3242],
        [-5.3617,  0.7728, -1.6688, -2.2974],
        [-3.1371, -2.2833, -1.8225,  1.6212],
        [-4.5062,  1.6807, -2.7569, -2.3225]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 173
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5890458226203918, logits - tensor([[-4.2411,  1.6937, -1.6765, -2.5921],
        [-4.2383,  1.1620, -2.4642, -1.6249],
        [-4.8229,  1.3039, -2.1667, -1.6148],
        [-4.4308,  1.1316, -2.4961, -1.2170],
        [-4.1168,  0.8357, -2.3223, -1.0731],
        [-4.9467,  1.5085, -2.9802, -1.7216],
        [-5.2287,  1.2404, -1.9737, -1.6128],
        [-3.3086, -1.9895, -2.1329,  0.

 60%|██████    | 174/289 [02:10<01:26,  1.34it/s]

Training loop 174
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 61%|██████    | 175/289 [02:11<01:25,  1.33it/s]

loss - 0.4515891671180725, logits - tensor([[-4.7121,  1.1588, -2.3659, -1.8865],
        [-4.3374,  1.3163, -1.8663, -1.7806],
        [-5.2898,  0.8392, -2.2903, -1.8042],
        [-5.0898,  0.9301, -2.3740, -1.9453],
        [-4.1218,  0.7011, -0.9347, -1.4228],
        [-5.1228,  1.2348, -2.0654, -2.0477],
        [-5.4063,  0.3399, -2.3439, -2.0605],
        [-4.7386,  1.2379, -2.5515, -1.9997]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 175
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5472123622894287, logits - tensor([[-5.5597,  1.4518, -2.7011, -1.9128],
        [-4.8957,  0.8008, -2.9138, -1.2560],
        [-4.2636,  0.8826, -2.1954, -1.6523],
        [-4.1285,  1.0231, -1.5412, -1.3319],
        [-4.3703,  1.2909, -2.3925, -2.0900],
        [-5.7161,  0.8981, -2.4540, -1.5922],
        [-5.0724,  1.4098, -2.4335, -2.2746],
        [-4.7470,  0.7425, -2.5796, -1.8

 61%|██████    | 176/289 [02:11<01:24,  1.33it/s]

Training loop 176
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2645682394504547, logits - tensor([[-4.4272,  0.7676, -2.0255, -1.7145],
        [-4.5065,  1.1581, -2.6097, -1.5337],
        [-4.6230,  1.1574, -1.9903, -1.7673],
        [-4.7778,  0.9067, -2.2089, -1.3424],
        [-4.9249,  1.3958, -2.1661, -1.2893],
        [-4.6364,  0.9275, -2.1526, -1.4062],
        [-4.9769,  0.8422, -2.3773, -1.8346],
        [-4.5822, -1.9444, -2.5391,  1.9080]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 61%|██████    | 177/289 [02:12<01:23,  1.33it/s]

Training loop 177
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4685550928115845, logits - tensor([[-4.7863,  0.6955, -2.1623, -1.2418],
        [-4.3884,  1.3114, -1.8053, -1.3778],
        [-5.0155,  1.0913, -2.1658, -1.0087],
        [-3.8557,  0.7279, -1.9635, -2.0413],
        [-4.7458,  0.3602, -2.6881, -1.1119],
        [-5.6603,  0.7606, -2.0264, -1.1244],
        [-4.3545,  0.8199, -2.2743, -1.5353],
        [-3.9968, -2.4171, -1.9685,  1.8658]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 62%|██████▏   | 178/289 [02:13<01:23,  1.33it/s]

Training loop 178
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2743438482284546, logits - tensor([[-5.1765,  0.9645, -2.8627, -1.0839],
        [-5.0654,  1.0924, -2.5194, -1.2776],
        [-3.9001, -2.1784, -2.1445,  2.3504],
        [-3.8372,  0.8429, -2.1465, -1.4337],
        [-5.0759,  0.9551, -2.0402, -1.4594],
        [-4.1977,  0.4451, -2.2230, -1.5162],
        [-5.1051,  0.8235, -2.3566, -1.7041],
        [-4.6760,  0.8858, -1.6154, -1.4952]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 62%|██████▏   | 179/289 [02:14<01:22,  1.34it/s]

Training loop 179
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2854161858558655, logits - tensor([[-5.0112,  0.6203, -1.5592, -1.1176],
        [-4.7388,  0.9256, -1.9971, -1.1563],
        [-4.4550,  0.6270, -2.1494, -1.4526],
        [-4.5448,  1.4904, -2.1496, -1.3157],
        [-3.9081,  0.2841, -1.4770, -1.2937],
        [-3.2116, -2.1688, -2.2461,  2.7303],
        [-3.6161, -1.5375, -2.2319,  1.5799],
        [-4.5798,  1.1852, -1.9340, -1.3292]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 62%|██████▏   | 180/289 [02:14<01:21,  1.34it/s]

Training loop 180
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.28047144412994385, logits - tensor([[-5.0631, -0.7809, -2.6505,  1.1906],
        [-4.6829,  0.6510, -1.9054, -1.3109],
        [-5.2845,  0.4672, -1.8951, -1.1950],
        [-3.7372,  0.1469, -1.8069, -0.9182],
        [-4.2173, -2.5487, -2.1698,  1.6631],
        [-4.2051,  0.9622, -1.7940, -0.9511],
        [-5.4627,  0.6330, -2.8462, -1.9352],
        [-4.6268,  1.0822, -2.0950, -1.7816]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 63%|██████▎   | 181/289 [02:15<01:20,  1.34it/s]

Training loop 181
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2754872441291809, logits - tensor([[-5.1178,  1.4930, -2.1705, -1.2489],
        [-3.8751,  0.1778, -2.2801, -1.6176],
        [-4.2029,  0.7037, -0.9904, -1.9661],
        [-5.2588,  0.5146, -1.7997, -1.1596],
        [-3.6981,  0.6522, -2.3009, -1.1280],
        [-3.4832, -2.4560, -2.3040,  2.0760],
        [-4.3526,  0.4452, -1.7250, -1.3480],
        [-4.6362,  0.4430, -2.3655, -2.0234]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 63%|██████▎   | 182/289 [02:16<01:19,  1.34it/s]

Training loop 182
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.44922396540641785, logits - tensor([[-4.5932,  0.4510, -1.5742, -0.9820],
        [-4.4460,  1.3450, -2.2914, -1.3922],
        [-3.9922,  0.3927, -1.9481, -0.9609],
        [-5.0277,  0.0650, -2.3640, -1.7988],
        [-4.9140,  0.1231, -2.4990, -1.3870],
        [-4.3314,  0.5429, -2.5489, -2.1358],
        [-4.9120,  0.4897, -1.8661, -1.4795],
        [-4.8964,  0.7288, -2.5598, -1.4180]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 63%|██████▎   | 183/289 [02:17<01:18,  1.35it/s]

Training loop 183
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.40706920623779297, logits - tensor([[-3.5526, -2.4508, -2.6752,  2.4493],
        [-4.0178, -2.7400, -2.7032,  2.4512],
        [-3.9144,  0.5605, -1.3539, -1.1216],
        [-4.8035,  0.5076, -1.9751, -1.4110],
        [-4.0580, -3.2668, -2.6691,  3.2361],
        [-3.8642,  0.4975, -1.2778, -1.4515],
        [-4.2485,  0.9492, -1.7779, -1.6140],
        [-4.9385,  0.0355, -1.7471, -1.3123]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 64%|██████▎   | 184/289 [02:17<01:18,  1.34it/s]

Training loop 184
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3196552097797394, logits - tensor([[-3.9472, -2.2631, -2.9776,  2.0012],
        [-4.4856,  0.3632, -1.3964, -1.6018],
        [-4.1990,  0.2555, -1.9914, -0.9268],
        [-4.5392,  0.7707, -2.0441, -1.5337],
        [-5.2664,  0.6759, -1.9339, -1.0402],
        [-4.1629,  0.6082, -2.3710, -1.0329],
        [-3.7785,  0.5707, -1.6895, -1.1572],
        [-4.0576,  0.5595, -2.0602, -1.3511]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 64%|██████▍   | 185/289 [02:18<01:17,  1.34it/s]

Training loop 185
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2935987412929535, logits - tensor([[-4.3227, -2.5589, -2.5765,  2.6127],
        [-5.1340,  0.9025, -1.9158, -1.3703],
        [-3.9571, -0.3393, -1.9024, -1.1093],
        [-4.2645,  0.5068, -1.5813, -1.4418],
        [-4.7883,  0.1754, -2.0710, -1.1149],
        [-4.8633,  0.1305, -1.4921, -1.7262],
        [-4.6248,  0.4491, -2.2896, -1.3445],
        [-4.0285,  0.4064, -2.0544, -1.1173]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 64%|██████▍   | 186/289 [02:19<01:16,  1.34it/s]

Training loop 186
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.41903215646743774, logits - tensor([[-5.0592, -1.7921, -2.2603,  1.5909],
        [-4.9389,  1.1282, -2.0135, -0.7930],
        [-4.5834,  0.8235, -1.4847, -1.3889],
        [-4.3980,  0.6783, -1.9879, -1.4706],
        [-4.5612,  0.9547, -1.8498, -0.6692],
        [-4.0279,  0.6548, -2.1293, -1.5098],
        [-4.1652,  0.8006, -1.6404, -1.5355],
        [-4.2992,  0.9588, -1.2251, -0.9330]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 65%|██████▍   | 187/289 [02:20<01:15,  1.34it/s]

Training loop 187
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3325532078742981, logits - tensor([[-5.0618,  0.6796, -1.8740, -1.2541],
        [-4.3437,  0.7442, -2.0437, -1.3024],
        [-4.4278,  0.9032, -1.8334, -0.7882],
        [-4.0061,  0.8711, -1.6797, -1.1374],
        [-4.0887,  0.4767, -1.9346, -1.1612],
        [-3.5477, -2.1727, -2.1062,  1.8465],
        [-4.6941,  0.8215, -2.0188, -1.2933],
        [-4.8313,  0.2845, -2.3261, -1.8787]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 65%|██████▌   | 188/289 [02:20<01:15,  1.35it/s]

Training loop 188
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.33852100372314453, logits - tensor([[-4.7891,  0.2975, -2.3934, -1.2803],
        [-3.7751,  0.9651, -1.3603, -1.4332],
        [-3.9854,  0.5847, -2.0859, -1.3840],
        [-3.9774,  0.2385, -2.0472, -1.0540],
        [-3.6040,  0.7514, -1.5270, -1.5460],
        [-5.1020, -0.2421, -2.2536,  0.5681],
        [-4.6158,  0.0805, -1.8984, -1.6019],
        [-4.3305,  0.3105, -2.3597, -1.1776]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 65%|██████▌   | 189/289 [02:21<01:14,  1.34it/s]

Training loop 189
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2400658279657364, logits - tensor([[-4.4960, -2.3725, -1.9979,  1.9178],
        [-4.1244, -1.4968, -2.7279,  2.0446],
        [-4.2752,  0.7008, -1.7328, -1.5803],
        [-4.2466,  1.2423, -1.5539, -1.5465],
        [-4.9499,  0.2624, -1.4271, -1.2467],
        [-4.0954,  0.9589, -1.7695, -1.5086],
        [-3.7555,  1.1021, -0.9016, -1.5872],
        [-4.4370, -1.5725, -2.6804,  1.4299]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 66%|██████▌   | 190/289 [02:22<01:13,  1.34it/s]

Training loop 190
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.37522661685943604, logits - tensor([[-4.2490,  0.6597, -1.9177, -2.0938],
        [-4.8160,  0.7374, -1.5684, -1.9948],
        [-4.4279,  0.8384, -1.9202, -1.5175],
        [-4.7552,  0.3617, -2.3961, -1.6264],
        [-3.2425,  0.7979, -1.2666, -0.9710],
        [-4.2299,  0.8162, -1.5960, -1.0273],
        [-4.6318,  0.6373, -1.8457, -1.6521],
        [-4.1948, -1.9364, -3.2635,  1.9411]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 66%|██████▌   | 191/289 [02:23<01:13,  1.34it/s]

Training loop 191
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4599771499633789, logits - tensor([[-4.5585,  0.3959, -2.3970, -2.0784],
        [-4.3491,  1.2666, -1.8090, -1.4361],
        [-4.1598, -2.3829, -2.4816,  2.3393],
        [-3.8951,  1.0418, -2.0829, -1.7454],
        [-4.8119,  0.3924, -2.0172, -0.4307],
        [-4.2798,  0.5271, -1.6258, -0.9984],
        [-4.5188,  1.3175, -2.1934, -1.5273],
        [-5.2274, -0.3467, -3.2119,  0.5258]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 66%|██████▋   | 192/289 [02:23<01:12,  1.33it/s]

Training loop 192
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.46047547459602356, logits - tensor([[-4.3051, -1.6324, -3.0236,  0.7618],
        [-4.9136, -2.1114, -2.6438,  1.6258],
        [-4.6232, -2.2532, -2.3708,  1.9188],
        [-4.1944,  1.1845, -1.6999, -1.3258],
        [-4.8572, -2.1079, -3.2922,  1.2839],
        [-4.4221,  0.9443, -1.4272, -1.3248],
        [-4.8624, -2.5510, -2.7764,  2.3902],
        [-3.8509, -2.4221, -2.2781,  1.8187]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 67%|██████▋   | 193/289 [02:24<01:11,  1.34it/s]

Training loop 193
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4995318353176117, logits - tensor([[-5.3394, -0.2281, -2.1186, -1.5329],
        [-3.8414,  1.4084, -1.3639, -1.8731],
        [-4.4101,  0.4125, -1.4572, -1.0335],
        [-4.3786,  0.8273, -1.5355, -1.9712],
        [-3.9760,  1.4101, -2.0589, -1.3846],
        [-5.0718, -2.0877, -3.4787,  1.9066],
        [-3.6924,  0.7938, -1.5839, -1.1825],
        [-4.7732,  1.0925, -1.3437, -1.4055]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 67%|██████▋   | 194/289 [02:25<01:11,  1.33it/s]

Training loop 194
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.31815528869628906, logits - tensor([[-4.5021,  1.4285, -1.9765, -1.5755],
        [-4.4570,  0.8847, -1.8283, -1.4507],
        [-4.5406,  1.4349, -1.8165, -1.4096],
        [-4.8129, -2.3213, -2.8076,  1.7817],
        [-3.6522,  0.4647, -1.6226, -1.5603],
        [-4.9615,  1.2720, -1.7981, -1.9666],
        [-3.8937,  0.8612, -1.5610, -1.5574],
        [-4.3066,  0.7879, -1.6638, -1.1365]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 67%|██████▋   | 195/289 [02:26<01:10,  1.33it/s]

Training loop 195
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.27083057165145874, logits - tensor([[-3.9157,  0.4944, -2.0900, -1.6873],
        [-4.4249,  0.7770, -1.1249, -1.5043],
        [-3.9439,  1.1350, -1.6584, -1.8510],
        [-3.6947,  0.7008, -1.8385, -1.9728],
        [-4.9606,  0.7430, -1.5845, -2.1479],
        [-3.9785,  0.5187, -1.1425, -1.3025],
        [-4.6215,  0.3758, -1.7293, -1.0118],
        [-4.0474, -2.8152, -1.9986,  1.2816]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 68%|██████▊   | 196/289 [02:26<01:09,  1.33it/s]

Training loop 196
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.6564727425575256, logits - tensor([[-5.1059,  0.4344, -1.9412, -2.2907],
        [-4.2180,  0.8879, -1.3589, -1.3270],
        [-4.4864,  0.5377, -1.5868, -1.3692],
        [-4.4303,  0.2573, -1.9589, -1.5328],
        [-3.9300,  0.5714, -1.8622, -1.5800],
        [-5.0680,  0.4688, -1.5851, -2.1327],
        [-4.4939,  1.2372, -1.9064, -1.8864],
        [-4.7636,  0.8410, -1.1933, -2.0278]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 68%|██████▊   | 197/289 [02:27<01:09,  1.33it/s]

Training loop 197
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.36115702986717224, logits - tensor([[-4.9582,  0.9538, -1.6597, -1.4938],
        [-4.1471,  0.5295, -1.8829, -0.9461],
        [-5.1271,  0.6674, -2.0664, -1.4027],
        [-4.9730,  0.5289, -2.1986, -2.0161],
        [-5.9166, -1.5230, -3.0295,  1.0301],
        [-4.9312, -0.0895, -2.4387, -0.1577],
        [-4.8323,  0.7892, -1.4941, -1.4196],
        [-3.9336,  0.8327, -1.4949, -1.7174]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 69%|██████▊   | 198/289 [02:28<01:08,  1.33it/s]

Training loop 198
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2792017161846161, logits - tensor([[-3.6402,  0.9318, -1.9638, -1.6018],
        [-3.6304,  0.8791, -1.3188, -1.1799],
        [-4.6624,  0.8612, -1.4356, -1.7400],
        [-4.4001,  0.9723, -0.8935, -0.9537],
        [-4.0863,  0.4345, -1.7433, -1.2451],
        [-4.7498,  0.5455, -1.8101, -1.4461],
        [-3.7022,  0.7344, -1.4578, -1.3027],
        [-4.3347,  0.2333, -1.5532, -1.5423]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 69%|██████▉   | 199/289 [02:29<01:07,  1.33it/s]

Training loop 199
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1974276453256607, logits - tensor([[-5.1878,  0.8411, -1.6052, -2.0213],
        [-3.8490,  0.5364, -0.8315, -1.7273],
        [-4.7186,  0.6608, -1.7978, -1.6422],
        [-4.8697,  0.7052, -1.4226, -1.9684],
        [-4.1381,  0.4811, -1.4304, -0.9655],
        [-4.5534,  0.4294, -2.0435, -1.8897],
        [-4.5899,  1.0358, -1.8758, -1.2785],
        [-4.3388,  0.9338, -1.9134, -1.9308]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 69%|██████▉   | 200/289 [02:29<01:06,  1.34it/s]

Training loop 200
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5422228574752808, logits - tensor([[-4.7443,  1.1976, -1.7519, -1.7851],
        [-4.6334,  1.2228, -1.6006, -1.6004],
        [-5.2933,  0.2795, -1.6360, -0.8015],
        [-4.5562,  0.7492, -2.2019, -1.2427],
        [-4.7132,  1.0294, -1.6570, -2.3330],
        [-4.8424,  0.4421, -1.8813, -1.4075],
        [-4.1443,  1.4176, -1.5076, -1.9299],
        [-4.9852, -1.6257, -3.2072,  0.8550]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 70%|██████▉   | 201/289 [02:30<01:05,  1.34it/s]

Training loop 201
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.42213475704193115, logits - tensor([[-3.8308,  1.0081, -1.4940, -1.7510],
        [-4.1660,  0.8508, -1.4799, -1.7500],
        [-4.3107,  0.9199, -2.1304, -1.3976],
        [-4.9339,  0.8492, -1.6919, -2.1172],
        [-5.4898,  0.9773, -1.7777, -1.9275],
        [-4.7166, -1.9251, -2.0290,  0.7772],
        [-4.0919,  1.1147, -1.5771, -1.6628],
        [-4.6315,  0.6027, -1.5362, -2.3679]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 70%|██████▉   | 202/289 [02:31<01:05,  1.34it/s]

Training loop 202
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.32771050930023193, logits - tensor([[-4.0150,  0.6726, -1.6110, -1.5122],
        [-4.5229,  0.3061, -1.6727, -1.6458],
        [-5.0094,  0.5394, -1.9484, -1.2965],
        [-5.1773,  0.3608, -1.5532, -1.7573],
        [-4.2107,  0.8226, -1.1525, -1.7535],
        [-5.0623,  0.7746, -1.7058, -1.3395],
        [-4.5989,  0.8175, -1.3331, -2.6973],
        [-4.6289,  0.8668, -2.0637, -1.0580]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 70%|███████   | 203/289 [02:32<01:04,  1.33it/s]

Training loop 203
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2791506350040436, logits - tensor([[-4.4596,  0.8264, -1.9022, -1.5522],
        [-3.7935,  0.3684, -1.3735, -1.5474],
        [-4.4091,  0.2768, -1.6951, -1.3963],
        [-4.4128,  0.9265, -1.4919, -2.0926],
        [-3.2853, -1.7617, -1.4431, -0.1270],
        [-4.3381,  0.7088, -1.6306, -1.1728],
        [-5.1617,  0.7815, -1.8678, -1.6474],
        [-5.4818,  1.1795, -2.4055, -2.1326]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 71%|███████   | 204/289 [02:32<01:03,  1.34it/s]

Training loop 204
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5646733045578003, logits - tensor([[-4.6727,  0.7539, -1.3820, -1.4789],
        [-4.4855,  0.9148, -2.3103, -1.8146],
        [-5.1647,  0.6153, -2.2346, -2.2799],
        [-5.2693,  0.5000, -1.9697, -1.6888],
        [-4.3109,  0.5693, -1.3420, -1.3754],
        [-4.0808,  0.3730, -1.6379, -1.0900],
        [-4.2868,  1.0645, -1.9700, -2.0550],
        [-4.2470,  0.4938, -2.0013, -1.6365]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 71%|███████   | 205/289 [02:33<01:03,  1.33it/s]

Training loop 205
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.24924297630786896, logits - tensor([[-5.3054,  0.9701, -1.7347, -2.3004],
        [-4.8345,  1.1788, -1.8802, -1.7269],
        [-4.8331,  0.3572, -2.0309, -1.8147],
        [-4.7509,  0.6423, -2.3082, -2.1032],
        [-5.1234,  1.3818, -1.8004, -1.6685],
        [-4.9891,  1.2826, -2.0314, -1.5933],
        [-5.2143,  0.5700, -2.3104, -1.5879],
        [-5.5048,  1.0309, -2.4517, -1.7259]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 71%|███████▏  | 206/289 [02:34<01:02,  1.33it/s]

Training loop 206
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.41851937770843506, logits - tensor([[-4.9360,  0.9001, -1.6949, -1.9234],
        [-5.3005,  1.2610, -1.8003, -2.2375],
        [-5.5503,  0.4712, -2.5498, -0.9034],
        [-4.8807,  0.4611, -1.5357, -1.7809],
        [-4.8135,  0.6104, -2.0553, -1.9793],
        [-4.7070,  0.3917, -2.0124, -1.9218],
        [-4.9961,  1.2206, -2.1177, -2.3372],
        [-5.0252,  0.4890, -1.8938, -1.4141]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 72%|███████▏  | 207/289 [02:35<01:01,  1.33it/s]

Training loop 207
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5743222236633301, logits - tensor([[-5.0236,  0.6675, -1.6490, -1.4732],
        [-4.4714,  0.7308, -2.3924, -2.0963],
        [-4.3447,  0.6797, -1.7224, -1.5958],
        [-5.1130,  0.2746, -1.9574, -2.0403],
        [-4.6934,  1.0358, -2.0275, -2.1633],
        [-4.3648,  0.8474, -1.6741, -1.1019],
        [-4.4465,  1.5293, -1.3852, -0.8111],
        [-4.7395,  0.7223, -1.9082, -1.6701]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 72%|███████▏  | 208/289 [02:35<01:00,  1.33it/s]

Training loop 208
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2698610723018646, logits - tensor([[-4.7739,  1.1933, -1.4970, -1.8554],
        [-5.5122,  0.5399, -2.3915, -1.4555],
        [-4.0563,  0.9286, -1.7794, -1.7411],
        [-5.0458,  0.7127, -1.6451, -2.1769],
        [-5.1683,  0.9057, -1.9384, -1.7086],
        [-4.6980,  0.5508, -2.1768, -1.6532],
        [-4.6002,  0.5577, -1.6732, -1.6163],
        [-4.7343,  0.7326, -2.3037, -1.8667]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 72%|███████▏  | 209/289 [02:36<01:00,  1.33it/s]

Training loop 209
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4009503126144409, logits - tensor([[-4.7482,  1.0310, -2.0393, -1.6501],
        [-5.1010,  0.4056, -1.9686, -1.2729],
        [-4.3342,  1.0717, -1.2362, -1.5946],
        [-5.0651,  1.2837, -1.5060, -1.6355],
        [-4.0634,  0.5975, -1.4549, -1.2683],
        [-4.9075,  0.7383, -1.7127, -1.4568],
        [-4.3187,  1.2493, -1.7109, -2.0356],
        [-4.8901,  0.7686, -1.7060, -1.4549]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 73%|███████▎  | 210/289 [02:37<00:59,  1.33it/s]

Training loop 210
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.35498112440109253, logits - tensor([[-5.2966,  0.7245, -2.5819, -1.8477],
        [-5.0622,  0.7621, -1.7680, -2.0981],
        [-3.7265, -2.8566, -1.4337,  0.9115],
        [-4.5842,  1.0932, -2.2008, -1.5382],
        [-5.4424,  0.7210, -1.7266, -1.8508],
        [-3.9608,  0.5370, -1.6799, -1.9106],
        [-4.6795,  0.9763, -1.6711, -1.3536],
        [-5.2330,  0.4688, -1.8462, -1.8685]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 73%|███████▎  | 211/289 [02:38<00:58,  1.34it/s]

Training loop 211
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 73%|███████▎  | 212/289 [02:38<00:57,  1.34it/s]

loss - 0.29153841733932495, logits - tensor([[-5.1075,  1.1664, -2.1820, -1.8516],
        [-4.6207,  1.2683, -2.0533, -1.6486],
        [-3.6710,  0.8905, -1.3853, -2.1185],
        [-4.8935,  1.1545, -1.8420, -2.2961],
        [-5.2970,  1.0202, -1.4166, -1.7490],
        [-5.2255,  0.7226, -2.1168, -1.9042],
        [-4.8674, -0.5167, -1.6158, -0.3480],
        [-4.7269,  0.9799, -1.5105, -1.7925]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 212
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2840957045555115, logits - tensor([[-4.5956,  0.5466, -1.8017, -1.7251],
        [-4.7295,  0.4911, -1.9195, -1.2440],
        [-4.2087,  0.9465, -1.5454, -1.9661],
        [-5.1928,  0.8483, -2.0700, -1.6747],
        [-4.3933, -2.0962, -1.4078,  0.3357],
        [-5.2469,  0.1450, -1.3369, -1.4527],
        [-4.8143,  0.8794, -2.0517, -1.8495],
        [-4.8532,  0.4409, -1.4229, -2.

 74%|███████▎  | 213/289 [02:39<00:57,  1.33it/s]

Training loop 213
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 74%|███████▍  | 214/289 [02:40<00:56,  1.33it/s]

loss - 0.521955132484436, logits - tensor([[-4.3237,  0.3815, -1.9684, -1.4966],
        [-4.2582,  1.1581, -2.0840, -1.7928],
        [-4.7841,  0.9482, -2.3024, -2.5964],
        [-4.3936,  0.5264, -1.6518, -1.9216],
        [-5.4620,  1.4677, -2.0886, -1.8084],
        [-4.9495, -1.2312, -2.1994, -0.0750],
        [-4.9281,  0.8094, -1.6954, -1.9222],
        [-4.9111, -1.1345, -1.9498, -0.4979]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 214
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3160671293735504, logits - tensor([[-4.6438,  0.3258, -1.2147, -1.5081],
        [-5.2714,  0.7760, -2.0231, -1.4213],
        [-5.2701,  0.9366, -1.7343, -1.6858],
        [-4.9788,  1.3182, -1.8951, -1.6708],
        [-5.0693,  0.8052, -1.7579, -1.6820],
        [-5.2540, -1.5665, -2.4763,  0.1445],
        [-5.1700,  0.7493, -1.8125, -1.8538],
        [-4.8235,  1.0662, -1.9823, -1.28

 74%|███████▍  | 215/289 [02:41<00:55,  1.33it/s]

Training loop 215
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4242517352104187, logits - tensor([[-5.1271,  0.6483, -2.4788, -1.8800],
        [-5.5986,  1.0509, -1.7725, -1.4620],
        [-4.6662,  0.8558, -2.5906, -1.4982],
        [-3.9286,  0.1053, -2.2494, -1.3149],
        [-4.6945,  0.8723, -1.7997, -1.5610],
        [-4.8711,  1.3366, -2.1121, -1.5474],
        [-5.0691,  0.8906, -1.7860, -1.4704],
        [-5.7470,  1.3525, -2.1631, -1.5948]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 75%|███████▍  | 216/289 [02:41<00:55,  1.32it/s]

Training loop 216
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.44656503200531006, logits - tensor([[-4.0378, -2.2116, -1.6939,  1.1511],
        [-5.3643,  0.8944, -1.8348, -1.5149],
        [-4.5011,  0.5596, -1.7772, -1.3532],
        [-5.8538,  0.8077, -2.1655, -1.4555],
        [-5.0426,  0.4503, -2.0902, -1.0870],
        [-3.4069, -2.4520, -1.2979,  1.1264],
        [-4.9637,  0.7152, -2.6433, -1.3425],
        [-5.1329,  1.2384, -2.4775, -1.5509]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 75%|███████▌  | 217/289 [02:42<00:54,  1.32it/s]

Training loop 217
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4817742109298706, logits - tensor([[-5.3889,  0.8369, -2.2716, -1.1427],
        [-5.1892,  0.3526, -1.6293, -1.3769],
        [-4.6639,  0.8563, -1.7167, -0.3891],
        [-5.8478, -0.0189, -2.8856,  0.1337],
        [-4.5003,  1.1167, -2.4630, -0.8461],
        [-5.6662,  0.5824, -2.5880, -2.0827],
        [-4.9578,  0.9981, -2.7623, -1.3432],
        [-5.1257,  0.9726, -2.0176, -1.7910]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 75%|███████▌  | 218/289 [02:43<00:53,  1.32it/s]

Training loop 218
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.41291889548301697, logits - tensor([[-4.8218,  0.8151, -2.4551, -1.7487],
        [-4.3560,  0.4839, -1.8157, -1.0544],
        [-4.9995,  0.3881, -2.3841, -1.2411],
        [-4.7877,  1.1803, -1.9104, -1.2045],
        [-6.0141,  0.3935, -2.3412, -1.1460],
        [-5.4972,  0.4706, -2.4983, -1.5949],
        [-4.5322,  0.4288, -2.1434, -1.6279],
        [-3.7624, -3.2409, -1.7199,  1.2607]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 76%|███████▌  | 219/289 [02:44<00:52,  1.32it/s]

Training loop 219
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3419472575187683, logits - tensor([[-5.5232e+00,  5.9269e-01, -2.6455e+00, -1.4916e+00],
        [-5.5687e+00,  8.3182e-01, -2.2417e+00, -1.6296e+00],
        [-5.4363e+00,  1.7470e+00, -1.9594e+00, -1.0858e+00],
        [-4.5921e+00,  7.7126e-01, -1.9813e+00, -1.8953e+00],
        [-4.6784e+00,  8.1773e-01, -2.2204e+00, -1.8988e+00],
        [-5.1081e+00,  6.5163e-01, -1.9739e+00, -1.6519e+00],
        [-5.7640e+00,  3.4529e-01, -2.2283e+00, -1.3626e+00],
        [-4.4509e+00, -1.9026e+00, -2.5279e+00,  4.7600e-03]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 76%|███████▌  | 220/289 [02:44<00:52,  1.33it/s]

Training loop 220
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4023619592189789, logits - tensor([[-5.7890,  1.0193, -2.1605, -1.5943],
        [-5.1675,  0.9161, -1.8433, -1.5408],
        [-4.2975,  0.3673, -1.6142, -1.6955],
        [-4.6666,  0.3625, -1.9217, -1.8010],
        [-5.3873,  0.6123, -2.3460, -1.2226],
        [-4.5034,  0.1430, -1.9262, -1.1601],
        [-5.0847,  0.7990, -2.1662, -0.7855],
        [-4.5630,  0.3792, -1.7565, -1.0206]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 76%|███████▋  | 221/289 [02:45<00:51,  1.33it/s]

Training loop 221
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.38122838735580444, logits - tensor([[-2.9303, -3.3275, -1.7498,  2.3636],
        [-3.8407, -3.1745, -1.7360,  1.9095],
        [-4.8427, -0.0368, -2.5348, -1.6265],
        [-5.1339,  0.1690, -2.1942, -0.8168],
        [-4.4208, -2.5320, -2.0060,  2.3043],
        [-4.6946,  0.1815, -2.2565, -0.9573],
        [-5.1672,  0.8294, -1.8188, -1.2794],
        [-2.5601, -2.9946, -1.9615,  2.0525]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 77%|███████▋  | 222/289 [02:46<00:50,  1.33it/s]

Training loop 222
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.430057555437088, logits - tensor([[-5.9101,  0.7045, -2.0120, -1.3781],
        [-4.5923,  0.6323, -1.7162, -1.4478],
        [-4.8767,  0.7208, -2.3558, -1.2987],
        [-4.5676,  0.3853, -2.8999, -0.3637],
        [-5.6347,  0.6072, -1.9415, -1.1444],
        [-5.0719,  0.1160, -2.0388, -1.1040],
        [-5.2554,  0.7193, -2.3285, -1.5575],
        [-4.4150,  0.1969, -1.6973, -0.9163]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 77%|███████▋  | 223/289 [02:47<00:49,  1.33it/s]

Training loop 223
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3163295388221741, logits - tensor([[-5.5771,  0.3959, -1.8603, -0.9721],
        [-4.8660,  0.5100, -1.6738, -0.9064],
        [-4.5141,  0.1193, -1.6947, -1.1579],
        [-5.8670,  0.5309, -2.4556, -0.8821],
        [-3.5042, -3.3586, -1.8907,  1.6522],
        [-5.8459,  0.8259, -2.4380, -0.6431],
        [-5.3866,  0.5428, -2.8292, -0.5198],
        [-4.9943,  0.9556, -2.5131, -1.2366]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 78%|███████▊  | 224/289 [02:47<00:48,  1.33it/s]

Training loop 224
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3637740910053253, logits - tensor([[-4.3238,  0.9281, -1.9505, -0.9144],
        [-4.9621,  0.3722, -2.7347, -0.9119],
        [-5.0921,  0.1289, -2.6328, -1.0413],
        [-3.7406, -2.8773, -2.2453,  1.8592],
        [-5.0463, -0.1563, -2.4473, -0.8701],
        [-5.2233,  0.4128, -2.2409, -0.9109],
        [-6.2849,  0.8266, -2.8299, -1.1812],
        [-5.9753,  0.1219, -2.2175, -1.0009]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 78%|███████▊  | 225/289 [02:48<00:48,  1.33it/s]

Training loop 225
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5896949768066406, logits - tensor([[-5.3007,  0.3601, -2.3281, -1.0478],
        [-5.8552,  0.6224, -2.2534, -1.2834],
        [-4.5210,  0.3340, -2.5165, -0.6985],
        [-5.9495,  0.3586, -1.9647, -0.6060],
        [-4.8542,  0.5155, -1.5754, -1.0487],
        [-4.7812, -3.0783, -1.8641,  2.7678],
        [-5.4417,  0.4614, -2.5955, -0.9331],
        [-4.8034,  0.6363, -1.8801, -1.1086]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 78%|███████▊  | 226/289 [02:49<00:47,  1.33it/s]

Training loop 226
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.450493186712265, logits - tensor([[-5.0865, -0.1863, -2.2395, -0.8768],
        [-5.2411,  0.9002, -1.9019, -1.0227],
        [-5.2369,  0.3538, -1.8112, -1.5332],
        [-5.0558,  0.6016, -2.3648, -0.9995],
        [-4.3831,  0.8273, -2.1137, -1.0559],
        [-5.8658,  0.8467, -2.3857, -0.9534],
        [-5.3938,  0.7551, -2.3501, -1.0785],
        [-5.5430,  1.0452, -2.6037, -0.6843]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 79%|███████▊  | 227/289 [02:50<00:46,  1.33it/s]

Training loop 227
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2989899814128876, logits - tensor([[-5.2342,  0.4100, -2.3025, -0.9349],
        [-4.8163,  0.6656, -1.8766, -1.5940],
        [-5.5885,  0.8536, -1.8783, -0.5810],
        [-4.7337,  0.5773, -1.6987, -1.0261],
        [-5.1524,  0.5335, -2.0883, -0.4784],
        [-5.1601,  0.1958, -1.6295, -1.2444],
        [-5.5434,  0.0418, -1.9704, -0.8717],
        [-3.6913, -3.1511, -1.4773,  1.5401]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 79%|███████▉  | 228/289 [02:50<00:45,  1.33it/s]

Training loop 228
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.6213014125823975, logits - tensor([[-4.7939,  0.4298, -1.9596, -0.7573],
        [-5.2225, -0.0538, -2.0205, -1.1406],
        [-5.8260,  0.5660, -1.8759, -0.9193],
        [-5.1610,  0.0739, -2.0128,  0.1230],
        [-4.2453, -2.6628, -1.8432,  1.5729],
        [-4.9716,  0.4019, -2.2393, -0.4585],
        [-3.8834, -3.1094, -2.4439,  2.2691],
        [-3.7994, -2.5730, -2.4698,  1.0532]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 79%|███████▉  | 229/289 [02:51<00:45,  1.33it/s]

Training loop 229
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.36286666989326477, logits - tensor([[-3.3466, -2.7495, -2.1902,  1.8468],
        [-5.2398,  0.5887, -2.2430, -0.7167],
        [-4.7241,  0.0936, -1.8635, -0.6211],
        [-5.0922,  0.2357, -1.7591, -0.8905],
        [-3.8551,  0.8963, -1.7088, -1.1624],
        [-4.6285, -1.4506, -1.9845,  1.2912],
        [-5.5754,  0.6046, -2.1650, -0.8441],
        [-5.4476,  0.2632, -1.7988, -1.3447]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 80%|███████▉  | 230/289 [02:52<00:44,  1.33it/s]

Training loop 230
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2706700563430786, logits - tensor([[-5.6349,  0.2495, -1.7972, -0.8498],
        [-5.7628,  0.7176, -2.4732, -1.3632],
        [-4.9593,  0.2413, -2.1629, -0.7556],
        [-5.6233,  1.0684, -2.4425, -0.7581],
        [-3.3168, -2.6584, -2.1494,  2.0033],
        [-5.3350,  0.9026, -2.1160, -0.1103],
        [-5.0392,  0.3496, -1.5874, -0.5411],
        [-3.6679, -1.9406, -2.0719,  1.3675]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 80%|███████▉  | 231/289 [02:53<00:43,  1.33it/s]

Training loop 231
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.26792410016059875, logits - tensor([[-4.8201, -0.0749, -1.4955, -0.6429],
        [-4.2645,  0.4274, -1.5871, -1.0551],
        [-6.0629,  1.0391, -2.5391, -1.4964],
        [-5.3263,  1.2235, -1.9390, -0.7305],
        [-4.9472,  0.4840, -1.9023, -1.0355],
        [-4.4239, -1.6982, -2.3975,  1.2569],
        [-3.7259, -2.0467, -1.8203,  2.1092],
        [-5.5723,  0.5071, -2.4704, -0.6896]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 80%|████████  | 232/289 [02:53<00:42,  1.33it/s]

Training loop 232
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3338674008846283, logits - tensor([[-5.5529,  0.4422, -1.3623, -0.7603],
        [-4.9221,  0.6214, -2.3558, -1.2218],
        [-5.1454,  0.5390, -2.2182, -1.2028],
        [-4.8683,  0.1414, -1.8738, -0.9011],
        [-6.0235,  1.0483, -2.3834, -1.1755],
        [-4.8290,  0.3957, -2.1695, -1.2889],
        [-5.4780,  0.8204, -2.3185, -0.5184],
        [-5.5346,  0.5066, -2.1812, -1.3748]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 81%|████████  | 233/289 [02:54<00:42,  1.33it/s]

Training loop 233
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.38059109449386597, logits - tensor([[-5.2696,  0.6160, -1.8831, -1.2190],
        [-3.9856, -2.2077, -2.3543,  1.3817],
        [-4.7071,  1.2461, -1.8640, -1.1316],
        [-4.6350,  0.4385, -1.8338, -1.0787],
        [-3.2157, -1.5428, -1.5649,  0.8717],
        [-5.2473,  0.8364, -2.5540, -1.2848],
        [-5.6640, -0.1542, -2.1751, -1.2173],
        [-4.7545,  0.7901, -1.8420, -0.9626]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 81%|████████  | 234/289 [02:55<00:41,  1.32it/s]

Training loop 234
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3512827157974243, logits - tensor([[-4.9630,  0.2316, -1.7153, -1.5243],
        [-5.3335,  0.2226, -1.8435, -1.2868],
        [-5.7668,  0.7270, -2.0470, -1.0124],
        [-4.8059,  0.6666, -1.6463, -1.5929],
        [-4.1832, -1.7838, -2.1074,  1.7114],
        [-4.3704, -2.4646, -3.2086,  2.1993],
        [-4.0316,  0.5849, -1.2252, -1.7418],
        [-4.9751,  0.7465, -1.9113, -1.1147]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 81%|████████▏ | 235/289 [02:56<00:40,  1.32it/s]

Training loop 235
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3993958830833435, logits - tensor([[-4.2969,  0.4304, -1.7677, -0.9154],
        [-4.3608,  0.9050, -1.3264, -1.3079],
        [-5.1298,  0.4761, -1.8311, -1.2692],
        [-4.2902,  0.7708, -1.8479, -0.6056],
        [-4.9734,  0.5854, -2.0243, -1.1532],
        [-3.8540, -2.9884, -1.7469,  1.4783],
        [-4.9362,  0.9362, -2.2750, -0.9518],
        [-5.3324, -0.2319, -2.2429, -1.0379]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 82%|████████▏ | 236/289 [02:56<00:39,  1.33it/s]

Training loop 236
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5559154748916626, logits - tensor([[-4.7378,  0.7337, -1.6379, -1.3295],
        [-4.5891,  0.2765, -1.3160, -1.5742],
        [-5.6881,  0.1687, -1.7327, -1.9601],
        [-4.9554,  0.8442, -1.7349, -1.3432],
        [-5.3953,  1.0193, -2.1032, -2.5976],
        [-5.0074,  0.5735, -1.2358, -1.2245],
        [-4.8673,  0.2988, -1.2106, -1.6015],
        [-4.9037, -0.6577, -2.8580,  0.9883]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 82%|████████▏ | 237/289 [02:57<00:39,  1.33it/s]

Training loop 237
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 82%|████████▏ | 238/289 [02:58<00:38,  1.32it/s]

loss - 0.37235328555107117, logits - tensor([[-2.7475, -0.8175, -1.3642,  1.0385],
        [-5.0834,  0.8380, -1.3228, -1.1848],
        [-5.5430,  1.3488, -1.8167, -1.1006],
        [-5.4073,  1.3510, -2.0579, -1.7139],
        [-5.2353,  0.7937, -1.9307, -2.0664],
        [-4.8096,  0.3550, -2.1619, -1.1138],
        [-4.9995,  0.4344, -2.1545,  0.7145],
        [-4.9847,  0.5710, -1.5156, -1.3644]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 238
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.379507839679718, logits - tensor([[-5.0584,  0.2305, -1.8726, -1.8110],
        [-5.2849,  0.3876, -1.9049, -1.3765],
        [-4.5326,  0.3448, -1.2311, -1.0992],
        [-5.6135,  0.7368, -2.1019, -1.3025],
        [-5.4709,  0.7267, -1.8848, -2.0703],
        [-5.9482,  0.5529, -2.7131, -2.1864],
        [-4.1701,  0.7689, -1.0505, -1.5527],
        [-4.1564,  0.6638, -1.2490, -1.6

 83%|████████▎ | 239/289 [02:59<00:37,  1.32it/s]

Training loop 239
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4419984817504883, logits - tensor([[-4.3290,  0.5092, -0.9294, -1.7466],
        [-5.2183,  0.3665, -1.9886, -1.6967],
        [-5.0373,  0.3518, -1.5785, -1.8014],
        [-4.6483,  0.5017, -1.9979, -0.8882],
        [-5.0792,  1.3553, -2.0424, -0.9343],
        [-3.8576,  1.0590, -1.0933, -1.5743],
        [-3.5255, -1.0123, -2.3177,  1.6898],
        [-4.3082, -0.7836, -2.3462,  0.1669]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 83%|████████▎ | 240/289 [02:59<00:36,  1.33it/s]

Training loop 240
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.43453899025917053, logits - tensor([[-4.0064,  0.6343, -1.1316, -1.5210],
        [-5.5007,  0.6008, -1.9487, -1.7299],
        [-4.4605, -0.4208, -2.7007,  0.3157],
        [-5.6059,  0.8374, -2.4267, -1.6618],
        [-4.7491,  0.6406, -1.4615, -1.9820],
        [-5.1576,  0.8176, -2.0957, -0.9511],
        [-5.0555,  0.8816, -2.2030, -1.6803],
        [-4.8883,  0.0867, -2.2511,  0.1525]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 83%|████████▎ | 241/289 [03:00<00:36,  1.33it/s]

Training loop 241
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.534727156162262, logits - tensor([[-5.3755,  0.9304, -1.9502, -2.4041],
        [-4.6925,  0.8765, -1.3993, -2.0480],
        [-4.7258,  0.8327, -1.9446, -2.0548],
        [-4.9399,  0.3845, -1.9365, -1.4303],
        [-5.3670,  0.6057, -1.6320, -1.2976],
        [-4.9143,  0.7743, -1.3386, -1.5170],
        [-3.6953,  0.7209, -1.7014, -1.6323],
        [-2.8705,  0.0407, -2.3193,  0.2485]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 84%|████████▎ | 242/289 [03:01<00:35,  1.33it/s]

Training loop 242
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.35784414410591125, logits - tensor([[-4.9789,  0.2313, -0.7629, -1.2355],
        [-5.0687,  0.7210, -1.7470, -1.3813],
        [-4.1004,  0.7378, -0.7565, -1.8704],
        [-5.7291,  0.9691, -1.8240, -1.7473],
        [-5.4171,  0.8450, -1.7396, -2.1939],
        [-6.1662,  1.0598, -2.1530, -1.6469],
        [-4.3324, -1.0118, -2.2190,  0.7685],
        [-5.4075,  0.5318, -1.6942, -2.0671]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 84%|████████▍ | 243/289 [03:02<00:34,  1.33it/s]

Training loop 243
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3902655839920044, logits - tensor([[-4.8757,  0.9337, -1.7190, -1.3176],
        [-4.6032,  0.7427, -0.6085, -1.8251],
        [-4.4625,  1.0468, -1.4040, -1.1031],
        [-4.3745,  0.6027, -0.9979, -1.5496],
        [-5.5268,  0.7115, -1.6132, -1.5393],
        [-4.9149,  0.6179, -1.5507, -1.4172],
        [-4.3321,  0.7983, -1.1260, -1.7641],
        [-4.7871,  1.3622, -1.3177, -1.0076]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 84%|████████▍ | 244/289 [03:02<00:33,  1.33it/s]

Training loop 244
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.526862621307373, logits - tensor([[-3.8313, -0.1294, -1.2673, -1.7507],
        [-5.5709,  0.5274, -1.7761, -1.4217],
        [-4.6838, -0.5088, -2.2796, -0.0276],
        [-3.6881, -0.5735, -2.4784, -0.0295],
        [-4.4881,  0.4878, -0.9877, -2.6048],
        [-4.7372,  0.9839, -1.7534, -1.8227],
        [-3.5605,  0.7171, -0.7176, -1.8643],
        [-4.5396,  0.1959, -1.3818, -1.7112]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 85%|████████▍ | 245/289 [03:03<00:33,  1.33it/s]

Training loop 245
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3448939621448517, logits - tensor([[-5.8202,  0.8038, -2.3100, -1.4445],
        [-4.4100,  0.7235, -1.1875, -1.5225],
        [-5.0452,  0.4635, -1.9960, -2.1054],
        [-3.6884, -1.2517, -2.0906,  0.7608],
        [-4.0225,  0.3362, -0.5349, -1.3087],
        [-4.8584,  0.5337, -1.5735, -0.8066],
        [-5.4801, -0.3461, -2.4151,  0.3679],
        [-4.9709,  0.7662, -1.7811, -1.7291]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 85%|████████▌ | 246/289 [03:04<00:32,  1.33it/s]

Training loop 246
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.28149861097335815, logits - tensor([[-5.0051,  0.2274, -1.9807, -1.8375],
        [-5.3551,  0.3681, -1.9693, -1.7522],
        [-4.1489,  0.5120, -1.2273, -1.7041],
        [-5.4533,  0.3321, -2.2534, -1.9455],
        [-5.1583,  0.5693, -1.9457, -1.6226],
        [-5.8404,  1.0899, -1.9433, -1.2382],
        [-4.6739, -0.4667, -1.9577, -1.2015],
        [-3.7681, -0.4777, -2.0344,  0.5518]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 85%|████████▌ | 247/289 [03:05<00:31,  1.33it/s]

Training loop 247
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4215508699417114, logits - tensor([[-4.5201, -0.0110, -0.8227, -2.1574],
        [-5.6633,  1.2432, -1.9433, -1.9319],
        [-5.6041,  0.7056, -2.6935, -1.7386],
        [-3.5545,  0.1494, -1.0406, -1.3354],
        [-6.2188,  0.9112, -2.1019, -1.2514],
        [-4.5086,  0.5748, -1.0779, -1.5155],
        [-3.8381,  0.2623, -0.8647, -1.8133],
        [-2.9816, -1.2638, -2.5751,  0.7501]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 86%|████████▌ | 248/289 [03:05<00:30,  1.33it/s]

Training loop 248
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.42427727580070496, logits - tensor([[-3.8607, -1.5957, -2.2772,  1.0934],
        [-4.1635, -1.6052, -2.6119,  0.7519],
        [-5.1971,  0.7975, -2.1049, -1.4241],
        [-5.2146,  0.2387, -2.3031, -0.8373],
        [-6.5638,  0.6528, -2.4751, -2.1360],
        [-5.7761, -0.1166, -2.1921, -1.8085],
        [-5.0002,  0.4009, -1.1485, -1.3002],
        [-4.5949,  0.8878, -1.3651, -2.1853]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 86%|████████▌ | 249/289 [03:06<00:30,  1.33it/s]

Training loop 249
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3396022915840149, logits - tensor([[-3.7976, -1.0022, -2.4057,  0.9435],
        [-4.1247, -0.5347, -2.1602,  0.5352],
        [-4.5067,  0.9156, -1.5914, -1.4148],
        [-4.0201, -0.1472, -0.4379, -1.5014],
        [-4.9276,  0.5916, -2.0037, -2.2482],
        [-3.6733,  0.1778, -0.6858, -1.7696],
        [-3.4119, -0.0526, -0.8372, -1.5398],
        [-5.3145,  0.9695, -2.2231, -1.4971]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 87%|████████▋ | 250/289 [03:07<00:29,  1.32it/s]

Training loop 250
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.41382694244384766, logits - tensor([[-5.0358,  0.8871, -2.1439, -1.4113],
        [-5.4765,  1.3464, -1.7339, -2.1268],
        [-5.1889,  0.7386, -1.8290, -2.1858],
        [-5.7142,  1.5808, -2.8631, -1.2531],
        [-5.5530,  0.9044, -2.1667, -1.3435],
        [-2.9304, -0.6806, -2.2629,  0.5145],
        [-5.0399, -0.1777, -2.2487, -1.7725],
        [-4.3879,  0.2460, -1.2010, -2.0740]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 87%|████████▋ | 251/289 [03:08<00:28,  1.32it/s]

Training loop 251
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3153403401374817, logits - tensor([[-4.7594,  0.6136, -1.9696, -1.4641],
        [-3.9831, -1.4160, -2.2659,  0.9243],
        [-5.9467,  0.6548, -2.2472, -1.9003],
        [-3.5832,  0.1057, -0.2936, -1.5268],
        [-5.8272,  0.6731, -2.7744, -1.4409],
        [-3.9940, -1.1908, -1.9361,  1.0349],
        [-5.7519,  0.8000, -2.4925, -1.2808],
        [-5.0424,  0.2113, -2.6141, -1.5658]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 87%|████████▋ | 252/289 [03:08<00:27,  1.33it/s]

Training loop 252
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 88%|████████▊ | 253/289 [03:09<00:27,  1.32it/s]

loss - 0.3457241654396057, logits - tensor([[-4.8845,  0.9609, -2.1217, -1.0838],
        [-6.0203,  1.8419, -2.5982, -2.1162],
        [-6.0236,  1.4541, -2.8401, -1.4012],
        [-6.2978,  0.7111, -2.7040, -1.7176],
        [-5.4132,  0.6090, -2.6689, -0.7459],
        [-3.7255, -1.6315, -2.6015,  1.2665],
        [-5.8447,  1.2889, -2.8309, -1.9798],
        [-5.3218,  1.3315, -2.3589, -1.1210]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 253
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.30537912249565125, logits - tensor([[-5.7487,  1.3513, -2.6523, -1.8348],
        [-4.2632,  0.1188,  0.0541, -1.6229],
        [-3.0894, -0.0126, -0.2989, -1.0295],
        [-3.5297,  0.0074, -0.6120, -1.2128],
        [-3.8953, -0.6143, -0.9372, -1.6667],
        [-3.6545, -1.1840, -2.2178,  1.2169],
        [-5.6795,  0.9478, -2.2318, -0.9786],
        [-6.3099,  1.5736, -2.4315, -1.

 88%|████████▊ | 254/289 [03:10<00:26,  1.32it/s]

Training loop 254
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.35383808612823486, logits - tensor([[-5.3116,  0.7465, -2.2432, -1.4192],
        [-5.8481,  0.4281, -2.9924, -1.7036],
        [-6.2655,  1.1782, -2.7107, -1.0687],
        [-5.0167,  0.8376, -2.1718, -1.2471],
        [-5.3091,  0.7472, -2.3268, -1.6505],
        [-6.2684,  1.3772, -3.0413, -1.7114],
        [-3.8911, -0.3731, -0.5220, -1.5017],
        [-6.0106,  0.8768, -2.2300, -2.0892]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 88%|████████▊ | 255/289 [03:11<00:25,  1.32it/s]

Training loop 255
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4198400676250458, logits - tensor([[-7.3146,  1.7769, -2.4208, -0.9762],
        [-4.3291, -2.2061, -1.6157,  0.6548],
        [-5.3807,  0.7650, -2.3977, -0.5289],
        [-3.1673, -0.2864,  0.2268, -1.2278],
        [-6.0649,  0.5854, -2.6700, -0.7846],
        [-3.6748, -1.7251, -2.3081,  1.1583],
        [-6.3620,  1.2407, -2.5321, -0.8763],
        [-5.9488,  1.7710, -2.5188, -1.1652]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 89%|████████▊ | 256/289 [03:12<00:25,  1.32it/s]

Training loop 256
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.28594011068344116, logits - tensor([[-5.0839,  0.4485, -2.1124, -1.0654],
        [-4.0817, -0.3568,  0.0300, -1.4491],
        [-5.6581,  0.7737, -2.4370, -1.3036],
        [-4.3418, -1.4449, -1.1111,  0.3742],
        [-3.7033,  0.2104,  0.0587, -1.9630],
        [-5.7810,  0.9953, -2.8624, -1.0860],
        [-4.0441, -1.3453, -2.0364,  1.3507],
        [-5.1065,  1.3053, -2.2495, -0.8950]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 89%|████████▉ | 257/289 [03:12<00:24,  1.32it/s]

Training loop 257
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3053141236305237, logits - tensor([[-5.6756,  1.9473, -3.0863, -0.8881],
        [-5.7855,  1.3896, -2.5534, -1.5185],
        [-3.3238, -1.9462, -2.0157,  1.3677],
        [-5.4657,  1.1732, -3.4768, -0.8652],
        [-3.1403, -0.3330,  0.3126, -1.9803],
        [-5.8705,  1.5640, -2.8819, -0.2737],
        [-5.7803,  1.9651, -2.9660, -0.3077],
        [-4.6701, -1.3148, -0.9326, -0.6723]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 89%|████████▉ | 258/289 [03:13<00:23,  1.31it/s]

Training loop 258
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.23995532095432281, logits - tensor([[-6.0476,  0.8680, -2.8025, -1.4524],
        [-5.2722,  1.0564, -3.3915, -0.7521],
        [-6.4243,  1.3890, -3.2764, -1.6541],
        [-4.8019, -1.9084, -2.2325,  1.6079],
        [-5.0268,  0.8227, -2.9125, -1.0266],
        [-3.5885, -0.4420,  0.4641, -1.5471],
        [-5.9453,  1.5455, -2.9806, -1.4144],
        [-3.5322, -0.8525,  0.3980, -1.7494]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 90%|████████▉ | 259/289 [03:14<00:22,  1.32it/s]

Training loop 259
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2742130756378174, logits - tensor([[-6.4268,  1.6753, -3.1822, -1.9123],
        [-5.1032,  0.7898, -2.9007, -0.9411],
        [-5.8288,  0.9205, -2.8661, -1.1880],
        [-2.9200, -0.6506,  0.7467, -1.3891],
        [-3.9474, -2.4115, -1.2450,  0.7053],
        [-6.4081,  0.9814, -2.2665, -0.5138],
        [-5.6426, -1.6958, -2.1513,  1.1536],
        [-5.7552,  1.4438, -3.8006, -1.2552]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 90%|████████▉ | 260/289 [03:15<00:21,  1.33it/s]

Training loop 260
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.23987406492233276, logits - tensor([[-5.7086,  0.8768, -3.5801, -0.8592],
        [-5.5716,  0.6551, -3.2996, -0.7785],
        [-5.2715,  0.7987, -3.2141, -1.2561],
        [-6.1206,  1.6473, -3.1462, -0.9553],
        [-5.8085,  1.4615, -3.4004, -1.0113],
        [-3.5316, -0.6906,  0.2924, -1.7622],
        [-5.8521,  0.8524, -2.9616, -1.3757],
        [-6.0535,  1.3008, -2.8295, -1.5046]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 90%|█████████ | 261/289 [03:15<00:21,  1.33it/s]

Training loop 261
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.14703300595283508, logits - tensor([[-3.4896, -1.3482,  0.2292, -1.4786],
        [-5.0894,  1.3983, -2.8264, -1.2275],
        [-5.3565,  1.3602, -2.9037, -1.1577],
        [-5.2108,  2.5933, -3.0259, -1.3502],
        [-5.5291,  0.9343, -2.8860, -0.8779],
        [-4.6154, -1.8015, -2.3529,  1.2057],
        [-5.8531,  1.5240, -3.3844, -1.1639],
        [-5.8989,  1.3818, -2.8424, -1.7785]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 91%|█████████ | 262/289 [03:16<00:20,  1.33it/s]

Training loop 262
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3146229386329651, logits - tensor([[-6.2004,  1.6049, -3.6942, -0.9875],
        [-5.3747,  1.0938, -2.9077, -1.6581],
        [-5.5618,  0.6041, -2.7886, -1.2563],
        [-5.8665,  1.4439, -2.2549, -2.1511],
        [-5.5837,  0.8892, -3.0094, -1.1799],
        [-5.3892,  1.5709, -2.9504, -2.3399],
        [-4.8975,  1.1936, -3.5426, -0.6352],
        [-6.8316,  1.7312, -3.7999, -1.7372]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 91%|█████████ | 263/289 [03:17<00:19,  1.33it/s]

Training loop 263
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.32051289081573486, logits - tensor([[-2.6345, -0.7871,  0.4871, -1.6932],
        [-5.0887,  1.1860, -3.2202, -1.2078],
        [-5.6677,  1.5083, -3.6064, -1.5062],
        [-5.9865,  1.4147, -3.4241, -1.5136],
        [-6.1763,  1.9637, -3.5598, -1.3699],
        [-5.0638,  1.2120, -2.5949, -1.3233],
        [-5.3600,  2.1049, -3.4943, -1.5829],
        [-4.8513,  1.9798, -3.0474, -1.8570]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 91%|█████████▏| 264/289 [03:18<00:18,  1.33it/s]

Training loop 264
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4225366711616516, logits - tensor([[-4.8187,  1.4522, -2.4020, -1.1908],
        [-5.1567,  1.5413, -2.2543, -1.8860],
        [-3.2316, -1.0607,  0.7187, -1.9022],
        [-5.7038,  1.5561, -3.1111, -0.9454],
        [-5.4828,  1.7624, -3.2037, -1.2989],
        [-3.4335, -0.8006,  0.9641, -1.4191],
        [-5.2707,  1.3986, -2.6385, -1.5411],
        [-5.8793,  1.3300, -3.0474, -1.3949]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 92%|█████████▏| 265/289 [03:18<00:18,  1.33it/s]

Training loop 265
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3122203052043915, logits - tensor([[-5.8198,  1.8845, -3.4640, -1.2386],
        [-5.0688,  1.6131, -3.9188, -1.1977],
        [-4.3338,  1.6135, -2.7334, -1.4822],
        [-5.0074,  1.4002, -2.9365, -0.9642],
        [-2.9025, -0.9694,  0.9377, -1.6824],
        [-6.6790,  1.1997, -2.8167, -1.7230],
        [-5.4693,  1.6458, -3.4210, -1.7575],
        [-3.4984, -1.2497,  0.8196, -1.7402]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 92%|█████████▏| 266/289 [03:19<00:17,  1.33it/s]

Training loop 266
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3344871997833252, logits - tensor([[-5.1780, -1.7714, -2.8342,  0.6436],
        [-5.1725,  1.9304, -3.8180, -1.1519],
        [-5.6135,  1.3911, -3.7869, -1.1165],
        [-6.2152,  1.2258, -2.4935, -0.4561],
        [-5.2621,  2.0447, -3.4810, -0.3789],
        [-3.6912, -1.2760,  1.2308, -1.6967],
        [-5.6019,  1.0823, -2.8651, -1.1015],
        [-5.5339,  0.1748, -2.8184, -0.8644]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 92%|█████████▏| 267/289 [03:20<00:16,  1.33it/s]

Training loop 267
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.23320288956165314, logits - tensor([[-5.4296, -0.3746, -2.9567, -0.9041],
        [-4.9709,  1.6344, -3.1007, -0.9606],
        [-5.3300,  1.6105, -3.7641, -1.1345],
        [-5.0032,  1.8112, -3.4290, -1.1507],
        [-5.8139,  1.7669, -3.3931, -1.3093],
        [-5.9226,  1.4900, -3.4271, -1.3170],
        [-5.0088,  0.8181, -2.6822, -1.0121],
        [-5.1691,  1.6030, -3.2767, -0.8088]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 93%|█████████▎| 268/289 [03:21<00:15,  1.33it/s]

Training loop 268
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2327134758234024, logits - tensor([[-3.4413, -2.2129, -2.4579,  1.0243],
        [-6.2647,  0.7515, -3.6520, -1.0370],
        [-2.7165, -1.5346,  0.9717, -2.1280],
        [-3.8163, -1.6561, -2.1443,  0.2465],
        [-5.9180,  0.6861, -3.2678, -1.0148],
        [-5.1988,  0.9702, -3.2231, -1.3062],
        [-5.2948,  1.7593, -3.2688, -1.0657],
        [-4.6048,  1.5476, -3.4060, -1.2323]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 93%|█████████▎| 269/289 [03:21<00:15,  1.33it/s]

Training loop 269
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5129604339599609, logits - tensor([[-4.7931,  1.3571, -3.4949, -1.2385],
        [-5.3748,  1.8317, -3.6785, -1.9597],
        [-2.2877, -1.3003,  0.9440, -0.9947],
        [-5.3122,  2.2286, -4.0000, -1.5887],
        [-4.9089,  1.5280, -3.0107, -1.7427],
        [-4.8654,  1.3014, -3.5498, -0.9042],
        [-4.8428,  0.9807, -3.2468, -1.4501],
        [-5.7662,  2.1297, -3.8059, -1.0371]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 93%|█████████▎| 270/289 [03:22<00:14,  1.33it/s]

Training loop 270
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.142452210187912, logits - tensor([[-5.0204, -1.2381, -2.9629,  0.4176],
        [-3.6982, -3.2038, -2.3168,  0.9609],
        [-5.1798,  1.8571, -3.6215, -1.5917],
        [-4.7800,  1.7272, -2.9255, -0.9983],
        [-4.2996, -1.2901, -2.7888,  0.4986],
        [-4.5591,  1.7261, -3.7659, -0.7591],
        [-4.3819,  1.7714, -3.4002, -1.7268],
        [-3.0888, -1.5639,  1.2331, -1.9741]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 94%|█████████▍| 271/289 [03:23<00:13,  1.33it/s]

Training loop 271
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 94%|█████████▍| 272/289 [03:24<00:12,  1.33it/s]

loss - 0.2730560600757599, logits - tensor([[-4.3346, -2.9078, -2.0400,  1.3594],
        [-5.9589, -0.6864, -4.2935, -0.0064],
        [-5.8121,  1.5325, -3.5762, -1.4028],
        [-3.1779, -1.4797,  1.5326, -1.5668],
        [-4.1551,  1.3398, -3.6229, -1.4132],
        [-4.4254,  1.3836, -3.4115, -1.3682],
        [-3.4852, -1.8102,  0.4761, -1.8573],
        [-4.8634,  0.4786, -3.8042, -1.5123]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 272
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.18544068932533264, logits - tensor([[-5.0882,  0.6609, -3.7736, -1.5666],
        [-4.8203,  1.0313, -3.3873, -0.9281],
        [-5.9939, -0.1031, -4.1487,  1.3735],
        [-4.7911,  0.9937, -3.2617, -1.0637],
        [-4.9383, -2.3257, -2.3451,  1.3853],
        [-5.0238,  1.4897, -3.6962, -0.5303],
        [-4.0535,  0.5614, -2.8267, -0.1931],
        [-5.0153,  0.5208, -3.1264, -0.

 94%|█████████▍| 273/289 [03:24<00:12,  1.33it/s]

Training loop 273
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 95%|█████████▍| 274/289 [03:25<00:11,  1.33it/s]

loss - 0.3288469910621643, logits - tensor([[-5.8220,  1.4808, -4.5058, -1.1446],
        [-4.7281,  2.1423, -3.3773, -1.4777],
        [-3.0576, -2.1028,  1.3288, -1.3315],
        [-5.2168,  1.5750, -4.1707, -1.4637],
        [-5.3159,  0.8160, -3.9164, -0.5619],
        [-5.5696,  1.5855, -4.5540, -1.1993],
        [-3.2287, -1.8998,  0.8929, -1.1182],
        [-5.0062,  0.6664, -3.4171, -1.1011]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 274
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.20290333032608032, logits - tensor([[-5.4076,  1.5477, -4.1594, -1.5456],
        [-5.8721,  1.2046, -3.8749, -1.2486],
        [-5.7767,  1.0072, -4.2413, -1.5090],
        [-5.0544,  0.6459, -3.2086, -0.7809],
        [-5.8842,  1.3809, -3.7255, -2.0962],
        [-5.5379,  0.4462, -3.4460, -0.7372],
        [-5.8221,  1.2673, -4.1996, -2.1007],
        [-4.9335,  0.4936, -3.3466, -1.

 95%|█████████▌| 275/289 [03:26<00:10,  1.32it/s]

Training loop 275
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.23109060525894165, logits - tensor([[-5.5094,  0.9696, -3.9652, -1.3242],
        [-4.6981,  0.7235, -4.2481, -0.3475],
        [-5.5914,  0.7585, -4.4340, -0.7883],
        [-4.9781,  0.0095, -2.9422, -0.9343],
        [-5.4970,  0.9766, -4.2875, -1.8474],
        [-4.4466, -1.9280, -2.8400,  1.1917],
        [-3.1383, -2.7474, -1.8118,  1.2237],
        [-5.8029,  0.8772, -3.6307, -0.9652]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 96%|█████████▌| 276/289 [03:27<00:09,  1.32it/s]

Training loop 276
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.27469944953918457, logits - tensor([[-4.9435,  0.9042, -3.9499, -1.5595],
        [-5.4599,  0.8952, -3.8412,  0.0097],
        [-2.6971, -1.4139,  1.5751, -1.4638],
        [-5.5598,  0.7790, -4.7075, -1.5042],
        [-5.1166,  1.0572, -3.6243, -1.2773],
        [-6.0610,  0.8765, -4.1232, -1.3471],
        [-5.3931,  0.9302, -3.9476, -1.1467],
        [-5.6117,  0.6785, -4.0859, -0.4538]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 96%|█████████▌| 277/289 [03:27<00:09,  1.32it/s]

Training loop 277
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3753015398979187, logits - tensor([[-3.5544, -1.8779,  1.4186, -2.1429],
        [-5.3459,  1.1917, -4.3817, -1.0208],
        [-5.3577,  1.6286, -4.0876, -1.5612],
        [-4.6617,  1.6708, -3.2331, -1.5301],
        [-7.1464,  0.4357, -2.8175, -1.0657],
        [-3.2703, -1.7947,  1.5320, -2.1805],
        [-5.4441,  1.6560, -4.1446, -1.3373],
        [-5.7112,  0.8012, -3.4456, -1.2648]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 96%|█████████▌| 278/289 [03:28<00:08,  1.32it/s]

Training loop 278
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.22626030445098877, logits - tensor([[-3.1320, -1.7946,  1.4116, -1.3370],
        [-4.9402,  0.6505, -3.9515, -0.9503],
        [-3.2132, -1.5655,  1.1664, -2.0219],
        [-6.2757,  1.0573, -3.7817, -0.7211],
        [-3.0245, -1.7828,  1.1631, -2.0372],
        [-3.2895, -1.9512,  1.5393, -1.7975],
        [-5.7329,  0.8405, -3.8965, -1.3705],
        [-5.2311,  0.7795, -4.0253, -1.8828]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 97%|█████████▋| 279/289 [03:29<00:07,  1.33it/s]

Training loop 279
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.14290817081928253, logits - tensor([[-6.2496,  1.2183, -4.6934, -1.3075],
        [-4.8100,  0.9818, -4.0361, -1.4531],
        [-4.2415, -3.3163, -2.5442,  2.2057],
        [-5.2282,  0.8741, -4.1227, -1.3284],
        [-3.1361, -1.8394,  1.4399, -1.5208],
        [-5.3071,  1.2908, -3.9746, -0.6198],
        [-5.1799,  0.4080, -3.0305, -1.0694],
        [-5.1261,  0.9940, -4.1953, -1.6015]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 97%|█████████▋| 280/289 [03:30<00:06,  1.33it/s]

Training loop 280
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.38955187797546387, logits - tensor([[-3.5745, -2.8236, -2.0879,  1.4793],
        [-5.2843,  0.6462, -3.7321, -1.3189],
        [-5.5539,  1.0002, -4.4070, -1.7754],
        [-5.4819,  1.2960, -4.4797, -1.2319],
        [-5.6351,  1.0640, -4.5772, -1.3177],
        [-4.5510, -1.4672, -2.8540,  1.5679],
        [-5.3846,  1.2007, -5.0696, -1.9336],
        [-5.5765,  0.5503, -4.1679, -0.8922]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 97%|█████████▋| 281/289 [03:30<00:06,  1.33it/s]

Training loop 281
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.11040191352367401, logits - tensor([[-3.6569, -2.1588, -2.2756,  1.6761],
        [-3.3866, -2.1067,  1.3292, -2.4067],
        [-3.2325, -3.1851, -3.1730,  1.9587],
        [-5.1002,  1.6129, -4.1840, -1.6918],
        [-5.6887,  1.6886, -4.1486, -0.8993],
        [-4.4856,  0.9882, -3.8068, -1.6634],
        [-5.2385,  1.2228, -4.3167, -1.1769],
        [-4.1029,  1.6929, -3.9647, -1.3381]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 98%|█████████▊| 282/289 [03:31<00:05,  1.33it/s]

Training loop 282
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3209492862224579, logits - tensor([[-5.3847,  1.6429, -4.5758, -1.0685],
        [-3.9590, -2.7136, -1.9180,  1.9004],
        [-4.3704, -2.5718,  1.3348, -2.4758],
        [-4.8397, -3.3368, -2.0764,  2.1874],
        [-3.5062, -2.2322,  1.5029, -1.8971],
        [-4.7217,  1.0909, -4.0473, -1.5143],
        [-6.1151,  0.8830, -3.6997, -1.3174],
        [-4.9625,  1.3518, -3.8263, -1.6358]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 98%|█████████▊| 283/289 [03:32<00:04,  1.33it/s]

Training loop 283
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.36339810490608215, logits - tensor([[-4.9139,  1.8539, -4.8343, -1.7734],
        [-3.4144, -1.9253,  1.5963, -2.2117],
        [-3.1805, -1.5818,  0.9664, -2.0037],
        [-3.0232, -2.4295,  1.3581, -2.1721],
        [-4.6744,  1.7891, -4.1571, -0.6114],
        [-4.2517,  1.2069, -3.8886, -1.4133],
        [-5.3898,  2.2135, -5.4078, -1.2507],
        [-4.8375,  1.4724, -4.1331, -0.7346]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 98%|█████████▊| 284/289 [03:33<00:03,  1.33it/s]

Training loop 284
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.22654606401920319, logits - tensor([[-5.2038, -1.2403, -3.1781,  1.4529],
        [-4.4726,  2.4385, -4.2974, -1.4497],
        [-5.1432,  2.3743, -5.2653, -1.4466],
        [-4.9866,  1.0956, -4.3971, -0.7973],
        [-5.9370,  1.2800, -4.5157, -2.3178],
        [-3.7780, -1.8663, -2.6139,  2.6986],
        [-3.9162, -2.0266,  1.7012, -1.8005],
        [-5.0771,  1.7053, -5.1030, -1.0138]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 99%|█████████▊| 285/289 [03:33<00:03,  1.33it/s]

Training loop 285
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.25077003240585327, logits - tensor([[-5.8458,  1.5420, -4.6827, -1.1774],
        [-4.9732,  1.4982, -3.6516, -1.5280],
        [-5.9007,  1.7503, -4.2277, -1.4215],
        [-5.2795,  1.2341, -4.4624, -1.9674],
        [-3.2191, -1.5028,  2.2357, -2.3063],
        [-3.6011, -2.2390,  1.7140, -2.0018],
        [-3.4173, -1.9489,  1.3229, -2.6094],
        [-5.6687,  1.7111, -5.1005, -1.1480]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 99%|█████████▉| 286/289 [03:34<00:02,  1.33it/s]

Training loop 286
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2572783827781677, logits - tensor([[-5.4213,  2.1824, -4.4877, -1.9893],
        [-5.2756, -0.0835, -3.6993,  0.9401],
        [-4.2609, -2.4070,  1.5552, -2.2525],
        [-4.6715, -1.3979, -3.2252,  0.9870],
        [-4.6255,  1.0557, -4.2986, -1.6790],
        [-4.0879, -1.9779, -3.0903,  1.1483],
        [-4.5284,  1.5275, -3.8724, -1.7447],
        [-5.0715,  0.9831, -4.3406, -1.9227]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 99%|█████████▉| 287/289 [03:35<00:01,  1.33it/s]

Training loop 287
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.12490174919366837, logits - tensor([[-5.2433,  0.9541, -5.0721, -1.9939],
        [-5.0935,  1.3225, -3.5852, -1.6412],
        [-3.4307, -1.7421,  1.2943, -1.5143],
        [-4.8501,  1.3661, -3.6809, -1.7424],
        [-3.8090, -1.8325,  0.8243, -2.1769],
        [-4.7496,  1.6990, -3.7725, -1.4604],
        [-3.4213, -1.8266,  1.4860, -2.0320],
        [-4.0618,  1.8020, -3.7882, -1.1325]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


100%|█████████▉| 288/289 [03:36<00:00,  1.33it/s]

Training loop 288
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.12934643030166626, logits - tensor([[-4.7725,  1.6071, -4.2974, -1.1616],
        [-4.1101, -2.0094,  0.5531, -2.2124],
        [-5.7400,  1.4217, -5.0584, -1.9076],
        [-4.5836,  1.1673, -4.6230, -1.3175]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


100%|██████████| 289/289 [03:36<00:00,  1.33it/s]
  0%|          | 0/194 [00:00<?, ?it/s]

Validation Loop 0
input - False, attention_mask - False


  1%|          | 1/194 [00:00<00:50,  3.82it/s]

Validation Loop 1
input - False, attention_mask - False


  1%|          | 2/194 [00:00<00:47,  4.00it/s]

Validation Loop 2
input - False, attention_mask - False


  2%|▏         | 3/194 [00:00<00:46,  4.08it/s]

Validation Loop 3
input - False, attention_mask - False


  2%|▏         | 4/194 [00:01<00:48,  3.93it/s]

Validation Loop 4
input - False, attention_mask - False


  3%|▎         | 5/194 [00:01<00:47,  4.01it/s]

Validation Loop 5
input - False, attention_mask - False


  3%|▎         | 6/194 [00:01<00:46,  4.06it/s]

Validation Loop 6
input - False, attention_mask - False


  4%|▎         | 7/194 [00:01<00:45,  4.08it/s]

Validation Loop 7
input - False, attention_mask - False


  4%|▍         | 8/194 [00:01<00:46,  4.02it/s]

Validation Loop 8
input - False, attention_mask - False


  5%|▍         | 9/194 [00:02<00:46,  4.00it/s]

Validation Loop 9
input - False, attention_mask - False


  5%|▌         | 10/194 [00:02<00:46,  3.98it/s]

Validation Loop 10
input - False, attention_mask - False


  6%|▌         | 11/194 [00:02<00:45,  4.00it/s]

Validation Loop 11
input - False, attention_mask - False


  6%|▌         | 12/194 [00:02<00:45,  4.01it/s]

Validation Loop 12
input - False, attention_mask - False


  7%|▋         | 13/194 [00:03<00:45,  3.99it/s]

Validation Loop 13
input - False, attention_mask - False


  7%|▋         | 14/194 [00:03<00:45,  4.00it/s]

Validation Loop 14
input - False, attention_mask - False


  8%|▊         | 15/194 [00:03<00:44,  4.03it/s]

Validation Loop 15
input - False, attention_mask - False


  8%|▊         | 16/194 [00:03<00:43,  4.05it/s]

Validation Loop 16
input - False, attention_mask - False


  9%|▉         | 17/194 [00:04<00:44,  4.02it/s]

Validation Loop 17
input - False, attention_mask - False


  9%|▉         | 18/194 [00:04<00:43,  4.01it/s]

Validation Loop 18
input - False, attention_mask - False


 10%|▉         | 19/194 [00:04<00:43,  4.01it/s]

Validation Loop 19
input - False, attention_mask - False


 10%|█         | 20/194 [00:04<00:43,  4.01it/s]

Validation Loop 20
input - False, attention_mask - False


 11%|█         | 21/194 [00:05<00:43,  4.00it/s]

Validation Loop 21
input - False, attention_mask - False


 11%|█▏        | 22/194 [00:05<00:43,  3.98it/s]

Validation Loop 22
input - False, attention_mask - False


 12%|█▏        | 23/194 [00:05<00:42,  4.00it/s]

Validation Loop 23
input - False, attention_mask - False


 12%|█▏        | 24/194 [00:05<00:42,  3.97it/s]

Validation Loop 24
input - False, attention_mask - False


 13%|█▎        | 25/194 [00:06<00:43,  3.92it/s]

Validation Loop 25
input - False, attention_mask - False


 13%|█▎        | 26/194 [00:06<00:42,  3.94it/s]

Validation Loop 26
input - False, attention_mask - False


 14%|█▍        | 27/194 [00:06<00:41,  3.99it/s]

Validation Loop 27
input - False, attention_mask - False


 14%|█▍        | 28/194 [00:06<00:41,  4.00it/s]

Validation Loop 28
input - False, attention_mask - False


 15%|█▍        | 29/194 [00:07<00:41,  3.99it/s]

Validation Loop 29
input - False, attention_mask - False


 15%|█▌        | 30/194 [00:07<00:40,  4.01it/s]

Validation Loop 30
input - False, attention_mask - False


 16%|█▌        | 31/194 [00:07<00:40,  3.98it/s]

Validation Loop 31
input - False, attention_mask - False


 16%|█▋        | 32/194 [00:08<00:40,  3.99it/s]

Validation Loop 32
input - False, attention_mask - False


 17%|█▋        | 33/194 [00:08<00:40,  4.02it/s]

Validation Loop 33
input - False, attention_mask - False


 18%|█▊        | 34/194 [00:08<00:39,  4.01it/s]

Validation Loop 34
input - False, attention_mask - False


 18%|█▊        | 35/194 [00:08<00:39,  4.02it/s]

Validation Loop 35
input - False, attention_mask - False


 19%|█▊        | 36/194 [00:09<00:39,  3.98it/s]

Validation Loop 36
input - False, attention_mask - False


 19%|█▉        | 37/194 [00:09<00:39,  3.97it/s]

Validation Loop 37
input - False, attention_mask - False


 20%|█▉        | 38/194 [00:09<00:39,  3.97it/s]

Validation Loop 38
input - False, attention_mask - False


 20%|██        | 39/194 [00:09<00:39,  3.96it/s]

Validation Loop 39
input - False, attention_mask - False


 21%|██        | 40/194 [00:10<00:38,  3.97it/s]

Validation Loop 40
input - False, attention_mask - False


 21%|██        | 41/194 [00:10<00:38,  3.97it/s]

Validation Loop 41
input - False, attention_mask - False


 22%|██▏       | 42/194 [00:10<00:38,  3.97it/s]

Validation Loop 42
input - False, attention_mask - False


 22%|██▏       | 43/194 [00:10<00:38,  3.96it/s]

Validation Loop 43
input - False, attention_mask - False


 23%|██▎       | 44/194 [00:11<00:37,  3.97it/s]

Validation Loop 44
input - False, attention_mask - False


 23%|██▎       | 45/194 [00:11<00:37,  3.97it/s]

Validation Loop 45
input - False, attention_mask - False


 24%|██▎       | 46/194 [00:11<00:37,  3.96it/s]

Validation Loop 46
input - False, attention_mask - False


 24%|██▍       | 47/194 [00:11<00:37,  3.96it/s]

Validation Loop 47
input - False, attention_mask - False


 25%|██▍       | 48/194 [00:12<00:36,  4.00it/s]

Validation Loop 48
input - False, attention_mask - False


 25%|██▌       | 49/194 [00:12<00:36,  3.98it/s]

Validation Loop 49
input - False, attention_mask - False


 26%|██▌       | 50/194 [00:12<00:36,  4.00it/s]

Validation Loop 50
input - False, attention_mask - False


 26%|██▋       | 51/194 [00:12<00:35,  4.01it/s]

Validation Loop 51
input - False, attention_mask - False


 27%|██▋       | 52/194 [00:13<00:35,  3.99it/s]

Validation Loop 52
input - False, attention_mask - False


 27%|██▋       | 53/194 [00:13<00:35,  3.99it/s]

Validation Loop 53
input - False, attention_mask - False


 28%|██▊       | 54/194 [00:13<00:35,  3.99it/s]

Validation Loop 54
input - False, attention_mask - False


 28%|██▊       | 55/194 [00:13<00:34,  3.99it/s]

Validation Loop 55
input - False, attention_mask - False


 29%|██▉       | 56/194 [00:14<00:34,  3.98it/s]

Validation Loop 56
input - False, attention_mask - False


 29%|██▉       | 57/194 [00:14<00:34,  3.97it/s]

Validation Loop 57
input - False, attention_mask - False


 30%|██▉       | 58/194 [00:14<00:34,  3.97it/s]

Validation Loop 58
input - False, attention_mask - False


 30%|███       | 59/194 [00:14<00:34,  3.96it/s]

Validation Loop 59
input - False, attention_mask - False


 31%|███       | 60/194 [00:15<00:33,  4.00it/s]

Validation Loop 60
input - False, attention_mask - False


 31%|███▏      | 61/194 [00:15<00:33,  4.01it/s]

Validation Loop 61
input - False, attention_mask - False


 32%|███▏      | 62/194 [00:15<00:32,  4.02it/s]

Validation Loop 62
input - False, attention_mask - False


 32%|███▏      | 63/194 [00:15<00:32,  4.00it/s]

Validation Loop 63
input - False, attention_mask - False


 33%|███▎      | 64/194 [00:16<00:32,  4.01it/s]

Validation Loop 64
input - False, attention_mask - False


 34%|███▎      | 65/194 [00:16<00:32,  4.00it/s]

Validation Loop 65
input - False, attention_mask - False


 34%|███▍      | 66/194 [00:16<00:31,  4.02it/s]

Validation Loop 66
input - False, attention_mask - False


 35%|███▍      | 67/194 [00:16<00:31,  4.02it/s]

Validation Loop 67
input - False, attention_mask - False


 35%|███▌      | 68/194 [00:17<00:31,  4.01it/s]

Validation Loop 68
input - False, attention_mask - False


 36%|███▌      | 69/194 [00:17<00:31,  4.03it/s]

Validation Loop 69
input - False, attention_mask - False


 36%|███▌      | 70/194 [00:17<00:30,  4.03it/s]

Validation Loop 70
input - False, attention_mask - False


 37%|███▋      | 71/194 [00:17<00:30,  4.00it/s]

Validation Loop 71
input - False, attention_mask - False


 37%|███▋      | 72/194 [00:18<00:30,  4.02it/s]

Validation Loop 72
input - False, attention_mask - False


 38%|███▊      | 73/194 [00:18<00:30,  3.95it/s]

Validation Loop 73
input - False, attention_mask - False


 38%|███▊      | 74/194 [00:18<00:30,  3.95it/s]

Validation Loop 74
input - False, attention_mask - False


 39%|███▊      | 75/194 [00:18<00:29,  3.97it/s]

Validation Loop 75
input - False, attention_mask - False


 39%|███▉      | 76/194 [00:19<00:29,  3.95it/s]

Validation Loop 76
input - False, attention_mask - False


 40%|███▉      | 77/194 [00:19<00:29,  3.93it/s]

Validation Loop 77
input - False, attention_mask - False


 40%|████      | 78/194 [00:19<00:29,  3.95it/s]

Validation Loop 78
input - False, attention_mask - False


 41%|████      | 79/194 [00:19<00:28,  3.98it/s]

Validation Loop 79
input - False, attention_mask - False


 41%|████      | 80/194 [00:20<00:28,  3.96it/s]

Validation Loop 80
input - False, attention_mask - False


 42%|████▏     | 81/194 [00:20<00:28,  3.97it/s]

Validation Loop 81
input - False, attention_mask - False


 42%|████▏     | 82/194 [00:20<00:27,  4.01it/s]

Validation Loop 82
input - False, attention_mask - False


 43%|████▎     | 83/194 [00:20<00:27,  3.99it/s]

Validation Loop 83
input - False, attention_mask - False


 43%|████▎     | 84/194 [00:21<00:27,  3.94it/s]

Validation Loop 84
input - False, attention_mask - False


 44%|████▍     | 85/194 [00:21<00:27,  3.94it/s]

Validation Loop 85
input - False, attention_mask - False


 44%|████▍     | 86/194 [00:21<00:27,  3.96it/s]

Validation Loop 86
input - False, attention_mask - False


 45%|████▍     | 87/194 [00:21<00:26,  3.96it/s]

Validation Loop 87
input - False, attention_mask - False


 45%|████▌     | 88/194 [00:22<00:26,  3.98it/s]

Validation Loop 88
input - False, attention_mask - False


 46%|████▌     | 89/194 [00:22<00:26,  3.96it/s]

Validation Loop 89
input - False, attention_mask - False


 46%|████▋     | 90/194 [00:22<00:26,  3.95it/s]

Validation Loop 90
input - False, attention_mask - False


 47%|████▋     | 91/194 [00:22<00:25,  3.97it/s]

Validation Loop 91
input - False, attention_mask - False


 47%|████▋     | 92/194 [00:23<00:25,  3.98it/s]

Validation Loop 92
input - False, attention_mask - False


 48%|████▊     | 93/194 [00:23<00:25,  3.94it/s]

Validation Loop 93
input - False, attention_mask - False


 48%|████▊     | 94/194 [00:23<00:25,  3.99it/s]

Validation Loop 94
input - False, attention_mask - False


 49%|████▉     | 95/194 [00:23<00:24,  4.02it/s]

Validation Loop 95
input - False, attention_mask - False


 49%|████▉     | 96/194 [00:24<00:24,  3.99it/s]

Validation Loop 96
input - False, attention_mask - False


 50%|█████     | 97/194 [00:24<00:24,  4.01it/s]

Validation Loop 97
input - False, attention_mask - False


 51%|█████     | 98/194 [00:24<00:23,  4.03it/s]

Validation Loop 98
input - False, attention_mask - False


 51%|█████     | 99/194 [00:24<00:23,  4.00it/s]

Validation Loop 99
input - False, attention_mask - False


 52%|█████▏    | 100/194 [00:25<00:23,  4.00it/s]

Validation Loop 100
input - False, attention_mask - False


 52%|█████▏    | 101/194 [00:25<00:23,  4.00it/s]

Validation Loop 101
input - False, attention_mask - False


 53%|█████▎    | 102/194 [00:25<00:23,  4.00it/s]

Validation Loop 102
input - False, attention_mask - False


 53%|█████▎    | 103/194 [00:25<00:22,  4.03it/s]

Validation Loop 103
input - False, attention_mask - False


 54%|█████▎    | 104/194 [00:26<00:22,  4.03it/s]

Validation Loop 104
input - False, attention_mask - False


 54%|█████▍    | 105/194 [00:26<00:22,  4.02it/s]

Validation Loop 105
input - False, attention_mask - False


 55%|█████▍    | 106/194 [00:26<00:21,  4.01it/s]

Validation Loop 106
input - False, attention_mask - False


 55%|█████▌    | 107/194 [00:26<00:21,  4.04it/s]

Validation Loop 107
input - False, attention_mask - False


 56%|█████▌    | 108/194 [00:27<00:21,  4.02it/s]

Validation Loop 108
input - False, attention_mask - False


 56%|█████▌    | 109/194 [00:27<00:21,  4.02it/s]

Validation Loop 109
input - False, attention_mask - False


 57%|█████▋    | 110/194 [00:27<00:20,  4.02it/s]

Validation Loop 110
input - False, attention_mask - False


 57%|█████▋    | 111/194 [00:27<00:20,  4.00it/s]

Validation Loop 111
input - False, attention_mask - False


 58%|█████▊    | 112/194 [00:28<00:20,  4.01it/s]

Validation Loop 112
input - False, attention_mask - False


 58%|█████▊    | 113/194 [00:28<00:20,  4.03it/s]

Validation Loop 113
input - False, attention_mask - False


 59%|█████▉    | 114/194 [00:28<00:19,  4.00it/s]

Validation Loop 114
input - False, attention_mask - False


 59%|█████▉    | 115/194 [00:28<00:19,  3.99it/s]

Validation Loop 115
input - False, attention_mask - False


 60%|█████▉    | 116/194 [00:29<00:19,  3.99it/s]

Validation Loop 116
input - False, attention_mask - False


 60%|██████    | 117/194 [00:29<00:19,  4.00it/s]

Validation Loop 117
input - False, attention_mask - False


 61%|██████    | 118/194 [00:29<00:19,  4.00it/s]

Validation Loop 118
input - False, attention_mask - False


 61%|██████▏   | 119/194 [00:29<00:18,  4.02it/s]

Validation Loop 119
input - False, attention_mask - False


 62%|██████▏   | 120/194 [00:30<00:18,  4.02it/s]

Validation Loop 120
input - False, attention_mask - False


 62%|██████▏   | 121/194 [00:30<00:18,  4.02it/s]

Validation Loop 121
input - False, attention_mask - False


 63%|██████▎   | 122/194 [00:30<00:17,  4.00it/s]

Validation Loop 122
input - False, attention_mask - False


 63%|██████▎   | 123/194 [00:30<00:17,  4.00it/s]

Validation Loop 123
input - False, attention_mask - False


 64%|██████▍   | 124/194 [00:31<00:17,  4.01it/s]

Validation Loop 124
input - False, attention_mask - False


 64%|██████▍   | 125/194 [00:31<00:17,  4.00it/s]

Validation Loop 125
input - False, attention_mask - False


 65%|██████▍   | 126/194 [00:31<00:16,  4.01it/s]

Validation Loop 126
input - False, attention_mask - False


 65%|██████▌   | 127/194 [00:31<00:16,  4.01it/s]

Validation Loop 127
input - False, attention_mask - False


 66%|██████▌   | 128/194 [00:32<00:16,  4.02it/s]

Validation Loop 128
input - False, attention_mask - False


 66%|██████▋   | 129/194 [00:32<00:16,  4.00it/s]

Validation Loop 129
input - False, attention_mask - False


 67%|██████▋   | 130/194 [00:32<00:16,  3.98it/s]

Validation Loop 130
input - False, attention_mask - False


 68%|██████▊   | 131/194 [00:32<00:15,  3.98it/s]

Validation Loop 131
input - False, attention_mask - False


 68%|██████▊   | 132/194 [00:33<00:15,  4.01it/s]

Validation Loop 132
input - False, attention_mask - False


 69%|██████▊   | 133/194 [00:33<00:15,  3.99it/s]

Validation Loop 133
input - False, attention_mask - False


 69%|██████▉   | 134/194 [00:33<00:15,  3.96it/s]

Validation Loop 134
input - False, attention_mask - False


 70%|██████▉   | 135/194 [00:33<00:15,  3.93it/s]

Validation Loop 135
input - False, attention_mask - False


 70%|███████   | 136/194 [00:34<00:14,  3.96it/s]

Validation Loop 136
input - False, attention_mask - False


 71%|███████   | 137/194 [00:34<00:14,  3.93it/s]

Validation Loop 137
input - False, attention_mask - False


 71%|███████   | 138/194 [00:34<00:14,  3.94it/s]

Validation Loop 138
input - False, attention_mask - False


 72%|███████▏  | 139/194 [00:34<00:13,  3.93it/s]

Validation Loop 139
input - False, attention_mask - False


 72%|███████▏  | 140/194 [00:35<00:13,  3.91it/s]

Validation Loop 140
input - False, attention_mask - False


 73%|███████▎  | 141/194 [00:35<00:13,  3.92it/s]

Validation Loop 141
input - False, attention_mask - False


 73%|███████▎  | 142/194 [00:35<00:13,  3.91it/s]

Validation Loop 142
input - False, attention_mask - False


 74%|███████▎  | 143/194 [00:35<00:13,  3.90it/s]

Validation Loop 143
input - False, attention_mask - False


 74%|███████▍  | 144/194 [00:36<00:12,  3.95it/s]

Validation Loop 144
input - False, attention_mask - False


 75%|███████▍  | 145/194 [00:36<00:12,  3.96it/s]

Validation Loop 145
input - False, attention_mask - False


 75%|███████▌  | 146/194 [00:36<00:12,  3.98it/s]

Validation Loop 146
input - False, attention_mask - False


 76%|███████▌  | 147/194 [00:36<00:11,  3.97it/s]

Validation Loop 147
input - False, attention_mask - False


 76%|███████▋  | 148/194 [00:37<00:11,  3.96it/s]

Validation Loop 148
input - False, attention_mask - False


 77%|███████▋  | 149/194 [00:37<00:11,  3.95it/s]

Validation Loop 149
input - False, attention_mask - False


 77%|███████▋  | 150/194 [00:37<00:11,  3.97it/s]

Validation Loop 150
input - False, attention_mask - False


 78%|███████▊  | 151/194 [00:37<00:10,  3.97it/s]

Validation Loop 151
input - False, attention_mask - False


 78%|███████▊  | 152/194 [00:38<00:10,  3.95it/s]

Validation Loop 152
input - False, attention_mask - False


 79%|███████▉  | 153/194 [00:38<00:10,  3.97it/s]

Validation Loop 153
input - False, attention_mask - False


 79%|███████▉  | 154/194 [00:38<00:10,  4.00it/s]

Validation Loop 154
input - False, attention_mask - False


 80%|███████▉  | 155/194 [00:38<00:09,  3.95it/s]

Validation Loop 155
input - False, attention_mask - False


 80%|████████  | 156/194 [00:39<00:09,  4.00it/s]

Validation Loop 156
input - False, attention_mask - False


 81%|████████  | 157/194 [00:39<00:09,  4.03it/s]

Validation Loop 157
input - False, attention_mask - False


 81%|████████▏ | 158/194 [00:39<00:08,  4.02it/s]

Validation Loop 158
input - False, attention_mask - False


 82%|████████▏ | 159/194 [00:39<00:08,  3.99it/s]

Validation Loop 159
input - False, attention_mask - False


 82%|████████▏ | 160/194 [00:40<00:08,  4.01it/s]

Validation Loop 160
input - False, attention_mask - False


 83%|████████▎ | 161/194 [00:40<00:08,  4.01it/s]

Validation Loop 161
input - False, attention_mask - False


 84%|████████▎ | 162/194 [00:40<00:07,  4.01it/s]

Validation Loop 162
input - False, attention_mask - False


 84%|████████▍ | 163/194 [00:40<00:07,  3.98it/s]

Validation Loop 163
input - False, attention_mask - False


 85%|████████▍ | 164/194 [00:41<00:07,  3.99it/s]

Validation Loop 164
input - False, attention_mask - False


 85%|████████▌ | 165/194 [00:41<00:07,  4.02it/s]

Validation Loop 165
input - False, attention_mask - False


 86%|████████▌ | 166/194 [00:41<00:06,  4.04it/s]

Validation Loop 166
input - False, attention_mask - False


 86%|████████▌ | 167/194 [00:41<00:06,  4.04it/s]

Validation Loop 167
input - False, attention_mask - False


 87%|████████▋ | 168/194 [00:42<00:06,  4.03it/s]

Validation Loop 168
input - False, attention_mask - False


 87%|████████▋ | 169/194 [00:42<00:06,  4.03it/s]

Validation Loop 169
input - False, attention_mask - False


 88%|████████▊ | 170/194 [00:42<00:05,  4.04it/s]

Validation Loop 170
input - False, attention_mask - False


 88%|████████▊ | 171/194 [00:42<00:05,  4.03it/s]

Validation Loop 171
input - False, attention_mask - False


 89%|████████▊ | 172/194 [00:43<00:05,  4.02it/s]

Validation Loop 172
input - False, attention_mask - False


 89%|████████▉ | 173/194 [00:43<00:05,  4.02it/s]

Validation Loop 173
input - False, attention_mask - False


 90%|████████▉ | 174/194 [00:43<00:04,  4.00it/s]

Validation Loop 174
input - False, attention_mask - False


 90%|█████████ | 175/194 [00:43<00:04,  3.99it/s]

Validation Loop 175
input - False, attention_mask - False


 91%|█████████ | 176/194 [00:44<00:04,  3.98it/s]

Validation Loop 176
input - False, attention_mask - False


 91%|█████████ | 177/194 [00:44<00:04,  3.95it/s]

Validation Loop 177
input - False, attention_mask - False


 92%|█████████▏| 178/194 [00:44<00:04,  3.97it/s]

Validation Loop 178
input - False, attention_mask - False


 92%|█████████▏| 179/194 [00:44<00:03,  4.01it/s]

Validation Loop 179
input - False, attention_mask - False


 93%|█████████▎| 180/194 [00:45<00:03,  4.00it/s]

Validation Loop 180
input - False, attention_mask - False


 93%|█████████▎| 181/194 [00:45<00:03,  4.01it/s]

Validation Loop 181
input - False, attention_mask - False


 94%|█████████▍| 182/194 [00:45<00:02,  4.02it/s]

Validation Loop 182
input - False, attention_mask - False


 94%|█████████▍| 183/194 [00:45<00:02,  4.02it/s]

Validation Loop 183
input - False, attention_mask - False


 95%|█████████▍| 184/194 [00:46<00:02,  4.01it/s]

Validation Loop 184
input - False, attention_mask - False


 95%|█████████▌| 185/194 [00:46<00:02,  4.00it/s]

Validation Loop 185
input - False, attention_mask - False


 96%|█████████▌| 186/194 [00:46<00:02,  3.97it/s]

Validation Loop 186
input - False, attention_mask - False


 96%|█████████▋| 187/194 [00:46<00:01,  4.00it/s]

Validation Loop 187
input - False, attention_mask - False


 97%|█████████▋| 188/194 [00:47<00:01,  3.99it/s]

Validation Loop 188
input - False, attention_mask - False


 97%|█████████▋| 189/194 [00:47<00:01,  4.00it/s]

Validation Loop 189
input - False, attention_mask - False


 98%|█████████▊| 190/194 [00:47<00:01,  3.99it/s]

Validation Loop 190
input - False, attention_mask - False


 98%|█████████▊| 191/194 [00:47<00:00,  3.98it/s]

Validation Loop 191
input - False, attention_mask - False


 99%|█████████▉| 192/194 [00:48<00:00,  4.01it/s]

Validation Loop 192
input - False, attention_mask - False


 99%|█████████▉| 193/194 [00:48<00:00,  3.99it/s]

Validation Loop 193
input - False, attention_mask - False


100%|██████████| 194/194 [00:48<00:00,  3.98it/s]


[{'tp': 0, 'tn': 1552, 'fp': 0, 'fn': 0}, {'tp': 925, 'tn': 327, 'fp': 36, 'fn': 264}, {'tp': 155, 'tn': 1363, 'fp': 5, 'fn': 29}, {'tp': 147, 'tn': 1102, 'fp': 284, 'fn': 19}]
Detailed accuracy after 0 epoch:
unanswerable accuarcy: 1.0
extractive accuarcy: 0.8067010309278351
yes_no accuarcy: 0.9780927835051546
abstractive accuarcy: 0.8047680412371134
Overall accuarcy: 0.8973904639175257
Best accuarcy: 0
0.8973904639175257
Model Updated


  0%|          | 0/289 [00:00<?, ?it/s]

Training loop 0
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.302915096282959, logits - tensor([[-5.3461,  2.3689, -4.0576, -2.2258],
        [-4.1762, -1.9196,  1.5113, -1.9857],
        [-5.8166,  1.6375, -4.5721, -1.9064],
        [-5.0293, -2.6163, -3.4506,  1.8821],
        [-4.3807,  1.8856, -3.6000, -1.4023],
        [-4.1211, -2.4952,  1.1960, -2.2317],
        [-6.0795,  2.0819, -5.0301, -1.5136],
        [-4.4570,  2.2543, -3.9370, -1.4111]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  0%|          | 1/289 [00:00<03:58,  1.21it/s]

Training loop 1
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19387951493263245, logits - tensor([[-4.5252,  1.8650, -4.1438, -1.8140],
        [-5.7720,  1.6883, -4.6045, -2.1996],
        [-4.8951,  2.1311, -4.2554, -1.7145],
        [-5.6139,  1.9625, -4.8251, -2.6613],
        [-5.4744,  2.3912, -4.2481, -1.6645],
        [-5.4409,  2.1979, -3.9556, -1.8293],
        [-5.0648,  1.6520, -4.4451, -1.5384],
        [-5.7312, -1.4467, -3.1427,  1.4832]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  1%|          | 2/289 [00:01<03:44,  1.28it/s]

Training loop 2
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.26335662603378296, logits - tensor([[-3.7658, -1.5238,  1.4003, -1.7859],
        [-4.7214,  1.1615, -3.4274, -1.5602],
        [-4.8553, -3.0747, -2.6013,  1.7987],
        [-5.9784, -0.2485, -2.1942, -2.3010],
        [-5.2232,  1.8759, -4.4782, -2.5099],
        [-5.1161,  2.6344, -4.0572, -1.2017],
        [-3.7949, -1.6908,  1.5129, -2.5020],
        [-5.5209,  1.9704, -4.9335, -1.8155]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  1%|          | 3/289 [00:02<03:39,  1.31it/s]

Training loop 3
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.18715956807136536, logits - tensor([[-4.3695,  1.5012, -4.2032, -2.1508],
        [-3.2934, -1.9296,  1.7897, -1.9146],
        [-5.4752,  1.8775, -4.5814, -1.7193],
        [-5.2670,  2.2973, -4.6697, -2.0770],
        [-4.1390, -3.5525, -2.9263,  2.7854],
        [-5.1028,  1.9625, -4.5536, -1.6166],
        [-4.3764, -2.2030, -3.1402,  2.0842],
        [-4.9921,  2.2731, -3.9399, -1.6777]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  1%|▏         | 4/289 [00:03<03:36,  1.32it/s]

Training loop 4
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.21563851833343506, logits - tensor([[-3.1799, -2.2565,  1.1595, -1.4787],
        [-3.9451, -2.3505, -3.2681,  2.0498],
        [-4.6794,  1.6895, -4.8418, -1.8153],
        [-5.2860,  2.1048, -4.7432, -2.3895],
        [-3.7180, -3.5614, -2.8485,  1.9728],
        [-2.8285, -2.0742,  0.6142, -1.4169],
        [-6.0409,  1.7387, -4.6231, -2.5419],
        [-5.0053,  1.3418, -3.5710, -2.4550]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  2%|▏         | 5/289 [00:03<03:33,  1.33it/s]

Training loop 5
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.17810913920402527, logits - tensor([[-5.7890,  1.6965, -4.3162, -1.3523],
        [-4.6173,  2.7238, -4.0422, -1.0950],
        [-3.7696, -2.1515,  1.5805, -1.5283],
        [-5.0083,  1.5126, -4.2579, -2.0984],
        [-5.2483,  1.8643, -4.0228, -1.7878],
        [-4.9430,  1.5753, -4.0655, -1.7008],
        [-5.2829,  1.4380, -3.7592, -1.1021],
        [-4.7284,  1.4904, -4.1217, -2.8579]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  2%|▏         | 6/289 [00:04<03:33,  1.33it/s]

Training loop 6
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.11909127235412598, logits - tensor([[-5.4813, -0.1014, -3.2773, -1.4309],
        [-5.7487,  1.6674, -3.8446, -2.2901],
        [-5.7309,  1.8434, -4.2926, -1.6865],
        [-5.3786,  0.3731, -1.5236, -2.0897],
        [-5.3978,  2.1133, -3.7899, -1.2460],
        [-4.3978, -1.7263, -3.2958,  2.3219],
        [-4.1892,  1.7642, -4.2236, -1.1634],
        [-5.7703,  2.2586, -4.5474, -2.6501]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  2%|▏         | 7/289 [00:05<03:32,  1.33it/s]

Training loop 7
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3464809060096741, logits - tensor([[-5.3762,  1.0503, -3.6871, -0.9640],
        [-4.0201, -1.6657,  1.1135, -1.8792],
        [-3.7271, -2.8785,  1.1843, -1.8412],
        [-4.9924, -2.9236,  0.6695, -2.2843],
        [-6.2835,  1.3726, -4.0125, -1.9513],
        [-5.8184,  1.7837, -4.4151, -2.9481],
        [-3.9478, -1.8416,  1.0781, -1.6319],
        [-3.8217, -2.4001,  1.4292, -2.7499]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  3%|▎         | 8/289 [00:06<03:31,  1.33it/s]

Training loop 8
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.30164197087287903, logits - tensor([[-5.3546,  1.5874, -4.5991, -1.9521],
        [-5.6913,  2.3311, -4.5634, -2.1161],
        [-5.6605,  1.8504, -3.8253, -2.1028],
        [-4.1095, -2.6709,  1.8736, -1.7943],
        [-4.1422, -2.0143, -3.1528,  1.9543],
        [-4.7264,  1.9746, -2.8131, -1.7088],
        [-5.4245,  1.0595, -2.9115, -1.8584],
        [-4.5934, -2.7048,  1.6480, -2.0891]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  3%|▎         | 9/289 [00:06<03:30,  1.33it/s]

Training loop 9
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5486949682235718, logits - tensor([[-3.8324, -2.4346,  1.7677, -2.1481],
        [-5.7900, -0.2389, -2.9715, -1.6294],
        [-4.6405, -1.7281,  0.2271, -2.3352],
        [-4.2963, -1.4335,  0.4891, -2.1881],
        [-4.3438, -2.5887,  1.2049, -2.2332],
        [-4.7653,  1.7114, -4.3521, -2.0770],
        [-4.8944, -1.8797,  0.8636, -2.4766],
        [-6.0049,  0.3609, -2.7472, -1.8750]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  3%|▎         | 10/289 [00:07<03:29,  1.33it/s]

Training loop 10
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.31672006845474243, logits - tensor([[-5.4221,  1.7322, -4.5517, -2.4649],
        [-5.4187,  1.8089, -4.6555, -1.9453],
        [-5.2675,  0.8184, -3.8461, -2.4216],
        [-5.3080,  1.8409, -3.5191, -2.0236],
        [-5.3441,  1.6823, -4.6012, -2.1619],
        [-4.7973, -2.7445,  1.5417, -2.1128],
        [-4.2060, -1.5873, -3.6401,  1.9359],
        [-5.2297, -2.3720, -3.5789,  1.3574]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  4%|▍         | 11/289 [00:08<03:28,  1.33it/s]

Training loop 11
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


  4%|▍         | 12/289 [00:09<03:27,  1.33it/s]

loss - 0.35371965169906616, logits - tensor([[-5.3754,  1.5663, -4.2793, -1.7123],
        [-4.3087,  1.2534, -3.8445, -2.4596],
        [-5.9794, -2.5607, -3.9253,  2.3522],
        [-5.0558, -3.1929, -2.7770,  3.1969],
        [-4.0792, -2.3291,  1.4295, -1.2426],
        [-5.2704,  2.0754, -4.1273, -1.0653],
        [-4.4875, -2.9776, -2.9289,  3.5109],
        [-3.6293, -2.3195,  1.3600, -2.1253]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 12
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.22924908995628357, logits - tensor([[-3.9736, -2.5976,  2.1069, -1.4752],
        [-4.2264, -3.1256,  1.9054, -2.2246],
        [-4.3786,  2.4820, -3.6272, -2.0302],
        [-4.4248,  1.6152, -4.6459, -2.2263],
        [-5.5802,  2.3253, -4.6075, -2.8721],
        [-4.0408, -2.8578, -3.3941,  2.2948],
        [-5.1015,  1.9734, -4.4124, -1.5661],
        [-5.2316,  2.1039, -4.2856, -2.

  4%|▍         | 13/289 [00:09<03:27,  1.33it/s]

Training loop 13
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.24084070324897766, logits - tensor([[-4.8386,  2.5642, -4.5085, -1.2707],
        [-5.5327,  1.3821, -4.6293, -1.9868],
        [-4.7008,  2.3962, -4.5054, -1.8741],
        [-4.1569, -2.9467,  1.0226, -1.6462],
        [-4.3460,  1.1262, -4.4583, -1.0525],
        [-5.0691,  1.8702, -3.9594, -2.1841],
        [-4.9370,  1.0038, -4.4787, -1.2942],
        [-5.4558,  2.3352, -4.2844, -1.7488]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  5%|▍         | 14/289 [00:10<03:27,  1.33it/s]

Training loop 14
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2011812925338745, logits - tensor([[-4.8116, -1.9317, -3.8853,  2.4828],
        [-4.5117,  2.1273, -3.7658, -2.2543],
        [-5.0021,  2.2975, -4.6410, -1.4167],
        [-5.3889,  1.4273, -4.1292, -1.7509],
        [-4.4166,  2.4534, -4.0518, -1.2116],
        [-5.5028,  1.4110, -3.9279, -1.3224],
        [-4.7648, -1.9946, -2.8606,  1.8410],
        [-5.4384, -1.8938, -3.4371,  1.5516]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  5%|▌         | 15/289 [00:11<03:26,  1.33it/s]

Training loop 15
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.10262288898229599, logits - tensor([[-4.5861, -3.0076,  1.4174, -2.0875],
        [-5.5650,  1.7268, -5.2536, -2.0280],
        [-3.8607, -2.1026,  1.5274, -2.0056],
        [-4.9924,  2.1764, -4.6516, -2.2922],
        [-4.6677, -0.7156, -2.1091,  1.6419],
        [-3.0665, -2.8323, -2.2843,  2.0717],
        [-5.3511,  1.9378, -4.0865, -1.6964],
        [-4.5886, -2.1886,  1.0287, -1.8000]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  6%|▌         | 16/289 [00:12<03:25,  1.33it/s]

Training loop 16
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2043123096227646, logits - tensor([[-5.7987,  1.7540, -4.9553, -1.5864],
        [-4.7261,  2.0605, -5.2165, -1.7904],
        [-5.3092,  2.2650, -4.3293, -0.9822],
        [-6.2851,  1.8698, -4.0648, -1.4749],
        [-4.9249,  2.9103, -4.9668, -1.5255],
        [-4.9168, -3.0037,  0.9050, -1.5990],
        [-6.1391, -1.0844, -3.7915,  0.7431],
        [-5.0397,  1.6672, -4.2476, -2.2387]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  6%|▌         | 17/289 [00:12<03:25,  1.32it/s]

Training loop 17
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.09697866439819336, logits - tensor([[-5.0883, -0.8124, -3.5278,  1.3353],
        [-5.8436,  2.2345, -4.8794, -1.2520],
        [-5.5436,  1.3820, -4.4929, -1.9318],
        [-5.6368,  2.0748, -4.6040, -2.0810],
        [-4.8154,  1.2758, -4.1151, -1.5565],
        [-6.3209,  2.3967, -5.1069, -2.0867],
        [-6.5967,  1.7050, -5.0782, -1.3935],
        [-4.4036,  1.1732, -4.6206, -2.0742]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  6%|▌         | 18/289 [00:13<03:25,  1.32it/s]

Training loop 18
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.27394020557403564, logits - tensor([[-4.6965, -0.7001, -3.4012,  1.3076],
        [-6.0703,  1.6242, -5.0356, -1.8499],
        [-5.6983,  2.5944, -4.8435, -1.6196],
        [-5.6748,  1.6322, -4.7435, -0.6829],
        [-5.6157,  1.7501, -4.4626, -2.1206],
        [-7.2162,  1.2061, -3.2092, -1.7768],
        [-5.0449, -1.7002, -4.0103,  1.2332],
        [-6.2498,  1.5775, -4.6681, -1.2997]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  7%|▋         | 19/289 [00:14<03:25,  1.32it/s]

Training loop 19
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.39536774158477783, logits - tensor([[-5.5119,  2.9422, -4.4493, -0.8313],
        [-4.3736,  1.8609, -3.9206, -1.3417],
        [-5.0941,  1.4041, -4.0632, -2.0547],
        [-5.9528,  2.0751, -5.0775, -2.1197],
        [-4.9127,  1.3599, -4.1190, -1.8644],
        [-5.1847,  1.6789, -4.2448, -1.9699],
        [-5.9356, -1.2795, -4.4407,  2.2805],
        [-5.5567,  2.3173, -4.5506, -1.8815]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  7%|▋         | 20/289 [00:15<03:24,  1.32it/s]

Training loop 20
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.24487799406051636, logits - tensor([[-5.5506, -2.0236, -3.7164,  2.7768],
        [-5.8049, -0.4829, -3.7050, -0.0555],
        [-4.1701, -2.4577,  1.5809, -1.4620],
        [-4.3513, -2.2688, -3.6678,  2.0376],
        [-4.5405, -3.0057,  1.2999, -1.5241],
        [-4.3753,  1.2272, -4.1832, -1.3143],
        [-5.1509,  1.8759, -4.2187, -1.5607],
        [-5.5413,  2.1337, -5.1379, -2.1619]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  7%|▋         | 21/289 [00:15<03:23,  1.32it/s]

Training loop 21
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.22083714604377747, logits - tensor([[-6.4468,  1.8563, -4.9791, -0.5128],
        [-4.9240, -2.3041, -3.3367,  2.3661],
        [-5.9545,  2.1084, -4.4892, -1.4188],
        [-5.7625,  1.4024, -4.3733, -1.8115],
        [-5.6925,  1.2011, -4.5209, -0.5922],
        [-5.5078,  1.5706, -4.7189, -1.3241],
        [-4.3013, -2.7471,  1.9195, -1.8738],
        [-5.8335,  1.7066, -5.3329, -1.9616]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  8%|▊         | 22/289 [00:16<03:23,  1.31it/s]

Training loop 22
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.36878862977027893, logits - tensor([[-4.3972, -3.1450,  1.1518, -1.6216],
        [-5.3844,  1.8129, -4.6402, -0.9068],
        [-5.1119,  1.5302, -4.4785, -0.9742],
        [-5.4702,  1.1837, -4.4590, -0.7454],
        [-5.2706, -1.5867, -4.7543,  1.2989],
        [-4.1153, -3.1649,  1.4350, -2.3800],
        [-6.1372,  1.4167, -4.7241, -1.8209],
        [-5.2856,  2.2128, -4.8988, -1.6370]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  8%|▊         | 23/289 [00:17<03:21,  1.32it/s]

Training loop 23
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.20495577156543732, logits - tensor([[-6.0120,  1.7221, -3.6526, -1.5100],
        [-5.8761,  2.3062, -4.6430, -1.7307],
        [-5.2797,  1.5453, -3.9781, -1.1605],
        [-5.7955,  0.8070, -5.1304, -0.1619],
        [-5.7138,  1.2829, -4.1220, -0.5773],
        [-5.7850,  1.9625, -4.7472, -1.4477],
        [-5.4449, -0.3730, -3.9468,  0.7041],
        [-4.9411,  0.6307, -4.1817, -1.0096]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  8%|▊         | 24/289 [00:18<03:20,  1.32it/s]

Training loop 24
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.24034535884857178, logits - tensor([[-4.3712, -2.5091,  1.9354, -2.5085],
        [-4.2981, -1.6793,  1.0741, -1.9049],
        [-4.9725,  1.9159, -4.9166, -1.4129],
        [-4.4537, -2.2003,  2.0336, -1.9152],
        [-5.0477,  1.6998, -4.3663, -1.4990],
        [-5.2623, -0.1473, -3.8879,  0.7221],
        [-5.6777,  1.0388, -4.7055, -1.7437],
        [-4.7126,  1.0983, -3.7484, -1.8707]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  9%|▊         | 25/289 [00:18<03:19,  1.32it/s]

Training loop 25
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.21894770860671997, logits - tensor([[-5.2158,  1.1983, -5.5126, -1.2134],
        [-5.6905,  2.1299, -4.7301, -1.2360],
        [-5.9813,  1.8486, -4.8452, -1.4524],
        [-5.7085,  2.2247, -5.1013, -1.4146],
        [-4.4690, -3.0454,  2.4055, -2.2444],
        [-4.1887, -2.4503,  1.4950, -2.2571],
        [-5.6208, -2.2553, -3.3041,  2.7986],
        [-5.4099,  0.8873, -3.9014, -0.3538]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  9%|▉         | 26/289 [00:19<03:18,  1.33it/s]

Training loop 26
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2076447308063507, logits - tensor([[-3.6445, -1.7583, -3.0477,  1.6842],
        [-5.4233,  0.8531, -4.5262, -1.0795],
        [-5.2816,  1.5080, -4.9721, -1.9473],
        [-4.8231, -2.3235,  1.6088, -1.9047],
        [-5.5858,  0.9660, -4.8242, -1.9800],
        [-6.4382,  1.1712, -4.5481, -2.4097],
        [-5.7825,  1.5630, -4.5159, -1.1334],
        [-5.1878,  0.5251, -4.7610, -0.9662]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  9%|▉         | 27/289 [00:20<03:17,  1.33it/s]

Training loop 27
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2872840166091919, logits - tensor([[-6.1831,  1.3771, -5.1902, -1.3155],
        [-6.0202,  1.3211, -3.8487, -1.4284],
        [-4.4467, -2.9281,  1.9909, -1.8401],
        [-6.3098,  1.3773, -4.1752, -1.1484],
        [-5.6740,  0.6625, -4.6223, -1.4872],
        [-6.2145,  1.2350, -5.2164, -1.5045],
        [-4.7313,  0.6873, -4.2601, -0.1417],
        [-5.6214,  0.6967, -3.9365, -1.6246]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 10%|▉         | 28/289 [00:21<03:16,  1.33it/s]

Training loop 28
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19754472374916077, logits - tensor([[-4.8171,  1.2384, -4.6800, -2.0594],
        [-5.3952,  1.0146, -3.4707, -1.6534],
        [-5.3239,  1.0745, -4.0271, -2.2115],
        [-5.4740, -2.6166, -3.7117,  2.0691],
        [-4.0271, -2.4119,  1.6165, -2.2579],
        [-4.0819, -2.6923,  2.0276, -2.9602],
        [-5.7786,  1.1539, -4.0908, -1.3677],
        [-3.7529, -2.6093,  2.1412, -2.0654]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 10%|█         | 29/289 [00:21<03:15,  1.33it/s]

Training loop 29
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1191667914390564, logits - tensor([[-5.2936,  1.4163, -4.4272, -1.5025],
        [-5.6181,  1.1435, -4.0565, -1.6071],
        [-5.9517,  1.2935, -4.0880, -1.3841],
        [-5.8074,  1.9237, -4.9803, -1.0565],
        [-4.9547, -2.8900,  1.3420, -1.8682],
        [-4.8227,  0.7110, -3.1558, -1.0922],
        [-5.2701,  1.5941, -4.2834, -0.9365],
        [-5.5132,  1.8162, -4.8761, -2.2938]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 10%|█         | 30/289 [00:22<03:14,  1.33it/s]

Training loop 30
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.25537389516830444, logits - tensor([[-5.9091,  1.1284, -4.0175, -1.8556],
        [-5.1310, -2.3696, -3.1980,  2.1310],
        [-6.2389,  1.0759, -4.9097, -1.1064],
        [-5.0692, -2.5102, -3.7277,  1.1326],
        [-5.4551,  1.5471, -4.5526, -1.9622],
        [-6.0617,  1.5118, -4.7152, -1.3910],
        [-5.8193,  2.2553, -4.9795, -1.6037],
        [-5.5863,  1.2561, -4.4742, -2.3984]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 11%|█         | 31/289 [00:23<03:13,  1.33it/s]

Training loop 31
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.16933180391788483, logits - tensor([[-5.5251,  1.3207, -5.0098, -1.3905],
        [-5.1459,  0.3162, -3.9924, -1.4944],
        [-4.9372, -3.1063,  2.0437, -2.8979],
        [-5.7542,  0.8307, -4.5724, -1.5047],
        [-5.0564,  1.4132, -3.8846, -2.1746],
        [-5.8668,  0.6435, -3.3337, -1.9600],
        [-4.1331, -2.1027,  2.6974, -2.5210],
        [-4.5522, -2.3421,  1.9508, -2.1578]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 11%|█         | 32/289 [00:24<03:13,  1.33it/s]

Training loop 32
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.28256461024284363, logits - tensor([[-6.5535,  1.7714, -4.9013, -2.1575],
        [-5.9364,  1.8648, -4.5435, -1.8614],
        [-5.5635,  1.1589, -4.9832, -2.0132],
        [-5.6138,  1.6205, -4.8216, -2.2387],
        [-3.4234, -2.3792,  1.6632, -1.8583],
        [-5.7226,  0.5494, -4.3682, -2.1446],
        [-5.6420,  0.7696, -3.5503, -1.2078],
        [-5.2841,  1.6331, -4.3040, -1.7076]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 11%|█▏        | 33/289 [00:24<03:13,  1.32it/s]

Training loop 33
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.28787562251091003, logits - tensor([[-3.9009, -2.1749,  1.8444, -2.4114],
        [-5.7383,  1.1661, -4.4408, -2.2091],
        [-5.8070,  1.3329, -4.6265, -2.1696],
        [-5.9227,  1.4030, -4.4418, -2.3474],
        [-6.5684,  1.8899, -5.2174, -2.4117],
        [-4.3850,  1.4300, -4.0334, -1.0120],
        [-5.9349, -2.9381, -3.7835,  2.7057],
        [-5.4780,  1.2839, -5.2486, -1.7831]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 12%|█▏        | 34/289 [00:25<03:12,  1.33it/s]

Training loop 34
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3324331045150757, logits - tensor([[-5.3282,  1.3973, -4.6375, -1.8380],
        [-5.6566,  2.0460, -4.7223, -2.0901],
        [-5.2262,  1.2234, -3.7087, -1.4716],
        [-5.1289,  1.7838, -3.6641, -1.3272],
        [-5.8458,  0.6700, -3.8956, -1.7388],
        [-6.1376,  1.8708, -4.5758, -1.4470],
        [-4.2173, -2.3401, -4.3666,  2.5571],
        [-5.7525,  0.9233, -4.3415, -2.0171]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 12%|█▏        | 35/289 [00:26<03:11,  1.33it/s]

Training loop 35
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.16127067804336548, logits - tensor([[-5.9676,  1.4063, -4.0296, -1.2284],
        [-6.0129, -0.1782, -3.6075, -0.9682],
        [-6.0644,  0.3319, -4.5123, -1.0230],
        [-5.7124, -2.8536, -3.2364,  1.9529],
        [-5.6927,  0.6727, -3.9995, -0.7472],
        [-6.0951,  0.7535, -4.9703, -1.5146],
        [-6.2252,  1.1216, -4.1303, -1.2964],
        [-4.1687, -2.4839,  1.1291, -2.2237]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 12%|█▏        | 36/289 [00:27<03:10,  1.33it/s]

Training loop 36
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.32699739933013916, logits - tensor([[-6.1255, -0.4582, -3.8018,  0.0863],
        [-6.0531,  0.8549, -3.6471, -1.1425],
        [-5.4566,  1.7900, -4.2737, -1.4985],
        [-4.4105, -2.7836,  1.3272, -2.3733],
        [-5.9457,  1.3944, -4.6391, -1.8694],
        [-4.4835, -1.6608, -3.0129,  1.0571],
        [-5.9720,  1.3062, -4.3398, -1.6203],
        [-4.9599, -1.3102, -3.5563,  1.8782]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 13%|█▎        | 37/289 [00:27<03:10,  1.33it/s]

Training loop 37
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.45590394735336304, logits - tensor([[-5.6478,  1.3578, -4.3036, -1.5692],
        [-6.4353,  1.7000, -4.5415, -1.6006],
        [-5.5659, -2.4661,  0.8292, -3.2562],
        [-3.4311, -2.8836,  1.9829, -2.9180],
        [-5.5984,  1.2102, -3.7095, -1.2227],
        [-5.1425,  1.2277, -3.4307, -1.1409],
        [-5.9058,  0.8540, -3.8056, -1.7753],
        [-5.8736,  1.4702, -4.6472, -1.2431]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 13%|█▎        | 38/289 [00:28<03:09,  1.32it/s]

Training loop 38
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.30871689319610596, logits - tensor([[-3.7470, -2.0556,  1.9322, -2.7138],
        [-5.3377,  1.1664, -4.8116, -1.0614],
        [-6.0301,  1.5383, -4.8710, -1.3430],
        [-5.2080,  1.1879, -3.9638, -1.5893],
        [-5.4225,  1.3869, -3.7576, -0.9049],
        [-5.6696,  0.4152, -4.4511, -1.1064],
        [-4.8502, -1.4982, -2.9061,  1.7732],
        [-6.2829,  0.7710, -3.7567, -1.0130]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 13%|█▎        | 39/289 [00:29<03:35,  1.16it/s]

Training loop 39
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 14%|█▍        | 40/289 [00:30<03:27,  1.20it/s]

loss - 0.17160561680793762, logits - tensor([[-5.4258,  1.0165, -3.7214, -0.6246],
        [-4.0440, -2.3502, -3.2391,  2.5790],
        [-6.3209,  1.1967, -5.1212, -1.3529],
        [-6.5503,  0.3823, -4.3481, -1.1916],
        [-5.1089, -1.9343, -3.2394,  0.4739],
        [-4.6611, -2.0456,  1.6921, -2.9930],
        [-4.2074, -3.0681,  1.9790, -2.2056],
        [-4.5659, -2.6004,  2.0802, -2.8437]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 40
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.32565808296203613, logits - tensor([[-5.0512,  2.1982, -3.9050, -1.5824],
        [-6.4729,  0.9823, -3.8784, -1.3041],
        [-5.7664, -1.2850, -3.8506,  0.9577],
        [-6.6687, -0.7198, -3.0812, -0.6263],
        [-5.5243,  1.6595, -3.9836, -1.5610],
        [-5.2625,  1.4679, -4.2540, -1.9404],
        [-6.0921,  0.7935, -4.2771, -1.0212],
        [-7.0941,  1.0089, -4.8155, -1.

 14%|█▍        | 41/289 [00:31<03:21,  1.23it/s]

Training loop 41
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1308177262544632, logits - tensor([[-5.0984,  0.0407, -4.1327, -0.5035],
        [-5.8399,  1.0574, -4.5374, -0.5331],
        [-6.0269,  1.0999, -3.7107, -1.6514],
        [-4.8324, -2.1070, -3.5928,  1.8269],
        [-5.5116,  0.9205, -4.4587, -1.7347],
        [-5.3839, -2.4407, -3.2229,  1.6002],
        [-4.7302, -2.4004,  1.1828, -2.7836],
        [-5.5336, -2.6585, -3.4018,  2.2016]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 15%|█▍        | 42/289 [00:32<03:16,  1.26it/s]

Training loop 42
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4002913236618042, logits - tensor([[-6.0578,  1.2831, -4.2942, -1.4304],
        [-3.9486, -1.5443, -4.3621,  1.1580],
        [-4.8273, -2.0009,  1.6988, -2.2081],
        [-5.5818,  0.9440, -3.6325, -0.6633],
        [-5.9231,  1.0263, -3.2774, -1.6186],
        [-4.2682, -2.8859,  1.8931, -2.1758],
        [-4.7022,  1.5343, -3.3906, -1.4677],
        [-5.8185,  0.0977, -3.8740, -0.9116]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 15%|█▍        | 43/289 [00:32<03:13,  1.27it/s]

Training loop 43
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.15787504613399506, logits - tensor([[-5.7865,  1.3992, -4.0965, -1.0690],
        [-5.1139,  0.9120, -3.4679, -1.2605],
        [-4.3368, -1.6354,  1.6860, -2.3552],
        [-4.8238, -2.4945,  1.7544, -1.9526],
        [-5.5199,  1.1775, -3.8644, -0.1695],
        [-5.6991, -0.0424, -3.5142, -1.1331],
        [-5.2082,  1.7374, -3.8039, -1.4303],
        [-6.4134,  1.1627, -4.3270, -0.5961]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 15%|█▌        | 44/289 [00:33<03:10,  1.29it/s]

Training loop 44
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3193661868572235, logits - tensor([[-5.6715,  1.2313, -3.3069, -0.7316],
        [-5.3454, -1.5138, -3.5277,  1.4761],
        [-5.5989, -0.1794, -3.4208,  0.4607],
        [-5.2104,  1.9751, -4.1638, -1.3076],
        [-5.7036,  1.9766, -4.1377, -1.7186],
        [-5.7479,  0.9375, -3.6440, -1.4077],
        [-5.2972, -2.3150, -3.4277,  2.0232],
        [-5.9284,  0.5893, -3.2506, -0.8767]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 16%|█▌        | 45/289 [00:34<03:07,  1.30it/s]

Training loop 45
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3071037530899048, logits - tensor([[-6.0661,  0.7189, -3.6666, -0.6659],
        [-5.1497, -2.5136, -2.9988,  2.5348],
        [-5.1120,  1.3464, -3.7674, -1.0917],
        [-4.5769,  1.4895, -3.8517, -1.2937],
        [-3.6649, -2.1057,  1.8373, -2.5306],
        [-5.0731,  1.5718, -3.1906, -0.8933],
        [-6.4466,  1.6860, -4.8688, -1.6780],
        [-6.2073,  1.3183, -4.3129, -0.7771]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 16%|█▌        | 46/289 [00:35<03:06,  1.31it/s]

Training loop 46
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1081705316901207, logits - tensor([[-5.0507, -2.3238,  1.7150, -2.9205],
        [-5.2711,  0.8190, -3.5113, -0.2803],
        [-4.8140, -2.7989, -3.3454,  3.1750],
        [-5.6728,  0.8474, -3.3481, -0.9092],
        [-5.2341, -2.7927, -3.8965,  3.2898],
        [-4.5096, -1.9621, -3.6464,  1.8641],
        [-4.8035, -2.1941,  1.8216, -2.1176],
        [-6.1372,  1.4732, -4.5534, -1.1502]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 16%|█▋        | 47/289 [00:35<03:03,  1.32it/s]

Training loop 47
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.20273712277412415, logits - tensor([[-4.9335,  0.9785, -4.2027, -0.8114],
        [-6.3907,  0.8171, -4.0829, -1.1343],
        [-5.1711, -2.3992,  1.8914, -3.0582],
        [-3.7298, -1.8653,  1.2767, -2.6757],
        [-5.2434,  1.3031, -4.3996, -1.1243],
        [-5.9707,  0.4687, -4.5256, -0.6865],
        [-5.3721,  1.2242, -3.5154, -0.8122],
        [-4.9964, -2.8067,  1.5959, -2.9722]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 17%|█▋        | 48/289 [00:36<03:02,  1.32it/s]

Training loop 48
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19193631410598755, logits - tensor([[-4.2794, -2.4695,  2.3416, -2.6184],
        [-5.9902,  1.6870, -4.3396, -0.8752],
        [-4.9537,  1.0435, -3.4090, -1.1895],
        [-5.2905,  1.2401, -3.5644, -1.0369],
        [-5.3115,  1.2473, -3.7183, -1.5213],
        [-5.0949,  1.4278, -3.5864, -1.6654],
        [-4.7888, -1.8447,  1.8069, -2.3476],
        [-5.1601,  1.2835, -3.9701, -1.0175]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 17%|█▋        | 49/289 [00:37<03:01,  1.32it/s]

Training loop 49
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3399236798286438, logits - tensor([[-5.8198,  0.9792, -2.8667, -1.5951],
        [-5.2031,  1.3036, -3.9681, -0.6685],
        [-5.4582,  1.4789, -3.9917, -1.0940],
        [-5.7457,  1.3510, -3.9376, -1.5316],
        [-5.0232,  1.8665, -4.1064, -1.2260],
        [-6.3486,  2.0223, -4.7184, -1.3619],
        [-5.1238,  1.0819, -3.9844, -1.5373],
        [-5.5772,  0.7123, -3.5572, -0.0705]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 17%|█▋        | 50/289 [00:38<03:00,  1.32it/s]

Training loop 50
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 18%|█▊        | 51/289 [00:38<02:59,  1.32it/s]

loss - 0.33916378021240234, logits - tensor([[-4.8198, -1.7180,  1.1384, -2.4358],
        [-6.0201,  1.4934, -4.0545, -0.4614],
        [-5.5456,  1.2046, -3.9755, -1.3628],
        [-4.8331, -2.9476, -3.5398,  2.4588],
        [-5.5454,  1.5572, -4.0672, -0.8843],
        [-6.1447,  1.6534, -4.2648, -1.8131],
        [-5.6403,  1.5558, -4.2490, -1.0564],
        [-5.4226,  0.9049, -3.7118, -1.1602]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 51
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.17263466119766235, logits - tensor([[-4.9984,  1.4155, -3.9504, -1.0876],
        [-5.9839,  0.8540, -4.0948, -0.7624],
        [-5.7064,  0.7095, -3.8229, -1.2854],
        [-5.2663,  1.9949, -3.9880, -1.6381],
        [-5.5872,  1.5545, -3.5836, -1.9232],
        [-5.6133,  0.4830, -3.5058, -0.4206],
        [-6.0532,  1.9175, -3.5799, -0.7486],
        [-4.3517, -1.9823,  1.8688, -2.

 18%|█▊        | 52/289 [00:39<02:58,  1.32it/s]

Training loop 52
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.29168635606765747, logits - tensor([[-5.2497,  1.3017, -3.5987, -0.8826],
        [-5.7454,  1.7153, -4.2308, -1.3341],
        [-6.5138,  1.3917, -4.5622, -0.6571],
        [-5.1315,  1.4243, -3.7758, -1.3888],
        [-5.7599,  1.6942, -4.3895, -0.8861],
        [-5.5049,  1.5849, -3.8538, -0.9311],
        [-4.5223,  1.6578, -3.7581, -1.1099],
        [-4.9720,  1.1681, -3.7798, -1.4107]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 18%|█▊        | 53/289 [00:40<02:58,  1.33it/s]

Training loop 53
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.39335933327674866, logits - tensor([[-5.3111, -1.1678, -3.6471,  1.0578],
        [-5.6794,  1.0595, -4.5483, -0.5432],
        [-5.7779,  1.1788, -4.2671, -1.2931],
        [-4.5190,  1.5753, -3.3589, -1.3618],
        [-6.5185,  1.1049, -5.1880, -1.0448],
        [-5.9306,  2.3333, -4.1806, -0.8937],
        [-4.6565,  0.6128, -3.0003, -0.9631],
        [-6.0031, -1.1197, -3.6904,  1.6149]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 19%|█▊        | 54/289 [00:41<02:57,  1.32it/s]

Training loop 54
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1572839617729187, logits - tensor([[-6.0278,  1.8265, -4.4950, -1.0527],
        [-5.6229,  0.7980, -4.8182, -1.9490],
        [-5.4747, -2.2641,  0.2122, -1.4888],
        [-5.9588,  1.6745, -3.8049, -1.8802],
        [-5.7843,  1.1531, -3.8880, -0.0584],
        [-5.4698,  0.9546, -4.2906, -1.7892],
        [-5.6339,  0.3914, -3.7368, -0.6625],
        [-5.7813,  1.9930, -3.8641, -1.2775]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 19%|█▉        | 55/289 [00:41<02:56,  1.32it/s]

Training loop 55
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2823454737663269, logits - tensor([[-4.4433,  0.7195, -3.5633, -0.7530],
        [-5.2454,  1.6269, -4.2714, -1.0466],
        [-5.4572,  0.7528, -4.2169, -1.2813],
        [-3.7340, -1.4258,  2.1444, -2.2564],
        [-5.3785,  1.6142, -4.0177, -1.7188],
        [-4.8920, -1.2489,  0.6801, -2.2175],
        [-5.1004,  1.5206, -3.9697, -1.1158],
        [-4.8371,  1.4488, -3.2492, -1.0171]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 19%|█▉        | 56/289 [00:42<02:55,  1.33it/s]

Training loop 56
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.17597244679927826, logits - tensor([[-5.6859,  1.8509, -4.8916, -1.4764],
        [-5.8805,  1.8485, -4.2275, -1.5617],
        [-6.5483,  0.8134, -4.1266, -0.7689],
        [-5.5499,  1.6410, -4.8316, -1.6594],
        [-5.0541, -1.9775, -2.6594,  1.2978],
        [-5.1976,  0.0578, -3.5333, -0.9838],
        [-4.9916,  0.4160, -4.3138, -0.8684],
        [-5.3280,  1.4695, -4.4116, -1.6119]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 20%|█▉        | 57/289 [00:43<02:55,  1.32it/s]

Training loop 57
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.38065487146377563, logits - tensor([[-5.8620,  0.8537, -4.3628, -0.9459],
        [-5.9053,  1.4777, -4.1128, -1.2777],
        [-5.5190,  1.4952, -4.9803, -1.1333],
        [-5.4767,  1.4568, -3.9772, -2.4233],
        [-6.2651,  1.3244, -4.0084, -1.8676],
        [-4.8520,  0.7048, -3.5453, -1.3668],
        [-4.9956,  0.5033, -3.6706, -0.8374],
        [-7.0230,  1.2155, -4.2268, -0.2935]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 20%|██        | 58/289 [00:44<02:55,  1.31it/s]

Training loop 58
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.248567596077919, logits - tensor([[-4.8669, -0.0568, -4.0433, -0.2182],
        [-5.8021,  1.2729, -3.7010, -0.6182],
        [-6.0196,  1.5784, -3.9394, -0.8724],
        [-5.7314,  1.6541, -3.8441, -1.4839],
        [-4.7433,  1.1383, -3.5461, -0.6600],
        [-5.2979,  1.5310, -4.7492, -2.4030],
        [-5.2283, -0.0071, -3.4159, -0.2936],
        [-6.5787,  1.5900, -3.7464, -1.2931]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 20%|██        | 59/289 [00:44<02:55,  1.31it/s]

Training loop 59
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.537117600440979, logits - tensor([[-6.0894,  1.6260, -4.6323, -1.3064],
        [-4.9041,  1.3473, -3.2973, -1.1050],
        [-6.0968,  2.0585, -5.0548, -1.6192],
        [-6.4753,  1.4431, -4.6860, -1.9981],
        [-5.8756,  1.1814, -4.7949, -1.7498],
        [-5.2845,  1.6714, -4.1073, -1.7642],
        [-4.1760, -1.7799,  1.3445, -2.1732],
        [-6.2406,  2.0427, -5.0157, -1.2860]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 21%|██        | 60/289 [00:45<02:55,  1.30it/s]

Training loop 60
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.442888468503952, logits - tensor([[-5.3794,  0.3864, -4.5682, -0.6273],
        [-6.2343,  1.0992, -3.2451, -2.0406],
        [-4.6248, -3.2481,  2.1272, -2.8624],
        [-5.4820,  1.0948, -3.3418, -1.7250],
        [-5.4248,  2.2041, -4.2344, -1.7344],
        [-5.0864,  1.2194, -4.2876, -1.1123],
        [-4.8089,  0.7851, -4.2600, -1.9638],
        [-5.8112,  0.3340, -4.2834, -0.9600]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 21%|██        | 61/289 [00:46<02:53,  1.31it/s]

Training loop 61
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.366097092628479, logits - tensor([[-7.3749,  1.4697, -4.1428, -1.5754],
        [-5.3006,  1.9150, -4.4599, -1.7305],
        [-5.0521,  1.7518, -4.1803, -1.9346],
        [-5.6231,  1.7867, -4.6163, -2.3552],
        [-5.9617,  1.3809, -4.5695, -1.6374],
        [-5.7897, -0.3404, -1.2257, -2.2371],
        [-5.3114,  1.9540, -4.3978, -2.3596],
        [-5.8979, -1.3436, -4.0981,  1.5267]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 21%|██▏       | 62/289 [00:47<02:53,  1.31it/s]

Training loop 62
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3899308443069458, logits - tensor([[-5.7449, -1.7592, -3.9183,  1.6763],
        [-5.8767,  0.5548, -4.2125, -0.9229],
        [-5.5794,  0.1532, -2.5384, -2.0277],
        [-4.3904,  1.5987, -3.9174, -1.5866],
        [-3.5577, -1.2159,  0.8220, -2.2222],
        [-6.6087,  1.3309, -4.5004, -2.1112],
        [-3.9103, -1.9577,  1.7162, -2.9936],
        [-4.9033,  1.2338, -3.6575, -2.3336]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 22%|██▏       | 63/289 [00:48<02:52,  1.31it/s]

Training loop 63
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.38890478014945984, logits - tensor([[-5.2603, -2.3524, -4.0531,  2.4117],
        [-5.8693,  1.3948, -4.1062, -1.8361],
        [-6.2655,  1.3483, -4.5724, -1.5971],
        [-5.0641,  0.7473, -3.1858, -1.5090],
        [-5.3783, -0.0558, -3.1179, -0.5649],
        [-5.9738,  1.8104, -3.5369, -2.7647],
        [-4.5521, -2.5140,  1.6971, -2.2729],
        [-5.1568,  1.5238, -3.8493, -2.0213]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 22%|██▏       | 64/289 [00:48<02:52,  1.31it/s]

Training loop 64
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.29103171825408936, logits - tensor([[-3.7621, -2.0097,  2.0793, -2.9847],
        [-5.9192,  0.9872, -3.8664, -1.6614],
        [-5.7598, -1.7655, -4.6099,  2.4961],
        [-5.6424,  1.4580, -3.9292, -2.0646],
        [-6.2107,  0.7574, -3.7014, -2.0728],
        [-4.7350,  1.0888, -3.7952, -1.8042],
        [-6.7023,  2.1722, -4.1114, -2.7583],
        [-4.3969, -2.5482,  1.8976, -2.9992]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 22%|██▏       | 65/289 [00:49<02:51,  1.31it/s]

Training loop 65
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3720765709877014, logits - tensor([[-5.2403,  0.9519, -3.3317, -1.9842],
        [-5.8152,  0.9013, -3.5339, -1.5922],
        [-6.0856,  0.4160, -3.0795, -1.5496],
        [-4.0154, -2.2752,  2.1340, -2.5632],
        [-5.7058,  0.7284, -3.2048, -0.9751],
        [-5.3275,  0.5325, -2.7488, -1.0180],
        [-4.5478, -3.0063,  1.4308, -2.9851],
        [-6.0882,  0.8468, -3.2134, -2.1128]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 23%|██▎       | 66/289 [00:50<02:50,  1.31it/s]

Training loop 66
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1851091980934143, logits - tensor([[-6.0959,  1.2285, -4.1852, -1.5606],
        [-4.7973, -1.7349, -3.9182,  1.8516],
        [-4.1928, -1.2876,  0.8237, -2.3117],
        [-4.9586,  1.6631, -4.0669, -1.9966],
        [-5.2515,  0.2994, -3.1095, -1.2589],
        [-5.5470,  1.0330, -3.2197, -1.9135],
        [-5.3815,  1.1368, -3.4288, -1.5717],
        [-4.7682,  0.2329, -3.2867, -1.0863]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 23%|██▎       | 67/289 [00:51<02:49,  1.31it/s]

Training loop 67
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3838667869567871, logits - tensor([[-5.6968,  0.9932, -3.2692, -2.0443],
        [-5.7437,  0.4852, -3.4645, -1.2433],
        [-4.8670,  1.3186, -3.3463, -1.8794],
        [-5.9781,  0.0866, -3.0415,  0.3893],
        [-5.3315,  0.9978, -2.4732, -0.6653],
        [-5.2663, -1.5589, -3.6302,  1.5630],
        [-4.4716, -2.7354,  2.8282, -3.0570],
        [-5.3418,  0.5475, -3.0819, -1.7723]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 24%|██▎       | 68/289 [00:51<02:48,  1.31it/s]

Training loop 68
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2560344338417053, logits - tensor([[-6.8357,  0.2953, -3.9335, -1.3005],
        [-5.8169, -0.5439, -3.2172, -0.3334],
        [-5.6554, -2.4578, -2.9073,  1.7892],
        [-5.5857,  1.4780, -3.8130, -1.0590],
        [-5.7205,  1.4497, -4.1971, -1.2543],
        [-5.2021, -0.0866, -3.3560, -1.1054],
        [-5.1160, -1.9932,  1.6177, -2.2766],
        [-5.5320,  1.0733, -3.4387, -1.8431]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 24%|██▍       | 69/289 [00:52<02:47,  1.31it/s]

Training loop 69
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.21534475684165955, logits - tensor([[-4.9199, -2.2762,  1.1899, -2.9447],
        [-6.1189,  0.8062, -3.8029, -1.0980],
        [-5.8215,  0.0421, -3.0303, -1.2969],
        [-5.4419, -1.5917, -3.6448,  1.7385],
        [-4.2018, -2.4686,  1.7350, -2.6654],
        [-4.7691,  0.4305, -3.3393, -1.9347],
        [-6.1640, -1.7811, -4.0635,  1.8742],
        [-5.6545,  1.2751, -3.5879, -1.3387]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 24%|██▍       | 70/289 [00:53<02:46,  1.32it/s]

Training loop 70
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.14987479150295258, logits - tensor([[-5.0329,  0.8515, -2.9681, -1.7387],
        [-4.8244, -3.2488, -3.5173,  3.3026],
        [-5.6666,  0.4111, -4.3796, -1.1974],
        [-3.8226, -3.2197, -3.2557,  3.0461],
        [-5.0689,  0.2908, -3.1532, -1.1406],
        [-4.3065, -1.5815,  2.5437, -3.0035],
        [-5.1570, -0.1868, -3.4849, -0.5641],
        [-6.1465,  0.6742, -3.5090, -1.2663]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 25%|██▍       | 71/289 [00:54<02:44,  1.32it/s]

Training loop 71
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.205720454454422, logits - tensor([[-5.6323,  0.6460, -3.0993, -1.1264],
        [-5.6976,  0.9024, -3.9685, -1.2147],
        [-5.8401,  0.0992, -3.7745, -1.7789],
        [-4.7196, -3.1625,  2.4778, -3.3805],
        [-5.0101,  1.2580, -3.1875, -0.7846],
        [-5.5610, -2.6079,  1.5252, -2.8146],
        [-5.2315,  0.1252, -2.7661, -1.5329],
        [-5.7058,  1.6147, -3.2659, -1.3913]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 25%|██▍       | 72/289 [00:54<02:43,  1.33it/s]

Training loop 72
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.17497292160987854, logits - tensor([[-5.1418, -3.0580, -3.8954,  2.4122],
        [-3.9673, -2.2727,  1.8050, -2.9157],
        [-4.1656, -2.6767,  2.2681, -2.9403],
        [-4.6549,  0.6899, -2.9033, -0.7579],
        [-5.8976,  1.1823, -3.7567, -1.2759],
        [-5.9418,  1.2890, -3.7173, -1.7288],
        [-5.1900,  0.3076, -2.6082, -1.2505],
        [-6.4852,  0.5499, -3.7723, -0.7719]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 25%|██▌       | 73/289 [00:55<02:42,  1.33it/s]

Training loop 73
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.27946269512176514, logits - tensor([[-3.9646, -2.6212,  1.8563, -2.9696],
        [-5.3450,  0.8447, -3.4569, -1.3743],
        [-5.7405,  0.7837, -3.3249, -1.0965],
        [-4.3550,  0.7491, -2.5167, -1.0719],
        [-5.4346,  1.0761, -3.4451, -1.5372],
        [-5.4023,  0.6427, -3.7636, -1.4636],
        [-5.4834,  0.2639, -3.7812, -1.1171],
        [-5.7342,  1.2262, -3.8533, -1.0721]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 26%|██▌       | 74/289 [00:56<02:41,  1.33it/s]

Training loop 74
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.18389269709587097, logits - tensor([[-6.7619,  0.4227, -3.1871, -0.5805],
        [-5.2345,  0.3979, -2.9467, -1.2021],
        [-6.2416,  1.0963, -3.1799, -0.8780],
        [-5.2167, -2.2019, -3.6898,  2.9248],
        [-5.1049,  1.1807, -4.1123, -2.2080],
        [-5.5646, -3.8197, -2.8941,  2.5305],
        [-5.9781,  1.0370, -3.8106, -1.3158],
        [-5.8775, -1.2889, -3.1916,  1.5454]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 26%|██▌       | 75/289 [00:57<02:40,  1.33it/s]

Training loop 75
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1177683174610138, logits - tensor([[-5.4895,  0.7330, -3.2718, -0.6718],
        [-5.7895,  1.1173, -3.4812, -1.1025],
        [-5.3678, -3.1613,  2.9432, -2.7484],
        [-6.7472,  0.9737, -3.3165, -0.8100],
        [-6.8161,  1.9502, -4.3333, -1.4924],
        [-4.8552, -2.9341,  2.4255, -2.6406],
        [-5.4100,  1.1655, -3.2192, -1.6485],
        [-6.4556,  1.8444, -3.4480, -1.5048]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 26%|██▋       | 76/289 [00:57<02:39,  1.33it/s]

Training loop 76
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.21759191155433655, logits - tensor([[-6.5810, -0.1475, -4.2596,  0.1944],
        [-4.2411, -2.4290,  2.4190, -2.5356],
        [-5.7054,  0.7957, -3.0954, -0.8672],
        [-4.1749, -2.1830,  2.0312, -2.6007],
        [-5.3205,  0.9392, -2.9615, -0.8331],
        [-5.3479,  1.7272, -3.9750, -1.4806],
        [-5.6283,  0.7221, -3.1913, -1.3967],
        [-5.3055,  0.9669, -3.2487, -1.4663]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 27%|██▋       | 77/289 [00:58<02:38,  1.33it/s]

Training loop 77
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.07813626527786255, logits - tensor([[-4.8906, -2.9285,  2.1842, -2.4909],
        [-5.0707, -3.2215, -3.9922,  3.4044],
        [-5.6641,  1.3575, -4.0511, -1.8503],
        [-4.2949, -2.0846,  2.1911, -2.7987],
        [-4.0932, -2.4198,  2.0699, -2.4261],
        [-5.9736,  0.9286, -3.5906, -0.3706],
        [-5.7209, -3.3531,  2.4151, -3.5827],
        [-4.8519, -3.0261, -3.6080,  3.1694]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 27%|██▋       | 78/289 [00:59<02:37,  1.34it/s]

Training loop 78
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.32975757122039795, logits - tensor([[-4.4071, -2.4932,  2.1868, -2.4014],
        [-5.8361,  1.6613, -3.3397, -1.2760],
        [-6.0930,  1.4976, -3.1084, -1.4671],
        [-5.5323,  1.8341, -3.7517, -1.0587],
        [-5.7134,  0.9197, -3.3304, -0.9005],
        [-5.3001,  1.5890, -3.7029, -1.8290],
        [-5.8804,  1.6937, -3.2724, -2.0357],
        [-4.0212, -2.4303,  1.7714, -2.7133]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 27%|██▋       | 79/289 [01:00<02:37,  1.33it/s]

Training loop 79
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2178335338830948, logits - tensor([[-4.0145, -2.5143, -3.7755,  2.4569],
        [-5.2309,  1.9831, -4.7295, -1.0822],
        [-5.2912,  1.7584, -3.2581, -1.5743],
        [-5.1707,  1.0913, -3.6288, -1.9073],
        [-5.7896,  0.2071, -3.5352, -0.3224],
        [-5.8466,  1.6338, -3.2796, -1.3439],
        [-4.7469,  1.2035, -3.9283, -0.8839],
        [-3.8392, -2.6897,  2.3725, -2.5727]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 28%|██▊       | 80/289 [01:00<02:36,  1.34it/s]

Training loop 80
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4272426664829254, logits - tensor([[-3.7494, -2.8408,  2.4863, -2.6404],
        [-6.0443,  1.7417, -3.6039, -1.1058],
        [-5.4071,  1.7870, -4.1678, -1.6279],
        [-4.6751, -2.2926, -4.3045,  2.3027],
        [-5.2314,  1.6599, -3.6620, -1.1003],
        [-4.8913,  0.4393, -1.2045, -1.2770],
        [-5.0053, -3.0230, -4.0125,  3.0767],
        [-5.6543,  2.0142, -3.6962, -1.7225]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 28%|██▊       | 81/289 [01:01<02:35,  1.34it/s]

Training loop 81
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.30775782465934753, logits - tensor([[-4.6878,  1.5035, -3.4633, -0.4677],
        [-4.0009, -2.6267,  2.2015, -2.7753],
        [-4.5557, -2.8891,  1.5101, -2.0561],
        [-5.4690,  0.2099, -3.3020, -0.2265],
        [-5.8006,  1.1194, -4.0556, -0.1666],
        [-5.5321,  2.0249, -4.1360, -1.6437],
        [-6.0421,  1.7278, -3.7307, -2.1030],
        [-3.7233, -1.8376,  2.6223, -2.6772]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 28%|██▊       | 82/289 [01:02<02:35,  1.33it/s]

Training loop 82
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 29%|██▊       | 83/289 [01:03<02:34,  1.33it/s]

loss - 0.2626808285713196, logits - tensor([[-5.5457,  1.5558, -3.5375, -1.1229],
        [-4.0568, -2.7504,  2.4305, -2.4315],
        [-5.8568,  1.2530, -4.3605, -2.3229],
        [-5.3356,  1.6777, -3.5685, -1.0951],
        [-4.3430, -3.1728, -3.2609,  3.0451],
        [-5.5702,  1.0304, -3.8070, -1.2207],
        [-6.1512,  1.9561, -4.3890, -1.2758],
        [-3.7835, -2.9691,  2.3129, -3.5205]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 83
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.630128800868988, logits - tensor([[-4.7553,  1.3764, -3.7086, -1.3554],
        [-4.1197, -2.5118,  2.7011, -3.5454],
        [-5.3010,  1.0527, -3.7301, -1.9200],
        [-5.3205, -2.4723, -3.2495,  3.2139],
        [-5.7418,  1.7978, -4.1102, -0.8325],
        [-5.8327,  1.6999, -3.7918, -0.3234],
        [-5.6337,  1.1684, -2.9866, -0.8168],
        [-5.2902,  2.0561, -4.1250, -1.450

 29%|██▉       | 84/289 [01:03<02:34,  1.33it/s]

Training loop 84
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.0923704281449318, logits - tensor([[-5.8335,  2.3054, -4.2524, -1.2101],
        [-5.8271,  1.6447, -4.1733, -1.1576],
        [-6.0496,  1.3392, -4.0183, -1.0219],
        [-6.1532,  2.0278, -4.6749, -1.7245],
        [-6.5876,  1.1897, -3.8411, -1.6437],
        [-5.1148, -1.4961, -3.7381,  2.7683],
        [-5.0254,  1.5165, -3.4430, -2.0171],
        [-5.9518, -2.8436, -3.9274,  2.8219]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 29%|██▉       | 85/289 [01:04<02:33,  1.33it/s]

Training loop 85
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.16575834155082703, logits - tensor([[-4.1508, -2.4683,  1.2979, -2.7813],
        [-5.2003,  1.9444, -3.5944, -1.1771],
        [-6.6671,  1.4026, -4.9745, -1.1543],
        [-4.9406, -2.4596, -3.8225,  2.2121],
        [-5.3675, -1.6985, -0.6345, -1.3935],
        [-6.9373,  1.5220, -4.2729, -0.7687],
        [-4.9757,  2.2423, -3.4608, -0.8601],
        [-5.4654,  1.8239, -3.3402, -1.1826]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 30%|██▉       | 86/289 [01:05<02:32,  1.33it/s]

Training loop 86
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.18150191009044647, logits - tensor([[-5.1697, -3.0687,  1.8255, -2.8555],
        [-5.8314,  2.6101, -4.5649, -1.0986],
        [-5.5236,  1.9330, -4.0279, -1.9216],
        [-5.2468,  0.8380, -3.9695, -1.3279],
        [-5.0138,  1.7092, -3.7529, -0.7062],
        [-4.8850,  1.3244, -3.7273, -0.6867],
        [-4.8518, -2.4315,  1.5575, -2.2325],
        [-5.1411,  1.4672, -3.4915, -1.4999]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 30%|███       | 87/289 [01:06<02:31,  1.33it/s]

Training loop 87
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.24713794887065887, logits - tensor([[-5.9013,  1.4365, -4.2561, -0.9432],
        [-7.2935,  0.0644, -4.6929, -0.1496],
        [-5.7021,  1.1613, -3.3176, -0.9400],
        [-5.4269,  2.4885, -4.8783, -0.6941],
        [-5.3580,  2.1086, -3.9946, -0.9073],
        [-5.0919,  1.3847, -3.4869, -0.8406],
        [-4.8760,  1.1995, -3.7574, -1.5341],
        [-5.5823,  1.6704, -4.0211, -1.2093]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 30%|███       | 88/289 [01:06<02:31,  1.33it/s]

Training loop 88
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5231322646141052, logits - tensor([[-5.3355,  1.8009, -4.4839, -1.4994],
        [-5.4691,  1.5504, -4.2490, -1.1061],
        [-5.3595, -1.6001, -4.1608,  2.7016],
        [-6.4478,  1.8766, -4.6314, -1.7562],
        [-5.5648,  2.5129, -4.6483, -0.8520],
        [-5.8495,  2.6126, -4.4600, -1.6836],
        [-4.9947, -2.0748, -3.8022,  1.7961],
        [-5.4215,  1.8928, -4.1090, -1.4357]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 31%|███       | 89/289 [01:07<02:30,  1.33it/s]

Training loop 89
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1073768138885498, logits - tensor([[-3.9451, -2.9630,  1.9375, -1.7280],
        [-5.5960,  0.4878, -3.8176, -1.0110],
        [-5.3762,  1.6244, -4.2970, -0.7187],
        [-4.8455, -2.8669,  1.2732, -2.4302],
        [-5.0955,  1.5238, -3.7078, -1.2844],
        [-5.4795,  1.9211, -4.0715, -2.2167],
        [-5.7473,  2.2549, -3.3417, -1.9405],
        [-3.9262, -2.5603,  2.0364, -3.1214]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 31%|███       | 90/289 [01:08<02:29,  1.33it/s]

Training loop 90
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.43495655059814453, logits - tensor([[-5.0566, -2.4292,  2.5852, -2.5762],
        [-6.4428,  1.1305, -4.0374, -1.1323],
        [-5.1728,  2.3018, -4.9091, -1.3585],
        [-5.4862,  2.0430, -5.0764, -0.5273],
        [-4.9707,  1.1259, -4.0528, -1.3107],
        [-5.0535,  1.0190, -4.1899, -1.5016],
        [-4.5459,  0.8591, -4.1075, -1.5226],
        [-5.7337,  2.5446, -4.9196, -0.9985]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 31%|███▏      | 91/289 [01:09<02:28,  1.33it/s]

Training loop 91
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19838732481002808, logits - tensor([[-5.4711,  1.1980, -3.9639, -1.7427],
        [-5.3441,  1.6313, -4.6307, -1.4891],
        [-5.6700,  1.7364, -4.8006, -0.5453],
        [-5.4390,  1.2178, -3.4778, -1.4632],
        [-4.6962, -2.7443,  2.4334, -2.6920],
        [-5.8273,  1.5311, -4.0652, -2.2042],
        [-5.7516,  1.2842, -3.8746, -1.8968],
        [-5.1775, -2.7832,  1.7236, -2.6114]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 32%|███▏      | 92/289 [01:09<02:28,  1.33it/s]

Training loop 92
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2404160499572754, logits - tensor([[-5.6620,  1.3675, -4.4390, -1.8205],
        [-6.1541,  1.8429, -5.1468, -1.7679],
        [-6.3664, -1.6171, -4.7409,  1.8489],
        [-5.2140,  1.5916, -3.0242, -0.8854],
        [-4.0169, -2.6680,  2.3379, -2.6211],
        [-6.0533,  1.9318, -4.7196, -1.5464],
        [-5.7196,  1.1351, -3.4135, -1.4847],
        [-5.6867,  1.4742, -3.8244, -1.6109]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 32%|███▏      | 93/289 [01:10<02:27,  1.33it/s]

Training loop 93
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.16613999009132385, logits - tensor([[-6.1567,  1.0883, -4.3990, -1.9917],
        [-5.3433,  1.9068, -4.4938, -1.7219],
        [-6.1733,  1.5854, -4.2890, -1.2876],
        [-6.2868,  1.5877, -4.6568, -1.9981],
        [-5.4528,  1.4715, -4.1721, -2.3269],
        [-5.9095,  0.9207, -3.4169, -1.5082],
        [-5.5576, -2.2066, -4.4119,  2.3475],
        [-6.4747,  2.0954, -4.4618, -1.9273]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 33%|███▎      | 94/289 [01:11<02:26,  1.33it/s]

Training loop 94
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.16226737201213837, logits - tensor([[-5.8965,  2.0455, -4.6216, -1.5174],
        [-5.4199,  2.2596, -4.5589, -2.0256],
        [-6.4254,  0.6035, -4.0897, -1.8609],
        [-5.9186,  1.3783, -4.8928, -2.1434],
        [-6.1253,  2.1229, -4.5261, -2.5218],
        [-5.2238, -2.5704, -3.9748,  3.9201],
        [-5.6539,  1.6362, -4.4136, -1.4962],
        [-5.6704,  1.7106, -3.9165, -1.0221]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 33%|███▎      | 95/289 [01:12<02:25,  1.33it/s]

Training loop 95
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.08577506244182587, logits - tensor([[-5.4016,  1.6954, -4.4393, -1.5829],
        [-5.0835,  2.0353, -5.0363, -2.3040],
        [-4.7205,  1.3052, -4.0151, -1.6298],
        [-5.6986,  1.9062, -5.1366, -2.1290],
        [-5.6138,  2.0778, -4.4680, -1.6257],
        [-4.1230, -2.5726,  1.4087, -1.6987],
        [-6.3288,  1.7893, -5.0170, -2.3728],
        [-6.3809,  1.7784, -5.3525, -1.3361]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 33%|███▎      | 96/289 [01:12<02:24,  1.34it/s]

Training loop 96
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.09702086448669434, logits - tensor([[-6.2448,  1.8848, -5.1937, -0.6957],
        [-5.1105,  1.6937, -4.2736, -1.6075],
        [-4.2997, -2.1036,  1.0659, -2.4751],
        [-4.9599, -3.2855,  1.5111, -2.7219],
        [-4.6653,  1.5498, -3.8549, -1.9961],
        [-4.5519, -2.8054, -4.0571,  2.4763],
        [-5.1369,  2.0141, -3.9252, -1.2004],
        [-4.8477,  1.6517, -3.5873, -1.3717]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 34%|███▎      | 97/289 [01:13<02:23,  1.33it/s]

Training loop 97
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.18714402616024017, logits - tensor([[-5.9735,  1.7568, -4.0986, -1.8630],
        [-4.7378,  1.4149, -3.4329, -1.6279],
        [-6.8300,  0.9648, -4.7604, -1.7404],
        [-4.4021, -2.6495,  1.4487, -2.4660],
        [-4.9140, -1.4272, -3.9937,  2.0067],
        [-6.3721,  1.9286, -5.2989, -3.1288],
        [-4.2602, -3.0642,  1.4090, -2.5410],
        [-5.5581,  1.8185, -4.4483, -1.2725]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 34%|███▍      | 98/289 [01:14<02:23,  1.33it/s]

Training loop 98
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.22968091070652008, logits - tensor([[-5.9946, -0.4436, -4.0765,  0.8783],
        [-4.2584, -2.5029,  1.2918, -1.7204],
        [-5.9869, -1.9381, -4.4300,  2.5525],
        [-4.6570, -2.8268,  1.3054, -2.5935],
        [-5.4097,  1.4519, -3.7644, -1.9998],
        [-4.5974, -3.1716, -3.1330,  3.4759],
        [-5.0633, -3.1315,  1.6135, -2.3069],
        [-5.5253, -3.2319, -4.2689,  3.7809]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 34%|███▍      | 99/289 [01:15<02:22,  1.34it/s]

Training loop 99
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.20830421149730682, logits - tensor([[-5.7307,  1.4465, -4.0065, -2.3662],
        [-5.3621,  1.8722, -4.0888, -1.4847],
        [-6.4395,  2.1381, -5.3854, -1.9738],
        [-5.4983,  1.7633, -4.1730, -1.0830],
        [-5.8171,  2.7984, -5.6188, -1.3459],
        [-5.0941,  1.5074, -3.7076, -2.1881],
        [-4.2090, -2.7656,  2.0965, -1.8899],
        [-7.0184,  1.7968, -4.8507, -1.8712]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 35%|███▍      | 100/289 [01:15<02:21,  1.33it/s]

Training loop 100
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.06792935729026794, logits - tensor([[-4.8110, -3.1032,  2.6648, -2.6608],
        [-5.3771,  1.1428, -4.1544, -2.0253],
        [-6.2167,  2.2702, -4.7518, -2.1338],
        [-6.8332,  1.9278, -5.0145, -1.9743],
        [-4.2692, -2.4494, -4.3798,  1.8388],
        [-4.1249, -2.7949, -4.3674,  2.6687],
        [-6.2351,  1.6307, -5.5426, -2.2880],
        [-4.1666, -2.6639,  1.7613, -1.9167]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 35%|███▍      | 101/289 [01:16<02:21,  1.33it/s]

Training loop 101
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.23727937042713165, logits - tensor([[-4.3338,  2.4644, -4.1933, -1.3500],
        [-5.3082, -1.0027, -3.9918,  0.6050],
        [-4.1382, -2.9307,  1.3234, -1.9322],
        [-4.9969,  1.7936, -4.9022, -2.3267],
        [-5.7513,  1.2524, -4.5046, -1.3777],
        [-5.8089,  0.9636, -4.4885, -1.4045],
        [-4.6185,  1.5622, -3.9899, -1.7631],
        [-5.8942,  2.3893, -5.2008, -1.8910]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 35%|███▌      | 102/289 [01:17<02:20,  1.33it/s]

Training loop 102
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.562666118144989, logits - tensor([[-5.8289,  1.7701, -4.5598, -2.2478],
        [-5.3218, -1.5391, -4.4055,  1.5398],
        [-5.7422,  2.0891, -4.5183, -2.2915],
        [-6.0582,  2.1222, -5.1992, -2.7151],
        [-5.5593,  2.5382, -3.9881, -1.7622],
        [-5.9254,  2.5664, -4.6608, -1.4265],
        [-5.4749,  2.5430, -5.0112, -1.9830],
        [-4.9362, -2.3729, -3.0237,  2.5908]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 36%|███▌      | 103/289 [01:18<02:20,  1.32it/s]

Training loop 103
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.40352705121040344, logits - tensor([[-4.7259, -2.0930, -3.9800,  1.4954],
        [-6.1626,  2.2247, -5.2606, -2.8782],
        [-5.4007,  2.2151, -4.3930, -1.9840],
        [-6.2839,  2.1982, -4.6376, -1.4327],
        [-5.7073,  2.7129, -3.9946, -2.4862],
        [-5.9142,  2.0047, -4.7861, -1.7224],
        [-5.2498,  2.2933, -4.8291, -2.0677],
        [-6.1459,  2.6809, -4.6395, -2.2325]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 36%|███▌      | 104/289 [01:18<02:19,  1.32it/s]

Training loop 104
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4763423204421997, logits - tensor([[-5.1818,  1.6059, -4.6635, -1.9149],
        [-5.0889,  2.1997, -4.0141, -2.2035],
        [-5.1390, -2.4133,  1.4207, -2.1262],
        [-5.2990,  1.9721, -5.1822, -2.0566],
        [-6.0379,  2.2358, -5.0462, -2.2785],
        [-4.7357, -2.8519,  1.8125, -2.6688],
        [-5.6370,  0.9354, -4.8624, -2.7656],
        [-5.7439,  1.3451, -4.1506, -1.8171]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 36%|███▋      | 105/289 [01:19<02:18,  1.32it/s]

Training loop 105
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.29699528217315674, logits - tensor([[-5.2536,  1.9476, -4.6715, -2.0242],
        [-4.3422, -3.3146,  1.1929, -2.2635],
        [-5.8355,  2.1105, -5.2905, -1.8914],
        [-6.6120,  2.8741, -4.7702, -2.4620],
        [-5.2137, -2.8141,  1.5846, -1.7374],
        [-4.8307,  1.6032, -3.8868, -2.6748],
        [-6.4010,  1.1605, -4.5054, -2.5628],
        [-5.2907,  2.1408, -4.1739, -2.9935]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 37%|███▋      | 106/289 [01:20<02:18,  1.32it/s]

Training loop 106
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2138851135969162, logits - tensor([[-6.6141,  1.8798, -4.8067, -1.2508],
        [-4.5783, -2.8698,  1.3080, -1.9034],
        [-5.7153,  1.9254, -4.5010, -2.1159],
        [-6.3382,  1.7528, -4.9650, -2.5783],
        [-5.7166,  1.2252, -4.5730, -1.9980],
        [-6.0712,  1.3613, -5.0590, -2.3693],
        [-4.7964, -3.0994,  1.6361, -1.9244],
        [-5.3343,  1.4021, -4.3156, -2.6950]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 37%|███▋      | 107/289 [01:21<02:17,  1.33it/s]

Training loop 107
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.34315699338912964, logits - tensor([[-5.7179,  1.1152, -3.9687, -1.7147],
        [-5.0232,  1.6822, -4.2138, -2.4991],
        [-5.8019,  1.4778, -4.6645, -2.3888],
        [-5.9808,  2.5208, -4.8985, -2.2108],
        [-5.5211,  1.8836, -4.2113, -2.1525],
        [-5.8582, -1.8548, -3.5599,  2.6827],
        [-6.0279,  1.4948, -5.4478, -2.1798],
        [-6.0940,  2.2090, -4.3384, -2.4675]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 37%|███▋      | 108/289 [01:21<02:15,  1.33it/s]

Training loop 108
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.31683650612831116, logits - tensor([[-4.7302, -2.9049, -3.9222,  2.9512],
        [-6.5098,  1.3363, -4.1658, -2.5413],
        [-4.9674,  1.4226, -4.0035, -1.8209],
        [-5.9603,  1.8848, -4.2970, -2.0440],
        [-5.7199,  0.8857, -4.0940, -2.0629],
        [-4.7384, -2.9491,  1.6080, -1.8960],
        [-4.9637, -2.9903,  1.7334, -2.4031],
        [-6.2687,  2.0725, -4.9144, -2.6362]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 38%|███▊      | 109/289 [01:22<02:14,  1.33it/s]

Training loop 109
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.23039382696151733, logits - tensor([[-5.6608,  1.0757, -4.8580, -2.0963],
        [-6.4406,  2.2535, -4.6047, -2.0259],
        [-5.1721,  1.7692, -4.4983, -1.6401],
        [-5.8464,  2.1579, -4.5865, -2.3597],
        [-4.2869, -2.3744,  1.2528, -2.2931],
        [-6.6211,  1.2504, -4.8228, -1.3239],
        [-4.3221, -2.6960,  2.0548, -2.4375],
        [-5.1990,  0.9692, -3.7917, -1.0745]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 38%|███▊      | 110/289 [01:23<02:14,  1.33it/s]

Training loop 110
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.23467382788658142, logits - tensor([[-6.0629,  1.2644, -3.7618, -1.3720],
        [-5.8296,  1.8151, -5.2972, -1.6242],
        [-4.4230, -2.1054, -3.1344,  1.9113],
        [-5.1729,  1.4618, -3.8839, -1.5895],
        [-6.2729,  1.3101, -4.8480, -2.0100],
        [-5.5027,  1.5314, -4.1858, -1.7188],
        [-5.3203,  1.1536, -3.7678, -2.1495],
        [-4.1084, -2.3493, -3.5737,  2.1861]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 38%|███▊      | 111/289 [01:24<02:13,  1.33it/s]

Training loop 111
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.08958901464939117, logits - tensor([[-6.2067,  2.3830, -5.4635, -2.1827],
        [-6.3927,  2.1931, -5.4322, -1.2494],
        [-5.4547,  2.1314, -5.1615, -2.3583],
        [-5.3882,  0.7960, -4.2292, -1.0955],
        [-4.5922, -2.3294,  1.4189, -2.0661],
        [-5.6481,  0.9284, -4.5388, -2.1385],
        [-5.6290, -3.3113, -4.6526,  2.9715],
        [-5.0577, -2.9385,  1.7194, -1.5631]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 39%|███▉      | 112/289 [01:24<02:12,  1.34it/s]

Training loop 112
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.09363895654678345, logits - tensor([[-4.8891, -2.1987, -3.5024,  2.3275],
        [-5.8704,  0.5758, -3.8437, -1.0664],
        [-6.1486,  2.5079, -3.8721, -1.2776],
        [-4.8782,  1.5573, -4.3553, -1.5747],
        [-5.5815,  1.7982, -4.3791, -2.5068],
        [-5.9311,  1.7044, -4.1668, -3.0937],
        [-5.2766,  1.9731, -4.4931, -1.2988],
        [-4.6824, -2.8571,  1.4157, -2.2412]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 39%|███▉      | 113/289 [01:25<02:11,  1.34it/s]

Training loop 113
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.225229874253273, logits - tensor([[-5.8266, -0.1500, -4.8607, -0.0442],
        [-6.1799,  0.9322, -4.4785, -2.0125],
        [-5.9427, -3.0565,  1.0894, -1.8072],
        [-5.4538,  1.2207, -4.6513, -1.6118],
        [-5.3291,  1.1996, -4.4494, -1.4753],
        [-6.6427,  0.7143, -4.3953, -1.2027],
        [-5.9770,  2.1357, -4.4717, -2.1478],
        [-6.3345,  1.9111, -5.0226, -2.0480]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 39%|███▉      | 114/289 [01:26<02:11,  1.34it/s]

Training loop 114
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2555336058139801, logits - tensor([[-5.8001, -2.8895,  1.6952, -1.9919],
        [-5.5981, -0.3485, -4.3825,  1.4014],
        [-6.2764,  1.9846, -5.3703, -1.6050],
        [-5.3872,  1.3353, -4.3731, -2.0361],
        [-5.1592, -3.5810,  1.4177, -2.1501],
        [-3.8086, -1.7574, -3.3371,  1.4961],
        [-6.0246,  1.5487, -5.0334, -2.2746],
        [-5.9097, -0.3990, -4.1573,  0.6945]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 40%|███▉      | 115/289 [01:27<02:10,  1.34it/s]

Training loop 115
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.21314583718776703, logits - tensor([[-4.9697, -3.0939,  1.3206, -2.2694],
        [-5.8799,  2.0533, -4.6702, -2.2661],
        [-5.9459,  1.6046, -4.5687, -2.9612],
        [-6.2378,  1.7713, -4.6637, -1.8858],
        [-5.9170,  1.7997, -4.1035, -2.1476],
        [-6.8387,  1.7190, -4.9885, -1.7005],
        [-6.2784,  1.7164, -5.0988, -2.3033],
        [-5.5426,  2.5683, -4.3213, -2.2688]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 40%|████      | 116/289 [01:27<02:09,  1.34it/s]

Training loop 116
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.341746062040329, logits - tensor([[-6.7360,  1.9575, -4.8901, -1.8067],
        [-4.5831,  1.8141, -4.3274, -2.1526],
        [-5.4157,  1.9686, -4.4457, -1.8879],
        [-4.6394, -3.5086,  1.6266, -1.7747],
        [-6.8325, -0.3139, -5.1434,  0.1919],
        [-6.5631,  1.3583, -5.1632, -1.4373],
        [-5.0552,  1.7264, -4.2175, -1.3935],
        [-5.4741,  1.9031, -4.5930, -1.6939]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 40%|████      | 117/289 [01:28<02:08,  1.33it/s]

Training loop 117
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1953149437904358, logits - tensor([[-4.8809,  1.5862, -4.1691, -1.4929],
        [-6.4210,  1.8829, -4.7940, -1.6512],
        [-4.6715,  1.1169, -4.1750, -1.5114],
        [-5.8309,  0.9790, -4.7360, -1.3759],
        [-5.3920,  1.5125, -4.1945, -1.1603],
        [-5.9105,  1.6876, -5.5110, -1.8677],
        [-4.9763, -2.9910,  1.4562, -1.8829],
        [-4.6031, -2.9874,  0.8503, -1.8789]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 41%|████      | 118/289 [01:29<02:07,  1.34it/s]

Training loop 118
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.22263124585151672, logits - tensor([[-5.6530,  2.0313, -4.6737, -1.2494],
        [-6.0044, -3.3725,  0.9897, -1.9743],
        [-4.8622,  1.6635, -3.6001, -1.6665],
        [-5.7804,  1.3232, -3.8929, -1.5235],
        [-6.3132,  1.8313, -5.2157, -2.0882],
        [-5.4028, -2.8665,  1.5977, -1.7591],
        [-5.5990,  2.1174, -5.0668, -1.9325],
        [-5.1702, -3.7624,  1.2301, -1.8447]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 41%|████      | 119/289 [01:30<02:07,  1.33it/s]

Training loop 119
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4155929684638977, logits - tensor([[-4.9801,  1.3094, -4.2937, -1.7629],
        [-5.2033,  1.9092, -3.9715, -2.2746],
        [-6.2964,  1.7053, -4.9152, -0.9796],
        [-6.2684,  1.9011, -5.4145, -1.9586],
        [-5.2654,  1.8720, -3.6944, -1.6007],
        [-5.7407,  2.4359, -5.5291, -2.9981],
        [-5.4208,  1.5822, -4.5800, -1.5581],
        [-6.0433,  2.3183, -4.7854, -2.0050]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 42%|████▏     | 120/289 [01:30<02:07,  1.33it/s]

Training loop 120
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4353731870651245, logits - tensor([[-5.4591,  1.2193, -4.2652, -1.5686],
        [-5.2488,  1.7029, -4.7386, -0.9092],
        [-6.0664,  1.7182, -3.5706, -1.2845],
        [-4.6588, -3.2202,  2.2672, -1.9369],
        [-5.8128,  1.6924, -4.1855, -0.7731],
        [-5.7677,  0.0319, -4.5989,  0.6753],
        [-5.6854,  2.0824, -4.4232, -1.5533],
        [-5.2411,  1.5893, -4.1546, -1.8193]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 42%|████▏     | 121/289 [01:31<02:06,  1.33it/s]

Training loop 121
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 42%|████▏     | 122/289 [01:32<02:05,  1.33it/s]

loss - 0.19158527255058289, logits - tensor([[-5.7290,  1.6245, -4.6123, -2.3981],
        [-5.9013,  2.0550, -4.9997, -1.5084],
        [-6.1400,  1.5619, -4.6102, -1.5677],
        [-5.8565,  1.7576, -5.2305, -1.6883],
        [-4.9418, -1.6474, -4.3680,  1.5889],
        [-5.5960,  1.5012, -3.6130, -2.1547],
        [-5.6932,  2.5462, -5.3174, -1.4121],
        [-5.5147,  1.2059, -4.0306, -0.8547]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 122
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.17679783701896667, logits - tensor([[-5.6109,  1.1761, -4.1000, -1.6784],
        [-6.3706,  1.6815, -5.3328, -1.3003],
        [-7.0875,  0.8407, -4.7032, -1.5463],
        [-5.5092,  1.8694, -5.1060, -2.0765],
        [-5.3950,  2.1766, -4.1948, -1.5866],
        [-5.9856,  1.5042, -4.3790, -1.5554],
        [-6.2041,  1.5014, -4.3037, -1.4560],
        [-5.9246,  1.8294, -4.6265, -1

 43%|████▎     | 123/289 [01:33<02:04,  1.33it/s]

Training loop 123
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5238328576087952, logits - tensor([[-6.3067,  1.5829, -4.8805, -1.3492],
        [-5.7959, -0.9398, -4.0793,  1.1471],
        [-5.8791,  1.5775, -4.4142, -2.1766],
        [-4.6669, -3.2172,  1.4021, -1.7005],
        [-5.3318, -3.6985,  2.2457, -2.7368],
        [-4.7029, -2.4857,  1.9085, -2.1364],
        [-6.4907,  1.2481, -5.0231, -1.3197],
        [-6.0976,  0.5508, -4.6795, -1.3912]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 43%|████▎     | 124/289 [01:33<02:03,  1.33it/s]

Training loop 124
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.176200732588768, logits - tensor([[-5.9206,  1.3286, -4.1680, -0.8389],
        [-6.6065,  1.8265, -4.9536, -2.2155],
        [-5.7927,  1.5057, -4.1036, -1.6162],
        [-5.3013,  1.2240, -4.0208, -1.8837],
        [-6.1665,  1.3146, -4.4231, -0.9362],
        [-6.1626,  0.9019, -4.3048, -0.9416],
        [-4.1378, -3.0291,  2.1601, -1.9294],
        [-5.1405,  1.4555, -3.1084, -1.1998]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 43%|████▎     | 125/289 [01:34<02:03,  1.33it/s]

Training loop 125
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.20320729911327362, logits - tensor([[-6.5127, -0.0200, -4.9651,  0.9184],
        [-5.0388,  1.6005, -4.5279, -0.6483],
        [-5.6192,  0.2736, -4.3464, -1.5217],
        [-5.9620,  1.9745, -5.0580, -1.3947],
        [-5.8690,  2.4625, -4.5903, -1.4355],
        [-6.3461,  0.9083, -5.1260, -1.6279],
        [-4.1169, -2.7632,  1.3438, -1.1617],
        [-6.6892,  1.0274, -4.5747, -1.5914]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 44%|████▎     | 126/289 [01:35<02:03,  1.32it/s]

Training loop 126
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.18478822708129883, logits - tensor([[-4.9950,  1.8482, -4.1734, -1.0194],
        [-5.4566,  1.5682, -4.5297, -1.4521],
        [-6.1975,  1.2850, -4.4482, -1.3090],
        [-6.1622,  1.2933, -5.2705, -1.0792],
        [-6.3794,  2.5029, -4.7114, -1.3220],
        [-5.5320,  1.0928, -4.2643, -0.9661],
        [-5.0024,  1.3166, -4.0729, -0.8504],
        [-5.8400,  0.6574, -3.9454, -1.0009]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 44%|████▍     | 127/289 [01:36<02:02,  1.32it/s]

Training loop 127
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.16167786717414856, logits - tensor([[-6.3680, -1.1817, -4.6468,  0.7076],
        [-5.0729,  1.2445, -4.2273, -1.7493],
        [-5.4226,  0.9009, -5.0729, -1.0943],
        [-6.1933,  1.2017, -4.7412, -0.4838],
        [-6.1848,  0.5938, -4.9732, -2.1047],
        [-6.7229,  1.0789, -5.2003, -1.2666],
        [-5.4799, -0.1593, -3.6145, -1.1366],
        [-5.4528, -4.0106,  1.2324, -1.5559]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 44%|████▍     | 128/289 [01:36<02:01,  1.32it/s]

Training loop 128
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.14292095601558685, logits - tensor([[-5.9159,  1.3091, -4.7830, -1.3775],
        [-3.9046, -2.9317,  2.1762, -2.4913],
        [-4.5784, -3.2404,  0.9560, -0.7404],
        [-5.9189,  0.4465, -5.0033, -1.3145],
        [-6.0945,  1.3349, -4.1431, -1.3358],
        [-7.2957,  1.3564, -5.0515, -2.4944],
        [-5.7055,  0.7661, -4.2165, -1.5955],
        [-5.8500, -0.5842, -5.0441,  0.4421]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 45%|████▍     | 129/289 [01:37<02:01,  1.32it/s]

Training loop 129
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2019076645374298, logits - tensor([[-5.9727, -3.2419,  0.9521, -2.1447],
        [-5.7901, -3.6811,  1.8959, -1.8634],
        [-5.0500, -0.5544, -4.7033,  1.5387],
        [-5.0813, -3.2892,  1.6244, -2.0797],
        [-5.0910, -3.9501,  1.4309, -1.7893],
        [-6.9991,  1.1170, -5.0558, -1.1380],
        [-4.9222, -2.8886,  2.0289, -2.0276],
        [-5.9762,  1.5499, -5.7785, -1.4339]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 45%|████▍     | 130/289 [01:38<02:00,  1.32it/s]

Training loop 130
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.12178713828325272, logits - tensor([[-5.7161,  1.0387, -4.9296, -1.3297],
        [-5.5608, -3.1913,  1.6956, -1.7582],
        [-5.5109,  1.4091, -4.4179, -1.2330],
        [-6.8468, -1.7591, -4.1869,  1.5423],
        [-5.0537,  0.9407, -3.5311, -1.5493],
        [-5.9253,  0.7852, -3.7803, -1.2972],
        [-5.8497,  0.9072, -4.6397, -1.1416],
        [-5.7213, -1.4921, -4.5985,  2.5749]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 45%|████▌     | 131/289 [01:39<01:58,  1.33it/s]

Training loop 131
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2813112139701843, logits - tensor([[-6.3300,  2.0198, -4.9597, -1.6599],
        [-5.5659, -1.1736, -4.6007,  1.7052],
        [-6.0660,  1.0657, -4.6143, -0.7168],
        [-5.9545,  1.3590, -4.3357, -0.8469],
        [-5.6320,  1.3829, -4.9041, -1.0931],
        [-5.5838,  0.9595, -4.7329, -1.9333],
        [-5.7736,  1.1989, -3.6703, -0.7080],
        [-5.9311,  1.4213, -5.0438, -0.8742]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 46%|████▌     | 132/289 [01:39<01:57,  1.33it/s]

Training loop 132
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3267289996147156, logits - tensor([[-6.8078,  0.9548, -4.4146, -0.8200],
        [-5.6737,  0.9764, -4.3809, -1.0515],
        [-6.6357, -0.1576, -5.1983,  0.4768],
        [-4.5803,  1.1575, -4.7073, -0.7905],
        [-6.8438,  0.9940, -5.0776, -0.9681],
        [-4.6940, -3.8230,  1.6279, -2.1923],
        [-5.1158, -3.5331,  1.4116, -2.1466],
        [-6.1792,  1.3778, -4.6705, -0.9314]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 46%|████▌     | 133/289 [01:40<01:57,  1.33it/s]

Training loop 133
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4398587942123413, logits - tensor([[-5.7944,  0.1660, -4.5040, -1.0776],
        [-6.6338,  1.3315, -4.4151, -0.7799],
        [-5.9902,  1.2497, -4.7492, -1.1943],
        [-4.3403, -3.3328,  2.8945, -2.1361],
        [-5.1792,  1.4225, -4.6374, -1.8279],
        [-5.7578, -1.9901, -3.9395,  1.1875],
        [-4.8761,  0.2533, -3.6409, -0.6157],
        [-5.4991,  1.1886, -4.3559, -1.5570]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 46%|████▋     | 134/289 [01:41<01:56,  1.33it/s]

Training loop 134
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.353769987821579, logits - tensor([[-5.9162,  1.2858, -4.3235, -0.9976],
        [-5.1041,  1.3900, -4.8014, -0.8035],
        [-5.2728,  1.5048, -4.0531, -0.6380],
        [-5.7629,  1.1345, -3.9155, -1.3800],
        [-6.7137,  1.9356, -5.4188, -1.6855],
        [-6.1029,  1.3812, -5.5656, -1.6322],
        [-5.9455,  0.4808, -4.9871, -1.8576],
        [-6.5215,  1.4374, -4.2388, -1.2739]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 47%|████▋     | 135/289 [01:42<01:55,  1.33it/s]

Training loop 135
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.13007253408432007, logits - tensor([[-5.7197, -2.1081, -4.8257,  2.2170],
        [-6.5329,  1.6636, -4.0446, -1.7034],
        [-6.7167,  1.5507, -5.0748, -1.4420],
        [-6.2286,  1.1242, -4.2983, -1.6067],
        [-5.6152,  1.6622, -4.3781, -0.7830],
        [-5.6213,  1.1647, -4.4061, -0.5209],
        [-5.7544, -1.2037, -4.6144,  0.4343],
        [-6.5691,  0.8095, -4.3674, -1.4805]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 47%|████▋     | 136/289 [01:42<01:54,  1.34it/s]

Training loop 136
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.31891196966171265, logits - tensor([[-5.2144, -3.6541,  1.6275, -1.2149],
        [-5.6670,  1.2502, -4.2502, -1.1702],
        [-6.0964,  0.9887, -4.1954, -1.7949],
        [-6.7313, -2.1492, -4.1354,  3.3463],
        [-6.7023,  1.3178, -5.0117, -1.0979],
        [-4.8021, -4.0304,  2.5065, -1.9259],
        [-6.0107,  0.6023, -4.3446, -1.1904],
        [-5.4820,  1.1143, -3.8780, -1.4533]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 47%|████▋     | 137/289 [01:43<01:53,  1.34it/s]

Training loop 137
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2130366563796997, logits - tensor([[-5.7467,  1.2325, -4.4679, -1.2945],
        [-5.5417,  0.9388, -3.9370, -1.3364],
        [-6.1170,  1.3320, -4.3296, -0.2187],
        [-5.6563, -1.4021, -3.5089,  1.1691],
        [-4.4570, -2.8656,  1.3676, -1.6673],
        [-6.0697,  1.1940, -4.1039, -0.8100],
        [-5.7738,  1.4645, -4.7132, -0.7808],
        [-5.3690,  1.7400, -4.7494, -1.4043]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 48%|████▊     | 138/289 [01:44<01:53,  1.33it/s]

Training loop 138
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2381318211555481, logits - tensor([[-5.9309,  1.8647, -5.4018, -2.1794],
        [-6.3966,  0.5782, -4.2755, -0.6716],
        [-4.2753, -2.8581,  1.5496, -2.1496],
        [-4.9876, -3.4640,  2.2327, -1.9660],
        [-5.9141,  0.9425, -5.5211, -1.1964],
        [-6.0928, -1.4244, -4.5723,  1.5093],
        [-4.1910, -2.3128,  1.4571, -1.4726],
        [-5.8585,  1.5780, -3.5356, -1.0371]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 48%|████▊     | 139/289 [01:45<01:52,  1.33it/s]

Training loop 139
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.20030823349952698, logits - tensor([[-5.9551,  1.9288, -4.3824, -0.2931],
        [-5.8864,  1.8208, -4.8764, -2.0576],
        [-5.4294, -1.2451, -4.2832,  1.1965],
        [-6.4402,  1.5884, -4.6084, -1.3570],
        [-6.0384,  1.3421, -4.6002, -1.1368],
        [-5.8198,  1.4474, -4.5198, -1.2266],
        [-5.2338, -2.7284,  1.5720, -2.1993],
        [-5.4702,  1.4250, -4.0046, -0.7860]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 48%|████▊     | 140/289 [01:45<01:51,  1.33it/s]

Training loop 140
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2167576253414154, logits - tensor([[-5.5388, -3.1344,  0.9011, -2.1540],
        [-4.7690, -0.7134, -4.1879,  0.7254],
        [-5.1883,  2.0078, -4.5124, -1.5050],
        [-4.4279,  0.9566, -4.3872, -0.9821],
        [-5.3313,  2.0424, -4.4237, -1.1461],
        [-6.6608,  1.8706, -4.7866, -1.8290],
        [-5.0146, -2.5955,  1.7176, -2.3795],
        [-4.4030, -2.5701,  1.3869, -1.5141]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 49%|████▉     | 141/289 [01:46<01:51,  1.33it/s]

Training loop 141
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2277442067861557, logits - tensor([[-5.9647,  1.7811, -4.8015, -2.4870],
        [-6.8952,  2.0683, -4.9858, -1.5546],
        [-6.4227,  1.6586, -4.2899, -1.6799],
        [-5.2149,  1.9177, -4.5854, -1.6446],
        [-5.6013,  0.8712, -4.2426, -1.3320],
        [-5.5112,  1.3303, -4.2072, -1.1991],
        [-6.2506,  2.0391, -4.6230, -1.2395],
        [-5.9416,  2.1837, -4.8225, -1.8891]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 49%|████▉     | 142/289 [01:47<01:50,  1.33it/s]

Training loop 142
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.20097124576568604, logits - tensor([[-5.3242,  1.2698, -4.3503, -1.6390],
        [-5.7363,  1.2905, -4.3041, -1.5101],
        [-6.3157,  1.2055, -3.5800, -1.5484],
        [-5.4882,  1.1960, -3.8986, -0.9756],
        [-4.1798, -1.8580,  1.1792, -1.6097],
        [-6.0364,  1.2232, -4.4366, -2.1972],
        [-5.8137,  1.9050, -4.0375, -2.0131],
        [-4.6952, -2.7387,  2.2098, -2.4925]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 49%|████▉     | 143/289 [01:48<01:49,  1.33it/s]

Training loop 143
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2882765531539917, logits - tensor([[-4.9678, -2.1404,  2.0963, -2.2236],
        [-4.1544, -2.3585,  1.6277, -2.4144],
        [-6.4060,  1.2262, -5.3961, -1.3514],
        [-5.7179,  1.8496, -4.7445, -1.8949],
        [-6.0960,  1.9503, -3.9881, -1.4793],
        [-6.3720,  0.7556, -4.5353, -1.2065],
        [-4.3910, -1.8348,  1.1838, -2.0389],
        [-5.7805,  1.3857, -5.2762, -1.4047]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 50%|████▉     | 144/289 [01:48<01:49,  1.33it/s]

Training loop 144
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.21354363858699799, logits - tensor([[-5.4863,  2.5061, -4.6042, -1.2412],
        [-4.8763, -3.3157,  1.5953, -2.3206],
        [-6.7766,  2.0281, -5.1688, -1.9003],
        [-4.8773, -2.4426,  1.2626, -2.0610],
        [-4.5347,  0.9383, -3.6194, -1.3825],
        [-6.2031,  1.8281, -5.4363, -2.5516],
        [-5.7797,  2.1097, -4.9513, -0.6937],
        [-5.0441, -2.5455, -3.9727,  2.6610]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 50%|█████     | 145/289 [01:49<01:48,  1.32it/s]

Training loop 145
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.06590880453586578, logits - tensor([[-4.7582, -1.9918, -3.7583,  2.5135],
        [-4.4726, -2.6968,  1.8183, -2.8696],
        [-5.3897, -2.7184, -3.9775,  2.6698],
        [-4.5964, -2.8084,  2.3904, -2.0686],
        [-5.9672, -2.2648, -4.6253,  1.6302],
        [-5.7254,  2.0386, -4.5743, -1.6965],
        [-6.2948,  1.6159, -5.0909, -1.8024],
        [-4.7257, -2.1450, -3.9483,  2.4153]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 51%|█████     | 146/289 [01:50<01:48,  1.32it/s]

Training loop 146
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19656093418598175, logits - tensor([[-5.1367,  1.6854, -4.4620, -1.6620],
        [-5.9530,  2.5538, -4.9327, -1.2629],
        [-4.2668, -2.5587,  1.5321, -2.4343],
        [-5.8285,  2.2583, -5.3998, -2.4892],
        [-6.4667,  2.0221, -4.7150, -1.9775],
        [-5.6540,  1.4605, -4.4687, -2.2567],
        [-5.2903,  2.5523, -4.9467, -1.9116],
        [-5.5185,  1.2714, -4.8195, -1.8088]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 51%|█████     | 147/289 [01:51<01:47,  1.32it/s]

Training loop 147
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.18088047206401825, logits - tensor([[-5.5527,  1.9332, -3.8070, -0.9632],
        [-6.2864, -0.9827, -4.6346,  0.9830],
        [-5.5241,  1.8602, -4.7533, -2.2553],
        [-4.8588, -1.3352, -4.2851,  1.5350],
        [-5.3355,  2.0218, -5.1943, -2.6158],
        [-6.0016,  2.2309, -5.0635, -2.1143],
        [-5.1202,  1.3903, -4.5063, -1.2974],
        [-5.7582, -2.3427, -4.1468,  2.8192]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 51%|█████     | 148/289 [01:51<01:47,  1.32it/s]

Training loop 148
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.30144986510276794, logits - tensor([[-6.0574,  2.3019, -5.4387, -2.3920],
        [-5.9293, -1.8986, -3.8384,  1.3972],
        [-4.2594, -2.0044,  1.1630, -2.5105],
        [-5.9691,  1.2622, -4.5192, -2.1314],
        [-5.4969,  1.4642, -4.4769, -2.0925],
        [-5.1919,  1.4557, -4.7838, -1.7705],
        [-6.7216,  1.8811, -4.8942, -1.6708],
        [-4.2522, -1.9185,  1.7805, -2.5029]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 52%|█████▏    | 149/289 [01:52<01:46,  1.32it/s]

Training loop 149
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.38554126024246216, logits - tensor([[-4.1556, -2.0777,  1.3356, -1.8056],
        [-6.4670,  3.1285, -5.4416, -2.2885],
        [-6.8840,  1.4348, -5.3358, -1.6045],
        [-4.5732, -2.3562, -4.4756,  2.3200],
        [-4.9297, -2.5159,  1.4512, -2.0843],
        [-4.3618, -2.2063,  1.1804, -2.5414],
        [-6.2443,  1.6492, -4.9578, -2.3150],
        [-5.9458,  1.1895, -4.2373, -1.6966]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 52%|█████▏    | 150/289 [01:53<01:45,  1.32it/s]

Training loop 150
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2017013132572174, logits - tensor([[-5.8195,  2.0793, -4.3002, -1.5282],
        [-5.4650,  2.0627, -4.3728, -1.9997],
        [-4.2761, -2.0918,  1.5923, -1.8966],
        [-5.5916,  1.3641, -4.8854, -2.3584],
        [-5.4771,  1.9525, -4.6745, -1.9173],
        [-6.4378,  1.7287, -5.0797, -2.0552],
        [-6.0519,  1.6469, -4.8395, -2.0747],
        [-5.5981,  2.6895, -4.8959, -0.9197]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 52%|█████▏    | 151/289 [01:54<01:43,  1.33it/s]

Training loop 151
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.43854135274887085, logits - tensor([[-6.4963,  1.8684, -4.8316, -2.7799],
        [-6.1888,  1.5244, -4.6917, -1.7403],
        [-5.2759,  1.9622, -5.4160, -1.8889],
        [-5.9795,  2.0050, -4.2861, -2.0010],
        [-6.3719,  2.0134, -5.4459, -1.2818],
        [-6.5289,  2.5797, -5.0822, -1.9678],
        [-5.4098,  1.4895, -4.4592, -2.1862],
        [-4.6450, -2.8869,  0.2808, -0.3411]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 53%|█████▎    | 152/289 [01:54<01:42,  1.33it/s]

Training loop 152
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.21371161937713623, logits - tensor([[-6.5500,  1.4110, -4.6198, -1.6152],
        [-5.5668, -2.1832, -3.5389,  1.8188],
        [-6.7329,  1.6080, -5.5968, -2.5464],
        [-6.0318,  1.3599, -5.0033, -1.4257],
        [-6.0652,  1.7965, -5.3888, -1.7271],
        [-6.8063,  2.9003, -4.9438, -1.5871],
        [-5.8340,  1.3565, -4.5253, -1.4559],
        [-7.0098,  2.3599, -4.9039, -1.8558]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 53%|█████▎    | 153/289 [01:55<01:42,  1.33it/s]

Training loop 153
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.6246262788772583, logits - tensor([[-6.6879,  2.0791, -4.9264, -1.8219],
        [-5.1250, -1.8444, -3.8067,  1.1535],
        [-5.6205,  0.9237, -3.9601, -1.2048],
        [-5.2808, -3.6071, -4.2620,  2.3705],
        [-6.2393,  1.9865, -4.5583, -1.5970],
        [-6.7837,  2.4082, -4.8910, -2.4092],
        [-5.2402, -1.9479, -3.9278,  1.3877],
        [-5.7935,  2.1235, -4.2934, -2.2579]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 53%|█████▎    | 154/289 [01:56<01:41,  1.33it/s]

Training loop 154
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2123105823993683, logits - tensor([[-5.7712,  1.6954, -4.5561, -1.5249],
        [-5.2099,  1.9337, -4.9201, -1.5119],
        [-5.0343,  1.9285, -4.5040, -2.6503],
        [-6.9868,  1.0021, -4.3896, -1.1093],
        [-4.9151,  1.3227, -3.4889, -1.2251],
        [-6.1503,  1.8802, -5.8815, -1.7611],
        [-4.7424, -2.9035,  1.1960, -2.2667],
        [-6.1499,  1.0905, -4.4363, -1.3784]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 54%|█████▎    | 155/289 [01:57<01:40,  1.33it/s]

Training loop 155
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.306363046169281, logits - tensor([[-5.7711,  0.5141, -4.2584, -0.8181],
        [-4.5458, -2.5035,  1.5180, -2.4803],
        [-3.9116, -2.6315,  1.7299, -2.9474],
        [-6.2700,  1.7277, -5.2669, -1.8455],
        [-4.4359, -2.0586,  1.7466, -2.3146],
        [-6.1743,  1.5703, -4.5512, -1.5211],
        [-5.1131,  1.0146, -4.3421, -1.0725],
        [-5.1892,  0.8075, -3.7889, -1.8306]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 54%|█████▍    | 156/289 [01:57<01:39,  1.33it/s]

Training loop 156
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.25303351879119873, logits - tensor([[-6.8728,  0.9318, -5.6829, -0.9109],
        [-6.3831,  1.2285, -4.2344, -1.8930],
        [-6.7603,  1.5366, -4.4883, -1.3766],
        [-5.8470,  0.7540, -5.1757, -1.5927],
        [-5.4653,  0.8562, -4.6354, -1.2816],
        [-5.4580,  0.6469, -4.4893, -1.5945],
        [-3.6818, -2.2877,  1.8794, -1.6023],
        [-6.8347,  1.5657, -4.1122, -1.9824]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 54%|█████▍    | 157/289 [01:58<01:39,  1.33it/s]

Training loop 157
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19973543286323547, logits - tensor([[-5.8170,  1.5918, -3.9447, -1.4995],
        [-6.2033,  0.7710, -4.9822, -1.4139],
        [-5.9018,  0.1356, -4.6066, -1.0873],
        [-4.6451, -2.9927,  1.8530, -2.4133],
        [-6.2755,  0.9344, -4.6621, -1.2409],
        [-6.1667,  2.1107, -4.1791, -1.1553],
        [-6.1708,  0.9534, -4.4425, -1.0316],
        [-6.0913,  1.0737, -3.8712, -0.7221]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 55%|█████▍    | 158/289 [01:59<01:38,  1.33it/s]

Training loop 158
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3606681227684021, logits - tensor([[-6.4213,  1.0750, -3.8498, -1.0935],
        [-6.1635,  0.9780, -4.5968, -0.9170],
        [-6.4667,  0.6990, -4.6503, -0.8246],
        [-6.4640,  1.4067, -5.3330, -1.1796],
        [-6.2736,  1.1133, -4.6839, -1.6357],
        [-5.6041,  0.2348, -4.3888, -1.4195],
        [-6.1254,  0.9658, -5.1209, -0.6202],
        [-6.2219,  0.2636, -4.5548, -0.7856]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 55%|█████▌    | 159/289 [02:00<01:37,  1.33it/s]

Training loop 159
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2775707542896271, logits - tensor([[-6.6137,  0.2082, -4.8566, -1.1819],
        [-6.5521,  0.3091, -5.0000, -1.1757],
        [-7.4016,  0.4068, -4.6803, -0.9768],
        [-5.9847,  1.5094, -4.5748, -1.3227],
        [-6.9533,  0.8468, -4.6898, -0.7117],
        [-4.7871, -2.7233,  2.3973, -2.7013],
        [-6.6825,  1.3473, -5.0433, -1.0776],
        [-6.3584,  0.9591, -4.5507, -1.3933]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 55%|█████▌    | 160/289 [02:00<01:36,  1.33it/s]

Training loop 160
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.30677729845046997, logits - tensor([[-6.2309, -0.2639, -5.3070,  0.2742],
        [-6.1038,  1.3421, -5.2021, -1.2162],
        [-5.9968, -0.5329, -3.6052,  0.5598],
        [-5.3490,  1.7636, -3.3376, -1.1596],
        [-7.4898,  0.8109, -5.2142, -0.8895],
        [-5.8257,  1.2193, -4.7273, -0.7403],
        [-6.6288, -0.4378, -4.6467, -0.5777],
        [-4.0940, -3.2604, -3.2977,  2.1563]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 56%|█████▌    | 161/289 [02:01<01:36,  1.33it/s]

Training loop 161
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.23530671000480652, logits - tensor([[-6.6205, -0.2756, -4.2622, -0.3733],
        [-4.8961, -2.5854,  1.5310, -1.7519],
        [-7.3398,  0.9064, -4.7420, -1.8996],
        [-6.5856,  0.2318, -4.8678, -0.2038],
        [-6.0508,  0.1022, -4.8244, -0.8963],
        [-7.8709, -0.5181, -4.7451, -0.8352],
        [-5.6967,  1.0514, -3.6164, -1.1874],
        [-7.3822,  1.2121, -5.0248, -0.7489]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 56%|█████▌    | 162/289 [02:02<01:35,  1.33it/s]

Training loop 162
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 56%|█████▋    | 163/289 [02:03<01:34,  1.33it/s]

loss - 0.2808243930339813, logits - tensor([[-6.6020,  0.6690, -4.5075, -1.3495],
        [-5.9935, -1.4955, -3.8721,  0.9754],
        [-5.9707,  0.4836, -4.3396, -0.6399],
        [-5.3200, -3.3226, -4.0328,  2.0708],
        [-5.7241, -0.3672, -3.9505,  0.1930],
        [-6.3431,  0.9388, -4.5830, -0.8760],
        [-4.1210, -2.2298,  1.7778, -2.0883],
        [-5.5099, -1.8216, -3.6855,  1.7342]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 163
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3040108382701874, logits - tensor([[-6.3087,  1.1219, -5.1221, -0.9398],
        [-4.8328, -2.7240, -3.7889,  2.8028],
        [-7.2684,  0.1205, -4.3023, -0.8933],
        [-5.0006, -3.2070, -4.1523,  2.7422],
        [-5.6227,  0.3605, -3.7442, -0.5503],
        [-5.8530,  0.1193, -4.7641, -0.3284],
        [-6.4142,  0.5856, -4.5660, -1.4921],
        [-4.1328, -2.7549,  1.7618, -1.5

 57%|█████▋    | 164/289 [02:04<01:34,  1.32it/s]

Training loop 164
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5877867341041565, logits - tensor([[-6.4276,  0.0827, -4.0799, -0.9742],
        [-7.9913,  0.1037, -4.5023, -0.3026],
        [-5.9217,  0.7004, -4.6373, -0.9212],
        [-6.5993,  0.1706, -4.4076, -0.5243],
        [-7.3082, -0.1717, -4.4086, -1.3330],
        [-4.7387, -2.4029,  2.3939, -2.0173],
        [-5.7612, -3.3128, -3.6730,  2.9147],
        [-6.0012, -0.5321, -3.9581, -0.4306]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 57%|█████▋    | 165/289 [02:04<01:33,  1.32it/s]

Training loop 165
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.23421189188957214, logits - tensor([[-5.4443,  1.5014, -4.5748, -1.3549],
        [-4.9462, -2.6761, -4.1053,  1.6449],
        [-6.5802,  0.9170, -4.2152, -1.1074],
        [-6.0896,  1.2066, -3.3796, -0.5741],
        [-7.0740,  0.4830, -5.4770, -0.8867],
        [-6.5033, -1.2157, -5.0310,  1.6918],
        [-6.6866,  0.3607, -5.2319, -0.0448],
        [-6.6780,  0.6412, -4.5042, -1.0288]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 57%|█████▋    | 166/289 [02:05<01:33,  1.32it/s]

Training loop 166
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5392720103263855, logits - tensor([[-6.4318,  0.3369, -4.5171, -1.5006],
        [-6.5920,  0.4528, -4.1070, -1.3034],
        [-4.1264, -2.9811,  1.6530, -1.8858],
        [-6.3818,  0.4030, -4.6309, -1.3234],
        [-3.9593, -2.4392,  2.0702, -1.7812],
        [-5.9713, -1.6251, -3.7754,  2.2498],
        [-6.1415, -1.8081, -3.8423,  2.0886],
        [-6.5154,  0.2100, -3.8278, -0.8258]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 58%|█████▊    | 167/289 [02:06<01:32,  1.32it/s]

Training loop 167
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4771915078163147, logits - tensor([[-6.5827,  0.6810, -4.4577, -1.3099],
        [-6.3983,  0.5518, -4.4434, -1.0825],
        [-6.4322,  0.6139, -5.0097, -0.8337],
        [-6.4990,  0.2290, -4.2619, -0.8660],
        [-6.5396,  0.3059, -3.7119, -0.5362],
        [-6.2662, -2.0822, -4.1203,  1.4543],
        [-5.8388,  0.5730, -3.1164, -1.5143],
        [-5.9859, -2.8992,  1.7647, -2.4875]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 58%|█████▊    | 168/289 [02:07<01:32,  1.31it/s]

Training loop 168
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3756062984466553, logits - tensor([[-5.3694, -3.3343, -4.1858,  3.2486],
        [-7.6582,  1.1254, -4.9078, -0.9107],
        [-5.0561,  0.4630, -3.4121, -0.6426],
        [-4.6155, -2.7495,  1.6994, -2.5731],
        [-5.7246,  0.8405, -4.4633, -1.4699],
        [-5.2098, -2.4075,  1.7378, -2.0973],
        [-5.0825, -3.4595, -3.8590,  3.3653],
        [-6.6407,  0.8843, -4.1567, -1.4308]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 58%|█████▊    | 169/289 [02:07<01:30,  1.32it/s]

Training loop 169
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2533363997936249, logits - tensor([[-6.2240, -2.6450, -4.8006,  2.0765],
        [-5.0858,  0.1343, -3.7813, -0.9119],
        [-6.6702,  0.1499, -4.6002,  0.9047],
        [-7.1436,  1.1265, -3.9930, -1.4806],
        [-6.0097,  0.9245, -3.9930, -0.8962],
        [-5.8227,  1.1904, -3.7966, -0.7727],
        [-5.3312,  0.9182, -2.9563, -0.6912],
        [-5.0009, -2.6231,  1.3288, -2.3186]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 59%|█████▉    | 170/289 [02:08<01:29,  1.32it/s]

Training loop 170
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.22620084881782532, logits - tensor([[-4.6728, -2.7738,  1.8057, -1.6896],
        [-4.2754, -1.6893,  1.6647, -1.8110],
        [-4.9786, -2.6264,  1.3958, -1.6058],
        [-5.3217, -2.3502, -3.6570,  2.1314],
        [-5.2748, -3.8751, -3.9127,  3.1178],
        [-5.9355,  0.2817, -3.8732, -1.5059],
        [-4.1188, -2.5866,  0.8474, -1.6592],
        [-5.8536,  0.7741, -3.6089, -0.7840]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 59%|█████▉    | 171/289 [02:09<01:28,  1.33it/s]

Training loop 171
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19774021208286285, logits - tensor([[-6.4229,  0.1462, -5.4176, -0.7486],
        [-5.3415, -2.4531, -3.7703,  1.7748],
        [-3.8381, -2.2102,  1.0266, -1.7153],
        [-5.6860,  0.6459, -2.5486, -0.6757],
        [-6.3854,  1.1084, -4.1113, -0.7840],
        [-5.3941,  1.4736, -3.9448, -1.1869],
        [-5.2073,  1.5288, -3.4028, -1.3674],
        [-6.2601,  1.1717, -4.8164, -1.0578]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 60%|█████▉    | 172/289 [02:10<01:27,  1.33it/s]

Training loop 172
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.31207042932510376, logits - tensor([[-6.8809,  0.3003, -4.5148,  0.1770],
        [-6.2820,  0.0963, -5.3354,  1.0495],
        [-5.7929,  1.3002, -4.4665, -1.3021],
        [-6.1389,  1.2331, -4.7622, -1.3898],
        [-6.7292, -0.1625, -5.7607,  0.9832],
        [-4.3313, -1.9892,  1.1065, -1.9829],
        [-5.6960,  1.6008, -4.9487, -1.1324],
        [-6.3851,  1.2239, -3.8043, -0.3881]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 60%|█████▉    | 173/289 [02:10<01:27,  1.33it/s]

Training loop 173
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.40990447998046875, logits - tensor([[-6.2443, -2.7257, -3.8831,  2.2052],
        [-4.8656, -2.7167,  2.0315, -2.4725],
        [-6.6979,  1.8165, -4.4031, -1.6530],
        [-4.5629, -1.7388, -3.8760,  2.3681],
        [-5.8866,  1.9053, -5.0463, -1.0815],
        [-4.6666, -2.1780,  1.6687, -2.1935],
        [-6.1726,  1.3525, -4.2361, -1.7625],
        [-5.8207,  1.7364, -3.8547, -1.6732]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 60%|██████    | 174/289 [02:11<01:26,  1.33it/s]

Training loop 174
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3986068367958069, logits - tensor([[-5.8158, -1.7168, -3.6791,  3.4061],
        [-3.6151, -1.6483,  1.3001, -1.9073],
        [-4.8813,  1.2470, -3.7938, -0.4748],
        [-4.7466,  0.6033, -3.4357, -1.3009],
        [-7.4461, -1.6874, -4.9944,  2.1467],
        [-5.7735,  1.8433, -4.5886, -1.5191],
        [-6.1927,  1.5149, -3.8190, -1.0689],
        [-5.7349,  1.7600, -3.6190, -1.7072]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 61%|██████    | 175/289 [02:12<01:25,  1.33it/s]

Training loop 175
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3636951446533203, logits - tensor([[-6.0726,  1.6840, -3.4967, -0.6947],
        [-5.6965,  1.4999, -4.3568, -1.1379],
        [-6.3947,  1.5236, -4.5415, -1.5703],
        [-5.3993,  1.8264, -3.1090, -1.3806],
        [-4.6973, -2.0944, -3.9930,  2.2116],
        [-5.8717,  1.2220, -4.3095, -0.9839],
        [-6.0327,  1.0824, -4.0093, -0.6137],
        [-6.0731,  2.2091, -4.2385, -1.4097]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 61%|██████    | 176/289 [02:13<01:24,  1.33it/s]

Training loop 176
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19926992058753967, logits - tensor([[-4.5865, -2.1245,  0.7257, -2.2808],
        [-5.3986,  2.1616, -4.3087, -2.0573],
        [-5.4850,  1.9526, -3.9405, -0.9614],
        [-6.4746,  1.3261, -3.9824, -1.2738],
        [-5.8199,  1.6612, -4.8438, -1.0956],
        [-5.1710, -2.0434, -4.1967,  2.6194],
        [-6.0214,  1.0739, -4.1604, -1.1569],
        [-6.5857,  2.0052, -4.2964, -1.4748]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 61%|██████    | 177/289 [02:13<01:24,  1.33it/s]

Training loop 177
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.23291879892349243, logits - tensor([[-5.8333,  1.8118, -4.0147, -1.5927],
        [-6.5824,  1.1341, -5.1450, -0.2136],
        [-5.8910,  1.6835, -4.3567, -1.4181],
        [-6.0912,  1.6114, -4.0903, -1.9824],
        [-6.9955,  2.2529, -5.2536, -1.8686],
        [-5.4671, -2.4202,  1.4343, -1.9428],
        [-6.5457,  2.0528, -4.8221, -1.7548],
        [-5.3531,  2.0891, -4.5767, -1.2327]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 62%|██████▏   | 178/289 [02:14<01:23,  1.33it/s]

Training loop 178
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.20104113221168518, logits - tensor([[-6.2587, -0.8151, -3.7029,  0.9680],
        [-6.1663,  1.0488, -4.9061, -0.8245],
        [-5.0928, -1.6848, -4.7263,  2.2486],
        [-4.2479, -1.8654,  1.1063, -1.8719],
        [-5.7539,  1.3774, -4.0484, -0.3610],
        [-4.8788,  0.9056, -4.8197, -1.6142],
        [-5.9054, -1.4214, -4.1643,  2.1628],
        [-3.8986, -2.4084,  0.9288, -1.0642]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 62%|██████▏   | 179/289 [02:15<01:22,  1.33it/s]

Training loop 179
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5222175121307373, logits - tensor([[-5.0683,  2.0176, -3.8981, -2.0919],
        [-5.6151,  1.2058, -4.5989, -1.1437],
        [-4.7542, -2.6150,  1.1004, -1.3756],
        [-6.4248,  0.2929, -4.6403, -0.4190],
        [-6.5676,  2.3136, -4.6551, -1.2807],
        [-5.1751,  0.1321, -4.6839,  1.3138],
        [-6.1853,  2.1393, -4.2074, -1.4668],
        [-5.2692,  1.3091, -2.2857, -1.7354]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 62%|██████▏   | 180/289 [02:16<01:21,  1.34it/s]

Training loop 180
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.47477054595947266, logits - tensor([[-5.8038,  1.2578, -3.3755, -0.9640],
        [-6.7374,  1.8258, -3.8574, -1.6468],
        [-5.2734,  1.5873, -3.5961, -1.1614],
        [-6.5737,  1.1755, -4.9457, -1.1884],
        [-5.9955,  1.3141, -4.2890, -1.7945],
        [-6.3767,  1.9213, -4.7687, -0.7825],
        [-6.8309,  0.9417, -4.3648, -1.6326],
        [-5.0854,  1.6356, -3.9467, -1.2312]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 63%|██████▎   | 181/289 [02:16<01:21,  1.33it/s]

Training loop 181
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.30547237396240234, logits - tensor([[-6.3600,  1.8524, -4.2290, -1.5175],
        [-5.4551,  1.1283, -4.4486, -0.4559],
        [-6.6480,  1.4279, -3.4929, -1.3522],
        [-6.4311,  1.3609, -4.5086, -0.6499],
        [-5.6785,  1.0894, -4.3457, -0.9567],
        [-6.6684,  1.4536, -5.4488, -0.6024],
        [-5.1416,  0.7162, -3.6152, -0.5823],
        [-5.3991,  1.3969, -3.1735, -1.3505]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 63%|██████▎   | 182/289 [02:17<01:20,  1.33it/s]

Training loop 182
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19916778802871704, logits - tensor([[-6.0222,  1.6302, -4.1734, -2.0482],
        [-5.2553,  1.2183, -4.6467, -0.9473],
        [-6.4846,  1.1942, -4.2654, -0.8706],
        [-5.9994,  1.5747, -4.5317, -0.6957],
        [-5.2808, -0.8675, -4.7868,  1.3204],
        [-6.0245,  1.8595, -5.3176, -2.0130],
        [-4.9799, -2.2875,  0.6622, -1.6933],
        [-6.4215,  1.3858, -4.0297, -1.2879]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 63%|██████▎   | 183/289 [02:18<01:20,  1.32it/s]

Training loop 183
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1572994589805603, logits - tensor([[-5.6002,  1.2822, -3.3274, -1.2323],
        [-6.6050,  1.8956, -4.4795, -0.7831],
        [-5.5060,  1.8866, -4.1129, -0.8272],
        [-6.2089,  1.6689, -3.6575, -1.0382],
        [-3.8200, -1.1000,  0.1809, -1.5426],
        [-7.0741,  1.2522, -4.3648, -0.5455],
        [-5.8668,  1.4935, -4.5434, -0.5287],
        [-5.8270,  1.8356, -4.4643, -1.1963]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 64%|██████▎   | 184/289 [02:19<01:19,  1.32it/s]

Training loop 184
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2506240904331207, logits - tensor([[-6.1934,  1.0372, -4.3937, -1.2552],
        [-6.9234,  0.9884, -4.7947, -0.5676],
        [-5.1041, -1.7130, -4.4728,  0.9798],
        [-6.4505, -1.1588, -4.8233,  1.4701],
        [-5.3221,  0.9853, -3.4084, -1.0815],
        [-6.5131,  0.3523, -4.8621, -0.5457],
        [-5.2167, -3.0199,  0.6616, -1.9612],
        [-6.0420,  0.7835, -3.9841, -0.6522]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 64%|██████▍   | 185/289 [02:19<01:18,  1.32it/s]

Training loop 185
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3404543995857239, logits - tensor([[-6.5772,  1.1541, -3.9006, -1.6394],
        [-6.1608,  1.5171, -4.9344, -1.0893],
        [-5.4288,  1.5519, -3.7436, -1.2567],
        [-6.3796,  0.9267, -4.3240, -1.6779],
        [-6.1505,  0.7583, -3.8777, -1.3021],
        [-5.9425,  0.7604, -3.5487, -1.0612],
        [-6.9131,  1.6923, -4.0816, -1.0566],
        [-6.2822,  1.8050, -4.7793, -1.2503]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 64%|██████▍   | 186/289 [02:20<01:18,  1.32it/s]

Training loop 186
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.24921131134033203, logits - tensor([[-6.5854,  1.4185, -4.2744, -1.0563],
        [-6.9910,  1.5912, -5.4431, -1.5598],
        [-6.0239,  1.1961, -4.1044, -1.3455],
        [-5.8359,  0.0515, -4.0484,  0.1728],
        [-6.5774,  0.9251, -5.1209, -1.4627],
        [-5.4931, -0.5025, -3.3952,  1.2981],
        [-5.8231,  1.3450, -4.4160, -1.6416],
        [-5.1673, -2.0887,  0.2574, -2.0984]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 65%|██████▍   | 187/289 [02:21<01:17,  1.32it/s]

Training loop 187
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.11958642303943634, logits - tensor([[-6.8118,  1.8494, -4.8022, -1.8336],
        [-5.4231,  0.9691, -3.4221, -1.2461],
        [-5.8762,  1.5643, -4.2628, -0.7302],
        [-4.9220, -3.0158,  1.2039, -2.1294],
        [-6.4437,  1.1912, -3.6503, -1.1662],
        [-6.1946,  1.3312, -5.1444, -1.9439],
        [-6.5032,  1.2759, -4.6582, -1.8610],
        [-5.5433,  1.0482, -4.5117, -1.3754]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 65%|██████▌   | 188/289 [02:22<01:16,  1.32it/s]

Training loop 188
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.36333131790161133, logits - tensor([[-6.8632,  1.1739, -3.9979, -1.2047],
        [-5.5642,  0.2887, -3.6202, -1.3604],
        [-4.4678, -2.2841,  0.7612, -1.8581],
        [-6.0558, -0.3775, -1.9097, -1.4572],
        [-7.3647,  1.3379, -5.1818, -0.7340],
        [-7.6761, -0.9348, -5.5382,  0.8623],
        [-6.9759, -1.0907, -4.8499,  0.0216],
        [-5.6750,  0.0404, -3.3807, -1.3752]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 65%|██████▌   | 189/289 [02:22<01:15,  1.32it/s]

Training loop 189
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.21310102939605713, logits - tensor([[-5.7064,  1.2695, -4.7737, -1.6238],
        [-6.6986, -1.3425, -4.5731,  1.0466],
        [-4.2133, -1.9921,  1.5915, -1.0535],
        [-3.9534, -1.8487,  0.2468, -1.1150],
        [-7.0390,  1.0935, -4.7818, -0.6906],
        [-6.6544,  1.0357, -4.1579, -1.4587],
        [-5.2958, -1.9982,  1.2119, -0.9669],
        [-6.1055,  1.4725, -4.7546, -1.3446]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 66%|██████▌   | 190/289 [02:23<01:14,  1.32it/s]

Training loop 190
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.18383415043354034, logits - tensor([[-5.5872,  0.7477, -4.2169, -1.5581],
        [-6.5540,  1.7025, -4.4785, -0.7131],
        [-7.1627,  1.5153, -4.9053, -1.1379],
        [-6.6536, -0.7620, -4.8021,  0.8395],
        [-6.0684,  1.1786, -5.1993, -1.2093],
        [-5.9179,  0.7862, -3.9969, -1.3420],
        [-6.6810,  1.6299, -4.7608, -1.7040],
        [-5.9903,  0.4195, -3.1424, -0.5084]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 66%|██████▌   | 191/289 [02:24<01:14,  1.32it/s]

Training loop 191
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.28497445583343506, logits - tensor([[-6.6095,  0.8005, -4.2664, -1.0837],
        [-6.6682,  0.2788, -4.6990, -1.0779],
        [-6.0867,  1.4925, -4.3904, -1.8438],
        [-6.5815, -0.1461, -3.7875,  0.0754],
        [-6.4159,  1.6382, -4.8467, -1.4243],
        [-5.2915,  0.7226, -3.9451, -1.5528],
        [-6.0326,  1.9115, -4.3362, -1.3343],
        [-5.4406,  0.2706, -4.1843, -0.0288]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 66%|██████▋   | 192/289 [02:25<01:13,  1.32it/s]

Training loop 192
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.25484520196914673, logits - tensor([[-6.5339,  1.6375, -4.6586, -1.0394],
        [-5.9713,  0.7228, -3.3302, -1.3856],
        [-4.4291, -2.6445,  0.6129, -1.5894],
        [-5.4236,  1.3809, -3.8204, -1.4786],
        [-4.1059, -2.5291,  0.7506, -1.3851],
        [-6.3851,  0.8241, -4.1297, -0.6983],
        [-4.6402, -2.6157,  1.3748, -2.6455],
        [-7.4924,  1.6647, -5.2342, -1.8032]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 67%|██████▋   | 193/289 [02:25<01:12,  1.33it/s]

Training loop 193
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.23151911795139313, logits - tensor([[-5.8257,  1.1079, -5.1570, -0.9129],
        [-4.7262, -2.6941,  0.6282, -1.6289],
        [-6.7314, -0.4103, -4.8893,  1.1244],
        [-6.4394,  1.2060, -4.6036, -1.1475],
        [-6.2241,  0.6082, -4.3858, -1.6672],
        [-5.6103,  1.1342, -3.2909, -1.4144],
        [-4.3806, -2.2085,  0.7997, -1.2392],
        [-6.7076,  0.9351, -4.5958, -1.2680]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 67%|██████▋   | 194/289 [02:26<01:11,  1.32it/s]

Training loop 194
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1444813311100006, logits - tensor([[-4.9786, -1.7419,  1.2878, -1.6664],
        [-6.6954,  1.1083, -5.4874, -0.5818],
        [-4.9792, -2.9971,  1.8612, -1.7802],
        [-6.2574,  0.4375, -4.8242, -1.0073],
        [-6.2396,  1.6510, -4.1420, -0.6354],
        [-5.5004, -1.3611, -4.0896,  0.7455],
        [-6.2139,  1.3013, -4.4049, -1.2675],
        [-7.7137,  1.7454, -5.3245, -1.4867]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 67%|██████▋   | 195/289 [02:27<01:10,  1.33it/s]

Training loop 195
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.13102298974990845, logits - tensor([[-6.2272,  1.1268, -4.8595, -0.9171],
        [-4.5971, -1.8947,  1.6449, -1.5916],
        [-6.8918,  2.3166, -4.7147, -1.5362],
        [-6.9248,  1.1096, -5.2494, -1.9050],
        [-5.4835,  0.8842, -3.9640, -1.8957],
        [-6.1146,  1.2066, -4.0717, -0.9971],
        [-4.6230, -2.9114,  1.0531, -1.8421],
        [-7.0643,  0.7704, -5.0051, -0.9578]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 68%|██████▊   | 196/289 [02:28<01:10,  1.33it/s]

Training loop 196
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2901429235935211, logits - tensor([[-5.9168,  1.4236, -4.0157, -1.5156],
        [-5.6345,  0.4617, -4.6115, -0.9368],
        [-4.0978, -2.4899,  1.3384, -2.3749],
        [-6.1830,  2.1484, -4.5414, -1.6344],
        [-6.0467, -1.7065, -5.2684,  1.0821],
        [-4.2550,  0.9710, -3.1626, -1.6378],
        [-6.0944,  0.2886, -4.4792, -0.2433],
        [-5.5302,  1.7313, -4.2870, -0.1801]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 68%|██████▊   | 197/289 [02:28<01:09,  1.33it/s]

Training loop 197
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1155940443277359, logits - tensor([[-6.3146,  1.4726, -4.2520, -0.9332],
        [-6.8503,  1.2588, -4.8484, -1.0573],
        [-7.0291,  1.5430, -4.8828, -1.4001],
        [-5.6325,  1.6191, -4.5318, -1.7698],
        [-5.3867, -3.2731,  1.2149, -1.5639],
        [-4.4736, -2.2027,  0.9800, -2.3064],
        [-6.1082, -1.8717, -5.4622,  2.0817],
        [-5.9617,  1.7026, -4.3068, -0.9001]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 69%|██████▊   | 198/289 [02:29<01:08,  1.33it/s]

Training loop 198
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.21670030057430267, logits - tensor([[-5.9289,  0.9448, -4.6039, -2.0053],
        [-6.5465,  1.7224, -4.9230, -1.5235],
        [-5.8709,  0.9838, -3.8730, -0.6182],
        [-4.5503, -2.7683,  1.7454, -1.7216],
        [-3.8765, -2.4083,  0.9630, -1.5565],
        [-5.3634, -3.1243,  1.3498, -1.8657],
        [-6.0393,  1.3900, -4.7670, -1.1565],
        [-6.9858,  1.5732, -4.8507, -1.3504]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 69%|██████▉   | 199/289 [02:30<01:07,  1.33it/s]

Training loop 199
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19117572903633118, logits - tensor([[-7.0660,  1.0611, -5.2653, -2.0806],
        [-6.1410, -0.0180, -4.9746,  1.2099],
        [-5.6481, -1.2375, -4.3870,  1.4234],
        [-6.0668,  1.6848, -4.6671, -1.3791],
        [-5.7982,  1.3736, -4.5644, -0.1920],
        [-6.0800,  1.0993, -5.0516, -1.7527],
        [-6.8799,  2.1207, -5.6088, -1.3866],
        [-6.6567,  0.5030, -5.0046, -0.8813]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 69%|██████▉   | 200/289 [02:31<01:06,  1.33it/s]

Training loop 200
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2116309255361557, logits - tensor([[-6.4643,  0.8784, -4.5077, -1.0511],
        [-7.2629,  1.2025, -5.2746, -1.9811],
        [-5.7867, -1.7489, -4.1425,  1.3425],
        [-5.1844, -3.0331, -4.8956,  2.6210],
        [-6.3740,  0.8798, -5.2049, -1.3241],
        [-6.1088,  1.1997, -4.4538, -0.7571],
        [-5.8015,  1.1437, -4.8427, -1.7319],
        [-7.1355,  1.9812, -5.7840, -1.2748]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 70%|██████▉   | 201/289 [02:31<01:06,  1.33it/s]

Training loop 201
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.40301430225372314, logits - tensor([[-6.0257,  1.2969, -4.5106, -1.9573],
        [-4.0745, -3.0421,  1.2416, -1.7494],
        [-5.7881,  1.7195, -4.2803, -1.5877],
        [-4.5717, -2.7705,  0.9057, -1.2497],
        [-6.5586,  2.0583, -5.1672, -2.4257],
        [-4.8300, -3.3910,  1.7135, -1.4476],
        [-5.3604, -0.1243, -4.9994,  0.5050],
        [-4.4921, -2.7240,  1.1238, -1.6589]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 70%|██████▉   | 202/289 [02:32<01:05,  1.32it/s]

Training loop 202
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.18746429681777954, logits - tensor([[-7.3321,  2.6112, -5.4218, -1.2404],
        [-6.0040, -2.4669, -4.7921,  2.3084],
        [-6.7344,  1.7085, -4.9363, -1.5574],
        [-6.1697,  1.5387, -4.7102, -2.2415],
        [-6.4616,  1.6392, -5.4625, -1.5000],
        [-5.2812, -3.4338,  1.9972, -2.2487],
        [-6.0628,  0.9260, -4.6038, -1.4620],
        [-6.4743,  1.5886, -4.3479, -1.8975]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 70%|███████   | 203/289 [02:33<01:05,  1.32it/s]

Training loop 203
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.33389151096343994, logits - tensor([[-6.5911, -0.2127, -4.7747,  0.1478],
        [-6.9197,  1.8528, -4.5868, -2.3142],
        [-6.2618,  1.6241, -4.6458, -2.1091],
        [-5.4465,  0.9755, -4.0616, -1.5973],
        [-6.1901, -0.7000, -4.5070, -0.0532],
        [-6.8032,  1.9306, -5.1799, -1.5018],
        [-6.4674,  1.8747, -5.4042, -1.2692],
        [-6.6307,  2.0755, -4.8624, -1.0604]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 71%|███████   | 204/289 [02:34<01:04,  1.32it/s]

Training loop 204
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2894715368747711, logits - tensor([[-6.4351,  1.2498, -4.7325, -1.8742],
        [-5.5340,  1.8569, -4.3659, -1.7818],
        [-6.4152,  1.1698, -4.8183, -2.3117],
        [-6.2026, -1.9970, -4.1555,  1.8992],
        [-6.0079,  2.0217, -5.0402, -2.0937],
        [-6.8758,  1.6088, -5.2480, -2.0730],
        [-4.9652, -3.0674,  1.8517, -2.5598],
        [-7.2140,  1.5716, -5.6406, -2.0418]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 71%|███████   | 205/289 [02:34<01:03,  1.31it/s]

Training loop 205
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3587673604488373, logits - tensor([[-6.7371,  2.1124, -5.2332, -2.4856],
        [-6.1042,  0.9658, -5.2510, -0.4634],
        [-5.0367,  1.2972, -4.2018, -1.4293],
        [-4.9747, -1.1162, -4.4182,  1.3117],
        [-6.3438,  1.0840, -5.6035, -2.8840],
        [-6.5077,  1.5156, -4.9709, -1.0714],
        [-6.5828, -0.9709, -5.5107,  0.0527],
        [-6.6768,  2.1164, -4.9813, -2.8593]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 71%|███████▏  | 206/289 [02:35<01:03,  1.31it/s]

Training loop 206
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1729697585105896, logits - tensor([[-5.9842,  2.2393, -6.0709, -1.7125],
        [-5.3081,  1.6469, -5.0733, -1.6422],
        [-6.3047,  2.2481, -5.3478, -2.2657],
        [-5.2645,  2.1074, -4.3375, -2.3018],
        [-5.1125,  2.0834, -4.7825, -2.7423],
        [-5.9661, -2.5126, -4.0186,  1.5500],
        [-4.4083, -2.3754,  1.5620, -1.9967],
        [-5.4315,  2.1647, -3.8636, -1.9082]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 72%|███████▏  | 207/289 [02:36<01:02,  1.31it/s]

Training loop 207
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1601862609386444, logits - tensor([[-4.4262, -2.7087,  1.3844, -1.4175],
        [-4.9690, -2.5800,  1.7083, -2.4006],
        [-6.0514,  1.3414, -5.3109, -1.6539],
        [-7.1122,  0.5517, -5.2576, -0.2226],
        [-6.0379,  1.3542, -5.2560, -2.3855],
        [-7.1755,  2.0369, -5.0948, -1.5193],
        [-7.3061,  0.3741, -5.1101, -0.5622],
        [-4.6230, -2.6687,  1.5673, -1.9123]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 72%|███████▏  | 208/289 [02:37<01:01,  1.31it/s]

Training loop 208
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.20188234746456146, logits - tensor([[-5.6448,  1.2452, -5.2188, -1.7296],
        [-6.3731,  2.1996, -4.0066, -1.9869],
        [-4.5062, -2.5765,  1.9123, -1.7518],
        [-5.1201, -2.3355, -4.3870,  1.7482],
        [-7.0917,  2.3146, -6.3547, -1.9228],
        [-6.7162, -2.0915, -5.8420,  2.3234],
        [-5.6769,  2.1307, -4.8568, -2.0360],
        [-5.7043,  1.9316, -4.2510, -2.2960]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 72%|███████▏  | 209/289 [02:38<01:00,  1.32it/s]

Training loop 209
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4633018374443054, logits - tensor([[-7.3640,  1.8003, -5.4646, -1.8508],
        [-5.6429,  1.1807, -4.5187, -1.7306],
        [-6.7065,  1.4118, -5.1826, -1.6690],
        [-6.1936,  2.0632, -4.4634, -1.6473],
        [-6.0889,  1.0262, -4.9152, -1.0134],
        [-5.9147,  1.8504, -4.8617, -1.3705],
        [-6.9439,  2.6191, -6.2155, -1.7221],
        [-6.3864,  1.4953, -4.4161, -1.7128]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 73%|███████▎  | 210/289 [02:38<00:59,  1.32it/s]

Training loop 210
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2279750555753708, logits - tensor([[-6.4234,  2.2785, -5.1811, -2.4675],
        [-6.9313, -1.4114, -5.7529,  0.9396],
        [-7.1352,  1.5406, -5.2765, -1.3002],
        [-6.1628,  2.0244, -4.8333, -2.4885],
        [-5.0273,  1.6224, -4.3289, -1.1745],
        [-5.8238,  1.9638, -5.0196, -2.0476],
        [-5.6566,  1.8608, -4.6681, -1.4267],
        [-5.4551, -1.7178, -5.0204,  2.1578]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 73%|███████▎  | 211/289 [02:39<00:58,  1.32it/s]

Training loop 211
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2384517937898636, logits - tensor([[-6.1815, -1.6530, -4.2471,  0.7530],
        [-5.9945,  2.0162, -5.9091, -1.8807],
        [-4.4336, -2.9718,  1.4111, -2.2503],
        [-6.5540,  1.7066, -5.8606, -2.0846],
        [-6.3286, -0.8128, -4.1535, -0.2700],
        [-6.9217,  1.6877, -4.7127, -1.7402],
        [-6.8217,  0.9836, -5.2772, -1.4898],
        [-6.4349,  2.1898, -5.3596, -2.0707]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 73%|███████▎  | 212/289 [02:40<00:58,  1.33it/s]

Training loop 212
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.20163074135780334, logits - tensor([[-6.3227,  2.8578, -5.3392, -2.3906],
        [-5.0349, -2.3147, -4.8526,  1.6868],
        [-6.8073,  1.6755, -4.9674, -2.2887],
        [-6.0721,  1.6901, -6.3785, -1.6978],
        [-5.9212,  1.8170, -4.8277, -1.7746],
        [-6.6305,  1.1450, -4.5792, -1.6235],
        [-6.8003,  1.8490, -4.8998, -2.5477],
        [-5.7416,  1.2080, -5.4707, -1.7689]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 74%|███████▎  | 213/289 [02:41<00:57,  1.33it/s]

Training loop 213
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.08715254813432693, logits - tensor([[-5.8349,  1.7325, -5.2339, -1.6676],
        [-7.1453,  2.0433, -5.8925, -1.8907],
        [-5.6576,  1.5741, -4.2475, -2.2059],
        [-6.1818,  2.5474, -4.9391, -1.7885],
        [-6.6816,  1.4731, -5.3131, -1.6228],
        [-5.3925,  1.5494, -4.6185, -1.7497],
        [-5.3935, -3.3449,  1.7362, -1.6669],
        [-6.1305,  1.3177, -4.2982, -1.2950]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 74%|███████▍  | 214/289 [02:41<00:56,  1.33it/s]

Training loop 214
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.09135367721319199, logits - tensor([[-6.5578,  1.5027, -5.2851, -2.3959],
        [-6.8407,  0.6946, -5.4999, -0.8842],
        [-5.8434,  1.7968, -4.2606, -1.3274],
        [-5.8814,  2.6717, -4.8188, -2.0254],
        [-5.0935, -3.1914,  1.6762, -1.8970],
        [-4.4386, -2.8217,  1.7952, -1.9380],
        [-6.6188,  1.3902, -5.4773, -1.6657],
        [-5.9427, -2.9948, -4.8581,  2.3615]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 74%|███████▍  | 215/289 [02:42<00:55,  1.33it/s]

Training loop 215
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.12054362893104553, logits - tensor([[-6.4079,  1.8318, -5.1592, -1.1748],
        [-6.2582,  1.7976, -4.6642, -1.8829],
        [-5.0220, -0.1911, -4.3017,  0.5042],
        [-6.5553,  1.9704, -4.5614, -1.7462],
        [-5.0854, -3.0924,  1.7642, -2.4019],
        [-6.8059,  0.9840, -5.5643, -1.0770],
        [-5.5888,  1.6030, -4.7115, -1.4485],
        [-5.9365,  1.3220, -4.8116, -1.7308]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 75%|███████▍  | 216/289 [02:43<00:54,  1.33it/s]

Training loop 216
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.43597865104675293, logits - tensor([[-5.9978,  2.1699, -5.8999, -1.4801],
        [-6.3776,  2.0030, -5.2663, -1.4356],
        [-6.2362,  1.5763, -4.1624, -1.5993],
        [-4.9359, -2.7662,  1.9018, -1.9808],
        [-6.7722,  2.1311, -5.7068, -1.4541],
        [-5.3566, -3.4427,  2.2646, -2.0959],
        [-6.1310,  2.1249, -5.5264, -2.3058],
        [-6.2889,  2.1632, -5.6148, -1.5088]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 75%|███████▌  | 217/289 [02:44<00:54,  1.33it/s]

Training loop 217
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2109672874212265, logits - tensor([[-6.3422,  1.4944, -5.3339, -2.1288],
        [-6.3943,  1.9155, -5.2285, -1.6422],
        [-6.2415,  2.0849, -5.7394, -1.4566],
        [-4.1080, -2.9796,  2.1251, -1.7561],
        [-6.5423,  2.6578, -5.2150, -2.2129],
        [-6.7265,  0.7099, -5.2596, -1.0965],
        [-6.7816,  1.9139, -5.1969, -1.9458],
        [-4.7009, -2.5817,  1.8106, -1.5382]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 75%|███████▌  | 218/289 [02:44<00:53,  1.33it/s]

Training loop 218
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.29879170656204224, logits - tensor([[-4.3520, -2.6429,  1.8490, -2.2947],
        [-5.9131,  2.1175, -5.5224, -1.8151],
        [-5.8163,  1.8697, -4.8059, -1.9912],
        [-5.8908,  1.6014, -5.0047, -1.6886],
        [-6.6782,  2.3058, -4.9320, -2.0524],
        [-5.6484, -2.5099, -4.1222,  2.2660],
        [-6.3962,  1.2057, -5.1745, -1.5715],
        [-6.0641,  1.4227, -5.5430, -2.4224]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 76%|███████▌  | 219/289 [02:45<00:52,  1.33it/s]

Training loop 219
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.09512027353048325, logits - tensor([[-7.3061,  1.8966, -5.5997, -1.6312],
        [-6.4054,  1.6731, -5.5340, -1.1066],
        [-6.9619,  1.4411, -5.4605, -1.0781],
        [-6.3127,  2.7041, -5.8374, -2.0045],
        [-4.4579, -2.3994,  2.1096, -2.5202],
        [-6.9321,  1.3829, -6.2122, -1.5412],
        [-6.1140,  2.0744, -5.3365, -1.6516],
        [-6.1836,  1.3890, -5.1850, -1.0152]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 76%|███████▌  | 220/289 [02:46<00:51,  1.33it/s]

Training loop 220
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2551324665546417, logits - tensor([[-7.1134,  1.9012, -6.1751, -1.7990],
        [-5.7332,  1.8473, -4.9815, -0.6033],
        [-5.8384,  0.3542, -5.1030, -0.4076],
        [-6.2805,  1.0287, -4.7918, -1.8080],
        [-6.4092,  0.8636, -5.3910, -1.3105],
        [-6.5507,  1.4968, -5.3048, -2.1710],
        [-6.4133,  1.8023, -4.9680, -1.0926],
        [-6.1061,  1.7572, -5.3082, -1.5093]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 76%|███████▋  | 221/289 [02:47<00:51,  1.33it/s]

Training loop 221
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.08031590282917023, logits - tensor([[-5.3191, -3.3570,  2.7117, -2.4898],
        [-7.4108,  1.9736, -6.5373, -2.1473],
        [-6.7409,  1.8980, -6.0032, -1.3121],
        [-5.9248, -3.6930,  2.4016, -2.7393],
        [-6.8215,  1.3798, -5.1123, -1.4263],
        [-4.7114, -3.7200,  2.2491, -2.2206],
        [-5.0515,  1.6386, -4.3593, -1.2569],
        [-6.3249,  1.3617, -5.5707, -1.3905]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 77%|███████▋  | 222/289 [02:47<00:50,  1.33it/s]

Training loop 222
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.30900809168815613, logits - tensor([[-5.3253,  1.0122, -4.5731, -1.5504],
        [-5.7155,  2.3096, -5.2187, -1.6468],
        [-5.7747,  1.3758, -5.2740, -0.8800],
        [-6.3175,  2.1047, -5.5183, -0.8714],
        [-6.4649,  2.2327, -6.4183, -2.0945],
        [-6.1473,  2.0830, -5.9593, -1.9410],
        [-5.6946,  2.5062, -5.8089, -1.3004],
        [-6.8357,  2.5179, -6.2392, -1.7742]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 77%|███████▋  | 223/289 [02:48<00:49,  1.33it/s]

Training loop 223
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.26456406712532043, logits - tensor([[-6.3403,  1.8022, -5.6503, -1.4343],
        [-6.3794,  0.9084, -5.1798, -1.7751],
        [-6.4104,  2.2813, -5.3261, -1.3488],
        [-6.8645,  1.6951, -5.9881, -1.5127],
        [-6.6973,  1.3577, -4.3304, -1.6041],
        [-5.6443,  1.7456, -5.3515, -1.4166],
        [-5.9638,  1.3171, -5.2362, -0.9833],
        [-4.7311, -3.2063,  2.2004, -2.3103]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 78%|███████▊  | 224/289 [02:49<00:48,  1.33it/s]

Training loop 224
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.30846571922302246, logits - tensor([[-7.0177,  1.6988, -6.0970, -1.3510],
        [-5.9824,  1.9427, -5.2015, -1.9572],
        [-6.0556,  1.1812, -5.2481, -1.5373],
        [-6.6686, -1.0401, -4.5976,  0.7051],
        [-6.5506,  2.9995, -6.1694, -1.7744],
        [-4.6274, -3.1991,  2.0398, -2.3620],
        [-5.3545, -2.7342,  1.8511, -1.9097],
        [-6.1267,  1.5821, -5.3648, -1.6048]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 78%|███████▊  | 225/289 [02:50<00:48,  1.32it/s]

Training loop 225
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2652326822280884, logits - tensor([[-6.1675,  1.2957, -5.4052, -0.8787],
        [-7.0063,  1.8467, -5.1307, -2.0893],
        [-5.0527, -3.2900,  2.1592, -2.8551],
        [-5.6764,  1.7693, -5.3201, -2.1239],
        [-6.6191,  1.8151, -5.8310, -2.0874],
        [-5.9124,  1.8556, -5.9847, -1.9807],
        [-6.7887,  1.8763, -5.8383, -1.8044],
        [-5.1748, -3.4297,  2.3369, -2.4006]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 78%|███████▊  | 226/289 [02:50<00:47,  1.33it/s]

Training loop 226
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.17897579073905945, logits - tensor([[-6.7770,  1.4049, -6.3745, -1.1858],
        [-6.2533,  1.0550, -5.8556, -0.8071],
        [-7.0232,  1.6682, -6.1665, -1.7822],
        [-5.4579,  1.7785, -4.7829, -1.4855],
        [-5.4424,  1.1687, -5.1433, -1.7477],
        [-5.3582, -1.9042, -4.2443,  1.9934],
        [-6.5760,  2.3785, -4.9132, -1.0840],
        [-5.5114,  1.0709, -4.6604, -1.0733]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 79%|███████▊  | 227/289 [02:51<00:46,  1.32it/s]

Training loop 227
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.15493299067020416, logits - tensor([[-7.0005,  2.0321, -5.4947, -1.9072],
        [-6.3106,  1.5643, -5.1646, -2.0623],
        [-6.5727,  1.6904, -5.2152, -1.7266],
        [-5.5855,  1.6890, -5.1435, -1.7005],
        [-6.3730, -1.9686, -4.8952,  1.4809],
        [-5.6663,  1.1869, -5.0236, -0.9369],
        [-5.6421, -3.0369, -4.9074,  2.7577],
        [-5.6116,  0.9755, -5.5020, -1.7186]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 79%|███████▉  | 228/289 [02:52<00:46,  1.32it/s]

Training loop 228
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.41731011867523193, logits - tensor([[-4.5432, -3.1079, -4.6332,  2.4363],
        [-5.2689,  1.1048, -4.8341, -2.2983],
        [-5.7026,  2.1301, -4.6595, -2.1299],
        [-6.6109,  1.9688, -5.6523, -1.7418],
        [-6.4635,  1.4518, -5.8556, -1.0585],
        [-6.3991,  1.3312, -4.7086, -0.9952],
        [-6.6240,  1.1648, -4.8366, -0.7066],
        [-7.4747,  1.3550, -5.9047, -1.6449]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 79%|███████▉  | 229/289 [02:53<00:45,  1.32it/s]

Training loop 229
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5055209994316101, logits - tensor([[-4.7255, -3.4618,  1.3772, -2.1167],
        [-7.5716,  1.6920, -5.7842, -1.4164],
        [-6.4055,  1.7768, -5.6351, -1.0931],
        [-6.1541,  1.6520, -5.3686, -1.4427],
        [-6.9084,  0.8387, -5.4222, -1.1876],
        [-6.9265,  1.4977, -5.9344, -1.5862],
        [-5.8267,  1.6530, -5.3325, -0.7807],
        [-6.5236,  1.8111, -5.2990, -1.0172]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 80%|███████▉  | 230/289 [02:53<00:44,  1.32it/s]

Training loop 230
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.18232309818267822, logits - tensor([[-6.6035,  1.6017, -5.6475, -1.6009],
        [-5.1743, -3.0119,  2.6552, -2.3579],
        [-4.7687, -3.2187,  1.2268, -2.4883],
        [-6.1822,  1.9531, -5.3649, -2.4977],
        [-6.1823,  1.5654, -5.5879, -1.0297],
        [-4.5911, -2.6614, -5.0631,  2.9855],
        [-6.7806,  0.8438, -5.3185, -1.2998],
        [-5.4683, -3.5163,  2.0128, -3.0441]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 80%|███████▉  | 231/289 [02:54<00:43,  1.33it/s]

Training loop 231
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.34394025802612305, logits - tensor([[-3.9640, -3.1774,  2.1809, -2.5194],
        [-5.8556,  1.6406, -5.9873, -0.7871],
        [-4.3807, -3.1275,  1.0770, -2.0958],
        [-6.1893, -1.1097, -4.9322,  1.6733],
        [-5.8128,  1.5379, -5.2409, -1.7803],
        [-6.2538,  1.5715, -5.5161, -1.7478],
        [-5.1915, -3.8282,  2.1957, -2.9411],
        [-6.0819,  1.7030, -5.7367, -1.6029]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 80%|████████  | 232/289 [02:55<00:42,  1.33it/s]

Training loop 232
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.27206605672836304, logits - tensor([[-6.0804,  1.0823, -4.7501, -1.4851],
        [-7.3640,  0.4729, -6.2122, -0.7771],
        [-5.9114,  0.5221, -5.8646, -1.8897],
        [-6.0297,  0.9384, -6.0358, -1.2086],
        [-5.0219, -3.4371,  1.9339, -2.3460],
        [-6.0915,  0.9051, -5.3080, -1.5763],
        [-5.6785, -3.6965,  2.4971, -3.0117],
        [-6.1769,  1.7293, -4.9601, -0.8024]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 81%|████████  | 233/289 [02:56<00:42,  1.33it/s]

Training loop 233
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1864960789680481, logits - tensor([[-5.1143, -3.7249,  1.8272, -1.9218],
        [-7.0032,  0.7288, -6.1985, -0.6534],
        [-5.3425, -2.7188, -3.5369,  2.9559],
        [-6.4881,  1.1422, -5.3246, -1.7270],
        [-5.7614,  1.0546, -4.5946, -1.1734],
        [-6.5239,  1.2228, -5.1302, -1.2828],
        [-5.5377, -1.4877, -3.8911,  2.4727],
        [-5.8432,  0.7841, -5.3613, -1.3543]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 81%|████████  | 234/289 [02:56<00:41,  1.33it/s]

Training loop 234
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2784276306629181, logits - tensor([[-6.0957,  0.6138, -5.1716, -1.5732],
        [-5.7083,  0.9611, -4.2779, -1.4477],
        [-5.4664, -3.1355, -4.0451,  2.6563],
        [-6.5470,  1.8495, -5.8364, -0.9730],
        [-6.2891,  1.2911, -5.0606, -1.2048],
        [-4.9443, -0.3661, -4.9427,  0.8246],
        [-4.2373, -2.8979,  1.9972, -2.2767],
        [-6.8905,  1.4892, -6.0938, -1.0011]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 81%|████████▏ | 235/289 [02:57<00:40,  1.33it/s]

Training loop 235
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.23007671535015106, logits - tensor([[-5.5930,  0.8478, -5.2022, -1.3416],
        [-6.9752,  1.2879, -6.1554, -0.8877],
        [-6.1795,  0.7045, -5.3003, -0.5598],
        [-6.1129, -0.0252, -5.1217, -0.4545],
        [-6.8365,  1.5142, -5.9659, -0.7838],
        [-6.4968,  0.6648, -5.0878, -1.3122],
        [-6.4598,  0.7589, -4.7311, -0.6148],
        [-4.1298, -3.3847,  2.3770, -2.2565]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 82%|████████▏ | 236/289 [02:58<00:39,  1.33it/s]

Training loop 236
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.35397160053253174, logits - tensor([[-6.6777,  0.1586, -6.3317, -1.3117],
        [-6.0864,  1.1060, -5.1997, -1.7184],
        [-6.1477, -3.0260, -4.6325,  2.7195],
        [-6.3598,  0.2785, -5.4212, -1.0200],
        [-5.6849, -0.3329, -5.1711, -0.0384],
        [-6.5419,  0.3122, -4.9420, -0.4956],
        [-5.9691,  0.9965, -4.5967, -0.9757],
        [-5.2878, -3.8022,  2.0321, -2.5722]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 82%|████████▏ | 237/289 [02:59<00:39,  1.33it/s]

Training loop 237
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.33206045627593994, logits - tensor([[-6.8331,  0.3572, -5.0389, -0.6396],
        [-6.3388,  0.4294, -5.0050, -1.0933],
        [-6.1859,  1.1364, -5.2536, -0.5431],
        [-5.7737,  0.2663, -5.1461, -1.4301],
        [-5.6408,  1.0808, -5.7558, -0.5203],
        [-6.3244,  0.1290, -4.9378, -0.5477],
        [-7.5028,  1.2890, -5.5532, -1.1985],
        [-6.2153, -2.6524, -3.9294,  3.0324]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 82%|████████▏ | 238/289 [02:59<00:38,  1.33it/s]

Training loop 238
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.16619038581848145, logits - tensor([[-4.8855, -3.1417,  2.4588, -2.2273],
        [-6.6463,  0.7617, -5.4528, -0.4036],
        [-5.8468,  0.9649, -5.3051, -0.4930],
        [-5.8171, -2.5576, -5.6681,  1.6443],
        [-5.5174,  0.8058, -4.7741, -0.3258],
        [-6.1458,  1.5203, -5.8122, -1.4313],
        [-6.7458,  1.0226, -6.1896, -0.5204],
        [-6.4376,  0.6171, -5.5036, -0.2818]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 83%|████████▎ | 239/289 [03:00<00:37,  1.33it/s]

Training loop 239
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.15723171830177307, logits - tensor([[-6.0806,  1.5563, -4.6376, -0.5455],
        [-6.8942,  0.7435, -5.9702, -0.8830],
        [-5.7564,  1.2095, -4.9146, -1.1920],
        [-4.0133, -2.5138, -3.6713,  2.7247],
        [-6.2864, -2.1165, -5.0203,  2.5133],
        [-6.6445, -0.5197, -5.1564, -0.2644],
        [-5.8976,  0.3858, -5.2435, -0.7034],
        [-5.4606, -3.3828,  2.1770, -2.5638]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 83%|████████▎ | 240/289 [03:01<00:36,  1.33it/s]

Training loop 240
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.18417301774024963, logits - tensor([[-7.2413,  0.0729, -6.2436, -0.2820],
        [-7.2013,  0.5820, -5.8168, -0.1414],
        [-6.4505,  1.1159, -5.5493, -1.0888],
        [-6.6052, -2.9522, -4.3896,  2.7423],
        [-6.6361,  0.9869, -5.0548, -0.4471],
        [-5.3501, -3.0214, -4.3648,  2.2988],
        [-5.2949, -3.4711,  2.4513, -2.0578],
        [-5.7867,  0.4584, -4.8398, -0.5635]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 83%|████████▎ | 241/289 [03:02<00:36,  1.33it/s]

Training loop 241
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.24840199947357178, logits - tensor([[-6.6489e+00,  1.2173e+00, -5.9833e+00, -6.6485e-01],
        [-6.7004e+00, -3.8494e+00,  1.9930e+00, -1.8024e+00],
        [-5.0341e+00, -3.3458e+00,  2.2916e+00, -2.7599e+00],
        [-6.3669e+00,  5.0281e-01, -5.7013e+00, -3.2194e-03],
        [-5.0434e+00, -2.8926e+00, -5.3264e+00,  3.2578e+00],
        [-5.4103e+00, -3.0171e+00,  1.6864e+00, -2.3866e+00],
        [-6.2206e+00, -1.6690e+00, -4.1877e+00,  2.5555e+00],
        [-5.1829e+00,  1.6589e+00, -5.1174e+00, -8.3473e-01]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 84%|████████▎ | 242/289 [03:02<00:35,  1.33it/s]

Training loop 242
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.367692768573761, logits - tensor([[-7.0759,  0.6471, -5.4847, -0.7051],
        [-5.5341,  0.5701, -4.6725, -0.4699],
        [-5.6797,  0.6800, -5.1545, -0.2951],
        [-5.3285,  1.5724, -4.8838, -0.9889],
        [-5.1780, -3.6399,  1.3936, -2.1436],
        [-7.0584,  0.9159, -4.9134, -0.5720],
        [-5.0145,  0.7994, -5.4415, -0.7622],
        [-5.7684,  0.7983, -5.1367, -1.5749]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 84%|████████▍ | 243/289 [03:03<00:34,  1.32it/s]

Training loop 243
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1688021570444107, logits - tensor([[-6.2145,  0.7072, -4.9143, -1.0729],
        [-5.7079,  1.2997, -4.8418, -0.3145],
        [-5.7011,  0.1075, -5.0226, -0.8585],
        [-6.3103, -3.7255,  1.9947, -1.5212],
        [-6.1658,  0.8283, -5.5687, -1.6450],
        [-5.8185,  0.4838, -6.1098, -0.2850],
        [-4.9116, -3.2814,  2.3984, -1.8594],
        [-6.0408,  0.7824, -4.6729, -1.1018]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 84%|████████▍ | 244/289 [03:04<00:33,  1.32it/s]

Training loop 244
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3292774558067322, logits - tensor([[-5.0608,  0.1310, -4.4241, -0.8742],
        [-6.1018, -1.6756, -4.8675,  2.3788],
        [-6.1014,  0.5845, -5.2051, -1.3114],
        [-6.5385, -2.8128, -4.1312,  2.4286],
        [-6.2620,  0.9238, -5.5232, -0.8619],
        [-5.7566, -2.2768, -4.4725,  1.9608],
        [-5.5843,  1.3554, -5.0073, -1.0691],
        [-5.7784,  0.8492, -4.8535, -0.5686]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 85%|████████▍ | 245/289 [03:05<00:33,  1.33it/s]

Training loop 245
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2519450783729553, logits - tensor([[-6.0183,  0.5078, -4.7591, -1.6624],
        [-6.7675,  0.3650, -6.0062, -0.8790],
        [-5.7784,  1.2870, -4.8111, -0.4998],
        [-5.1579, -4.0383, -4.2869,  3.4235],
        [-5.8883, -3.6954,  2.0839, -2.3622],
        [-6.2248,  1.1514, -5.5994, -1.1459],
        [-6.2079,  1.6169, -5.8300, -0.6909],
        [-6.5203,  0.3443, -5.2564, -1.2465]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 85%|████████▌ | 246/289 [03:05<00:32,  1.32it/s]

Training loop 246
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2860146760940552, logits - tensor([[-5.3963, -3.1769, -4.9711,  2.4373],
        [-5.7354,  0.8224, -4.7167, -0.5651],
        [-5.9526,  0.7569, -4.9772, -1.5556],
        [-6.4033,  0.7473, -5.3307, -1.2423],
        [-5.7306,  1.3537, -5.6758, -0.3256],
        [-5.5156,  1.3788, -5.5316, -0.8339],
        [-5.5909,  1.0414, -5.7100, -0.8986],
        [-6.1002,  0.6455, -5.4419, -1.2509]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 85%|████████▌ | 247/289 [03:06<00:31,  1.32it/s]

Training loop 247
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.35777074098587036, logits - tensor([[-6.6578,  0.4417, -5.9356,  0.0414],
        [-5.9807,  0.8650, -5.9957, -0.7079],
        [-5.1388, -2.2312, -4.5063,  3.2647],
        [-4.9636, -2.7538,  1.6380, -2.7514],
        [-5.1209,  0.6389, -5.1568, -1.6050],
        [-6.9186,  1.2459, -5.5526, -0.6295],
        [-6.5594,  0.2631, -5.9418, -0.5873],
        [-5.2136,  0.2811, -5.0434, -0.4883]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 86%|████████▌ | 248/289 [03:07<00:31,  1.32it/s]

Training loop 248
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5747309923171997, logits - tensor([[-5.7279,  1.4537, -5.5203, -1.4764],
        [-5.9798,  0.3896, -4.3072, -1.3654],
        [-6.1066,  1.2905, -5.8761,  0.0868],
        [-6.3598,  0.7227, -5.8511, -1.3889],
        [-7.4074,  1.3509, -6.5951, -0.7266],
        [-5.1555, -2.8363,  2.9354, -3.1914],
        [-5.5617, -2.7651, -4.1850,  3.0301],
        [-6.0069, -3.0514, -4.2779,  2.7617]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 86%|████████▌ | 249/289 [03:08<00:30,  1.32it/s]

Training loop 249
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2436646968126297, logits - tensor([[-6.9732,  0.6622, -5.5005, -0.8652],
        [-5.3616, -3.4813,  2.1848, -2.7522],
        [-5.5624,  1.6817, -4.7079, -0.3581],
        [-4.8049, -2.0385, -4.4070,  2.9144],
        [-5.5316,  1.0730, -5.2025, -1.2331],
        [-5.5384,  0.7180, -4.9045, -0.9265],
        [-5.7436,  1.1194, -5.6189, -1.3492],
        [-5.6884,  0.8207, -5.6954, -1.8040]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 87%|████████▋ | 250/289 [03:08<00:29,  1.32it/s]

Training loop 250
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.12961415946483612, logits - tensor([[-5.5882,  1.3014, -5.7505, -0.7968],
        [-5.9989,  1.0729, -5.1157, -0.8278],
        [-5.1481,  1.2316, -4.1870, -1.2717],
        [-6.1106,  0.5244, -5.7100, -0.7487],
        [-5.3511, -2.5617,  1.2520, -2.1074],
        [-4.8822, -3.1604, -4.4520,  3.3459],
        [-6.5142, -2.3752, -4.1444,  2.4158],
        [-6.0182,  0.6926, -5.1323, -0.9224]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 87%|████████▋ | 251/289 [03:09<00:28,  1.33it/s]

Training loop 251
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.14144563674926758, logits - tensor([[-6.1384,  0.5106, -5.4930, -1.4539],
        [-5.0197, -1.7642, -4.0022,  2.3246],
        [-5.7773,  1.0236, -4.7819, -1.0708],
        [-6.6296,  0.9398, -4.7126, -0.9213],
        [-6.8913,  0.9683, -5.4784, -0.5838],
        [-6.1223,  0.4378, -4.9285, -1.1887],
        [-6.3176,  1.4719, -5.5158, -1.4065],
        [-4.8903, -2.8722,  2.5129, -1.7524]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 87%|████████▋ | 252/289 [03:10<00:27,  1.32it/s]

Training loop 252
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3648662567138672, logits - tensor([[-6.9392,  1.1290, -4.7415, -1.0969],
        [-5.5354,  0.9668, -4.9263, -1.4149],
        [-5.9916,  0.8481, -6.0679, -0.1095],
        [-5.5891,  1.3976, -4.7381, -1.0614],
        [-5.5264,  0.9378, -5.0008, -0.8435],
        [-5.4719, -3.2745,  2.0462, -2.7512],
        [-6.8878,  1.0879, -5.9113, -1.3460],
        [-6.7897,  1.6741, -6.0493, -1.4443]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 88%|████████▊ | 253/289 [03:11<00:27,  1.33it/s]

Training loop 253
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.40738123655319214, logits - tensor([[-5.6140,  1.1183, -4.8237, -1.3568],
        [-6.2996,  0.7178, -5.1870, -0.1980],
        [-6.2673,  2.1534, -4.8293, -1.1380],
        [-4.9731, -2.6815,  2.2236, -2.2666],
        [-7.0657, -0.3479, -4.8352,  1.3026],
        [-5.4913,  1.3408, -5.2113, -1.6433],
        [-7.0402,  0.7691, -5.6603, -0.3134],
        [-5.7973,  1.5020, -5.0944, -1.7663]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 88%|████████▊ | 254/289 [03:11<00:26,  1.32it/s]

Training loop 254
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1835654079914093, logits - tensor([[-6.1919,  1.4194, -5.1383, -1.9108],
        [-5.6761,  0.7160, -4.0104, -1.5108],
        [-5.9785,  0.5709, -4.8348, -0.8216],
        [-6.2254,  1.5594, -5.4287, -1.0406],
        [-5.7744, -2.7401,  2.0330, -2.3119],
        [-4.4286,  0.0761, -4.5949, -1.3400],
        [-6.0818,  0.5424, -5.1463, -1.0976],
        [-5.2050, -3.2677,  2.7403, -2.2408]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 88%|████████▊ | 255/289 [03:12<00:25,  1.33it/s]

Training loop 255
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.17299966514110565, logits - tensor([[-5.3911,  1.6666, -5.2093, -2.0891],
        [-7.0871,  0.6033, -5.0475, -2.0170],
        [-7.1272,  1.7899, -5.8408, -1.8342],
        [-7.2952,  1.7933, -5.8859, -1.3550],
        [-6.3108,  1.1462, -5.3306, -2.2173],
        [-5.9609, -3.0585,  1.9568, -1.9284],
        [-5.0166,  1.1100, -5.2984, -1.7679],
        [-4.8071, -2.1432, -3.6368,  2.9128]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 89%|████████▊ | 256/289 [03:13<00:24,  1.33it/s]

Training loop 256
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.0923096090555191, logits - tensor([[-5.7308,  2.0315, -5.2922, -1.0739],
        [-4.3909, -2.6076,  1.6515, -1.6818],
        [-5.5408, -2.7965, -4.8492,  2.0048],
        [-5.5616,  1.8110, -5.8543, -1.1806],
        [-4.7950, -2.1763,  2.2345, -2.3365],
        [-5.3952, -3.0908,  1.8003, -2.5513],
        [-6.1006, -2.5991, -4.5093,  2.1084],
        [-5.6987,  0.7614, -4.9071, -1.0671]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 89%|████████▉ | 257/289 [03:14<00:24,  1.33it/s]

Training loop 257
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.11132398247718811, logits - tensor([[-6.1276, -2.9576,  1.7586, -2.9490],
        [-4.9479, -3.0686,  0.8563, -2.1005],
        [-5.6884,  1.4606, -4.9121, -1.8037],
        [-6.2823,  0.3637, -4.7497, -0.4204],
        [-5.2880, -2.9376,  1.1922, -2.8687],
        [-7.0205,  2.2313, -6.0143, -1.7888],
        [-5.8234, -3.1013,  1.2228, -2.4473],
        [-6.3493, -3.1905,  1.7532, -1.9273]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 89%|████████▉ | 258/289 [03:14<00:23,  1.33it/s]

Training loop 258
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3412267565727234, logits - tensor([[-5.0665, -2.4567,  1.0244, -1.9551],
        [-4.8827, -3.5934,  2.1304, -2.8659],
        [-5.7127, -1.2750, -5.3665,  1.8802],
        [-5.3000, -3.2566,  1.8864, -2.7962],
        [-6.4708,  1.4959, -5.5141, -2.0409],
        [-5.5364,  1.0445, -5.6714, -1.9178],
        [-4.8381,  1.8477, -4.5907, -2.0646],
        [-6.2341, -1.6770, -4.9942,  1.4816]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 90%|████████▉ | 259/289 [03:15<00:22,  1.33it/s]

Training loop 259
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.235141322016716, logits - tensor([[-6.1365, -1.5488, -4.5601,  1.7184],
        [-7.0031,  1.4747, -5.6755, -1.0859],
        [-5.6288,  0.9905, -5.5218, -1.7906],
        [-6.3433,  1.5316, -4.5100, -2.8961],
        [-6.2766,  1.5322, -5.7624, -1.9001],
        [-6.0957, -2.4128, -4.9513,  2.5294],
        [-5.5956,  1.3968, -4.4467, -1.7018],
        [-6.7866,  1.3644, -5.2274, -0.9190]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 90%|████████▉ | 260/289 [03:16<00:21,  1.33it/s]

Training loop 260
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.10463167726993561, logits - tensor([[-5.6812, -2.3114,  1.5433, -2.0754],
        [-6.3917,  1.9085, -5.2023, -1.0197],
        [-5.2560,  0.4030, -3.7567, -0.9905],
        [-6.0808, -2.8522,  1.2515, -2.2688],
        [-5.7137,  1.5276, -5.1851, -1.1180],
        [-5.5126, -2.7508, -4.7305,  2.6424],
        [-6.2791, -3.3369,  1.3663, -2.0618],
        [-6.3275,  2.1758, -5.8492, -2.3281]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 90%|█████████ | 261/289 [03:17<00:21,  1.33it/s]

Training loop 261
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.30188751220703125, logits - tensor([[-7.1900,  2.3499, -6.0235, -1.6143],
        [-5.7264, -2.8997,  1.3821, -2.3700],
        [-6.8410,  1.9838, -5.3949, -1.9144],
        [-5.9244,  1.8001, -5.3447, -1.6588],
        [-5.6997,  2.4712, -5.4814, -1.8122],
        [-6.3312,  1.8718, -5.8026, -1.6221],
        [-4.5923,  1.4194, -4.2798, -0.7630],
        [-4.9685, -2.8024,  1.8027, -3.0677]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 91%|█████████ | 262/289 [03:17<00:20,  1.33it/s]

Training loop 262
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.09036552160978317, logits - tensor([[-6.0129,  1.8422, -5.6005, -1.8071],
        [-6.0751,  1.3692, -5.0545, -1.8935],
        [-6.7842,  2.5199, -5.2281, -1.4871],
        [-6.2470,  1.4906, -5.4016, -2.0192],
        [-4.8457,  0.6237, -4.0872, -1.3400],
        [-5.7062, -3.0438,  2.1714, -2.5533],
        [-5.6624,  1.4186, -5.1688, -2.1376],
        [-5.3731,  1.4308, -4.8138, -2.1940]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 91%|█████████ | 263/289 [03:18<00:19,  1.32it/s]

Training loop 263
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 91%|█████████▏| 264/289 [03:19<00:18,  1.32it/s]

loss - 0.09442393481731415, logits - tensor([[-6.1543,  2.2147, -5.7865, -1.9023],
        [-6.1913,  1.5287, -5.4868, -1.6043],
        [-5.5995,  1.5887, -5.8157, -2.1772],
        [-5.9846,  1.7829, -4.8870, -1.7829],
        [-6.7543,  1.3626, -5.9106, -2.4844],
        [-7.2604,  1.2168, -5.8685, -2.0929],
        [-6.6421,  0.9465, -5.1177, -1.2546],
        [-6.3770,  1.0152, -5.8316, -1.6465]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 264
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2658930718898773, logits - tensor([[-6.0188,  1.9572, -6.0300, -1.6620],
        [-6.4992,  1.6812, -5.3522, -1.3034],
        [-5.3521, -2.8357,  1.9573, -2.1049],
        [-6.2808,  2.0642, -5.3470, -2.0077],
        [-6.4136,  2.4284, -6.0389, -1.3071],
        [-6.4561, -0.1644, -5.3064,  0.3765],
        [-4.3604, -2.0684,  2.1230, -2.5646],
        [-4.4951, -2.4422, -4.4560,  2.

 92%|█████████▏| 265/289 [03:20<00:18,  1.32it/s]

Training loop 265
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 92%|█████████▏| 266/289 [03:20<00:17,  1.32it/s]

loss - 0.4280073046684265, logits - tensor([[-6.0508, -0.0849, -5.5947,  1.2528],
        [-6.2517,  1.4454, -4.7749, -1.9291],
        [-5.3730, -3.2625,  1.5076, -2.5683],
        [-6.6510,  2.3251, -6.1357, -2.5514],
        [-5.8001,  1.0623, -4.7506, -1.8795],
        [-6.6454,  1.9874, -5.7231, -1.8986],
        [-5.8511,  2.2132, -5.3215, -1.8621],
        [-6.2791,  1.8256, -5.4107, -2.0301]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 266
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.31644102931022644, logits - tensor([[-6.0473,  1.9913, -4.8838, -1.6797],
        [-5.6910,  1.7332, -5.1931, -1.8698],
        [-6.1568,  2.0262, -6.0382, -1.8857],
        [-5.5154,  1.7464, -4.3758, -1.7237],
        [-5.7463,  1.2670, -4.9914, -1.7267],
        [-6.8824,  1.3833, -5.6085, -1.1762],
        [-5.7728,  1.7652, -5.8473, -1.2614],
        [-6.3307,  1.9052, -5.6431, -1.

 92%|█████████▏| 267/289 [03:21<00:16,  1.32it/s]

Training loop 267
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3182808458805084, logits - tensor([[-5.2092,  1.8531, -5.3039, -1.1713],
        [-5.4824,  1.1834, -4.2250, -2.0474],
        [-6.1252,  2.1526, -4.9606, -1.9966],
        [-6.7163,  2.0313, -5.4860, -1.4822],
        [-6.1236,  0.9260, -5.1638, -1.2734],
        [-5.5359,  1.7991, -4.8523, -0.9179],
        [-5.4681, -3.0579,  1.8971, -2.5260],
        [-5.7570,  1.7494, -4.6825, -2.6272]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 93%|█████████▎| 268/289 [03:22<00:15,  1.32it/s]

Training loop 268
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.30916735529899597, logits - tensor([[-5.9539,  2.2932, -6.3053, -1.1560],
        [-6.3587,  1.5989, -5.1840, -2.4840],
        [-5.2173, -2.7556,  1.8564, -2.3057],
        [-6.3214,  2.3899, -4.8493, -1.2416],
        [-5.5786,  2.1888, -5.0186, -1.6126],
        [-6.2749,  1.9442, -5.5480, -1.9491],
        [-5.8162,  2.5725, -6.0377, -1.5938],
        [-5.3559, -3.5491,  2.3242, -2.2962]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 93%|█████████▎| 269/289 [03:23<00:15,  1.32it/s]

Training loop 269
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.07941029220819473, logits - tensor([[-5.7578,  2.2887, -5.0685, -1.8514],
        [-5.1036,  1.3393, -5.2244, -2.0096],
        [-5.0233,  2.3335, -4.6381, -2.1084],
        [-4.5914, -2.9464,  2.3666, -2.1582],
        [-5.8585,  1.7585, -5.1033, -1.5912],
        [-4.6311, -3.2557,  1.5714, -2.1372],
        [-5.5325,  1.9339, -5.2399, -1.4830],
        [-5.6724,  1.2861, -4.8715, -1.9683]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 93%|█████████▎| 270/289 [03:23<00:14,  1.32it/s]

Training loop 270
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19695298373699188, logits - tensor([[-6.5888, -1.4091, -5.1365,  0.5249],
        [-6.5257,  1.4107, -5.7852, -1.3438],
        [-7.0849,  0.3995, -5.7464, -0.2040],
        [-6.5431, -3.1261, -4.2698,  2.6691],
        [-6.5840,  1.6888, -6.7400, -2.2451],
        [-5.9265,  2.4198, -4.6984, -2.0692],
        [-6.8143, -0.4766, -4.6880,  0.0651],
        [-5.0759, -2.6805,  1.6362, -2.3512]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 94%|█████████▍| 271/289 [03:24<00:13,  1.32it/s]

Training loop 271
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.29632920026779175, logits - tensor([[-5.9566,  1.4155, -4.4773, -2.0255],
        [-5.0125,  1.2712, -4.4749, -2.1370],
        [-6.1700, -2.2369, -4.6991,  2.1780],
        [-5.9747,  2.3722, -4.8405, -2.0226],
        [-5.7050, -3.0861,  2.4206, -3.0834],
        [-5.6027,  1.6185, -5.3329, -2.0689],
        [-5.8151,  2.0625, -6.1805, -1.6888],
        [-5.9684,  1.5013, -4.6892, -2.0491]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 94%|█████████▍| 272/289 [03:25<00:12,  1.32it/s]

Training loop 272
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.08306552469730377, logits - tensor([[-6.0999,  1.6986, -5.3115, -2.0396],
        [-5.6732,  1.6374, -4.7319, -1.3093],
        [-5.6725,  1.7212, -4.8142, -2.0460],
        [-5.9027,  1.1079, -4.5070, -1.9473],
        [-5.2472, -2.2218,  2.0447, -2.2206],
        [-4.8972, -2.0267, -4.4638,  2.3007],
        [-5.7184,  1.9471, -4.7498, -1.3252],
        [-5.9552,  1.9322, -5.2679, -2.1847]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 94%|█████████▍| 273/289 [03:26<00:12,  1.33it/s]

Training loop 273
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.600460946559906, logits - tensor([[-5.3438, -3.0992,  2.4995, -3.0254],
        [-6.1723,  1.2305, -5.5302, -2.3608],
        [-5.8608,  2.0110, -5.2615, -1.3394],
        [-4.7234, -3.0682,  2.0737, -2.5793],
        [-6.2491,  1.8080, -5.4410, -1.7133],
        [-5.6266,  1.2111, -4.7100, -1.0326],
        [-5.5879,  1.8345, -5.4668, -1.6187],
        [-6.2728,  1.2375, -5.2221, -1.3337]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 95%|█████████▍| 274/289 [03:27<00:11,  1.32it/s]

Training loop 274
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.25423645973205566, logits - tensor([[-5.1833, -3.2482,  2.1618, -2.0354],
        [-5.9331,  1.4439, -4.6022, -1.1215],
        [-4.8616,  1.9834, -5.3619, -1.8483],
        [-5.8194,  1.9317, -5.1838, -1.9300],
        [-5.2878,  1.0235, -4.0530, -1.8639],
        [-5.4648, -3.1609,  1.8076, -3.1514],
        [-7.0285,  1.6908, -5.4434, -2.4289],
        [-5.9017,  1.6134, -5.4233, -1.6653]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 95%|█████████▌| 275/289 [03:27<00:10,  1.33it/s]

Training loop 275
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.20088748633861542, logits - tensor([[-5.6407,  2.0733, -4.9963, -1.5996],
        [-5.1463, -2.9521,  2.2469, -2.0310],
        [-5.7908,  2.2096, -5.8666, -2.0539],
        [-4.9986,  1.8561, -4.6831, -1.4453],
        [-6.2081, -1.1309, -5.1503,  1.8055],
        [-6.5916,  1.4600, -5.6217, -1.2895],
        [-6.2323, -1.7794, -4.9242,  0.9468],
        [-5.4557,  1.9765, -5.5433, -1.5319]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 96%|█████████▌| 276/289 [03:28<00:09,  1.33it/s]

Training loop 276
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.20423486828804016, logits - tensor([[-5.7085,  1.1014, -5.8278, -2.1228],
        [-5.0601,  1.2043, -3.6139, -1.2250],
        [-5.1209, -2.2688,  1.7717, -2.1142],
        [-6.0303,  2.3661, -3.9782, -1.7312],
        [-5.8529, -3.2189,  1.7983, -2.5268],
        [-5.8746,  1.5583, -5.2305, -1.3458],
        [-5.1716, -2.7287,  1.7182, -3.0352],
        [-6.2347,  0.1077, -5.2215, -0.1270]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 96%|█████████▌| 277/289 [03:29<00:09,  1.33it/s]

Training loop 277
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4210691452026367, logits - tensor([[-5.3297, -3.5908,  1.8176, -2.0227],
        [-5.5369,  1.7571, -4.9116, -1.3221],
        [-7.0885,  2.3837, -5.6494, -2.1766],
        [-6.7054,  2.1949, -6.3240, -1.5269],
        [-4.9678,  1.4907, -4.1250, -1.6428],
        [-5.9916,  1.5267, -5.4650, -1.4462],
        [-6.4316,  1.2068, -5.3280, -1.7943],
        [-5.4852,  1.4182, -4.9417, -1.2463]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 96%|█████████▌| 278/289 [03:30<00:08,  1.33it/s]

Training loop 278
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.22180812060832977, logits - tensor([[-5.7354,  1.2649, -5.3753, -1.4142],
        [-5.8637,  2.3365, -5.0629, -2.2699],
        [-5.5525,  2.4567, -4.7780, -1.9941],
        [-5.7352,  1.0088, -5.2721, -1.9412],
        [-6.1924,  1.4650, -3.9260, -1.9421],
        [-6.3492,  2.0269, -5.6027, -2.3302],
        [-4.9348,  1.3234, -4.7036, -1.8214],
        [-5.1654,  1.7300, -4.6180, -1.9637]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 97%|█████████▋| 279/289 [03:30<00:07,  1.33it/s]

Training loop 279
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2067112773656845, logits - tensor([[-5.4750,  1.6200, -4.8937, -1.8322],
        [-5.9158,  1.8752, -4.5007, -1.1022],
        [-5.1311, -3.3118, -3.6073,  2.0752],
        [-6.2306,  1.7094, -5.5064, -0.8868],
        [-4.1681,  2.0617, -4.0465, -1.7915],
        [-5.6272, -2.1977, -4.2763,  2.2530],
        [-6.1880,  1.8382, -5.7590, -1.3630],
        [-5.6026,  1.7636, -5.1861, -1.5291]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 97%|█████████▋| 280/289 [03:31<00:06,  1.33it/s]

Training loop 280
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2154301404953003, logits - tensor([[-4.3111,  1.7170, -5.0457, -1.6320],
        [-6.2422,  1.5837, -4.9396, -0.8495],
        [-4.9592, -2.3291,  1.6574, -2.4467],
        [-6.9251,  1.5942, -5.4841, -1.7731],
        [-5.9259,  1.1774, -5.2159, -1.4565],
        [-4.8321,  1.9779, -5.0069, -2.2712],
        [-5.3470,  1.5295, -5.0032, -2.0902],
        [-5.9160,  0.9733, -5.3723, -1.1544]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 97%|█████████▋| 281/289 [03:32<00:06,  1.33it/s]

Training loop 281
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2332666963338852, logits - tensor([[-5.9738,  1.2556, -5.6011, -1.4350],
        [-5.5323,  1.4023, -5.3273, -1.4474],
        [-5.2969, -2.3862,  0.6989, -2.1405],
        [-5.5698, -3.2582,  2.1055, -2.2849],
        [-6.5831, -1.2507, -5.1495,  0.6557],
        [-6.6213, -0.7599, -5.0103, -0.2705],
        [-6.5819,  0.9190, -5.5894, -1.0900],
        [-5.0448, -2.3922,  2.3818, -1.9395]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 98%|█████████▊| 282/289 [03:33<00:05,  1.33it/s]

Training loop 282
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1916576325893402, logits - tensor([[-6.3786,  2.0930, -5.8101, -1.2299],
        [-5.7696,  0.7057, -4.8385, -1.9414],
        [-4.8868, -2.6719, -3.9757,  2.5487],
        [-6.4647,  1.6190, -5.5830, -1.1027],
        [-6.0521, -3.5531,  1.9501, -2.6397],
        [-5.9525,  1.6109, -4.7515, -1.5622],
        [-6.0112,  1.5003, -5.5998, -1.6253],
        [-5.2808,  2.0526, -5.5482, -2.1300]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 98%|█████████▊| 283/289 [03:33<00:04,  1.33it/s]

Training loop 283
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3970106244087219, logits - tensor([[-5.5248,  0.5814, -4.7103, -1.8578],
        [-5.9103,  1.6373, -5.1133, -1.7147],
        [-6.0032,  1.9219, -5.8425, -1.1413],
        [-5.9343, -1.3344, -4.0363,  1.5034],
        [-6.3536, -2.4162, -5.6618,  2.4969],
        [-5.2313, -3.1257,  2.0408, -2.4849],
        [-5.6962, -2.8600,  2.4341, -2.0669],
        [-5.7279, -3.0623, -4.1644,  2.2369]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 98%|█████████▊| 284/289 [03:34<00:03,  1.32it/s]

Training loop 284
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.09586493670940399, logits - tensor([[-5.1821, -2.8672,  1.0561, -1.5792],
        [-6.2040, -1.5184, -5.5466,  2.0727],
        [-6.0865, -2.2071, -4.6727,  2.1008],
        [-6.3689,  1.7430, -5.9644, -1.6909],
        [-5.6365,  1.2600, -5.5277, -1.2593],
        [-4.9296, -3.5850,  1.9983, -2.0282],
        [-4.5857, -2.7745,  1.4528, -1.6779],
        [-6.4393,  1.6669, -4.7704, -1.4821]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 99%|█████████▊| 285/289 [03:35<00:03,  1.31it/s]

Training loop 285
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.10003527998924255, logits - tensor([[-5.2212,  1.7855, -4.8199, -0.8989],
        [-4.8147, -3.3746,  1.6583, -2.3470],
        [-6.0975,  1.9622, -6.2593, -0.6968],
        [-5.1395,  1.6906, -5.0657, -2.0148],
        [-4.8593, -3.1302,  1.6033, -1.4056],
        [-5.4815,  1.2525, -4.1606, -1.5171],
        [-6.3012,  2.5496, -6.0734, -1.4168],
        [-6.0672,  2.0232, -5.5162, -1.5771]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 99%|█████████▉| 286/289 [03:36<00:02,  1.31it/s]

Training loop 286
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.32070469856262207, logits - tensor([[-6.8828,  1.8095, -6.1987, -1.5930],
        [-5.8960, -2.1601, -5.8201,  1.6444],
        [-5.9284,  1.3200, -4.8987, -0.6889],
        [-6.0991, -3.3507,  2.5800, -2.4367],
        [-6.2860,  1.4393, -4.5321, -1.3353],
        [-5.8921,  1.5477, -5.3240, -1.3216],
        [-4.6250, -3.1843,  1.9637, -2.1306],
        [-6.7613,  1.8412, -5.7705, -1.6159]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 99%|█████████▉| 287/289 [03:36<00:01,  1.32it/s]

Training loop 287
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.20525172352790833, logits - tensor([[-5.6059,  1.5986, -5.2720, -1.6506],
        [-4.5056, -1.3038, -4.3149,  1.8992],
        [-6.6121,  1.7316, -6.0301, -1.1598],
        [-5.9542,  2.0183, -4.9023, -1.6398],
        [-4.4062, -3.0545,  1.2314, -1.9080],
        [-6.0564,  2.2804, -5.5290, -0.9981],
        [-7.5214, -1.0029, -5.4168,  1.5383],
        [-5.8901,  1.4956, -4.9612, -1.6757]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


100%|█████████▉| 288/289 [03:37<00:00,  1.32it/s]

Training loop 288
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2959824204444885, logits - tensor([[-5.0972,  1.2158, -5.1294, -1.1200],
        [-6.3203,  2.0061, -5.7728, -1.3775],
        [-6.0572,  1.6087, -5.8307, -1.5201],
        [-4.9934,  2.0600, -5.2416, -1.6271]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


100%|██████████| 289/289 [03:38<00:00,  1.33it/s]
  0%|          | 0/194 [00:00<?, ?it/s]

Validation Loop 0
input - False, attention_mask - False


  1%|          | 1/194 [00:00<00:55,  3.50it/s]

Validation Loop 1
input - False, attention_mask - False


  1%|          | 2/194 [00:00<00:50,  3.77it/s]

Validation Loop 2
input - False, attention_mask - False


  2%|▏         | 3/194 [00:00<00:49,  3.89it/s]

Validation Loop 3
input - False, attention_mask - False


  2%|▏         | 4/194 [00:01<00:50,  3.77it/s]

Validation Loop 4
input - False, attention_mask - False


  3%|▎         | 5/194 [00:01<00:49,  3.84it/s]

Validation Loop 5
input - False, attention_mask - False


  3%|▎         | 6/194 [00:01<00:47,  3.93it/s]

Validation Loop 6
input - False, attention_mask - False


  4%|▎         | 7/194 [00:01<00:47,  3.94it/s]

Validation Loop 7
input - False, attention_mask - False


  4%|▍         | 8/194 [00:02<00:47,  3.91it/s]

Validation Loop 8
input - False, attention_mask - False


  5%|▍         | 9/194 [00:02<00:46,  3.96it/s]

Validation Loop 9
input - False, attention_mask - False


  5%|▌         | 10/194 [00:02<00:46,  4.00it/s]

Validation Loop 10
input - False, attention_mask - False


  6%|▌         | 11/194 [00:02<00:45,  4.03it/s]

Validation Loop 11
input - False, attention_mask - False


  6%|▌         | 12/194 [00:03<00:45,  3.96it/s]

Validation Loop 12
input - False, attention_mask - False


  7%|▋         | 13/194 [00:03<00:45,  4.00it/s]

Validation Loop 13
input - False, attention_mask - False


  7%|▋         | 14/194 [00:03<00:44,  4.03it/s]

Validation Loop 14
input - False, attention_mask - False


  8%|▊         | 15/194 [00:03<00:44,  4.02it/s]

Validation Loop 15
input - False, attention_mask - False


  8%|▊         | 16/194 [00:04<00:44,  4.01it/s]

Validation Loop 16
input - False, attention_mask - False


  9%|▉         | 17/194 [00:04<00:44,  3.98it/s]

Validation Loop 17
input - False, attention_mask - False


  9%|▉         | 18/194 [00:04<00:44,  3.98it/s]

Validation Loop 18
input - False, attention_mask - False


 10%|▉         | 19/194 [00:04<00:43,  3.98it/s]

Validation Loop 19
input - False, attention_mask - False


 10%|█         | 20/194 [00:05<00:43,  3.96it/s]

Validation Loop 20
input - False, attention_mask - False


 11%|█         | 21/194 [00:05<00:43,  3.96it/s]

Validation Loop 21
input - False, attention_mask - False


 11%|█▏        | 22/194 [00:05<00:43,  3.97it/s]

Validation Loop 22
input - False, attention_mask - False


 12%|█▏        | 23/194 [00:05<00:42,  4.00it/s]

Validation Loop 23
input - False, attention_mask - False


 12%|█▏        | 24/194 [00:06<00:42,  4.02it/s]

Validation Loop 24
input - False, attention_mask - False


 13%|█▎        | 25/194 [00:06<00:42,  3.98it/s]

Validation Loop 25
input - False, attention_mask - False


 13%|█▎        | 26/194 [00:06<00:42,  3.97it/s]

Validation Loop 26
input - False, attention_mask - False


 14%|█▍        | 27/194 [00:06<00:42,  3.97it/s]

Validation Loop 27
input - False, attention_mask - False


 14%|█▍        | 28/194 [00:07<00:41,  3.98it/s]

Validation Loop 28
input - False, attention_mask - False


 15%|█▍        | 29/194 [00:07<00:41,  3.99it/s]

Validation Loop 29
input - False, attention_mask - False


 15%|█▌        | 30/194 [00:07<00:41,  3.99it/s]

Validation Loop 30
input - False, attention_mask - False


 16%|█▌        | 31/194 [00:07<00:40,  3.98it/s]

Validation Loop 31
input - False, attention_mask - False


 16%|█▋        | 32/194 [00:08<00:40,  3.97it/s]

Validation Loop 32
input - False, attention_mask - False


 17%|█▋        | 33/194 [00:08<00:40,  3.98it/s]

Validation Loop 33
input - False, attention_mask - False


 18%|█▊        | 34/194 [00:08<00:40,  3.98it/s]

Validation Loop 34
input - False, attention_mask - False


 18%|█▊        | 35/194 [00:08<00:40,  3.94it/s]

Validation Loop 35
input - False, attention_mask - False


 19%|█▊        | 36/194 [00:09<00:39,  3.97it/s]

Validation Loop 36
input - False, attention_mask - False


 19%|█▉        | 37/194 [00:09<00:39,  4.00it/s]

Validation Loop 37
input - False, attention_mask - False


 20%|█▉        | 38/194 [00:09<00:39,  3.96it/s]

Validation Loop 38
input - False, attention_mask - False


 20%|██        | 39/194 [00:09<00:38,  3.98it/s]

Validation Loop 39
input - False, attention_mask - False


 21%|██        | 40/194 [00:10<00:38,  3.99it/s]

Validation Loop 40
input - False, attention_mask - False


 21%|██        | 41/194 [00:10<00:38,  3.98it/s]

Validation Loop 41
input - False, attention_mask - False


 22%|██▏       | 42/194 [00:10<00:38,  3.95it/s]

Validation Loop 42
input - False, attention_mask - False


 22%|██▏       | 43/194 [00:10<00:38,  3.97it/s]

Validation Loop 43
input - False, attention_mask - False


 23%|██▎       | 44/194 [00:11<00:37,  3.98it/s]

Validation Loop 44
input - False, attention_mask - False


 23%|██▎       | 45/194 [00:11<00:37,  3.97it/s]

Validation Loop 45
input - False, attention_mask - False


 24%|██▎       | 46/194 [00:11<00:37,  3.98it/s]

Validation Loop 46
input - False, attention_mask - False


 24%|██▍       | 47/194 [00:11<00:37,  3.96it/s]

Validation Loop 47
input - False, attention_mask - False


 25%|██▍       | 48/194 [00:12<00:37,  3.93it/s]

Validation Loop 48
input - False, attention_mask - False


 25%|██▌       | 49/194 [00:12<00:36,  3.94it/s]

Validation Loop 49
input - False, attention_mask - False


 26%|██▌       | 50/194 [00:12<00:36,  3.92it/s]

Validation Loop 50
input - False, attention_mask - False


 26%|██▋       | 51/194 [00:12<00:36,  3.94it/s]

Validation Loop 51
input - False, attention_mask - False


 27%|██▋       | 52/194 [00:13<00:36,  3.90it/s]

Validation Loop 52
input - False, attention_mask - False


 27%|██▋       | 53/194 [00:13<00:36,  3.85it/s]

Validation Loop 53
input - False, attention_mask - False


 28%|██▊       | 54/194 [00:13<00:36,  3.87it/s]

Validation Loop 54
input - False, attention_mask - False


 28%|██▊       | 55/194 [00:13<00:36,  3.85it/s]

Validation Loop 55
input - False, attention_mask - False


 29%|██▉       | 56/194 [00:14<00:35,  3.86it/s]

Validation Loop 56
input - False, attention_mask - False


 29%|██▉       | 57/194 [00:14<00:35,  3.84it/s]

Validation Loop 57
input - False, attention_mask - False


 30%|██▉       | 58/194 [00:14<00:34,  3.90it/s]

Validation Loop 58
input - False, attention_mask - False


 30%|███       | 59/194 [00:14<00:34,  3.89it/s]

Validation Loop 59
input - False, attention_mask - False


 31%|███       | 60/194 [00:15<00:34,  3.91it/s]

Validation Loop 60
input - False, attention_mask - False


 31%|███▏      | 61/194 [00:15<00:33,  3.92it/s]

Validation Loop 61
input - False, attention_mask - False


 32%|███▏      | 62/194 [00:15<00:33,  3.94it/s]

Validation Loop 62
input - False, attention_mask - False


 32%|███▏      | 63/194 [00:15<00:33,  3.96it/s]

Validation Loop 63
input - False, attention_mask - False


 33%|███▎      | 64/194 [00:16<00:32,  3.97it/s]

Validation Loop 64
input - False, attention_mask - False


 34%|███▎      | 65/194 [00:16<00:32,  3.98it/s]

Validation Loop 65
input - False, attention_mask - False


 34%|███▍      | 66/194 [00:16<00:32,  3.99it/s]

Validation Loop 66
input - False, attention_mask - False


 35%|███▍      | 67/194 [00:16<00:31,  4.03it/s]

Validation Loop 67
input - False, attention_mask - False


 35%|███▌      | 68/194 [00:17<00:31,  4.01it/s]

Validation Loop 68
input - False, attention_mask - False


 36%|███▌      | 69/194 [00:17<00:31,  4.01it/s]

Validation Loop 69
input - False, attention_mask - False


 36%|███▌      | 70/194 [00:17<00:30,  4.03it/s]

Validation Loop 70
input - False, attention_mask - False


 37%|███▋      | 71/194 [00:17<00:30,  4.02it/s]

Validation Loop 71
input - False, attention_mask - False


 37%|███▋      | 72/194 [00:18<00:30,  4.01it/s]

Validation Loop 72
input - False, attention_mask - False


 38%|███▊      | 73/194 [00:18<00:30,  4.03it/s]

Validation Loop 73
input - False, attention_mask - False


 38%|███▊      | 74/194 [00:18<00:30,  4.00it/s]

Validation Loop 74
input - False, attention_mask - False


 39%|███▊      | 75/194 [00:18<00:30,  3.95it/s]

Validation Loop 75
input - False, attention_mask - False


 39%|███▉      | 76/194 [00:19<00:30,  3.91it/s]

Validation Loop 76
input - False, attention_mask - False


 40%|███▉      | 77/194 [00:19<00:29,  3.92it/s]

Validation Loop 77
input - False, attention_mask - False


 40%|████      | 78/194 [00:19<00:29,  3.91it/s]

Validation Loop 78
input - False, attention_mask - False


 41%|████      | 79/194 [00:19<00:29,  3.95it/s]

Validation Loop 79
input - False, attention_mask - False


 41%|████      | 80/194 [00:20<00:28,  3.96it/s]

Validation Loop 80
input - False, attention_mask - False


 42%|████▏     | 81/194 [00:20<00:28,  3.94it/s]

Validation Loop 81
input - False, attention_mask - False


 42%|████▏     | 82/194 [00:20<00:28,  3.97it/s]

Validation Loop 82
input - False, attention_mask - False


 43%|████▎     | 83/194 [00:20<00:27,  3.97it/s]

Validation Loop 83
input - False, attention_mask - False


 43%|████▎     | 84/194 [00:21<00:27,  4.01it/s]

Validation Loop 84
input - False, attention_mask - False


 44%|████▍     | 85/194 [00:21<00:27,  4.02it/s]

Validation Loop 85
input - False, attention_mask - False


 44%|████▍     | 86/194 [00:21<00:26,  4.02it/s]

Validation Loop 86
input - False, attention_mask - False


 45%|████▍     | 87/194 [00:21<00:26,  4.00it/s]

Validation Loop 87
input - False, attention_mask - False


 45%|████▌     | 88/194 [00:22<00:26,  4.00it/s]

Validation Loop 88
input - False, attention_mask - False


 46%|████▌     | 89/194 [00:22<00:26,  4.00it/s]

Validation Loop 89
input - False, attention_mask - False


 46%|████▋     | 90/194 [00:22<00:25,  4.03it/s]

Validation Loop 90
input - False, attention_mask - False


 47%|████▋     | 91/194 [00:22<00:25,  4.02it/s]

Validation Loop 91
input - False, attention_mask - False


 47%|████▋     | 92/194 [00:23<00:25,  4.00it/s]

Validation Loop 92
input - False, attention_mask - False


 48%|████▊     | 93/194 [00:23<00:25,  4.01it/s]

Validation Loop 93
input - False, attention_mask - False


 48%|████▊     | 94/194 [00:23<00:25,  4.00it/s]

Validation Loop 94
input - False, attention_mask - False


 49%|████▉     | 95/194 [00:23<00:25,  3.96it/s]

Validation Loop 95
input - False, attention_mask - False


 49%|████▉     | 96/194 [00:24<00:24,  3.95it/s]

Validation Loop 96
input - False, attention_mask - False


 50%|█████     | 97/194 [00:24<00:24,  3.97it/s]

Validation Loop 97
input - False, attention_mask - False


 51%|█████     | 98/194 [00:24<00:24,  3.96it/s]

Validation Loop 98
input - False, attention_mask - False


 51%|█████     | 99/194 [00:24<00:23,  4.00it/s]

Validation Loop 99
input - False, attention_mask - False


 52%|█████▏    | 100/194 [00:25<00:23,  4.01it/s]

Validation Loop 100
input - False, attention_mask - False


 52%|█████▏    | 101/194 [00:25<00:23,  4.00it/s]

Validation Loop 101
input - False, attention_mask - False


 53%|█████▎    | 102/194 [00:25<00:23,  3.96it/s]

Validation Loop 102
input - False, attention_mask - False


 53%|█████▎    | 103/194 [00:25<00:22,  3.96it/s]

Validation Loop 103
input - False, attention_mask - False


 54%|█████▎    | 104/194 [00:26<00:22,  3.96it/s]

Validation Loop 104
input - False, attention_mask - False


 54%|█████▍    | 105/194 [00:26<00:22,  3.97it/s]

Validation Loop 105
input - False, attention_mask - False


 55%|█████▍    | 106/194 [00:26<00:22,  3.99it/s]

Validation Loop 106
input - False, attention_mask - False


 55%|█████▌    | 107/194 [00:27<00:22,  3.95it/s]

Validation Loop 107
input - False, attention_mask - False


 56%|█████▌    | 108/194 [00:27<00:21,  3.94it/s]

Validation Loop 108
input - False, attention_mask - False


 56%|█████▌    | 109/194 [00:27<00:21,  3.96it/s]

Validation Loop 109
input - False, attention_mask - False


 57%|█████▋    | 110/194 [00:27<00:21,  3.91it/s]

Validation Loop 110
input - False, attention_mask - False


 57%|█████▋    | 111/194 [00:28<00:21,  3.93it/s]

Validation Loop 111
input - False, attention_mask - False


 58%|█████▊    | 112/194 [00:28<00:20,  3.93it/s]

Validation Loop 112
input - False, attention_mask - False


 58%|█████▊    | 113/194 [00:28<00:20,  3.90it/s]

Validation Loop 113
input - False, attention_mask - False


 59%|█████▉    | 114/194 [00:28<00:20,  3.87it/s]

Validation Loop 114
input - False, attention_mask - False


 59%|█████▉    | 115/194 [00:29<00:20,  3.91it/s]

Validation Loop 115
input - False, attention_mask - False


 60%|█████▉    | 116/194 [00:29<00:19,  3.91it/s]

Validation Loop 116
input - False, attention_mask - False


 60%|██████    | 117/194 [00:29<00:19,  3.93it/s]

Validation Loop 117
input - False, attention_mask - False


 61%|██████    | 118/194 [00:29<00:19,  3.95it/s]

Validation Loop 118
input - False, attention_mask - False


 61%|██████▏   | 119/194 [00:30<00:18,  3.96it/s]

Validation Loop 119
input - False, attention_mask - False


 62%|██████▏   | 120/194 [00:30<00:18,  3.95it/s]

Validation Loop 120
input - False, attention_mask - False


 62%|██████▏   | 121/194 [00:30<00:18,  3.97it/s]

Validation Loop 121
input - False, attention_mask - False


 63%|██████▎   | 122/194 [00:30<00:18,  3.95it/s]

Validation Loop 122
input - False, attention_mask - False


 63%|██████▎   | 123/194 [00:31<00:17,  3.96it/s]

Validation Loop 123
input - False, attention_mask - False


 64%|██████▍   | 124/194 [00:31<00:17,  3.98it/s]

Validation Loop 124
input - False, attention_mask - False


 64%|██████▍   | 125/194 [00:31<00:17,  3.96it/s]

Validation Loop 125
input - False, attention_mask - False


 65%|██████▍   | 126/194 [00:31<00:17,  3.99it/s]

Validation Loop 126
input - False, attention_mask - False


 65%|██████▌   | 127/194 [00:32<00:16,  4.00it/s]

Validation Loop 127
input - False, attention_mask - False


 66%|██████▌   | 128/194 [00:32<00:16,  4.00it/s]

Validation Loop 128
input - False, attention_mask - False


 66%|██████▋   | 129/194 [00:32<00:16,  4.00it/s]

Validation Loop 129
input - False, attention_mask - False


 67%|██████▋   | 130/194 [00:32<00:15,  4.00it/s]

Validation Loop 130
input - False, attention_mask - False


 68%|██████▊   | 131/194 [00:33<00:15,  4.01it/s]

Validation Loop 131
input - False, attention_mask - False


 68%|██████▊   | 132/194 [00:33<00:15,  4.00it/s]

Validation Loop 132
input - False, attention_mask - False


 69%|██████▊   | 133/194 [00:33<00:15,  3.96it/s]

Validation Loop 133
input - False, attention_mask - False


 69%|██████▉   | 134/194 [00:33<00:15,  3.96it/s]

Validation Loop 134
input - False, attention_mask - False


 70%|██████▉   | 135/194 [00:34<00:15,  3.91it/s]

Validation Loop 135
input - False, attention_mask - False


 70%|███████   | 136/194 [00:34<00:14,  3.91it/s]

Validation Loop 136
input - False, attention_mask - False


 71%|███████   | 137/194 [00:34<00:14,  3.89it/s]

Validation Loop 137
input - False, attention_mask - False


 71%|███████   | 138/194 [00:34<00:14,  3.90it/s]

Validation Loop 138
input - False, attention_mask - False


 72%|███████▏  | 139/194 [00:35<00:14,  3.88it/s]

Validation Loop 139
input - False, attention_mask - False


 72%|███████▏  | 140/194 [00:35<00:13,  3.88it/s]

Validation Loop 140
input - False, attention_mask - False


 73%|███████▎  | 141/194 [00:35<00:13,  3.91it/s]

Validation Loop 141
input - False, attention_mask - False


 73%|███████▎  | 142/194 [00:35<00:13,  3.90it/s]

Validation Loop 142
input - False, attention_mask - False


 74%|███████▎  | 143/194 [00:36<00:12,  3.94it/s]

Validation Loop 143
input - False, attention_mask - False


 74%|███████▍  | 144/194 [00:36<00:12,  3.92it/s]

Validation Loop 144
input - False, attention_mask - False


 75%|███████▍  | 145/194 [00:36<00:12,  3.96it/s]

Validation Loop 145
input - False, attention_mask - False


 75%|███████▌  | 146/194 [00:36<00:12,  3.96it/s]

Validation Loop 146
input - False, attention_mask - False


 76%|███████▌  | 147/194 [00:37<00:11,  3.96it/s]

Validation Loop 147
input - False, attention_mask - False


 76%|███████▋  | 148/194 [00:37<00:11,  4.00it/s]

Validation Loop 148
input - False, attention_mask - False


 77%|███████▋  | 149/194 [00:37<00:11,  3.98it/s]

Validation Loop 149
input - False, attention_mask - False


 77%|███████▋  | 150/194 [00:37<00:11,  3.98it/s]

Validation Loop 150
input - False, attention_mask - False


 78%|███████▊  | 151/194 [00:38<00:10,  3.98it/s]

Validation Loop 151
input - False, attention_mask - False


 78%|███████▊  | 152/194 [00:38<00:10,  3.96it/s]

Validation Loop 152
input - False, attention_mask - False


 79%|███████▉  | 153/194 [00:38<00:10,  3.92it/s]

Validation Loop 153
input - False, attention_mask - False


 79%|███████▉  | 154/194 [00:38<00:10,  3.93it/s]

Validation Loop 154
input - False, attention_mask - False


 80%|███████▉  | 155/194 [00:39<00:09,  3.94it/s]

Validation Loop 155
input - False, attention_mask - False


 80%|████████  | 156/194 [00:39<00:09,  3.89it/s]

Validation Loop 156
input - False, attention_mask - False


 81%|████████  | 157/194 [00:39<00:09,  3.92it/s]

Validation Loop 157
input - False, attention_mask - False


 81%|████████▏ | 158/194 [00:39<00:09,  3.95it/s]

Validation Loop 158
input - False, attention_mask - False


 82%|████████▏ | 159/194 [00:40<00:08,  3.94it/s]

Validation Loop 159
input - False, attention_mask - False


 82%|████████▏ | 160/194 [00:40<00:08,  3.89it/s]

Validation Loop 160
input - False, attention_mask - False


 83%|████████▎ | 161/194 [00:40<00:08,  3.92it/s]

Validation Loop 161
input - False, attention_mask - False


 84%|████████▎ | 162/194 [00:40<00:08,  3.91it/s]

Validation Loop 162
input - False, attention_mask - False


 84%|████████▍ | 163/194 [00:41<00:07,  3.94it/s]

Validation Loop 163
input - False, attention_mask - False


 85%|████████▍ | 164/194 [00:41<00:07,  3.92it/s]

Validation Loop 164
input - False, attention_mask - False


 85%|████████▌ | 165/194 [00:41<00:07,  3.90it/s]

Validation Loop 165
input - False, attention_mask - False


 86%|████████▌ | 166/194 [00:41<00:07,  3.94it/s]

Validation Loop 166
input - False, attention_mask - False


 86%|████████▌ | 167/194 [00:42<00:06,  3.96it/s]

Validation Loop 167
input - False, attention_mask - False


 87%|████████▋ | 168/194 [00:42<00:06,  3.94it/s]

Validation Loop 168
input - False, attention_mask - False


 87%|████████▋ | 169/194 [00:42<00:06,  3.97it/s]

Validation Loop 169
input - False, attention_mask - False


 88%|████████▊ | 170/194 [00:42<00:06,  3.97it/s]

Validation Loop 170
input - False, attention_mask - False


 88%|████████▊ | 171/194 [00:43<00:05,  3.98it/s]

Validation Loop 171
input - False, attention_mask - False


 89%|████████▊ | 172/194 [00:43<00:05,  3.96it/s]

Validation Loop 172
input - False, attention_mask - False


 89%|████████▉ | 173/194 [00:43<00:05,  3.99it/s]

Validation Loop 173
input - False, attention_mask - False


 90%|████████▉ | 174/194 [00:43<00:05,  3.99it/s]

Validation Loop 174
input - False, attention_mask - False


 90%|█████████ | 175/194 [00:44<00:04,  3.96it/s]

Validation Loop 175
input - False, attention_mask - False


 91%|█████████ | 176/194 [00:44<00:04,  3.94it/s]

Validation Loop 176
input - False, attention_mask - False


 91%|█████████ | 177/194 [00:44<00:04,  3.98it/s]

Validation Loop 177
input - False, attention_mask - False


 92%|█████████▏| 178/194 [00:45<00:04,  3.93it/s]

Validation Loop 178
input - False, attention_mask - False


 92%|█████████▏| 179/194 [00:45<00:03,  3.90it/s]

Validation Loop 179
input - False, attention_mask - False


 93%|█████████▎| 180/194 [00:45<00:03,  3.95it/s]

Validation Loop 180
input - False, attention_mask - False


 93%|█████████▎| 181/194 [00:45<00:03,  3.94it/s]

Validation Loop 181
input - False, attention_mask - False


 94%|█████████▍| 182/194 [00:46<00:03,  3.95it/s]

Validation Loop 182
input - False, attention_mask - False


 94%|█████████▍| 183/194 [00:46<00:02,  3.93it/s]

Validation Loop 183
input - False, attention_mask - False


 95%|█████████▍| 184/194 [00:46<00:02,  3.95it/s]

Validation Loop 184
input - False, attention_mask - False


 95%|█████████▌| 185/194 [00:46<00:02,  3.92it/s]

Validation Loop 185
input - False, attention_mask - False


 96%|█████████▌| 186/194 [00:47<00:02,  3.92it/s]

Validation Loop 186
input - False, attention_mask - False


 96%|█████████▋| 187/194 [00:47<00:01,  3.94it/s]

Validation Loop 187
input - False, attention_mask - False


 97%|█████████▋| 188/194 [00:47<00:01,  3.91it/s]

Validation Loop 188
input - False, attention_mask - False


 97%|█████████▋| 189/194 [00:47<00:01,  3.92it/s]

Validation Loop 189
input - False, attention_mask - False


 98%|█████████▊| 190/194 [00:48<00:01,  3.95it/s]

Validation Loop 190
input - False, attention_mask - False


 98%|█████████▊| 191/194 [00:48<00:00,  3.94it/s]

Validation Loop 191
input - False, attention_mask - False


 99%|█████████▉| 192/194 [00:48<00:00,  3.94it/s]

Validation Loop 192
input - False, attention_mask - False


 99%|█████████▉| 193/194 [00:48<00:00,  3.97it/s]

Validation Loop 193
input - False, attention_mask - False


100%|██████████| 194/194 [00:49<00:00,  3.95it/s]


[{'tp': 0, 'tn': 1552, 'fp': 0, 'fn': 0}, {'tp': 927, 'tn': 332, 'fp': 34, 'fn': 259}, {'tp': 158, 'tn': 1364, 'fp': 2, 'fn': 28}, {'tp': 155, 'tn': 1097, 'fp': 276, 'fn': 24}]
Detailed accuracy after 1 epoch:
unanswerable accuarcy: 1.0
extractive accuarcy: 0.8112113402061856
yes_no accuarcy: 0.9806701030927835
abstractive accuarcy: 0.8067010309278351
Overall accuarcy: 0.899645618556701
Best accuarcy: 0.8973904639175257
0.899645618556701
Model Updated


  0%|          | 0/289 [00:00<?, ?it/s]

Training loop 0
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.11593576520681381, logits - tensor([[-5.9219,  1.1813, -5.1149, -0.6268],
        [-5.1905,  0.8905, -5.3378, -1.0403],
        [-6.4677,  0.9432, -4.8326, -1.2141],
        [-4.8366,  1.6476, -4.8945, -1.3514],
        [-6.3879,  2.2978, -5.7676, -1.7901],
        [-5.5818, -3.7030,  1.7656, -2.0296],
        [-6.0188,  1.0368, -5.5944, -1.9084],
        [-6.1747, -1.9125, -5.2215,  1.6789]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  0%|          | 1/289 [00:00<04:01,  1.19it/s]

Training loop 1
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.09891599416732788, logits - tensor([[-4.8881, -3.1377,  1.2527, -2.3322],
        [-4.8295, -3.0358,  1.6668, -1.2299],
        [-6.1868,  1.8659, -5.1213, -1.5255],
        [-6.1546,  1.7695, -5.4254, -1.3967],
        [-4.9035, -3.1600, -4.5708,  2.9482],
        [-4.3555, -2.7100,  1.9200, -1.6949],
        [-5.8749,  1.7629, -5.2918, -0.8414],
        [-6.1057,  1.3610, -4.9126, -1.0575]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  1%|          | 2/289 [00:01<03:45,  1.27it/s]

Training loop 2
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5322823524475098, logits - tensor([[-5.5780,  1.1730, -5.2031, -0.4354],
        [-6.2716,  1.3312, -5.5773, -1.7751],
        [-6.1124, -1.6203, -4.9229,  0.9027],
        [-6.0716,  1.2536, -5.5139, -1.6115],
        [-5.7886, -3.2681, -5.4789,  2.1339],
        [-6.3548,  1.5347, -5.8187, -1.4358],
        [-5.3642,  0.7899, -5.2838, -1.1358],
        [-5.3134, -2.9316,  1.6032, -2.3526]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  1%|          | 3/289 [00:02<03:39,  1.31it/s]

Training loop 3
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.10628339648246765, logits - tensor([[-6.9628,  2.2493, -6.3567, -1.5411],
        [-5.9792,  0.9327, -5.4333, -1.4000],
        [-6.7830,  1.3368, -5.1912, -1.3669],
        [-5.9729,  1.0291, -4.7700, -1.0741],
        [-6.9867,  1.4568, -5.8949, -1.7131],
        [-6.2327,  1.5786, -5.3100, -0.9556],
        [-5.6567, -3.0045, -4.1635,  2.5019],
        [-5.3733,  1.9449, -4.9497, -1.1398]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  1%|▏         | 4/289 [00:03<03:35,  1.32it/s]

Training loop 4
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.20630131661891937, logits - tensor([[-6.8383,  1.4209, -5.2750, -2.0874],
        [-4.9100, -3.2730,  1.7042, -2.7990],
        [-6.9321,  0.5321, -6.3278, -2.0186],
        [-6.1789,  1.6818, -5.6917, -1.4912],
        [-4.7742, -3.1724,  2.0442, -2.1848],
        [-6.5843,  1.9067, -6.3251, -1.6005],
        [-6.7456,  2.3128, -5.1711, -1.5301],
        [-6.3320,  1.6735, -5.9419, -2.2203]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  2%|▏         | 5/289 [00:03<03:33,  1.33it/s]

Training loop 5
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.23231399059295654, logits - tensor([[-6.4186,  1.2092, -5.0965, -1.1546],
        [-5.7019, -1.7108, -5.4811,  2.1774],
        [-6.4306,  1.0992, -5.3509, -1.2123],
        [-5.3214,  1.3574, -5.2019, -1.6647],
        [-6.0193,  1.0824, -5.7762, -1.1689],
        [-6.1394,  1.6152, -5.1431, -2.2527],
        [-5.5999, -2.9992,  0.8562, -1.7319],
        [-4.6418, -3.0847,  1.9239, -1.9107]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  2%|▏         | 6/289 [00:04<03:31,  1.34it/s]

Training loop 6
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.18664588034152985, logits - tensor([[-5.7147, -2.0473, -4.5536,  1.6991],
        [-5.7073, -1.5844, -5.2362,  1.0864],
        [-6.0886,  1.6565, -5.4727, -0.8523],
        [-5.3223, -3.1422,  1.5493, -1.7903],
        [-6.5704,  2.1798, -6.1572, -1.9049],
        [-5.6548,  1.0763, -4.7116, -1.7599],
        [-6.2537,  2.0029, -5.8312, -1.6544],
        [-5.8388,  0.6336, -5.0547, -1.0924]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  2%|▏         | 7/289 [00:05<03:30,  1.34it/s]

Training loop 7
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.21171917021274567, logits - tensor([[-6.7270,  1.7524, -5.8381, -1.0201],
        [-6.1345, -2.8333,  1.1988, -2.0076],
        [-5.2142,  1.4935, -4.9587, -1.7024],
        [-6.3780,  0.9493, -6.3409, -1.4674],
        [-5.1871,  0.7165, -5.0458, -2.1036],
        [-5.3238, -2.8039,  1.6801, -2.3828],
        [-5.5695,  0.9607, -5.2221, -1.3297],
        [-6.0211,  1.9286, -5.9247, -1.6800]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  3%|▎         | 8/289 [00:06<03:30,  1.34it/s]

Training loop 8
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.26744604110717773, logits - tensor([[-6.0855,  1.7400, -5.4046, -1.0563],
        [-5.5991,  1.6127, -5.0618, -1.6047],
        [-5.3294,  1.3068, -4.6448, -1.8599],
        [-6.5831,  2.0089, -6.4272, -1.6203],
        [-6.3141,  1.4627, -5.0859, -0.3191],
        [-6.8869,  0.7033, -5.8741, -0.3261],
        [-5.4233,  1.3506, -4.3694, -1.2163],
        [-6.5944,  0.9684, -5.1830, -1.1933]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  3%|▎         | 9/289 [00:06<03:30,  1.33it/s]

Training loop 9
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4156627357006073, logits - tensor([[-5.9991, -2.0515, -4.7669,  1.6315],
        [-4.9614, -3.2322,  0.9340, -1.7239],
        [-6.6141,  2.3133, -5.6098, -0.9978],
        [-6.5219,  2.0969, -5.5915, -1.4090],
        [-5.8209, -3.0169, -4.9324,  2.3230],
        [-6.2638,  1.7075, -5.2708, -0.8679],
        [-6.5082,  0.8995, -4.8465, -1.6230],
        [-5.3992, -2.7627,  1.5683, -1.6083]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  3%|▎         | 10/289 [00:07<03:29,  1.33it/s]

Training loop 10
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.16355104744434357, logits - tensor([[-5.7196,  0.7332, -5.4405, -1.7018],
        [-5.3157, -3.5822,  1.5120, -1.8906],
        [-5.4445,  2.2121, -5.2229, -1.1913],
        [-6.9841,  0.9115, -5.7788, -0.2927],
        [-6.0876,  1.3574, -5.9862, -0.9143],
        [-4.8111, -3.2353,  1.0405, -1.4102],
        [-6.3844,  1.8885, -5.9362, -1.9273],
        [-6.1171, -3.2990,  1.3736, -1.9916]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  4%|▍         | 11/289 [00:08<03:29,  1.33it/s]

Training loop 11
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2748834490776062, logits - tensor([[-5.5656, -2.9817,  1.5432, -1.9009],
        [-6.0542,  1.0986, -4.9139, -0.9234],
        [-6.2445,  2.1309, -5.5126, -2.8247],
        [-5.5175,  1.3389, -4.2041, -0.8950],
        [-6.2749,  1.2251, -5.9208, -1.8738],
        [-5.6731,  1.7098, -5.5356, -1.6473],
        [-5.7145,  1.6801, -5.0363, -1.3753],
        [-6.3490,  1.0317, -5.2008, -1.1806]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  4%|▍         | 12/289 [00:09<03:29,  1.32it/s]

Training loop 12
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.41478776931762695, logits - tensor([[-5.9802,  1.5555, -5.1663, -1.9633],
        [-6.5827,  1.0725, -5.7972, -0.4092],
        [-5.6386,  1.5290, -5.8145, -1.1712],
        [-6.0049,  1.2951, -5.9732, -1.6432],
        [-5.2092,  1.0580, -4.6579, -1.0035],
        [-5.1992,  1.4974, -4.9051, -1.0153],
        [-5.6003,  0.9625, -5.2196, -0.8153],
        [-4.7357, -2.9471,  2.2435, -1.6453]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  4%|▍         | 13/289 [00:09<03:28,  1.32it/s]

Training loop 13
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.20340749621391296, logits - tensor([[-5.3582,  0.7296, -4.9125, -1.2406],
        [-5.0859, -3.6812,  1.3896, -2.4631],
        [-5.6110, -0.7858, -5.0337,  1.4434],
        [-6.5960,  1.3097, -5.1947, -1.2406],
        [-5.2044,  1.4586, -3.9484, -1.8364],
        [-5.7448, -3.2590,  1.0073, -1.4695],
        [-5.3778, -3.1852,  1.6572, -1.6029],
        [-5.0711,  1.4317, -5.1046, -1.1362]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  5%|▍         | 14/289 [00:10<03:27,  1.33it/s]

Training loop 14
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19713062047958374, logits - tensor([[-5.9770,  1.5154, -5.1595, -1.6208],
        [-4.9740, -2.9972, -4.3769,  2.2916],
        [-6.2042,  1.2032, -4.9059, -1.5434],
        [-5.4957,  1.1047, -5.6946, -1.3718],
        [-5.8122,  1.7742, -4.9343, -0.9899],
        [-7.1387,  1.7966, -5.4031, -1.1240],
        [-5.3435, -3.6028,  1.2979, -1.4765],
        [-5.0303, -2.6757,  1.3843, -1.8061]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  5%|▌         | 15/289 [00:11<03:27,  1.32it/s]

Training loop 15
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.23562854528427124, logits - tensor([[-5.5011,  0.9411, -5.0992, -1.5593],
        [-5.7897,  0.9762, -5.0788, -1.8461],
        [-5.9077,  0.8492, -5.2926, -1.2945],
        [-6.5308, -1.6264, -4.5385,  1.8515],
        [-5.8631,  1.0418, -5.0264, -0.9455],
        [-4.7577, -2.3368, -3.9995,  3.5456],
        [-5.9322,  1.0084, -5.4828, -1.0980],
        [-5.1702, -2.7028,  0.9708, -1.9542]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  6%|▌         | 16/289 [00:12<03:26,  1.32it/s]

Training loop 16
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.12664993107318878, logits - tensor([[-6.2386,  1.6849, -5.7363, -0.8012],
        [-6.0014,  1.1466, -5.5407, -1.1660],
        [-5.9814,  1.4176, -5.4277, -2.3163],
        [-5.1099,  0.9951, -5.0723, -0.8397],
        [-5.9210,  0.8742, -5.6168, -0.5220],
        [-6.2470,  2.0077, -6.3357, -1.8406],
        [-5.4549, -2.8489,  1.8415, -2.0838],
        [-5.4034,  1.5687, -5.2059, -0.9487]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  6%|▌         | 17/289 [00:12<03:26,  1.32it/s]

Training loop 17
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.15933439135551453, logits - tensor([[-6.4403,  1.0039, -5.1584, -1.1998],
        [-4.8199, -3.6760,  1.8035, -1.9944],
        [-5.6165, -2.1625, -4.3861,  1.5616],
        [-5.0698, -2.6705, -4.6796,  3.2820],
        [-5.4124,  1.0605, -5.3791, -1.8081],
        [-7.6102,  1.3469, -5.7986, -1.5066],
        [-6.2953,  2.5942, -5.6435, -1.8967],
        [-4.6191, -2.8034,  1.5629, -1.5351]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  6%|▌         | 18/289 [00:13<03:24,  1.32it/s]

Training loop 18
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.25638943910598755, logits - tensor([[-6.7617,  0.7862, -5.7229, -1.6123],
        [-6.0235,  1.2704, -4.6908, -0.8705],
        [-5.6374,  1.1648, -4.7137, -1.2183],
        [-5.9294,  1.1754, -5.5900, -0.8376],
        [-5.2904, -3.8665,  2.0008, -2.3922],
        [-5.4236,  1.3144, -5.0902, -1.7738],
        [-6.3786,  0.7687, -5.4550, -0.6803],
        [-5.6951,  0.8299, -5.1568, -1.5761]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  7%|▋         | 19/289 [00:14<03:23,  1.33it/s]

Training loop 19
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.17103196680545807, logits - tensor([[-6.1038,  1.3400, -4.9720, -0.8737],
        [-4.9577, -2.7943,  2.5766, -2.3731],
        [-6.4366,  1.2027, -5.1653, -1.0451],
        [-5.0821, -2.0495,  1.9981, -1.7806],
        [-7.1661,  1.7181, -6.3415, -1.1673],
        [-6.0273, -2.2682, -4.0153,  2.3425],
        [-6.1715,  0.5238, -5.4706, -1.3327],
        [-5.3958,  0.9695, -4.6414, -2.0123]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  7%|▋         | 20/289 [00:15<03:23,  1.32it/s]

Training loop 20
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.21722950041294098, logits - tensor([[-5.4476, -2.1701, -4.1309,  1.9614],
        [-5.8654, -1.5975, -4.5448,  1.3214],
        [-6.5535, -1.4044, -5.5699,  1.3890],
        [-6.3149,  1.4831, -5.3943, -1.6373],
        [-6.5622,  0.5291, -4.5798, -1.2073],
        [-6.5689,  0.9853, -5.2515, -1.1146],
        [-5.5972, -1.6489, -5.0829,  1.1454],
        [-4.7864,  1.4130, -4.6242, -1.2187]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  7%|▋         | 21/289 [00:15<03:22,  1.32it/s]

Training loop 21
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.11136417835950851, logits - tensor([[-6.2053,  1.3038, -5.7327, -1.4708],
        [-5.8654,  1.5477, -5.3581, -0.6110],
        [-6.4803,  1.2420, -5.5300, -1.7948],
        [-6.2705, -0.7851, -4.6094,  0.6189],
        [-6.4427,  1.2417, -5.3757, -1.3662],
        [-6.6147,  2.3994, -6.2333, -1.3584],
        [-4.9459, -3.3827, -4.2677,  3.1151],
        [-6.1674,  1.8580, -5.4682, -1.5591]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  8%|▊         | 22/289 [00:16<03:21,  1.32it/s]

Training loop 22
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1552116721868515, logits - tensor([[-5.9588,  1.4016, -4.7877, -1.2329],
        [-5.1703, -2.7740, -4.0251,  2.1915],
        [-4.9857, -3.1542, -3.8887,  3.1162],
        [-5.4800, -3.2003,  1.9207, -1.5486],
        [-4.4547, -2.6404,  1.8589, -2.6852],
        [-5.7988,  1.6512, -5.1867, -1.9116],
        [-6.4014,  1.3791, -5.4158, -0.6931],
        [-6.2484,  1.1316, -5.3377, -0.7505]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  8%|▊         | 23/289 [00:17<03:21,  1.32it/s]

Training loop 23
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1425972729921341, logits - tensor([[-6.1394,  0.5630, -5.5490, -1.1474],
        [-7.4881,  0.4753, -6.5465, -0.3667],
        [-4.1604, -2.9819, -4.5415,  2.7487],
        [-6.5046,  1.3352, -5.7939, -1.9675],
        [-5.7234,  1.8946, -6.5193, -1.2985],
        [-4.9113, -3.0560, -3.4686,  2.7290],
        [-6.0468,  1.3270, -5.2707, -1.5623],
        [-4.2409, -2.6117,  1.1804, -1.8170]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  8%|▊         | 24/289 [00:18<03:21,  1.32it/s]

Training loop 24
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.46138790249824524, logits - tensor([[-5.9900,  1.2289, -5.6605, -1.1171],
        [-6.6191,  0.8876, -5.0304, -0.7231],
        [-6.5228, -0.9173, -5.5035,  0.9813],
        [-5.4175, -2.9990,  2.1269, -2.3994],
        [-4.1051, -3.8409, -4.3616,  2.3592],
        [-5.7502, -1.3592, -0.8311, -1.4857],
        [-5.1767,  1.4631, -5.2666, -1.3818],
        [-6.8698,  0.8405, -6.1430, -1.1527]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  9%|▊         | 25/289 [00:18<03:21,  1.31it/s]

Training loop 25
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.21386805176734924, logits - tensor([[-6.4338,  1.2661, -6.0730, -1.8968],
        [-6.2390,  0.7512, -6.0896, -1.6027],
        [-6.6786,  1.6916, -5.2783, -1.1591],
        [-6.2336,  0.5643, -5.8611, -1.8626],
        [-5.0888, -2.5908,  1.4828, -2.1474],
        [-6.4750,  1.6089, -5.4450, -1.2627],
        [-5.7610,  1.6104, -5.4881, -1.7776],
        [-6.7244,  1.8646, -6.1141, -1.4259]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  9%|▉         | 26/289 [00:19<03:20,  1.31it/s]

Training loop 26
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3439120650291443, logits - tensor([[-5.0325, -2.0592, -4.5952,  1.9044],
        [-6.7802,  1.0360, -5.2190, -1.2717],
        [-5.8069,  1.1060, -5.7991, -0.4957],
        [-6.4214,  1.1821, -5.9897, -1.9982],
        [-5.0494, -3.6226, -3.9977,  3.6500],
        [-5.7632, -3.5682,  2.0221, -2.2121],
        [-5.1779, -2.8717, -3.7882,  1.9279],
        [-5.0466, -2.2657, -4.0161,  1.9684]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  9%|▉         | 27/289 [00:20<03:19,  1.31it/s]

Training loop 27
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 10%|▉         | 28/289 [00:21<03:18,  1.31it/s]

loss - 0.4661181569099426, logits - tensor([[-5.9155,  1.3962, -5.5542, -1.3537],
        [-6.0575,  1.2594, -5.3049, -1.2862],
        [-5.6429,  1.2987, -5.0515, -1.8482],
        [-6.6047, -2.2884, -4.6274,  1.7542],
        [-4.7652, -3.2445,  1.9705, -2.5667],
        [-6.7147,  1.2289, -5.0699, -2.3689],
        [-5.6081,  1.2659, -5.1871, -1.8650],
        [-5.0297, -3.7203,  1.3509, -2.4673]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 28
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4667416214942932, logits - tensor([[-5.6501, -3.4248,  2.4017, -2.3070],
        [-5.9847,  1.4798, -5.0303, -1.8190],
        [-4.2666, -3.0512, -3.2191,  2.0022],
        [-6.2306,  1.1492, -5.5975, -2.0139],
        [-5.6037,  1.3530, -5.3294, -2.0146],
        [-5.7261, -4.1800,  1.9527, -2.8195],
        [-6.1397,  1.9600, -5.5092, -1.8315],
        [-6.3626, -3.6422,  2.2189, -2.64

 10%|█         | 29/289 [00:21<03:18,  1.31it/s]

Training loop 29
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19650408625602722, logits - tensor([[-6.2540,  0.9601, -5.2130, -1.2450],
        [-6.8070,  1.2962, -5.7885, -1.8280],
        [-6.3296,  1.5244, -4.6784, -0.7390],
        [-6.3560,  1.0280, -5.5656, -1.5114],
        [-6.0412,  1.3924, -5.4360, -1.4737],
        [-7.4634,  0.7199, -5.6502, -1.4024],
        [-4.5776, -2.8196, -4.8522,  2.7388],
        [-4.8731,  0.7438, -5.0140, -0.9085]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 10%|█         | 30/289 [00:22<03:18,  1.30it/s]

Training loop 30
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.18305662274360657, logits - tensor([[-7.0116,  1.0236, -5.5521, -1.8021],
        [-7.4409,  1.9399, -5.9088, -1.4304],
        [-4.9809, -4.6090,  2.7823, -3.0807],
        [-6.6023,  1.4400, -5.9337, -1.3772],
        [-4.8569, -2.8366, -3.6360,  2.0110],
        [-6.9702,  0.5909, -6.3383, -1.2291],
        [-6.6500,  0.9955, -5.8816, -2.0265],
        [-3.6765, -2.2082,  2.6688, -2.1928]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 11%|█         | 31/289 [00:23<03:18,  1.30it/s]

Training loop 31
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3407317101955414, logits - tensor([[-6.2389,  1.3518, -5.3381, -1.5423],
        [-6.6190,  1.4911, -5.8639, -2.2767],
        [-5.9482,  1.7081, -5.7172, -1.3575],
        [-4.5823, -2.9547, -3.9223,  2.6372],
        [-5.0956, -2.6154, -3.8344,  2.4565],
        [-4.8126, -3.2200,  1.8999, -3.0041],
        [-5.2686,  1.3961, -4.5887, -1.2549],
        [-4.9564, -3.9629,  2.2639, -2.2186]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 11%|█         | 32/289 [00:24<03:17,  1.30it/s]

Training loop 32
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1791730523109436, logits - tensor([[-6.2735,  1.7301, -5.1647, -0.9362],
        [-6.0110,  1.5328, -4.3150, -2.1132],
        [-6.4084,  1.9290, -5.2446, -1.3205],
        [-5.9099,  0.9237, -4.5749, -1.3678],
        [-6.1876,  1.6270, -5.1132, -1.3775],
        [-5.8108,  1.0445, -6.1359, -2.0189],
        [-6.4864,  1.4402, -5.3777, -1.4976],
        [-5.7510,  1.1041, -5.1122, -2.2318]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 11%|█▏        | 33/289 [00:25<03:17,  1.29it/s]

Training loop 33
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.20781195163726807, logits - tensor([[-5.1026,  1.6207, -4.7051, -1.4545],
        [-5.3326, -2.0663,  1.3216, -1.7225],
        [-6.6380,  1.2687, -5.5170, -0.8234],
        [-5.3403,  0.9303, -4.7731, -1.0751],
        [-4.0191, -2.6445,  2.2319, -2.7003],
        [-5.2099, -3.1965, -3.3786,  2.3215],
        [-6.5949,  1.6347, -4.9990, -1.5956],
        [-5.0452, -2.5610,  1.9784, -2.6510]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 12%|█▏        | 34/289 [00:25<03:16,  1.30it/s]

Training loop 34
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3309967517852783, logits - tensor([[-5.7163, -3.2126,  1.8910, -2.3639],
        [-6.6495,  1.7931, -6.0498, -2.1916],
        [-6.2522, -2.8279, -4.3420,  2.5392],
        [-7.0906,  1.0028, -5.5142, -1.6304],
        [-6.2943,  1.4830, -6.2361, -2.7599],
        [-6.1278,  2.2882, -5.5912, -1.8069],
        [-6.3185,  1.5459, -5.8201, -2.0417],
        [-6.2675,  0.4605, -4.9882, -1.3722]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 12%|█▏        | 35/289 [00:26<03:15,  1.30it/s]

Training loop 35
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.43541982769966125, logits - tensor([[-7.5700,  0.5916, -5.2518, -0.2711],
        [-6.1054,  1.8856, -5.4838, -2.1133],
        [-6.1175,  2.0525, -5.7365, -1.3031],
        [-6.2945,  1.6577, -5.3912, -1.4765],
        [-5.7417,  1.9114, -5.3133, -2.7405],
        [-6.1533,  1.7957, -4.6558, -2.2697],
        [-6.5251,  0.6252, -5.2328, -1.5097],
        [-6.3776,  2.0369, -5.2576, -1.2569]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 12%|█▏        | 36/289 [00:27<03:13,  1.31it/s]

Training loop 36
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5001481771469116, logits - tensor([[-6.6423,  2.1078, -6.2397, -0.9588],
        [-6.5299,  1.5777, -5.5082, -1.9090],
        [-4.5333, -2.9670,  1.6175, -2.2176],
        [-5.0373, -3.0417,  2.7635, -2.3084],
        [-6.4917,  1.6551, -5.6645, -1.5990],
        [-6.3194,  1.4371, -5.8933, -2.0174],
        [-6.0994,  1.6460, -6.0123, -1.6838],
        [-6.5231,  1.8889, -5.6912, -2.2215]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 13%|█▎        | 37/289 [00:28<03:12,  1.31it/s]

Training loop 37
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.17327892780303955, logits - tensor([[-6.3576,  1.0287, -4.8782, -2.0439],
        [-5.5562, -3.5617,  2.2986, -3.5064],
        [-5.4457,  1.2332, -5.2761, -2.4094],
        [-4.9303,  1.5162, -3.8950, -1.4542],
        [-4.7589, -2.4630,  1.9196, -2.6597],
        [-5.5422,  1.7543, -5.3017, -2.1939],
        [-4.9641, -2.6495,  1.6480, -2.3139],
        [-5.0435,  1.5743, -4.7463, -1.2114]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 13%|█▎        | 38/289 [00:28<03:11,  1.31it/s]

Training loop 38
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2752733528614044, logits - tensor([[-6.1439,  1.6080, -4.2574, -1.0974],
        [-6.6057,  1.7300, -5.4130, -1.1897],
        [-5.7063, -1.8873, -4.2340,  1.6424],
        [-5.8671, -2.5235, -4.3556,  2.5027],
        [-6.4799,  1.5030, -6.2086, -1.7857],
        [-6.8784,  1.0445, -6.1750, -0.3969],
        [-5.1706, -1.6560, -4.5596,  2.8086],
        [-6.0997, -0.8822, -4.6412,  1.1480]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 13%|█▎        | 39/289 [00:29<03:10,  1.31it/s]

Training loop 39
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3878190517425537, logits - tensor([[-4.7473, -2.5445,  0.9878, -1.9606],
        [-6.0189,  1.9099, -5.6566, -1.7246],
        [-6.5370,  1.2565, -5.2040, -1.7112],
        [-5.7166,  2.1115, -5.4779, -0.6377],
        [-5.1313, -3.0345,  2.0927, -2.9047],
        [-6.4121,  2.0804, -5.5995, -1.2964],
        [-6.1680,  0.8403, -4.4340, -0.9504],
        [-5.2159, -3.3977,  1.8640, -3.2720]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 14%|█▍        | 40/289 [00:30<03:10,  1.31it/s]

Training loop 40
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.28351086378097534, logits - tensor([[-4.4832, -3.1537,  1.4530, -2.8522],
        [-4.4038, -2.6887, -4.6718,  2.9399],
        [-5.9501,  1.1939, -4.3781, -1.8406],
        [-5.4912, -3.4186, -4.6169,  2.2315],
        [-6.3926, -0.5150, -4.8608,  0.5589],
        [-5.0366, -1.9693,  1.0034, -2.0788],
        [-6.0232,  2.0472, -5.1200, -1.9065],
        [-6.5592,  1.3236, -4.7913, -1.4017]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 14%|█▍        | 41/289 [00:31<03:09,  1.31it/s]

Training loop 41
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.21310622990131378, logits - tensor([[-6.0890,  1.0579, -5.5320, -1.1953],
        [-5.5720,  0.5761, -5.1566, -0.4304],
        [-6.7590,  0.5157, -4.7945, -0.4902],
        [-5.1340, -1.6474, -4.6853,  2.0524],
        [-6.3313, -1.9664, -5.0269,  1.7782],
        [-7.1363,  0.9407, -5.8494, -1.9096],
        [-7.5385, -0.5849, -5.4290, -0.0667],
        [-6.5268,  0.8893, -5.6708, -1.4460]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 15%|█▍        | 42/289 [00:31<03:09,  1.31it/s]

Training loop 42
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.21183277666568756, logits - tensor([[-6.3122,  0.8593, -4.6649, -0.6599],
        [-6.0497,  0.8928, -4.5455, -0.1833],
        [-6.5374,  1.2773, -6.1222, -2.2570],
        [-4.3130, -2.1095,  1.4903, -1.7048],
        [-5.1809, -2.5440,  1.4020, -2.3000],
        [-4.5834, -2.4490, -4.4996,  2.2565],
        [-6.2423, -0.3975, -2.4998, -1.7987],
        [-6.5000,  1.4804, -5.3501, -1.9813]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 15%|█▍        | 43/289 [00:32<03:07,  1.31it/s]

Training loop 43
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.24258384108543396, logits - tensor([[-6.0079,  1.0915, -4.5649, -1.4461],
        [-6.2739,  1.1517, -4.7350, -0.6675],
        [-5.3469, -2.5261, -4.0817,  3.0519],
        [-5.8554,  1.3790, -4.7968, -0.4837],
        [-5.8666, -0.7087, -4.9061, -0.5193],
        [-3.9187, -2.3763,  0.7821, -2.8680],
        [-4.1840, -2.0028,  1.8678, -2.8290],
        [-5.7701,  1.3273, -4.9081, -1.4328]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 15%|█▌        | 44/289 [00:33<03:06,  1.31it/s]

Training loop 44
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.09578655660152435, logits - tensor([[-5.0279, -2.7861,  1.1189, -2.2450],
        [-6.3073,  1.8455, -5.4728, -0.9947],
        [-4.2835, -2.2855,  1.5447, -2.2381],
        [-6.7100,  1.4091, -5.9306, -1.5329],
        [-5.5379,  1.0936, -4.1746, -1.1633],
        [-5.4937, -2.3344, -5.1998,  2.3551],
        [-4.3246, -3.0704,  1.8026, -2.7794],
        [-4.4200, -1.6178, -5.0231,  2.4692]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 16%|█▌        | 45/289 [00:34<03:05,  1.32it/s]

Training loop 45
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.10067398846149445, logits - tensor([[-5.7182,  1.1009, -4.6842, -0.9062],
        [-5.9007,  1.2024, -5.0314, -1.7124],
        [-4.8957, -3.3323,  1.6846, -3.5696],
        [-4.9775,  1.3765, -5.1577, -1.4222],
        [-5.2662, -2.5311,  2.3512, -2.4000],
        [-6.5854,  1.4189, -5.3096, -0.8013],
        [-5.8217, -2.5026, -4.8405,  1.7679],
        [-6.4609,  1.4654, -5.8041, -1.9128]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 16%|█▌        | 46/289 [00:34<03:04,  1.32it/s]

Training loop 46
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.49407923221588135, logits - tensor([[-5.4170, -2.3759, -3.9976,  2.1757],
        [-4.3697, -1.8400,  1.1697, -2.1904],
        [-5.6807, -3.1125,  2.1353, -2.3438],
        [-5.6944,  0.9848, -4.7353, -0.8682],
        [-6.2018,  0.6908, -4.3287, -0.9132],
        [-5.7742,  0.9523, -4.7840, -1.4463],
        [-6.0391,  0.8970, -5.8303, -1.4246],
        [-6.4370,  1.3919, -5.2867, -1.6019]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 16%|█▋        | 47/289 [00:35<03:03,  1.32it/s]

Training loop 47
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.13542699813842773, logits - tensor([[-6.2795,  1.1957, -4.7778, -1.6856],
        [-6.4161,  1.5409, -5.2329, -1.7039],
        [-4.9540,  1.2373, -4.0761, -1.2304],
        [-6.0433,  0.9220, -4.7079, -1.0337],
        [-6.8022,  1.5510, -5.7178, -0.9161],
        [-5.5550,  1.0069, -5.0042, -0.6186],
        [-5.0993, -2.3467, -4.4915,  2.1599],
        [-7.0575,  0.4163, -5.1068, -0.9819]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 17%|█▋        | 48/289 [00:36<03:02,  1.32it/s]

Training loop 48
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.23464882373809814, logits - tensor([[-5.1435,  1.2312, -4.4405, -1.9457],
        [-5.0630, -2.5006, -4.9490,  2.2731],
        [-5.8149, -2.7048,  1.8811, -2.8534],
        [-5.1390,  1.2529, -4.4542, -0.8342],
        [-5.1374, -2.1739,  0.7304, -2.0872],
        [-4.9576, -3.3717,  1.5620, -2.2111],
        [-5.9168,  0.5968, -3.7345, -0.7987],
        [-5.6112,  0.8342, -4.3521, -0.8315]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 17%|█▋        | 49/289 [00:37<03:01,  1.32it/s]

Training loop 49
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.17428113520145416, logits - tensor([[-5.6471,  1.6813, -4.3854, -1.3012],
        [-6.0929,  1.4752, -4.1636, -1.4798],
        [-4.9410, -2.3467,  2.1380, -2.1134],
        [-4.1619, -2.9844,  1.8316, -2.5453],
        [-5.8590, -1.9961, -4.2329,  1.9523],
        [-5.0703, -2.9844,  2.2290, -3.0449],
        [-4.3650, -2.5182,  1.6742, -2.8184],
        [-5.3350, -2.4735, -4.2226,  1.3847]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 17%|█▋        | 50/289 [00:38<03:01,  1.32it/s]

Training loop 50
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1598818600177765, logits - tensor([[-7.5483,  1.1697, -6.3320, -1.2502],
        [-5.4421, -0.7268, -4.4262,  1.2414],
        [-5.9499,  0.7403, -4.7387, -0.9965],
        [-5.8989, -2.5546,  2.1929, -2.3353],
        [-6.5863,  0.7235, -4.6075, -0.4348],
        [-6.3290,  1.3919, -5.1231, -0.8412],
        [-6.2566,  1.6129, -4.6407, -1.2533],
        [-6.0052,  0.2903, -5.6733, -0.6131]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 18%|█▊        | 51/289 [00:38<03:00,  1.32it/s]

Training loop 51
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2747902274131775, logits - tensor([[-5.6582,  0.9585, -4.1992, -1.0691],
        [-6.1416,  1.1484, -4.5169, -0.9351],
        [-5.7873,  1.5143, -4.6832, -1.8671],
        [-5.7101,  1.6595, -5.2451, -1.2041],
        [-6.2387,  1.7443, -4.5039, -1.1624],
        [-5.5347,  1.6420, -5.7593, -1.4644],
        [-5.8798, -1.4122, -5.0601,  2.3497],
        [-5.9053,  1.8313, -4.9965, -1.1008]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 18%|█▊        | 52/289 [00:39<02:59,  1.32it/s]

Training loop 52
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19317349791526794, logits - tensor([[-5.7700,  1.2167, -5.1578, -1.0292],
        [-6.1401,  0.7937, -3.7776, -1.4837],
        [-5.4745,  1.4571, -4.8667, -1.3252],
        [-6.8886,  1.8992, -4.7249, -0.8762],
        [-4.6773,  1.4730, -4.7177, -1.8457],
        [-6.4199,  1.4642, -5.4627, -0.9790],
        [-6.0719, -3.0709,  2.1718, -2.0294],
        [-5.7011, -3.5067, -4.7834,  2.9391]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 18%|█▊        | 53/289 [00:40<02:58,  1.32it/s]

Training loop 53
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3891032338142395, logits - tensor([[-6.6142,  1.3808, -5.6977, -0.9868],
        [-6.0560,  0.4898, -4.7936, -1.3634],
        [-5.4981,  1.3613, -5.1004, -0.6912],
        [-5.7484,  0.9898, -4.7652, -1.5246],
        [-3.8083, -2.0974,  2.7107, -2.1127],
        [-5.1902,  1.7662, -4.3209, -1.6470],
        [-4.6505, -2.7975,  2.5538, -3.0277],
        [-6.4205,  1.7582, -5.7671, -1.4603]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 19%|█▊        | 54/289 [00:41<02:58,  1.32it/s]

Training loop 54
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.22871263325214386, logits - tensor([[-5.2430, -2.3146,  1.1822, -2.6168],
        [-6.5521, -2.2252, -2.3353,  0.9981],
        [-6.1093,  1.2767, -4.9499, -0.8999],
        [-5.5003, -3.4168, -4.4632,  2.8642],
        [-5.7774,  0.5853, -5.0531, -0.9017],
        [-6.6658,  1.0727, -5.2506, -0.6510],
        [-4.4408, -2.5229,  1.9988, -2.8235],
        [-3.8930, -3.0016, -3.8266,  1.9694]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 19%|█▉        | 55/289 [00:41<02:57,  1.32it/s]

Training loop 55
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.16928730905056, logits - tensor([[-6.9114,  1.4019, -5.8069, -0.7131],
        [-4.2543, -3.1743, -3.6673,  2.4617],
        [-6.0965,  1.1336, -4.8604, -1.6483],
        [-5.3454, -3.3447,  1.6840, -2.8722],
        [-4.8568,  1.1514, -4.4191, -0.5493],
        [-6.7558,  1.2119, -6.3970, -1.4876],
        [-6.2763,  0.8121, -5.1823, -1.2939],
        [-5.9039,  1.7225, -5.2607, -1.5350]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 19%|█▉        | 56/289 [00:42<02:56,  1.32it/s]

Training loop 56
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.11367006599903107, logits - tensor([[-5.9167,  1.3100, -5.6533, -1.3494],
        [-4.3042, -2.8382,  2.0502, -2.5836],
        [-5.5606, -2.7114, -3.9131,  3.9779],
        [-4.1394, -2.7001,  1.6862, -2.1104],
        [-6.2084, -3.3181, -5.3815,  3.5323],
        [-6.0789,  1.0507, -4.7037, -0.5593],
        [-6.5204,  0.7410, -4.9576, -0.6966],
        [-5.6216,  1.9155, -5.0401, -0.1059]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 20%|█▉        | 57/289 [00:43<02:55,  1.32it/s]

Training loop 57
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3223085105419159, logits - tensor([[-5.5583,  1.8399, -5.2407, -1.0688],
        [-6.3736,  1.0824, -4.2844, -1.4439],
        [-7.0666,  1.4600, -5.9759, -1.5761],
        [-6.1085,  0.7845, -4.7118, -0.9586],
        [-5.0703, -1.8927, -4.4903,  2.1692],
        [-6.1135,  0.2131, -5.6110,  0.1819],
        [-6.2915,  0.7542, -5.0627, -0.9330],
        [-6.6049,  1.1426, -5.7631, -1.1082]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 20%|██        | 58/289 [00:44<02:54,  1.32it/s]

Training loop 58
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.23852376639842987, logits - tensor([[-6.3828,  1.3939, -5.3017, -1.3045],
        [-5.6517,  2.4906, -5.4481, -0.9944],
        [-4.3524, -3.0440,  2.3157, -2.4257],
        [-6.0590, -1.7360, -4.8569,  2.5454],
        [-5.9286,  1.6734, -5.2485, -1.2234],
        [-6.4735,  0.7919, -5.0217, -1.1075],
        [-5.2026, -2.9143,  1.8151, -3.1835],
        [-4.9039,  0.9398, -4.2921, -1.0922]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 20%|██        | 59/289 [00:44<02:53,  1.32it/s]

Training loop 59
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3239254951477051, logits - tensor([[-5.7373,  1.5139, -5.0508, -1.2126],
        [-6.5105,  1.5494, -5.3031, -1.1121],
        [-6.0696, -1.5337, -3.7580,  0.8187],
        [-6.4529,  1.3171, -4.8390, -1.0625],
        [-7.3432,  1.2226, -6.2839, -0.7819],
        [-5.8877,  1.3524, -4.6099, -0.8125],
        [-6.8902,  0.5114, -5.3786, -0.4760],
        [-5.5349,  1.2311, -4.5971, -1.6790]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 21%|██        | 60/289 [00:45<02:52,  1.33it/s]

Training loop 60
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.39218127727508545, logits - tensor([[-5.1048, -1.0656, -4.3824,  1.2425],
        [-4.9260, -3.6140,  1.8249, -2.6806],
        [-5.8249,  1.0549, -5.7204, -1.4171],
        [-5.7866,  1.3824, -5.6453, -1.4939],
        [-6.2709, -2.8061, -4.9302,  2.9469],
        [-6.6693,  1.5231, -5.6834, -1.1618],
        [-5.5365, -3.4875,  3.9516, -2.4181],
        [-5.6025, -2.5087, -4.5707,  2.2580]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 21%|██        | 61/289 [00:46<02:51,  1.33it/s]

Training loop 61
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.18701177835464478, logits - tensor([[-6.4919,  1.3508, -6.1599, -1.3671],
        [-6.1712,  1.3667, -5.6874, -1.2466],
        [-4.5984, -3.0948,  2.1679, -2.3561],
        [-6.6988,  1.1020, -5.4702, -1.4188],
        [-6.6612,  1.1965, -5.0677, -1.1644],
        [-6.9410,  1.3246, -5.2367, -1.1687],
        [-5.3055, -3.8294, -3.8960,  2.4496],
        [-6.4690,  1.6813, -4.4870, -0.8785]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 21%|██▏       | 62/289 [00:47<02:50,  1.33it/s]

Training loop 62
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.29444757103919983, logits - tensor([[-5.7818,  0.9479, -4.7264, -1.0648],
        [-5.6531,  1.0186, -5.4660, -1.0792],
        [-6.7499,  1.3181, -4.7412, -1.4583],
        [-5.9702,  1.6746, -4.5728, -1.9504],
        [-3.6406, -3.2570, -3.6694,  1.8330],
        [-5.9030,  0.2808, -4.1395, -1.1633],
        [-7.0412,  1.1811, -5.8297, -0.8850],
        [-7.7336, -0.3299, -5.3468,  0.8350]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 22%|██▏       | 63/289 [00:47<02:49,  1.33it/s]

Training loop 63
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1303786337375641, logits - tensor([[-5.8288,  1.0236, -5.5114, -0.9638],
        [-6.1801,  0.7282, -5.4242, -1.8520],
        [-5.9831,  1.1540, -4.8288, -0.5276],
        [-6.4990,  0.4607, -5.2028, -1.4762],
        [-5.3590, -1.2676, -4.1131,  3.0620],
        [-6.0797,  0.8390, -6.1804, -1.7130],
        [-5.7514, -3.1041, -4.4049,  2.5033],
        [-6.7157,  1.1369, -4.8815, -1.1721]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 22%|██▏       | 64/289 [00:48<02:48,  1.33it/s]

Training loop 64
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.21455813944339752, logits - tensor([[-4.7396, -0.1046, -3.8648, -1.0837],
        [-6.1176,  1.4641, -5.6916, -0.7871],
        [-6.2650,  0.7985, -4.5530, -0.7259],
        [-6.3419,  1.3912, -4.9756, -1.8675],
        [-5.8826,  0.9489, -5.0784, -1.4344],
        [-4.5263, -2.9570, -5.1508,  2.1728],
        [-6.1081,  1.0355, -5.4960, -1.1115],
        [-6.7192,  1.0420, -5.6635, -1.6314]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 22%|██▏       | 65/289 [00:49<02:47,  1.33it/s]

Training loop 65
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.10930141806602478, logits - tensor([[-5.8311,  1.5640, -5.8679, -1.2785],
        [-4.9392,  1.4282, -4.5571, -1.9716],
        [-5.3423, -3.4871,  2.5838, -3.2358],
        [-5.0551,  1.2205, -5.6389, -1.6493],
        [-6.6783,  1.4057, -5.1354, -1.0189],
        [-5.4364, -3.2075,  2.0188, -2.2310],
        [-6.1186, -0.6744, -4.5107,  0.9649],
        [-6.0712,  1.1310, -5.1003, -1.2069]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 23%|██▎       | 66/289 [00:50<02:47,  1.33it/s]

Training loop 66
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.0849667489528656, logits - tensor([[-6.8514, -2.4555, -5.8402,  2.9250],
        [-7.4076,  1.3074, -5.9003, -1.4101],
        [-6.6040,  1.2318, -5.1444, -1.4703],
        [-5.9350, -3.0145, -5.2371,  3.7596],
        [-4.6184, -3.8208,  3.0292, -3.0673],
        [-6.3459,  1.6947, -5.6710, -0.6080],
        [-6.1598,  1.0553, -5.9261, -0.9225],
        [-4.2133, -3.0373,  2.7923, -2.5065]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 23%|██▎       | 67/289 [00:50<02:46,  1.33it/s]

Training loop 67
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.08450880646705627, logits - tensor([[-4.7968, -2.3652, -3.8818,  2.1701],
        [-5.0365, -2.8836,  1.4779, -2.8030],
        [-5.8782,  1.1810, -5.0144, -1.1395],
        [-4.8933, -2.4549,  2.0937, -2.3832],
        [-6.8765,  1.9596, -5.9748, -1.7158],
        [-5.7936,  1.7505, -5.4308, -1.6362],
        [-4.5684, -2.7739,  2.9077, -2.1046],
        [-6.3929,  1.4805, -5.9469, -1.4596]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 24%|██▎       | 68/289 [00:51<02:45,  1.33it/s]

Training loop 68
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.28446751832962036, logits - tensor([[-5.4903,  2.5471, -5.1543, -1.8380],
        [-5.8675,  1.1685, -5.5570, -1.0833],
        [-6.3421,  1.2271, -5.4088,  0.1248],
        [-6.4024,  1.1376, -5.0460, -1.9270],
        [-6.2685,  1.7671, -5.1179, -1.7425],
        [-5.8412,  0.8490, -5.2985, -0.8333],
        [-4.3590, -2.4719,  1.7518, -2.3187],
        [-4.7290, -3.1799,  2.8461, -3.6049]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 24%|██▍       | 69/289 [00:52<02:45,  1.33it/s]

Training loop 69
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.30211856961250305, logits - tensor([[-6.0415,  1.5010, -5.7800, -1.5937],
        [-5.3927,  1.2128, -4.9624, -0.8606],
        [-5.8846,  1.0733, -4.3704, -1.1286],
        [-5.9771,  1.7566, -4.7872, -1.5691],
        [-5.8615, -0.2064, -5.1937,  0.8215],
        [-5.9460,  1.6612, -6.1913, -0.9667],
        [-7.6518,  1.4024, -5.7491, -2.2514],
        [-5.4694,  1.2877, -4.6731, -0.8487]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 24%|██▍       | 70/289 [00:53<02:44,  1.33it/s]

Training loop 70
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.40166622400283813, logits - tensor([[-6.3229,  1.8649, -5.4695, -0.7577],
        [-5.8986,  1.8232, -5.3540, -0.9400],
        [-5.4545,  1.1113, -5.0362, -1.6777],
        [-6.5048,  0.9702, -5.2522, -1.6822],
        [-6.2887,  0.9718, -5.6294, -1.0271],
        [-5.3182, -3.5115,  3.1012, -3.3888],
        [-5.9655,  1.3700, -5.3623, -0.8790],
        [-7.0359,  1.7423, -5.4602, -2.2907]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 25%|██▍       | 71/289 [00:53<02:45,  1.32it/s]

Training loop 71
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1414509415626526, logits - tensor([[-5.5282, -2.1033, -4.3154,  2.2206],
        [-5.4638, -3.4243,  3.8512, -2.9951],
        [-6.2449, -3.2021, -4.2822,  2.4356],
        [-5.0397, -2.7681,  2.7133, -2.7656],
        [-4.7492, -2.4665,  2.0055, -2.5618],
        [-6.0872,  1.6542, -5.5002, -1.7796],
        [-5.3368, -3.1201,  2.0999, -2.2405],
        [-6.1848,  1.2451, -5.4512, -1.1902]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 25%|██▍       | 72/289 [00:54<02:45,  1.31it/s]

Training loop 72
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2850043773651123, logits - tensor([[-6.0598,  1.4530, -5.5537, -1.8218],
        [-7.3919,  0.5902, -6.1988, -0.3957],
        [-4.3689, -2.2056, -5.4862,  2.6761],
        [-5.3598,  1.2822, -4.2721, -1.9993],
        [-7.1254,  2.1932, -5.8091, -1.2923],
        [-4.9483, -2.7482,  2.5004, -3.3065],
        [-5.9012,  1.7728, -5.7655, -1.8172],
        [-6.1239,  1.0322, -5.2123, -1.5768]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 25%|██▌       | 73/289 [00:55<02:44,  1.31it/s]

Training loop 73
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2080855816602707, logits - tensor([[-6.7078,  1.3431, -5.5408, -1.2680],
        [-6.8821,  1.6361, -5.9939, -1.8326],
        [-6.8214,  1.6571, -5.9292, -1.5380],
        [-5.4303,  0.5845, -4.9612, -1.1052],
        [-5.8646,  1.4939, -5.5498, -2.5757],
        [-6.2677,  1.6722, -5.1703, -2.2041],
        [-4.0435, -2.2886, -5.7306,  2.6240],
        [-5.9208,  1.6631, -5.5147, -2.0070]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 26%|██▌       | 74/289 [00:56<02:43,  1.32it/s]

Training loop 74
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.34588301181793213, logits - tensor([[-6.7261,  1.9747, -6.0566, -1.5479],
        [-6.5084,  0.9970, -5.0303, -1.0189],
        [-6.0743,  1.0796, -4.7829, -1.3707],
        [-5.8279,  1.9321, -5.9236, -2.3826],
        [-6.6157,  1.5739, -6.3096, -1.1929],
        [-5.4135,  1.4826, -5.4936, -1.5400],
        [-6.2585,  2.0922, -5.7694, -1.3945],
        [-6.3631,  2.0206, -5.0277, -2.3820]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 26%|██▌       | 75/289 [00:56<02:42,  1.32it/s]

Training loop 75
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.18047548830509186, logits - tensor([[-4.5078, -2.6232,  1.0339, -2.4792],
        [-3.8533, -2.7686, -3.6200,  2.6985],
        [-6.5782,  1.4172, -6.2498, -1.2709],
        [-5.9392,  1.8205, -5.7125, -2.0228],
        [-5.6564,  0.9935, -4.6929, -1.7886],
        [-5.4412,  1.3603, -5.3635, -1.5340],
        [-6.7859,  2.2155, -5.9731, -1.2428],
        [-6.3332,  1.4569, -5.7910, -2.0950]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 26%|██▋       | 76/289 [00:57<02:40,  1.32it/s]

Training loop 76
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.17389552295207977, logits - tensor([[-6.9283,  1.8891, -5.6936, -1.4941],
        [-5.3688, -3.7349,  2.0711, -2.6971],
        [-7.4134,  1.8349, -5.9064, -2.0790],
        [-6.1179,  1.7601, -4.5870, -1.4042],
        [-5.3673, -3.2056,  2.2329, -2.5905],
        [-6.3316,  2.3012, -6.5467, -1.8706],
        [-6.3238,  2.6048, -5.8780, -2.7690],
        [-5.5127,  1.2508, -5.0925, -2.2249]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 27%|██▋       | 77/289 [00:58<02:39,  1.33it/s]

Training loop 77
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19221742451190948, logits - tensor([[-6.1464,  1.3255, -4.5464, -1.9108],
        [-6.4018,  2.0605, -5.4095, -1.7016],
        [-6.5334,  1.1592, -4.9460, -2.4105],
        [-5.4131,  1.5138, -5.0174, -1.3967],
        [-6.4457,  1.8288, -6.2283, -1.5043],
        [-5.8492,  2.0889, -6.1681, -1.4101],
        [-5.3433,  1.3575, -4.2110, -1.2295],
        [-4.9804, -3.0842,  1.9913, -2.8083]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 27%|██▋       | 78/289 [00:59<02:38,  1.33it/s]

Training loop 78
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.10464828461408615, logits - tensor([[-4.9747, -3.6352,  3.0466, -2.9262],
        [-4.6231, -3.2896, -4.5171,  2.9364],
        [-6.7491,  2.8299, -5.3135, -2.2672],
        [-5.7528,  1.4290, -5.3337, -2.0815],
        [-6.4319,  2.3937, -5.9027, -2.0060],
        [-5.8907,  2.2739, -5.1274, -1.7042],
        [-6.6197,  0.8302, -5.2661, -0.2403],
        [-4.8906, -3.4129, -4.3164,  3.1463]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 27%|██▋       | 79/289 [00:59<02:37,  1.33it/s]

Training loop 79
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3340359330177307, logits - tensor([[-4.7452, -3.2202,  1.7309, -2.6384],
        [-5.5577,  1.7935, -4.9666, -1.9453],
        [-6.5322,  1.6233, -5.5545, -1.3511],
        [-5.1886, -3.2326,  2.1345, -2.7266],
        [-6.7623,  1.6202, -6.4317, -2.4126],
        [-6.2759,  2.2789, -5.4881, -0.8366],
        [-6.2817,  2.2262, -5.0673, -1.2121],
        [-6.6510,  1.9644, -5.0679, -1.7061]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 28%|██▊       | 80/289 [01:00<02:36,  1.33it/s]

Training loop 80
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.0830778032541275, logits - tensor([[-6.5463, -1.9806, -5.3459,  1.5405],
        [-4.9614, -3.1402,  2.2618, -2.5536],
        [-6.2879,  1.6546, -5.2956, -1.6278],
        [-6.3094,  1.4475, -5.2481, -1.2677],
        [-6.7781,  1.2831, -6.5110, -1.2162],
        [-5.1294,  1.4974, -4.5843, -1.8064],
        [-5.9266, -3.3333, -4.9959,  3.1816],
        [-5.7460,  1.5635, -6.0881, -2.0542]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 28%|██▊       | 81/289 [01:01<02:35,  1.33it/s]

Training loop 81
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.08208030462265015, logits - tensor([[-6.1878,  2.4202, -6.3114, -1.4461],
        [-6.1013,  1.1696, -4.9217, -1.7477],
        [-6.6028, -2.3758, -5.3807,  2.1586],
        [-5.2443, -3.3281, -4.1394,  1.9210],
        [-4.8507, -3.1139,  2.2210, -1.9120],
        [-6.3647, -2.0688, -5.2976,  2.2905],
        [-4.6522,  1.0549, -4.7271, -1.1251],
        [-5.8599,  1.4343, -6.4370, -1.7657]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 28%|██▊       | 82/289 [01:02<02:34,  1.34it/s]

Training loop 82
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3045158088207245, logits - tensor([[-5.9195,  1.3653, -5.2505, -2.2209],
        [-6.1714,  1.3699, -6.0564, -2.1096],
        [-5.6041,  2.2324, -4.9041, -1.3646],
        [-5.0641, -2.2737, -4.4197,  2.1389],
        [-6.9257,  1.7393, -5.9477, -2.3747],
        [-6.3851,  2.1786, -5.2255, -2.1466],
        [-5.8998,  1.6117, -5.3027, -1.8468],
        [-6.6793,  1.7215, -6.2216, -2.2273]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 29%|██▊       | 83/289 [01:02<02:34,  1.33it/s]

Training loop 83
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.08712054044008255, logits - tensor([[-5.5147,  1.9859, -5.6319, -1.4136],
        [-5.1514, -2.6435,  1.5644, -2.7649],
        [-6.4317,  1.4303, -5.2418, -1.5572],
        [-5.6186,  1.8467, -5.6675, -1.8269],
        [-4.0982, -3.0191,  2.7218, -2.9938],
        [-5.9877,  1.0923, -5.8812, -1.6136],
        [-6.4719,  1.6397, -5.8029, -1.7357],
        [-5.8375,  1.1591, -5.0684, -2.1162]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 29%|██▉       | 84/289 [01:03<02:33,  1.33it/s]

Training loop 84
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.18314522504806519, logits - tensor([[-4.8744, -3.6583,  2.7246, -2.6759],
        [-7.0630,  1.8218, -5.6024, -1.4137],
        [-6.2018,  1.7194, -5.2460, -1.7618],
        [-5.9279,  1.6535, -5.7572, -1.4036],
        [-6.8904,  1.7006, -6.2608, -1.7798],
        [-5.7420,  1.3382, -5.6343, -1.7965],
        [-6.8566,  2.3235, -5.9861, -2.2619],
        [-6.4542,  2.4571, -5.8706, -2.1461]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 29%|██▉       | 85/289 [01:04<02:33,  1.33it/s]

Training loop 85
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.11602544039487839, logits - tensor([[-6.4053,  1.9018, -5.9276, -2.3870],
        [-5.0425, -3.0004,  2.3646, -2.0748],
        [-5.9711,  2.0623, -6.0871, -1.7826],
        [-4.7006, -2.6002, -5.7561,  2.8407],
        [-6.4046,  2.5142, -5.0146, -2.1559],
        [-5.8590,  0.7581, -5.5847, -0.1025],
        [-4.8877, -2.0779, -4.1188,  2.6291],
        [-5.5125,  1.3721, -5.3964, -1.2072]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 30%|██▉       | 86/289 [01:05<02:32,  1.34it/s]

Training loop 86
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.24851644039154053, logits - tensor([[-7.3554,  2.1954, -6.3028, -2.3267],
        [-5.3730, -3.3286, -5.7227,  2.6059],
        [-7.1832,  1.4154, -6.9010, -1.9267],
        [-6.5216,  1.7771, -6.0755, -1.4041],
        [-6.1028,  2.4435, -7.4191, -1.8979],
        [-6.2525, -4.0610,  1.2686, -1.7149],
        [-5.4240, -3.3249,  2.6938, -2.9854],
        [-4.9887, -3.2937,  2.6526, -3.4227]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 30%|███       | 87/289 [01:05<02:30,  1.34it/s]

Training loop 87
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4041560888290405, logits - tensor([[-5.0395, -3.3593,  2.2414, -2.5741],
        [-5.3379, -2.9933,  2.8072, -2.6953],
        [-6.0014,  1.3317, -5.9734, -1.8539],
        [-6.4952,  2.0183, -5.7017, -1.6918],
        [-4.9313, -2.3243, -4.8901,  1.5409],
        [-6.5951,  2.2187, -5.8880, -1.5540],
        [-5.6459,  1.5112, -4.7075, -1.6273],
        [-6.2578, -3.5626, -5.3548,  2.9061]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 30%|███       | 88/289 [01:06<02:30,  1.34it/s]

Training loop 88
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.34477266669273376, logits - tensor([[-5.7626,  1.8925, -4.4514, -1.7063],
        [-6.4579,  1.9339, -6.5974, -2.6557],
        [-4.6294, -3.1256,  2.0451, -3.0002],
        [-6.3710,  0.9635, -4.2017, -2.0359],
        [-5.8502,  1.7004, -4.7775, -2.0108],
        [-4.9949, -3.7001,  2.6559, -3.0720],
        [-5.8651,  2.7261, -6.4303, -1.9627],
        [-6.4226,  0.5752, -4.7780, -1.3007]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 31%|███       | 89/289 [01:07<02:29,  1.34it/s]

Training loop 89
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3893129229545593, logits - tensor([[-6.1158,  1.7131, -5.1477, -0.8838],
        [-6.3106,  1.6517, -6.0839, -2.2523],
        [-6.2001,  2.0925, -4.9221, -1.4016],
        [-5.6918,  1.5370, -5.9443, -1.5181],
        [-6.2214,  1.3284, -5.0964, -1.6913],
        [-7.4267,  2.1144, -6.3036, -1.4535],
        [-7.4391,  1.6725, -6.2564, -2.4310],
        [-6.9633,  2.3019, -6.5822, -1.6460]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 31%|███       | 90/289 [01:08<02:29,  1.33it/s]

Training loop 90
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2114168405532837, logits - tensor([[-6.2694,  1.5939, -6.1577, -0.9838],
        [-6.0714,  2.0793, -5.3078, -1.3672],
        [-6.2548, -1.8504, -5.1349,  1.8466],
        [-3.9812, -2.8845,  2.0386, -2.4487],
        [-6.8531,  1.4638, -5.4176, -2.2041],
        [-6.5911,  1.3181, -5.2154, -0.4367],
        [-6.6007,  2.0277, -6.1051, -2.0772],
        [-6.7987,  1.8939, -5.7736, -1.8361]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 31%|███▏      | 91/289 [01:08<02:28,  1.33it/s]

Training loop 91
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3470790684223175, logits - tensor([[-6.8300,  1.5464, -5.4597, -1.4492],
        [-7.7213, -1.0741, -6.4275,  0.5232],
        [-5.6388, -2.7791, -5.1501,  2.8836],
        [-6.3373,  0.9594, -5.7443, -1.1583],
        [-5.2781, -3.4681, -4.5585,  2.9577],
        [-4.9325, -1.7912,  2.0423, -2.7785],
        [-6.4324,  1.9492, -6.0270, -2.0639],
        [-6.9159,  1.0169, -6.1476, -0.1398]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 32%|███▏      | 92/289 [01:09<02:27,  1.33it/s]

Training loop 92
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.30241551995277405, logits - tensor([[-6.7808e+00, -1.7585e-03, -5.3040e+00, -7.3867e-01],
        [-6.4835e+00,  2.3603e+00, -6.4079e+00, -1.8123e+00],
        [-6.5656e+00,  1.6722e+00, -5.3181e+00, -1.4296e+00],
        [-7.0653e+00,  1.0974e+00, -5.3644e+00, -1.1289e+00],
        [-5.9655e+00, -1.3293e+00, -4.7901e+00,  1.7751e+00],
        [-5.2617e+00,  1.4595e+00, -5.7421e+00, -1.5607e+00],
        [-6.2368e+00,  2.2112e+00, -6.1163e+00, -1.2305e+00],
        [-6.4552e+00,  1.3454e+00, -5.3860e+00, -1.3714e+00]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 32%|███▏      | 93/289 [01:10<02:26,  1.33it/s]

Training loop 93
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.32590651512145996, logits - tensor([[-6.8611,  1.9928, -5.4549, -2.2076],
        [-6.1490, -1.1998, -4.1778,  1.5058],
        [-4.0382, -2.6527,  2.6528, -2.8600],
        [-6.9759,  1.8993, -6.2871, -1.6215],
        [-6.6390,  1.2649, -5.3264, -1.6359],
        [-5.6863,  0.8488, -5.2817, -0.7353],
        [-6.2876,  1.9951, -4.6668, -1.9190],
        [-6.4748,  0.7960, -4.7904, -0.6335]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 33%|███▎      | 94/289 [01:11<02:26,  1.33it/s]

Training loop 94
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2506203055381775, logits - tensor([[-7.5841,  0.5386, -5.5060, -0.4510],
        [-5.5436,  1.6533, -5.7800, -1.5711],
        [-5.7706,  1.5796, -5.0312, -1.7703],
        [-5.3481, -3.0088,  2.5479, -3.4602],
        [-5.6775,  0.7230, -5.6012, -2.0645],
        [-5.8362,  1.2412, -5.3966, -1.3511],
        [-6.0418,  1.3056, -5.3016, -1.0378],
        [-4.1710, -2.3445,  2.4151, -2.4572]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 33%|███▎      | 95/289 [01:11<02:25,  1.33it/s]

Training loop 95
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.31668275594711304, logits - tensor([[-5.2913, -3.7413, -4.7954,  3.7530],
        [-5.3559, -3.1385, -4.2659,  2.8396],
        [-7.1434, -2.3260, -6.2423,  2.9288],
        [-4.8446, -3.0584,  1.6860, -2.0636],
        [-7.2848,  1.8104, -6.2665, -1.6380],
        [-5.6814,  1.2895, -4.9463, -1.2534],
        [-6.0036,  0.8408, -5.0030, -1.4097],
        [-5.9441,  0.9930, -3.9531, -1.5088]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 33%|███▎      | 96/289 [01:12<02:24,  1.33it/s]

Training loop 96
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.07378128916025162, logits - tensor([[-6.0924,  1.2336, -5.6547, -1.7119],
        [-5.4210,  1.8684, -4.9157, -1.8897],
        [-4.4206, -2.7715,  1.9992, -2.4190],
        [-4.6990, -2.6729, -4.7617,  2.9525],
        [-6.4747, -1.8741, -4.5754,  2.8394],
        [-6.5550,  1.8477, -5.8116, -2.1042],
        [-6.0300,  1.4208, -6.0117, -1.9709],
        [-6.3620,  1.5019, -5.5397, -1.6115]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 34%|███▎      | 97/289 [01:13<02:24,  1.33it/s]

Training loop 97
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.33472275733947754, logits - tensor([[-6.4477,  1.3752, -5.6663, -1.4664],
        [-5.8150,  1.6686, -5.0481, -1.1752],
        [-5.3523, -0.0344, -2.3892, -1.0373],
        [-6.6155,  0.8611, -5.3224, -1.1809],
        [-7.4564, -0.0903, -5.8220,  0.1557],
        [-6.6616,  1.1105, -5.8574, -1.5208],
        [-6.6472,  1.5706, -5.5208, -1.4674],
        [-6.0112, -2.0786, -4.9785,  2.0825]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 34%|███▍      | 98/289 [01:14<02:23,  1.33it/s]

Training loop 98
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.40688759088516235, logits - tensor([[-7.3133,  1.2587, -5.0578, -1.7161],
        [-5.9911, -2.9246,  1.2707, -2.2855],
        [-5.5772,  1.4336, -5.4731, -1.2369],
        [-6.0118, -2.5672, -5.4271,  1.7635],
        [-7.4184,  2.1104, -7.3783, -1.6052],
        [-6.0240,  0.8480, -5.2958, -1.7996],
        [-5.9261,  2.0606, -5.6963, -1.1913],
        [-5.6388, -3.0809,  1.4397, -3.2137]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 34%|███▍      | 99/289 [01:14<02:22,  1.33it/s]

Training loop 99
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3147687315940857, logits - tensor([[-6.6801,  0.9068, -5.8653, -1.5849],
        [-7.2842, -1.2402, -5.9738,  1.7883],
        [-6.1570,  1.1188, -5.1368, -1.4941],
        [-5.0451, -2.7850,  1.9835, -3.7557],
        [-6.2784,  1.2246, -6.1971, -1.4991],
        [-6.6158, -2.1108, -5.1173,  1.8073],
        [-6.5590,  1.1151, -6.3830, -1.4918],
        [-6.4397,  2.0114, -5.6718, -2.0973]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 35%|███▍      | 100/289 [01:15<02:21,  1.34it/s]

Training loop 100
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.18006880581378937, logits - tensor([[-6.0681,  1.5671, -6.1467, -1.8527],
        [-4.0648, -2.6078,  2.1265, -2.3448],
        [-4.8071, -2.5620,  2.3894, -2.3266],
        [-7.1230,  1.0086, -5.0099, -1.8775],
        [-5.5515, -3.6410,  1.9472, -2.7906],
        [-5.6243, -2.2548, -5.2788,  2.4663],
        [-7.0718,  1.3662, -6.1988, -1.7893],
        [-5.4925, -2.0059, -4.3934,  1.0090]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 35%|███▍      | 101/289 [01:16<02:20,  1.34it/s]

Training loop 101
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.43404698371887207, logits - tensor([[-6.0467,  0.9988, -5.0572, -1.6890],
        [-5.6946,  1.0455, -5.2133, -1.5283],
        [-6.0491,  1.3900, -4.8986, -1.3934],
        [-6.3342,  0.9372, -5.5993, -0.6327],
        [-5.7060,  2.0572, -4.6358, -1.7309],
        [-6.0093,  0.6322, -4.0221, -0.8223],
        [-6.8041,  0.5153, -4.9969, -0.9463],
        [-7.0810,  0.7702, -5.2806, -1.3396]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 35%|███▌      | 102/289 [01:17<02:20,  1.34it/s]

Training loop 102
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1986880749464035, logits - tensor([[-6.3450, -0.0201, -4.4870, -1.0538],
        [-6.6100,  1.7331, -5.3180, -1.6260],
        [-5.5716, -1.6125, -4.4931,  1.0076],
        [-4.3682, -2.4164,  1.3563, -3.3407],
        [-6.2454,  1.4617, -5.3279, -0.6172],
        [-6.6739,  0.9273, -5.0487, -1.8940],
        [-6.8643,  1.4301, -5.7879, -1.0102],
        [-6.4660,  1.6724, -5.1699, -1.4594]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 36%|███▌      | 103/289 [01:17<02:19,  1.33it/s]

Training loop 103
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1245257779955864, logits - tensor([[-6.5027,  0.8409, -5.4921, -1.0390],
        [-5.9506,  1.2405, -5.4552, -1.1295],
        [-6.4825,  0.0939, -5.6411, -1.2567],
        [-5.8979,  0.6415, -4.9293, -1.5617],
        [-4.4579, -2.5503,  2.3366, -3.0919],
        [-6.1695, -1.7734, -5.8180,  2.0157],
        [-6.4421,  1.3653, -6.4923, -1.3961],
        [-3.9846, -2.2814,  2.1801, -2.6875]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 36%|███▌      | 104/289 [01:18<02:19,  1.33it/s]

Training loop 104
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.0812133252620697, logits - tensor([[-5.3820, -2.1322, -4.8471,  3.3167],
        [-6.2920, -1.8045, -5.3004,  1.2798],
        [-6.6015,  1.5968, -5.7080, -1.5180],
        [-5.9806,  1.0475, -4.8968, -1.8781],
        [-6.0615, -3.0149, -5.4725,  3.1205],
        [-6.2817,  1.4325, -5.6296, -1.5405],
        [-7.5294,  1.7999, -6.3831, -1.6843],
        [-7.0860,  1.5714, -5.3910, -1.7304]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 36%|███▋      | 105/289 [01:19<02:17,  1.33it/s]

Training loop 105
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3953462243080139, logits - tensor([[-4.4683, -2.6340,  1.9719, -2.9711],
        [-5.8275,  1.4518, -4.5309, -1.7903],
        [-5.3045, -3.1104,  2.1030, -2.3150],
        [-6.2330,  1.5594, -5.7257, -1.3196],
        [-6.9464,  1.3776, -5.5173, -0.9165],
        [-6.3779,  1.8946, -5.7520, -0.6176],
        [-6.0361,  1.7344, -6.0429, -1.0519],
        [-5.3666, -3.5766,  2.8329, -3.4599]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 37%|███▋      | 106/289 [01:20<02:17,  1.33it/s]

Training loop 106
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.08139072358608246, logits - tensor([[-5.6858, -3.7717,  2.9517, -2.4798],
        [-6.4327,  2.0082, -5.8595, -1.4993],
        [-6.9655,  1.9738, -5.5383, -1.6567],
        [-5.0751, -3.1055,  2.5673, -3.6319],
        [-5.8529, -3.0411,  2.2691, -2.6136],
        [-6.6419,  1.6233, -5.7942, -1.4208],
        [-6.5661,  1.4679, -5.4469, -1.2619],
        [-6.7639,  1.5939, -6.0769, -0.7549]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 37%|███▋      | 107/289 [01:20<02:16,  1.33it/s]

Training loop 107
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.32497355341911316, logits - tensor([[-4.8473, -2.3596,  1.5599, -2.2867],
        [-5.8194,  0.5550, -5.4031, -1.7855],
        [-7.1923,  1.9278, -6.4320, -1.2217],
        [-6.5424,  1.6303, -5.9136, -1.5805],
        [-5.3990, -2.3320, -5.2482,  2.6556],
        [-5.1386,  1.4904, -4.9189, -1.1352],
        [-6.0088,  2.1467, -6.1770, -1.7909],
        [-6.1992,  2.5217, -5.2077, -1.1167]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 37%|███▋      | 108/289 [01:21<02:15,  1.33it/s]

Training loop 108
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 38%|███▊      | 109/289 [01:22<02:14,  1.33it/s]

loss - 0.33200976252555847, logits - tensor([[-5.9094, -3.1519,  2.0593, -3.0836],
        [-4.8730,  1.2467, -5.0048, -1.1545],
        [-6.3124,  2.3289, -5.5644, -1.3218],
        [-5.6795,  1.2569, -5.4760, -1.5420],
        [-6.1717,  1.8131, -5.7819, -2.1961],
        [-6.1702,  1.8906, -5.3010, -1.3070],
        [-6.7237,  1.1309, -5.7261, -1.1653],
        [-4.9072, -2.7099,  1.4313, -2.9223]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 109
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3647926449775696, logits - tensor([[-5.7279,  1.5324, -5.1265, -1.2540],
        [-6.2831, -0.4503, -3.4434, -0.8806],
        [-6.3482,  1.6956, -5.9750, -1.3671],
        [-6.0588,  1.6260, -5.4362, -1.5842],
        [-5.6883, -2.1027, -5.0081,  1.6775],
        [-6.4319,  2.7439, -5.7592, -1.8931],
        [-6.0994,  1.5295, -6.4867, -1.5888],
        [-4.6304, -2.7488,  2.4590, -2.

 38%|███▊      | 110/289 [01:23<02:14,  1.34it/s]

Training loop 110
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2265135794878006, logits - tensor([[-6.0354,  1.5368, -5.3246, -1.0566],
        [-6.4213,  1.8809, -6.3269, -1.9824],
        [-6.9993,  1.2403, -5.7391, -1.7505],
        [-7.9016,  2.1045, -6.8555, -1.5986],
        [-6.5486,  2.0938, -5.9280, -2.0959],
        [-6.5619,  2.0397, -5.5150, -2.1864],
        [-5.5807, -2.5702,  1.5234, -3.0956],
        [-6.1651,  1.7534, -6.0158, -1.6106]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 38%|███▊      | 111/289 [01:23<02:13,  1.33it/s]

Training loop 111
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.47282910346984863, logits - tensor([[-4.4166, -1.8452,  1.5498, -2.7563],
        [-6.5451,  2.5713, -6.2343, -2.0491],
        [-5.9759,  1.8748, -4.9448, -1.1700],
        [-6.0393,  1.4948, -5.1719, -1.8642],
        [-5.3566, -3.3272,  2.2849, -2.6573],
        [-6.4784, -1.7611, -6.2292,  2.5423],
        [-5.6272, -1.7395, -5.5723,  1.9767],
        [-6.1968,  1.5094, -5.5750, -1.5759]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 39%|███▉      | 112/289 [01:24<02:12,  1.33it/s]

Training loop 112
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3684905469417572, logits - tensor([[-6.1588,  1.3755, -4.9439, -1.2438],
        [-5.9748,  1.1811, -5.1825, -0.3870],
        [-6.1345,  1.3844, -6.0156, -1.5331],
        [-5.9755,  1.5710, -6.9727, -0.8947],
        [-5.4516,  0.9361, -5.3654, -1.0087],
        [-5.6741, -0.9477, -5.3695,  1.6063],
        [-7.1470,  1.9064, -5.5821, -0.8915],
        [-6.1803, -0.3101, -5.7863,  1.3274]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 39%|███▉      | 113/289 [01:25<02:12,  1.33it/s]

Training loop 113
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.10867849737405777, logits - tensor([[-6.5698,  0.9862, -6.0262, -2.1997],
        [-6.3444,  1.8653, -5.6902, -1.2199],
        [-6.6982,  1.1791, -6.0766, -0.8984],
        [-6.5667,  1.6033, -7.0683, -1.4751],
        [-6.2089,  1.6761, -5.7487, -0.9443],
        [-6.8014,  0.7262, -6.1758, -2.1599],
        [-5.4311, -3.7348,  2.0069, -2.1854],
        [-6.3382,  1.6421, -5.9260, -1.5939]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 39%|███▉      | 114/289 [01:26<02:11,  1.33it/s]

Training loop 114
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2472834587097168, logits - tensor([[-6.9586,  0.1043, -5.5246, -0.6709],
        [-5.4960,  2.1659, -6.5177, -1.3902],
        [-6.1659,  2.0694, -5.5110, -1.3430],
        [-5.0162, -2.7308,  1.4055, -2.5285],
        [-5.7095,  0.8050, -5.0171, -1.0588],
        [-4.5920, -2.7642,  1.2538, -2.3269],
        [-6.1814,  0.9568, -4.1539, -1.1416],
        [-5.8199,  1.5637, -5.6232, -1.0361]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 40%|███▉      | 115/289 [01:26<02:11,  1.33it/s]

Training loop 115
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.21229146420955658, logits - tensor([[-6.8402,  1.4948, -5.3316, -0.7539],
        [-6.8091,  1.0665, -5.6614, -1.4444],
        [-7.6147,  1.1590, -6.1750, -0.9621],
        [-6.3956,  1.8454, -6.0155, -1.4374],
        [-5.1272, -1.7118, -5.4516,  1.8420],
        [-6.4266,  2.0663, -5.4035, -1.2196],
        [-5.4422,  1.1159, -4.5500, -1.7553],
        [-5.6853,  1.3635, -5.8742, -1.8102]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 40%|████      | 116/289 [01:27<02:10,  1.33it/s]

Training loop 116
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.23591409623622894, logits - tensor([[-6.8066, -2.1744, -5.4579,  1.6910],
        [-5.7039,  1.4683, -5.4976, -1.4858],
        [-4.5363, -2.3073,  0.8769, -2.3791],
        [-6.8466,  2.3294, -6.6017, -1.8437],
        [-6.4751, -1.6770, -5.7764,  0.4350],
        [-6.4212, -0.7994, -5.1074,  1.3379],
        [-7.3345,  2.2110, -6.5848, -1.9569],
        [-4.9308, -2.2554,  1.6480, -2.3860]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 40%|████      | 117/289 [01:28<02:09,  1.33it/s]

Training loop 117
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2623536288738251, logits - tensor([[-4.4027, -2.3929,  1.9381, -1.6458],
        [-6.3372,  1.0480, -6.1610, -0.8332],
        [-5.8317,  1.4416, -5.6266, -1.9959],
        [-6.3957,  1.6052, -5.9418, -1.5365],
        [-6.5725, -1.5870, -5.1600,  0.9957],
        [-5.5637,  1.9450, -5.4694, -1.6565],
        [-5.9684,  1.3854, -5.0880, -1.6960],
        [-5.3145, -2.6750,  1.5747, -2.4339]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 41%|████      | 118/289 [01:29<02:08,  1.33it/s]

Training loop 118
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.18135607242584229, logits - tensor([[-6.2423,  0.9313, -4.6690, -1.3853],
        [-5.8490,  1.8542, -4.8350, -0.9320],
        [-5.3912, -2.7936,  1.0497, -2.6560],
        [-6.4294,  1.5432, -5.4316, -1.5285],
        [-5.8849, -2.6541, -4.8935,  2.2891],
        [-6.4989,  1.3797, -5.5172, -1.2356],
        [-6.3163,  1.5766, -4.8002, -1.7044],
        [-6.8555,  2.1624, -5.5624, -1.7086]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 41%|████      | 119/289 [01:29<02:07,  1.33it/s]

Training loop 119
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3490563929080963, logits - tensor([[-6.3900,  1.9348, -6.0181, -2.0152],
        [-6.9512, -2.5485, -5.9418,  1.2119],
        [-6.5957,  0.3428, -5.5233, -1.1962],
        [-5.7861,  0.6439, -5.8558, -1.6644],
        [-4.7789, -2.5834,  1.5938, -2.0825],
        [-5.7937,  1.4346, -6.0548, -1.7784],
        [-5.4763, -1.8087, -4.4910,  2.0041],
        [-5.2995,  0.5914, -5.3269, -1.7034]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 42%|████▏     | 120/289 [01:30<02:07,  1.33it/s]

Training loop 120
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.12037147581577301, logits - tensor([[-6.1567,  0.9610, -5.0490, -0.8767],
        [-5.9090,  1.2307, -5.5076, -0.8511],
        [-5.5990,  1.8441, -5.4750, -1.9541],
        [-6.3396,  2.1744, -6.2034, -1.1193],
        [-5.9407, -2.6007,  0.6842, -2.1756],
        [-7.0511,  2.4517, -5.7087, -1.9965],
        [-5.8130, -2.1401,  0.7403, -2.0656],
        [-5.8739,  1.1939, -5.7272, -1.6663]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 42%|████▏     | 121/289 [01:31<02:06,  1.33it/s]

Training loop 121
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2463505119085312, logits - tensor([[-6.7503,  1.5333, -5.6878, -1.8542],
        [-5.9647,  0.9840, -4.4356, -0.8922],
        [-6.0638,  0.7710, -4.7716, -1.8376],
        [-6.3614,  1.9017, -6.0132, -2.0897],
        [-5.8004, -2.0269, -0.1312, -1.7921],
        [-6.2754,  1.6292, -6.1433, -1.6067],
        [-5.8942, -2.5329,  0.8010, -1.5093],
        [-5.4777,  2.0150, -5.2621, -1.6872]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 42%|████▏     | 122/289 [01:32<02:05,  1.33it/s]

Training loop 122
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.09741715341806412, logits - tensor([[-6.0503,  1.9047, -5.9291, -0.7645],
        [-6.3112,  1.6200, -5.5660, -1.2346],
        [-5.5181,  1.8264, -4.8438, -2.5679],
        [-6.6789,  1.4314, -6.2124, -1.7159],
        [-6.1352, -1.4706, -4.9201,  1.9343],
        [-6.0956,  1.2990, -5.0411, -1.3191],
        [-7.0662,  1.7137, -5.7427, -2.1645],
        [-6.7174,  1.8087, -5.1811, -1.2287]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 43%|████▎     | 123/289 [01:32<02:04,  1.33it/s]

Training loop 123
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1719791293144226, logits - tensor([[-5.5140, -3.0744,  1.1983, -2.3968],
        [-6.3460,  1.8102, -6.0391, -1.0873],
        [-4.6220, -1.6796,  0.4282, -1.9928],
        [-5.7959,  1.9811, -5.2714, -1.2166],
        [-6.0071, -1.2162, -4.9208,  1.6050],
        [-6.2591,  0.7439, -5.5576, -0.5189],
        [-5.9151,  1.0109, -4.6268, -1.7729],
        [-6.2822,  1.4810, -5.4190, -1.6087]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 43%|████▎     | 124/289 [01:33<02:04,  1.32it/s]

Training loop 124
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.225660040974617, logits - tensor([[-5.8788,  0.9803, -5.8276, -2.1860],
        [-5.9074, -1.8080, -4.5486,  1.9370],
        [-6.5353,  1.9115, -5.6514, -1.9630],
        [-6.5691,  1.6150, -6.1486, -1.5042],
        [-6.0703,  1.5053, -4.7025, -2.1967],
        [-6.4628,  2.0836, -5.3495, -1.9796],
        [-5.6763, -0.4463, -5.0445,  0.3188],
        [-6.4216,  1.6017, -5.5131, -2.3759]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 43%|████▎     | 125/289 [01:34<02:03,  1.33it/s]

Training loop 125
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.17121067643165588, logits - tensor([[-6.4774, -0.8326, -5.2210,  1.1737],
        [-5.0110, -2.7228,  0.7974, -2.5213],
        [-6.6813,  1.2677, -6.2411, -1.0772],
        [-6.2198, -1.7239, -6.5144,  2.3107],
        [-6.3883,  1.4326, -5.1548, -1.6036],
        [-5.3334, -1.7991, -5.1480,  2.5163],
        [-6.0546,  2.4461, -5.7353, -1.9926],
        [-5.8142,  1.6698, -4.8758, -2.0186]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 44%|████▎     | 126/289 [01:35<02:02,  1.33it/s]

Training loop 126
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.22597776353359222, logits - tensor([[-6.5833,  1.0651, -5.7772, -1.7692],
        [-7.1124,  2.2209, -6.8786, -1.6409],
        [-5.8795, -2.9112,  1.0024, -1.8865],
        [-6.5795, -2.0408, -5.9141,  2.5010],
        [-6.0151,  0.8911, -5.5584, -1.4958],
        [-5.8372, -2.7448, -4.0787,  2.2851],
        [-5.1129, -2.6852,  0.7720, -2.6362],
        [-4.7028, -2.6179,  0.5990, -1.8549]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 44%|████▍     | 127/289 [01:35<02:01,  1.33it/s]

Training loop 127
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4331666827201843, logits - tensor([[-6.7706,  2.2449, -5.9538, -1.6616],
        [-6.7896,  1.6790, -5.8907, -1.9623],
        [-7.3342,  2.1910, -6.8364, -2.4211],
        [-6.1939,  1.2069, -5.7092, -1.4105],
        [-7.3229,  2.5207, -6.0601, -3.1074],
        [-6.9496,  1.6338, -6.4484, -1.8910],
        [-6.9136,  2.0506, -5.8888, -1.0841],
        [-6.2906, -0.5443, -4.9897, -0.0312]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 44%|████▍     | 128/289 [01:36<02:00,  1.33it/s]

Training loop 128
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1147080585360527, logits - tensor([[-4.8032, -1.8985,  0.7919, -1.3492],
        [-5.8997,  1.4618, -5.0809, -1.1262],
        [-5.9418,  1.7343, -4.9245, -1.9956],
        [-6.2240,  1.2191, -5.6323, -1.6999],
        [-3.8512, -2.0486,  1.2354, -1.6621],
        [-6.0141,  1.9495, -4.5786, -1.9855],
        [-5.5486,  1.9700, -4.9177, -1.9821],
        [-4.9602, -2.4454,  1.5504, -1.0924]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 45%|████▍     | 129/289 [01:37<02:00,  1.33it/s]

Training loop 129
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.27341216802597046, logits - tensor([[-7.3434,  0.5099, -3.6278, -1.0819],
        [-5.4048,  1.4168, -5.4856, -2.0756],
        [-7.7238,  2.7408, -6.9427, -2.3916],
        [-6.4489,  2.1261, -5.4822, -1.1666],
        [-6.5665, -0.3105, -5.4782,  0.4209],
        [-6.9445, -1.3644, -5.5737,  2.2042],
        [-5.2950, -2.6899,  1.6778, -2.2767],
        [-6.1740,  2.9130, -5.4516, -2.0706]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 45%|████▍     | 130/289 [01:38<01:59,  1.33it/s]

Training loop 130
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.22322820127010345, logits - tensor([[-6.3864,  1.5795, -5.4427, -2.2945],
        [-7.1998,  2.6754, -6.2979, -2.6483],
        [-4.6307, -2.6404,  1.2798, -2.0891],
        [-6.5406,  2.2170, -5.3671, -2.1416],
        [-6.7931,  1.8286, -5.9186, -1.3859],
        [-6.0607, -2.0120, -5.0284,  1.7192],
        [-6.2463,  0.9706, -5.0953, -1.7910],
        [-5.8864,  1.8678, -5.5914, -0.7479]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 45%|████▌     | 131/289 [01:38<01:59,  1.32it/s]

Training loop 131
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.21655301749706268, logits - tensor([[-7.1457,  0.9919, -6.1019, -1.9156],
        [-5.8854,  1.4571, -5.9540, -2.2210],
        [-5.2639,  1.8517, -5.7596, -1.4777],
        [-6.8367,  2.1720, -5.9667, -1.6935],
        [-5.8745, -1.8680, -5.7303,  1.6337],
        [-7.0472,  1.3309, -5.4539, -1.4418],
        [-5.3442, -2.5145,  0.8030, -1.8275],
        [-5.7554,  1.7886, -5.6241, -2.1749]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 46%|████▌     | 132/289 [01:39<01:59,  1.31it/s]

Training loop 132
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.304667592048645, logits - tensor([[-6.8320,  0.1672, -5.2318,  0.1964],
        [-4.5513, -3.0187,  1.4178, -2.1841],
        [-5.5503,  0.8393, -5.0844, -1.1981],
        [-6.7288,  2.0601, -6.5849, -1.8075],
        [-5.6442, -1.5051, -5.1075,  0.6109],
        [-4.0234, -1.9719,  1.3521, -1.4702],
        [-7.3121, -1.3471, -4.7201,  1.6960],
        [-6.4639,  1.6155, -6.2423, -2.3385]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 46%|████▌     | 133/289 [01:40<01:59,  1.31it/s]

Training loop 133
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 46%|████▋     | 134/289 [01:41<01:58,  1.31it/s]

loss - 0.3011123836040497, logits - tensor([[-6.2452,  1.5612, -5.4329, -2.0926],
        [-6.6986,  1.5680, -6.1200, -1.5255],
        [-6.6806,  1.6618, -6.2259, -1.4877],
        [-5.1651,  1.5413, -4.4717, -2.3594],
        [-5.1957,  0.9633, -4.8173, -0.9520],
        [-6.0622,  1.8819, -5.3002, -1.7664],
        [-7.0257,  0.3187, -5.1339,  0.4025],
        [-5.6272, -0.9915, -5.0437,  1.1201]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 134
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.45776307582855225, logits - tensor([[-5.3458,  2.0883, -5.7535, -1.4739],
        [-5.9983,  1.1421, -5.3631, -0.2454],
        [-5.8793, -1.8462, -5.5539,  2.0125],
        [-5.7500,  1.8918, -4.9462, -1.0956],
        [-6.4875,  1.8622, -5.6705, -1.9275],
        [-6.1091, -3.0856,  1.5932, -2.5683],
        [-5.9500, -1.9616, -5.3768,  1.1622],
        [-5.9010,  0.6244, -5.5744, -0.

 47%|████▋     | 135/289 [01:42<01:57,  1.31it/s]

Training loop 135
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3193418085575104, logits - tensor([[-6.2614, -1.6404, -4.8135,  1.1060],
        [-5.8702,  1.7452, -5.6723, -2.4084],
        [-5.6973, -2.4291, -6.0455,  3.2916],
        [-6.5252,  0.5724, -3.9848, -2.2232],
        [-5.8710,  1.8953, -5.2966, -1.1232],
        [-5.7737, -2.7064,  1.9864, -2.4934],
        [-6.3232,  0.9443, -4.7797, -1.2299],
        [-5.9724,  2.0572, -5.9408, -1.5186]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 47%|████▋     | 136/289 [01:42<01:56,  1.32it/s]

Training loop 136
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.10761116445064545, logits - tensor([[-6.6116,  2.1676, -6.2519, -1.3589],
        [-6.1951,  1.9724, -5.6731, -1.6130],
        [-6.7294,  0.7504, -5.1631, -1.8414],
        [-6.5181,  1.6366, -5.6723, -2.4488],
        [-5.3556,  2.7273, -5.6195, -1.3964],
        [-5.7250,  2.0938, -6.0258, -0.7303],
        [-7.0378,  1.0111, -5.3748, -1.1289],
        [-6.8157,  0.8326, -5.3861, -1.4609]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 47%|████▋     | 137/289 [01:43<01:55,  1.31it/s]

Training loop 137
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3079051375389099, logits - tensor([[-6.5969,  2.2624, -5.6083, -2.0247],
        [-5.0614, -3.5175,  1.9102, -1.9599],
        [-6.5712,  1.6172, -6.1238, -1.5113],
        [-5.4073,  1.2159, -5.0325, -2.3736],
        [-6.8783,  1.3431, -5.4554, -2.8115],
        [-5.9782,  1.4981, -5.8807, -1.9509],
        [-4.5119, -2.5030,  2.2708, -2.7214],
        [-6.1830,  1.9295, -5.2426, -0.4144]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 48%|████▊     | 138/289 [01:44<01:54,  1.32it/s]

Training loop 138
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.22310197353363037, logits - tensor([[-6.1331,  0.8098, -4.8592, -1.9506],
        [-6.9580,  0.2348, -4.3672, -1.3930],
        [-4.7446, -2.5857,  1.6795, -2.1401],
        [-6.6333, -1.2508, -5.0687,  1.9972],
        [-6.5242,  1.7935, -5.6565, -0.9461],
        [-4.9465, -2.0971,  1.6629, -1.7426],
        [-6.5925, -0.2224, -6.1225,  0.4486],
        [-6.1914,  2.1946, -6.0470, -1.6214]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 48%|████▊     | 139/289 [01:45<01:54,  1.31it/s]

Training loop 139
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.35437747836112976, logits - tensor([[-5.6051,  0.9848, -5.3172, -1.3929],
        [-7.6946,  2.3753, -6.0253, -1.5833],
        [-6.5686,  2.7139, -5.5281, -1.4737],
        [-5.3614, -3.2041,  1.5714, -2.5117],
        [-6.5373,  1.2013, -6.4791, -1.7690],
        [-5.7763, -3.1610,  1.9453, -1.9060],
        [-5.9337, -3.8554,  2.6527, -2.6842],
        [-5.7462,  1.8694, -5.4730, -2.1197]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 48%|████▊     | 140/289 [01:45<01:53,  1.32it/s]

Training loop 140
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3175086975097656, logits - tensor([[-7.3617,  2.0822, -6.3074, -2.7663],
        [-5.8775, -3.4448,  2.1786, -2.0871],
        [-5.1572, -3.4901,  2.4687, -2.1818],
        [-6.5168,  0.4642, -4.9691, -1.5136],
        [-6.3890,  1.4201, -4.9132, -1.2775],
        [-7.0651,  1.4886, -5.0167, -1.7500],
        [-5.3288, -3.6730,  2.1987, -2.4089],
        [-5.2117, -3.5231,  1.9988, -2.5915]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 49%|████▉     | 141/289 [01:46<01:52,  1.32it/s]

Training loop 141
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.26582908630371094, logits - tensor([[-5.1218, -2.4311,  1.9644, -1.9999],
        [-6.6537,  1.7805, -6.2155, -1.6450],
        [-6.1275,  1.7573, -5.3267, -1.3635],
        [-6.1131,  0.8612, -5.3186, -0.6184],
        [-5.6914,  1.7349, -5.3013, -2.1529],
        [-6.4386,  1.7742, -5.3232, -1.9069],
        [-5.7178,  0.7007, -4.6013, -1.8739],
        [-6.6346, -1.6786, -4.9799,  2.0365]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 49%|████▉     | 142/289 [01:47<01:51,  1.32it/s]

Training loop 142
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.13873574137687683, logits - tensor([[-6.6915,  0.6297, -5.2915, -0.5313],
        [-6.0109,  1.8430, -5.2302, -1.7776],
        [-6.4196, -0.2376, -5.3059,  0.7404],
        [-6.5850, -0.9707, -5.1864,  1.2270],
        [-5.1662, -3.1079,  2.3074, -1.0696],
        [-6.5108,  1.8375, -5.2049, -1.8070],
        [-6.7984,  0.7085, -5.6986, -1.6049],
        [-6.2438,  1.5464, -4.8418, -1.9516]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 49%|████▉     | 143/289 [01:48<01:50,  1.32it/s]

Training loop 143
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.279949426651001, logits - tensor([[-5.9795,  1.5058, -4.6281, -2.4325],
        [-6.3327,  1.5113, -5.9435, -1.6357],
        [-6.4246,  2.1043, -5.5247, -2.1491],
        [-7.8413,  2.0785, -6.2755, -1.4178],
        [-5.0237,  1.0920, -5.0954, -1.6809],
        [-6.8244,  2.5089, -5.7291, -2.4801],
        [-6.5697,  1.6855, -5.2070, -1.2751],
        [-7.0068, -1.8495, -5.1256,  3.1241]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 50%|████▉     | 144/289 [01:48<01:49,  1.33it/s]

Training loop 144
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 50%|█████     | 145/289 [01:49<01:48,  1.32it/s]

loss - 0.15638236701488495, logits - tensor([[-7.1062,  0.9948, -4.5250, -1.5409],
        [-6.3669,  1.1528, -5.2518, -1.5685],
        [-5.8243, -3.9672,  2.5357, -2.8466],
        [-6.8331,  1.0673, -5.3573, -0.0605],
        [-6.4542,  1.7374, -5.5603, -1.6515],
        [-6.6841, -0.2314, -5.3220, -0.8254],
        [-6.7388, -1.0770, -5.8686,  1.6380],
        [-6.7403,  2.2835, -5.5637, -1.5327]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 145
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.27170729637145996, logits - tensor([[-6.6361, -2.1196, -4.8984,  1.5844],
        [-4.7814,  2.6686, -5.0380, -1.9632],
        [-6.1631,  1.9084, -5.4266, -1.5051],
        [-6.6194,  1.4980, -4.9487, -1.1315],
        [-7.0945, -0.3728, -5.6801,  0.1046],
        [-6.2455,  1.4824, -6.0776, -1.3941],
        [-6.4596,  1.2293, -5.6083, -1.9338],
        [-7.3190, -0.5597, -5.9338,  0

 51%|█████     | 146/289 [01:50<01:48,  1.32it/s]

Training loop 146
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.16780120134353638, logits - tensor([[-5.1015,  1.0279, -4.9996, -0.9392],
        [-5.8662,  2.1964, -4.9417, -1.8860],
        [-6.7862,  1.9051, -5.6179, -2.4908],
        [-6.3150,  0.0962, -5.1815, -0.1845],
        [-7.0608, -0.3155, -5.5937,  0.8138],
        [-5.9718,  1.6458, -5.7689, -1.2210],
        [-7.1633,  0.2326, -5.4182, -0.4593],
        [-6.1628,  2.3699, -5.6467, -1.6935]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 51%|█████     | 147/289 [01:51<01:48,  1.31it/s]

Training loop 147
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.30366790294647217, logits - tensor([[-6.7337,  1.6451, -5.8480, -1.3145],
        [-6.0301,  0.4296, -5.4613, -1.5981],
        [-6.3073,  1.5329, -5.9798, -2.0063],
        [-6.7802,  2.1903, -5.0764, -1.9016],
        [-4.5544, -2.6030,  2.8269, -2.2723],
        [-6.3319,  1.5672, -5.0280, -1.1109],
        [-4.9771, -1.2590, -5.0038,  0.7340],
        [-6.4135,  1.3864, -6.2989, -1.2482]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 51%|█████     | 148/289 [01:51<01:47,  1.32it/s]

Training loop 148
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4182130694389343, logits - tensor([[-5.0533, -2.9268,  2.6731, -2.0617],
        [-5.5079,  1.6808, -4.4709, -1.5769],
        [-4.8061, -3.1663,  1.8222, -2.1734],
        [-5.9199,  1.9470, -5.6180, -1.5865],
        [-6.1303,  2.4869, -5.8018, -2.2517],
        [-6.0252,  1.3497, -5.4980, -0.6515],
        [-6.4858,  0.2084, -5.6694,  0.0089],
        [-7.0888,  2.0288, -6.4633, -1.5453]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 52%|█████▏    | 149/289 [01:52<01:46,  1.32it/s]

Training loop 149
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.36897701025009155, logits - tensor([[-6.2861,  0.7532, -5.1292, -0.9122],
        [-6.3502,  0.8979, -4.9475, -1.6936],
        [-5.7612,  1.9273, -5.1329, -1.2172],
        [-5.9983,  1.7010, -4.9446, -2.1358],
        [-6.0791, -0.3555, -5.3775,  0.9051],
        [-6.3690,  2.0927, -4.7521, -1.9087],
        [-6.7334,  2.1890, -5.7302, -1.7804],
        [-6.4621,  1.1186, -5.9651, -0.9964]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 52%|█████▏    | 150/289 [01:53<01:45,  1.32it/s]

Training loop 150
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2820247411727905, logits - tensor([[-7.0661,  2.0839, -6.1719, -2.1187],
        [-6.7184,  1.0478, -4.9737, -1.5534],
        [-5.7592,  2.1339, -4.8632, -1.8886],
        [-5.2682,  1.7364, -5.5124, -1.3768],
        [-6.0547,  0.5346, -4.7470, -0.1435],
        [-6.1087,  1.1431, -5.0190, -0.9962],
        [-6.6350,  0.5604, -4.5723, -1.0584],
        [-5.9680,  1.8227, -6.0042, -2.0562]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 52%|█████▏    | 151/289 [01:54<01:44,  1.32it/s]

Training loop 151
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.18195264041423798, logits - tensor([[-6.1526,  1.2253, -5.8832, -0.8460],
        [-6.5734,  1.5601, -5.7964, -2.1600],
        [-6.1734,  1.4756, -4.9103, -1.0866],
        [-6.0810,  1.7082, -5.4076, -1.6456],
        [-6.8359,  1.6406, -5.5984, -0.6731],
        [-3.7978, -1.9171,  2.3872, -1.8921],
        [-6.7458,  1.6617, -5.4852, -1.5607],
        [-6.3414, -1.1529, -4.6132,  0.9246]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 53%|█████▎    | 152/289 [01:54<01:44,  1.32it/s]

Training loop 152
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.23919448256492615, logits - tensor([[-5.7424,  1.7209, -5.3536, -0.8834],
        [-6.5772,  1.5948, -5.2162, -2.0961],
        [-6.5650,  0.8741, -5.5341, -1.6971],
        [-6.2476,  2.1661, -5.0921, -1.3016],
        [-6.2002,  1.8155, -5.2068, -1.0828],
        [-6.0922, -2.1331, -5.2720,  2.3448],
        [-7.0451, -0.0551, -6.1791,  0.1833],
        [-7.1884, -2.1633, -5.4627,  1.5449]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 53%|█████▎    | 153/289 [01:55<01:43,  1.31it/s]

Training loop 153
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.17913083732128143, logits - tensor([[-6.8127, -1.8226, -6.0815,  1.4143],
        [-6.4917,  0.0781, -5.2826, -0.6450],
        [-5.5586,  1.0481, -5.1522, -0.7275],
        [-6.7176, -1.3652, -5.3155,  0.4440],
        [-6.3908,  1.3081, -5.1191, -1.1524],
        [-7.2204,  0.1900, -6.0349, -0.2871],
        [-6.0116,  0.9721, -5.5552, -0.6437],
        [-5.7981,  1.3459, -4.9658, -1.7048]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 53%|█████▎    | 154/289 [01:56<01:42,  1.32it/s]

Training loop 154
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2753070294857025, logits - tensor([[-6.4687,  1.6182, -5.6074, -1.8479],
        [-6.1845,  2.3475, -5.6988, -1.4140],
        [-6.3157,  1.8559, -5.9472, -1.4607],
        [-5.4299, -3.3439,  1.9003, -1.7893],
        [-6.2266,  0.7500, -5.6386, -1.3340],
        [-6.4236,  0.6368, -4.8301, -0.4986],
        [-6.4116,  1.5459, -4.5973, -1.5187],
        [-5.7169,  1.1324, -5.8065, -0.6405]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 54%|█████▎    | 155/289 [01:57<01:41,  1.32it/s]

Training loop 155
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.11447648704051971, logits - tensor([[-5.0008, -2.6220,  2.3067, -1.8386],
        [-6.8241,  0.8975, -5.5447, -0.7581],
        [-6.8085,  0.9191, -4.2847, -0.4089],
        [-6.1752,  1.5544, -4.6982, -1.6029],
        [-6.4687,  1.5082, -5.2239, -1.6054],
        [-7.1127,  1.6779, -6.0910, -1.5909],
        [-7.2316,  2.1350, -6.4038, -1.3634],
        [-6.6690,  1.5987, -5.3819, -2.4211]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 54%|█████▍    | 156/289 [01:57<01:40,  1.32it/s]

Training loop 156
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.17323194444179535, logits - tensor([[-4.9102, -2.8370,  1.4343, -2.3070],
        [-4.6543, -2.7895,  2.0362, -2.2978],
        [-5.9349,  1.6005, -5.2980, -1.4827],
        [-7.7016,  1.9813, -6.2686, -2.0689],
        [-6.7815,  0.0166, -5.5869,  0.5728],
        [-6.0957,  1.0535, -4.7536, -0.6431],
        [-6.2044,  1.2759, -5.0913, -1.1722],
        [-5.3969, -3.8256,  3.1558, -2.7517]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 54%|█████▍    | 157/289 [01:58<01:39,  1.32it/s]

Training loop 157
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.16304443776607513, logits - tensor([[-6.0459,  1.4940, -5.2605, -1.7495],
        [-4.8317, -2.3473,  2.0275, -1.5313],
        [-5.4531, -1.8000, -4.6071,  1.4376],
        [-6.4977, -2.0000, -5.6033,  2.4192],
        [-5.5843,  0.9581, -4.5599, -1.4886],
        [-6.7443, -0.9834, -5.5478,  0.7210],
        [-6.0021,  0.3415, -5.3573, -0.6773],
        [-6.3402,  0.0228, -5.4073,  0.4222]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 55%|█████▍    | 158/289 [01:59<01:39,  1.32it/s]

Training loop 158
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.30122968554496765, logits - tensor([[-5.8395,  1.9779, -5.0101, -1.2036],
        [-6.6706, -4.2319,  2.5120, -2.6261],
        [-6.6920,  1.1134, -5.4809, -1.9894],
        [-7.3117,  1.6940, -6.0310, -1.7675],
        [-4.4364, -1.9293,  1.4567, -1.6671],
        [-7.2242,  1.1526, -5.2803, -1.6777],
        [-7.2693,  1.4255, -6.4247, -1.7111],
        [-7.0267,  1.6591, -5.3317, -1.5985]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 55%|█████▌    | 159/289 [02:00<01:38,  1.32it/s]

Training loop 159
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19618722796440125, logits - tensor([[-5.8661,  0.9621, -5.2383, -1.3636],
        [-5.8677,  1.6959, -5.9942, -2.6356],
        [-6.3352,  1.2007, -4.8833, -1.2684],
        [-6.1339, -2.5915, -5.1185,  2.3864],
        [-5.8502, -0.8639, -5.1385,  0.9554],
        [-6.7836,  0.5540, -4.7557, -1.4760],
        [-6.4924,  1.3039, -5.8087, -0.9451],
        [-6.8509,  0.9593, -5.4069, -2.7919]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 55%|█████▌    | 160/289 [02:00<01:37,  1.32it/s]

Training loop 160
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.28523266315460205, logits - tensor([[-4.5997, -3.3258,  1.9592, -1.6851],
        [-5.9467,  1.3834, -5.1488, -1.6401],
        [-6.7042,  2.0179, -6.3101, -1.4913],
        [-5.3133, -2.2453, -5.0793,  2.0402],
        [-6.5136,  1.5904, -5.1415, -1.6176],
        [-4.9770, -2.8307,  1.6754, -1.7186],
        [-6.7596,  1.5181, -5.1705, -1.6722],
        [-6.1887,  1.4026, -4.4607, -1.4695]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 56%|█████▌    | 161/289 [02:01<01:36,  1.33it/s]

Training loop 161
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.22464925050735474, logits - tensor([[-6.2987,  1.3208, -5.6106, -1.6438],
        [-7.2255,  0.6442, -5.6738, -2.1848],
        [-6.7543,  1.8622, -5.5025, -2.0191],
        [-5.9892,  0.9885, -5.2674, -1.3027],
        [-5.9377,  0.9383, -4.4867, -0.7475],
        [-6.2783,  0.6213, -4.4915, -0.4660],
        [-5.4379,  0.9428, -4.7049, -1.2997],
        [-6.6738,  1.2040, -5.0354, -2.1505]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 56%|█████▌    | 162/289 [02:02<01:35,  1.33it/s]

Training loop 162
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1812233179807663, logits - tensor([[-6.5704,  1.2867, -4.6737, -1.1761],
        [-6.7762,  2.4845, -5.9257, -2.0902],
        [-7.0624, -1.8256, -5.1834,  0.0776],
        [-7.0748,  0.9555, -6.2921, -0.8388],
        [-5.0091,  1.8869, -5.1560, -1.1330],
        [-7.6431, -1.3896, -5.5175,  1.0873],
        [-4.5738, -2.9604,  2.4724, -1.7078],
        [-6.0628,  0.9418, -6.2062, -1.5611]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 56%|█████▋    | 163/289 [02:03<01:34,  1.33it/s]

Training loop 163
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.13639536499977112, logits - tensor([[-5.9129,  1.8462, -5.4201, -1.7743],
        [-6.1893,  1.2777, -5.0527, -1.5313],
        [-6.7548,  0.9835, -5.8951, -1.3168],
        [-6.8877,  1.0381, -5.2954, -1.2657],
        [-6.7939, -0.0753, -5.5088, -0.2920],
        [-4.7923, -3.5157,  2.4225, -2.8573],
        [-6.4376,  1.6888, -5.6211, -1.9078],
        [-6.2775,  1.5072, -4.3886, -1.1686]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 57%|█████▋    | 164/289 [02:03<01:34,  1.33it/s]

Training loop 164
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.18294160068035126, logits - tensor([[-4.7104,  0.6012, -4.7156, -1.0171],
        [-6.7875,  1.4557, -4.9052, -1.0694],
        [-6.9075,  1.0781, -4.9497, -1.9724],
        [-6.2261,  1.8617, -6.5462, -1.4339],
        [-5.2380,  1.0710, -4.5098, -1.2387],
        [-6.2153, -2.2193, -5.4017,  2.7421],
        [-6.6173,  2.0193, -5.9479, -1.4423],
        [-6.4776,  1.4384, -5.1872, -1.6448]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 57%|█████▋    | 165/289 [02:04<01:33,  1.32it/s]

Training loop 165
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.0889408141374588, logits - tensor([[-5.5566,  1.8936, -4.8985, -1.7974],
        [-6.8474, -1.9786, -5.3039,  2.5429],
        [-6.3912, -1.9873, -5.6462,  2.1415],
        [-5.4002, -3.6462,  1.8775, -2.1085],
        [-6.8241,  1.4115, -5.2469, -1.5661],
        [-6.3657,  1.0806, -4.6442, -0.8095],
        [-7.1611,  1.7172, -5.1694, -1.1191],
        [-5.7040, -1.8697, -4.2263,  2.2857]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 57%|█████▋    | 166/289 [02:05<01:32,  1.33it/s]

Training loop 166
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3593803346157074, logits - tensor([[-7.7907,  1.3139, -6.6288, -2.0551],
        [-5.9756,  1.1666, -5.1990, -1.8922],
        [-4.5493, -3.5712,  1.8414, -2.8283],
        [-6.1136,  1.9633, -5.8078, -1.8576],
        [-6.1779, -1.9644, -6.3081,  1.4407],
        [-6.6165,  1.9016, -4.6552, -1.8649],
        [-4.4138, -3.3289,  1.7035, -2.0555],
        [-5.6300,  0.6927, -4.7700, -1.0294]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 58%|█████▊    | 167/289 [02:06<01:31,  1.33it/s]

Training loop 167
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2446346879005432, logits - tensor([[-4.8098, -2.3587,  1.8367, -1.9536],
        [-6.7815,  1.7222, -5.7423, -1.4009],
        [-5.8504,  2.0485, -5.9839, -1.5645],
        [-4.8623, -2.7680,  2.0736, -1.6648],
        [-6.4074,  1.2938, -5.6314, -1.4956],
        [-7.0829,  1.4333, -6.1140, -1.7885],
        [-4.3257, -2.9851,  2.0138, -1.7678],
        [-5.7015,  2.1593, -5.2487, -2.2504]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 58%|█████▊    | 168/289 [02:06<01:31,  1.33it/s]

Training loop 168
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19316759705543518, logits - tensor([[-6.7024,  1.1378, -5.1864, -0.6278],
        [-6.4175,  1.3262, -5.5163, -1.6998],
        [-6.3708,  1.7783, -5.0025, -1.2723],
        [-7.2182,  0.6648, -4.4694, -0.6807],
        [-6.3593,  1.2217, -5.4255, -1.2800],
        [-6.3272,  1.2086, -5.3955, -1.7812],
        [-5.7084,  0.9744, -4.9253, -0.3356],
        [-5.0221, -2.5099, -5.0408,  1.8091]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 58%|█████▊    | 169/289 [02:07<01:30,  1.33it/s]

Training loop 169
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.26309362053871155, logits - tensor([[-6.5039, -0.3560, -5.0534,  0.3128],
        [-6.2876,  1.6155, -4.9479, -2.0520],
        [-5.5162, -3.2976,  2.5201, -2.4298],
        [-6.3475,  0.8773, -4.9900, -1.4605],
        [-4.5677, -2.4468,  1.1796, -1.1640],
        [-5.6541, -3.5662,  1.2784, -2.1906],
        [-5.7404, -2.0582, -5.1037,  2.4969],
        [-6.9022, -2.1133, -6.2221,  2.6760]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 59%|█████▉    | 170/289 [02:08<01:30,  1.32it/s]

Training loop 170
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.11990414559841156, logits - tensor([[-5.3504,  1.5740, -3.8529, -1.1175],
        [-6.0788,  0.8873, -4.6426, -1.5894],
        [-6.0710,  2.1375, -5.2537, -1.0262],
        [-6.1311,  1.4416, -4.8659, -0.9056],
        [-6.4277,  1.3636, -5.1417, -0.7976],
        [-6.3701,  1.0649, -4.5437, -0.9834],
        [-6.2926,  1.6107, -5.2018, -2.0098],
        [-5.1134,  2.0959, -5.1709, -1.9392]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 59%|█████▉    | 171/289 [02:09<01:29,  1.31it/s]

Training loop 171
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.17957384884357452, logits - tensor([[-6.7833,  2.8582, -5.8349, -2.1100],
        [-5.0797, -3.0411, -5.0596,  2.8247],
        [-6.4374,  0.8501, -4.6175, -1.8786],
        [-6.5382,  1.9971, -4.7966, -1.7794],
        [-5.9590,  1.9258, -5.3326, -1.4965],
        [-5.4564, -3.2646,  2.2085, -2.4373],
        [-7.0579,  0.2603, -4.9575, -1.0536],
        [-5.7896,  0.9279, -5.9353, -1.9615]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 60%|█████▉    | 172/289 [02:10<01:29,  1.31it/s]

Training loop 172
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.36664992570877075, logits - tensor([[-5.3794, -3.3948,  2.6900, -2.7492],
        [-6.1788,  1.3927, -4.8684, -1.3541],
        [-4.1906, -2.4208,  2.6276, -2.6198],
        [-6.4483,  0.6049, -4.8817, -1.6338],
        [-6.3357,  1.4398, -5.3421, -0.1944],
        [-6.7933,  1.1252, -5.7628, -1.1996],
        [-4.5348, -2.5134,  1.7976, -2.1115],
        [-7.4720,  1.9849, -6.6818, -1.4992]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 60%|█████▉    | 173/289 [02:10<01:28,  1.31it/s]

Training loop 173
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3045121729373932, logits - tensor([[-5.9896,  1.7935, -5.8719, -1.7772],
        [-5.9351,  1.7395, -5.9985, -1.4240],
        [-6.3291,  1.6667, -4.9784, -1.1537],
        [-6.2862,  1.5225, -4.9496, -1.6199],
        [-6.1389,  1.5267, -5.3642, -2.2400],
        [-5.9262,  1.1478, -3.9544, -1.5109],
        [-7.8238,  1.5141, -5.5832, -0.8262],
        [-5.1044,  2.5219, -5.5091, -0.9624]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 60%|██████    | 174/289 [02:11<01:27,  1.31it/s]

Training loop 174
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2677294909954071, logits - tensor([[-6.2663,  1.6905, -5.0266, -1.8451],
        [-5.7379,  1.7857, -5.3648, -1.9460],
        [-6.8067,  1.9817, -5.6765, -1.4340],
        [-7.0596,  1.3058, -5.3975, -0.8389],
        [-5.6650,  2.0448, -5.2096, -1.8257],
        [-5.9269,  1.8477, -5.1618, -1.8958],
        [-6.5024,  1.8859, -5.9727, -1.9913],
        [-6.7677,  2.5029, -5.5149, -3.0150]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 61%|██████    | 175/289 [02:12<01:26,  1.31it/s]

Training loop 175
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.363292932510376, logits - tensor([[-6.3555,  1.0524, -4.9149, -1.5848],
        [-7.3207,  1.5602, -5.5553, -1.1888],
        [-6.5132, -2.3749, -6.0533,  2.4515],
        [-5.8629, -3.0275,  2.6853, -2.1263],
        [-6.2089,  2.1787, -5.7465, -1.9000],
        [-6.7558,  1.7169, -5.5420, -0.5962],
        [-5.8438,  0.8873, -4.6555, -0.7884],
        [-6.7435,  1.7611, -5.4446, -1.0645]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 61%|██████    | 176/289 [02:13<01:26,  1.31it/s]

Training loop 176
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19993805885314941, logits - tensor([[-7.1310,  1.7047, -5.8029, -1.5701],
        [-5.7637,  0.8393, -4.5103, -1.0674],
        [-6.5512,  1.7913, -5.3620, -1.8874],
        [-6.0006,  2.3702, -5.9653, -1.3586],
        [-4.8058, -3.2999,  1.7536, -1.8991],
        [-6.5571,  2.1605, -5.4551, -2.1303],
        [-7.1116,  2.6178, -5.4837, -1.6283],
        [-5.6791, -4.1805,  2.4032, -1.9269]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 61%|██████    | 177/289 [02:13<01:25,  1.32it/s]

Training loop 177
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.08950801193714142, logits - tensor([[-5.3141,  1.1938, -4.8256, -2.2341],
        [-4.9530, -2.8570,  1.2455, -1.9402],
        [-6.0987,  1.5182, -5.3796, -1.4691],
        [-5.8577,  2.2966, -5.5726, -1.2728],
        [-6.2001,  2.0465, -5.4131, -1.7702],
        [-4.5681, -2.9627,  1.9737, -2.1990],
        [-6.0554, -2.4422, -4.9339,  2.8441],
        [-6.0181,  1.1575, -4.6235, -1.2267]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 62%|██████▏   | 178/289 [02:14<01:24,  1.32it/s]

Training loop 178
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.27109795808792114, logits - tensor([[-5.6142,  1.0858, -5.0193, -1.1391],
        [-6.1906, -2.0887, -5.2356,  2.2837],
        [-7.1907,  1.3493, -5.5960, -2.4169],
        [-6.1231, -3.6767,  1.3134, -1.8494],
        [-6.8891,  1.4598, -5.3699, -1.9665],
        [-6.4657,  0.6790, -5.6906, -1.2144],
        [-4.6933, -2.8042,  1.3779, -2.0708],
        [-5.5951,  1.2229, -4.4485, -0.7671]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 62%|██████▏   | 179/289 [02:15<01:23,  1.32it/s]

Training loop 179
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.20895515382289886, logits - tensor([[-5.9970, -1.7658, -5.2402,  1.3770],
        [-5.4435,  1.9769, -5.2140, -0.6816],
        [-5.7394,  1.7338, -5.0545, -1.9162],
        [-6.5994,  2.0566, -6.2106, -1.6201],
        [-6.1449, -1.9462, -5.3106,  1.6304],
        [-7.1199,  2.0710, -6.0577, -2.4949],
        [-4.5139, -2.7838,  1.6703, -2.2465],
        [-6.9889,  1.9831, -5.6140, -1.4093]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 62%|██████▏   | 180/289 [02:16<01:22,  1.33it/s]

Training loop 180
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3736492693424225, logits - tensor([[-7.8916,  2.2962, -6.5269, -1.9110],
        [-6.8151,  1.1650, -5.5600, -1.7034],
        [-6.2807,  1.3254, -4.7581, -1.9133],
        [-6.8047,  0.9319, -5.5655, -0.7147],
        [-6.9371,  1.5092, -5.5484, -1.8312],
        [-6.8153,  2.0794, -6.8312, -1.7602],
        [-7.1208,  1.9603, -5.6401, -2.0725],
        [-7.5251,  2.3206, -5.8793, -2.4552]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 63%|██████▎   | 181/289 [02:16<01:21,  1.33it/s]

Training loop 181
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1564771682024002, logits - tensor([[-6.2277,  1.7489, -5.2292, -1.4336],
        [-5.5590,  2.0611, -4.6734, -2.0704],
        [-5.9187,  2.1479, -5.6039, -1.3842],
        [-6.1516,  1.6644, -5.6693, -1.8856],
        [-7.8610, -1.3362, -5.9849,  1.2128],
        [-6.2561,  2.0310, -5.9740, -2.1751],
        [-6.2584,  1.9950, -5.5551, -2.1848],
        [-6.8926,  2.1234, -6.6585, -2.5224]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 63%|██████▎   | 182/289 [02:17<01:20,  1.33it/s]

Training loop 182
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.30291157960891724, logits - tensor([[-6.7354,  0.5052, -4.4724, -0.7704],
        [-6.7830,  1.1825, -5.0339, -0.4713],
        [-6.2424, -3.1622, -5.6838,  2.5062],
        [-6.1016,  1.6788, -4.8838, -1.3193],
        [-6.7579,  1.6369, -5.1382, -2.2398],
        [-6.3317, -1.0409, -5.1732,  1.8715],
        [-5.9587, -2.0770, -5.3684,  1.8099],
        [-6.7762,  1.1940, -5.2462, -0.3032]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 63%|██████▎   | 183/289 [02:18<01:19,  1.33it/s]

Training loop 183
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3003166913986206, logits - tensor([[-7.1834,  2.2126, -6.2345, -1.5002],
        [-5.3521,  0.2973, -4.4526, -0.6150],
        [-6.2379, -3.3862,  2.0158, -2.0530],
        [-5.7035,  0.8376, -5.7677, -0.4385],
        [-5.5946,  0.8029, -4.5858, -0.7329],
        [-5.3183, -3.1553,  1.2530, -1.9014],
        [-6.2587,  1.2812, -4.9530, -1.0121],
        [-5.9263,  0.5877, -4.9128, -1.6085]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 64%|██████▎   | 184/289 [02:19<01:19,  1.33it/s]

Training loop 184
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1488274782896042, logits - tensor([[-7.4024,  0.0365, -5.3655, -0.2889],
        [-7.1310,  2.5279, -6.0533, -2.2552],
        [-6.3452,  0.0560, -5.4176, -0.8032],
        [-5.0071,  1.8742, -5.3123, -1.6754],
        [-6.1325,  1.6361, -5.0168, -1.2136],
        [-5.4988,  1.8977, -5.6695, -1.8591],
        [-6.0122,  2.1880, -5.7265, -2.2823],
        [-6.5086,  1.4067, -5.0661, -0.4970]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 64%|██████▍   | 185/289 [02:19<01:18,  1.33it/s]

Training loop 185
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.34652960300445557, logits - tensor([[-6.4153,  2.4505, -5.9422, -1.8916],
        [-6.3559,  2.0018, -5.0265, -1.7270],
        [-5.4786, -3.4919,  1.5811, -2.0301],
        [-5.1015, -2.8715,  2.3128, -2.0203],
        [-4.0400, -3.0670,  1.5972, -2.0157],
        [-6.7510,  2.2213, -5.3997, -2.0059],
        [-6.5077,  1.6324, -4.7976, -1.7045],
        [-6.5032,  1.5233, -5.6932, -1.4374]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 64%|██████▍   | 186/289 [02:20<01:17,  1.33it/s]

Training loop 186
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1311851292848587, logits - tensor([[-6.8837,  1.3444, -5.3026, -1.0041],
        [-7.2355,  1.4942, -5.8763, -1.8711],
        [-6.7096,  0.0175, -5.4522,  0.1629],
        [-4.5814, -2.9489, -5.4050,  1.9600],
        [-6.8182,  1.9432, -5.9914, -2.0157],
        [-6.8516,  0.8677, -4.2896, -0.4418],
        [-6.4302,  2.1757, -4.7637, -2.3543],
        [-5.0502, -3.2642,  2.0761, -1.9039]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 65%|██████▍   | 187/289 [02:21<01:16,  1.33it/s]

Training loop 187
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3884749710559845, logits - tensor([[-6.1831, -1.4578, -4.7622,  1.7512],
        [-4.8351, -3.0606,  1.0707, -1.5198],
        [-5.2748, -1.7480, -0.9216, -0.8361],
        [-5.9772, -2.7661,  0.9675, -1.5791],
        [-5.6991,  0.5163, -3.9101, -0.2926],
        [-6.5835,  2.1308, -5.7146, -2.4163],
        [-7.6892, -0.0726, -5.2937, -0.8755],
        [-6.4346,  1.7061, -5.5670, -1.6880]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 65%|██████▌   | 188/289 [02:22<01:16,  1.33it/s]

Training loop 188
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.11265620589256287, logits - tensor([[-5.6512,  1.4090, -4.9973, -1.1566],
        [-6.1888,  2.1447, -5.5223, -1.3643],
        [-6.3178,  0.9090, -5.0423, -1.2622],
        [-5.3146, -2.6252,  1.2137, -1.9509],
        [-5.7285, -2.2249, -4.9697,  1.7539],
        [-6.1807, -3.6420,  2.0075, -2.1990],
        [-6.1524,  0.7994, -4.9037, -0.6217],
        [-6.2701, -1.7176, -5.1577,  1.6271]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 65%|██████▌   | 189/289 [02:22<01:15,  1.32it/s]

Training loop 189
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.16119268536567688, logits - tensor([[-6.5056,  1.4462, -6.2829, -1.4602],
        [-6.4253,  1.6454, -5.5212, -2.4385],
        [-5.3070,  1.4991, -5.0207, -1.9928],
        [-5.8086,  0.0429, -4.2281, -0.6877],
        [-5.5018, -2.9659,  1.8907, -1.5709],
        [-5.7920,  0.5411, -5.3591,  0.4900],
        [-7.7669,  1.4753, -5.9206, -0.2109],
        [-7.5260,  1.8560, -5.9282, -1.2037]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 66%|██████▌   | 190/289 [02:23<01:14,  1.32it/s]

Training loop 190
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.15989653766155243, logits - tensor([[-5.9921, -3.2985,  1.5167, -2.0886],
        [-6.4172,  1.7848, -5.9610, -2.4578],
        [-6.3870,  2.0748, -5.7725, -2.2385],
        [-6.8348,  0.7926, -5.3324, -1.5725],
        [-7.2305,  1.2757, -5.1506, -1.0398],
        [-7.6257,  1.5918, -6.1137, -1.6718],
        [-6.1139,  2.4004, -5.1387, -2.7388],
        [-5.8064, -3.7079,  1.3550, -2.4392]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 66%|██████▌   | 191/289 [02:24<01:14,  1.32it/s]

Training loop 191
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.35346144437789917, logits - tensor([[-5.9362,  1.5660, -4.8688, -1.8412],
        [-5.4021,  1.0511, -4.6821, -1.3140],
        [-5.0093, -3.4525,  1.7996, -2.1587],
        [-6.9589,  1.5973, -5.1857, -1.8162],
        [-5.4701,  1.4524, -4.9604, -2.6961],
        [-6.1780,  2.3938, -6.0841, -1.8958],
        [-5.2677, -2.8704,  0.7054, -1.8389],
        [-6.0886, -2.4642, -5.0241,  2.1130]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 66%|██████▋   | 192/289 [02:25<01:13,  1.32it/s]

Training loop 192
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.17587833106517792, logits - tensor([[-5.2574, -2.8513,  1.5584, -1.9368],
        [-5.8909,  1.9841, -5.3906, -1.7597],
        [-4.6138, -2.5001,  1.3656, -1.8182],
        [-5.4770, -2.2347, -5.1419,  2.9235],
        [-6.2076, -1.4508, -4.9935,  1.6339],
        [-5.9724,  1.9460, -4.8678, -1.8446],
        [-5.9255,  1.2709, -4.9318, -1.2984],
        [-5.1739, -2.9798,  0.9117, -1.7073]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 67%|██████▋   | 193/289 [02:25<01:12,  1.32it/s]

Training loop 193
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.09364783018827438, logits - tensor([[-5.3203, -3.0881,  1.4248, -2.0791],
        [-5.9316, -1.9951, -5.9185,  2.3052],
        [-5.2168, -3.1163,  2.3323, -1.7844],
        [-6.4690,  0.9199, -5.3041, -1.0087],
        [-6.4345,  2.1578, -5.6652, -1.5067],
        [-6.1361,  1.6996, -5.3768, -1.9532],
        [-6.2631,  0.7797, -4.1728, -1.1039],
        [-7.0424,  2.9428, -6.0014, -2.5003]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 67%|██████▋   | 194/289 [02:26<01:11,  1.32it/s]

Training loop 194
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.331284761428833, logits - tensor([[-5.8848,  1.9760, -4.9790, -1.8113],
        [-5.4128,  1.3860, -4.8594, -1.9526],
        [-5.3625,  1.1786, -4.5562, -1.8152],
        [-6.2115,  0.3312, -4.5220, -0.9010],
        [-6.8581,  2.2018, -6.6412, -2.0096],
        [-6.3451,  0.9938, -5.6648, -1.4906],
        [-6.7269,  2.3821, -6.0150, -1.5830],
        [-5.7103,  1.8614, -4.3840, -2.1131]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 67%|██████▋   | 195/289 [02:27<01:11,  1.32it/s]

Training loop 195
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.20879331231117249, logits - tensor([[-6.3863,  1.8481, -4.6910, -2.5350],
        [-4.7019, -2.9707,  1.6397, -1.8569],
        [-5.8674, -2.6498,  1.9005, -1.9604],
        [-6.4070,  1.9447, -5.2699, -1.9731],
        [-5.6858,  0.6287, -5.0861, -0.7871],
        [-7.4272,  2.4074, -6.3946, -2.8121],
        [-5.7841,  2.2348, -5.6489, -1.7649],
        [-5.5284, -3.4678,  1.5758, -1.8579]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 68%|██████▊   | 196/289 [02:28<01:10,  1.32it/s]

Training loop 196
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3060433268547058, logits - tensor([[-5.8347,  2.1990, -5.4147, -1.6737],
        [-6.9437,  1.7684, -5.4892, -1.6364],
        [-5.3537,  1.7205, -4.8312, -1.9288],
        [-5.8609, -2.0905, -4.4125,  2.2959],
        [-5.4356,  2.2657, -5.2018, -1.7792],
        [-6.6464,  1.9947, -4.4965, -2.3750],
        [-6.4759,  2.7828, -5.3288, -2.4881],
        [-5.7371, -3.9685,  1.0764, -1.8120]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 68%|██████▊   | 197/289 [02:28<01:09,  1.33it/s]

Training loop 197
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.09160687029361725, logits - tensor([[-5.7355,  2.3421, -5.9779, -1.5939],
        [-6.3904,  1.2952, -5.3331, -0.9765],
        [-5.9330,  2.1197, -5.5689, -1.7299],
        [-5.8123,  1.9456, -5.2282, -2.0950],
        [-7.0944,  2.7847, -5.4078, -2.7287],
        [-5.1424,  1.1666, -3.7624, -1.4507],
        [-6.8566,  0.8071, -4.5855, -1.1611],
        [-6.2329,  2.2875, -5.4974, -1.8540]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 69%|██████▊   | 198/289 [02:29<01:08,  1.32it/s]

Training loop 198
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.15858155488967896, logits - tensor([[-5.7080,  1.8105, -4.8017, -2.2075],
        [-5.2050,  1.4596, -4.6614, -1.0677],
        [-6.8532,  1.8641, -5.9564, -1.7087],
        [-5.2744, -3.2906,  2.1500, -1.8981],
        [-5.8042, -3.5648,  1.8301, -2.1188],
        [-5.9102,  1.6315, -4.9695, -2.2687],
        [-5.5197, -2.2509, -4.8499,  2.6749],
        [-7.2016,  1.7285, -5.2226, -1.3672]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 69%|██████▉   | 199/289 [02:30<01:07,  1.33it/s]

Training loop 199
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.14962346851825714, logits - tensor([[-4.7883, -2.9191,  1.3200, -1.3611],
        [-5.9302,  1.1606, -4.8407, -0.8028],
        [-5.8970,  2.2790, -5.7239, -3.2767],
        [-5.0018,  1.7673, -4.8527, -2.5371],
        [-5.6015, -3.7964,  1.3242, -2.2166],
        [-6.9785,  1.4259, -5.2169, -1.8662],
        [-4.9529,  1.7469, -4.7775, -2.2760],
        [-6.7422,  2.3365, -6.0164, -1.7787]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 69%|██████▉   | 200/289 [02:31<01:07,  1.32it/s]

Training loop 200
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.20064418017864227, logits - tensor([[-6.8004,  1.6766, -5.9283, -2.2844],
        [-6.2777,  1.3953, -5.6033, -0.8680],
        [-5.0267,  1.3965, -4.0352, -2.5535],
        [-4.7336, -2.7907,  0.8863, -1.1533],
        [-6.3161,  2.3408, -5.8576, -1.9093],
        [-6.8830,  2.2353, -5.8039, -2.3737],
        [-5.6677, -3.4250, -4.5871,  3.1682],
        [-5.4981,  2.9420, -5.5262, -3.2652]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 70%|██████▉   | 201/289 [02:31<01:06,  1.33it/s]

Training loop 201
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2326422780752182, logits - tensor([[-6.9331,  1.6049, -5.0058, -1.4135],
        [-6.8614,  2.4371, -5.7700, -2.5307],
        [-4.9914, -3.9393,  2.1398, -2.0806],
        [-5.0721, -3.2189,  1.2607, -2.7610],
        [-6.6408,  2.4220, -5.1829, -2.4064],
        [-6.0724,  2.5420, -6.0871, -2.0567],
        [-5.8753,  1.9897, -6.1257, -1.2295],
        [-4.1517, -1.6539,  1.7705, -1.6482]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 70%|██████▉   | 202/289 [02:32<01:05,  1.33it/s]

Training loop 202
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.07832120358943939, logits - tensor([[-6.2646,  2.2648, -5.6904, -2.2925],
        [-6.3106, -2.5074, -5.8363,  2.5903],
        [-6.2899, -0.9517, -4.6314,  1.3345],
        [-6.9668,  2.0054, -5.2862, -1.6789],
        [-5.3553, -3.3194,  1.3555, -1.7148],
        [-4.4195, -2.5841,  1.3226, -2.4930],
        [-6.2252,  2.1521, -5.6190, -2.1446],
        [-5.5776,  2.0084, -4.7717, -2.5451]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 70%|███████   | 203/289 [02:33<01:04,  1.33it/s]

Training loop 203
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.22991560399532318, logits - tensor([[-5.6425,  2.9740, -5.3961, -3.0106],
        [-4.8985, -2.5027,  1.0143, -1.6635],
        [-7.1325,  1.8566, -5.6234, -1.9061],
        [-5.5426, -3.3563,  2.4957, -2.2149],
        [-6.3757,  2.3335, -5.8399, -2.8199],
        [-6.4994,  1.9566, -4.7652, -2.2844],
        [-6.5523,  2.4503, -5.5374, -2.2246],
        [-6.6135,  1.2966, -6.4981, -1.5632]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 71%|███████   | 204/289 [02:34<01:03,  1.33it/s]

Training loop 204
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.07465149462223053, logits - tensor([[-6.5958,  2.2136, -5.0093, -2.6102],
        [-5.6606,  1.9714, -5.1683, -1.9550],
        [-5.8681,  2.1528, -4.4998, -1.3113],
        [-6.5923,  1.9463, -5.4887, -2.0794],
        [-5.3060, -3.2225,  2.0276, -1.6302],
        [-6.8435,  1.7141, -5.5648, -1.5430],
        [-5.9770,  1.6969, -5.3669, -2.1733],
        [-5.7825,  1.7344, -5.0176, -1.8296]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 71%|███████   | 205/289 [02:34<01:03,  1.33it/s]

Training loop 205
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5294865369796753, logits - tensor([[-5.9207,  2.1109, -4.8225, -1.7344],
        [-4.8775, -3.8842,  2.2538, -2.4544],
        [-7.1547,  3.1023, -6.7870, -2.3844],
        [-7.1768,  2.0353, -6.1408, -2.1907],
        [-5.6803,  1.5858, -4.7413, -2.0176],
        [-5.8592,  2.3444, -6.2537, -1.9607],
        [-5.5711, -3.6187,  1.9912, -1.7731],
        [-7.1034,  0.5429, -5.9464, -1.3650]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 71%|███████▏  | 206/289 [02:35<01:02,  1.33it/s]

Training loop 206
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19870010018348694, logits - tensor([[-6.3411,  2.0274, -4.9068, -1.4322],
        [-6.4722,  1.7369, -6.2121, -2.4188],
        [-5.7462,  1.8782, -5.6681, -1.9024],
        [-5.4170, -3.7049,  2.2323, -2.3793],
        [-6.0932,  1.0777, -5.0772, -1.5317],
        [-4.1480, -2.7684,  2.5211, -1.9789],
        [-6.7673,  2.2073, -5.3574, -1.7495],
        [-5.2424,  1.1126, -4.9741, -2.0132]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 72%|███████▏  | 207/289 [02:36<01:01,  1.33it/s]

Training loop 207
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.13340194523334503, logits - tensor([[-5.3881, -2.2518, -4.1209,  1.8865],
        [-6.1099,  2.3796, -5.1172, -1.9178],
        [-6.2534,  1.1997, -4.7642, -1.1526],
        [-5.8540,  0.9723, -5.0869, -2.1942],
        [-6.9100,  1.6999, -5.6415, -1.0958],
        [-6.3369,  2.1074, -4.7676, -1.3635],
        [-7.2518,  0.0387, -5.1943, -0.2809],
        [-6.0882,  1.6604, -5.5646, -1.3348]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 72%|███████▏  | 208/289 [02:37<01:00,  1.34it/s]

Training loop 208
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.17114579677581787, logits - tensor([[-6.0485, -1.1814, -4.5317,  1.6068],
        [-5.0606,  1.7348, -4.9149, -1.6840],
        [-7.7904,  2.3321, -6.2392, -1.5385],
        [-6.7503,  2.8945, -5.0819, -1.7472],
        [-6.5603,  1.7908, -4.9487, -0.9407],
        [-4.6857, -3.0755, -3.7760,  2.7496],
        [-5.0287, -2.3951, -4.4339,  2.0525],
        [-5.7102,  1.2428, -4.6999, -1.0486]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 72%|███████▏  | 209/289 [02:37<00:59,  1.34it/s]

Training loop 209
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.34683865308761597, logits - tensor([[-6.4839,  1.8720, -5.7529, -2.2683],
        [-6.8847,  2.2520, -4.7016, -2.0380],
        [-6.6076,  1.8442, -4.7258, -1.7961],
        [-6.7276, -0.2411, -4.8620,  0.0386],
        [-7.2144,  2.1155, -6.3377, -1.7375],
        [-6.3590,  1.9515, -5.6568, -3.0108],
        [-5.8834,  2.2962, -5.2530, -1.9339],
        [-5.9798, -1.6477, -5.4140,  1.7057]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 73%|███████▎  | 210/289 [02:38<00:59,  1.33it/s]

Training loop 210
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.48206716775894165, logits - tensor([[-6.6955,  1.6022, -5.5400, -1.7532],
        [-6.0738,  1.9605, -5.3506, -1.6130],
        [-6.0410,  2.2952, -5.5429, -2.9480],
        [-5.6589,  2.0484, -5.0911, -1.2554],
        [-6.5611,  0.6349, -4.3140, -1.4940],
        [-5.8608,  0.9869, -4.2481, -1.3683],
        [-7.3385,  2.3616, -5.8950, -2.5020],
        [-6.1499,  1.3950, -5.8364, -3.0647]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 73%|███████▎  | 211/289 [02:39<00:58,  1.33it/s]

Training loop 211
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 73%|███████▎  | 212/289 [02:40<00:58,  1.32it/s]

loss - 0.20684552192687988, logits - tensor([[-6.7530, -1.6891, -5.6516,  1.2685],
        [-6.3973, -0.2471, -5.2111, -0.8137],
        [-5.7762, -3.3376,  1.9872, -2.5415],
        [-5.7655,  1.7645, -5.8420, -2.6419],
        [-7.5810,  2.2626, -5.3867, -1.6551],
        [-5.6440, -3.5249,  2.0985, -2.9584],
        [-5.4150, -2.9994,  2.0740, -2.3385],
        [-6.4496,  1.8760, -5.3988, -2.1566]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 212
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.33596938848495483, logits - tensor([[-5.6456, -3.9397,  3.2045, -2.7977],
        [-5.5796,  1.8834, -4.9965, -1.7652],
        [-6.5089,  0.7644, -4.8303, -1.3087],
        [-6.3029,  1.1690, -4.7412, -0.6955],
        [-5.4329, -0.7525, -0.6206, -0.9713],
        [-4.4366, -3.8129,  1.2969, -2.1448],
        [-4.9206, -3.8212,  1.8905, -2.3011],
        [-6.2040,  0.9884, -4.9378, -0

 74%|███████▎  | 213/289 [02:40<00:57,  1.32it/s]

Training loop 213
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 74%|███████▍  | 214/289 [02:41<00:56,  1.32it/s]

loss - 0.12821796536445618, logits - tensor([[-6.1914,  0.4086, -3.6768, -0.1875],
        [-5.2979,  1.4326, -4.5033, -1.5956],
        [-5.8939,  1.0479, -4.3605, -1.0419],
        [-7.5133,  1.1913, -5.2549, -0.6084],
        [-5.9872,  1.5042, -5.7957, -1.9696],
        [-4.3559, -3.1728,  2.3926, -1.9789],
        [-5.2537, -3.7623,  1.7767, -2.1049],
        [-5.5667,  1.6705, -4.8790, -1.9476]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 214
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.10860354453325272, logits - tensor([[-6.0079, -2.7885, -4.5823,  2.6199],
        [-5.7761,  1.5523, -4.6815, -1.3118],
        [-5.6123,  1.2686, -4.8222, -1.6596],
        [-6.8826,  0.0105, -3.7089, -0.7302],
        [-6.4586,  2.0900, -5.2642, -1.6851],
        [-6.2978,  2.0548, -4.6524, -1.7233],
        [-4.7667, -3.3435, -3.9796,  2.9304],
        [-6.0569,  0.6346, -5.2586, -1

 74%|███████▍  | 215/289 [02:42<00:56,  1.32it/s]

Training loop 215
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.31263411045074463, logits - tensor([[-5.6993,  1.9224, -5.6642, -1.1014],
        [-6.0543,  1.2113, -4.3975, -0.9102],
        [-7.0495, -0.2331, -4.8830,  0.2881],
        [-6.0102,  1.4990, -4.5227, -1.7155],
        [-6.2248,  1.5487, -5.6787, -2.0470],
        [-6.9161,  1.6761, -5.6181, -2.5089],
        [-6.4409,  1.0333, -5.3934, -1.8223],
        [-6.6964,  1.0293, -4.4100, -1.5269]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 75%|███████▍  | 216/289 [02:43<00:55,  1.32it/s]

Training loop 216
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.23253200948238373, logits - tensor([[-5.7900,  0.9824, -4.6667, -1.4610],
        [-6.6701,  1.6727, -5.2096, -1.1908],
        [-6.3395,  1.4088, -4.7989, -1.4173],
        [-6.8618, -1.1110, -4.5837,  0.1716],
        [-6.7734,  1.6184, -4.6378, -1.8971],
        [-4.6003, -4.0895,  2.6760, -1.9162],
        [-6.0636,  0.9179, -4.8705, -1.8104],
        [-6.9596,  1.3910, -5.0556, -2.3576]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 75%|███████▌  | 217/289 [02:43<00:54,  1.33it/s]

Training loop 217
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.24020007252693176, logits - tensor([[-5.9356,  0.1314, -3.9345, -1.2502],
        [-7.7854,  1.2657, -5.8151, -1.4581],
        [-5.4686,  1.5482, -4.5231, -1.6077],
        [-6.3353,  1.4191, -4.9730, -1.9058],
        [-7.0853,  2.2189, -5.6213, -1.7851],
        [-6.0775,  1.0538, -5.1325, -1.9128],
        [-6.5194,  0.7478, -5.0187, -1.0493],
        [-5.6243, -3.9315,  2.6580, -2.6376]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 75%|███████▌  | 218/289 [02:44<00:53,  1.32it/s]

Training loop 218
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.21925635635852814, logits - tensor([[-5.0221, -3.8061,  2.3980, -1.7195],
        [-6.7685,  1.9245, -5.7245, -1.7429],
        [-6.6986,  1.0368, -4.7122, -1.1588],
        [-5.5797,  1.4145, -5.2345, -1.3398],
        [-5.2638,  1.3827, -4.6413, -1.6018],
        [-5.8817,  0.7991, -4.7349, -1.8100],
        [-4.7394, -3.8833,  2.2532, -1.9631],
        [-6.5989,  0.8385, -4.5675, -1.9540]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 76%|███████▌  | 219/289 [02:45<00:52,  1.32it/s]

Training loop 219
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.23752239346504211, logits - tensor([[-5.1402,  1.2395, -4.5307, -1.3989],
        [-6.4463, -2.3118, -4.9332,  1.7543],
        [-5.7509,  1.0563, -4.0653, -0.7538],
        [-6.2413, -0.3129, -4.1218, -0.2591],
        [-5.6034,  0.8093, -3.6848, -1.0072],
        [-6.4775, -2.0886, -5.0300,  2.4115],
        [-7.4813, -0.5315, -5.8014, -0.0330],
        [-6.4889,  1.8244, -4.7864, -1.6131]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 76%|███████▌  | 220/289 [02:46<00:51,  1.33it/s]

Training loop 220
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.39234459400177, logits - tensor([[-6.1169,  0.3475, -4.4221, -0.6187],
        [-6.0791,  0.0950, -4.4894, -0.7646],
        [-4.7515, -3.9104,  2.2289, -2.0070],
        [-6.5093,  2.1097, -6.1680, -2.2259],
        [-5.6177,  1.4897, -4.5985, -1.3488],
        [-5.6823, -2.6458, -5.0316,  2.0155],
        [-6.1946, -1.2328, -5.2472, -0.3852],
        [-6.0394,  1.2644, -4.7390, -1.2704]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 76%|███████▋  | 221/289 [02:46<00:51,  1.33it/s]

Training loop 221
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.20786148309707642, logits - tensor([[-5.9530,  2.1588, -5.4075, -1.6362],
        [-6.4399,  2.3517, -5.1567, -1.5374],
        [-6.3714,  1.8398, -4.7949, -1.4456],
        [-6.8006, -1.7321, -5.0983,  2.5454],
        [-3.5903, -3.5819,  1.5988, -1.6766],
        [-5.3672, -3.3349,  1.8723, -2.5629],
        [-5.6467, -3.3063,  2.7619, -3.2076],
        [-4.1783, -3.7795,  1.8837, -1.5911]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 77%|███████▋  | 222/289 [02:47<00:50,  1.33it/s]

Training loop 222
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.28117385506629944, logits - tensor([[-5.7346e+00,  5.7673e-01, -3.7919e+00, -8.8304e-01],
        [-5.6488e+00,  9.5283e-01, -3.7658e+00,  1.6458e-03],
        [-6.0235e+00,  8.7673e-03, -4.0028e+00, -8.4278e-01],
        [-6.3131e+00,  1.6529e+00, -5.5725e+00, -1.1188e+00],
        [-4.7522e+00, -2.8403e+00, -4.9256e+00,  2.9507e+00],
        [-6.8568e+00,  1.4957e+00, -4.5942e+00, -1.5077e+00],
        [-5.2579e+00, -3.7799e+00,  2.3499e+00, -2.8293e+00],
        [-5.8350e+00,  1.2075e+00, -4.6580e+00, -1.8024e+00]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 77%|███████▋  | 223/289 [02:48<00:49,  1.33it/s]

Training loop 223
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3028091490268707, logits - tensor([[-5.5228, -3.2515,  3.2615, -2.0620],
        [-5.8118,  0.0267, -4.3327, -0.4992],
        [-5.5947,  1.7239, -4.5684, -1.8954],
        [-5.7559,  0.2899, -4.9973, -0.6452],
        [-4.6842, -3.6774,  2.4646, -2.3916],
        [-6.3918,  0.8601, -4.0081, -0.6186],
        [-5.8112,  0.2173, -4.1824, -0.2781],
        [-6.5960,  1.0328, -4.9226, -0.6948]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 78%|███████▊  | 224/289 [02:49<00:48,  1.33it/s]

Training loop 224
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2643091380596161, logits - tensor([[-7.0045,  1.2299, -4.8603, -1.4606],
        [-6.0886,  0.7406, -4.8356, -1.2796],
        [-6.3302,  1.1324, -4.3707, -1.1352],
        [-5.5802,  0.8974, -4.0547, -0.2741],
        [-6.5475,  1.3204, -4.8273, -1.8191],
        [-6.8713,  0.5019, -5.0942, -0.0262],
        [-3.4940, -3.2505,  3.3399, -2.5676],
        [-5.3497,  1.7070, -4.3101, -1.2507]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 78%|███████▊  | 225/289 [02:49<00:48,  1.33it/s]

Training loop 225
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3220938742160797, logits - tensor([[-5.2765,  0.9287, -3.8916, -1.5257],
        [-5.0169, -3.3455,  2.6470, -1.7995],
        [-5.6988,  0.2496, -4.1990, -0.9475],
        [-6.0617,  0.2362, -3.7208, -1.2816],
        [-6.5307, -2.0458, -4.8507,  2.5734],
        [-6.3717, -0.6894, -4.3523,  0.8329],
        [-6.8771,  0.6718, -4.2930, -0.8882],
        [-5.7298, -4.6870,  2.3487, -2.8756]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 78%|███████▊  | 226/289 [02:50<00:47,  1.33it/s]

Training loop 226
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.21583318710327148, logits - tensor([[-4.6794,  0.8055, -3.9417, -1.1278],
        [-6.3950,  0.8319, -4.6485, -1.3687],
        [-6.3003,  0.9940, -4.7966, -0.8659],
        [-6.0947, -1.5341, -5.6936,  1.1146],
        [-5.8527,  1.5235, -4.9547, -1.6207],
        [-6.5281, -2.7593, -5.0918,  2.8441],
        [-5.8623,  0.4388, -4.1188, -0.9775],
        [-6.3567,  1.8410, -4.5490, -1.0077]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 79%|███████▊  | 227/289 [02:51<00:46,  1.33it/s]

Training loop 227
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2272428572177887, logits - tensor([[-6.2109,  0.9480, -3.9765, -0.8020],
        [-5.9380, -3.0666, -4.8076,  3.1305],
        [-7.3328,  1.3725, -5.5011, -0.5620],
        [-5.9028,  1.4279, -4.6054, -1.3369],
        [-5.8911, -3.1656, -4.6417,  2.4437],
        [-5.9692,  0.4683, -3.7866, -0.6082],
        [-5.4747,  0.3097, -3.0977, -0.4825],
        [-6.1334,  1.0526, -4.1277, -1.0903]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 79%|███████▉  | 228/289 [02:52<00:45,  1.33it/s]

Training loop 228
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.27538132667541504, logits - tensor([[-5.3132, -3.4109,  2.0917, -2.6218],
        [-6.2833,  0.6729, -4.5219, -1.5613],
        [-6.9259,  0.4251, -5.1585, -0.0764],
        [-5.4691, -1.9280, -4.8181,  2.8513],
        [-5.8939,  0.7124, -4.0138, -1.0589],
        [-6.5424,  0.9092, -5.5008, -1.5021],
        [-4.7949, -3.2761,  2.6577, -2.4470],
        [-6.9459,  1.3254, -5.4469, -1.5638]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 79%|███████▉  | 229/289 [02:53<00:45,  1.33it/s]

Training loop 229
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.28675636649131775, logits - tensor([[-5.7989,  0.6388, -4.7862, -0.7234],
        [-5.4566, -0.0795, -3.7438, -0.7540],
        [-7.2821,  0.9283, -4.7040, -0.5858],
        [-7.4283,  1.4356, -6.0645, -0.6435],
        [-5.4633,  1.0440, -3.9277, -1.8892],
        [-5.0910,  0.0094, -3.9322, -0.0140],
        [-6.4703,  0.9350, -5.1087, -1.3778],
        [-6.2780,  1.5387, -5.1280, -1.5892]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 80%|███████▉  | 230/289 [02:53<00:44,  1.32it/s]

Training loop 230
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2592964768409729, logits - tensor([[-6.1368, -1.7381, -5.2629,  2.4522],
        [-5.7285,  0.4719, -3.9780, -0.7540],
        [-7.0298, -0.2460, -5.3414, -0.1001],
        [-5.6863,  0.4146, -3.8490, -0.3184],
        [-7.1266,  0.6855, -5.4338, -0.7548],
        [-5.7662,  0.0485, -4.7423, -0.2370],
        [-5.3363, -3.5688,  1.9004, -2.4865],
        [-5.8334,  1.0569, -4.2173, -0.4046]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 80%|███████▉  | 231/289 [02:54<00:44,  1.32it/s]

Training loop 231
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.15483085811138153, logits - tensor([[-6.5504,  1.3703, -5.8898, -1.7822],
        [-5.6419,  1.0482, -4.2917, -1.6123],
        [-5.2217,  1.6783, -4.8652, -1.4816],
        [-6.1309, -0.3182, -3.8708, -0.3183],
        [-5.5799, -3.5827,  2.9896, -2.3220],
        [-6.4430,  0.7805, -4.4772, -1.0054],
        [-6.5328,  0.9266, -5.3522, -0.8016],
        [-5.9180,  1.2994, -5.0366, -0.6814]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 80%|████████  | 232/289 [02:55<00:43,  1.31it/s]

Training loop 232
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.17276564240455627, logits - tensor([[-6.2444, -0.5794, -4.5861,  0.2280],
        [-7.7056,  0.2459, -5.8726,  0.0642],
        [-6.6249,  1.0055, -5.4235, -0.8534],
        [-6.6782,  2.3624, -5.7636, -1.3509],
        [-4.9878,  0.7065, -4.0746, -0.7845],
        [-6.3293,  1.7034, -4.2774, -1.8457],
        [-4.2197, -3.9227,  2.7339, -3.0771],
        [-5.0905, -4.5069,  3.4561, -2.8865]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 81%|████████  | 233/289 [02:56<00:42,  1.31it/s]

Training loop 233
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2485176920890808, logits - tensor([[-4.5491, -3.7079,  2.0367, -2.3727],
        [-6.4341, -2.3440, -4.5455,  1.8808],
        [-6.1520, -1.5530, -5.2175,  2.8509],
        [-5.9903,  0.9401, -4.7148, -1.0128],
        [-6.1592,  2.0482, -5.0214, -0.5429],
        [-6.0346,  0.9466, -4.9547, -2.4426],
        [-6.2054,  1.4922, -5.4533, -1.3987],
        [-5.9419, -4.0386,  1.6301, -2.5900]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 81%|████████  | 234/289 [02:56<00:42,  1.31it/s]

Training loop 234
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.20280055701732635, logits - tensor([[-5.8222,  1.5798, -4.3261, -0.8223],
        [-5.1927,  2.2123, -5.0040, -2.1098],
        [-6.4964,  1.3085, -6.3076, -2.4151],
        [-5.9351,  2.6659, -4.8984, -1.7724],
        [-6.7683,  0.4893, -5.4003,  0.5362],
        [-4.0342, -3.1814,  3.2061, -2.3003],
        [-6.1702,  1.6737, -5.7459, -1.9622],
        [-6.3048,  1.7043, -5.2258, -1.0019]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 81%|████████▏ | 235/289 [02:57<00:41,  1.31it/s]

Training loop 235
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2995889484882355, logits - tensor([[-6.5773,  1.2796, -5.1983, -1.9991],
        [-6.9237,  1.9988, -5.7487, -1.5322],
        [-5.4565, -4.0550,  2.4329, -2.8668],
        [-6.4054,  2.2487, -5.4169, -1.0777],
        [-5.9572,  1.8722, -5.4912, -1.4567],
        [-6.0609,  1.9362, -4.3448, -1.6567],
        [-4.9676, -3.1771,  2.8117, -2.2354],
        [-5.6491,  1.5130, -4.0946, -0.8619]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 82%|████████▏ | 236/289 [02:58<00:40,  1.32it/s]

Training loop 236
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.22028616070747375, logits - tensor([[-5.0706,  1.2531, -4.9258, -1.6581],
        [-5.8018,  1.4470, -4.6494, -1.2953],
        [-6.4777,  1.3047, -4.8223, -1.3222],
        [-6.6770,  2.1906, -5.2963, -1.9252],
        [-5.5607,  0.9074, -4.3551, -1.0145],
        [-6.7937,  1.8254, -4.7251, -2.0840],
        [-6.2616,  2.3370, -5.0982, -1.8020],
        [-5.6668,  1.3886, -5.7218, -2.5319]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 82%|████████▏ | 237/289 [02:59<00:39,  1.32it/s]

Training loop 237
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.31398218870162964, logits - tensor([[-6.1620,  1.7036, -4.8485, -1.3426],
        [-6.1193,  1.4173, -5.4705, -1.3543],
        [-6.4390,  2.2196, -5.7693, -1.4613],
        [-5.9535,  1.7909, -4.9242, -1.6210],
        [-6.1603, -2.1908, -5.5835,  2.1702],
        [-5.8357,  1.2666, -4.5741, -1.2705],
        [-6.4176,  1.6793, -5.4651, -1.0710],
        [-7.0514,  1.8262, -4.7225, -1.1846]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 82%|████████▏ | 238/289 [02:59<00:38,  1.33it/s]

Training loop 238
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2642441689968109, logits - tensor([[-6.6017,  1.0780, -5.2933, -1.4019],
        [-5.2596,  1.8491, -4.5032, -2.0943],
        [-5.7788,  1.1232, -4.6940, -1.5093],
        [-5.8928,  1.6310, -5.0565, -1.8267],
        [-5.6313,  2.0931, -5.2936, -2.4744],
        [-6.0348,  1.5397, -5.3258, -1.8431],
        [-5.3244,  1.1789, -5.0036, -1.6674],
        [-5.2152,  0.9370, -5.0269, -1.9249]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 83%|████████▎ | 239/289 [03:00<00:37,  1.33it/s]

Training loop 239
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.29759812355041504, logits - tensor([[-5.6515,  1.8737, -4.1807, -1.7948],
        [-5.5749, -2.1068, -4.4139,  2.5260],
        [-6.0969,  2.6237, -5.3483, -2.3844],
        [-6.6850, -1.1388, -5.6676,  1.2878],
        [-5.4847,  2.2504, -3.8165, -1.9492],
        [-6.2029, -0.8599, -4.5695,  0.9331],
        [-6.5895,  1.1045, -5.2668, -1.8064],
        [-6.0976,  1.7633, -5.5099, -1.7232]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 83%|████████▎ | 240/289 [03:01<00:36,  1.33it/s]

Training loop 240
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19405004382133484, logits - tensor([[-6.3353,  1.5539, -5.0677, -1.7786],
        [-7.4425,  0.5401, -5.7422, -0.0507],
        [-5.3534, -4.2872,  2.7357, -3.2416],
        [-6.1558,  1.6399, -4.9698, -2.0522],
        [-4.3129, -4.2951,  3.7441, -3.1438],
        [-6.8322, -2.0210, -6.4004,  1.9185],
        [-5.4082,  2.0952, -4.8312, -1.4767],
        [-5.3155,  2.0747, -4.7884, -1.8864]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 83%|████████▎ | 241/289 [03:02<00:36,  1.33it/s]

Training loop 241
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.07077592611312866, logits - tensor([[-5.8942, -4.0659,  2.8987, -2.8140],
        [-6.5740,  2.1097, -5.8559, -2.0043],
        [-6.9594, -0.6254, -5.0451,  0.7607],
        [-6.4476, -2.2261, -5.7702,  2.6752],
        [-4.6194, -3.2960,  1.9489, -2.2393],
        [-5.1051, -3.5141,  3.8804, -2.1100],
        [-6.1073,  1.9914, -4.8529, -2.4015],
        [-5.9908,  2.0681, -5.8690, -2.3059]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 84%|████████▎ | 242/289 [03:02<00:35,  1.33it/s]

Training loop 242
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2708275616168976, logits - tensor([[-5.6217,  2.4876, -5.4463, -1.7216],
        [-5.4962,  1.4902, -5.2230, -1.4831],
        [-4.7659,  1.7253, -5.0065, -2.3459],
        [-6.9625,  2.2231, -5.2863, -2.0449],
        [-5.8177, -3.7024,  3.0396, -3.1389],
        [-5.8054,  1.9986, -4.8499, -1.3986],
        [-5.3198,  1.6609, -4.8743, -1.3826],
        [-4.8482,  0.7117, -4.8373, -1.6308]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 84%|████████▍ | 243/289 [03:03<00:34,  1.33it/s]

Training loop 243
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.29262304306030273, logits - tensor([[-5.3562,  1.7309, -4.3620, -1.7215],
        [-6.5464,  2.1982, -5.2428, -1.5415],
        [-5.3002,  1.0700, -5.1650, -1.4079],
        [-5.7039,  0.6178, -4.7686, -0.3858],
        [-5.9773,  1.5791, -4.6492, -0.6616],
        [-5.7156,  0.8247, -4.9437, -1.0612],
        [-6.9469,  2.7950, -5.9316, -2.3107],
        [-5.2045,  1.8893, -4.0725, -1.2489]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 84%|████████▍ | 244/289 [03:04<00:33,  1.34it/s]

Training loop 244
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.36772093176841736, logits - tensor([[-5.4204,  1.4591, -4.1381, -1.4504],
        [-6.2858,  1.1161, -5.4960, -1.0858],
        [-6.8847,  0.7199, -4.5756, -0.8381],
        [-5.9383,  1.0245, -4.1887, -1.2572],
        [-5.8266,  1.4326, -4.1591, -1.5377],
        [-4.6177, -3.9945,  3.5263, -3.2222],
        [-6.0981, -2.4524, -4.5721,  2.1380],
        [-6.4515,  1.8304, -5.9779, -1.4239]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 85%|████████▍ | 245/289 [03:05<00:33,  1.33it/s]

Training loop 245
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2508186101913452, logits - tensor([[-5.5864,  2.0082, -5.3736, -2.3020],
        [-5.6746,  1.0091, -3.8201, -2.0307],
        [-5.6532, -1.2663, -5.3685,  1.2759],
        [-5.6377,  1.3585, -4.5383, -1.7938],
        [-5.1586,  1.9684, -4.8832, -1.3922],
        [-5.3210,  0.5112, -4.3393, -0.6546],
        [-6.0586,  2.1670, -5.4909, -1.7730],
        [-6.3473,  1.3253, -5.5180, -2.0079]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 85%|████████▌ | 246/289 [03:05<00:32,  1.33it/s]

Training loop 246
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1926143914461136, logits - tensor([[-5.7467,  2.4055, -5.1072, -2.9536],
        [-5.8842,  1.5593, -4.4636, -1.5012],
        [-5.7120,  1.3781, -4.2014, -2.1252],
        [-5.6889,  1.8261, -5.6377, -1.2110],
        [-6.0688,  1.2293, -4.6402, -1.8049],
        [-5.5478,  1.3654, -4.5124, -1.4497],
        [-5.8224,  1.5551, -4.6916, -1.8857],
        [-6.1808, -4.0924,  2.8813, -3.1230]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 85%|████████▌ | 247/289 [03:06<00:31,  1.33it/s]

Training loop 247
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5916719436645508, logits - tensor([[-6.2507, -3.6464,  3.1710, -2.5127],
        [-5.5803,  0.8708, -3.8786, -1.6479],
        [-4.9848,  1.1470, -4.0448, -1.6253],
        [-5.7888,  0.9160, -4.8807, -1.3294],
        [-6.5870,  1.1385, -5.4424, -1.0103],
        [-5.8211, -3.6319,  1.5591, -1.8909],
        [-5.4687,  1.5257, -5.0973, -1.8809],
        [-5.9869,  2.0122, -4.2767, -1.6333]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 86%|████████▌ | 248/289 [03:07<00:30,  1.33it/s]

Training loop 248
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.18035349249839783, logits - tensor([[-7.1393,  0.9868, -4.9690, -1.6622],
        [-5.6604,  1.8228, -4.9008, -1.6131],
        [-6.2580,  2.1329, -4.7930, -1.6730],
        [-5.7514,  1.0547, -5.0832, -1.6482],
        [-6.0075, -0.9142, -5.6058,  0.5298],
        [-5.7550, -3.6984,  2.4794, -2.4922],
        [-6.7514, -0.0631, -4.7252, -0.0596],
        [-6.0859,  1.6800, -5.6790, -1.8755]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 86%|████████▌ | 249/289 [03:08<00:30,  1.33it/s]

Training loop 249
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3783453106880188, logits - tensor([[-6.8911,  2.2730, -6.1278, -1.9582],
        [-6.2110,  0.5295, -4.8337, -0.8777],
        [-5.5634, -3.9259,  3.0288, -1.9763],
        [-5.1007,  1.6322, -4.6252, -1.6000],
        [-5.9949,  1.2828, -4.6574, -0.5197],
        [-5.9700,  2.0329, -5.8192, -1.5389],
        [-6.2989,  1.2057, -5.1274, -1.2861],
        [-6.0303,  0.6334, -3.7190, -0.7040]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 87%|████████▋ | 250/289 [03:08<00:29,  1.33it/s]

Training loop 250
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.22306740283966064, logits - tensor([[-6.8055,  1.4687, -5.8600, -1.4116],
        [-6.0233,  1.9941, -5.4467, -0.8415],
        [-6.4529,  1.8908, -4.6826, -0.9720],
        [-6.5602,  1.1049, -4.7066, -1.3123],
        [-5.9085,  1.1634, -5.4393, -1.5198],
        [-5.3581,  1.9101, -5.3660, -1.1368],
        [-5.7152,  0.6794, -4.6240, -0.0452],
        [-6.2778,  1.4199, -5.6962, -1.7843]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 87%|████████▋ | 251/289 [03:09<00:28,  1.33it/s]

Training loop 251
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19287192821502686, logits - tensor([[-6.8916,  2.4870, -6.2049, -2.4235],
        [-6.1926,  1.1701, -4.7384, -0.6722],
        [-7.0941,  0.8537, -6.2519, -1.5350],
        [-7.1687, -1.9557, -5.9945,  2.8436],
        [-5.3375,  1.3320, -4.6197, -1.8342],
        [-6.2531,  1.7017, -6.3929, -1.1209],
        [-4.6051, -3.8888,  2.8444, -2.2898],
        [-6.0166,  2.1100, -4.7701, -1.3015]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 87%|████████▋ | 252/289 [03:10<00:27,  1.33it/s]

Training loop 252
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19000014662742615, logits - tensor([[-7.1389, -1.3073, -5.2385,  0.8919],
        [-5.9615,  0.8925, -4.6225, -0.5975],
        [-6.3915,  1.7697, -5.9339, -1.3517],
        [-6.0203, -4.2308,  2.7851, -2.5949],
        [-4.9429,  1.2734, -4.3390, -0.9837],
        [-6.3079,  1.4033, -5.2497, -1.1629],
        [-6.0155,  1.3550, -5.1233, -2.1074],
        [-6.1023,  1.2575, -4.9079, -1.4889]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 88%|████████▊ | 253/289 [03:11<00:27,  1.33it/s]

Training loop 253
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2347719818353653, logits - tensor([[-7.0129, -2.2790, -5.9350,  2.5052],
        [-5.5680,  0.9665, -4.7461, -1.0299],
        [-7.3361, -0.6248, -5.4370,  1.4359],
        [-7.0138,  1.5805, -4.7728, -1.8146],
        [-6.3739,  1.1702, -4.8957, -1.3792],
        [-5.5967,  0.8950, -4.9751, -0.6184],
        [-5.2548, -1.2156, -3.6059,  1.2431],
        [-5.5467,  1.3109, -5.1505, -1.3939]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 88%|████████▊ | 254/289 [03:11<00:26,  1.33it/s]

Training loop 254
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.24111227691173553, logits - tensor([[-6.5518, -1.0400, -5.1513,  2.3736],
        [-5.1067,  1.8206, -4.9038, -1.6749],
        [-5.9418,  1.1628, -4.7969, -1.1939],
        [-7.6749,  0.8542, -5.4468, -0.1513],
        [-6.2899,  1.2142, -5.7619, -1.3998],
        [-6.4961,  1.1852, -5.1769, -1.0678],
        [-6.9159,  1.9436, -5.8781, -1.1061],
        [-5.5934,  1.4156, -4.5546, -1.2635]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 88%|████████▊ | 255/289 [03:12<00:25,  1.33it/s]

Training loop 255
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.11351687461137772, logits - tensor([[-7.3702,  2.0168, -6.1170, -1.4745],
        [-6.3439,  1.5566, -4.9745, -1.8219],
        [-6.8403,  0.6719, -4.6364, -0.5211],
        [-5.8421,  0.5782, -4.6831, -1.1539],
        [-6.4541, -2.3985, -4.8934,  2.4810],
        [-5.0734,  1.2003, -4.8824, -1.6064],
        [-5.5950,  1.7813, -4.9643, -1.5665],
        [-5.7700, -1.9053, -5.6545,  1.5621]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 89%|████████▊ | 256/289 [03:13<00:24,  1.33it/s]

Training loop 256
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.20278982818126678, logits - tensor([[-6.5368,  2.9977, -6.3097, -0.9956],
        [-6.0802,  1.5286, -5.8716, -1.4733],
        [-5.7710,  1.2131, -4.9255, -0.6824],
        [-5.4840,  1.3951, -4.5036, -1.2843],
        [-5.7998,  1.2877, -4.4845, -0.6163],
        [-5.6442, -4.1617,  2.4064, -1.9527],
        [-5.6467,  2.0010, -4.9761, -1.1891],
        [-5.6728,  1.4365, -4.1571, -1.0037]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 89%|████████▉ | 257/289 [03:14<00:24,  1.33it/s]

Training loop 257
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.22370842099189758, logits - tensor([[-6.8872,  1.2126, -5.3188, -1.0781],
        [-5.6584,  0.5913, -4.2921, -0.4631],
        [-6.3901,  1.3903, -5.5931, -1.6468],
        [-5.5587, -4.7693,  2.1322, -2.9730],
        [-5.8429, -1.4574, -4.3970,  1.6164],
        [-5.9288,  1.3496, -4.7330, -2.1409],
        [-7.1035,  1.2851, -5.8650, -1.9487],
        [-6.2541,  2.0668, -5.9873, -1.5534]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 89%|████████▉ | 258/289 [03:14<00:23,  1.33it/s]

Training loop 258
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.09967828541994095, logits - tensor([[-5.9931,  2.0069, -5.3662, -1.8611],
        [-5.4787, -3.6954,  2.1448, -1.9548],
        [-5.8156,  1.1004, -4.4473, -1.1707],
        [-6.3600,  2.0690, -5.7146, -0.8704],
        [-5.5191,  1.0499, -4.6619, -1.6429],
        [-5.1976,  1.2771, -4.3687, -1.2478],
        [-5.9991,  1.3343, -4.7624, -1.5802],
        [-6.3420, -2.6378, -4.2213,  2.6233]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 90%|████████▉ | 259/289 [03:15<00:22,  1.33it/s]

Training loop 259
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.12234631180763245, logits - tensor([[-6.8229,  2.0562, -5.5787, -2.2463],
        [-5.3628, -3.0965,  1.8271, -2.6724],
        [-6.9514,  0.3010, -4.4884, -0.3115],
        [-5.4971,  1.7400, -5.3717, -1.1124],
        [-5.0140,  1.8714, -4.4962, -1.7654],
        [-5.6407,  1.9840, -5.3605, -1.7989],
        [-5.9225,  1.4700, -5.3069, -1.2977],
        [-6.3937, -4.2739,  2.5592, -2.4036]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 90%|████████▉ | 260/289 [03:16<00:21,  1.33it/s]

Training loop 260
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2954995632171631, logits - tensor([[-5.5146, -4.3693,  3.1750, -3.3844],
        [-5.9551,  1.0358, -4.8631, -0.7142],
        [-6.7078,  1.5212, -4.8092, -0.8360],
        [-5.4227,  1.0249, -5.1725, -0.8614],
        [-5.8308,  1.7316, -4.8576, -1.5285],
        [-6.9878,  2.5299, -5.5259, -1.3508],
        [-4.9178, -4.0763,  2.3947, -1.9086],
        [-6.3652, -3.0249,  0.9980, -0.8525]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 90%|█████████ | 261/289 [03:17<00:20,  1.34it/s]

Training loop 261
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3595326542854309, logits - tensor([[-6.3184,  0.8850, -4.2067, -1.2411],
        [-5.4390,  1.4541, -4.5734, -2.1995],
        [-6.4358,  1.0565, -5.8399, -1.3928],
        [-5.2823,  1.8003, -4.7370, -1.4620],
        [-5.3829,  1.2611, -4.8233, -1.4480],
        [-5.4032, -3.7165,  2.0716, -2.3327],
        [-5.7875, -4.5142,  2.4439, -2.4614],
        [-6.2002,  2.3554, -5.4987, -1.6638]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 91%|█████████ | 262/289 [03:17<00:20,  1.33it/s]

Training loop 262
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.20529797673225403, logits - tensor([[-6.6050,  2.2545, -5.6332, -1.7905],
        [-7.0559,  1.7320, -6.0870, -1.9013],
        [-6.4481, -2.8142, -4.8285,  2.7418],
        [-6.9594,  2.0513, -5.8352, -2.1456],
        [-6.1431, -3.5659, -4.7752,  2.9379],
        [-5.6756,  1.2438, -5.2187, -1.1619],
        [-6.5582,  1.3244, -6.1499, -1.6664],
        [-5.5492,  0.6835, -5.0895, -1.9031]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 91%|█████████ | 263/289 [03:18<00:19,  1.33it/s]

Training loop 263
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.17232196033000946, logits - tensor([[-4.8986,  0.3272, -5.0144, -1.1253],
        [-5.6862,  0.7884, -5.7967, -0.7766],
        [-6.2150,  1.2209, -5.4444, -1.4881],
        [-5.8694,  1.1395, -4.8262, -1.7532],
        [-6.9759, -2.7154, -4.7237,  1.4079],
        [-6.5141, -2.9610, -5.1410,  3.6279],
        [-6.7777, -0.4005, -5.9831,  0.9201],
        [-6.6399,  1.9857, -5.8332, -1.5861]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 91%|█████████▏| 264/289 [03:19<00:18,  1.33it/s]

Training loop 264
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2398686558008194, logits - tensor([[-6.6719, -1.0847, -5.2343,  0.7674],
        [-5.7428, -1.4882, -4.7032,  1.0352],
        [-6.0399,  1.7060, -4.9723, -1.8725],
        [-6.4577,  1.8408, -5.8425, -2.0742],
        [-5.9206,  1.6964, -5.7466, -1.6038],
        [-5.9986, -3.7239,  1.8714, -1.9168],
        [-6.6749, -4.0515,  1.7777, -1.4896],
        [-5.6638,  0.8502, -5.4814, -2.1845]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 92%|█████████▏| 265/289 [03:20<00:18,  1.33it/s]

Training loop 265
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.23358401656150818, logits - tensor([[-6.0158,  2.4984, -5.3985, -1.6303],
        [-6.4218,  1.1424, -4.9328, -0.6945],
        [-5.1475,  1.3104, -4.8135, -0.9673],
        [-5.2677, -3.4292, -6.0659,  3.3011],
        [-5.5478, -1.4842, -5.0073,  1.8963],
        [-6.0675,  2.2353, -5.6681, -2.1083],
        [-5.6632,  0.3039, -4.4170, -0.8902],
        [-5.8395, -4.1404,  1.6617, -2.8061]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 92%|█████████▏| 266/289 [03:20<00:17,  1.33it/s]

Training loop 266
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.08264148235321045, logits - tensor([[-5.4175, -3.6554,  2.0403, -2.3009],
        [-6.4542, -2.2312, -5.2908,  1.3693],
        [-5.7946, -3.3099, -4.2765,  3.1472],
        [-5.0007,  1.9497, -5.0434, -1.7550],
        [-6.4438,  1.3607, -5.4330, -1.4454],
        [-6.6696,  0.5369, -4.5203, -2.0515],
        [-5.9032,  1.6210, -5.0726, -1.3098],
        [-5.3234, -2.2922, -4.7134,  2.4381]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 92%|█████████▏| 267/289 [03:21<00:16,  1.33it/s]

Training loop 267
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3918852210044861, logits - tensor([[-7.0047,  2.0583, -5.9469, -2.3675],
        [-6.0194, -4.0584,  1.5930, -2.7649],
        [-6.6014,  1.4216, -4.8407, -0.9987],
        [-5.5831,  1.2075, -4.7849, -1.1478],
        [-6.1710, -4.5499,  2.5400, -2.5331],
        [-6.5321,  1.8964, -6.1134, -1.3118],
        [-5.6207, -3.2990,  1.6115, -2.1081],
        [-4.7384, -3.5389,  2.0517, -2.7442]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 93%|█████████▎| 268/289 [03:22<00:15,  1.33it/s]

Training loop 268
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.08201515674591064, logits - tensor([[-6.9071,  0.9741, -5.9056, -1.9687],
        [-6.0369,  2.0973, -5.3062, -1.4742],
        [-5.6592,  2.0199, -5.2910, -1.9719],
        [-5.7677,  1.9117, -4.7945, -2.1674],
        [-4.6521, -3.8637,  2.4463, -1.9356],
        [-6.1136,  2.0130, -5.2779, -1.3410],
        [-6.9472, -1.4121, -6.2765,  1.5009],
        [-5.9936,  1.7038, -5.1660, -2.1642]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 93%|█████████▎| 269/289 [03:23<00:15,  1.32it/s]

Training loop 269
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.31908273696899414, logits - tensor([[-5.7362, -3.4737, -5.2100,  2.7230],
        [-5.7076,  2.0901, -4.7081, -1.8537],
        [-5.6176,  1.9660, -5.0057, -1.9790],
        [-6.9252,  1.7076, -5.2064, -1.3577],
        [-7.3877,  2.1097, -5.9954, -1.8807],
        [-5.8463,  1.9630, -5.3439, -1.9230],
        [-6.9825, -1.1255, -5.1701,  1.1796],
        [-6.6618,  2.4805, -6.3033, -2.9509]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 93%|█████████▎| 270/289 [03:23<00:14,  1.32it/s]

Training loop 270
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.22835299372673035, logits - tensor([[-7.0723, -1.2746, -5.7143,  1.4421],
        [-6.0615,  1.4542, -5.0346, -1.7360],
        [-6.6399, -3.5204, -5.1926,  2.5293],
        [-6.9582,  2.1859, -6.3933, -1.8623],
        [-5.8437,  2.0960, -5.8127, -2.6291],
        [-5.7877,  1.4961, -4.4500, -1.6580],
        [-5.3444, -3.4650,  1.9020, -2.7104],
        [-6.0199, -1.5538, -4.7267,  0.5378]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 94%|█████████▍| 271/289 [03:24<00:13,  1.31it/s]

Training loop 271
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.22166967391967773, logits - tensor([[-6.4524,  1.9785, -6.2793, -2.2568],
        [-5.0371, -3.1379,  2.0012, -1.9182],
        [-6.8459,  2.1278, -6.0046, -2.1708],
        [-6.3875,  2.9763, -6.1916, -3.1097],
        [-6.6984,  2.0368, -5.5233, -1.9334],
        [-6.2604,  1.5175, -5.2558, -1.5103],
        [-6.4769,  2.8740, -5.3740, -2.5877],
        [-7.4299,  0.2345, -6.0998,  0.3265]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 94%|█████████▍| 272/289 [03:25<00:12,  1.32it/s]

Training loop 272
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5394953489303589, logits - tensor([[-4.9875, -2.7402,  1.3504, -2.0242],
        [-6.4010, -0.0307, -5.3740,  0.5779],
        [-5.5623, -2.9478,  1.2323, -1.5399],
        [-5.9117,  1.6215, -5.7703, -2.2907],
        [-5.9031,  2.0459, -5.9299, -1.6279],
        [-5.2449,  1.8031, -5.3518, -1.8066],
        [-5.8285,  1.4656, -5.2155, -1.0940],
        [-6.4000, -3.0402, -4.6545,  2.6173]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 94%|█████████▍| 273/289 [03:26<00:12,  1.32it/s]

Training loop 273
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3542519509792328, logits - tensor([[-6.1238,  2.6791, -5.6630, -2.1469],
        [-5.9018,  2.5403, -5.7362, -2.5258],
        [-6.7068,  2.7364, -6.0917, -2.2529],
        [-6.6436,  1.5629, -6.2376, -0.2816],
        [-5.7688,  1.6550, -5.7520, -1.9313],
        [-6.2717, -2.4594, -5.9094,  2.5687],
        [-5.4425,  1.9348, -5.3126, -0.7339],
        [-6.6658,  1.7483, -4.8171, -2.2116]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 95%|█████████▍| 274/289 [03:26<00:11,  1.32it/s]

Training loop 274
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3837448060512543, logits - tensor([[-6.0973,  2.5491, -5.2493, -2.4990],
        [-6.9228, -1.3743, -4.7610,  0.6257],
        [-6.2703, -1.4840, -5.6362,  2.5574],
        [-6.3009,  1.7441, -5.1966, -1.7419],
        [-6.7092,  1.2874, -5.8627, -1.4860],
        [-5.2783,  1.7917, -5.3265, -1.9506],
        [-5.5200,  2.0110, -6.0101, -1.8665],
        [-6.0834,  1.6310, -5.1917, -2.2350]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 95%|█████████▌| 275/289 [03:27<00:10,  1.32it/s]

Training loop 275
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.20470747351646423, logits - tensor([[-5.4845,  1.4119, -6.0220, -2.1470],
        [-4.7423, -2.7015,  1.0789, -1.1951],
        [-6.4907,  1.9902, -5.8797, -2.1276],
        [-5.9097,  1.1170, -5.8356, -2.1519],
        [-5.9963,  1.2927, -4.4409, -1.5011],
        [-5.6424,  1.6572, -5.3847, -1.7057],
        [-5.0527,  0.6481, -4.5577, -1.0587],
        [-6.1517,  2.7718, -5.3869, -2.7500]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 96%|█████████▌| 276/289 [03:28<00:09,  1.32it/s]

Training loop 276
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.09051301330327988, logits - tensor([[-6.5845,  2.2518, -5.5948, -1.4907],
        [-5.4015,  2.1244, -5.3268, -2.1090],
        [-6.5912,  1.6702, -5.5382, -1.6081],
        [-6.3294,  1.5114, -5.5257, -1.2011],
        [-6.3715,  2.4821, -6.2461, -2.4819],
        [-6.2154, -2.7190,  0.7631, -1.4838],
        [-5.3420, -3.3760,  1.7620, -1.3896],
        [-5.5623,  2.3095, -5.2321, -1.6072]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 96%|█████████▌| 277/289 [03:29<00:09,  1.32it/s]

Training loop 277
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.18756315112113953, logits - tensor([[-6.0607, -3.8297,  2.1028, -1.8739],
        [-5.2140, -3.6037,  1.9692, -2.1629],
        [-5.7992,  1.6633, -5.0788, -1.7189],
        [-5.7194, -3.2170, -5.2748,  2.6014],
        [-5.7041,  1.7933, -4.7723, -1.1156],
        [-6.0704,  1.1677, -5.5871, -1.0362],
        [-6.1910,  2.0863, -5.2964, -1.8136],
        [-6.4996,  1.6213, -5.7122, -2.1584]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 96%|█████████▌| 278/289 [03:29<00:08,  1.32it/s]

Training loop 278
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4263611435890198, logits - tensor([[-5.0444, -1.7798, -0.6382, -1.8072],
        [-6.5315,  1.8682, -5.1473, -1.5863],
        [-5.4315,  1.9404, -5.5053, -2.0954],
        [-5.6183,  1.3770, -5.3434, -1.8725],
        [-6.5694,  1.7892, -5.4384, -1.7623],
        [-5.6302,  1.8473, -5.8453, -1.8018],
        [-4.9221,  1.6427, -4.8565, -1.5647],
        [-5.1199,  1.6570, -5.3121, -1.8424]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 97%|█████████▋| 279/289 [03:30<00:07,  1.32it/s]

Training loop 279
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3816957473754883, logits - tensor([[-6.1471, -3.3648,  1.4882, -2.0946],
        [-4.9048, -2.1132,  0.6863, -0.7215],
        [-6.4201,  2.4652, -5.4102, -1.7178],
        [-5.4202,  2.1282, -4.7929, -1.9398],
        [-6.0952, -2.3472, -5.7097,  1.7288],
        [-6.3223,  1.8571, -5.4172, -1.4896],
        [-6.0839,  1.8322, -5.1238, -1.1337],
        [-6.6051,  1.6531, -5.2569, -1.1815]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 97%|█████████▋| 280/289 [03:31<00:06,  1.32it/s]

Training loop 280
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19584958255290985, logits - tensor([[-4.7130, -1.7876,  0.3827, -1.4721],
        [-5.5138,  2.4435, -5.2399, -1.8167],
        [-6.1607,  2.3460, -5.5378, -1.5108],
        [-5.8610,  1.3571, -4.8924, -1.1742],
        [-6.0611,  2.8272, -5.3709, -2.6011],
        [-6.2309,  2.4612, -5.9779, -2.5913],
        [-6.5086, -0.1731, -5.4255,  1.1938],
        [-5.1611, -3.5431,  0.8887, -1.2927]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 97%|█████████▋| 281/289 [03:32<00:06,  1.32it/s]

Training loop 281
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1779300719499588, logits - tensor([[-5.9447,  1.5539, -4.8632, -1.9177],
        [-5.5454, -3.3362,  1.1101, -2.2992],
        [-6.0809,  1.4644, -4.5821, -0.6225],
        [-5.7076,  1.4800, -4.8243, -1.2629],
        [-6.9336, -3.1924,  1.4448, -2.1216],
        [-5.7050,  0.5571, -5.2627, -1.5218],
        [-4.7484, -3.1038,  1.2430, -1.7318],
        [-5.8126,  2.5585, -5.2670, -1.7233]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 98%|█████████▊| 282/289 [03:33<00:05,  1.33it/s]

Training loop 282
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3807362914085388, logits - tensor([[-5.6268,  2.1765, -6.2998, -2.1808],
        [-5.6858, -2.7787,  0.5924, -2.0831],
        [-6.3612,  1.6049, -6.0211, -1.9352],
        [-4.9052,  0.6344, -5.0009,  0.0512],
        [-6.2614,  1.1376, -4.0310, -1.4258],
        [-5.9929,  1.4698, -6.1868, -1.5019],
        [-5.5770,  2.6500, -6.0215, -2.1862],
        [-5.8691,  2.1950, -5.8249, -1.7296]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 98%|█████████▊| 283/289 [03:33<00:04,  1.33it/s]

Training loop 283
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.17396186292171478, logits - tensor([[-6.0085, -3.7877,  0.9514, -2.0062],
        [-6.0898,  2.0133, -5.4143, -1.0543],
        [-6.0079,  2.1723, -5.8380, -1.9892],
        [-5.8244,  1.6965, -5.0640, -2.2778],
        [-5.5913, -2.7516, -4.6886,  2.9060],
        [-6.0467,  1.2951, -5.4005, -2.3107],
        [-5.2690, -2.8334,  0.7402, -1.4278],
        [-5.9772,  1.2952, -5.1353, -1.1920]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 98%|█████████▊| 284/289 [03:34<00:03,  1.33it/s]

Training loop 284
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.21854174137115479, logits - tensor([[-6.3290,  2.3438, -5.8399, -1.9459],
        [-5.8389, -4.0305,  1.5387, -1.6111],
        [-6.2050,  1.6010, -5.2018, -1.5413],
        [-6.0238, -3.3582,  0.9950, -1.4618],
        [-6.1913,  0.7045, -5.2098, -0.9389],
        [-5.3519, -2.7496,  0.6137, -1.8773],
        [-6.7545,  0.9653, -5.7866, -1.6749],
        [-6.2556, -2.3524, -4.9086,  2.5076]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 99%|█████████▊| 285/289 [03:35<00:03,  1.33it/s]

Training loop 285
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3958345055580139, logits - tensor([[-5.1913,  1.3183, -4.9301, -0.9459],
        [-5.3712,  1.6208, -4.6138, -1.3982],
        [-6.2082,  1.8245, -5.6477, -1.9061],
        [-5.2821,  0.9930, -4.1405, -1.5457],
        [-5.2300, -3.1786,  1.9772, -1.6573],
        [-5.8188, -1.7426, -0.4579, -1.6126],
        [-6.0190,  1.2778, -5.0614, -1.7490],
        [-6.6341,  2.0583, -5.7259, -1.9545]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 99%|█████████▉| 286/289 [03:36<00:02,  1.33it/s]

Training loop 286
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.16729223728179932, logits - tensor([[-5.3571, -3.6000,  1.7916, -1.9579],
        [-6.0769,  1.4984, -5.1285, -2.1422],
        [-5.9247,  1.6989, -4.2425, -0.2833],
        [-5.3005, -2.9834,  1.0252, -1.7860],
        [-6.3060, -2.9470,  1.2241, -1.1756],
        [-7.0851, -2.3617, -4.8426,  1.6341],
        [-6.8213, -2.1241, -4.8561,  2.0966],
        [-6.0056,  1.9815, -5.4904, -1.3049]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 99%|█████████▉| 287/289 [03:36<00:01,  1.33it/s]

Training loop 287
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.09980662912130356, logits - tensor([[-6.1284,  1.3999, -5.1153, -1.7753],
        [-5.2421, -2.6556,  1.3833, -1.6200],
        [-5.4630, -3.1431, -4.3466,  2.8248],
        [-5.4758,  1.3715, -5.0270, -1.4413],
        [-6.3815,  1.6766, -4.8738, -1.9332],
        [-5.4647,  0.8155, -4.4419, -1.5700],
        [-5.7992, -3.9376,  2.1403, -2.0999],
        [-5.4280,  0.7420, -4.8750, -1.3651]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


100%|█████████▉| 288/289 [03:37<00:00,  1.33it/s]

Training loop 288
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3172880709171295, logits - tensor([[-6.4554,  1.3075, -5.7168, -1.1827],
        [-6.4124,  1.9630, -5.9195, -1.3304],
        [-6.5749,  1.5680, -4.2565, -1.1035],
        [-6.7368,  1.4482, -5.5530, -1.5417]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


100%|██████████| 289/289 [03:37<00:00,  1.33it/s]
  0%|          | 0/194 [00:00<?, ?it/s]

Validation Loop 0
input - False, attention_mask - False


  1%|          | 1/194 [00:00<00:52,  3.70it/s]

Validation Loop 1
input - False, attention_mask - False


  1%|          | 2/194 [00:00<00:48,  3.93it/s]

Validation Loop 2
input - False, attention_mask - False


  2%|▏         | 3/194 [00:00<00:47,  4.03it/s]

Validation Loop 3
input - False, attention_mask - False


  2%|▏         | 4/194 [00:01<00:48,  3.92it/s]

Validation Loop 4
input - False, attention_mask - False


  3%|▎         | 5/194 [00:01<00:47,  4.00it/s]

Validation Loop 5
input - False, attention_mask - False


  3%|▎         | 6/194 [00:01<00:47,  4.00it/s]

Validation Loop 6
input - False, attention_mask - False


  4%|▎         | 7/194 [00:01<00:46,  4.02it/s]

Validation Loop 7
input - False, attention_mask - False


  4%|▍         | 8/194 [00:02<00:47,  3.96it/s]

Validation Loop 8
input - False, attention_mask - False


  5%|▍         | 9/194 [00:02<00:46,  3.97it/s]

Validation Loop 9
input - False, attention_mask - False


  5%|▌         | 10/194 [00:02<00:45,  4.01it/s]

Validation Loop 10
input - False, attention_mask - False


  6%|▌         | 11/194 [00:02<00:45,  4.01it/s]

Validation Loop 11
input - False, attention_mask - False


  6%|▌         | 12/194 [00:03<00:45,  3.96it/s]

Validation Loop 12
input - False, attention_mask - False


  7%|▋         | 13/194 [00:03<00:45,  3.97it/s]

Validation Loop 13
input - False, attention_mask - False


  7%|▋         | 14/194 [00:03<00:45,  3.96it/s]

Validation Loop 14
input - False, attention_mask - False


  8%|▊         | 15/194 [00:03<00:44,  4.01it/s]

Validation Loop 15
input - False, attention_mask - False


  8%|▊         | 16/194 [00:04<00:44,  3.97it/s]

Validation Loop 16
input - False, attention_mask - False


  9%|▉         | 17/194 [00:04<00:44,  3.98it/s]

Validation Loop 17
input - False, attention_mask - False


  9%|▉         | 18/194 [00:04<00:44,  3.97it/s]

Validation Loop 18
input - False, attention_mask - False


 10%|▉         | 19/194 [00:04<00:43,  3.98it/s]

Validation Loop 19
input - False, attention_mask - False


 10%|█         | 20/194 [00:05<00:43,  3.97it/s]

Validation Loop 20
input - False, attention_mask - False


 11%|█         | 21/194 [00:05<00:43,  3.95it/s]

Validation Loop 21
input - False, attention_mask - False


 11%|█▏        | 22/194 [00:05<00:43,  3.95it/s]

Validation Loop 22
input - False, attention_mask - False


 12%|█▏        | 23/194 [00:05<00:43,  3.96it/s]

Validation Loop 23
input - False, attention_mask - False


 12%|█▏        | 24/194 [00:06<00:42,  4.00it/s]

Validation Loop 24
input - False, attention_mask - False


 13%|█▎        | 25/194 [00:06<00:42,  3.98it/s]

Validation Loop 25
input - False, attention_mask - False


 13%|█▎        | 26/194 [00:06<00:42,  4.00it/s]

Validation Loop 26
input - False, attention_mask - False


 14%|█▍        | 27/194 [00:06<00:41,  4.00it/s]

Validation Loop 27
input - False, attention_mask - False


 14%|█▍        | 28/194 [00:07<00:41,  4.00it/s]

Validation Loop 28
input - False, attention_mask - False


 15%|█▍        | 29/194 [00:07<00:41,  3.97it/s]

Validation Loop 29
input - False, attention_mask - False


 15%|█▌        | 30/194 [00:07<00:41,  3.97it/s]

Validation Loop 30
input - False, attention_mask - False


 16%|█▌        | 31/194 [00:07<00:41,  3.94it/s]

Validation Loop 31
input - False, attention_mask - False


 16%|█▋        | 32/194 [00:08<00:41,  3.92it/s]

Validation Loop 32
input - False, attention_mask - False


 17%|█▋        | 33/194 [00:08<00:40,  3.93it/s]

Validation Loop 33
input - False, attention_mask - False


 18%|█▊        | 34/194 [00:08<00:41,  3.89it/s]

Validation Loop 34
input - False, attention_mask - False


 18%|█▊        | 35/194 [00:08<00:40,  3.92it/s]

Validation Loop 35
input - False, attention_mask - False


 19%|█▊        | 36/194 [00:09<00:40,  3.93it/s]

Validation Loop 36
input - False, attention_mask - False


 19%|█▉        | 37/194 [00:09<00:39,  3.97it/s]

Validation Loop 37
input - False, attention_mask - False


 20%|█▉        | 38/194 [00:09<00:39,  3.97it/s]

Validation Loop 38
input - False, attention_mask - False


 20%|██        | 39/194 [00:09<00:39,  3.94it/s]

Validation Loop 39
input - False, attention_mask - False


 21%|██        | 40/194 [00:10<00:39,  3.90it/s]

Validation Loop 40
input - False, attention_mask - False


 21%|██        | 41/194 [00:10<00:39,  3.91it/s]

Validation Loop 41
input - False, attention_mask - False


 22%|██▏       | 42/194 [00:10<00:38,  3.91it/s]

Validation Loop 42
input - False, attention_mask - False


 22%|██▏       | 43/194 [00:10<00:39,  3.86it/s]

Validation Loop 43
input - False, attention_mask - False


 23%|██▎       | 44/194 [00:11<00:39,  3.84it/s]

Validation Loop 44
input - False, attention_mask - False


 23%|██▎       | 45/194 [00:11<00:38,  3.89it/s]

Validation Loop 45
input - False, attention_mask - False


 24%|██▎       | 46/194 [00:11<00:37,  3.91it/s]

Validation Loop 46
input - False, attention_mask - False


 24%|██▍       | 47/194 [00:11<00:37,  3.91it/s]

Validation Loop 47
input - False, attention_mask - False


 25%|██▍       | 48/194 [00:12<00:37,  3.94it/s]

Validation Loop 48
input - False, attention_mask - False


 25%|██▌       | 49/194 [00:12<00:36,  3.95it/s]

Validation Loop 49
input - False, attention_mask - False


 26%|██▌       | 50/194 [00:12<00:36,  3.97it/s]

Validation Loop 50
input - False, attention_mask - False


 26%|██▋       | 51/194 [00:12<00:36,  3.95it/s]

Validation Loop 51
input - False, attention_mask - False


 27%|██▋       | 52/194 [00:13<00:35,  3.95it/s]

Validation Loop 52
input - False, attention_mask - False


 27%|██▋       | 53/194 [00:13<00:35,  3.97it/s]

Validation Loop 53
input - False, attention_mask - False


 28%|██▊       | 54/194 [00:13<00:35,  3.93it/s]

Validation Loop 54
input - False, attention_mask - False


 28%|██▊       | 55/194 [00:13<00:35,  3.93it/s]

Validation Loop 55
input - False, attention_mask - False


 29%|██▉       | 56/194 [00:14<00:34,  3.96it/s]

Validation Loop 56
input - False, attention_mask - False


 29%|██▉       | 57/194 [00:14<00:34,  3.92it/s]

Validation Loop 57
input - False, attention_mask - False


 30%|██▉       | 58/194 [00:14<00:34,  3.93it/s]

Validation Loop 58
input - False, attention_mask - False


 30%|███       | 59/194 [00:14<00:34,  3.92it/s]

Validation Loop 59
input - False, attention_mask - False


 31%|███       | 60/194 [00:15<00:33,  3.96it/s]

Validation Loop 60
input - False, attention_mask - False


 31%|███▏      | 61/194 [00:15<00:33,  3.96it/s]

Validation Loop 61
input - False, attention_mask - False


 32%|███▏      | 62/194 [00:15<00:33,  3.97it/s]

Validation Loop 62
input - False, attention_mask - False


 32%|███▏      | 63/194 [00:15<00:32,  4.00it/s]

Validation Loop 63
input - False, attention_mask - False


 33%|███▎      | 64/194 [00:16<00:32,  4.02it/s]

Validation Loop 64
input - False, attention_mask - False


 34%|███▎      | 65/194 [00:16<00:32,  3.98it/s]

Validation Loop 65
input - False, attention_mask - False


 34%|███▍      | 66/194 [00:16<00:32,  3.97it/s]

Validation Loop 66
input - False, attention_mask - False


 35%|███▍      | 67/194 [00:16<00:31,  3.97it/s]

Validation Loop 67
input - False, attention_mask - False


 35%|███▌      | 68/194 [00:17<00:32,  3.93it/s]

Validation Loop 68
input - False, attention_mask - False


 36%|███▌      | 69/194 [00:17<00:31,  3.93it/s]

Validation Loop 69
input - False, attention_mask - False


 36%|███▌      | 70/194 [00:17<00:31,  3.94it/s]

Validation Loop 70
input - False, attention_mask - False


 37%|███▋      | 71/194 [00:17<00:31,  3.97it/s]

Validation Loop 71
input - False, attention_mask - False


 37%|███▋      | 72/194 [00:18<00:30,  3.97it/s]

Validation Loop 72
input - False, attention_mask - False


 38%|███▊      | 73/194 [00:18<00:30,  3.99it/s]

Validation Loop 73
input - False, attention_mask - False


 38%|███▊      | 74/194 [00:18<00:29,  4.01it/s]

Validation Loop 74
input - False, attention_mask - False


 39%|███▊      | 75/194 [00:18<00:29,  3.99it/s]

Validation Loop 75
input - False, attention_mask - False


 39%|███▉      | 76/194 [00:19<00:29,  4.01it/s]

Validation Loop 76
input - False, attention_mask - False


 40%|███▉      | 77/194 [00:19<00:29,  3.98it/s]

Validation Loop 77
input - False, attention_mask - False


 40%|████      | 78/194 [00:19<00:29,  3.99it/s]

Validation Loop 78
input - False, attention_mask - False


 41%|████      | 79/194 [00:19<00:28,  3.97it/s]

Validation Loop 79
input - False, attention_mask - False


 41%|████      | 80/194 [00:20<00:28,  3.97it/s]

Validation Loop 80
input - False, attention_mask - False


 42%|████▏     | 81/194 [00:20<00:28,  3.98it/s]

Validation Loop 81
input - False, attention_mask - False


 42%|████▏     | 82/194 [00:20<00:28,  3.98it/s]

Validation Loop 82
input - False, attention_mask - False


 43%|████▎     | 83/194 [00:20<00:27,  4.01it/s]

Validation Loop 83
input - False, attention_mask - False


 43%|████▎     | 84/194 [00:21<00:27,  4.03it/s]

Validation Loop 84
input - False, attention_mask - False


 44%|████▍     | 85/194 [00:21<00:27,  4.02it/s]

Validation Loop 85
input - False, attention_mask - False


 44%|████▍     | 86/194 [00:21<00:26,  4.00it/s]

Validation Loop 86
input - False, attention_mask - False


 45%|████▍     | 87/194 [00:21<00:26,  4.01it/s]

Validation Loop 87
input - False, attention_mask - False


 45%|████▌     | 88/194 [00:22<00:26,  4.01it/s]

Validation Loop 88
input - False, attention_mask - False


 46%|████▌     | 89/194 [00:22<00:26,  4.02it/s]

Validation Loop 89
input - False, attention_mask - False


 46%|████▋     | 90/194 [00:22<00:25,  4.00it/s]

Validation Loop 90
input - False, attention_mask - False


 47%|████▋     | 91/194 [00:22<00:25,  4.01it/s]

Validation Loop 91
input - False, attention_mask - False


 47%|████▋     | 92/194 [00:23<00:25,  3.98it/s]

Validation Loop 92
input - False, attention_mask - False


 48%|████▊     | 93/194 [00:23<00:25,  3.98it/s]

Validation Loop 93
input - False, attention_mask - False


 48%|████▊     | 94/194 [00:23<00:25,  3.96it/s]

Validation Loop 94
input - False, attention_mask - False


 49%|████▉     | 95/194 [00:23<00:25,  3.92it/s]

Validation Loop 95
input - False, attention_mask - False


 49%|████▉     | 96/194 [00:24<00:24,  3.94it/s]

Validation Loop 96
input - False, attention_mask - False


 50%|█████     | 97/194 [00:24<00:24,  3.94it/s]

Validation Loop 97
input - False, attention_mask - False


 51%|█████     | 98/194 [00:24<00:24,  3.95it/s]

Validation Loop 98
input - False, attention_mask - False


 51%|█████     | 99/194 [00:24<00:24,  3.94it/s]

Validation Loop 99
input - False, attention_mask - False


 52%|█████▏    | 100/194 [00:25<00:23,  3.96it/s]

Validation Loop 100
input - False, attention_mask - False


 52%|█████▏    | 101/194 [00:25<00:23,  3.96it/s]

Validation Loop 101
input - False, attention_mask - False


 53%|█████▎    | 102/194 [00:25<00:23,  3.91it/s]

Validation Loop 102
input - False, attention_mask - False


 53%|█████▎    | 103/194 [00:26<00:23,  3.93it/s]

Validation Loop 103
input - False, attention_mask - False


 54%|█████▎    | 104/194 [00:26<00:23,  3.91it/s]

Validation Loop 104
input - False, attention_mask - False


 54%|█████▍    | 105/194 [00:26<00:22,  3.93it/s]

Validation Loop 105
input - False, attention_mask - False


 55%|█████▍    | 106/194 [00:26<00:22,  3.94it/s]

Validation Loop 106
input - False, attention_mask - False


 55%|█████▌    | 107/194 [00:27<00:22,  3.95it/s]

Validation Loop 107
input - False, attention_mask - False


 56%|█████▌    | 108/194 [00:27<00:21,  3.94it/s]

Validation Loop 108
input - False, attention_mask - False


 56%|█████▌    | 109/194 [00:27<00:21,  3.94it/s]

Validation Loop 109
input - False, attention_mask - False


 57%|█████▋    | 110/194 [00:27<00:21,  3.94it/s]

Validation Loop 110
input - False, attention_mask - False


 57%|█████▋    | 111/194 [00:28<00:21,  3.90it/s]

Validation Loop 111
input - False, attention_mask - False


 58%|█████▊    | 112/194 [00:28<00:21,  3.88it/s]

Validation Loop 112
input - False, attention_mask - False


 58%|█████▊    | 113/194 [00:28<00:20,  3.93it/s]

Validation Loop 113
input - False, attention_mask - False


 59%|█████▉    | 114/194 [00:28<00:20,  3.91it/s]

Validation Loop 114
input - False, attention_mask - False


 59%|█████▉    | 115/194 [00:29<00:20,  3.92it/s]

Validation Loop 115
input - False, attention_mask - False


 60%|█████▉    | 116/194 [00:29<00:19,  3.95it/s]

Validation Loop 116
input - False, attention_mask - False


 60%|██████    | 117/194 [00:29<00:19,  3.91it/s]

Validation Loop 117
input - False, attention_mask - False


 61%|██████    | 118/194 [00:29<00:19,  3.94it/s]

Validation Loop 118
input - False, attention_mask - False


 61%|██████▏   | 119/194 [00:30<00:19,  3.93it/s]

Validation Loop 119
input - False, attention_mask - False


 62%|██████▏   | 120/194 [00:30<00:18,  3.95it/s]

Validation Loop 120
input - False, attention_mask - False


 62%|██████▏   | 121/194 [00:30<00:18,  3.97it/s]

Validation Loop 121
input - False, attention_mask - False


 63%|██████▎   | 122/194 [00:30<00:18,  3.97it/s]

Validation Loop 122
input - False, attention_mask - False


 63%|██████▎   | 123/194 [00:31<00:17,  3.95it/s]

Validation Loop 123
input - False, attention_mask - False


 64%|██████▍   | 124/194 [00:31<00:17,  3.95it/s]

Validation Loop 124
input - False, attention_mask - False


 64%|██████▍   | 125/194 [00:31<00:17,  4.00it/s]

Validation Loop 125
input - False, attention_mask - False


 65%|██████▍   | 126/194 [00:31<00:17,  3.97it/s]

Validation Loop 126
input - False, attention_mask - False


 65%|██████▌   | 127/194 [00:32<00:16,  4.00it/s]

Validation Loop 127
input - False, attention_mask - False


 66%|██████▌   | 128/194 [00:32<00:16,  3.97it/s]

Validation Loop 128
input - False, attention_mask - False


 66%|██████▋   | 129/194 [00:32<00:16,  3.97it/s]

Validation Loop 129
input - False, attention_mask - False


 67%|██████▋   | 130/194 [00:32<00:16,  3.99it/s]

Validation Loop 130
input - False, attention_mask - False


 68%|██████▊   | 131/194 [00:33<00:15,  3.97it/s]

Validation Loop 131
input - False, attention_mask - False


 68%|██████▊   | 132/194 [00:33<00:15,  3.99it/s]

Validation Loop 132
input - False, attention_mask - False


 69%|██████▊   | 133/194 [00:33<00:15,  3.99it/s]

Validation Loop 133
input - False, attention_mask - False


 69%|██████▉   | 134/194 [00:33<00:15,  4.00it/s]

Validation Loop 134
input - False, attention_mask - False


 70%|██████▉   | 135/194 [00:34<00:14,  3.96it/s]

Validation Loop 135
input - False, attention_mask - False


 70%|███████   | 136/194 [00:34<00:14,  3.95it/s]

Validation Loop 136
input - False, attention_mask - False


 71%|███████   | 137/194 [00:34<00:14,  3.94it/s]

Validation Loop 137
input - False, attention_mask - False


 71%|███████   | 138/194 [00:34<00:14,  3.97it/s]

Validation Loop 138
input - False, attention_mask - False


 72%|███████▏  | 139/194 [00:35<00:13,  3.97it/s]

Validation Loop 139
input - False, attention_mask - False


 72%|███████▏  | 140/194 [00:35<00:13,  4.00it/s]

Validation Loop 140
input - False, attention_mask - False


 73%|███████▎  | 141/194 [00:35<00:13,  3.97it/s]

Validation Loop 141
input - False, attention_mask - False


 73%|███████▎  | 142/194 [00:35<00:13,  3.95it/s]

Validation Loop 142
input - False, attention_mask - False


 74%|███████▎  | 143/194 [00:36<00:12,  3.94it/s]

Validation Loop 143
input - False, attention_mask - False


 74%|███████▍  | 144/194 [00:36<00:12,  3.98it/s]

Validation Loop 144
input - False, attention_mask - False


 75%|███████▍  | 145/194 [00:36<00:12,  3.96it/s]

Validation Loop 145
input - False, attention_mask - False


 75%|███████▌  | 146/194 [00:36<00:12,  3.97it/s]

Validation Loop 146
input - False, attention_mask - False


 76%|███████▌  | 147/194 [00:37<00:11,  4.01it/s]

Validation Loop 147
input - False, attention_mask - False


 76%|███████▋  | 148/194 [00:37<00:11,  3.99it/s]

Validation Loop 148
input - False, attention_mask - False


 77%|███████▋  | 149/194 [00:37<00:11,  4.00it/s]

Validation Loop 149
input - False, attention_mask - False


 77%|███████▋  | 150/194 [00:37<00:11,  3.97it/s]

Validation Loop 150
input - False, attention_mask - False


 78%|███████▊  | 151/194 [00:38<00:10,  3.97it/s]

Validation Loop 151
input - False, attention_mask - False


 78%|███████▊  | 152/194 [00:38<00:10,  3.96it/s]

Validation Loop 152
input - False, attention_mask - False


 79%|███████▉  | 153/194 [00:38<00:10,  3.97it/s]

Validation Loop 153
input - False, attention_mask - False


 79%|███████▉  | 154/194 [00:38<00:10,  3.98it/s]

Validation Loop 154
input - False, attention_mask - False


 80%|███████▉  | 155/194 [00:39<00:09,  3.97it/s]

Validation Loop 155
input - False, attention_mask - False


 80%|████████  | 156/194 [00:39<00:09,  3.98it/s]

Validation Loop 156
input - False, attention_mask - False


 81%|████████  | 157/194 [00:39<00:09,  3.98it/s]

Validation Loop 157
input - False, attention_mask - False


 81%|████████▏ | 158/194 [00:39<00:09,  4.00it/s]

Validation Loop 158
input - False, attention_mask - False


 82%|████████▏ | 159/194 [00:40<00:08,  3.97it/s]

Validation Loop 159
input - False, attention_mask - False


 82%|████████▏ | 160/194 [00:40<00:08,  3.99it/s]

Validation Loop 160
input - False, attention_mask - False


 83%|████████▎ | 161/194 [00:40<00:08,  3.94it/s]

Validation Loop 161
input - False, attention_mask - False


 84%|████████▎ | 162/194 [00:40<00:08,  3.94it/s]

Validation Loop 162
input - False, attention_mask - False


 84%|████████▍ | 163/194 [00:41<00:07,  3.94it/s]

Validation Loop 163
input - False, attention_mask - False


 85%|████████▍ | 164/194 [00:41<00:07,  3.96it/s]

Validation Loop 164
input - False, attention_mask - False


 85%|████████▌ | 165/194 [00:41<00:07,  3.95it/s]

Validation Loop 165
input - False, attention_mask - False


 86%|████████▌ | 166/194 [00:41<00:07,  3.98it/s]

Validation Loop 166
input - False, attention_mask - False


 86%|████████▌ | 167/194 [00:42<00:06,  3.96it/s]

Validation Loop 167
input - False, attention_mask - False


 87%|████████▋ | 168/194 [00:42<00:06,  3.98it/s]

Validation Loop 168
input - False, attention_mask - False


 87%|████████▋ | 169/194 [00:42<00:06,  3.97it/s]

Validation Loop 169
input - False, attention_mask - False


 88%|████████▊ | 170/194 [00:42<00:06,  3.97it/s]

Validation Loop 170
input - False, attention_mask - False


 88%|████████▊ | 171/194 [00:43<00:05,  3.98it/s]

Validation Loop 171
input - False, attention_mask - False


 89%|████████▊ | 172/194 [00:43<00:05,  3.97it/s]

Validation Loop 172
input - False, attention_mask - False


 89%|████████▉ | 173/194 [00:43<00:05,  3.94it/s]

Validation Loop 173
input - False, attention_mask - False


 90%|████████▉ | 174/194 [00:43<00:05,  3.96it/s]

Validation Loop 174
input - False, attention_mask - False


 90%|█████████ | 175/194 [00:44<00:04,  3.95it/s]

Validation Loop 175
input - False, attention_mask - False


 91%|█████████ | 176/194 [00:44<00:04,  3.95it/s]

Validation Loop 176
input - False, attention_mask - False


 91%|█████████ | 177/194 [00:44<00:04,  3.98it/s]

Validation Loop 177
input - False, attention_mask - False


 92%|█████████▏| 178/194 [00:44<00:04,  3.97it/s]

Validation Loop 178
input - False, attention_mask - False


 92%|█████████▏| 179/194 [00:45<00:03,  4.00it/s]

Validation Loop 179
input - False, attention_mask - False


 93%|█████████▎| 180/194 [00:45<00:03,  3.99it/s]

Validation Loop 180
input - False, attention_mask - False


 93%|█████████▎| 181/194 [00:45<00:03,  3.99it/s]

Validation Loop 181
input - False, attention_mask - False


 94%|█████████▍| 182/194 [00:45<00:03,  3.93it/s]

Validation Loop 182
input - False, attention_mask - False


 94%|█████████▍| 183/194 [00:46<00:02,  3.94it/s]

Validation Loop 183
input - False, attention_mask - False


 95%|█████████▍| 184/194 [00:46<00:02,  3.96it/s]

Validation Loop 184
input - False, attention_mask - False


 95%|█████████▌| 185/194 [00:46<00:02,  3.96it/s]

Validation Loop 185
input - False, attention_mask - False


 96%|█████████▌| 186/194 [00:46<00:02,  3.95it/s]

Validation Loop 186
input - False, attention_mask - False


 96%|█████████▋| 187/194 [00:47<00:01,  3.96it/s]

Validation Loop 187
input - False, attention_mask - False


 97%|█████████▋| 188/194 [00:47<00:01,  3.96it/s]

Validation Loop 188
input - False, attention_mask - False


 97%|█████████▋| 189/194 [00:47<00:01,  3.95it/s]

Validation Loop 189
input - False, attention_mask - False


 98%|█████████▊| 190/194 [00:47<00:01,  3.94it/s]

Validation Loop 190
input - False, attention_mask - False


 98%|█████████▊| 191/194 [00:48<00:00,  3.97it/s]

Validation Loop 191
input - False, attention_mask - False


 99%|█████████▉| 192/194 [00:48<00:00,  3.98it/s]

Validation Loop 192
input - False, attention_mask - False


 99%|█████████▉| 193/194 [00:48<00:00,  3.99it/s]

Validation Loop 193
input - False, attention_mask - False


100%|██████████| 194/194 [00:49<00:00,  3.96it/s]

[{'tp': 0, 'tn': 1552, 'fp': 0, 'fn': 0}, {'tp': 918, 'tn': 338, 'fp': 43, 'fn': 253}, {'tp': 156, 'tn': 1367, 'fp': 4, 'fn': 25}, {'tp': 153, 'tn': 1097, 'fp': 278, 'fn': 24}]
Detailed accuracy after 2 epoch:
unanswerable accuarcy: 1.0
extractive accuarcy: 0.8092783505154639
yes_no accuarcy: 0.9813144329896907
abstractive accuarcy: 0.8054123711340206
Overall accuarcy: 0.8990012886597938
Best accuarcy: 0.899645618556701



  0%|          | 0/289 [00:00<?, ?it/s]

Training loop 0
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.23928213119506836, logits - tensor([[-5.9743,  0.8514, -5.1597, -0.2781],
        [-6.2167, -3.4856,  1.0874, -1.7203],
        [-6.2512, -3.4295, -4.6890,  3.2699],
        [-5.7551, -3.0406, -4.6222,  3.0015],
        [-6.0031,  1.1971, -4.9622, -1.1874],
        [-5.7795,  1.2887, -5.2304, -1.4828],
        [-5.0986,  1.4606, -4.5092, -0.8944],
        [-5.7142,  1.1219, -5.2559, -1.2075]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  0%|          | 1/289 [00:00<03:52,  1.24it/s]

Training loop 1
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.16922159492969513, logits - tensor([[-6.1658,  0.7671, -4.7465, -1.4002],
        [-5.4467,  2.2562, -5.1742, -2.0119],
        [-6.3001,  1.8964, -5.5380, -1.5339],
        [-6.4472, -3.8137,  0.9811, -2.3025],
        [-4.9851, -2.5482,  1.4414, -1.6313],
        [-6.0992,  1.8860, -5.1522, -1.4824],
        [-5.0455,  1.1254, -5.5249, -1.4659],
        [-5.6722, -3.5620,  1.6906, -2.3033]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  1%|          | 2/289 [00:01<03:44,  1.28it/s]

Training loop 2
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.44427597522735596, logits - tensor([[-6.6881,  1.7481, -5.4637, -1.9769],
        [-6.5631,  1.9392, -5.7663, -2.1813],
        [-5.9447, -4.1526,  1.9498, -2.0183],
        [-6.0237,  0.8107, -4.5854, -1.6411],
        [-5.9177, -2.4637,  0.5402, -1.2670],
        [-6.0182,  1.0330, -4.6553, -1.2605],
        [-6.6698,  1.7814, -5.7062, -2.0606],
        [-6.0998,  0.8531, -5.2538, -1.6942]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  1%|          | 3/289 [00:02<03:39,  1.30it/s]

Training loop 3
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1516699492931366, logits - tensor([[-5.0092, -3.4647,  1.5683, -1.3733],
        [-6.7651, -3.1787, -4.8107,  2.5063],
        [-6.5180,  0.3894, -4.5025, -0.6688],
        [-5.6706,  0.7915, -4.2103, -1.6610],
        [-6.0945,  1.5768, -5.1347, -1.3371],
        [-5.5919,  1.1267, -4.6605, -1.8981],
        [-5.7022, -2.9953, -5.0562,  2.0431],
        [-5.9822,  1.0726, -5.0798, -0.8394]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  1%|▏         | 4/289 [00:03<03:36,  1.31it/s]

Training loop 4
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2451099157333374, logits - tensor([[-6.0188,  0.7765, -4.5243, -1.3362],
        [-5.8327, -2.9726,  0.8814, -2.2312],
        [-5.8375,  1.0486, -4.4193, -1.1408],
        [-5.4545, -2.9106,  1.1749, -2.1074],
        [-5.6168,  1.4575, -4.6931, -1.0781],
        [-6.9317,  0.5745, -5.6114, -0.8418],
        [-5.6299, -3.8035,  1.0609, -2.1042],
        [-5.1626,  1.2847, -4.5427, -1.9634]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  2%|▏         | 5/289 [00:03<03:35,  1.32it/s]

Training loop 5
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.15955345332622528, logits - tensor([[-4.8409, -3.1262,  2.0314, -2.1592],
        [-5.4924,  0.2955, -3.7676, -0.8567],
        [-5.7025,  0.9361, -4.4957, -1.6499],
        [-6.3333,  1.6047, -5.5365, -1.4779],
        [-5.8018,  1.6697, -5.4454, -0.9698],
        [-5.5257,  0.9810, -4.8052, -1.2055],
        [-4.8667, -3.6475,  1.4171, -1.5716],
        [-4.6485, -3.8343,  1.8873, -2.1245]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  2%|▏         | 6/289 [00:04<03:33,  1.33it/s]

Training loop 6
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.30622202157974243, logits - tensor([[-4.9080, -3.1715, -4.4757,  2.8148],
        [-5.9884,  1.8768, -4.8975, -1.5581],
        [-6.5077,  1.9613, -5.7977, -1.6875],
        [-7.9008, -0.0149, -5.5733,  0.0289],
        [-6.3166, -0.0291, -5.1808,  1.2749],
        [-6.6473,  1.8341, -6.0822, -1.7448],
        [-4.9531,  0.6229, -4.3879, -1.1438],
        [-6.6811, -1.7960, -4.8516,  2.6151]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  2%|▏         | 7/289 [00:05<03:32,  1.33it/s]

Training loop 7
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.23629042506217957, logits - tensor([[-6.4638,  0.9692, -4.9817, -0.9638],
        [-5.1502, -4.6649,  2.4015, -1.9057],
        [-6.2399,  1.5486, -5.0359, -0.2372],
        [-5.1792,  1.1016, -4.3523, -0.8131],
        [-5.8782,  0.6667, -4.8116, -0.8925],
        [-5.9386,  0.8387, -5.4335, -1.2025],
        [-5.9426,  0.2443, -4.5680, -0.8592],
        [-5.4386,  0.1101, -4.4468, -0.5433]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  3%|▎         | 8/289 [00:06<03:32,  1.32it/s]

Training loop 8
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.17619512975215912, logits - tensor([[-5.8748, -4.0430,  1.5619, -2.4703],
        [-4.6024,  0.8123, -4.6585, -1.7500],
        [-6.3613,  0.9261, -5.0897, -0.3398],
        [-5.7478,  1.7909, -4.5859, -1.3852],
        [-6.0587,  0.1332, -4.0740, -1.0193],
        [-4.8392, -3.4606,  2.0177, -1.8698],
        [-5.2438,  1.0118, -4.3594, -1.0609],
        [-5.3221, -3.3504,  2.0049, -1.4567]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  3%|▎         | 9/289 [00:06<03:31,  1.32it/s]

Training loop 9
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.22749021649360657, logits - tensor([[-6.1110,  0.0274, -4.0195, -0.8105],
        [-6.6498,  0.2950, -5.3638,  0.0596],
        [-4.9207, -2.9247,  0.9010, -1.3248],
        [-5.9333,  1.6431, -5.5982, -1.3111],
        [-7.1569, -0.9217, -5.5925,  0.7570],
        [-5.6143, -3.6379, -4.4029,  3.3391],
        [-5.8200,  0.9698, -4.4373, -0.5576],
        [-6.2369,  0.9957, -5.3707, -0.4807]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  3%|▎         | 10/289 [00:07<03:31,  1.32it/s]

Training loop 10
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.17820127308368683, logits - tensor([[-5.5840,  1.4133, -5.0161, -1.9076],
        [-6.0937,  1.0895, -4.8959, -1.5956],
        [-5.9957,  0.9312, -5.2597, -0.8936],
        [-5.7094,  1.2790, -4.8953, -1.7825],
        [-5.8337,  0.5384, -4.5902, -0.7419],
        [-6.6659,  1.1743, -5.2963, -1.8249],
        [-6.5747, -0.3538, -4.4460,  0.4101],
        [-5.5791,  0.6952, -4.9578, -1.3223]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  4%|▍         | 11/289 [00:08<03:30,  1.32it/s]

Training loop 11
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1231798306107521, logits - tensor([[-6.6644,  0.9879, -5.1257, -0.3379],
        [-5.1894, -4.4016,  2.3429, -2.1446],
        [-5.1607, -3.4162,  2.3731, -2.3199],
        [-5.6706,  1.3726, -4.7981, -0.8502],
        [-5.1196,  1.1364, -4.4819, -0.9659],
        [-5.8884,  0.3114, -4.0796, -0.4521],
        [-5.0715, -3.0647, -4.5013,  3.4066],
        [-4.4333, -3.7624,  2.2666, -1.9211]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  4%|▍         | 12/289 [00:09<03:29,  1.32it/s]

Training loop 12
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.14712077379226685, logits - tensor([[-6.7493,  1.9060, -5.6745, -0.7843],
        [-6.0567,  0.7085, -4.9998,  0.1106],
        [-5.6399,  0.2070, -5.2087, -0.5421],
        [-5.6755,  1.3037, -5.4625, -1.4973],
        [-5.3764, -3.6728,  2.8940, -2.7124],
        [-5.5934, -4.4550,  2.3430, -3.0213],
        [-5.4281,  0.8002, -4.4376, -1.1694],
        [-6.4298,  1.5992, -5.5804, -0.8112]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  4%|▍         | 13/289 [00:09<03:28,  1.33it/s]

Training loop 13
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.363309383392334, logits - tensor([[-5.8706, -1.4399, -4.5360,  1.4709],
        [-5.3411,  1.1349, -4.5156, -0.3323],
        [-5.3976, -2.9791,  1.8036, -2.0374],
        [-5.2408,  0.0609, -3.4071, -0.6046],
        [-7.0103, -1.8520, -5.3724,  2.3710],
        [-6.2913,  0.5377, -5.7532, -0.7609],
        [-6.5879, -3.2274, -5.1387,  3.5772],
        [-6.6318, -0.1396, -4.6087, -0.3868]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  5%|▍         | 14/289 [00:10<03:28,  1.32it/s]

Training loop 14
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.12136349081993103, logits - tensor([[-5.5925,  0.8661, -4.5360, -0.9431],
        [-5.2141,  0.8035, -4.5403, -1.2557],
        [-5.1335, -3.7171,  2.3919, -2.0608],
        [-5.7953, -3.2612,  2.5256, -2.0475],
        [-6.0244,  1.5261, -4.7527, -1.1573],
        [-5.5243,  0.9206, -4.6787, -1.5461],
        [-5.2328,  0.7426, -4.2820, -0.6387],
        [-5.1618, -3.3243,  2.2861, -2.4413]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  5%|▌         | 15/289 [00:11<03:28,  1.32it/s]

Training loop 15
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.14429756999015808, logits - tensor([[-5.8281,  0.6332, -4.4013, -1.2480],
        [-6.7968,  0.7338, -5.2231, -1.0194],
        [-5.9311,  1.0755, -5.4002, -1.1623],
        [-5.6617,  1.4493, -4.2954, -0.8530],
        [-5.2509, -2.8014,  1.4624, -3.1041],
        [-6.8416,  1.2037, -5.8562, -0.6827],
        [-5.7664,  0.9609, -4.6083, -0.5972],
        [-6.4881,  1.9180, -5.5523, -1.8055]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  6%|▌         | 16/289 [00:12<03:26,  1.32it/s]

Training loop 16
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2081533968448639, logits - tensor([[-6.9858, -3.4956, -5.2009,  3.1717],
        [-6.8238,  0.9608, -5.0571, -1.3067],
        [-6.6557,  1.1622, -5.1189, -0.8017],
        [-5.3054,  0.6038, -4.2686, -0.6755],
        [-7.5587, -0.6333, -6.0099,  1.0201],
        [-6.2786,  1.6568, -4.9439, -1.4836],
        [-6.3964,  0.4784, -4.8625, -0.6916],
        [-4.5787, -3.6169,  2.5572, -2.3063]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  6%|▌         | 17/289 [00:12<03:25,  1.32it/s]

Training loop 17
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2923620045185089, logits - tensor([[-6.4791, -0.4556, -5.2865,  0.1167],
        [-5.8941,  1.1876, -4.9212, -2.0525],
        [-7.4170, -2.7692, -5.4054,  3.0504],
        [-6.7791, -1.4332, -5.0225,  1.7890],
        [-5.8904, -3.7983,  2.3682, -2.7084],
        [-6.0302, -3.0017, -4.3958,  2.4663],
        [-6.3553,  1.7013, -5.6576, -1.2501],
        [-5.3222, -3.3953,  2.6402, -2.3733]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  6%|▌         | 18/289 [00:13<03:24,  1.32it/s]

Training loop 18
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.31665968894958496, logits - tensor([[-5.1844, -3.6033,  2.3184, -2.2678],
        [-5.9970, -2.7567, -4.4807,  2.5116],
        [-6.3758,  1.4246, -5.0608, -1.9561],
        [-6.1328, -3.6742,  2.2940, -2.5758],
        [-7.9998, -0.5332, -4.7883,  1.6310],
        [-6.4367,  1.5164, -4.9859, -1.2838],
        [-6.8404,  2.7683, -7.4649, -2.3615],
        [-5.8585,  0.4360, -4.4125, -1.5366]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  7%|▋         | 19/289 [00:14<03:23,  1.33it/s]

Training loop 19
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.08780839294195175, logits - tensor([[-7.0456,  1.6900, -5.8228, -2.2378],
        [-6.2669,  1.5941, -5.4038, -1.7803],
        [-5.0033, -4.2389,  2.6157, -2.3657],
        [-5.9876, -4.6143,  2.5374, -2.6082],
        [-5.9567,  1.3435, -4.9559, -1.2132],
        [-6.8877,  1.3120, -5.9481, -0.9899],
        [-4.9720, -3.9909,  2.4554, -1.6706],
        [-5.3020,  1.4882, -5.1698, -1.0757]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  7%|▋         | 20/289 [00:15<03:22,  1.33it/s]

Training loop 20
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.30076098442077637, logits - tensor([[-5.9820,  1.8223, -5.8477, -1.6705],
        [-6.1261, -2.5919, -5.4719,  2.4156],
        [-5.8435,  1.0962, -4.9218, -0.9322],
        [-6.5825,  2.1304, -5.6357, -1.2156],
        [-6.3311,  1.1764, -5.2320, -1.3796],
        [-6.0328,  1.7198, -6.2203, -1.3208],
        [-6.1350,  2.0915, -5.8499, -1.4190],
        [-7.2302,  2.5686, -5.3151, -1.1795]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  7%|▋         | 21/289 [00:15<03:22,  1.32it/s]

Training loop 21
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.34110403060913086, logits - tensor([[-6.5202,  1.3003, -5.5797, -1.3558],
        [-6.0972,  1.4014, -4.8810, -1.5096],
        [-6.1928,  1.6234, -5.2559, -1.2296],
        [-6.4211,  1.5649, -5.9753, -2.0968],
        [-7.5061, -1.9611, -4.9323,  1.1186],
        [-6.8594,  0.2895, -5.6292,  0.8522],
        [-7.5946,  2.5597, -7.0694, -2.4561],
        [-6.1568,  0.8878, -5.9507, -1.3074]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  8%|▊         | 22/289 [00:16<03:21,  1.33it/s]

Training loop 22
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.28534501791000366, logits - tensor([[-6.0158,  1.7189, -5.3308, -1.7110],
        [-6.0576,  1.7568, -5.1546, -1.5725],
        [-5.5668, -3.4561,  2.3953, -2.7331],
        [-5.8893, -4.0858,  2.9855, -2.3171],
        [-6.4837,  0.7088, -5.6002, -0.8363],
        [-5.4663, -3.5028, -5.5203,  3.5617],
        [-6.3966,  1.7801, -4.9093, -0.9312],
        [-6.5009, -1.0391, -5.6346,  0.6733]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  8%|▊         | 23/289 [00:17<03:20,  1.33it/s]

Training loop 23
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2727511525154114, logits - tensor([[-6.8911,  1.6711, -5.8421, -1.7810],
        [-5.3930, -3.5075,  2.4139, -2.0696],
        [-6.1452,  1.4942, -5.1193, -1.1288],
        [-6.2893,  0.5148, -4.7148, -0.7191],
        [-5.8933,  1.8065, -4.5159, -1.7280],
        [-7.4645,  2.1603, -6.6749, -1.7850],
        [-5.0178,  1.7528, -4.5050, -0.7363],
        [-6.6116, -1.5425, -5.2553,  1.2724]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  8%|▊         | 24/289 [00:18<03:19,  1.33it/s]

Training loop 24
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2013552486896515, logits - tensor([[-6.5468,  1.9631, -6.2667, -2.3465],
        [-6.1489,  0.9295, -5.3858, -1.8420],
        [-6.8482, -0.0837, -5.8235, -0.1399],
        [-6.9785,  2.3337, -6.6214, -1.8989],
        [-4.9433, -3.1151,  2.7664, -3.1050],
        [-6.5368,  2.1134, -6.2513, -2.2394],
        [-6.8914,  2.6326, -7.1636, -1.9418],
        [-6.0267,  1.6453, -5.0560, -1.4551]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  9%|▊         | 25/289 [00:18<03:19,  1.33it/s]

Training loop 25
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.34655454754829407, logits - tensor([[-7.5684, -2.4613, -4.6484,  2.4862],
        [-4.1308, -3.3041,  3.1566, -2.6389],
        [-5.8032,  1.7452, -5.8299, -1.5398],
        [-5.2917, -3.9099,  2.7140, -3.1398],
        [-6.3706,  1.9098, -4.5502, -1.2311],
        [-5.8889, -3.9032,  2.9934, -3.2566],
        [-5.7074,  0.9616, -4.5984, -1.4867],
        [-5.3405,  1.4306, -4.7961, -1.5617]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  9%|▉         | 26/289 [00:19<03:18,  1.33it/s]

Training loop 26
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3520069122314453, logits - tensor([[-6.4981,  1.8798, -5.5120, -3.1087],
        [-6.1159,  2.4472, -5.4916, -1.1781],
        [-6.4685,  1.2069, -5.8869, -1.2666],
        [-5.9012,  1.1837, -5.4751, -1.0334],
        [-6.1370,  1.8015, -4.9021, -1.9401],
        [-6.8794,  1.5895, -5.9616, -1.9768],
        [-4.8811, -3.5385,  2.8954, -3.4699],
        [-5.7225,  1.4428, -5.0301, -2.4583]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  9%|▉         | 27/289 [00:20<03:16,  1.33it/s]

Training loop 27
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.07308919727802277, logits - tensor([[-7.0600,  1.9750, -5.7713, -2.0153],
        [-6.3603,  1.2811, -5.4472, -1.5581],
        [-5.7847, -2.2985, -4.5780,  1.4020],
        [-5.6425, -4.1712,  3.4807, -3.7035],
        [-5.3517, -3.4869,  3.0182, -3.6185],
        [-5.6040,  1.7509, -4.9936, -1.5200],
        [-5.9614,  1.3875, -4.3859, -1.0846],
        [-5.9106,  1.8769, -5.1072, -2.5180]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 10%|▉         | 28/289 [00:21<03:16,  1.33it/s]

Training loop 28
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.39922821521759033, logits - tensor([[-5.9394,  1.2943, -4.9784, -1.2260],
        [-6.3364,  1.4320, -4.5998, -1.3343],
        [-6.5682,  0.8737, -5.5357, -0.4883],
        [-6.5739,  0.9091, -6.4814, -0.3903],
        [-6.3849,  1.7811, -4.9105, -1.5629],
        [-6.6997,  2.3343, -5.5756, -2.2243],
        [-5.4627,  1.7921, -6.4973, -2.3808],
        [-6.8631, -0.6126, -4.9206, -0.3410]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 10%|█         | 29/289 [00:21<03:15,  1.33it/s]

Training loop 29
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.12399814277887344, logits - tensor([[-6.9006,  1.0778, -5.1823, -1.6199],
        [-6.6867,  1.3380, -5.7158, -1.8802],
        [-6.4567,  1.3226, -5.0366, -1.5265],
        [-5.3819, -4.1817,  2.2560, -2.9640],
        [-6.7436,  1.5338, -5.7466, -1.4988],
        [-6.2136, -0.1989, -4.4071, -0.5593],
        [-6.6746,  1.5396, -5.6559, -2.1881],
        [-5.1465, -3.4363,  2.6701, -2.7419]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 10%|█         | 30/289 [00:22<03:15,  1.33it/s]

Training loop 30
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 11%|█         | 31/289 [00:23<03:14,  1.32it/s]

loss - 0.0826055258512497, logits - tensor([[-5.2489, -4.2521,  3.3978, -2.7426],
        [-6.6414,  2.4321, -6.5725, -2.9942],
        [-5.8164,  1.5824, -5.2440, -2.0069],
        [-6.1579,  2.2324, -6.1044, -2.3842],
        [-4.8237, -4.3362,  2.2135, -1.6898],
        [-6.6817,  1.2995, -6.3820, -1.3552],
        [-5.8539, -0.7639, -4.3562,  0.8392],
        [-5.8083,  2.1595, -4.7978, -1.3422]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 31
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.22522303462028503, logits - tensor([[-5.6499, -4.5507,  2.3926, -3.2275],
        [-6.0091, -4.6994,  2.8012, -3.1241],
        [-6.8844,  2.0839, -6.2993, -2.8867],
        [-5.5556, -4.4386,  2.6050, -3.2973],
        [-8.1279,  1.5475, -6.6448, -1.6421],
        [-5.5897,  1.3900, -5.1056, -1.7318],
        [-6.7404,  1.1099, -5.2268, -0.7111],
        [-6.2618,  2.0294, -5.2135, -2.0

 11%|█         | 32/289 [00:24<03:13,  1.33it/s]

Training loop 32
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19279628992080688, logits - tensor([[-5.2177, -3.2195,  2.2738, -3.7000],
        [-7.2187,  0.1343, -4.9201, -0.2068],
        [-5.3879,  0.9572, -4.7154, -1.0081],
        [-6.4198,  1.2674, -5.2023, -1.6160],
        [-7.7580, -1.2767, -5.5291,  1.2819],
        [-6.3977,  0.4746, -4.7268, -1.3087],
        [-5.3032, -2.4393, -4.3581,  3.6227],
        [-6.8082,  1.8471, -5.8402, -1.4379]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 11%|█▏        | 33/289 [00:24<03:13,  1.32it/s]

Training loop 33
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3101920485496521, logits - tensor([[-6.6214, -1.4463, -5.1824,  0.9010],
        [-5.8410, -1.9359, -4.7610,  2.2156],
        [-5.5322, -3.4187,  2.2208, -1.7880],
        [-7.1900, -1.1758, -5.0431,  1.2258],
        [-6.0878,  0.9604, -5.4382, -1.2389],
        [-5.9976, -0.4821, -5.5686,  0.8713],
        [-5.1148, -3.5849,  2.3571, -2.2908],
        [-6.1380,  1.1324, -6.4043, -1.7754]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 12%|█▏        | 34/289 [00:25<03:13,  1.32it/s]

Training loop 34
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.47205713391304016, logits - tensor([[-6.7776,  1.7156, -5.8837, -1.9950],
        [-7.1333,  1.1317, -5.6191, -1.4685],
        [-6.2978,  1.4060, -4.7033, -1.0249],
        [-6.4648,  2.1023, -6.0210, -2.4464],
        [-7.1243, -1.1384, -5.6740,  0.9102],
        [-4.8310, -3.7080,  2.7683, -3.2760],
        [-6.4246,  0.1798, -4.8712, -1.1429],
        [-7.2042,  1.0195, -5.6466,  0.0534]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 12%|█▏        | 35/289 [00:26<03:12,  1.32it/s]

Training loop 35
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2950701117515564, logits - tensor([[-6.5322,  2.4671, -5.7761, -2.2249],
        [-5.9881,  0.7065, -4.9743, -0.6031],
        [-5.7555, -4.2397,  2.5763, -2.7419],
        [-6.7212,  0.9756, -5.2880, -1.0961],
        [-7.0094,  2.1669, -7.0666, -2.1052],
        [-6.7320,  1.0447, -4.9026, -1.2938],
        [-7.8802,  0.2668, -5.6762,  0.1852],
        [-6.0337, -2.3924, -4.6410,  2.4658]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 12%|█▏        | 36/289 [00:27<03:10,  1.33it/s]

Training loop 36
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.09284782409667969, logits - tensor([[-7.4106, -3.4874, -3.8291,  2.3405],
        [-7.2490,  1.6297, -5.9457, -2.3155],
        [-6.2277,  1.2701, -5.0308, -0.6936],
        [-6.6632,  1.6107, -5.3436, -1.0111],
        [-6.9716,  1.4661, -5.3498, -2.2999],
        [-7.3062, -2.6949, -5.1682,  2.3403],
        [-5.9973,  0.1925, -4.1122, -2.0772],
        [-6.2164, -4.1255,  2.3922, -2.5605]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 13%|█▎        | 37/289 [00:27<03:09,  1.33it/s]

Training loop 37
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.22235426306724548, logits - tensor([[-5.7727e+00, -2.8737e+00, -4.9452e+00,  2.7907e+00],
        [-6.9828e+00, -1.2392e+00, -5.3175e+00,  6.5411e-01],
        [-7.0551e+00,  1.1901e+00, -5.4471e+00, -6.9425e-01],
        [-6.3084e+00, -2.2812e+00, -4.7939e+00,  2.4755e+00],
        [-6.6422e+00,  9.7281e-01, -5.4546e+00, -1.1756e+00],
        [-6.8648e+00, -3.9674e-01, -5.2421e+00, -4.3586e-01],
        [-7.1779e+00, -3.2833e-03, -5.1461e+00, -4.7773e-01],
        [-6.4243e+00,  1.6883e+00, -5.4228e+00, -1.6016e+00]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 13%|█▎        | 38/289 [00:28<03:08,  1.33it/s]

Training loop 38
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3027224540710449, logits - tensor([[-7.4650,  2.2586, -6.9085, -1.9340],
        [-4.8691, -2.8208, -3.6980,  3.6460],
        [-6.4223,  0.4032, -4.9291, -2.1745],
        [-6.1354, -1.3490, -4.6079,  1.9702],
        [-6.6651,  1.0984, -5.7826, -1.5764],
        [-4.6385, -3.6174,  2.1137, -3.0318],
        [-5.7353,  1.1203, -4.7092, -1.1689],
        [-6.0601,  1.0112, -4.3882, -1.3276]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 13%|█▎        | 39/289 [00:29<03:08,  1.33it/s]

Training loop 39
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.15983884036540985, logits - tensor([[-7.1412,  0.7432, -5.2001, -1.1702],
        [-5.6895, -3.3118, -4.1230,  3.8514],
        [-6.1772,  0.4170, -5.0291, -0.4764],
        [-5.0237, -3.9974,  2.1805, -2.0727],
        [-6.4618, -2.3237, -4.4868,  2.5828],
        [-6.4371,  0.5638, -6.1105, -1.4790],
        [-5.3990,  0.7825, -4.2828, -0.3578],
        [-6.9969,  2.0170, -6.2317, -2.3271]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 14%|█▍        | 40/289 [00:30<03:07,  1.33it/s]

Training loop 40
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1368524581193924, logits - tensor([[-6.8228,  2.4628, -6.2175, -1.4897],
        [-6.2085,  1.3026, -5.6745, -1.5404],
        [-5.7633,  1.0190, -5.7356, -0.9398],
        [-5.6557, -4.9815,  2.9659, -3.0534],
        [-6.7635,  2.0978, -5.4874, -1.3566],
        [-5.4733, -4.3295,  3.0347, -1.8982],
        [-5.7456,  0.0329, -4.2926, -0.0107],
        [-6.2362,  0.6667, -5.2064, -0.3274]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 14%|█▍        | 41/289 [00:30<03:06,  1.33it/s]

Training loop 41
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.20876793563365936, logits - tensor([[-6.5988,  1.5470, -5.4321, -1.5551],
        [-5.7673, -1.9271, -4.6641,  1.9910],
        [-4.9868, -3.4902,  2.6583, -1.9812],
        [-6.3627,  0.7859, -5.8041,  0.0110],
        [-6.4995, -2.9992, -4.0490,  2.0629],
        [-7.0753,  2.1938, -6.1534, -2.9703],
        [-5.5008,  0.9669, -3.8946, -0.6691],
        [-5.8965,  0.2390, -4.1081, -0.7260]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 15%|█▍        | 42/289 [00:31<03:05,  1.33it/s]

Training loop 42
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.27217432856559753, logits - tensor([[-6.1863, -4.0171,  2.3576, -2.3467],
        [-6.8920,  1.4258, -5.9911, -1.1699],
        [-6.5240,  1.5079, -5.7208, -2.2562],
        [-5.2385, -3.6251,  1.8097, -3.5865],
        [-5.5227,  1.5151, -5.8365, -2.2505],
        [-6.6180,  1.7467, -5.8249, -2.1604],
        [-6.2987, -3.9317,  2.4696, -2.1917],
        [-6.6136,  1.8525, -5.6354, -1.7269]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 15%|█▍        | 43/289 [00:32<03:04,  1.33it/s]

Training loop 43
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4146767854690552, logits - tensor([[-5.4207, -2.8657,  1.8667, -2.1810],
        [-5.1375, -3.6899,  2.6304, -2.8890],
        [-5.2675,  1.5357, -4.5983, -1.2981],
        [-5.5141,  0.9170, -3.8917, -0.3798],
        [-6.3049, -3.8154,  2.5680, -3.0950],
        [-6.2699, -0.7190, -4.7934, -0.2589],
        [-5.7709, -3.3580,  2.7766, -2.1367],
        [-5.3509, -3.7314,  2.3100, -2.4579]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 15%|█▌        | 44/289 [00:33<03:03,  1.34it/s]

Training loop 44
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.06761248409748077, logits - tensor([[-5.5962, -3.9904,  2.2777, -3.0525],
        [-5.8552, -3.7785,  1.7434, -3.1699],
        [-6.0844, -4.0697,  2.8055, -3.2123],
        [-6.3217,  1.0905, -5.5144, -2.4298],
        [-4.7975, -3.6299,  1.8572, -1.9815],
        [-5.2952, -3.5720,  2.5544, -2.3264],
        [-6.8653,  1.8759, -6.1651, -1.8419],
        [-5.3254,  1.3792, -5.6349, -1.3166]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 16%|█▌        | 45/289 [00:33<03:02,  1.34it/s]

Training loop 45
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.37842637300491333, logits - tensor([[-6.5290, -3.5118,  2.0616, -2.8536],
        [-5.5105, -3.9334,  1.8434, -2.6137],
        [-6.2400,  1.7418, -4.8885, -1.8881],
        [-5.5356,  2.4873, -4.9150, -1.7045],
        [-5.9961,  2.0644, -5.4587, -1.7441],
        [-5.6865,  1.0060, -4.6384, -1.0681],
        [-5.9788,  0.6782, -4.4003, -0.1422],
        [-6.0745,  2.3378, -5.6431, -1.4218]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 16%|█▌        | 46/289 [00:34<03:02,  1.33it/s]

Training loop 46
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19680926203727722, logits - tensor([[-7.6559,  1.6483, -6.4363, -2.1766],
        [-6.5037,  1.8577, -5.4116, -1.1378],
        [-7.3440,  1.4026, -6.2386, -1.1507],
        [-6.1650, -1.1470, -4.3913,  1.4966],
        [-6.8658,  1.4919, -5.1446, -0.8170],
        [-5.9962,  1.6093, -4.9344, -1.5652],
        [-5.3809, -4.0481,  1.7944, -2.0925],
        [-4.9120, -2.3818, -4.5878,  2.7163]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 16%|█▋        | 47/289 [00:35<03:01,  1.34it/s]

Training loop 47
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.23086875677108765, logits - tensor([[-5.2567, -3.1152, -4.6557,  3.4136],
        [-7.2449,  1.4401, -5.6420, -0.7550],
        [-6.1142,  1.2000, -4.6543, -1.4165],
        [-6.7115,  0.1849, -4.2276, -0.1631],
        [-4.8954, -3.9755,  3.4457, -3.0032],
        [-7.3877, -0.2117, -5.9981, -0.1488],
        [-6.0287,  1.4050, -4.1333, -0.9837],
        [-6.3964,  2.1544, -5.0277, -2.4193]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 17%|█▋        | 48/289 [00:36<03:00,  1.33it/s]

Training loop 48
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.31214120984077454, logits - tensor([[-6.9138,  0.6927, -5.0639, -1.1987],
        [-6.9594,  1.3578, -6.1060, -2.3008],
        [-5.7841, -3.6952,  2.2705, -1.9847],
        [-5.8432, -4.1655,  2.5828, -2.5473],
        [-6.0074,  1.7698, -5.7832, -1.6293],
        [-5.6808,  1.2790, -5.3098, -2.0002],
        [-5.7064, -0.1662, -3.8183,  1.1565],
        [-5.8405,  1.1204, -5.4460, -1.5856]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 17%|█▋        | 49/289 [00:36<02:59,  1.33it/s]

Training loop 49
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2608006000518799, logits - tensor([[-6.0620,  1.3814, -5.4443, -1.2666],
        [-6.8232,  1.9143, -5.9906, -1.3936],
        [-6.7761, -1.0927, -5.0733,  0.3456],
        [-5.4507,  1.2255, -3.9852, -0.7308],
        [-6.2924,  2.0314, -5.3786, -2.1961],
        [-5.5195, -3.9127,  2.7888, -2.4347],
        [-5.4712,  1.8552, -4.9071, -1.7146],
        [-6.7102,  1.0570, -6.4791, -1.3699]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 17%|█▋        | 50/289 [00:37<02:59,  1.33it/s]

Training loop 50
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 18%|█▊        | 51/289 [00:38<02:59,  1.33it/s]

loss - 0.29028013348579407, logits - tensor([[-6.2331, -1.3003, -5.1184,  2.2162],
        [-5.8919,  0.1212, -5.0799, -1.2556],
        [-6.5989,  1.9135, -5.7924, -2.5643],
        [-6.9048,  2.0436, -5.3584, -1.9219],
        [-6.8698, -0.9135, -5.3966,  1.0808],
        [-5.9677,  1.3603, -4.6701, -1.4608],
        [-5.4321, -3.6707, -4.9357,  3.5103],
        [-6.6190,  1.1209, -5.3692, -1.4146]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 51
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.16726167500019073, logits - tensor([[-6.0977,  1.8088, -5.3014, -1.8888],
        [-6.0723,  1.6908, -5.1698, -1.3870],
        [-6.0201, -1.6986, -4.0648,  1.7076],
        [-6.3827, -0.7098, -4.3326,  1.3007],
        [-5.9507,  0.5984, -4.6442, -0.7768],
        [-6.0975,  1.6611, -5.3334, -1.7822],
        [-6.7599,  0.3275, -5.3794, -0.8533],
        [-6.5954,  1.9308, -5.9717, -1.

 18%|█▊        | 52/289 [00:39<02:58,  1.32it/s]

Training loop 52
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1654396802186966, logits - tensor([[-7.0289,  1.6972, -6.2987, -2.8141],
        [-6.7469,  1.0430, -5.1298, -1.6784],
        [-4.8262, -2.6990,  1.9269, -1.7000],
        [-5.2448,  0.8526, -3.7657, -0.5537],
        [-6.7293,  1.0780, -5.5020, -0.6642],
        [-6.8949,  1.1261, -5.0337, -1.0856],
        [-7.5838,  1.2357, -6.0165, -2.1254],
        [-6.4753,  1.5821, -5.5640, -2.1660]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 18%|█▊        | 53/289 [00:39<02:58,  1.32it/s]

Training loop 53
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.38858386874198914, logits - tensor([[-5.7791,  1.7534, -5.6042, -1.6025],
        [-5.0319, -3.2722,  2.3070, -1.7778],
        [-6.8452,  1.4132, -5.5163, -1.9893],
        [-6.4964,  1.5598, -6.0779, -1.4225],
        [-4.7624,  1.0537, -5.3295, -1.3278],
        [-6.4695, -1.3349, -4.5420,  1.4216],
        [-5.0286,  0.5202, -4.8874, -0.6267],
        [-6.1195,  1.1956, -5.4388, -1.2875]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 19%|█▊        | 54/289 [00:40<02:58,  1.32it/s]

Training loop 54
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.11966602504253387, logits - tensor([[-6.2525, -1.2428, -5.8867,  0.7039],
        [-6.7682,  1.6600, -5.7956, -1.6152],
        [-6.8409, -2.0910, -4.3045,  1.4629],
        [-7.4061,  1.0187, -5.8742, -1.3359],
        [-6.7999, -0.1835, -4.4807,  0.6455],
        [-5.4831,  1.6997, -5.1867, -1.4435],
        [-5.8147,  1.8183, -4.8212, -2.2254],
        [-6.9836,  1.9215, -5.6540, -2.3502]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 19%|█▉        | 55/289 [00:41<02:57,  1.32it/s]

Training loop 55
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.10422804951667786, logits - tensor([[-5.4346, -4.1058,  2.6672, -2.3706],
        [-5.9430,  0.5234, -5.0340, -0.6891],
        [-4.6457, -3.5499,  2.5391, -3.1991],
        [-6.7056,  1.6943, -5.7750, -1.0265],
        [-5.8306,  1.4275, -5.4013, -1.7005],
        [-6.4239,  1.6353, -4.8474, -1.0338],
        [-6.5037,  1.5465, -6.1476, -1.1064],
        [-6.5148,  1.9665, -6.0550, -1.8603]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 19%|█▉        | 56/289 [00:42<02:57,  1.31it/s]

Training loop 56
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.325619637966156, logits - tensor([[-7.6030,  0.8843, -5.8333, -1.6939],
        [-5.2099,  0.8260, -3.7384, -0.6004],
        [-5.7766, -4.1988,  2.4945, -3.2082],
        [-6.8602,  1.8083, -5.2083, -1.5718],
        [-5.8120,  3.0065, -5.4394, -2.0200],
        [-5.7305,  1.4782, -5.5139, -1.7971],
        [-5.9715,  1.7706, -4.6450, -1.4530],
        [-7.0184,  1.4127, -5.4064, -1.4465]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 20%|█▉        | 57/289 [00:43<02:57,  1.31it/s]

Training loop 57
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.17413055896759033, logits - tensor([[-6.2429, -1.0740, -4.4301,  0.8968],
        [-7.0737,  0.8434, -5.4999, -0.8466],
        [-5.7211,  0.9800, -4.7404, -0.7055],
        [-5.5376,  1.3288, -5.5410, -1.5411],
        [-5.8005, -3.9619,  2.3160, -2.3400],
        [-6.6252,  2.1739, -5.8108, -2.4447],
        [-6.1662,  1.3637, -4.9247, -1.7626],
        [-6.5263,  0.8531, -4.7439, -1.5945]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 20%|██        | 58/289 [00:43<02:55,  1.31it/s]

Training loop 58
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4291326403617859, logits - tensor([[-5.9188,  0.7652, -5.3332, -0.5646],
        [-6.4814,  1.6325, -5.8211, -1.7397],
        [-6.2829,  1.9422, -6.0399, -1.7578],
        [-5.7945,  0.4935, -4.8774, -0.4346],
        [-5.9664, -3.5387,  1.8960, -2.3748],
        [-5.9891,  1.0539, -4.7151, -2.1072],
        [-6.4480,  1.7394, -5.8714, -2.2000],
        [-6.6359,  0.5517, -4.2000, -0.3487]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 20%|██        | 59/289 [00:44<02:54,  1.32it/s]

Training loop 59
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.10469512641429901, logits - tensor([[-5.2982, -3.4396,  2.7736, -2.7979],
        [-5.8688, -3.3990,  2.5289, -2.1834],
        [-6.8407,  1.4880, -5.2162, -0.9429],
        [-6.3317, -0.4713, -5.1446,  0.3024],
        [-6.6343,  1.5152, -4.8231, -1.8724],
        [-5.2236, -3.9073,  1.6373, -2.8493],
        [-6.2143, -2.7279, -4.3781,  2.1109],
        [-5.3198,  1.0107, -4.9814, -1.1911]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 21%|██        | 60/289 [00:45<02:53,  1.32it/s]

Training loop 60
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3547474145889282, logits - tensor([[-6.0134, -0.6550, -4.4072,  1.5568],
        [-6.0019,  1.4037, -5.1889, -1.5546],
        [-5.4971,  2.1147, -5.8250, -1.2930],
        [-5.0082, -3.5300,  2.6514, -3.0395],
        [-5.1548, -3.8393,  1.9795, -2.9375],
        [-5.6434, -3.6067,  2.2792, -3.0956],
        [-6.2845, -0.6021, -5.0525,  0.1376],
        [-5.1965, -3.7192,  2.1371, -2.2119]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 21%|██        | 61/289 [00:46<02:52,  1.32it/s]

Training loop 61
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1666191816329956, logits - tensor([[-6.8188,  1.7047, -5.4174, -1.5341],
        [-6.6194,  0.9237, -4.7687, -0.5582],
        [-6.9181,  1.9907, -6.1185, -1.9936],
        [-7.9048,  1.7382, -6.4201, -2.8744],
        [-6.2360,  0.1707, -4.8926, -0.1147],
        [-6.4495,  1.5889, -5.5306, -1.3941],
        [-6.6335,  1.9797, -5.9268, -1.3428],
        [-7.0166,  2.5023, -6.9194, -2.3569]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 21%|██▏       | 62/289 [00:46<02:51,  1.33it/s]

Training loop 62
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.281078964471817, logits - tensor([[-6.5601,  0.8592, -6.4854, -0.9441],
        [-6.0272, -4.7687,  2.9769, -2.9656],
        [-5.3217,  0.6787, -3.8445, -1.3326],
        [-7.2906,  1.3652, -6.0056, -0.2687],
        [-6.3052,  1.2928, -5.9658, -1.2622],
        [-6.2818,  1.7276, -5.1925, -1.0630],
        [-6.7935,  0.6729, -4.6829, -0.8822],
        [-5.7186, -3.8481,  2.2273, -2.6416]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 22%|██▏       | 63/289 [00:47<02:50,  1.33it/s]

Training loop 63
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5021809935569763, logits - tensor([[-6.6771,  1.8674, -6.1504, -1.4545],
        [-6.6787, -3.6015, -5.3958,  3.7589],
        [-5.5039,  0.9465, -5.3918, -0.7723],
        [-5.8205, -4.3485,  2.3429, -2.5782],
        [-6.7830,  0.4411, -5.4901, -1.0934],
        [-6.1011,  1.7380, -5.1278, -1.0233],
        [-6.4683,  1.7193, -6.5689, -2.0679],
        [-6.2213, -0.7454, -3.9937,  0.1979]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 22%|██▏       | 64/289 [00:48<02:49,  1.33it/s]

Training loop 64
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.21929331123828888, logits - tensor([[-6.2171, -3.5392,  2.4415, -2.7615],
        [-5.5844,  1.4122, -4.8490, -1.3756],
        [-5.9085,  1.6820, -5.4033, -2.0324],
        [-5.7445,  1.3906, -5.9831, -1.0101],
        [-5.3453, -3.4210, -4.3189,  2.1947],
        [-6.9028,  0.7988, -5.2255, -0.6271],
        [-6.3007,  1.6201, -5.6110, -1.7569],
        [-7.1274,  0.2697, -5.8782, -1.0776]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 22%|██▏       | 65/289 [00:49<02:48,  1.33it/s]

Training loop 65
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1466849148273468, logits - tensor([[-6.3835,  0.8887, -4.6401, -1.5057],
        [-5.0873,  0.8857, -4.2196, -1.7306],
        [-5.9586,  1.2807, -5.3361, -1.5682],
        [-6.5755,  2.1043, -5.6994, -2.2386],
        [-5.2010, -3.0576, -4.5858,  2.7609],
        [-5.6587,  1.3370, -5.0483, -1.4820],
        [-7.2494, -0.3133, -4.4772, -0.0432],
        [-5.6733,  0.7405, -4.6753, -0.6311]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 23%|██▎       | 66/289 [00:49<02:47,  1.33it/s]

Training loop 66
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2080337256193161, logits - tensor([[-5.3050,  1.3277, -4.9199, -1.2349],
        [-6.3716, -1.4303, -4.0802,  0.6875],
        [-5.1778,  1.5968, -5.6527, -0.9462],
        [-7.1851,  0.8898, -5.6616, -1.1950],
        [-5.8352,  1.3967, -4.6959, -1.2705],
        [-6.1231,  1.9579, -4.6934, -0.9748],
        [-5.0016, -3.4442,  2.4637, -3.1686],
        [-5.4558, -3.4546,  1.4879, -2.2203]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 23%|██▎       | 67/289 [00:50<02:46,  1.33it/s]

Training loop 67
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.14616167545318604, logits - tensor([[-5.8853,  1.2596, -4.9184, -0.6540],
        [-5.9172,  2.8674, -6.2014, -1.8622],
        [-6.7341, -0.3037, -4.8143, -0.0399],
        [-7.0629,  1.0926, -4.8949, -0.9293],
        [-6.0737,  2.1983, -6.5970, -1.0442],
        [-5.5344,  0.7299, -4.2133, -0.9954],
        [-7.1330,  2.0593, -6.2879, -1.1511],
        [-5.8137,  1.9317, -5.0160, -1.4580]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 24%|██▎       | 68/289 [00:51<02:46,  1.33it/s]

Training loop 68
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.22097712755203247, logits - tensor([[-5.9615,  0.7217, -4.7854, -1.6626],
        [-6.4713,  0.7620, -5.6402, -1.4463],
        [-6.4956,  1.2410, -5.1044, -0.6978],
        [-6.3968, -2.7962, -4.1868,  1.5783],
        [-6.6820,  1.5177, -6.2146, -1.1064],
        [-6.4151,  2.5987, -5.9252, -1.5923],
        [-5.4122,  1.6918, -4.9059, -1.8424],
        [-5.8164,  1.8502, -5.3069, -1.5208]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 24%|██▍       | 69/289 [00:52<02:45,  1.33it/s]

Training loop 69
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.44102317094802856, logits - tensor([[-6.4817,  1.3294, -4.9333, -1.2482],
        [-5.8371,  1.1478, -5.0078, -1.3071],
        [-5.0368, -3.2354, -4.1898,  3.5739],
        [-6.5627,  2.7159, -5.7264, -2.4800],
        [-6.3531, -0.7497, -5.0305,  0.7399],
        [-7.4245,  0.8278, -6.2471,  0.4807],
        [-6.2888,  1.3780, -6.2683, -2.1242],
        [-6.7364,  0.6374, -5.0803, -0.8385]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 24%|██▍       | 70/289 [00:52<02:45,  1.33it/s]

Training loop 70
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.22029420733451843, logits - tensor([[-6.7855,  2.0313, -5.7009, -1.5435],
        [-5.3406,  0.2956, -3.6393, -0.9822],
        [-7.3312,  1.7453, -6.6170, -1.5584],
        [-6.9658,  1.7121, -5.0514, -1.8835],
        [-6.2168,  1.5301, -4.8450, -1.7711],
        [-5.8782,  1.4499, -5.6173, -1.6046],
        [-6.5565, -1.5638, -4.0493,  1.2291],
        [-6.6513,  1.8439, -6.3160, -1.5026]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 25%|██▍       | 71/289 [00:53<02:43,  1.33it/s]

Training loop 71
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.14978286623954773, logits - tensor([[-6.7087,  2.0188, -5.7827, -0.7446],
        [-6.6265,  1.7647, -5.5685, -1.3190],
        [-5.5943, -3.0146, -4.7627,  2.4997],
        [-5.6572,  0.4945, -4.6398, -1.0281],
        [-6.5857, -0.0088, -5.4187,  0.3935],
        [-6.4871,  1.5016, -5.3370, -1.3617],
        [-5.7436,  1.2766, -5.0340, -1.0101],
        [-6.2430,  1.9984, -4.7227, -1.5702]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 25%|██▍       | 72/289 [00:54<02:43,  1.33it/s]

Training loop 72
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2000892609357834, logits - tensor([[-7.0020, -2.4007, -2.7475,  0.5724],
        [-5.2996,  1.3997, -5.1893, -0.8711],
        [-6.2116,  1.5278, -4.7574, -0.7144],
        [-6.7720,  1.7603, -5.9585, -1.6856],
        [-6.9995, -1.1569, -6.2305,  1.5657],
        [-6.0100,  2.9512, -5.6934, -1.5399],
        [-6.4748,  1.0683, -5.2909, -1.0325],
        [-5.5564, -3.0736,  2.0529, -2.6068]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 25%|██▌       | 73/289 [00:55<02:42,  1.33it/s]

Training loop 73
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.15996913611888885, logits - tensor([[-6.5895, -3.8676,  1.8284, -2.2409],
        [-6.0132, -0.3526, -4.4683,  0.4359],
        [-6.6733,  2.2144, -6.3662, -2.2156],
        [-7.0276,  0.7623, -4.9189, -1.1607],
        [-5.7678, -3.3988,  2.2390, -3.0874],
        [-5.5622,  1.2856, -5.1220, -2.1531],
        [-5.9501,  2.3899, -5.0491, -2.0256],
        [-6.9729, -1.9858, -5.0480,  2.2212]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 26%|██▌       | 74/289 [00:55<02:42,  1.32it/s]

Training loop 74
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.16178032755851746, logits - tensor([[-6.6482,  1.8366, -5.6898, -2.1139],
        [-5.9141, -3.3437,  1.4943, -2.0269],
        [-6.2558,  1.6351, -5.0460, -1.6728],
        [-5.6752,  0.6159, -4.6387, -1.9983],
        [-5.9575,  2.7364, -6.0103, -2.2010],
        [-6.5081,  0.9855, -5.6244, -1.7950],
        [-7.2514,  1.9913, -6.0286, -2.8699],
        [-5.7785, -2.2768, -5.3988,  2.7522]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 26%|██▌       | 75/289 [00:56<02:41,  1.32it/s]

Training loop 75
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.051793940365314484, logits - tensor([[-7.0213,  2.1523, -6.5815, -2.9192],
        [-5.4847, -2.9793, -4.8474,  2.4478],
        [-5.8561,  2.1435, -6.5525, -2.0917],
        [-5.9397,  2.0406, -5.4737, -1.5837],
        [-7.2642,  2.2465, -6.3155, -2.4155],
        [-6.0453,  2.8618, -6.0781, -2.1998],
        [-6.5291,  1.1414, -5.4568, -2.3802],
        [-5.5245, -3.3822, -4.8841,  3.4692]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 26%|██▋       | 76/289 [00:57<02:41,  1.32it/s]

Training loop 76
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.44714322686195374, logits - tensor([[-6.6522,  1.4457, -5.9369, -1.9656],
        [-7.5666,  1.1674, -6.3969, -1.4709],
        [-5.7468,  2.0750, -5.3571, -2.0868],
        [-5.8817, -3.2997, -4.7832,  3.7216],
        [-5.7691, -0.4222, -4.2080,  1.2958],
        [-6.6241,  1.5701, -6.0793, -1.8596],
        [-6.4619,  2.0567, -5.4904, -2.0317],
        [-5.6893,  2.8512, -6.4791, -2.3103]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 27%|██▋       | 77/289 [00:58<02:40,  1.32it/s]

Training loop 77
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.24687671661376953, logits - tensor([[-6.2366,  1.2221, -5.2810, -2.2451],
        [-6.5600,  1.1252, -4.9370, -2.0699],
        [-5.8200,  0.7921, -5.2585, -1.2998],
        [-5.5266, -3.4526,  1.9260, -2.8417],
        [-5.5393,  2.3596, -5.4741, -2.7772],
        [-5.4942,  1.2514, -4.9508, -1.8299],
        [-5.8388,  2.3583, -6.0843, -1.6935],
        [-6.5372,  2.5229, -5.6977, -2.6021]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 27%|██▋       | 78/289 [00:58<02:40,  1.32it/s]

Training loop 78
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.13723605871200562, logits - tensor([[-5.9727,  0.1929, -3.9242, -0.7008],
        [-5.4216, -3.0139,  1.7308, -3.2620],
        [-5.3270,  1.5824, -5.2113, -1.7391],
        [-6.0890,  1.3264, -5.6388, -1.7862],
        [-5.1179,  0.5779, -5.0141, -1.9747],
        [-7.2603,  1.7114, -6.0027, -1.4991],
        [-6.0646,  1.7023, -5.3079, -2.4401],
        [-6.2861,  2.4131, -5.4582, -1.7899]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 27%|██▋       | 79/289 [00:59<02:39,  1.32it/s]

Training loop 79
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2777705192565918, logits - tensor([[-6.0733,  2.0794, -5.9606, -1.1679],
        [-6.4269,  1.9033, -6.2438, -1.6447],
        [-6.0776,  2.0379, -6.2977, -2.5277],
        [-7.2316,  1.8619, -5.7685, -2.1779],
        [-5.8487,  1.4348, -5.7454, -1.7505],
        [-5.8933,  1.4378, -5.6888, -1.2093],
        [-6.5423,  1.4405, -5.3877, -1.6191],
        [-5.6757,  1.9590, -6.0839, -2.0162]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 28%|██▊       | 80/289 [01:00<02:37,  1.32it/s]

Training loop 80
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.07382185012102127, logits - tensor([[-6.0782,  0.8488, -5.4882, -1.2365],
        [-5.1919, -3.1373,  2.6878, -3.4309],
        [-6.8483,  2.0359, -5.8979, -2.4589],
        [-6.5122,  1.3996, -5.7434, -1.4950],
        [-7.4953, -4.3924,  2.6570, -3.0006],
        [-6.2227,  1.9683, -6.5046, -2.3293],
        [-6.9007,  1.4530, -5.6179, -1.7793],
        [-5.9812,  2.0742, -5.6599, -2.1376]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 28%|██▊       | 81/289 [01:01<02:36,  1.33it/s]

Training loop 81
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.23268181085586548, logits - tensor([[-6.2992,  1.7081, -5.9383, -1.5458],
        [-7.3136,  1.8795, -6.2604, -1.8159],
        [-6.3291,  1.8488, -6.0763, -1.4989],
        [-6.0572,  1.9708, -5.4843, -2.8188],
        [-5.6628,  1.9016, -5.3446, -1.6594],
        [-5.4041,  1.4262, -5.4740, -1.6390],
        [-6.1539,  1.7334, -5.2524, -1.7841],
        [-5.8516,  1.9475, -4.8250, -1.2259]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 28%|██▊       | 82/289 [01:01<02:35,  1.33it/s]

Training loop 82
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.0805811807513237, logits - tensor([[-6.0194,  0.9417, -5.4057, -1.6008],
        [-6.3485,  1.3006, -5.3177, -1.5425],
        [-5.5278, -3.0706,  1.9681, -2.2645],
        [-6.3220, -2.0890, -5.6202,  3.2617],
        [-6.1409, -3.6052, -4.8393,  3.3939],
        [-6.2090, -3.0082,  2.2037, -2.1610],
        [-6.4170,  1.8313, -5.9527, -1.2419],
        [-6.6065,  1.8876, -4.9649, -1.0936]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 29%|██▊       | 83/289 [01:02<02:34,  1.33it/s]

Training loop 83
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.10105651617050171, logits - tensor([[-6.4593,  1.5832, -6.0936, -1.2047],
        [-5.1844,  1.1842, -4.4341, -1.3480],
        [-6.5628,  2.4820, -6.2570, -2.3125],
        [-5.1999,  0.8197, -4.5941, -1.4666],
        [-6.0336, -3.1507,  2.7017, -2.2164],
        [-5.9692, -3.6037,  1.9626, -2.9348],
        [-4.7330, -2.4811,  1.6451, -2.3399],
        [-5.6449,  0.9504, -5.1079, -0.7707]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 29%|██▉       | 84/289 [01:03<02:34,  1.33it/s]

Training loop 84
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.15957729518413544, logits - tensor([[-6.6208,  1.4565, -6.0795, -1.1403],
        [-6.3839,  1.4370, -5.7464, -2.1056],
        [-6.1619,  1.0585, -5.3169, -0.7231],
        [-5.6111,  0.6024, -4.5505, -0.3710],
        [-5.9414, -3.5821,  2.1069, -3.2821],
        [-5.4798,  0.8331, -5.6150, -1.2721],
        [-6.0655, -1.1316, -5.2637,  1.1599],
        [-6.2888,  1.8949, -5.6336, -1.9675]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 29%|██▉       | 85/289 [01:04<02:33,  1.33it/s]

Training loop 85
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1728738248348236, logits - tensor([[-7.3576, -0.5373, -5.8727,  0.9120],
        [-6.1686,  1.6441, -5.6157, -1.5908],
        [-5.0751, -3.2796,  2.0575, -3.1132],
        [-6.0823,  1.7257, -5.4096, -2.2950],
        [-7.0535,  1.6535, -5.9622, -1.3485],
        [-5.4230,  2.4823, -5.9614, -2.2733],
        [-5.3664,  1.4499, -5.9163, -1.8304],
        [-6.5076,  1.2853, -5.2570, -1.1059]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 30%|██▉       | 86/289 [01:04<02:33,  1.33it/s]

Training loop 86
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.30817538499832153, logits - tensor([[-5.7421,  1.7002, -5.0927, -1.6931],
        [-6.9013,  0.8014, -5.8865, -0.8187],
        [-6.0501,  2.2406, -5.4806, -2.6615],
        [-6.9785,  1.2469, -5.2438, -2.0084],
        [-6.4691,  0.7465, -5.0765, -0.8376],
        [-7.3626, -1.0824, -5.2362,  1.7475],
        [-5.5006,  2.0152, -5.7250, -1.7900],
        [-6.3923,  1.3377, -5.2366, -1.2486]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 30%|███       | 87/289 [01:05<02:32,  1.33it/s]

Training loop 87
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.06752590835094452, logits - tensor([[-7.1043,  1.5434, -5.7652, -1.9643],
        [-5.6246,  0.9064, -5.6572, -1.7146],
        [-5.1799,  2.0644, -5.4844, -2.4237],
        [-6.8574,  1.3159, -6.3098, -2.2392],
        [-6.0249,  2.2978, -5.1828, -2.4545],
        [-6.9739,  2.2448, -6.5791, -1.9879],
        [-6.4414,  1.6648, -5.8774, -2.5560],
        [-7.0438,  3.3332, -6.3700, -2.8441]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 30%|███       | 88/289 [01:06<02:31,  1.33it/s]

Training loop 88
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.08553672581911087, logits - tensor([[-6.0841,  2.6999, -5.5789, -1.6559],
        [-5.0835, -3.4307, -5.9654,  2.3597],
        [-6.4636, -3.4956,  2.0954, -3.3299],
        [-6.1054, -3.1763,  1.8403, -2.2448],
        [-5.9352,  2.0411, -5.3895, -1.4961],
        [-7.3167,  2.3231, -6.3691, -2.4625],
        [-6.6770, -0.3897, -4.4974,  0.1255],
        [-5.1329, -3.4260,  1.9995, -2.9529]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 31%|███       | 89/289 [01:07<02:31,  1.32it/s]

Training loop 89
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.06430042535066605, logits - tensor([[-7.2433,  2.1952, -6.3201, -2.4414],
        [-6.1456, -4.1202,  2.4321, -3.0005],
        [-6.5981, -4.1918, -4.8140,  3.8770],
        [-6.8378, -3.4299,  2.5781, -2.9624],
        [-6.1265, -0.6310, -4.9927,  0.8534],
        [-5.1616, -3.2742,  2.1581, -2.5660],
        [-6.6736, -1.9268, -4.9358,  1.5614],
        [-6.8983,  3.3729, -6.6676, -2.0072]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 31%|███       | 90/289 [01:07<02:30,  1.32it/s]

Training loop 90
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1764705777168274, logits - tensor([[-6.8208,  2.1583, -6.0791, -2.5431],
        [-6.5101,  2.0520, -5.2561, -1.8733],
        [-6.0696,  1.8107, -4.8508, -2.1502],
        [-5.4209, -2.5061, -5.1319,  3.0323],
        [-6.0053, -3.2218,  2.1223, -2.1048],
        [-5.8940, -3.1610, -4.2487,  4.4674],
        [-6.3124, -3.8469,  2.6711, -3.3891],
        [-6.4233, -3.0576, -5.4694,  3.2160]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 31%|███▏      | 91/289 [01:08<02:29,  1.33it/s]

Training loop 91
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.24883972108364105, logits - tensor([[-7.7371,  0.2840, -6.1247, -0.3041],
        [-7.8146, -0.3686, -5.7167,  0.7323],
        [-6.5307,  1.8785, -4.9065, -1.9015],
        [-6.2641,  2.0826, -6.5636, -1.6468],
        [-6.4575, -2.7662, -5.3058,  2.6473],
        [-5.1311,  1.7893, -5.0091, -1.4000],
        [-5.8672, -3.9684,  1.7340, -2.8913],
        [-6.7556,  2.2291, -6.3375, -2.2271]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 32%|███▏      | 92/289 [01:09<02:28,  1.33it/s]

Training loop 92
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1897147297859192, logits - tensor([[-6.7807,  2.3422, -6.5933, -2.4984],
        [-5.9536, -3.4237,  2.4329, -2.3228],
        [-6.2157,  1.5292, -5.7257, -1.7260],
        [-7.2838,  2.7708, -6.3103, -1.6675],
        [-6.7398,  2.8401, -6.1025, -2.3623],
        [-6.2141, -3.7953, -5.1882,  2.3377],
        [-5.4413, -3.8061,  2.3674, -2.1753],
        [-5.9959, -3.4498,  2.5943, -3.0952]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 32%|███▏      | 93/289 [01:10<02:27,  1.33it/s]

Training loop 93
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1979409158229828, logits - tensor([[-5.5864, -3.9270,  2.6181, -2.9490],
        [-7.3763, -0.6362, -6.0551, -0.3017],
        [-7.0056,  2.4023, -5.8957, -2.7264],
        [-5.9641, -2.1180, -4.6034,  3.2539],
        [-6.4798,  1.8066, -5.5433, -2.2423],
        [-6.4735,  0.8208, -4.3850, -1.2472],
        [-7.1252,  0.1606, -5.7950, -0.2260],
        [-6.6841,  2.9077, -6.3051, -3.3719]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 33%|███▎      | 94/289 [01:10<02:28,  1.32it/s]

Training loop 94
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1437913477420807, logits - tensor([[-6.9913,  1.0704, -4.9184, -1.1133],
        [-6.0047,  1.7516, -5.5014, -1.9925],
        [-6.4884,  1.5908, -6.1430, -1.3894],
        [-7.1244,  2.3284, -5.7751, -2.0555],
        [-6.4831,  1.3847, -5.0775, -1.3979],
        [-6.2756, -3.2523, -5.0035,  2.9846],
        [-6.2824, -3.3997,  2.3650, -2.9046],
        [-5.6083, -3.0374,  2.5078, -2.9154]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 33%|███▎      | 95/289 [01:11<02:27,  1.31it/s]

Training loop 95
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.20972678065299988, logits - tensor([[-6.2888,  1.4864, -5.0806, -1.3501],
        [-5.2143, -2.2201, -4.4964,  2.3212],
        [-8.1928,  1.3049, -5.6401, -0.2962],
        [-6.4714,  1.3580, -5.9131, -0.8819],
        [-5.7117,  1.6756, -5.2395, -1.2639],
        [-5.6031, -3.8285,  2.4611, -2.9356],
        [-6.9122, -0.3561, -5.3209,  0.7623],
        [-6.0889,  2.2249, -5.5197, -0.9028]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 33%|███▎      | 96/289 [01:12<02:26,  1.31it/s]

Training loop 96
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2455860674381256, logits - tensor([[-4.7128, -3.5540,  2.6859, -3.0631],
        [-6.5329,  2.5577, -5.7535, -2.1447],
        [-5.7224,  2.4331, -5.2607, -1.4376],
        [-5.1517, -1.2160, -3.4111,  1.3435],
        [-7.0743,  2.6124, -5.6397, -1.9895],
        [-6.3459,  1.7017, -5.6783, -1.6370],
        [-7.1135,  3.1494, -6.7412, -2.4107],
        [-6.5810,  2.6496, -6.3355, -3.2415]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 34%|███▎      | 97/289 [01:13<02:26,  1.31it/s]

Training loop 97
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2005673348903656, logits - tensor([[-6.2941,  2.7527, -4.9287, -2.0511],
        [-6.1865,  1.7712, -5.8445, -2.9974],
        [-5.3056,  1.7297, -4.8122, -1.7441],
        [-5.6831, -3.7573, -4.9797,  3.0268],
        [-6.4587,  2.2387, -6.2015, -2.6631],
        [-6.8962, -0.0698, -5.2715,  0.1264],
        [-5.2637, -3.1866,  1.8566, -2.9622],
        [-5.5864, -3.4508,  2.3238, -2.3779]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 34%|███▍      | 98/289 [01:14<02:25,  1.31it/s]

Training loop 98
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.0809563547372818, logits - tensor([[-5.3685,  2.8608, -4.9963, -2.0439],
        [-7.0529,  1.6956, -5.6348, -1.7417],
        [-7.9068,  1.3447, -6.5691, -0.9879],
        [-7.3185,  1.0137, -6.3679, -0.6949],
        [-5.8378, -3.1966, -4.9820,  2.6842],
        [-5.2934, -3.3933,  1.9044, -3.3985],
        [-6.1702,  1.9096, -5.5037, -1.5787],
        [-6.1431, -4.3505,  2.3519, -3.2689]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 34%|███▍      | 99/289 [01:14<02:24,  1.32it/s]

Training loop 99
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4479885697364807, logits - tensor([[-5.4497, -3.4697,  2.2618, -2.8707],
        [-6.7563,  0.1730, -4.2391, -1.0090],
        [-6.6350, -1.2842, -5.5530,  0.7468],
        [-6.9977,  1.9732, -6.7272, -2.6268],
        [-7.2697,  2.3612, -6.2898, -1.7641],
        [-6.7259, -4.0637,  1.8101, -3.5138],
        [-6.8675,  1.5914, -5.8231, -1.6362],
        [-5.5718,  2.1759, -5.9711, -1.7673]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 35%|███▍      | 100/289 [01:15<02:23,  1.32it/s]

Training loop 100
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19123941659927368, logits - tensor([[-6.6281,  2.5438, -5.4108, -1.9033],
        [-5.9512,  1.7832, -5.5762, -1.8071],
        [-6.2418,  1.8859, -5.4860, -1.8869],
        [-6.9621,  2.2623, -5.8197, -2.7297],
        [-5.2178,  1.9257, -5.4389, -2.0998],
        [-7.0887,  0.8035, -5.8450, -0.9685],
        [-7.2163,  1.8907, -6.4635, -1.5563],
        [-7.4417,  2.0897, -6.0351, -1.9760]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 35%|███▍      | 101/289 [01:16<02:22,  1.32it/s]

Training loop 101
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.0614813007414341, logits - tensor([[-4.2452, -3.1385,  2.6867, -2.9827],
        [-5.9874,  1.7817, -5.0578, -1.2858],
        [-6.4376,  2.0918, -5.4256, -2.1086],
        [-6.1916,  2.2940, -5.5382, -2.3319],
        [-6.5089,  1.2767, -5.1449, -1.7526],
        [-5.7223,  2.2414, -4.7033, -2.5195],
        [-5.8488,  2.2952, -5.1281, -2.0314],
        [-5.9962, -3.4058,  2.8721, -3.3923]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 35%|███▌      | 102/289 [01:17<02:21,  1.32it/s]

Training loop 102
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1688528209924698, logits - tensor([[-5.2772, -3.7274,  2.1074, -3.2623],
        [-5.6930,  2.6855, -5.9167, -2.4555],
        [-5.9597, -3.6249,  2.8595, -2.7999],
        [-7.0051,  3.6733, -6.6871, -1.9513],
        [-6.1572,  1.4994, -4.5617, -1.7918],
        [-6.1671,  1.5113, -5.8070, -2.1454],
        [-6.3084, -1.7930, -4.5882,  1.6627],
        [-7.5335,  1.5556, -6.0057, -2.3507]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 36%|███▌      | 103/289 [01:17<02:20,  1.33it/s]

Training loop 103
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.06037747114896774, logits - tensor([[-6.7891,  2.5816, -6.3844, -2.7970],
        [-6.1950,  1.9956, -6.0089, -2.0737],
        [-5.2942,  1.8037, -4.5797, -2.1039],
        [-5.7023,  2.5258, -5.9952, -2.7649],
        [-4.8790, -3.4370, -5.2545,  3.3696],
        [-5.2079, -3.4384,  1.9682, -3.0877],
        [-6.3811,  2.2020, -5.8511, -1.4566],
        [-8.5917,  1.8942, -6.9860, -0.8059]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 36%|███▌      | 104/289 [01:18<02:19,  1.33it/s]

Training loop 104
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4362030625343323, logits - tensor([[-6.0720,  1.9084, -6.1070, -2.8503],
        [-6.7673,  1.0074, -5.2471, -1.8278],
        [-5.9683,  3.0283, -6.5069, -1.5102],
        [-6.4297,  1.5342, -4.6015, -1.7152],
        [-6.3601,  2.3076, -6.4202, -1.8664],
        [-6.8350,  1.7396, -6.0875, -1.7625],
        [-5.4648,  2.3635, -4.9948, -2.1959],
        [-7.3087,  2.3687, -6.4869, -1.8705]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 36%|███▋      | 105/289 [01:19<02:18,  1.33it/s]

Training loop 105
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.050035618245601654, logits - tensor([[-6.3188,  2.7197, -6.4600, -2.8062],
        [-6.7637, -1.4472, -4.8704,  1.7301],
        [-7.0343,  2.3923, -6.5849, -2.8387],
        [-6.3924, -4.3194,  2.4085, -3.2757],
        [-6.7800,  1.7164, -5.7208, -1.7707],
        [-5.0000,  2.5178, -5.3077, -2.0715],
        [-6.8038,  2.6447, -6.7555, -2.4182],
        [-6.8642,  2.8864, -6.6234, -2.7882]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 37%|███▋      | 106/289 [01:20<02:18,  1.33it/s]

Training loop 106
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.0686139389872551, logits - tensor([[-5.7109, -3.4683,  1.9419, -2.2447],
        [-7.8251, -2.5964, -2.4895,  0.1901],
        [-7.0283,  1.7163, -6.4753, -2.6133],
        [-6.4049,  1.8955, -5.8595, -2.3673],
        [-7.3852,  2.2071, -6.3889, -3.4281],
        [-7.9242,  2.8229, -6.7447, -2.5032],
        [-7.4191,  3.6031, -6.5365, -2.6665],
        [-5.9177,  2.1692, -5.3392, -1.4460]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 37%|███▋      | 107/289 [01:20<02:16,  1.33it/s]

Training loop 107
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.08088831603527069, logits - tensor([[-5.5346,  1.9478, -5.9883, -2.1645],
        [-4.6803, -2.4497, -4.1968,  2.7332],
        [-5.5231,  1.2029, -4.6176, -2.2321],
        [-6.4332,  1.8739, -6.5867, -2.5674],
        [-6.2481,  2.2093, -5.8905, -2.6345],
        [-6.2330,  1.4636, -4.9176, -1.6406],
        [-6.1159, -1.1965, -4.9101,  0.8970],
        [-6.6385, -2.3312, -5.3815,  1.1098]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 37%|███▋      | 108/289 [01:21<02:16,  1.33it/s]

Training loop 108
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.405608594417572, logits - tensor([[-5.5149, -2.5773, -4.8485,  3.0744],
        [-6.6802,  1.8685, -6.4026, -2.2219],
        [-6.2051,  1.9799, -5.9053, -2.2876],
        [-5.9561, -3.7521,  2.7185, -3.1197],
        [-7.2954,  1.6572, -5.5863, -1.8273],
        [-6.1098, -4.2061,  2.7941, -3.4602],
        [-6.6129,  1.3932, -6.0321, -1.5348],
        [-7.1533,  2.2162, -6.5265, -2.7414]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 38%|███▊      | 109/289 [01:22<02:15,  1.33it/s]

Training loop 109
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3450567126274109, logits - tensor([[-5.6818, -3.7409,  1.7270, -3.1487],
        [-6.6741,  1.9186, -6.3382, -2.3246],
        [-4.6537, -3.7463,  2.3967, -2.5851],
        [-4.9368, -3.6991,  2.3969, -3.1811],
        [-6.5075,  2.5266, -5.5087, -2.6998],
        [-6.0164,  1.4909, -4.3186, -0.8070],
        [-6.8692,  2.9838, -5.8240, -2.7411],
        [-6.2179,  2.2410, -4.6922, -1.8813]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 38%|███▊      | 110/289 [01:23<02:15,  1.32it/s]

Training loop 110
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2548092007637024, logits - tensor([[-5.8057, -3.6489,  3.1888, -2.4453],
        [-6.7811, -1.4912, -5.8477,  1.2881],
        [-5.9173, -3.6845,  2.6091, -2.2986],
        [-7.3894,  2.5537, -6.3377, -2.1957],
        [-6.1009,  1.7270, -5.7796, -2.7106],
        [-5.9555,  0.9589, -4.5166, -2.1185],
        [-6.3115, -1.0269, -4.3043,  1.3937],
        [-6.4444, -1.8738, -5.3895,  1.7308]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 38%|███▊      | 111/289 [01:23<02:15,  1.32it/s]

Training loop 111
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.04368528351187706, logits - tensor([[-6.3800,  2.1581, -5.7525, -2.3687],
        [-5.7919, -3.2815,  2.2313, -2.9028],
        [-7.2640,  3.0118, -6.5673, -3.3220],
        [-6.8502,  2.7720, -5.9233, -1.7239],
        [-6.2951, -4.3851,  3.0556, -3.3962],
        [-5.9233,  1.5725, -5.9005, -2.0815],
        [-5.6909, -2.9147, -4.6937,  3.0540],
        [-5.3403,  2.3193, -5.5074, -2.6842]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 39%|███▉      | 112/289 [01:24<02:14,  1.32it/s]

Training loop 112
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.22412793338298798, logits - tensor([[-5.9592, -3.5170,  2.0153, -2.5030],
        [-7.1475,  1.5529, -5.5365, -2.2610],
        [-6.3932, -1.4104, -1.2361, -0.6259],
        [-7.5926,  2.3040, -6.4110, -1.7578],
        [-5.8745,  1.9814, -5.2140, -1.5303],
        [-6.0504, -2.8498, -4.4680,  2.4440],
        [-6.1233,  0.8645, -4.5849, -1.3775],
        [-5.2389,  1.9493, -5.1716, -2.0281]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 39%|███▉      | 113/289 [01:25<02:13,  1.32it/s]

Training loop 113
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.47615140676498413, logits - tensor([[-7.4771,  3.0248, -6.3405, -2.3712],
        [-5.5169, -3.8722,  3.1439, -2.5448],
        [-5.9586,  1.6210, -5.0457, -1.4383],
        [-7.1908,  2.5648, -5.4902, -1.1987],
        [-5.2840, -3.1320,  1.3587, -2.2409],
        [-5.6386,  2.3488, -5.2131, -2.7723],
        [-6.6097,  2.4278, -6.2354, -2.1502],
        [-6.3550,  2.2361, -6.6708, -2.2890]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 39%|███▉      | 114/289 [01:26<02:13,  1.31it/s]

Training loop 114
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.07001761347055435, logits - tensor([[-7.7546,  1.5888, -6.5066, -1.4574],
        [-7.1788,  1.9939, -5.4946, -2.4280],
        [-5.7724, -2.3314, -4.6954,  3.1783],
        [-6.0459, -0.6988, -4.2943,  0.7704],
        [-5.5498, -4.0503,  3.5688, -3.0530],
        [-6.9733,  1.9702, -6.2901, -2.1290],
        [-6.3660,  2.8665, -6.0678, -2.0017],
        [-6.0308, -3.9607,  2.5739, -2.7964]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 40%|███▉      | 115/289 [01:26<02:13,  1.30it/s]

Training loop 115
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.21307644248008728, logits - tensor([[-7.2919,  2.2061, -6.5867, -1.7962],
        [-5.0980, -2.9397,  2.4422, -2.4579],
        [-6.8176,  2.6709, -5.9237, -2.1038],
        [-6.9958,  2.5918, -6.6325, -1.8766],
        [-6.1033,  2.4795, -5.4892, -2.6884],
        [-6.0662,  2.4246, -5.4493, -1.8931],
        [-5.8867,  1.2533, -4.9876, -1.6005],
        [-6.7181,  2.2706, -5.6487, -2.2457]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 40%|████      | 116/289 [01:27<02:12,  1.30it/s]

Training loop 116
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1758219599723816, logits - tensor([[-6.1230,  1.5667, -5.7642, -1.6477],
        [-6.6960,  1.4373, -5.7962, -2.4461],
        [-7.2975,  1.7581, -5.8899, -1.7502],
        [-6.8899,  2.3601, -6.3883, -2.6022],
        [-5.7663, -2.6890, -4.8703,  3.2528],
        [-6.2534,  1.6040, -5.2382, -0.9100],
        [-6.0534,  2.6104, -6.0999, -2.4960],
        [-6.8492,  2.4023, -5.6270, -2.6185]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 40%|████      | 117/289 [01:28<02:11,  1.30it/s]

Training loop 117
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.07014322280883789, logits - tensor([[-5.8440,  2.8210, -5.3330, -2.2935],
        [-5.7371, -3.5889,  2.3097, -2.0751],
        [-6.2202, -2.6960,  1.3300, -1.3857],
        [-6.6816,  2.4789, -6.0861, -2.5477],
        [-6.8951, -3.8277,  2.1113, -2.5185],
        [-6.0480,  2.3868, -4.8930, -2.0216],
        [-6.7271,  1.9729, -6.2824, -2.2362],
        [-4.9937, -2.3721,  1.6989, -1.4034]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 41%|████      | 118/289 [01:29<02:10,  1.31it/s]

Training loop 118
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.17351999878883362, logits - tensor([[-6.3616,  2.2344, -6.3692, -2.5442],
        [-6.1535,  2.0665, -5.6478, -2.0650],
        [-6.9186,  1.0948, -5.5530, -1.0546],
        [-4.7763, -2.7165, -4.5999,  3.6857],
        [-6.7052,  0.9911, -5.7641, -2.3195],
        [-6.5951,  2.0635, -6.0681, -1.3196],
        [-6.9872,  2.5647, -5.2858, -2.8767],
        [-6.7100,  0.2498, -5.6582,  0.1836]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 41%|████      | 119/289 [01:29<02:09,  1.31it/s]

Training loop 119
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3766106963157654, logits - tensor([[-6.6655,  2.3521, -5.3261, -1.7446],
        [-7.1065,  2.6265, -6.9791, -2.6373],
        [-8.2423,  2.4612, -5.4650, -3.0138],
        [-6.7639,  1.3537, -6.3353, -2.4468],
        [-6.1669, -3.9298,  2.7941, -2.6101],
        [-4.9669,  2.1242, -5.2536, -2.9949],
        [-6.5829, -4.5171,  2.0213, -2.9422],
        [-5.8664, -2.1456, -5.1756,  1.9633]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 42%|████▏     | 120/289 [01:30<02:08,  1.31it/s]

Training loop 120
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.07220792770385742, logits - tensor([[-5.0381,  1.8981, -4.3108, -1.6734],
        [-6.6272,  0.3828, -5.0091, -1.2467],
        [-5.9479, -3.7850,  2.2651, -2.3448],
        [-6.0002,  2.4298, -5.2376, -1.9931],
        [-5.9911, -1.7322, -4.4963,  1.5639],
        [-6.2471,  2.3914, -5.6134, -3.1375],
        [-5.7220,  2.6952, -5.7274, -2.2745],
        [-4.6318, -3.1704, -4.8168,  3.4071]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 42%|████▏     | 121/289 [01:31<02:08,  1.31it/s]

Training loop 121
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.23456957936286926, logits - tensor([[-6.4879,  0.4966, -5.4902, -0.7745],
        [-6.7563, -1.3787, -4.1948,  1.1163],
        [-6.6419,  2.1733, -5.8534, -2.3224],
        [-5.5518, -3.2048,  1.7170, -1.7701],
        [-7.0373, -1.1222, -4.5370,  1.7199],
        [-6.1732, -3.0465,  2.2148, -2.0002],
        [-7.5031,  2.8358, -6.9515, -2.3453],
        [-7.0466,  1.9033, -6.1355, -2.5423]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 42%|████▏     | 122/289 [01:32<02:07,  1.31it/s]

Training loop 122
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.18612509965896606, logits - tensor([[-5.9209, -3.2169,  1.3479, -1.8713],
        [-6.0166,  0.1982, -5.1242,  0.3943],
        [-5.9042,  2.0145, -5.0696, -1.1490],
        [-6.9173,  1.7697, -5.7838, -1.4171],
        [-5.8578,  2.8801, -5.4388, -1.7827],
        [-6.6123,  0.0774, -5.4336,  0.5722],
        [-7.3571,  0.6865, -4.7351, -0.1904],
        [-6.3558,  1.5860, -5.1221, -2.0265]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 43%|████▎     | 123/289 [01:33<02:06,  1.31it/s]

Training loop 123
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1404775083065033, logits - tensor([[-5.6671,  2.4642, -5.8608, -2.1727],
        [-6.1847,  1.2270, -5.4393, -0.8940],
        [-5.7519,  2.4730, -5.8809, -2.1282],
        [-7.0168,  2.6540, -5.7483, -1.9368],
        [-6.2280,  1.1305, -5.1445, -0.7813],
        [-5.8572, -2.9046,  1.2263, -1.7418],
        [-6.6928,  2.1623, -6.4337, -2.7239],
        [-7.0481, -3.6023, -5.8641,  3.4983]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 43%|████▎     | 124/289 [01:33<02:05,  1.32it/s]

Training loop 124
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19476214051246643, logits - tensor([[-5.3526, -3.0048,  1.9651, -2.2762],
        [-7.2468,  2.6151, -6.2349, -2.0324],
        [-6.9418,  1.6840, -5.0322, -1.4469],
        [-6.3323, -2.2058, -5.7336,  3.3217],
        [-5.8245,  3.1617, -6.3187, -2.8011],
        [-7.0630,  2.6137, -6.3682, -2.7341],
        [-6.6126,  2.6123, -6.2963, -2.2481],
        [-7.1482,  2.2939, -6.0816, -2.8267]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 43%|████▎     | 125/289 [01:34<02:04,  1.32it/s]

Training loop 125
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.18955637514591217, logits - tensor([[-6.6691, -1.2629, -4.7006,  1.9292],
        [-6.2361, -1.0688, -5.4130,  1.5489],
        [-7.6209,  1.3412, -6.4364, -1.7938],
        [-6.2537, -1.1113, -4.9570,  2.3864],
        [-5.8110, -3.8492,  2.7878, -2.6541],
        [-6.8689,  1.6165, -5.5023, -1.6876],
        [-7.2752,  1.5557, -4.9102, -1.8003],
        [-6.0464,  2.2300, -6.1502, -2.4348]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 44%|████▎     | 126/289 [01:35<02:03,  1.32it/s]

Training loop 126
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.08386200666427612, logits - tensor([[-7.2463,  0.8377, -5.7872, -0.4455],
        [-5.4934,  2.0453, -5.1631, -1.5027],
        [-6.2036, -1.6793, -5.5636,  2.0024],
        [-5.1079, -2.7685,  1.3660, -1.8109],
        [-6.4362, -3.7379,  2.1177, -2.3092],
        [-5.8949,  2.5624, -5.6468, -2.6965],
        [-6.4185,  1.7097, -5.6118, -2.4857],
        [-7.1386,  2.8560, -6.7701, -2.8704]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 44%|████▍     | 127/289 [01:36<02:02,  1.32it/s]

Training loop 127
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.14742228388786316, logits - tensor([[-7.2661,  2.9136, -6.8050, -2.5915],
        [-6.0045, -4.3163, -5.1542,  3.4205],
        [-5.9998,  1.1390, -4.4230, -0.8604],
        [-5.8459, -0.1203, -6.0655,  1.3972],
        [-5.4096,  2.8160, -5.9641, -2.3151],
        [-6.6996,  1.8192, -6.3210, -1.4990],
        [-6.2380,  1.6933, -5.0923, -2.1667],
        [-6.4803,  2.0046, -4.8599, -2.2907]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 44%|████▍     | 128/289 [01:36<02:01,  1.33it/s]

Training loop 128
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.41354167461395264, logits - tensor([[-5.2080,  2.2220, -5.4883, -3.0575],
        [-6.0907,  2.2895, -5.8323, -2.2498],
        [-5.4941,  2.3691, -5.2809, -2.2995],
        [-7.1425,  1.9253, -6.0056, -1.6056],
        [-6.7676, -3.6443,  1.7297, -1.7903],
        [-5.1059,  0.8581, -4.7177, -2.0345],
        [-6.2620,  2.0112, -4.1675, -1.7981],
        [-6.6874,  3.3592, -6.5594, -2.2992]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 45%|████▍     | 129/289 [01:37<02:00,  1.33it/s]

Training loop 129
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.33231446146965027, logits - tensor([[-5.7247,  1.4651, -5.7099, -2.3586],
        [-6.3263,  2.3666, -6.5034, -2.5032],
        [-6.0471, -3.6781, -4.8282,  3.4996],
        [-6.5845,  1.8927, -5.9914, -1.4265],
        [-5.9736,  2.5047, -6.0099, -3.2479],
        [-7.4223,  1.8090, -6.0152, -2.0102],
        [-6.7710,  1.8632, -4.5629, -1.1658],
        [-5.8724, -4.0679,  2.3596, -2.2472]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 45%|████▍     | 130/289 [01:38<01:59,  1.33it/s]

Training loop 130
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2553636431694031, logits - tensor([[-5.4335, -1.3479, -0.1421, -1.1630],
        [-6.4038,  2.4368, -5.4568, -2.4897],
        [-6.4233,  2.2100, -6.3233, -2.0864],
        [-6.6758,  2.7867, -6.3782, -2.5092],
        [-5.6142, -2.6307,  0.9101, -1.3773],
        [-6.5925,  1.8992, -6.0015, -1.9544],
        [-6.6702,  2.0059, -6.1665, -1.7859],
        [-6.5401,  1.1804, -5.6285, -1.8468]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 45%|████▌     | 131/289 [01:39<01:59,  1.33it/s]

Training loop 131
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5121700763702393, logits - tensor([[-7.2252,  0.6452, -6.2971, -0.2264],
        [-6.7025,  2.3886, -6.6024, -2.3714],
        [-5.4604, -3.3126,  1.2771, -2.2177],
        [-6.7401,  2.3025, -5.2599, -1.4917],
        [-6.1057,  2.0337, -5.9041, -2.4802],
        [-6.1637,  2.9052, -5.9831, -2.9148],
        [-6.0962, -4.4160,  1.9326, -3.0156],
        [-6.0035,  2.0516, -5.0440, -1.4517]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 46%|████▌     | 132/289 [01:39<01:58,  1.33it/s]

Training loop 132
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 46%|████▌     | 133/289 [01:40<01:57,  1.33it/s]

loss - 0.32487350702285767, logits - tensor([[-6.5097,  1.6596, -5.7738, -2.4661],
        [-6.5319, -3.2949, -5.8004,  2.8088],
        [-7.2169,  1.8440, -5.4571, -1.9926],
        [-5.4451,  1.9927, -5.8701, -2.3578],
        [-6.4322,  1.9203, -5.5756, -2.2892],
        [-6.0966,  0.7371, -5.0693, -1.5718],
        [-5.8408, -2.9250,  0.7709, -2.0798],
        [-5.6664, -2.7286, -5.5105,  3.3502]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 133
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.41303879022598267, logits - tensor([[-6.2649,  1.3909, -5.6349, -1.3396],
        [-6.3038,  1.8552, -5.3821, -2.8759],
        [-6.1389, -3.1053,  0.6422, -1.4642],
        [-6.0933,  0.7734, -3.3418, -1.7184],
        [-6.3982,  1.5168, -6.3267, -1.8221],
        [-6.4718,  1.8610, -5.2221, -1.5295],
        [-7.1841,  2.3121, -6.3413, -2.1419],
        [-5.7614,  2.1708, -5.5142, -2

 46%|████▋     | 134/289 [01:41<01:56,  1.33it/s]

Training loop 134
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.08457167446613312, logits - tensor([[-6.9805,  1.4294, -5.8464, -1.8504],
        [-6.2323, -3.5092,  1.5069, -2.8260],
        [-6.4714, -1.6160, -4.9422,  0.8621],
        [-6.1410,  1.3950, -5.9475, -1.7716],
        [-6.3184, -3.3953, -5.3694,  2.8614],
        [-6.5705,  1.4492, -4.8443, -1.9250],
        [-5.7082,  1.5757, -5.8452, -2.1469],
        [-5.8765, -3.1664,  1.6842, -1.7727]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 47%|████▋     | 135/289 [01:42<01:56,  1.32it/s]

Training loop 135
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.33842897415161133, logits - tensor([[-6.8214,  1.6165, -4.9921, -2.2061],
        [-6.6966,  1.9042, -5.7004, -2.4774],
        [-5.8203, -0.4482, -4.2016,  0.2403],
        [-6.0722,  0.1643, -3.5215, -0.2142],
        [-5.7909,  1.0349, -5.3214, -0.6671],
        [-6.2731,  1.6419, -5.6350, -1.8397],
        [-6.0297,  1.3904, -5.5603, -1.2205],
        [-6.3691,  1.5291, -5.7428, -1.7868]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 47%|████▋     | 136/289 [01:42<01:56,  1.32it/s]

Training loop 136
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 47%|████▋     | 137/289 [01:43<01:55,  1.31it/s]

loss - 0.18874478340148926, logits - tensor([[-5.7581,  1.8555, -4.9842, -1.5061],
        [-6.3269, -0.6929, -4.2383,  0.1351],
        [-6.4989,  1.6466, -5.3657, -1.3542],
        [-5.8991, -1.2150, -1.4301, -1.6138],
        [-5.8554,  0.7709, -4.7628, -1.0328],
        [-6.5738, -4.1123,  3.1103, -2.0609],
        [-6.1583, -3.0865, -5.3410,  3.2941],
        [-7.6949,  1.0758, -6.4091, -1.2135]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 137
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1297142058610916, logits - tensor([[-5.8497,  1.1579, -5.8430, -1.6914],
        [-6.7302,  1.3847, -4.9736, -1.7269],
        [-5.3652,  0.3501, -4.0596,  0.0223],
        [-5.9036,  1.4585, -4.6747, -1.0041],
        [-6.2964,  0.9717, -5.3865, -2.0871],
        [-6.6935,  1.6908, -5.6120, -1.7177],
        [-6.5395, -2.8323,  1.6163, -1.8504],
        [-6.1199,  1.6240, -6.2277, -1.

 48%|████▊     | 138/289 [01:44<01:54,  1.32it/s]

Training loop 138
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 48%|████▊     | 139/289 [01:45<01:53,  1.32it/s]

loss - 0.13744698464870453, logits - tensor([[-7.1643,  1.5901, -5.6300, -1.5873],
        [-6.6823,  1.5303, -5.1644, -1.2596],
        [-7.3414,  1.0008, -5.4293, -1.8140],
        [-6.8012,  1.9078, -5.7465, -1.8283],
        [-7.6513, -0.0144, -5.5234, -0.2832],
        [-6.6336, -1.2958, -4.4128,  0.2274],
        [-5.4342, -2.6648, -4.7000,  1.7285],
        [-6.1762,  1.2103, -4.9418, -1.5816]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 139
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.20772060751914978, logits - tensor([[-5.9963,  0.7904, -4.3638, -0.1221],
        [-4.5545, -2.9957,  1.5042, -1.6803],
        [-6.2235, -1.7625, -5.2863,  1.8378],
        [-6.5326, -2.8669, -4.6771,  2.4013],
        [-4.5912, -3.3657, -4.5169,  3.1316],
        [-5.4921,  1.9512, -5.2994, -2.0213],
        [-5.8650,  0.5204, -5.0589, -0.9434],
        [-6.6117,  0.6483, -4.8761, -1

 48%|████▊     | 140/289 [01:45<01:53,  1.31it/s]

Training loop 140
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.253834068775177, logits - tensor([[-6.7400, -0.4626, -4.6337,  0.2519],
        [-7.1924,  0.5335, -5.7782, -1.2357],
        [-5.9841, -4.0324, -3.7970,  4.3261],
        [-6.7327, -1.0990, -4.6027,  0.2465],
        [-5.5771, -3.2464,  2.0829, -2.3650],
        [-6.1898,  1.7367, -5.6224, -1.9026],
        [-6.6152, -1.4208, -3.9492,  1.2034],
        [-5.9830, -3.3824,  1.3113, -2.2883]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 49%|████▉     | 141/289 [01:46<01:52,  1.32it/s]

Training loop 141
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.37033337354660034, logits - tensor([[-6.3343,  0.9692, -5.2950,  0.1391],
        [-6.1528, -4.1555, -5.4427,  3.6233],
        [-6.9054,  2.0568, -5.6160, -1.9115],
        [-6.7445,  1.3253, -6.2502, -1.8800],
        [-4.8122, -3.5476, -5.1794,  2.8777],
        [-6.8309, -1.2671, -4.7785,  0.8739],
        [-6.4170,  1.8658, -6.0543, -1.8294],
        [-5.3900,  1.3691, -4.7125, -1.0456]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 49%|████▉     | 142/289 [01:47<01:51,  1.32it/s]

Training loop 142
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.186900332570076, logits - tensor([[-6.2517,  0.3497, -4.3441, -0.2467],
        [-6.6855,  1.4441, -5.5575, -1.2711],
        [-6.8676,  1.8211, -5.8087, -0.8469],
        [-4.7853, -3.1154, -4.6685,  3.2892],
        [-7.4042,  1.4046, -5.2684, -1.0882],
        [-5.8277, -2.1571,  0.3214, -1.7274],
        [-7.4254, -0.0440, -5.5696,  0.4540],
        [-5.7147, -3.5318,  2.3206, -2.8205]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 49%|████▉     | 143/289 [01:48<01:50,  1.32it/s]

Training loop 143
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2767976224422455, logits - tensor([[-7.2470,  0.9850, -4.9268, -1.3706],
        [-5.0499, -3.3539,  2.2058, -1.5224],
        [-7.1491,  1.2924, -5.1740, -1.3810],
        [-5.7272, -2.3476, -4.5339,  2.3278],
        [-6.2957,  1.7637, -5.5454, -1.7082],
        [-6.4598,  0.8562, -5.1617, -0.8725],
        [-5.2106,  0.6737, -3.9773, -0.7389],
        [-6.8884, -2.5527, -5.2028,  2.7268]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 50%|████▉     | 144/289 [01:48<01:49,  1.32it/s]

Training loop 144
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.27542343735694885, logits - tensor([[-5.5636, -4.4087, -5.1061,  4.2541],
        [-4.4289, -3.5361,  1.6345, -2.8762],
        [-5.8772,  1.3279, -4.6013, -1.3489],
        [-5.7941,  0.8544, -5.0885, -1.1561],
        [-4.9561, -4.2734, -4.3983,  3.4683],
        [-5.5764,  0.8440, -4.1907, -0.1201],
        [-5.7573,  0.7381, -4.6555, -0.1498],
        [-7.1881,  2.3215, -6.4530, -1.8351]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 50%|█████     | 145/289 [01:49<01:48,  1.32it/s]

Training loop 145
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4867623448371887, logits - tensor([[-7.1265,  1.1406, -5.8329, -1.4943],
        [-6.3818, -4.3861,  2.4850, -2.9774],
        [-5.4328,  1.1101, -4.3849, -0.6223],
        [-6.9438,  0.3999, -4.8900, -0.5307],
        [-5.5487, -4.0410,  2.3614, -2.6793],
        [-4.7222, -3.2319,  2.0340, -2.5554],
        [-7.2489,  1.9168, -5.7518, -1.7323],
        [-5.9743, -0.3860, -4.2343, -0.1898]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 51%|█████     | 146/289 [01:50<01:48,  1.32it/s]

Training loop 146
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.09937479346990585, logits - tensor([[-5.9089, -4.0025,  2.0897, -2.2073],
        [-5.2438, -3.9666, -4.6602,  3.7721],
        [-6.8323,  2.3307, -6.5821, -1.5278],
        [-5.2341,  1.2312, -5.3588, -1.0320],
        [-5.8833,  0.8148, -4.1043, -1.2793],
        [-6.1303,  0.9495, -4.8658, -1.3466],
        [-5.8371, -3.6540, -4.3288,  3.2714],
        [-5.0316, -2.8472,  0.6261, -1.2584]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 51%|█████     | 147/289 [01:51<01:47,  1.32it/s]

Training loop 147
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.23215851187705994, logits - tensor([[-6.1887e+00,  2.2399e+00, -5.2850e+00, -2.2008e+00],
        [-6.3284e+00,  1.6244e+00, -5.0207e+00, -1.3414e+00],
        [-4.5306e+00, -2.3981e+00,  1.9949e+00, -2.1407e+00],
        [-5.3761e+00,  9.3493e-01, -4.5310e+00, -9.0770e-01],
        [-7.4300e+00, -3.8358e-01, -5.1174e+00, -7.2226e-03],
        [-6.8773e+00,  9.5096e-01, -4.7416e+00, -1.1236e+00],
        [-6.5319e+00,  1.3413e+00, -5.2936e+00, -1.4764e+00],
        [-5.4639e+00,  1.2191e+00, -5.1687e+00, -1.8547e+00]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 51%|█████     | 148/289 [01:51<01:46,  1.32it/s]

Training loop 148
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.14766673743724823, logits - tensor([[-5.3863,  1.3472, -4.9958, -2.0442],
        [-5.6501, -3.4238,  2.4151, -2.6246],
        [-4.4030, -3.8875, -4.4502,  2.6930],
        [-7.1157,  2.7332, -6.8644, -1.1305],
        [-7.3276,  1.4285, -6.0363, -1.2084],
        [-6.8610,  1.1246, -4.7005, -0.7197],
        [-6.7656, -2.8840, -3.1950,  1.0715],
        [-6.6408,  1.7137, -5.2787, -1.8803]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 52%|█████▏    | 149/289 [01:52<01:45,  1.33it/s]

Training loop 149
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1323147416114807, logits - tensor([[-6.0300,  2.3583, -5.4508, -1.9854],
        [-4.9189, -0.9661, -3.3580,  0.6957],
        [-5.5144,  0.5486, -4.4350, -1.2246],
        [-6.0044,  1.6462, -5.1466, -1.1483],
        [-6.0663,  0.6223, -4.5762, -0.7333],
        [-5.7912,  1.5943, -4.0847, -0.9562],
        [-6.2259,  1.8155, -5.2074, -1.8066],
        [-4.8768,  1.6045, -4.8135, -1.6428]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 52%|█████▏    | 150/289 [01:53<01:44,  1.32it/s]

Training loop 150
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.26861050724983215, logits - tensor([[-5.8037,  0.9807, -4.0307, -0.8590],
        [-5.1680, -3.7505,  2.3603, -2.3289],
        [-4.6160, -0.7601, -0.9117, -1.0116],
        [-5.6011,  1.3731, -5.2867, -1.8693],
        [-7.0652,  0.7626, -5.4755, -0.8594],
        [-5.2818, -3.9592,  2.4861, -2.3352],
        [-6.4895,  1.3471, -4.6915, -1.7490],
        [-6.5551,  1.2107, -4.5421, -1.5861]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 52%|█████▏    | 151/289 [01:54<01:43,  1.33it/s]

Training loop 151
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.11482366174459457, logits - tensor([[-6.3898,  1.9655, -4.7933, -1.9346],
        [-7.0120,  0.5543, -5.6172, -1.0125],
        [-5.1286, -2.6538, -4.0755,  2.5278],
        [-6.5958,  1.8275, -5.2382, -1.6154],
        [-6.2776,  1.6473, -5.4037, -1.8970],
        [-6.6619,  1.3023, -4.5025, -2.3694],
        [-6.7842,  0.6450, -4.7030, -0.7878],
        [-5.8116, -2.1798,  0.6744, -1.9569]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 53%|█████▎    | 152/289 [01:54<01:43,  1.33it/s]

Training loop 152
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.09708691388368607, logits - tensor([[-7.5302,  1.5514, -5.5534, -1.8512],
        [-6.0659,  1.6446, -5.2025, -1.2697],
        [-6.9166,  0.5856, -4.7209, -1.6238],
        [-4.7014, -3.5993, -4.8189,  4.3568],
        [-7.0903,  1.1934, -5.5153, -2.0042],
        [-6.0993,  1.1532, -5.8477, -1.6787],
        [-6.9527,  1.5114, -5.2118, -1.0348],
        [-6.5108,  1.6917, -5.6874, -2.1251]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 53%|█████▎    | 153/289 [01:55<01:43,  1.32it/s]

Training loop 153
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 53%|█████▎    | 154/289 [01:56<01:42,  1.32it/s]

loss - 0.356086790561676, logits - tensor([[-6.4623, -1.9971, -1.6654, -0.2701],
        [-5.4695, -3.6817,  1.7210, -2.6281],
        [-6.2118,  0.8709, -4.8503, -1.3939],
        [-4.3938, -2.9735,  2.1147, -2.3213],
        [-5.9588,  1.7749, -5.8693, -1.5759],
        [-5.8700, -3.0941,  1.9756, -2.3649],
        [-6.8759,  0.8369, -5.1917, -1.3463],
        [-5.2196,  1.7689, -4.4703, -1.7069]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 154
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.09993124008178711, logits - tensor([[-5.7688, -1.8285,  0.4167, -1.6658],
        [-6.4104,  1.1472, -5.2510, -2.1517],
        [-6.4661,  1.6925, -4.9153, -1.7649],
        [-6.1664, -4.1145,  2.5713, -2.4090],
        [-7.4021, -2.8008, -5.1607,  2.0612],
        [-5.5118,  2.5595, -4.9371, -2.6756],
        [-6.5517,  1.0505, -5.7466, -1.6267],
        [-5.4023,  1.3745, -5.0072, -0.7

 54%|█████▎    | 155/289 [01:57<01:42,  1.31it/s]

Training loop 155
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1112210601568222, logits - tensor([[-6.6345,  2.2133, -6.8633, -2.5760],
        [-6.1182,  2.1465, -5.6503, -2.2577],
        [-6.1070,  0.0152, -3.9839,  0.0919],
        [-5.5441,  1.7897, -4.5108, -1.5911],
        [-6.7816,  0.9639, -5.0842, -1.0951],
        [-5.9095, -3.1885,  1.5779, -2.4296],
        [-7.0052,  2.1883, -5.2170, -1.9191],
        [-6.8868,  2.4874, -5.7601, -2.3039]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 54%|█████▍    | 156/289 [01:57<01:41,  1.31it/s]

Training loop 156
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4099670648574829, logits - tensor([[-6.8415,  1.3279, -5.7584, -1.6280],
        [-5.9405, -0.1565, -5.1154, -0.0209],
        [-5.8582,  2.2148, -5.4533, -1.9749],
        [-5.7485,  1.3161, -4.7711, -1.0503],
        [-6.6464,  1.7418, -5.6476, -1.2051],
        [-6.4993,  1.9638, -5.7649, -2.2962],
        [-6.9395,  1.7042, -5.3034, -1.3243],
        [-6.3238, -3.9005,  1.1517, -1.1650]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 54%|█████▍    | 157/289 [01:58<01:41,  1.31it/s]

Training loop 157
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1381549835205078, logits - tensor([[-6.4688,  1.5803, -5.1409, -1.8801],
        [-5.0073, -3.4110,  2.3865, -2.0985],
        [-6.6306,  2.2023, -6.0379, -1.9912],
        [-7.3499,  1.9819, -5.6821, -1.3279],
        [-6.8000,  1.7153, -5.0876, -1.7493],
        [-6.3025, -0.9013, -5.5175,  0.7603],
        [-6.8933,  1.4491, -5.3606, -2.2818],
        [-5.7711,  2.0741, -5.6126, -2.5809]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 55%|█████▍    | 158/289 [01:59<01:40,  1.31it/s]

Training loop 158
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.25737208127975464, logits - tensor([[-7.0603,  2.3024, -5.6720, -1.4007],
        [-5.9732,  2.3137, -5.5223, -2.7585],
        [-5.8023,  1.6098, -5.8073, -1.9661],
        [-6.3066,  1.6642, -5.1442, -2.3992],
        [-5.5647, -1.0480, -3.3950,  1.8348],
        [-5.5162,  1.2677, -5.5323, -1.6148],
        [-4.9775, -3.4124,  1.4891, -2.0576],
        [-5.8503,  1.0395, -4.7873, -0.9153]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 55%|█████▌    | 159/289 [02:00<01:39,  1.31it/s]

Training loop 159
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.28477680683135986, logits - tensor([[-5.8195,  1.4323, -4.8592, -1.5524],
        [-6.3905,  1.2817, -4.8886, -1.9595],
        [-6.0054,  2.7188, -5.3505, -1.4625],
        [-5.2218, -3.4947,  2.0721, -2.2686],
        [-4.7363,  2.3418, -5.3664, -2.5692],
        [-5.9181,  1.6759, -5.1395, -1.4843],
        [-6.4872,  1.5434, -5.1902, -1.9906],
        [-5.9635,  1.9799, -5.4153, -1.6014]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 55%|█████▌    | 160/289 [02:01<01:38,  1.32it/s]

Training loop 160
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19260139763355255, logits - tensor([[-6.9970,  2.4081, -6.1493, -2.0703],
        [-6.4524,  1.5226, -5.6312, -2.0564],
        [-6.5611,  1.6582, -5.2574, -1.5968],
        [-6.1566, -0.4226, -4.8894,  0.5098],
        [-5.1320,  2.2135, -4.3086, -2.3287],
        [-6.4073,  2.6245, -5.2304, -2.1946],
        [-6.6155,  2.4194, -5.9497, -2.3014],
        [-6.6028, -1.2876, -4.5199,  1.5699]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 56%|█████▌    | 161/289 [02:01<01:37,  1.31it/s]

Training loop 161
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4202355444431305, logits - tensor([[-6.3139,  2.0018, -5.7311, -1.6898],
        [-6.4371,  1.5937, -5.4434, -2.2276],
        [-5.6500, -3.3876,  2.2882, -1.9020],
        [-5.3738,  2.2313, -6.1202, -1.8331],
        [-7.1999, -2.6627, -2.6756,  1.4800],
        [-5.6361, -3.2512, -5.6852,  2.4104],
        [-6.4336,  2.7615, -5.2856, -2.1194],
        [-5.9033,  0.0617, -5.6514, -0.7301]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 56%|█████▌    | 162/289 [02:02<01:36,  1.32it/s]

Training loop 162
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19284164905548096, logits - tensor([[-5.5258, -3.4836,  2.0384, -2.3740],
        [-5.8698, -3.2214, -5.3690,  1.8387],
        [-6.8705,  1.9748, -5.4457, -1.8519],
        [-6.3480,  1.5259, -5.8432, -2.6898],
        [-6.3674,  2.0695, -5.2180, -2.2439],
        [-6.2123,  2.0945, -5.3236, -2.1980],
        [-5.7551,  1.9415, -4.8461, -2.2440],
        [-5.6505, -2.6483, -4.0049,  2.2561]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 56%|█████▋    | 163/289 [02:03<01:35,  1.32it/s]

Training loop 163
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1827782392501831, logits - tensor([[-5.8303,  2.5886, -5.5247, -2.6561],
        [-5.6641,  1.8907, -5.2158, -2.3745],
        [-5.9488,  1.9924, -4.6423, -2.0843],
        [-6.0433,  2.3871, -5.4844, -2.2638],
        [-6.2971,  2.5661, -5.9059, -2.1633],
        [-5.5084, -2.9435, -5.2880,  3.5223],
        [-5.7025,  1.7668, -5.2508, -1.3259],
        [-5.5755,  1.5693, -5.3413, -2.7908]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 57%|█████▋    | 164/289 [02:04<01:34,  1.32it/s]

Training loop 164
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4473733901977539, logits - tensor([[-4.7335,  2.0113, -4.6963, -1.2830],
        [-5.5635,  1.5813, -5.0012, -2.2371],
        [-6.1139,  2.6818, -6.3374, -2.1998],
        [-6.4544, -0.3997, -5.5908,  0.9763],
        [-5.7907,  2.0447, -5.5140, -2.1655],
        [-6.2649,  1.8062, -5.7408, -1.8331],
        [-6.6587,  1.8927, -4.9262, -2.1183],
        [-6.5435,  2.3232, -5.9382, -1.5244]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 57%|█████▋    | 165/289 [02:04<01:33,  1.32it/s]

Training loop 165
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.05405397713184357, logits - tensor([[-5.7265, -2.1089, -5.7170,  2.2295],
        [-6.4306,  2.4286, -5.8809, -2.4931],
        [-6.2182, -3.7236,  2.3931, -2.3988],
        [-6.2348,  1.2162, -4.6032, -2.6253],
        [-5.6828,  2.8132, -6.2776, -3.3312],
        [-6.9109, -3.2735, -5.4447,  3.2239],
        [-4.2590, -2.2551,  1.7658, -2.1861],
        [-5.5076,  2.0098, -4.2953, -2.3074]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 57%|█████▋    | 166/289 [02:05<01:32,  1.33it/s]

Training loop 166
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.08262734115123749, logits - tensor([[-5.5385,  2.4111, -5.2478, -1.8273],
        [-6.8991,  2.0659, -5.8523, -2.0299],
        [-6.1640,  2.2122, -5.0519, -2.4831],
        [-5.3816,  2.8889, -4.9197, -1.6715],
        [-6.9435,  1.8527, -5.5615, -1.9645],
        [-6.2837, -0.9050, -4.8228,  0.7768],
        [-5.7685,  1.6401, -6.1270, -1.9196],
        [-6.1579,  1.5830, -5.1489, -1.5148]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 58%|█████▊    | 167/289 [02:06<01:31,  1.33it/s]

Training loop 167
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3797803521156311, logits - tensor([[-4.4163, -2.3013,  1.3005, -1.5405],
        [-7.4340, -1.2557, -5.4788,  1.2894],
        [-7.2258, -2.9049, -5.7056,  2.4884],
        [-6.0978,  2.0890, -4.5203, -2.7839],
        [-5.7208, -0.7974, -1.7206, -1.2643],
        [-6.2470,  1.9883, -4.7852, -1.2488],
        [-4.8428, -2.2475,  2.3268, -2.4323],
        [-5.9676,  1.8680, -6.2661, -2.2930]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 58%|█████▊    | 168/289 [02:07<01:30,  1.33it/s]

Training loop 168
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.314202219247818, logits - tensor([[-6.4661, -3.3361, -4.0032,  3.1803],
        [-6.5954,  2.3692, -5.0181, -2.1874],
        [-5.3993, -3.6414,  2.2377, -2.8226],
        [-5.3415,  2.4727, -5.7266, -2.2654],
        [-6.2100,  2.0341, -5.8345, -2.4686],
        [-6.6671, -2.0296, -4.6716,  1.1663],
        [-5.9424,  1.7813, -5.9127, -2.1763],
        [-6.2786,  2.1321, -6.4500, -2.1833]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 58%|█████▊    | 169/289 [02:07<01:30,  1.33it/s]

Training loop 169
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.046068571507930756, logits - tensor([[-6.2892,  2.5037, -5.7516, -2.5345],
        [-6.2307,  1.6654, -4.4158, -1.7878],
        [-6.1343,  2.5293, -6.4456, -2.7886],
        [-5.9207,  2.5025, -5.5437, -2.5595],
        [-6.0341,  1.9370, -6.1420, -2.4396],
        [-5.8037, -3.7208,  2.9470, -2.8143],
        [-6.6218, -3.4151,  2.5741, -2.1003],
        [-5.4492, -3.4899, -4.8306,  3.0039]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 59%|█████▉    | 170/289 [02:08<01:29,  1.33it/s]

Training loop 170
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.17094922065734863, logits - tensor([[-5.6210, -4.1479, -4.4007,  3.3482],
        [-5.6155,  2.3638, -4.8770, -1.5299],
        [-6.3722,  1.4084, -5.8782, -1.5593],
        [-6.1690,  1.7769, -5.2716, -2.3645],
        [-6.4853,  2.2796, -6.3876, -2.9376],
        [-5.6929,  1.7897, -5.7341, -2.1229],
        [-5.5742,  0.8250, -4.5992, -1.3367],
        [-6.0731,  1.6811, -4.7778, -1.2416]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 59%|█████▉    | 171/289 [02:09<01:28,  1.33it/s]

Training loop 171
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.342239648103714, logits - tensor([[-6.0834, -0.5045, -4.2933, -0.0798],
        [-5.6239,  2.3540, -4.9879, -2.6828],
        [-6.6932,  1.6021, -5.6302, -1.5502],
        [-5.2787, -3.4841,  2.7783, -2.5492],
        [-5.9746,  2.0131, -5.1923, -2.0977],
        [-6.0520,  0.7692, -4.7234,  0.0388],
        [-5.3425,  1.4162, -5.1686, -2.0945],
        [-6.0014,  1.0355, -4.8296, -1.5253]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 60%|█████▉    | 172/289 [02:10<01:28,  1.32it/s]

Training loop 172
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2079641968011856, logits - tensor([[-5.6336,  1.8717, -5.0903, -1.4676],
        [-7.1229,  0.4359, -5.2136, -0.5290],
        [-5.5900, -4.1330, -3.8390,  3.9133],
        [-6.1488,  1.5660, -5.9525, -1.9829],
        [-5.4154,  1.0617, -4.7303, -1.4688],
        [-4.7911,  1.4358, -5.1006, -1.8800],
        [-6.5232,  1.6397, -5.5294, -1.6268],
        [-5.4596, -3.8678,  2.8564, -2.0899]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 60%|█████▉    | 173/289 [02:10<01:27,  1.33it/s]

Training loop 173
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.16119807958602905, logits - tensor([[-5.8565,  1.1265, -4.9022, -1.7064],
        [-4.9970,  0.7125, -4.0897, -1.3119],
        [-6.6690,  1.9480, -5.3152, -2.0195],
        [-5.6842, -3.7422,  2.6951, -2.3729],
        [-6.0308, -3.8058,  1.7819, -3.0132],
        [-5.8477,  1.3965, -4.1444, -1.6154],
        [-5.3685,  1.0026, -4.8137, -1.3362],
        [-5.9226, -2.3049, -4.4394,  1.2462]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 60%|██████    | 174/289 [02:11<01:27,  1.32it/s]

Training loop 174
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.18489322066307068, logits - tensor([[-4.8167,  1.4747, -3.7853, -1.1055],
        [-6.2079, -3.6899,  3.0752, -2.5425],
        [-5.9967,  1.9212, -4.5916, -1.4360],
        [-5.8530,  1.5777, -5.0521, -1.6515],
        [-4.8525,  2.3259, -4.9882, -1.6288],
        [-5.9842,  0.7494, -4.4766, -0.7208],
        [-5.5578,  1.3280, -4.2254, -1.6616],
        [-5.0678,  1.6508, -4.6666, -1.3586]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 61%|██████    | 175/289 [02:12<01:26,  1.32it/s]

Training loop 175
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.17723152041435242, logits - tensor([[-5.5266,  1.2110, -5.0747, -1.8601],
        [-6.5233, -3.3828,  2.0940, -2.6345],
        [-4.9707, -2.6779, -4.4344,  2.7073],
        [-6.3308, -2.9888, -4.3222,  1.4600],
        [-5.7924,  1.5138, -4.9615, -1.5250],
        [-6.6100,  2.0773, -5.3351, -1.6215],
        [-6.3859,  1.7361, -4.8758, -1.0264],
        [-6.0273,  2.1087, -5.3339, -1.3442]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 61%|██████    | 176/289 [02:13<01:26,  1.31it/s]

Training loop 176
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.06688497960567474, logits - tensor([[-5.9291, -3.1432,  2.8887, -1.8143],
        [-7.3125,  1.9533, -5.9542, -2.4926],
        [-5.5662, -3.9385,  3.6274, -3.0430],
        [-6.0834,  1.9936, -6.0601, -1.9258],
        [-6.2970,  1.7414, -5.3191, -2.1019],
        [-6.6171,  1.6614, -5.3152, -1.8442],
        [-5.1845,  1.3302, -4.3708, -1.9353],
        [-6.9512,  2.0079, -6.1541, -1.6021]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 61%|██████    | 177/289 [02:13<01:25,  1.31it/s]

Training loop 177
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.17171348631381989, logits - tensor([[-6.5979, -2.2805, -4.3865,  1.7799],
        [-7.7805, -3.2121, -4.8081,  2.3222],
        [-6.4517,  2.3480, -5.3949, -2.3676],
        [-5.6669,  1.4933, -5.2000, -1.3654],
        [-7.2452,  1.4457, -5.7717, -1.9697],
        [-5.6983, -1.8677, -5.2118,  1.3485],
        [-5.5534,  2.1283, -5.1327, -2.3620],
        [-6.3154, -3.1279, -4.0040,  3.1776]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 62%|██████▏   | 178/289 [02:14<01:24,  1.31it/s]

Training loop 178
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2162097990512848, logits - tensor([[-6.0189,  0.6044, -4.4329, -1.0756],
        [-5.9547,  1.9280, -6.0028, -1.6877],
        [-5.4941,  1.1378, -4.5078, -1.6660],
        [-5.4439,  2.0897, -5.1783, -2.1987],
        [-6.0366,  1.1939, -4.5664, -1.7529],
        [-5.3551,  0.5501, -4.5227, -1.5002],
        [-5.4970,  1.3086, -4.9542, -2.0261],
        [-6.7942,  1.6715, -5.4281, -1.6836]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 62%|██████▏   | 179/289 [02:15<01:23,  1.32it/s]

Training loop 179
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3502885699272156, logits - tensor([[-5.4431,  0.8229, -4.7500, -1.0280],
        [-6.7497, -1.0134, -4.4505,  0.7020],
        [-5.3188, -3.6759,  1.8723, -2.4747],
        [-5.6115,  0.8961, -4.5105, -1.0481],
        [-6.3521, -3.8963,  1.6933, -1.5320],
        [-6.4038,  1.0860, -4.9665, -1.0476],
        [-5.7246, -3.2258,  1.7517, -2.0054],
        [-5.1099, -3.1506,  1.3047, -2.0077]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 62%|██████▏   | 180/289 [02:16<01:22,  1.32it/s]

Training loop 180
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2807270884513855, logits - tensor([[-5.1352,  1.1694, -4.4586, -1.0855],
        [-6.8080,  1.5460, -6.0328, -2.3221],
        [-4.4517,  1.7576, -4.7971, -1.0456],
        [-6.6009,  0.6338, -5.4269, -1.1999],
        [-5.8882,  1.3117, -5.1799, -1.1122],
        [-6.3712,  0.1112, -4.7430,  0.2137],
        [-5.6546,  1.7771, -4.4292, -0.7641],
        [-6.2555,  1.9472, -4.0798, -1.2105]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 63%|██████▎   | 181/289 [02:16<01:21,  1.32it/s]

Training loop 181
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.28796443343162537, logits - tensor([[-5.5803,  0.6965, -4.5538, -0.5358],
        [-5.7126,  0.8207, -4.3446, -0.9012],
        [-6.7679,  1.8014, -5.2125, -1.7562],
        [-5.8646,  1.0554, -4.6425, -1.2455],
        [-4.9480, -3.9820, -4.8082,  4.1992],
        [-5.9520, -2.5038, -4.0266,  1.8813],
        [-5.2802,  1.1889, -4.3541, -0.5488],
        [-6.4406, -2.9549, -4.7639,  3.7212]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 63%|██████▎   | 182/289 [02:17<01:20,  1.32it/s]

Training loop 182
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.0816393718123436, logits - tensor([[-6.8244,  0.4185, -5.1779, -1.0860],
        [-5.0170, -3.8247,  2.1527, -2.3195],
        [-6.4027, -2.4362, -4.4666,  2.4394],
        [-6.1303,  1.7106, -5.7325, -1.6730],
        [-6.4666,  1.7335, -5.3336, -1.6330],
        [-5.5586, -3.0512, -3.9039,  4.2748],
        [-5.4614,  2.5180, -5.9210, -2.0163],
        [-6.4592,  2.0805, -4.5183, -1.1200]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 63%|██████▎   | 183/289 [02:18<01:19,  1.33it/s]

Training loop 183
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.09926408529281616, logits - tensor([[-6.2108,  0.8218, -4.7681, -1.2179],
        [-6.5415,  1.5144, -5.7436, -1.9379],
        [-6.1016, -2.5931, -3.5878,  2.1182],
        [-6.6477,  2.1207, -5.3046, -1.1740],
        [-6.0199,  1.4633, -4.6309, -1.0586],
        [-6.3007, -3.2359,  1.6369, -2.0273],
        [-6.4933,  2.2933, -5.9331, -1.8711],
        [-5.9366,  1.3282, -5.3692, -1.2388]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 64%|██████▎   | 184/289 [02:19<01:18,  1.33it/s]

Training loop 184
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.13340559601783752, logits - tensor([[-4.4330, -3.0441,  1.8582, -2.6068],
        [-7.0709,  1.9009, -6.2732, -2.3066],
        [-5.0554, -2.6025,  1.7254, -2.0831],
        [-6.8147, -0.4205, -4.2199, -0.1989],
        [-6.6024,  0.8571, -4.7792, -0.6201],
        [-5.9883,  1.2693, -5.2757, -0.9653],
        [-6.7670, -3.5918, -4.7827,  2.2859],
        [-5.6603,  0.8521, -4.4791, -1.4805]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 64%|██████▍   | 185/289 [02:19<01:18,  1.33it/s]

Training loop 185
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.20300330221652985, logits - tensor([[-6.2453, -4.0220,  2.3593, -2.1341],
        [-5.9908,  1.4687, -4.2859, -1.0764],
        [-5.9830,  0.9935, -3.9737, -0.7232],
        [-5.0850, -3.9939, -3.5433,  3.0668],
        [-6.3426,  1.2141, -4.6956, -0.2609],
        [-5.4726,  1.4808, -5.0937, -1.4590],
        [-6.0752,  1.6108, -5.1256, -1.1144],
        [-5.3605,  1.3558, -4.7628, -1.3604]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 64%|██████▍   | 186/289 [02:20<01:17,  1.33it/s]

Training loop 186
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2987127900123596, logits - tensor([[-7.0529, -3.8387,  2.3424, -1.6815],
        [-5.8170,  1.6290, -4.4484, -1.5229],
        [-5.9099,  0.8389, -4.9080, -0.6410],
        [-5.1859,  1.0016, -4.3184, -0.6974],
        [-6.1357, -3.3802, -4.0195,  2.4646],
        [-5.8420,  1.2000, -4.9286, -0.4977],
        [-5.8295,  1.9976, -5.2085, -1.8257],
        [-5.6733,  0.8987, -4.2149, -0.7030]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 65%|██████▍   | 187/289 [02:21<01:17,  1.32it/s]

Training loop 187
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.09830029308795929, logits - tensor([[-6.2218,  1.8014, -4.6527, -2.0029],
        [-6.2380,  1.0064, -4.6729, -0.6968],
        [-6.4920,  0.8682, -4.9522, -0.5012],
        [-6.8590,  1.6909, -6.1088, -1.6986],
        [-6.1869,  2.0658, -5.4422, -2.0311],
        [-7.3272, -2.1077, -3.7865,  1.5965],
        [-5.9136, -3.5376,  2.4410, -2.4423],
        [-5.3161,  2.3454, -6.1411, -2.4025]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 65%|██████▌   | 188/289 [02:22<01:16,  1.32it/s]

Training loop 188
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.08676141500473022, logits - tensor([[-4.8249,  1.1023, -4.4372, -1.3461],
        [-4.8449, -3.5684, -3.5026,  4.0125],
        [-5.7456,  1.7414, -5.7478, -1.1755],
        [-6.1367, -4.2233, -3.4469,  4.0881],
        [-6.7138, -3.6083,  3.1126, -2.6486],
        [-5.2913,  1.6891, -4.9540, -1.9707],
        [-6.6102,  1.0097, -4.3651, -0.9798],
        [-6.6503,  0.7926, -5.7127, -1.6202]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 65%|██████▌   | 189/289 [02:22<01:15,  1.33it/s]

Training loop 189
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.07415702193975449, logits - tensor([[-5.8046,  2.4727, -5.3826, -3.0799],
        [-7.2881, -2.2110, -4.5710,  1.9872],
        [-5.1513, -4.3363, -3.6958,  3.9157],
        [-6.5820,  1.4916, -5.1966, -1.4506],
        [-6.1235,  2.4910, -5.3468, -1.5875],
        [-5.8549,  1.7262, -5.2136, -1.6475],
        [-5.8572,  0.7703, -4.7623, -1.0825],
        [-6.8957,  2.4858, -6.8735, -2.0045]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 66%|██████▌   | 190/289 [02:23<01:14,  1.33it/s]

Training loop 190
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.35158881545066833, logits - tensor([[-6.4488,  0.3172, -4.9435, -0.7962],
        [-6.6307,  0.9345, -4.6725, -1.0207],
        [-6.3015,  1.0345, -5.3349, -0.4992],
        [-5.9253,  1.9501, -5.1662, -1.8578],
        [-4.8661,  0.3101, -3.3046, -0.4540],
        [-6.6039,  2.3657, -5.9351, -2.8539],
        [-4.6532,  1.1221, -4.6777, -1.3689],
        [-5.7810, -3.1982, -4.6318,  3.5602]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 66%|██████▌   | 191/289 [02:24<01:13,  1.32it/s]

Training loop 191
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3068682849407196, logits - tensor([[-6.1271,  1.5572, -5.4065, -1.1039],
        [-4.8998, -3.5819,  1.7240, -1.8682],
        [-5.4316, -4.2202, -4.5764,  3.9015],
        [-5.8153,  1.6839, -5.1285, -1.7172],
        [-6.3914,  2.2961, -5.4654, -1.1381],
        [-6.7025,  1.6876, -6.0105, -2.5761],
        [-5.3647,  1.5687, -5.4047, -1.5831],
        [-5.8570, -2.2369,  0.6813, -1.8261]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 66%|██████▋   | 192/289 [02:25<01:13,  1.32it/s]

Training loop 192
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.18909063935279846, logits - tensor([[-6.6578,  2.2648, -6.7713, -2.0267],
        [-6.4925,  2.3024, -5.5285, -2.2490],
        [-4.9359, -3.3589,  2.6347, -2.4634],
        [-6.9431,  2.0018, -6.3870, -2.1118],
        [-4.9836, -4.2085,  2.5892, -2.9133],
        [-5.9353,  0.9211, -4.4945, -1.2445],
        [-5.7749,  2.5637, -6.2098, -2.2927],
        [-5.6939, -2.5251, -4.7973,  2.7446]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 67%|██████▋   | 193/289 [02:25<01:12,  1.32it/s]

Training loop 193
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 67%|██████▋   | 194/289 [02:26<01:11,  1.33it/s]

loss - 0.20691928267478943, logits - tensor([[-6.0333,  2.2497, -5.3762, -2.3858],
        [-4.9223, -2.6497, -4.1212,  2.8371],
        [-6.0791, -4.3594, -4.2032,  3.9448],
        [-6.4867,  2.3768, -5.6950, -2.4473],
        [-5.7995,  1.7856, -5.9998, -1.7841],
        [-6.5485,  2.2478, -5.5104, -0.9736],
        [-5.5184,  1.8432, -5.2510, -2.1228],
        [-5.5431, -3.0166,  2.5010, -2.4112]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 194
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.38435137271881104, logits - tensor([[-6.4752,  1.7651, -5.0557, -1.1679],
        [-4.8840, -2.9301,  2.3441, -1.8098],
        [-5.7353,  2.3597, -5.4808, -2.3226],
        [-5.1163, -3.7037,  2.3562, -2.2444],
        [-6.0550, -2.5567, -4.5998,  2.7649],
        [-5.9010,  2.3901, -5.3639, -2.4868],
        [-5.7366, -3.8108,  2.6062, -2.6280],
        [-5.7184,  2.2223, -5.1434, -0

 67%|██████▋   | 195/289 [02:27<01:11,  1.32it/s]

Training loop 195
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.28250885009765625, logits - tensor([[-5.5460, -2.5686,  2.0097, -2.2131],
        [-6.7880,  2.2487, -5.5214, -2.0006],
        [-6.1604,  1.2084, -4.9346, -1.8839],
        [-5.8501,  1.8764, -5.9228, -2.0376],
        [-7.0071,  2.0733, -5.1676, -1.5541],
        [-6.1625, -3.4384,  2.6398, -2.7547],
        [-5.9359,  2.2749, -5.9553, -1.7489],
        [-5.9488,  1.4976, -6.2535, -1.8405]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 68%|██████▊   | 196/289 [02:28<01:10,  1.33it/s]

Training loop 196
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.21343880891799927, logits - tensor([[-6.0646,  1.3240, -5.6334, -2.0345],
        [-7.1798, -2.7487, -4.6426,  2.5911],
        [-6.2072, -3.8627, -4.1853,  3.1734],
        [-6.1502,  2.5844, -5.8568, -2.1621],
        [-6.1609,  2.1844, -5.4872, -1.5071],
        [-5.8694,  1.8825, -5.4843, -1.4517],
        [-5.6609,  2.1227, -4.8468, -1.0941],
        [-6.4551,  1.9330, -5.3170, -2.4629]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 68%|██████▊   | 197/289 [02:28<01:09,  1.32it/s]

Training loop 197
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.06252981722354889, logits - tensor([[-6.0854,  2.0617, -5.7774, -2.0647],
        [-6.3194,  2.0573, -5.0968, -2.3836],
        [-6.2803, -1.9871, -4.7177,  1.6900],
        [-6.3889,  1.9646, -6.3498, -2.4513],
        [-6.6047,  1.7626, -5.5772, -1.0427],
        [-7.4212,  2.1408, -6.3848, -2.0638],
        [-6.1207, -3.7595,  3.1479, -3.1102],
        [-5.6222, -3.6063,  2.5351, -2.2720]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 69%|██████▊   | 198/289 [02:29<01:09,  1.31it/s]

Training loop 198
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.23385480046272278, logits - tensor([[-6.2659, -2.7499,  1.8004, -1.7052],
        [-5.1895, -3.7092,  1.8556, -2.9192],
        [-5.6310, -3.1345,  2.0462, -2.8487],
        [-5.4036,  2.1870, -5.6456, -2.1810],
        [-5.5734, -4.1238, -4.2805,  3.5772],
        [-6.9895,  1.7782, -6.0009, -1.4494],
        [-7.2436,  0.8699, -5.4737, -0.4883],
        [-6.4102,  1.3379, -5.2288, -1.8438]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 69%|██████▉   | 199/289 [02:30<01:08,  1.31it/s]

Training loop 199
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.05363418906927109, logits - tensor([[-6.2270,  2.2491, -5.2625, -1.7190],
        [-6.0927, -2.9345, -5.6041,  3.4836],
        [-6.7758,  2.3418, -6.7018, -2.5527],
        [-5.8908, -4.4778, -4.6607,  3.8472],
        [-6.2328,  1.8179, -6.5046, -1.7227],
        [-6.5144,  2.8662, -5.6543, -2.4654],
        [-5.6588,  1.0289, -5.5181, -1.6777],
        [-6.2403,  2.2431, -5.3972, -2.3567]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 69%|██████▉   | 200/289 [02:31<01:07,  1.31it/s]

Training loop 200
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1970655918121338, logits - tensor([[-6.5496,  1.3484, -5.1035, -1.2756],
        [-6.1948, -3.8028, -4.1849,  2.7024],
        [-5.5525,  2.1878, -5.1614, -2.3111],
        [-6.5652,  2.2144, -6.0990, -2.6736],
        [-6.4085,  1.9351, -6.3267, -2.1923],
        [-7.0691, -2.7526, -4.6688,  2.2817],
        [-5.9713,  2.7398, -6.1162, -2.3929],
        [-6.0954,  1.9926, -4.9907, -1.9829]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 70%|██████▉   | 201/289 [02:32<01:06,  1.32it/s]

Training loop 201
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.06407694518566132, logits - tensor([[-5.2575,  2.7116, -5.9680, -2.2406],
        [-6.9422,  1.9764, -5.8943, -1.9371],
        [-6.6030,  2.1296, -5.8172, -2.1935],
        [-6.5387,  1.9803, -5.5250, -2.1251],
        [-4.8518,  1.7529, -4.4283, -1.4974],
        [-5.8010,  1.6898, -5.6940, -2.1692],
        [-6.2291, -3.6420,  2.5007, -2.8444],
        [-6.7803,  1.4368, -5.8246, -2.3234]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 70%|██████▉   | 202/289 [02:32<01:06,  1.32it/s]

Training loop 202
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.0456058606505394, logits - tensor([[-7.1048,  2.8740, -5.9995, -2.3775],
        [-6.7634, -4.3261,  2.7986, -2.6996],
        [-4.9774, -3.7811, -4.1037,  2.8946],
        [-6.3416, -2.3631, -5.0145,  2.7471],
        [-7.0039,  3.5124, -6.6611, -2.9815],
        [-6.3007,  1.5837, -5.8512, -2.1243],
        [-6.4678,  2.5415, -5.3215, -1.8592],
        [-6.5496,  1.7401, -5.7230, -1.9332]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 70%|███████   | 203/289 [02:33<01:05,  1.32it/s]

Training loop 203
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.34155723452568054, logits - tensor([[-5.9244, -1.8693, -4.5101,  1.8248],
        [-5.4576, -2.9505,  1.9903, -2.1727],
        [-6.4245,  2.4993, -6.6775, -1.9797],
        [-6.9831,  2.3435, -5.4568, -1.6136],
        [-6.1727,  0.9227, -5.2554, -1.4809],
        [-6.3616,  2.8944, -6.0290, -1.7972],
        [-5.5054, -2.9206,  2.2861, -2.2236],
        [-5.8004, -2.6712, -4.6745,  2.1132]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 71%|███████   | 204/289 [02:34<01:04,  1.32it/s]

Training loop 204
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3563894033432007, logits - tensor([[-6.0010, -2.8112, -3.8760,  2.6849],
        [-5.0163, -3.7288, -4.3360,  2.5697],
        [-5.8713, -1.9480, -4.6705,  1.1281],
        [-6.1509,  2.4603, -6.0985, -2.3042],
        [-6.3709,  1.3460, -5.3714, -1.3772],
        [-8.0472,  2.2859, -6.9116, -1.4195],
        [-6.2914, -2.3892, -5.3972,  1.7793],
        [-6.0806,  3.2041, -6.1688, -1.9562]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 71%|███████   | 205/289 [02:35<01:03,  1.32it/s]

Training loop 205
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.20823098719120026, logits - tensor([[-5.7908,  2.5474, -6.0149, -2.1855],
        [-6.1142,  0.9394, -4.9549, -2.9091],
        [-4.8214, -2.2560,  1.1389, -2.4604],
        [-6.8211,  1.5629, -6.0239, -1.3743],
        [-6.5432,  1.8812, -5.5622, -1.3650],
        [-5.4042, -4.2397, -5.1384,  4.1908],
        [-5.1889,  1.6344, -5.4671, -2.0092],
        [-6.5744, -0.9033, -5.0044,  1.1617]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 71%|███████▏  | 206/289 [02:35<01:02,  1.32it/s]

Training loop 206
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.05409855395555496, logits - tensor([[-5.6310,  1.8431, -4.9037, -2.0089],
        [-5.5576,  2.7143, -5.8788, -1.9050],
        [-5.4105, -3.8399, -3.7912,  4.3425],
        [-5.7869,  1.3603, -5.1127, -2.2931],
        [-5.7585,  3.3072, -5.9147, -1.7580],
        [-4.8007, -2.7681,  2.1054, -2.4784],
        [-5.7411,  2.1443, -5.7787, -2.0154],
        [-5.7098,  3.0169, -6.0926, -2.4566]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 72%|███████▏  | 207/289 [02:36<01:02,  1.32it/s]

Training loop 207
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19211608171463013, logits - tensor([[-6.2014, -2.9674,  1.4305, -2.2811],
        [-5.6770,  1.7918, -5.5850, -1.8057],
        [-5.8299,  2.0753, -6.0130, -2.3845],
        [-7.2549,  2.5446, -6.7674, -3.0206],
        [-7.8685,  2.4859, -5.7352, -3.0961],
        [-7.0500,  2.3204, -6.5961, -2.4819],
        [-5.3209,  1.9786, -4.9348, -2.4418],
        [-6.2060,  2.5193, -5.4618, -2.0344]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 72%|███████▏  | 208/289 [02:37<01:01,  1.32it/s]

Training loop 208
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.03667442500591278, logits - tensor([[-6.0702, -3.2083,  2.2490, -2.6576],
        [-6.5598,  2.4172, -6.2485, -3.0706],
        [-6.0418,  2.3849, -5.9335, -2.1988],
        [-5.7946,  2.6992, -5.5502, -3.4462],
        [-6.8442, -2.5357, -5.2584,  2.2850],
        [-4.5509, -3.6040, -4.8360,  3.6785],
        [-7.0229,  2.9691, -6.8235, -3.3526],
        [-6.7037, -2.2436, -4.7540,  2.4716]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 72%|███████▏  | 209/289 [02:38<01:00,  1.32it/s]

Training loop 209
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3338549733161926, logits - tensor([[-5.8993,  2.6256, -5.4866, -1.4492],
        [-7.3572,  2.6852, -6.0675, -2.1607],
        [-6.5864,  2.3236, -6.0497, -2.6404],
        [-6.7608,  2.2309, -6.0848, -3.1076],
        [-6.0338,  0.9156, -3.6672, -1.7427],
        [-5.9356,  2.1675, -6.4408, -2.4081],
        [-6.5339,  0.3487, -4.9013,  0.2517],
        [-6.6169,  2.2271, -6.0612, -2.4854]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 73%|███████▎  | 210/289 [02:38<00:59,  1.32it/s]

Training loop 210
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3716621994972229, logits - tensor([[-5.9045,  2.7376, -6.2264, -2.0921],
        [-6.4328, -2.6463, -5.8283,  2.5410],
        [-6.3420,  1.7527, -5.4736, -2.3691],
        [-5.9371,  1.9313, -5.8818, -1.7362],
        [-6.5035, -0.9429, -5.0207,  0.6527],
        [-4.3509, -2.4862,  2.4017, -2.2197],
        [-5.0544, -3.0979,  1.7550, -2.4283],
        [-5.0749, -2.8405,  1.5279, -2.2462]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 73%|███████▎  | 211/289 [02:39<00:59,  1.31it/s]

Training loop 211
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.0623818039894104, logits - tensor([[-5.2845, -3.1148,  1.7658, -2.8574],
        [-6.0260,  2.1136, -5.6374, -2.2118],
        [-7.4923, -1.7741, -5.9380,  2.0306],
        [-5.2686,  2.8161, -5.6798, -3.2184],
        [-6.0762,  2.1587, -5.3781, -2.8377],
        [-5.6583,  2.0729, -5.5926, -2.8364],
        [-6.0036,  2.4922, -5.0332, -3.0415],
        [-6.5017, -0.8670, -5.4472,  1.1567]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 73%|███████▎  | 212/289 [02:40<00:58,  1.32it/s]

Training loop 212
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.32942885160446167, logits - tensor([[-5.9138,  2.3620, -5.9558, -2.3743],
        [-6.0342,  1.8019, -6.3957, -1.8200],
        [-5.9687, -3.5049,  1.9795, -3.4731],
        [-5.9501,  2.7107, -5.4183, -2.2597],
        [-5.9072,  2.5605, -5.9232, -2.2836],
        [-6.3143,  2.1492, -4.8798, -2.5766],
        [-6.9586, -0.9610, -4.9066,  0.6011],
        [-6.2079,  2.0411, -6.2418, -2.5551]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 74%|███████▎  | 213/289 [02:41<00:57,  1.32it/s]

Training loop 213
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.04744450002908707, logits - tensor([[-6.6837,  1.7456, -5.8160, -2.8029],
        [-5.3030, -3.5496, -4.8331,  3.3656],
        [-6.3434,  1.6355, -5.8336, -1.5709],
        [-6.3280,  2.5594, -6.7220, -3.1499],
        [-4.9287,  2.2634, -4.7890, -1.9250],
        [-5.6673,  1.8144, -5.9134, -2.5150],
        [-5.8917, -3.7026, -5.7861,  3.1735],
        [-6.4700,  2.3208, -6.8908, -2.5498]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 74%|███████▍  | 214/289 [02:41<00:56,  1.32it/s]

Training loop 214
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1035931259393692, logits - tensor([[-5.9143,  2.7288, -5.2415, -2.5248],
        [-5.5334,  2.0051, -4.8300, -2.7933],
        [-4.8421, -2.9063,  2.1873, -1.7746],
        [-5.4403, -0.9261, -1.4074, -0.8850],
        [-6.0266,  2.1354, -4.6199, -2.0032],
        [-5.7036,  2.2296, -4.5270, -1.4009],
        [-6.2750,  2.7494, -5.4480, -2.2055],
        [-5.9669,  2.9063, -6.4243, -3.1875]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 74%|███████▍  | 215/289 [02:42<00:55,  1.33it/s]

Training loop 215
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.05893678963184357, logits - tensor([[-6.3060, -1.7333, -4.5149,  2.1009],
        [-5.9786, -1.8560, -4.3612,  1.4962],
        [-6.0629, -3.5702,  2.0178, -2.0991],
        [-5.4373, -3.3252,  2.0383, -2.9726],
        [-5.8525, -2.4465, -4.8591,  1.9655],
        [-5.6580,  1.9357, -5.7070, -2.3674],
        [-6.7676,  2.5784, -6.0080, -2.6234],
        [-6.8044,  2.7357, -6.1647, -2.4694]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 75%|███████▍  | 216/289 [02:43<00:54,  1.33it/s]

Training loop 216
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3115832805633545, logits - tensor([[-5.8608,  2.0493, -6.1459, -2.0789],
        [-6.4327,  2.3028, -6.4215, -2.1666],
        [-5.6294,  2.3149, -5.3309, -1.7519],
        [-5.9687, -3.7000, -5.1207,  3.9773],
        [-6.4437,  2.4459, -5.7324, -1.3694],
        [-5.8236,  2.6108, -5.7584, -2.0018],
        [-6.9463,  2.6263, -5.8774, -2.5256],
        [-5.8938,  2.0929, -5.4991, -1.9824]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 75%|███████▌  | 217/289 [02:44<00:54,  1.32it/s]

Training loop 217
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1447456032037735, logits - tensor([[-5.6635,  2.0389, -5.9179, -2.0056],
        [-5.8838,  2.8783, -5.3395, -2.4405],
        [-5.8367, -3.7383, -5.9220,  2.7450],
        [-4.7340, -3.0228,  1.8373, -2.5030],
        [-5.9200, -1.4399, -4.6078,  1.1899],
        [-6.9163,  2.0036, -5.7278, -2.1794],
        [-5.7425, -1.8807, -4.9289,  2.1255],
        [-4.7766, -2.5301,  2.3158, -2.8831]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 75%|███████▌  | 218/289 [02:44<00:53,  1.32it/s]

Training loop 218
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.20697636902332306, logits - tensor([[-7.4009,  2.4383, -6.8724, -2.9663],
        [-5.1376,  2.6400, -5.1140, -2.7328],
        [-5.6795, -2.5115, -4.5944,  2.7610],
        [-5.6430, -3.1937,  1.4672, -3.1368],
        [-6.6505,  1.1440, -5.6838, -1.4332],
        [-5.9675,  2.1785, -5.1637, -2.7558],
        [-6.1459,  2.8902, -6.5738, -2.2156],
        [-7.4062,  2.7721, -6.5298, -2.6703]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 76%|███████▌  | 219/289 [02:45<00:53,  1.31it/s]

Training loop 219
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.23925243318080902, logits - tensor([[-6.3371,  2.2500, -6.3570, -2.2382],
        [-5.4175, -1.8538, -3.4782,  1.3187],
        [-5.7424, -3.5009,  1.9033, -2.8775],
        [-6.0324,  1.3741, -4.8115, -2.0432],
        [-6.9901,  2.6229, -6.3590, -3.2375],
        [-5.7274,  2.6827, -5.8347, -2.7168],
        [-5.9535, -3.5680,  2.2400, -2.5298],
        [-5.3485, -2.3426,  0.9091, -1.7601]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 76%|███████▌  | 220/289 [02:46<00:52,  1.31it/s]

Training loop 220
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.16730043292045593, logits - tensor([[-5.6360, -2.5257, -4.4458,  2.0717],
        [-5.8553,  2.9408, -6.4925, -1.9570],
        [-5.9335,  2.4435, -5.7074, -3.2888],
        [-5.8129,  2.1488, -5.2155, -2.2009],
        [-6.1504,  2.0302, -6.1894, -1.7952],
        [-6.3303,  3.1262, -5.8443, -2.1322],
        [-6.3081,  1.8215, -5.4320, -2.6693],
        [-5.8037,  3.0014, -4.9107, -2.8380]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 76%|███████▋  | 221/289 [02:47<00:52,  1.30it/s]

Training loop 221
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.14291170239448547, logits - tensor([[-6.2084,  2.6008, -5.8545, -2.3038],
        [-5.4734,  2.3501, -5.8693, -2.2045],
        [-6.7268, -0.9463, -5.1983,  1.9348],
        [-5.1280, -3.0505, -3.9545,  3.0274],
        [-6.0303,  2.4897, -5.3165, -2.1850],
        [-5.2608,  2.6463, -5.7306, -2.4148],
        [-6.5199,  2.9652, -5.5477, -2.7721],
        [-6.1002,  2.1974, -6.3558, -1.8362]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 77%|███████▋  | 222/289 [02:47<00:51,  1.31it/s]

Training loop 222
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.17545196413993835, logits - tensor([[-7.5820,  2.0898, -6.1584, -1.5744],
        [-5.4398, -2.5367,  1.0303, -2.7944],
        [-6.1203,  2.1156, -5.1465, -1.9893],
        [-5.4815,  2.3309, -5.9102, -2.5719],
        [-5.2659,  2.1412, -4.7456, -2.3077],
        [-7.2693,  1.8831, -4.9368, -2.6104],
        [-7.0561, -2.9143, -6.0055,  3.4054],
        [-6.7501,  1.7024, -5.4021, -2.5890]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 77%|███████▋  | 223/289 [02:48<00:50,  1.31it/s]

Training loop 223
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.24123543500900269, logits - tensor([[-6.5684,  2.2488, -5.9265, -2.1642],
        [-6.9454,  2.7882, -6.4323, -2.0959],
        [-6.8702,  2.0489, -6.4035, -3.0274],
        [-5.7430,  1.9372, -5.0272, -2.2077],
        [-6.4323,  2.7906, -5.5914, -2.9574],
        [-5.8566, -3.0072,  2.2639, -2.5573],
        [-5.6467,  2.0038, -5.5704, -1.7672],
        [-6.8961,  3.3964, -6.7801, -2.7852]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 78%|███████▊  | 224/289 [02:49<00:49,  1.32it/s]

Training loop 224
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2305625081062317, logits - tensor([[-6.6097,  1.4383, -5.7591, -1.8665],
        [-5.8211,  2.4233, -5.8015, -2.2198],
        [-5.8988,  2.2664, -5.2642, -2.2327],
        [-7.3079, -3.0989,  1.2130, -1.0236],
        [-5.9760,  2.1654, -5.8689, -2.6101],
        [-5.0337, -3.1571,  1.7918, -2.2281],
        [-5.4339,  1.6324, -4.9663, -1.8149],
        [-6.4380,  0.4478, -5.3702, -0.4101]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 78%|███████▊  | 225/289 [02:50<00:48,  1.32it/s]

Training loop 225
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.40407872200012207, logits - tensor([[-5.5242, -3.4086, -4.2142,  4.1635],
        [-6.2508,  2.3691, -5.6715, -2.3453],
        [-5.9414,  2.1425, -5.3978, -2.3846],
        [-6.4010,  1.2797, -4.3250, -2.0097],
        [-6.3478,  2.9053, -5.3836, -2.7165],
        [-6.0633,  1.9744, -4.6086, -1.0266],
        [-6.3873,  1.9698, -6.6696, -2.1140],
        [-5.5887, -3.2415,  1.6184, -3.2495]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 78%|███████▊  | 226/289 [02:50<00:47,  1.32it/s]

Training loop 226
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19484815001487732, logits - tensor([[-6.3836,  2.0707, -5.6186, -2.4768],
        [-4.8677,  1.3548, -4.2598, -1.6137],
        [-6.5132, -1.2877, -5.7378,  0.6081],
        [-5.7361,  1.6038, -5.3493, -1.9166],
        [-6.4847, -0.8396, -5.2168,  1.3895],
        [-6.4634,  2.4612, -5.6815, -2.9737],
        [-6.1865,  2.7694, -6.7576, -2.3012],
        [-5.6790,  2.4042, -5.7594, -2.3854]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 79%|███████▊  | 227/289 [02:51<00:46,  1.32it/s]

Training loop 227
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.42058777809143066, logits - tensor([[-6.2312,  2.1534, -5.2831, -1.7364],
        [-6.7488,  2.7585, -5.8655, -2.4280],
        [-7.1251,  2.4886, -5.8475, -2.4734],
        [-6.8812,  1.9757, -6.4021, -2.3968],
        [-5.9967,  1.9702, -5.6677, -2.4820],
        [-6.3630,  1.4236, -4.5425, -1.6426],
        [-7.2809,  2.6559, -6.5959, -1.7751],
        [-5.7803,  1.3291, -6.1941, -1.6821]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 79%|███████▉  | 228/289 [02:52<00:46,  1.33it/s]

Training loop 228
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2356157898902893, logits - tensor([[-6.7607, -3.1621, -4.5857,  2.6941],
        [-5.7395,  1.8219, -5.8643, -1.3718],
        [-5.6118,  2.4575, -5.4534, -2.9360],
        [-5.8583,  2.4133, -4.8444, -1.9067],
        [-6.6280, -2.2680, -7.1973,  1.3069],
        [-6.0632,  2.2615, -5.5647, -1.2108],
        [-5.9598,  1.8697, -4.9812, -1.9202],
        [-6.0836,  2.1985, -5.5655, -1.7369]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 79%|███████▉  | 229/289 [02:53<00:45,  1.33it/s]

Training loop 229
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.24333816766738892, logits - tensor([[-7.1349,  1.8749, -5.9705, -1.3780],
        [-5.9492, -3.3408,  2.4965, -2.0286],
        [-6.1537, -1.4280, -6.0041,  2.0859],
        [-7.0092,  2.3363, -5.9169, -1.7763],
        [-6.4203,  1.2843, -4.8662, -0.7357],
        [-6.3293, -3.1858,  2.1074, -2.8155],
        [-6.4196,  2.1274, -5.2211, -1.6292],
        [-6.1329,  2.4624, -5.4858, -2.6692]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 80%|███████▉  | 230/289 [02:54<00:44,  1.32it/s]

Training loop 230
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.07261216640472412, logits - tensor([[-5.8506,  2.5570, -5.5936, -1.5997],
        [-6.4825,  1.3771, -5.5852, -1.8483],
        [-5.4598, -3.6767,  2.1710, -2.7492],
        [-7.0055,  1.7387, -5.8546, -1.8570],
        [-6.7708,  1.6069, -5.6675, -1.2784],
        [-7.0878,  1.7296, -6.0762, -2.0064],
        [-5.4312,  2.0764, -5.6874, -2.2174],
        [-6.0904,  2.2970, -6.5395, -2.0824]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 80%|███████▉  | 231/289 [02:54<00:43,  1.32it/s]

Training loop 231
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.43800514936447144, logits - tensor([[-6.3178,  1.5470, -6.0339, -2.1578],
        [-6.6166,  2.2695, -5.4707, -2.1529],
        [-6.0437,  1.3868, -5.0839, -2.2142],
        [-6.3202, -0.6423, -5.0841, -0.0370],
        [-6.3233,  1.0065, -6.1213, -2.2771],
        [-5.7773,  1.5165, -4.5547, -1.4760],
        [-6.3700,  1.5469, -5.6662, -1.9325],
        [-7.3115, -2.3374, -5.3546,  2.8265]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 80%|████████  | 232/289 [02:55<00:43,  1.33it/s]

Training loop 232
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19049040973186493, logits - tensor([[-6.3064,  1.9394, -5.4444, -2.0996],
        [-6.7225, -1.6732, -5.2928,  2.1422],
        [-6.2084,  1.5571, -5.7711, -2.0811],
        [-6.3188,  2.1480, -6.0728, -2.0967],
        [-5.5117,  2.0757, -5.3178, -1.7822],
        [-6.6166,  1.3501, -6.2068, -1.1649],
        [-6.2133,  1.6408, -5.1732, -2.3350],
        [-4.5768, -3.7954, -4.2749,  3.2859]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 81%|████████  | 233/289 [02:56<00:42,  1.33it/s]

Training loop 233
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2352330982685089, logits - tensor([[-6.3784,  0.7862, -5.3256, -1.0847],
        [-5.9426, -3.3525,  2.7638, -2.7151],
        [-7.6007,  0.2522, -5.1415, -0.7917],
        [-6.6099,  1.1458, -5.4305, -1.7448],
        [-6.5645,  2.3229, -6.0149, -2.0688],
        [-6.6698,  0.7957, -5.5007, -1.5777],
        [-6.3860, -4.5504,  2.0509, -2.6537],
        [-5.9434,  1.8592, -6.1278, -1.3662]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 81%|████████  | 234/289 [02:57<00:41,  1.33it/s]

Training loop 234
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4821532368659973, logits - tensor([[-6.3184,  2.5102, -5.8388, -1.9087],
        [-5.5539,  1.4429, -5.3939, -1.4025],
        [-6.3111,  1.8946, -5.6324, -1.8031],
        [-7.0452, -0.1981, -6.3254,  1.1484],
        [-5.9599,  0.9500, -5.6712, -1.6818],
        [-6.7053, -3.5273,  3.1128, -2.5911],
        [-5.5620, -3.0970,  2.0347, -2.7898],
        [-5.5725, -3.8558,  3.0424, -3.2295]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 81%|████████▏ | 235/289 [02:57<00:40,  1.33it/s]

Training loop 235
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1915258914232254, logits - tensor([[-6.7286,  2.3111, -6.6608, -0.9363],
        [-6.7561,  1.8327, -5.2420, -1.7958],
        [-6.0893,  1.8213, -5.6299, -1.3663],
        [-6.0087,  1.9215, -5.0935, -1.8435],
        [-6.3159,  1.4583, -5.9440, -1.8561],
        [-6.4613,  1.7595, -5.7999, -1.7870],
        [-6.5289,  1.1137, -4.9893, -1.5711],
        [-5.5664,  1.3856, -5.9361, -1.8793]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 82%|████████▏ | 236/289 [02:58<00:39,  1.33it/s]

Training loop 236
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2618269622325897, logits - tensor([[-6.1721,  1.1570, -5.8901, -1.1842],
        [-5.6174, -2.6001,  1.2970, -2.3822],
        [-6.2593,  1.4425, -6.0255, -1.5202],
        [-5.5812,  1.4745, -5.7368, -1.7129],
        [-5.8481,  0.5210, -5.1560, -0.8156],
        [-7.1201,  1.4572, -5.7784, -0.9466],
        [-5.8194,  0.1336, -5.0373, -0.9346],
        [-6.6549,  1.1172, -5.3913, -1.6301]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 82%|████████▏ | 237/289 [02:59<00:39,  1.32it/s]

Training loop 237
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.09815892577171326, logits - tensor([[-6.2886,  1.0610, -5.0694, -1.5009],
        [-7.6941,  1.9293, -7.0943, -1.7630],
        [-6.5485,  1.2389, -5.8431, -1.5682],
        [-5.8361, -2.0805, -5.4274,  1.3710],
        [-5.6174,  2.4542, -5.2503, -2.4274],
        [-6.0658, -3.8271, -4.7270,  3.2523],
        [-6.5265,  0.5750, -5.7423, -1.0811],
        [-6.4911,  1.3540, -5.5755, -0.9922]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 82%|████████▏ | 238/289 [03:00<00:38,  1.32it/s]

Training loop 238
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.173213392496109, logits - tensor([[-6.6375,  1.5759, -5.5343, -0.8595],
        [-6.1042,  1.2596, -5.2605, -0.8011],
        [-7.1297,  1.7646, -6.5507, -1.9667],
        [-7.1029,  1.0450, -5.8453, -1.7460],
        [-6.1663,  1.7376, -5.8306, -1.8208],
        [-5.5249, -3.2134,  2.0759, -2.4659],
        [-5.8812,  1.0072, -5.1204, -0.8559],
        [-5.9247,  2.1729, -5.0291, -1.5117]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 83%|████████▎ | 239/289 [03:00<00:38,  1.32it/s]

Training loop 239
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3845699429512024, logits - tensor([[-7.7419,  1.1099, -6.4806, -1.1075],
        [-5.5064,  2.7706, -5.2987, -1.6701],
        [-5.9267, -4.1044,  2.3323, -3.3202],
        [-6.3661,  1.0726, -5.3869, -1.0891],
        [-5.4846, -3.5253,  2.7692, -2.5609],
        [-6.2939, -3.3646, -4.6462,  2.9724],
        [-4.7672, -2.6457,  1.9066, -1.9157],
        [-5.5385,  0.9907, -5.8524, -0.9040]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 83%|████████▎ | 240/289 [03:01<00:37,  1.31it/s]

Training loop 240
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.15593455731868744, logits - tensor([[-6.5858,  1.2680, -5.4678, -1.6260],
        [-5.6835, -2.9584,  0.8987, -2.2085],
        [-7.0106,  1.1507, -6.2148, -1.5583],
        [-6.8645, -2.8150,  2.7709, -2.7665],
        [-5.3625,  0.8353, -4.8644, -0.9946],
        [-5.5687,  1.7645, -5.5194, -1.0701],
        [-7.4512,  1.8554, -7.0613, -1.3059],
        [-6.2697,  0.5359, -5.6757, -0.3283]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 83%|████████▎ | 241/289 [03:02<00:36,  1.31it/s]

Training loop 241
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2659967839717865, logits - tensor([[-6.7996,  1.1881, -5.4340, -1.4429],
        [-6.3407,  0.9996, -5.4168, -1.2142],
        [-6.5007,  1.1492, -5.4852, -1.7033],
        [-6.5361, -0.9686, -5.0476,  1.3668],
        [-6.3725,  1.3361, -5.7319, -2.4049],
        [-5.5908, -3.1594, -4.6560,  3.6002],
        [-5.6704,  1.5871, -6.1259, -1.7030],
        [-6.5211, -2.6899, -4.9072,  2.2537]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 84%|████████▎ | 242/289 [03:03<00:35,  1.31it/s]

Training loop 242
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.0958101749420166, logits - tensor([[-7.0171,  1.4330, -5.6652, -0.8020],
        [-5.2266, -2.7350, -4.0955,  2.4172],
        [-4.8917, -3.2286, -4.5865,  3.5338],
        [-6.1558,  2.0341, -6.5812, -2.2937],
        [-6.4376,  0.9120, -4.8962, -1.0320],
        [-6.0667,  1.2342, -5.2890, -1.6027],
        [-6.9756,  1.4559, -6.1505, -1.7222],
        [-5.1920,  1.5229, -5.1346, -0.9911]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 84%|████████▍ | 243/289 [03:03<00:34,  1.32it/s]

Training loop 243
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.06995929777622223, logits - tensor([[-5.2133, -2.8829,  1.9405, -2.0653],
        [-6.1011,  2.1265, -6.3733, -2.5635],
        [-6.3173, -3.3971, -5.3053,  3.8186],
        [-6.7896,  1.6097, -5.0122, -0.2462],
        [-5.5602, -3.1577,  2.6757, -2.7829],
        [-7.4791,  2.0529, -6.6378, -1.8565],
        [-5.6760,  1.5165, -5.6134, -1.4074],
        [-5.6850, -3.8652, -4.7636,  4.7758]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 84%|████████▍ | 244/289 [03:04<00:34,  1.32it/s]

Training loop 244
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.075586698949337, logits - tensor([[-6.1816, -3.5425,  2.1477, -2.7478],
        [-4.6556, -2.9007,  2.1426, -1.8390],
        [-6.5353,  0.7816, -5.6518, -1.2944],
        [-6.9165, -2.8264, -5.6842,  3.2360],
        [-6.2140,  1.8157, -5.9869, -0.9291],
        [-5.5161, -3.0854, -4.2468,  3.1991],
        [-6.1918,  2.4818, -5.9298, -1.9009],
        [-5.5869,  1.7055, -5.2847, -1.6316]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 85%|████████▍ | 245/289 [03:05<00:33,  1.32it/s]

Training loop 245
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.08578497171401978, logits - tensor([[-5.7758,  1.7876, -4.8222, -1.2651],
        [-5.9283, -4.2513,  2.0646, -1.8682],
        [-6.3319, -2.0734, -4.6895,  2.9296],
        [-6.5063,  1.3078, -6.0405, -1.4740],
        [-7.0835,  2.1371, -6.8788, -0.8773],
        [-6.5426, -3.3194, -4.9502,  2.2495],
        [-5.4172,  0.7480, -5.5087, -2.2684],
        [-6.7838,  1.2601, -5.9207, -2.6752]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 85%|████████▌ | 246/289 [03:06<00:32,  1.32it/s]

Training loop 246
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1635882705450058, logits - tensor([[-6.1483,  1.1799, -6.2359, -1.3478],
        [-4.4286, -3.5507,  2.2178, -1.9820],
        [-5.6142,  1.7733, -4.7848, -1.4059],
        [-5.4165,  0.6569, -4.3954, -1.1793],
        [-6.6654,  2.0107, -6.8476, -2.0849],
        [-5.3116, -4.1375, -5.4708,  4.3792],
        [-5.8560, -3.8954, -5.6526,  3.8025],
        [-6.8595,  1.0550, -5.2774, -1.4808]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 85%|████████▌ | 247/289 [03:06<00:31,  1.32it/s]

Training loop 247
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3434993624687195, logits - tensor([[-5.1939, -2.6540,  1.8788, -2.1665],
        [-7.2195, -0.7825, -5.8665,  0.3123],
        [-7.1491,  1.0785, -6.2082, -0.7479],
        [-6.4228,  1.1104, -6.0679, -0.8101],
        [-4.7664, -3.3453, -4.6473,  3.2118],
        [-7.0814,  1.0908, -5.8472, -1.0084],
        [-6.2230,  1.6988, -6.3168, -1.7162],
        [-6.2322, -2.4863, -4.8951,  3.2925]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 86%|████████▌ | 248/289 [03:07<00:31,  1.31it/s]

Training loop 248
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.23070457577705383, logits - tensor([[-6.6537, -0.0739, -5.0331,  0.0264],
        [-6.3651,  1.9641, -6.5468, -1.7105],
        [-6.5264,  1.8433, -6.1912, -0.3022],
        [-5.2662, -2.7603, -5.3267,  2.1720],
        [-5.6675,  1.8877, -5.4263, -1.5089],
        [-5.9983,  1.5023, -6.7871, -1.8738],
        [-5.6928,  0.0290, -4.4561,  0.2551],
        [-5.5613,  1.5601, -5.1209, -1.5686]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 86%|████████▌ | 249/289 [03:08<00:30,  1.31it/s]

Training loop 249
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.16996294260025024, logits - tensor([[-5.2570,  0.9030, -4.8368, -1.0576],
        [-7.1340,  1.5528, -6.3165, -0.5003],
        [-7.2398,  1.5789, -6.1997, -1.2700],
        [-6.9393,  0.9673, -5.1077, -0.8723],
        [-6.2194, -4.0494, -4.5926,  3.4674],
        [-6.1537,  1.7018, -6.2027, -1.5779],
        [-6.6513,  0.9158, -5.5072, -1.7295],
        [-7.2566, -4.4965, -5.1805,  4.8827]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 87%|████████▋ | 250/289 [03:09<00:29,  1.32it/s]

Training loop 250
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.17104433476924896, logits - tensor([[-6.7192,  1.9045, -6.0531, -0.8216],
        [-7.3561,  1.5667, -6.3553, -1.7999],
        [-6.6296,  1.4192, -5.8134, -1.1020],
        [-6.1078, -4.0277,  2.0580, -3.0492],
        [-4.5514, -2.5252,  1.2984, -2.0969],
        [-6.8509,  1.6814, -5.1322, -1.6698],
        [-5.8860, -3.5673,  2.2999, -2.3563],
        [-6.4079,  1.4431, -5.3997, -1.6446]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 87%|████████▋ | 251/289 [03:09<00:28,  1.32it/s]

Training loop 251
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.31425389647483826, logits - tensor([[-5.0778, -2.7064,  1.7158, -1.9149],
        [-6.9697, -1.5662, -5.2562,  1.9798],
        [-7.7610, -1.2794, -5.0761,  0.5945],
        [-6.9229, -3.7990, -0.7011,  0.1213],
        [-6.0716, -3.3175,  1.6003, -2.1579],
        [-6.0239,  0.8051, -6.3743, -1.1576],
        [-5.6641,  1.8009, -6.4866, -2.1769],
        [-6.0091, -4.6137, -5.1071,  4.3539]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 87%|████████▋ | 252/289 [03:10<00:28,  1.32it/s]

Training loop 252
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.10562414675951004, logits - tensor([[-5.5494,  1.9517, -5.5393, -1.9670],
        [-6.5248, -3.6557, -5.7730,  3.4734],
        [-6.4514,  2.5301, -5.7466, -2.1514],
        [-5.4724, -3.7027,  2.2603, -2.8152],
        [-6.8690,  0.9458, -5.4637,  0.0797],
        [-6.7016,  2.0253, -6.1463, -2.1960],
        [-6.6412,  2.1420, -6.6054, -2.0388],
        [-6.8223,  2.3572, -6.5073, -1.6526]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 88%|████████▊ | 253/289 [03:11<00:27,  1.32it/s]

Training loop 253
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.23672941327095032, logits - tensor([[-5.4965, -3.1290,  1.9822, -1.9322],
        [-5.7648,  1.3583, -5.3840, -2.3516],
        [-6.4442,  0.4909, -4.3659, -0.3919],
        [-5.4341,  1.6250, -5.3184, -1.1614],
        [-5.9633, -2.5452,  0.1528, -1.7871],
        [-6.9470, -3.1646, -4.8345,  3.1464],
        [-7.2490,  1.7858, -7.2532, -1.1811],
        [-5.9394,  1.2578, -6.4816, -1.2007]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 88%|████████▊ | 254/289 [03:12<00:26,  1.32it/s]

Training loop 254
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2507251501083374, logits - tensor([[-5.8923,  1.2893, -5.9418, -1.8128],
        [-5.9526,  1.2665, -4.7649, -1.9868],
        [-6.3438,  2.0950, -6.3592, -2.3868],
        [-6.8619,  1.5365, -6.2472, -1.5910],
        [-5.2234, -2.6389,  1.5905, -1.7873],
        [-5.8494, -1.3519, -2.8943, -0.3656],
        [-5.9314, -3.4408,  1.8285, -2.7954],
        [-6.3622,  2.5261, -6.2582, -2.2138]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 88%|████████▊ | 255/289 [03:12<00:25,  1.32it/s]

Training loop 255
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.16391968727111816, logits - tensor([[-7.1871,  2.7083, -6.8510, -2.5610],
        [-6.5463,  2.2599, -5.8008, -2.3641],
        [-6.6197,  0.8095, -4.5024, -1.2923],
        [-7.0942,  1.3979, -6.1684, -1.7522],
        [-5.7270, -3.1074,  2.0107, -2.6411],
        [-6.5675,  0.9218, -5.9334, -0.2669],
        [-6.6248,  1.5874, -5.5670, -1.3737],
        [-6.2673,  2.6938, -6.4414, -1.7010]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 89%|████████▊ | 256/289 [03:13<00:25,  1.32it/s]

Training loop 256
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.36382052302360535, logits - tensor([[-6.3437,  1.5052, -5.5568, -1.9886],
        [-6.1077,  1.1038, -5.2295, -1.0293],
        [-6.7749,  2.5613, -6.6814, -1.2200],
        [-6.2802,  0.0354, -5.0322, -0.8666],
        [-6.8918,  1.3357, -6.8072, -1.0530],
        [-5.6834,  1.3565, -5.0215, -1.1631],
        [-6.9913,  1.3574, -5.8765, -1.3479],
        [-6.2315, -2.8032,  2.3819, -2.6907]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 89%|████████▉ | 257/289 [03:14<00:24,  1.32it/s]

Training loop 257
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 89%|████████▉ | 258/289 [03:15<00:23,  1.32it/s]

loss - 0.39779040217399597, logits - tensor([[-6.6116,  1.6232, -5.6662, -1.6241],
        [-7.0890,  1.2700, -5.4530, -0.3233],
        [-6.9227,  1.3682, -5.9694, -1.4865],
        [-6.6265,  1.7443, -6.5768, -1.6414],
        [-5.3170, -3.1464,  1.5784, -1.9512],
        [-6.2189, -2.9016, -5.6955,  3.4558],
        [-5.6853,  0.7304, -4.4976, -0.9151],
        [-6.9728,  1.6052, -5.7246, -1.5039]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 258
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.08125558495521545, logits - tensor([[-6.8039,  0.8934, -5.8449, -0.5955],
        [-6.6885,  2.0891, -6.2415, -3.2734],
        [-7.0643, -2.5517, -5.3502,  2.0945],
        [-6.1111,  0.7574, -5.1584, -1.7407],
        [-6.4067,  2.6015, -6.0089, -1.5679],
        [-6.3295, -2.6956, -5.0246,  2.0818],
        [-7.1467, -3.0371, -5.1505,  1.4240],
        [-6.1336, -2.4944, -5.0900,  2

 90%|████████▉ | 259/289 [03:15<00:22,  1.32it/s]

Training loop 259
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.32999712228775024, logits - tensor([[-7.4177,  1.7094, -6.3824, -1.0680],
        [-5.1814, -2.3311,  1.6107, -1.9764],
        [-6.0388, -3.2153,  1.4843, -2.3769],
        [-7.1743,  1.8507, -6.0237, -1.8839],
        [-4.7182, -3.5560,  1.9079, -1.9225],
        [-6.6683,  0.7238, -5.8991, -1.2308],
        [-5.4873,  1.5554, -5.8353, -1.4552],
        [-6.0093,  2.1514, -5.6435, -2.0589]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 90%|████████▉ | 260/289 [03:16<00:21,  1.32it/s]

Training loop 260
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4268297553062439, logits - tensor([[-5.4996, -2.8122,  1.9181, -2.2290],
        [-6.5130,  1.5981, -6.0980, -2.0499],
        [-6.0100,  1.4743, -6.2369, -2.0894],
        [-7.6782, -1.8240, -5.2585,  2.1213],
        [-6.5001,  2.0758, -6.2860, -2.4438],
        [-5.9663,  1.8450, -5.0549, -1.4915],
        [-6.9118,  1.6223, -6.0136, -0.6234],
        [-6.2033,  1.0205, -4.6866, -1.2106]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 90%|█████████ | 261/289 [03:17<00:21,  1.32it/s]

Training loop 261
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.09426098316907883, logits - tensor([[-6.1043,  1.1068, -5.0826, -1.3608],
        [-7.1216,  1.1321, -5.3649, -1.3269],
        [-6.6591,  2.9192, -6.5093, -2.1933],
        [-6.2547, -4.0339,  1.7440, -3.0038],
        [-5.8000, -3.3745,  1.8166, -2.8208],
        [-5.4567, -3.3374,  1.1560, -2.8826],
        [-6.6193,  1.7457, -5.6773, -0.5699],
        [-5.6550, -3.0704,  1.4211, -2.3365]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 91%|█████████ | 262/289 [03:18<00:20,  1.32it/s]

Training loop 262
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.08244270086288452, logits - tensor([[-5.4116,  2.0623, -5.8814, -1.9584],
        [-6.4273,  1.7113, -6.7300, -1.9172],
        [-6.1211,  1.0835, -4.7051, -1.5194],
        [-5.9581,  1.7346, -6.3338, -1.4637],
        [-5.7443, -3.4904, -5.0595,  3.3813],
        [-5.3264,  1.1020, -4.0775, -1.5338],
        [-6.4042,  2.4695, -6.1629, -2.4605],
        [-6.4150, -1.6093, -5.1870,  1.1955]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 91%|█████████ | 263/289 [03:19<00:19,  1.32it/s]

Training loop 263
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.25166261196136475, logits - tensor([[-4.6960, -2.9136,  0.9719, -1.5720],
        [-7.9760,  1.6206, -6.5560, -1.2594],
        [-5.9131, -3.3096, -5.1622,  3.2631],
        [-6.5469,  1.9287, -6.9994, -1.4663],
        [-4.8202, -3.7246,  2.2026, -2.3462],
        [-6.4269,  0.9849, -6.1817, -1.6799],
        [-7.1196,  1.6416, -6.2389, -1.5384],
        [-6.7348,  1.1188, -5.8059, -1.1921]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 91%|█████████▏| 264/289 [03:19<00:18,  1.33it/s]

Training loop 264
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2961083948612213, logits - tensor([[-5.5668,  2.8201, -6.3687, -1.5785],
        [-6.7427,  0.7251, -5.2461, -1.0304],
        [-6.6678,  1.3868, -5.9613, -1.2616],
        [-6.6437,  1.8413, -5.6174, -2.3474],
        [-6.3079, -3.1966, -5.4343,  2.5978],
        [-6.6613,  1.4289, -6.3567, -1.9273],
        [-5.5198,  0.9812, -4.5399, -1.2088],
        [-7.3694,  3.0154, -7.4146, -2.3096]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 92%|█████████▏| 265/289 [03:20<00:18,  1.33it/s]

Training loop 265
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2223350703716278, logits - tensor([[-7.3997,  1.0698, -5.0045, -1.6182],
        [-6.5982,  1.6103, -6.2803, -1.6175],
        [-5.9157, -2.5295, -4.4670,  2.1899],
        [-6.7281,  1.4988, -6.5341, -1.8575],
        [-7.4328,  2.7833, -7.2556, -2.7567],
        [-6.5333,  1.7192, -5.8809, -1.5538],
        [-6.7135,  2.6359, -6.1373, -0.9722],
        [-5.4728, -1.2585, -0.5768, -1.5334]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 92%|█████████▏| 266/289 [03:21<00:17,  1.32it/s]

Training loop 266
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1487213373184204, logits - tensor([[-6.0560,  0.7387, -5.1914, -0.9382],
        [-6.9128,  1.5905, -5.4282, -1.9995],
        [-6.5067,  2.0134, -6.7183, -2.0793],
        [-6.2789,  2.0214, -6.5716, -1.9189],
        [-6.2165, -1.9738, -4.6430,  3.2325],
        [-6.9298,  3.2149, -6.9680, -2.2144],
        [-6.8322,  1.0222, -5.0265, -1.2024],
        [-6.2755, -3.1472, -4.7177,  3.8201]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 92%|█████████▏| 267/289 [03:22<00:16,  1.32it/s]

Training loop 267
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.15848329663276672, logits - tensor([[-6.8176,  0.3564, -5.2723, -1.3343],
        [-6.1662,  0.0251, -2.0492, -1.6525],
        [-7.0733,  1.2362, -6.2680, -1.5360],
        [-6.0715, -1.7354, -5.3694,  2.0285],
        [-6.3946,  1.8613, -6.0255, -2.9838],
        [-5.1874, -3.0432,  2.6869, -1.7055],
        [-5.9316, -3.8422,  3.0372, -2.2949],
        [-6.3507,  2.5935, -6.0385, -1.7563]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 93%|█████████▎| 268/289 [03:22<00:15,  1.33it/s]

Training loop 268
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.08734256029129028, logits - tensor([[-6.2289,  1.7219, -5.7070, -1.7246],
        [-5.2310, -2.4174,  1.5133, -1.9097],
        [-7.1879,  1.3512, -6.9718, -1.6851],
        [-6.4565, -3.9270,  2.0279, -3.0114],
        [-6.0973,  1.6721, -5.3801, -1.1010],
        [-5.6160, -3.4765, -4.0834,  4.0181],
        [-6.5018,  1.4226, -5.6348, -2.2279],
        [-6.7872,  0.7530, -5.2631, -1.5745]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 93%|█████████▎| 269/289 [03:23<00:15,  1.33it/s]

Training loop 269
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.18101917207241058, logits - tensor([[-7.2289,  0.9763, -6.3517, -1.1202],
        [-6.8166, -2.7201, -5.2975,  2.6222],
        [-7.0640,  1.2916, -6.9536, -2.0128],
        [-6.6139,  1.9859, -7.5896, -1.5827],
        [-6.8533,  0.7825, -6.1789, -1.1729],
        [-6.2323,  1.2794, -5.4624, -1.4154],
        [-7.5975,  1.9653, -7.7119, -1.8411],
        [-6.6175,  1.8651, -6.2490, -2.0504]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 93%|█████████▎| 270/289 [03:24<00:14,  1.33it/s]

Training loop 270
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.25197839736938477, logits - tensor([[-6.4803,  1.5282, -6.6629, -1.1702],
        [-4.6553, -2.6165,  1.8482, -1.0709],
        [-7.9010,  0.2878, -5.5126,  0.0482],
        [-6.3064,  1.4506, -6.1668, -1.4568],
        [-6.1226,  1.4871, -6.2026, -2.1045],
        [-6.6845,  1.0442, -6.5982, -2.2670],
        [-5.1370, -3.2904,  1.3118, -2.2157],
        [-7.0769,  1.7693, -6.4678, -1.1869]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 94%|█████████▍| 271/289 [03:25<00:13,  1.32it/s]

Training loop 271
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4543222486972809, logits - tensor([[-6.3925,  2.2794, -6.6289, -1.3707],
        [-6.2189,  1.2255, -5.6561, -1.6510],
        [-6.0868, -3.0668,  1.7031, -2.3532],
        [-7.0492,  2.6189, -7.2883, -1.9658],
        [-6.7267,  1.5257, -5.7550, -1.5978],
        [-6.1837,  1.5929, -5.5106, -1.3378],
        [-6.5668,  1.0297, -6.0709, -1.0684],
        [-5.7670, -3.4739,  1.4464, -2.7545]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 94%|█████████▍| 272/289 [03:25<00:12,  1.32it/s]

Training loop 272
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4592638611793518, logits - tensor([[-7.3668,  2.4357, -7.1930, -1.7742],
        [-6.2081,  0.7566, -5.0105, -0.3425],
        [-5.3647, -3.0236,  1.7527, -2.3362],
        [-7.9972,  1.7081, -7.6558, -1.2500],
        [-6.8076, -2.2847, -4.8410,  1.4117],
        [-6.0257,  1.8630, -5.9377, -1.5098],
        [-6.1754,  1.5277, -5.6199, -0.6294],
        [-7.4051,  2.8293, -6.9289, -1.8034]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 94%|█████████▍| 273/289 [03:26<00:12,  1.32it/s]

Training loop 273
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.24454256892204285, logits - tensor([[-6.5375,  1.3261, -5.6963, -0.8837],
        [-5.8343, -3.3289,  1.6347, -2.0710],
        [-5.6067,  0.9732, -5.0435, -0.9962],
        [-6.8622, -0.5274, -4.5038, -0.0220],
        [-6.0854,  1.5357, -5.5924, -0.7110],
        [-6.1094, -3.3311, -4.6816,  3.6951],
        [-6.9278,  0.8313, -5.1124, -0.1361],
        [-6.0531,  0.6473, -4.3975, -0.3027]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 95%|█████████▍| 274/289 [03:27<00:11,  1.32it/s]

Training loop 274
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3317198157310486, logits - tensor([[-7.2889,  0.3051, -5.3381, -0.1912],
        [-6.6847, -3.9348,  2.3218, -2.7123],
        [-5.4891, -2.9656,  1.6899, -1.5592],
        [-5.7400,  1.0139, -4.9965, -0.9742],
        [-5.9924,  0.7924, -6.1591,  0.5212],
        [-7.2024,  1.2619, -5.2358, -0.8734],
        [-6.6716,  1.2936, -6.0258, -1.5183],
        [-6.0611,  0.0304, -4.4857, -0.6501]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 95%|█████████▌| 275/289 [03:28<00:10,  1.32it/s]

Training loop 275
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1975349634885788, logits - tensor([[-6.8997,  1.5035, -6.6542, -1.4138],
        [-7.0261,  2.5757, -7.2091, -2.8155],
        [-6.5657,  2.5858, -6.6152, -2.4864],
        [-7.5565,  1.1181, -5.1881, -0.5140],
        [-7.1184,  1.5289, -6.6082, -1.1372],
        [-7.4185, -1.7295, -4.7115,  1.0515],
        [-5.0423, -2.4679,  1.8502, -2.1020],
        [-6.1951, -2.3477,  0.9741, -1.3766]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 96%|█████████▌| 276/289 [03:28<00:09,  1.32it/s]

Training loop 276
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.20267857611179352, logits - tensor([[-6.1106, -3.8223,  2.2487, -2.5053],
        [-7.6212, -0.5473, -4.5911,  0.9563],
        [-5.4967, -3.6459,  2.1701, -1.5988],
        [-6.6891,  1.0489, -7.0508, -0.8654],
        [-7.0674, -3.2769, -5.3515,  3.7537],
        [-5.9827,  2.1020, -6.4156, -1.6363],
        [-7.4501,  1.0814, -6.7128, -0.6584],
        [-5.6572, -3.1384,  2.6640, -2.4328]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 96%|█████████▌| 277/289 [03:29<00:09,  1.32it/s]

Training loop 277
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4184289574623108, logits - tensor([[-5.3023, -2.5365,  1.6557, -2.1753],
        [-6.3568,  0.1710, -5.0018, -0.0614],
        [-5.5744, -2.9743,  1.9075, -2.1007],
        [-6.6727,  1.3817, -6.1988, -1.3056],
        [-6.6888, -0.2634, -5.9155,  0.3457],
        [-7.7963,  1.0180, -5.7784, -1.0390],
        [-6.0071,  1.1883, -5.4655, -1.2792],
        [-8.0989,  1.4490, -6.7926, -2.4630]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 96%|█████████▌| 278/289 [03:30<00:08,  1.33it/s]

Training loop 278
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 97%|█████████▋| 279/289 [03:31<00:07,  1.32it/s]

loss - 0.2844076454639435, logits - tensor([[-6.7456,  1.5416, -6.0008, -0.5659],
        [-5.1693,  1.7817, -5.6061, -1.8135],
        [-6.5500,  1.2057, -6.6639, -1.0866],
        [-6.3704,  0.7356, -6.1418, -0.8679],
        [-7.3664,  0.3392, -5.2331, -0.3168],
        [-6.8935,  1.4025, -6.3222, -1.9645],
        [-7.7783, -0.9938, -5.7004,  1.0577],
        [-5.8219,  1.6417, -5.5830, -1.1302]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 279
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.16308140754699707, logits - tensor([[-6.3683, -4.1604, -4.8821,  3.7401],
        [-5.5102, -2.6127,  1.9657, -1.8927],
        [-5.7525, -3.9376, -5.5015,  4.1792],
        [-6.6889,  0.3309, -4.6934, -1.0375],
        [-6.0620, -3.0535,  1.4546, -2.5599],
        [-7.0053,  2.4049, -6.5785, -1.8958],
        [-6.8922,  1.9147, -6.0188, -1.2407],
        [-6.8751,  0.1536, -5.9490, -0.

 97%|█████████▋| 280/289 [03:31<00:06,  1.32it/s]

Training loop 280
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.49193626642227173, logits - tensor([[-6.1113,  0.9581, -5.5843, -1.3416],
        [-5.2736, -3.6644, -4.3754,  3.6705],
        [-4.8227, -2.1654,  1.4750, -1.9085],
        [-6.8558,  1.9630, -7.2780, -1.0441],
        [-5.8749,  1.3340, -4.8698, -1.7676],
        [-6.4772, -4.0538,  2.2770, -2.2852],
        [-6.8455,  1.3510, -5.3738, -0.4940],
        [-6.0834, -3.0468,  1.7077, -2.1608]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 97%|█████████▋| 281/289 [03:32<00:06,  1.32it/s]

Training loop 281
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.13651630282402039, logits - tensor([[-5.8901,  1.0716, -4.9362, -0.7333],
        [-6.9985,  1.6633, -5.9912, -1.7678],
        [-5.3298,  2.3371, -5.7123, -1.7070],
        [-5.6915, -0.0491, -4.9279, -0.5449],
        [-6.9100, -2.7132, -4.9984,  2.9939],
        [-4.9800, -2.6367,  0.9519, -1.9998],
        [-5.7496,  1.3411, -5.6190, -0.2577],
        [-6.0129,  1.6704, -5.6707, -1.2858]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 98%|█████████▊| 282/289 [03:33<00:05,  1.33it/s]

Training loop 282
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 98%|█████████▊| 283/289 [03:34<00:04,  1.32it/s]

loss - 0.26395750045776367, logits - tensor([[-5.7754,  1.5646, -5.4244, -2.0421],
        [-5.6775,  0.7574, -5.6338, -0.9049],
        [-6.6135, -2.8212, -5.7435,  1.4392],
        [-4.9970, -2.5217,  1.4137, -1.8412],
        [-5.8584,  0.0868, -5.4667,  0.3156],
        [-7.0253, -0.7629, -5.1724,  1.0567],
        [-5.7566,  1.8541, -5.9776, -1.1330],
        [-7.5417,  1.5155, -6.2723, -0.8702]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 283
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.252409428358078, logits - tensor([[-7.5645,  1.0741, -6.5324, -1.2467],
        [-6.9943, -2.6591, -4.8546,  2.6091],
        [-5.7872,  1.8599, -5.8412, -2.1524],
        [-6.6699,  1.8495, -6.2771, -1.4265],
        [-5.9012, -2.8913,  0.9934, -1.2479],
        [-7.2984,  2.0743, -6.1162, -1.5149],
        [-6.6159,  1.6358, -5.7430, -0.9972],
        [-6.6130, -0.9299, -5.5923,  1.1

 98%|█████████▊| 284/289 [03:34<00:03,  1.32it/s]

Training loop 284
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2740924060344696, logits - tensor([[-5.5986,  0.6978, -5.0464, -0.9302],
        [-4.7449, -2.3622,  1.8144, -1.5157],
        [-6.5117,  1.7208, -6.5935, -1.3192],
        [-5.6399,  1.6937, -5.3966, -1.2099],
        [-6.3567,  1.3897, -5.5371, -1.4220],
        [-7.5062, -0.9633, -5.2186,  0.3577],
        [-6.0938,  0.6873, -5.3979, -0.6160],
        [-5.3414, -3.1425,  1.4420, -1.5509]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 99%|█████████▊| 285/289 [03:35<00:03,  1.32it/s]

Training loop 285
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2656286358833313, logits - tensor([[-6.8151,  1.6810, -7.0290, -1.8960],
        [-6.8203,  1.5443, -6.6356, -1.3007],
        [-6.4609,  1.5300, -5.8069, -2.1953],
        [-6.8297, -2.6864, -4.3640,  2.4385],
        [-7.3846,  1.8796, -6.7918, -1.4250],
        [-7.5729, -2.5535, -5.6112,  2.9455],
        [-5.3888,  1.7877, -6.1167, -1.5507],
        [-6.6657,  0.2827, -5.6091, -0.7968]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 99%|█████████▉| 286/289 [03:36<00:02,  1.33it/s]

Training loop 286
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.21160557866096497, logits - tensor([[-4.4647, -3.1044,  1.4697, -1.7279],
        [-6.3750,  1.8990, -6.8771, -1.1770],
        [-4.8764, -2.9516,  1.8552, -2.6065],
        [-6.9970,  1.1545, -5.7781, -1.0162],
        [-6.1192,  0.8781, -5.1403, -2.0110],
        [-6.7712,  1.9475, -6.2146, -1.7826],
        [-7.0098,  1.6448, -6.8573, -0.9939],
        [-5.9815,  0.6765, -5.4845, -1.0032]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 99%|█████████▉| 287/289 [03:37<00:01,  1.33it/s]

Training loop 287
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.23663800954818726, logits - tensor([[-6.1625,  0.3278, -5.1405, -1.4622],
        [-7.5588,  2.4388, -6.4673, -2.2883],
        [-7.6640,  1.2578, -7.0595, -1.2961],
        [-6.4502,  2.4003, -6.5277, -2.2741],
        [-7.0093,  1.0744, -6.0681, -1.3467],
        [-5.5908, -3.1052,  2.1326, -1.9954],
        [-7.1771, -1.0840, -5.9985,  0.3392],
        [-6.6765, -0.1354, -5.6625,  0.5470]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


100%|█████████▉| 288/289 [03:37<00:00,  1.33it/s]

Training loop 288
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5069805383682251, logits - tensor([[-5.9475,  1.6240, -5.3492, -1.4675],
        [-6.4094,  1.6259, -7.1302, -1.8913],
        [-6.3870, -1.4105, -4.3562,  2.6090],
        [-5.2253,  1.3096, -5.5175, -2.3501]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


100%|██████████| 289/289 [03:38<00:00,  1.32it/s]
  0%|          | 0/194 [00:00<?, ?it/s]

Validation Loop 0
input - False, attention_mask - False


  1%|          | 1/194 [00:00<00:57,  3.34it/s]

Validation Loop 1
input - False, attention_mask - False


  1%|          | 2/194 [00:00<00:51,  3.74it/s]

Validation Loop 2
input - False, attention_mask - False


  2%|▏         | 3/194 [00:00<00:49,  3.87it/s]

Validation Loop 3
input - False, attention_mask - False


  2%|▏         | 4/194 [00:01<00:50,  3.79it/s]

Validation Loop 4
input - False, attention_mask - False


  3%|▎         | 5/194 [00:01<00:49,  3.84it/s]

Validation Loop 5
input - False, attention_mask - False


  3%|▎         | 6/194 [00:01<00:47,  3.92it/s]

Validation Loop 6
input - False, attention_mask - False


  4%|▎         | 7/194 [00:01<00:47,  3.97it/s]

Validation Loop 7
input - False, attention_mask - False


  4%|▍         | 8/194 [00:02<00:47,  3.90it/s]

Validation Loop 8
input - False, attention_mask - False


  5%|▍         | 9/194 [00:02<00:47,  3.91it/s]

Validation Loop 9
input - False, attention_mask - False


  5%|▌         | 10/194 [00:02<00:46,  3.97it/s]

Validation Loop 10
input - False, attention_mask - False


  6%|▌         | 11/194 [00:02<00:46,  3.98it/s]

Validation Loop 11
input - False, attention_mask - False


  6%|▌         | 12/194 [00:03<00:46,  3.94it/s]

Validation Loop 12
input - False, attention_mask - False


  7%|▋         | 13/194 [00:03<00:45,  3.94it/s]

Validation Loop 13
input - False, attention_mask - False


  7%|▋         | 14/194 [00:03<00:45,  3.99it/s]

Validation Loop 14
input - False, attention_mask - False


  8%|▊         | 15/194 [00:03<00:44,  3.99it/s]

Validation Loop 15
input - False, attention_mask - False


  8%|▊         | 16/194 [00:04<00:45,  3.95it/s]

Validation Loop 16
input - False, attention_mask - False


  9%|▉         | 17/194 [00:04<00:44,  3.95it/s]

Validation Loop 17
input - False, attention_mask - False


  9%|▉         | 18/194 [00:04<00:44,  3.95it/s]

Validation Loop 18
input - False, attention_mask - False


 10%|▉         | 19/194 [00:04<00:44,  3.96it/s]

Validation Loop 19
input - False, attention_mask - False


 10%|█         | 20/194 [00:05<00:44,  3.93it/s]

Validation Loop 20
input - False, attention_mask - False


 11%|█         | 21/194 [00:05<00:43,  3.93it/s]

Validation Loop 21
input - False, attention_mask - False


 11%|█▏        | 22/194 [00:05<00:43,  3.98it/s]

Validation Loop 22
input - False, attention_mask - False


 12%|█▏        | 23/194 [00:05<00:42,  3.99it/s]

Validation Loop 23
input - False, attention_mask - False


 12%|█▏        | 24/194 [00:06<00:42,  3.99it/s]

Validation Loop 24
input - False, attention_mask - False


 13%|█▎        | 25/194 [00:06<00:42,  4.01it/s]

Validation Loop 25
input - False, attention_mask - False


 13%|█▎        | 26/194 [00:06<00:42,  3.99it/s]

Validation Loop 26
input - False, attention_mask - False


 14%|█▍        | 27/194 [00:06<00:41,  3.99it/s]

Validation Loop 27
input - False, attention_mask - False


 14%|█▍        | 28/194 [00:07<00:41,  3.96it/s]

Validation Loop 28
input - False, attention_mask - False


 15%|█▍        | 29/194 [00:07<00:41,  3.96it/s]

Validation Loop 29
input - False, attention_mask - False


 15%|█▌        | 30/194 [00:07<00:41,  3.96it/s]

Validation Loop 30
input - False, attention_mask - False


 16%|█▌        | 31/194 [00:07<00:41,  3.96it/s]

Validation Loop 31
input - False, attention_mask - False


 16%|█▋        | 32/194 [00:08<00:40,  3.99it/s]

Validation Loop 32
input - False, attention_mask - False


 17%|█▋        | 33/194 [00:08<00:40,  3.98it/s]

Validation Loop 33
input - False, attention_mask - False


 18%|█▊        | 34/194 [00:08<00:39,  4.00it/s]

Validation Loop 34
input - False, attention_mask - False


 18%|█▊        | 35/194 [00:08<00:39,  3.99it/s]

Validation Loop 35
input - False, attention_mask - False


 19%|█▊        | 36/194 [00:09<00:39,  3.97it/s]

Validation Loop 36
input - False, attention_mask - False


 19%|█▉        | 37/194 [00:09<00:39,  3.95it/s]

Validation Loop 37
input - False, attention_mask - False


 20%|█▉        | 38/194 [00:09<00:39,  3.93it/s]

Validation Loop 38
input - False, attention_mask - False


 20%|██        | 39/194 [00:09<00:39,  3.94it/s]

Validation Loop 39
input - False, attention_mask - False


 21%|██        | 40/194 [00:10<00:39,  3.90it/s]

Validation Loop 40
input - False, attention_mask - False


 21%|██        | 41/194 [00:10<00:38,  3.94it/s]

Validation Loop 41
input - False, attention_mask - False


 22%|██▏       | 42/194 [00:10<00:38,  3.92it/s]

Validation Loop 42
input - False, attention_mask - False


 22%|██▏       | 43/194 [00:10<00:38,  3.90it/s]

Validation Loop 43
input - False, attention_mask - False


 23%|██▎       | 44/194 [00:11<00:38,  3.91it/s]

Validation Loop 44
input - False, attention_mask - False


 23%|██▎       | 45/194 [00:11<00:38,  3.87it/s]

Validation Loop 45
input - False, attention_mask - False


 24%|██▎       | 46/194 [00:11<00:38,  3.89it/s]

Validation Loop 46
input - False, attention_mask - False


 24%|██▍       | 47/194 [00:11<00:37,  3.93it/s]

Validation Loop 47
input - False, attention_mask - False


 25%|██▍       | 48/194 [00:12<00:37,  3.92it/s]

Validation Loop 48
input - False, attention_mask - False


 25%|██▌       | 49/194 [00:12<00:36,  3.94it/s]

Validation Loop 49
input - False, attention_mask - False


 26%|██▌       | 50/194 [00:12<00:36,  3.91it/s]

Validation Loop 50
input - False, attention_mask - False


 26%|██▋       | 51/194 [00:12<00:36,  3.92it/s]

Validation Loop 51
input - False, attention_mask - False


 27%|██▋       | 52/194 [00:13<00:36,  3.94it/s]

Validation Loop 52
input - False, attention_mask - False


 27%|██▋       | 53/194 [00:13<00:35,  3.94it/s]

Validation Loop 53
input - False, attention_mask - False


 28%|██▊       | 54/194 [00:13<00:35,  3.95it/s]

Validation Loop 54
input - False, attention_mask - False


 28%|██▊       | 55/194 [00:13<00:35,  3.94it/s]

Validation Loop 55
input - False, attention_mask - False


 29%|██▉       | 56/194 [00:14<00:35,  3.93it/s]

Validation Loop 56
input - False, attention_mask - False


 29%|██▉       | 57/194 [00:14<00:34,  3.96it/s]

Validation Loop 57
input - False, attention_mask - False


 30%|██▉       | 58/194 [00:14<00:34,  3.98it/s]

Validation Loop 58
input - False, attention_mask - False


 30%|███       | 59/194 [00:14<00:34,  3.94it/s]

Validation Loop 59
input - False, attention_mask - False


 31%|███       | 60/194 [00:15<00:33,  3.95it/s]

Validation Loop 60
input - False, attention_mask - False


 31%|███▏      | 61/194 [00:15<00:33,  3.97it/s]

Validation Loop 61
input - False, attention_mask - False


 32%|███▏      | 62/194 [00:15<00:33,  3.96it/s]

Validation Loop 62
input - False, attention_mask - False


 32%|███▏      | 63/194 [00:15<00:33,  3.94it/s]

Validation Loop 63
input - False, attention_mask - False


 33%|███▎      | 64/194 [00:16<00:32,  3.97it/s]

Validation Loop 64
input - False, attention_mask - False


 34%|███▎      | 65/194 [00:16<00:32,  3.97it/s]

Validation Loop 65
input - False, attention_mask - False


 34%|███▍      | 66/194 [00:16<00:32,  4.00it/s]

Validation Loop 66
input - False, attention_mask - False


 35%|███▍      | 67/194 [00:16<00:31,  3.98it/s]

Validation Loop 67
input - False, attention_mask - False


 35%|███▌      | 68/194 [00:17<00:31,  4.01it/s]

Validation Loop 68
input - False, attention_mask - False


 36%|███▌      | 69/194 [00:17<00:31,  4.00it/s]

Validation Loop 69
input - False, attention_mask - False


 36%|███▌      | 70/194 [00:17<00:31,  3.99it/s]

Validation Loop 70
input - False, attention_mask - False


 37%|███▋      | 71/194 [00:17<00:31,  3.96it/s]

Validation Loop 71
input - False, attention_mask - False


 37%|███▋      | 72/194 [00:18<00:30,  3.99it/s]

Validation Loop 72
input - False, attention_mask - False


 38%|███▊      | 73/194 [00:18<00:30,  3.96it/s]

Validation Loop 73
input - False, attention_mask - False


 38%|███▊      | 74/194 [00:18<00:30,  3.93it/s]

Validation Loop 74
input - False, attention_mask - False


 39%|███▊      | 75/194 [00:19<00:30,  3.95it/s]

Validation Loop 75
input - False, attention_mask - False


 39%|███▉      | 76/194 [00:19<00:29,  3.96it/s]

Validation Loop 76
input - False, attention_mask - False


 40%|███▉      | 77/194 [00:19<00:29,  3.97it/s]

Validation Loop 77
input - False, attention_mask - False


 40%|████      | 78/194 [00:19<00:29,  3.98it/s]

Validation Loop 78
input - False, attention_mask - False


 41%|████      | 79/194 [00:20<00:28,  3.97it/s]

Validation Loop 79
input - False, attention_mask - False


 41%|████      | 80/194 [00:20<00:28,  3.97it/s]

Validation Loop 80
input - False, attention_mask - False


 42%|████▏     | 81/194 [00:20<00:28,  3.95it/s]

Validation Loop 81
input - False, attention_mask - False


 42%|████▏     | 82/194 [00:20<00:28,  3.96it/s]

Validation Loop 82
input - False, attention_mask - False


 43%|████▎     | 83/194 [00:21<00:28,  3.96it/s]

Validation Loop 83
input - False, attention_mask - False


 43%|████▎     | 84/194 [00:21<00:27,  3.95it/s]

Validation Loop 84
input - False, attention_mask - False


 44%|████▍     | 85/194 [00:21<00:27,  3.98it/s]

Validation Loop 85
input - False, attention_mask - False


 44%|████▍     | 86/194 [00:21<00:27,  3.95it/s]

Validation Loop 86
input - False, attention_mask - False


 45%|████▍     | 87/194 [00:22<00:26,  3.98it/s]

Validation Loop 87
input - False, attention_mask - False


 45%|████▌     | 88/194 [00:22<00:26,  3.96it/s]

Validation Loop 88
input - False, attention_mask - False


 46%|████▌     | 89/194 [00:22<00:26,  3.98it/s]

Validation Loop 89
input - False, attention_mask - False


 46%|████▋     | 90/194 [00:22<00:26,  3.96it/s]

Validation Loop 90
input - False, attention_mask - False


 47%|████▋     | 91/194 [00:23<00:25,  3.97it/s]

Validation Loop 91
input - False, attention_mask - False


 47%|████▋     | 92/194 [00:23<00:25,  3.98it/s]

Validation Loop 92
input - False, attention_mask - False


 48%|████▊     | 93/194 [00:23<00:25,  3.99it/s]

Validation Loop 93
input - False, attention_mask - False


 48%|████▊     | 94/194 [00:23<00:25,  3.95it/s]

Validation Loop 94
input - False, attention_mask - False


 49%|████▉     | 95/194 [00:24<00:24,  3.97it/s]

Validation Loop 95
input - False, attention_mask - False


 49%|████▉     | 96/194 [00:24<00:24,  3.95it/s]

Validation Loop 96
input - False, attention_mask - False


 50%|█████     | 97/194 [00:24<00:24,  3.90it/s]

Validation Loop 97
input - False, attention_mask - False


 51%|█████     | 98/194 [00:24<00:24,  3.89it/s]

Validation Loop 98
input - False, attention_mask - False


 51%|█████     | 99/194 [00:25<00:24,  3.87it/s]

Validation Loop 99
input - False, attention_mask - False


 52%|█████▏    | 100/194 [00:25<00:24,  3.83it/s]

Validation Loop 100
input - False, attention_mask - False


 52%|█████▏    | 101/194 [00:25<00:24,  3.85it/s]

Validation Loop 101
input - False, attention_mask - False


 53%|█████▎    | 102/194 [00:25<00:23,  3.85it/s]

Validation Loop 102
input - False, attention_mask - False


 53%|█████▎    | 103/194 [00:26<00:23,  3.85it/s]

Validation Loop 103
input - False, attention_mask - False


 54%|█████▎    | 104/194 [00:26<00:23,  3.90it/s]

Validation Loop 104
input - False, attention_mask - False


 54%|█████▍    | 105/194 [00:26<00:22,  3.92it/s]

Validation Loop 105
input - False, attention_mask - False


 55%|█████▍    | 106/194 [00:26<00:22,  3.94it/s]

Validation Loop 106
input - False, attention_mask - False


 55%|█████▌    | 107/194 [00:27<00:22,  3.95it/s]

Validation Loop 107
input - False, attention_mask - False


 56%|█████▌    | 108/194 [00:27<00:21,  3.98it/s]

Validation Loop 108
input - False, attention_mask - False


 56%|█████▌    | 109/194 [00:27<00:21,  3.96it/s]

Validation Loop 109
input - False, attention_mask - False


 57%|█████▋    | 110/194 [00:27<00:21,  3.94it/s]

Validation Loop 110
input - False, attention_mask - False


 57%|█████▋    | 111/194 [00:28<00:21,  3.95it/s]

Validation Loop 111
input - False, attention_mask - False


 58%|█████▊    | 112/194 [00:28<00:20,  3.97it/s]

Validation Loop 112
input - False, attention_mask - False


 58%|█████▊    | 113/194 [00:28<00:20,  3.94it/s]

Validation Loop 113
input - False, attention_mask - False


 59%|█████▉    | 114/194 [00:28<00:20,  3.98it/s]

Validation Loop 114
input - False, attention_mask - False


 59%|█████▉    | 115/194 [00:29<00:19,  3.97it/s]

Validation Loop 115
input - False, attention_mask - False


 60%|█████▉    | 116/194 [00:29<00:19,  3.93it/s]

Validation Loop 116
input - False, attention_mask - False


 60%|██████    | 117/194 [00:29<00:19,  3.96it/s]

Validation Loop 117
input - False, attention_mask - False


 61%|██████    | 118/194 [00:29<00:19,  3.96it/s]

Validation Loop 118
input - False, attention_mask - False


 61%|██████▏   | 119/194 [00:30<00:18,  3.96it/s]

Validation Loop 119
input - False, attention_mask - False


 62%|██████▏   | 120/194 [00:30<00:18,  3.95it/s]

Validation Loop 120
input - False, attention_mask - False


 62%|██████▏   | 121/194 [00:30<00:18,  3.98it/s]

Validation Loop 121
input - False, attention_mask - False


 63%|██████▎   | 122/194 [00:30<00:18,  3.95it/s]

Validation Loop 122
input - False, attention_mask - False


 63%|██████▎   | 123/194 [00:31<00:18,  3.94it/s]

Validation Loop 123
input - False, attention_mask - False


 64%|██████▍   | 124/194 [00:31<00:17,  3.94it/s]

Validation Loop 124
input - False, attention_mask - False


 64%|██████▍   | 125/194 [00:31<00:17,  3.96it/s]

Validation Loop 125
input - False, attention_mask - False


 65%|██████▍   | 126/194 [00:31<00:17,  3.97it/s]

Validation Loop 126
input - False, attention_mask - False


 65%|██████▌   | 127/194 [00:32<00:16,  3.95it/s]

Validation Loop 127
input - False, attention_mask - False


 66%|██████▌   | 128/194 [00:32<00:16,  3.98it/s]

Validation Loop 128
input - False, attention_mask - False


 66%|██████▋   | 129/194 [00:32<00:16,  3.96it/s]

Validation Loop 129
input - False, attention_mask - False


 67%|██████▋   | 130/194 [00:32<00:16,  3.95it/s]

Validation Loop 130
input - False, attention_mask - False


 68%|██████▊   | 131/194 [00:33<00:15,  3.95it/s]

Validation Loop 131
input - False, attention_mask - False


 68%|██████▊   | 132/194 [00:33<00:15,  3.93it/s]

Validation Loop 132
input - False, attention_mask - False


 69%|██████▊   | 133/194 [00:33<00:15,  3.97it/s]

Validation Loop 133
input - False, attention_mask - False


 69%|██████▉   | 134/194 [00:33<00:15,  3.97it/s]

Validation Loop 134
input - False, attention_mask - False


 70%|██████▉   | 135/194 [00:34<00:14,  3.96it/s]

Validation Loop 135
input - False, attention_mask - False


 70%|███████   | 136/194 [00:34<00:14,  3.99it/s]

Validation Loop 136
input - False, attention_mask - False


 71%|███████   | 137/194 [00:34<00:14,  3.97it/s]

Validation Loop 137
input - False, attention_mask - False


 71%|███████   | 138/194 [00:34<00:14,  4.00it/s]

Validation Loop 138
input - False, attention_mask - False


 72%|███████▏  | 139/194 [00:35<00:13,  4.00it/s]

Validation Loop 139
input - False, attention_mask - False


 72%|███████▏  | 140/194 [00:35<00:13,  4.00it/s]

Validation Loop 140
input - False, attention_mask - False


 73%|███████▎  | 141/194 [00:35<00:13,  3.98it/s]

Validation Loop 141
input - False, attention_mask - False


 73%|███████▎  | 142/194 [00:35<00:13,  3.98it/s]

Validation Loop 142
input - False, attention_mask - False


 74%|███████▎  | 143/194 [00:36<00:12,  3.97it/s]

Validation Loop 143
input - False, attention_mask - False


 74%|███████▍  | 144/194 [00:36<00:12,  3.98it/s]

Validation Loop 144
input - False, attention_mask - False


 75%|███████▍  | 145/194 [00:36<00:12,  4.01it/s]

Validation Loop 145
input - False, attention_mask - False


 75%|███████▌  | 146/194 [00:36<00:12,  3.99it/s]

Validation Loop 146
input - False, attention_mask - False


 76%|███████▌  | 147/194 [00:37<00:11,  4.00it/s]

Validation Loop 147
input - False, attention_mask - False


 76%|███████▋  | 148/194 [00:37<00:11,  4.02it/s]

Validation Loop 148
input - False, attention_mask - False


 77%|███████▋  | 149/194 [00:37<00:11,  4.03it/s]

Validation Loop 149
input - False, attention_mask - False


 77%|███████▋  | 150/194 [00:37<00:10,  4.02it/s]

Validation Loop 150
input - False, attention_mask - False


 78%|███████▊  | 151/194 [00:38<00:10,  4.01it/s]

Validation Loop 151
input - False, attention_mask - False


 78%|███████▊  | 152/194 [00:38<00:10,  4.01it/s]

Validation Loop 152
input - False, attention_mask - False


 79%|███████▉  | 153/194 [00:38<00:10,  4.02it/s]

Validation Loop 153
input - False, attention_mask - False


 79%|███████▉  | 154/194 [00:38<00:09,  4.02it/s]

Validation Loop 154
input - False, attention_mask - False


 80%|███████▉  | 155/194 [00:39<00:09,  4.02it/s]

Validation Loop 155
input - False, attention_mask - False


 80%|████████  | 156/194 [00:39<00:09,  4.00it/s]

Validation Loop 156
input - False, attention_mask - False


 81%|████████  | 157/194 [00:39<00:09,  4.01it/s]

Validation Loop 157
input - False, attention_mask - False


 81%|████████▏ | 158/194 [00:39<00:08,  4.01it/s]

Validation Loop 158
input - False, attention_mask - False


 82%|████████▏ | 159/194 [00:40<00:08,  4.00it/s]

Validation Loop 159
input - False, attention_mask - False


 82%|████████▏ | 160/194 [00:40<00:08,  4.02it/s]

Validation Loop 160
input - False, attention_mask - False


 83%|████████▎ | 161/194 [00:40<00:08,  4.02it/s]

Validation Loop 161
input - False, attention_mask - False


 84%|████████▎ | 162/194 [00:40<00:07,  4.00it/s]

Validation Loop 162
input - False, attention_mask - False


 84%|████████▍ | 163/194 [00:41<00:07,  3.99it/s]

Validation Loop 163
input - False, attention_mask - False


 85%|████████▍ | 164/194 [00:41<00:07,  3.99it/s]

Validation Loop 164
input - False, attention_mask - False


 85%|████████▌ | 165/194 [00:41<00:07,  3.97it/s]

Validation Loop 165
input - False, attention_mask - False


 86%|████████▌ | 166/194 [00:41<00:07,  3.95it/s]

Validation Loop 166
input - False, attention_mask - False


 86%|████████▌ | 167/194 [00:42<00:06,  3.93it/s]

Validation Loop 167
input - False, attention_mask - False


 87%|████████▋ | 168/194 [00:42<00:06,  3.92it/s]

Validation Loop 168
input - False, attention_mask - False


 87%|████████▋ | 169/194 [00:42<00:06,  3.88it/s]

Validation Loop 169
input - False, attention_mask - False


 88%|████████▊ | 170/194 [00:42<00:06,  3.90it/s]

Validation Loop 170
input - False, attention_mask - False


 88%|████████▊ | 171/194 [00:43<00:05,  3.92it/s]

Validation Loop 171
input - False, attention_mask - False


 89%|████████▊ | 172/194 [00:43<00:05,  3.94it/s]

Validation Loop 172
input - False, attention_mask - False


 89%|████████▉ | 173/194 [00:43<00:05,  3.93it/s]

Validation Loop 173
input - False, attention_mask - False


 90%|████████▉ | 174/194 [00:44<00:05,  3.89it/s]

Validation Loop 174
input - False, attention_mask - False


 90%|█████████ | 175/194 [00:44<00:04,  3.92it/s]

Validation Loop 175
input - False, attention_mask - False


 91%|█████████ | 176/194 [00:44<00:04,  3.90it/s]

Validation Loop 176
input - False, attention_mask - False


 91%|█████████ | 177/194 [00:44<00:04,  3.94it/s]

Validation Loop 177
input - False, attention_mask - False


 92%|█████████▏| 178/194 [00:45<00:04,  3.93it/s]

Validation Loop 178
input - False, attention_mask - False


 92%|█████████▏| 179/194 [00:45<00:03,  3.95it/s]

Validation Loop 179
input - False, attention_mask - False


 93%|█████████▎| 180/194 [00:45<00:03,  3.93it/s]

Validation Loop 180
input - False, attention_mask - False


 93%|█████████▎| 181/194 [00:45<00:03,  3.89it/s]

Validation Loop 181
input - False, attention_mask - False


 94%|█████████▍| 182/194 [00:46<00:03,  3.95it/s]

Validation Loop 182
input - False, attention_mask - False


 94%|█████████▍| 183/194 [00:46<00:02,  3.92it/s]

Validation Loop 183
input - False, attention_mask - False


 95%|█████████▍| 184/194 [00:46<00:02,  3.93it/s]

Validation Loop 184
input - False, attention_mask - False


 95%|█████████▌| 185/194 [00:46<00:02,  3.91it/s]

Validation Loop 185
input - False, attention_mask - False


 96%|█████████▌| 186/194 [00:47<00:02,  3.95it/s]

Validation Loop 186
input - False, attention_mask - False


 96%|█████████▋| 187/194 [00:47<00:01,  3.97it/s]

Validation Loop 187
input - False, attention_mask - False


 97%|█████████▋| 188/194 [00:47<00:01,  3.94it/s]

Validation Loop 188
input - False, attention_mask - False


 97%|█████████▋| 189/194 [00:47<00:01,  3.98it/s]

Validation Loop 189
input - False, attention_mask - False


 98%|█████████▊| 190/194 [00:48<00:01,  3.98it/s]

Validation Loop 190
input - False, attention_mask - False


 98%|█████████▊| 191/194 [00:48<00:00,  4.00it/s]

Validation Loop 191
input - False, attention_mask - False


 99%|█████████▉| 192/194 [00:48<00:00,  4.00it/s]

Validation Loop 192
input - False, attention_mask - False


 99%|█████████▉| 193/194 [00:48<00:00,  4.00it/s]

Validation Loop 193
input - False, attention_mask - False


100%|██████████| 194/194 [00:49<00:00,  3.95it/s]

[{'tp': 0, 'tn': 1552, 'fp': 0, 'fn': 0}, {'tp': 902, 'tn': 353, 'fp': 59, 'fn': 238}, {'tp': 156, 'tn': 1365, 'fp': 4, 'fn': 27}, {'tp': 158, 'tn': 1087, 'fp': 273, 'fn': 34}]
Detailed accuracy after 3 epoch:
unanswerable accuarcy: 1.0
extractive accuarcy: 0.8086340206185567
yes_no accuarcy: 0.9800257731958762
abstractive accuarcy: 0.8021907216494846
Overall accuarcy: 0.8977126288659794
Best accuarcy: 0.899645618556701



  0%|          | 0/289 [00:00<?, ?it/s]

Training loop 0
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.22852785885334015, logits - tensor([[-5.8459,  1.8384, -6.3109, -1.7298],
        [-5.9567, -2.0622, -4.1607,  2.1398],
        [-7.1895,  1.6392, -6.6809, -1.6067],
        [-5.9443,  1.1574, -5.8133, -1.1804],
        [-5.5456,  1.0246, -5.0029, -0.8818],
        [-6.1692,  2.3866, -6.3865, -1.7144],
        [-7.4927,  2.3965, -7.6149, -2.1975],
        [-5.0439, -2.8380,  1.2842, -1.6683]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  0%|          | 1/289 [00:00<03:46,  1.27it/s]

Training loop 1
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.0929800420999527, logits - tensor([[-7.3270, -1.6815, -4.1597,  1.1851],
        [-5.8343,  2.1566, -6.0041, -1.2871],
        [-7.2173,  2.0813, -6.2285, -2.2925],
        [-5.2077, -3.0335,  2.1693, -2.0671],
        [-6.3528,  1.3999, -6.1009, -1.4342],
        [-6.1977,  0.9075, -5.6271, -1.2985],
        [-4.5651, -3.3124,  1.5830, -1.8437],
        [-5.6941,  2.0493, -5.6132, -1.9369]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  1%|          | 2/289 [00:01<03:42,  1.29it/s]

Training loop 2
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.08777051419019699, logits - tensor([[-6.2338,  1.4851, -6.7309, -1.4817],
        [-6.7805,  0.9590, -6.1662, -1.5252],
        [-4.7274,  1.5658, -4.8685, -2.2043],
        [-6.2408,  1.4762, -6.2349, -1.5522],
        [-7.1108, -2.3453, -5.0855,  1.6671],
        [-7.0239,  1.7550, -6.5935, -1.5133],
        [-6.7128,  1.9595, -6.5030, -2.2953],
        [-5.5186,  1.6002, -5.9074, -2.1899]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  1%|          | 3/289 [00:02<03:40,  1.30it/s]

Training loop 3
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.42781856656074524, logits - tensor([[-5.3328e+00,  1.8020e+00, -5.2935e+00, -1.1739e+00],
        [-4.3258e+00, -1.4994e+00,  2.8485e-03, -1.6820e+00],
        [-6.2274e+00,  1.7575e+00, -5.8610e+00, -2.1095e+00],
        [-6.6396e+00,  1.5635e+00, -6.6308e+00, -2.0747e+00],
        [-7.6793e+00,  1.3250e-01, -6.2202e+00,  2.3400e-01],
        [-5.9648e+00,  1.4671e+00, -5.9658e+00, -1.7996e+00],
        [-6.5467e+00,  1.6551e+00, -6.1562e+00, -2.0252e+00],
        [-7.0537e+00,  1.5015e+00, -6.3649e+00, -8.0428e-01]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  1%|▏         | 4/289 [00:03<03:38,  1.31it/s]

Training loop 4
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5202329158782959, logits - tensor([[-7.0241e+00, -2.4200e-01, -5.6749e+00, -1.8482e-01],
        [-6.2209e+00,  1.2819e+00, -5.1315e+00, -9.3361e-01],
        [-6.4082e+00, -2.8293e+00,  1.9978e+00, -2.0508e+00],
        [-7.0087e+00,  1.1479e+00, -6.1406e+00, -9.8343e-01],
        [-7.2462e+00,  6.2626e-01, -5.7499e+00, -1.1150e+00],
        [-6.4972e+00,  2.0775e+00, -6.6415e+00, -2.5460e+00],
        [-6.8304e+00, -6.2986e-02, -4.8814e+00, -1.4954e-03],
        [-7.1550e+00, -2.3622e+00, -5.0666e+00,  2.5529e+00]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  2%|▏         | 5/289 [00:03<03:36,  1.31it/s]

Training loop 5
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.14039789140224457, logits - tensor([[-6.5441,  2.2261, -7.3230, -2.8442],
        [-5.8851,  1.9829, -5.8730, -2.6042],
        [-6.0847,  1.6984, -5.3824, -1.3845],
        [-5.8666,  0.6197, -5.4165, -1.8027],
        [-5.5953,  1.1536, -5.7116, -1.9187],
        [-5.8897,  0.0190, -4.4827, -0.6605],
        [-5.9269,  1.1066, -4.8197, -0.6987],
        [-6.4646,  2.3910, -6.2331, -1.9433]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  2%|▏         | 6/289 [00:04<03:35,  1.32it/s]

Training loop 6
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2026904821395874, logits - tensor([[-6.3118,  2.1365, -5.8096, -2.0652],
        [-6.2131,  1.3004, -4.9668, -1.2502],
        [-7.0236,  1.1495, -5.3151, -1.5566],
        [-4.4848, -2.7086,  1.6927, -1.9771],
        [-5.9951,  1.0188, -5.2044, -1.7018],
        [-6.3633, -1.6137, -4.7507, -0.2708],
        [-7.0120,  1.5608, -5.8374, -2.3572],
        [-6.5060,  1.3598, -5.5931, -1.7257]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  2%|▏         | 7/289 [00:05<03:34,  1.31it/s]

Training loop 7
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.18879306316375732, logits - tensor([[-6.6387, -2.9718, -5.4197,  1.8712],
        [-6.2363,  1.2960, -5.5735, -1.5153],
        [-6.8638,  2.0231, -5.8844, -1.3898],
        [-6.7818,  0.3731, -5.2653, -0.6960],
        [-7.1259,  1.7599, -5.5390, -2.0129],
        [-6.3909,  1.0284, -6.4941, -1.4438],
        [-7.1678,  1.2826, -6.3444, -1.3451],
        [-6.2178,  1.6699, -5.4206, -1.6756]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  3%|▎         | 8/289 [00:06<03:33,  1.32it/s]

Training loop 8
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.11313005536794662, logits - tensor([[-5.7384,  1.1600, -5.6386, -1.0422],
        [-8.6473, -2.3419, -5.7713,  1.8043],
        [-5.9732, -4.3471, -5.6499,  3.8409],
        [-5.6692,  1.2155, -5.3028, -1.4413],
        [-7.1070,  0.8412, -5.6359, -1.1379],
        [-4.7664,  0.4674, -3.8431, -0.5945],
        [-6.1675,  2.2952, -4.7641, -1.1948],
        [-6.2422,  1.6585, -6.0494, -2.0220]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  3%|▎         | 9/289 [00:06<03:32,  1.32it/s]

Training loop 9
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.18611419200897217, logits - tensor([[-6.1864,  1.4300, -6.2047, -1.6841],
        [-5.3943, -2.5908,  0.3759, -0.9093],
        [-6.4171,  2.1922, -5.7243, -2.0070],
        [-6.0899,  1.1033, -5.8681, -1.0945],
        [-6.1151, -0.7020, -5.3493, -0.1485],
        [-6.9305,  2.5741, -6.6356, -2.2849],
        [-5.8562,  0.4848, -4.9240, -1.0760],
        [-6.6282,  1.3196, -6.2488, -1.8163]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  3%|▎         | 10/289 [00:07<03:30,  1.32it/s]

Training loop 10
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2976617217063904, logits - tensor([[-6.3106,  1.5167, -5.4840, -1.3220],
        [-5.7336, -1.5176, -4.8289,  2.1581],
        [-5.2109,  0.7199, -4.5354, -1.1951],
        [-6.1993,  1.5349, -6.1425, -1.6665],
        [-7.5142,  2.1893, -7.1385, -2.5394],
        [-5.8111, -3.2064,  1.8051, -2.5644],
        [-6.7361,  1.8342, -5.6864, -1.5390],
        [-6.3575,  2.4663, -5.8464, -1.5830]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  4%|▍         | 11/289 [00:08<03:29,  1.32it/s]

Training loop 11
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1888211965560913, logits - tensor([[-5.4589,  0.9746, -4.5151, -0.6234],
        [-5.5727, -2.9812,  1.4334, -2.0913],
        [-5.7390,  0.6787, -5.2995, -1.5033],
        [-6.3225,  2.0071, -5.4786, -2.0459],
        [-7.0519,  2.1188, -6.2369, -2.1658],
        [-6.2388,  2.0658, -5.8457, -1.9165],
        [-5.4499, -1.9853,  0.2564, -1.5368],
        [-6.5536,  1.1547, -5.5299, -2.1238]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  4%|▍         | 12/289 [00:09<03:29,  1.33it/s]

Training loop 12
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.058310989290475845, logits - tensor([[-6.7759, -3.0392, -5.2156,  2.7502],
        [-6.5728,  2.5061, -6.0546, -3.0754],
        [-5.8780,  2.0074, -5.3567, -1.8619],
        [-6.7206, -3.5601, -4.7003,  3.0538],
        [-6.7212,  1.3699, -6.5347, -2.3135],
        [-6.5066,  0.9881, -5.9019, -1.5038],
        [-7.1251,  2.7822, -6.5300, -2.8102],
        [-6.8770,  1.7981, -5.6036, -1.9153]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  4%|▍         | 13/289 [00:09<03:28,  1.33it/s]

Training loop 13
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1570054143667221, logits - tensor([[-5.9231, -3.2898, -4.7160,  3.9392],
        [-7.0024,  1.7420, -5.6404, -0.6416],
        [-5.5038,  1.8187, -5.5583, -2.1088],
        [-5.5244, -3.5869,  1.9079, -2.3697],
        [-6.6161,  0.6609, -5.9363, -1.2787],
        [-6.7756,  2.3921, -5.4026, -2.6642],
        [-6.0563,  1.6031, -5.7648, -2.0244],
        [-5.7896,  1.9410, -5.4747, -1.6744]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  5%|▍         | 14/289 [00:10<03:27,  1.33it/s]

Training loop 14
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.28159400820732117, logits - tensor([[-6.9604, -2.6569, -4.1352,  2.8577],
        [-5.5169,  1.4465, -6.2898, -1.0277],
        [-6.6952, -1.2206, -4.5922,  1.3865],
        [-5.7490,  2.2960, -6.1763, -3.1196],
        [-6.3468,  1.5591, -6.1392, -1.6257],
        [-6.8603,  0.1800, -5.7206, -0.1847],
        [-6.3388,  1.5811, -5.8940, -1.5022],
        [-5.8210,  1.1016, -4.6872, -1.5210]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  5%|▌         | 15/289 [00:11<03:27,  1.32it/s]

Training loop 15
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3715900778770447, logits - tensor([[-6.2089,  2.1884, -6.0293, -0.9474],
        [-6.2492,  0.9977, -4.9757, -0.9894],
        [-5.6102,  0.6549, -4.7299, -1.7114],
        [-7.0032,  1.7958, -6.0616, -1.8106],
        [-6.2903, -0.1423, -5.7046,  0.4490],
        [-5.8271, -2.9282,  1.7189, -2.6756],
        [-6.4330, -1.0282, -5.0101,  0.9923],
        [-5.0070, -2.2000,  0.6520, -1.5369]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  6%|▌         | 16/289 [00:12<03:27,  1.32it/s]

Training loop 16
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19616129994392395, logits - tensor([[-5.6305,  1.5535, -4.8980, -1.8300],
        [-5.9981,  2.9677, -6.1250, -2.3472],
        [-6.8997, -4.1531, -5.0215,  3.2362],
        [-6.7169,  1.0228, -5.9010, -1.0920],
        [-6.6900,  2.3550, -5.4131, -1.7675],
        [-6.5439,  2.8964, -5.5953, -2.1753],
        [-5.9769,  2.6012, -6.3491, -2.2401],
        [-5.4707,  1.2375, -5.4201, -1.9428]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  6%|▌         | 17/289 [00:12<03:26,  1.32it/s]

Training loop 17
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1056390106678009, logits - tensor([[-6.1776,  0.1924, -3.7595, -1.3355],
        [-6.3997,  2.4474, -6.1484, -2.3423],
        [-5.3500,  1.4527, -4.0904, -1.1798],
        [-4.7620, -2.1735,  1.3871, -1.8317],
        [-6.6264,  2.1489, -5.7441, -2.8779],
        [-6.1282,  0.8651, -4.9352, -1.3945],
        [-6.5528,  1.5812, -6.3563, -1.8086],
        [-5.2052, -2.9820,  2.0016, -2.4992]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  6%|▌         | 18/289 [00:13<03:24,  1.32it/s]

Training loop 18
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.14859411120414734, logits - tensor([[-5.4789, -3.1998,  1.4957, -2.0727],
        [-6.2240,  0.7401, -5.5187, -1.4589],
        [-5.8483,  1.1333, -5.3241, -1.2505],
        [-6.3358,  1.3820, -5.8300, -2.0427],
        [-6.1533,  2.0937, -6.4211, -2.4431],
        [-7.1487, -0.0947, -6.0126, -1.2900],
        [-7.4451, -3.6302, -5.5952,  3.4803],
        [-5.9590,  1.1851, -4.9545, -1.6732]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  7%|▋         | 19/289 [00:14<03:24,  1.32it/s]

Training loop 19
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.21572613716125488, logits - tensor([[-6.6134,  2.4121, -6.4631, -2.6628],
        [-6.0018,  1.6079, -5.4131, -2.4339],
        [-6.1044,  1.7189, -5.7035, -2.0996],
        [-5.9246,  2.5840, -6.0145, -2.0642],
        [-5.5543, -2.3913, -3.9228,  2.8271],
        [-7.4224,  1.2538, -5.8047, -1.9864],
        [-7.1607, -4.3106, -5.7079,  3.5319],
        [-6.1354,  2.7431, -4.0234, -2.3452]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  7%|▋         | 20/289 [00:15<03:22,  1.33it/s]

Training loop 20
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1872154325246811, logits - tensor([[-4.9274, -2.6379,  2.2388, -2.2033],
        [-4.7202, -2.5446,  0.5774, -1.9130],
        [-6.6632,  2.0423, -5.9053, -2.3056],
        [-6.6131,  2.1815, -5.7223, -1.6231],
        [-5.2500, -2.0219,  0.4997, -1.3416],
        [-6.8933, -2.3068, -5.3168,  2.7696],
        [-6.5735,  1.0699, -5.2890, -1.7897],
        [-5.3651, -3.2752,  2.2062, -2.6397]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  7%|▋         | 21/289 [00:15<03:21,  1.33it/s]

Training loop 21
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2619002163410187, logits - tensor([[-6.5192,  1.5195, -5.9777, -1.2928],
        [-6.9540, -3.7787,  2.1486, -2.5255],
        [-6.1656,  1.2945, -5.7198, -2.1296],
        [-6.6603,  1.4974, -6.4230, -1.7090],
        [-6.2382,  1.8265, -5.2782, -1.3199],
        [-6.5427,  2.0568, -5.4998, -2.1570],
        [-6.1574,  2.2454, -5.7442, -2.4263],
        [-6.4553,  2.2452, -5.6760, -1.8767]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  8%|▊         | 22/289 [00:16<03:20,  1.33it/s]

Training loop 22
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.29651299118995667, logits - tensor([[-5.2204, -2.2022,  2.0806, -2.3558],
        [-6.0167,  2.1109, -5.9855, -1.0839],
        [-5.6616,  1.3531, -4.6913, -1.5461],
        [-6.2055, -0.8575, -4.2667,  0.6712],
        [-7.0080,  2.2064, -5.9394, -1.9530],
        [-5.7008,  1.4924, -4.7289, -1.8732],
        [-6.9566,  0.8731, -5.3479, -1.3543],
        [-6.3614, -3.1569,  1.1845, -2.4460]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  8%|▊         | 23/289 [00:17<03:20,  1.33it/s]

Training loop 23
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.13238094747066498, logits - tensor([[-5.7221,  1.1473, -4.7685, -1.3022],
        [-6.3800,  2.7713, -6.5058, -2.8755],
        [-5.5217,  1.3366, -5.1530, -2.1231],
        [-4.6026,  1.5564, -4.4597, -1.6057],
        [-6.5481,  2.4607, -6.0033, -2.0489],
        [-7.1418,  1.0147, -6.0643, -1.5864],
        [-7.0867,  0.4951, -5.5095, -0.4664],
        [-5.7225,  2.2455, -6.5732, -2.2799]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  8%|▊         | 24/289 [00:18<03:20,  1.32it/s]

Training loop 24
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1344485878944397, logits - tensor([[-6.3101,  2.1282, -6.2896, -2.2866],
        [-5.7111,  1.6744, -5.7238, -2.0323],
        [-7.6736, -1.4527, -5.8364,  0.7875],
        [-5.7925,  2.0270, -4.9264, -1.7711],
        [-6.0845,  3.0354, -5.1204, -2.3250],
        [-5.8705,  2.5344, -5.1120, -2.6255],
        [-5.2717,  0.9232, -4.6386, -0.1915],
        [-5.7787, -3.1120,  0.9277, -1.7590]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  9%|▊         | 25/289 [00:18<03:19,  1.33it/s]

Training loop 25
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3054802417755127, logits - tensor([[-7.5826,  2.4349, -7.0593, -2.7174],
        [-7.1722,  2.5913, -6.2231, -1.6865],
        [-5.8109,  1.2128, -5.2268, -1.5848],
        [-6.7567,  2.1006, -6.1383, -2.5199],
        [-6.0995,  2.2547, -5.8766, -1.6280],
        [-5.8130, -2.7002,  1.0360, -1.9468],
        [-6.2129,  2.5758, -6.5505, -2.1801],
        [-6.9945,  1.4327, -5.9284, -2.0503]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  9%|▉         | 26/289 [00:19<03:18,  1.32it/s]

Training loop 26
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.061892807483673096, logits - tensor([[-6.2362,  2.1682, -5.3433, -1.6171],
        [-5.7589,  1.9523, -5.7592, -2.2776],
        [-6.0633,  1.7338, -5.7596, -1.4248],
        [-7.5928, -2.0576, -6.3897,  1.9288],
        [-5.2337,  1.8071, -6.6199, -2.5038],
        [-6.1398,  2.9797, -6.4519, -2.5058],
        [-6.0835,  2.3330, -5.9179, -2.9869],
        [-5.1111,  1.8712, -5.2116, -1.9283]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  9%|▉         | 27/289 [00:20<03:16,  1.33it/s]

Training loop 27
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1572631448507309, logits - tensor([[-6.1675,  1.8442, -5.4986, -2.2900],
        [-7.1352,  0.5791, -5.7828, -1.1652],
        [-5.9703, -0.8751, -4.8435, -0.0203],
        [-5.8183, -3.1196,  2.1934, -2.7495],
        [-6.3484,  2.9960, -6.4538, -2.8132],
        [-6.2514,  1.6566, -6.0163, -2.2241],
        [-5.0541, -2.1635,  1.9949, -2.1771],
        [-5.9212, -3.2203,  1.8468, -2.1342]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 10%|▉         | 28/289 [00:21<03:15,  1.33it/s]

Training loop 28
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2522571086883545, logits - tensor([[-5.8119, -0.0389, -5.1995,  0.3398],
        [-6.3522, -3.1392,  1.6337, -1.9964],
        [-4.6300,  2.2509, -5.0238, -2.0440],
        [-5.6621, -2.1658, -4.8177,  1.8658],
        [-6.9447, -0.2466, -5.3262, -1.2776],
        [-6.4266, -2.2970, -5.9317,  1.7272],
        [-5.4690, -2.9875,  1.7908, -2.2180],
        [-7.4143,  2.1181, -5.4557, -1.9845]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 10%|█         | 29/289 [00:21<03:15,  1.33it/s]

Training loop 29
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.21550998091697693, logits - tensor([[-6.4757, -2.7189, -5.1340,  2.1880],
        [-5.2362,  1.3189, -4.7158, -0.8010],
        [-6.0844, -0.4343, -1.8444, -1.7552],
        [-5.5531, -3.7321,  1.6682, -2.7073],
        [-6.6738,  1.4006, -5.5685, -0.3805],
        [-4.9553,  2.3272, -4.6498, -2.5553],
        [-6.8615,  1.5717, -6.1001, -2.0610],
        [-7.4879,  2.4590, -7.0854, -2.5878]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 10%|█         | 30/289 [00:22<03:15,  1.33it/s]

Training loop 30
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.28626832365989685, logits - tensor([[-6.1297, -2.2266, -4.6431,  2.6368],
        [-5.4394,  2.2578, -6.2458, -1.9185],
        [-7.6888,  1.8727, -6.4887, -2.6310],
        [-6.6640,  2.3651, -5.6517, -1.9404],
        [-5.0031, -2.4937,  0.8445, -2.2224],
        [-7.0725,  0.9769, -4.9076, -0.8656],
        [-8.3583, -1.3186, -5.7945,  1.3014],
        [-7.4078,  2.1777, -6.9023, -1.9776]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 11%|█         | 31/289 [00:23<03:13,  1.33it/s]

Training loop 31
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.0721442773938179, logits - tensor([[-7.1619,  2.9318, -5.8787, -2.6213],
        [-6.3027,  1.9472, -5.9459, -1.7852],
        [-7.6853,  2.1209, -6.1587, -1.7027],
        [-6.1127, -1.7678, -5.2481,  2.5305],
        [-5.3023,  1.7989, -5.3998, -1.7063],
        [-6.9083,  2.2815, -6.4212, -2.5738],
        [-7.5676, -0.7839, -5.5664,  0.9777],
        [-6.8454,  2.7041, -5.9314, -2.3159]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 11%|█         | 32/289 [00:24<03:14,  1.32it/s]

Training loop 32
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.07450907677412033, logits - tensor([[-5.0364, -3.2248,  1.3774, -2.4778],
        [-6.8664,  2.2668, -5.8711, -1.5829],
        [-7.1069,  2.0737, -6.1170, -1.8375],
        [-6.2669,  1.9738, -5.4983, -1.6804],
        [-6.2264,  2.5744, -5.9826, -2.6073],
        [-7.4851,  0.9007, -6.3853, -1.7026],
        [-7.0583,  2.3903, -6.0071, -1.7689],
        [-7.3606,  2.1456, -6.9931, -1.8379]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 11%|█▏        | 33/289 [00:24<03:15,  1.31it/s]

Training loop 33
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.26875004172325134, logits - tensor([[-7.2452, -3.0515, -4.7337,  1.9267],
        [-5.8297, -2.7791,  1.4517, -1.5116],
        [-7.2308,  2.3239, -6.5446, -2.6955],
        [-5.4451, -2.5859,  1.7536, -1.7194],
        [-6.8512,  2.3975, -5.6441, -1.7446],
        [-7.2992,  2.1186, -6.4555, -2.2142],
        [-6.6390, -0.4926, -1.9204, -0.8619],
        [-6.8611, -0.8778, -4.4035,  0.7981]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 12%|█▏        | 34/289 [00:25<03:13,  1.31it/s]

Training loop 34
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 12%|█▏        | 35/289 [00:26<03:13,  1.31it/s]

loss - 0.3018626570701599, logits - tensor([[-7.4195,  2.1700, -6.3938, -2.3681],
        [-6.6374,  2.2992, -5.9893, -1.7307],
        [-6.1510,  2.2483, -6.4181, -2.4571],
        [-5.2264, -1.8330,  0.2174, -1.3315],
        [-5.3979,  0.3135, -4.7671, -1.2448],
        [-7.4711,  3.3083, -7.1390, -2.8369],
        [-5.8876, -3.6065,  2.1154, -2.2503],
        [-7.7117, -1.0338, -4.9140,  0.6424]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 35
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.17851050198078156, logits - tensor([[-5.8618, -3.1652,  2.5859, -2.0839],
        [-6.7447,  2.4174, -5.5185, -1.6360],
        [-7.4383, -0.0835, -4.6442, -0.0669],
        [-6.3841, -1.7052, -4.8832,  1.9331],
        [-5.6527,  1.6484, -5.2468, -1.5245],
        [-5.4188, -3.5912,  2.5473, -2.3158],
        [-7.0127,  2.5232, -5.8480, -1.8930],
        [-6.4560,  0.6477, -4.9462, -1.4

 12%|█▏        | 36/289 [00:27<03:11,  1.32it/s]

Training loop 36
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.07183084636926651, logits - tensor([[-6.6825,  1.5477, -5.9977, -2.0700],
        [-5.5572, -2.4971, -5.8329,  3.0576],
        [-6.2062,  2.7566, -6.3682, -2.5367],
        [-4.8174, -2.4097,  0.9026, -2.5638],
        [-5.8496,  1.5674, -5.3480, -2.0069],
        [-5.4509, -3.1499,  2.1095, -2.0748],
        [-5.7182,  2.7113, -6.7637, -2.6775],
        [-5.9941, -3.1583,  0.8818, -2.5785]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 13%|█▎        | 37/289 [00:28<03:11,  1.32it/s]

Training loop 37
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 13%|█▎        | 38/289 [00:28<03:10,  1.32it/s]

loss - 0.1457362174987793, logits - tensor([[-4.7180, -1.7091,  0.9543, -1.2090],
        [-7.2590,  1.6106, -5.7446, -1.6594],
        [-6.7806, -1.9124, -6.3721,  1.3805],
        [-5.7296, -3.0241,  1.5167, -1.9749],
        [-5.1820, -2.3562,  0.7635, -1.3582],
        [-5.1009, -2.8440,  2.1003, -2.8093],
        [-6.2671, -0.1589, -4.5327, -0.2959],
        [-6.5192,  1.2779, -5.2460, -2.0690]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 38
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1666957437992096, logits - tensor([[-7.2418, -0.6106, -5.1687,  0.7867],
        [-7.4481,  1.6226, -6.1748, -0.3626],
        [-6.1502,  1.5226, -4.5264, -0.7680],
        [-5.2323, -3.1232,  1.1426, -2.0314],
        [-6.5580,  2.4943, -6.2067, -1.6927],
        [-5.4164,  2.3803, -5.6161, -1.5765],
        [-7.2762, -2.5825, -4.2949,  2.5298],
        [-6.3975,  1.0348, -5.5336, -0.83

 13%|█▎        | 39/289 [00:29<03:08,  1.32it/s]

Training loop 39
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5034265518188477, logits - tensor([[-6.7749,  2.6457, -6.3307, -1.5551],
        [-6.3705,  1.7131, -5.8119, -1.4127],
        [-5.8491,  2.6627, -5.4350, -2.2347],
        [-6.5185,  0.3349, -5.1197, -1.1023],
        [-5.3583, -3.6705,  2.2493, -2.1753],
        [-5.7911,  2.7566, -5.4784, -1.8201],
        [-5.4743, -2.3871,  1.3380, -1.5932],
        [-7.3639, -2.1955, -5.3820,  2.0242]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 14%|█▍        | 40/289 [00:30<03:08,  1.32it/s]

Training loop 40
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.12204695492982864, logits - tensor([[-6.4180, -3.4434, -4.6838,  3.7277],
        [-5.7933,  2.4069, -6.1377, -2.3161],
        [-5.6364,  0.4861, -5.8312, -0.8099],
        [-7.1049,  2.1972, -6.0660, -1.4201],
        [-6.4188,  2.1257, -6.1725, -1.9148],
        [-7.3433,  2.0804, -5.7929, -1.1804],
        [-7.9738,  2.0138, -6.6594, -2.4439],
        [-6.8328,  0.0557, -5.2203,  0.4603]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 14%|█▍        | 41/289 [00:31<03:07,  1.32it/s]

Training loop 41
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1966760754585266, logits - tensor([[-7.7729,  2.6985, -5.7969, -2.8813],
        [-6.3327,  2.0601, -5.2029, -1.8648],
        [-6.1667,  2.0742, -6.0198, -2.0280],
        [-6.8175,  1.3735, -5.9728, -1.6434],
        [-7.1201,  2.2868, -6.7370, -2.1174],
        [-7.2264,  2.5924, -5.7659, -2.2496],
        [-5.6607,  1.8240, -5.8737, -2.2875],
        [-6.9044,  2.3370, -6.4389, -2.1871]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 15%|█▍        | 42/289 [00:31<03:06,  1.32it/s]

Training loop 42
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.44259878993034363, logits - tensor([[-7.1271,  1.2035, -6.0821, -0.8507],
        [-6.2275,  0.5390, -4.5968, -0.0169],
        [-5.7785,  2.6581, -5.5448, -1.9583],
        [-6.4461,  3.3961, -6.6976, -3.2558],
        [-5.6374,  2.4993, -5.4900, -2.7779],
        [-6.2578, -3.6052, -4.3968,  2.8645],
        [-6.0657,  1.3886, -5.1905, -1.1561],
        [-6.4066, -2.9233, -4.9518,  3.5398]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 15%|█▍        | 43/289 [00:32<03:06,  1.32it/s]

Training loop 43
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.0640001967549324, logits - tensor([[-4.5445, -2.8602,  2.4633, -2.1444],
        [-6.4862,  1.5310, -4.9350, -1.3934],
        [-7.6294,  2.7371, -6.9576, -3.1467],
        [-6.8486,  2.1137, -6.1887, -1.7815],
        [-6.7289,  2.8165, -6.6572, -2.5667],
        [-7.8018,  2.1162, -6.5234, -2.0298],
        [-6.2886,  1.6371, -5.8876, -2.2360],
        [-6.1127,  1.6337, -5.3749, -1.8855]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 15%|█▌        | 44/289 [00:33<03:05,  1.32it/s]

Training loop 44
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19420045614242554, logits - tensor([[-5.7641, -3.7915, -4.7091,  3.7174],
        [-6.8289, -3.4991, -5.2466,  3.3821],
        [-6.9118, -3.4910,  2.1545, -2.7449],
        [-6.8490,  0.9567, -6.3008, -1.1673],
        [-6.6242,  3.1598, -6.0268, -2.0181],
        [-6.6037,  2.7273, -5.5605, -1.7745],
        [-6.5333, -0.0300, -5.9457,  1.3922],
        [-6.6377, -1.3879, -5.8536,  2.2720]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 16%|█▌        | 45/289 [00:34<03:04,  1.32it/s]

Training loop 45
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.0531916543841362, logits - tensor([[-6.5586,  2.0508, -5.8947, -1.7366],
        [-6.6843,  2.4822, -6.5167, -2.7649],
        [-6.8352,  2.7024, -5.5150, -3.0402],
        [-5.2281,  1.5166, -5.1579, -2.0264],
        [-6.6952,  3.1475, -6.6995, -2.2876],
        [-6.2888,  2.5220, -6.0471, -2.2517],
        [-6.6350,  1.8576, -5.0017, -1.3066],
        [-6.8765,  3.2730, -7.0274, -2.6817]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 16%|█▌        | 46/289 [00:34<03:03,  1.32it/s]

Training loop 46
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.20928159356117249, logits - tensor([[-6.2468,  2.4639, -5.7650, -2.1304],
        [-5.4019, -2.1808,  1.2511, -1.4580],
        [-5.7124, -2.8414,  2.2160, -2.4236],
        [-6.4768,  2.5040, -5.9933, -1.7689],
        [-7.0078,  2.2205, -7.3751, -2.8074],
        [-7.5170,  1.6588, -5.7521, -1.6295],
        [-6.4952, -1.7926, -4.7645,  2.5396],
        [-6.0806, -2.5108, -5.2295,  1.9471]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 16%|█▋        | 47/289 [00:35<03:02,  1.32it/s]

Training loop 47
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.27000555396080017, logits - tensor([[-5.0616, -2.9511,  1.9128, -2.1532],
        [-6.8199,  0.8558, -4.7495, -1.0893],
        [-6.7385,  2.6367, -5.8524, -1.0095],
        [-6.6786, -0.8422, -5.2689,  0.9212],
        [-7.2706,  2.2717, -5.6507, -2.3766],
        [-6.2273, -3.6705,  2.7677, -2.3327],
        [-6.1479,  1.5838, -5.6169, -2.2075],
        [-6.8735,  1.7345, -5.4474, -1.6851]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 17%|█▋        | 48/289 [00:36<03:02,  1.32it/s]

Training loop 48
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.30952826142311096, logits - tensor([[-7.0505,  2.0865, -6.5394, -2.6139],
        [-7.5703,  2.0261, -6.8928, -1.9157],
        [-6.2554,  2.8046, -6.5796, -2.3115],
        [-6.8868,  2.1781, -6.3222, -1.5733],
        [-6.0663,  1.7004, -6.5931, -1.7848],
        [-5.9917, -2.8282,  1.9461, -1.7256],
        [-5.3522,  1.2670, -4.7488, -1.7391],
        [-6.6224,  2.5687, -6.0521, -2.1246]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 17%|█▋        | 49/289 [00:37<03:02,  1.31it/s]

Training loop 49
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19253093004226685, logits - tensor([[-7.6083,  2.2120, -6.9801, -1.6862],
        [-6.0383,  2.6097, -5.3905, -2.3723],
        [-6.4380, -1.8301, -5.3383,  1.3477],
        [-6.9972,  1.5501, -5.0942, -1.7529],
        [-6.8534, -1.3458, -4.8996,  0.9788],
        [-6.0614, -3.1748,  1.1191, -2.0684],
        [-5.3488,  1.7754, -4.6392, -1.3137],
        [-6.8431,  2.0169, -6.4226, -1.9754]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 17%|█▋        | 50/289 [00:37<03:01,  1.31it/s]

Training loop 50
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2043006718158722, logits - tensor([[-6.6828,  1.8041, -6.0106, -1.7734],
        [-6.7915, -0.7414, -5.2815,  1.7716],
        [-6.7388,  2.2879, -7.5427, -2.1915],
        [-6.9265,  2.8843, -7.0837, -2.7144],
        [-6.9989,  1.1358, -5.4782, -1.0403],
        [-7.3166, -0.6908, -5.1924,  0.6965],
        [-6.6125,  2.4985, -6.6702, -2.5024],
        [-6.5325,  1.9519, -5.4319, -2.6579]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 18%|█▊        | 51/289 [00:38<03:00,  1.32it/s]

Training loop 51
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3026377260684967, logits - tensor([[-6.5407, -1.2381, -5.2848,  1.2971],
        [-6.3197, -2.5980, -4.0725,  1.8588],
        [-4.8759,  0.9585, -4.5204, -1.6184],
        [-5.1047, -3.4336,  2.1473, -2.3963],
        [-4.7821, -2.5915,  1.4451, -1.8761],
        [-5.8649,  2.5047, -5.3340, -2.2181],
        [-6.2549,  2.3424, -5.2257, -2.1960],
        [-6.1139, -3.3352,  1.9417, -2.5977]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 18%|█▊        | 52/289 [00:39<03:00,  1.32it/s]

Training loop 52
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.08681870251893997, logits - tensor([[-5.3410, -0.4596, -4.5490,  0.6299],
        [-7.7264,  2.5607, -6.1315, -2.5669],
        [-6.5146,  2.3403, -6.2385, -2.4826],
        [-6.1682,  1.7140, -5.4641, -2.4687],
        [-7.7363,  2.6446, -7.1573, -3.1603],
        [-7.4412,  0.7910, -4.8493, -0.8869],
        [-6.5144,  2.2430, -6.5101, -1.6114],
        [-6.7128,  2.8026, -6.7049, -2.4756]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 18%|█▊        | 53/289 [00:40<02:59,  1.31it/s]

Training loop 53
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.17213425040245056, logits - tensor([[-6.5428,  2.6332, -6.3080, -2.6611],
        [-7.0912,  0.2155, -5.0708, -0.5902],
        [-6.1722,  1.3751, -5.5946, -1.5207],
        [-6.5563,  1.9374, -6.5963, -2.5113],
        [-7.1158,  1.8719, -6.5292, -2.0904],
        [-7.3114, -0.5756, -5.4978,  0.4584],
        [-5.9643,  2.1623, -5.7340, -1.8397],
        [-6.4252,  1.4001, -4.4582, -1.8475]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 19%|█▊        | 54/289 [00:40<02:59,  1.31it/s]

Training loop 54
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1278381645679474, logits - tensor([[-6.3213e+00,  2.2194e+00, -6.6030e+00, -2.8721e+00],
        [-6.1185e+00, -3.5882e+00,  1.7927e+00, -2.2962e+00],
        [-6.1385e+00,  1.9447e+00, -5.2603e+00, -3.8086e-01],
        [-5.2281e+00, -2.3933e+00,  2.5753e+00, -2.4515e+00],
        [-6.6342e+00,  1.9698e+00, -5.9576e+00, -2.3420e+00],
        [-5.4860e+00,  5.3536e-03, -2.8058e+00, -6.2452e-01],
        [-6.3133e+00,  1.8654e+00, -4.8556e+00, -1.1786e+00],
        [-6.4506e+00,  2.0785e+00, -5.7929e+00, -1.7726e+00]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 19%|█▉        | 55/289 [00:41<02:58,  1.31it/s]

Training loop 55
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.06467219442129135, logits - tensor([[-6.7484,  2.1590, -7.3717, -1.7898],
        [-6.9257,  1.2807, -5.5766, -2.4765],
        [-5.2327, -3.7197,  2.2276, -2.8076],
        [-5.2833, -3.7368,  2.0916, -2.1920],
        [-4.9810, -2.4489,  2.5179, -2.7453],
        [-7.2478,  1.6618, -6.6024, -1.7658],
        [-6.0927,  2.4628, -5.8576, -2.3782],
        [-7.0413,  1.7861, -5.6579, -1.9410]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 19%|█▉        | 56/289 [00:42<02:57,  1.32it/s]

Training loop 56
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.26399314403533936, logits - tensor([[-7.0587,  2.0276, -5.7273, -1.1945],
        [-6.3020,  1.6162, -6.6361, -2.0000],
        [-6.5444, -2.1305, -4.5925,  2.4207],
        [-5.3048, -2.9898,  1.8682, -2.4200],
        [-6.5327,  1.6195, -5.8326, -1.6757],
        [-6.3710,  1.9660, -5.9560, -2.8371],
        [-5.6904, -0.1313, -3.2813, -0.9851],
        [-6.9738,  0.0702, -4.5558, -0.1463]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 20%|█▉        | 57/289 [00:43<02:56,  1.31it/s]

Training loop 57
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1576804369688034, logits - tensor([[-5.1824,  2.1356, -5.4049, -2.1482],
        [-5.9642,  1.0795, -4.5030, -1.1340],
        [-6.4729,  0.9693, -5.1304, -1.2559],
        [-6.9676,  1.1890, -5.2907, -1.1722],
        [-6.1088,  2.4295, -5.9492, -2.4382],
        [-5.4634, -4.3813,  3.0437, -2.2587],
        [-5.4712, -3.5384,  1.6475, -2.1336],
        [-6.1036,  2.1953, -5.6335, -2.0067]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 20%|██        | 58/289 [00:43<02:56,  1.31it/s]

Training loop 58
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5503106117248535, logits - tensor([[-5.0278, -3.2031,  1.9581, -1.8373],
        [-5.2092, -2.3621,  2.0776, -2.2157],
        [-5.8332,  1.6187, -5.7207, -1.5501],
        [-6.6516,  1.8510, -5.8435, -0.7512],
        [-7.0231, -1.8191, -4.1235,  1.7966],
        [-7.0195,  2.6089, -6.8802, -2.2129],
        [-6.6066,  2.5003, -5.3695, -2.5600],
        [-6.2093,  1.0325, -5.4664, -1.5237]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 20%|██        | 59/289 [00:44<02:54,  1.32it/s]

Training loop 59
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.10163947194814682, logits - tensor([[-6.1499,  2.1674, -5.9353, -1.2716],
        [-6.6117, -2.7950, -6.1159,  2.7474],
        [-6.0511,  1.3051, -4.6344, -1.5601],
        [-5.7720,  2.4086, -5.2155, -1.6358],
        [-7.3428,  0.6104, -4.8628, -0.7728],
        [-5.6570,  3.0041, -5.5681, -2.2581],
        [-6.5314,  1.2808, -4.2437, -0.3854],
        [-6.5153,  1.4951, -6.1631, -2.3707]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 21%|██        | 60/289 [00:45<02:53,  1.32it/s]

Training loop 60
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.32735902070999146, logits - tensor([[-7.2765, -1.4189, -5.3716,  0.2139],
        [-6.5476, -2.9031, -5.0739,  3.2945],
        [-6.2027, -1.4160, -3.0613,  0.3771],
        [-6.1918,  2.2111, -6.0010, -1.8412],
        [-5.2896, -3.3306,  2.3938, -2.6835],
        [-6.7459,  1.9818, -5.9735, -1.9279],
        [-5.7192,  0.7266, -4.2092, -0.2983],
        [-6.6970,  1.2049, -4.9238, -1.2444]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 21%|██        | 61/289 [00:46<02:52,  1.32it/s]

Training loop 61
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.25087517499923706, logits - tensor([[-5.9677, -3.7636,  2.6786, -2.1263],
        [-6.7460,  2.3764, -5.9297, -1.9976],
        [-7.2245,  1.3843, -5.6388, -1.8778],
        [-6.1558, -1.8895, -4.2976,  2.1327],
        [-6.7695, -1.4179, -3.8654,  1.0937],
        [-7.3550, -0.3655, -4.8667, -0.0156],
        [-5.4365,  0.1459, -3.4600, -0.7229],
        [-7.1102,  0.6799, -6.0517, -0.8568]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 21%|██▏       | 62/289 [00:46<02:52,  1.32it/s]

Training loop 62
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.30293434858322144, logits - tensor([[-7.0244e+00, -1.6000e+00, -4.7782e+00,  1.7782e+00],
        [-5.0416e+00, -3.4374e+00,  2.5161e+00, -2.5470e+00],
        [-5.1148e+00, -3.1805e+00, -4.8271e+00,  3.2031e+00],
        [-7.9649e+00,  2.7147e-03, -5.4867e+00, -4.8400e-01],
        [-6.8227e+00, -1.7086e-01, -3.7716e+00, -1.0525e+00],
        [-7.0966e+00, -2.5616e-01, -4.8377e+00, -1.0917e+00],
        [-7.8285e+00, -3.5424e+00, -4.8864e+00,  2.8533e+00],
        [-6.9376e+00,  1.5160e+00, -6.4168e+00, -1.0785e+00]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 22%|██▏       | 63/289 [00:47<02:52,  1.31it/s]

Training loop 63
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1252441555261612, logits - tensor([[-7.4759,  0.6229, -5.5781, -0.8847],
        [-6.3926,  2.5047, -5.0748, -1.9359],
        [-5.4547, -2.8615,  2.5113, -2.2474],
        [-6.6015,  0.2178, -4.1027,  0.1280],
        [-6.8679, -4.1845, -4.6064,  3.4326],
        [-5.8932,  0.7278, -4.8939, -0.9692],
        [-6.8442,  1.6298, -5.4869, -1.7142],
        [-6.6600,  2.5733, -6.4539, -1.5537]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 22%|██▏       | 64/289 [00:48<02:51,  1.31it/s]

Training loop 64
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.07369166612625122, logits - tensor([[-6.4909,  1.8590, -6.6369, -1.4336],
        [-5.7829, -3.4285,  2.3172, -2.4556],
        [-6.9253,  1.5575, -6.4318, -1.9815],
        [-7.1838, -5.0889, -5.0408,  3.6946],
        [-5.6629, -2.5200, -4.2029,  2.2906],
        [-6.4320,  2.0945, -5.7645, -2.8227],
        [-7.3827,  0.3693, -5.0687, -0.8690],
        [-6.7891,  2.7629, -6.0275, -2.1856]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 22%|██▏       | 65/289 [00:49<02:50,  1.32it/s]

Training loop 65
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2411431074142456, logits - tensor([[-5.8961, -2.2587, -4.5679,  3.1876],
        [-7.1411,  0.4406, -5.0120, -0.0916],
        [-6.5357, -1.4330, -4.4945,  1.4014],
        [-7.5681,  1.7689, -6.2301, -1.6430],
        [-5.1836, -2.8079,  2.2014, -2.6627],
        [-6.7441,  2.1404, -5.6865, -2.7288],
        [-7.8937, -2.2835, -4.3941,  1.6648],
        [-5.3748, -3.3927,  1.9457, -2.1719]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 23%|██▎       | 66/289 [00:50<02:49,  1.32it/s]

Training loop 66
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3080257773399353, logits - tensor([[-7.3387,  1.2558, -6.0267, -1.0095],
        [-7.7825,  1.1387, -5.4507, -1.5262],
        [-5.1572, -3.6447,  2.5974, -2.2612],
        [-7.2140,  2.1842, -6.5143, -1.7825],
        [-6.2632, -0.7450, -4.3901,  0.9861],
        [-7.6344,  0.2698, -5.4908,  0.0985],
        [-7.4986,  1.7171, -6.2656, -1.2303],
        [-6.4187,  1.6514, -5.2868, -2.3307]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 23%|██▎       | 67/289 [00:50<02:48,  1.32it/s]

Training loop 67
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.40627986192703247, logits - tensor([[-6.5191, -1.4262, -4.8364,  1.8867],
        [-6.0703,  2.3135, -5.7637, -2.5092],
        [-6.9537,  0.2290, -5.5619, -0.4616],
        [-5.2790, -3.8746,  2.1185, -2.9224],
        [-5.4622,  3.0286, -5.9188, -3.3657],
        [-5.2520, -3.3563,  2.7572, -2.0343],
        [-5.8530,  1.8371, -5.0558, -1.4484],
        [-7.3622,  2.1498, -6.7655, -2.3611]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 24%|██▎       | 68/289 [00:51<02:47,  1.32it/s]

Training loop 68
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1254235953092575, logits - tensor([[-7.0932, -2.3961, -4.5878,  2.6679],
        [-7.5921,  0.0559, -4.7524,  0.3286],
        [-4.7599, -3.3116,  2.2841, -2.7943],
        [-6.7710,  2.2094, -5.3826, -2.2332],
        [-6.6141,  1.1924, -5.5543, -1.6248],
        [-7.0966, -0.2558, -3.9280, -1.3283],
        [-6.7852,  2.2268, -6.5974, -1.8862],
        [-5.4013,  2.9223, -6.0456, -2.9627]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 24%|██▍       | 69/289 [00:52<02:46,  1.32it/s]

Training loop 69
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.14104340970516205, logits - tensor([[-7.1720,  2.1273, -5.6533, -1.0870],
        [-6.4378,  1.2863, -5.5636, -1.9480],
        [-6.4706,  0.6347, -3.9260, -0.4459],
        [-5.4172, -2.9420, -4.2541,  2.2987],
        [-5.0152, -3.8631,  2.3441, -1.8390],
        [-4.8349, -3.4561,  2.3394, -2.4619],
        [-6.5918, -0.0161, -4.7452, -0.8604],
        [-7.2474,  2.2786, -6.8056, -1.9795]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 24%|██▍       | 70/289 [00:53<02:45,  1.32it/s]

Training loop 70
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2738575339317322, logits - tensor([[-6.5180,  0.4360, -4.7337, -0.3279],
        [-5.9680, -1.5695, -4.9333,  1.7679],
        [-5.9739, -3.5938,  1.9778, -2.0577],
        [-7.3026,  1.2207, -6.1420, -1.2194],
        [-5.8723, -1.9499, -4.1362,  1.2990],
        [-6.3717,  0.4647, -5.7581, -0.6253],
        [-6.1611,  0.9089, -5.8721, -1.0779],
        [-5.7952,  1.2789, -5.4342, -0.9861]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 25%|██▍       | 71/289 [00:53<02:44,  1.33it/s]

Training loop 71
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3574146628379822, logits - tensor([[-6.7210,  1.1091, -5.1645, -0.8050],
        [-5.8802, -3.2396,  2.2122, -1.5481],
        [-5.6242,  1.7905, -4.8459, -1.1690],
        [-5.7598,  0.6204, -4.5535, -0.7694],
        [-6.2629,  2.0357, -4.7813, -1.7794],
        [-6.3899,  0.7485, -5.0874, -0.6834],
        [-5.8438,  1.0672, -5.5638, -0.9352],
        [-6.7680, -3.0257, -5.6275,  3.4087]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 25%|██▍       | 72/289 [00:54<02:44,  1.32it/s]

Training loop 72
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.09965457022190094, logits - tensor([[-6.4617,  1.0373, -5.5694, -2.1408],
        [-5.7812,  2.4046, -5.9171, -2.1940],
        [-6.1142, -3.5241,  2.3823, -2.8826],
        [-6.0020,  0.6998, -4.4387,  0.9138],
        [-6.3772,  2.2257, -6.4868, -2.3875],
        [-8.0751, -3.6456, -5.8549,  3.7566],
        [-7.1365,  2.2625, -7.7252, -3.0250],
        [-5.6359, -3.0671,  1.9102, -1.8345]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 25%|██▌       | 73/289 [00:55<02:43,  1.32it/s]

Training loop 73
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 26%|██▌       | 74/289 [00:56<02:43,  1.32it/s]

loss - 0.09161368012428284, logits - tensor([[-6.8851,  1.2770, -5.9416, -1.0439],
        [-5.4323, -3.4178,  2.1356, -3.3426],
        [-7.1896,  2.5532, -6.8942, -2.8350],
        [-6.2157, -2.8501, -5.1401,  2.5021],
        [-6.8507,  1.8567, -5.9122, -1.6035],
        [-7.1879,  0.2334, -4.6109,  0.9498],
        [-5.2544, -3.3371,  2.6543, -2.6600],
        [-7.0440,  1.7485, -6.7738, -2.2944]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 74
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.06370247155427933, logits - tensor([[-6.4616, -1.3483, -4.3129,  1.8242],
        [-4.8492, -3.0769,  2.0999, -1.9924],
        [-6.6297, -2.7831, -5.7177,  3.4050],
        [-6.5055,  1.1043, -6.1391, -2.4875],
        [-7.0222,  2.9384, -6.4524, -2.3962],
        [-6.4741,  2.7113, -6.2219, -2.7846],
        [-7.2792,  1.6837, -6.6672, -2.3775],
        [-6.0579,  1.7131, -5.5130, -1.

 26%|██▌       | 75/289 [00:56<02:43,  1.31it/s]

Training loop 75
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1694030612707138, logits - tensor([[-5.4053, -3.5138,  2.3468, -2.9119],
        [-6.7694, -3.8037, -5.7899,  4.0995],
        [-6.6460,  2.2546, -6.5641, -1.1614],
        [-5.5062,  2.3246, -5.5264, -2.0823],
        [-7.0804,  1.0821, -6.1903, -1.0643],
        [-6.3942,  1.9213, -6.6924, -2.3417],
        [-6.1891,  2.1309, -5.7963, -2.2317],
        [-6.3539, -2.3638, -5.1578,  2.9435]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 26%|██▋       | 76/289 [00:57<02:42,  1.31it/s]

Training loop 76
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2636846899986267, logits - tensor([[-7.0194, -0.6544, -4.6017,  1.5300],
        [-6.3134,  0.7222, -4.8418, -0.5292],
        [-7.3518, -2.2234, -5.5835,  1.3457],
        [-6.0862, -3.0451, -4.9484,  2.5891],
        [-4.8152, -2.5449,  2.4679, -1.5151],
        [-5.8793, -4.3196,  2.2897, -2.1003],
        [-6.1651, -3.7332,  2.6517, -2.7959],
        [-6.6583, -3.5812, -5.3310,  2.8325]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 27%|██▋       | 77/289 [00:58<02:41,  1.32it/s]

Training loop 77
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.22621527314186096, logits - tensor([[-6.2097,  1.9014, -5.9147, -1.4690],
        [-7.7285,  2.8822, -6.6526, -2.6279],
        [-5.5504,  2.3135, -7.1360, -2.8773],
        [-6.5507,  2.8508, -6.0846, -2.4924],
        [-6.3817,  2.1637, -5.8890, -2.5647],
        [-5.6246, -2.9544,  2.5700, -2.0000],
        [-6.3232,  3.1460, -6.9517, -1.8489],
        [-5.4381, -3.5005,  1.9717, -1.7463]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 27%|██▋       | 78/289 [00:59<02:40,  1.32it/s]

Training loop 78
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19043776392936707, logits - tensor([[-6.2368,  2.0025, -6.6619, -2.6617],
        [-6.7787,  2.1725, -6.0436, -1.6432],
        [-6.5824, -2.4641, -4.7268,  3.2749],
        [-6.1886, -4.2256,  1.7776, -2.7916],
        [-6.7115,  2.4269, -6.5028, -2.5215],
        [-5.8358, -3.0871,  2.0791, -2.4171],
        [-5.9806,  3.0346, -6.3746, -1.9383],
        [-7.1089, -3.1261, -6.0353,  2.2126]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 27%|██▋       | 79/289 [00:59<02:38,  1.32it/s]

Training loop 79
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.04831736534833908, logits - tensor([[-5.8637,  2.4607, -5.6616, -2.1248],
        [-5.8342,  3.5005, -6.4336, -2.5996],
        [-7.1434,  3.9147, -6.1853, -2.7636],
        [-6.4890,  2.3183, -5.9491, -2.4325],
        [-7.2524,  2.6683, -6.5250, -1.9773],
        [-6.1076, -2.5364, -4.5478,  1.9457],
        [-6.1350,  1.5666, -5.6712, -1.5497],
        [-5.7068,  2.4065, -6.2633, -2.5868]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 28%|██▊       | 80/289 [01:00<02:38,  1.32it/s]

Training loop 80
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3739798963069916, logits - tensor([[-6.2086, -1.7386, -4.8580,  1.7200],
        [-7.1203,  2.1687, -6.9095, -1.5862],
        [-6.6158,  1.1286, -5.6565, -0.3755],
        [-6.2851,  1.9621, -5.4731, -1.5904],
        [-6.6119,  2.5977, -6.6365, -3.2790],
        [-7.0278,  2.2856, -5.5705, -3.5016],
        [-6.7150,  1.7301, -6.1159, -1.8568],
        [-6.9691,  2.5401, -6.3241, -2.1816]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 28%|██▊       | 81/289 [01:01<02:37,  1.32it/s]

Training loop 81
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.28199684619903564, logits - tensor([[-6.9817,  2.1588, -6.9474, -2.8637],
        [-7.7223,  3.4641, -7.4653, -3.0201],
        [-5.5910,  2.4684, -5.4960, -2.0925],
        [-4.7442, -3.1232,  1.6299, -2.3366],
        [-6.4078,  2.7255, -6.8937, -2.2795],
        [-6.7298,  2.6124, -6.8236, -2.5707],
        [-6.4840, -3.3040, -4.8991,  2.1426],
        [-7.3122,  0.0875, -4.9862,  0.3020]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 28%|██▊       | 82/289 [01:02<02:36,  1.32it/s]

Training loop 82
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.23711828887462616, logits - tensor([[-7.0197,  2.0439, -6.3358, -1.6193],
        [-6.4290, -0.1184, -5.5899,  1.2536],
        [-6.4651,  2.9918, -6.5992, -2.9602],
        [-5.5507, -1.6012, -5.4452,  0.9654],
        [-5.6544, -2.2427, -5.8530,  2.6795],
        [-6.8418,  2.0286, -6.0164, -1.7082],
        [-6.8104,  2.6701, -6.6254, -3.1525],
        [-5.6344,  2.5944, -5.8790, -2.4667]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 29%|██▊       | 83/289 [01:02<02:35,  1.32it/s]

Training loop 83
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.07205425947904587, logits - tensor([[-5.4164,  2.1697, -5.7016, -3.3834],
        [-5.1961, -3.0017,  1.5524, -1.9917],
        [-6.3113,  1.7077, -6.5578, -1.3530],
        [-6.4950, -1.2312, -5.0768,  0.4857],
        [-6.3197,  3.4180, -6.1628, -3.4502],
        [-6.4379,  3.3614, -6.6626, -2.4923],
        [-5.4470, -1.3700, -4.9830,  2.3586],
        [-6.0837,  2.6095, -6.0914, -2.9291]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 29%|██▉       | 84/289 [01:03<02:34,  1.33it/s]

Training loop 84
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.24012020230293274, logits - tensor([[-6.4219,  2.4969, -6.1156, -2.0517],
        [-5.0678, -3.0390,  2.0636, -1.9700],
        [-7.2069,  1.9116, -6.2424, -1.1017],
        [-5.9020,  0.5239, -5.3535, -0.4531],
        [-6.8724,  2.3007, -7.0771, -2.7899],
        [-6.5665,  1.8400, -6.1852, -2.0204],
        [-6.3653,  2.9057, -6.7765, -2.4158],
        [-7.3463,  2.8510, -6.9441, -2.7431]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 29%|██▉       | 85/289 [01:04<02:33,  1.33it/s]

Training loop 85
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2042635977268219, logits - tensor([[-7.3241,  1.9566, -6.6985, -2.9011],
        [-7.1843, -3.4531, -5.6039,  2.9947],
        [-7.8343, -1.3412, -6.3673,  2.2174],
        [-5.1995, -2.7089,  1.2273, -1.6383],
        [-4.8624, -3.4852,  2.2059, -2.2583],
        [-6.0599,  1.9524, -5.7787, -1.4060],
        [-7.0982,  3.0205, -7.4581, -3.0236],
        [-5.3099, -2.9482,  2.2673, -2.1757]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 30%|██▉       | 86/289 [01:05<02:33,  1.33it/s]

Training loop 86
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1931154876947403, logits - tensor([[-6.6117,  2.4005, -6.5203, -3.0832],
        [-6.7864,  2.6793, -6.3979, -2.7583],
        [-5.0145,  1.7585, -5.3377, -2.7013],
        [-5.7818, -3.6732,  2.6615, -2.8419],
        [-6.1656,  1.6314, -5.0835, -1.3831],
        [-6.1343, -2.7268,  1.4638, -1.0166],
        [-6.2906, -4.2253,  2.7550, -2.4581],
        [-6.2703,  2.3346, -6.5599, -2.3951]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 30%|███       | 87/289 [01:05<02:31,  1.33it/s]

Training loop 87
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.23143459856510162, logits - tensor([[-6.0786,  0.6281, -4.8694, -0.4273],
        [-6.2042,  0.9171, -5.4063, -1.2660],
        [-5.8647,  1.7332, -6.1320, -0.5718],
        [-5.0763, -3.0931,  2.6109, -2.6132],
        [-6.9295,  2.5088, -6.1718, -3.0403],
        [-5.4807, -3.4816, -4.7105,  3.3110],
        [-5.7009,  1.6765, -5.5223, -2.5926],
        [-5.5274,  2.6429, -5.7421, -3.0698]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 30%|███       | 88/289 [01:06<02:31,  1.33it/s]

Training loop 88
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.039210569113492966, logits - tensor([[-8.2665, -1.9763, -6.3823,  2.7169],
        [-6.1370,  2.8402, -5.3662, -2.7255],
        [-5.5723,  2.9736, -5.6701, -2.6782],
        [-6.9345,  3.2638, -6.4348, -2.9054],
        [-6.0745,  1.3505, -6.2529, -2.4429],
        [-6.0872,  2.4734, -6.3067, -2.8389],
        [-6.0120,  2.4674, -6.3092, -2.1190],
        [-4.8858, -3.5433, -4.9571,  4.3312]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 31%|███       | 89/289 [01:07<02:30,  1.33it/s]

Training loop 89
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.0749158188700676, logits - tensor([[-7.2023,  2.2193, -7.2673, -2.2941],
        [-7.2492, -2.3806, -5.6528,  2.4407],
        [-6.0493,  1.7456, -5.5203, -2.3047],
        [-7.1390,  2.6163, -6.3940, -2.2621],
        [-6.6948,  1.5607, -5.8105, -1.5098],
        [-7.4225, -0.4895, -5.3012,  0.8925],
        [-6.4882,  1.6574, -5.2724, -2.3339],
        [-5.8390,  3.5177, -6.8497, -2.8228]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 31%|███       | 90/289 [01:08<02:29,  1.33it/s]

Training loop 90
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.12369484454393387, logits - tensor([[-5.7697, -1.7554, -5.3166,  1.8820],
        [-6.6677, -1.9512, -4.9140,  1.1202],
        [-6.2569,  2.7232, -6.3150, -2.8603],
        [-6.7149,  1.9742, -6.0410, -2.4159],
        [-5.9598,  2.8679, -5.7231, -1.8568],
        [-5.8291,  3.2287, -6.1843, -3.1066],
        [-4.7283, -2.7820, -4.5914,  2.8392],
        [-7.4890, -0.6660, -6.1963,  1.0772]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 31%|███▏      | 91/289 [01:08<02:28,  1.33it/s]

Training loop 91
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.17208288609981537, logits - tensor([[-7.2144,  1.0383, -6.4872, -0.7123],
        [-6.4208,  2.4053, -6.9652, -2.6745],
        [-5.4537, -1.1788, -6.1614,  2.0463],
        [-6.0735,  2.8301, -6.1348, -2.7027],
        [-5.8462,  1.8568, -6.6257, -1.8939],
        [-6.2257,  3.0781, -5.6499, -2.9777],
        [-5.3155, -3.6856,  1.8504, -2.2099],
        [-6.8849,  2.3502, -6.9402, -1.9250]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 32%|███▏      | 92/289 [01:09<02:28,  1.33it/s]

Training loop 92
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 32%|███▏      | 93/289 [01:10<02:28,  1.32it/s]

loss - 0.26916003227233887, logits - tensor([[-5.5384,  2.0014, -5.7329, -2.2860],
        [-7.0212,  1.1091, -6.8551, -0.9949],
        [-6.8012,  2.3422, -6.3833, -1.5450],
        [-5.3433, -3.4356,  1.4635, -1.7762],
        [-5.6094,  2.6196, -6.1880, -3.0918],
        [-4.9836, -3.4878, -4.8687,  3.9833],
        [-6.9324,  0.6940, -6.2423, -0.7147],
        [-6.0808,  2.8060, -5.9300, -3.0321]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 93
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.08166573196649551, logits - tensor([[-5.3351, -2.6955,  1.7556, -2.0836],
        [-5.7062, -2.3726,  0.7489, -1.6502],
        [-6.0945,  1.7174, -5.6545, -1.7581],
        [-7.3655, -2.2384, -5.0726,  2.1038],
        [-5.7457,  0.6465, -5.8273, -0.6794],
        [-6.1672, -3.8661, -5.0980,  4.1601],
        [-5.8368,  3.0110, -6.2415, -3.1822],
        [-6.5017,  3.1642, -5.8439, -3.

 33%|███▎      | 94/289 [01:11<02:28,  1.32it/s]

Training loop 94
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.10177120566368103, logits - tensor([[-4.2488,  2.6288, -5.0008, -3.1612],
        [-5.9500,  2.3675, -6.5013, -1.7838],
        [-6.5111,  1.0286, -6.4744,  0.3311],
        [-6.3373,  1.6554, -6.0790, -1.9732],
        [-5.6885, -3.2910,  1.6358, -2.6618],
        [-6.1300,  1.4128, -4.9369, -1.6612],
        [-6.2590,  2.0099, -6.0197, -3.0192],
        [-5.6764, -2.8432,  1.7355, -1.0546]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 33%|███▎      | 95/289 [01:11<02:27,  1.31it/s]

Training loop 95
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.30501657724380493, logits - tensor([[-6.2678,  2.2816, -5.5972, -2.1162],
        [-6.2824, -0.7225, -5.1509,  0.7324],
        [-5.6436,  2.4874, -5.0257, -3.1295],
        [-5.5525, -3.3488,  1.7505, -2.0190],
        [-6.6979,  0.6308, -5.4734, -0.3212],
        [-6.8986,  2.4106, -5.8074, -2.2454],
        [-5.2230, -3.2167,  2.0310, -2.1245],
        [-6.1486,  0.2308, -4.4616,  0.0583]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 33%|███▎      | 96/289 [01:12<02:27,  1.31it/s]

Training loop 96
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 34%|███▎      | 97/289 [01:13<02:26,  1.31it/s]

loss - 0.19274075329303741, logits - tensor([[-5.9950,  2.5044, -6.0574, -2.7477],
        [-7.3246,  1.4850, -6.1017, -0.7160],
        [-6.0741, -1.0684, -5.6527,  1.4074],
        [-7.9339,  2.7976, -6.9217, -2.5764],
        [-7.6707,  1.5378, -6.0084, -1.4235],
        [-6.2890,  2.0987, -6.9322, -1.6372],
        [-6.7284,  0.9441, -5.9065, -1.6013],
        [-6.2779,  1.1877, -5.6669, -1.9447]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 97
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.22309449315071106, logits - tensor([[-4.9786, -2.4437,  0.9613, -1.7515],
        [-5.9867,  2.8503, -6.7909, -2.4302],
        [-6.6350,  2.6689, -6.1881, -2.7573],
        [-7.3681,  2.1890, -6.3144, -2.5101],
        [-6.7568,  1.9907, -6.2553, -1.5364],
        [-6.5773,  2.6923, -6.3036, -2.7326],
        [-6.0252,  3.0772, -5.8459, -2.7346],
        [-6.7194,  2.2445, -5.8709, -1.

 34%|███▍      | 98/289 [01:14<02:25,  1.31it/s]

Training loop 98
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1462804228067398, logits - tensor([[-6.5258,  0.8796, -5.9773, -1.7200],
        [-7.3713,  2.1222, -7.2045, -2.7098],
        [-6.4221,  2.4099, -6.3912, -2.6102],
        [-7.8962, -1.5080, -6.6666,  1.6006],
        [-7.1372,  2.4256, -6.6569, -2.6249],
        [-6.3455,  1.9521, -5.6631, -2.0311],
        [-6.1373, -3.8896, -5.0214,  4.2017],
        [-4.9826, -3.5744,  1.2503, -2.3305]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 34%|███▍      | 99/289 [01:15<02:24,  1.32it/s]

Training loop 99
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1546141356229782, logits - tensor([[-6.2452,  1.9603, -5.7035, -2.1876],
        [-7.1860, -1.9593, -5.1579,  2.4441],
        [-6.3090,  2.2735, -6.6883, -3.2515],
        [-5.6554, -3.1802,  1.6654, -2.6511],
        [-4.5897, -2.7772,  1.6497, -2.0806],
        [-7.1206,  1.5113, -5.5997, -0.7963],
        [-6.4537,  2.5712, -6.5920, -3.0207],
        [-6.9608,  0.8553, -5.6586, -0.9481]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 35%|███▍      | 100/289 [01:15<02:22,  1.32it/s]

Training loop 100
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.0660506933927536, logits - tensor([[-5.9704,  2.7466, -5.3151, -3.3165],
        [-6.8501,  1.2794, -6.1621, -1.8035],
        [-7.0029, -2.3969, -5.0588,  2.4676],
        [-5.7015, -2.7525,  1.2533, -1.9777],
        [-6.4564,  1.7164, -5.9187, -1.6805],
        [-5.5904,  2.3585, -5.0193, -2.6120],
        [-7.0803,  1.9357, -5.8223, -1.9163],
        [-6.3436,  1.9561, -6.0534, -2.7581]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 35%|███▍      | 101/289 [01:16<02:21,  1.32it/s]

Training loop 101
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.07189753651618958, logits - tensor([[-7.0220,  2.5140, -6.4421, -2.7967],
        [-7.3865, -1.4229, -4.7958,  1.6729],
        [-6.9720,  1.5289, -6.4308, -2.3391],
        [-7.2934,  1.4404, -7.1383, -1.5862],
        [-6.5208,  1.8549, -5.9597, -1.2802],
        [-7.2609, -2.8434, -6.5098,  2.8733],
        [-7.1448, -2.0369, -5.2869,  2.6885],
        [-4.8079,  1.6057, -5.3154, -1.6561]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 35%|███▌      | 102/289 [01:17<02:21,  1.32it/s]

Training loop 102
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2227858304977417, logits - tensor([[-6.3171,  2.3712, -6.2019, -2.1813],
        [-6.2316,  2.0248, -6.6493, -2.0956],
        [-4.7820, -2.6763,  1.9059, -2.3890],
        [-6.9741, -2.4275, -5.0330,  2.7769],
        [-6.7864,  0.7841, -5.4699, -0.8783],
        [-6.5890,  1.1516, -6.9215, -0.2118],
        [-5.8632, -1.2716, -3.9305,  1.5092],
        [-5.8522, -3.7811, -5.3265,  4.2933]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 36%|███▌      | 103/289 [01:18<02:20,  1.32it/s]

Training loop 103
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.07219253480434418, logits - tensor([[-6.4843,  1.7607, -7.2989, -1.3943],
        [-6.0846, -4.0031,  2.0164, -3.0624],
        [-7.7965,  1.6057, -6.9549, -1.5693],
        [-7.5080,  1.8892, -6.8919, -1.7071],
        [-6.0126, -3.2764,  2.1355, -2.8358],
        [-6.4775, -3.8783,  1.5867, -2.7524],
        [-7.2894,  1.3567, -6.3818, -1.8900],
        [-7.3172,  1.9221, -6.5036, -2.6992]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 36%|███▌      | 104/289 [01:18<02:19,  1.33it/s]

Training loop 104
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1228826493024826, logits - tensor([[-7.8237,  1.6600, -5.7119, -1.4971],
        [-5.0275, -3.6757, -4.8125,  4.0706],
        [-7.3724,  2.9317, -7.0720, -2.8023],
        [-6.2329,  1.6752, -5.7307, -2.3794],
        [-6.9485,  1.4792, -5.4573, -1.5514],
        [-7.9903,  1.0129, -6.2378, -0.7156],
        [-7.6632, -3.0160, -6.5310,  3.2757],
        [-7.5014,  2.2236, -7.4345, -2.4081]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 36%|███▋      | 105/289 [01:19<02:18,  1.33it/s]

Training loop 105
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3066157102584839, logits - tensor([[-6.5452,  1.2836, -6.3729, -0.6096],
        [-5.9253,  1.2123, -5.9749, -1.3095],
        [-6.5321,  1.1053, -5.4971, -1.8234],
        [-5.6749, -2.7814,  1.2956, -2.4268],
        [-6.0623,  2.2589, -6.1240, -2.4043],
        [-4.7105, -2.7812,  1.8728, -1.8277],
        [-7.3615,  2.9272, -6.5211, -2.0506],
        [-5.6073, -4.0512,  1.9626, -3.1068]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 37%|███▋      | 106/289 [01:20<02:18,  1.32it/s]

Training loop 106
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.17591050267219543, logits - tensor([[-6.2460, -1.2635, -4.5825,  1.6130],
        [-6.9584,  2.8339, -6.9585, -2.1537],
        [-7.0578,  2.1488, -5.5700, -1.4375],
        [-5.3199, -2.1803, -4.8923,  3.1020],
        [-7.0986,  0.4803, -6.4078, -0.4180],
        [-6.8904,  0.6769, -5.8898, -0.2702],
        [-6.5237,  2.3527, -6.6117, -2.0874],
        [-7.5913, -1.5604, -6.6086,  1.0276]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 37%|███▋      | 107/289 [01:21<02:17,  1.32it/s]

Training loop 107
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.06132041662931442, logits - tensor([[-6.0772,  1.7708, -6.6461, -2.1126],
        [-6.5598, -3.5118,  1.6830, -2.3684],
        [-6.4850,  3.1663, -6.5655, -3.2270],
        [-6.1802, -3.6107, -4.2840,  2.7827],
        [-6.4586,  1.5275, -5.8640, -1.8223],
        [-7.6861, -1.9095, -5.0602,  2.2976],
        [-6.2931,  1.5393, -6.3247, -1.1815],
        [-5.9679,  2.3808, -6.1061, -2.7706]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 37%|███▋      | 108/289 [01:21<02:17,  1.32it/s]

Training loop 108
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.26327845454216003, logits - tensor([[-6.8665,  0.2762, -6.8279,  0.8518],
        [-8.0101,  1.2626, -6.9610, -0.5624],
        [-7.0767,  1.5566, -5.7786, -2.4037],
        [-7.0998,  0.8762, -6.5644, -1.2329],
        [-7.6005, -0.5771, -6.3331,  0.7933],
        [-7.3780, -1.1922, -4.8681,  0.6570],
        [-6.7458,  2.1873, -5.8931, -2.3037],
        [-7.4946, -0.2873, -5.3842,  0.0465]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 38%|███▊      | 109/289 [01:22<02:16,  1.31it/s]

Training loop 109
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.32772010564804077, logits - tensor([[-5.2821, -3.5863,  1.4777, -1.1969],
        [-5.0286, -3.4911, -5.5048,  3.3320],
        [-7.1999,  1.2108, -6.9345, -2.9129],
        [-6.7332, -0.6416, -5.9363,  1.0797],
        [-5.6940,  0.0438, -5.2084,  0.1327],
        [-6.4814,  2.3089, -6.3475, -1.8659],
        [-5.7126, -3.6016,  2.6961, -3.0907],
        [-6.7813,  2.0171, -6.7015, -2.2474]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 38%|███▊      | 110/289 [01:23<02:16,  1.31it/s]

Training loop 110
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.22142869234085083, logits - tensor([[-7.3306,  1.0711, -7.6667, -1.4702],
        [-7.1173, -2.5690, -6.1412,  2.7535],
        [-5.4378,  3.3163, -6.2304, -3.2037],
        [-6.3410,  0.6079, -5.1295, -0.9723],
        [-7.5312, -2.9226, -4.2929,  1.0829],
        [-7.1997, -0.0558, -6.4507, -0.5145],
        [-6.3956, -0.0366, -6.1411,  0.0490],
        [-5.2807, -4.4578, -4.2299,  3.1173]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 38%|███▊      | 111/289 [01:24<02:15,  1.31it/s]

Training loop 111
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2221565544605255, logits - tensor([[-5.6111, -3.6586, -4.8187,  3.8689],
        [-7.6121,  1.1146, -6.8103, -1.9752],
        [-7.9715, -0.6960, -6.6498,  0.5011],
        [-8.1155,  0.9914, -7.1836, -1.4392],
        [-6.4016,  2.0533, -6.0184, -2.9737],
        [-6.1149,  2.7424, -5.6635, -2.2343],
        [-6.9389,  0.8788, -6.8111, -1.0850],
        [-6.8159,  1.0518, -6.8164, -1.0038]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 39%|███▉      | 112/289 [01:24<02:14,  1.32it/s]

Training loop 112
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.24683746695518494, logits - tensor([[-6.7418,  0.0230, -5.5191,  1.5238],
        [-7.0768,  0.6164, -6.4499, -0.4970],
        [-5.9670, -3.4142, -4.5603,  3.7235],
        [-6.5480, -3.7765,  1.0658, -2.4561],
        [-6.9364,  2.6450, -6.3150, -1.7312],
        [-8.6244, -1.5055, -7.0345,  1.4105],
        [-5.5381, -3.1336,  1.2419, -2.4939],
        [-6.7410,  2.6655, -6.3454, -2.3622]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 39%|███▉      | 113/289 [01:25<02:14,  1.31it/s]

Training loop 113
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.20725251734256744, logits - tensor([[-5.9976,  1.9229, -5.8951, -2.4161],
        [-6.1842,  2.7263, -5.9018, -2.5133],
        [-7.3455, -0.4382, -6.3350,  0.6719],
        [-6.4021, -2.5937, -4.2316,  3.2009],
        [-6.2696,  1.5093, -4.9552, -2.1163],
        [-5.6589, -3.3977, -4.4864,  3.8973],
        [-5.2180, -3.3756,  1.7117, -1.8275],
        [-5.1431, -3.3423,  3.0830, -2.9843]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 39%|███▉      | 114/289 [01:26<02:12,  1.32it/s]

Training loop 114
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.16587693989276886, logits - tensor([[-6.3024,  1.7449, -5.9024, -1.5750],
        [-7.1336, -1.0572, -5.6993,  0.6558],
        [-5.1264, -3.2871,  2.4920, -2.5786],
        [-7.7393,  0.2471, -6.6150, -0.1627],
        [-6.3804, -3.6118,  1.9902, -2.8566],
        [-5.8991, -2.8500, -5.1000,  3.3134],
        [-5.9046,  1.2575, -6.2171, -1.2480],
        [-7.6826,  1.5031, -6.9075, -2.1625]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 40%|███▉      | 115/289 [01:27<02:11,  1.32it/s]

Training loop 115
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3871815800666809, logits - tensor([[-6.4433,  1.2090, -5.6509, -1.1073],
        [-6.3890,  1.2897, -6.7117, -1.1317],
        [-7.3956,  0.2982, -6.3736,  0.7464],
        [-7.6153,  1.7437, -6.6140, -2.4964],
        [-7.8817, -2.8686, -6.4197,  3.2077],
        [-5.3912, -3.9709,  2.2759, -2.3581],
        [-6.6905, -2.7978,  1.5009, -2.2702],
        [-7.8116,  0.8970, -6.4885, -0.8529]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 40%|████      | 116/289 [01:27<02:11,  1.32it/s]

Training loop 116
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.15661051869392395, logits - tensor([[-5.4576, -4.0199,  2.1051, -2.3669],
        [-6.9022,  2.2963, -6.5893, -2.2091],
        [-6.9144,  1.2214, -7.3246, -1.0494],
        [-5.9446, -1.3577, -5.2079,  1.7422],
        [-4.5323,  2.1500, -6.4841, -1.8679],
        [-6.5152, -2.5428, -5.0745,  1.8642],
        [-7.0317,  1.5473, -7.2304, -1.4887],
        [-7.8435,  0.5419, -7.1895, -1.5357]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 40%|████      | 117/289 [01:28<02:10,  1.32it/s]

Training loop 117
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.21966180205345154, logits - tensor([[-6.9094,  2.2092, -6.8015, -1.9995],
        [-4.9779, -3.3972,  2.5485, -2.7798],
        [-6.0726,  1.9577, -5.8231, -2.9714],
        [-7.1647,  1.4846, -6.2819, -2.4604],
        [-6.1731, -4.1565,  1.4630, -2.2042],
        [-6.1514,  2.6837, -6.5352, -2.5211],
        [-6.3735,  2.3402, -6.2858, -3.1405],
        [-6.3323,  1.4735, -6.3596, -2.1090]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 41%|████      | 118/289 [01:29<02:09,  1.32it/s]

Training loop 118
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.24035419523715973, logits - tensor([[-5.5579,  2.0170, -6.5024, -2.6124],
        [-5.6862, -3.8832, -4.3718,  3.4171],
        [-7.4852,  0.8100, -6.3012, -1.5136],
        [-5.5261,  2.6312, -6.8897, -3.0053],
        [-7.1442,  0.4284, -6.6920,  0.0992],
        [-6.5400,  2.9517, -6.2101, -3.8509],
        [-6.2752, -3.5451,  1.5329, -2.7118],
        [-6.6011,  2.3197, -6.5784, -2.7642]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 41%|████      | 119/289 [01:30<02:09,  1.32it/s]

Training loop 119
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1576755940914154, logits - tensor([[-5.9911, -3.9722,  2.1395, -3.0432],
        [-5.9291,  1.0671, -6.0236, -2.1518],
        [-7.2602, -1.8266, -4.6313,  1.3568],
        [-6.7489,  0.9269, -6.2656, -0.5788],
        [-7.3388,  2.0372, -7.1713, -2.2886],
        [-7.5192, -2.4264, -4.0350,  1.4937],
        [-8.0798, -0.7030, -5.6691,  1.1241],
        [-6.3039, -4.0215,  1.9518, -2.3297]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 42%|████▏     | 120/289 [01:30<02:08,  1.32it/s]

Training loop 120
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.25111445784568787, logits - tensor([[-6.8843,  1.8500, -6.5750, -2.0597],
        [-5.2067, -3.4963,  2.9467, -2.6030],
        [-6.6901,  2.7094, -6.3361, -2.5560],
        [-6.4899,  2.8844, -6.7270, -3.5893],
        [-5.5619,  2.6881, -5.6463, -2.4668],
        [-6.5922,  3.0452, -6.6292, -2.0579],
        [-5.4381, -3.8229,  2.2887, -2.6129],
        [-7.1269,  1.9297, -6.6373, -1.2408]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 42%|████▏     | 121/289 [01:31<02:07,  1.32it/s]

Training loop 121
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.34209394454956055, logits - tensor([[-7.8602,  2.5801, -7.2717, -3.0124],
        [-7.2689,  2.7850, -7.1510, -2.4761],
        [-6.9193,  2.6206, -7.1842, -2.7898],
        [-6.7397, -2.3823, -6.3520,  2.8331],
        [-6.2760,  2.1989, -6.6296, -1.3683],
        [-5.4789,  1.6723, -5.6542, -2.7439],
        [-6.4026,  2.3184, -7.5611, -2.5519],
        [-5.1427,  1.5773, -5.8524, -2.5440]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 42%|████▏     | 122/289 [01:32<02:06,  1.32it/s]

Training loop 122
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.052888184785842896, logits - tensor([[-6.0832,  2.4775, -6.6374, -2.2334],
        [-6.2990,  2.1799, -5.6864, -2.2549],
        [-5.5212, -3.9952,  1.7686, -2.5243],
        [-7.0453,  1.2381, -6.2955, -1.0638],
        [-6.3090,  3.1565, -6.3689, -3.0828],
        [-6.0098, -3.3681,  2.5670, -2.6738],
        [-7.0313, -3.0043, -5.6273,  3.4264],
        [-5.6893,  2.3360, -5.3147, -3.4495]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 43%|████▎     | 123/289 [01:33<02:05,  1.32it/s]

Training loop 123
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1967693567276001, logits - tensor([[-6.6798,  2.1800, -6.5504, -2.3273],
        [-6.4927, -2.9853, -6.6299,  3.3794],
        [-6.8109,  1.3904, -6.8531, -1.6968],
        [-7.2603,  2.0562, -6.4746, -2.3275],
        [-5.9166, -2.7255,  1.5323, -2.3234],
        [-5.5426, -3.9034,  1.6385, -3.0259],
        [-7.3155,  2.4994, -7.2521, -2.0148],
        [-5.1304, -3.8711,  2.3907, -2.3356]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 43%|████▎     | 124/289 [01:33<02:04,  1.32it/s]

Training loop 124
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2646321654319763, logits - tensor([[-6.1305,  2.1586, -7.0130, -1.6137],
        [-6.0744,  2.3070, -6.0091, -1.9330],
        [-4.8115, -3.4882,  1.9403, -2.6408],
        [-5.8971,  2.5143, -6.9050, -1.7914],
        [-6.3496,  3.2363, -6.7067, -3.2084],
        [-6.5740, -1.0028, -5.8818,  0.9824],
        [-5.7486,  2.1702, -5.7061, -1.7187],
        [-6.1647,  1.7943, -5.9076, -2.4458]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 43%|████▎     | 125/289 [01:34<02:04,  1.32it/s]

Training loop 125
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.06254620850086212, logits - tensor([[-6.0582,  2.6192, -6.2520, -2.4988],
        [-7.0324,  1.1895, -5.6275, -1.3282],
        [-7.2599,  3.4835, -6.6067, -2.8806],
        [-5.6460, -3.8132, -4.7912,  3.1544],
        [-5.5972,  2.0698, -6.3681, -2.4483],
        [-7.2297,  2.7214, -6.4322, -2.9529],
        [-5.3922,  2.1736, -6.6134, -2.4382],
        [-5.6861,  0.8135, -4.8832, -1.1252]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 44%|████▎     | 126/289 [01:35<02:03,  1.32it/s]

Training loop 126
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.07806149125099182, logits - tensor([[-6.3836, -3.3379, -5.9406,  3.1568],
        [-5.6720,  1.7266, -4.8584, -1.9802],
        [-4.8297, -3.5250, -4.0593,  3.6136],
        [-6.9239,  2.9952, -7.5052, -1.9958],
        [-5.9635,  2.6734, -6.4320, -3.0955],
        [-6.7645,  1.4068, -7.9144, -1.7012],
        [-7.6140,  0.3320, -6.5377, -0.3307],
        [-6.3087, -3.3593,  1.8924, -2.3575]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 44%|████▍     | 127/289 [01:36<02:02,  1.32it/s]

Training loop 127
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.05880920961499214, logits - tensor([[-5.9976, -3.0111, -5.1966,  3.2961],
        [-6.1577,  2.4840, -5.6672, -2.5749],
        [-6.8149,  1.8119, -6.4518, -2.5636],
        [-5.3235,  1.7987, -5.5623, -2.8264],
        [-6.2291,  1.4600, -6.1566, -1.4747],
        [-6.2553,  1.9172, -5.9584, -1.4648],
        [-6.1741,  2.6373, -6.5422, -1.9772],
        [-6.3401,  2.0971, -5.6701, -2.3522]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 44%|████▍     | 128/289 [01:36<02:01,  1.32it/s]

Training loop 128
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.05941460654139519, logits - tensor([[-5.9833,  2.1351, -5.7673, -2.6024],
        [-5.7820,  2.9593, -6.2972, -3.4597],
        [-7.2479, -1.6924, -5.8050,  1.2432],
        [-4.8074, -3.6593,  2.3033, -2.4069],
        [-6.4191,  2.4601, -6.6036, -2.1584],
        [-7.6145,  2.2236, -6.7931, -1.8960],
        [-5.9412,  2.4471, -6.2899, -1.8148],
        [-6.5029,  1.8135, -6.5588, -1.7938]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 45%|████▍     | 129/289 [01:37<02:01,  1.32it/s]

Training loop 129
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2002197504043579, logits - tensor([[-6.2964,  2.9897, -6.5551, -2.4949],
        [-6.8742,  1.9041, -5.9173, -1.7805],
        [-6.0069,  2.1767, -6.2567, -2.7063],
        [-6.0872,  1.8848, -6.0777, -2.7689],
        [-6.1233,  1.5545, -6.5599, -2.7981],
        [-7.4569, -2.5160, -6.1331,  3.0108],
        [-6.8900,  3.4241, -6.7332, -2.4812],
        [-6.0236,  1.8870, -5.9629, -2.6653]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 45%|████▍     | 130/289 [01:38<02:00,  1.32it/s]

Training loop 130
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.08757330477237701, logits - tensor([[-6.1525, -3.7648,  2.7513, -2.2989],
        [-7.3888, -1.1568, -6.1226,  1.2019],
        [-6.5297,  2.6664, -5.9390, -2.6475],
        [-5.7027,  2.6057, -5.1586, -2.7185],
        [-6.9098,  2.6755, -6.7116, -2.7065],
        [-6.3708, -0.4655, -5.7105,  0.1078],
        [-5.5396, -3.6335,  2.3539, -2.5247],
        [-6.7438,  1.6673, -5.8903, -1.7807]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 45%|████▌     | 131/289 [01:39<02:00,  1.31it/s]

Training loop 131
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1850101798772812, logits - tensor([[-6.5862,  3.0946, -6.9350, -3.5440],
        [-6.9266,  3.1587, -6.6584, -3.0717],
        [-6.0734,  2.2105, -6.1548, -2.6734],
        [-5.9470, -3.2178,  2.9079, -2.7630],
        [-5.5585, -3.2415,  2.4394, -2.3411],
        [-6.4051, -2.5188, -5.2946,  2.5048],
        [-6.6964,  2.5699, -7.2585, -2.1706],
        [-6.1191, -3.0855, -5.3013,  2.6516]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 46%|████▌     | 132/289 [01:40<01:59,  1.32it/s]

Training loop 132
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2848370671272278, logits - tensor([[-5.7379,  1.9652, -6.7838, -1.8384],
        [-5.5383, -2.5685,  1.4234, -2.0520],
        [-6.4647, -1.5809, -4.9024,  1.4364],
        [-7.6278,  2.0322, -6.5153, -2.0572],
        [-6.6530, -1.1764, -6.7728,  0.8197],
        [-5.5921, -2.2058, -5.3333,  1.8800],
        [-6.0860,  2.5781, -5.9750, -2.0835],
        [-7.2548,  1.8710, -7.0441, -2.5916]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 46%|████▌     | 133/289 [01:40<01:58,  1.32it/s]

Training loop 133
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3879174590110779, logits - tensor([[-5.7194,  2.7695, -6.8419, -2.5102],
        [-6.4885, -1.3944, -5.4089,  0.9086],
        [-7.1071,  2.5616, -6.7245, -1.9402],
        [-6.7515,  2.1620, -5.8446, -1.8969],
        [-6.1767,  3.3871, -6.1687, -3.1017],
        [-5.4961, -1.9227, -4.9491,  1.6248],
        [-6.2537,  0.7505, -5.1452, -1.1766],
        [-5.9517, -3.0116,  2.4991, -2.9889]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 46%|████▋     | 134/289 [01:41<01:57,  1.32it/s]

Training loop 134
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.155513197183609, logits - tensor([[-6.7318,  1.4159, -5.7152, -1.6670],
        [-6.8718,  1.6329, -6.3727, -1.1879],
        [-5.0706, -3.5051,  2.6483, -3.7036],
        [-5.6267,  2.5986, -5.2419, -2.2680],
        [-5.3937,  2.3503, -5.8738, -2.4697],
        [-6.6777,  2.4651, -7.1095, -2.6581],
        [-6.3149, -2.0071, -4.9243,  2.2671],
        [-5.7923,  2.6159, -6.9093, -2.2065]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 47%|████▋     | 135/289 [01:42<01:56,  1.33it/s]

Training loop 135
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.34644901752471924, logits - tensor([[-5.9291,  2.0304, -6.4326, -2.0267],
        [-5.3538,  1.8437, -5.2099, -2.2243],
        [-5.5838, -3.6940,  2.2513, -3.3325],
        [-6.8524,  2.4798, -7.3415, -2.4944],
        [-6.1926, -3.8338, -3.3167,  2.8201],
        [-7.4765,  2.4125, -6.3469, -2.3109],
        [-6.9616, -0.3648, -6.1059,  0.7801],
        [-6.1638,  1.2116, -5.8623, -2.5137]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 47%|████▋     | 136/289 [01:43<01:55,  1.33it/s]

Training loop 136
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.053610604256391525, logits - tensor([[-6.5742,  2.5796, -6.8008, -2.1286],
        [-6.7831,  2.2747, -6.9905, -1.8585],
        [-7.5971,  1.5863, -6.6359, -1.3507],
        [-6.3082,  2.1192, -6.2712, -1.7300],
        [-6.4654, -4.4744,  2.9133, -2.6262],
        [-5.8415,  1.9955, -5.6983, -2.8959],
        [-6.1723,  3.3615, -6.5770, -3.0265],
        [-5.3834,  2.4507, -6.7833, -2.3921]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 47%|████▋     | 137/289 [01:43<01:54,  1.33it/s]

Training loop 137
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 48%|████▊     | 138/289 [01:44<01:53,  1.33it/s]

loss - 0.07066968083381653, logits - tensor([[-6.1847, -0.6506, -5.2545,  0.4013],
        [-5.8212,  1.5261, -6.2095, -2.4593],
        [-6.4189,  3.1591, -6.7559, -2.8517],
        [-5.8577,  2.0931, -5.6182, -2.3817],
        [-6.8746,  2.5397, -6.4697, -2.5252],
        [-5.9599, -2.6792, -4.7126,  1.7723],
        [-5.4059,  2.8162, -5.1537, -2.7189],
        [-6.2935,  1.9379, -6.5571, -2.6834]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 138
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 48%|████▊     | 139/289 [01:45<01:53,  1.32it/s]

loss - 0.221485435962677, logits - tensor([[-7.0623,  2.4093, -5.9519, -2.4077],
        [-6.6484,  1.8938, -6.9620, -1.1509],
        [-5.4367, -1.7025, -4.5141,  1.6351],
        [-6.6162,  0.9327, -6.0031, -1.5454],
        [-6.1879, -3.7287,  2.5425, -2.4000],
        [-7.5544, -1.1493, -6.0833,  1.3555],
        [-6.7184,  2.4813, -5.9977, -2.0621],
        [-6.1284,  2.3669, -6.7746, -2.6397]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 139
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.08982427418231964, logits - tensor([[-6.9004,  2.1872, -7.0224, -2.2962],
        [-7.4983, -0.0964, -6.5738,  0.2871],
        [-6.2231,  1.4905, -6.0885, -1.5955],
        [-6.5349,  1.3748, -6.6733, -2.4951],
        [-5.0740, -4.0884,  2.7320, -2.9456],
        [-6.3005,  1.5449, -6.0002, -2.3118],
        [-6.2725,  2.6530, -6.2518, -3.2465],
        [-6.3918,  1.9194, -6.7839, -2.4

 48%|████▊     | 140/289 [01:46<01:53,  1.32it/s]

Training loop 140
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.07510295510292053, logits - tensor([[-6.5701, -4.1066, -5.8294,  3.1705],
        [-6.4963,  1.1491, -6.1069, -0.6201],
        [-6.9668, -3.7086, -5.9660,  3.4141],
        [-6.4957,  1.0333, -7.2005, -1.0129],
        [-6.2940,  1.9389, -6.4090, -2.5909],
        [-7.8349,  1.2910, -5.9302, -1.7647],
        [-5.7444,  2.2313, -6.5716, -2.2187],
        [-6.7864,  2.6376, -6.2632, -2.7953]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 49%|████▉     | 141/289 [01:46<01:53,  1.31it/s]

Training loop 141
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3935947120189667, logits - tensor([[-7.4067,  2.5837, -7.4755, -2.1865],
        [-7.2488,  2.5289, -6.9447, -2.6093],
        [-6.0760,  1.9465, -6.2893, -2.2208],
        [-7.9813,  2.5360, -8.2980, -2.5976],
        [-7.1111,  2.0382, -6.5499, -2.7261],
        [-5.9785,  0.6568, -4.7721, -1.2071],
        [-6.2820,  2.9409, -6.6426, -2.9023],
        [-6.0808, -4.0906,  3.1720, -3.5602]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 49%|████▉     | 142/289 [01:47<01:52,  1.31it/s]

Training loop 142
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 49%|████▉     | 143/289 [01:48<01:51,  1.31it/s]

loss - 0.262265682220459, logits - tensor([[-6.9871, -1.2153, -5.3669,  1.4200],
        [-5.8905, -3.2834,  2.4727, -2.1813],
        [-6.1335, -2.5820,  2.1124, -2.5469],
        [-7.5419, -0.3317, -6.3907,  0.7620],
        [-5.5498, -0.9123, -5.1305,  0.6743],
        [-6.0295, -3.8338,  1.9267, -2.8504],
        [-6.1150,  1.1213, -5.9329, -1.9905],
        [-5.9258,  1.7596, -6.5608, -1.2823]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 143
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.23998181521892548, logits - tensor([[-5.7344, -0.6796, -4.1362,  0.0950],
        [-5.8919, -2.3850, -5.8284,  3.0948],
        [-7.0537, -2.2876, -5.6865,  1.6407],
        [-7.5946,  1.1721, -6.0854, -2.0972],
        [-5.3861, -3.2203,  2.1979, -2.7871],
        [-6.0746,  2.5083, -5.8147, -1.8790],
        [-6.1267,  2.2988, -6.0581, -2.2091],
        [-6.6881,  2.6797, -6.6111, -2.1

 50%|████▉     | 144/289 [01:49<01:49,  1.32it/s]

Training loop 144
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.16168534755706787, logits - tensor([[-4.6484, -3.1496,  2.3084, -2.6547],
        [-7.2555,  0.9522, -6.8448, -1.3427],
        [-7.1987,  2.3474, -6.6573, -3.1222],
        [-5.8300, -3.6674, -5.4659,  3.3247],
        [-8.2819,  1.7525, -6.7034, -2.4039],
        [-6.2643, -2.9810, -5.8352,  2.0605],
        [-6.6058,  0.7808, -6.4077, -1.2858],
        [-6.1128,  1.0457, -5.5220, -1.6146]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 50%|█████     | 145/289 [01:49<01:48,  1.32it/s]

Training loop 145
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1279657483100891, logits - tensor([[-5.6594,  2.3757, -5.4241, -2.4592],
        [-5.6731, -2.4091, -6.2401,  2.8628],
        [-7.2533,  1.8944, -6.7420, -2.3237],
        [-6.6450,  2.4071, -6.7945, -2.3499],
        [-6.6889,  2.4682, -5.5132, -1.9670],
        [-6.1858, -0.3964, -6.1072,  1.7217],
        [-5.5377,  2.2137, -5.7809, -2.3362],
        [-5.5042, -3.7965,  2.9886, -2.8412]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 51%|█████     | 146/289 [01:50<01:48,  1.32it/s]

Training loop 146
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.07566796243190765, logits - tensor([[-5.6415,  1.6281, -5.4051, -1.9137],
        [-6.5463,  1.9293, -6.5320, -2.0210],
        [-7.3777,  1.6792, -7.2152, -2.8885],
        [-6.7571,  2.4967, -7.0639, -1.8449],
        [-5.4246,  1.1953, -5.9270, -2.7575],
        [-5.9268,  2.4834, -5.8296, -2.1137],
        [-6.2634,  0.8071, -6.6036, -0.7399],
        [-7.8718,  3.0370, -7.6857, -3.2955]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 51%|█████     | 147/289 [01:51<01:47,  1.33it/s]

Training loop 147
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.07067526131868362, logits - tensor([[-6.1365,  1.7875, -6.8974, -2.0857],
        [-5.9112,  1.9785, -6.1475, -1.6315],
        [-6.6927,  2.0974, -6.3004, -2.3082],
        [-6.3796,  2.0703, -6.6382, -2.1417],
        [-5.1158,  2.2963, -5.9921, -2.6963],
        [-6.9258, -0.3459, -5.3824,  1.7315],
        [-6.9595,  2.4875, -7.5726, -3.0272],
        [-6.5749,  2.0777, -6.3639, -2.2105]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 51%|█████     | 148/289 [01:52<01:46,  1.33it/s]

Training loop 148
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2365541011095047, logits - tensor([[-5.4893,  2.2617, -6.3714, -2.2787],
        [-6.0722,  1.8190, -5.9348, -1.7254],
        [-6.4278,  2.5850, -7.8935, -2.5771],
        [-5.4329,  2.2510, -6.5244, -2.6878],
        [-6.6916,  2.8997, -5.7909, -2.6407],
        [-5.8530,  1.7378, -6.3493, -2.5451],
        [-5.1326, -3.7669,  2.8126, -2.8965],
        [-6.1642, -1.0379, -0.7755, -0.9149]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 52%|█████▏    | 149/289 [01:52<01:45,  1.33it/s]

Training loop 149
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.26498478651046753, logits - tensor([[-6.4049, -2.9322, -5.5346,  3.3594],
        [-5.4981, -2.5106, -6.2104,  2.6671],
        [-6.3373,  2.5859, -6.4675, -2.5445],
        [-6.6920,  2.1190, -6.7812, -2.5899],
        [-6.0057,  1.3229, -5.6826, -1.1217],
        [-6.5833,  1.9858, -6.4279, -1.9052],
        [-5.4710,  1.9758, -6.1176, -2.4234],
        [-6.2278,  0.6076, -6.4559, -0.7347]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 52%|█████▏    | 150/289 [01:53<01:44,  1.33it/s]

Training loop 150
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1567874252796173, logits - tensor([[-5.7125,  2.5283, -5.9064, -1.8137],
        [-6.8953,  1.7670, -6.1645, -1.5139],
        [-6.9991, -2.7340, -6.3394,  2.6226],
        [-6.5506,  1.1872, -5.5455, -1.6552],
        [-6.8170,  2.2441, -6.4118, -2.6723],
        [-5.5967, -3.9584,  2.8981, -3.2367],
        [-6.5029,  1.7966, -6.5701, -1.5425],
        [-8.2042,  1.9945, -7.4344, -1.3265]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 52%|█████▏    | 151/289 [01:54<01:43,  1.33it/s]

Training loop 151
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.07563342154026031, logits - tensor([[-5.2277, -3.8890, -5.1444,  3.8142],
        [-6.9996, -1.5973, -5.4440,  2.2540],
        [-7.1562, -0.3241, -6.8849,  0.7766],
        [-6.2440,  2.3163, -6.6034, -1.9899],
        [-7.6997,  1.5980, -6.8014, -1.2548],
        [-5.4113, -3.6935,  2.1368, -3.0550],
        [-6.2695,  2.6405, -6.3026, -2.7899],
        [-6.3398,  2.2854, -6.4439, -2.5956]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 53%|█████▎    | 152/289 [01:55<01:43,  1.33it/s]

Training loop 152
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.05747615545988083, logits - tensor([[-6.3291,  2.5313, -6.5598, -1.5868],
        [-6.7360, -2.6597, -6.4361,  2.9322],
        [-6.8017,  2.2892, -6.3222, -2.5055],
        [-5.7261,  2.6015, -6.1955, -2.2608],
        [-5.3503,  1.8139, -6.3090, -2.4705],
        [-6.1774, -2.2426, -4.9291,  2.2328],
        [-6.5707,  1.4734, -5.9457, -0.9430],
        [-6.0964, -4.0893,  2.7916, -3.5362]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 53%|█████▎    | 153/289 [01:55<01:42,  1.33it/s]

Training loop 153
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.24543645977973938, logits - tensor([[-6.2989,  1.9933, -6.0487, -1.8284],
        [-5.5975, -3.6206,  2.1737, -2.4005],
        [-6.9671,  2.0211, -6.2887, -1.5925],
        [-6.9061, -1.9017, -6.3385,  1.6971],
        [-6.5980,  0.8840, -6.2803, -1.1615],
        [-5.9281,  1.7899, -6.4019, -3.6696],
        [-6.2882,  2.4597, -6.5934, -2.2535],
        [-6.3609,  2.9326, -6.1374, -3.3225]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 53%|█████▎    | 154/289 [01:56<01:41,  1.33it/s]

Training loop 154
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.38047921657562256, logits - tensor([[-6.7894,  2.1986, -5.6093, -1.1943],
        [-6.4028,  2.2654, -6.9303, -2.4955],
        [-5.6108, -2.9862,  2.7933, -2.0463],
        [-5.2245, -3.6763,  1.9386, -1.7726],
        [-6.6418,  1.9411, -7.2690, -2.2733],
        [-6.8826, -3.2361,  1.4623, -1.6872],
        [-6.3311,  0.8601, -5.9698, -0.5351],
        [-7.2351,  2.0201, -6.5812, -1.0678]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 54%|█████▎    | 155/289 [01:57<01:41,  1.33it/s]

Training loop 155
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.08147899806499481, logits - tensor([[-6.6746,  1.4777, -6.2642, -2.0587],
        [-7.3381,  2.1210, -6.9826, -1.8881],
        [-6.0376, -4.6023, -5.8409,  4.3142],
        [-6.8319,  2.2707, -6.6786, -2.3240],
        [-6.7524,  1.4850, -6.8770, -2.6931],
        [-7.2281,  1.3390, -6.3248, -1.8902],
        [-7.9949,  1.3000, -6.5027, -1.8645],
        [-6.5427,  0.5381, -6.6090, -1.0392]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 54%|█████▍    | 156/289 [01:58<01:40,  1.33it/s]

Training loop 156
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.09439868479967117, logits - tensor([[-7.7186,  2.4247, -6.6324, -1.9842],
        [-6.4946, -4.0518, -5.8735,  3.7612],
        [-7.2715,  1.3442, -6.4795, -1.4510],
        [-7.0242, -2.1923, -6.5027,  2.7996],
        [-6.1229, -3.7546,  2.4865, -3.2170],
        [-6.3885,  2.7463, -6.1174, -2.6116],
        [-6.8043,  1.4236, -6.3030, -2.1319],
        [-7.0796,  0.3391, -5.4599,  0.0785]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 54%|█████▍    | 157/289 [01:58<01:39,  1.33it/s]

Training loop 157
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.10430200397968292, logits - tensor([[-6.9791,  1.3573, -6.6244, -1.6846],
        [-5.4189,  1.8874, -5.1932, -2.0511],
        [-7.7721,  3.1335, -7.1885, -3.0840],
        [-6.6506,  2.4234, -6.3396, -1.7103],
        [-6.0713, -1.8263, -4.8913,  1.5952],
        [-7.2581,  3.3696, -6.4029, -2.6682],
        [-4.8371, -3.2737,  1.8918, -2.5509],
        [-7.8889,  0.7468, -6.1717,  0.4932]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 55%|█████▍    | 158/289 [01:59<01:38,  1.33it/s]

Training loop 158
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.11326978355646133, logits - tensor([[-5.3727, -2.9344, -4.4916,  2.9874],
        [-6.0999,  2.2164, -6.4158, -1.9938],
        [-6.7763, -1.0672, -5.2046,  1.0720],
        [-6.2147, -3.0728, -5.0599,  2.0623],
        [-7.0061,  2.7860, -7.0205, -2.1075],
        [-6.3729,  2.6683, -6.3423, -2.3020],
        [-8.0116,  0.8191, -5.8689, -0.0645],
        [-4.6297, -2.9347,  2.3297, -2.3288]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 55%|█████▌    | 159/289 [02:00<01:38,  1.32it/s]

Training loop 159
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2849605083465576, logits - tensor([[-7.4588, -0.7487, -6.2776,  1.7330],
        [-6.4483,  0.6859, -6.2105, -0.4898],
        [-8.4237, -2.0536, -5.8261,  2.0010],
        [-7.4656, -1.6853, -6.9154,  1.2371],
        [-6.7646, -3.8562, -6.5626,  3.5330],
        [-7.8482,  2.5357, -6.7213, -2.6090],
        [-7.4518,  2.7006, -6.8591, -2.4849],
        [-6.8779,  2.2125, -6.7454, -1.6809]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 55%|█████▌    | 160/289 [02:01<01:38,  1.31it/s]

Training loop 160
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2328026294708252, logits - tensor([[-5.8111, -3.8598,  2.6003, -1.8837],
        [-6.4280, -3.5220, -5.5031,  3.1335],
        [-5.9012,  2.2432, -6.3066, -0.9766],
        [-6.7991,  2.8055, -6.4539, -2.2236],
        [-5.3042, -3.0333,  2.0390, -1.9860],
        [-6.9425,  1.7286, -6.2817, -1.0754],
        [-6.4117,  1.7813, -6.4648, -1.3135],
        [-7.2315,  1.6258, -7.1314, -1.9511]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 56%|█████▌    | 161/289 [02:01<01:37,  1.31it/s]

Training loop 161
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.40865224599838257, logits - tensor([[-7.0291,  3.3783, -7.4709, -2.2494],
        [-6.9623,  0.1550, -5.8134, -1.1458],
        [-5.6828, -4.3856,  2.7438, -2.4224],
        [-7.5705,  2.6462, -6.6397, -2.4155],
        [-7.0974,  0.7130, -6.5225, -0.8279],
        [-6.6153,  1.4751, -6.5670, -1.7849],
        [-5.6798,  1.9309, -5.9805, -1.8706],
        [-7.0281, -2.7482, -6.6085,  2.8026]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 56%|█████▌    | 162/289 [02:02<01:36,  1.31it/s]

Training loop 162
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 56%|█████▋    | 163/289 [02:03<01:36,  1.31it/s]

loss - 0.13469654321670532, logits - tensor([[-6.5170,  2.1351, -6.7296, -1.3963],
        [-6.9678,  1.1609, -6.0296, -0.5623],
        [-6.5867, -2.9695, -4.6062,  4.0170],
        [-5.3731, -3.6515,  2.0483, -2.9159],
        [-7.5361,  1.8876, -6.5841, -1.8825],
        [-5.4456,  1.9199, -5.8855, -2.5868],
        [-6.4772,  1.3807, -6.4543, -1.5768],
        [-5.7440, -3.3402,  1.4367, -2.5708]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 163
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2269584834575653, logits - tensor([[-8.7652,  1.5876, -6.4134, -1.4138],
        [-6.4532,  2.2579, -6.9858, -2.3892],
        [-6.5572,  1.5913, -7.0120, -1.3965],
        [-6.8887,  1.7043, -6.3959, -1.3739],
        [-7.2268, -3.3576, -0.0316,  0.0838],
        [-6.9474,  0.9687, -5.7838, -1.1267],
        [-6.6416,  2.1856, -6.6261, -2.5858],
        [-5.7531, -1.5242, -2.7349,  1.

 57%|█████▋    | 164/289 [02:04<01:35,  1.30it/s]

Training loop 164
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 57%|█████▋    | 165/289 [02:05<01:34,  1.31it/s]

loss - 0.05351511761546135, logits - tensor([[-6.3074, -1.7349, -5.3530,  2.1649],
        [-6.9142,  1.8820, -7.3483, -2.2972],
        [-5.6450,  3.2536, -6.8759, -3.1252],
        [-4.8783, -3.1253,  1.6758, -3.3953],
        [-7.0378, -2.8394, -5.3748,  2.1476],
        [-5.5123, -3.0740,  2.0883, -2.5569],
        [-5.8211, -3.1115, -4.6077,  3.4293],
        [-6.6139,  1.8815, -6.0262, -1.4660]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 165
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.05336771160364151, logits - tensor([[-7.2122,  2.6835, -6.4825, -2.8393],
        [-7.0513,  2.4887, -6.6824, -3.0579],
        [-5.8103, -3.5403,  1.6790, -2.6870],
        [-7.3449,  1.8224, -6.4159, -1.9696],
        [-5.6184,  1.6465, -5.5330, -1.4226],
        [-5.9095, -3.2373, -5.0890,  3.2544],
        [-5.7519,  2.0571, -6.4858, -2.5366],
        [-5.6408, -3.9919,  2.1601, -2

 57%|█████▋    | 166/289 [02:05<01:33,  1.31it/s]

Training loop 166
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.07110776752233505, logits - tensor([[-7.8321,  1.7031, -7.7518, -1.4572],
        [-6.0194,  2.4803, -7.0798, -2.4058],
        [-6.6019,  3.1818, -6.8007, -2.2201],
        [-6.8768,  0.5139, -5.7517, -1.1613],
        [-6.6398, -3.1592, -4.1869,  3.7303],
        [-5.5583,  2.6581, -6.7768, -2.0289],
        [-5.1543,  2.3488, -6.3669, -1.5423],
        [-5.2484, -3.3249,  1.8252, -2.5413]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 58%|█████▊    | 167/289 [02:06<01:32,  1.32it/s]

Training loop 167
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.24203018844127655, logits - tensor([[-5.7536, -3.2002,  3.0505, -2.6468],
        [-5.6218,  2.2557, -6.6163, -2.5862],
        [-7.0698, -2.7231, -6.5234,  3.4160],
        [-4.7140, -3.7193,  2.3462, -3.0239],
        [-6.6937,  2.3174, -7.1233, -2.5130],
        [-5.8429, -2.2303, -4.5208,  1.8234],
        [-7.5939,  1.0407, -6.2578, -1.9308],
        [-7.6562,  0.4630, -5.9312, -0.7137]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 58%|█████▊    | 168/289 [02:07<01:32,  1.31it/s]

Training loop 168
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.17861820757389069, logits - tensor([[-5.4842,  2.5321, -5.8882, -1.9174],
        [-6.0047,  1.9558, -5.8874, -3.3989],
        [-6.9453,  2.4373, -7.5306, -2.1589],
        [-7.0268,  0.8479, -6.2847, -2.2452],
        [-6.3799, -2.9036, -5.1087,  2.9559],
        [-7.4352,  1.2845, -6.1330, -1.5308],
        [-6.6723,  0.6260, -6.9297, -0.9404],
        [-8.4729,  1.8367, -7.8066, -2.1161]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 58%|█████▊    | 169/289 [02:08<01:31,  1.31it/s]

Training loop 169
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.10075358301401138, logits - tensor([[-5.4264,  2.5200, -6.1472, -2.1962],
        [-6.8895,  2.0858, -6.1463, -1.9309],
        [-6.7254,  1.6627, -6.5022, -1.0208],
        [-6.7499,  1.7481, -5.7324, -2.1504],
        [-6.9366,  1.9193, -5.8564, -2.3577],
        [-6.9167,  1.0337, -6.3302, -1.2643],
        [-5.7589,  0.6969, -5.8176, -1.3218],
        [-5.7033,  1.4781, -6.0635, -0.7869]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 59%|█████▉    | 170/289 [02:08<01:30,  1.32it/s]

Training loop 170
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.28188183903694153, logits - tensor([[-5.0175, -2.2194,  1.1713, -1.4701],
        [-7.3278, -2.1403, -4.9137,  2.4737],
        [-7.4807,  2.5409, -7.1849, -2.5651],
        [-6.9280,  0.7681, -7.2018, -1.1611],
        [-7.5794, -1.0967, -5.5470,  2.0015],
        [-5.4692, -3.4408,  2.5043, -2.8948],
        [-7.0647, -3.7039,  1.9871, -2.7001],
        [-6.4040, -2.6546, -5.5310,  2.8122]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 59%|█████▉    | 171/289 [02:09<01:29,  1.32it/s]

Training loop 171
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.18256372213363647, logits - tensor([[-6.0106,  1.5414, -5.9431, -2.2344],
        [-6.5908,  1.5384, -6.8739, -0.9560],
        [-7.6782,  3.0722, -7.2639, -2.7917],
        [-4.9141,  2.0013, -5.1977, -2.2789],
        [-5.0247, -2.9466,  1.5152, -2.3267],
        [-7.1322,  2.6463, -6.7668, -2.5659],
        [-6.4515,  2.6610, -6.5272, -2.6156],
        [-6.4788,  2.4017, -6.8675, -2.1548]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 60%|█████▉    | 172/289 [02:10<01:28,  1.32it/s]

Training loop 172
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19810345768928528, logits - tensor([[-7.7339,  2.4198, -6.8908, -2.4640],
        [-5.8441, -3.3741,  2.0334, -1.9274],
        [-6.1234, -3.7546,  2.2880, -2.6076],
        [-5.5382, -3.0797,  2.1536, -2.1170],
        [-7.1253, -3.3578, -5.1800,  2.1450],
        [-6.6100,  1.9853, -6.7127, -2.5804],
        [-6.8625,  1.5532, -6.9670, -1.7271],
        [-6.4863,  2.7355, -6.9070, -2.6863]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 60%|█████▉    | 173/289 [02:11<01:27,  1.33it/s]

Training loop 173
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.07727228105068207, logits - tensor([[-6.3162, -3.7063, -5.0504,  3.9694],
        [-4.9567, -2.7963,  2.1361, -2.4993],
        [-6.4325,  2.0295, -5.9558, -1.5444],
        [-5.2416, -3.9399, -4.7509,  3.4858],
        [-6.1998,  1.9189, -5.8024, -1.9901],
        [-7.5361,  1.9633, -7.2881, -2.5431],
        [-6.8295, -3.5453, -5.4494,  3.9287],
        [-6.9238,  0.2164, -5.2408,  0.6067]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 60%|██████    | 174/289 [02:11<01:26,  1.33it/s]

Training loop 174
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19613221287727356, logits - tensor([[-6.1798,  2.6236, -6.1334, -2.4699],
        [-4.7274, -2.3978,  1.4570, -1.9190],
        [-6.6585,  1.6189, -5.8470, -1.1257],
        [-6.0181, -3.6749, -6.4434,  4.2788],
        [-4.7816, -3.0641,  2.0641, -2.4958],
        [-4.6248, -2.8042,  1.8844, -1.7295],
        [-5.7773,  1.3832, -4.8291, -1.4003],
        [-6.7753,  2.5346, -6.0037, -1.7923]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 61%|██████    | 175/289 [02:12<01:25,  1.33it/s]

Training loop 175
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.09187231212854385, logits - tensor([[-6.6183,  0.2324, -6.3569,  0.0772],
        [-4.4909, -2.9986,  1.7922, -1.8689],
        [-6.8598,  1.6217, -6.9291, -2.4375],
        [-6.7482, -3.0711, -5.7575,  2.7421],
        [-6.3443,  2.4366, -6.8988, -2.2592],
        [-6.4046,  2.7122, -7.2057, -2.3878],
        [-6.9302,  2.4729, -6.3462, -2.1856],
        [-6.9419,  2.6070, -6.7743, -2.0081]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 61%|██████    | 176/289 [02:13<01:25,  1.33it/s]

Training loop 176
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.04972351714968681, logits - tensor([[-8.1720,  2.6513, -7.7732, -2.3891],
        [-6.9072,  2.2881, -7.2190, -1.9895],
        [-6.4325,  1.6962, -5.8700, -2.0391],
        [-7.8339,  3.1028, -6.9041, -2.6867],
        [-5.6616, -3.0572,  1.7648, -1.8321],
        [-5.2738, -3.4105, -4.9051,  3.8531],
        [-6.5320,  1.9678, -6.9228, -1.7813],
        [-6.1655,  3.0038, -6.2514, -3.2427]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 61%|██████    | 177/289 [02:14<01:24,  1.33it/s]

Training loop 177
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.22632630169391632, logits - tensor([[-5.6909, -3.7609,  1.9926, -2.8760],
        [-7.1486,  2.7730, -7.0238, -2.2955],
        [-7.2492,  0.8783, -6.2622, -0.7456],
        [-6.7246, -2.7120, -4.8909,  2.5644],
        [-6.8789,  1.5122, -6.0874, -1.4891],
        [-5.6053,  1.9578, -5.5654, -1.7526],
        [-7.3481,  2.6226, -6.7252, -2.3111],
        [-5.3990, -2.5512,  1.3371, -1.6236]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 62%|██████▏   | 178/289 [02:14<01:23,  1.33it/s]

Training loop 178
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.14635004103183746, logits - tensor([[-5.0835, -2.5251,  2.0641, -2.2705],
        [-6.4659,  2.5940, -6.3329, -1.7596],
        [-5.9850,  2.3657, -6.4123, -2.2818],
        [-4.7999, -3.3617,  2.2379, -2.1439],
        [-6.9452,  1.2320, -6.6598, -1.4637],
        [-7.4215,  2.0003, -6.8710, -2.5677],
        [-7.3675,  2.6945, -7.2260, -2.4669],
        [-5.2870,  2.3916, -5.9311, -2.3103]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 62%|██████▏   | 179/289 [02:15<01:23,  1.32it/s]

Training loop 179
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2526494860649109, logits - tensor([[-7.0590,  1.2640, -6.0131, -1.3786],
        [-7.5385,  0.0170, -6.4118,  1.3534],
        [-6.0878,  0.9313, -5.8600, -1.5867],
        [-4.9069, -2.6999,  1.8460, -2.2473],
        [-7.1401,  3.1347, -6.3759, -2.8318],
        [-6.4867,  2.6733, -6.4208, -1.5863],
        [-7.0008,  2.8300, -7.5724, -2.7357],
        [-5.3236, -3.4792,  2.0946, -3.2136]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 62%|██████▏   | 180/289 [02:16<01:22,  1.32it/s]

Training loop 180
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1703013777732849, logits - tensor([[-7.6088,  2.7938, -6.8448, -1.9451],
        [-5.9679, -3.7983, -6.2432,  4.0437],
        [-6.0495, -3.5378,  2.1924, -3.2053],
        [-6.6046,  1.3827, -6.6207, -2.0324],
        [-6.0596,  3.1773, -6.9476, -2.4773],
        [-6.4198, -2.3880, -5.4645,  3.4280],
        [-5.3979, -3.0343,  1.9925, -2.0813],
        [-4.6851, -3.5595, -5.0821,  3.8908]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 63%|██████▎   | 181/289 [02:17<01:22,  1.31it/s]

Training loop 181
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2043946534395218, logits - tensor([[-6.1130,  1.5031, -5.5758, -2.0736],
        [-6.6222,  1.9685, -6.5488, -1.8317],
        [-6.1116,  2.4928, -6.2032, -2.3924],
        [-6.7371,  1.6091, -6.1067, -1.9532],
        [-6.3135, -2.5513, -5.9516,  2.3917],
        [-6.8094, -2.6692, -4.8573,  3.8329],
        [-6.6323, -3.1833, -4.9742,  3.3116],
        [-5.5194, -3.8894,  2.1731, -2.5170]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 63%|██████▎   | 182/289 [02:17<01:21,  1.31it/s]

Training loop 182
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3541352152824402, logits - tensor([[-6.7212,  2.6856, -6.2222, -2.6073],
        [-7.6947,  1.3846, -6.4822, -1.6867],
        [-7.5043,  2.8309, -6.0841, -2.0432],
        [-7.2320, -2.5023, -5.8361,  3.6217],
        [-7.1791,  2.6692, -6.7391, -2.5789],
        [-6.7503,  0.7690, -6.6482, -0.4021],
        [-6.0794, -3.2238, -4.5265,  3.0942],
        [-6.4096,  2.1569, -6.8074, -2.7088]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 63%|██████▎   | 183/289 [02:18<01:21,  1.31it/s]

Training loop 183
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.23247423768043518, logits - tensor([[-7.0733,  3.1879, -7.9760, -2.4650],
        [-5.3042, -2.8149,  2.3392, -2.3010],
        [-6.9053,  1.9565, -6.7082, -2.8226],
        [-7.0229,  2.2517, -6.9320, -2.3841],
        [-6.2461, -4.3429,  1.8188, -2.4690],
        [-5.5696, -3.7200,  2.6522, -2.8860],
        [-6.1156,  2.2247, -6.3298, -1.8598],
        [-7.2614,  0.9996, -6.9568, -1.6120]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 64%|██████▎   | 184/289 [02:19<01:20,  1.31it/s]

Training loop 184
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.35307782888412476, logits - tensor([[-7.3058,  3.0124, -6.1118, -2.5191],
        [-6.2281,  2.1059, -6.7362, -1.8276],
        [-6.1755,  1.7707, -6.4852, -2.9153],
        [-7.0930,  1.7018, -6.2145, -2.0347],
        [-6.5709,  2.1715, -5.9457, -2.5508],
        [-6.1310, -2.3991,  1.1141, -2.2799],
        [-5.9852, -3.8743,  2.1020, -1.8457],
        [-7.4107,  1.0902, -5.8684, -1.3279]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 64%|██████▍   | 185/289 [02:20<01:18,  1.32it/s]

Training loop 185
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.05783652886748314, logits - tensor([[-8.0637,  2.8797, -7.3473, -3.3974],
        [-6.5233, -3.2177,  2.2821, -3.0125],
        [-6.6411,  1.0381, -5.7090, -2.1603],
        [-5.7421, -3.7311,  2.2421, -2.7650],
        [-7.2466,  1.6122, -6.1437, -1.6989],
        [-7.5725,  2.9214, -7.6676, -3.5125],
        [-6.3753,  2.6377, -5.3249, -2.8594],
        [-6.3500,  1.5525, -7.2264, -1.4582]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 64%|██████▍   | 186/289 [02:20<01:18,  1.32it/s]

Training loop 186
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.07043728232383728, logits - tensor([[-6.8715,  2.7278, -6.7027, -2.5684],
        [-5.2379, -1.9493,  1.6404, -2.2966],
        [-6.8718,  2.3961, -6.5475, -2.8415],
        [-6.7339,  0.9382, -6.4892, -0.3551],
        [-6.7661, -4.0404, -6.0059,  3.5979],
        [-6.3939, -4.1472,  2.1340, -2.7731],
        [-6.4446,  2.2828, -7.0052, -2.2618],
        [-7.1632,  2.2011, -7.1765, -1.8255]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 65%|██████▍   | 187/289 [02:21<01:17,  1.32it/s]

Training loop 187
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.21458418667316437, logits - tensor([[-8.0358, -0.9662, -5.9731,  1.2823],
        [-6.9066,  1.8363, -5.9515, -1.9486],
        [-7.4381,  2.5149, -6.3500, -2.6430],
        [-6.8000, -3.7564,  2.8803, -3.1157],
        [-6.9398, -3.3826, -5.3101,  3.3045],
        [-6.6409,  2.4067, -7.3613, -2.5455],
        [-5.8162,  2.4454, -7.2681, -2.8936],
        [-7.2682,  1.7887, -6.8775, -3.0377]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 65%|██████▌   | 188/289 [02:22<01:16,  1.32it/s]

Training loop 188
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.06640810519456863, logits - tensor([[-5.3412, -3.8250,  2.5680, -2.6092],
        [-6.0106, -3.6935,  2.4799, -2.4441],
        [-6.5184,  2.2356, -6.2004, -1.9968],
        [-4.8286, -2.1630,  0.9219, -1.7370],
        [-6.0858, -3.2942, -4.4451,  3.1410],
        [-5.6150, -3.0731,  1.7028, -2.2994],
        [-5.9082,  2.9312, -6.8670, -2.7209],
        [-6.9532,  1.5733, -7.0441, -1.4933]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 65%|██████▌   | 189/289 [02:23<01:15,  1.33it/s]

Training loop 189
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3129923939704895, logits - tensor([[-6.4968, -2.8679,  1.9794, -3.0074],
        [-5.7704,  3.4944, -6.7325, -2.2025],
        [-6.7070,  1.3732, -5.8525, -1.3648],
        [-6.8558,  1.7541, -6.5700, -1.8195],
        [-7.2381,  2.1304, -6.3826, -1.8563],
        [-5.9762,  2.4069, -6.4266, -2.9353],
        [-6.5726,  2.5068, -6.6910, -2.6459],
        [-6.3488,  2.5358, -6.0499, -1.7478]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 66%|██████▌   | 190/289 [02:23<01:14,  1.33it/s]

Training loop 190
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.10149082541465759, logits - tensor([[-6.9959,  2.4789, -7.4155, -1.9272],
        [-7.9591, -0.2098, -6.8870,  0.1235],
        [-6.1968, -3.7548,  1.5732, -2.6274],
        [-6.7902,  0.4201, -6.9306, -0.6763],
        [-6.1302,  2.0520, -5.7994, -2.4203],
        [-5.3119, -2.9716,  2.7955, -3.0395],
        [-7.0918, -2.8656, -6.5702,  2.9755],
        [-5.7726,  2.3693, -7.0013, -3.6051]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 66%|██████▌   | 191/289 [02:24<01:13,  1.33it/s]

Training loop 191
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3257148861885071, logits - tensor([[-5.7125, -4.0711,  1.1922, -2.7154],
        [-5.6636, -4.1667, -5.2209,  3.8822],
        [-7.2264,  1.9744, -6.2872, -1.0466],
        [-6.7887,  1.4313, -5.6798, -1.9605],
        [-5.4666, -3.7837, -4.6763,  3.3672],
        [-7.4392, -4.3737, -5.0995,  4.3330],
        [-4.6363, -2.9293,  2.0758, -2.5731],
        [-6.8674,  2.3704, -6.5831, -2.3541]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 66%|██████▋   | 192/289 [02:25<01:12,  1.33it/s]

Training loop 192
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1397816240787506, logits - tensor([[-5.6898, -3.3960, -4.8315,  2.9595],
        [-6.6392,  2.8039, -6.9655, -2.8287],
        [-6.5888,  3.1333, -6.8788, -1.7384],
        [-7.1786,  2.6812, -6.9506, -3.2223],
        [-6.6327,  2.0849, -6.2618, -2.6325],
        [-6.5356,  0.8067, -6.3248, -0.5352],
        [-6.2809,  0.7209, -4.1803, -1.3392],
        [-5.9909,  2.5189, -6.4738, -1.9346]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 67%|██████▋   | 193/289 [02:26<01:12,  1.32it/s]

Training loop 193
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.15727859735488892, logits - tensor([[-6.5557,  2.2877, -6.0600, -1.1334],
        [-6.1235, -3.2623, -4.7770,  3.3779],
        [-5.6782, -2.7105, -5.4236,  3.8868],
        [-6.8991,  1.9761, -5.5515, -2.5321],
        [-7.2134,  3.0291, -6.8957, -3.2397],
        [-6.3493,  2.0531, -6.3135, -1.5019],
        [-6.9332,  2.9449, -6.7088, -3.1212],
        [-6.8504,  2.6905, -6.4371, -1.9317]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 67%|██████▋   | 194/289 [02:26<01:11,  1.32it/s]

Training loop 194
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19824042916297913, logits - tensor([[-6.1478,  2.5241, -6.2706, -1.4463],
        [-6.3014,  1.4510, -5.9881, -0.1559],
        [-7.5001,  1.3915, -6.3798, -1.9944],
        [-6.8214,  2.1643, -6.7186, -2.4833],
        [-7.3411,  1.9251, -6.7877, -1.7021],
        [-6.1468,  1.8403, -6.5514, -1.9861],
        [-7.2007,  2.9215, -6.3925, -3.0074],
        [-5.7288,  1.9845, -5.5455, -1.6985]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 67%|██████▋   | 195/289 [02:27<01:10,  1.33it/s]

Training loop 195
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.12101301550865173, logits - tensor([[-6.5679,  2.5912, -7.0297, -2.1566],
        [-6.4889,  3.4088, -6.7805, -2.6598],
        [-6.0537,  2.9645, -6.0677, -1.7736],
        [-6.0650,  2.5405, -5.7024, -1.8271],
        [-5.8081,  2.5056, -5.9499, -1.9891],
        [-5.8199,  0.8336, -5.9577, -0.7260],
        [-7.2787,  2.1943, -6.5200, -1.3197],
        [-6.7142, -3.6947,  1.5836, -2.9081]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 68%|██████▊   | 196/289 [02:28<01:10,  1.33it/s]

Training loop 196
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3842257857322693, logits - tensor([[-5.8426,  2.8311, -6.6188, -1.5914],
        [-7.1906,  1.5389, -6.6124, -2.3080],
        [-5.5376, -3.0591,  1.8824, -1.7218],
        [-7.0887,  0.0656, -6.7513, -0.0607],
        [-6.8464,  2.7681, -5.8363, -1.9046],
        [-6.6687,  2.8011, -6.5924, -2.2038],
        [-5.5877, -3.1574,  2.5383, -2.4960],
        [-6.0608,  2.4211, -6.0162, -2.0763]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 68%|██████▊   | 197/289 [02:29<01:09,  1.32it/s]

Training loop 197
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1710408478975296, logits - tensor([[-6.4454,  2.1123, -5.5871, -1.6112],
        [-5.4518,  2.9815, -6.1591, -2.8530],
        [-5.4049,  3.2528, -5.6327, -2.8269],
        [-5.7255, -2.8910,  2.0324, -2.1544],
        [-7.0074,  2.4505, -7.2103, -2.4709],
        [-5.6371, -3.5483,  1.5949, -2.2892],
        [-7.4057,  2.0464, -6.6557, -2.2038],
        [-6.3930, -3.3496, -4.1210,  3.7586]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 69%|██████▊   | 198/289 [02:29<01:08,  1.33it/s]

Training loop 198
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.08488556742668152, logits - tensor([[-6.6027, -2.9548, -5.2062,  3.5860],
        [-7.1581,  1.7375, -6.0732, -1.9494],
        [-6.2678, -3.5012,  1.7555, -2.2596],
        [-6.8436,  1.3075, -6.5397, -1.0044],
        [-5.8154, -2.5487, -4.0996,  2.7460],
        [-7.7132,  1.8970, -6.9198, -2.6153],
        [-7.1086,  1.5842, -6.6746, -0.9945],
        [-5.2154, -2.4509,  0.7941, -1.7812]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 69%|██████▉   | 199/289 [02:30<01:07,  1.33it/s]

Training loop 199
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 69%|██████▉   | 200/289 [02:31<01:07,  1.32it/s]

loss - 0.2989107072353363, logits - tensor([[-5.8505e+00, -3.6354e+00, -4.5566e+00,  3.4394e+00],
        [-6.2521e+00,  2.8448e+00, -5.6905e+00, -3.0764e+00],
        [-6.6439e+00,  2.5247e+00, -6.8699e+00, -2.7402e+00],
        [-7.3447e+00,  6.3891e-04, -6.4186e+00,  1.0634e+00],
        [-6.7222e+00,  2.5362e+00, -6.6725e+00, -2.3978e+00],
        [-5.6914e+00, -3.0147e+00,  1.9214e+00, -2.4184e+00],
        [-7.2217e+00, -4.9905e-01, -6.2875e+00,  1.1403e-01],
        [-6.8390e+00,  7.8403e-01, -6.0527e+00, -9.6391e-01]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 200
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.36931413412094116, logits - tensor([[-6.6065,  1.4339, -6.1759, -1.3489],
        [-7.2900,  2.2762, -6.6011, -2.1177],
        [-7.5468,  1.3013, -6.5831, -1.1634],
        [-6.0354, -3.6529,  1.2844, -1.5210],
        [-5.1601, -3.0386,  0.6371, -1.3684],
   

 70%|██████▉   | 201/289 [02:32<01:07,  1.31it/s]

Training loop 201
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2766210436820984, logits - tensor([[-5.8848,  1.4710, -5.6001, -2.8150],
        [-6.6691,  0.8123, -5.4408, -1.4705],
        [-4.5880, -2.8718,  2.1269, -1.6110],
        [-7.8683, -2.0386, -6.8970,  1.8304],
        [-7.4695,  2.8921, -7.5884, -2.8153],
        [-8.2171,  0.9923, -6.5781, -0.2561],
        [-7.5451,  1.4068, -6.4974, -0.8749],
        [-6.7199,  2.1603, -7.0201, -2.1669]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 70%|██████▉   | 202/289 [02:32<01:06,  1.32it/s]

Training loop 202
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.12328226864337921, logits - tensor([[-7.0779,  2.5271, -6.0959, -2.2396],
        [-7.0806,  1.7561, -7.3098, -1.8955],
        [-5.2550, -3.4646,  1.6329, -2.0118],
        [-6.3848, -3.9982, -5.9042,  3.9785],
        [-4.9506, -3.1813,  1.6950, -2.5688],
        [-7.7747,  0.3423, -6.3139,  0.4834],
        [-5.1747,  1.1811, -6.1004, -1.8885],
        [-7.0043,  0.5520, -6.2724, -0.6641]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 70%|███████   | 203/289 [02:33<01:05,  1.31it/s]

Training loop 203
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19726523756980896, logits - tensor([[-6.6495, -4.1746, -6.0713,  5.2499],
        [-5.7451, -3.2978,  2.2997, -2.3380],
        [-8.3488,  1.3132, -7.2417, -1.3463],
        [-5.8692, -3.5049, -6.1755,  3.1253],
        [-7.4107,  1.4069, -6.6575, -0.1505],
        [-5.6870, -3.2211,  2.7839, -2.4455],
        [-7.1500,  0.7337, -5.9531, -1.0729],
        [-5.9208,  1.3582, -5.4906, -2.0853]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 71%|███████   | 204/289 [02:34<01:04,  1.32it/s]

Training loop 204
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3659333288669586, logits - tensor([[-8.3427, -1.1711, -5.6754,  1.7596],
        [-7.0123, -2.5139, -5.0107,  2.5645],
        [-6.5728,  2.0589, -5.7450, -1.4252],
        [-6.9218,  1.5081, -6.9888, -0.6063],
        [-7.1036,  1.9612, -7.8693, -2.2937],
        [-6.3487, -2.9930,  1.9489, -1.9981],
        [-7.1518,  0.0840, -5.8197, -0.3423],
        [-6.6844,  0.9709, -5.8287, -1.3749]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 71%|███████   | 205/289 [02:35<01:03,  1.32it/s]

Training loop 205
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.08094078302383423, logits - tensor([[-5.8201, -3.3094, -5.7815,  2.2323],
        [-6.5365,  1.2720, -5.9514, -0.9469],
        [-5.6029, -3.2924,  2.0623, -1.8347],
        [-7.3454,  2.3514, -6.8099, -1.8377],
        [-6.5466, -1.6772, -6.6260,  2.6008],
        [-7.7678, -2.2647, -5.8130,  3.2043],
        [-4.6308, -2.0840,  0.7711, -1.5837],
        [-6.3883,  2.4604, -6.4934, -1.8048]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 71%|███████▏  | 206/289 [02:36<01:02,  1.32it/s]

Training loop 206
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.09409020841121674, logits - tensor([[-6.7233,  4.0073, -6.6685, -2.2695],
        [-7.1612,  1.2139, -6.2507, -1.3076],
        [-7.6125,  2.5890, -7.3435, -2.1925],
        [-7.3901,  3.1920, -7.2545, -3.2059],
        [-6.0308,  1.4241, -5.3808, -2.2008],
        [-7.4732,  1.9678, -6.3535, -2.1054],
        [-5.6249, -0.2639, -4.8031, -0.2957],
        [-6.1128,  2.4545, -7.0489, -2.6827]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 72%|███████▏  | 207/289 [02:36<01:02,  1.32it/s]

Training loop 207
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.06817497313022614, logits - tensor([[-6.6026,  0.6703, -5.9127, -1.4085],
        [-5.4036, -2.8429, -5.4853,  3.3674],
        [-6.1356, -3.4755, -5.3400,  3.3900],
        [-8.1238,  2.2611, -6.9160, -2.0841],
        [-7.3339,  1.8293, -6.5019, -1.7290],
        [-6.4828, -3.8085,  2.0424, -2.4373],
        [-6.4249, -1.6129, -3.8451,  0.7932],
        [-5.9742, -4.2593, -5.4665,  4.1201]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 72%|███████▏  | 208/289 [02:37<01:01,  1.33it/s]

Training loop 208
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.04550909250974655, logits - tensor([[-6.6828, -2.2549, -6.3691,  3.0273],
        [-8.3001,  2.4478, -6.6417, -1.6517],
        [-5.9972, -3.3786, -5.2729,  3.5755],
        [-6.4129, -3.0305, -6.2305,  3.2369],
        [-6.6155, -4.0122, -5.8785,  4.2368],
        [-5.3485, -3.9777, -4.8429,  2.5951],
        [-6.4761,  1.2414, -5.5823, -1.7598],
        [-5.3288, -2.9000,  1.6648, -2.2612]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 72%|███████▏  | 209/289 [02:38<01:00,  1.33it/s]

Training loop 209
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.12684357166290283, logits - tensor([[-7.4169,  2.3439, -6.4938, -3.1733],
        [-5.6287, -3.7572,  1.6373, -2.4876],
        [-7.3172, -0.2672, -6.7315, -0.2206],
        [-6.2279, -3.7047,  1.4825, -0.9711],
        [-6.2376, -2.2991,  0.5023, -1.6235],
        [-6.5855, -3.9875, -5.1772,  3.1236],
        [-6.2168,  1.2926, -5.8216, -1.8145],
        [-5.8314, -2.8237,  1.1392, -1.8059]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 73%|███████▎  | 210/289 [02:39<00:59,  1.33it/s]

Training loop 210
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.32954737544059753, logits - tensor([[-5.8990,  1.6408, -6.2086, -1.9368],
        [-4.3631, -3.1880,  2.0732, -2.1724],
        [-7.5632,  1.5505, -6.5117, -1.4884],
        [-6.3082, -2.3008,  0.8088, -1.8949],
        [-7.3814,  1.4678, -6.2038, -2.3002],
        [-6.6935, -3.0656, -4.7968,  3.7034],
        [-7.2220,  0.5036, -6.3073, -0.5296],
        [-7.0867,  0.7048, -7.5991, -0.5631]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 73%|███████▎  | 211/289 [02:39<00:58,  1.32it/s]

Training loop 211
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.09291607141494751, logits - tensor([[-6.4204e+00,  2.6634e+00, -5.9169e+00, -2.1365e+00],
        [-5.8152e+00,  1.3170e+00, -5.5985e+00, -1.7149e+00],
        [-7.1714e+00,  2.1991e+00, -5.8748e+00, -2.2196e+00],
        [-7.0655e+00, -3.5734e-01, -5.3199e+00,  4.8619e-03],
        [-7.7702e+00,  7.9849e-01, -6.8223e+00, -1.0870e+00],
        [-6.0699e+00, -3.4898e+00, -5.6372e+00,  4.2855e+00],
        [-4.9306e+00, -3.7297e+00, -5.6127e+00,  3.0704e+00],
        [-6.7157e+00,  2.4599e+00, -6.5769e+00, -2.7176e+00]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 73%|███████▎  | 212/289 [02:40<00:58,  1.33it/s]

Training loop 212
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.05294771492481232, logits - tensor([[-6.8145,  2.2105, -6.6181, -1.8892],
        [-7.3856,  2.6625, -6.7583, -2.4156],
        [-8.0896, -2.7164, -6.8369,  2.5681],
        [-5.8637, -3.7315,  1.8148, -2.7507],
        [-6.7289,  1.8394, -6.7925, -2.2807],
        [-6.0027,  2.7481, -6.2281, -2.4949],
        [-6.2363,  2.0204, -5.9477, -2.4484],
        [-5.5813,  2.2925, -6.1096, -1.4720]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 74%|███████▎  | 213/289 [02:41<00:57,  1.33it/s]

Training loop 213
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.06010633334517479, logits - tensor([[-7.2897,  1.8437, -7.1643, -2.9341],
        [-6.0937,  3.1821, -5.9012, -2.9812],
        [-6.5372,  1.1360, -5.6289, -1.6872],
        [-6.9876, -3.1328, -5.7307,  3.0768],
        [-5.3693, -2.9528,  1.0006, -1.4582],
        [-7.6415, -3.4670, -5.1390,  3.9933],
        [-5.9778,  2.7722, -6.2485, -2.4956],
        [-6.8813,  1.7370, -6.5052, -1.8893]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 74%|███████▍  | 214/289 [02:42<00:56,  1.33it/s]

Training loop 214
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3377860188484192, logits - tensor([[-6.6547,  1.8773, -6.4099, -1.2636],
        [-5.4810, -3.1910,  1.0987, -1.3450],
        [-6.9852,  1.8085, -6.7335, -1.5701],
        [-6.5945,  0.7419, -6.0959, -1.4700],
        [-6.4844,  0.6703, -5.8113, -0.6352],
        [-6.3103, -1.2708, -5.4863,  1.1189],
        [-6.7231,  2.3373, -7.0566, -2.2574],
        [-5.1341, -2.9044,  0.8699, -2.0592]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 74%|███████▍  | 215/289 [02:42<00:55,  1.32it/s]

Training loop 215
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.08655378222465515, logits - tensor([[-6.5019,  2.6782, -6.0623, -2.8922],
        [-6.0805, -4.7793, -6.6068,  4.0581],
        [-5.6070, -3.4171,  1.7580, -2.4171],
        [-6.9521, -3.4525, -0.2268, -1.5521],
        [-7.2451,  2.2488, -7.8772, -2.1913],
        [-6.4984, -0.4771, -5.5217,  1.4943],
        [-7.0737,  2.2640, -6.3606, -2.1008],
        [-6.7543,  3.1070, -6.5211, -1.9157]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 75%|███████▍  | 216/289 [02:43<00:55,  1.32it/s]

Training loop 216
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2577826976776123, logits - tensor([[-8.3525,  1.8418, -6.9228, -1.4308],
        [-6.2394,  0.7181, -6.5766, -1.7691],
        [-5.9676,  1.2569, -5.7786, -1.6365],
        [-6.0287,  2.6945, -6.3930, -3.0313],
        [-6.6428,  2.3682, -6.8518, -2.5647],
        [-6.7901, -3.5794,  1.7092, -2.2724],
        [-7.3084, -1.3661, -6.7108,  0.9994],
        [-6.4028,  1.0156, -6.1386, -1.4841]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 75%|███████▌  | 217/289 [02:44<00:54,  1.32it/s]

Training loop 217
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4295673072338104, logits - tensor([[-7.0711,  2.2559, -6.8450, -2.8645],
        [-6.7906,  2.7001, -7.9042, -2.8625],
        [-4.9721, -3.5876,  2.1635, -2.6786],
        [-5.3257, -2.9162,  2.4672, -2.4182],
        [-7.4409,  1.3398, -5.5164, -1.3307],
        [-7.0864,  2.8854, -6.5178, -3.0778],
        [-6.4549,  1.8597, -6.1758, -1.7318],
        [-7.9440,  1.3648, -6.8165, -1.1536]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 75%|███████▌  | 218/289 [02:45<00:53,  1.32it/s]

Training loop 218
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.12072129547595978, logits - tensor([[-6.4119,  2.8819, -6.7510, -2.6762],
        [-6.2757, -3.2242, -5.1817,  3.0966],
        [-5.3442,  0.5613, -5.1407, -0.7861],
        [-7.6369,  0.9474, -6.3315, -0.8893],
        [-6.7981,  2.0913, -5.8336, -2.4598],
        [-6.0079, -3.2855,  1.7802, -3.0604],
        [-5.2693, -3.5599,  2.4283, -3.1045],
        [-5.8590, -3.6375,  2.4707, -2.5574]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 76%|███████▌  | 219/289 [02:45<00:52,  1.32it/s]

Training loop 219
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.132040336728096, logits - tensor([[-6.2683,  2.7777, -6.8825, -2.8719],
        [-7.2334,  2.2618, -7.5373, -1.5352],
        [-6.8392, -1.5219, -6.6560,  2.6862],
        [-7.0960,  2.7407, -7.1344, -1.8944],
        [-8.3332, -2.5221, -5.9683,  2.3513],
        [-6.0546,  0.6656, -6.2270, -1.2279],
        [-6.5222,  0.9192, -6.9279, -0.1814],
        [-8.0521,  1.6483, -6.3548, -1.1654]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 76%|███████▌  | 220/289 [02:46<00:52,  1.33it/s]

Training loop 220
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.147325336933136, logits - tensor([[-6.1750, -2.0804,  0.3783, -1.6668],
        [-6.6272,  1.5888, -6.3895, -1.6603],
        [-6.5730,  2.0521, -6.7557, -1.9441],
        [-5.6949, -3.8668,  2.7614, -2.6733],
        [-6.9955, -2.2581, -4.2638,  1.1011],
        [-7.2101, -1.9873, -5.0439,  1.3745],
        [-6.5825,  2.6649, -6.8199, -2.9573],
        [-6.4729,  2.2162, -5.9649, -2.3374]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 76%|███████▋  | 221/289 [02:47<00:51,  1.32it/s]

Training loop 221
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.21945694088935852, logits - tensor([[-5.2639, -2.9508,  2.1312, -1.8048],
        [-6.6934,  2.8671, -7.6098, -2.7774],
        [-6.0545,  2.6234, -6.5637, -2.7564],
        [-7.0020,  2.4158, -6.6772, -2.9522],
        [-7.8278,  2.2129, -6.7254, -1.7048],
        [-4.8044, -3.5721,  1.7294, -2.0626],
        [-7.2068,  1.8575, -6.9421, -2.2891],
        [-5.5579, -3.3153, -5.2032,  2.9439]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 77%|███████▋  | 222/289 [02:48<00:50,  1.32it/s]

Training loop 222
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19364002346992493, logits - tensor([[-6.5828,  2.6862, -6.3021, -1.3550],
        [-6.9526,  1.9110, -7.2352, -2.6149],
        [-6.4719,  0.1976, -5.6273,  0.0649],
        [-4.8239, -2.4050,  1.2221, -1.8361],
        [-7.4883,  2.0468, -6.8016, -0.8904],
        [-5.9304, -3.1469, -4.6771,  3.5811],
        [-5.7737,  2.3575, -7.2315, -3.0281],
        [-7.0772, -3.6677,  3.1004, -3.6190]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 77%|███████▋  | 223/289 [02:48<00:50,  1.32it/s]

Training loop 223
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.0838095024228096, logits - tensor([[-7.5453,  2.1664, -7.1455, -2.1258],
        [-5.9545,  2.2722, -5.9895, -1.8422],
        [-6.5610,  1.4508, -6.6893, -2.1721],
        [-8.0485,  1.4381, -6.8058, -0.3431],
        [-6.2655, -2.8334,  1.3171, -2.2263],
        [-5.5183, -3.0980,  1.7737, -1.9827],
        [-6.5828, -2.7711, -4.4391,  3.9228],
        [-6.7195,  2.1310, -6.7041, -1.5277]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 78%|███████▊  | 224/289 [02:49<00:49,  1.32it/s]

Training loop 224
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2497413158416748, logits - tensor([[-7.1638,  3.2938, -7.0732, -2.5071],
        [-4.9596, -2.9863,  2.5129, -2.2802],
        [-6.5703,  2.2760, -6.4355, -2.5560],
        [-8.3334, -0.5725, -6.9027,  0.9144],
        [-7.1082,  2.1226, -7.2711, -2.0396],
        [-6.0166, -3.1243, -6.5886,  2.9499],
        [-7.1980,  0.1592, -6.0849, -0.7966],
        [-4.9210, -3.3636,  2.6798, -1.9714]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 78%|███████▊  | 225/289 [02:50<00:49,  1.31it/s]

Training loop 225
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.22897514700889587, logits - tensor([[-7.0697,  1.8510, -6.5316, -1.5026],
        [-6.8017, -0.1860, -5.7109, -0.2890],
        [-8.4001,  0.9883, -7.5995, -1.7187],
        [-7.2310,  0.8962, -6.7848, -1.2930],
        [-7.2554,  0.9797, -6.6375, -1.9674],
        [-6.8119, -3.1380, -5.2984,  3.1451],
        [-6.7542,  2.1555, -6.3875, -2.5732],
        [-6.6939,  2.4682, -6.5527, -1.1034]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 78%|███████▊  | 226/289 [02:51<00:48,  1.31it/s]

Training loop 226
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3399614095687866, logits - tensor([[-6.8081,  1.8078, -5.9106, -1.1172],
        [-7.2110,  2.5898, -7.0639, -2.3048],
        [-6.1082,  0.9624, -6.7582, -1.8219],
        [-7.3586,  2.5699, -7.2855, -2.9165],
        [-7.5046,  1.5850, -8.1444, -1.4514],
        [-5.6951,  2.0841, -6.9275, -2.4919],
        [-6.9451,  1.4293, -6.5734, -2.0058],
        [-6.0243,  2.7604, -6.8600, -2.9216]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 79%|███████▊  | 227/289 [02:51<00:47,  1.32it/s]

Training loop 227
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3774569034576416, logits - tensor([[-6.6444,  1.8605, -6.5995, -2.3128],
        [-7.2304,  2.8555, -6.7226, -2.5233],
        [-5.0547, -2.9603,  1.5963, -1.2847],
        [-6.6891,  0.0867, -5.5452,  0.5363],
        [-6.5834, -3.3323, -4.7870,  2.1549],
        [-6.6949,  1.5266, -6.4363, -1.9723],
        [-5.2852, -1.7365, -0.1488, -0.9400],
        [-7.7314, -3.2947, -6.2424,  2.8337]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 79%|███████▉  | 228/289 [02:52<00:46,  1.32it/s]

Training loop 228
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.04231739789247513, logits - tensor([[-7.9884,  1.9262, -7.0937, -2.5065],
        [-6.2872,  2.8006, -6.4506, -3.4021],
        [-7.4237, -2.7043, -5.7465,  3.9489],
        [-6.8171, -4.5669, -5.6155,  4.4098],
        [-6.6541, -3.4444,  2.7771, -2.1440],
        [-8.1499, -1.4299, -5.9008,  2.0462],
        [-6.6279,  2.5811, -7.2616, -2.7688],
        [-6.2215,  1.8515, -6.4456, -2.2486]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 79%|███████▉  | 229/289 [02:53<00:45,  1.32it/s]

Training loop 229
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.12986654043197632, logits - tensor([[-5.7127, -3.8302,  1.8419, -2.5909],
        [-7.3258,  2.0177, -5.9413, -2.6418],
        [-7.8692,  0.9482, -6.3376, -1.0392],
        [-6.8229,  1.4204, -6.8242, -1.5970],
        [-6.1164,  2.5962, -6.9179, -2.7841],
        [-8.3176,  0.2082, -7.0121,  0.6039],
        [-7.0827,  0.8412, -5.8677, -1.0568],
        [-7.6696,  2.5923, -7.3013, -1.6562]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 80%|███████▉  | 230/289 [02:54<00:44,  1.32it/s]

Training loop 230
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.24579858779907227, logits - tensor([[-5.4726, -3.6683, -5.8124,  4.1805],
        [-6.8953,  2.0908, -6.7601, -1.3100],
        [-5.3044, -3.0211,  1.9599, -2.3746],
        [-6.3415,  0.2140, -5.9918,  0.5256],
        [-6.5990, -3.1085,  0.8055, -0.5218],
        [-8.0359,  1.3441, -6.7879, -0.7823],
        [-7.3278,  1.8254, -7.6411, -2.5846],
        [-6.6735,  1.5503, -6.2811, -2.3633]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 80%|███████▉  | 231/289 [02:54<00:43,  1.32it/s]

Training loop 231
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2035207450389862, logits - tensor([[-6.4527, -4.1694, -5.4033,  4.3626],
        [-5.7497, -2.6790,  0.9229, -1.9471],
        [-5.3309, -3.0393,  2.1381, -2.0293],
        [-6.1460, -2.7027, -5.0275,  2.3559],
        [-7.5288,  1.4960, -6.7006, -1.8885],
        [-6.6110,  2.0046, -6.1995, -2.5429],
        [-6.3928,  2.2764, -6.8645, -2.5806],
        [-7.1557, -1.9453, -5.0818,  2.6099]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 80%|████████  | 232/289 [02:55<00:43,  1.32it/s]

Training loop 232
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2180517315864563, logits - tensor([[-6.3061,  1.4907, -6.2841, -1.9877],
        [-6.3071, -2.6188, -5.6013,  2.3228],
        [-6.0905,  2.4003, -6.5521, -1.6897],
        [-7.8149,  2.3471, -6.9006, -2.0541],
        [-5.9856,  2.1801, -7.2434, -3.1199],
        [-7.4098, -0.6316, -6.0092,  0.3291],
        [-7.5272,  1.8324, -6.7239, -2.2534],
        [-6.7922, -4.6696,  2.5592, -2.2459]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 81%|████████  | 233/289 [02:56<00:42,  1.32it/s]

Training loop 233
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.07668022811412811, logits - tensor([[-7.4291,  1.1233, -7.3607, -1.0708],
        [-7.2859, -1.3523, -6.7712,  1.2916],
        [-7.1837,  1.4501, -6.2498, -1.7168],
        [-6.8344,  3.5477, -6.9648, -3.9065],
        [-5.7844,  2.4094, -6.6482, -2.7427],
        [-6.9331,  2.6379, -7.6875, -2.6049],
        [-6.8231,  1.4669, -6.2098, -1.4677],
        [-6.0679,  1.6549, -6.0315, -2.4799]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 81%|████████  | 234/289 [02:57<00:41,  1.33it/s]

Training loop 234
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.11624392867088318, logits - tensor([[-6.8188,  0.7643, -5.9616, -2.1271],
        [-6.4085,  1.9287, -6.7926, -2.6944],
        [-4.7994, -3.6547,  2.3059, -1.8710],
        [-6.2059,  2.2691, -5.8767, -2.1138],
        [-6.1529,  1.4615, -6.9364, -2.3744],
        [-8.8125, -0.0161, -7.5050,  0.7695],
        [-5.5947, -3.6329,  2.9458, -2.5300],
        [-6.6185,  2.0101, -5.6339, -2.3968]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 81%|████████▏ | 235/289 [02:57<00:40,  1.33it/s]

Training loop 235
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.10014575719833374, logits - tensor([[-8.0539,  2.2235, -7.0453, -2.5880],
        [-5.6597, -4.0174,  2.8495, -3.2870],
        [-6.3753,  2.1188, -6.0199, -2.6105],
        [-6.0552,  2.9542, -6.6014, -2.5364],
        [-7.2981,  2.1741, -7.1762, -2.4387],
        [-6.6862,  0.8713, -6.3945, -0.2757],
        [-5.1971, -3.6560,  3.2415, -3.2601],
        [-5.6254, -3.1295,  2.4190, -2.2631]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 82%|████████▏ | 236/289 [02:58<00:39,  1.33it/s]

Training loop 236
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.155257910490036, logits - tensor([[-5.4926,  2.1597, -5.4200, -1.9792],
        [-6.1804,  2.7971, -6.0518, -2.0375],
        [-6.3402, -3.8568,  2.5365, -1.4461],
        [-7.1053, -3.4092, -8.0112,  2.8857],
        [-6.6710,  3.1226, -6.5771, -3.3382],
        [-6.6054,  2.0847, -6.5807, -2.4738],
        [-5.8329,  2.9633, -5.6276, -2.8731],
        [-6.3231,  1.9609, -6.3205, -1.4332]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 82%|████████▏ | 237/289 [02:59<00:39,  1.33it/s]

Training loop 237
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.22672972083091736, logits - tensor([[-8.3639, -0.9126, -6.5902,  0.1960],
        [-6.7225,  2.2565, -7.0635, -1.6201],
        [-6.1237,  2.2474, -7.2153, -1.9383],
        [-5.3092,  1.2575, -5.9658, -2.4435],
        [-6.5774,  1.4837, -6.3715, -1.7251],
        [-6.4606,  2.5011, -6.5149, -2.5045],
        [-7.4650,  1.3223, -7.8020, -1.1279],
        [-7.6343,  0.0359, -6.2505, -0.9009]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 82%|████████▏ | 238/289 [03:00<00:38,  1.33it/s]

Training loop 238
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.28763407468795776, logits - tensor([[-6.4605, -2.7534, -4.9930,  3.2261],
        [-6.9931,  2.4553, -6.6896, -2.2510],
        [-6.9410,  3.0265, -6.3661, -2.9935],
        [-5.5708, -3.3819,  2.6375, -2.8699],
        [-7.3397,  1.6852, -6.0069, -1.7719],
        [-5.3434, -3.2043,  2.0729, -2.3614],
        [-7.2409, -0.5791, -6.2937,  0.3970],
        [-7.6541,  2.8671, -7.0811, -2.9225]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 83%|████████▎ | 239/289 [03:00<00:37,  1.33it/s]

Training loop 239
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.05693131685256958, logits - tensor([[-6.2163,  2.6968, -6.3450, -4.1332],
        [-6.7740,  2.1209, -7.0025, -2.9232],
        [-6.8613,  2.5351, -6.5368, -2.5517],
        [-6.8606,  1.8333, -5.5854, -1.0904],
        [-5.7361,  1.2586, -5.8773, -2.2509],
        [-7.1079,  3.1304, -6.8068, -2.4324],
        [-6.8553,  2.3393, -6.9024, -1.7514],
        [-6.8325,  2.5029, -6.6682, -1.8042]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 83%|████████▎ | 240/289 [03:01<00:36,  1.33it/s]

Training loop 240
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.18133088946342468, logits - tensor([[-5.7100, -3.4787,  2.2553, -2.1395],
        [-5.5223,  1.9920, -5.2882, -2.3744],
        [-6.8898,  2.2382, -6.3161, -2.5724],
        [-5.4129, -3.3457,  3.1776, -2.7158],
        [-6.9062,  2.2297, -6.7154, -2.7158],
        [-7.5202,  2.2769, -6.6757, -2.6578],
        [-5.9085, -3.9919,  2.9404, -3.1977],
        [-6.9730,  2.4233, -6.9312, -2.4790]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 83%|████████▎ | 241/289 [03:02<00:36,  1.32it/s]

Training loop 241
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.29935747385025024, logits - tensor([[-5.5735, -2.3874,  1.3518, -1.9651],
        [-6.5017, -3.9172,  1.6997, -2.8687],
        [-7.7555,  1.3685, -7.2784, -2.3441],
        [-5.0896,  2.2999, -5.5423, -1.8776],
        [-7.2849,  2.9172, -7.3857, -3.1185],
        [-6.2558,  1.6155, -7.1262, -2.6984],
        [-5.7008,  1.2640, -6.2225, -1.5609],
        [-7.2200,  1.8247, -6.3327, -1.3384]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 84%|████████▎ | 242/289 [03:03<00:35,  1.32it/s]

Training loop 242
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.08725974708795547, logits - tensor([[-6.6939,  2.7513, -6.2703, -2.7659],
        [-6.3167,  3.3059, -6.5917, -3.4672],
        [-6.0146,  3.1006, -5.7460, -3.2339],
        [-6.7888,  2.9847, -6.2209, -3.0344],
        [-6.7700,  2.7624, -6.4605, -2.2524],
        [-7.1581, -1.4070, -6.4678,  1.9017],
        [-7.2645,  0.7505, -6.7776,  0.1984],
        [-6.5162, -2.7361, -4.3903,  2.6293]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 84%|████████▍ | 243/289 [03:04<00:34,  1.32it/s]

Training loop 243
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.13889718055725098, logits - tensor([[-6.7410,  1.7506, -6.9501, -1.2068],
        [-6.0511,  1.8629, -5.9527, -2.3306],
        [-5.8113,  2.3170, -6.3489, -3.0233],
        [-5.3731, -3.5959,  2.9621, -3.3323],
        [-5.7233, -3.5710,  2.1630, -3.0236],
        [-7.0117,  3.0731, -6.2810, -2.8731],
        [-5.7757, -3.0693,  2.5857, -3.3867],
        [-6.3830,  2.8448, -6.1287, -2.9156]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 84%|████████▍ | 244/289 [03:04<00:34,  1.32it/s]

Training loop 244
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.05153997242450714, logits - tensor([[-6.6492,  1.6827, -7.1310, -1.9013],
        [-6.8209, -1.7050, -6.3238,  2.9072],
        [-6.3807,  2.3547, -6.6925, -2.8445],
        [-5.6576, -3.2514, -5.2219,  3.2702],
        [-7.3452,  1.6704, -6.1396, -2.5465],
        [-7.2744,  1.9692, -6.9684, -2.4382],
        [-7.3549,  2.2327, -6.5419, -1.8989],
        [-5.7220, -3.8534,  2.9625, -2.3025]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 85%|████████▍ | 245/289 [03:05<00:33,  1.32it/s]

Training loop 245
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.509939432144165, logits - tensor([[-5.9158, -3.2311,  2.6484, -2.4018],
        [-6.6115,  2.4157, -6.3389, -1.2578],
        [-5.1444,  2.8906, -6.0242, -2.5895],
        [-6.8314, -3.5531, -5.3023,  2.6795],
        [-5.0929, -3.8989, -4.5971,  4.3072],
        [-5.8152,  1.6022, -5.9091, -1.0012],
        [-6.4408,  2.8591, -6.9990, -2.8977],
        [-7.8049,  1.7872, -7.7738, -1.9321]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 85%|████████▌ | 246/289 [03:06<00:32,  1.32it/s]

Training loop 246
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.059740111231803894, logits - tensor([[-5.5828, -4.1884, -4.3995,  3.5410],
        [-6.8768,  1.3451, -6.3445, -1.9587],
        [-7.5096, -2.7970, -6.7023,  2.2986],
        [-8.0249,  1.2031, -8.2438, -0.1159],
        [-6.6697,  2.7541, -6.1500, -3.2488],
        [-6.9275, -3.8299,  2.7603, -3.1077],
        [-6.3610, -3.7814, -5.8701,  3.7916],
        [-6.5970, -2.6325, -5.1376,  2.5897]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 85%|████████▌ | 247/289 [03:07<00:31,  1.32it/s]

Training loop 247
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.21158184111118317, logits - tensor([[-7.4530,  3.0462, -7.6397, -2.8129],
        [-6.9307,  2.7631, -6.6498, -2.1860],
        [-6.0940,  2.5207, -5.6099, -2.8311],
        [-7.9847,  2.8777, -7.3175, -3.3312],
        [-5.6613, -4.2396,  3.3665, -3.4015],
        [-6.3854,  2.4825, -6.1151, -3.2414],
        [-6.4317,  2.0900, -6.6615, -2.3969],
        [-8.7896,  1.2648, -5.9729, -1.2735]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 86%|████████▌ | 248/289 [03:07<00:31,  1.32it/s]

Training loop 248
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.28326672315597534, logits - tensor([[-5.6985, -2.0914,  0.5864, -2.1641],
        [-6.4839,  3.2705, -6.3322, -3.4045],
        [-7.7560, -1.4269, -6.1649,  2.2860],
        [-7.5327, -0.3270, -5.8287, -0.4934],
        [-6.6895, -1.6851, -5.8992,  2.3967],
        [-6.6078,  0.2797, -7.2133, -1.1741],
        [-8.1512, -0.4350, -6.2030,  0.0466],
        [-4.9928, -2.7835,  2.4013, -2.2145]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 86%|████████▌ | 249/289 [03:08<00:30,  1.32it/s]

Training loop 249
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.08450151979923248, logits - tensor([[-6.9449,  2.2242, -7.1624, -2.2718],
        [-6.4000,  2.1324, -6.7939, -3.0807],
        [-6.5132,  2.6233, -7.0377, -1.9519],
        [-6.9659,  2.1202, -6.6720, -2.3154],
        [-6.4369,  1.8217, -7.2911, -1.4173],
        [-7.3132,  2.2481, -7.0806, -2.0579],
        [-7.5432, -0.9446, -6.0060,  1.2949],
        [-8.4264,  0.8661, -6.3558, -0.6827]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 87%|████████▋ | 250/289 [03:09<00:29,  1.32it/s]

Training loop 250
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.07163120806217194, logits - tensor([[-5.6432, -3.2836,  3.0377, -3.4798],
        [-6.1660, -3.5232, -4.8210,  3.2856],
        [-7.4449,  1.0252, -6.6037, -1.2464],
        [-7.4156,  0.4991, -6.5579, -0.3019],
        [-5.5988, -3.0699,  2.8969, -3.1111],
        [-6.4793, -3.8397,  2.2657, -2.8546],
        [-6.7625,  2.6856, -6.6704, -2.9592],
        [-6.7550, -3.5560, -5.1271,  3.5105]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 87%|████████▋ | 251/289 [03:10<00:28,  1.32it/s]

Training loop 251
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.15864352881908417, logits - tensor([[-6.8435,  1.7033, -6.8404, -1.4927],
        [-7.1227,  0.2921, -6.2829,  0.0210],
        [-6.1755,  2.3021, -6.5284, -2.8750],
        [-7.5254, -0.0710, -6.8386,  0.4786],
        [-6.7999,  1.5690, -6.2355, -1.3375],
        [-6.5130,  1.3367, -6.6763, -1.4463],
        [-6.9671,  0.9437, -5.9086, -1.3602],
        [-6.8137, -2.9913, -6.1070,  2.1615]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 87%|████████▋ | 252/289 [03:10<00:28,  1.32it/s]

Training loop 252
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.177194744348526, logits - tensor([[-7.7579,  2.7115, -7.1476, -2.6569],
        [-4.7105, -2.5782,  1.1457, -2.5629],
        [-7.2709,  1.1839, -6.4699, -1.0870],
        [-6.4595, -1.3885, -5.0695,  1.5882],
        [-6.7389, -4.4685,  2.9126, -3.2765],
        [-7.5455,  1.4032, -6.9530, -0.9498],
        [-6.3565,  1.9014, -6.5174, -3.3300],
        [-6.6245,  2.0133, -6.2394, -1.5396]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 88%|████████▊ | 253/289 [03:11<00:27,  1.32it/s]

Training loop 253
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1088009625673294, logits - tensor([[-5.3931, -3.3643, -4.5963,  3.7303],
        [-6.1871,  1.7395, -5.8055, -1.6378],
        [-6.9376,  0.3220, -6.7266,  0.5719],
        [-6.5797,  1.0772, -5.9740, -0.9146],
        [-6.8533,  1.6373, -6.9374, -2.0660],
        [-6.8349,  1.8271, -6.4430, -1.9951],
        [-7.4474, -2.8740, -5.8252,  2.7250],
        [-5.5624,  2.8376, -6.1608, -2.2410]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 88%|████████▊ | 254/289 [03:12<00:26,  1.32it/s]

Training loop 254
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1632617563009262, logits - tensor([[-6.1471, -3.5511,  2.4170, -2.5012],
        [-6.6441,  1.4416, -5.5839, -1.8072],
        [-6.9717, -1.5038, -5.7943,  2.3718],
        [-7.6947,  2.1857, -6.8961, -2.7106],
        [-8.6600,  1.4784, -7.1426, -1.6881],
        [-5.6070, -2.5067, -4.4304,  2.3299],
        [-7.3895,  3.4407, -7.2812, -2.6014],
        [-6.5982,  1.6647, -6.8650, -2.2262]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 88%|████████▊ | 255/289 [03:13<00:25,  1.33it/s]

Training loop 255
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.04221027344465256, logits - tensor([[-6.9760, -3.1743, -6.3807,  2.4578],
        [-7.5716,  2.4954, -7.7435, -2.6416],
        [-7.8173, -2.2278, -5.9146,  2.4267],
        [-6.0637,  2.6967, -5.7259, -2.8451],
        [-6.3950,  2.0008, -6.1292, -2.6218],
        [-5.1000, -2.8640,  2.1820, -2.3597],
        [-7.5842, -2.3957, -7.0465,  2.7792],
        [-6.4130,  2.6986, -6.5250, -2.4600]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 89%|████████▊ | 256/289 [03:13<00:24,  1.33it/s]

Training loop 256
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.20360448956489563, logits - tensor([[-7.7322,  2.1172, -6.9908, -1.6314],
        [-7.4630,  2.2027, -6.6062, -3.0527],
        [-6.2202,  1.9224, -5.7702, -1.8712],
        [-8.2998,  2.3562, -7.3747, -2.4677],
        [-6.6996, -3.2737, -5.7033,  4.3263],
        [-7.1243,  1.3891, -7.2319, -2.0034],
        [-7.0246,  1.7060, -6.2249, -2.1815],
        [-5.9963, -4.0339,  2.9241, -3.4026]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 89%|████████▉ | 257/289 [03:14<00:24,  1.32it/s]

Training loop 257
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.041727278381586075, logits - tensor([[-7.2194, -4.2022, -5.4542,  2.9652],
        [-6.4185, -3.3872,  2.4140, -2.2731],
        [-6.7288,  2.4103, -6.3392, -2.3879],
        [-7.0033,  2.3217, -6.2706, -2.1688],
        [-5.6304, -3.8613,  2.5553, -3.1174],
        [-7.3561,  2.0597, -6.9362, -2.1505],
        [-5.5702,  2.5130, -6.2820, -2.1511],
        [-6.9148, -4.2093,  2.9596, -3.4600]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 89%|████████▉ | 258/289 [03:15<00:23,  1.33it/s]

Training loop 258
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.23453107476234436, logits - tensor([[-6.9543,  2.6943, -6.2558, -1.8008],
        [-4.8056,  2.4015, -5.4552, -2.6737],
        [-7.6198,  1.3080, -6.4589, -0.5647],
        [-8.8734,  2.8353, -7.7718, -3.1877],
        [-5.1988,  1.4701, -5.9816, -2.1879],
        [-6.4773, -1.4385, -5.0741,  2.0688],
        [-6.3339, -4.3222, -5.0741,  4.1137],
        [-6.5770,  3.4101, -7.1088, -2.7052]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 90%|████████▉ | 259/289 [03:16<00:22,  1.33it/s]

Training loop 259
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.05297191068530083, logits - tensor([[-7.7626,  1.9857, -7.2601, -2.2431],
        [-7.5649,  3.1805, -6.6931, -2.1163],
        [-6.9804,  2.6418, -6.8475, -2.8149],
        [-6.7036,  1.9447, -6.2680, -1.2090],
        [-5.8028, -3.7367,  1.6931, -3.2474],
        [-6.2495,  2.5698, -6.3411, -2.1605],
        [-5.1733,  2.4833, -6.0846, -2.8887],
        [-5.7305, -3.4841,  2.3771, -2.2878]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 90%|████████▉ | 260/289 [03:16<00:21,  1.33it/s]

Training loop 260
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.04970991984009743, logits - tensor([[-5.4195, -3.7456,  3.2304, -2.8296],
        [-7.9862, -3.2028, -6.4904,  2.4505],
        [-6.2741,  2.3732, -6.8270, -2.1760],
        [-6.1114,  1.7572, -6.3539, -2.0287],
        [-5.1212, -3.4028,  2.1868, -2.4574],
        [-6.5173,  2.3218, -7.5888, -1.9454],
        [-6.5158,  2.1492, -6.0705, -2.8659],
        [-5.2302, -3.8394,  2.4165, -2.0757]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 90%|█████████ | 261/289 [03:17<00:21,  1.32it/s]

Training loop 261
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.03909682482481003, logits - tensor([[-5.3940, -3.7883,  3.1023, -3.1396],
        [-5.0045,  2.6628, -5.6665, -2.4511],
        [-5.6734,  2.4117, -6.0155, -2.2888],
        [-7.5924,  1.8371, -6.4514, -1.4294],
        [-6.9116, -3.3744, -5.7204,  2.9872],
        [-6.4215, -4.2521, -5.7413,  3.1849],
        [-4.9135, -3.6838,  3.8120, -3.1300],
        [-6.5954,  2.9871, -6.4904, -1.9811]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 91%|█████████ | 262/289 [03:18<00:20,  1.32it/s]

Training loop 262
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.20426253974437714, logits - tensor([[-6.3203,  2.6432, -7.3779, -2.4942],
        [-5.7273,  2.4833, -6.0724, -2.2035],
        [-5.6740, -3.6991,  3.2379, -3.7280],
        [-6.2138, -2.1823, -6.0363,  2.6265],
        [-6.4066,  2.4530, -5.9400, -2.9314],
        [-7.1125,  3.0696, -7.9148, -2.2859],
        [-6.8795,  2.0559, -6.7968, -1.9141],
        [-6.2724,  1.8876, -6.3746, -2.2527]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 91%|█████████ | 263/289 [03:19<00:19,  1.32it/s]

Training loop 263
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 91%|█████████▏| 264/289 [03:19<00:18,  1.33it/s]

loss - 0.14846652746200562, logits - tensor([[-7.9610, -2.9808, -6.0229,  3.6161],
        [-7.0077, -0.3904, -5.4600,  0.4037],
        [-5.9751, -3.9196,  3.2141, -3.3647],
        [-6.7505,  1.3600, -6.9770, -1.2730],
        [-5.8218, -3.2798,  2.8867, -2.5023],
        [-5.8742, -3.9037, -5.4981,  3.5814],
        [-4.8405, -4.0801,  2.6289, -3.3440],
        [-6.0601, -3.3663, -5.6919,  3.2859]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 264
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19498607516288757, logits - tensor([[-7.4459,  2.5092, -7.5960, -3.5833],
        [-5.4550,  2.9017, -5.2699, -1.7944],
        [-7.1809,  1.8625, -6.0987, -3.0348],
        [-5.5226, -3.0577,  2.2232, -1.5350],
        [-7.5587,  3.0024, -6.6065, -2.0500],
        [-5.4363, -4.3014, -5.1323,  4.2681],
        [-6.0633,  2.7416, -6.3660, -3.1463],
        [-5.3120, -3.1278,  3.0501, -2

 92%|█████████▏| 265/289 [03:20<00:18,  1.33it/s]

Training loop 265
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.38965266942977905, logits - tensor([[-6.8269,  1.7758, -6.0768, -2.1695],
        [-6.0212, -4.9880,  3.0170, -3.3132],
        [-6.2688,  3.3252, -6.2033, -2.4164],
        [-5.8769,  3.0384, -6.5305, -2.8359],
        [-6.4783, -0.6838, -5.9227,  1.1113],
        [-5.9778,  1.3049, -5.4141, -1.2391],
        [-6.8288,  2.4933, -6.5099, -1.9190],
        [-8.0068, -1.1299, -6.8545,  1.2603]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 92%|█████████▏| 266/289 [03:21<00:17,  1.33it/s]

Training loop 266
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.11986597627401352, logits - tensor([[-7.4073, -0.4698, -6.4196,  0.6215],
        [-6.5542,  0.5771, -6.6248, -0.4555],
        [-6.1101, -3.9249, -4.7974,  4.3985],
        [-7.3234,  3.2160, -7.1732, -3.1415],
        [-6.1586,  1.6058, -5.7184, -2.5153],
        [-7.2401, -1.9168, -6.3420,  2.4798],
        [-6.3840,  3.1019, -6.0885, -2.5256],
        [-6.9721,  2.5834, -6.2214, -2.5517]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 92%|█████████▏| 267/289 [03:22<00:16,  1.33it/s]

Training loop 267
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.0592028833925724, logits - tensor([[-6.3829,  3.3007, -6.5438, -2.7680],
        [-7.5017, -1.2666, -4.4656,  0.9427],
        [-6.4333,  2.0097, -6.7387, -2.4846],
        [-6.9709,  2.4867, -6.4231, -2.5466],
        [-6.6197, -2.3558, -5.3910,  1.4837],
        [-5.3941,  2.1766, -6.0069, -2.4766],
        [-5.3833,  2.3494, -6.4551, -2.6302],
        [-7.1704,  2.9763, -7.0949, -2.0224]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 93%|█████████▎| 268/289 [03:22<00:15,  1.33it/s]

Training loop 268
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3801976442337036, logits - tensor([[-5.9230,  3.0120, -6.7379, -3.3251],
        [-7.4715,  1.5218, -5.9809, -1.1529],
        [-7.3037,  1.9953, -6.3498, -1.9876],
        [-6.1621, -1.9725, -6.8190,  2.6117],
        [-6.0794, -4.1638,  3.6269, -3.8855],
        [-7.4022,  1.9184, -6.5168, -2.0160],
        [-4.9351, -3.6267,  2.8915, -3.9427],
        [-6.6764, -4.6346,  2.6379, -3.3501]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 93%|█████████▎| 269/289 [03:23<00:15,  1.32it/s]

Training loop 269
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.07568389177322388, logits - tensor([[-5.0463,  3.0566, -6.2533, -2.6950],
        [-7.4712,  1.0939, -6.7796, -1.5657],
        [-7.0284,  2.7176, -7.4031, -3.5653],
        [-5.5243, -3.3513,  3.0364, -3.6431],
        [-7.3722,  1.8892, -6.7829, -1.8434],
        [-7.5290, -0.2905, -5.8502,  0.8264],
        [-6.3156,  1.9232, -5.5058, -2.2131],
        [-6.7339,  3.2937, -6.5194, -2.0775]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 93%|█████████▎| 270/289 [03:24<00:14,  1.32it/s]

Training loop 270
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.07844221591949463, logits - tensor([[-5.9817, -1.5789, -3.5777, -0.5063],
        [-6.8260,  1.4245, -6.8884, -1.1019],
        [-6.0508,  2.4731, -5.9749, -2.6284],
        [-5.0096, -4.0396,  3.7523, -3.5728],
        [-6.7334,  3.6379, -7.1579, -2.3069],
        [-6.8694,  2.3938, -6.1334, -2.1828],
        [-7.2594,  2.2980, -7.4183, -2.9933],
        [-7.0750,  3.2437, -7.1211, -2.5751]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 94%|█████████▍| 271/289 [03:25<00:13,  1.33it/s]

Training loop 271
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.05259270220994949, logits - tensor([[-5.3094, -3.0829, -5.3332,  3.5113],
        [-6.8560,  3.0998, -6.8939, -2.4582],
        [-5.6689,  2.2125, -5.7710, -2.1988],
        [-6.8315,  2.6617, -6.8111, -2.0048],
        [-6.6375,  2.5118, -6.7610, -2.3602],
        [-6.0111,  2.3531, -6.3136, -3.0871],
        [-6.4253, -1.9686, -6.4741,  2.8997],
        [-6.9317, -1.6125, -6.7815,  0.7800]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 94%|█████████▍| 272/289 [03:25<00:12,  1.33it/s]

Training loop 272
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19480794668197632, logits - tensor([[-7.6820,  2.8968, -6.5132, -3.2052],
        [-7.8354,  2.6518, -7.1817, -2.3428],
        [-6.5813, -2.6509, -6.4689,  1.3843],
        [-5.0877, -3.6855,  2.9529, -3.5305],
        [-7.1518,  2.3505, -6.9674, -3.0995],
        [-7.5971, -1.9485, -5.6184,  3.0924],
        [-5.8409, -3.6495,  3.4136, -3.2028],
        [-6.5289,  2.4028, -6.3225, -2.6504]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 94%|█████████▍| 273/289 [03:26<00:12,  1.33it/s]

Training loop 273
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.11571402847766876, logits - tensor([[-7.3949,  0.0980, -6.0468, -0.3347],
        [-5.8539, -4.2433, -5.6038,  3.9924],
        [-6.5665,  2.6462, -6.8269, -2.5110],
        [-6.3173,  2.2269, -6.4148, -2.3547],
        [-7.2157,  0.7775, -6.7239, -0.1525],
        [-5.9766,  2.3302, -5.9723, -2.9795],
        [-5.3568, -4.2251, -5.8523,  4.2936],
        [-6.9739,  1.4797, -6.8676, -1.0225]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 95%|█████████▍| 274/289 [03:27<00:11,  1.33it/s]

Training loop 274
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.21801504492759705, logits - tensor([[-5.4816, -3.4806,  2.9426, -3.2897],
        [-6.3987,  2.4382, -6.6446, -2.6911],
        [-6.5145, -4.2248,  3.5384, -3.6215],
        [-6.2218, -2.3282, -5.0773,  2.4123],
        [-6.2488,  2.2486, -5.9898, -2.0723],
        [-6.4049, -3.6411,  1.6259, -1.5567],
        [-6.8359,  2.8949, -6.5002, -2.6998],
        [-7.5180,  2.8147, -7.5736, -3.0457]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 95%|█████████▌| 275/289 [03:28<00:10,  1.32it/s]

Training loop 275
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4437185823917389, logits - tensor([[-4.8666, -2.7252,  2.3078, -2.4174],
        [-6.4424,  2.1299, -6.5703, -1.8457],
        [-5.8807, -3.7375,  3.6143, -3.1910],
        [-6.7491,  2.8859, -7.0511, -2.6472],
        [-6.4243,  1.5715, -5.3122, -2.4554],
        [-5.8233, -4.8763,  4.3430, -3.8263],
        [-7.6062, -3.2015, -7.4604,  3.2657],
        [-7.3222,  2.0398, -7.3545, -3.0011]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 96%|█████████▌| 276/289 [03:28<00:09,  1.33it/s]

Training loop 276
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.15288056433200836, logits - tensor([[-5.5203, -3.2749, -4.7795,  3.3925],
        [-5.6774, -3.5908,  1.4291, -1.6255],
        [-7.1584,  0.6175, -5.2216, -1.5552],
        [-5.1236, -3.5733,  2.8762, -2.8650],
        [-6.6584, -3.0737, -5.7846,  2.9498],
        [-6.5145,  2.5070, -6.8045, -1.7528],
        [-7.1486,  2.4099, -7.1841, -3.2026],
        [-7.5251,  3.0980, -7.0405, -3.1299]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 96%|█████████▌| 277/289 [03:29<00:09,  1.33it/s]

Training loop 277
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.03801707923412323, logits - tensor([[-7.2101, -4.2517, -6.1705,  4.7173],
        [-5.8704, -3.4340, -6.3873,  3.7157],
        [-8.2394,  2.8694, -6.7227, -2.5025],
        [-6.7422,  2.5934, -6.9840, -3.3779],
        [-6.5698, -3.3824, -5.4329,  3.7036],
        [-6.7759,  2.3180, -6.2171, -1.6036],
        [-7.1175,  2.9355, -6.2732, -2.7802],
        [-6.2467,  1.1496, -5.9460, -1.8277]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 96%|█████████▌| 278/289 [03:30<00:08,  1.33it/s]

Training loop 278
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2366102784872055, logits - tensor([[-6.5419,  2.1618, -6.2601, -1.8082],
        [-5.9247, -4.8123,  2.7291, -3.0413],
        [-6.7784,  2.5377, -7.1585, -3.8466],
        [-6.9528, -3.3058, -5.9189,  2.5934],
        [-7.5935, -2.4625, -6.4599,  2.9261],
        [-6.6221,  1.6806, -6.2226, -2.1329],
        [-6.5808,  3.0614, -5.8998, -2.7478],
        [-6.1904, -4.0570, -6.0741,  3.3084]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 97%|█████████▋| 279/289 [03:31<00:07,  1.33it/s]

Training loop 279
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.20596396923065186, logits - tensor([[-7.1839,  2.3343, -6.5266, -2.6989],
        [-5.0211, -2.6815,  2.4678, -2.4901],
        [-7.3996, -2.5625, -6.1464,  1.9582],
        [-6.9868, -2.1755, -5.4903,  3.0271],
        [-6.6475,  2.5904, -6.1096, -2.0759],
        [-5.5337, -3.6401,  3.2052, -3.8008],
        [-6.9297,  1.6901, -6.1846, -2.0070],
        [-6.0717, -4.5038,  3.4466, -2.9679]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 97%|█████████▋| 280/289 [03:31<00:06,  1.33it/s]

Training loop 280
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.25242871046066284, logits - tensor([[-7.3687,  2.8543, -6.8963, -2.4845],
        [-5.1783, -2.8975,  2.4555, -2.7894],
        [-6.2061, -1.7585, -5.5285,  1.6507],
        [-7.0628,  2.2864, -7.3980, -2.7998],
        [-5.0960, -3.8304,  2.8611, -2.8780],
        [-7.2364,  0.1032, -5.9777, -0.0424],
        [-7.5667,  1.9288, -7.7995, -0.9053],
        [-7.9675,  1.8454, -7.1416, -2.5612]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 97%|█████████▋| 281/289 [03:32<00:06,  1.33it/s]

Training loop 281
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.33018073439598083, logits - tensor([[-7.4618,  2.8458, -7.1037, -2.6970],
        [-5.7934, -3.1596,  2.4314, -2.8953],
        [-6.7579,  2.7249, -6.2609, -2.7966],
        [-7.0378,  2.4470, -6.7515, -2.6643],
        [-6.1621,  1.5587, -6.3852, -1.4572],
        [-7.6021, -0.4223, -6.1561,  0.5088],
        [-6.3973,  2.7081, -7.3858, -2.9473],
        [-7.2824,  2.9644, -5.9257, -2.4105]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 98%|█████████▊| 282/289 [03:33<00:05,  1.33it/s]

Training loop 282
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.18858852982521057, logits - tensor([[-6.2043, -3.7140, -3.8681,  4.0034],
        [-7.2383, -1.9703, -5.4393,  2.3191],
        [-6.0760,  2.1925, -6.8597, -1.6844],
        [-6.8015, -2.5776, -5.9019,  2.1535],
        [-7.4386,  1.7518, -6.6445, -1.5712],
        [-5.6204,  2.0692, -5.8038, -2.5853],
        [-6.8912,  2.4260, -5.7737, -1.9135],
        [-6.5579,  1.9902, -6.6634, -2.5017]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 98%|█████████▊| 283/289 [03:34<00:04,  1.33it/s]

Training loop 283
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 98%|█████████▊| 284/289 [03:34<00:03,  1.33it/s]

loss - 0.05772098898887634, logits - tensor([[-6.4068,  3.5632, -6.4231, -3.2762],
        [-4.8387, -2.7184,  1.3414, -1.8301],
        [-5.6720, -3.3676,  2.2615, -1.9816],
        [-7.7653,  3.0668, -7.5113, -2.9910],
        [-6.9508,  2.8748, -5.7846, -3.0495],
        [-7.4330,  1.8313, -6.5993, -2.6393],
        [-6.9321,  2.6775, -6.1407, -1.5288],
        [-6.5428,  2.2012, -6.9261, -1.1844]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 284
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2364448755979538, logits - tensor([[-5.9446,  2.0341, -5.5221, -2.6323],
        [-6.7926, -0.2563, -5.1158,  0.7620],
        [-7.0088,  2.1641, -6.5481, -2.7177],
        [-6.0424,  1.7884, -5.8583, -2.4174],
        [-5.4528, -3.3598,  1.6998, -1.8325],
        [-7.3284,  2.1484, -6.8855, -2.1189],
        [-8.4469,  1.1263, -7.7171, -0.1267],
        [-6.1356,  2.0972, -6.9833, -2.

 99%|█████████▊| 285/289 [03:35<00:03,  1.33it/s]

Training loop 285
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19786128401756287, logits - tensor([[-7.1303,  2.6577, -6.5664, -1.7724],
        [-6.7198,  1.2248, -7.0788, -0.8802],
        [-6.2084,  2.5025, -6.8012, -2.8366],
        [-5.9664,  2.2283, -6.1886, -1.7981],
        [-6.0863, -3.6825,  2.3438, -2.8592],
        [-6.2785,  1.8749, -6.4780, -2.6157],
        [-5.0229, -3.9194, -4.9112,  4.2183],
        [-6.5164,  2.5142, -6.5453, -2.3812]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 99%|█████████▉| 286/289 [03:36<00:02,  1.33it/s]

Training loop 286
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.08242768049240112, logits - tensor([[-6.8398,  0.5864, -6.1923, -1.3266],
        [-7.0034,  3.1254, -6.7400, -2.7922],
        [-6.8487,  0.4228, -6.0309, -0.1273],
        [-5.4534,  2.5760, -6.4405, -2.1315],
        [-6.3347,  2.7682, -7.2354, -2.0088],
        [-6.6405, -4.5896,  3.7407, -3.2839],
        [-5.9017, -3.6000, -6.1354,  3.8337],
        [-5.8119,  1.9525, -5.3485, -2.6621]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 99%|█████████▉| 287/289 [03:37<00:01,  1.33it/s]

Training loop 287
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4239519238471985, logits - tensor([[-6.7291,  2.8061, -6.6333, -1.7041],
        [-6.6440,  2.7294, -6.6714, -2.8630],
        [-7.2803,  2.7813, -7.1952, -2.0359],
        [-6.6653,  2.8221, -7.0048, -2.6071],
        [-6.1221,  1.7221, -6.6581, -2.3232],
        [-6.6676, -3.2897, -5.9984,  2.5542],
        [-6.3005, -2.8035, -5.7654,  1.7932],
        [-6.2577,  1.7724, -6.2959, -1.2623]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


100%|█████████▉| 288/289 [03:37<00:00,  1.33it/s]

Training loop 288
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.33757033944129944, logits - tensor([[-6.8412,  2.0982, -7.4283, -2.7150],
        [-6.9884,  2.0242, -6.2202, -1.7511],
        [-4.9251, -2.7492,  2.7255, -1.7359],
        [-6.4047,  2.6699, -6.3580, -2.1926]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


100%|██████████| 289/289 [03:38<00:00,  1.32it/s]
  0%|          | 0/194 [00:00<?, ?it/s]

Validation Loop 0
input - False, attention_mask - False


  1%|          | 1/194 [00:00<00:55,  3.48it/s]

Validation Loop 1
input - False, attention_mask - False


  1%|          | 2/194 [00:00<00:50,  3.82it/s]

Validation Loop 2
input - False, attention_mask - False


  2%|▏         | 3/194 [00:00<00:49,  3.89it/s]

Validation Loop 3
input - False, attention_mask - False


  2%|▏         | 4/194 [00:01<00:49,  3.81it/s]

Validation Loop 4
input - False, attention_mask - False


  3%|▎         | 5/194 [00:01<00:48,  3.89it/s]

Validation Loop 5
input - False, attention_mask - False


  3%|▎         | 6/194 [00:01<00:47,  3.95it/s]

Validation Loop 6
input - False, attention_mask - False


  4%|▎         | 7/194 [00:01<00:47,  3.96it/s]

Validation Loop 7
input - False, attention_mask - False


  4%|▍         | 8/194 [00:02<00:47,  3.92it/s]

Validation Loop 8
input - False, attention_mask - False


  5%|▍         | 9/194 [00:02<00:46,  3.97it/s]

Validation Loop 9
input - False, attention_mask - False


  5%|▌         | 10/194 [00:02<00:46,  3.96it/s]

Validation Loop 10
input - False, attention_mask - False


  6%|▌         | 11/194 [00:02<00:46,  3.97it/s]

Validation Loop 11
input - False, attention_mask - False


  6%|▌         | 12/194 [00:03<00:45,  3.96it/s]

Validation Loop 12
input - False, attention_mask - False


  7%|▋         | 13/194 [00:03<00:45,  3.98it/s]

Validation Loop 13
input - False, attention_mask - False


  7%|▋         | 14/194 [00:03<00:45,  3.97it/s]

Validation Loop 14
input - False, attention_mask - False


  8%|▊         | 15/194 [00:03<00:45,  3.95it/s]

Validation Loop 15
input - False, attention_mask - False


  8%|▊         | 16/194 [00:04<00:44,  3.96it/s]

Validation Loop 16
input - False, attention_mask - False


  9%|▉         | 17/194 [00:04<00:44,  3.98it/s]

Validation Loop 17
input - False, attention_mask - False


  9%|▉         | 18/194 [00:04<00:44,  3.95it/s]

Validation Loop 18
input - False, attention_mask - False


 10%|▉         | 19/194 [00:04<00:43,  3.98it/s]

Validation Loop 19
input - False, attention_mask - False


 10%|█         | 20/194 [00:05<00:43,  3.97it/s]

Validation Loop 20
input - False, attention_mask - False


 11%|█         | 21/194 [00:05<00:43,  3.96it/s]

Validation Loop 21
input - False, attention_mask - False


 11%|█▏        | 22/194 [00:05<00:43,  3.93it/s]

Validation Loop 22
input - False, attention_mask - False


 12%|█▏        | 23/194 [00:05<00:43,  3.91it/s]

Validation Loop 23
input - False, attention_mask - False


 12%|█▏        | 24/194 [00:06<00:43,  3.95it/s]

Validation Loop 24
input - False, attention_mask - False


 13%|█▎        | 25/194 [00:06<00:43,  3.92it/s]

Validation Loop 25
input - False, attention_mask - False


 13%|█▎        | 26/194 [00:06<00:42,  3.94it/s]

Validation Loop 26
input - False, attention_mask - False


 14%|█▍        | 27/194 [00:06<00:42,  3.94it/s]

Validation Loop 27
input - False, attention_mask - False


 14%|█▍        | 28/194 [00:07<00:41,  3.96it/s]

Validation Loop 28
input - False, attention_mask - False


 15%|█▍        | 29/194 [00:07<00:41,  3.99it/s]

Validation Loop 29
input - False, attention_mask - False


 15%|█▌        | 30/194 [00:07<00:41,  3.95it/s]

Validation Loop 30
input - False, attention_mask - False


 16%|█▌        | 31/194 [00:07<00:41,  3.93it/s]

Validation Loop 31
input - False, attention_mask - False


 16%|█▋        | 32/194 [00:08<00:40,  3.96it/s]

Validation Loop 32
input - False, attention_mask - False


 17%|█▋        | 33/194 [00:08<00:40,  3.94it/s]

Validation Loop 33
input - False, attention_mask - False


 18%|█▊        | 34/194 [00:08<00:40,  3.95it/s]

Validation Loop 34
input - False, attention_mask - False


 18%|█▊        | 35/194 [00:08<00:40,  3.95it/s]

Validation Loop 35
input - False, attention_mask - False


 19%|█▊        | 36/194 [00:09<00:39,  3.96it/s]

Validation Loop 36
input - False, attention_mask - False


 19%|█▉        | 37/194 [00:09<00:39,  3.98it/s]

Validation Loop 37
input - False, attention_mask - False


 20%|█▉        | 38/194 [00:09<00:39,  3.96it/s]

Validation Loop 38
input - False, attention_mask - False


 20%|██        | 39/194 [00:09<00:39,  3.97it/s]

Validation Loop 39
input - False, attention_mask - False


 21%|██        | 40/194 [00:10<00:38,  3.96it/s]

Validation Loop 40
input - False, attention_mask - False


 21%|██        | 41/194 [00:10<00:38,  3.97it/s]

Validation Loop 41
input - False, attention_mask - False


 22%|██▏       | 42/194 [00:10<00:38,  3.96it/s]

Validation Loop 42
input - False, attention_mask - False


 22%|██▏       | 43/194 [00:10<00:38,  3.92it/s]

Validation Loop 43
input - False, attention_mask - False


 23%|██▎       | 44/194 [00:11<00:38,  3.90it/s]

Validation Loop 44
input - False, attention_mask - False


 23%|██▎       | 45/194 [00:11<00:37,  3.94it/s]

Validation Loop 45
input - False, attention_mask - False


 24%|██▎       | 46/194 [00:11<00:37,  3.90it/s]

Validation Loop 46
input - False, attention_mask - False


 24%|██▍       | 47/194 [00:11<00:37,  3.92it/s]

Validation Loop 47
input - False, attention_mask - False


 25%|██▍       | 48/194 [00:12<00:36,  3.96it/s]

Validation Loop 48
input - False, attention_mask - False


 25%|██▌       | 49/194 [00:12<00:36,  3.95it/s]

Validation Loop 49
input - False, attention_mask - False


 26%|██▌       | 50/194 [00:12<00:36,  3.94it/s]

Validation Loop 50
input - False, attention_mask - False


 26%|██▋       | 51/194 [00:12<00:36,  3.89it/s]

Validation Loop 51
input - False, attention_mask - False


 27%|██▋       | 52/194 [00:13<00:36,  3.90it/s]

Validation Loop 52
input - False, attention_mask - False


 27%|██▋       | 53/194 [00:13<00:36,  3.85it/s]

Validation Loop 53
input - False, attention_mask - False


 28%|██▊       | 54/194 [00:13<00:35,  3.89it/s]

Validation Loop 54
input - False, attention_mask - False


 28%|██▊       | 55/194 [00:13<00:35,  3.88it/s]

Validation Loop 55
input - False, attention_mask - False


 29%|██▉       | 56/194 [00:14<00:35,  3.89it/s]

Validation Loop 56
input - False, attention_mask - False


 29%|██▉       | 57/194 [00:14<00:35,  3.91it/s]

Validation Loop 57
input - False, attention_mask - False


 30%|██▉       | 58/194 [00:14<00:34,  3.94it/s]

Validation Loop 58
input - False, attention_mask - False


 30%|███       | 59/194 [00:14<00:34,  3.95it/s]

Validation Loop 59
input - False, attention_mask - False


 31%|███       | 60/194 [00:15<00:33,  3.97it/s]

Validation Loop 60
input - False, attention_mask - False


 31%|███▏      | 61/194 [00:15<00:33,  3.93it/s]

Validation Loop 61
input - False, attention_mask - False


 32%|███▏      | 62/194 [00:15<00:33,  3.91it/s]

Validation Loop 62
input - False, attention_mask - False


 32%|███▏      | 63/194 [00:16<00:33,  3.87it/s]

Validation Loop 63
input - False, attention_mask - False


 33%|███▎      | 64/194 [00:16<00:33,  3.86it/s]

Validation Loop 64
input - False, attention_mask - False


 34%|███▎      | 65/194 [00:16<00:33,  3.84it/s]

Validation Loop 65
input - False, attention_mask - False


 34%|███▍      | 66/194 [00:16<00:32,  3.90it/s]

Validation Loop 66
input - False, attention_mask - False


 35%|███▍      | 67/194 [00:17<00:32,  3.90it/s]

Validation Loop 67
input - False, attention_mask - False


 35%|███▌      | 68/194 [00:17<00:32,  3.88it/s]

Validation Loop 68
input - False, attention_mask - False


 36%|███▌      | 69/194 [00:17<00:31,  3.92it/s]

Validation Loop 69
input - False, attention_mask - False


 36%|███▌      | 70/194 [00:17<00:31,  3.94it/s]

Validation Loop 70
input - False, attention_mask - False


 37%|███▋      | 71/194 [00:18<00:31,  3.96it/s]

Validation Loop 71
input - False, attention_mask - False


 37%|███▋      | 72/194 [00:18<00:30,  3.97it/s]

Validation Loop 72
input - False, attention_mask - False


 38%|███▊      | 73/194 [00:18<00:30,  3.97it/s]

Validation Loop 73
input - False, attention_mask - False


 38%|███▊      | 74/194 [00:18<00:30,  3.97it/s]

Validation Loop 74
input - False, attention_mask - False


 39%|███▊      | 75/194 [00:19<00:30,  3.96it/s]

Validation Loop 75
input - False, attention_mask - False


 39%|███▉      | 76/194 [00:19<00:29,  3.94it/s]

Validation Loop 76
input - False, attention_mask - False


 40%|███▉      | 77/194 [00:19<00:29,  3.94it/s]

Validation Loop 77
input - False, attention_mask - False


 40%|████      | 78/194 [00:19<00:29,  3.93it/s]

Validation Loop 78
input - False, attention_mask - False


 41%|████      | 79/194 [00:20<00:29,  3.95it/s]

Validation Loop 79
input - False, attention_mask - False


 41%|████      | 80/194 [00:20<00:28,  3.94it/s]

Validation Loop 80
input - False, attention_mask - False


 42%|████▏     | 81/194 [00:20<00:28,  3.96it/s]

Validation Loop 81
input - False, attention_mask - False


 42%|████▏     | 82/194 [00:20<00:28,  3.98it/s]

Validation Loop 82
input - False, attention_mask - False


 43%|████▎     | 83/194 [00:21<00:28,  3.95it/s]

Validation Loop 83
input - False, attention_mask - False


 43%|████▎     | 84/194 [00:21<00:27,  3.98it/s]

Validation Loop 84
input - False, attention_mask - False


 44%|████▍     | 85/194 [00:21<00:27,  3.96it/s]

Validation Loop 85
input - False, attention_mask - False


 44%|████▍     | 86/194 [00:21<00:27,  4.00it/s]

Validation Loop 86
input - False, attention_mask - False


 45%|████▍     | 87/194 [00:22<00:26,  3.97it/s]

Validation Loop 87
input - False, attention_mask - False


 45%|████▌     | 88/194 [00:22<00:26,  3.98it/s]

Validation Loop 88
input - False, attention_mask - False


 46%|████▌     | 89/194 [00:22<00:26,  4.02it/s]

Validation Loop 89
input - False, attention_mask - False


 46%|████▋     | 90/194 [00:22<00:25,  4.01it/s]

Validation Loop 90
input - False, attention_mask - False


 47%|████▋     | 91/194 [00:23<00:25,  4.01it/s]

Validation Loop 91
input - False, attention_mask - False


 47%|████▋     | 92/194 [00:23<00:25,  4.02it/s]

Validation Loop 92
input - False, attention_mask - False


 48%|████▊     | 93/194 [00:23<00:25,  4.00it/s]

Validation Loop 93
input - False, attention_mask - False


 48%|████▊     | 94/194 [00:23<00:24,  4.00it/s]

Validation Loop 94
input - False, attention_mask - False


 49%|████▉     | 95/194 [00:24<00:24,  3.99it/s]

Validation Loop 95
input - False, attention_mask - False


 49%|████▉     | 96/194 [00:24<00:24,  3.98it/s]

Validation Loop 96
input - False, attention_mask - False


 50%|█████     | 97/194 [00:24<00:24,  3.99it/s]

Validation Loop 97
input - False, attention_mask - False


 51%|█████     | 98/194 [00:24<00:24,  3.98it/s]

Validation Loop 98
input - False, attention_mask - False


 51%|█████     | 99/194 [00:25<00:23,  4.01it/s]

Validation Loop 99
input - False, attention_mask - False


 52%|█████▏    | 100/194 [00:25<00:23,  4.00it/s]

Validation Loop 100
input - False, attention_mask - False


 52%|█████▏    | 101/194 [00:25<00:23,  4.00it/s]

Validation Loop 101
input - False, attention_mask - False


 53%|█████▎    | 102/194 [00:25<00:23,  3.96it/s]

Validation Loop 102
input - False, attention_mask - False


 53%|█████▎    | 103/194 [00:26<00:22,  3.98it/s]

Validation Loop 103
input - False, attention_mask - False


 54%|█████▎    | 104/194 [00:26<00:22,  4.01it/s]

Validation Loop 104
input - False, attention_mask - False


 54%|█████▍    | 105/194 [00:26<00:22,  4.00it/s]

Validation Loop 105
input - False, attention_mask - False


 55%|█████▍    | 106/194 [00:26<00:22,  3.95it/s]

Validation Loop 106
input - False, attention_mask - False


 55%|█████▌    | 107/194 [00:27<00:22,  3.94it/s]

Validation Loop 107
input - False, attention_mask - False


 56%|█████▌    | 108/194 [00:27<00:21,  3.94it/s]

Validation Loop 108
input - False, attention_mask - False


 56%|█████▌    | 109/194 [00:27<00:21,  3.93it/s]

Validation Loop 109
input - False, attention_mask - False


 57%|█████▋    | 110/194 [00:27<00:21,  3.92it/s]

Validation Loop 110
input - False, attention_mask - False


 57%|█████▋    | 111/194 [00:28<00:20,  3.96it/s]

Validation Loop 111
input - False, attention_mask - False


 58%|█████▊    | 112/194 [00:28<00:20,  3.93it/s]

Validation Loop 112
input - False, attention_mask - False


 58%|█████▊    | 113/194 [00:28<00:20,  3.91it/s]

Validation Loop 113
input - False, attention_mask - False


 59%|█████▉    | 114/194 [00:28<00:20,  3.96it/s]

Validation Loop 114
input - False, attention_mask - False


 59%|█████▉    | 115/194 [00:29<00:20,  3.95it/s]

Validation Loop 115
input - False, attention_mask - False


 60%|█████▉    | 116/194 [00:29<00:20,  3.90it/s]

Validation Loop 116
input - False, attention_mask - False


 60%|██████    | 117/194 [00:29<00:19,  3.92it/s]

Validation Loop 117
input - False, attention_mask - False


 61%|██████    | 118/194 [00:29<00:19,  3.90it/s]

Validation Loop 118
input - False, attention_mask - False


 61%|██████▏   | 119/194 [00:30<00:19,  3.90it/s]

Validation Loop 119
input - False, attention_mask - False


 62%|██████▏   | 120/194 [00:30<00:19,  3.88it/s]

Validation Loop 120
input - False, attention_mask - False


 62%|██████▏   | 121/194 [00:30<00:18,  3.87it/s]

Validation Loop 121
input - False, attention_mask - False


 63%|██████▎   | 122/194 [00:30<00:18,  3.92it/s]

Validation Loop 122
input - False, attention_mask - False


 63%|██████▎   | 123/194 [00:31<00:18,  3.91it/s]

Validation Loop 123
input - False, attention_mask - False


 64%|██████▍   | 124/194 [00:31<00:17,  3.93it/s]

Validation Loop 124
input - False, attention_mask - False


 64%|██████▍   | 125/194 [00:31<00:17,  3.93it/s]

Validation Loop 125
input - False, attention_mask - False


 65%|██████▍   | 126/194 [00:31<00:17,  3.91it/s]

Validation Loop 126
input - False, attention_mask - False


 65%|██████▌   | 127/194 [00:32<00:17,  3.92it/s]

Validation Loop 127
input - False, attention_mask - False


 66%|██████▌   | 128/194 [00:32<00:16,  3.89it/s]

Validation Loop 128
input - False, attention_mask - False


 66%|██████▋   | 129/194 [00:32<00:16,  3.86it/s]

Validation Loop 129
input - False, attention_mask - False


 67%|██████▋   | 130/194 [00:32<00:16,  3.91it/s]

Validation Loop 130
input - False, attention_mask - False


 68%|██████▊   | 131/194 [00:33<00:16,  3.87it/s]

Validation Loop 131
input - False, attention_mask - False


 68%|██████▊   | 132/194 [00:33<00:15,  3.93it/s]

Validation Loop 132
input - False, attention_mask - False


 69%|██████▊   | 133/194 [00:33<00:15,  3.95it/s]

Validation Loop 133
input - False, attention_mask - False


 69%|██████▉   | 134/194 [00:34<00:15,  3.90it/s]

Validation Loop 134
input - False, attention_mask - False


 70%|██████▉   | 135/194 [00:34<00:15,  3.93it/s]

Validation Loop 135
input - False, attention_mask - False


 70%|███████   | 136/194 [00:34<00:14,  3.95it/s]

Validation Loop 136
input - False, attention_mask - False


 71%|███████   | 137/194 [00:34<00:14,  3.94it/s]

Validation Loop 137
input - False, attention_mask - False


 71%|███████   | 138/194 [00:35<00:14,  3.96it/s]

Validation Loop 138
input - False, attention_mask - False


 72%|███████▏  | 139/194 [00:35<00:13,  3.97it/s]

Validation Loop 139
input - False, attention_mask - False


 72%|███████▏  | 140/194 [00:35<00:13,  3.99it/s]

Validation Loop 140
input - False, attention_mask - False


 73%|███████▎  | 141/194 [00:35<00:13,  3.96it/s]

Validation Loop 141
input - False, attention_mask - False


 73%|███████▎  | 142/194 [00:36<00:13,  3.97it/s]

Validation Loop 142
input - False, attention_mask - False


 74%|███████▎  | 143/194 [00:36<00:12,  3.96it/s]

Validation Loop 143
input - False, attention_mask - False


 74%|███████▍  | 144/194 [00:36<00:12,  3.96it/s]

Validation Loop 144
input - False, attention_mask - False


 75%|███████▍  | 145/194 [00:36<00:12,  3.94it/s]

Validation Loop 145
input - False, attention_mask - False


 75%|███████▌  | 146/194 [00:37<00:12,  3.94it/s]

Validation Loop 146
input - False, attention_mask - False


 76%|███████▌  | 147/194 [00:37<00:11,  3.96it/s]

Validation Loop 147
input - False, attention_mask - False


 76%|███████▋  | 148/194 [00:37<00:11,  3.91it/s]

Validation Loop 148
input - False, attention_mask - False


 77%|███████▋  | 149/194 [00:37<00:11,  3.95it/s]

Validation Loop 149
input - False, attention_mask - False


 77%|███████▋  | 150/194 [00:38<00:11,  3.96it/s]

Validation Loop 150
input - False, attention_mask - False


 78%|███████▊  | 151/194 [00:38<00:10,  3.97it/s]

Validation Loop 151
input - False, attention_mask - False


 78%|███████▊  | 152/194 [00:38<00:10,  3.95it/s]

Validation Loop 152
input - False, attention_mask - False


 79%|███████▉  | 153/194 [00:38<00:10,  3.96it/s]

Validation Loop 153
input - False, attention_mask - False


 79%|███████▉  | 154/194 [00:39<00:09,  4.01it/s]

Validation Loop 154
input - False, attention_mask - False


 80%|███████▉  | 155/194 [00:39<00:09,  4.00it/s]

Validation Loop 155
input - False, attention_mask - False


 80%|████████  | 156/194 [00:39<00:09,  4.01it/s]

Validation Loop 156
input - False, attention_mask - False


 81%|████████  | 157/194 [00:39<00:09,  3.96it/s]

Validation Loop 157
input - False, attention_mask - False


 81%|████████▏ | 158/194 [00:40<00:09,  3.98it/s]

Validation Loop 158
input - False, attention_mask - False


 82%|████████▏ | 159/194 [00:40<00:08,  3.94it/s]

Validation Loop 159
input - False, attention_mask - False


 82%|████████▏ | 160/194 [00:40<00:08,  3.97it/s]

Validation Loop 160
input - False, attention_mask - False


 83%|████████▎ | 161/194 [00:40<00:08,  3.99it/s]

Validation Loop 161
input - False, attention_mask - False


 84%|████████▎ | 162/194 [00:41<00:08,  3.95it/s]

Validation Loop 162
input - False, attention_mask - False


 84%|████████▍ | 163/194 [00:41<00:07,  4.01it/s]

Validation Loop 163
input - False, attention_mask - False


 85%|████████▍ | 164/194 [00:41<00:07,  3.98it/s]

Validation Loop 164
input - False, attention_mask - False


 85%|████████▌ | 165/194 [00:41<00:07,  3.97it/s]

Validation Loop 165
input - False, attention_mask - False


 86%|████████▌ | 166/194 [00:42<00:07,  3.97it/s]

Validation Loop 166
input - False, attention_mask - False


 86%|████████▌ | 167/194 [00:42<00:06,  3.98it/s]

Validation Loop 167
input - False, attention_mask - False


 87%|████████▋ | 168/194 [00:42<00:06,  3.96it/s]

Validation Loop 168
input - False, attention_mask - False


 87%|████████▋ | 169/194 [00:42<00:06,  3.98it/s]

Validation Loop 169
input - False, attention_mask - False


 88%|████████▊ | 170/194 [00:43<00:06,  3.97it/s]

Validation Loop 170
input - False, attention_mask - False


 88%|████████▊ | 171/194 [00:43<00:05,  3.97it/s]

Validation Loop 171
input - False, attention_mask - False


 89%|████████▊ | 172/194 [00:43<00:05,  3.98it/s]

Validation Loop 172
input - False, attention_mask - False


 89%|████████▉ | 173/194 [00:43<00:05,  3.98it/s]

Validation Loop 173
input - False, attention_mask - False


 90%|████████▉ | 174/194 [00:44<00:05,  3.96it/s]

Validation Loop 174
input - False, attention_mask - False


 90%|█████████ | 175/194 [00:44<00:04,  3.89it/s]

Validation Loop 175
input - False, attention_mask - False


 91%|█████████ | 176/194 [00:44<00:04,  3.93it/s]

Validation Loop 176
input - False, attention_mask - False


 91%|█████████ | 177/194 [00:44<00:04,  3.92it/s]

Validation Loop 177
input - False, attention_mask - False


 92%|█████████▏| 178/194 [00:45<00:04,  3.93it/s]

Validation Loop 178
input - False, attention_mask - False


 92%|█████████▏| 179/194 [00:45<00:03,  3.95it/s]

Validation Loop 179
input - False, attention_mask - False


 93%|█████████▎| 180/194 [00:45<00:03,  3.95it/s]

Validation Loop 180
input - False, attention_mask - False


 93%|█████████▎| 181/194 [00:45<00:03,  3.92it/s]

Validation Loop 181
input - False, attention_mask - False


 94%|█████████▍| 182/194 [00:46<00:03,  3.96it/s]

Validation Loop 182
input - False, attention_mask - False


 94%|█████████▍| 183/194 [00:46<00:02,  3.99it/s]

Validation Loop 183
input - False, attention_mask - False


 95%|█████████▍| 184/194 [00:46<00:02,  3.97it/s]

Validation Loop 184
input - False, attention_mask - False


 95%|█████████▌| 185/194 [00:46<00:02,  3.95it/s]

Validation Loop 185
input - False, attention_mask - False


 96%|█████████▌| 186/194 [00:47<00:02,  3.97it/s]

Validation Loop 186
input - False, attention_mask - False


 96%|█████████▋| 187/194 [00:47<00:01,  3.96it/s]

Validation Loop 187
input - False, attention_mask - False


 97%|█████████▋| 188/194 [00:47<00:01,  3.94it/s]

Validation Loop 188
input - False, attention_mask - False


 97%|█████████▋| 189/194 [00:47<00:01,  3.93it/s]

Validation Loop 189
input - False, attention_mask - False


 98%|█████████▊| 190/194 [00:48<00:01,  3.94it/s]

Validation Loop 190
input - False, attention_mask - False


 98%|█████████▊| 191/194 [00:48<00:00,  3.98it/s]

Validation Loop 191
input - False, attention_mask - False


 99%|█████████▉| 192/194 [00:48<00:00,  3.94it/s]

Validation Loop 192
input - False, attention_mask - False


 99%|█████████▉| 193/194 [00:48<00:00,  3.96it/s]

Validation Loop 193
input - False, attention_mask - False


100%|██████████| 194/194 [00:49<00:00,  3.94it/s]

[{'tp': 0, 'tn': 1552, 'fp': 0, 'fn': 0}, {'tp': 886, 'tn': 355, 'fp': 75, 'fn': 236}, {'tp': 156, 'tn': 1365, 'fp': 4, 'fn': 27}, {'tp': 164, 'tn': 1076, 'fp': 267, 'fn': 45}]
Detailed accuracy after 4 epoch:
unanswerable accuarcy: 1.0
extractive accuarcy: 0.7996134020618557
yes_no accuarcy: 0.9800257731958762
abstractive accuarcy: 0.7989690721649485
Overall accuarcy: 0.8946520618556701
Best accuarcy: 0.899645618556701



  0%|          | 0/289 [00:00<?, ?it/s]

Training loop 0
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.08748622238636017, logits - tensor([[-6.2039,  3.3167, -6.6497, -2.2802],
        [-6.7624,  1.2786, -6.6687, -0.8572],
        [-7.5079, -0.3890, -6.6906,  1.2222],
        [-7.7248,  1.6839, -7.1045, -1.6759],
        [-6.7816,  1.2923, -6.8972, -1.1125],
        [-5.9899,  2.2719, -6.3237, -2.2526],
        [-6.8511, -3.8316, -6.4174,  4.3682],
        [-6.1984,  2.7879, -5.9793, -2.1655]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  0%|          | 1/289 [00:00<03:58,  1.21it/s]

Training loop 1
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.06313817203044891, logits - tensor([[-7.2186, -0.8509, -5.8750,  1.4739],
        [-5.5802, -3.9280,  2.1545, -3.4935],
        [-6.4743,  2.9992, -6.2926, -3.0377],
        [-6.2531, -3.4389,  2.5569, -3.4461],
        [-5.9326, -3.4234,  2.0635, -2.5580],
        [-6.8226,  1.7272, -6.1061, -1.3693],
        [-6.7521,  1.7524, -5.3429, -2.3127],
        [-6.5229,  2.4953, -6.4449, -2.3767]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  1%|          | 2/289 [00:01<03:46,  1.27it/s]

Training loop 2
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.04745064675807953, logits - tensor([[-6.8978,  2.7194, -6.9837, -3.1937],
        [-5.6170, -2.5590,  1.6328, -2.0370],
        [-7.4491,  1.3162, -6.8469, -1.7544],
        [-6.7526,  2.7228, -7.4546, -2.8459],
        [-6.6134,  1.9406, -6.3388, -2.1643],
        [-5.4535, -4.8779, -5.7133,  3.9689],
        [-6.8619,  2.3691, -6.8234, -2.5468],
        [-6.5912, -3.3100, -5.2552,  3.7470]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  1%|          | 3/289 [00:02<03:41,  1.29it/s]

Training loop 3
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.20330199599266052, logits - tensor([[-6.6835,  2.3430, -5.9055, -2.5497],
        [-5.3489, -3.6453,  2.7640, -2.8185],
        [-7.1567,  2.4192, -6.4128, -1.8427],
        [-5.4491,  1.9096, -4.8673, -1.6737],
        [-6.4458,  3.0323, -6.7010, -2.8230],
        [-6.3128,  2.1581, -6.5923, -2.5239],
        [-7.9142,  1.3645, -6.5253, -0.5872],
        [-6.6569, -1.3187, -6.6645,  1.7633]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  1%|▏         | 4/289 [00:03<03:38,  1.30it/s]

Training loop 4
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.11850882321596146, logits - tensor([[-6.9366,  2.7660, -6.7465, -2.5805],
        [-6.4561,  2.4566, -7.2109, -2.7110],
        [-5.6682,  2.4760, -6.9442, -3.1507],
        [-7.9779,  3.8331, -7.0959, -3.0679],
        [-8.0349,  2.2600, -7.1814, -3.0191],
        [-7.1372,  2.9140, -7.0069, -2.3869],
        [-5.8228, -2.9681,  1.3236, -2.4214],
        [-6.4854,  1.3565, -6.1914, -0.6319]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  2%|▏         | 5/289 [00:03<03:36,  1.31it/s]

Training loop 5
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.041106756776571274, logits - tensor([[-5.8591, -4.3506,  2.7467, -3.0479],
        [-4.8808, -3.0845,  2.0103, -3.0617],
        [-7.1864,  2.0103, -6.2228, -2.6822],
        [-6.7695,  2.2369, -7.8119, -2.0328],
        [-6.0800,  2.0161, -5.8590, -3.0281],
        [-7.2952,  3.0148, -6.5622, -3.1457],
        [-5.7875, -2.8017, -4.9436,  2.7602],
        [-6.4252,  2.4278, -6.2486, -2.8064]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  2%|▏         | 6/289 [00:04<03:34,  1.32it/s]

Training loop 6
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.492886483669281, logits - tensor([[-6.8616,  2.0336, -6.1555, -1.7699],
        [-5.0932,  2.0304, -5.7442, -1.9789],
        [-6.6285,  3.2244, -6.7683, -2.6713],
        [-6.1848, -2.6993, -6.3389,  3.0952],
        [-7.9033,  1.0399, -7.1304, -1.3715],
        [-6.5318,  2.9451, -6.4101, -2.4635],
        [-7.0912, -3.8381, -6.1260,  3.1802],
        [-6.0505,  2.4689, -6.6659, -2.1840]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  2%|▏         | 7/289 [00:05<03:33,  1.32it/s]

Training loop 7
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.0527825802564621, logits - tensor([[-7.7002,  1.6689, -7.2304, -0.4894],
        [-7.2300,  2.2429, -7.1400, -2.3900],
        [-6.8496,  2.7190, -6.7390, -3.0495],
        [-5.1846, -3.7222,  2.2494, -3.0045],
        [-5.2930, -4.0398,  1.7501, -2.7086],
        [-6.1466,  2.7082, -6.9421, -2.1113],
        [-6.2874, -4.2039, -5.3092,  3.1828],
        [-5.4317,  3.3256, -7.1503, -3.6883]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  3%|▎         | 8/289 [00:06<03:31,  1.33it/s]

Training loop 8
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.20104394853115082, logits - tensor([[-6.8833,  2.6983, -6.3464, -2.0347],
        [-6.3827,  1.7451, -5.9305, -1.8519],
        [-6.6327,  2.0933, -6.4455, -2.3671],
        [-6.8362,  1.2195, -6.9098, -2.0923],
        [-6.4236,  2.3008, -6.2217, -3.0742],
        [-6.1321, -0.2579, -4.6940, -0.4378],
        [-7.5731, -3.8368, -6.1360,  3.8566],
        [-7.0147,  1.8638, -6.7652, -1.6313]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  3%|▎         | 9/289 [00:06<03:31,  1.32it/s]

Training loop 9
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.17310307919979095, logits - tensor([[-5.8667, -3.3444,  2.7082, -3.4938],
        [-5.3010, -2.2933,  2.1562, -2.0897],
        [-8.4548,  1.9155, -7.6074, -1.6117],
        [-7.5263,  2.5059, -6.8683, -2.7087],
        [-5.3111, -3.3360,  1.5221, -2.3846],
        [-5.6948, -3.4942,  2.2747, -3.0420],
        [-8.2810, -2.7368, -7.1202,  3.6170],
        [-6.8229,  2.8715, -6.8363, -2.8197]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  3%|▎         | 10/289 [00:07<03:30,  1.33it/s]

Training loop 10
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.045494355261325836, logits - tensor([[-7.4935,  1.6316, -6.3875, -2.0801],
        [-6.5679,  2.1490, -6.5737, -2.3082],
        [-5.1951, -3.2858,  2.4255, -2.8709],
        [-5.6360,  1.5109, -6.3023, -2.2286],
        [-6.6545,  2.1615, -6.0152, -2.8050],
        [-7.6194, -4.1254, -5.4459,  4.1044],
        [-6.2544,  2.8253, -7.2413, -2.7783],
        [-6.9399,  2.8195, -7.0779, -2.6061]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  4%|▍         | 11/289 [00:08<03:29,  1.33it/s]

Training loop 11
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.06222166866064072, logits - tensor([[-6.6018,  1.8710, -6.6241, -1.5494],
        [-6.4076,  1.9597, -6.7842, -2.1389],
        [-6.9207,  2.5989, -6.0018, -2.0634],
        [-7.1833,  3.2112, -6.2725, -2.3517],
        [-6.4256, -3.0495,  1.5385, -1.8844],
        [-6.7090, -3.2080, -5.6126,  2.7287],
        [-5.7891,  1.7313, -6.0277, -2.0843],
        [-6.5207,  1.5829, -6.2319, -2.1400]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  4%|▍         | 12/289 [00:09<03:28,  1.33it/s]

Training loop 12
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.0480341799557209, logits - tensor([[-7.2844,  2.4221, -7.2601, -3.3005],
        [-6.3029,  2.2517, -6.8492, -2.4606],
        [-8.1611,  1.6581, -6.4588, -2.4759],
        [-7.5436,  2.6036, -6.1298, -2.7939],
        [-6.4033,  1.8381, -6.0718, -2.0306],
        [-7.4829, -2.2633, -5.9006,  3.0147],
        [-6.4463,  1.7070, -5.9912, -2.0423],
        [-6.9295, -3.3294, -6.7779,  2.4342]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  4%|▍         | 13/289 [00:09<03:28,  1.32it/s]

Training loop 13
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.04587366431951523, logits - tensor([[-7.4747,  2.5858, -6.7613, -1.3666],
        [-6.5343,  2.7329, -6.0150, -2.4175],
        [-6.5027,  2.7221, -5.6314, -2.4490],
        [-5.5886,  2.4343, -6.9242, -1.9195],
        [-7.3452,  2.5322, -7.2747, -2.8588],
        [-6.2537, -3.1879, -5.5847,  3.0241],
        [-5.6955, -3.3830,  2.4696, -2.8490],
        [-8.0705, -2.4832, -4.9599,  1.8347]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  5%|▍         | 14/289 [00:10<03:28,  1.32it/s]

Training loop 14
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.05202343687415123, logits - tensor([[-6.6431,  2.1188, -6.3111, -2.5021],
        [-6.4889,  1.2948, -6.2178, -2.3337],
        [-5.1329, -4.0968,  2.4859, -3.3120],
        [-5.5895, -2.0726, -5.6000,  2.5209],
        [-5.8804, -3.8970,  2.5460, -2.2844],
        [-6.1807,  1.8672, -6.4193, -2.3923],
        [-6.1907, -4.1826,  2.2821, -3.0953],
        [-6.3662,  2.2711, -6.0119, -2.2671]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  5%|▌         | 15/289 [00:11<03:27,  1.32it/s]

Training loop 15
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.415230393409729, logits - tensor([[-5.7858,  2.0698, -5.8880, -2.6794],
        [-7.3903,  1.2866, -7.1186, -1.9192],
        [-6.0239,  1.4486, -6.2052, -1.7710],
        [-7.3095,  3.1496, -7.8285, -2.6642],
        [-6.5819,  3.0075, -6.2328, -2.6729],
        [-6.5101,  2.1007, -5.8585, -1.4997],
        [-7.7262,  1.7418, -7.2324, -1.2867],
        [-5.1964,  2.6338, -5.8721, -1.8770]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  6%|▌         | 16/289 [00:12<03:26,  1.32it/s]

Training loop 16
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.058783065527677536, logits - tensor([[-6.2537, -3.1490, -4.8743,  2.9922],
        [-7.3860, -0.8008, -6.3686,  1.1494],
        [-7.3524,  2.2474, -7.2039, -2.9069],
        [-5.9559,  1.9022, -6.1690, -1.7673],
        [-6.3109, -2.3527, -6.0104,  2.3629],
        [-5.8260,  2.5894, -5.5531, -2.3209],
        [-6.3129, -3.5872,  2.3352, -3.6347],
        [-5.8929,  2.2512, -6.0876, -2.7001]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  6%|▌         | 17/289 [00:12<03:25,  1.33it/s]

Training loop 17
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.05824960023164749, logits - tensor([[-7.5696,  0.8306, -7.3527, -1.2142],
        [-6.0736, -3.5807,  2.9056, -2.8599],
        [-6.8696,  2.3978, -6.8391, -3.2097],
        [-7.1435,  2.0390, -7.4465, -2.0962],
        [-5.5567, -3.5992,  2.3743, -2.8688],
        [-5.9788,  2.2559, -6.8678, -2.4012],
        [-6.5341, -2.8857, -6.3367,  2.7264],
        [-6.3023,  1.8681, -6.5155, -2.2402]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  6%|▌         | 18/289 [00:13<03:25,  1.32it/s]

Training loop 18
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.17885763943195343, logits - tensor([[-5.1150, -2.8188,  1.4860, -1.7372],
        [-7.3313, -2.5199, -6.3520,  3.5177],
        [-7.9481,  2.0102, -7.7591, -1.7541],
        [-5.3304, -3.4294, -4.4860,  3.4243],
        [-6.8199,  2.9967, -6.0553, -2.3797],
        [-6.1801, -2.4813,  1.7424, -1.8767],
        [-6.7518,  1.5746, -7.0706, -2.3268],
        [-7.6112,  2.9110, -6.8768, -2.9294]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  7%|▋         | 19/289 [00:14<03:25,  1.32it/s]

Training loop 19
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.038263753056526184, logits - tensor([[-6.3276,  1.7829, -6.1459, -1.9468],
        [-8.5515, -3.0995, -5.2141,  2.9597],
        [-6.2167,  2.2213, -6.1579, -2.6095],
        [-5.9704,  2.9637, -5.8924, -2.4073],
        [-7.5088,  2.6498, -6.2411, -1.8942],
        [-6.3709,  2.7523, -6.5039, -2.8731],
        [-7.1107, -2.7421, -6.2475,  3.4596],
        [-6.0284,  3.2357, -6.8227, -3.1413]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  7%|▋         | 20/289 [00:15<03:23,  1.32it/s]

Training loop 20
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.04422871023416519, logits - tensor([[-6.5441,  2.3299, -6.9417, -2.3275],
        [-5.6459,  2.6960, -6.6830, -2.8550],
        [-6.3856,  3.5490, -6.1691, -2.0098],
        [-6.1008,  1.8271, -6.0514, -1.8273],
        [-6.4816,  1.8894, -6.4039, -2.4992],
        [-6.9336,  2.1709, -7.6121, -2.9623],
        [-7.1199,  2.5467, -6.5201, -3.1820],
        [-6.2480,  2.8120, -6.0227, -2.5352]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  7%|▋         | 21/289 [00:15<03:23,  1.31it/s]

Training loop 21
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.11332891881465912, logits - tensor([[-7.1017, -4.2861,  2.8278, -3.2556],
        [-5.3094, -3.2366,  0.6514, -2.2783],
        [-7.8572, -0.3391, -6.9162, -0.0531],
        [-6.8078,  2.2143, -6.4019, -2.9008],
        [-6.5892, -2.4817, -5.7435,  2.8207],
        [-8.2242, -0.3817, -7.5201,  0.8207],
        [-6.8049,  2.9195, -6.1574, -2.8516],
        [-6.7848, -1.3971, -6.5305,  1.6684]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  8%|▊         | 22/289 [00:16<03:22,  1.32it/s]

Training loop 22
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.04506073147058487, logits - tensor([[-6.4558, -2.2744, -5.4905,  3.0019],
        [-5.5885,  1.7084, -5.5411, -1.4253],
        [-6.9345,  2.4395, -6.0700, -3.1147],
        [-6.1585,  1.8077, -5.6267, -2.0206],
        [-7.3721,  2.3256, -7.1707, -2.0804],
        [-6.9318,  3.2014, -7.4520, -2.8032],
        [-5.9663,  3.0820, -5.3047, -2.6681],
        [-5.9701, -3.8987, -5.4692,  3.5278]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  8%|▊         | 23/289 [00:17<03:21,  1.32it/s]

Training loop 23
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.09715063869953156, logits - tensor([[-6.0478,  2.6799, -5.7996, -3.3123],
        [-6.7577, -2.2924, -5.5232,  1.7767],
        [-6.3590,  2.0093, -5.9004, -2.2527],
        [-5.6858, -3.3239, -4.8934,  3.9457],
        [-7.6066,  0.7596, -6.6367, -0.4928],
        [-6.3749, -4.0354, -5.9745,  4.6790],
        [-6.6233,  2.8320, -6.8825, -3.5529],
        [-8.1916,  2.3532, -7.3319, -2.0369]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  8%|▊         | 24/289 [00:18<03:20,  1.32it/s]

Training loop 24
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.15965566039085388, logits - tensor([[-6.9616,  1.9624, -5.9934, -1.4746],
        [-5.8269, -3.6673,  2.6603, -2.7572],
        [-5.3089, -3.5591,  1.9872, -1.5555],
        [-7.7952,  1.8855, -6.9611, -2.8357],
        [-6.7732,  2.8523, -6.5060, -3.5236],
        [-6.9189,  3.0896, -5.7201, -1.7009],
        [-7.2690,  2.6625, -6.2129, -3.5393],
        [-6.6935, -4.6718,  2.7198, -3.2513]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  9%|▊         | 25/289 [00:18<03:20,  1.32it/s]

Training loop 25
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1850421279668808, logits - tensor([[-6.0857,  1.3994, -6.2461, -2.5995],
        [-6.7303,  1.8924, -6.2548, -2.3970],
        [-6.3159,  2.4306, -5.7065, -1.4847],
        [-6.4594, -3.8395,  2.2905, -3.4894],
        [-5.5847, -3.6018,  1.8312, -2.1314],
        [-5.3007, -4.1369,  2.6077, -2.7472],
        [-6.4459, -3.8663,  2.3740, -3.0654],
        [-6.2476, -3.6892, -6.0404,  3.4722]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  9%|▉         | 26/289 [00:19<03:20,  1.31it/s]

Training loop 26
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.0582612119615078, logits - tensor([[-8.0113, -1.8013, -5.4477,  1.4528],
        [-5.7737,  3.2495, -5.0117, -2.8143],
        [-6.8200, -4.4346,  2.3365, -3.2687],
        [-6.3526,  0.8039, -5.6423, -1.8887],
        [-6.3269,  2.7539, -6.3324, -2.6497],
        [-7.8012,  3.4859, -6.9935, -2.9609],
        [-6.0295, -3.3695, -5.6795,  2.2866],
        [-6.3770,  1.6662, -6.0881, -1.4904]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  9%|▉         | 27/289 [00:20<03:18,  1.32it/s]

Training loop 27
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3485742509365082, logits - tensor([[-5.7453,  2.0791, -5.8577, -2.0866],
        [-6.2710,  2.0065, -6.3614, -2.2016],
        [-6.0330,  2.3269, -5.5452, -1.9720],
        [-7.0268, -2.9459, -5.1374,  3.5971],
        [-6.3206,  2.2713, -6.0731, -2.8877],
        [-7.3432,  2.0659, -6.4971, -2.0940],
        [-6.5223, -1.3385, -6.0088,  1.6195],
        [-6.4919, -2.8912, -5.1838,  3.3795]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 10%|▉         | 28/289 [00:21<03:18,  1.32it/s]

Training loop 28
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2787858843803406, logits - tensor([[-6.6394,  2.9804, -5.0726, -2.9695],
        [-6.1679,  1.8081, -6.7944, -2.0116],
        [-7.4358,  1.6079, -5.9889, -2.0837],
        [-5.5150, -3.0247,  0.9237, -2.4252],
        [-6.7608,  1.7260, -5.8837, -1.1509],
        [-6.8651,  2.9762, -6.0969, -2.3201],
        [-6.9249,  1.7613, -5.9957, -2.4362],
        [-6.1239, -3.1704,  1.7497, -1.8035]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 10%|█         | 29/289 [00:22<03:17,  1.32it/s]

Training loop 29
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19675767421722412, logits - tensor([[-5.8568,  2.2690, -6.1310, -1.7737],
        [-6.3069,  2.1528, -5.4701, -2.6432],
        [-6.1974, -1.3045, -5.8875,  1.2589],
        [-7.2302, -0.5050, -5.2926,  0.0670],
        [-7.3748,  2.1466, -7.2151, -2.6950],
        [-6.0853,  2.1586, -6.1624, -2.8316],
        [-6.5609,  1.9194, -6.4162, -2.5857],
        [-6.5833, -2.1246, -5.7517,  1.1377]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 10%|█         | 30/289 [00:22<03:16,  1.32it/s]

Training loop 30
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.04457578808069229, logits - tensor([[-6.0616,  2.4326, -6.3006, -2.6301],
        [-5.8330, -3.3729,  2.1633, -2.5970],
        [-7.5207,  2.5884, -8.0782, -2.4387],
        [-6.9614,  1.5882, -5.8532, -2.7822],
        [-5.7934, -3.9129,  2.4222, -3.4251],
        [-7.8626,  2.1823, -6.6027, -2.5854],
        [-6.2917, -3.4802,  2.4930, -2.3391],
        [-6.5734,  2.6256, -6.6155, -3.0328]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 11%|█         | 31/289 [00:23<03:16,  1.32it/s]

Training loop 31
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.04977066069841385, logits - tensor([[-6.0125, -1.4193, -5.9703,  1.3641],
        [-5.8709,  1.4792, -5.5958, -2.1710],
        [-6.8561,  2.7442, -6.0125, -2.4399],
        [-5.1182, -4.1698, -5.8113,  4.2636],
        [-6.3337,  3.5845, -6.9152, -3.2616],
        [-7.3511,  3.1993, -6.3308, -2.9855],
        [-5.8715, -3.9229,  2.3611, -2.4013],
        [-8.1467,  1.6439, -7.1054, -2.2573]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 11%|█         | 32/289 [00:24<03:14,  1.32it/s]

Training loop 32
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.10106503963470459, logits - tensor([[-6.6934,  1.7223, -6.8806, -2.1249],
        [-5.6911, -4.3759,  2.8499, -3.3197],
        [-7.6993, -1.3997, -6.9558,  1.6988],
        [-7.3622,  1.4297, -6.2266, -1.7151],
        [-6.3268,  2.2302, -6.0060, -2.9312],
        [-7.6095,  1.4965, -7.0429, -1.5138],
        [-6.6879, -4.2035, -5.6386,  3.9437],
        [-7.3603,  0.0439, -6.5291,  0.2060]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 11%|█▏        | 33/289 [00:25<03:14,  1.32it/s]

Training loop 33
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.11096597462892532, logits - tensor([[-6.8459,  2.6087, -7.2705, -2.8072],
        [-5.8931, -3.3843,  2.0652, -2.2039],
        [-7.0663,  0.6668, -5.9918, -0.8910],
        [-4.8722, -3.8689,  2.2536, -2.5058],
        [-7.2027,  2.2405, -5.9636, -1.8733],
        [-6.2857,  0.1476, -5.6470, -0.3089],
        [-6.5033, -3.2500, -5.0895,  3.8326],
        [-6.4681,  2.2595, -6.8038, -1.8928]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 12%|█▏        | 34/289 [00:25<03:12,  1.32it/s]

Training loop 34
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19994612038135529, logits - tensor([[-5.6420, -3.7951,  2.3817, -2.3393],
        [-6.7979,  1.2412, -6.0340, -0.9808],
        [-5.9039,  2.0965, -6.1479, -2.4871],
        [-6.6272, -3.4185, -6.0113,  3.7348],
        [-6.2526, -4.2464,  2.0570, -2.4907],
        [-6.3457,  1.2637, -5.9242, -1.7227],
        [-5.9662,  2.8877, -5.4377, -3.1038],
        [-6.2238, -4.4417, -4.8331,  4.4061]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 12%|█▏        | 35/289 [00:26<03:11,  1.33it/s]

Training loop 35
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.0770985335111618, logits - tensor([[-6.2147, -3.4987,  2.4472, -1.8063],
        [-5.4272, -3.7639,  2.6280, -1.9098],
        [-6.1679,  1.8676, -6.9836, -2.5587],
        [-6.3348, -4.3492, -4.4851,  4.2977],
        [-6.7148,  2.1305, -6.0145, -2.1482],
        [-5.2165, -2.8259,  1.9321, -1.7798],
        [-6.1000, -3.9223,  2.0988, -2.2497],
        [-6.7500,  0.8041, -5.5499, -0.4037]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 12%|█▏        | 36/289 [00:27<03:10,  1.33it/s]

Training loop 36
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1644279956817627, logits - tensor([[-5.4741, -3.9014, -5.9436,  4.0198],
        [-7.3860,  2.8880, -7.1450, -3.3845],
        [-6.3820,  3.3420, -6.4525, -3.0403],
        [-6.7232,  2.3883, -6.7664, -2.7924],
        [-7.6760,  1.9207, -6.9740, -1.8333],
        [-6.4207,  0.7557, -5.6117, -1.8864],
        [-6.8751,  2.0720, -5.9783, -2.3018],
        [-5.5589, -3.6314,  3.2101, -3.3408]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 13%|█▎        | 37/289 [00:28<03:09,  1.33it/s]

Training loop 37
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.04586603865027428, logits - tensor([[-5.7447, -3.4487,  2.7002, -2.0919],
        [-6.1845, -4.6038,  2.8379, -2.2475],
        [-5.6255, -3.8829, -5.8472,  4.3718],
        [-6.1713,  1.9595, -6.5470, -2.0379],
        [-5.9870, -4.0862,  2.5017, -2.2054],
        [-5.7444,  2.6728, -5.2742, -2.9267],
        [-7.3042,  2.1271, -6.1944, -2.0177],
        [-5.9687, -4.3876,  2.2801, -2.2213]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 13%|█▎        | 38/289 [00:28<03:09,  1.32it/s]

Training loop 38
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3904072344303131, logits - tensor([[-6.2429,  2.5178, -6.3419, -1.5576],
        [-6.1293, -3.6294,  2.2072, -2.2950],
        [-6.6594, -0.3335, -2.1355, -1.0121],
        [-6.4363,  2.5198, -7.4038, -2.3834],
        [-5.5023,  2.3758, -5.5674, -2.4384],
        [-6.3464, -1.0714, -4.5121,  0.7287],
        [-6.9550,  2.7812, -6.5826, -2.6510],
        [-6.8968,  1.8576, -6.2044, -1.7706]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 13%|█▎        | 39/289 [00:29<03:09,  1.32it/s]

Training loop 39
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.21335814893245697, logits - tensor([[-6.4792, -3.7807,  1.1570, -1.5506],
        [-5.0400, -3.4320, -4.9569,  3.4634],
        [-7.6833,  2.0431, -6.6397, -1.5318],
        [-6.4325, -4.1991,  2.4940, -2.4735],
        [-6.5984, -3.8487,  1.7835, -1.9711],
        [-6.3149, -4.4532,  2.8054, -3.2631],
        [-7.3751,  1.0735, -6.9827, -2.5224],
        [-6.0109, -4.2107, -5.3061,  4.1931]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 14%|█▍        | 40/289 [00:30<03:07,  1.33it/s]

Training loop 40
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.028117666020989418, logits - tensor([[-6.9213,  2.7393, -6.8944, -2.6282],
        [-6.4668, -2.6843, -5.5021,  2.3669],
        [-5.5080,  2.9723, -5.7984, -2.8774],
        [-5.3953, -4.5402, -5.7943,  4.3216],
        [-6.3465, -3.8706, -4.7247,  3.3232],
        [-6.2254, -4.5159, -4.6940,  5.0510],
        [-6.8294,  3.0049, -6.1746, -2.6955],
        [-7.0146,  1.8525, -6.0112, -2.2504]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 14%|█▍        | 41/289 [00:31<03:06,  1.33it/s]

Training loop 41
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.16352078318595886, logits - tensor([[-6.0076,  3.3766, -6.5797, -2.6319],
        [-7.4229,  2.6884, -6.8910, -2.1766],
        [-6.4983,  0.3190, -5.8046, -0.7432],
        [-7.2158, -3.2531, -6.1868,  2.9575],
        [-6.3751, -0.5294, -5.3365, -0.0926],
        [-6.7752,  1.4489, -5.0875, -1.4987],
        [-6.5529, -1.7680, -4.8260,  0.9670],
        [-6.1048, -2.0098, -4.8902,  1.7112]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 15%|█▍        | 42/289 [00:31<03:05,  1.33it/s]

Training loop 42
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.16643556952476501, logits - tensor([[-5.3120, -4.5489, -4.5143,  3.7897],
        [-6.3595, -3.8179, -5.5007,  3.7879],
        [-6.5422, -3.9926, -5.3804,  4.2887],
        [-6.0429,  3.5198, -6.0849, -3.0923],
        [-7.5932,  2.3969, -7.5074, -2.1564],
        [-5.7432, -4.7939,  2.9448, -2.9883],
        [-5.8637, -4.0069,  2.5468, -2.7329],
        [-7.5438,  2.7400, -6.8307, -3.6835]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 15%|█▍        | 43/289 [00:32<03:05,  1.32it/s]

Training loop 43
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.21756033599376678, logits - tensor([[-6.3008,  1.5389, -5.1692, -1.5274],
        [-6.3552,  1.9756, -5.4626, -2.1516],
        [-5.6954, -4.4380,  3.0521, -3.2885],
        [-6.1236, -3.1566, -5.5445,  3.0636],
        [-6.1712,  2.4389, -6.2418, -2.3735],
        [-7.4831, -3.1561, -5.4565,  3.7904],
        [-6.2756, -5.1286, -5.5976,  4.6758],
        [-7.1433,  3.1122, -7.0915, -2.6331]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 15%|█▌        | 44/289 [00:33<03:06,  1.31it/s]

Training loop 44
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.08481130748987198, logits - tensor([[-7.8090, -0.5695, -5.2404,  1.2468],
        [-7.0778,  2.7588, -6.5267, -2.9471],
        [-6.4183,  1.2770, -5.8209, -1.7937],
        [-5.2894, -2.7373,  1.6406, -2.8354],
        [-6.4944, -4.0478,  2.8855, -2.8710],
        [-5.3865, -3.7039,  3.5831, -2.9618],
        [-7.7515,  1.0849, -6.6841, -1.5791],
        [-6.5684,  1.6544, -6.3508, -1.0795]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 16%|█▌        | 45/289 [00:34<03:05,  1.32it/s]

Training loop 45
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.34699541330337524, logits - tensor([[-6.5412,  0.8004, -5.9318, -1.1671],
        [-6.8339,  2.2525, -6.1516, -1.4036],
        [-5.8228, -3.4501, -5.3721,  2.7127],
        [-6.4934,  2.2066, -6.3489, -2.4526],
        [-6.8272, -0.3380, -4.7199,  0.3723],
        [-6.9268,  1.4427, -5.6188, -2.6006],
        [-6.3087,  1.9531, -6.6313, -2.6334],
        [-6.0479,  2.6820, -6.6144, -2.5475]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 16%|█▌        | 46/289 [00:34<03:04,  1.32it/s]

Training loop 46
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.10885216295719147, logits - tensor([[-5.3999, -4.3030,  2.6390, -3.3005],
        [-6.5506,  2.0758, -6.0354, -2.8422],
        [-6.5292, -4.4603,  2.8925, -3.4299],
        [-7.2570,  2.2741, -6.3923, -2.4140],
        [-6.9456,  0.4906, -6.7526, -1.1012],
        [-6.5338, -3.9605,  2.2942, -2.8376],
        [-6.8858,  1.4040, -6.3784, -1.3533],
        [-6.5213, -0.5368, -3.9344, -0.4676]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 16%|█▋        | 47/289 [00:35<03:03,  1.32it/s]

Training loop 47
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.0521121472120285, logits - tensor([[-5.5693,  3.1986, -6.8378, -2.4368],
        [-7.3697,  1.9351, -6.4218, -1.6740],
        [-5.8758, -4.4845, -4.9375,  4.2617],
        [-5.2078, -3.5601,  2.6536, -3.1986],
        [-7.5804,  0.4793, -6.2447, -0.9074],
        [-4.9094, -4.9173, -5.8279,  4.3500],
        [-5.4551,  2.4767, -6.6287, -2.7181],
        [-6.1812, -4.4876, -5.7621,  3.4013]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 17%|█▋        | 48/289 [00:36<03:03,  1.32it/s]

Training loop 48
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.16226103901863098, logits - tensor([[-6.1325,  2.4587, -6.3900, -2.5229],
        [-6.6783,  2.8046, -7.0083, -1.8996],
        [-7.0371, -1.8657, -0.6967, -1.2337],
        [-7.0654, -0.0368, -5.7276, -0.9033],
        [-6.9049, -2.5804, -5.1377,  2.1102],
        [-6.9984, -2.1376, -5.6952,  2.9288],
        [-6.9389,  2.2000, -6.3796, -1.9785],
        [-7.3109,  1.3680, -6.8470, -1.1521]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 17%|█▋        | 49/289 [00:37<03:01,  1.32it/s]

Training loop 49
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 17%|█▋        | 50/289 [00:37<03:01,  1.32it/s]

loss - 0.19440820813179016, logits - tensor([[-5.7433,  2.7782, -6.4279, -2.8620],
        [-6.8911, -4.4513,  1.8303, -2.9393],
        [-7.3300,  2.5500, -6.3504, -2.4297],
        [-6.7859,  2.7162, -6.9790, -3.2749],
        [-6.5274, -4.3598,  2.5496, -2.8270],
        [-6.7166, -3.6946, -6.2754,  4.1217],
        [-6.0660,  2.2660, -6.5209, -2.8054],
        [-5.9249,  1.6538, -5.7008, -2.1548]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 50
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.04686916247010231, logits - tensor([[-7.1172,  1.7639, -5.6916, -2.2474],
        [-6.1845, -3.7457,  2.1276, -2.5570],
        [-5.7370,  2.4732, -5.7545, -1.5717],
        [-6.8965,  2.1625, -6.7468, -2.0833],
        [-7.4232, -3.7975, -5.9226,  3.7613],
        [-6.6721, -4.4353, -5.4615,  4.0563],
        [-6.0313,  2.4522, -6.7455, -2.6048],
        [-6.3215,  1.8446, -5.8027, -1.

 18%|█▊        | 51/289 [00:38<03:00,  1.32it/s]

Training loop 51
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.042439766228199005, logits - tensor([[-5.3619, -3.4610,  2.2047, -2.0268],
        [-5.5028, -4.1108,  2.5925, -3.8384],
        [-6.7122,  2.8158, -6.9563, -2.5454],
        [-6.3876,  2.9545, -5.8239, -2.5366],
        [-6.8421,  2.7597, -6.6402, -2.2913],
        [-6.4642,  2.8424, -5.6662, -2.0037],
        [-7.4496,  2.1534, -7.2830, -2.0300],
        [-6.4798,  2.9210, -6.8768, -2.5508]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 18%|█▊        | 52/289 [00:39<02:58,  1.32it/s]

Training loop 52
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1921611875295639, logits - tensor([[-4.5712, -3.3600, -4.8668,  3.2550],
        [-7.1337, -3.0113, -4.7330,  4.1683],
        [-7.9475,  2.6818, -6.9713, -2.3530],
        [-6.5153,  3.0078, -6.9974, -2.3071],
        [-6.7895,  3.2329, -7.1069, -3.1650],
        [-5.5995, -4.3956, -5.2062,  3.6364],
        [-7.3352,  1.3080, -6.2795, -1.4724],
        [-4.9980, -4.4095,  4.1938, -3.3056]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 18%|█▊        | 53/289 [00:40<02:57,  1.33it/s]

Training loop 53
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.07669392228126526, logits - tensor([[-6.4321,  1.6243, -6.1648, -2.3708],
        [-8.3248,  1.6706, -6.8036, -0.5588],
        [-7.5486,  2.0938, -7.3263, -2.8707],
        [-6.0915,  2.6873, -6.2991, -2.2429],
        [-6.1899,  1.9330, -6.1099, -2.5831],
        [-7.1004,  2.6026, -6.5804, -3.0656],
        [-5.9687, -4.5523,  2.9656, -3.6347],
        [-7.2116,  0.5299, -6.7545, -0.9521]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 19%|█▊        | 54/289 [00:40<02:57,  1.33it/s]

Training loop 54
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1803656369447708, logits - tensor([[-6.6990,  1.7717, -7.0193, -1.7415],
        [-5.9634,  2.7200, -6.4181, -2.9889],
        [-6.8147, -1.7132, -4.7382,  1.9066],
        [-5.2360, -4.0945,  2.7240, -3.2717],
        [-7.1903, -1.5044, -6.3079,  1.3183],
        [-6.7589,  2.0724, -5.7872, -1.3205],
        [-7.0569, -2.7703, -6.0297,  2.5679],
        [-7.5651,  2.2136, -6.6001, -1.4248]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 19%|█▉        | 55/289 [00:41<02:56,  1.33it/s]

Training loop 55
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.27211323380470276, logits - tensor([[-7.2180, -1.5367, -5.8349,  2.0153],
        [-6.2720,  1.7480, -6.0220, -2.4729],
        [-5.6494,  2.3958, -5.4594, -2.9174],
        [-6.0052, -3.8704,  3.2703, -3.0913],
        [-7.0534,  2.7387, -6.5780, -3.6225],
        [-6.9320,  1.0647, -6.3451, -1.3868],
        [-7.1570,  1.5263, -5.4061, -1.6527],
        [-6.8032, -2.9738,  2.6156, -3.1293]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 19%|█▉        | 56/289 [00:42<02:55,  1.33it/s]

Training loop 56
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.17290234565734863, logits - tensor([[-6.9134,  1.9822, -6.1155, -1.5442],
        [-6.7909, -4.0227, -6.0389,  4.1691],
        [-7.4330, -4.6026, -5.2442,  3.5701],
        [-6.0229,  1.8291, -6.6894, -2.1889],
        [-6.8035,  2.6986, -6.4360, -3.2155],
        [-7.0168,  2.3928, -6.3489, -2.7747],
        [-6.4336,  1.8386, -6.2100, -1.3989],
        [-6.3239,  2.6840, -6.9313, -1.8091]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 20%|█▉        | 57/289 [00:43<02:54,  1.33it/s]

Training loop 57
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.08905776590108871, logits - tensor([[-6.4394,  3.1772, -7.2518, -2.5835],
        [-6.8700,  2.1157, -6.2710, -2.8277],
        [-5.6947, -3.1927,  2.1439, -2.6033],
        [-7.3506, -0.3553, -5.9514, -0.2870],
        [-6.4994, -1.5642, -5.0211,  1.7680],
        [-7.3025, -4.0804, -4.8892,  3.4199],
        [-7.2287, -2.4147, -6.1007,  1.8850],
        [-6.6164,  1.4355, -6.4483, -2.3241]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 20%|██        | 58/289 [00:43<02:54,  1.33it/s]

Training loop 58
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.03360290825366974, logits - tensor([[-7.5090,  2.2660, -6.2739, -2.1308],
        [-6.6801, -3.3583, -5.3627,  3.0725],
        [-7.2079,  3.0895, -6.6880, -3.0691],
        [-6.5736,  2.4248, -5.6797, -3.3183],
        [-6.0085,  2.5001, -6.1944, -2.3041],
        [-7.2066,  3.2541, -7.0338, -2.8118],
        [-5.7383,  2.9305, -5.9489, -2.6670],
        [-6.3451,  3.1581, -6.3407, -2.1239]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 20%|██        | 59/289 [00:44<02:53,  1.32it/s]

Training loop 59
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.09322609007358551, logits - tensor([[-7.2543,  1.7097, -5.5632, -2.4183],
        [-7.7747, -3.9893,  2.7078, -3.1082],
        [-7.5798,  2.4413, -7.2240, -2.8028],
        [-4.5526, -3.8927, -6.2605,  3.6141],
        [-6.6995, -3.3276, -5.2595,  2.8470],
        [-7.6756,  2.3518, -7.3334, -2.7240],
        [-7.2102,  2.6520, -7.6458, -1.9804],
        [-6.5286,  0.3952, -5.6759, -0.5883]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 21%|██        | 60/289 [00:45<02:52,  1.33it/s]

Training loop 60
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.15191882848739624, logits - tensor([[-6.4297, -3.1932, -5.9305,  3.1176],
        [-6.0032, -3.2247, -5.7619,  3.1990],
        [-7.3401,  2.7494, -6.5446, -2.1331],
        [-8.2431,  2.0062, -7.9010, -1.7403],
        [-5.5080, -4.0107, -4.8936,  4.3207],
        [-6.4232,  2.1918, -6.6846, -2.8191],
        [-5.7146, -3.5861, -4.5303,  3.6944],
        [-6.7852,  1.8795, -6.4161, -2.8221]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 21%|██        | 61/289 [00:46<02:51,  1.33it/s]

Training loop 61
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.04936826974153519, logits - tensor([[-5.8944,  2.1475, -6.0336, -2.3331],
        [-6.9361, -4.6112,  3.2654, -4.1143],
        [-6.1710,  2.0023, -4.2188, -1.7099],
        [-5.1819, -3.8274,  3.7930, -3.3097],
        [-6.2517,  1.2246, -5.8346, -1.4768],
        [-6.5024, -3.6880, -4.9975,  3.2364],
        [-6.4268,  1.9668, -5.9569, -2.4472],
        [-6.5051,  2.6206, -6.6288, -2.4686]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 21%|██▏       | 62/289 [00:46<02:50,  1.33it/s]

Training loop 62
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.02686832845211029, logits - tensor([[-7.0805,  2.7276, -7.1123, -2.8988],
        [-5.7584, -4.1179,  2.9235, -3.5606],
        [-6.5584,  2.5524, -6.9360, -3.0792],
        [-7.6061, -3.5422, -5.6649,  3.0693],
        [-6.9055, -3.3563, -6.7229,  3.8584],
        [-6.4369,  2.2857, -6.2378, -1.7437],
        [-6.2955,  3.5497, -6.7780, -2.9271],
        [-5.4648, -4.0106, -5.8108,  4.2860]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 22%|██▏       | 63/289 [00:47<02:50,  1.33it/s]

Training loop 63
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.03025512769818306, logits - tensor([[-5.2510, -3.3145,  3.9587, -2.6985],
        [-6.4849,  2.4450, -6.3785, -2.6091],
        [-6.3886, -3.1320, -5.3490,  3.1011],
        [-5.5398, -3.6192, -4.8830,  3.3574],
        [-6.8682, -3.4251, -5.0452,  4.5051],
        [-7.3617,  2.8329, -6.7633, -2.9031],
        [-6.9012,  2.5304, -7.5175, -2.5471],
        [-6.0023,  2.6181, -6.4897, -1.9771]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 22%|██▏       | 64/289 [00:48<02:49,  1.33it/s]

Training loop 64
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.043543968349695206, logits - tensor([[-7.4831,  2.0301, -6.7577, -2.8580],
        [-6.6024, -3.5795, -5.3355,  3.2114],
        [-7.1112, -4.0470, -4.8148,  3.8312],
        [-5.9917, -4.1036, -5.4102,  4.7595],
        [-6.8191,  1.6261, -6.4827, -1.5188],
        [-7.5131, -1.8255, -5.3083,  1.5780],
        [-5.9201,  3.1760, -5.9719, -3.3442],
        [-6.7033, -4.1835,  2.4402, -1.7764]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 22%|██▏       | 65/289 [00:49<02:48,  1.33it/s]

Training loop 65
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 23%|██▎       | 66/289 [00:49<02:48,  1.33it/s]

loss - 0.1698853075504303, logits - tensor([[-6.3377,  1.5268, -6.5599, -2.0935],
        [-5.0852, -2.5360,  1.2791, -2.1139],
        [-7.5508, -2.8859, -5.1194,  2.5690],
        [-6.1059,  1.6816, -6.3658, -2.2715],
        [-6.0268, -3.3143, -4.7310,  2.8933],
        [-5.5922, -1.7198, -4.7772,  2.4037],
        [-7.6003,  2.4195, -7.1491, -2.5234],
        [-6.9386,  1.8354, -7.0186, -1.5062]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 66
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1442224532365799, logits - tensor([[-7.2101,  1.8399, -6.9591, -1.7459],
        [-5.4219, -4.0098, -4.3262,  4.2547],
        [-7.0830,  2.7524, -6.4966, -2.3879],
        [-5.9841, -1.0015, -1.6362, -1.2517],
        [-5.9534, -4.1252, -5.5086,  3.6179],
        [-6.9542, -2.6512, -6.7262,  2.9950],
        [-7.0385,  0.5667, -5.7199, -0.1833],
        [-6.3407,  3.0090, -6.2158, -2.51

 23%|██▎       | 67/289 [00:50<02:46,  1.33it/s]

Training loop 67
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19598695635795593, logits - tensor([[-6.7261,  2.3723, -7.5481, -2.4500],
        [-5.8937,  1.6468, -5.4011, -2.2473],
        [-7.6272,  2.5209, -7.5901, -1.2403],
        [-6.2285,  1.9930, -5.8623, -1.8859],
        [-7.2643, -0.6411, -6.1212,  0.6121],
        [-6.3413, -3.1857,  2.1176, -2.3480],
        [-6.8366,  2.0350, -6.3980, -3.5344],
        [-6.6849,  2.3185, -7.1159, -2.3634]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 24%|██▎       | 68/289 [00:51<02:46,  1.33it/s]

Training loop 68
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.0699618011713028, logits - tensor([[-7.1308,  2.4099, -5.5719, -2.5021],
        [-6.8294, -3.6340, -6.1983,  4.7526],
        [-7.6761,  0.9676, -6.6269, -0.0684],
        [-6.9182,  1.7054, -6.8755, -1.6298],
        [-7.3394,  2.3783, -7.6239, -3.5087],
        [-7.1430,  2.4284, -7.3732, -2.8318],
        [-7.1549,  1.7135, -6.8274, -2.1370],
        [-6.3710,  2.7798, -6.1995, -2.3076]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 24%|██▍       | 69/289 [00:52<02:45,  1.33it/s]

Training loop 69
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.055652495473623276, logits - tensor([[-7.5038,  2.6818, -7.6010, -2.4398],
        [-7.1273,  2.7757, -7.2717, -2.7011],
        [-7.1817,  2.6347, -7.3794, -2.8748],
        [-7.6199,  1.2712, -6.9931, -1.2917],
        [-6.1129,  2.3741, -6.1714, -2.3140],
        [-6.3140,  2.1192, -6.4113, -2.7740],
        [-6.8665,  2.4425, -6.8400, -1.5033],
        [-6.9533,  1.9542, -6.8140, -2.2496]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 24%|██▍       | 70/289 [00:52<02:45,  1.32it/s]

Training loop 70
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2247178852558136, logits - tensor([[-5.9324, -3.2145, -5.1313,  2.5560],
        [-7.4827,  1.4065, -6.7328, -1.9272],
        [-5.5618, -3.4219, -6.0653,  2.7669],
        [-5.3184, -3.5828,  2.2372, -3.4517],
        [-8.0685,  1.8120, -7.2369, -1.4602],
        [-8.1944,  0.0419, -7.0112,  0.6633],
        [-7.6534,  1.5462, -6.1712, -0.8992],
        [-7.3390,  1.9081, -6.4422, -0.9664]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 25%|██▍       | 71/289 [00:53<02:45,  1.32it/s]

Training loop 71
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19248530268669128, logits - tensor([[-6.6274,  1.1356, -6.0733, -1.9396],
        [-5.8497,  2.3605, -7.6215, -2.3339],
        [-6.8346, -0.6142, -5.5393,  0.6377],
        [-8.0033, -1.5378, -6.0098,  2.1931],
        [-6.8474, -2.7212, -6.5919,  2.4154],
        [-5.4243, -3.7321, -4.4104,  2.8330],
        [-6.5709,  1.3928, -5.3646, -1.6340],
        [-6.7804, -1.0032, -5.6378,  0.7906]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 25%|██▍       | 72/289 [00:54<02:44,  1.32it/s]

Training loop 72
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.16385029256343842, logits - tensor([[-5.9731, -1.0363, -1.2517, -0.9035],
        [-5.1811,  2.9093, -5.0494, -3.0941],
        [-5.9796,  2.4671, -6.3399, -2.0000],
        [-6.7299, -3.6689,  2.5897, -3.3011],
        [-7.2259, -2.8549, -3.1264,  1.7827],
        [-5.5697, -1.1709, -4.3550,  1.1681],
        [-7.1434,  0.7643, -7.1675, -0.1292],
        [-6.6648,  2.5366, -7.4743, -2.8989]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 25%|██▌       | 73/289 [00:55<02:43,  1.32it/s]

Training loop 73
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.08700799942016602, logits - tensor([[-7.5303,  2.6151, -7.2436, -1.9215],
        [-6.2929, -4.2460, -4.9926,  4.7089],
        [-5.4338, -4.2167,  2.5501, -3.4506],
        [-6.2023, -2.7971, -5.0285,  2.6150],
        [-6.1752, -0.3985, -5.2277,  0.4323],
        [-6.6848,  3.0288, -6.6810, -2.3348],
        [-6.3571,  2.5023, -7.0497, -2.4039],
        [-6.9078,  3.0131, -7.4710, -2.6441]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 26%|██▌       | 74/289 [00:56<02:42,  1.32it/s]

Training loop 74
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.04960436373949051, logits - tensor([[-4.7900, -3.3695,  2.7707, -2.9222],
        [-6.6347,  2.2453, -6.6362, -2.0687],
        [-5.7440, -2.4671,  1.9437, -2.0030],
        [-6.4436,  2.9671, -6.5163, -3.0712],
        [-5.6665,  2.2649, -6.3657, -2.6349],
        [-8.0044,  1.3975, -6.9987, -2.0217],
        [-5.0978, -4.0406, -5.8700,  3.8773],
        [-6.5694,  1.9354, -6.6012, -2.7488]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 26%|██▌       | 75/289 [00:56<02:42,  1.32it/s]

Training loop 75
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.137787863612175, logits - tensor([[-6.4549,  2.6952, -7.1228, -2.7301],
        [-6.0675,  1.4165, -5.3490, -1.7366],
        [-7.0166, -1.8995, -2.3860, -0.1422],
        [-6.4066,  3.4151, -6.6731, -3.7164],
        [-6.6719,  2.2114, -7.0172, -1.9865],
        [-6.7033,  3.4808, -6.6127, -3.9965],
        [-7.4106,  0.6269, -6.1460, -0.9137],
        [-6.9702,  2.0883, -6.4045, -2.0279]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 26%|██▋       | 76/289 [00:57<02:42,  1.31it/s]

Training loop 76
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.039202019572257996, logits - tensor([[-6.6645,  2.1413, -6.0754, -1.4807],
        [-7.2398,  3.2117, -6.2999, -2.9684],
        [-6.9804,  3.3016, -6.8280, -3.2986],
        [-7.4947,  2.6841, -6.4636, -2.6897],
        [-6.2521,  3.5305, -5.9114, -3.2159],
        [-6.2653,  1.2774, -6.5038, -1.6174],
        [-6.0781, -3.9338, -5.2715,  4.2611],
        [-6.5718,  2.9320, -6.6913, -3.3087]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 27%|██▋       | 77/289 [00:58<02:40,  1.32it/s]

Training loop 77
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.03479249030351639, logits - tensor([[-6.5719,  3.4271, -6.0804, -2.4220],
        [-6.5689,  2.7906, -6.3208, -2.0702],
        [-6.1867,  2.3940, -6.4334, -2.4798],
        [-5.5280,  3.1577, -6.8108, -2.0142],
        [-6.0028,  2.8973, -6.9864, -2.6274],
        [-6.3536,  2.9176, -6.7474, -2.8024],
        [-6.5256, -4.3983,  2.7841, -3.4469],
        [-8.1910,  2.8867, -7.0453, -2.6852]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 27%|██▋       | 78/289 [00:59<02:40,  1.31it/s]

Training loop 78
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.028827382251620293, logits - tensor([[-4.7942, -3.5508,  3.0466, -3.3860],
        [-5.0980,  2.2788, -6.1989, -2.7277],
        [-6.0394, -3.4332, -4.5995,  2.7101],
        [-6.4309,  2.2261, -6.8964, -2.8151],
        [-5.8104, -4.0825,  3.1627, -3.0671],
        [-6.0374, -2.7249, -5.2854,  3.0701],
        [-6.0179, -3.8716,  3.0980, -2.9976],
        [-6.2446, -4.3141, -5.8232,  4.7798]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 27%|██▋       | 79/289 [00:59<02:39,  1.32it/s]

Training loop 79
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.09487824141979218, logits - tensor([[-6.4321,  3.6699, -6.5774, -2.7873],
        [-7.6575,  2.8748, -7.6302, -3.5346],
        [-7.1351,  2.8816, -6.9072, -2.5015],
        [-6.8425,  3.0291, -7.0820, -2.7754],
        [-6.5749, -4.0065, -0.6968,  0.2360],
        [-6.5904,  1.5315, -6.4372, -1.3626],
        [-6.2377,  3.0755, -7.2538, -3.4194],
        [-6.3148,  2.4026, -5.7310, -2.4597]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 28%|██▊       | 80/289 [01:00<02:38,  1.32it/s]

Training loop 80
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3311918079853058, logits - tensor([[-7.5400, -1.5061, -5.9672,  2.2139],
        [-7.1207,  2.7539, -6.2796, -2.4816],
        [-6.7847,  3.1329, -6.8383, -2.9096],
        [-6.6279,  2.9439, -6.0914, -3.3157],
        [-6.8117,  2.5880, -7.3557, -2.4025],
        [-6.7406,  1.1640, -7.4888, -1.4929],
        [-6.2909,  2.4723, -6.8522, -2.3887],
        [-5.7002, -4.3025,  1.9548, -3.0935]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 28%|██▊       | 81/289 [01:01<02:37,  1.32it/s]

Training loop 81
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.04019171744585037, logits - tensor([[-6.7431, -4.2328, -5.6906,  3.6527],
        [-7.4283, -0.5524, -6.1410,  1.3306],
        [-6.2575, -3.4267, -5.4717,  3.6975],
        [-6.8359,  3.2825, -7.3589, -2.8157],
        [-5.5045, -3.7743, -5.0500,  5.0693],
        [-6.6832,  3.1115, -7.0643, -2.8023],
        [-5.8679, -4.9799,  3.0418, -2.8756],
        [-6.5214,  2.5578, -6.5334, -2.8229]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 28%|██▊       | 82/289 [01:02<02:36,  1.32it/s]

Training loop 82
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2705156207084656, logits - tensor([[-7.6032,  1.4897, -7.1782, -1.2315],
        [-8.2530,  3.7509, -7.6577, -2.1514],
        [-4.5970, -3.3907,  3.0649, -3.1375],
        [-7.8445,  2.6820, -6.8489, -1.8054],
        [-6.4196,  3.5671, -7.1404, -3.7648],
        [-6.2114,  3.7371, -5.8774, -2.9011],
        [-7.2834,  2.5344, -7.5294, -1.6604],
        [-6.9709,  2.8150, -6.6806, -2.8682]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 29%|██▊       | 83/289 [01:02<02:36,  1.32it/s]

Training loop 83
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1787794828414917, logits - tensor([[-6.7688,  3.4713, -7.1215, -3.4970],
        [-6.6267, -1.7466, -5.6600,  1.6074],
        [-6.0160,  3.3311, -5.9624, -2.3378],
        [-6.6165,  2.9135, -6.4735, -2.7602],
        [-5.5044, -2.6502,  1.8179, -2.6103],
        [-6.4441,  3.1382, -6.8249, -2.8859],
        [-6.0877,  3.0964, -6.8694, -2.8301],
        [-5.6425,  2.6215, -5.9366, -2.8106]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 29%|██▉       | 84/289 [01:03<02:35,  1.32it/s]

Training loop 84
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.18576738238334656, logits - tensor([[-7.1588,  1.9656, -6.6784, -2.9136],
        [-6.5728,  1.9964, -5.6215, -1.4493],
        [-6.4000, -3.5617, -5.6705,  5.3219],
        [-6.9237,  2.2148, -6.4796, -2.5366],
        [-7.4533, -4.4259, -5.9177,  4.5122],
        [-5.7651,  2.8616, -6.7656, -2.1562],
        [-7.1679,  2.4127, -6.4293, -2.8743],
        [-5.7743, -4.4783,  2.9812, -3.1674]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 29%|██▉       | 85/289 [01:04<02:34,  1.32it/s]

Training loop 85
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1288541555404663, logits - tensor([[-6.6719,  2.4108, -6.4421, -1.8153],
        [-6.6382,  2.5908, -6.9296, -2.5397],
        [-7.4745,  2.8391, -6.8997, -3.5175],
        [-7.5362, -2.9330, -6.5469,  2.5581],
        [-6.5183, -1.2536, -6.4060,  1.3770],
        [-7.5571,  3.1930, -7.3035, -2.5367],
        [-5.9475,  2.7228, -6.4457, -1.9407],
        [-6.1394,  2.8194, -6.4920, -3.6319]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 30%|██▉       | 86/289 [01:05<02:34,  1.32it/s]

Training loop 86
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 30%|███       | 87/289 [01:05<02:34,  1.31it/s]

loss - 0.1806761771440506, logits - tensor([[-6.2003, -4.3008, -5.8716,  5.2977],
        [-6.5378,  2.8280, -5.5969, -3.0741],
        [-5.9876, -3.9235, -5.8933,  4.1299],
        [-6.2718, -3.4295, -5.7014,  2.7844],
        [-6.9419,  2.3615, -7.2385, -2.7099],
        [-6.0503,  3.6197, -6.4409, -3.2705],
        [-6.3952,  3.1187, -6.8772, -2.4417],
        [-7.2459,  3.3145, -7.2355, -2.9520]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 87
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.24590513110160828, logits - tensor([[-7.4813,  3.4362, -7.4417, -2.4984],
        [-6.1697,  2.2345, -5.9911, -2.1996],
        [-7.1462,  1.8219, -6.7468, -1.4970],
        [-7.4228,  2.9156, -6.7894, -2.0422],
        [-6.3798,  2.9012, -6.2031, -2.2926],
        [-6.3873,  0.0900, -6.0295,  0.2612],
        [-6.4088, -4.2368,  2.2215, -2.5122],
        [-7.5782, -1.7842, -6.3399,  1.3

 30%|███       | 88/289 [01:06<02:33,  1.31it/s]

Training loop 88
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.07057855278253555, logits - tensor([[-7.3129e+00,  2.5356e+00, -7.2424e+00, -3.0742e+00],
        [-7.2960e+00,  2.6915e+00, -6.7752e+00, -3.0313e+00],
        [-6.0233e+00,  5.6279e-03, -5.2933e+00,  6.5595e-01],
        [-6.4503e+00,  3.6811e+00, -6.4521e+00, -2.8436e+00],
        [-6.7986e+00,  2.7423e+00, -7.9275e+00, -2.4792e+00],
        [-6.6973e+00,  1.8338e+00, -6.9566e+00, -1.1514e+00],
        [-6.4755e+00,  2.4601e+00, -5.7485e+00, -3.2926e+00],
        [-6.0137e+00,  2.9416e+00, -5.7439e+00, -2.6957e+00]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 31%|███       | 89/289 [01:07<02:33,  1.30it/s]

Training loop 89
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3096916377544403, logits - tensor([[-7.0876, -2.6865, -6.2860,  3.2799],
        [-7.0448,  3.0714, -6.8981, -3.0597],
        [-6.5735,  2.7276, -6.2554, -2.6372],
        [-6.2533, -1.7944, -5.5834,  0.8616],
        [-5.1296, -3.4977,  2.6976, -2.5005],
        [-6.8926,  2.8122, -7.3936, -2.0779],
        [-5.7093, -3.0024,  0.8655, -1.9630],
        [-5.4204,  2.7321, -6.2031, -2.5415]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 31%|███       | 90/289 [01:08<02:32,  1.30it/s]

Training loop 90
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.040704868733882904, logits - tensor([[-6.8420, -2.3451, -6.1843,  2.6918],
        [-6.7005,  2.0323, -7.2181, -2.5156],
        [-4.7460, -3.1688,  3.2006, -4.0054],
        [-6.0682, -2.2883, -7.3401,  1.3766],
        [-6.7665,  2.7256, -7.4453, -3.4573],
        [-6.1710, -3.9477,  3.0432, -3.2668],
        [-6.5419, -3.2039, -5.5424,  4.2533],
        [-6.5070,  1.7733, -6.0132, -2.4147]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 31%|███▏      | 91/289 [01:08<02:31,  1.30it/s]

Training loop 91
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.06412400305271149, logits - tensor([[-5.6121, -4.2912,  2.6780, -3.4099],
        [-6.9332, -3.2593,  2.5993, -2.2664],
        [-6.6981,  2.0733, -6.2221, -2.0091],
        [-5.4848,  2.7271, -6.0574, -2.1610],
        [-7.2551,  2.4455, -7.2268, -1.9039],
        [-6.2653,  2.9352, -5.5801, -2.6039],
        [-6.4830, -0.6313, -5.7219,  1.3031],
        [-7.3246,  1.6894, -7.1725, -2.1952]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 32%|███▏      | 92/289 [01:09<02:30,  1.31it/s]

Training loop 92
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.08178886026144028, logits - tensor([[-7.5637, -0.0947, -6.0007, -0.5116],
        [-6.7656,  3.4567, -7.1237, -3.2296],
        [-6.5956, -1.3495, -5.7882,  1.5657],
        [-4.9990, -4.6455,  4.0464, -3.5290],
        [-6.3616,  2.1556, -6.5389, -2.8248],
        [-7.3387,  2.8715, -6.7476, -3.0861],
        [-6.8452,  2.5358, -6.2091, -3.1435],
        [-6.1902, -4.6537, -5.0822,  4.1433]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 32%|███▏      | 93/289 [01:10<02:29,  1.31it/s]

Training loop 93
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.049395717680454254, logits - tensor([[-6.0523, -3.0687, -5.6762,  2.7961],
        [-6.8115,  3.5410, -7.1163, -2.4017],
        [-6.1129,  1.7472, -6.0192, -1.9447],
        [-5.0569, -3.8704,  2.1040, -2.8951],
        [-7.6763,  2.0382, -6.3519, -1.3211],
        [-6.9811,  2.2315, -6.4561, -1.9782],
        [-5.1674,  2.3171, -6.3119, -2.5034],
        [-6.0319,  3.1545, -7.5015, -3.2488]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 33%|███▎      | 94/289 [01:11<02:28,  1.31it/s]

Training loop 94
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.027534594759345055, logits - tensor([[-6.5025,  1.9111, -6.4549, -2.4662],
        [-5.0827, -3.4022, -5.8794,  4.2480],
        [-6.8492,  3.1222, -6.7590, -3.3378],
        [-6.1270,  2.7456, -5.5103, -2.7588],
        [-7.0766, -3.4837, -6.0662,  2.9583],
        [-6.6492,  2.9543, -6.2685, -2.5614],
        [-7.1697,  3.0921, -7.4687, -2.8506],
        [-6.2693,  3.2450, -6.8449, -3.2766]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 33%|███▎      | 95/289 [01:12<02:26,  1.32it/s]

Training loop 95
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.03986694663763046, logits - tensor([[-5.5144, -4.4739,  2.7456, -3.3710],
        [-6.7195, -3.8483, -5.8894,  3.7905],
        [-6.5472,  2.5078, -5.4303, -2.5004],
        [-6.5668, -2.4639, -5.3160,  1.9437],
        [-6.1004, -4.1236,  1.9853, -2.9988],
        [-7.2793,  2.5512, -7.8485, -2.4296],
        [-6.6981, -3.5063,  2.3865, -2.1950],
        [-7.1544, -2.3660, -6.3476,  2.7936]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 33%|███▎      | 96/289 [01:12<02:26,  1.32it/s]

Training loop 96
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.03757766634225845, logits - tensor([[-6.3483,  1.9358, -6.8709, -2.9037],
        [-6.2347,  4.0100, -6.1241, -2.8146],
        [-6.3224,  2.2333, -5.5987, -2.9852],
        [-6.1813,  2.9540, -6.5281, -2.5556],
        [-5.1522, -3.6557,  2.4140, -2.8843],
        [-7.4317, -4.5433,  2.8122, -3.2065],
        [-6.0413, -3.6095,  2.1361, -2.1150],
        [-7.3064,  2.7613, -7.4912, -3.0169]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 34%|███▎      | 97/289 [01:13<02:25,  1.32it/s]

Training loop 97
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.17667770385742188, logits - tensor([[-5.7821,  2.2514, -6.2533, -2.0937],
        [-7.4417,  1.3653, -7.0040, -1.0963],
        [-6.2921,  2.6881, -5.8949, -2.8911],
        [-7.0952,  3.5667, -7.5022, -3.9368],
        [-6.6537,  0.6332, -6.3453, -0.2539],
        [-6.2192,  2.7838, -5.6195, -2.0562],
        [-6.3326, -4.4262,  3.1939, -2.9977],
        [-6.1400,  3.1811, -6.8105, -2.8190]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 34%|███▍      | 98/289 [01:14<02:25,  1.32it/s]

Training loop 98
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.29351747035980225, logits - tensor([[-7.1265,  0.0807, -6.3836,  0.0795],
        [-7.1103,  2.7760, -6.1575, -2.5632],
        [-7.1431, -0.5494, -6.9895,  2.0778],
        [-7.0412,  1.1983, -6.3333, -1.0186],
        [-5.5210, -3.7979,  2.7977, -3.4255],
        [-6.2228,  3.1387, -6.2923, -2.5986],
        [-6.3262,  2.5226, -6.7402, -3.5333],
        [-7.6620,  2.4812, -6.5923, -2.7120]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 34%|███▍      | 99/289 [01:15<02:24,  1.31it/s]

Training loop 99
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2435823380947113, logits - tensor([[-7.5977,  2.7049, -6.5526, -2.5492],
        [-5.0365, -3.5218, -5.9266,  3.9515],
        [-6.8738,  0.4792, -5.9226, -0.0207],
        [-6.2020, -3.6967, -5.9001,  3.2165],
        [-6.2891,  3.0208, -6.3089, -2.1496],
        [-4.6814, -2.8624, -6.1230,  2.5001],
        [-6.8269, -3.6748,  3.2568, -3.2251],
        [-6.7147, -3.0341, -5.0742,  3.4506]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 35%|███▍      | 100/289 [01:15<02:23,  1.32it/s]

Training loop 100
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1071712002158165, logits - tensor([[-7.6523,  1.5502, -7.6813, -1.2947],
        [-6.1787,  2.0130, -6.7895, -2.1560],
        [-5.8067,  3.0002, -6.7091, -2.3277],
        [-6.3272, -3.8697, -5.7306,  3.4332],
        [-6.9303,  2.8045, -6.9446, -2.3284],
        [-7.0189,  3.6214, -7.3953, -3.6713],
        [-7.6813,  1.8670, -6.6873, -1.1544],
        [-6.9425, -1.0014, -1.0462, -0.9974]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 35%|███▍      | 101/289 [01:16<02:23,  1.31it/s]

Training loop 101
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.05009056255221367, logits - tensor([[-6.2115, -3.6497, -5.2724,  3.3193],
        [-8.0721,  1.7167, -7.4155, -1.3450],
        [-6.5642, -2.6531, -5.3361,  3.2382],
        [-6.1810, -4.1933, -5.5393,  3.3580],
        [-5.9950,  2.0088, -6.0369, -3.4065],
        [-7.6282,  2.5541, -7.7964, -1.9158],
        [-6.6686,  1.8639, -6.8371, -1.3929],
        [-6.5211,  1.7289, -6.5633, -2.8207]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 35%|███▌      | 102/289 [01:17<02:22,  1.32it/s]

Training loop 102
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.05217849463224411, logits - tensor([[-6.7935,  2.2582, -6.6449, -2.5461],
        [-5.4092,  3.1114, -6.0813, -3.1049],
        [-7.8639,  1.0094, -7.3704, -1.3874],
        [-6.6213, -2.5990, -4.7655,  3.2823],
        [-6.7209,  2.6154, -7.3555, -2.8644],
        [-7.6582, -1.9073, -6.7766,  1.0657],
        [-6.1937, -3.3411, -5.8872,  3.2248],
        [-5.3573,  2.9636, -6.1095, -3.0704]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 36%|███▌      | 103/289 [01:18<02:21,  1.32it/s]

Training loop 103
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.0537547767162323, logits - tensor([[-5.7506, -3.9674,  2.8898, -2.5458],
        [-6.4732, -3.6796, -6.1844,  2.8433],
        [-7.1537,  3.2342, -6.8641, -1.7315],
        [-6.7971,  2.9731, -6.9196, -2.1943],
        [-6.3606,  2.3136, -6.5633, -2.3162],
        [-7.2274,  1.0323, -6.4278, -0.9341],
        [-5.1989, -3.5113,  1.9904, -2.7585],
        [-6.7299, -3.1209, -5.5466,  3.9150]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 36%|███▌      | 104/289 [01:18<02:19,  1.32it/s]

Training loop 104
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 36%|███▋      | 105/289 [01:19<02:19,  1.32it/s]

loss - 0.18889349699020386, logits - tensor([[-6.3837,  1.9741, -6.9353, -1.6030],
        [-5.8089,  2.2044, -6.3829, -2.3751],
        [-6.1875,  2.0636, -6.5114, -2.2907],
        [-6.6853,  2.0126, -6.5756, -2.2878],
        [-6.3595, -3.1351, -4.5473,  2.9131],
        [-6.4781,  2.6303, -6.5480, -2.4275],
        [-6.8440,  2.4677, -5.9548, -2.8333],
        [-6.9132,  2.7739, -6.8972, -3.2803]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 105
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.09450718760490417, logits - tensor([[-7.3718,  3.9014, -6.6509, -3.5544],
        [-7.6308,  2.1508, -6.8920, -1.9480],
        [-6.4727,  2.6870, -6.4950, -2.5197],
        [-6.1144,  1.3168, -5.8149, -1.7776],
        [-5.0346, -4.1956,  2.7973, -3.1250],
        [-7.6626,  0.2821, -6.5037,  0.7758],
        [-6.7055,  3.0996, -6.3454, -2.7937],
        [-8.7987,  1.7578, -7.8184, -2

 37%|███▋      | 106/289 [01:20<02:18,  1.32it/s]

Training loop 106
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.044306978583335876, logits - tensor([[-6.3646,  2.8407, -6.3616, -1.9059],
        [-6.2088, -3.3588,  2.2579, -2.2651],
        [-6.4068,  2.6756, -5.6959, -3.3847],
        [-7.3023,  1.7552, -7.1931, -1.8761],
        [-5.3458,  2.4773, -6.4225, -2.2928],
        [-7.3393,  2.5313, -7.7849, -2.1221],
        [-6.0744, -4.1987,  3.2753, -3.1924],
        [-6.4789, -4.0028,  3.2378, -2.9788]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 37%|███▋      | 107/289 [01:21<02:18,  1.32it/s]

Training loop 107
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 37%|███▋      | 108/289 [01:21<02:17,  1.32it/s]

loss - 0.048992007970809937, logits - tensor([[-6.0052, -3.3722,  1.8646, -2.0636],
        [-6.0199,  2.6659, -5.7984, -2.6818],
        [-5.2755,  2.5507, -4.9374, -3.3679],
        [-6.7797, -3.9712, -5.9841,  2.7805],
        [-6.6310,  2.3476, -6.5169, -3.2800],
        [-5.3897,  1.8203, -5.3360, -1.9652],
        [-5.8691,  2.8885, -6.0251, -1.9877],
        [-6.7132,  1.8369, -6.6113, -1.7151]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 108
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3643651604652405, logits - tensor([[-6.5591,  1.0953, -6.6224, -0.5493],
        [-5.7262, -3.1724,  2.4418, -2.5351],
        [-6.9561,  0.1635, -6.2435,  0.3818],
        [-6.1686, -5.3300,  3.4142, -3.9109],
        [-6.4516, -4.4355, -5.1443,  4.4023],
        [-6.3101,  2.9752, -6.1331, -2.3722],
        [-7.1377,  3.1425, -7.4103, -2.5841],
        [-6.5221,  3.6386, -6.6662, -2

 38%|███▊      | 109/289 [01:22<02:16,  1.32it/s]

Training loop 109
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.05022657662630081, logits - tensor([[-7.8305,  2.5304, -7.1654, -2.1351],
        [-6.2316,  2.0324, -5.6281, -2.8207],
        [-6.7199,  3.4073, -7.5960, -2.4715],
        [-6.0465,  2.8929, -6.1126, -2.7473],
        [-6.0363,  3.1666, -6.7372, -2.8660],
        [-6.9650,  3.3378, -6.8049, -3.4176],
        [-6.4726,  1.9853, -6.1829, -1.9446],
        [-6.3411, -1.0319, -5.4351,  1.2640]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 38%|███▊      | 110/289 [01:23<02:15,  1.32it/s]

Training loop 110
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2930125892162323, logits - tensor([[-6.3433, -3.7121, -4.2786,  3.2374],
        [-6.3597, -4.1752,  1.8462, -3.1687],
        [-6.9957,  3.0585, -6.6505, -2.6052],
        [-6.1266, -3.2442,  1.0590, -1.8556],
        [-7.2106, -3.2722, -6.7417,  3.3467],
        [-6.8156,  2.4674, -6.9686, -1.7368],
        [-5.9867, -3.9867,  3.0587, -2.8199],
        [-6.3525, -3.4652,  3.7786, -2.7499]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 38%|███▊      | 111/289 [01:24<02:14,  1.32it/s]

Training loop 111
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.23204655945301056, logits - tensor([[-6.3262,  3.0683, -6.5265, -3.9052],
        [-6.9887,  2.1145, -6.9981, -1.8350],
        [-6.5467,  2.9229, -6.8856, -3.0831],
        [-7.6641,  3.2042, -6.0717, -3.0847],
        [-5.4718,  3.4023, -7.0295, -3.1412],
        [-7.1125,  0.9405, -6.7740, -1.0470],
        [-7.8872, -0.2623, -7.4880,  0.9170],
        [-6.3355,  1.4760, -4.9976,  0.6254]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 39%|███▉      | 112/289 [01:24<02:14,  1.32it/s]

Training loop 112
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.04103967547416687, logits - tensor([[-5.6782, -2.5123, -6.1186,  3.0986],
        [-5.9138,  3.4095, -6.5670, -3.0668],
        [-5.5542, -2.4176,  1.7531, -1.8606],
        [-5.6365,  3.3350, -4.7449, -3.2450],
        [-7.2749, -2.0018, -6.6697,  1.6948],
        [-7.1401,  2.9292, -6.2564, -3.1801],
        [-7.2844,  2.5015, -7.1272, -2.4787],
        [-6.1857,  3.2798, -6.1137, -3.5053]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 39%|███▉      | 113/289 [01:25<02:13,  1.32it/s]

Training loop 113
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4290229380130768, logits - tensor([[-7.0996, -2.7054, -5.8728,  2.6071],
        [-5.9328,  3.4472, -7.1824, -3.8512],
        [-6.1800, -3.5219, -5.0064,  3.5696],
        [-6.3911,  2.2103, -6.3428, -3.2740],
        [-6.0302,  2.0876, -7.2199, -1.5677],
        [-6.4079,  2.8592, -6.6562, -3.1214],
        [-6.1495,  2.6438, -5.1627, -2.9485],
        [-6.5934,  2.2035, -6.7797, -2.6802]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 39%|███▉      | 114/289 [01:26<02:12,  1.32it/s]

Training loop 114
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.02222273126244545, logits - tensor([[-7.4216, -3.6204, -5.8453,  3.4752],
        [-6.4030,  3.3330, -6.3269, -3.4217],
        [-5.1426,  3.7022, -6.0492, -3.3419],
        [-6.9966,  3.6977, -7.0492, -4.0384],
        [-6.8221,  3.1143, -5.6259, -2.6133],
        [-5.4940,  2.7554, -5.8066, -3.1004],
        [-6.0526,  3.3267, -5.9595, -3.4532],
        [-5.0399, -3.6042,  2.6808, -2.6778]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 40%|███▉      | 115/289 [01:27<02:11,  1.32it/s]

Training loop 115
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5018028020858765, logits - tensor([[-6.2070,  1.8899, -5.0064, -1.9671],
        [-7.4809,  4.0584, -7.3366, -4.2656],
        [-6.9764, -2.7625, -6.1350,  2.8576],
        [-6.3391, -2.3966, -6.1739,  2.4199],
        [-6.1749,  3.7343, -6.0218, -3.0515],
        [-7.5619,  1.1104, -6.8372, -1.3709],
        [-7.1681,  3.2503, -6.8190, -3.3711],
        [-6.4934,  2.6155, -5.9861, -2.9424]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 40%|████      | 116/289 [01:27<02:11,  1.32it/s]

Training loop 116
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.035752810537815094, logits - tensor([[-6.4957, -4.0117, -5.6156,  4.0443],
        [-6.5383,  2.8788, -7.0524, -3.1594],
        [-6.1226,  3.0123, -6.4940, -3.4695],
        [-6.5689,  2.6600, -6.7814, -3.3690],
        [-5.3490,  3.4084, -5.4188, -3.3978],
        [-6.5747,  1.2482, -5.7979, -1.1455],
        [-5.3787,  2.8769, -5.7727, -2.2724],
        [-7.0053,  3.7790, -7.1012, -3.6261]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 40%|████      | 117/289 [01:28<02:10,  1.32it/s]

Training loop 117
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.21307091414928436, logits - tensor([[-6.8767, -4.6684,  3.2871, -3.7286],
        [-6.7109,  2.9355, -6.6796, -2.9053],
        [-6.9100, -2.8018, -6.5026,  1.9551],
        [-7.8067,  3.0592, -6.6112, -2.5838],
        [-7.6798,  2.8517, -7.8904, -2.8820],
        [-5.2831,  2.5457, -5.3115, -3.5580],
        [-6.8493, -2.1592, -6.3503,  2.3223],
        [-6.4133,  3.6065, -6.4931, -3.7017]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 41%|████      | 118/289 [01:29<02:09,  1.32it/s]

Training loop 118
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.05095621943473816, logits - tensor([[-6.5586,  3.0668, -6.0687, -3.1643],
        [-6.4620,  2.6161, -5.9332, -3.1208],
        [-6.8591, -0.7041, -6.0875,  0.6863],
        [-6.3587,  3.8796, -6.3929, -3.7215],
        [-6.8080,  1.9973, -6.1223, -1.8617],
        [-6.7315,  3.4753, -5.3223, -3.4008],
        [-5.6386, -3.8269,  2.5165, -2.9433],
        [-5.9864, -3.4930, -5.3922,  3.6387]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 41%|████      | 119/289 [01:30<02:08,  1.32it/s]

Training loop 119
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1189635619521141, logits - tensor([[-7.6082,  2.7220, -6.9583, -2.7661],
        [-6.9564,  0.7251, -7.0641, -1.0435],
        [-6.5390,  2.5895, -6.9894, -2.9596],
        [-7.3181,  2.5888, -6.8774, -2.5422],
        [-6.6726,  3.2946, -6.6843, -3.4086],
        [-6.5560, -2.4748,  0.5853, -1.9333],
        [-6.9450,  4.2284, -6.9262, -3.8337],
        [-6.5599,  2.3850, -7.2250, -2.5002]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 42%|████▏     | 120/289 [01:30<02:08,  1.32it/s]

Training loop 120
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.08077140152454376, logits - tensor([[-7.0775, -3.3015, -6.3911,  3.1053],
        [-5.6492,  2.9323, -6.0320, -2.9557],
        [-6.5178,  2.7065, -7.2009, -3.3577],
        [-6.8062,  2.6975, -6.7477, -3.2606],
        [-7.0224, -0.2202, -6.2652,  0.2469],
        [-5.2996,  3.0282, -5.5497, -3.4351],
        [-6.5299, -2.1945, -5.6068,  1.5270],
        [-6.8816,  2.4746, -6.7439, -2.5817]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 42%|████▏     | 121/289 [01:31<02:07,  1.31it/s]

Training loop 121
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.02267712913453579, logits - tensor([[-6.9683,  2.7508, -6.7074, -2.6154],
        [-6.1268, -3.5679,  2.7806, -2.5191],
        [-6.2150, -4.9339,  3.1296, -3.9871],
        [-5.6136,  4.3863, -6.2560, -3.3925],
        [-5.3319, -3.6286,  3.6604, -2.6166],
        [-6.4763,  3.5698, -6.5059, -3.4759],
        [-6.1425,  3.5872, -6.1345, -3.5414],
        [-6.3692,  3.7445, -5.7717, -3.6578]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 42%|████▏     | 122/289 [01:32<02:06,  1.32it/s]

Training loop 122
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1537499725818634, logits - tensor([[-6.4483, -4.1785,  2.8266, -2.6256],
        [-6.4746, -3.0881, -6.0676,  2.6684],
        [-6.3470,  3.6841, -6.7009, -4.0218],
        [-6.5160, -2.9520,  0.7297, -0.9156],
        [-7.0605,  2.8514, -6.2585, -2.4072],
        [-7.2747,  3.2807, -6.3791, -3.8887],
        [-6.9177,  0.2000, -6.8381, -0.1786],
        [-5.6174,  1.4805, -5.4961, -1.6060]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 43%|████▎     | 123/289 [01:33<02:05,  1.32it/s]

Training loop 123
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.17670930922031403, logits - tensor([[-5.3853,  3.1107, -5.7675, -2.1287],
        [-6.4554,  3.1087, -6.4622, -3.0986],
        [-5.8992, -4.6440, -5.3501,  4.2476],
        [-6.9810,  3.7444, -7.0878, -3.2902],
        [-6.6526,  2.5126, -6.7961, -2.4388],
        [-6.4598,  2.9636, -6.3969, -3.7010],
        [-5.5168,  3.7750, -6.4582, -3.7024],
        [-6.4586, -3.9994, -5.6986,  3.2503]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 43%|████▎     | 124/289 [01:33<02:04,  1.32it/s]

Training loop 124
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2684940993785858, logits - tensor([[-5.1746e+00,  2.4754e+00, -5.0827e+00, -3.7641e+00],
        [-6.5803e+00, -3.6979e-01, -6.5908e+00,  2.7375e-03],
        [-6.1612e+00, -3.3660e+00,  2.0445e+00, -2.4893e+00],
        [-6.4401e+00,  3.3285e+00, -6.0243e+00, -2.7216e+00],
        [-7.3764e+00,  2.1596e+00, -7.0293e+00, -2.2861e+00],
        [-6.5385e+00, -5.1267e+00,  1.7201e+00, -2.0280e+00],
        [-6.0694e+00,  3.1665e+00, -6.2535e+00, -3.3773e+00],
        [-5.8384e+00,  3.0131e+00, -5.8612e+00, -3.1858e+00]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 43%|████▎     | 125/289 [01:34<02:04,  1.32it/s]

Training loop 125
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.053329918533563614, logits - tensor([[-6.7284,  1.3832, -7.1697, -0.8179],
        [-6.8567,  2.3165, -7.1263, -2.7337],
        [-6.0802, -3.9587, -5.6858,  4.5526],
        [-6.0344,  2.3158, -6.2154, -1.9679],
        [-5.9673,  2.6659, -6.4287, -2.7546],
        [-6.0963,  3.2240, -5.8307, -3.5663],
        [-7.2211,  2.5148, -7.5386, -2.3636],
        [-4.4436, -2.5802,  2.2344, -1.9624]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 44%|████▎     | 126/289 [01:35<02:03,  1.32it/s]

Training loop 126
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.21555978059768677, logits - tensor([[-6.5798, -2.5857, -5.3100,  2.1530],
        [-5.6790,  1.8592, -6.6867, -2.2538],
        [-7.9738,  3.1232, -7.9140, -2.3483],
        [-7.7052, -0.4135, -6.3728,  0.2097],
        [-6.9215, -3.1139, -6.4632,  4.5606],
        [-7.7590,  3.3422, -6.8798, -3.7736],
        [-5.6405, -3.6478,  2.9295, -1.9503],
        [-7.2310,  2.2982, -6.3184, -2.8320]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 44%|████▍     | 127/289 [01:36<02:02,  1.32it/s]

Training loop 127
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.29420313239097595, logits - tensor([[-6.3789,  2.3330, -5.3070, -2.3100],
        [-5.6717,  3.3362, -7.2832, -3.4172],
        [-6.4187,  2.6286, -6.9631, -2.4614],
        [-6.1263, -3.9763,  3.1391, -3.2708],
        [-6.1703,  2.7314, -7.4085, -3.5419],
        [-6.8925,  1.1128, -6.6875, -0.5365],
        [-5.5150, -3.6556, -5.3027,  3.1925],
        [-6.0937,  3.7399, -5.4214, -3.6364]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 44%|████▍     | 128/289 [01:37<02:02,  1.32it/s]

Training loop 128
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.20345915853977203, logits - tensor([[-7.4928,  2.5292, -8.2006, -3.2583],
        [-7.6929,  3.1775, -6.5803, -3.0120],
        [-6.2338,  2.1273, -6.5903, -2.3498],
        [-5.5577, -3.3554,  1.9440, -2.4829],
        [-6.8220, -2.3383, -6.7548,  2.5873],
        [-4.8526,  3.3522, -5.2835, -3.5648],
        [-6.0286, -2.4302, -5.1811,  2.6618],
        [-7.8220,  3.2625, -7.5074, -2.0178]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 45%|████▍     | 129/289 [01:37<02:01,  1.32it/s]

Training loop 129
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.08010943233966827, logits - tensor([[-7.7223,  2.4756, -6.7837, -2.9335],
        [-6.5138,  1.7617, -6.8222, -2.8752],
        [-6.4458,  3.1693, -7.0698, -3.8741],
        [-6.3966, -4.6369,  3.1206, -2.7733],
        [-6.1946,  2.7095, -6.6570, -2.7016],
        [-6.7037,  0.5417, -5.3388, -0.1198],
        [-7.5272, -3.8325, -5.3637,  3.2293],
        [-7.0623,  3.1824, -7.0111, -3.4102]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 45%|████▍     | 130/289 [01:38<02:00,  1.32it/s]

Training loop 130
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.0436319038271904, logits - tensor([[-7.5099,  3.1971, -7.7356, -3.0666],
        [-5.8449,  3.1471, -6.6373, -3.3176],
        [-7.4114,  1.8353, -6.4258, -0.2968],
        [-7.5420,  2.4232, -8.0809, -2.6366],
        [-6.6204, -4.0886,  2.9477, -3.0265],
        [-7.6403,  3.4279, -6.8354, -3.1204],
        [-6.8783, -4.2230, -5.2331,  3.9489],
        [-6.1001, -3.3880, -4.1165,  2.4584]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 45%|████▌     | 131/289 [01:39<02:00,  1.32it/s]

Training loop 131
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.16114526987075806, logits - tensor([[-7.3612,  3.2199, -7.6786, -4.0590],
        [-6.9216,  1.9472, -6.8643, -2.4276],
        [-6.9943, -1.4336, -7.0713,  1.9532],
        [-7.3147,  3.0922, -6.8138, -1.9666],
        [-7.2293,  1.7497, -6.7100, -1.4306],
        [-5.6997, -4.4044,  2.3803, -3.4748],
        [-7.7736,  2.0253, -7.7667, -1.2685],
        [-6.2017, -3.9422,  2.5313, -2.5045]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 46%|████▌     | 132/289 [01:40<01:59,  1.31it/s]

Training loop 132
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.05668513476848602, logits - tensor([[-7.5577,  2.9635, -6.5703, -2.4314],
        [-6.4397, -1.9343, -6.7233,  2.8167],
        [-5.6796, -4.0308,  2.7866, -3.7447],
        [-8.1133,  1.8912, -7.3769, -1.5826],
        [-6.3219,  2.9069, -6.2986, -2.7790],
        [-5.8379,  3.1713, -6.6387, -3.4563],
        [-6.3235, -3.6437,  1.2577, -1.0741],
        [-7.4062,  1.5997, -6.8354, -2.2709]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 46%|████▌     | 133/289 [01:40<01:58,  1.32it/s]

Training loop 133
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.05968226119875908, logits - tensor([[-8.9560,  2.1959, -7.0832, -2.2877],
        [-6.9942,  0.7983, -7.0401, -0.9662],
        [-5.7921,  1.9097, -6.0826, -2.8879],
        [-7.0019, -2.5415, -5.6012,  1.8327],
        [-6.1840, -2.7229, -6.1873,  3.5395],
        [-6.6738, -2.3020, -5.2597,  2.5330],
        [-7.0968,  1.9221, -7.1006, -2.1493],
        [-6.6549, -3.5664, -5.5417,  3.5517]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 46%|████▋     | 134/289 [01:41<01:57,  1.31it/s]

Training loop 134
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.045843400061130524, logits - tensor([[-6.5053,  2.9624, -6.4198, -2.7336],
        [-7.3853,  1.9457, -6.9282, -1.5095],
        [-6.2862, -2.9462, -5.3478,  3.6994],
        [-6.6116,  2.5657, -6.1561, -1.8887],
        [-7.0915,  1.8042, -6.7439, -1.9853],
        [-4.9705, -3.2085,  3.3965, -2.4728],
        [-6.2071,  2.4897, -5.6604, -3.6509],
        [-7.5201,  2.7764, -6.9449, -2.2651]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 47%|████▋     | 135/289 [01:42<01:57,  1.31it/s]

Training loop 135
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.06522338092327118, logits - tensor([[-8.0930,  2.4340, -7.5763, -2.0234],
        [-6.6589, -3.1882,  1.6631, -2.1819],
        [-7.8744, -4.5379, -7.2159,  4.0413],
        [-8.0331,  3.0544, -7.1287, -2.1774],
        [-7.2244, -3.2583, -5.6685,  3.7341],
        [-6.1059, -4.0934,  2.4109, -3.1541],
        [-6.5727,  0.5272, -6.0625, -0.3385],
        [-5.8205, -3.6768,  2.7919, -2.6703]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 47%|████▋     | 136/289 [01:43<01:56,  1.31it/s]

Training loop 136
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.24036705493927002, logits - tensor([[-5.8526, -3.6664,  2.6991, -1.9783],
        [-6.9112, -4.1848,  0.0794, -1.0374],
        [-5.8095,  2.8523, -5.9734, -2.7404],
        [-6.4289,  3.3287, -7.0931, -2.6611],
        [-6.7782,  2.4719, -6.6887, -2.8267],
        [-8.5066,  2.5131, -7.2190, -1.7303],
        [-6.9221,  1.0076, -5.8245, -0.8520],
        [-7.1576, -1.0870, -5.6385,  1.3035]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 47%|████▋     | 137/289 [01:43<01:55,  1.31it/s]

Training loop 137
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.11494753509759903, logits - tensor([[-4.9962, -3.9608, -5.4929,  4.2479],
        [-6.4518, -4.4383,  2.5106, -4.3078],
        [-6.9140, -2.6206, -6.1184,  2.0558],
        [-6.8518,  2.9639, -7.1456, -2.7437],
        [-7.1951, -1.1917, -6.6882,  1.1766],
        [-6.5991,  3.6603, -6.5655, -4.1465],
        [-7.7024,  1.9416, -6.7846, -2.2463],
        [-6.3683, -4.6980, -4.4548,  4.2250]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 48%|████▊     | 138/289 [01:44<01:54,  1.32it/s]

Training loop 138
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.18923954665660858, logits - tensor([[-6.7894,  0.8843, -6.5495, -0.6702],
        [-6.9517,  1.8884, -7.8787, -1.9043],
        [-6.3369,  2.2587, -6.6222, -2.3034],
        [-7.0058, -2.9241, -6.1371,  2.9687],
        [-7.4256,  2.5664, -6.9900, -3.0518],
        [-6.4719,  1.2847, -5.6899, -1.5618],
        [-6.5336,  2.4165, -7.3796, -2.5196],
        [-7.4010,  2.1954, -7.0752, -2.4354]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 48%|████▊     | 139/289 [01:45<01:53,  1.32it/s]

Training loop 139
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.24168024957180023, logits - tensor([[-6.2184,  0.5705, -6.0883,  0.1112],
        [-6.2072,  2.9626, -5.6033, -2.6285],
        [-5.4676,  3.4618, -6.5428, -3.0213],
        [-7.0814,  2.9347, -6.6473, -2.3713],
        [-7.3172,  1.5750, -6.9294, -1.4848],
        [-6.6821,  2.7997, -7.5961, -3.1048],
        [-6.6407,  2.2090, -6.1214, -2.5328],
        [-6.7833, -2.7081, -6.1446,  2.0464]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 48%|████▊     | 140/289 [01:46<01:52,  1.33it/s]

Training loop 140
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2441815733909607, logits - tensor([[-6.1244, -3.5217,  1.9650, -2.5452],
        [-4.5379,  2.7288, -5.7909, -3.4080],
        [-6.5813, -3.7973, -4.6634,  3.6882],
        [-6.6789,  1.9692, -6.9496, -2.8341],
        [-6.8154,  0.0405, -6.1460, -0.3375],
        [-5.3529, -4.0385,  1.4018, -2.2442],
        [-6.4064, -3.4730, -6.0710,  3.0518],
        [-6.5057, -2.9304, -5.3304,  3.3357]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 49%|████▉     | 141/289 [01:46<01:51,  1.33it/s]

Training loop 141
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.04826267063617706, logits - tensor([[-7.3789,  1.4613, -6.0834, -1.9567],
        [-5.1844, -3.6812,  2.9941, -3.3930],
        [-6.6606,  3.6447, -6.9006, -2.6957],
        [-5.5109, -3.2712,  2.1329, -2.7686],
        [-5.8700,  1.9380, -5.8163, -1.8251],
        [-5.1646, -3.0882,  2.4151, -2.8478],
        [-7.6780,  3.8833, -7.9891, -3.1611],
        [-6.2951,  2.7466, -7.0730, -1.6775]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 49%|████▉     | 142/289 [01:47<01:51,  1.32it/s]

Training loop 142
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.05012813210487366, logits - tensor([[-5.6215,  1.9725, -5.7210, -2.6989],
        [-6.1395,  3.0634, -6.2560, -3.5626],
        [-7.9102,  3.1603, -6.6193, -2.8490],
        [-6.0127, -4.7659,  3.0685, -3.0347],
        [-6.6035, -1.5876, -5.8184,  1.0456],
        [-4.5519, -2.8938,  1.2605, -1.4553],
        [-6.5742,  3.3864, -6.6329, -3.4596],
        [-6.9734,  3.7164, -7.7319, -4.2810]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 49%|████▉     | 143/289 [01:48<01:50,  1.32it/s]

Training loop 143
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.06881701946258545, logits - tensor([[-7.0250, -0.3874, -6.7270, -0.0870],
        [-7.3953, -2.7752, -4.5464,  1.8473],
        [-6.8328, -3.7912, -6.2934,  4.2043],
        [-7.1766, -2.9041, -5.5404,  3.3535],
        [-7.4949, -3.4378, -6.1428,  3.2207],
        [-6.2781,  3.6364, -6.4099, -3.5544],
        [-6.6499,  2.8461, -6.8409, -2.8633],
        [-7.6767,  2.1422, -6.7267, -1.3250]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 50%|████▉     | 144/289 [01:49<01:49,  1.32it/s]

Training loop 144
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.05360235646367073, logits - tensor([[-7.0428,  2.9264, -6.3734, -3.1213],
        [-6.0642, -3.7487, -5.8520,  3.8332],
        [-6.6085,  4.1842, -6.4838, -3.6425],
        [-6.6560,  2.4119, -5.6686, -3.4325],
        [-7.5269,  0.4833, -7.5683, -0.4711],
        [-6.4998, -2.1351, -5.8215,  2.8668],
        [-6.7005,  2.3599, -6.7918, -2.8813],
        [-5.6545, -4.2830,  3.0511, -2.9185]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 50%|█████     | 145/289 [01:49<01:48,  1.33it/s]

Training loop 145
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.21166425943374634, logits - tensor([[-5.5304, -4.7014,  3.7019, -2.1964],
        [-6.8525,  2.6726, -6.4558, -3.2262],
        [-7.6695,  3.0511, -6.6240, -2.8926],
        [-5.9697, -2.4627, -5.3795,  3.5110],
        [-6.9532,  3.0904, -7.3852, -3.6793],
        [-7.2350, -3.7515, -5.6516,  4.0319],
        [-6.1277,  2.5742, -5.8572, -2.4736],
        [-5.6852, -3.8905, -4.2922,  4.1598]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 51%|█████     | 146/289 [01:50<01:48,  1.32it/s]

Training loop 146
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 51%|█████     | 147/289 [01:51<01:47,  1.32it/s]

loss - 0.32638493180274963, logits - tensor([[-7.8068,  1.2455, -6.9352, -0.3936],
        [-6.3151,  3.0708, -7.1221, -3.4755],
        [-7.3582,  3.1312, -6.8913, -2.4932],
        [-5.2351, -2.5335,  1.3327, -1.8920],
        [-6.2092,  3.6755, -6.2985, -2.2520],
        [-7.2628,  1.5992, -6.7962, -2.1138],
        [-6.9275, -2.5128, -5.5223,  2.3046],
        [-6.0660, -3.2913,  2.4805, -2.2055]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 147
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.06424765288829803, logits - tensor([[-6.4477,  3.0505, -6.2249, -3.6144],
        [-6.9221,  3.6366, -7.6519, -3.7515],
        [-7.5311, -2.3069, -6.2332,  2.5517],
        [-6.3875, -2.8938, -4.6026,  2.0754],
        [-7.0434,  0.9946, -6.8584, -1.3425],
        [-7.0467,  2.5296, -7.4619, -2.6169],
        [-6.5711,  2.5593, -5.4748, -2.5324],
        [-8.6007,  0.8774, -8.4101, -0

 51%|█████     | 148/289 [01:52<01:46,  1.32it/s]

Training loop 148
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.371712863445282, logits - tensor([[-6.3896,  2.4531, -6.2749, -3.6089],
        [-5.6430, -4.1987,  2.9280, -2.9826],
        [-5.7594, -2.9804,  1.6430, -1.8249],
        [-7.0596,  4.0443, -6.3054, -3.9117],
        [-6.8172,  3.1346, -6.5621, -2.9511],
        [-5.8314, -1.7556, -6.3793,  2.0032],
        [-7.0918,  2.8713, -6.9792, -3.2007],
        [-6.8721,  3.4112, -6.8740, -3.4982]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 52%|█████▏    | 149/289 [01:52<01:46,  1.32it/s]

Training loop 149
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.04765748977661133, logits - tensor([[-5.9226, -2.6501, -5.9613,  3.6790],
        [-6.3754, -3.8217, -6.0759,  3.1498],
        [-6.4146, -1.1518, -5.6494,  0.9512],
        [-6.6842, -4.0585, -5.9224,  3.0562],
        [-6.3266,  3.3928, -5.5456, -3.5271],
        [-5.1983, -4.6494, -4.8174,  3.9690],
        [-7.6568,  1.5363, -7.1115, -2.3359],
        [-6.1728,  1.9513, -6.0292, -1.8202]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 52%|█████▏    | 150/289 [01:53<01:45,  1.31it/s]

Training loop 150
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19002360105514526, logits - tensor([[-6.8530,  2.6114, -7.5303, -2.8530],
        [-7.3475,  3.6832, -7.2001, -3.5263],
        [-6.4519, -3.4888, -5.6309,  4.0807],
        [-6.5275,  2.8138, -6.8226, -3.3648],
        [-6.1807, -2.6557,  1.5579, -2.2267],
        [-6.7336,  3.5530, -6.6766, -2.7380],
        [-6.3998,  2.3315, -6.1109, -2.7365],
        [-6.7778,  3.5166, -7.0199, -3.3452]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 52%|█████▏    | 151/289 [01:54<01:45,  1.31it/s]

Training loop 151
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.28662431240081787, logits - tensor([[-7.2018,  3.1190, -7.4522, -3.4427],
        [-7.1065,  3.6169, -6.8901, -2.5612],
        [-6.5120,  1.2966, -6.6250, -1.1572],
        [-6.6600,  2.5179, -6.6633, -3.0659],
        [-6.3343,  2.8292, -6.8570, -2.5393],
        [-7.3332,  2.6111, -8.1767, -3.1932],
        [-6.3720,  3.1133, -6.6224, -3.4346],
        [-6.3388,  2.2966, -7.1006, -2.2265]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 53%|█████▎    | 152/289 [01:55<01:44,  1.31it/s]

Training loop 152
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2060130387544632, logits - tensor([[-6.1645,  3.5153, -6.9808, -3.1351],
        [-6.7080, -1.7198, -6.3036,  2.4187],
        [-5.5817,  3.0340, -6.2676, -3.2804],
        [-6.8258, -3.1523, -5.6142,  2.4431],
        [-7.4086,  2.5311, -6.9671, -3.1847],
        [-7.2773, -3.2366, -5.5837,  4.0689],
        [-6.0610,  2.8375, -6.3022, -3.6681],
        [-7.5228,  2.3785, -7.2995, -2.3094]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 53%|█████▎    | 153/289 [01:56<01:43,  1.31it/s]

Training loop 153
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1691247820854187, logits - tensor([[-7.1889, -2.4753, -5.3611,  2.7704],
        [-6.6434, -1.1285, -5.7107,  1.6609],
        [-7.2194,  2.2519, -6.4294, -2.0466],
        [-6.9992, -0.1757, -6.3468,  0.8232],
        [-6.6924, -1.8117, -5.1382,  1.4403],
        [-6.2148, -3.0952,  1.3511, -1.2247],
        [-6.4674, -4.2676, -4.6214,  3.4753],
        [-6.9519, -4.2786,  3.4315, -3.7487]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 53%|█████▎    | 154/289 [01:56<01:42,  1.32it/s]

Training loop 154
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1744573712348938, logits - tensor([[-5.4249, -3.9388,  2.9609, -3.0170],
        [-7.5189,  3.4711, -7.1695, -3.3798],
        [-6.3094,  3.4935, -6.5865, -3.4542],
        [-6.7362, -2.3053, -4.9019,  2.5026],
        [-6.3024, -4.1439,  2.8493, -3.5610],
        [-6.7742,  3.1276, -7.5045, -3.5669],
        [-6.5876,  2.6312, -5.9708, -3.1236],
        [-6.1939,  3.5292, -6.9813, -4.0022]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 54%|█████▎    | 155/289 [01:57<01:41,  1.32it/s]

Training loop 155
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.04624404013156891, logits - tensor([[-7.7361,  3.3761, -6.8975, -3.1465],
        [-6.5421, -3.9309, -5.3464,  3.6781],
        [-6.3409, -3.6403,  2.6540, -2.0028],
        [-6.7096,  2.6389, -6.7261, -1.9567],
        [-6.7883,  1.9698, -6.6439, -3.7858],
        [-6.9248,  2.9589, -7.0526, -2.8376],
        [-5.5692,  2.5524, -6.1085, -3.2611],
        [-6.2040, -1.2731, -4.6029,  1.1138]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 54%|█████▍    | 156/289 [01:58<01:40,  1.32it/s]

Training loop 156
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.05883168801665306, logits - tensor([[-5.4810, -3.9851,  3.2926, -2.6196],
        [-6.5478,  2.1657, -6.7205, -3.6170],
        [-6.7216, -2.7220, -6.8292,  2.7835],
        [-6.8403,  2.8797, -5.3692, -3.7746],
        [-6.5508, -2.1107, -0.3635, -1.8299],
        [-6.4670, -4.4133, -5.9567,  3.7385],
        [-5.7598, -3.6600, -4.7838,  4.2554],
        [-4.9989, -3.4345,  2.7895, -2.7506]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 54%|█████▍    | 157/289 [01:59<01:39,  1.32it/s]

Training loop 157
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.04136281833052635, logits - tensor([[-6.0928,  3.0330, -5.6667, -3.2923],
        [-6.9588,  2.9392, -6.8169, -2.8629],
        [-7.0184,  2.5022, -6.8105, -3.5750],
        [-6.7647,  2.2425, -6.5637, -2.9142],
        [-6.1291,  3.4047, -5.5453, -2.7306],
        [-8.0601,  1.4155, -6.8812, -1.6024],
        [-6.5091,  3.0527, -5.4096, -1.9340],
        [-6.4768,  2.2826, -6.4261, -2.6384]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 55%|█████▍    | 158/289 [01:59<01:39,  1.32it/s]

Training loop 158
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.043995827436447144, logits - tensor([[-6.5199,  1.8191, -6.3967, -2.5521],
        [-6.2062,  2.6168, -6.0056, -2.7567],
        [-6.5558,  2.3311, -5.9866, -2.8480],
        [-7.2166, -1.6094, -5.6584,  1.6015],
        [-6.6455,  3.2401, -7.3810, -2.7879],
        [-7.7244,  3.2395, -6.9894, -3.7024],
        [-6.9706,  2.7912, -6.5525, -1.7934],
        [-5.9765,  2.2205, -6.1907, -3.4273]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 55%|█████▌    | 159/289 [02:00<01:38,  1.32it/s]

Training loop 159
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.0645161122083664, logits - tensor([[-6.8928,  3.2022, -7.5597, -3.4486],
        [-6.6390,  1.7543, -5.9238, -3.0068],
        [-5.6790,  1.8504, -6.2568, -1.4136],
        [-6.3977,  2.8611, -6.4797, -2.9451],
        [-6.8479, -3.4658, -4.9686,  2.8621],
        [-5.7530, -2.7765,  0.8503, -1.2633],
        [-6.2959,  0.9791, -5.0467, -1.9924],
        [-6.3857,  3.0004, -6.1091, -3.4685]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 55%|█████▌    | 160/289 [02:01<01:37,  1.32it/s]

Training loop 160
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19694308936595917, logits - tensor([[-5.8258,  1.8092, -6.4011, -1.9848],
        [-7.5623,  2.8844, -7.0537, -2.5838],
        [-6.5346,  2.6509, -5.6640, -2.5040],
        [-6.8976,  3.1294, -6.9068, -2.8598],
        [-7.3521, -3.3353, -5.1584,  2.4032],
        [-6.4406,  2.2618, -6.7084, -2.7369],
        [-5.3392, -3.9350,  3.4333, -3.6138],
        [-6.0941, -3.2877,  1.8578, -2.5025]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 56%|█████▌    | 161/289 [02:02<01:36,  1.33it/s]

Training loop 161
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.05314147472381592, logits - tensor([[-6.0434, -3.8959, -5.8109,  2.7216],
        [-5.7028, -1.9733,  0.3163, -1.5452],
        [-6.8448, -1.9712, -5.3602,  1.8802],
        [-7.2769,  3.7447, -6.8662, -3.0524],
        [-6.5872,  1.9432, -5.9030, -3.4800],
        [-7.0794,  2.6249, -6.5804, -2.6489],
        [-6.2625,  3.5692, -6.1934, -4.2533],
        [-6.2388, -4.5413, -5.4369,  4.1188]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 56%|█████▌    | 162/289 [02:02<01:35,  1.33it/s]

Training loop 162
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.0373179167509079, logits - tensor([[-5.3355, -3.5908,  2.7483, -3.0656],
        [-4.9238,  2.4796, -6.2352, -3.4726],
        [-6.1679,  3.0755, -6.6228, -3.2330],
        [-5.6968, -3.9326,  3.0001, -3.0604],
        [-6.8310,  3.1780, -6.9451, -2.2444],
        [-7.0483,  2.2398, -6.8816, -3.1701],
        [-5.2223, -3.8963,  2.3803, -3.4760],
        [-6.2380,  1.6019, -6.2856, -2.1493]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 56%|█████▋    | 163/289 [02:03<01:34,  1.33it/s]

Training loop 163
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.03586627170443535, logits - tensor([[-6.0964,  3.4676, -7.2992, -3.0112],
        [-6.3871,  2.8952, -6.6136, -2.2668],
        [-7.2309,  3.1322, -5.8664, -3.0578],
        [-7.8876,  1.9262, -6.8789, -1.8397],
        [-6.5143,  2.4848, -6.7048, -2.7557],
        [-5.4692, -4.3846, -5.3878,  3.6439],
        [-6.0187, -3.9343,  2.2691, -2.4285],
        [-5.8472, -3.6997,  3.1882, -2.6277]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 57%|█████▋    | 164/289 [02:04<01:33,  1.33it/s]

Training loop 164
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.035602398216724396, logits - tensor([[-6.6974,  1.9234, -6.7105, -1.9803],
        [-7.1568,  3.5723, -7.0840, -3.0264],
        [-6.5340,  2.7687, -6.1436, -2.6347],
        [-7.3391,  2.1714, -6.9853, -2.1518],
        [-6.1872,  2.5842, -6.9490, -2.9514],
        [-5.6441,  2.5787, -6.6112, -3.1316],
        [-7.0411, -3.7469, -7.0703,  3.1563],
        [-6.5473,  2.6004, -5.5959, -2.9649]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 57%|█████▋    | 165/289 [02:05<01:33,  1.33it/s]

Training loop 165
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.17345163226127625, logits - tensor([[-5.6353,  2.7003, -6.3887, -2.6075],
        [-7.2320, -2.3602, -6.6419,  1.9590],
        [-7.0265, -3.3931, -5.9952,  3.0478],
        [-7.1494,  2.1276, -6.7527, -2.2320],
        [-6.3939,  2.2760, -6.1077, -2.9004],
        [-6.1819, -4.2364,  2.7310, -3.1901],
        [-6.1078, -3.6886,  2.9826, -2.6878],
        [-6.9736,  3.2122, -6.4338, -2.1786]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 57%|█████▋    | 166/289 [02:05<01:32,  1.33it/s]

Training loop 166
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1263505220413208, logits - tensor([[-6.3604,  2.5191, -7.3596, -3.4212],
        [-6.6235,  0.8235, -6.7029, -1.1063],
        [-7.2059,  2.7253, -7.2261, -3.4322],
        [-7.0917, -1.0863, -6.1679,  0.6791],
        [-5.6519, -4.3902, -5.9186,  4.4799],
        [-6.5675,  2.6472, -6.0862, -3.1067],
        [-6.9185,  3.0591, -6.7225, -2.1134],
        [-7.5484, -1.7742, -6.9276,  1.2722]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 58%|█████▊    | 167/289 [02:06<01:31,  1.33it/s]

Training loop 167
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19185562431812286, logits - tensor([[-7.2772,  2.4116, -7.4828, -2.5856],
        [-7.0510,  2.1443, -6.3864, -2.3520],
        [-5.3025,  2.5956, -6.2022, -3.1099],
        [-7.9328, -2.7052, -6.4206,  3.0747],
        [-6.5678,  2.6068, -6.5728, -3.5581],
        [-6.6475,  2.9463, -6.6143, -2.8473],
        [-5.3869,  2.5430, -4.8977, -2.5131],
        [-7.3718,  2.9415, -6.3497, -2.9098]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 58%|█████▊    | 168/289 [02:07<01:31,  1.32it/s]

Training loop 168
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.07787300646305084, logits - tensor([[-5.8054, -2.9413,  2.3992, -2.9103],
        [-6.6382,  2.3858, -6.6396, -1.8460],
        [-7.5614, -2.3930, -5.3082,  3.0362],
        [-6.9428,  2.9845, -6.7246, -3.2310],
        [-6.2741,  0.1765, -6.2278, -0.3085],
        [-5.8295, -3.3777,  2.9315, -3.2537],
        [-6.3667,  2.0280, -6.1829, -1.8512],
        [-5.9598, -4.1665,  2.4706, -1.6974]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 58%|█████▊    | 169/289 [02:08<01:30,  1.32it/s]

Training loop 169
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.0461391806602478, logits - tensor([[-6.1262, -3.5135, -4.3920,  3.6117],
        [-6.4007,  1.3586, -6.6323, -0.7455],
        [-7.7752,  2.2769, -6.9776, -1.6373],
        [-6.0034, -3.6993,  2.9166, -3.8160],
        [-6.7968,  3.0219, -7.4873, -2.4937],
        [-8.7764,  3.4567, -7.7703, -2.5110],
        [-6.5672,  2.6850, -6.0051, -3.5077],
        [-6.1463,  3.7693, -6.8140, -3.0999]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 59%|█████▉    | 170/289 [02:08<01:30,  1.32it/s]

Training loop 170
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.05811682343482971, logits - tensor([[-6.3322,  1.7933, -6.3892, -1.2165],
        [-5.7217,  2.9296, -6.2851, -2.7563],
        [-6.5301,  3.3188, -6.1016, -3.2604],
        [-6.8673, -2.0800, -5.9012,  2.0784],
        [-7.5399, -0.7893, -4.5160,  0.7613],
        [-6.9948, -3.4218, -6.1642,  3.5249],
        [-6.3795,  2.3235, -7.1778, -3.5349],
        [-7.4736, -3.7466, -6.1022,  3.7267]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 59%|█████▉    | 171/289 [02:09<01:29,  1.32it/s]

Training loop 171
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3900807797908783, logits - tensor([[-7.2268,  1.5935, -6.0042, -2.0539],
        [-6.9092, -3.4410, -5.5847,  4.3987],
        [-6.1217,  3.3241, -5.7311, -3.5409],
        [-6.7753,  3.0916, -6.9727, -3.0349],
        [-7.0760,  3.0244, -6.8807, -2.3210],
        [-8.4759,  2.8403, -7.9753, -2.6154],
        [-6.5188, -3.3550, -5.2764,  2.3830],
        [-6.8361,  2.9767, -7.1007, -3.2282]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 60%|█████▉    | 172/289 [02:10<01:29,  1.31it/s]

Training loop 172
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.20476436614990234, logits - tensor([[-6.8057,  2.7283, -6.1606, -2.8293],
        [-6.0456,  3.5159, -6.5123, -2.1925],
        [-6.6050,  2.0550, -6.5459, -2.1563],
        [-6.8276,  3.1285, -6.8292, -2.4170],
        [-7.6856,  2.3865, -6.7179, -2.7089],
        [-5.4702, -2.2081, -4.6349,  2.0410],
        [-5.8598,  2.4427, -6.6090, -2.4665],
        [-5.3872, -2.9287,  2.3751, -2.0424]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 60%|█████▉    | 173/289 [02:11<01:28,  1.31it/s]

Training loop 173
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 60%|██████    | 174/289 [02:11<01:27,  1.31it/s]

loss - 0.21487966179847717, logits - tensor([[-7.2031,  3.0147, -6.7556, -2.3850],
        [-6.3275,  3.0018, -7.0340, -3.0416],
        [-6.2093, -4.1036, -5.3241,  4.0077],
        [-7.6577,  2.6757, -6.5466, -3.3661],
        [-7.5646, -3.2860, -6.4795,  3.2408],
        [-5.0804, -3.5000,  2.7863, -2.4957],
        [-6.9816, -4.2416, -5.6150,  3.4870],
        [-8.3818,  3.3393, -7.4557, -2.7592]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 174
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.22812621295452118, logits - tensor([[-6.2115, -1.9338, -5.6885,  3.4052],
        [-6.9145,  2.8446, -6.4490, -2.8528],
        [-5.1922,  2.9957, -5.6321, -2.7454],
        [-7.2253,  2.1764, -6.2479, -3.0716],
        [-6.9893,  2.9364, -6.6632, -2.4611],
        [-7.1882,  2.1515, -7.8625, -2.1231],
        [-5.8526, -3.5682,  2.4477, -3.2504],
        [-5.3386, -3.4178,  2.5052, -2

 61%|██████    | 175/289 [02:12<01:26,  1.31it/s]

Training loop 175
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.14950507879257202, logits - tensor([[-6.6151,  2.2840, -6.8470, -2.5775],
        [-6.6432, -4.8617,  1.4130, -1.3844],
        [-5.5752, -3.9035, -5.6003,  4.4678],
        [-6.7385,  3.0853, -6.0092, -3.0236],
        [-5.8326, -2.5398, -5.7552,  2.5790],
        [-6.7606,  3.0838, -6.4007, -3.0130],
        [-7.4155,  2.8088, -7.0297, -2.2053],
        [-6.6664, -1.2115, -4.6721,  0.3278]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 61%|██████    | 176/289 [02:13<01:26,  1.31it/s]

Training loop 176
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4160413146018982, logits - tensor([[-7.7952,  0.7116, -6.2124, -0.5561],
        [-6.7761,  2.2723, -7.0136, -2.9059],
        [-6.8478,  2.6772, -6.0368, -2.4015],
        [-6.2908,  2.7115, -6.9113, -2.3755],
        [-6.8765,  3.0551, -7.5100, -3.2277],
        [-5.9668,  2.6308, -6.5692, -3.3110],
        [-6.6270,  2.5702, -6.3555, -2.3883],
        [-7.6019,  1.5368, -6.3186, -2.0486]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 61%|██████    | 177/289 [02:14<01:25,  1.32it/s]

Training loop 177
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19366052746772766, logits - tensor([[-7.1286,  3.7857, -7.7845, -3.1498],
        [-6.4010, -4.5357, -5.8870,  4.8287],
        [-6.8494,  2.3044, -6.3001, -3.1063],
        [-5.6728, -3.9588,  2.4219, -2.8667],
        [-5.8158, -4.0686, -6.8005,  3.9257],
        [-6.7301,  3.1275, -6.6477, -3.2564],
        [-5.6337, -3.2826,  2.0077, -2.6280],
        [-7.4461, -4.1569, -6.7975,  4.1262]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 62%|██████▏   | 178/289 [02:14<01:24,  1.32it/s]

Training loop 178
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.020369630306959152, logits - tensor([[-6.5291,  2.7627, -5.4058, -3.2559],
        [-5.7308,  2.8650, -5.7701, -3.3839],
        [-4.7128,  3.0039, -5.6784, -3.3739],
        [-5.4885,  3.0776, -5.3267, -3.7664],
        [-6.1954, -4.0651, -5.6463,  3.9038],
        [-6.6714, -3.7975, -6.2769,  2.9182],
        [-6.7824,  3.1880, -6.2292, -2.4290],
        [-6.3285, -4.4279, -5.2418,  4.3355]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 62%|██████▏   | 179/289 [02:15<01:23,  1.32it/s]

Training loop 179
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5702356100082397, logits - tensor([[-6.8410,  2.2515, -6.6691, -2.3323],
        [-5.6041, -4.0868,  2.7314, -2.6012],
        [-7.4864,  0.1880, -6.6503, -0.2018],
        [-5.4339, -3.3022,  2.1246, -2.7514],
        [-7.7116,  2.1854, -7.2891, -4.0121],
        [-6.4636,  3.3283, -5.8415, -2.8868],
        [-6.3297,  2.3225, -5.6158, -2.9351],
        [-8.2877, -2.8013, -6.4223,  3.4471]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 62%|██████▏   | 180/289 [02:16<01:22,  1.32it/s]

Training loop 180
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.0440576896071434, logits - tensor([[-7.6428,  3.0939, -7.4120, -2.6602],
        [-6.6646,  2.4123, -6.7372, -3.1137],
        [-5.7626,  2.3090, -6.5289, -2.9605],
        [-6.9664, -1.4967, -5.8375,  2.5110],
        [-6.8715,  2.7998, -6.2400, -3.0131],
        [-7.5268,  1.9610, -7.7312, -2.2189],
        [-6.9550,  2.6139, -7.2222, -2.3243],
        [-7.1746,  2.0317, -6.1287, -2.3215]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 63%|██████▎   | 181/289 [02:17<01:21,  1.32it/s]

Training loop 181
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.03313122317194939, logits - tensor([[-8.2178,  2.4221, -7.5341, -2.9806],
        [-6.4231,  2.7926, -6.1323, -2.9628],
        [-5.9565,  2.0968, -6.5387, -2.5451],
        [-6.2003, -4.1352, -5.8218,  3.9097],
        [-8.1665,  1.7890, -8.0958, -2.3376],
        [-7.7543,  2.5998, -6.1243, -3.3538],
        [-6.8083, -2.8527, -6.5203,  2.7633],
        [-5.3990,  2.9392, -5.2755, -3.2478]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 63%|██████▎   | 182/289 [02:17<01:20,  1.33it/s]

Training loop 182
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19465285539627075, logits - tensor([[-7.1736, -3.6780, -6.2322,  3.1427],
        [-6.5333,  2.8126, -6.4948, -1.5924],
        [-6.9410, -3.9931, -5.7582,  4.0608],
        [-7.7353,  2.6917, -7.4157, -3.1358],
        [-6.2153,  2.0655, -7.1197, -2.9912],
        [-6.8661,  2.9793, -6.3934, -3.5391],
        [-7.4776,  2.7560, -6.7828, -2.8359],
        [-7.4027, -2.5829, -6.3020,  2.6661]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 63%|██████▎   | 183/289 [02:18<01:19,  1.33it/s]

Training loop 183
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.03635181486606598, logits - tensor([[-7.5309, -2.1714, -6.8226,  2.4651],
        [-7.3668,  3.3247, -7.3992, -2.5970],
        [-5.7958,  2.6042, -5.8214, -2.9718],
        [-7.1216,  3.1223, -5.3763, -2.2420],
        [-6.7986,  2.7595, -5.7377, -3.2195],
        [-6.2077, -3.7837,  2.2228, -1.7875],
        [-6.6547,  2.7167, -6.4794, -3.9123],
        [-5.5165,  2.4464, -6.4788, -3.6929]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 64%|██████▎   | 184/289 [02:19<01:18,  1.33it/s]

Training loop 184
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.11006874591112137, logits - tensor([[-5.6382,  2.2475, -5.9739, -2.1979],
        [-6.8435,  2.4769, -7.2371, -2.6546],
        [-6.9845,  3.0550, -7.2955, -3.5807],
        [-6.0123, -3.2437, -4.7712,  3.4714],
        [-6.5197,  2.1313, -6.5646, -2.9991],
        [-7.3372,  1.4905, -6.2933, -1.7802],
        [-6.1256, -1.8396, -0.4064, -0.9950],
        [-7.5172,  1.2672, -6.4470, -1.2143]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 64%|██████▍   | 185/289 [02:20<01:18,  1.33it/s]

Training loop 185
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.10020607709884644, logits - tensor([[-5.5499,  2.9194, -5.7703, -3.1311],
        [-6.5440,  2.6292, -6.3767, -3.0646],
        [-7.6357,  1.0070, -6.6815, -0.5330],
        [-7.2567,  3.0838, -6.4690, -3.6301],
        [-8.1758,  2.9728, -7.4282, -1.9287],
        [-6.3992, -3.6381,  2.5737, -2.7516],
        [-5.7841, -3.7127,  2.5121, -2.8376],
        [-7.1903,  2.9485, -7.1380, -3.3732]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 64%|██████▍   | 186/289 [02:20<01:17,  1.33it/s]

Training loop 186
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.10908819735050201, logits - tensor([[-8.4452,  0.7747, -7.2146, -0.8754],
        [-7.4250,  2.9108, -7.1603, -2.8581],
        [-7.2618,  1.9464, -7.3262, -2.6982],
        [-6.2224, -3.3447,  2.0010, -2.3499],
        [-7.8352,  2.4906, -6.9500, -2.2437],
        [-6.2997, -3.7688,  2.9848, -2.7630],
        [-6.4590, -4.1399, -6.4781,  3.9711],
        [-6.9284,  2.4153, -6.1317, -2.2061]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 65%|██████▍   | 187/289 [02:21<01:16,  1.33it/s]

Training loop 187
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 65%|██████▌   | 188/289 [02:22<01:16,  1.33it/s]

loss - 0.21953658759593964, logits - tensor([[-6.4479, -3.7903, -5.4285,  4.2034],
        [-7.3377,  3.3796, -6.9120, -2.8238],
        [-7.5635, -2.3207, -7.3555,  2.6906],
        [-6.6708,  1.5056, -6.1630, -2.0197],
        [-6.9862, -3.7639, -5.7191,  4.2345],
        [-6.7390,  2.8461, -6.0440, -2.8707],
        [-6.0496,  1.9312, -6.8822, -1.4584],
        [-6.8138,  2.4839, -6.5594, -2.2905]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 188
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19632618129253387, logits - tensor([[-6.8622,  2.4148, -6.3674, -3.2296],
        [-6.2862, -3.5153, -5.6756,  3.4610],
        [-6.0466, -2.9694,  0.5601, -1.6758],
        [-6.7957,  2.6170, -6.6321, -1.8129],
        [-8.0609,  2.3927, -6.6088, -2.1361],
        [-6.9018, -3.6661, -6.1910,  3.7217],
        [-6.4279,  2.4032, -6.3982, -2.0778],
        [-7.0884,  2.0874, -5.6350, -2

 65%|██████▌   | 189/289 [02:23<01:15,  1.33it/s]

Training loop 189
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.0551995150744915, logits - tensor([[-7.1101,  2.9130, -6.6743, -2.7495],
        [-5.3115, -3.8817,  2.7454, -3.3922],
        [-7.0251,  2.6857, -6.8841, -2.0336],
        [-6.2277,  3.0105, -6.3585, -2.5746],
        [-7.0163,  1.4300, -6.6439, -2.5061],
        [-6.5596, -1.2128, -5.7817,  0.8586],
        [-6.9602,  2.6321, -6.9552, -2.9272],
        [-7.9469,  2.2918, -7.5689, -2.5023]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 66%|██████▌   | 190/289 [02:23<01:14,  1.33it/s]

Training loop 190
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 66%|██████▌   | 191/289 [02:24<01:14,  1.32it/s]

loss - 0.22257627546787262, logits - tensor([[-6.7891,  2.4509, -6.9208, -2.2599],
        [-6.3095,  3.7016, -6.2294, -2.8981],
        [-5.7017, -3.2573,  1.5421, -2.2267],
        [-7.1575,  3.2403, -6.4812, -2.5447],
        [-7.0441,  2.2852, -6.2827, -2.0031],
        [-7.6400,  2.1659, -7.2498, -2.2366],
        [-5.3947, -4.6348, -4.2324,  4.7432],
        [-5.8104,  3.1233, -6.5036, -2.2995]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 191
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 66%|██████▋   | 192/289 [02:25<01:13,  1.32it/s]

loss - 0.22603458166122437, logits - tensor([[-6.3841,  2.6514, -6.6498, -3.1357],
        [-7.3111,  3.2043, -6.4274, -2.4346],
        [-7.1422, -2.6540, -5.1815,  2.8082],
        [-6.5333,  2.0576, -6.2660, -2.7071],
        [-7.1363,  3.5661, -6.5763, -2.7385],
        [-6.9807,  2.5489, -7.4056, -3.5266],
        [-6.2265, -2.5912,  0.3192, -1.1412],
        [-7.6123,  3.2739, -6.2972, -2.5622]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 192
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.08214469254016876, logits - tensor([[-7.0586, -3.8653, -4.8856,  3.8456],
        [-6.7996, -0.1726, -7.1914,  0.1829],
        [-6.6102, -3.3082,  2.4948, -3.2094],
        [-6.8726,  3.2437, -6.7245, -3.3439],
        [-6.3734, -4.3785, -5.5068,  3.4216],
        [-5.4471,  3.3491, -6.7236, -2.9217],
        [-7.3070, -2.6482, -5.8586,  2.1507],
        [-7.2401, -1.3531, -6.2869,  1

 67%|██████▋   | 193/289 [02:26<01:12,  1.33it/s]

Training loop 193
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.159150630235672, logits - tensor([[-7.7412,  1.8910, -7.1367, -1.9606],
        [-7.4547, -4.2442, -4.8688,  5.8506],
        [-8.0670,  3.3490, -7.4471, -3.7256],
        [-7.4443,  3.0335, -7.0292, -2.4601],
        [-6.7968, -2.5288, -5.2876,  2.8417],
        [-6.6050,  2.1939, -4.9601, -2.6866],
        [-6.9503,  1.8368, -6.4862, -2.1397],
        [-5.8241, -4.6160, -5.0047,  3.5928]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 67%|██████▋   | 194/289 [02:26<01:11,  1.33it/s]

Training loop 194
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.18919402360916138, logits - tensor([[-6.3764,  2.7403, -6.6888, -2.5041],
        [-7.4311,  1.7642, -6.2788, -2.2693],
        [-7.4693, -1.9824, -5.8312,  1.6113],
        [-6.6507,  2.6362, -5.9305, -2.9822],
        [-7.4744,  2.1983, -6.6450, -1.7285],
        [-7.3689, -2.5207, -5.4309,  3.5146],
        [-6.0812,  1.5206, -6.3105, -1.6502],
        [-6.8525,  1.3729, -7.0282, -1.5467]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 67%|██████▋   | 195/289 [02:27<01:10,  1.33it/s]

Training loop 195
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.23346984386444092, logits - tensor([[-7.5379, -2.6596, -6.5780,  2.2148],
        [-6.5020,  3.2983, -6.7861, -2.6249],
        [-6.7618,  3.9894, -6.8723, -3.0111],
        [-7.0767, -4.0707, -7.0062,  3.4307],
        [-6.2222,  2.6611, -6.0158, -3.6026],
        [-7.9178, -2.3337, -5.2587,  2.4775],
        [-7.7717,  0.5055, -7.2734, -0.8950],
        [-6.9438, -3.7928, -6.5110,  3.5381]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 68%|██████▊   | 196/289 [02:28<01:09,  1.33it/s]

Training loop 196
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.06556089967489243, logits - tensor([[-5.6940, -3.4410,  1.0413, -2.7383],
        [-4.3589, -3.3584,  2.3018, -2.7967],
        [-6.0836,  2.6233, -7.0680, -3.5532],
        [-7.5737,  4.5184, -7.8158, -3.5851],
        [-5.3309, -2.7067,  1.2314, -1.9868],
        [-6.5405, -2.5305, -1.6984,  0.6548],
        [-5.6810, -3.6185,  2.5955, -3.1186],
        [-6.3835, -2.8873, -5.1476,  3.4700]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 68%|██████▊   | 197/289 [02:29<01:09,  1.33it/s]

Training loop 197
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.18761268258094788, logits - tensor([[-5.9897, -4.2943, -4.9070,  3.7291],
        [-5.7039, -3.3625, -4.8581,  2.5808],
        [-7.8413,  2.4720, -7.5394, -2.4540],
        [-6.4517,  1.5039, -6.4428, -1.7910],
        [-7.0254,  3.1024, -7.6353, -2.7226],
        [-6.7323,  3.4538, -6.9616, -3.6727],
        [-5.6291,  3.7481, -6.2570, -2.9396],
        [-6.8929,  2.5912, -6.7806, -2.6593]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 69%|██████▊   | 198/289 [02:29<01:08,  1.33it/s]

Training loop 198
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.23291105031967163, logits - tensor([[-5.4572,  3.1143, -6.3718, -3.0182],
        [-6.4972, -4.5156,  2.7706, -2.8142],
        [-6.9935, -3.7454, -6.1195,  2.9827],
        [-6.6979, -1.5905, -6.1587,  2.0026],
        [-6.4384,  0.5745, -5.7214, -0.9466],
        [-7.0197, -3.0641, -5.5449,  3.5864],
        [-5.3368, -3.1699,  1.8120, -3.4048],
        [-7.1481, -4.2376,  2.6781, -2.4862]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 69%|██████▉   | 199/289 [02:30<01:07,  1.33it/s]

Training loop 199
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2038658708333969, logits - tensor([[-5.6986, -2.9493, -5.3740,  3.0999],
        [-6.8647,  2.8999, -6.5107, -2.6296],
        [-6.5320,  3.4657, -6.2359, -2.3088],
        [-6.6730, -3.9236, -5.8119,  3.6405],
        [-6.8389,  2.7745, -6.8962, -2.8639],
        [-6.4719,  2.7077, -6.4863, -1.7137],
        [-6.7861,  0.7057, -4.5319, -1.0892],
        [-8.1110,  1.0407, -7.2836, -1.7646]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 69%|██████▉   | 200/289 [02:31<01:06,  1.33it/s]

Training loop 200
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.07891398668289185, logits - tensor([[-7.3042,  2.3623, -6.8015, -3.1616],
        [-6.5473, -3.6226,  1.6417, -2.1696],
        [-5.4061, -4.3221,  3.0208, -3.4404],
        [-7.2055,  1.8032, -7.0757, -2.4893],
        [-6.6579, -3.4184, -6.4291,  3.7019],
        [-6.9864,  1.8900, -5.1920, -1.9701],
        [-7.8884,  1.1008, -7.2479, -1.5339],
        [-8.0269,  0.2950, -7.0256, -0.8039]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 70%|██████▉   | 201/289 [02:32<01:05,  1.33it/s]

Training loop 201
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.10873787105083466, logits - tensor([[-8.5429, -1.6980, -7.0496,  2.5675],
        [-7.9810,  2.0257, -7.9823, -1.0063],
        [-7.6304, -0.3369, -6.1907,  0.6061],
        [-6.5375, -3.5561,  3.6772, -2.9625],
        [-6.9350,  1.9278, -7.3512, -2.8688],
        [-7.1331, -3.7624, -5.3140,  3.7418],
        [-7.1448,  1.5786, -6.7801, -2.1557],
        [-6.4448,  2.0565, -5.6678, -2.1856]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 70%|██████▉   | 202/289 [02:32<01:05,  1.33it/s]

Training loop 202
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1666100174188614, logits - tensor([[-6.3660, -4.6207,  3.0336, -3.3064],
        [-7.2181, -3.7833, -5.8593,  3.7705],
        [-7.4210, -4.6414, -6.2950,  4.3786],
        [-6.8487, -4.4276, -4.7360,  4.3961],
        [-6.3674, -2.6449, -5.0647,  3.7615],
        [-6.0818,  0.2010, -6.4853, -1.1373],
        [-7.0237, -3.2474, -5.2509,  3.6624],
        [-6.2924,  1.5909, -6.6852, -2.1960]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 70%|███████   | 203/289 [02:33<01:04,  1.33it/s]

Training loop 203
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.04138585180044174, logits - tensor([[-6.7858,  2.4228, -5.6841, -2.4991],
        [-6.4599, -3.5206,  2.2866, -2.4975],
        [-6.1760,  3.3472, -6.5218, -3.2268],
        [-6.2887,  2.1975, -7.1875, -1.7861],
        [-6.8034,  3.3277, -5.6907, -2.4946],
        [-8.2879,  1.9712, -7.4502, -1.8000],
        [-6.8267, -3.0004, -5.2240,  2.8292],
        [-7.4153,  2.9755, -7.5253, -3.1287]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 71%|███████   | 204/289 [02:34<01:03,  1.33it/s]

Training loop 204
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.051436614245176315, logits - tensor([[-6.8184,  2.8308, -6.4415, -2.5734],
        [-7.4689,  3.5034, -6.7494, -3.1966],
        [-5.4104, -4.3950,  3.3704, -3.3616],
        [-6.6277,  1.8264, -6.9115, -2.3046],
        [-7.1607,  2.2075, -7.4406, -2.7490],
        [-7.7321,  1.7280, -6.9453, -1.9971],
        [-5.8965, -1.6853, -5.3257,  1.5154],
        [-5.2760, -3.1241,  2.0368, -2.2242]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 71%|███████   | 205/289 [02:35<01:02,  1.34it/s]

Training loop 205
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.38714221119880676, logits - tensor([[-6.9023,  3.2697, -6.3281, -3.6180],
        [-5.2139, -2.7685,  1.0822, -1.3028],
        [-6.1299,  2.7523, -6.7285, -2.6495],
        [-6.1655, -3.7072, -7.0599,  4.3048],
        [-7.3364,  1.2172, -6.8660, -1.1657],
        [-7.0597,  1.5501, -7.5790, -1.0715],
        [-8.2683, -3.5127, -5.5418,  2.9487],
        [-6.5406,  1.9626, -4.9167, -1.9419]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 71%|███████▏  | 206/289 [02:35<01:02,  1.33it/s]

Training loop 206
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.041440509259700775, logits - tensor([[-5.6649, -3.7943,  2.9332, -3.4447],
        [-6.4844, -3.8364, -5.4766,  3.7207],
        [-5.9495, -3.3397,  2.9451, -1.9514],
        [-8.2468,  2.6496, -8.8771, -2.4479],
        [-6.0092,  1.6334, -5.6711, -2.3402],
        [-7.2587,  2.8372, -7.6038, -2.6651],
        [-6.8663,  2.8538, -8.1480, -2.9161],
        [-7.0869,  2.3895, -6.0003, -1.5629]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 72%|███████▏  | 207/289 [02:36<01:01,  1.33it/s]

Training loop 207
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.07579464465379715, logits - tensor([[-6.7546,  1.2589, -6.6834, -2.7187],
        [-6.0278,  1.3535, -5.4612, -2.5493],
        [-7.1626,  1.9604, -6.7487, -1.6032],
        [-6.9209, -1.0542, -6.0991,  1.0807],
        [-6.5705, -2.8061,  1.7015, -2.4904],
        [-6.5402,  2.6342, -6.2795, -2.8765],
        [-5.9546, -3.2877,  2.1659, -2.6818],
        [-7.7794,  1.9678, -7.5215, -2.1946]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 72%|███████▏  | 208/289 [02:37<01:00,  1.33it/s]

Training loop 208
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.06780751049518585, logits - tensor([[-4.5814, -3.6278,  1.7545, -2.5363],
        [-6.4274,  2.8297, -6.8309, -2.8061],
        [-8.4955,  2.4973, -8.2064, -2.1256],
        [-6.6503,  0.6360, -6.3030, -0.6961],
        [-7.4454,  1.8430, -7.0771, -1.7562],
        [-6.2085, -4.0845, -4.6790,  3.3409],
        [-6.4580,  2.4331, -6.2388, -2.0482],
        [-7.2134,  2.3398, -6.6207, -2.4765]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 72%|███████▏  | 209/289 [02:38<01:00,  1.32it/s]

Training loop 209
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 73%|███████▎  | 210/289 [02:38<00:59,  1.32it/s]

loss - 0.18946188688278198, logits - tensor([[-6.5314,  3.6395, -6.5546, -3.8004],
        [-7.2345,  0.3484, -5.9184, -0.6131],
        [-8.5929,  0.3661, -8.1806, -0.0188],
        [-7.7981,  2.0528, -6.7122, -1.8119],
        [-5.8480,  2.8272, -6.0592, -3.0136],
        [-5.6893,  1.5689, -6.1217, -1.2227],
        [-6.6768,  3.1404, -7.2449, -3.2947],
        [-7.5322,  2.9130, -6.4804, -2.6843]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 210
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 73%|███████▎  | 211/289 [02:39<00:59,  1.32it/s]

loss - 0.22611486911773682, logits - tensor([[-5.8361, -3.4751,  1.9359, -2.9837],
        [-6.7437,  1.2742, -6.6361, -1.6537],
        [-4.5687, -3.2809,  3.0954, -2.6770],
        [-7.0977,  2.0817, -6.9005, -1.8739],
        [-5.9839, -4.7342, -5.3219,  5.2633],
        [-7.9347, -4.8287, -0.7917,  0.8461],
        [-8.0087,  2.9136, -7.5996, -2.9297],
        [-5.7615, -1.1921, -5.9442,  0.4064]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 211
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.05211545154452324, logits - tensor([[-6.4028, -4.6093, -5.3501,  4.2255],
        [-8.6454, -0.6996, -6.4964,  1.0852],
        [-7.6225,  1.5534, -7.2274, -2.1834],
        [-6.7868, -4.0245, -5.0044,  3.9183],
        [-6.5161, -3.8409, -6.1563,  3.2474],
        [-6.3103, -3.1195, -4.7955,  2.9785],
        [-6.9921,  1.7148, -6.7650, -1.4911],
        [-6.7059,  3.3502, -6.4072, -3

 73%|███████▎  | 212/289 [02:40<00:58,  1.32it/s]

Training loop 212
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.05996037274599075, logits - tensor([[-6.9540,  2.2032, -6.2258, -1.8753],
        [-6.7805,  1.8232, -5.8985, -2.2137],
        [-6.6319,  2.4095, -7.1245, -2.4810],
        [-6.7803,  3.5626, -7.3561, -3.0909],
        [-6.6250, -4.1014, -6.7001,  3.1785],
        [-6.1296,  1.9060, -5.8174, -1.2479],
        [-5.9584,  1.1011, -5.2250, -1.1433],
        [-6.8589,  2.9165, -6.5624, -2.4154]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 74%|███████▎  | 213/289 [02:41<00:57,  1.31it/s]

Training loop 213
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.29401126503944397, logits - tensor([[-7.9055,  2.5378, -8.0946, -2.1404],
        [-7.3267, -3.8271, -6.4233,  3.5810],
        [-7.4985, -2.5907, -6.1124,  2.0183],
        [-6.5901,  1.8543, -6.5995, -2.4591],
        [-7.4938, -0.4759, -6.3250,  0.4953],
        [-8.1822,  4.3816, -8.2096, -3.6841],
        [-6.0533, -3.6337,  1.6429, -2.2164],
        [-6.6329, -4.5912, -5.8229,  4.7386]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 74%|███████▍  | 214/289 [02:42<00:56,  1.32it/s]

Training loop 214
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.16892899572849274, logits - tensor([[-6.4038,  2.7213, -6.0514, -2.0264],
        [-6.6342,  2.1782, -6.6094, -1.6885],
        [-7.1662,  2.2439, -6.9971, -2.2264],
        [-7.0159,  2.2749, -6.2710, -2.1307],
        [-6.7676,  2.5028, -6.3204, -2.4082],
        [-7.4013, -3.2800, -3.8816,  2.2652],
        [-5.5732, -3.7759,  2.6276, -2.7108],
        [-6.2679, -3.5272,  2.4842, -2.8886]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 74%|███████▍  | 215/289 [02:42<00:56,  1.31it/s]

Training loop 215
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.15323887765407562, logits - tensor([[-7.0422,  2.8932, -6.8967, -3.0209],
        [-7.4459,  2.7824, -7.4183, -2.9343],
        [-6.1781, -2.5831,  0.9863, -1.9531],
        [-6.1189,  3.6016, -6.7124, -3.4910],
        [-7.0256,  2.2802, -7.1685, -2.4084],
        [-7.1834, -2.1163, -5.9734,  3.1025],
        [-5.3241, -4.4572,  3.7271, -2.9974],
        [-6.0213, -4.3349,  3.0860, -3.2583]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 75%|███████▍  | 216/289 [02:43<00:55,  1.31it/s]

Training loop 216
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.05378764867782593, logits - tensor([[-6.8065, -3.4873,  1.8379, -2.2215],
        [-7.4571, -0.6409, -6.0573,  0.8173],
        [-9.1334,  3.0566, -7.4537, -2.9425],
        [-6.8472, -4.4646, -6.0268,  4.3463],
        [-5.7104,  2.1820, -6.7713, -2.2802],
        [-6.8379, -4.6151, -5.6137,  3.1061],
        [-6.9601,  2.3251, -6.7246, -2.5401],
        [-6.9086, -2.9870, -5.9145,  3.4812]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 75%|███████▌  | 217/289 [02:44<00:54,  1.31it/s]

Training loop 217
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1570410430431366, logits - tensor([[-6.2196, -3.8851,  1.8208, -1.7321],
        [-6.5153,  2.3416, -6.5488, -3.1075],
        [-6.8932, -3.4197, -7.0257,  4.1993],
        [-6.8485,  3.4582, -6.2875, -3.0484],
        [-7.4430, -2.7707,  0.8038, -0.9457],
        [-7.5067,  0.4684, -5.9258,  0.0986],
        [-7.0834, -3.1438, -5.7833,  3.1214],
        [-5.8761, -3.6376,  1.0909, -1.1599]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 75%|███████▌  | 218/289 [02:45<00:53,  1.32it/s]

Training loop 218
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.03986111655831337, logits - tensor([[-6.3868, -3.0130, -6.6144,  1.6309],
        [-8.7259, -4.1608, -5.6079,  3.4507],
        [-6.1022,  2.3308, -5.8546, -3.2331],
        [-6.1060, -4.3134,  3.1295, -2.0806],
        [-6.3351, -3.2128, -6.5417,  2.2119],
        [-6.4327,  2.3434, -6.5658, -2.6559],
        [-7.3129,  2.3947, -7.0347, -1.3651],
        [-6.4962, -3.3385, -6.2578,  3.9524]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 76%|███████▌  | 219/289 [02:45<00:53,  1.32it/s]

Training loop 219
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3112676739692688, logits - tensor([[-7.2769,  2.4240, -7.5482, -2.3408],
        [-7.1211,  0.2889, -6.2381, -0.1832],
        [-5.2192, -2.8808,  1.6300, -1.8076],
        [-6.6423,  1.9413, -7.2059, -2.0488],
        [-8.1734,  3.0818, -7.6133, -2.5353],
        [-5.6628,  2.4533, -6.7090, -3.3543],
        [-8.0384,  0.1625, -6.3053, -0.2772],
        [-5.4147, -2.3602,  0.2426, -1.2542]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 76%|███████▌  | 220/289 [02:46<00:52,  1.32it/s]

Training loop 220
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.04507660120725632, logits - tensor([[-5.6032,  2.8449, -5.6216, -3.1535],
        [-6.9974,  0.9982, -4.9702, -1.8773],
        [-6.7448,  2.6681, -7.3351, -2.2465],
        [-7.0289, -4.1771, -5.8203,  4.0898],
        [-7.1316, -2.9710, -5.9892,  2.1539],
        [-5.2026, -4.0458,  2.9108, -3.2304],
        [-5.4205, -3.2739,  1.8556, -2.2109],
        [-7.1886,  3.3161, -7.0441, -2.7502]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 76%|███████▋  | 221/289 [02:47<00:51,  1.32it/s]

Training loop 221
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.06327558308839798, logits - tensor([[-6.7807,  0.4783, -6.9496, -0.5407],
        [-6.2699,  2.8408, -6.2676, -2.6864],
        [-6.0242, -4.9241, -6.7463,  4.2835],
        [-6.4761,  2.6602, -7.0537, -2.6470],
        [-6.5898,  1.7293, -5.3160, -1.6299],
        [-6.3969, -3.5150, -5.2774,  2.5230],
        [-5.5141, -3.7478,  2.6991, -2.6579],
        [-6.1451,  2.2432, -6.8555, -2.6830]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 77%|███████▋  | 222/289 [02:48<00:50,  1.32it/s]

Training loop 222
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.08062014728784561, logits - tensor([[-7.6182, -1.9140, -5.3864,  1.1671],
        [-6.7668,  0.0466, -5.4136, -0.0162],
        [-7.3274,  2.8161, -6.3208, -2.9970],
        [-5.4202, -3.7164,  2.8117, -2.2819],
        [-6.9876, -3.8973, -5.1139,  4.0465],
        [-6.3806,  3.6199, -7.5745, -2.9314],
        [-6.0663, -3.4542,  2.6064, -3.2093],
        [-7.2510,  2.3727, -6.5177, -2.3770]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 77%|███████▋  | 223/289 [02:48<00:50,  1.32it/s]

Training loop 223
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.02525240369141102, logits - tensor([[-7.6060,  3.2219, -7.8231, -3.1395],
        [-6.9943,  3.3214, -6.3782, -4.1857],
        [-5.9987, -4.2041,  2.6691, -2.4972],
        [-7.6565,  2.5621, -6.7809, -3.2722],
        [-6.8577,  2.6157, -7.1800, -2.6599],
        [-6.2495, -4.8119, -5.2865,  4.2903],
        [-6.7107, -4.5336, -6.0631,  3.4908],
        [-6.7355,  2.6591, -6.9204, -2.1218]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 78%|███████▊  | 224/289 [02:49<00:48,  1.33it/s]

Training loop 224
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.03785596042871475, logits - tensor([[-7.3063,  2.6641, -7.0233, -2.5860],
        [-5.4848,  3.3303, -6.3175, -2.9733],
        [-5.5862,  2.3517, -6.3070, -2.7767],
        [-5.9465,  3.2705, -5.9889, -3.5302],
        [-6.3274,  2.4852, -7.1845, -3.3509],
        [-5.5282,  2.6290, -5.2354, -2.8733],
        [-6.0837, -3.8792,  2.4404, -2.7704],
        [-7.9072,  1.7231, -7.9077, -1.7000]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 78%|███████▊  | 225/289 [02:50<00:48,  1.33it/s]

Training loop 225
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4312969744205475, logits - tensor([[-5.8034, -3.9736, -5.2011,  4.4146],
        [-6.3023,  3.0353, -6.9394, -3.0443],
        [-6.6053,  1.7215, -6.4065, -2.4903],
        [-5.6395,  1.8510, -6.2650, -2.8755],
        [-6.9062, -0.1807, -6.0096,  1.0605],
        [-6.0980,  2.9208, -6.9919, -2.8875],
        [-7.2822,  2.7826, -6.7285, -2.3177],
        [-8.3350, -4.2616, -6.6161,  2.9864]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 78%|███████▊  | 226/289 [02:51<00:47,  1.33it/s]

Training loop 226
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3192339241504669, logits - tensor([[-6.4541,  3.3719, -7.8121, -3.0807],
        [-5.4028, -2.9702,  1.4422, -1.5926],
        [-7.2344,  2.8728, -6.4456, -2.5842],
        [-6.0751, -3.4249,  2.1486, -2.8485],
        [-7.0771, -2.0130, -5.4916,  1.0483],
        [-6.8195,  2.2798, -6.1662, -1.7724],
        [-6.9867,  2.1991, -5.8074, -3.3231],
        [-6.4607,  3.5428, -5.6973, -3.8552]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 79%|███████▊  | 227/289 [02:51<00:46,  1.33it/s]

Training loop 227
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.0382046215236187, logits - tensor([[-6.6551,  2.7812, -7.2175, -2.2000],
        [-8.1458,  2.9605, -6.4096, -3.6717],
        [-6.4801, -3.9804,  2.6828, -2.7907],
        [-7.6566,  2.7510, -7.1722, -2.8035],
        [-5.9236,  2.2030, -6.2722, -1.4291],
        [-6.4189,  2.0302, -7.2310, -2.8021],
        [-5.8941,  3.2219, -6.6497, -2.7477],
        [-6.7813,  3.4224, -6.6228, -2.7574]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 79%|███████▉  | 228/289 [02:52<00:45,  1.33it/s]

Training loop 228
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.03416213393211365, logits - tensor([[-7.6283,  2.7514, -7.1313, -2.5091],
        [-8.0092,  1.7465, -7.2206, -1.6703],
        [-6.6996,  2.9582, -6.1522, -3.8248],
        [-7.4402,  3.0240, -6.4670, -2.6818],
        [-5.8091, -2.7161, -4.8896,  3.1755],
        [-5.6772, -4.2037,  2.4755, -2.6768],
        [-5.9954,  3.0056, -5.8679, -3.6632],
        [-5.7891,  3.0860, -6.0761, -4.1124]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 79%|███████▉  | 229/289 [02:53<00:45,  1.33it/s]

Training loop 229
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.03688428923487663, logits - tensor([[-6.1909,  3.3746, -7.0229, -2.7098],
        [-8.0457,  2.2534, -8.0935, -2.4426],
        [-7.9851, -1.4325, -6.2541,  2.6007],
        [-5.3134, -3.8146, -6.3184,  3.8148],
        [-6.0547, -4.8348,  3.5200, -3.7382],
        [-5.8361, -2.8180,  1.7410, -1.9762],
        [-5.8710,  3.0646, -6.5978, -3.2639],
        [-6.7839, -4.8123,  4.0916, -3.5331]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 80%|███████▉  | 230/289 [02:54<00:44,  1.32it/s]

Training loop 230
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.23819731175899506, logits - tensor([[-7.3310,  1.8923, -7.3033, -1.6433],
        [-5.8070,  1.5974, -5.9830, -1.0409],
        [-5.7806, -3.4018,  2.3043, -2.6392],
        [-6.6828,  3.1263, -6.4619, -2.3314],
        [-5.5492,  3.4815, -5.8102, -2.2178],
        [-8.2147,  1.6718, -7.4951, -1.3601],
        [-5.6940,  3.1175, -6.3809, -2.4137],
        [-5.6109, -3.1990,  2.8544, -2.7830]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 80%|███████▉  | 231/289 [02:54<00:44,  1.31it/s]

Training loop 231
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.0947045385837555, logits - tensor([[-6.4396, -3.9699,  2.7838, -2.6486],
        [-6.1860, -2.2662, -5.2259,  2.2085],
        [-6.1283,  2.8632, -6.2215, -2.5044],
        [-6.5951,  2.3901, -6.4732, -2.4613],
        [-6.7023,  1.7170, -6.6641, -2.7394],
        [-6.2316, -2.9984,  2.3525, -2.5079],
        [-7.0533,  0.0203, -6.6723, -0.2155],
        [-5.1266, -2.5988,  1.5206, -2.0305]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 80%|████████  | 232/289 [02:55<00:43,  1.30it/s]

Training loop 232
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.03222721815109253, logits - tensor([[-5.9696,  2.7066, -5.5655, -2.2989],
        [-6.7849,  2.7937, -6.8462, -2.4890],
        [-6.5548,  2.4824, -6.4106, -2.4219],
        [-7.0900,  3.7169, -6.7383, -3.4845],
        [-6.0937, -4.5417, -6.0769,  3.7968],
        [-6.2984,  3.2038, -6.9881, -4.0165],
        [-6.8135, -3.1598,  1.7510, -2.0121],
        [-7.3320,  3.2440, -6.4161, -3.4200]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 81%|████████  | 233/289 [02:56<00:42,  1.31it/s]

Training loop 233
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.20532995462417603, logits - tensor([[-5.6714,  3.5086, -6.1486, -2.8423],
        [-7.0011, -0.5593, -4.2038,  0.4605],
        [-6.3229,  1.9235, -6.3412, -3.2121],
        [-6.3429,  2.7755, -6.1435, -2.4551],
        [-5.5008, -3.0369,  1.6265, -1.7710],
        [-5.8955,  3.3806, -6.6160, -3.2260],
        [-4.9030, -4.5063, -4.9149,  3.5703],
        [-6.6648,  2.1697, -6.8881, -2.3027]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 81%|████████  | 234/289 [02:57<00:41,  1.31it/s]

Training loop 234
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.36540085077285767, logits - tensor([[-5.6962, -3.8132,  1.8340, -2.7055],
        [-7.6816,  3.2876, -7.0888, -3.3846],
        [-6.4216, -4.2767, -5.4312,  3.7832],
        [-6.2017, -3.8976,  1.6872, -1.6062],
        [-8.1012,  2.8756, -7.4831, -2.8175],
        [-7.7457,  2.0046, -7.3279, -2.5458],
        [-6.9060,  3.5142, -7.0654, -3.2331],
        [-7.0745, -1.6299, -5.4781,  1.8045]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 81%|████████▏ | 235/289 [02:57<00:41,  1.31it/s]

Training loop 235
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.04086720943450928, logits - tensor([[-6.4447,  2.5120, -6.3209, -2.9387],
        [-7.3004,  3.7390, -5.8823, -3.1425],
        [-4.4850, -4.1905, -4.2881,  4.5608],
        [-5.4381, -2.2117, -6.0192,  2.5721],
        [-6.4502,  1.8665, -7.0739, -2.5853],
        [-5.8696, -3.9462, -5.4418,  4.9554],
        [-6.9302,  2.4633, -6.5292, -1.7547],
        [-8.1308, -2.2140, -3.1633,  1.3539]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 82%|████████▏ | 236/289 [02:58<00:40,  1.31it/s]

Training loop 236
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.02855311706662178, logits - tensor([[-5.8115,  2.3722, -5.7480, -2.4843],
        [-6.8376,  3.7528, -6.2549, -3.2778],
        [-6.7184,  2.0173, -6.1950, -2.0380],
        [-7.1578,  3.7585, -6.7790, -3.6216],
        [-6.3698,  3.1949, -5.6986, -2.9332],
        [-6.4301,  3.3943, -6.2958, -3.6413],
        [-7.0624, -4.0654, -5.5117,  3.2681],
        [-5.8555, -4.1321,  2.3645, -3.0603]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 82%|████████▏ | 237/289 [02:59<00:39,  1.31it/s]

Training loop 237
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2135285884141922, logits - tensor([[-5.4634, -3.1201,  1.2084, -1.7816],
        [-7.3092,  1.3719, -7.4581, -0.9809],
        [-6.5376,  3.1182, -7.0444, -3.2913],
        [-6.0131, -2.7332,  0.5458, -1.5848],
        [-6.1255, -2.7917,  2.4009, -2.1588],
        [-7.7225,  2.9893, -8.1714, -2.9509],
        [-6.4211,  2.9945, -6.8675, -3.2384],
        [-7.6786,  2.7083, -8.0119, -3.3485]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 82%|████████▏ | 238/289 [03:00<00:38,  1.32it/s]

Training loop 238
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.047465406358242035, logits - tensor([[-7.7222, -5.1164,  3.2341, -2.5215],
        [-6.2915,  1.8987, -6.5008, -2.8067],
        [-7.4175,  3.0576, -6.5423, -2.4096],
        [-6.5007,  2.4242, -6.7720, -2.5373],
        [-7.8686,  3.0047, -8.0545, -1.9616],
        [-7.8148, -2.7795, -6.1429,  2.6158],
        [-8.4441,  1.5354, -7.0331, -1.6028],
        [-6.3033, -3.6318,  2.1489, -2.6913]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 83%|████████▎ | 239/289 [03:01<00:37,  1.32it/s]

Training loop 239
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.16785883903503418, logits - tensor([[-6.9890,  3.2749, -6.9481, -3.5949],
        [-6.8986,  3.7328, -6.3957, -3.0526],
        [-6.5284,  2.1064, -5.9675, -1.4580],
        [-6.8831,  3.2955, -7.2079, -2.1442],
        [-6.6512,  2.4616, -6.6207, -2.0999],
        [-6.3903,  2.7647, -5.1945, -2.9312],
        [-7.5779,  2.0942, -6.6245, -2.5527],
        [-6.2620, -2.9417,  0.8120, -1.2249]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 83%|████████▎ | 240/289 [03:01<00:37,  1.32it/s]

Training loop 240
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.08758228272199631, logits - tensor([[-7.1685, -2.2990,  0.0686, -0.6889],
        [-5.5771, -4.5776, -4.2553,  2.9591],
        [-6.4832,  3.0572, -6.4198, -3.0078],
        [-6.6142,  2.4870, -6.7222, -2.4474],
        [-6.9527, -3.4929, -5.8752,  3.1798],
        [-6.6173,  1.9564, -5.9433, -2.2489],
        [-5.3948, -4.1882,  3.7722, -2.6738],
        [-7.1477,  3.0514, -7.9538, -2.5709]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 83%|████████▎ | 241/289 [03:02<00:36,  1.32it/s]

Training loop 241
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.09959319233894348, logits - tensor([[-6.6635,  2.6615, -6.4046, -2.0164],
        [-6.8719,  0.2760, -5.9366,  0.6875],
        [-6.7200,  2.5425, -7.1337, -3.8201],
        [-6.7362,  2.4716, -6.3405, -2.7399],
        [-7.8319,  2.4532, -7.1664, -3.5464],
        [-6.7521, -4.4549, -5.7379,  4.6721],
        [-5.6714, -3.6347,  1.2552, -1.7236],
        [-6.3431, -3.1609,  1.1073, -1.6742]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 84%|████████▎ | 242/289 [03:03<00:35,  1.32it/s]

Training loop 242
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.42548999190330505, logits - tensor([[-7.5407,  3.1888, -6.9448, -2.8929],
        [-5.6649, -2.7411,  0.8582, -1.3342],
        [-4.5836, -4.0944, -4.5894,  4.4026],
        [-7.6976,  1.8542, -7.5890, -1.9415],
        [-7.5120,  2.6617, -7.1297, -2.7909],
        [-7.0759, -4.4070, -5.5212,  4.7324],
        [-7.3797,  1.7728, -6.3261, -2.2905],
        [-5.5010, -3.2028,  2.2161, -1.7436]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 84%|████████▍ | 243/289 [03:04<00:34,  1.32it/s]

Training loop 243
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.17142820358276367, logits - tensor([[-6.6868,  2.0996, -5.9639, -1.2988],
        [-4.8864, -3.2903, -4.7353,  3.5116],
        [-6.3057,  3.6846, -6.8235, -3.4870],
        [-6.6240,  2.3415, -6.3720, -1.7274],
        [-5.1762,  2.5296, -5.7736, -2.8181],
        [-5.8253, -4.0944,  2.9436, -2.5260],
        [-5.7154,  2.7623, -6.0651, -2.0390],
        [-5.7859, -3.6694,  2.6150, -2.5071]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 84%|████████▍ | 244/289 [03:04<00:34,  1.32it/s]

Training loop 244
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2919997572898865, logits - tensor([[-6.6621, -2.0497, -6.0124,  1.8679],
        [-5.5863, -3.3686,  1.3629, -1.5090],
        [-7.3798, -2.5623, -6.0739,  2.2726],
        [-5.3714, -0.9868, -1.1993, -0.4884],
        [-6.0070, -3.8952,  2.4048, -1.8198],
        [-5.6242, -4.0104,  2.5758, -2.3397],
        [-7.0041, -2.1068, -7.1578,  1.3654],
        [-7.0215,  0.1153, -6.2736,  0.1629]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 85%|████████▍ | 245/289 [03:05<00:33,  1.32it/s]

Training loop 245
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.16483578085899353, logits - tensor([[-6.1566, -4.1759,  2.5115, -2.2307],
        [-5.2446, -4.9471, -6.3803,  4.0485],
        [-6.8623,  2.9759, -6.6219, -2.8220],
        [-5.0514, -3.1914,  2.2982, -1.7884],
        [-7.2195,  1.9992, -6.6158, -1.6433],
        [-5.9066, -3.8896, -4.4342,  3.8595],
        [-7.4350,  2.3154, -7.3903, -1.5638],
        [-6.1990,  2.5755, -6.3558, -3.3100]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 85%|████████▌ | 246/289 [03:06<00:32,  1.33it/s]

Training loop 246
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.13407644629478455, logits - tensor([[-6.8911,  2.5119, -6.9745, -3.2048],
        [-5.7323, -3.3969,  2.0523, -1.6233],
        [-6.3438, -4.0472,  2.7646, -3.1014],
        [-7.2553,  1.7231, -6.5736, -2.6298],
        [-7.2861,  1.2376, -6.7458, -1.6107],
        [-7.4527,  3.4021, -6.8711, -2.8632],
        [-7.7678,  1.2883, -6.3590, -0.9714],
        [-7.4252,  2.9789, -7.3549, -2.6575]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 85%|████████▌ | 247/289 [03:07<00:31,  1.33it/s]

Training loop 247
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.06676719337701797, logits - tensor([[-5.8084,  2.4293, -6.3819, -2.8941],
        [-6.9691,  3.2993, -6.6119, -3.3573],
        [-6.7583, -3.2858, -5.8964,  3.8300],
        [-6.0702,  1.7744, -6.3166, -2.4698],
        [-6.6067,  1.9912, -7.7274, -1.2390],
        [-7.1157,  0.8741, -6.7598, -1.4293],
        [-7.1852,  1.6531, -6.1924, -1.6702],
        [-6.1993,  1.5443, -6.9119, -2.0573]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 86%|████████▌ | 248/289 [03:07<00:30,  1.33it/s]

Training loop 248
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2598792016506195, logits - tensor([[-8.2551, -1.8509, -5.8618,  2.2933],
        [-6.8426,  0.4153, -5.9891, -0.8139],
        [-5.9333, -4.4411,  2.9422, -2.2763],
        [-6.5088,  2.8918, -7.0206, -2.2624],
        [-6.7890,  1.1766, -6.5422, -1.6214],
        [-7.9149, -1.6309, -6.5597,  2.2509],
        [-6.2351, -4.2943,  2.5488, -2.6280],
        [-5.6831, -3.1059,  1.1895, -1.8621]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 86%|████████▌ | 249/289 [03:08<00:30,  1.33it/s]

Training loop 249
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.12030866742134094, logits - tensor([[-6.3977,  1.5221, -6.1299, -2.5172],
        [-7.0510,  3.0599, -6.6271, -2.7729],
        [-7.5706,  2.7759, -8.0388, -2.8401],
        [-5.2660, -3.7009,  2.4648, -2.9805],
        [-7.9636,  3.0431, -7.9769, -3.3853],
        [-6.3462,  2.0466, -6.6707, -2.2030],
        [-8.2829,  1.4004, -7.0871, -0.5785],
        [-7.5076,  1.9685, -5.7154, -2.3199]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 87%|████████▋ | 250/289 [03:09<00:29,  1.33it/s]

Training loop 250
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.21051552891731262, logits - tensor([[-5.7836, -3.4261, -5.8972,  3.9318],
        [-5.5132, -3.3918,  2.2105, -2.2730],
        [-8.6251,  2.3426, -7.1598, -2.2409],
        [-6.4788,  2.5481, -6.9363, -2.1272],
        [-7.8744, -1.4075, -6.4743,  1.1404],
        [-7.9453,  1.9462, -6.3976, -1.9681],
        [-7.0829, -1.9272, -5.5503,  1.0439],
        [-6.6134, -2.0441, -5.4043,  1.9110]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 87%|████████▋ | 251/289 [03:10<00:28,  1.32it/s]

Training loop 251
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.13913601636886597, logits - tensor([[-7.1576,  2.1796, -6.6506, -2.8230],
        [-6.2568,  1.8475, -6.4553, -0.9532],
        [-7.5833,  2.3939, -6.8044, -1.9909],
        [-6.5521,  2.1568, -7.2103, -2.7853],
        [-7.0924,  1.4556, -7.1344, -0.4104],
        [-6.8233,  2.1032, -7.4711, -2.3600],
        [-9.1165, -0.4238, -7.8729,  1.2951],
        [-7.9746, -4.5486, -6.8005,  4.0781]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 87%|████████▋ | 252/289 [03:10<00:28,  1.31it/s]

Training loop 252
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2973725497722626, logits - tensor([[-7.1002, -1.2628, -6.4295,  1.4705],
        [-6.2357,  2.2597, -6.3953, -3.2136],
        [-7.8956,  2.4572, -6.6147, -2.6281],
        [-6.7267,  2.1547, -6.6809, -2.5693],
        [-7.0136,  2.6094, -6.7842, -2.2612],
        [-5.6595, -3.6588,  2.7507, -2.5686],
        [-7.0435, -3.5167, -6.2941,  3.6392],
        [-6.0266,  2.6607, -5.7718, -2.8706]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 88%|████████▊ | 253/289 [03:11<00:27,  1.32it/s]

Training loop 253
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.09603533148765564, logits - tensor([[-7.7919, -0.0154, -5.8183, -0.0729],
        [-6.9180,  2.9163, -6.4716, -2.8399],
        [-6.2323, -3.0181, -5.1998,  2.4808],
        [-5.5721,  1.5130, -6.9116, -2.1721],
        [-5.5147,  1.6095, -5.3704, -1.6974],
        [-6.7315,  2.5553, -6.5263, -1.8562],
        [-5.7406,  2.7312, -5.5759, -2.7991],
        [-7.0480,  1.3695, -7.5934, -1.4366]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 88%|████████▊ | 254/289 [03:12<00:26,  1.32it/s]

Training loop 254
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.5350524187088013, logits - tensor([[-6.1686,  1.1883, -5.9349, -1.5344],
        [-6.8621,  2.1315, -6.0853, -2.1228],
        [-6.5443,  2.0866, -6.8282, -1.6449],
        [-6.7868, -3.6246,  2.1346, -2.7843],
        [-5.8157,  2.4664, -6.8955, -2.1793],
        [-7.2436,  2.3054, -7.4278, -2.6681],
        [-5.3056, -3.1414,  1.5201, -2.0655],
        [-7.6533, -0.6497, -6.3653,  0.6966]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 88%|████████▊ | 255/289 [03:13<00:25,  1.32it/s]

Training loop 255
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2336803823709488, logits - tensor([[-7.0116,  2.4108, -6.8137, -2.2502],
        [-5.3665, -4.2157,  2.5525, -2.9561],
        [-6.9117,  2.3459, -7.3943, -3.2060],
        [-7.5236, -3.0285, -6.8196,  2.7716],
        [-4.9987, -2.9460,  1.9006, -1.4449],
        [-8.1329,  1.6645, -6.4918, -1.1658],
        [-8.3241, -0.1058, -7.8616, -0.6267],
        [-6.1274, -4.2647,  2.1759, -2.4097]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 89%|████████▊ | 256/289 [03:13<00:24,  1.32it/s]

Training loop 256
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.18121615052223206, logits - tensor([[-6.8984, -2.8252, -6.0445,  1.6440],
        [-7.6667,  2.5412, -7.0323, -2.0231],
        [-7.3816,  2.1860, -6.9728, -2.3336],
        [-6.4725, -4.3947, -5.8087,  4.5569],
        [-6.3435,  2.0926, -6.2361, -2.6151],
        [-6.8443,  1.1603, -6.5484, -0.4153],
        [-5.7148, -3.3771,  1.6645, -2.1006],
        [-7.5162, -3.0688, -6.5566,  3.8407]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 89%|████████▉ | 257/289 [03:14<00:24,  1.33it/s]

Training loop 257
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.23678326606750488, logits - tensor([[-6.0981, -3.6376, -5.7629,  3.4533],
        [-7.1849,  0.8476, -6.4555, -0.7808],
        [-6.7733,  1.7502, -5.8800, -2.2294],
        [-7.8629, -3.8725, -6.9969,  3.9012],
        [-5.4695, -4.1506,  3.0139, -2.6194],
        [-6.5623, -3.4794, -4.9777,  2.8651],
        [-5.9914, -3.4540,  1.6007, -1.8137],
        [-7.0542,  1.6529, -6.5011, -2.8022]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 89%|████████▉ | 258/289 [03:15<00:23,  1.33it/s]

Training loop 258
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.21861305832862854, logits - tensor([[-6.2728,  1.7144, -5.6620, -1.8548],
        [-7.0095,  0.0510, -6.9380, -0.2833],
        [-7.6170,  2.4462, -7.2778, -2.0126],
        [-7.0583,  2.3857, -6.5868, -2.4643],
        [-5.8847,  2.2157, -5.7091, -1.7189],
        [-7.1935, -2.3284, -6.3258,  1.7937],
        [-5.3494, -2.7148,  2.1239, -1.6792],
        [-6.9344,  3.3502, -6.5994, -3.4665]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 90%|████████▉ | 259/289 [03:16<00:22,  1.32it/s]

Training loop 259
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.06849150359630585, logits - tensor([[-7.7242,  0.1407, -7.8520, -1.0010],
        [-6.5520,  2.7571, -6.4815, -3.3796],
        [-5.5191, -3.4916, -6.8812,  2.8320],
        [-6.9877,  2.4854, -7.3670, -2.9453],
        [-6.9724, -3.4554, -6.2346,  3.7171],
        [-6.1929, -2.3257,  0.7799, -1.2296],
        [-7.5192,  3.5368, -7.7299, -3.0574],
        [-6.8976,  3.2127, -6.9457, -3.6594]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 90%|████████▉ | 260/289 [03:16<00:21,  1.33it/s]

Training loop 260
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.09702306985855103, logits - tensor([[-5.8665, -3.4682,  1.7051, -2.1461],
        [-7.5388,  2.5863, -6.8747, -2.1206],
        [-8.0334,  1.1097, -6.1653, -2.0150],
        [-5.2247, -2.7285,  1.5810, -2.7583],
        [-7.9422,  3.3403, -7.2346, -3.0740],
        [-7.1068, -5.2362, -0.3579,  0.0242],
        [-6.6433,  2.8322, -7.1592, -2.7985],
        [-6.7345,  3.3945, -6.9718, -3.1415]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 90%|█████████ | 261/289 [03:17<00:21,  1.33it/s]

Training loop 261
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.03488358482718468, logits - tensor([[-7.8801,  2.9298, -7.2185, -2.8836],
        [-6.4300, -3.3313, -4.6585,  3.4844],
        [-7.1482, -4.4445,  1.9438, -2.2021],
        [-6.4715, -4.6541,  2.9895, -3.2323],
        [-6.5584,  3.5866, -6.3171, -3.4816],
        [-6.7228, -2.9449, -5.9553,  2.7552],
        [-7.1823,  1.8908, -6.5934, -1.9490],
        [-6.9195,  2.6472, -6.5664, -2.8156]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 91%|█████████ | 262/289 [03:18<00:20,  1.33it/s]

Training loop 262
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19165253639221191, logits - tensor([[-5.2543,  1.4270, -5.6822, -2.5720],
        [-7.0454,  2.6416, -6.7583, -1.5341],
        [-6.1400,  2.3637, -6.8285, -2.5794],
        [-5.7128,  2.3420, -6.4504, -2.1540],
        [-7.1007,  3.8075, -6.8524, -3.9115],
        [-7.2948,  2.2565, -6.8800, -2.0637],
        [-6.0512, -2.8268,  1.8060, -2.5859],
        [-6.3885,  2.1942, -5.6395, -1.8586]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 91%|█████████ | 263/289 [03:19<00:19,  1.33it/s]

Training loop 263
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2934619188308716, logits - tensor([[-6.1534e+00, -3.8186e+00,  2.5149e+00, -3.3059e+00],
        [-7.2864e+00,  2.2152e+00, -6.8732e+00, -2.4474e+00],
        [-6.7603e+00, -1.8534e-01, -7.0206e+00, -1.3018e-03],
        [-8.1416e+00,  3.3904e+00, -7.2733e+00, -2.4802e+00],
        [-6.4756e+00,  2.9358e+00, -6.6804e+00, -2.9324e+00],
        [-7.7740e+00,  4.7575e-01, -6.3804e+00, -3.5742e-01],
        [-5.5352e+00, -3.5692e+00,  1.4347e+00, -1.6380e+00],
        [-7.7983e+00,  2.1039e+00, -7.1176e+00, -2.0802e+00]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 91%|█████████▏| 264/289 [03:19<00:18,  1.32it/s]

Training loop 264
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.09385610371828079, logits - tensor([[-8.0389, -4.5244, -7.2685,  4.2906],
        [-6.2026,  1.8860, -6.6268, -2.4305],
        [-8.5198, -3.1777, -6.6420,  3.6816],
        [-7.1659,  2.2283, -6.9570, -2.3595],
        [-6.2465, -3.6829,  1.8062, -1.5857],
        [-5.9190, -2.3531,  0.5819, -0.7561],
        [-7.0763,  0.2822, -5.9312, -1.2173],
        [-6.0428,  1.2077, -6.2321, -2.1572]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 92%|█████████▏| 265/289 [03:20<00:18,  1.32it/s]

Training loop 265
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.21058876812458038, logits - tensor([[-7.6407,  3.2766, -6.8348, -2.7854],
        [-6.6382,  2.8505, -6.6859, -2.8584],
        [-5.6265,  1.7572, -6.4592, -1.9039],
        [-6.8746,  2.9611, -6.5909, -3.1673],
        [-6.9359,  1.9104, -6.3234, -2.5340],
        [-8.9643,  0.0776, -6.7589, -0.2922],
        [-6.7767,  1.7719, -6.8313, -1.7145],
        [-8.2161, -0.5701, -7.0925,  0.6021]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 92%|█████████▏| 266/289 [03:21<00:17,  1.32it/s]

Training loop 266
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.21304236352443695, logits - tensor([[-5.6225, -3.1012,  0.9813, -1.2451],
        [-7.1423, -1.6059, -7.2346,  1.9664],
        [-5.7224, -2.5353,  0.1607, -2.2086],
        [-7.5963,  2.8588, -6.5424, -1.7667],
        [-6.7649, -1.3605, -5.7080,  1.9527],
        [-7.3127,  2.9659, -7.4688, -3.6539],
        [-6.3441, -2.3297,  1.4357, -2.4872],
        [-6.5494,  2.8734, -6.2100, -2.8555]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 92%|█████████▏| 267/289 [03:22<00:16,  1.32it/s]

Training loop 267
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.03187024965882301, logits - tensor([[-5.9877, -4.2120,  2.7776, -2.2091],
        [-6.3258,  2.8058, -6.1675, -3.5424],
        [-6.8949,  2.4235, -7.0617, -2.6278],
        [-6.4842, -3.0291, -5.2807,  2.9369],
        [-6.6220,  2.3546, -6.8637, -2.9796],
        [-7.9841,  3.5408, -8.1409, -3.8878],
        [-6.3739,  2.6936, -6.7924, -2.2351],
        [-6.5509,  2.5893, -5.8007, -3.0534]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 93%|█████████▎| 268/289 [03:22<00:15,  1.32it/s]

Training loop 268
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.03653769940137863, logits - tensor([[-6.1720,  2.8628, -5.8154, -2.1640],
        [-6.8320,  3.5744, -7.2141, -2.7884],
        [-7.5462,  2.3017, -6.5836, -2.5047],
        [-6.6974,  2.1612, -6.4235, -2.9337],
        [-5.7782, -3.6871,  2.6486, -2.6017],
        [-6.2064,  3.0153, -5.8858, -2.1917],
        [-6.9025,  2.4106, -7.3827, -2.6815],
        [-7.6430,  3.0326, -6.9573, -3.1350]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 93%|█████████▎| 269/289 [03:23<00:15,  1.32it/s]

Training loop 269
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 93%|█████████▎| 270/289 [03:24<00:14,  1.33it/s]

loss - 0.030430437996983528, logits - tensor([[-6.7121, -3.9915,  3.2102, -3.6924],
        [-6.3008,  2.2311, -6.3068, -2.4790],
        [-6.3859, -1.9552, -5.8802,  2.7726],
        [-6.9253,  3.3103, -7.1054, -2.5631],
        [-6.6870,  2.4387, -7.1695, -2.8948],
        [-6.3001, -3.6594, -5.6619,  3.4758],
        [-5.8454,  2.7078, -5.5039, -2.3073],
        [-7.1378, -4.0504, -5.5112,  5.2280]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 270
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.0401187427341938, logits - tensor([[-6.2101,  2.8101, -6.4285, -3.4764],
        [-7.4680, -2.5367, -6.5738,  3.3630],
        [-6.0291, -3.1136,  0.9121, -2.0918],
        [-7.7515, -3.5707, -6.0281,  3.1472],
        [-6.2778,  4.0129, -6.3336, -2.9252],
        [-5.9495, -3.2523,  1.9037, -2.5828],
        [-6.8866,  3.7526, -6.5547, -2.9469],
        [-5.9610,  2.8375, -6.4343, -3

 94%|█████████▍| 271/289 [03:25<00:13,  1.33it/s]

Training loop 271
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.07515392452478409, logits - tensor([[-7.7608, -4.0251, -6.2215,  3.7751],
        [-5.5443, -3.6734,  3.6809, -3.0848],
        [-6.4791, -4.0707,  1.9869, -3.2582],
        [-7.6480,  3.4328, -7.2450, -2.7314],
        [-7.4030,  3.1894, -7.5555, -3.0882],
        [-6.2973, -3.7421,  1.3073, -2.7426],
        [-6.1634,  2.8149, -5.6719, -3.2958],
        [-6.9294,  0.0933, -6.5052, -0.0649]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 94%|█████████▍| 272/289 [03:25<00:12,  1.32it/s]

Training loop 272
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.07245004922151566, logits - tensor([[-6.1179,  0.6416, -5.5028, -0.2532],
        [-5.9206, -3.0023,  2.0610, -1.4779],
        [-6.6099,  2.3397, -6.1871, -2.5700],
        [-6.1514,  2.7765, -6.7232, -3.1740],
        [-7.2760,  2.3401, -7.9098, -1.9114],
        [-5.8158, -3.1867,  2.1294, -2.7706],
        [-8.1248,  2.8219, -7.2823, -3.5231],
        [-6.9382,  2.7944, -6.6482, -2.6641]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 94%|█████████▍| 273/289 [03:26<00:12,  1.32it/s]

Training loop 273
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3355645537376404, logits - tensor([[-5.6486, -3.1435,  1.7078, -2.4567],
        [-6.9843,  2.9606, -7.6812, -3.0085],
        [-6.6838,  0.8978, -6.3047, -1.1153],
        [-7.1495,  1.7066, -6.7073, -1.5791],
        [-7.2269, -0.3839, -6.4133, -0.0604],
        [-6.7793,  2.9916, -6.8838, -2.6840],
        [-6.4242,  2.5666, -6.7419, -3.0364],
        [-6.7373,  2.2216, -7.5244, -2.4998]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 95%|█████████▍| 274/289 [03:27<00:11,  1.32it/s]

Training loop 274
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.15461035072803497, logits - tensor([[-6.1778,  3.8961, -6.4906, -4.0818],
        [-6.2719, -1.6018, -4.6171,  0.4437],
        [-7.1897, -3.1964, -1.8381,  0.2779],
        [-7.5012,  2.2428, -6.5119, -2.3276],
        [-6.6655, -4.2660, -5.3412,  3.6111],
        [-6.6453,  1.6685, -5.8980, -2.3678],
        [-6.8313,  0.5556, -4.3413, -1.6386],
        [-7.0359,  3.0679, -6.5025, -2.6578]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 95%|█████████▌| 275/289 [03:28<00:10,  1.32it/s]

Training loop 275
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.05065963417291641, logits - tensor([[-7.7993, -4.3569,  1.8771, -2.4635],
        [-7.4406, -3.5373, -6.2579,  3.4054],
        [-7.4635,  3.4864, -7.3038, -2.6117],
        [-6.5389,  0.6946, -4.4779, -0.8170],
        [-7.1102,  3.0904, -6.9404, -3.9022],
        [-5.0224, -3.5239,  2.2400, -2.0398],
        [-7.0475,  3.6784, -6.8486, -3.3169],
        [-6.8023,  3.9621, -7.3575, -3.6441]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 96%|█████████▌| 276/289 [03:28<00:09,  1.32it/s]

Training loop 276
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.13251084089279175, logits - tensor([[-7.0681,  2.9441, -5.8918, -2.8829],
        [-7.2938,  3.0769, -7.6632, -3.2946],
        [-7.5604,  3.1820, -7.2755, -3.3302],
        [-6.5829,  3.7667, -6.7771, -3.2716],
        [-6.3139,  2.4510, -6.0944, -3.7496],
        [-6.8552,  3.9296, -7.6409, -3.4725],
        [-8.1564,  1.6576, -7.1962, -1.5637],
        [-7.0812,  2.7001, -6.7664, -2.3257]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 96%|█████████▌| 277/289 [03:29<00:09,  1.32it/s]

Training loop 277
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.13166430592536926, logits - tensor([[-5.8074,  3.3476, -6.1229, -2.9577],
        [-6.8962, -3.8228, -6.0644,  3.6572],
        [-6.7201, -2.7383, -5.2888,  3.0121],
        [-7.6863,  1.4267, -7.7883, -1.6539],
        [-7.0505,  2.6026, -7.0357, -2.8074],
        [-6.4273,  4.2212, -7.0258, -3.6826],
        [-6.3519, -1.6933, -5.6286,  2.1478],
        [-6.0909, -5.0628, -4.9386,  4.6236]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 96%|█████████▌| 278/289 [03:30<00:08,  1.31it/s]

Training loop 278
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.06687984615564346, logits - tensor([[-6.9025,  2.6302, -6.3760, -3.1486],
        [-6.3235,  2.1973, -6.8749, -2.5404],
        [-5.5604,  2.2712, -6.7082, -1.9030],
        [-6.1610,  2.5564, -6.1464, -3.2614],
        [-7.5287, -4.5573,  1.9815, -2.8028],
        [-6.0042,  2.6764, -6.2158, -2.6450],
        [-7.1154,  3.3863, -6.7321, -3.5058],
        [-5.4384, -2.0828, -0.0360, -1.2750]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 97%|█████████▋| 279/289 [03:31<00:07,  1.32it/s]

Training loop 279
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.04932059347629547, logits - tensor([[-6.1916, -3.5670,  2.1351, -2.0612],
        [-7.8571,  2.1983, -7.1679, -2.2615],
        [-6.1719, -3.4188,  1.4368, -1.6993],
        [-6.8218, -2.8920, -5.6459,  2.2919],
        [-6.6393,  3.4257, -7.7640, -3.6687],
        [-5.3936, -2.9033,  2.0672, -2.0428],
        [-8.1177,  3.0323, -7.4758, -2.7512],
        [-6.9042, -3.3132, -5.7794,  3.3846]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 97%|█████████▋| 280/289 [03:31<00:06,  1.32it/s]

Training loop 280
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.23951585590839386, logits - tensor([[-6.3992, -3.3221,  1.6405, -2.4941],
        [-7.6040, -3.1469, -6.6307,  3.0385],
        [-6.2120,  2.3801, -6.5011, -2.8912],
        [-4.7864, -1.9467,  0.8665, -1.7866],
        [-6.4127,  3.6608, -7.2460, -3.8067],
        [-6.9057,  2.4883, -6.5748, -2.2974],
        [-5.8057, -2.7425,  1.1104, -2.0660],
        [-6.2622, -2.9900,  0.8697, -2.0851]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 97%|█████████▋| 281/289 [03:32<00:06,  1.33it/s]

Training loop 281
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.04254430904984474, logits - tensor([[-7.5692,  2.4477, -7.7866, -2.4144],
        [-6.1633, -3.4879,  3.0145, -2.7529],
        [-6.9907,  3.8240, -6.3861, -2.5951],
        [-6.6780, -2.8644,  0.8518, -2.0067],
        [-7.1787,  3.8738, -7.6969, -3.2016],
        [-6.9993,  3.1441, -6.7698, -2.5866],
        [-7.1347,  2.7250, -6.5564, -2.5349],
        [-6.8639,  2.7622, -7.0199, -3.4282]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 98%|█████████▊| 282/289 [03:33<00:05,  1.33it/s]

Training loop 282
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.40715497732162476, logits - tensor([[-7.2178,  3.0781, -7.5526, -2.5689],
        [-7.8313,  1.9805, -6.8720, -1.4611],
        [-6.1279,  2.5551, -6.3170, -2.4344],
        [-6.1501,  1.6435, -5.6107, -1.3273],
        [-6.6503, -3.9191,  2.0949, -2.6209],
        [-6.7945,  2.3305, -6.5390, -2.9422],
        [-5.3652,  3.1775, -6.2433, -3.1231],
        [-6.1864,  2.9714, -5.7039, -2.3966]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 98%|█████████▊| 283/289 [03:34<00:04,  1.33it/s]

Training loop 283
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1537366658449173, logits - tensor([[-7.4586,  1.8161, -7.2845, -1.3709],
        [-8.3489,  2.7695, -6.5156, -2.1365],
        [-6.2338, -4.4174,  2.6016, -3.4166],
        [-7.0626,  3.3771, -6.3296, -2.8598],
        [-7.4925,  3.1740, -6.7466, -2.4421],
        [-6.4859,  1.6602, -5.8002, -1.8873],
        [-7.5570, -1.7903, -6.8283,  1.4343],
        [-5.7930,  3.1336, -6.9228, -2.6124]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 98%|█████████▊| 284/289 [03:35<00:03,  1.33it/s]

Training loop 284
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.09782851487398148, logits - tensor([[-7.0286, -1.8045, -6.0224,  1.6534],
        [-6.2390,  1.9442, -5.6370, -1.5582],
        [-8.0110,  1.7243, -7.2918, -1.8006],
        [-6.6962, -3.0086, -4.7317,  1.8511],
        [-6.4011,  1.5941, -5.9777, -1.4610],
        [-6.6564,  2.2535, -6.8021, -1.9130],
        [-7.5649, -2.4526, -5.0764,  2.0958],
        [-6.2627, -1.5978,  0.0765, -1.2031]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 99%|█████████▊| 285/289 [03:35<00:03,  1.33it/s]

Training loop 285
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.05432002246379852, logits - tensor([[-6.8295,  1.6758, -6.6697, -0.9776],
        [-5.9245, -4.3020, -4.6803,  4.2853],
        [-8.0303,  2.8650, -7.2778, -3.5404],
        [-6.9049,  2.2991, -6.4579, -2.1103],
        [-7.3014,  1.8888, -8.1587, -1.2831],
        [-6.4040,  2.1871, -6.3452, -2.2624],
        [-5.3718,  3.6818, -5.9371, -2.5381],
        [-5.4891, -3.7491,  2.1396, -2.6814]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 99%|█████████▉| 286/289 [03:36<00:02,  1.33it/s]

Training loop 286
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.21219605207443237, logits - tensor([[-7.4777,  2.1652, -6.3920, -1.0662],
        [-6.3535, -3.5545,  2.6364, -2.8394],
        [-5.7022, -3.3579,  1.6512, -2.2314],
        [-6.6305,  2.5953, -7.0680, -2.2562],
        [-6.8576,  2.5138, -6.0573, -2.7687],
        [-6.9322,  1.4879, -5.9901, -1.1833],
        [-7.7248,  2.7246, -6.6195, -3.1266],
        [-8.1168,  2.1101, -7.3297, -3.0247]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 99%|█████████▉| 287/289 [03:37<00:01,  1.32it/s]

Training loop 287
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.042232945561409, logits - tensor([[-5.7595, -4.4493,  3.0625, -3.7506],
        [-6.8561, -4.4156,  2.7524, -3.6543],
        [-7.3944,  2.6509, -6.9932, -2.5729],
        [-6.0969,  1.9166, -7.3908, -2.1261],
        [-6.6683, -4.1095,  2.0703, -3.0901],
        [-7.9742,  3.3027, -7.0509, -3.4807],
        [-6.6422,  3.1715, -6.1720, -2.5256],
        [-7.4431, -1.5181, -5.8458,  1.5096]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


100%|█████████▉| 288/289 [03:38<00:00,  1.32it/s]

Training loop 288
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.04248876869678497, logits - tensor([[-6.2620,  3.1503, -6.4002, -3.2686],
        [-7.7351,  1.9381, -7.1521, -2.2094],
        [-6.3685,  2.4841, -5.6645, -2.5728],
        [-7.0991, -2.5706, -4.9840,  2.0874]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


100%|██████████| 289/289 [03:38<00:00,  1.32it/s]
  0%|          | 0/194 [00:00<?, ?it/s]

Validation Loop 0
input - False, attention_mask - False


  1%|          | 1/194 [00:00<00:59,  3.22it/s]

Validation Loop 1
input - False, attention_mask - False


  1%|          | 2/194 [00:00<00:52,  3.66it/s]

Validation Loop 2
input - False, attention_mask - False


  2%|▏         | 3/194 [00:00<00:49,  3.84it/s]

Validation Loop 3
input - False, attention_mask - False


  2%|▏         | 4/194 [00:01<00:50,  3.78it/s]

Validation Loop 4
input - False, attention_mask - False


  3%|▎         | 5/194 [00:01<00:49,  3.85it/s]

Validation Loop 5
input - False, attention_mask - False


  3%|▎         | 6/194 [00:01<00:48,  3.91it/s]

Validation Loop 6
input - False, attention_mask - False


  4%|▎         | 7/194 [00:01<00:48,  3.89it/s]

Validation Loop 7
input - False, attention_mask - False


  4%|▍         | 8/194 [00:02<00:47,  3.90it/s]

Validation Loop 8
input - False, attention_mask - False


  5%|▍         | 9/194 [00:02<00:47,  3.90it/s]

Validation Loop 9
input - False, attention_mask - False


  5%|▌         | 10/194 [00:02<00:46,  3.92it/s]

Validation Loop 10
input - False, attention_mask - False


  6%|▌         | 11/194 [00:02<00:46,  3.93it/s]

Validation Loop 11
input - False, attention_mask - False


  6%|▌         | 12/194 [00:03<00:46,  3.95it/s]

Validation Loop 12
input - False, attention_mask - False


  7%|▋         | 13/194 [00:03<00:45,  3.94it/s]

Validation Loop 13
input - False, attention_mask - False


  7%|▋         | 14/194 [00:03<00:45,  3.94it/s]

Validation Loop 14
input - False, attention_mask - False


  8%|▊         | 15/194 [00:03<00:45,  3.92it/s]

Validation Loop 15
input - False, attention_mask - False


  8%|▊         | 16/194 [00:04<00:45,  3.92it/s]

Validation Loop 16
input - False, attention_mask - False


  9%|▉         | 17/194 [00:04<00:45,  3.91it/s]

Validation Loop 17
input - False, attention_mask - False


  9%|▉         | 18/194 [00:04<00:44,  3.92it/s]

Validation Loop 18
input - False, attention_mask - False


 10%|▉         | 19/194 [00:04<00:44,  3.91it/s]

Validation Loop 19
input - False, attention_mask - False


 10%|█         | 20/194 [00:05<00:44,  3.94it/s]

Validation Loop 20
input - False, attention_mask - False


 11%|█         | 21/194 [00:05<00:43,  3.93it/s]

Validation Loop 21
input - False, attention_mask - False


 11%|█▏        | 22/194 [00:05<00:44,  3.89it/s]

Validation Loop 22
input - False, attention_mask - False


 12%|█▏        | 23/194 [00:05<00:43,  3.89it/s]

Validation Loop 23
input - False, attention_mask - False


 12%|█▏        | 24/194 [00:06<00:44,  3.86it/s]

Validation Loop 24
input - False, attention_mask - False


 13%|█▎        | 25/194 [00:06<00:43,  3.86it/s]

Validation Loop 25
input - False, attention_mask - False


 13%|█▎        | 26/194 [00:06<00:43,  3.82it/s]

Validation Loop 26
input - False, attention_mask - False


 14%|█▍        | 27/194 [00:06<00:43,  3.83it/s]

Validation Loop 27
input - False, attention_mask - False


 14%|█▍        | 28/194 [00:07<00:43,  3.82it/s]

Validation Loop 28
input - False, attention_mask - False


 15%|█▍        | 29/194 [00:07<00:42,  3.84it/s]

Validation Loop 29
input - False, attention_mask - False


 15%|█▌        | 30/194 [00:07<00:42,  3.89it/s]

Validation Loop 30
input - False, attention_mask - False


 16%|█▌        | 31/194 [00:08<00:42,  3.86it/s]

Validation Loop 31
input - False, attention_mask - False


 16%|█▋        | 32/194 [00:08<00:41,  3.88it/s]

Validation Loop 32
input - False, attention_mask - False


 17%|█▋        | 33/194 [00:08<00:41,  3.92it/s]

Validation Loop 33
input - False, attention_mask - False


 18%|█▊        | 34/194 [00:08<00:40,  3.92it/s]

Validation Loop 34
input - False, attention_mask - False


 18%|█▊        | 35/194 [00:09<00:40,  3.96it/s]

Validation Loop 35
input - False, attention_mask - False


 19%|█▊        | 36/194 [00:09<00:40,  3.94it/s]

Validation Loop 36
input - False, attention_mask - False


 19%|█▉        | 37/194 [00:09<00:40,  3.91it/s]

Validation Loop 37
input - False, attention_mask - False


 20%|█▉        | 38/194 [00:09<00:40,  3.89it/s]

Validation Loop 38
input - False, attention_mask - False


 20%|██        | 39/194 [00:10<00:39,  3.92it/s]

Validation Loop 39
input - False, attention_mask - False


 21%|██        | 40/194 [00:10<00:39,  3.89it/s]

Validation Loop 40
input - False, attention_mask - False


 21%|██        | 41/194 [00:10<00:39,  3.89it/s]

Validation Loop 41
input - False, attention_mask - False


 22%|██▏       | 42/194 [00:10<00:39,  3.88it/s]

Validation Loop 42
input - False, attention_mask - False


 22%|██▏       | 43/194 [00:11<00:39,  3.83it/s]

Validation Loop 43
input - False, attention_mask - False


 23%|██▎       | 44/194 [00:11<00:39,  3.82it/s]

Validation Loop 44
input - False, attention_mask - False


 23%|██▎       | 45/194 [00:11<00:38,  3.88it/s]

Validation Loop 45
input - False, attention_mask - False


 24%|██▎       | 46/194 [00:11<00:38,  3.87it/s]

Validation Loop 46
input - False, attention_mask - False


 24%|██▍       | 47/194 [00:12<00:37,  3.92it/s]

Validation Loop 47
input - False, attention_mask - False


 25%|██▍       | 48/194 [00:12<00:36,  3.96it/s]

Validation Loop 48
input - False, attention_mask - False


 25%|██▌       | 49/194 [00:12<00:36,  3.93it/s]

Validation Loop 49
input - False, attention_mask - False


 26%|██▌       | 50/194 [00:12<00:36,  3.97it/s]

Validation Loop 50
input - False, attention_mask - False


 26%|██▋       | 51/194 [00:13<00:35,  3.97it/s]

Validation Loop 51
input - False, attention_mask - False


 27%|██▋       | 52/194 [00:13<00:35,  4.00it/s]

Validation Loop 52
input - False, attention_mask - False


 27%|██▋       | 53/194 [00:13<00:35,  3.97it/s]

Validation Loop 53
input - False, attention_mask - False


 28%|██▊       | 54/194 [00:13<00:35,  3.95it/s]

Validation Loop 54
input - False, attention_mask - False


 28%|██▊       | 55/194 [00:14<00:35,  3.95it/s]

Validation Loop 55
input - False, attention_mask - False


 29%|██▉       | 56/194 [00:14<00:34,  3.97it/s]

Validation Loop 56
input - False, attention_mask - False


 29%|██▉       | 57/194 [00:14<00:34,  3.93it/s]

Validation Loop 57
input - False, attention_mask - False


 30%|██▉       | 58/194 [00:14<00:34,  3.95it/s]

Validation Loop 58
input - False, attention_mask - False


 30%|███       | 59/194 [00:15<00:34,  3.92it/s]

Validation Loop 59
input - False, attention_mask - False


 31%|███       | 60/194 [00:15<00:34,  3.94it/s]

Validation Loop 60
input - False, attention_mask - False


 31%|███▏      | 61/194 [00:15<00:34,  3.89it/s]

Validation Loop 61
input - False, attention_mask - False


 32%|███▏      | 62/194 [00:15<00:33,  3.91it/s]

Validation Loop 62
input - False, attention_mask - False


 32%|███▏      | 63/194 [00:16<00:33,  3.89it/s]

Validation Loop 63
input - False, attention_mask - False


 33%|███▎      | 64/194 [00:16<00:33,  3.91it/s]

Validation Loop 64
input - False, attention_mask - False


 34%|███▎      | 65/194 [00:16<00:32,  3.93it/s]

Validation Loop 65
input - False, attention_mask - False


 34%|███▍      | 66/194 [00:16<00:32,  3.91it/s]

Validation Loop 66
input - False, attention_mask - False


 35%|███▍      | 67/194 [00:17<00:32,  3.96it/s]

Validation Loop 67
input - False, attention_mask - False


 35%|███▌      | 68/194 [00:17<00:31,  3.95it/s]

Validation Loop 68
input - False, attention_mask - False


 36%|███▌      | 69/194 [00:17<00:31,  3.92it/s]

Validation Loop 69
input - False, attention_mask - False


 36%|███▌      | 70/194 [00:17<00:31,  3.93it/s]

Validation Loop 70
input - False, attention_mask - False


 37%|███▋      | 71/194 [00:18<00:31,  3.90it/s]

Validation Loop 71
input - False, attention_mask - False


 37%|███▋      | 72/194 [00:18<00:31,  3.93it/s]

Validation Loop 72
input - False, attention_mask - False


 38%|███▊      | 73/194 [00:18<00:30,  3.96it/s]

Validation Loop 73
input - False, attention_mask - False


 38%|███▊      | 74/194 [00:18<00:30,  3.93it/s]

Validation Loop 74
input - False, attention_mask - False


 39%|███▊      | 75/194 [00:19<00:30,  3.95it/s]

Validation Loop 75
input - False, attention_mask - False


 39%|███▉      | 76/194 [00:19<00:29,  3.97it/s]

Validation Loop 76
input - False, attention_mask - False


 40%|███▉      | 77/194 [00:19<00:29,  3.99it/s]

Validation Loop 77
input - False, attention_mask - False


 40%|████      | 78/194 [00:19<00:29,  3.99it/s]

Validation Loop 78
input - False, attention_mask - False


 41%|████      | 79/194 [00:20<00:28,  3.99it/s]

Validation Loop 79
input - False, attention_mask - False


 41%|████      | 80/194 [00:20<00:28,  4.00it/s]

Validation Loop 80
input - False, attention_mask - False


 42%|████▏     | 81/194 [00:20<00:28,  4.00it/s]

Validation Loop 81
input - False, attention_mask - False


 42%|████▏     | 82/194 [00:20<00:27,  4.01it/s]

Validation Loop 82
input - False, attention_mask - False


 43%|████▎     | 83/194 [00:21<00:27,  4.01it/s]

Validation Loop 83
input - False, attention_mask - False


 43%|████▎     | 84/194 [00:21<00:27,  4.01it/s]

Validation Loop 84
input - False, attention_mask - False


 44%|████▍     | 85/194 [00:21<00:27,  4.01it/s]

Validation Loop 85
input - False, attention_mask - False


 44%|████▍     | 86/194 [00:21<00:27,  4.00it/s]

Validation Loop 86
input - False, attention_mask - False


 45%|████▍     | 87/194 [00:22<00:26,  4.00it/s]

Validation Loop 87
input - False, attention_mask - False


 45%|████▌     | 88/194 [00:22<00:26,  3.97it/s]

Validation Loop 88
input - False, attention_mask - False


 46%|████▌     | 89/194 [00:22<00:26,  3.97it/s]

Validation Loop 89
input - False, attention_mask - False


 46%|████▋     | 90/194 [00:22<00:26,  3.99it/s]

Validation Loop 90
input - False, attention_mask - False


 47%|████▋     | 91/194 [00:23<00:26,  3.96it/s]

Validation Loop 91
input - False, attention_mask - False


 47%|████▋     | 92/194 [00:23<00:25,  3.98it/s]

Validation Loop 92
input - False, attention_mask - False


 48%|████▊     | 93/194 [00:23<00:25,  3.97it/s]

Validation Loop 93
input - False, attention_mask - False


 48%|████▊     | 94/194 [00:23<00:25,  4.00it/s]

Validation Loop 94
input - False, attention_mask - False


 49%|████▉     | 95/194 [00:24<00:25,  3.95it/s]

Validation Loop 95
input - False, attention_mask - False


 49%|████▉     | 96/194 [00:24<00:24,  3.97it/s]

Validation Loop 96
input - False, attention_mask - False


 50%|█████     | 97/194 [00:24<00:24,  3.94it/s]

Validation Loop 97
input - False, attention_mask - False


 51%|█████     | 98/194 [00:24<00:24,  3.92it/s]

Validation Loop 98
input - False, attention_mask - False


 51%|█████     | 99/194 [00:25<00:24,  3.95it/s]

Validation Loop 99
input - False, attention_mask - False


 52%|█████▏    | 100/194 [00:25<00:23,  3.93it/s]

Validation Loop 100
input - False, attention_mask - False


 52%|█████▏    | 101/194 [00:25<00:23,  3.96it/s]

Validation Loop 101
input - False, attention_mask - False


 53%|█████▎    | 102/194 [00:25<00:23,  3.95it/s]

Validation Loop 102
input - False, attention_mask - False


 53%|█████▎    | 103/194 [00:26<00:22,  3.98it/s]

Validation Loop 103
input - False, attention_mask - False


 54%|█████▎    | 104/194 [00:26<00:22,  3.97it/s]

Validation Loop 104
input - False, attention_mask - False


 54%|█████▍    | 105/194 [00:26<00:22,  3.93it/s]

Validation Loop 105
input - False, attention_mask - False


 55%|█████▍    | 106/194 [00:27<00:22,  3.95it/s]

Validation Loop 106
input - False, attention_mask - False


 55%|█████▌    | 107/194 [00:27<00:21,  3.98it/s]

Validation Loop 107
input - False, attention_mask - False


 56%|█████▌    | 108/194 [00:27<00:21,  3.94it/s]

Validation Loop 108
input - False, attention_mask - False


 56%|█████▌    | 109/194 [00:27<00:21,  3.98it/s]

Validation Loop 109
input - False, attention_mask - False


 57%|█████▋    | 110/194 [00:28<00:21,  3.96it/s]

Validation Loop 110
input - False, attention_mask - False


 57%|█████▋    | 111/194 [00:28<00:20,  3.97it/s]

Validation Loop 111
input - False, attention_mask - False


 58%|█████▊    | 112/194 [00:28<00:20,  3.96it/s]

Validation Loop 112
input - False, attention_mask - False


 58%|█████▊    | 113/194 [00:28<00:20,  3.98it/s]

Validation Loop 113
input - False, attention_mask - False


 59%|█████▉    | 114/194 [00:29<00:20,  3.96it/s]

Validation Loop 114
input - False, attention_mask - False


 59%|█████▉    | 115/194 [00:29<00:19,  3.96it/s]

Validation Loop 115
input - False, attention_mask - False


 60%|█████▉    | 116/194 [00:29<00:19,  3.93it/s]

Validation Loop 116
input - False, attention_mask - False


 60%|██████    | 117/194 [00:29<00:19,  3.97it/s]

Validation Loop 117
input - False, attention_mask - False


 61%|██████    | 118/194 [00:30<00:19,  3.95it/s]

Validation Loop 118
input - False, attention_mask - False


 61%|██████▏   | 119/194 [00:30<00:18,  3.95it/s]

Validation Loop 119
input - False, attention_mask - False


 62%|██████▏   | 120/194 [00:30<00:18,  3.97it/s]

Validation Loop 120
input - False, attention_mask - False


 62%|██████▏   | 121/194 [00:30<00:18,  3.91it/s]

Validation Loop 121
input - False, attention_mask - False


 63%|██████▎   | 122/194 [00:31<00:18,  3.94it/s]

Validation Loop 122
input - False, attention_mask - False


 63%|██████▎   | 123/194 [00:31<00:18,  3.92it/s]

Validation Loop 123
input - False, attention_mask - False


 64%|██████▍   | 124/194 [00:31<00:17,  3.96it/s]

Validation Loop 124
input - False, attention_mask - False


 64%|██████▍   | 125/194 [00:31<00:17,  3.95it/s]

Validation Loop 125
input - False, attention_mask - False


 65%|██████▍   | 126/194 [00:32<00:17,  3.91it/s]

Validation Loop 126
input - False, attention_mask - False


 65%|██████▌   | 127/194 [00:32<00:16,  3.95it/s]

Validation Loop 127
input - False, attention_mask - False


 66%|██████▌   | 128/194 [00:32<00:16,  3.98it/s]

Validation Loop 128
input - False, attention_mask - False


 66%|██████▋   | 129/194 [00:32<00:16,  3.98it/s]

Validation Loop 129
input - False, attention_mask - False


 67%|██████▋   | 130/194 [00:33<00:16,  3.94it/s]

Validation Loop 130
input - False, attention_mask - False


 68%|██████▊   | 131/194 [00:33<00:15,  3.94it/s]

Validation Loop 131
input - False, attention_mask - False


 68%|██████▊   | 132/194 [00:33<00:15,  3.96it/s]

Validation Loop 132
input - False, attention_mask - False


 69%|██████▊   | 133/194 [00:33<00:15,  3.94it/s]

Validation Loop 133
input - False, attention_mask - False


 69%|██████▉   | 134/194 [00:34<00:15,  3.96it/s]

Validation Loop 134
input - False, attention_mask - False


 70%|██████▉   | 135/194 [00:34<00:14,  3.96it/s]

Validation Loop 135
input - False, attention_mask - False


 70%|███████   | 136/194 [00:34<00:14,  3.99it/s]

Validation Loop 136
input - False, attention_mask - False


 71%|███████   | 137/194 [00:34<00:14,  3.95it/s]

Validation Loop 137
input - False, attention_mask - False


 71%|███████   | 138/194 [00:35<00:14,  3.96it/s]

Validation Loop 138
input - False, attention_mask - False


 72%|███████▏  | 139/194 [00:35<00:13,  3.98it/s]

Validation Loop 139
input - False, attention_mask - False


 72%|███████▏  | 140/194 [00:35<00:13,  3.93it/s]

Validation Loop 140
input - False, attention_mask - False


 73%|███████▎  | 141/194 [00:35<00:13,  3.94it/s]

Validation Loop 141
input - False, attention_mask - False


 73%|███████▎  | 142/194 [00:36<00:13,  3.97it/s]

Validation Loop 142
input - False, attention_mask - False


 74%|███████▎  | 143/194 [00:36<00:12,  3.97it/s]

Validation Loop 143
input - False, attention_mask - False


 74%|███████▍  | 144/194 [00:36<00:12,  3.98it/s]

Validation Loop 144
input - False, attention_mask - False


 75%|███████▍  | 145/194 [00:36<00:12,  3.95it/s]

Validation Loop 145
input - False, attention_mask - False


 75%|███████▌  | 146/194 [00:37<00:12,  3.97it/s]

Validation Loop 146
input - False, attention_mask - False


 76%|███████▌  | 147/194 [00:37<00:11,  3.96it/s]

Validation Loop 147
input - False, attention_mask - False


 76%|███████▋  | 148/194 [00:37<00:11,  3.97it/s]

Validation Loop 148
input - False, attention_mask - False


 77%|███████▋  | 149/194 [00:37<00:11,  3.95it/s]

Validation Loop 149
input - False, attention_mask - False


 77%|███████▋  | 150/194 [00:38<00:11,  3.94it/s]

Validation Loop 150
input - False, attention_mask - False


 78%|███████▊  | 151/194 [00:38<00:10,  3.92it/s]

Validation Loop 151
input - False, attention_mask - False


 78%|███████▊  | 152/194 [00:38<00:10,  3.93it/s]

Validation Loop 152
input - False, attention_mask - False


 79%|███████▉  | 153/194 [00:38<00:10,  3.89it/s]

Validation Loop 153
input - False, attention_mask - False


 79%|███████▉  | 154/194 [00:39<00:10,  3.93it/s]

Validation Loop 154
input - False, attention_mask - False


 80%|███████▉  | 155/194 [00:39<00:09,  3.96it/s]

Validation Loop 155
input - False, attention_mask - False


 80%|████████  | 156/194 [00:39<00:09,  3.94it/s]

Validation Loop 156
input - False, attention_mask - False


 81%|████████  | 157/194 [00:39<00:09,  3.95it/s]

Validation Loop 157
input - False, attention_mask - False


 81%|████████▏ | 158/194 [00:40<00:09,  3.97it/s]

Validation Loop 158
input - False, attention_mask - False


 82%|████████▏ | 159/194 [00:40<00:08,  3.98it/s]

Validation Loop 159
input - False, attention_mask - False


 82%|████████▏ | 160/194 [00:40<00:08,  3.96it/s]

Validation Loop 160
input - False, attention_mask - False


 83%|████████▎ | 161/194 [00:40<00:08,  3.98it/s]

Validation Loop 161
input - False, attention_mask - False


 84%|████████▎ | 162/194 [00:41<00:08,  3.98it/s]

Validation Loop 162
input - False, attention_mask - False


 84%|████████▍ | 163/194 [00:41<00:07,  4.00it/s]

Validation Loop 163
input - False, attention_mask - False


 85%|████████▍ | 164/194 [00:41<00:07,  4.00it/s]

Validation Loop 164
input - False, attention_mask - False


 85%|████████▌ | 165/194 [00:41<00:07,  3.95it/s]

Validation Loop 165
input - False, attention_mask - False


 86%|████████▌ | 166/194 [00:42<00:07,  3.91it/s]

Validation Loop 166
input - False, attention_mask - False


 86%|████████▌ | 167/194 [00:42<00:06,  3.90it/s]

Validation Loop 167
input - False, attention_mask - False


 87%|████████▋ | 168/194 [00:42<00:06,  3.89it/s]

Validation Loop 168
input - False, attention_mask - False


 87%|████████▋ | 169/194 [00:42<00:06,  3.87it/s]

Validation Loop 169
input - False, attention_mask - False


 88%|████████▊ | 170/194 [00:43<00:06,  3.85it/s]

Validation Loop 170
input - False, attention_mask - False


 88%|████████▊ | 171/194 [00:43<00:05,  3.84it/s]

Validation Loop 171
input - False, attention_mask - False


 89%|████████▊ | 172/194 [00:43<00:05,  3.84it/s]

Validation Loop 172
input - False, attention_mask - False


 89%|████████▉ | 173/194 [00:43<00:05,  3.87it/s]

Validation Loop 173
input - False, attention_mask - False


 90%|████████▉ | 174/194 [00:44<00:05,  3.86it/s]

Validation Loop 174
input - False, attention_mask - False


 90%|█████████ | 175/194 [00:44<00:04,  3.87it/s]

Validation Loop 175
input - False, attention_mask - False


 91%|█████████ | 176/194 [00:44<00:04,  3.92it/s]

Validation Loop 176
input - False, attention_mask - False


 91%|█████████ | 177/194 [00:45<00:04,  3.93it/s]

Validation Loop 177
input - False, attention_mask - False


 92%|█████████▏| 178/194 [00:45<00:04,  3.93it/s]

Validation Loop 178
input - False, attention_mask - False


 92%|█████████▏| 179/194 [00:45<00:03,  3.97it/s]

Validation Loop 179
input - False, attention_mask - False


 93%|█████████▎| 180/194 [00:45<00:03,  3.96it/s]

Validation Loop 180
input - False, attention_mask - False


 93%|█████████▎| 181/194 [00:46<00:03,  3.94it/s]

Validation Loop 181
input - False, attention_mask - False


 94%|█████████▍| 182/194 [00:46<00:03,  3.94it/s]

Validation Loop 182
input - False, attention_mask - False


 94%|█████████▍| 183/194 [00:46<00:02,  3.97it/s]

Validation Loop 183
input - False, attention_mask - False


 95%|█████████▍| 184/194 [00:46<00:02,  3.96it/s]

Validation Loop 184
input - False, attention_mask - False


 95%|█████████▌| 185/194 [00:47<00:02,  3.96it/s]

Validation Loop 185
input - False, attention_mask - False


 96%|█████████▌| 186/194 [00:47<00:02,  3.94it/s]

Validation Loop 186
input - False, attention_mask - False


 96%|█████████▋| 187/194 [00:47<00:01,  3.97it/s]

Validation Loop 187
input - False, attention_mask - False


 97%|█████████▋| 188/194 [00:47<00:01,  3.98it/s]

Validation Loop 188
input - False, attention_mask - False


 97%|█████████▋| 189/194 [00:48<00:01,  3.91it/s]

Validation Loop 189
input - False, attention_mask - False


 98%|█████████▊| 190/194 [00:48<00:01,  3.91it/s]

Validation Loop 190
input - False, attention_mask - False


 98%|█████████▊| 191/194 [00:48<00:00,  3.94it/s]

Validation Loop 191
input - False, attention_mask - False


 99%|█████████▉| 192/194 [00:48<00:00,  3.95it/s]

Validation Loop 192
input - False, attention_mask - False


 99%|█████████▉| 193/194 [00:49<00:00,  3.94it/s]

Validation Loop 193
input - False, attention_mask - False


100%|██████████| 194/194 [00:49<00:00,  3.93it/s]

[{'tp': 0, 'tn': 1552, 'fp': 0, 'fn': 0}, {'tp': 827, 'tn': 376, 'fp': 134, 'fn': 215}, {'tp': 156, 'tn': 1367, 'fp': 4, 'fn': 25}, {'tp': 183, 'tn': 1025, 'fp': 248, 'fn': 96}]
Detailed accuracy after 5 epoch:
unanswerable accuarcy: 1.0
extractive accuarcy: 0.7751288659793815
yes_no accuarcy: 0.9813144329896907
abstractive accuarcy: 0.7783505154639175
Overall accuarcy: 0.8836984536082474
Best accuarcy: 0.899645618556701



  0%|          | 0/289 [00:00<?, ?it/s]

Training loop 0
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.0282338447868824, logits - tensor([[-7.0308,  2.7746, -6.3399, -3.4442],
        [-6.4900,  3.0889, -7.0495, -2.8296],
        [-5.4552, -3.7996, -5.9068,  2.9476],
        [-7.4614,  1.8947, -6.4943, -2.0963],
        [-7.2866, -3.0905, -5.8409,  4.1542],
        [-6.7869,  3.0208, -8.0422, -3.5390],
        [-6.6229,  3.1417, -6.1087, -3.2635],
        [-6.8579, -2.4289, -5.8438,  2.9078]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  0%|          | 1/289 [00:00<03:54,  1.23it/s]

Training loop 1
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.22631922364234924, logits - tensor([[-6.7735,  3.8016, -6.8652, -3.2932],
        [-7.3676, -2.4951, -7.4101,  1.9504],
        [-7.9071,  2.5036, -7.4474, -2.1404],
        [-5.0869, -2.3092, -5.1387,  2.4914],
        [-7.9540,  2.5559, -7.2617, -2.2271],
        [-7.8546,  0.4361, -7.6598, -0.2467],
        [-7.2976,  3.8228, -6.5885, -3.1511],
        [-6.7818,  2.3759, -6.8938, -2.9520]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  1%|          | 2/289 [00:01<03:45,  1.27it/s]

Training loop 2
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.03532880172133446, logits - tensor([[-7.4806, -2.2901, -6.6976,  2.3767],
        [-7.5943, -3.0715, -6.6593,  2.5543],
        [-7.2138,  2.3868, -7.2547, -2.6356],
        [-5.9846,  3.6648, -6.5785, -2.3077],
        [-7.4582, -3.4833, -6.5582,  3.1520],
        [-7.6942,  2.1367, -6.6386, -2.2757],
        [-6.7402,  3.5450, -6.1545, -2.3838],
        [-5.8326,  2.5304, -5.5438, -2.9662]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  1%|          | 3/289 [00:02<03:39,  1.30it/s]

Training loop 3
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19428986310958862, logits - tensor([[-8.0195, -0.1043, -7.0780, -0.2113],
        [-5.9995, -3.9747,  1.8988, -1.8445],
        [-6.4767,  2.0963, -6.5610, -2.9934],
        [-6.9163,  2.5648, -7.7151, -2.9020],
        [-7.2924,  3.1940, -5.6029, -3.4317],
        [-7.1174,  3.0339, -6.0675, -2.8180],
        [-6.5153,  2.4325, -6.2786, -1.4244],
        [-5.4671, -4.3815, -5.2120,  3.9474]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  1%|▏         | 4/289 [00:03<03:36,  1.32it/s]

Training loop 4
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.07972487062215805, logits - tensor([[-5.5759, -4.5553, -6.3022,  4.7088],
        [-6.6488,  2.5059, -7.2115, -2.2181],
        [-6.2755, -2.8326, -4.7685,  2.8494],
        [-7.5037, -3.7187, -5.9511,  2.8364],
        [-7.7359, -0.3454, -6.5142,  0.1877],
        [-5.3368,  3.3497, -6.3006, -2.5340],
        [-8.0134,  3.2755, -7.5242, -2.7588],
        [-6.7146,  2.3626, -7.2222, -1.8095]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  2%|▏         | 5/289 [00:03<03:34,  1.33it/s]

Training loop 5
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.33162009716033936, logits - tensor([[-6.4625,  2.5383, -7.0038, -2.6615],
        [-5.2046, -3.7973,  2.7775, -2.7944],
        [-5.8818, -2.9313, -3.8181,  3.6469],
        [-5.9383, -3.9102, -5.3891,  3.3946],
        [-7.0540, -3.0185,  0.6179, -2.1067],
        [-7.9229,  3.4649, -7.4620, -2.9617],
        [-7.0613,  3.0284, -6.2954, -3.5548],
        [-7.7660, -2.9006, -7.5624,  3.2230]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  2%|▏         | 6/289 [00:04<03:33,  1.32it/s]

Training loop 6
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.03178366273641586, logits - tensor([[-6.4255, -4.1166,  2.9945, -2.9492],
        [-7.2291,  3.1581, -6.4696, -1.9753],
        [-6.2800, -4.6299,  2.6426, -3.4305],
        [-8.1807,  2.9602, -7.1387, -2.9549],
        [-6.7889, -4.8738, -6.0728,  4.5679],
        [-6.4676, -2.6489, -4.8577,  2.2458],
        [-6.7553,  2.8040, -6.5987, -2.9888],
        [-4.9748, -3.5569,  2.2776, -2.6487]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  2%|▏         | 7/289 [00:05<03:32,  1.33it/s]

Training loop 7
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.054517462849617004, logits - tensor([[-6.3656, -3.9118,  2.7998, -2.4385],
        [-5.1212, -3.4964,  1.9920, -2.2946],
        [-6.0567, -2.1950,  1.1813, -1.9287],
        [-6.2205, -1.6140, -4.3935,  1.4630],
        [-7.2458, -4.6277, -6.2797,  3.6601],
        [-6.7485, -3.7711,  2.6751, -3.2284],
        [-5.2052,  2.4498, -6.0576, -3.4785],
        [-6.4406,  2.9543, -6.1270, -2.7413]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  3%|▎         | 8/289 [00:06<03:31,  1.33it/s]

Training loop 8
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.03109767660498619, logits - tensor([[-6.5847,  3.5401, -6.9688, -2.4155],
        [-7.1773,  2.5539, -6.1282, -2.4702],
        [-6.4350, -3.4034, -5.9170,  3.1311],
        [-6.6970, -4.2262,  2.9513, -3.1957],
        [-6.9696,  3.0991, -6.9126, -2.8610],
        [-6.0328,  2.2400, -6.2425, -2.5456],
        [-8.1426,  2.8825, -8.2738, -2.9280],
        [-7.4612,  2.4517, -6.0710, -2.8233]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  3%|▎         | 9/289 [00:06<03:29,  1.33it/s]

Training loop 9
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.23652023077011108, logits - tensor([[-7.9432, -0.0173, -6.8285, -0.3548],
        [-6.5511,  2.4179, -6.3858, -2.5093],
        [-6.5369,  3.3982, -7.0377, -3.9932],
        [-7.3353, -2.5948, -5.5999,  2.2834],
        [-6.3882, -5.2553, -5.1655,  2.9110],
        [-6.8205,  3.1212, -7.3496, -2.9718],
        [-5.7648, -2.4460,  1.5086, -1.7885],
        [-5.0840, -2.7925,  1.3270, -1.7501]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  3%|▎         | 10/289 [00:07<03:29,  1.33it/s]

Training loop 10
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.07272152602672577, logits - tensor([[-6.7589,  1.8842, -6.4488, -2.3786],
        [-5.6728,  3.1693, -6.6004, -3.0590],
        [-7.5778,  2.1499, -6.9047, -2.2274],
        [-6.8056,  2.9899, -7.1366, -2.8767],
        [-8.1275, -1.1203, -7.0535,  0.6835],
        [-6.3808,  2.5046, -5.9561, -2.3697],
        [-5.7166, -2.9101,  1.0694, -1.3619],
        [-7.6271, -2.0711, -5.7949,  2.0909]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  4%|▍         | 11/289 [00:08<03:29,  1.32it/s]

Training loop 11
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.035739876329898834, logits - tensor([[-6.1152,  3.6924, -7.1810, -4.0433],
        [-7.6349,  1.5036, -7.0817, -1.4251],
        [-5.8451, -4.6781, -5.8183,  4.5662],
        [-7.0751,  4.0018, -8.0882, -3.4909],
        [-5.8229,  3.0140, -7.0888, -2.8060],
        [-5.5899,  2.6577, -6.3671, -2.9083],
        [-6.4536,  2.2578, -6.2618, -2.3917],
        [-5.7822, -3.3990,  2.2915, -3.0357]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  4%|▍         | 12/289 [00:09<03:29,  1.32it/s]

Training loop 12
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.22712258994579315, logits - tensor([[-6.9045,  1.7099, -6.5886, -1.3187],
        [-6.0368, -3.6213,  2.7405, -3.1205],
        [-6.4108,  2.8256, -7.3666, -2.8288],
        [-7.1499,  2.5447, -6.5407, -2.4925],
        [-5.6709, -3.7971,  2.2574, -2.9985],
        [-6.0746,  3.5163, -7.0259, -2.9683],
        [-6.6000,  1.5817, -5.7753, -2.2030],
        [-6.7812, -3.9790, -5.5215,  4.1304]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  4%|▍         | 13/289 [00:09<03:27,  1.33it/s]

Training loop 13
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.07489590346813202, logits - tensor([[-6.8032,  3.4699, -7.0029, -3.5329],
        [-6.0092,  2.3580, -5.6804, -2.7103],
        [-6.5646, -2.2151, -6.2142,  2.1613],
        [-6.3319, -2.3812, -5.6497,  3.3636],
        [-7.0430, -0.5834, -5.6753,  0.1869],
        [-6.2459,  0.9923, -4.0595, -1.5235],
        [-7.1196,  2.8950, -7.3488, -2.8247],
        [-5.9350,  2.4117, -5.6443, -3.1356]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  5%|▍         | 14/289 [00:10<03:27,  1.33it/s]

Training loop 14
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


  5%|▌         | 15/289 [00:11<03:26,  1.33it/s]

loss - 0.024079248309135437, logits - tensor([[-7.1649,  3.1127, -6.9741, -3.5585],
        [-7.4268,  2.9372, -8.2126, -3.2228],
        [-6.8275,  3.4595, -6.5746, -3.4880],
        [-7.5352, -1.9304, -5.0510,  1.9523],
        [-5.7177,  3.0874, -6.3055, -3.5919],
        [-6.9127,  3.2671, -6.7895, -3.2577],
        [-7.0146,  3.0484, -6.7901, -3.7139],
        [-7.0555, -4.3799, -6.7998,  3.6402]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 15
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.028973940759897232, logits - tensor([[-8.1812,  3.6934, -8.0181, -3.3297],
        [-8.8033, -2.8694, -5.9192,  3.6160],
        [-6.0494,  2.3196, -7.1855, -3.3992],
        [-6.9842, -4.1552,  2.1149, -3.0585],
        [-6.5816, -3.2471,  2.2353, -3.0411],
        [-7.0099,  3.5998, -7.0939, -3.2352],
        [-6.7270,  3.2586, -7.0725, -2.3964],
        [-7.0495,  3.5758, -6.7506, -

  6%|▌         | 16/289 [00:12<03:25,  1.33it/s]

Training loop 16
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.03543409705162048, logits - tensor([[-6.4393,  2.2900, -6.8298, -3.4065],
        [-7.3174,  3.0658, -6.9290, -3.0519],
        [-6.6905,  2.9996, -6.7892, -3.0556],
        [-7.0306,  2.7654, -7.0914, -3.2060],
        [-6.2728,  2.4206, -6.6649, -2.7562],
        [-6.0305,  2.8282, -6.8597, -2.2045],
        [-6.2441,  2.2094, -6.0828, -2.8744],
        [-5.9040, -3.6328,  2.0749, -2.4417]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  6%|▌         | 17/289 [00:12<03:26,  1.32it/s]

Training loop 17
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.032352298498153687, logits - tensor([[-6.4970,  3.4839, -6.7825, -3.0701],
        [-7.4001,  2.6646, -7.2811, -3.0935],
        [-6.2697,  3.5622, -7.6594, -2.8533],
        [-6.8367, -4.2389,  2.6997, -2.6483],
        [-6.2326, -3.4172,  2.2600, -1.9558],
        [-6.8890,  3.4531, -6.9796, -3.1088],
        [-6.5197,  3.1596, -7.1798, -2.7341],
        [-7.7980,  1.9824, -7.1337, -3.5158]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  6%|▌         | 18/289 [00:13<03:25,  1.32it/s]

Training loop 18
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.032455116510391235, logits - tensor([[-6.3216,  2.8761, -7.0223, -4.1186],
        [-5.1489,  3.2676, -5.8707, -2.7806],
        [-6.6707,  3.3097, -6.6994, -3.0816],
        [-6.0475, -3.5808, -5.2526,  3.5698],
        [-6.1327,  2.0992, -5.9652, -2.1491],
        [-6.5653,  3.3328, -6.7237, -3.0160],
        [-5.9208, -3.4646,  2.3984, -2.2832],
        [-6.9641,  2.6701, -6.2228, -2.1600]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  7%|▋         | 19/289 [00:14<03:24,  1.32it/s]

Training loop 19
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3198925256729126, logits - tensor([[-6.2081,  2.9622, -6.8425, -2.6455],
        [-6.8352, -3.0771,  0.9774, -1.1758],
        [-6.3130,  0.9020, -6.0423, -1.3670],
        [-7.6034,  0.6601, -6.4067, -1.5985],
        [-6.5769, -3.9420, -4.6777,  4.5992],
        [-6.3212, -2.9729, -3.0721,  1.5953],
        [-6.6682, -3.4115, -4.6791,  3.7066],
        [-5.4908, -3.9493, -5.6746,  3.7347]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  7%|▋         | 20/289 [00:15<03:23,  1.32it/s]

Training loop 20
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2101941704750061, logits - tensor([[-7.3928,  2.7242, -6.6603, -2.1254],
        [-6.3449, -2.9826,  1.3013, -1.9063],
        [-5.8082, -1.5135, -5.4036,  0.9982],
        [-6.3723,  2.2976, -6.2282, -3.6868],
        [-6.8462, -3.4637, -6.2861,  3.7592],
        [-6.4856,  0.2602, -6.4862,  0.6052],
        [-7.1472,  2.8909, -7.6173, -3.3476],
        [-5.3049,  1.8417, -5.7035, -1.9223]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  7%|▋         | 21/289 [00:15<03:24,  1.31it/s]

Training loop 21
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.027779031544923782, logits - tensor([[-6.4962,  3.0808, -6.2098, -2.6828],
        [-7.3775,  2.9295, -7.0233, -2.3909],
        [-8.3430, -2.9729, -5.9427,  2.9336],
        [-8.2395,  3.7238, -8.6450, -3.0395],
        [-7.4024,  2.9661, -7.6468, -2.5666],
        [-7.9414,  2.4239, -7.3844, -2.7123],
        [-6.5952,  3.5640, -6.8989, -2.4956],
        [-6.7754,  3.5933, -7.5294, -3.1054]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  8%|▊         | 22/289 [00:16<03:22,  1.32it/s]

Training loop 22
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.20520532131195068, logits - tensor([[-7.1836,  2.8388, -7.2788, -2.5753],
        [-7.7083,  2.6606, -6.5511, -2.9388],
        [-7.1124, -1.8923, -6.1386,  2.0822],
        [-7.6565,  3.2083, -7.0165, -2.6253],
        [-5.8667, -4.0491,  1.9569, -2.0489],
        [-6.1756, -3.6371, -5.3565,  3.8790],
        [-5.6865,  2.5910, -5.8761, -3.0574],
        [-6.2179, -3.4601, -6.0689,  3.1560]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  8%|▊         | 23/289 [00:17<03:22,  1.32it/s]

Training loop 23
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.03184613585472107, logits - tensor([[-7.3062,  3.3386, -7.6105, -3.4018],
        [-6.2249,  3.1341, -6.6310, -2.8640],
        [-7.3196,  2.7141, -6.8399, -2.6143],
        [-6.1753,  2.3863, -6.7643, -3.2107],
        [-7.7320,  1.9094, -6.7489, -2.0893],
        [-7.0342,  3.0847, -6.3598, -3.2272],
        [-7.4451,  3.8600, -7.7829, -2.8722],
        [-6.3325,  3.1689, -6.4613, -2.0823]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  8%|▊         | 24/289 [00:18<03:21,  1.32it/s]

Training loop 24
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.07727371156215668, logits - tensor([[-6.9025,  3.2629, -8.0815, -2.9452],
        [-6.0929,  0.9813, -6.4981, -1.6431],
        [-7.7756,  2.8985, -7.0837, -2.7770],
        [-5.8245, -4.8378,  3.3633, -3.1403],
        [-6.5120,  4.0302, -7.2384, -3.0361],
        [-7.3335,  2.1729, -7.5298, -3.7385],
        [-5.5785, -2.4842,  0.3825, -1.5634],
        [-6.9530,  1.1784, -8.1162, -0.6550]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  9%|▊         | 25/289 [00:18<03:20,  1.32it/s]

Training loop 25
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.028175275772809982, logits - tensor([[-7.3527,  2.9949, -6.7759, -3.8139],
        [-6.7467,  3.7716, -6.4186, -3.8834],
        [-6.7155, -2.6746, -4.4998,  2.2522],
        [-7.1280,  3.2888, -6.2685, -3.5431],
        [-6.3942,  3.0859, -6.9639, -3.0863],
        [-6.6554,  3.8969, -6.9636, -4.0286],
        [-8.1357, -1.3970, -6.8352,  2.5123],
        [-6.7443, -2.8495, -5.9994,  3.0943]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  9%|▉         | 26/289 [00:19<03:19,  1.32it/s]

Training loop 26
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.03496275469660759, logits - tensor([[-7.0863,  2.7911, -6.7602, -3.7100],
        [-6.7130,  3.0929, -6.9201, -4.1599],
        [-7.4068,  2.1084, -6.8909, -1.9257],
        [-6.2790,  2.0849, -6.5361, -2.7157],
        [-7.9768, -2.4179, -6.8869,  2.4123],
        [-6.5729,  2.7830, -6.8160, -2.9403],
        [-6.4605,  3.7251, -6.8649, -3.1298],
        [-5.3582,  3.0282, -5.0578, -2.0590]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  9%|▉         | 27/289 [00:20<03:18,  1.32it/s]

Training loop 27
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.14409087598323822, logits - tensor([[-6.5767,  3.5608, -6.6472, -2.7274],
        [-7.0014,  2.9350, -6.8416, -2.9003],
        [-5.8917, -3.6915,  2.0411, -2.4474],
        [-5.2009, -3.7157,  2.5668, -2.3459],
        [-6.7646,  2.0337, -6.7090, -1.1281],
        [-5.8703,  2.0852, -5.8785, -2.8940],
        [-6.7337,  2.8198, -5.9322, -1.9736],
        [-6.9759,  3.4677, -7.6128, -4.2631]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 10%|▉         | 28/289 [00:21<03:17,  1.32it/s]

Training loop 28
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.20445384085178375, logits - tensor([[-6.0186,  2.3262, -6.2809, -3.4588],
        [-7.1750,  2.5507, -6.3030, -2.5214],
        [-5.2411, -2.1040, -5.2986,  2.4007],
        [-7.4157, -0.2220, -7.0040,  0.4409],
        [-5.9196,  1.6569, -5.5542, -2.5268],
        [-6.1698, -3.9117,  2.0838, -2.1549],
        [-6.0993,  4.0360, -6.1826, -2.6264],
        [-7.4035,  3.1454, -7.5382, -2.4880]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 10%|█         | 29/289 [00:21<03:16,  1.32it/s]

Training loop 29
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.03646177053451538, logits - tensor([[-6.0706, -3.2515,  2.8632, -2.2641],
        [-6.7226,  3.9974, -7.7939, -3.7710],
        [-6.3470,  3.3238, -5.4625, -3.2082],
        [-6.3780,  3.3805, -7.1052, -3.0453],
        [-8.0275,  2.1242, -7.6191, -2.8153],
        [-6.9588,  3.8372, -6.7740, -3.5526],
        [-7.5192,  1.9484, -6.9420, -0.9597],
        [-6.9999,  2.9339, -6.6493, -3.5098]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 10%|█         | 30/289 [00:22<03:15,  1.32it/s]

Training loop 30
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.035069823265075684, logits - tensor([[-5.7976, -4.0071,  2.2225, -2.3835],
        [-7.3262,  2.5418, -7.5861, -2.5786],
        [-6.3377, -3.6967,  2.1681, -2.7531],
        [-7.6642,  2.7191, -7.2251, -3.4313],
        [-6.2518,  3.0829, -7.4094, -2.5777],
        [-6.6931, -5.1629, -5.7609,  4.8170],
        [-5.8117,  2.7449, -5.6610, -2.7466],
        [-6.7439,  2.1480, -6.8767, -2.4276]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 11%|█         | 31/289 [00:23<03:15,  1.32it/s]

Training loop 31
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.05926353111863136, logits - tensor([[-6.6214,  2.4850, -6.3744, -1.7501],
        [-6.6868,  2.1011, -5.8901, -1.7773],
        [-7.8215, -1.9142, -2.3581,  0.3363],
        [-7.1062, -2.7851, -5.9410,  2.3546],
        [-6.1860,  3.5979, -5.7371, -3.6128],
        [-6.0214, -4.0601,  1.8297, -2.5142],
        [-7.8094,  3.7983, -7.5938, -2.8285],
        [-7.3195,  3.6425, -6.8649, -3.1261]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 11%|█         | 32/289 [00:24<03:14,  1.32it/s]

Training loop 32
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.03834572806954384, logits - tensor([[-5.2361, -3.3665,  1.6728, -1.2049],
        [-6.5517,  2.7976, -6.2817, -3.6676],
        [-6.2171,  3.9636, -7.0466, -3.0580],
        [-6.1255, -3.9339,  2.9457, -2.8452],
        [-7.3878,  3.0091, -6.4446, -2.7867],
        [-7.2501,  3.3158, -7.3000, -2.7719],
        [-5.5916, -3.7987,  2.3546, -3.5227],
        [-6.6803,  2.3936, -6.4506, -3.6627]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 11%|█▏        | 33/289 [00:25<03:14,  1.32it/s]

Training loop 33
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.17213931679725647, logits - tensor([[-7.2609,  3.4301, -7.2639, -2.3489],
        [-6.8715,  3.3950, -6.8506, -3.1531],
        [-5.3791, -3.9455,  2.3919, -2.7885],
        [-6.9104,  2.0587, -6.9544, -1.9269],
        [-5.6824, -3.4972,  1.6929, -2.4281],
        [-7.4559, -3.0235, -6.2576,  2.4415],
        [-7.1413,  2.4555, -6.1961, -2.4869],
        [-6.7675,  2.2942, -6.5114, -2.5764]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 12%|█▏        | 34/289 [00:25<03:13,  1.32it/s]

Training loop 34
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1978566199541092, logits - tensor([[-7.2879,  2.4620, -7.8684, -2.8738],
        [-8.2446,  3.1210, -7.6270, -3.3569],
        [-6.4012,  2.7665, -6.8582, -3.6015],
        [-7.0683, -3.8745, -6.2220,  4.0541],
        [-6.2169, -4.4190,  4.1455, -3.0059],
        [-5.2007,  2.5008, -5.8836, -1.9059],
        [-6.2685,  2.9988, -7.3768, -2.8312],
        [-7.9703,  2.4735, -7.5908, -2.9666]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 12%|█▏        | 35/289 [00:26<03:12,  1.32it/s]

Training loop 35
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19253118336200714, logits - tensor([[-6.5025, -2.9099,  1.6663, -1.5375],
        [-7.5976,  2.8588, -7.0931, -1.5963],
        [-8.3724, -1.7831, -7.5466,  0.9856],
        [-5.6688, -2.3932, -5.5013,  3.0035],
        [-6.7328,  2.2276, -6.5001, -3.1670],
        [-6.7140, -2.4056, -4.8990,  2.5869],
        [-7.4132, -3.7970, -6.6508,  3.5514],
        [-5.4458,  3.6729, -5.6881, -3.6233]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 12%|█▏        | 36/289 [00:27<03:12,  1.32it/s]

Training loop 36
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.14423716068267822, logits - tensor([[-6.2293, -3.8645,  1.2250, -1.4552],
        [-5.9240, -3.4508,  2.2463, -2.8691],
        [-6.4629, -3.9467,  2.9158, -2.3237],
        [-6.9954,  3.0100, -7.3428, -3.6904],
        [-7.2026, -0.7536, -6.6543,  1.4165],
        [-7.3188,  3.1237, -7.5120, -2.0410],
        [-6.9689,  3.7041, -7.5082, -3.2346],
        [-6.6101, -2.2789, -6.2262,  2.5117]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 13%|█▎        | 37/289 [00:28<03:11,  1.32it/s]

Training loop 37
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.11438663303852081, logits - tensor([[-6.0607,  3.0660, -6.6624, -3.2972],
        [-6.3898,  1.5078, -5.8203, -0.9454],
        [-6.1271,  3.5005, -7.5002, -3.2561],
        [-6.5082,  3.0532, -7.1379, -3.5256],
        [-6.1612, -4.0630, -6.6113,  3.4655],
        [-7.3360,  2.4381, -7.4163, -2.4315],
        [-5.8597,  2.5652, -7.1032, -2.5793],
        [-7.1681,  3.1972, -7.1378, -3.5573]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 13%|█▎        | 38/289 [00:28<03:10,  1.32it/s]

Training loop 38
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.03865253925323486, logits - tensor([[-6.9384,  2.9954, -6.4095, -3.5155],
        [-6.3657,  2.5073, -5.3451, -1.3761],
        [-7.2284, -2.3398, -6.4276,  2.9691],
        [-6.3910, -3.1751, -4.6429,  4.2279],
        [-6.7212,  2.3508, -6.1288, -2.6447],
        [-6.0453,  3.1834, -6.2934, -2.7209],
        [-7.6293,  2.6389, -7.2255, -3.1782],
        [-6.9207,  2.3763, -5.8884, -1.7523]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 13%|█▎        | 39/289 [00:29<03:10,  1.31it/s]

Training loop 39
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1995585858821869, logits - tensor([[-8.1062, -2.6597, -5.1440,  2.9113],
        [-6.0843,  3.1332, -6.3824, -3.2224],
        [-5.8103,  3.2403, -6.3531, -3.4230],
        [-7.2553,  3.2797, -7.9134, -2.7285],
        [-7.9813,  2.9236, -7.3290, -2.5079],
        [-5.2669, -4.5688,  3.1802, -3.8062],
        [-7.1874,  3.7093, -7.0179, -2.9951],
        [-5.8397,  2.9306, -6.1136, -2.6569]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 14%|█▍        | 40/289 [00:30<03:10,  1.31it/s]

Training loop 40
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.176949143409729, logits - tensor([[-6.2330,  2.3452, -6.4630, -3.8978],
        [-8.5949, -0.1724, -6.5722,  0.6365],
        [-4.7407, -4.1888,  2.3723, -1.9974],
        [-7.7671,  2.5522, -7.6820, -2.4978],
        [-7.6167,  3.5231, -7.7888, -3.0438],
        [-6.5382,  3.5546, -6.9224, -2.5375],
        [-6.0902, -3.2873, -5.1170,  3.9246],
        [-6.4002, -1.5732, -4.5983,  1.9627]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 14%|█▍        | 41/289 [00:31<03:09,  1.31it/s]

Training loop 41
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.07428137958049774, logits - tensor([[-6.6732,  3.7067, -6.5939, -3.0998],
        [-6.0230, -1.0328, -5.3700,  1.5688],
        [-5.7577, -4.3461,  3.0411, -3.7013],
        [-5.5461, -4.2573,  3.3471, -3.3538],
        [-6.4181,  2.4996, -5.5199, -2.4073],
        [-8.3901,  0.6881, -7.5708, -1.0200],
        [-7.9363, -2.0507, -6.5789,  2.1513],
        [-8.2506,  1.2363, -6.7102, -1.2347]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 15%|█▍        | 42/289 [00:31<03:07,  1.32it/s]

Training loop 42
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.10053039342164993, logits - tensor([[-6.3043,  2.5857, -5.9135, -2.3413],
        [-6.5119,  2.8630, -6.2490, -3.6883],
        [-5.5333,  3.1750, -6.9728, -3.4895],
        [-7.5857,  3.1712, -7.0353, -2.4921],
        [-6.7912,  1.3740, -7.3492, -1.0070],
        [-7.2885,  2.7891, -6.4462, -3.1817],
        [-7.8511,  0.7378, -7.3967, -0.4193],
        [-6.7627,  3.3484, -6.3041, -3.3822]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 15%|█▍        | 43/289 [00:32<03:07,  1.31it/s]

Training loop 43
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.03427698835730553, logits - tensor([[-6.8520,  2.5810, -6.4776, -3.0017],
        [-6.6312,  2.9352, -6.6690, -2.9183],
        [-7.1062,  2.4601, -6.7987, -2.4781],
        [-6.7439, -4.4102, -6.0940,  3.5625],
        [-7.3476,  1.7063, -7.0410, -1.1859],
        [-7.4274,  2.7676, -7.5513, -2.7522],
        [-6.2163, -4.3374, -4.9306,  4.0903],
        [-6.8917,  3.5925, -7.5034, -3.4391]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 15%|█▌        | 44/289 [00:33<03:06,  1.32it/s]

Training loop 44
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.04893547296524048, logits - tensor([[-6.1877,  1.4869, -5.6764, -1.9064],
        [-4.9897, -3.5687,  2.6662, -2.4850],
        [-7.4687,  1.8546, -6.5712, -2.0890],
        [-5.8854, -3.5763,  2.9318, -2.2276],
        [-6.5380,  2.5701, -6.0489, -2.0513],
        [-6.8732, -5.3380,  3.4567, -3.1830],
        [-6.2922,  2.3401, -6.8003, -2.4234],
        [-6.7943,  3.1414, -6.6013, -2.3995]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 16%|█▌        | 45/289 [00:34<03:05,  1.32it/s]

Training loop 45
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.031092830002307892, logits - tensor([[-6.7419,  3.5542, -6.7034, -2.8994],
        [-6.6040,  2.5389, -6.7359, -2.6863],
        [-7.6422,  2.9091, -7.5765, -2.9512],
        [-6.8049, -2.3714, -6.2424,  2.6760],
        [-7.1705, -3.5505, -5.7436,  3.2691],
        [-7.2526,  3.3916, -6.2956, -3.0713],
        [-6.8690, -4.2570,  1.9588, -2.7882],
        [-7.4943,  2.7745, -7.4980, -2.4413]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 16%|█▌        | 46/289 [00:34<03:04,  1.32it/s]

Training loop 46
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.05441088601946831, logits - tensor([[-6.0362,  2.9592, -6.1441, -3.0915],
        [-7.1016, -3.4085, -7.0207,  3.7938],
        [-7.4343,  2.9957, -7.3037, -3.4316],
        [-6.1139,  3.6663, -7.4118, -3.2073],
        [-6.1788, -2.0771, -6.2082,  2.4208],
        [-5.1813, -1.2475, -4.2329,  0.4357],
        [-5.7660, -2.1870, -6.3600,  1.2399],
        [-7.0374,  2.9900, -7.3229, -3.1765]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 16%|█▋        | 47/289 [00:35<03:04,  1.31it/s]

Training loop 47
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.025445587933063507, logits - tensor([[-7.4540,  3.6673, -7.3590, -2.2614],
        [-6.9047,  3.0931, -7.1757, -3.1579],
        [-6.8248,  2.4281, -7.1780, -3.4365],
        [-6.9017, -4.2123,  2.5445, -3.0118],
        [-7.3165, -3.4950, -6.3251,  3.4644],
        [-6.5445, -2.5567, -5.0998,  2.8914],
        [-5.7916,  2.8780, -5.3460, -3.3423],
        [-6.1672, -4.8036, -5.5902,  3.3976]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 17%|█▋        | 48/289 [00:36<03:03,  1.31it/s]

Training loop 48
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.12172900140285492, logits - tensor([[-5.0754, -4.2064,  2.8543, -3.0452],
        [-6.5412,  0.1155, -6.1552, -0.5570],
        [-6.0740, -2.9095, -5.6481,  2.8039],
        [-5.8104,  3.5304, -6.0592, -2.3546],
        [-6.5621,  2.8521, -6.3372, -1.9684],
        [-6.9188,  1.5510, -6.3946, -1.1237],
        [-6.4417,  0.0281, -6.2386, -1.0967],
        [-5.3795,  2.8184, -6.6055, -2.7515]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 17%|█▋        | 49/289 [00:37<03:02,  1.31it/s]

Training loop 49
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.08438161015510559, logits - tensor([[-6.8626, -2.8724, -0.1722, -0.5673],
        [-6.5945, -3.4768, -5.5291,  4.0852],
        [-6.8687, -2.8022, -6.0204,  3.0878],
        [-6.3835, -3.7294,  1.2938, -2.5204],
        [-7.9741,  2.7034, -7.1078, -2.7837],
        [-5.4494,  3.4293, -6.2193, -2.9276],
        [-6.9317,  3.1435, -6.8947, -3.1208],
        [-7.8344,  2.3774, -6.8041, -2.0731]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 17%|█▋        | 50/289 [00:37<03:01,  1.32it/s]

Training loop 50
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.04738955944776535, logits - tensor([[-6.8614, -4.1464, -6.1161,  3.7018],
        [-5.8771, -2.8926,  1.3842, -1.8318],
        [-6.1046,  2.5205, -6.9594, -2.6003],
        [-7.1506,  2.6004, -6.2851, -1.9369],
        [-6.7965,  2.7088, -6.5701, -3.1483],
        [-6.6242,  1.8443, -6.2231, -1.9375],
        [-6.6181,  2.6199, -5.8096, -3.2131],
        [-5.3595,  2.1306, -5.6838, -2.8709]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 18%|█▊        | 51/289 [00:38<03:00,  1.32it/s]

Training loop 51
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.025806430727243423, logits - tensor([[-7.0463,  3.1008, -6.4856, -2.3620],
        [-7.0846,  4.3781, -6.3369, -4.2341],
        [-5.8359, -3.9865,  2.8686, -3.2550],
        [-7.5467,  2.0555, -7.5959, -1.8809],
        [-6.3023, -3.8914, -5.7312,  3.6995],
        [-6.7339,  3.0420, -6.2884, -2.6561],
        [-7.0458,  3.5950, -6.9907, -2.8563],
        [-6.7382, -4.4069, -5.5959,  4.3329]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 18%|█▊        | 52/289 [00:39<02:59,  1.32it/s]

Training loop 52
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.36302486062049866, logits - tensor([[-7.3150,  2.5409, -7.3874, -3.2421],
        [-6.4834,  1.6849, -5.5100, -1.7767],
        [-6.0748, -3.0205,  1.3377, -1.7599],
        [-8.2548,  3.2902, -7.1960, -3.4019],
        [-6.7486, -3.0282, -6.2569,  4.3481],
        [-6.4002,  3.1246, -6.5229, -3.4255],
        [-6.8857,  2.8507, -6.5391, -2.5702],
        [-6.5695, -3.0561, -5.9528,  3.1741]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 18%|█▊        | 53/289 [00:40<02:58,  1.32it/s]

Training loop 53
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.12903963029384613, logits - tensor([[-6.6961,  1.3218, -6.7597, -1.4012],
        [-5.8363,  3.0241, -5.9871, -2.9201],
        [-6.4139,  3.3083, -7.6335, -3.8300],
        [-6.2152, -3.1350,  0.9000, -1.8403],
        [-8.5711,  1.4757, -7.5209,  0.1616],
        [-5.8572, -3.2192,  1.4093, -1.5736],
        [-7.0618,  3.7127, -6.6760, -3.6779],
        [-7.4940,  2.5342, -6.6002, -2.2962]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 19%|█▊        | 54/289 [00:40<02:57,  1.32it/s]

Training loop 54
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.07569696009159088, logits - tensor([[-6.0123,  3.4126, -6.0442, -3.3383],
        [-6.8606,  0.2879, -6.4380, -0.2929],
        [-6.0306, -3.2832,  1.3530, -1.8864],
        [-6.0887,  2.5676, -6.3030, -2.6115],
        [-6.5839,  2.3341, -7.2093, -2.8224],
        [-6.9506,  2.8546, -7.1319, -3.7302],
        [-6.1412, -3.1842,  2.0720, -1.6726],
        [-7.2132,  3.3054, -7.0598, -2.6745]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 19%|█▉        | 55/289 [00:41<02:57,  1.32it/s]

Training loop 55
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.11827471852302551, logits - tensor([[-5.1376, -3.7915, -5.7600,  4.0743],
        [-6.1536, -3.4259,  1.8176, -2.0662],
        [-5.0946, -3.2976,  2.0505, -1.9169],
        [-7.4023,  2.6484, -6.3209, -3.7198],
        [-8.1083,  4.1583, -7.3285, -3.7100],
        [-6.7430,  1.3322, -5.6845, -0.8879],
        [-7.7629,  3.8524, -7.4623, -3.6882],
        [-6.1696, -4.0118,  2.7870, -2.6755]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 19%|█▉        | 56/289 [00:42<02:56,  1.32it/s]

Training loop 56
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.04554605111479759, logits - tensor([[-6.1246, -4.2548, -4.9830,  4.0289],
        [-6.1166,  3.3544, -7.3113, -3.3580],
        [-7.0196,  1.6417, -6.9991, -1.9464],
        [-7.1233,  2.5867, -6.2072, -2.3524],
        [-6.4581,  3.0821, -6.9000, -2.5628],
        [-6.9449, -2.1863, -6.3705,  2.7152],
        [-5.8097, -3.0672,  2.1212, -2.1748],
        [-6.6744, -2.4922, -5.3846,  1.4166]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 20%|█▉        | 57/289 [00:43<02:56,  1.32it/s]

Training loop 57
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 20%|██        | 58/289 [00:43<02:55,  1.32it/s]

loss - 0.15344315767288208, logits - tensor([[-7.1591, -2.6144, -6.3062,  2.6148],
        [-5.8839,  2.3784, -5.7873, -2.1112],
        [-6.4193, -3.7857, -5.9951,  4.0494],
        [-5.2666, -2.5332,  0.8168, -1.6235],
        [-7.4557,  2.9741, -6.4305, -2.6471],
        [-7.3379, -2.7761, -6.1272,  3.0789],
        [-7.1691, -2.5040, -5.6861,  2.4991],
        [-8.1849,  2.5872, -7.1195, -2.5928]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 58
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.042601361870765686, logits - tensor([[-5.8773,  2.9756, -6.3013, -1.8610],
        [-6.2480,  3.4934, -6.7775, -2.5709],
        [-6.7729, -3.5995,  0.9204, -1.5108],
        [-6.2570,  3.2462, -5.9017, -2.7907],
        [-5.5541,  3.0200, -5.5548, -2.7187],
        [-6.8697,  3.1003, -6.5530, -3.3621],
        [-6.3464,  2.9456, -6.4387, -3.3043],
        [-6.0114,  3.1836, -5.7853, -2

 20%|██        | 59/289 [00:44<02:54,  1.32it/s]

Training loop 59
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.021664194762706757, logits - tensor([[-5.5737,  2.7644, -7.2657, -2.5778],
        [-6.8216, -3.9824, -5.3319,  4.8795],
        [-7.1197,  2.7741, -8.1611, -3.2552],
        [-5.7544,  3.6668, -6.6476, -3.3748],
        [-7.4569,  3.0927, -7.1062, -2.8701],
        [-6.2145, -4.4493,  3.5208, -2.9582],
        [-6.5001, -3.9309,  3.2689, -2.8971],
        [-7.0069,  3.4943, -6.6092, -3.8423]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 21%|██        | 60/289 [00:45<02:53,  1.32it/s]

Training loop 60
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.039761193096637726, logits - tensor([[-7.0267,  2.5212, -6.9680, -3.1962],
        [-6.1978,  3.1330, -5.5672, -3.2095],
        [-7.1583, -3.1624, -6.6125,  3.2014],
        [-4.4197, -3.6028,  2.6477, -2.5038],
        [-7.6329, -3.6678, -5.8674,  3.8320],
        [-7.0283,  0.9256, -6.3785, -1.3784],
        [-6.9316, -4.5671, -5.7080,  3.4649],
        [-5.5058,  2.8344, -5.8271, -2.5512]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 21%|██        | 61/289 [00:46<02:52,  1.32it/s]

Training loop 61
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.02835516631603241, logits - tensor([[-6.7013, -2.3454, -5.4076,  3.5143],
        [-5.9473, -3.7237,  3.6148, -2.8563],
        [-6.3785,  3.7151, -6.4307, -3.4681],
        [-7.0272,  2.6090, -7.2936, -2.3468],
        [-5.6792,  3.3094, -6.3766, -3.7462],
        [-6.6401, -3.6419, -5.8608,  4.4497],
        [-6.0844, -4.8086,  2.5926, -2.3579],
        [-6.0597, -2.5325, -6.2857,  2.3592]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 21%|██▏       | 62/289 [00:47<02:51,  1.32it/s]

Training loop 62
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.17445620894432068, logits - tensor([[-6.8769, -4.5407,  2.3512, -2.2215],
        [-7.0215,  3.2430, -7.2068, -2.5770],
        [-6.0989, -3.7818,  2.9716, -2.5145],
        [-6.2442,  3.2123, -6.6298, -3.2340],
        [-8.4752,  1.5425, -7.3568, -2.3058],
        [-7.9646,  2.2608, -7.0937, -2.0834],
        [-6.2916, -4.5692,  3.2480, -3.0222],
        [-6.1424,  3.5502, -6.8612, -2.9890]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 22%|██▏       | 63/289 [00:47<02:51,  1.32it/s]

Training loop 63
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1674104928970337, logits - tensor([[-7.5417, -3.3129, -6.5852,  3.1062],
        [-6.5667, -3.5502,  2.2711, -2.6671],
        [-5.6053,  3.1403, -6.1972, -3.0594],
        [-7.9049, -1.6275, -6.8458,  2.1563],
        [-7.6470,  1.8810, -7.5739, -2.2647],
        [-7.3539,  2.2192, -7.1213, -3.0956],
        [-7.9181,  3.5129, -7.7303, -2.6551],
        [-6.4027,  3.2725, -5.8873, -3.8832]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 22%|██▏       | 64/289 [00:48<02:50,  1.32it/s]

Training loop 64
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.03760761395096779, logits - tensor([[-6.2220,  2.7797, -6.2618, -2.5427],
        [-6.1630, -4.0874,  1.8845, -2.2325],
        [-6.9229, -2.6245, -6.5329,  2.6351],
        [-5.9041,  3.0459, -5.9363, -2.3199],
        [-5.8889, -4.4778,  2.7897, -2.7684],
        [-5.7312, -4.0202, -5.6322,  3.0856],
        [-7.1223,  3.3128, -7.1889, -2.5060],
        [-7.7249,  2.0602, -7.0755, -2.5733]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 22%|██▏       | 65/289 [00:49<02:50,  1.32it/s]

Training loop 65
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.027774479240179062, logits - tensor([[-6.7343, -4.3546,  3.2165, -3.0303],
        [-6.6073, -2.6730, -5.2101,  1.6880],
        [-8.0387,  2.4503, -7.6844, -2.5154],
        [-5.4598,  3.1854, -6.2667, -3.3578],
        [-7.0253,  2.8983, -7.8084, -3.4236],
        [-6.1672,  3.3598, -6.1286, -3.5509],
        [-6.5784,  3.4615, -5.3934, -3.9308],
        [-7.8791,  3.1624, -7.3150, -2.9606]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 23%|██▎       | 66/289 [00:50<02:49,  1.32it/s]

Training loop 66
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.13788416981697083, logits - tensor([[-6.5175,  2.9786, -6.6653, -2.7908],
        [-5.9947, -3.9312,  2.4649, -2.5536],
        [-5.5812, -5.1834,  2.7869, -3.7503],
        [-5.4975,  2.9191, -6.9245, -2.5645],
        [-7.2414,  1.5770, -6.7299, -1.6732],
        [-7.4202, -1.6870, -6.5896,  1.1867],
        [-7.5759,  2.7565, -6.9552, -3.5319],
        [-6.6472,  2.5641, -6.3706, -2.8693]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 23%|██▎       | 67/289 [00:50<02:47,  1.32it/s]

Training loop 67
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.03226909041404724, logits - tensor([[-5.9652,  3.2865, -7.2699, -2.9253],
        [-7.7958,  3.3022, -7.5773, -2.9975],
        [-6.7898,  1.9356, -6.6688, -2.7044],
        [-5.7412,  1.7922, -6.7652, -1.8537],
        [-6.5016,  3.1788, -6.3263, -3.3099],
        [-6.8718,  2.9638, -6.4010, -3.5235],
        [-7.5458, -3.3829, -6.3939,  3.2168],
        [-6.7158,  2.6735, -6.1321, -3.1586]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 24%|██▎       | 68/289 [00:51<02:47,  1.32it/s]

Training loop 68
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.02823709324002266, logits - tensor([[-7.1870,  2.6329, -7.6119, -3.5560],
        [-7.6287,  3.3685, -6.8953, -2.9834],
        [-6.3894, -4.1449,  2.8268, -3.1262],
        [-7.1062,  3.3095, -6.4406, -2.5642],
        [-4.5883, -4.6760, -4.0025,  3.3404],
        [-7.2927,  2.5713, -6.5913, -1.7736],
        [-6.6436, -2.6046, -6.2268,  2.9807],
        [-6.0995, -3.2097, -5.1260,  4.3282]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 24%|██▍       | 69/289 [00:52<02:46,  1.32it/s]

Training loop 69
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.02655024081468582, logits - tensor([[-7.0615,  2.5009, -7.2206, -2.5098],
        [-7.6680, -3.7025, -5.3459,  2.6848],
        [-6.4672,  2.8433, -6.2038, -3.0279],
        [-6.4668, -3.7282, -5.8324,  3.3605],
        [-6.7433,  2.8011, -6.7708, -2.5440],
        [-6.2942, -4.1527,  3.3222, -2.7277],
        [-8.0542,  3.4771, -7.6123, -2.7758],
        [-5.2230, -4.7605,  3.9232, -3.0263]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 24%|██▍       | 70/289 [00:53<02:45,  1.33it/s]

Training loop 70
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.03429730609059334, logits - tensor([[-6.1665, -2.5258, -6.4691,  2.3919],
        [-5.9553, -3.9686, -5.2377,  3.2258],
        [-6.4835, -4.1044,  2.4811, -2.7021],
        [-7.4228,  2.6051, -7.0912, -3.4349],
        [-6.0753,  3.8325, -6.6551, -2.1709],
        [-6.2272, -4.8865,  2.7850, -3.5576],
        [-7.4353, -5.2106,  3.3707, -3.4813],
        [-6.8246, -3.0569, -3.2530,  1.4505]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 25%|██▍       | 71/289 [00:53<02:44,  1.33it/s]

Training loop 71
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.06087497994303703, logits - tensor([[-6.0194,  2.8015, -5.7204, -1.9846],
        [-5.1356, -4.4539,  2.9539, -2.5908],
        [-5.7214,  2.9278, -6.0800, -2.4960],
        [-5.7823,  2.9632, -6.2264, -3.1011],
        [-7.5205,  0.9883, -6.7133, -0.4952],
        [-6.4632, -4.0060,  2.3746, -2.6297],
        [-6.3846,  3.2096, -5.7678, -2.0808],
        [-7.9875,  2.0412, -7.9975, -2.0539]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 25%|██▍       | 72/289 [00:54<02:43,  1.33it/s]

Training loop 72
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.08410302549600601, logits - tensor([[-6.0333, -2.4708,  0.2139, -0.6776],
        [-6.3898, -4.4518,  4.1544, -3.8141],
        [-7.1770,  2.9605, -6.0075, -2.9567],
        [-6.1393, -4.2337,  4.1322, -3.3181],
        [-7.8329,  2.5793, -6.5156, -3.2249],
        [-6.9769,  2.4848, -6.5110, -1.8943],
        [-4.8546, -4.6838, -4.3494,  4.4475],
        [-6.9122,  2.6540, -6.1032, -3.1654]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 25%|██▌       | 73/289 [00:55<02:42,  1.33it/s]

Training loop 73
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.08496159315109253, logits - tensor([[-7.2306e+00,  2.2955e-03, -5.8347e+00, -2.2473e-01],
        [-5.3698e+00, -4.0222e+00,  2.1364e+00, -2.8524e+00],
        [-7.4593e+00,  1.8852e+00, -6.0869e+00, -1.3964e+00],
        [-6.8577e+00,  2.6513e+00, -7.1495e+00, -2.2680e+00],
        [-6.9632e+00,  3.6257e+00, -7.8633e+00, -2.9483e+00],
        [-6.0485e+00, -3.6594e+00, -5.3487e+00,  3.9670e+00],
        [-5.8209e+00, -2.3709e+00, -5.8865e+00,  2.7402e+00],
        [-7.3507e+00,  1.8144e+00, -6.8147e+00, -3.0827e+00]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 26%|██▌       | 74/289 [00:56<02:42,  1.33it/s]

Training loop 74
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.029473276808857918, logits - tensor([[-7.7632,  3.5587, -6.9969, -3.8051],
        [-5.6560, -3.8334,  3.0524, -3.4308],
        [-6.2821,  1.9649, -6.3178, -2.3050],
        [-6.3359,  3.0670, -6.5333, -2.7575],
        [-5.4645, -4.8012,  2.5777, -2.5256],
        [-6.6599,  2.3390, -6.7897, -2.9637],
        [-6.2138, -3.5538, -5.7894,  3.7945],
        [-5.8889, -4.5655,  2.8400, -4.1390]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 26%|██▌       | 75/289 [00:56<02:41,  1.33it/s]

Training loop 75
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.35143208503723145, logits - tensor([[-7.0975,  2.2004, -7.7215, -3.1986],
        [-5.7581, -4.0783,  3.2130, -2.5919],
        [-6.3774, -4.2841, -6.2552,  3.4818],
        [-7.4777, -2.5483, -5.9703,  2.1427],
        [-6.6960,  2.8296, -6.9403, -3.3798],
        [-7.7410, -1.9174, -7.2512,  1.5298],
        [-6.7456,  2.5677, -7.1062, -1.0916],
        [-7.9296,  3.1809, -6.5069, -3.5984]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 26%|██▋       | 76/289 [00:57<02:40,  1.33it/s]

Training loop 76
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 27%|██▋       | 77/289 [00:58<02:40,  1.32it/s]

loss - 0.024624895304441452, logits - tensor([[-6.1153, -4.5312,  3.3345, -3.1324],
        [-5.4373, -3.9531, -5.0745,  4.6690],
        [-5.3729, -4.2661,  2.0070, -2.3174],
        [-6.5648,  3.5789, -6.8190, -3.9921],
        [-7.3988, -3.6005, -6.2245,  2.9367],
        [-6.8659,  3.3526, -7.1571, -2.7016],
        [-7.2788, -2.2660, -6.0337,  3.8008],
        [-6.8414,  2.9128, -6.5001, -4.6237]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 77
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.048539258539676666, logits - tensor([[-7.8642, -2.9858, -6.9082,  2.5874],
        [-6.5904,  3.4188, -6.3786, -2.4056],
        [-7.3721,  2.7459, -7.0186, -3.1309],
        [-8.0020,  3.5198, -7.7237, -3.1868],
        [-6.4292,  2.5684, -6.8974, -1.4442],
        [-7.2314, -1.9487, -5.3804,  1.7962],
        [-5.9196,  3.4711, -6.9774, -3.5304],
        [-5.1693, -2.9158,  1.0138, -

 27%|██▋       | 78/289 [00:59<02:39,  1.32it/s]

Training loop 78
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.06425411999225616, logits - tensor([[-6.5122,  2.4764, -6.3986, -3.8170],
        [-6.8754,  3.9563, -6.2298, -3.7913],
        [-7.1183, -2.2678, -0.8044, -0.2689],
        [-7.5778, -2.6831, -6.2075,  3.2242],
        [-6.4263,  2.6749, -7.3504, -3.6385],
        [-5.8461, -4.5064,  2.7290, -1.4927],
        [-7.4827,  3.8965, -6.5584, -3.1955],
        [-7.1951, -3.8795, -6.0618,  3.4105]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 27%|██▋       | 79/289 [00:59<02:39,  1.32it/s]

Training loop 79
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.03743399307131767, logits - tensor([[-6.0274, -3.2001,  1.3040, -2.2617],
        [-7.3759,  2.5597, -6.9086, -1.9255],
        [-6.5551, -2.9179, -5.4512,  3.0500],
        [-5.6198,  3.0393, -6.3941, -3.4359],
        [-4.5788, -3.9443,  2.5552, -2.4533],
        [-6.9390,  2.8181, -7.9545, -4.2188],
        [-6.0622, -4.7785,  3.9238, -2.8565],
        [-6.9220, -3.2663, -5.3310,  3.6490]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 28%|██▊       | 80/289 [01:00<02:38,  1.32it/s]

Training loop 80
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.26848626136779785, logits - tensor([[-5.8081,  3.2434, -5.7610, -3.5149],
        [-7.4607,  2.9164, -6.8241, -3.1935],
        [-5.8445,  2.8342, -6.4925, -2.6119],
        [-5.7942, -4.3644,  2.5296, -2.3001],
        [-8.1964, -4.2650, -5.9410,  3.6800],
        [-6.0773, -3.1203, -5.9871,  3.7730],
        [-7.1957,  2.9590, -6.4258, -2.6461],
        [-5.7140, -0.2786, -4.7817,  0.3871]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 28%|██▊       | 81/289 [01:01<02:38,  1.31it/s]

Training loop 81
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.08979631960391998, logits - tensor([[-7.0008,  1.2653, -6.1262, -2.2498],
        [-7.6670,  2.8280, -7.2624, -3.1514],
        [-5.5750, -4.5269,  2.5851, -3.4677],
        [-5.5015, -4.5436, -4.0496,  3.1077],
        [-6.6688,  2.9737, -6.6611, -3.3802],
        [-6.1413, -2.5011, -0.7079, -0.4810],
        [-7.2989, -1.1393, -6.6543,  1.0845],
        [-6.5270,  2.5350, -6.0427, -3.0371]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 28%|██▊       | 82/289 [01:02<02:37,  1.31it/s]

Training loop 82
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.15083207190036774, logits - tensor([[-8.2192,  2.5679, -7.5852, -2.2252],
        [-6.2936, -2.1657, -0.1292, -1.3701],
        [-6.4797,  2.2541, -6.2922, -2.5597],
        [-8.3138,  1.0252, -8.5848, -1.3740],
        [-7.2278, -3.6341, -5.2818,  3.5711],
        [-5.9273, -4.1762,  2.4351, -3.0845],
        [-5.2867, -3.3749,  2.7991, -3.5202],
        [-6.9313,  3.9934, -6.9400, -2.3635]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 29%|██▊       | 83/289 [01:02<02:36,  1.32it/s]

Training loop 83
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.03497415781021118, logits - tensor([[-7.3848,  2.2286, -6.6787, -1.7766],
        [-6.7760,  2.6450, -6.5090, -2.5896],
        [-6.7566,  2.1718, -6.4269, -2.6205],
        [-6.5022,  3.0934, -6.4858, -2.9061],
        [-6.1909,  3.3835, -7.1229, -3.3492],
        [-7.1104,  2.2312, -7.5412, -3.9571],
        [-6.1612,  2.7497, -6.5684, -2.4186],
        [-7.4603, -2.9401, -6.1059,  3.3236]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 29%|██▉       | 84/289 [01:03<02:35,  1.31it/s]

Training loop 84
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.06421126425266266, logits - tensor([[-6.3849, -3.8084,  3.2204, -3.0714],
        [-7.5200, -3.4792, -6.2639,  3.7072],
        [-5.7562,  2.5783, -5.8950, -2.8605],
        [-6.5938,  2.3302, -6.1762, -2.9749],
        [-5.7724,  3.0965, -7.4189, -2.9349],
        [-7.0338, -0.6083, -6.3666,  1.0034],
        [-7.1447, -0.8448, -6.5400,  1.8957],
        [-7.3753, -2.2553, -7.6405,  1.7571]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 29%|██▉       | 85/289 [01:04<02:34,  1.32it/s]

Training loop 85
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.03959886357188225, logits - tensor([[-6.4149, -4.1463,  3.1818, -3.1339],
        [-5.4161,  3.2660, -5.8251, -2.7937],
        [-6.8210,  2.8627, -6.8765, -2.9699],
        [-5.7085, -3.4487,  2.9503, -2.4084],
        [-6.8493,  1.6186, -7.1006, -2.4960],
        [-7.2896, -2.7718, -6.1277,  3.6184],
        [-5.6756, -4.4393,  3.3691, -4.2543],
        [-5.3684, -2.4935,  2.1261, -1.6785]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 30%|██▉       | 86/289 [01:05<02:33,  1.32it/s]

Training loop 86
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.06854266673326492, logits - tensor([[-7.9974,  2.7821, -8.3639, -3.1775],
        [-8.0247,  0.5246, -7.5414,  0.1379],
        [-6.5094,  3.0195, -7.1828, -3.6772],
        [-6.9683,  3.0363, -7.0901, -3.3784],
        [-7.4522, -2.6893, -8.0799,  3.1878],
        [-6.0307,  3.6452, -6.1314, -3.6846],
        [-6.9355, -3.6960, -5.4378,  4.1493],
        [-5.7453, -3.7828,  3.0835, -3.2048]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 30%|███       | 87/289 [01:05<02:32,  1.32it/s]

Training loop 87
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.060275495052337646, logits - tensor([[-6.4603, -4.2158, -5.0032,  3.5651],
        [-5.5743, -0.6580, -5.0106,  0.7665],
        [-6.2726,  3.7198, -6.9795, -3.4362],
        [-6.2613,  2.7311, -6.6173, -2.0738],
        [-6.0103,  2.9400, -6.0725, -3.3195],
        [-6.5434, -4.7646,  3.2682, -3.1105],
        [-5.5525,  2.8407, -5.5770, -3.1630],
        [-6.8294,  1.1695, -6.4183, -1.1979]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 30%|███       | 88/289 [01:06<02:31,  1.33it/s]

Training loop 88
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.029424913227558136, logits - tensor([[-6.2962, -3.9116, -6.6915,  4.0649],
        [-7.2015,  2.9897, -6.8342, -2.6087],
        [-5.4048, -3.8314, -5.0543,  4.5531],
        [-5.6056,  2.0928, -5.0796, -2.3489],
        [-5.6043, -4.0038, -5.2443,  4.4648],
        [-7.4478,  2.5250, -7.6441, -2.9768],
        [-7.3360,  2.6017, -6.9896, -3.2610],
        [-6.5338,  2.1962, -6.1508, -1.9455]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 31%|███       | 89/289 [01:07<02:30,  1.33it/s]

Training loop 89
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.062029991298913956, logits - tensor([[-6.8416,  1.7538, -6.7397, -2.0829],
        [-7.4486,  1.0358, -7.6125, -1.2241],
        [-6.2230,  2.3047, -5.8604, -2.6816],
        [-7.1886, -3.9756,  0.8763, -1.5903],
        [-5.9542, -4.4833,  3.6807, -3.3079],
        [-5.9241, -4.2279,  2.4645, -2.6539],
        [-5.9418,  2.8485, -5.6641, -3.8485],
        [-6.4625,  2.8829, -6.7491, -3.1531]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 31%|███       | 90/289 [01:08<02:29,  1.33it/s]

Training loop 90
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.02736954763531685, logits - tensor([[-6.1140, -3.9423,  2.3010, -1.9304],
        [-6.9294, -3.3988, -4.9230,  4.0768],
        [-6.3890,  2.6635, -5.1465, -3.1196],
        [-5.8427, -3.7238, -5.5484,  4.5667],
        [-6.8029, -3.9010, -5.6444,  3.5425],
        [-6.8622,  2.7625, -5.9479, -2.7587],
        [-6.8285, -4.4113,  3.1558, -3.4203],
        [-6.5153,  2.3137, -6.4738, -3.1345]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 31%|███▏      | 91/289 [01:08<02:29,  1.33it/s]

Training loop 91
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.23211514949798584, logits - tensor([[-6.6319, -3.8802,  1.7934, -2.2626],
        [-6.2151, -4.2310, -5.6561,  3.9840],
        [-6.2797,  2.4462, -6.9113, -2.8949],
        [-7.0671,  1.3855, -6.8154, -1.5586],
        [-6.0071,  2.2166, -7.5463, -2.2999],
        [-6.4621,  1.2922, -5.8412, -1.5690],
        [-6.3106, -2.4730, -5.4322,  2.4588],
        [-6.9101,  3.7416, -6.9850, -2.9264]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 32%|███▏      | 92/289 [01:09<02:28,  1.32it/s]

Training loop 92
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.05343420058488846, logits - tensor([[-5.4261, -3.4424, -5.0676,  3.1952],
        [-7.3921,  3.0306, -7.1145, -3.0241],
        [-6.3831, -4.6033,  2.3923, -2.8390],
        [-7.4961,  1.0746, -6.4158, -0.3748],
        [-8.2844,  1.8525, -7.8248, -2.4800],
        [-5.6272, -2.5578, -4.7004,  2.5489],
        [-7.4165,  3.8264, -7.1826, -3.6916],
        [-6.3534,  3.1145, -6.5554, -2.5805]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 32%|███▏      | 93/289 [01:10<02:28,  1.32it/s]

Training loop 93
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.01730794459581375, logits - tensor([[-9.0864,  3.2732, -7.7238, -2.6037],
        [-6.4645,  2.9302, -6.6617, -3.5392],
        [-7.5857,  3.4887, -7.3732, -3.1112],
        [-6.5188,  3.2946, -6.3203, -2.9016],
        [-6.4455,  3.7050, -6.6645, -3.1431],
        [-6.8529, -4.4221, -6.0203,  4.8952],
        [-6.8817,  3.3271, -7.6580, -4.0131],
        [-7.3236,  3.6460, -6.8017, -3.9649]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 33%|███▎      | 94/289 [01:11<02:27,  1.32it/s]

Training loop 94
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.08765286207199097, logits - tensor([[-6.7828,  0.5449, -6.4857, -0.0866],
        [-6.6436,  2.9029, -7.1743, -3.8533],
        [-7.1006,  2.9374, -6.8659, -2.6819],
        [-5.2731, -3.4845, -4.3992,  2.9358],
        [-6.8608,  2.6402, -7.4930, -2.5732],
        [-7.4053,  2.9673, -6.7584, -3.4615],
        [-7.0714,  2.9932, -6.8090, -3.0973],
        [-6.4149,  1.3902, -5.8194, -1.4108]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 33%|███▎      | 95/289 [01:11<02:27,  1.32it/s]

Training loop 95
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.05077052116394043, logits - tensor([[-8.4533, -3.3925, -6.5437,  4.0404],
        [-4.9092, -3.5232,  2.6976, -2.2977],
        [-6.4465,  3.2931, -6.7902, -3.7492],
        [-8.0388,  2.7516, -7.4430, -2.5580],
        [-6.5090,  4.0180, -6.7737, -3.5343],
        [-7.7944, -0.8742, -6.9857,  0.9538],
        [-7.0138, -2.4044, -6.6075,  2.2250],
        [-7.0462, -3.1965,  2.1391, -2.1856]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 33%|███▎      | 96/289 [01:12<02:25,  1.32it/s]

Training loop 96
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.02429654635488987, logits - tensor([[-7.3719,  3.4661, -7.2655, -3.4620],
        [-6.8292,  3.9815, -7.0781, -3.8398],
        [-6.9858,  2.9482, -6.8030, -2.4840],
        [-7.5963,  3.2254, -7.6998, -3.4167],
        [-6.9615, -2.9843, -5.5474,  2.1614],
        [-7.3521,  3.2214, -6.7640, -3.5922],
        [-7.3472,  2.2157, -7.2564, -2.7938],
        [-7.4273,  3.8533, -6.8921, -2.9864]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 34%|███▎      | 97/289 [01:13<02:24,  1.32it/s]

Training loop 97
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.18353387713432312, logits - tensor([[-8.5492,  3.6296, -8.8832, -2.9524],
        [-7.3879,  3.3491, -6.6606, -2.7705],
        [-7.1062, -2.6963, -5.7078,  1.8808],
        [-7.7890,  3.0548, -7.0920, -3.1514],
        [-8.5706,  2.2470, -7.6077, -2.4050],
        [-6.7139,  2.0846, -7.0450, -2.2392],
        [-6.2491, -3.4109, -6.4111,  3.6388],
        [-6.4467, -3.1364,  2.3883, -1.8653]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 34%|███▍      | 98/289 [01:14<02:24,  1.32it/s]

Training loop 98
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.07325376570224762, logits - tensor([[-6.3510, -4.2187,  1.2751, -2.0489],
        [-8.0035,  0.6234, -7.2449, -0.4767],
        [-7.4021,  2.5311, -6.7553, -2.8880],
        [-5.4355,  3.2393, -6.3739, -3.3155],
        [-7.3025,  2.6517, -6.9093, -3.0164],
        [-6.7980, -1.6819, -5.3351,  1.5962],
        [-8.0379,  3.6924, -7.9233, -3.4848],
        [-5.4753, -2.7800,  1.6208, -2.8690]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 34%|███▍      | 99/289 [01:15<02:24,  1.31it/s]

Training loop 99
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.04421532526612282, logits - tensor([[-6.8343,  3.5921, -6.9614, -2.8089],
        [-6.6809,  3.2734, -6.3927, -3.8323],
        [-7.2247, -2.1257, -5.6964,  1.9545],
        [-6.1150, -4.0445,  2.5671, -3.2180],
        [-7.1645, -2.6532, -6.1098,  3.2348],
        [-7.7838,  2.1174, -7.1585, -2.2053],
        [-7.0234,  1.2337, -6.6274, -2.0506],
        [-6.5817, -3.5782,  2.5809, -2.5695]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 35%|███▍      | 100/289 [01:15<02:24,  1.31it/s]

Training loop 100
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.0447363406419754, logits - tensor([[-6.1972, -3.3825, -5.6513,  3.3435],
        [-7.3515,  2.1898, -6.5177, -1.9354],
        [-6.4184, -3.6850, -5.7024,  3.7577],
        [-6.8841,  2.1990, -7.0061, -2.6967],
        [-8.0093,  3.3214, -6.8499, -1.9698],
        [-5.7424, -2.3897,  1.2896, -1.0262],
        [-6.9606,  3.8270, -6.5500, -3.7173],
        [-5.2443, -4.7512, -4.9072,  4.3635]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 35%|███▍      | 101/289 [01:16<02:23,  1.31it/s]

Training loop 101
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19637474417686462, logits - tensor([[-7.2340, -3.2290, -5.3882,  3.3452],
        [-6.3970, -4.6583, -5.6126,  5.2193],
        [-5.2146, -3.9819,  3.2406, -2.8936],
        [-6.2830,  2.2271, -6.3483, -1.8320],
        [-6.9226,  3.3135, -6.4535, -3.4272],
        [-6.2678,  3.2088, -6.6751, -2.9956],
        [-5.1706,  2.7006, -6.7087, -2.5834],
        [-5.8932,  2.3110, -5.2070, -2.0500]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 35%|███▌      | 102/289 [01:17<02:23,  1.31it/s]

Training loop 102
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.04353681951761246, logits - tensor([[-6.7100,  1.1701, -6.6197, -1.2639],
        [-6.5541,  3.7065, -6.6477, -3.2846],
        [-7.1552, -3.5638, -6.0938,  3.5397],
        [-7.3848,  2.7205, -6.9509, -2.2021],
        [-5.8121, -4.0929, -5.6211,  3.6131],
        [-5.5379, -3.5793,  1.5201, -1.9601],
        [-6.4775,  2.7317, -6.5497, -3.1048],
        [-7.0238,  3.8278, -6.6824, -3.3755]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 36%|███▌      | 103/289 [01:18<02:21,  1.31it/s]

Training loop 103
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2515391707420349, logits - tensor([[-5.5412, -2.6246, -5.3431,  3.0304],
        [-4.9920, -4.2512,  2.8710, -3.3307],
        [-6.3202, -3.8787,  2.1265, -2.3393],
        [-6.6945, -3.6522, -4.5605,  3.6017],
        [-7.3974,  2.8807, -6.9859, -2.8101],
        [-6.5566,  3.0955, -6.3527, -3.5938],
        [-5.9332, -4.3803,  2.3572, -3.0950],
        [-6.8861,  0.8483, -7.7967, -0.1461]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 36%|███▌      | 104/289 [01:18<02:20,  1.32it/s]

Training loop 104
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.25893115997314453, logits - tensor([[-5.7037, -3.8982,  3.2025, -2.5481],
        [-8.0238, -4.1207, -7.2404,  3.9348],
        [-5.7530, -4.2472,  3.6151, -3.3495],
        [-6.0838, -4.0225,  2.9132, -3.4279],
        [-7.1395,  2.6272, -6.9368, -3.7058],
        [-6.3892,  2.2138, -6.1522, -0.8733],
        [-7.5630,  3.1218, -7.8699, -2.7384],
        [-8.1939, -2.1973, -7.4387,  1.3698]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 36%|███▋      | 105/289 [01:19<02:19,  1.32it/s]

Training loop 105
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.02736140787601471, logits - tensor([[-6.3463, -3.9272, -5.6842,  3.7714],
        [-6.9496,  3.6464, -5.6028, -3.2495],
        [-5.8117, -3.9960,  2.5281, -3.2846],
        [-5.7782, -3.3490,  2.4270, -2.6756],
        [-5.9527,  3.4074, -6.6620, -3.4552],
        [-6.0022, -4.9050, -5.6593,  4.5962],
        [-5.9637, -4.1816,  2.9587, -3.4417],
        [-5.6286,  1.5757, -6.2217, -2.9350]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 37%|███▋      | 106/289 [01:20<02:18,  1.32it/s]

Training loop 106
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.04461785778403282, logits - tensor([[-6.5475,  1.7192, -5.9172, -2.2131],
        [-5.3184, -4.1134, -5.1410,  4.1515],
        [-5.7966,  2.6250, -5.7308, -3.8264],
        [-6.8420,  2.3160, -5.9915, -1.9452],
        [-6.5955, -4.0378, -5.9650,  3.8215],
        [-6.8946,  2.5493, -7.2344, -3.6593],
        [-6.3674, -5.1087,  2.5078, -2.6253],
        [-7.6352,  1.5067, -7.0035, -1.1281]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 37%|███▋      | 107/289 [01:21<02:17,  1.32it/s]

Training loop 107
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1465599238872528, logits - tensor([[-5.8278, -2.0448, -5.5752,  3.0865],
        [-7.4239,  0.2939, -6.3691, -0.2461],
        [-7.6530,  2.0628, -6.5740, -1.5055],
        [-7.1234,  3.5964, -6.6991, -3.1297],
        [-6.8442,  2.3905, -7.3813, -2.3531],
        [-7.3208, -1.3483, -6.1219,  0.1927],
        [-5.9341,  2.6802, -6.7684, -2.7306],
        [-7.2763,  0.5354, -5.7513, -0.2705]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 37%|███▋      | 108/289 [01:21<02:16,  1.32it/s]

Training loop 108
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.24176527559757233, logits - tensor([[-6.8074, -4.8802, -4.9241,  5.0625],
        [-7.1295, -4.6002,  1.0617, -2.1012],
        [-7.4455,  2.3104, -6.7187, -2.0685],
        [-5.4768, -3.4128,  3.1506, -3.6911],
        [-6.8642, -2.7978, -6.2755,  2.6840],
        [-7.8409,  3.2071, -7.7359, -3.4133],
        [-7.2510,  3.8223, -7.8951, -3.1321],
        [-6.4513,  2.9721, -6.4457, -3.4072]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 38%|███▊      | 109/289 [01:22<02:15,  1.33it/s]

Training loop 109
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.18753933906555176, logits - tensor([[-6.2778, -3.6231,  2.6021, -2.6300],
        [-6.6145,  2.2250, -6.2134, -3.1228],
        [-7.1810,  3.0858, -6.5577, -3.3704],
        [-6.7309,  3.2682, -6.1077, -3.4496],
        [-7.4517,  3.3382, -6.7866, -3.9829],
        [-6.9003,  2.9964, -6.2160, -3.0536],
        [-6.5779, -3.4961, -5.7440,  3.2820],
        [-6.8223,  3.4159, -6.5452, -3.2263]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 38%|███▊      | 110/289 [01:23<02:14,  1.33it/s]

Training loop 110
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.4107940196990967, logits - tensor([[-6.2647,  3.6375, -7.2468, -3.5085],
        [-7.5315,  2.6439, -7.1434, -1.9582],
        [-5.9720,  2.0595, -5.9032, -1.9887],
        [-7.0096,  2.5003, -6.7858, -2.7660],
        [-6.8646,  3.3027, -6.6073, -3.2145],
        [-6.8066,  1.9308, -6.0193, -1.4818],
        [-5.6711, -3.7980,  3.1050, -2.0248],
        [-6.8701, -4.0057,  2.7259, -3.1004]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 38%|███▊      | 111/289 [01:24<02:13,  1.33it/s]

Training loop 111
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.05628596246242523, logits - tensor([[-7.5063,  3.1123, -7.4193, -2.7869],
        [-6.7900, -0.9005, -7.2517,  1.9469],
        [-6.4632,  2.6552, -6.1068, -3.6773],
        [-7.0582,  1.9282, -7.2181, -1.3564],
        [-7.0049,  2.4014, -7.6376, -2.6825],
        [-6.8009, -3.5444, -5.5085,  4.0148],
        [-7.4395,  1.3499, -7.0435, -1.1021],
        [-6.9555, -4.3388, -4.5060,  4.8625]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 39%|███▉      | 112/289 [01:24<02:13,  1.33it/s]

Training loop 112
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.0441296212375164, logits - tensor([[-7.3393,  0.3269, -7.4943, -0.6119],
        [-6.6326, -2.9123, -5.7579,  2.5831],
        [-6.4970,  3.1649, -6.3934, -4.3991],
        [-8.2065,  4.1795, -7.6056, -4.0608],
        [-6.7527, -4.7185,  3.4342, -3.7687],
        [-5.8995, -4.4976,  3.5985, -3.7445],
        [-5.7542,  4.0341, -7.2398, -3.3746],
        [-6.9871, -4.5153, -5.0587,  4.3637]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 39%|███▉      | 113/289 [01:25<02:12,  1.33it/s]

Training loop 113
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.05899076908826828, logits - tensor([[-7.2243,  2.4708, -6.2685, -3.5297],
        [-5.9874,  2.0731, -5.6865, -1.6056],
        [-6.8975,  2.1821, -7.2399, -2.5990],
        [-5.6488, -3.5777,  1.8494, -2.6205],
        [-6.7561,  0.8213, -5.9288, -1.1368],
        [-7.5632,  2.7824, -7.3360, -3.4817],
        [-6.1935, -3.7603,  2.8602, -2.2591],
        [-5.9869, -2.7621, -5.4640,  2.8961]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 39%|███▉      | 114/289 [01:26<02:11,  1.33it/s]

Training loop 114
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.03673757612705231, logits - tensor([[-6.3570,  0.6969, -6.3932, -1.2168],
        [-6.8801,  3.8071, -6.3006, -3.1036],
        [-5.8381, -4.0719, -5.4207,  4.6602],
        [-5.8733,  2.9520, -5.7216, -3.5098],
        [-8.0415,  2.6875, -7.9329, -3.1706],
        [-6.7149, -3.1506, -6.1574,  4.2454],
        [-8.0029,  3.7380, -7.1043, -4.6756],
        [-7.8328,  3.2802, -8.0287, -2.4729]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 40%|███▉      | 115/289 [01:27<02:11,  1.33it/s]

Training loop 115
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.033889465034008026, logits - tensor([[-6.1547,  3.8035, -7.0259, -3.7973],
        [-5.8146,  2.1656, -6.3563, -2.4529],
        [-6.4206, -2.3237, -6.1503,  2.5606],
        [-7.3995,  2.1135, -6.7456, -2.8827],
        [-7.2656,  2.0609, -6.9686, -2.0474],
        [-6.2954, -4.2066,  3.1504, -3.0287],
        [-5.3201, -4.2227, -5.0563,  5.3611],
        [-6.8070,  2.7936, -6.2791, -2.8191]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 40%|████      | 116/289 [01:27<02:10,  1.33it/s]

Training loop 116
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.21334561705589294, logits - tensor([[-6.4119,  2.9727, -6.8429, -3.3463],
        [-7.2820, -2.2298, -5.8614,  3.6325],
        [-5.9334, -3.4138,  2.7632, -2.9860],
        [-6.7079, -3.2002,  2.2533, -2.4661],
        [-7.5157, -3.0737, -6.3697,  1.5442],
        [-6.1726,  3.3020, -6.0040, -3.3956],
        [-7.6340, -1.5837, -6.0991,  2.5636],
        [-6.9388,  2.5260, -7.1469, -3.0090]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 40%|████      | 117/289 [01:28<02:09,  1.33it/s]

Training loop 117
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.25361302495002747, logits - tensor([[-7.1299,  3.0212, -6.4689, -3.5075],
        [-7.3955,  3.1835, -7.6299, -3.4862],
        [-7.0716,  4.0158, -6.1399, -4.2914],
        [-7.2866, -0.2286, -5.5503,  0.9346],
        [-5.8561,  3.6833, -6.5368, -2.7388],
        [-6.0979,  3.6203, -6.4234, -3.5477],
        [-6.1782,  3.5482, -6.7780, -4.0670],
        [-6.6958,  2.8615, -5.6359, -2.4915]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 41%|████      | 118/289 [01:29<02:08,  1.33it/s]

Training loop 118
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.186624214053154, logits - tensor([[-6.2294,  2.6011, -7.6109, -2.4084],
        [-6.4494, -4.6797, -6.3523,  3.7332],
        [-6.5214, -3.7385, -5.1557,  4.2536],
        [-6.0039, -4.0234,  3.1513, -2.5346],
        [-4.7718, -2.9170,  1.7768, -1.7837],
        [-6.9617,  3.5486, -7.6743, -3.9737],
        [-8.0246, -3.1433, -5.6686,  2.9674],
        [-5.9023, -4.5291,  3.8085, -3.4871]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 41%|████      | 119/289 [01:30<02:08,  1.32it/s]

Training loop 119
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.02208382450044155, logits - tensor([[-7.1343,  2.9811, -7.0650, -3.2202],
        [-6.7845,  3.7284, -7.3822, -2.5991],
        [-6.6059,  3.0970, -6.7443, -2.9015],
        [-6.8817, -3.4432, -7.0205,  3.1953],
        [-6.4063,  3.1729, -7.1548, -2.8688],
        [-5.6251,  3.5924, -7.1103, -2.4926],
        [-6.0509,  3.1005, -6.4393, -2.8783],
        [-5.5644, -3.9507, -5.1946,  5.3755]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 42%|████▏     | 120/289 [01:30<02:07,  1.32it/s]

Training loop 120
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.18481238186359406, logits - tensor([[-6.5023,  1.0194, -6.0996, -1.1329],
        [-6.0304,  2.5950, -7.2349, -2.8101],
        [-6.2762, -3.3510,  1.5850, -1.4471],
        [-7.4555,  3.6705, -7.0019, -3.1212],
        [-6.1460,  2.7263, -7.2288, -3.0327],
        [-7.0993, -0.6113, -6.5904,  1.3164],
        [-6.7032,  3.6736, -6.9131, -4.2095],
        [-8.0169,  0.1385, -6.6508, -0.4418]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 42%|████▏     | 121/289 [01:31<02:07,  1.32it/s]

Training loop 121
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.31208378076553345, logits - tensor([[-7.1557,  2.3951, -6.7712, -2.3722],
        [-6.3885, -3.2928,  2.3539, -2.2168],
        [-7.1390,  3.1557, -6.2374, -4.2072],
        [-7.6856,  2.4244, -6.3653, -1.5699],
        [-6.6537,  2.9358, -6.8517, -3.3097],
        [-7.6331,  2.8003, -7.2513, -3.6693],
        [-6.8910, -3.1559, -5.5324,  2.5492],
        [-7.4773, -2.2190, -6.9725,  2.5242]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 42%|████▏     | 122/289 [01:32<02:07,  1.31it/s]

Training loop 122
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.11066525429487228, logits - tensor([[-6.5767,  1.3934, -6.8911, -0.7577],
        [-5.5231, -4.0504,  2.4698, -2.5867],
        [-7.1395, -4.3093, -5.3959,  4.3725],
        [-6.8476,  2.9332, -7.0301, -3.6690],
        [-5.6425, -4.1305,  2.6471, -2.8601],
        [-5.4020, -3.9792,  3.1918, -2.4022],
        [-6.1843, -4.0541, -4.7752,  4.0635],
        [-6.4939,  3.1593, -5.9271, -2.1294]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 43%|████▎     | 123/289 [01:33<02:06,  1.31it/s]

Training loop 123
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.051506660878658295, logits - tensor([[-6.1309,  3.2989, -6.5174, -3.6544],
        [-7.4123, -0.6921, -6.0078,  1.1287],
        [-6.8752, -4.3079, -6.3229,  3.5607],
        [-7.8925,  1.6358, -6.6438, -1.8151],
        [-6.9167,  2.8831, -7.1734, -4.0473],
        [-5.3687, -4.1502,  2.6000, -2.4646],
        [-7.6743,  2.6685, -7.9242, -1.8895],
        [-7.6694,  3.4271, -7.5412, -3.4833]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 43%|████▎     | 124/289 [01:33<02:06,  1.31it/s]

Training loop 124
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.03122715651988983, logits - tensor([[-6.1299, -4.3362, -5.4066,  4.2693],
        [-7.8979,  2.0486, -6.6937, -1.6761],
        [-6.6946,  3.1659, -7.2731, -2.5326],
        [-7.8429, -2.6634, -6.2275,  4.0350],
        [-5.7462, -3.6552,  2.3767, -2.6053],
        [-6.8004,  2.6726, -7.1825, -3.6559],
        [-6.4334,  2.8325, -5.9073, -2.3482],
        [-5.5500, -4.3607, -5.6986,  4.7395]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 43%|████▎     | 125/289 [01:34<02:05,  1.31it/s]

Training loop 125
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.040504902601242065, logits - tensor([[-6.7559,  3.5805, -7.6245, -3.3996],
        [-5.9662, -3.1127,  2.8187, -3.3519],
        [-7.1521, -4.9660,  2.8487, -2.2615],
        [-6.6899,  1.3403, -6.3046, -1.4882],
        [-7.1602, -4.3342, -5.6021,  4.5550],
        [-7.1745,  3.3188, -7.0988, -3.7776],
        [-7.2396, -1.8719, -4.6708,  1.8330],
        [-6.6371, -2.9700, -5.4392,  3.0011]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 44%|████▎     | 126/289 [01:35<02:03,  1.32it/s]

Training loop 126
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.035744138062000275, logits - tensor([[-7.5667,  2.6257, -7.0858, -3.2776],
        [-5.8311,  1.9960, -6.2360, -1.9501],
        [-6.8431,  2.0440, -6.8634, -2.7795],
        [-6.1127,  3.4061, -5.9666, -3.7856],
        [-6.1737,  3.0449, -6.3338, -2.6728],
        [-6.8383, -4.4986, -4.5867,  4.1095],
        [-6.3009,  2.6877, -6.9677, -2.3196],
        [-6.9895,  1.7477, -6.2890, -3.0815]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 44%|████▍     | 127/289 [01:36<02:02,  1.32it/s]

Training loop 127
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3966096341609955, logits - tensor([[-7.1979, -3.2285, -7.1358,  4.7140],
        [-7.7084,  1.5448, -7.3176, -1.2448],
        [-7.1977,  3.6191, -7.7826, -2.7342],
        [-6.3633, -3.6784,  0.3981, -0.4943],
        [-6.3008,  2.9705, -6.0029, -3.5316],
        [-7.9219,  1.3898, -6.8957, -1.5361],
        [-6.7189, -0.5097, -3.0932, -0.0951],
        [-7.4386,  3.1966, -6.9546, -3.2948]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 44%|████▍     | 128/289 [01:36<02:02,  1.32it/s]

Training loop 128
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.45833736658096313, logits - tensor([[-7.2508,  1.7027, -6.7195, -1.0446],
        [-7.5476,  1.1391, -7.1713, -1.5379],
        [-6.5516, -4.3080,  2.9835, -4.1744],
        [-7.1687,  2.1077, -6.6562, -1.5852],
        [-7.1925,  3.4339, -7.3357, -2.9110],
        [-5.2254, -3.4065,  2.8489, -3.1255],
        [-7.2650,  1.8292, -7.5069, -2.1709],
        [-5.6439, -3.2794, -5.5555,  3.1215]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 45%|████▍     | 129/289 [01:37<02:01,  1.31it/s]

Training loop 129
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.07556074857711792, logits - tensor([[-5.4484,  3.1769, -5.9811, -3.3146],
        [-6.8521,  2.8922, -5.5606, -2.8041],
        [-7.6130,  2.8760, -7.3046, -4.2862],
        [-6.4686,  2.5595, -6.7521, -2.9733],
        [-7.4714, -0.0384, -6.7389,  0.1183],
        [-7.3701,  2.4773, -7.0713, -1.6701],
        [-6.8756,  2.0867, -6.6063, -2.2719],
        [-6.9743,  1.9865, -6.6335, -2.1945]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 45%|████▍     | 130/289 [01:38<02:00,  1.32it/s]

Training loop 130
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.11275717616081238, logits - tensor([[-5.5364, -4.8440, -5.0024,  4.3649],
        [-7.0388,  2.3014, -6.6563, -2.6430],
        [-6.3111,  3.4685, -6.1125, -3.8331],
        [-7.0483, -4.2485, -5.4233,  3.7714],
        [-6.5076, -2.7766,  0.5317, -1.5619],
        [-6.5987, -5.0250, -5.9900,  4.9064],
        [-6.6068,  1.3772, -5.9565, -1.6236],
        [-7.2486, -3.1617, -5.5631,  3.4369]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 45%|████▌     | 131/289 [01:39<02:00,  1.32it/s]

Training loop 131
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.046423934400081635, logits - tensor([[-6.4020, -4.9001,  2.9734, -3.0145],
        [-7.3704,  3.2358, -7.2091, -3.3480],
        [-8.1492,  2.3839, -8.1072, -2.4984],
        [-7.2744, -4.5601, -6.3054,  4.4276],
        [-5.4941,  1.8679, -7.3261, -0.6894],
        [-6.5164,  1.4462, -5.7063, -1.7035],
        [-6.1798, -4.0855,  3.1645, -3.1226],
        [-6.3407, -3.2062, -6.0964,  3.6318]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 46%|████▌     | 132/289 [01:40<01:58,  1.32it/s]

Training loop 132
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.04786807671189308, logits - tensor([[-5.7954, -5.8161,  3.4834, -2.6524],
        [-6.4261,  2.0983, -6.9381, -2.3258],
        [-6.5950,  2.3580, -6.3191, -2.6196],
        [-6.3791,  4.4970, -6.7439, -3.4633],
        [-5.4884, -4.2074,  2.7388, -2.3690],
        [-7.6327,  1.3785, -6.3052, -1.3292],
        [-7.7734,  2.7194, -6.6488, -1.4857],
        [-7.4610,  3.1655, -7.3967, -2.7445]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 46%|████▌     | 133/289 [01:40<01:57,  1.33it/s]

Training loop 133
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.029113132506608963, logits - tensor([[-7.3078,  1.7816, -7.6099, -1.5380],
        [-7.0888,  2.5892, -6.9804, -2.8535],
        [-6.7172,  3.6555, -6.0576, -3.5679],
        [-5.8428, -4.4786, -6.3638,  4.5122],
        [-8.1129,  2.7840, -7.6718, -3.1534],
        [-6.2926,  2.7200, -7.0103, -3.1304],
        [-6.6187,  2.6495, -7.1074, -2.9069],
        [-5.9278, -4.3362, -5.2769,  4.5937]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 46%|████▋     | 134/289 [01:41<01:56,  1.33it/s]

Training loop 134
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.04727540910243988, logits - tensor([[-7.3805,  2.4292, -8.4404, -1.7075],
        [-6.0665, -3.8728, -5.1250,  4.4787],
        [-7.0338,  1.3584, -7.2227, -1.5285],
        [-7.3752,  1.8933, -6.5310, -1.2921],
        [-6.8558, -4.0572, -6.0884,  2.3767],
        [-8.1696,  2.9083, -7.2381, -3.1287],
        [-8.2945, -3.7933, -7.1415,  3.3912],
        [-6.7510, -3.9426,  2.3700, -3.2874]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 47%|████▋     | 135/289 [01:42<01:56,  1.33it/s]

Training loop 135
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.042990393936634064, logits - tensor([[-7.4879,  1.3975, -7.3029, -2.8357],
        [-6.6321, -2.7790, -4.4737,  4.0844],
        [-7.1437, -2.1201, -6.2566,  1.6986],
        [-8.3836,  1.6807, -7.4389, -2.1299],
        [-4.8787, -4.2805,  2.4831, -4.1088],
        [-6.2482, -3.9155,  2.6507, -2.6467],
        [-6.3863, -3.8630, -4.9980,  3.3168],
        [-6.8663,  2.7814, -7.0617, -3.3072]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 47%|████▋     | 136/289 [01:43<01:55,  1.33it/s]

Training loop 136
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.026040051132440567, logits - tensor([[-6.4560,  2.1853, -6.8603, -3.1081],
        [-5.9000, -4.0351, -5.7930,  4.3425],
        [-6.1181, -3.6112,  2.8767, -1.8483],
        [-6.1824, -3.9465, -5.2373,  3.7746],
        [-5.8841, -4.5727, -5.3471,  5.3595],
        [-6.5729,  2.5252, -6.5868, -1.7837],
        [-7.0817, -3.8314, -6.2063,  3.7719],
        [-7.1363,  3.7815, -6.9693, -3.3019]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 47%|████▋     | 137/289 [01:43<01:54,  1.32it/s]

Training loop 137
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.05724431201815605, logits - tensor([[-6.6391,  1.9583, -7.3681, -1.9154],
        [-6.0359,  0.6860, -6.2249, -1.4280],
        [-7.6583,  3.4705, -7.1167, -2.1887],
        [-8.8549,  1.8383, -8.2047, -1.9027],
        [-8.1195,  2.8771, -7.9246, -2.5665],
        [-6.0171,  3.5504, -5.9755, -2.1620],
        [-6.6728, -4.3567,  2.6469, -2.9952],
        [-6.1721,  3.3144, -6.4951, -2.6869]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 48%|████▊     | 138/289 [01:44<01:53,  1.33it/s]

Training loop 138
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1433859020471573, logits - tensor([[-6.7229,  1.6871, -6.4958, -1.5394],
        [-6.3403,  1.7676, -5.1680, -1.9294],
        [-7.4042,  3.4953, -7.1768, -2.3931],
        [-7.0080, -2.6616, -5.9715,  1.8416],
        [-5.4662,  2.7739, -5.8281, -3.4038],
        [-5.5614, -4.0832,  2.9104, -3.5526],
        [-6.7888, -4.9160, -6.6208,  3.8171],
        [-7.9521,  2.8957, -7.3572, -2.6394]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 48%|████▊     | 139/289 [01:45<01:53,  1.33it/s]

Training loop 139
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.02907397784292698, logits - tensor([[-7.5665,  2.8126, -7.3761, -3.2028],
        [-6.4718, -4.2139, -5.3169,  4.4986],
        [-4.9046,  3.2670, -6.1757, -3.4155],
        [-6.0292, -4.6586, -5.7431,  3.9261],
        [-7.0260,  2.9840, -6.2610, -3.7909],
        [-7.9721,  1.8042, -7.1966, -1.0427],
        [-6.7516,  3.1723, -6.4911, -3.0603],
        [-6.4889,  2.9565, -7.2429, -4.3576]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 48%|████▊     | 140/289 [01:46<01:52,  1.33it/s]

Training loop 140
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.040912069380283356, logits - tensor([[-5.6526, -4.0668,  3.3242, -3.4553],
        [-5.1821,  2.9301, -5.0917, -3.5858],
        [-6.2105,  2.4679, -7.0995, -2.3668],
        [-6.9453,  3.9186, -7.1151, -2.9079],
        [-7.1919, -3.4518, -6.5323,  3.6832],
        [-7.0153, -4.2044, -5.0156,  3.1318],
        [-6.4383, -3.1837,  1.5883, -2.0397],
        [-7.8638,  1.1644, -6.5262, -1.9293]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 49%|████▉     | 141/289 [01:46<01:51,  1.33it/s]

Training loop 141
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.04241086170077324, logits - tensor([[-6.8116,  3.8940, -7.3017, -3.6997],
        [-7.8538,  1.1783, -6.7363, -0.7505],
        [-5.7436,  3.7933, -6.6467, -3.4558],
        [-6.7035, -2.3968, -5.2262,  2.6127],
        [-6.7142,  2.8882, -6.3626, -2.9643],
        [-5.7135, -4.7524,  2.6922, -3.2014],
        [-6.6488,  2.9100, -6.3515, -3.1341],
        [-7.5371,  2.9910, -6.5285, -2.8293]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 49%|████▉     | 142/289 [01:47<01:51,  1.32it/s]

Training loop 142
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.05196623131632805, logits - tensor([[-5.7903, -4.0272, -6.2272,  4.0051],
        [-7.1026, -2.2650, -6.8570,  2.9982],
        [-6.5959, -4.0111,  2.3515, -2.7891],
        [-7.2872,  2.9782, -6.9100, -3.1844],
        [-7.2035,  2.4773, -6.4240, -2.6623],
        [-7.7418,  1.4627, -8.0726, -0.7033],
        [-6.9397,  3.5849, -6.5322, -3.0553],
        [-5.6956, -3.1835,  1.5566, -1.9085]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 49%|████▉     | 143/289 [01:48<01:51,  1.31it/s]

Training loop 143
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1526169627904892, logits - tensor([[-6.5258,  3.3410, -6.6959, -2.8502],
        [-6.3262, -4.9532, -5.7256,  4.6044],
        [-6.0658, -3.2266,  1.7826, -2.2915],
        [-6.2386, -3.2627, -3.7863,  3.5581],
        [-6.6808,  2.9085, -6.4600, -3.3489],
        [-5.8900, -4.5376,  3.9104, -3.3508],
        [-7.5343,  2.9112, -6.9892, -2.9893],
        [-7.0420, -3.8701, -6.3462,  3.6435]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 50%|████▉     | 144/289 [01:49<01:50,  1.31it/s]

Training loop 144
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1832801103591919, logits - tensor([[-5.7781, -3.5311,  1.6370, -1.7784],
        [-6.2847, -0.7551, -5.3310,  1.8029],
        [-7.0268, -0.3629, -6.9099,  0.6405],
        [-6.2122,  2.1964, -6.0755, -2.9995],
        [-6.4141,  3.2736, -6.6390, -2.9834],
        [-7.9604, -1.9867, -6.2809,  2.7943],
        [-7.3946,  0.6903, -6.2303, -0.5897],
        [-7.3774,  2.3320, -7.1450, -2.7619]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 50%|█████     | 145/289 [01:49<01:50,  1.30it/s]

Training loop 145
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.042309995740652084, logits - tensor([[-7.1665,  2.1666, -6.9541, -2.1181],
        [-7.1234,  3.4421, -7.1509, -3.5731],
        [-6.3922,  2.9696, -7.2421, -2.9552],
        [-7.3219,  3.6759, -7.4291, -3.4036],
        [-7.4159,  3.0483, -7.1351, -2.9675],
        [-5.5688, -3.8115, -4.4639,  4.0529],
        [-6.9977,  3.0098, -6.1770, -3.4758],
        [-6.1084,  0.8382, -6.4083, -1.0117]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 51%|█████     | 146/289 [01:50<01:49,  1.31it/s]

Training loop 146
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.053056370466947556, logits - tensor([[-7.2298,  3.2972, -6.1617, -3.4885],
        [-6.6019,  3.1744, -6.5045, -2.5531],
        [-6.3659,  2.0653, -6.5969, -2.6805],
        [-6.4795, -3.8849, -5.5342,  3.9266],
        [-7.2680,  2.4465, -6.5562, -3.0416],
        [-6.5128, -3.8830, -6.2352,  3.8181],
        [-5.3876, -3.0805,  2.2956, -1.9622],
        [-6.9765, -2.4216, -1.4881,  0.3538]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 51%|█████     | 147/289 [01:51<01:48,  1.31it/s]

Training loop 147
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.04408542066812515, logits - tensor([[-5.4414, -2.3244,  1.1333, -1.2961],
        [-6.6936,  3.3137, -6.2990, -3.3061],
        [-5.5627, -3.3490,  2.4735, -3.0868],
        [-6.2206,  2.9265, -6.7871, -3.0729],
        [-7.5508,  2.4143, -6.5373, -2.0624],
        [-7.2550, -3.0841, -5.6359,  3.7559],
        [-5.6425, -3.7667,  3.4951, -3.1002],
        [-7.8540,  2.9276, -8.3152, -3.8897]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 51%|█████     | 148/289 [01:52<01:47,  1.31it/s]

Training loop 148
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.30342644453048706, logits - tensor([[-6.3363,  1.0085, -5.8539, -0.7754],
        [-6.6099,  2.7083, -6.3952, -2.8571],
        [-6.8809,  2.3530, -7.3320, -2.9346],
        [-8.1665,  1.2974, -7.2103, -1.4311],
        [-6.5224, -3.8579,  2.6525, -2.3856],
        [-6.0185, -3.3887,  1.3939, -1.9170],
        [-6.1269,  2.5640, -6.1493, -3.3693],
        [-7.1057,  2.6500, -7.4721, -2.0343]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 52%|█████▏    | 149/289 [01:52<01:46,  1.31it/s]

Training loop 149
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.12307989597320557, logits - tensor([[-4.8485, -3.4218,  3.3969, -2.9402],
        [-6.4278,  3.6969, -6.2055, -3.6992],
        [-6.1788,  3.6098, -6.0560, -4.4874],
        [-7.0001,  1.7511, -6.5331, -1.1437],
        [-7.9267,  3.6897, -6.8805, -3.8866],
        [-6.1927,  3.6312, -6.9530, -2.9301],
        [-7.5978, -1.0555, -7.2512,  1.5726],
        [-6.6940, -3.7543, -5.5345,  4.8256]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 52%|█████▏    | 150/289 [01:53<01:45,  1.32it/s]

Training loop 150
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.038591790944337845, logits - tensor([[-5.9071, -4.8574,  2.9331, -2.3139],
        [-5.3757, -3.3661,  2.5632, -1.7037],
        [-5.7050,  2.0135, -5.8956, -2.1965],
        [-5.7270,  3.5005, -6.2675, -3.8190],
        [-7.2554,  4.0304, -7.0450, -4.1879],
        [-8.1872,  1.9064, -6.2633, -1.8540],
        [-5.6455, -4.1269, -4.9693,  3.9964],
        [-8.3302,  2.4265, -7.4835, -2.8861]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 52%|█████▏    | 151/289 [01:54<01:44,  1.32it/s]

Training loop 151
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.0519578643143177, logits - tensor([[-6.4601, -3.4119, -5.7262,  3.3077],
        [-6.3818,  3.3637, -7.0970, -2.7641],
        [-6.3922,  3.2501, -6.6347, -3.5484],
        [-7.2177,  3.3443, -6.3438, -3.3505],
        [-7.8021,  2.6088, -8.1089, -2.7630],
        [-8.3205,  0.5205, -7.1565, -0.9345],
        [-7.3612,  2.4653, -6.7673, -2.5292],
        [-5.3811, -3.5943,  2.0947, -2.1397]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 53%|█████▎    | 152/289 [01:55<01:43,  1.32it/s]

Training loop 152
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.05706329643726349, logits - tensor([[-7.4838, -0.8551, -6.0503,  0.7864],
        [-7.8870,  2.3432, -6.8360, -2.2125],
        [-5.9930,  3.2313, -5.8276, -3.4868],
        [-7.6586,  4.3786, -7.8500, -3.9420],
        [-6.6968,  3.2954, -6.8616, -2.8476],
        [-5.7447, -2.4093,  1.3358, -1.8166],
        [-7.7181,  3.0974, -7.5334, -2.1529],
        [-5.8383,  3.5989, -5.9292, -3.3755]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 53%|█████▎    | 153/289 [01:55<01:42,  1.32it/s]

Training loop 153
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.08260554075241089, logits - tensor([[-7.5746,  3.3850, -6.5598, -3.6513],
        [-7.5623,  3.0017, -8.0835, -3.0234],
        [-5.7562, -4.6732, -4.5445,  3.3427],
        [-7.4384,  2.5340, -7.3337, -3.3943],
        [-7.6047,  0.5636, -6.3963, -0.7599],
        [-6.8198, -4.6653, -6.0786,  3.9622],
        [-7.2035,  2.9374, -7.4436, -3.4994],
        [-6.9710, -3.5512, -6.4467,  4.3958]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 53%|█████▎    | 154/289 [01:56<01:41,  1.32it/s]

Training loop 154
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.04176175966858864, logits - tensor([[-5.0300, -3.3303,  3.3375, -2.8943],
        [-6.4832, -4.0259,  2.5653, -2.8979],
        [-6.7648,  2.9853, -6.6328, -3.0569],
        [-6.3676, -3.4726,  2.6794, -1.8141],
        [-5.7249, -3.2065,  1.7591, -2.4085],
        [-7.7922,  3.1261, -7.8365, -2.6817],
        [-7.3202,  1.5764, -6.4326, -2.6827],
        [-6.0430, -3.9053, -4.8916,  3.5770]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 54%|█████▎    | 155/289 [01:57<01:41,  1.32it/s]

Training loop 155
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.09984429180622101, logits - tensor([[-6.4672, -4.1271,  3.4745, -3.1069],
        [-7.0949,  3.2563, -7.1651, -2.7113],
        [-8.0836,  3.1133, -8.6200, -2.8489],
        [-5.0719,  3.2115, -5.4517, -2.4394],
        [-5.8534, -3.7611,  2.9302, -3.3543],
        [-7.6822,  0.0549, -6.4842, -0.1795],
        [-5.4558,  2.0973, -6.3774, -2.6297],
        [-6.2899, -2.6178, -0.3780, -1.4316]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 54%|█████▍    | 156/289 [01:58<01:40,  1.32it/s]

Training loop 156
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.03264988958835602, logits - tensor([[-6.2331, -3.8772,  2.5666, -2.1435],
        [-7.3824, -3.7021, -5.9016,  3.5857],
        [-6.9001, -3.0636, -6.2274,  3.5271],
        [-7.1250,  3.6858, -7.3598, -4.0746],
        [-6.1230, -1.7594, -5.8625,  1.8490],
        [-6.6565, -4.4411,  2.3004, -3.4930],
        [-6.4725,  3.6775, -6.3412, -3.4115],
        [-7.2169,  2.4818, -6.9807, -2.6158]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 54%|█████▍    | 157/289 [01:58<01:40,  1.32it/s]

Training loop 157
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.23023682832717896, logits - tensor([[-6.7202,  1.9484, -6.3457, -1.4942],
        [-6.0994, -3.6876,  3.3878, -2.9459],
        [-7.7301,  2.7127, -7.5189, -3.5631],
        [-7.0786, -2.9396, -5.9086,  4.0704],
        [-7.3507,  3.2381, -6.8802, -2.9448],
        [-6.2135, -4.1372,  1.7793, -1.9356],
        [-6.7380,  3.0356, -6.4258, -2.5156],
        [-7.0874,  3.9558, -8.0374, -3.8166]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 55%|█████▍    | 158/289 [01:59<01:39,  1.32it/s]

Training loop 158
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.15963663160800934, logits - tensor([[-6.1484,  3.4920, -6.4447, -3.7511],
        [-6.2244, -3.3208, -4.2550,  3.4384],
        [-7.4809,  2.5335, -6.6959, -1.4855],
        [-6.8683,  2.7231, -6.6916, -2.3230],
        [-6.1660, -2.2818, -4.8800,  2.8599],
        [-6.7416,  3.1982, -5.4679, -3.1314],
        [-6.8519, -3.1371,  2.2664, -2.3125],
        [-7.2753,  4.7354, -7.7712, -4.5612]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 55%|█████▌    | 159/289 [02:00<01:38,  1.32it/s]

Training loop 159
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.02635931223630905, logits - tensor([[-6.3555, -3.1897, -5.8745,  3.9150],
        [-6.7996,  3.8273, -6.8885, -3.6447],
        [-7.0074, -2.5947, -5.0396,  2.4522],
        [-5.1730,  3.1792, -6.0104, -3.9352],
        [-6.4203, -4.2975, -6.1772,  4.4563],
        [-6.6937, -3.0484, -5.7935,  1.8630],
        [-6.4557,  2.7113, -6.2148, -3.3372],
        [-8.0339,  2.1208, -7.0583, -2.8135]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 55%|█████▌    | 160/289 [02:01<01:37,  1.32it/s]

Training loop 160
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.16573314368724823, logits - tensor([[-7.1414,  2.8322, -6.9855, -2.4561],
        [-5.5417, -2.6724,  1.2156, -1.3761],
        [-6.7425,  2.1312, -5.8733, -1.6071],
        [-5.8659,  2.7623, -6.7705, -2.8914],
        [-4.3076, -3.7425, -3.6403,  3.5040],
        [-7.6193, -2.5359, -7.1622,  3.2034],
        [-6.4211, -4.3701, -5.2664,  3.8008],
        [-6.6432,  4.3680, -7.5059, -3.3344]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 56%|█████▌    | 161/289 [02:01<01:36,  1.32it/s]

Training loop 161
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.12848107516765594, logits - tensor([[-5.8086, -2.7684,  1.1431, -1.9250],
        [-6.5672,  3.7030, -6.5838, -2.7464],
        [-6.6655,  3.2092, -7.9866, -4.0319],
        [-6.7439,  3.4658, -6.7102, -3.5677],
        [-7.1234, -3.4483, -5.6078,  3.7249],
        [-7.1876,  3.0638, -5.6631, -3.4337],
        [-6.6193,  2.3820, -7.1904, -2.4640],
        [-7.1290,  3.7689, -6.6812, -3.9426]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 56%|█████▌    | 162/289 [02:02<01:36,  1.32it/s]

Training loop 162
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1763221174478531, logits - tensor([[-6.4491,  3.5069, -7.1614, -3.2228],
        [-7.4905,  3.3365, -7.9038, -3.4582],
        [-6.6587,  3.7803, -6.0319, -3.3956],
        [-7.9573,  2.3461, -7.6265, -2.5062],
        [-7.1060,  3.8401, -7.4216, -3.7896],
        [-7.1031,  2.7353, -6.4230, -2.6244],
        [-8.0516, -3.2599, -7.5379,  4.2875],
        [-7.5373,  2.3564, -7.5281, -2.2717]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 56%|█████▋    | 163/289 [02:03<01:35,  1.32it/s]

Training loop 163
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.024633659049868584, logits - tensor([[-6.9589, -4.3343, -6.5037,  3.6663],
        [-6.0389,  1.8927, -6.2393, -2.3090],
        [-6.8136,  3.6110, -6.7742, -3.0804],
        [-6.9636, -3.2555, -5.7365,  4.2035],
        [-5.1929,  2.5238, -5.5603, -3.8241],
        [-4.6616,  2.7873, -5.8335, -3.5533],
        [-5.4485,  3.2895, -6.3403, -4.0221],
        [-5.7680, -4.3847,  3.1262, -2.9661]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 57%|█████▋    | 164/289 [02:04<01:34,  1.32it/s]

Training loop 164
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 57%|█████▋    | 165/289 [02:05<01:33,  1.32it/s]

loss - 0.22032786905765533, logits - tensor([[-7.0412,  1.7048, -6.0238, -1.9340],
        [-6.8837,  4.3882, -8.0695, -4.3655],
        [-6.1523, -4.0005, -5.5864,  2.9235],
        [-6.0550, -3.7941,  2.2475, -1.9519],
        [-7.0751,  4.1148, -6.5939, -3.0067],
        [-7.2115,  1.8042, -7.1520, -1.8662],
        [-8.0527,  2.5373, -8.2723, -3.2554],
        [-6.8828, -4.2763,  3.2179, -2.8026]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 165
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.024884186685085297, logits - tensor([[-6.7237, -3.2467, -6.0093,  3.5020],
        [-7.0068,  3.6289, -6.4523, -3.3597],
        [-6.1772, -4.4285,  3.2343, -4.4413],
        [-5.3731,  3.0695, -6.4469, -3.1907],
        [-6.0170,  2.9667, -6.2082, -3.2641],
        [-6.1215, -3.5971, -5.3863,  3.0721],
        [-7.0191,  2.0482, -6.9857, -3.0133],
        [-6.5906,  2.3134, -6.1248, -

 57%|█████▋    | 166/289 [02:05<01:33,  1.32it/s]

Training loop 166
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.18936848640441895, logits - tensor([[-6.3441, -3.4608,  2.0752, -2.8361],
        [-6.1999,  2.1946, -6.0463, -3.0894],
        [-7.1791,  3.0501, -7.9601, -3.6669],
        [-7.5165, -3.6272, -5.7533,  2.8502],
        [-7.1878,  2.9120, -6.6629, -3.4850],
        [-6.7359, -1.9619, -4.4931,  2.1749],
        [-6.4972,  1.6817, -5.6334, -2.6858],
        [-5.4464, -3.4271, -5.4011,  4.7968]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 58%|█████▊    | 167/289 [02:06<01:32,  1.31it/s]

Training loop 167
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.056388795375823975, logits - tensor([[-7.1633,  3.0391, -7.5783, -3.2526],
        [-6.7110,  2.4793, -6.4360, -2.8983],
        [-6.1642,  2.6948, -6.4304, -2.1794],
        [-6.3546,  2.7367, -6.4784, -3.3250],
        [-5.8782,  0.5969, -6.2211, -0.4214],
        [-7.2646, -1.9677, -6.3617,  2.5877],
        [-6.2567,  3.1400, -6.2602, -2.8400],
        [-5.7015, -4.0374, -5.1994,  3.8548]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 58%|█████▊    | 168/289 [02:07<01:31,  1.32it/s]

Training loop 168
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.01702064648270607, logits - tensor([[-6.6660, -3.7836, -5.1531,  3.3453],
        [-6.2167,  3.4212, -6.8270, -3.3894],
        [-6.4303,  3.6483, -6.9054, -3.1014],
        [-6.4976,  3.0664, -6.5505, -2.8344],
        [-5.3350, -3.8021,  3.3377, -3.8336],
        [-6.4939, -4.0927, -5.7046,  4.3988],
        [-7.3164,  3.0286, -6.4674, -3.1349],
        [-6.1452, -4.8808, -5.0082,  4.7063]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 58%|█████▊    | 169/289 [02:08<01:31,  1.32it/s]

Training loop 169
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.23244744539260864, logits - tensor([[-5.5090, -2.8824,  2.2945, -2.0259],
        [-5.9965, -2.8291,  1.0156, -2.5007],
        [-7.8572,  2.7500, -8.4008, -2.9599],
        [-6.8937,  2.2705, -6.9600, -2.4473],
        [-7.0024,  0.1287, -5.7610,  0.0552],
        [-6.0472, -2.9648,  2.2157, -2.1639],
        [-7.7104,  3.5660, -8.2319, -3.9672],
        [-7.0353,  1.3095, -5.9999, -1.5956]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 59%|█████▉    | 170/289 [02:08<01:30,  1.32it/s]

Training loop 170
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.019323620945215225, logits - tensor([[-5.7999, -4.0338, -4.9718,  3.6441],
        [-6.7551, -3.4462, -6.5573,  3.5399],
        [-8.1256,  3.4214, -6.4685, -3.9094],
        [-7.1558,  3.7051, -7.2940, -3.4745],
        [-6.1889, -3.5238, -5.1080,  2.7401],
        [-6.7971, -4.6167, -6.1813,  3.3287],
        [-7.2788, -3.5006, -4.6300,  3.0190],
        [-6.7007,  2.7519, -6.3430, -2.3420]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 59%|█████▉    | 171/289 [02:09<01:29,  1.32it/s]

Training loop 171
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.18971849977970123, logits - tensor([[-6.6149, -1.9342, -4.6203,  2.4492],
        [-6.6109,  2.7347, -6.9186, -2.7302],
        [-5.8252,  3.3549, -6.5325, -2.8912],
        [-7.2447,  2.5728, -6.7854, -3.1457],
        [-5.9576,  3.7462, -6.5082, -3.6178],
        [-7.2757, -3.0849, -5.9569,  3.0474],
        [-6.7718, -2.1978, -6.2573,  2.8147],
        [-6.7419, -1.8951, -5.1784,  3.5211]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 60%|█████▉    | 172/289 [02:10<01:28,  1.32it/s]

Training loop 172
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.11174790561199188, logits - tensor([[-6.1769,  1.2201, -6.0237, -1.0718],
        [-6.9894, -1.4467, -6.0196,  2.0187],
        [-6.4240, -3.8910, -6.1026,  4.4326],
        [-7.1310,  3.7282, -6.8621, -3.8694],
        [-6.5010,  3.6725, -5.9613, -3.1623],
        [-6.0694, -4.9870,  3.3817, -3.3869],
        [-6.1914,  3.3288, -6.6169, -3.8130],
        [-6.5738,  2.8781, -6.8097, -3.1806]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 60%|█████▉    | 173/289 [02:11<01:27,  1.32it/s]

Training loop 173
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.05204656720161438, logits - tensor([[-6.3924,  3.1399, -6.9167, -3.1272],
        [-6.9249,  1.5435, -6.9108, -1.0394],
        [-7.1208,  3.1590, -7.1236, -3.5807],
        [-7.1386,  2.0731, -7.3156, -1.8010],
        [-7.3602,  1.7156, -7.0755, -1.3202],
        [-7.0815,  2.5654, -7.1233, -3.5202],
        [-8.1554,  2.4463, -7.6663, -3.0133],
        [-6.3778,  2.7219, -6.3591, -3.5513]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 60%|██████    | 174/289 [02:11<01:26,  1.32it/s]

Training loop 174
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.03555096685886383, logits - tensor([[-5.3555, -4.3324,  3.6988, -2.9939],
        [-6.0416,  2.2278, -7.3848, -2.1516],
        [-7.6477,  3.4430, -7.6177, -3.1258],
        [-6.4051,  3.5039, -7.8946, -4.5609],
        [-8.2702,  3.2649, -6.5775, -3.1708],
        [-7.0661,  3.1805, -7.9409, -3.6471],
        [-6.2521, -3.1787,  1.0345, -1.6253],
        [-7.2528, -4.0903, -6.0348,  3.9224]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 61%|██████    | 175/289 [02:12<01:26,  1.32it/s]

Training loop 175
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.023572511970996857, logits - tensor([[-7.3406,  3.8040, -7.7235, -2.9118],
        [-4.8021, -4.2384,  3.2567, -3.7809],
        [-7.0539,  2.0557, -8.3525, -2.6704],
        [-7.0641, -3.0395, -6.4831,  3.7854],
        [-6.5352,  3.4114, -6.4802, -3.3260],
        [-6.9972,  3.2051, -7.2912, -4.1328],
        [-7.2589, -4.2090, -6.4347,  3.3689],
        [-7.6005, -2.3452, -6.1566,  2.7445]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 61%|██████    | 176/289 [02:13<01:25,  1.33it/s]

Training loop 176
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.03325452655553818, logits - tensor([[-6.8853,  2.2904, -6.7217, -3.8353],
        [-7.4426,  2.8805, -7.3027, -2.8523],
        [-7.0822,  2.6519, -6.8261, -2.0205],
        [-7.2146,  3.0150, -7.3320, -3.6830],
        [-7.3205,  3.2414, -6.6682, -2.2676],
        [-7.9045,  2.3771, -6.6552, -2.7744],
        [-5.5621, -3.4382,  2.5251, -2.2277],
        [-6.8278,  3.2414, -6.5429, -4.0070]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 61%|██████    | 177/289 [02:14<01:24,  1.32it/s]

Training loop 177
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.07311370968818665, logits - tensor([[-6.3469e+00, -4.1124e+00, -6.1380e+00,  5.0532e+00],
        [-8.0290e+00,  2.2561e-01, -5.9848e+00,  7.2488e-04],
        [-5.7959e+00,  3.1559e+00, -5.2924e+00, -2.5466e+00],
        [-7.4810e+00,  4.4999e+00, -7.0471e+00, -3.5159e+00],
        [-7.3253e+00,  2.0154e+00, -6.5879e+00, -1.7157e+00],
        [-6.9524e+00,  4.6286e+00, -6.9610e+00, -4.3297e+00],
        [-7.1487e+00,  1.7439e+00, -7.3051e+00, -2.9320e+00],
        [-5.9124e+00, -4.3925e+00,  2.8591e+00, -3.3130e+00]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 62%|██████▏   | 178/289 [02:14<01:23,  1.33it/s]

Training loop 178
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.07238727062940598, logits - tensor([[-4.9223, -4.2804,  2.8371, -3.2092],
        [-8.4361,  3.1082, -8.1838, -3.6014],
        [-7.7847, -1.5756, -6.0277,  1.3110],
        [-5.1206, -4.2757,  3.3511, -3.4277],
        [-6.0862, -3.5951, -5.7031,  5.0108],
        [-6.4025,  3.1651, -7.1278, -1.9879],
        [-6.8095,  0.1452, -6.4902,  0.2920],
        [-6.3955, -3.3241, -4.6342,  3.6752]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 62%|██████▏   | 179/289 [02:15<01:22,  1.33it/s]

Training loop 179
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.0348990298807621, logits - tensor([[-6.4941,  3.1978, -6.8027, -3.8960],
        [-5.9760, -4.3521, -4.9784,  3.5389],
        [-6.5578,  2.2138, -6.3770, -2.1538],
        [-6.9894,  1.7653, -6.9236, -1.7465],
        [-5.9107,  2.5667, -6.3302, -1.6825],
        [-7.0272, -4.7410,  2.7015, -3.7348],
        [-6.3393, -3.5875, -5.5682,  4.1188],
        [-7.1414,  3.1808, -7.1573, -3.5780]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 62%|██████▏   | 180/289 [02:16<01:22,  1.33it/s]

Training loop 180
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.06996951997280121, logits - tensor([[-5.2884,  3.6623, -6.1073, -3.5461],
        [-6.1960,  3.1884, -6.8348, -4.4100],
        [-4.3924, -0.6268, -4.5801,  1.5417],
        [-6.8330, -0.5646, -6.6367,  0.3936],
        [-6.9021,  3.4212, -6.8127, -3.2926],
        [-6.6851,  3.9749, -6.8920, -2.9366],
        [-6.4714,  3.4702, -7.1169, -3.7865],
        [-7.4739, -1.6758, -5.5877,  1.9327]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 63%|██████▎   | 181/289 [02:17<01:21,  1.33it/s]

Training loop 181
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.39697447419166565, logits - tensor([[-7.0060,  4.0969, -7.1574, -4.0295],
        [-5.9150, -2.4010,  1.0423, -1.6922],
        [-6.6186,  3.1862, -7.3549, -2.6313],
        [-7.0545,  3.0577, -7.8961, -3.2978],
        [-6.2438, -4.7884,  2.3391, -2.6238],
        [-7.3452, -3.1788, -5.7543,  3.4719],
        [-7.8968,  3.7666, -7.1726, -3.7422],
        [-6.2787,  3.6892, -5.9808, -4.0267]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 63%|██████▎   | 182/289 [02:17<01:20,  1.32it/s]

Training loop 182
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.037226613610982895, logits - tensor([[-6.9879, -3.4869, -5.2528,  3.5645],
        [-6.0924, -4.4315,  3.0574, -2.4851],
        [-5.1590,  3.2069, -6.4049, -3.4000],
        [-6.2935,  2.9772, -6.3361, -2.9473],
        [-6.9840,  1.9234, -6.7622, -1.8905],
        [-6.5208, -4.6124, -5.8439,  4.6205],
        [-6.5406, -2.8538, -5.5720,  3.8654],
        [-6.8460,  2.0223, -7.1051, -1.0840]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 63%|██████▎   | 183/289 [02:18<01:20,  1.32it/s]

Training loop 183
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2527138292789459, logits - tensor([[-6.0016, -4.3874,  2.7341, -3.0924],
        [-6.0271, -3.5165,  2.8368, -3.6353],
        [-7.3783, -1.5682, -4.9109,  2.4202],
        [-7.1233,  3.5519, -6.3647, -3.4645],
        [-7.3765, -0.3641, -5.8986,  1.1916],
        [-7.3908, -4.7998, -5.9521,  4.8668],
        [-7.8084,  3.0582, -8.0148, -2.3740],
        [-6.8049,  3.1586, -6.6445, -3.2245]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 64%|██████▎   | 184/289 [02:19<01:19,  1.32it/s]

Training loop 184
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.12128859758377075, logits - tensor([[-6.5774,  2.8628, -6.2070, -2.8458],
        [-5.8895,  3.3006, -5.9939, -3.0973],
        [-6.6571,  2.1963, -6.0772, -3.0119],
        [-6.3705,  2.8233, -6.9327, -3.0725],
        [-6.8637,  1.9896, -6.7004, -2.6246],
        [-6.8934,  2.7765, -6.7801, -2.0332],
        [-6.1900, -4.5525,  2.7932, -2.6688],
        [-8.4125,  0.9664, -6.6520, -1.3772]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 64%|██████▍   | 185/289 [02:20<01:18,  1.32it/s]

Training loop 185
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.024433186277747154, logits - tensor([[-8.1878, -3.8926, -6.1245,  3.6700],
        [-7.0111,  3.4278, -7.0199, -3.9265],
        [-7.1242,  4.0477, -7.3813, -2.7437],
        [-5.5702,  3.1844, -6.0505, -2.9241],
        [-6.9573,  3.1074, -6.4394, -2.9663],
        [-5.9120,  3.5671, -7.2306, -3.2153],
        [-7.6668,  2.4500, -7.5867, -1.6399],
        [-6.7219, -3.3081, -5.5343,  3.4453]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 64%|██████▍   | 186/289 [02:20<01:17,  1.32it/s]

Training loop 186
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.24463197588920593, logits - tensor([[-7.6531, -3.9492, -6.1196,  4.0378],
        [-6.7181,  2.9292, -6.3847, -3.0728],
        [-7.1634,  3.4654, -7.6930, -3.8856],
        [-6.5570,  3.2415, -6.8870, -3.8067],
        [-6.5255, -4.3258, -6.4181,  4.1332],
        [-6.4670,  2.2929, -5.8469, -2.2973],
        [-6.6848,  2.5467, -6.4680, -2.3749],
        [-6.4018,  2.6407, -7.2330, -2.8109]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 65%|██████▍   | 187/289 [02:21<01:17,  1.32it/s]

Training loop 187
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.18988315761089325, logits - tensor([[-6.7186,  3.3319, -6.7114, -3.9964],
        [-6.2242,  2.7716, -5.8825, -3.0128],
        [-6.1272, -3.8823,  2.2279, -2.5502],
        [-5.0080, -3.1913, -4.7465,  3.9259],
        [-6.8820,  3.1740, -7.3381, -2.4644],
        [-7.3506, -4.1109,  2.4872, -2.2301],
        [-7.2915,  2.7732, -5.9467, -2.9976],
        [-6.9557, -1.4378, -4.6164,  1.5401]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 65%|██████▌   | 188/289 [02:22<01:16,  1.32it/s]

Training loop 188
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19398388266563416, logits - tensor([[-6.6034,  3.8163, -6.4816, -3.7630],
        [-7.3403,  2.9812, -6.7779, -2.4830],
        [-6.3871,  3.6896, -6.4943, -3.3291],
        [-6.8871, -3.5076, -6.7785,  2.6191],
        [-7.9964,  2.5755, -7.6651, -2.6110],
        [-6.4312,  2.5821, -6.6653, -2.7992],
        [-6.3004, -4.0882, -5.6916,  4.4262],
        [-5.6065, -2.7924, -5.1914,  4.0780]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 65%|██████▌   | 189/289 [02:23<01:15,  1.32it/s]

Training loop 189
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.03018510714173317, logits - tensor([[-6.4731,  4.0246, -7.1632, -3.5759],
        [-7.5404, -3.9083, -5.9320,  3.9889],
        [-6.1077, -3.5643,  3.3419, -3.1521],
        [-6.8835, -4.1716, -4.2990,  3.4996],
        [-7.0474,  4.5094, -7.8682, -5.0434],
        [-5.5297, -3.0488,  1.0464, -1.8292],
        [-6.9313,  3.5274, -7.6059, -3.2196],
        [-7.4633,  2.8674, -7.8023, -2.6879]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 66%|██████▌   | 190/289 [02:23<01:14,  1.32it/s]

Training loop 190
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.07945984601974487, logits - tensor([[-5.5416, -3.9715,  2.7413, -3.2930],
        [-6.8415,  0.4789, -6.7268, -0.0162],
        [-6.8241,  3.3156, -6.5075, -2.5553],
        [-6.5648,  2.9053, -6.5634, -3.0534],
        [-6.5969, -4.5961, -5.7450,  4.4259],
        [-7.5702, -1.4503, -6.6683,  1.5087],
        [-5.7009,  3.7788, -6.8072, -4.6990],
        [-6.3809,  3.4344, -7.0589, -3.4190]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 66%|██████▌   | 191/289 [02:24<01:14,  1.32it/s]

Training loop 191
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.06664665043354034, logits - tensor([[-6.8687,  3.3795, -7.0039, -2.6486],
        [-7.5306,  0.6049, -7.6673,  0.3191],
        [-5.5094, -3.9096,  2.5977, -3.1708],
        [-6.1806,  2.1770, -5.8015, -3.0601],
        [-6.8532,  2.0996, -7.0790, -2.3320],
        [-6.8154, -4.7474,  2.8023, -3.9195],
        [-7.2520,  4.0263, -7.5037, -3.9942],
        [-7.0847,  4.0930, -6.9569, -2.5155]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 66%|██████▋   | 192/289 [02:25<01:13,  1.32it/s]

Training loop 192
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.03897450864315033, logits - tensor([[-6.8301,  3.5883, -7.4890, -2.9342],
        [-6.5254, -3.9810,  3.3864, -2.5929],
        [-8.1438,  2.5464, -7.1703, -2.6859],
        [-6.0090, -4.0118,  2.7080, -3.2390],
        [-6.6955, -2.2246, -5.7431,  1.8682],
        [-6.1457, -4.1984, -5.8142,  4.5523],
        [-5.5268,  2.7549, -5.7503, -3.4790],
        [-6.1808, -3.4070,  1.0525, -2.7699]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 67%|██████▋   | 193/289 [02:26<01:12,  1.32it/s]

Training loop 193
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.047679949551820755, logits - tensor([[-7.2918,  2.6794, -6.1266, -2.9450],
        [-6.8578,  1.8428, -6.5296, -2.2847],
        [-6.8324, -2.1970, -6.4580,  2.8362],
        [-6.7486,  1.6070, -6.3589, -1.0750],
        [-6.8843,  2.8896, -7.4752, -2.7961],
        [-6.7481, -4.2007, -6.2868,  3.8941],
        [-5.9051,  4.0335, -6.3660, -3.0048],
        [-5.9849,  1.7828, -6.9279, -1.9566]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 67%|██████▋   | 194/289 [02:26<01:11,  1.32it/s]

Training loop 194
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.026641294360160828, logits - tensor([[-6.3161,  2.0484, -6.8918, -2.4299],
        [-6.4815,  2.8033, -6.2069, -3.0262],
        [-7.6340,  3.7476, -6.2898, -3.2314],
        [-7.8367, -4.2326, -6.0265,  4.3305],
        [-5.4959,  4.0189, -6.5089, -3.4639],
        [-6.9862,  2.8249, -8.3940, -3.2563],
        [-5.9515, -2.6976, -5.6057,  2.1290],
        [-5.9519,  3.5371, -5.6531, -2.6213]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 67%|██████▋   | 195/289 [02:27<01:11,  1.32it/s]

Training loop 195
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.02396836318075657, logits - tensor([[-6.1523,  3.7722, -6.5208, -3.1309],
        [-5.9756,  3.6494, -7.3084, -3.3207],
        [-5.9089,  3.2304, -5.7151, -3.4416],
        [-5.9903,  3.4240, -6.1363, -4.2238],
        [-7.1538,  3.8290, -7.0554, -3.3403],
        [-5.6842, -4.4952,  2.1788, -2.9331],
        [-7.6777,  3.3076, -6.9771, -2.5838],
        [-7.8390,  2.6646, -7.1259, -2.2971]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 68%|██████▊   | 196/289 [02:28<01:10,  1.32it/s]

Training loop 196
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.02912086620926857, logits - tensor([[-6.7593,  3.0897, -7.1080, -2.8762],
        [-6.5119,  2.7671, -6.5873, -3.5029],
        [-5.4510,  3.1222, -7.0487, -3.0365],
        [-5.9165,  2.9888, -6.4446, -2.9325],
        [-6.1513, -4.0994,  2.4949, -2.5004],
        [-6.8781,  3.7167, -6.9562, -3.1220],
        [-6.9769,  1.9583, -6.1446, -2.1153],
        [-6.5763,  3.7074, -6.0841, -4.2654]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 68%|██████▊   | 197/289 [02:29<01:09,  1.32it/s]

Training loop 197
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.025147806853055954, logits - tensor([[-7.9239, -2.9474, -5.8257,  5.1872],
        [-6.9446,  2.3082, -6.4750, -2.3944],
        [-8.0710,  1.9962, -8.1074, -1.7697],
        [-6.4442,  3.5521, -7.1169, -3.2275],
        [-6.9186,  3.2095, -6.4601, -3.5041],
        [-6.8596, -4.7040, -6.3616,  5.4016],
        [-7.6703,  2.7810, -6.8737, -3.6682],
        [-5.9653, -4.2403, -5.1722,  4.8495]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 69%|██████▊   | 198/289 [02:29<01:08,  1.32it/s]

Training loop 198
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.04904818907380104, logits - tensor([[-6.2809,  3.5883, -5.9235, -3.9673],
        [-6.3878,  4.3609, -6.8464, -3.7578],
        [-5.2386, -3.2447,  1.6138, -2.1322],
        [-8.3081, -4.1465, -6.8411,  4.1524],
        [-7.0001,  3.8935, -6.2102, -3.8052],
        [-7.5650,  2.8146, -6.2519, -2.9208],
        [-7.1223, -3.5893,  0.4930, -0.6928],
        [-5.9181, -3.7654, -5.5687,  4.3564]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 69%|██████▉   | 199/289 [02:30<01:08,  1.32it/s]

Training loop 199
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.03035416454076767, logits - tensor([[-5.9307, -4.3620,  2.7515, -2.6956],
        [-5.6475,  2.4882, -6.5549, -2.2296],
        [-6.6618, -3.6109, -6.9247,  3.7181],
        [-6.3932,  3.3662, -6.8953, -3.6515],
        [-5.7780,  3.1473, -6.5683, -3.2118],
        [-4.9446, -4.1523, -3.6499,  2.5106],
        [-6.6725, -2.3033, -5.6867,  2.5201],
        [-6.7104, -2.6066, -5.6152,  2.7677]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 69%|██████▉   | 200/289 [02:31<01:07,  1.33it/s]

Training loop 200
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.175762340426445, logits - tensor([[-7.6996, -2.4768, -7.4574,  2.4309],
        [-7.1676,  3.3243, -6.3666, -3.4782],
        [-7.6210,  1.9662, -7.0211, -2.9275],
        [-7.7420,  3.6446, -6.8229, -3.8716],
        [-7.8982,  3.3303, -7.5931, -3.0258],
        [-6.0027, -4.7366, -5.8187,  4.2563],
        [-6.9104, -4.8333,  3.4760, -4.2264],
        [-6.5470,  3.0773, -6.6041, -3.2908]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 70%|██████▉   | 201/289 [02:32<01:06,  1.33it/s]

Training loop 201
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.030440261587500572, logits - tensor([[-7.3269,  3.9349, -6.7683, -3.5693],
        [-5.7848,  2.3898, -6.4683, -2.7397],
        [-6.2822,  3.4072, -6.2989, -2.2123],
        [-7.8908,  3.4469, -6.1390, -3.7849],
        [-7.8126,  2.2347, -7.5879, -1.5007],
        [-6.7233,  3.6169, -5.5534, -3.9942],
        [-5.0083, -3.6062,  2.7220, -2.4474],
        [-7.2033, -4.2217, -5.7313,  3.8353]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 70%|██████▉   | 202/289 [02:33<01:05,  1.33it/s]

Training loop 202
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1188754141330719, logits - tensor([[-6.6413,  2.5363, -7.4736, -1.9381],
        [-7.0787, -4.1900, -5.3560,  4.1331],
        [-7.0739,  2.8023, -7.7766, -2.8184],
        [-5.9594,  3.8146, -5.9298, -3.8195],
        [-5.9134,  2.8392, -6.5119, -2.7794],
        [-5.6701,  3.6630, -6.5379, -3.9139],
        [-5.9241, -2.8351,  1.1135, -1.3143],
        [-5.8165, -4.0829,  1.8534, -3.1457]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 70%|███████   | 203/289 [02:33<01:04,  1.33it/s]

Training loop 203
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.022922858595848083, logits - tensor([[-4.8813, -4.1618, -5.2761,  4.6497],
        [-5.3212, -4.0821,  3.2413, -2.5946],
        [-6.4419, -4.6372,  3.7855, -3.5836],
        [-4.9872, -4.0194,  2.6026, -2.2887],
        [-6.2234,  3.3405, -6.8645, -3.3627],
        [-5.7862, -3.7350, -5.3758,  3.2670],
        [-6.0904,  3.4707, -6.3269, -3.0840],
        [-6.9132,  3.2812, -6.9246, -2.9443]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 71%|███████   | 204/289 [02:34<01:04,  1.33it/s]

Training loop 204
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.04787961393594742, logits - tensor([[-6.4784,  0.5081, -6.5320, -1.5296],
        [-7.4363,  2.3456, -6.8357, -2.3531],
        [-6.4903, -3.8070,  3.1442, -2.3246],
        [-7.4267, -3.3352, -5.9124,  4.5138],
        [-7.1861,  2.2583, -7.8398, -1.7384],
        [-6.3615, -3.5870, -5.4931,  3.6068],
        [-5.4527,  3.1632, -6.3644, -3.6511],
        [-6.6568, -4.7936,  3.2872, -3.5540]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 71%|███████   | 205/289 [02:35<01:03,  1.33it/s]

Training loop 205
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.04953642562031746, logits - tensor([[-6.0601, -5.0583,  3.3656, -2.4761],
        [-6.4921,  2.3405, -6.2983, -2.4369],
        [-6.9619,  3.2406, -6.6941, -3.1278],
        [-6.7061, -4.2671, -6.2772,  4.0854],
        [-7.5578,  4.5480, -6.6270, -3.5807],
        [-7.4563,  3.0188, -7.3756, -3.2577],
        [-5.4509, -3.0568,  0.4735, -0.6533],
        [-6.9521,  2.8150, -6.7023, -3.6617]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 71%|███████▏  | 206/289 [02:36<01:02,  1.32it/s]

Training loop 206
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.45003581047058105, logits - tensor([[-7.9827,  2.0897, -6.7994, -2.2987],
        [-6.9859,  2.9286, -6.5311, -2.5707],
        [-7.6699,  0.4704, -7.0079, -0.4280],
        [-5.9341,  2.1875, -6.1668, -2.3015],
        [-6.6462, -3.9892,  2.6441, -2.6880],
        [-7.0969, -3.3265, -5.9074,  2.7898],
        [-6.8326,  3.3272, -7.1000, -3.5729],
        [-7.1372,  1.9886, -7.4069, -2.6214]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 72%|███████▏  | 207/289 [02:36<01:01,  1.33it/s]

Training loop 207
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.045321766287088394, logits - tensor([[-7.0267,  3.2907, -7.4602, -2.9305],
        [-5.9079, -5.1363,  3.7569, -3.3581],
        [-7.1034,  2.5747, -6.7584, -2.6662],
        [-5.5992,  3.3506, -5.5196, -3.2948],
        [-7.1136,  3.0759, -7.3039, -3.6997],
        [-6.8481, -3.2994,  0.5705, -0.6376],
        [-5.8092, -4.4754, -5.2125,  5.1772],
        [-5.9469,  3.3733, -6.2714, -3.5274]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 72%|███████▏  | 208/289 [02:37<01:00,  1.33it/s]

Training loop 208
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.055286623537540436, logits - tensor([[-6.2397, -4.6366, -5.6842,  5.2993],
        [-6.5844,  3.4002, -6.9833, -3.6884],
        [-6.6566,  3.1632, -6.5677, -3.3740],
        [-6.3234, -3.1699, -6.3987,  3.7476],
        [-6.1553, -3.3262,  1.2100, -1.8413],
        [-8.2089,  2.8887, -7.3766, -3.5491],
        [-5.7361, -4.5929,  3.2352, -3.0214],
        [-7.5223, -0.3578, -7.3770,  0.7594]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 72%|███████▏  | 209/289 [02:38<01:00,  1.32it/s]

Training loop 209
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.025850027799606323, logits - tensor([[-7.3460,  3.8493, -7.9785, -2.3231],
        [-6.8617,  3.8126, -5.9181, -3.9185],
        [-6.2141,  3.3439, -6.1735, -3.9446],
        [-5.4062,  2.7068, -5.5908, -3.3621],
        [-7.2570,  3.0888, -7.9493, -2.0738],
        [-6.5503, -2.6051, -6.2107,  3.5781],
        [-7.2152, -3.4055, -5.8019,  3.9373],
        [-5.6408,  2.0715, -5.7686, -2.8870]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 73%|███████▎  | 210/289 [02:39<00:59,  1.32it/s]

Training loop 210
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.01730356737971306, logits - tensor([[-6.2934, -4.1975, -4.7322,  4.2288],
        [-6.8324, -4.7260, -5.5178,  4.0698],
        [-7.8250, -3.8946, -6.1404,  3.9307],
        [-6.2099,  4.1894, -6.6885, -3.1275],
        [-6.7391, -3.8384,  2.2467, -2.3207],
        [-6.3957, -4.3797,  3.8816, -3.5698],
        [-7.1874,  4.1310, -6.3888, -3.2038],
        [-6.6218,  4.0027, -7.4785, -3.7881]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 73%|███████▎  | 211/289 [02:39<00:58,  1.32it/s]

Training loop 211
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.03334759175777435, logits - tensor([[-6.5574, -4.2071,  3.2715, -4.2317],
        [-6.8819, -2.3305, -6.7194,  1.8646],
        [-6.5885,  3.2455, -6.1820, -3.5033],
        [-5.5186, -4.1765,  2.2408, -3.3802],
        [-6.9464,  1.8339, -7.9896, -1.8672],
        [-6.2893,  3.6940, -6.6792, -3.5098],
        [-6.2184,  3.5055, -7.2495, -3.3763],
        [-6.0549, -2.7617, -5.0809,  2.9924]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 73%|███████▎  | 212/289 [02:40<00:58,  1.32it/s]

Training loop 212
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.029688898473978043, logits - tensor([[-6.3861,  3.1625, -6.4806, -3.1056],
        [-6.0544, -4.4818,  3.8973, -3.1074],
        [-7.4261, -3.5185, -6.7669,  3.5915],
        [-6.6280,  2.7782, -6.1296, -2.7090],
        [-7.9270, -2.9835, -7.2383,  2.8133],
        [-6.9524,  2.6134, -7.3875, -2.0787],
        [-6.1098, -3.5467,  2.2846, -2.1479],
        [-5.8947,  3.2931, -6.8762, -3.8735]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 74%|███████▎  | 213/289 [02:41<00:57,  1.32it/s]

Training loop 213
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.05470762029290199, logits - tensor([[-6.7926,  1.9561, -7.2486, -1.7715],
        [-7.6029, -2.6905, -6.3534,  3.6401],
        [-6.5859, -4.6645, -5.4163,  4.9107],
        [-4.9390,  2.7134, -5.3181, -3.5812],
        [-6.8394,  0.9429, -6.3491, -0.6546],
        [-6.8313,  2.8402, -6.9975, -2.4148],
        [-6.3658,  3.2314, -6.9109, -3.7268],
        [-9.1277,  2.3397, -8.0512, -1.5806]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 74%|███████▍  | 214/289 [02:42<00:56,  1.32it/s]

Training loop 214
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.0309157632291317, logits - tensor([[-7.7592,  3.3101, -7.1904, -3.6917],
        [-6.1955,  3.9755, -6.7790, -3.4932],
        [-7.6282, -2.7493, -6.1934,  3.7286],
        [-5.9476,  2.4911, -6.3488, -2.3663],
        [-6.2757, -3.4869,  1.7785, -2.6348],
        [-7.5550,  2.0292, -8.0117, -2.3002],
        [-6.8335, -3.4450, -6.5784,  2.9727],
        [-6.8549,  4.0164, -6.9577, -3.3895]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 74%|███████▍  | 215/289 [02:42<00:56,  1.31it/s]

Training loop 215
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.030095811933279037, logits - tensor([[-7.5980,  2.8469, -7.0768, -3.5677],
        [-6.1722, -3.5301,  1.8562, -2.2540],
        [-6.4599,  3.4143, -6.8733, -4.2560],
        [-6.6872,  2.0545, -6.2268, -2.0053],
        [-7.2100,  3.0748, -7.1838, -3.1484],
        [-5.8646, -3.3678, -6.3618,  3.2989],
        [-6.6889,  3.2044, -7.2381, -3.6876],
        [-5.5156,  2.9678, -5.7502, -4.1808]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 75%|███████▍  | 216/289 [02:43<00:55,  1.31it/s]

Training loop 216
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.015161378309130669, logits - tensor([[-6.4182, -3.8866, -5.7317,  3.7280],
        [-7.6407, -4.3707, -4.0010,  2.6871],
        [-6.1242,  3.2801, -7.3606, -3.2601],
        [-6.6037,  4.1079, -7.0830, -4.5285],
        [-6.3745, -3.7656, -5.9580,  5.0108],
        [-7.0814,  3.4156, -7.0150, -3.0735],
        [-6.6509,  3.4807, -7.1236, -2.9115],
        [-6.0394, -4.8470,  4.6414, -4.2442]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 75%|███████▌  | 217/289 [02:44<00:54,  1.31it/s]

Training loop 217
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.24811217188835144, logits - tensor([[-6.0311,  3.9533, -6.2793, -3.3106],
        [-4.4157,  3.5281, -4.3642, -3.0727],
        [-6.4588,  3.3809, -5.7257, -3.4473],
        [-6.6737,  3.6350, -6.6042, -3.3696],
        [-7.3495,  2.1915, -7.1806, -2.4247],
        [-7.0638,  3.4356, -6.0939, -3.5742],
        [-5.3692, -2.8045,  1.5683, -1.4001],
        [-7.1224,  2.9637, -6.8204, -3.1644]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 75%|███████▌  | 218/289 [02:45<00:54,  1.31it/s]

Training loop 218
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.020608332008123398, logits - tensor([[-7.0777,  2.3766, -7.0992, -2.8847],
        [-7.4134,  2.9410, -7.1502, -2.9669],
        [-6.2025, -4.5063, -4.7844,  4.1418],
        [-6.2384,  4.5723, -6.0727, -4.0463],
        [-7.5315,  3.4805, -7.1001, -3.1668],
        [-6.5185,  3.7683, -6.8003, -3.8398],
        [-5.6819,  2.8088, -5.9058, -2.6456],
        [-6.0849, -3.5567, -5.9352,  2.8411]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 76%|███████▌  | 219/289 [02:45<00:53,  1.31it/s]

Training loop 219
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.03253645449876785, logits - tensor([[-6.0095, -2.8970,  1.7292, -1.7277],
        [-7.1196,  3.5769, -7.4989, -2.9474],
        [-4.9789,  3.5268, -5.2985, -2.7776],
        [-7.4536, -2.9658, -5.1278,  2.7104],
        [-7.0459,  3.1338, -7.2704, -3.1627],
        [-7.4508, -2.9388, -6.5174,  3.7418],
        [-6.1309,  3.1224, -7.5479, -3.5022],
        [-7.0544, -3.4283, -5.4940,  2.4769]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 76%|███████▌  | 220/289 [02:46<00:52,  1.31it/s]

Training loop 220
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.022944219410419464, logits - tensor([[-5.7015,  3.2387, -6.6891, -3.1204],
        [-6.5353, -3.6881, -5.9772,  3.4522],
        [-5.3078, -3.6756,  2.3599, -2.2499],
        [-7.1098, -4.4074, -6.2693,  3.8889],
        [-5.9101,  3.1167, -5.3087, -2.6862],
        [-8.2776,  3.6702, -7.7164, -3.3788],
        [-8.2991,  3.5503, -7.7401, -3.5576],
        [-5.9399,  3.0539, -6.5977, -2.9886]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 76%|███████▋  | 221/289 [02:47<00:52,  1.30it/s]

Training loop 221
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.06566363573074341, logits - tensor([[-5.8714e+00, -3.6823e+00, -4.9216e+00,  4.1032e+00],
        [-6.3753e+00,  3.6147e+00, -7.4908e+00, -3.6530e+00],
        [-6.6425e+00, -3.9008e+00, -6.6505e+00,  3.8628e+00],
        [-8.6525e+00,  2.2619e-01, -7.4249e+00, -5.9122e-03],
        [-6.7163e+00,  3.9797e+00, -6.9992e+00, -4.0204e+00],
        [-6.4336e+00,  3.9383e+00, -6.1712e+00, -2.8858e+00],
        [-7.1432e+00,  2.2728e+00, -6.9419e+00, -2.2369e+00],
        [-6.3948e+00,  2.5625e+00, -6.5726e+00, -3.0072e+00]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 77%|███████▋  | 222/289 [02:48<00:51,  1.31it/s]

Training loop 222
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.17422932386398315, logits - tensor([[-7.1004,  2.4089, -6.3491, -2.6341],
        [-7.2817,  4.2480, -7.0308, -3.3142],
        [-5.9779,  3.0679, -5.9665, -2.8757],
        [-7.0736, -3.2127, -5.5688,  4.2508],
        [-6.2765,  3.9679, -6.2644, -3.1713],
        [-5.5335, -4.0046,  3.0068, -3.3970],
        [-6.6189,  3.0284, -7.2381, -1.6579],
        [-5.4213, -4.1945,  2.9646, -2.8603]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 77%|███████▋  | 223/289 [02:48<00:50,  1.31it/s]

Training loop 223
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.019260574132204056, logits - tensor([[-6.0049,  3.0631, -6.7752, -3.7694],
        [-6.0141, -3.9387,  3.6053, -3.3933],
        [-6.9020, -3.5610, -4.3281,  2.5572],
        [-6.8621,  3.7741, -7.0768, -3.9341],
        [-7.9759,  4.2120, -7.4288, -3.3810],
        [-6.1280, -4.5091,  2.7903, -3.4819],
        [-6.0915,  3.0344, -6.3834, -2.9706],
        [-5.3611,  3.8322, -6.6156, -3.8328]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 78%|███████▊  | 224/289 [02:49<00:49,  1.32it/s]

Training loop 224
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.11903621256351471, logits - tensor([[-6.3922, -3.5681,  1.4786, -1.8467],
        [-6.4657,  3.4978, -5.8792, -3.7065],
        [-7.2174, -3.7113, -5.3543,  2.7630],
        [-6.4329,  3.7465, -6.5131, -3.6319],
        [-6.7810,  0.4223, -6.5927, -1.4211],
        [-7.2571,  1.3852, -6.7003, -1.1554],
        [-7.4186,  4.1357, -6.8919, -4.7887],
        [-7.1551,  2.7511, -5.7032, -2.6406]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 78%|███████▊  | 225/289 [02:50<00:48,  1.32it/s]

Training loop 225
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.01848467066884041, logits - tensor([[-6.5898,  3.5624, -7.3692, -3.5711],
        [-7.8188,  3.9169, -7.6699, -4.1755],
        [-6.4110,  3.2525, -6.8069, -2.9200],
        [-6.5500, -3.3506, -5.8447,  2.7605],
        [-7.5262,  2.0118, -6.7217, -3.0836],
        [-7.5016, -4.1383, -5.9284,  4.1915],
        [-6.4411, -3.5570, -5.8236,  4.5473],
        [-7.6640,  3.8611, -7.3919, -3.3825]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 78%|███████▊  | 226/289 [02:51<00:47,  1.32it/s]

Training loop 226
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.05985507369041443, logits - tensor([[-6.9048,  3.2693, -7.5287, -3.3314],
        [-6.6660,  3.7546, -5.5555, -3.0987],
        [-5.8541,  3.2079, -5.2122, -4.2064],
        [-5.6901,  3.8682, -6.5779, -3.6607],
        [-6.3700, -1.1923, -5.3091,  1.9107],
        [-5.1781, -4.5771, -4.0556,  4.8181],
        [-7.9093, -0.0856, -6.5816,  0.7092],
        [-6.9138,  2.8452, -6.2595, -2.3390]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 79%|███████▊  | 227/289 [02:51<00:46,  1.33it/s]

Training loop 227
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1624450981616974, logits - tensor([[-5.3025, -4.4306, -5.5152,  5.1425],
        [-6.3432,  2.8035, -6.0011, -3.8280],
        [-6.7922, -1.6204, -6.1568,  1.4267],
        [-7.3070,  2.8812, -6.4774, -2.4770],
        [-5.2619,  3.9444, -6.7388, -3.6238],
        [-7.4296,  2.0503, -7.3887, -2.1121],
        [-4.7511, -5.1437, -4.3024,  4.4591],
        [-4.8593, -3.6711, -5.3144,  3.7703]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 79%|███████▉  | 228/289 [02:52<00:45,  1.33it/s]

Training loop 228
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.14096976816654205, logits - tensor([[-5.8276, -3.2349, -5.2015,  4.1360],
        [-6.9492, -2.4478, -5.6850,  2.7407],
        [-7.7268, -1.4880, -6.8180,  1.1657],
        [-6.4711,  2.0054, -6.5074, -3.0729],
        [-7.0906, -3.6676,  1.7096, -1.8764],
        [-6.4889,  4.2077, -6.3514, -4.2192],
        [-6.5119,  3.0506, -6.6815, -3.0476],
        [-5.8185, -3.5793,  1.0848, -1.4586]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 79%|███████▉  | 229/289 [02:53<00:45,  1.33it/s]

Training loop 229
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19818998873233795, logits - tensor([[-6.4889,  3.7159, -7.1002, -4.4280],
        [-5.9323,  3.3583, -6.2178, -3.0714],
        [-6.1620,  3.0132, -6.4267, -3.5740],
        [-5.6306,  3.2950, -5.9128, -2.4997],
        [-5.6978, -4.1486,  3.1495, -2.7484],
        [-5.1300,  3.6691, -6.3321, -3.4364],
        [-7.6698,  2.6525, -6.2547, -2.9967],
        [-5.7678, -3.4425, -5.8931,  3.6623]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 80%|███████▉  | 230/289 [02:54<00:44,  1.32it/s]

Training loop 230
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.031704552471637726, logits - tensor([[-6.5681,  4.0649, -6.8732, -3.9588],
        [-8.0342,  2.5103, -8.6853, -2.9938],
        [-6.0264, -4.5849, -5.7447,  3.7582],
        [-6.1747, -3.6594,  2.3935, -2.6084],
        [-6.3119,  3.9319, -6.4799, -3.7036],
        [-7.6014,  3.9257, -8.4229, -3.7077],
        [-8.0597, -1.0132, -7.4108,  1.6919],
        [-6.0115, -4.4707, -5.0826,  3.4057]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 80%|███████▉  | 231/289 [02:54<00:43,  1.32it/s]

Training loop 231
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.035458095371723175, logits - tensor([[-5.7930,  1.2517, -6.2369, -1.2284],
        [-6.4059, -4.4738, -6.0996,  4.3507],
        [-6.6046, -2.7646, -4.9685,  2.8665],
        [-5.9437,  3.1833, -6.2893, -2.8617],
        [-7.0944,  4.2590, -6.2605, -4.8704],
        [-6.5226, -4.0129,  2.0181, -2.7099],
        [-6.4570, -4.8680, -5.3287,  4.6669],
        [-5.3910,  3.0587, -6.7415, -2.7860]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 80%|████████  | 232/289 [02:55<00:43,  1.32it/s]

Training loop 232
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.01887081377208233, logits - tensor([[-6.7616,  3.3525, -6.7052, -3.8718],
        [-7.1371, -2.3820, -5.4252,  2.5843],
        [-6.0884,  3.3220, -6.9403, -4.3425],
        [-7.7183, -4.1602, -6.4919,  3.2360],
        [-6.7706, -3.2457, -4.8766,  3.1907],
        [-6.6794, -3.7517, -6.2359,  3.8099],
        [-7.2137,  3.3779, -6.7493, -2.6646],
        [-5.9289, -4.2764, -5.3814,  4.3212]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 81%|████████  | 233/289 [02:56<00:42,  1.32it/s]

Training loop 233
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.18684861063957214, logits - tensor([[-6.4400, -3.4409, -4.3537,  3.8290],
        [-6.0884, -4.7292,  3.2279, -3.6129],
        [-6.1366,  2.6235, -6.7835, -2.3489],
        [-6.5081,  3.2647, -5.3972, -4.2947],
        [-6.2060, -3.9801,  2.6983, -2.5445],
        [-6.6856,  3.8631, -6.5565, -4.4229],
        [-7.7587,  3.5673, -7.2063, -2.3303],
        [-7.0496,  3.7355, -6.5573, -3.6168]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 81%|████████  | 234/289 [02:57<00:41,  1.32it/s]

Training loop 234
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.04542429745197296, logits - tensor([[-5.6382, -3.7011, -5.7802,  4.0078],
        [-7.7460, -5.1681, -6.2698,  3.0973],
        [-6.4029, -3.7159, -5.5546,  3.4909],
        [-7.1333,  0.4598, -5.3097, -0.3542],
        [-6.2672,  3.5724, -6.4948, -4.3597],
        [-4.8471,  2.6571, -5.9818, -2.7312],
        [-5.1126, -4.0606, -5.3191,  4.4488],
        [-6.2865, -4.2952, -5.9487,  3.7479]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 81%|████████▏ | 235/289 [02:58<00:40,  1.32it/s]

Training loop 235
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.16648061573505402, logits - tensor([[-6.7043,  2.2925, -7.1931, -2.5025],
        [-7.3682,  3.5997, -6.6758, -3.7397],
        [-6.7979, -4.3881, -6.1146,  4.2699],
        [-8.4502, -1.8636, -8.1374,  2.0079],
        [-6.9496, -3.3462,  1.2106, -2.0153],
        [-6.4301,  2.0475, -6.3006, -2.3698],
        [-5.5017, -4.5447, -5.2163,  3.9130],
        [-6.4440,  1.8156, -5.8653, -2.2998]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 82%|████████▏ | 236/289 [02:58<00:40,  1.32it/s]

Training loop 236
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.15020370483398438, logits - tensor([[-6.3528,  3.5945, -6.5312, -2.9741],
        [-6.0983, -1.8427, -5.2284,  1.9263],
        [-6.2700,  2.0540, -5.4143, -2.6560],
        [-7.0631,  2.6794, -7.3246, -2.4483],
        [-6.6684,  2.9878, -7.2907, -3.5740],
        [-5.9123,  3.2451, -6.3947, -4.0018],
        [-7.1861,  2.9171, -7.2041, -2.8649],
        [-5.5434,  3.3747, -5.6838, -3.4558]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 82%|████████▏ | 237/289 [02:59<00:39,  1.31it/s]

Training loop 237
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19532805681228638, logits - tensor([[-6.9546,  3.2902, -6.0642, -3.2788],
        [-7.7387,  2.0678, -7.6888, -2.2212],
        [-6.2409,  3.4718, -7.7850, -3.0864],
        [-6.3514,  3.7505, -6.5896, -3.0695],
        [-7.0311, -3.0885, -6.0285,  2.2208],
        [-6.5751, -5.0829,  3.5440, -3.7979],
        [-7.2474,  3.1854, -6.3442, -3.7850],
        [-7.2995,  2.9995, -6.6527, -2.3889]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 82%|████████▏ | 238/289 [03:00<00:39,  1.31it/s]

Training loop 238
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.01994010992348194, logits - tensor([[-6.9113,  3.0225, -6.6049, -3.5299],
        [-6.7510, -2.4219, -6.2658,  2.9154],
        [-7.2144,  3.3195, -6.8910, -3.5941],
        [-6.4338,  4.7181, -6.7561, -3.8717],
        [-7.6034,  3.0244, -6.6851, -3.3743],
        [-6.2851,  2.9643, -6.9571, -3.7525],
        [-7.1568, -4.6695, -5.6840,  3.6883],
        [-7.2022,  3.1378, -6.8854, -2.4833]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 83%|████████▎ | 239/289 [03:01<00:38,  1.30it/s]

Training loop 239
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.24161043763160706, logits - tensor([[-6.1393, -4.7318,  2.7557, -3.2589],
        [-6.8036,  4.1184, -6.4829, -4.0844],
        [-7.6581,  2.5640, -7.3317, -2.7321],
        [-5.7507, -2.5812,  0.5556, -1.4985],
        [-6.2630,  3.5592, -6.8673, -3.4368],
        [-6.0047, -4.1148,  2.5558, -2.3429],
        [-7.2279,  3.3722, -7.2366, -3.5531],
        [-5.9804, -3.3711,  2.8903, -2.9080]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 83%|████████▎ | 240/289 [03:01<00:37,  1.31it/s]

Training loop 240
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.019657865166664124, logits - tensor([[-7.0544,  3.9477, -7.0450, -3.0640],
        [-6.9044,  2.6091, -6.5928, -2.7600],
        [-7.7475, -2.5153, -6.5727,  3.0954],
        [-6.5165,  4.1583, -6.8749, -4.3602],
        [-6.6875, -4.5570,  3.9711, -3.3334],
        [-6.1439,  3.1184, -5.8739, -3.8846],
        [-6.7570,  3.8138, -6.9666, -3.2452],
        [-8.1781, -2.9037, -6.4792,  3.8091]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 83%|████████▎ | 241/289 [03:02<00:36,  1.31it/s]

Training loop 241
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.03691532835364342, logits - tensor([[-6.0315, -4.1009, -5.5032,  3.7368],
        [-6.0285, -3.3181, -4.3097,  3.4157],
        [-7.5490,  4.1659, -7.8550, -3.2860],
        [-6.0984,  3.4868, -6.2820, -3.4961],
        [-6.6425,  3.5934, -6.9076, -3.2907],
        [-6.6480, -2.2824, -1.6896,  1.2440],
        [-6.0364, -3.9943, -5.0110,  4.1739],
        [-5.1916, -2.8002,  2.2611, -1.8903]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 84%|████████▎ | 242/289 [03:03<00:35,  1.31it/s]

Training loop 242
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.04467959329485893, logits - tensor([[-6.7830,  4.0938, -6.9082, -4.2875],
        [-7.1466,  4.5665, -6.2873, -3.4916],
        [-5.4888, -3.6270,  2.2755, -2.5364],
        [-6.3518, -0.6896, -5.8700,  0.8314],
        [-8.0390,  3.0593, -7.5976, -3.1828],
        [-5.9341,  2.6083, -6.1117, -3.5927],
        [-7.8831,  2.9524, -6.9336, -3.1079],
        [-6.8493,  3.3048, -7.0820, -2.9089]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 84%|████████▍ | 243/289 [03:04<00:35,  1.31it/s]

Training loop 243
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.03683706372976303, logits - tensor([[-7.9609,  3.4915, -7.2398, -2.2453],
        [-6.0007, -4.2812, -4.7152,  3.1948],
        [-7.0878,  3.4365, -7.7515, -3.2547],
        [-7.0197,  3.0510, -7.2906, -3.5530],
        [-7.1265,  3.2168, -7.5337, -2.9324],
        [-6.4849, -4.0464,  1.8341, -2.7543],
        [-5.9961, -3.6267,  1.0656, -2.3553],
        [-6.3489,  2.8699, -6.8155, -3.1937]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 84%|████████▍ | 244/289 [03:04<00:34,  1.31it/s]

Training loop 244
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.02475198358297348, logits - tensor([[-5.9629,  3.6987, -5.3380, -3.8373],
        [-6.2892, -3.0072, -6.5206,  3.0623],
        [-6.8425,  3.9495, -5.8275, -2.7997],
        [-5.7478, -4.4215, -5.0701,  4.1890],
        [-7.9646,  2.0002, -6.8365, -2.1184],
        [-7.6584, -3.5024, -7.2850,  4.4830],
        [-5.7765,  3.8278, -6.1849, -3.6963],
        [-7.0410, -3.4923,  2.4358, -2.5781]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 85%|████████▍ | 245/289 [03:05<00:33,  1.32it/s]

Training loop 245
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.03648579865694046, logits - tensor([[-5.7158, -3.4848,  2.5213, -2.4390],
        [-6.9610,  2.9566, -6.6849, -2.7339],
        [-5.8838, -4.0656,  3.4698, -3.4319],
        [-7.2893,  3.3433, -7.3280, -3.8304],
        [-5.8881, -3.3523,  2.2111, -1.3440],
        [-7.4527,  3.9720, -7.6297, -3.8592],
        [-7.1504,  2.0276, -7.4198, -2.1700],
        [-5.3348,  3.5726, -6.0886, -3.3038]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 85%|████████▌ | 246/289 [03:06<00:32,  1.32it/s]

Training loop 246
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.17273342609405518, logits - tensor([[-5.6470, -4.1896, -5.9113,  4.6809],
        [-7.6984,  3.3414, -7.4785, -2.8179],
        [-6.1557, -5.1082, -6.1481,  5.1994],
        [-6.6216,  3.5182, -5.6959, -3.6281],
        [-5.9807,  2.9166, -6.6753, -2.4476],
        [-7.6165,  2.6917, -7.2111, -2.1777],
        [-5.8106,  2.8949, -6.8323, -3.5556],
        [-5.9444,  3.4324, -5.5606, -3.9073]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 85%|████████▌ | 247/289 [03:07<00:31,  1.32it/s]

Training loop 247
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.10429459810256958, logits - tensor([[-7.1969,  3.6758, -6.8320, -3.2511],
        [-6.6222,  2.8853, -6.4862, -2.4448],
        [-5.9279,  3.2375, -5.7919, -3.0133],
        [-4.8855, -3.0783,  1.7507, -1.5702],
        [-6.8731, -0.7071, -6.0578,  0.9975],
        [-6.9954, -4.6886, -5.3848,  3.7546],
        [-6.2841, -4.1470, -5.5264,  4.2497],
        [-6.6172,  2.2439, -5.8180, -3.2751]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 86%|████████▌ | 248/289 [03:07<00:31,  1.32it/s]

Training loop 248
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2547794580459595, logits - tensor([[-6.6372,  2.7046, -6.2729, -3.1216],
        [-6.8745,  3.1622, -5.8310, -3.3084],
        [-8.2035,  3.4127, -8.1014, -3.9729],
        [-6.1714,  4.1484, -6.9081, -3.8110],
        [-6.6253,  2.9514, -7.1813, -3.2361],
        [-6.0897,  3.3356, -6.5620, -2.5615],
        [-8.0415,  1.1950, -6.4730, -1.1005],
        [-6.9015,  3.5035, -6.9872, -3.5467]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 86%|████████▌ | 249/289 [03:08<00:30,  1.32it/s]

Training loop 249
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3908085227012634, logits - tensor([[-6.2034,  3.1115, -6.7444, -3.7747],
        [-7.3270, -2.9948, -7.0174,  2.8276],
        [-7.1616,  2.9475, -6.5833, -3.3600],
        [-6.3196,  3.3477, -6.2099, -3.2240],
        [-7.7712,  3.0814, -7.3998, -3.1574],
        [-6.6146,  2.8733, -6.7574, -2.6479],
        [-6.4083, -3.8399,  2.5838, -2.8353],
        [-6.0889, -4.1982, -5.0449,  4.2633]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 87%|████████▋ | 250/289 [03:09<00:29,  1.32it/s]

Training loop 250
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.018066231161355972, logits - tensor([[-6.9667,  4.3029, -6.7974, -3.7582],
        [-6.5959,  3.4937, -6.5200, -3.2489],
        [-7.4806,  3.3134, -6.3594, -3.4578],
        [-7.3270, -4.0291, -5.8802,  3.1231],
        [-5.0231,  3.2445, -6.2727, -3.2769],
        [-6.9116, -4.3620,  3.2633, -3.6623],
        [-6.7986,  3.2384, -6.9080, -2.8684],
        [-6.9803,  3.0473, -6.9055, -3.4570]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 87%|████████▋ | 251/289 [03:10<00:28,  1.32it/s]

Training loop 251
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1871868073940277, logits - tensor([[-7.8729,  4.4447, -8.8886, -4.3367],
        [-7.1400,  4.3514, -7.3263, -3.5924],
        [-6.5105, -3.1396,  0.7193, -1.4414],
        [-6.1943,  3.4088, -7.0827, -3.4992],
        [-7.4283,  3.7081, -7.2759, -3.1707],
        [-6.1312, -4.1970, -5.0551,  3.6553],
        [-6.4157,  2.0176, -6.1587, -2.7864],
        [-7.2599,  2.4770, -7.4358, -2.2411]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 87%|████████▋ | 252/289 [03:10<00:28,  1.32it/s]

Training loop 252
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.24346508085727692, logits - tensor([[-6.8329,  3.7284, -5.7965, -4.0809],
        [-6.7142, -4.5565,  2.6214, -3.2584],
        [-5.0621, -4.1326,  3.0726, -3.2066],
        [-6.3589,  3.5748, -6.4439, -3.2882],
        [-4.9247, -3.3807, -5.4449,  3.2194],
        [-6.3559,  3.5085, -6.7657, -3.2384],
        [-6.5705, -4.0601, -5.1891,  3.6916],
        [-8.4213,  3.5759, -7.7755, -3.1492]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 88%|████████▊ | 253/289 [03:11<00:27,  1.32it/s]

Training loop 253
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.14128261804580688, logits - tensor([[-6.1926,  3.6970, -6.3513, -3.7069],
        [-7.7892,  1.9097, -5.4312, -1.5410],
        [-6.0417, -2.1361, -5.9257,  3.2394],
        [-6.9178,  3.5294, -6.5132, -3.0462],
        [-7.0810,  3.8474, -6.9841, -3.6675],
        [-5.7744,  3.7984, -7.0973, -3.9905],
        [-6.6865, -3.6756,  2.0727, -2.6496],
        [-6.4158,  2.6514, -6.2983, -2.6155]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 88%|████████▊ | 254/289 [03:12<00:26,  1.32it/s]

Training loop 254
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.016135066747665405, logits - tensor([[-6.7587, -2.9598, -7.0066,  3.5451],
        [-7.4368, -4.7887, -5.1104,  5.0116],
        [-6.5439, -3.8233,  3.6460, -3.4537],
        [-7.0556,  2.6461, -7.1228, -3.3475],
        [-7.4391, -5.1935, -5.8165,  4.9801],
        [-5.8458,  3.8387, -6.1587, -4.1359],
        [-6.1221,  2.6280, -6.4177, -3.6402],
        [-4.9258, -4.3136,  3.8481, -3.4976]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 88%|████████▊ | 255/289 [03:13<00:25,  1.32it/s]

Training loop 255
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.18981046974658966, logits - tensor([[-6.4398, -3.9036,  3.2563, -4.2889],
        [-6.1116, -3.0971,  1.2265, -1.0110],
        [-6.4155, -3.7567, -4.9182,  3.5670],
        [-5.7772,  3.6713, -6.8434, -3.1079],
        [-6.3106, -4.4661,  2.1815, -2.6505],
        [-7.2086, -1.0540, -5.4539,  1.2739],
        [-6.4372, -3.7503, -5.4944,  2.7386],
        [-6.4052, -4.2582,  3.7012, -2.7871]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 89%|████████▊ | 256/289 [03:13<00:24,  1.33it/s]

Training loop 256
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.20317421853542328, logits - tensor([[-5.4202, -3.3756,  2.9169, -2.5241],
        [-7.5215,  2.2851, -6.9977, -2.7022],
        [-5.1161, -3.2209,  1.9573, -2.2232],
        [-6.5443, -4.2710,  2.2961, -2.1543],
        [-6.2928,  1.8294, -6.8788, -3.0829],
        [-6.6721,  2.8119, -6.9814, -3.2862],
        [-6.0409, -3.8902,  2.4411, -2.9282],
        [-6.3622,  4.1048, -6.3743, -3.9234]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 89%|████████▉ | 257/289 [03:14<00:24,  1.32it/s]

Training loop 257
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.24963988363742828, logits - tensor([[-6.2505,  2.4133, -6.3344, -3.2354],
        [-6.8176,  3.0644, -5.9047, -2.7989],
        [-6.9422,  2.3005, -7.0225, -2.7891],
        [-6.5115, -4.3107,  3.4733, -3.5981],
        [-7.9269,  3.6717, -8.1284, -3.5519],
        [-6.4472,  2.7777, -5.5785, -2.1240],
        [-6.9247, -4.7070,  3.1019, -3.1568],
        [-6.2167,  4.0635, -7.0635, -3.0653]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 89%|████████▉ | 258/289 [03:15<00:23,  1.32it/s]

Training loop 258
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.026956873014569283, logits - tensor([[-6.8271, -3.8782, -2.6379,  3.2164],
        [-5.3966,  2.6420, -6.2071, -3.0594],
        [-5.8735, -4.6059, -4.8861,  3.2621],
        [-6.4246,  3.0180, -6.9979, -2.6077],
        [-5.6581,  3.4049, -6.2583, -3.2767],
        [-7.5262, -2.7389, -6.1020,  2.6284],
        [-6.0057,  2.4729, -6.3203, -2.4321],
        [-7.4246,  3.4434, -8.0259, -3.8927]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 90%|████████▉ | 259/289 [03:16<00:22,  1.31it/s]

Training loop 259
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 90%|████████▉ | 260/289 [03:17<00:21,  1.32it/s]

loss - 0.054143648594617844, logits - tensor([[-5.5018,  2.8257, -6.3353, -3.0926],
        [-7.7648, -4.7280, -6.9974,  4.4652],
        [-5.8924,  4.0133, -6.7681, -4.2783],
        [-5.7326,  2.9811, -7.1316, -2.9097],
        [-6.4770, -2.1674, -6.2011,  2.2839],
        [-6.9880, -0.3430, -4.7951,  0.3595],
        [-4.0155, -4.3477, -4.3966,  3.5005],
        [-5.7821,  3.1794, -6.6425, -2.7953]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 260
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3247233033180237, logits - tensor([[-6.3570, -0.7411, -6.2092,  0.8007],
        [-6.8150, -3.7071,  1.9280, -1.9427],
        [-5.9943,  2.8657, -6.8087, -3.4477],
        [-7.2341,  4.3488, -7.2132, -3.9762],
        [-7.8390,  2.3809, -6.3317, -3.4680],
        [-7.3438,  3.8304, -6.9030, -4.4081],
        [-7.9128,  1.1138, -6.7888, -1.1096],
        [-7.2132, -0.8938, -3.9933,  0

 90%|█████████ | 261/289 [03:17<00:21,  1.32it/s]

Training loop 261
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2042582631111145, logits - tensor([[-5.8030, -3.4869,  2.9088, -2.7019],
        [-7.3293,  3.0046, -7.1852, -2.4785],
        [-6.2377,  3.4501, -6.0663, -2.8697],
        [-8.1664, -2.4250, -6.8237,  1.9258],
        [-6.8476, -3.5544, -5.2136,  3.7751],
        [-6.6271,  2.6031, -7.0531, -3.3519],
        [-7.1084,  3.8536, -7.0120, -3.8090],
        [-7.1143,  2.8763, -7.4256, -2.7022]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 91%|█████████ | 262/289 [03:18<00:20,  1.32it/s]

Training loop 262
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.11646084487438202, logits - tensor([[-6.1585, -3.5649, -5.6806,  3.8328],
        [-6.9076, -4.7017, -6.3674,  3.8319],
        [-5.8773,  2.8680, -6.4187, -3.3541],
        [-6.0440, -3.0232, -5.5614,  3.4365],
        [-7.1842,  3.1172, -7.9290, -3.3185],
        [-7.1798,  3.5294, -7.1502, -3.1939],
        [-7.5179, -0.2502, -6.3536,  0.8698],
        [-7.4247, -2.4492,  0.3638, -1.1243]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 91%|█████████ | 263/289 [03:19<00:19,  1.31it/s]

Training loop 263
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.29456081986427307, logits - tensor([[-7.7537, -4.6143, -6.4589,  3.5433],
        [-6.8549,  2.9133, -6.4118, -3.5364],
        [-8.4815,  0.6583, -6.1921, -1.0451],
        [-7.7690,  4.0033, -7.9136, -3.3836],
        [-7.1408, -3.9311, -6.1545,  2.8954],
        [-7.5040,  2.7168, -7.9664, -2.8107],
        [-7.4416,  2.8333, -6.3691, -3.6409],
        [-5.8802,  3.3348, -5.3040, -3.1219]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 91%|█████████▏| 264/289 [03:20<00:19,  1.31it/s]

Training loop 264
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.026553666219115257, logits - tensor([[-7.8381,  2.4447, -6.8832, -3.3352],
        [-5.4625, -4.4989,  2.4437, -2.6040],
        [-6.0400, -3.5507,  2.9652, -2.7417],
        [-5.4720, -3.6912, -5.2025,  4.1337],
        [-7.1826,  3.0249, -7.4125, -2.4334],
        [-6.6946, -3.9999, -5.7294,  4.5909],
        [-6.3009,  3.9508, -6.6059, -3.6821],
        [-7.0984,  2.7006, -7.2810, -2.3942]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 92%|█████████▏| 265/289 [03:20<00:18,  1.31it/s]

Training loop 265
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.043077874928712845, logits - tensor([[-7.0584, -4.0768,  1.6454, -2.7404],
        [-6.1710, -3.4712,  1.7728, -2.8410],
        [-6.5408, -3.8920,  3.5972, -2.7792],
        [-6.5391,  2.0659, -6.8631, -2.5542],
        [-6.8712,  3.1431, -7.3419, -3.6383],
        [-5.9016,  3.6955, -6.1947, -3.5629],
        [-6.1569,  4.1898, -7.5524, -3.3007],
        [-6.3238,  1.6061, -6.0651, -1.4970]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 92%|█████████▏| 266/289 [03:21<00:17,  1.32it/s]

Training loop 266
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.13291704654693604, logits - tensor([[-7.3373, -4.5799, -6.4718,  4.0403],
        [-6.3365, -3.6529,  0.2221, -0.9017],
        [-7.1712,  1.9832, -7.3423, -2.0937],
        [-7.7321,  0.5419, -6.3638, -1.1662],
        [-7.4597,  3.0697, -7.8620, -3.5071],
        [-5.7782,  3.6891, -6.2154, -3.0646],
        [-6.9074, -2.9534, -6.2130,  3.5658],
        [-6.2647,  1.7662, -5.9868, -1.5908]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 92%|█████████▏| 267/289 [03:22<00:16,  1.32it/s]

Training loop 267
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2936873137950897, logits - tensor([[-7.0729, -4.2071, -6.0308,  3.2963],
        [-7.3075,  1.0399, -5.6962, -0.2528],
        [-7.9385,  3.3559, -7.6570, -2.5367],
        [-6.5912, -3.5526, -6.1844,  4.2298],
        [-6.9611,  3.0470, -7.2697, -3.6301],
        [-5.0551, -4.0150,  3.1997, -2.9241],
        [-6.7718, -4.4581,  2.6777, -2.8771],
        [-5.9282, -3.8103,  2.8176, -2.1031]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 93%|█████████▎| 268/289 [03:23<00:15,  1.32it/s]

Training loop 268
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.018984664231538773, logits - tensor([[-6.2371,  3.6513, -7.2691, -4.3018],
        [-6.4390, -5.3309,  3.9553, -4.3115],
        [-8.0624,  2.1264, -7.3819, -2.3520],
        [-8.0703, -3.1759, -6.1668,  2.7202],
        [-5.6641, -3.9162, -4.4831,  3.9215],
        [-7.4600,  3.3122, -8.2525, -3.7186],
        [-5.3565,  3.3064, -5.8471, -4.3399],
        [-6.7293, -4.0644, -5.9296,  3.7442]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 93%|█████████▎| 269/289 [03:23<00:15,  1.33it/s]

Training loop 269
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.030595824122428894, logits - tensor([[-6.1484,  2.1552, -6.5484, -1.7878],
        [-6.0232,  2.4657, -6.3783, -2.6090],
        [-5.5851,  3.5702, -5.9523, -3.8612],
        [-5.4801, -3.7542, -4.5804,  4.3998],
        [-6.4933,  3.6759, -6.6360, -3.9048],
        [-7.0107,  2.3184, -7.0936, -2.2219],
        [-7.4221,  2.7079, -6.1882, -3.1468],
        [-6.6102,  3.0069, -6.7053, -3.1158]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 93%|█████████▎| 270/289 [03:24<00:14,  1.33it/s]

Training loop 270
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.023925982415676117, logits - tensor([[-6.7476,  4.0586, -6.4591, -3.3543],
        [-6.5560,  2.0529, -5.6337, -2.3841],
        [-7.5359,  3.0702, -6.9836, -3.1792],
        [-6.0987, -4.2547, -6.3016,  3.3089],
        [-5.2711, -3.9713,  2.5595, -3.3364],
        [-6.8260,  3.8279, -5.7238, -3.9052],
        [-5.2930,  3.5614, -6.7173, -3.6772],
        [-7.5490,  2.8580, -6.3339, -2.7923]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 94%|█████████▍| 271/289 [03:25<00:13,  1.33it/s]

Training loop 271
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.06346464902162552, logits - tensor([[-5.0964, -4.1412,  3.1629, -3.7466],
        [-7.1275, -2.3233, -5.5569,  2.6703],
        [-7.5647,  1.5313, -6.9142, -1.3977],
        [-6.6485,  3.6732, -6.3229, -3.1406],
        [-6.3807,  3.2656, -7.2586, -3.8274],
        [-7.0803,  3.8962, -6.6514, -4.5761],
        [-6.7585,  0.4371, -5.7934, -0.4799],
        [-7.1308,  1.9354, -7.0640, -2.5401]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 94%|█████████▍| 272/289 [03:26<00:12,  1.33it/s]

Training loop 272
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.024562278762459755, logits - tensor([[-7.0420, -3.7962, -6.6198,  3.8567],
        [-7.5200,  2.4631, -8.0302, -2.2275],
        [-6.6251, -4.0661, -5.3497,  4.5256],
        [-6.1421,  2.8319, -5.9845, -3.3155],
        [-7.8106,  2.2545, -7.4997, -2.7224],
        [-6.8464, -3.4821,  2.0814, -2.7471],
        [-6.6842, -5.2993, -5.8225,  4.8145],
        [-7.3319, -3.9875, -6.6244,  4.4839]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 94%|█████████▍| 273/289 [03:26<00:12,  1.33it/s]

Training loop 273
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.02692105993628502, logits - tensor([[-6.8792,  2.6896, -7.2480, -1.5976],
        [-6.6378, -4.6695, -5.8968,  3.7995],
        [-5.8224, -3.8537, -5.7015,  4.5407],
        [-7.0839,  3.0809, -6.7263, -2.4005],
        [-7.2142, -5.0775,  2.6550, -2.3423],
        [-5.8926,  2.7139, -6.2550, -2.6765],
        [-6.0659, -3.7253, -5.5528,  3.5859],
        [-6.4396,  3.7167, -6.2219, -4.1521]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 95%|█████████▍| 274/289 [03:27<00:11,  1.33it/s]

Training loop 274
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2482547163963318, logits - tensor([[-6.1828,  2.5251, -6.6768, -3.0052],
        [-6.0849, -4.1811, -5.5966,  4.3674],
        [-6.6134,  2.3998, -6.6734, -3.0112],
        [-6.4204,  2.8656, -7.0581, -3.0482],
        [-6.6823,  3.0780, -7.2093, -4.1315],
        [-6.6043,  3.8224, -6.9242, -3.3174],
        [-6.7171, -1.0493, -3.1639, -0.5774],
        [-7.7018,  3.3396, -6.3280, -2.2430]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 95%|█████████▌| 275/289 [03:28<00:10,  1.33it/s]

Training loop 275
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.12037685513496399, logits - tensor([[-5.8875, -2.6169,  1.1558, -0.8607],
        [-7.1108,  2.2725, -6.0466, -2.3890],
        [-8.3461,  1.2184, -6.9561, -0.8574],
        [-6.7784,  3.4128, -7.1844, -4.3872],
        [-7.5089,  3.8489, -6.6809, -3.2557],
        [-7.4846,  2.9513, -7.4585, -3.0063],
        [-5.2678, -3.3866, -4.1589,  3.1768],
        [-5.7896, -4.5124, -5.0533,  4.5862]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 96%|█████████▌| 276/289 [03:29<00:09,  1.33it/s]

Training loop 276
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.02046230621635914, logits - tensor([[-6.1584, -3.5984, -6.1678,  3.5643],
        [-6.5869,  4.0707, -7.1312, -4.4134],
        [-6.3014,  3.0136, -6.9345, -2.7736],
        [-6.0516, -4.8902, -5.3487,  4.5912],
        [-7.5346,  3.3985, -7.2896, -3.0244],
        [-7.8041,  2.2368, -7.3485, -2.3776],
        [-6.6085,  3.6698, -6.4410, -3.5870],
        [-7.9280,  2.9611, -6.1393, -3.0261]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 96%|█████████▌| 277/289 [03:29<00:09,  1.33it/s]

Training loop 277
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.019953833892941475, logits - tensor([[-7.1522, -4.2627,  3.2239, -4.0909],
        [-6.6014,  3.3062, -6.0652, -2.9261],
        [-6.7286,  4.4711, -6.6502, -3.9638],
        [-6.3905,  2.6719, -6.2857, -2.7312],
        [-6.9837, -3.7071, -6.7790,  3.0415],
        [-7.6784, -3.7455, -6.0913,  3.9408],
        [-6.6200,  2.7690, -8.2793, -3.3513],
        [-6.2797,  3.1009, -6.5609, -3.0205]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 96%|█████████▌| 278/289 [03:30<00:08,  1.32it/s]

Training loop 278
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.06568469107151031, logits - tensor([[-7.2128,  1.5406, -5.7527, -0.9580],
        [-5.6500, -4.3851,  3.4117, -3.4144],
        [-6.0756,  3.4904, -6.6446, -2.0028],
        [-7.5442,  1.4568, -7.2360, -0.5360],
        [-6.2866,  2.2023, -6.0607, -2.9234],
        [-6.0616, -1.4540, -6.3311,  2.1511],
        [-5.9867, -3.1433, -5.9237,  3.2997],
        [-6.9618,  2.7300, -7.1213, -3.4016]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 97%|█████████▋| 279/289 [03:31<00:07,  1.32it/s]

Training loop 279
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.16520357131958008, logits - tensor([[-6.6096, -3.4937,  1.5579, -1.3609],
        [-7.9768,  0.8517, -6.5846, -0.8701],
        [-5.7535,  3.0364, -6.4774, -2.5391],
        [-6.8851, -4.1222,  2.8829, -3.2404],
        [-6.6246,  2.9224, -6.4352, -4.1368],
        [-6.9457, -3.4652, -5.0172,  3.9852],
        [-5.7102, -2.9999,  1.5731, -1.5940],
        [-6.3862,  2.2906, -5.9634, -2.6186]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 97%|█████████▋| 280/289 [03:32<00:06,  1.32it/s]

Training loop 280
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.026601385325193405, logits - tensor([[-5.6699, -3.6846, -4.3886,  3.9051],
        [-6.2162,  2.0509, -5.7987, -2.2765],
        [-6.4794,  2.4267, -5.6716, -2.5761],
        [-6.5782,  2.1993, -7.0261, -2.8529],
        [-6.6797,  3.5806, -6.7305, -3.9150],
        [-6.1167,  3.1681, -6.1162, -4.6071],
        [-6.3170, -4.8989, -4.9232,  4.0628],
        [-6.1888, -3.3925, -4.9739,  2.7870]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 97%|█████████▋| 281/289 [03:32<00:06,  1.31it/s]

Training loop 281
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.26910826563835144, logits - tensor([[-6.8679,  3.6902, -5.7730, -3.4169],
        [-6.9055,  3.3046, -6.5941, -3.9458],
        [-6.5168,  3.6978, -5.5485, -3.2885],
        [-5.2050, -4.4991, -5.1543,  5.3634],
        [-7.9411,  2.9821, -8.2539, -2.8822],
        [-7.1907, -0.0351, -2.9303, -1.0271],
        [-8.1994,  0.2689, -7.5019, -0.0239],
        [-6.7757,  2.8569, -6.6045, -2.9167]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 98%|█████████▊| 282/289 [03:33<00:05,  1.31it/s]

Training loop 282
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.050221800804138184, logits - tensor([[-7.2831, -0.8117, -5.2506,  0.6245],
        [-6.5229,  2.9829, -6.0864, -2.1628],
        [-5.9406, -3.3637,  3.4082, -3.0239],
        [-6.1423,  2.7753, -7.3886, -2.5861],
        [-7.5834,  3.6650, -6.0865, -3.8263],
        [-6.4972,  3.2678, -6.4886, -3.2010],
        [-7.7796,  2.9096, -7.7687, -3.1325],
        [-5.8855, -3.8190,  2.3294, -3.0475]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 98%|█████████▊| 283/289 [03:34<00:04,  1.31it/s]

Training loop 283
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.10374405980110168, logits - tensor([[-6.3126, -3.8194,  2.5902, -1.6679],
        [-8.4350, -0.7094, -6.7768,  0.0148],
        [-5.6204,  4.2651, -6.7567, -3.2761],
        [-7.0131,  3.1464, -6.6834, -3.0307],
        [-6.3573,  1.6550, -6.1838, -3.0285],
        [-7.0650, -4.2195, -6.5086,  3.7855],
        [-6.7440, -0.1161, -5.8745, -0.2521],
        [-6.3076, -3.3063,  2.0015, -2.8732]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 98%|█████████▊| 284/289 [03:35<00:03,  1.30it/s]

Training loop 284
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.018548011779785156, logits - tensor([[-6.9855, -3.7802, -6.4419,  3.6918],
        [-6.9485, -4.1733, -4.6595,  3.3747],
        [-6.5887, -3.3675, -5.7000,  3.5539],
        [-6.4207,  3.4728, -5.6658, -4.4520],
        [-5.9154,  2.3958, -6.0161, -2.3421],
        [-6.7094, -4.2438,  3.5288, -2.5765],
        [-6.6194, -3.4434, -5.7332,  3.9581],
        [-6.8648, -5.4021, -6.2841,  5.0574]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 99%|█████████▊| 285/289 [03:35<00:03,  1.31it/s]

Training loop 285
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.022844862192869186, logits - tensor([[-7.1405,  3.4487, -7.6875, -3.0555],
        [-6.2459, -3.6981, -5.3185,  2.8290],
        [-6.0453, -3.2083, -4.8792,  3.1018],
        [-6.3083,  2.1594, -5.5945, -2.8362],
        [-7.0513,  2.3908, -6.9828, -2.6765],
        [-7.1103,  3.5944, -7.2064, -3.6072],
        [-7.1466,  3.8024, -7.4686, -4.1289],
        [-7.4573, -4.3172, -6.4220,  3.4170]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 99%|█████████▉| 286/289 [03:36<00:02,  1.31it/s]

Training loop 286
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.01599108800292015, logits - tensor([[-6.2519,  4.2549, -7.7071, -4.6253],
        [-6.2214, -4.8899,  3.1612, -4.1388],
        [-6.2191, -4.3491,  3.6388, -3.7213],
        [-6.5917,  3.1909, -4.9997, -4.3662],
        [-5.8346,  4.3214, -6.5931, -3.1780],
        [-7.9822,  3.0313, -7.9275, -3.0267],
        [-5.1772, -3.9206, -4.7055,  5.1881],
        [-6.2457,  3.0681, -6.4774, -2.9553]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 99%|█████████▉| 287/289 [03:37<00:01,  1.32it/s]

Training loop 287
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1846218705177307, logits - tensor([[-7.7234,  4.1484, -6.5126, -3.3742],
        [-7.2274,  3.4361, -6.6760, -4.1164],
        [-6.9390,  2.8595, -6.4804, -2.8484],
        [-7.7054,  4.4401, -7.5462, -3.7435],
        [-6.2430, -3.7600,  1.1642, -1.5296],
        [-5.9591,  3.1047, -5.9483, -3.4503],
        [-7.3317,  3.0674, -7.4475, -3.5807],
        [-6.9595, -5.0017,  3.1412, -3.3399]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


100%|█████████▉| 288/289 [03:38<00:00,  1.32it/s]

Training loop 288
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.02507665380835533, logits - tensor([[-7.0257,  3.9340, -6.3894, -3.7506],
        [-6.7165,  3.6491, -7.0942, -3.6344],
        [-7.4304,  3.0986, -6.3551, -2.4326],
        [-7.1355,  2.3695, -7.2568, -2.4758]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


100%|██████████| 289/289 [03:38<00:00,  1.32it/s]
  0%|          | 0/194 [00:00<?, ?it/s]

Validation Loop 0
input - False, attention_mask - False


  1%|          | 1/194 [00:00<00:51,  3.77it/s]

Validation Loop 1
input - False, attention_mask - False


  1%|          | 2/194 [00:00<00:49,  3.88it/s]

Validation Loop 2
input - False, attention_mask - False


  2%|▏         | 3/194 [00:00<00:48,  3.90it/s]

Validation Loop 3
input - False, attention_mask - False


  2%|▏         | 4/194 [00:01<00:49,  3.81it/s]

Validation Loop 4
input - False, attention_mask - False


  3%|▎         | 5/194 [00:01<00:48,  3.86it/s]

Validation Loop 5
input - False, attention_mask - False


  3%|▎         | 6/194 [00:01<00:47,  3.92it/s]

Validation Loop 6
input - False, attention_mask - False


  4%|▎         | 7/194 [00:01<00:47,  3.97it/s]

Validation Loop 7
input - False, attention_mask - False


  4%|▍         | 8/194 [00:02<00:47,  3.91it/s]

Validation Loop 8
input - False, attention_mask - False


  5%|▍         | 9/194 [00:02<00:46,  3.97it/s]

Validation Loop 9
input - False, attention_mask - False


  5%|▌         | 10/194 [00:02<00:46,  3.99it/s]

Validation Loop 10
input - False, attention_mask - False


  6%|▌         | 11/194 [00:02<00:45,  3.99it/s]

Validation Loop 11
input - False, attention_mask - False


  6%|▌         | 12/194 [00:03<00:45,  3.97it/s]

Validation Loop 12
input - False, attention_mask - False


  7%|▋         | 13/194 [00:03<00:45,  3.97it/s]

Validation Loop 13
input - False, attention_mask - False


  7%|▋         | 14/194 [00:03<00:45,  3.98it/s]

Validation Loop 14
input - False, attention_mask - False


  8%|▊         | 15/194 [00:03<00:44,  3.99it/s]

Validation Loop 15
input - False, attention_mask - False


  8%|▊         | 16/194 [00:04<00:44,  3.98it/s]

Validation Loop 16
input - False, attention_mask - False


  9%|▉         | 17/194 [00:04<00:44,  3.94it/s]

Validation Loop 17
input - False, attention_mask - False


  9%|▉         | 18/194 [00:04<00:44,  3.96it/s]

Validation Loop 18
input - False, attention_mask - False


 10%|▉         | 19/194 [00:04<00:43,  3.98it/s]

Validation Loop 19
input - False, attention_mask - False


 10%|█         | 20/194 [00:05<00:43,  3.97it/s]

Validation Loop 20
input - False, attention_mask - False


 11%|█         | 21/194 [00:05<00:43,  3.94it/s]

Validation Loop 21
input - False, attention_mask - False


 11%|█▏        | 22/194 [00:05<00:43,  3.96it/s]

Validation Loop 22
input - False, attention_mask - False


 12%|█▏        | 23/194 [00:05<00:43,  3.95it/s]

Validation Loop 23
input - False, attention_mask - False


 12%|█▏        | 24/194 [00:06<00:42,  3.96it/s]

Validation Loop 24
input - False, attention_mask - False


 13%|█▎        | 25/194 [00:06<00:42,  3.96it/s]

Validation Loop 25
input - False, attention_mask - False


 13%|█▎        | 26/194 [00:06<00:42,  3.94it/s]

Validation Loop 26
input - False, attention_mask - False


 14%|█▍        | 27/194 [00:06<00:42,  3.94it/s]

Validation Loop 27
input - False, attention_mask - False


 14%|█▍        | 28/194 [00:07<00:41,  3.96it/s]

Validation Loop 28
input - False, attention_mask - False


 15%|█▍        | 29/194 [00:07<00:41,  3.99it/s]

Validation Loop 29
input - False, attention_mask - False


 15%|█▌        | 30/194 [00:07<00:41,  3.95it/s]

Validation Loop 30
input - False, attention_mask - False


 16%|█▌        | 31/194 [00:07<00:41,  3.94it/s]

Validation Loop 31
input - False, attention_mask - False


 16%|█▋        | 32/194 [00:08<00:41,  3.91it/s]

Validation Loop 32
input - False, attention_mask - False


 17%|█▋        | 33/194 [00:08<00:41,  3.92it/s]

Validation Loop 33
input - False, attention_mask - False


 18%|█▊        | 34/194 [00:08<00:41,  3.89it/s]

Validation Loop 34
input - False, attention_mask - False


 18%|█▊        | 35/194 [00:08<00:40,  3.93it/s]

Validation Loop 35
input - False, attention_mask - False


 19%|█▊        | 36/194 [00:09<00:39,  3.96it/s]

Validation Loop 36
input - False, attention_mask - False


 19%|█▉        | 37/194 [00:09<00:40,  3.92it/s]

Validation Loop 37
input - False, attention_mask - False


 20%|█▉        | 38/194 [00:09<00:39,  3.94it/s]

Validation Loop 38
input - False, attention_mask - False


 20%|██        | 39/194 [00:09<00:39,  3.95it/s]

Validation Loop 39
input - False, attention_mask - False


 21%|██        | 40/194 [00:10<00:39,  3.94it/s]

Validation Loop 40
input - False, attention_mask - False


 21%|██        | 41/194 [00:10<00:39,  3.92it/s]

Validation Loop 41
input - False, attention_mask - False


 22%|██▏       | 42/194 [00:10<00:38,  3.90it/s]

Validation Loop 42
input - False, attention_mask - False


 22%|██▏       | 43/194 [00:10<00:38,  3.92it/s]

Validation Loop 43
input - False, attention_mask - False


 23%|██▎       | 44/194 [00:11<00:38,  3.92it/s]

Validation Loop 44
input - False, attention_mask - False


 23%|██▎       | 45/194 [00:11<00:37,  3.92it/s]

Validation Loop 45
input - False, attention_mask - False


 24%|██▎       | 46/194 [00:11<00:37,  3.91it/s]

Validation Loop 46
input - False, attention_mask - False


 24%|██▍       | 47/194 [00:11<00:37,  3.96it/s]

Validation Loop 47
input - False, attention_mask - False


 25%|██▍       | 48/194 [00:12<00:36,  3.96it/s]

Validation Loop 48
input - False, attention_mask - False


 25%|██▌       | 49/194 [00:12<00:36,  3.94it/s]

Validation Loop 49
input - False, attention_mask - False


 26%|██▌       | 50/194 [00:12<00:36,  3.94it/s]

Validation Loop 50
input - False, attention_mask - False


 26%|██▋       | 51/194 [00:12<00:36,  3.92it/s]

Validation Loop 51
input - False, attention_mask - False


 27%|██▋       | 52/194 [00:13<00:36,  3.93it/s]

Validation Loop 52
input - False, attention_mask - False


 27%|██▋       | 53/194 [00:13<00:36,  3.89it/s]

Validation Loop 53
input - False, attention_mask - False


 28%|██▊       | 54/194 [00:13<00:36,  3.88it/s]

Validation Loop 54
input - False, attention_mask - False


 28%|██▊       | 55/194 [00:13<00:35,  3.86it/s]

Validation Loop 55
input - False, attention_mask - False


 29%|██▉       | 56/194 [00:14<00:35,  3.84it/s]

Validation Loop 56
input - False, attention_mask - False


 29%|██▉       | 57/194 [00:14<00:35,  3.85it/s]

Validation Loop 57
input - False, attention_mask - False


 30%|██▉       | 58/194 [00:14<00:35,  3.83it/s]

Validation Loop 58
input - False, attention_mask - False


 30%|███       | 59/194 [00:15<00:34,  3.88it/s]

Validation Loop 59
input - False, attention_mask - False


 31%|███       | 60/194 [00:15<00:34,  3.88it/s]

Validation Loop 60
input - False, attention_mask - False


 31%|███▏      | 61/194 [00:15<00:34,  3.90it/s]

Validation Loop 61
input - False, attention_mask - False


 32%|███▏      | 62/194 [00:15<00:33,  3.94it/s]

Validation Loop 62
input - False, attention_mask - False


 32%|███▏      | 63/194 [00:16<00:33,  3.95it/s]

Validation Loop 63
input - False, attention_mask - False


 33%|███▎      | 64/194 [00:16<00:32,  3.97it/s]

Validation Loop 64
input - False, attention_mask - False


 34%|███▎      | 65/194 [00:16<00:32,  3.96it/s]

Validation Loop 65
input - False, attention_mask - False


 34%|███▍      | 66/194 [00:16<00:32,  3.98it/s]

Validation Loop 66
input - False, attention_mask - False


 35%|███▍      | 67/194 [00:17<00:32,  3.97it/s]

Validation Loop 67
input - False, attention_mask - False


 35%|███▌      | 68/194 [00:17<00:31,  3.96it/s]

Validation Loop 68
input - False, attention_mask - False


 36%|███▌      | 69/194 [00:17<00:31,  3.96it/s]

Validation Loop 69
input - False, attention_mask - False


 36%|███▌      | 70/194 [00:17<00:31,  3.97it/s]

Validation Loop 70
input - False, attention_mask - False


 37%|███▋      | 71/194 [00:18<00:31,  3.95it/s]

Validation Loop 71
input - False, attention_mask - False


 37%|███▋      | 72/194 [00:18<00:30,  3.98it/s]

Validation Loop 72
input - False, attention_mask - False


 38%|███▊      | 73/194 [00:18<00:30,  3.95it/s]

Validation Loop 73
input - False, attention_mask - False


 38%|███▊      | 74/194 [00:18<00:30,  3.92it/s]

Validation Loop 74
input - False, attention_mask - False


 39%|███▊      | 75/194 [00:19<00:30,  3.91it/s]

Validation Loop 75
input - False, attention_mask - False


 39%|███▉      | 76/194 [00:19<00:30,  3.91it/s]

Validation Loop 76
input - False, attention_mask - False


 40%|███▉      | 77/194 [00:19<00:29,  3.90it/s]

Validation Loop 77
input - False, attention_mask - False


 40%|████      | 78/194 [00:19<00:29,  3.92it/s]

Validation Loop 78
input - False, attention_mask - False


 41%|████      | 79/194 [00:20<00:29,  3.96it/s]

Validation Loop 79
input - False, attention_mask - False


 41%|████      | 80/194 [00:20<00:28,  3.93it/s]

Validation Loop 80
input - False, attention_mask - False


 42%|████▏     | 81/194 [00:20<00:28,  3.93it/s]

Validation Loop 81
input - False, attention_mask - False


 42%|████▏     | 82/194 [00:20<00:28,  3.96it/s]

Validation Loop 82
input - False, attention_mask - False


 43%|████▎     | 83/194 [00:21<00:28,  3.96it/s]

Validation Loop 83
input - False, attention_mask - False


 43%|████▎     | 84/194 [00:21<00:27,  3.93it/s]

Validation Loop 84
input - False, attention_mask - False


 44%|████▍     | 85/194 [00:21<00:27,  3.93it/s]

Validation Loop 85
input - False, attention_mask - False


 44%|████▍     | 86/194 [00:21<00:27,  3.92it/s]

Validation Loop 86
input - False, attention_mask - False


 45%|████▍     | 87/194 [00:22<00:27,  3.93it/s]

Validation Loop 87
input - False, attention_mask - False


 45%|████▌     | 88/194 [00:22<00:26,  3.94it/s]

Validation Loop 88
input - False, attention_mask - False


 46%|████▌     | 89/194 [00:22<00:26,  3.95it/s]

Validation Loop 89
input - False, attention_mask - False


 46%|████▋     | 90/194 [00:22<00:26,  3.93it/s]

Validation Loop 90
input - False, attention_mask - False


 47%|████▋     | 91/194 [00:23<00:26,  3.96it/s]

Validation Loop 91
input - False, attention_mask - False


 47%|████▋     | 92/194 [00:23<00:25,  3.96it/s]

Validation Loop 92
input - False, attention_mask - False


 48%|████▊     | 93/194 [00:23<00:25,  3.95it/s]

Validation Loop 93
input - False, attention_mask - False


 48%|████▊     | 94/194 [00:23<00:25,  3.97it/s]

Validation Loop 94
input - False, attention_mask - False


 49%|████▉     | 95/194 [00:24<00:25,  3.96it/s]

Validation Loop 95
input - False, attention_mask - False


 49%|████▉     | 96/194 [00:24<00:24,  3.93it/s]

Validation Loop 96
input - False, attention_mask - False


 50%|█████     | 97/194 [00:24<00:24,  3.96it/s]

Validation Loop 97
input - False, attention_mask - False


 51%|█████     | 98/194 [00:24<00:24,  3.94it/s]

Validation Loop 98
input - False, attention_mask - False


 51%|█████     | 99/194 [00:25<00:23,  3.97it/s]

Validation Loop 99
input - False, attention_mask - False


 52%|█████▏    | 100/194 [00:25<00:23,  3.98it/s]

Validation Loop 100
input - False, attention_mask - False


 52%|█████▏    | 101/194 [00:25<00:23,  3.96it/s]

Validation Loop 101
input - False, attention_mask - False


 53%|█████▎    | 102/194 [00:25<00:23,  3.98it/s]

Validation Loop 102
input - False, attention_mask - False


 53%|█████▎    | 103/194 [00:26<00:22,  3.97it/s]

Validation Loop 103
input - False, attention_mask - False


 54%|█████▎    | 104/194 [00:26<00:22,  3.97it/s]

Validation Loop 104
input - False, attention_mask - False


 54%|█████▍    | 105/194 [00:26<00:22,  3.97it/s]

Validation Loop 105
input - False, attention_mask - False


 55%|█████▍    | 106/194 [00:26<00:22,  3.99it/s]

Validation Loop 106
input - False, attention_mask - False


 55%|█████▌    | 107/194 [00:27<00:21,  3.98it/s]

Validation Loop 107
input - False, attention_mask - False


 56%|█████▌    | 108/194 [00:27<00:21,  4.00it/s]

Validation Loop 108
input - False, attention_mask - False


 56%|█████▌    | 109/194 [00:27<00:21,  4.01it/s]

Validation Loop 109
input - False, attention_mask - False


 57%|█████▋    | 110/194 [00:27<00:20,  4.00it/s]

Validation Loop 110
input - False, attention_mask - False


 57%|█████▋    | 111/194 [00:28<00:20,  4.00it/s]

Validation Loop 111
input - False, attention_mask - False


 58%|█████▊    | 112/194 [00:28<00:20,  4.00it/s]

Validation Loop 112
input - False, attention_mask - False


 58%|█████▊    | 113/194 [00:28<00:20,  3.97it/s]

Validation Loop 113
input - False, attention_mask - False


 59%|█████▉    | 114/194 [00:28<00:20,  4.00it/s]

Validation Loop 114
input - False, attention_mask - False


 59%|█████▉    | 115/194 [00:29<00:19,  4.00it/s]

Validation Loop 115
input - False, attention_mask - False


 60%|█████▉    | 116/194 [00:29<00:19,  3.93it/s]

Validation Loop 116
input - False, attention_mask - False


 60%|██████    | 117/194 [00:29<00:19,  3.91it/s]

Validation Loop 117
input - False, attention_mask - False


 61%|██████    | 118/194 [00:29<00:19,  3.87it/s]

Validation Loop 118
input - False, attention_mask - False


 61%|██████▏   | 119/194 [00:30<00:19,  3.85it/s]

Validation Loop 119
input - False, attention_mask - False


 62%|██████▏   | 120/194 [00:30<00:19,  3.88it/s]

Validation Loop 120
input - False, attention_mask - False


 62%|██████▏   | 121/194 [00:30<00:18,  3.86it/s]

Validation Loop 121
input - False, attention_mask - False


 63%|██████▎   | 122/194 [00:30<00:18,  3.90it/s]

Validation Loop 122
input - False, attention_mask - False


 63%|██████▎   | 123/194 [00:31<00:18,  3.92it/s]

Validation Loop 123
input - False, attention_mask - False


 64%|██████▍   | 124/194 [00:31<00:17,  3.92it/s]

Validation Loop 124
input - False, attention_mask - False


 64%|██████▍   | 125/194 [00:31<00:17,  3.94it/s]

Validation Loop 125
input - False, attention_mask - False


 65%|██████▍   | 126/194 [00:31<00:17,  3.95it/s]

Validation Loop 126
input - False, attention_mask - False


 65%|██████▌   | 127/194 [00:32<00:16,  3.97it/s]

Validation Loop 127
input - False, attention_mask - False


 66%|██████▌   | 128/194 [00:32<00:16,  3.98it/s]

Validation Loop 128
input - False, attention_mask - False


 66%|██████▋   | 129/194 [00:32<00:16,  3.95it/s]

Validation Loop 129
input - False, attention_mask - False


 67%|██████▋   | 130/194 [00:32<00:16,  3.99it/s]

Validation Loop 130
input - False, attention_mask - False


 68%|██████▊   | 131/194 [00:33<00:15,  3.98it/s]

Validation Loop 131
input - False, attention_mask - False


 68%|██████▊   | 132/194 [00:33<00:15,  3.96it/s]

Validation Loop 132
input - False, attention_mask - False


 69%|██████▊   | 133/194 [00:33<00:15,  3.93it/s]

Validation Loop 133
input - False, attention_mask - False


 69%|██████▉   | 134/194 [00:34<00:15,  3.92it/s]

Validation Loop 134
input - False, attention_mask - False


 70%|██████▉   | 135/194 [00:34<00:14,  3.95it/s]

Validation Loop 135
input - False, attention_mask - False


 70%|███████   | 136/194 [00:34<00:14,  3.90it/s]

Validation Loop 136
input - False, attention_mask - False


 71%|███████   | 137/194 [00:34<00:14,  3.88it/s]

Validation Loop 137
input - False, attention_mask - False


 71%|███████   | 138/194 [00:35<00:14,  3.89it/s]

Validation Loop 138
input - False, attention_mask - False


 72%|███████▏  | 139/194 [00:35<00:14,  3.87it/s]

Validation Loop 139
input - False, attention_mask - False


 72%|███████▏  | 140/194 [00:35<00:14,  3.82it/s]

Validation Loop 140
input - False, attention_mask - False


 73%|███████▎  | 141/194 [00:35<00:13,  3.79it/s]

Validation Loop 141
input - False, attention_mask - False


 73%|███████▎  | 142/194 [00:36<00:13,  3.83it/s]

Validation Loop 142
input - False, attention_mask - False


 74%|███████▎  | 143/194 [00:36<00:13,  3.84it/s]

Validation Loop 143
input - False, attention_mask - False


 74%|███████▍  | 144/194 [00:36<00:12,  3.85it/s]

Validation Loop 144
input - False, attention_mask - False


 75%|███████▍  | 145/194 [00:36<00:12,  3.85it/s]

Validation Loop 145
input - False, attention_mask - False


 75%|███████▌  | 146/194 [00:37<00:12,  3.90it/s]

Validation Loop 146
input - False, attention_mask - False


 76%|███████▌  | 147/194 [00:37<00:11,  3.93it/s]

Validation Loop 147
input - False, attention_mask - False


 76%|███████▋  | 148/194 [00:37<00:11,  3.89it/s]

Validation Loop 148
input - False, attention_mask - False


 77%|███████▋  | 149/194 [00:37<00:11,  3.92it/s]

Validation Loop 149
input - False, attention_mask - False


 77%|███████▋  | 150/194 [00:38<00:11,  3.95it/s]

Validation Loop 150
input - False, attention_mask - False


 78%|███████▊  | 151/194 [00:38<00:10,  3.95it/s]

Validation Loop 151
input - False, attention_mask - False


 78%|███████▊  | 152/194 [00:38<00:10,  3.93it/s]

Validation Loop 152
input - False, attention_mask - False


 79%|███████▉  | 153/194 [00:38<00:10,  3.95it/s]

Validation Loop 153
input - False, attention_mask - False


 79%|███████▉  | 154/194 [00:39<00:10,  3.97it/s]

Validation Loop 154
input - False, attention_mask - False


 80%|███████▉  | 155/194 [00:39<00:09,  3.92it/s]

Validation Loop 155
input - False, attention_mask - False


 80%|████████  | 156/194 [00:39<00:09,  3.93it/s]

Validation Loop 156
input - False, attention_mask - False


 81%|████████  | 157/194 [00:39<00:09,  3.91it/s]

Validation Loop 157
input - False, attention_mask - False


 81%|████████▏ | 158/194 [00:40<00:09,  3.93it/s]

Validation Loop 158
input - False, attention_mask - False


 82%|████████▏ | 159/194 [00:40<00:08,  3.92it/s]

Validation Loop 159
input - False, attention_mask - False


 82%|████████▏ | 160/194 [00:40<00:08,  3.92it/s]

Validation Loop 160
input - False, attention_mask - False


 83%|████████▎ | 161/194 [00:40<00:08,  3.94it/s]

Validation Loop 161
input - False, attention_mask - False


 84%|████████▎ | 162/194 [00:41<00:08,  3.94it/s]

Validation Loop 162
input - False, attention_mask - False


 84%|████████▍ | 163/194 [00:41<00:07,  3.94it/s]

Validation Loop 163
input - False, attention_mask - False


 85%|████████▍ | 164/194 [00:41<00:07,  3.93it/s]

Validation Loop 164
input - False, attention_mask - False


 85%|████████▌ | 165/194 [00:41<00:07,  3.96it/s]

Validation Loop 165
input - False, attention_mask - False


 86%|████████▌ | 166/194 [00:42<00:07,  3.92it/s]

Validation Loop 166
input - False, attention_mask - False


 86%|████████▌ | 167/194 [00:42<00:06,  3.89it/s]

Validation Loop 167
input - False, attention_mask - False


 87%|████████▋ | 168/194 [00:42<00:06,  3.91it/s]

Validation Loop 168
input - False, attention_mask - False


 87%|████████▋ | 169/194 [00:42<00:06,  3.91it/s]

Validation Loop 169
input - False, attention_mask - False


 88%|████████▊ | 170/194 [00:43<00:06,  3.90it/s]

Validation Loop 170
input - False, attention_mask - False


 88%|████████▊ | 171/194 [00:43<00:05,  3.89it/s]

Validation Loop 171
input - False, attention_mask - False


 89%|████████▊ | 172/194 [00:43<00:05,  3.90it/s]

Validation Loop 172
input - False, attention_mask - False


 89%|████████▉ | 173/194 [00:43<00:05,  3.90it/s]

Validation Loop 173
input - False, attention_mask - False


 90%|████████▉ | 174/194 [00:44<00:05,  3.92it/s]

Validation Loop 174
input - False, attention_mask - False


 90%|█████████ | 175/194 [00:44<00:04,  3.92it/s]

Validation Loop 175
input - False, attention_mask - False


 91%|█████████ | 176/194 [00:44<00:04,  3.92it/s]

Validation Loop 176
input - False, attention_mask - False


 91%|█████████ | 177/194 [00:45<00:04,  3.90it/s]

Validation Loop 177
input - False, attention_mask - False


 92%|█████████▏| 178/194 [00:45<00:04,  3.90it/s]

Validation Loop 178
input - False, attention_mask - False


 92%|█████████▏| 179/194 [00:45<00:03,  3.92it/s]

Validation Loop 179
input - False, attention_mask - False


 93%|█████████▎| 180/194 [00:45<00:03,  3.92it/s]

Validation Loop 180
input - False, attention_mask - False


 93%|█████████▎| 181/194 [00:46<00:03,  3.95it/s]

Validation Loop 181
input - False, attention_mask - False


 94%|█████████▍| 182/194 [00:46<00:03,  3.93it/s]

Validation Loop 182
input - False, attention_mask - False


 94%|█████████▍| 183/194 [00:46<00:02,  3.94it/s]

Validation Loop 183
input - False, attention_mask - False


 95%|█████████▍| 184/194 [00:46<00:02,  3.94it/s]

Validation Loop 184
input - False, attention_mask - False


 95%|█████████▌| 185/194 [00:47<00:02,  3.91it/s]

Validation Loop 185
input - False, attention_mask - False


 96%|█████████▌| 186/194 [00:47<00:02,  3.93it/s]

Validation Loop 186
input - False, attention_mask - False


 96%|█████████▋| 187/194 [00:47<00:01,  3.94it/s]

Validation Loop 187
input - False, attention_mask - False


 97%|█████████▋| 188/194 [00:47<00:01,  3.91it/s]

Validation Loop 188
input - False, attention_mask - False


 97%|█████████▋| 189/194 [00:48<00:01,  3.94it/s]

Validation Loop 189
input - False, attention_mask - False


 98%|█████████▊| 190/194 [00:48<00:01,  3.94it/s]

Validation Loop 190
input - False, attention_mask - False


 98%|█████████▊| 191/194 [00:48<00:00,  3.95it/s]

Validation Loop 191
input - False, attention_mask - False


 99%|█████████▉| 192/194 [00:48<00:00,  3.96it/s]

Validation Loop 192
input - False, attention_mask - False


 99%|█████████▉| 193/194 [00:49<00:00,  3.95it/s]

Validation Loop 193
input - False, attention_mask - False


100%|██████████| 194/194 [00:49<00:00,  3.93it/s]

[{'tp': 0, 'tn': 1552, 'fp': 0, 'fn': 0}, {'tp': 818, 'tn': 372, 'fp': 143, 'fn': 219}, {'tp': 155, 'tn': 1367, 'fp': 5, 'fn': 25}, {'tp': 184, 'tn': 1015, 'fp': 247, 'fn': 106}]
Detailed accuracy after 6 epoch:
unanswerable accuarcy: 1.0
extractive accuarcy: 0.7667525773195877
yes_no accuarcy: 0.9806701030927835
abstractive accuarcy: 0.7725515463917526
Overall accuarcy: 0.879993556701031
Best accuarcy: 0.899645618556701



  0%|          | 0/289 [00:00<?, ?it/s]

Training loop 0
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.030545897781848907, logits - tensor([[-7.4565, -4.1286, -7.6198,  4.7095],
        [-5.9276, -4.6236, -5.0471,  4.1236],
        [-6.4786,  3.3182, -7.1343, -3.9824],
        [-7.3675, -2.0200, -5.8640,  2.7713],
        [-6.1468, -3.5772,  2.0279, -2.2745],
        [-7.0270,  2.4534, -6.7469, -3.1981],
        [-5.0981, -3.1132,  2.7801, -2.5222],
        [-6.9032,  3.1330, -7.3589, -2.7117]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  0%|          | 1/289 [00:00<03:52,  1.24it/s]

Training loop 1
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1604774445295334, logits - tensor([[-6.2910, -3.9175,  1.4765, -2.5489],
        [-6.2833,  3.0833, -6.7396, -2.2659],
        [-7.8026,  3.0887, -7.9233, -3.1275],
        [-6.4761,  2.9838, -6.5088, -3.4293],
        [-6.3928,  2.1626, -6.7842, -2.6800],
        [-7.0856,  3.3282, -6.5222, -3.7821],
        [-5.7329,  2.6808, -5.8458, -3.6873],
        [-5.3751, -4.4437,  2.2580, -3.2675]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  1%|          | 2/289 [00:01<03:46,  1.27it/s]

Training loop 2
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.04529576748609543, logits - tensor([[-5.6201,  4.3053, -6.3221, -4.1489],
        [-6.1016,  3.5227, -5.8940, -3.6746],
        [-7.8256,  1.1569, -7.1379, -1.2363],
        [-7.9605,  2.0745, -6.3142, -2.3641],
        [-6.0350,  3.6218, -6.9587, -2.9060],
        [-5.7338, -3.1327,  1.7962, -1.6908],
        [-6.7420, -4.3887,  3.3734, -3.8316],
        [-5.6266,  2.9096, -7.0439, -3.2493]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  1%|          | 3/289 [00:02<03:42,  1.28it/s]

Training loop 3
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.022728879004716873, logits - tensor([[-6.9780,  2.8375, -6.3497, -3.6388],
        [-7.8999,  1.9311, -7.0424, -1.9960],
        [-5.9377,  3.3619, -6.6332, -4.0218],
        [-5.5924,  3.4589, -6.5065, -3.4544],
        [-6.3748, -3.1581, -6.3731,  3.5118],
        [-6.1765, -3.8370, -6.0921,  4.4613],
        [-7.3103, -4.2377, -5.0196,  2.7257],
        [-6.5795,  3.9073, -6.2718, -3.3395]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  1%|▏         | 4/289 [00:03<03:38,  1.30it/s]

Training loop 4
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.026002252474427223, logits - tensor([[-7.5358,  3.7465, -6.9466, -2.8755],
        [-7.7241,  3.3382, -7.5204, -3.1881],
        [-6.0731, -3.5273, -5.4470,  3.9305],
        [-6.2114, -3.7275,  2.0590, -1.7564],
        [-6.6363,  2.9888, -6.3176, -3.6243],
        [-6.4320, -3.0422, -5.0097,  3.0845],
        [-6.0587,  3.2276, -6.9448, -2.8914],
        [-6.4807,  3.6142, -6.9367, -4.3743]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  2%|▏         | 5/289 [00:03<03:37,  1.31it/s]

Training loop 5
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.06955411285161972, logits - tensor([[-6.7695,  2.8146, -7.5479, -3.6950],
        [-7.6097,  2.8148, -6.8462, -3.0949],
        [-5.8762, -3.9319,  2.7339, -2.6852],
        [-6.7856,  0.1933, -6.3246,  0.0714],
        [-5.7630,  3.5354, -5.4654, -3.5595],
        [-6.8112, -3.9067,  2.7435, -2.7532],
        [-7.2764,  2.9381, -7.4384, -2.7446],
        [-6.2495,  2.9396, -6.6401, -2.9425]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  2%|▏         | 6/289 [00:04<03:35,  1.31it/s]

Training loop 6
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.26037850975990295, logits - tensor([[-6.3780,  3.2896, -6.8550, -3.6420],
        [-5.3418, -3.1969,  2.0464, -2.6709],
        [-6.0913, -4.2067,  3.1948, -3.3211],
        [-7.0168,  3.1633, -6.8834, -3.1230],
        [-5.7455,  3.1873, -6.0915, -3.5563],
        [-7.4957,  3.1718, -6.9626, -2.9634],
        [-6.8049, -0.2153, -5.5610,  0.4078],
        [-7.5662,  1.7578, -7.5834, -1.1047]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  2%|▏         | 7/289 [00:05<03:34,  1.32it/s]

Training loop 7
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.05464101582765579, logits - tensor([[-6.2116,  3.5214, -6.4387, -3.5683],
        [-6.7952, -4.0865, -5.9909,  3.0112],
        [-5.9081, -3.9101,  2.6019, -1.9307],
        [-7.4132, -2.8154, -6.7870,  2.7670],
        [-5.7898,  3.1200, -6.6481, -3.3117],
        [-6.4146, -3.2570,  1.0216, -1.6000],
        [-6.0683,  3.2982, -6.3341, -3.7966],
        [-6.9544,  0.8656, -6.7661, -1.3085]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  3%|▎         | 8/289 [00:06<03:33,  1.31it/s]

Training loop 8
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2113601118326187, logits - tensor([[-5.5764,  3.1843, -6.6731, -2.7222],
        [-7.2494,  3.2756, -7.3386, -4.3731],
        [-7.0958,  2.9716, -6.9439, -3.5437],
        [-6.0995,  2.8203, -6.6025, -2.6562],
        [-5.6734,  2.6299, -6.8163, -2.1390],
        [-6.0008,  3.8766, -5.5530, -3.4448],
        [-7.4445,  4.6518, -7.2128, -4.7469],
        [-7.8102,  2.7582, -7.6385, -3.2727]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  3%|▎         | 9/289 [00:06<03:32,  1.32it/s]

Training loop 9
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2603335976600647, logits - tensor([[-7.5769,  4.2511, -6.8448, -3.7574],
        [-6.6405, -0.3246, -7.5951,  1.2546],
        [-6.1520, -3.6427,  2.5664, -2.7697],
        [-6.6966, -3.4140,  0.9875, -2.2836],
        [-6.9150,  3.7480, -7.4496, -3.3564],
        [-7.0193,  3.6764, -7.2047, -3.8853],
        [-6.7809,  3.5406, -6.7142, -3.9808],
        [-6.0522, -0.4215, -2.6478, -0.9873]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  3%|▎         | 10/289 [00:07<03:31,  1.32it/s]

Training loop 10
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.0300893671810627, logits - tensor([[-9.1956, -1.9214, -7.6774,  2.2418],
        [-6.1848,  2.8913, -6.4454, -2.6841],
        [-6.1154,  1.5378, -6.6810, -2.6060],
        [-6.0159, -3.1047, -5.5371,  2.4839],
        [-6.8812,  3.9781, -6.5775, -3.4442],
        [-6.4299,  3.9414, -7.1273, -4.3346],
        [-6.9065,  3.9609, -6.6445, -3.5409],
        [-6.8529,  3.1705, -6.4654, -3.8645]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  4%|▍         | 11/289 [00:08<03:31,  1.32it/s]

Training loop 11
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.08762332797050476, logits - tensor([[-7.2177, -2.7138, -6.0885,  2.8223],
        [-8.6452, -1.4992, -8.0973, -0.2851],
        [-6.6513, -4.5058,  2.8591, -3.5383],
        [-6.5114, -4.0193, -5.0736,  3.6777],
        [-6.4936, -3.6104, -6.4061,  3.8051],
        [-5.5479,  3.1770, -5.2628, -3.1918],
        [-5.5334, -3.9797, -5.2064,  3.4765],
        [-5.3244, -3.7362, -5.6371,  3.2660]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  4%|▍         | 12/289 [00:09<03:30,  1.32it/s]

Training loop 12
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3435596525669098, logits - tensor([[-7.1179,  0.4809, -7.2213, -0.5796],
        [-5.9860, -4.3506,  1.9600, -2.4877],
        [-7.5306, -4.6714, -5.9721,  4.1697],
        [-7.5317,  3.1981, -6.8359, -4.1392],
        [-7.6202,  2.8843, -7.3655, -3.3561],
        [-5.9436, -1.7366, -6.2856,  1.8730],
        [-7.1863, -4.1297,  1.5956, -2.9107],
        [-5.9396, -3.2249,  1.9556, -2.4183]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  4%|▍         | 13/289 [00:09<03:29,  1.32it/s]

Training loop 13
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.02135259471833706, logits - tensor([[-6.8006,  3.1546, -6.6749, -3.0219],
        [-7.8145,  3.6292, -7.3714, -2.9733],
        [-6.6822,  2.0671, -7.1063, -3.8101],
        [-5.8939,  2.6511, -5.9986, -3.6894],
        [-5.9156,  3.4881, -5.6166, -3.7433],
        [-5.8958,  3.4215, -6.6793, -2.5917],
        [-6.2548,  3.7103, -6.5386, -3.6864],
        [-6.4468,  3.2639, -6.1700, -4.4545]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  5%|▍         | 14/289 [00:10<03:28,  1.32it/s]

Training loop 14
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.022419804707169533, logits - tensor([[-5.3390,  2.9505, -5.5837, -2.7994],
        [-6.3510, -4.1840, -5.1744,  2.5468],
        [-6.4407,  3.7049, -7.6991, -3.6006],
        [-6.9036,  3.3233, -6.9808, -3.6449],
        [-6.0438, -4.1271,  3.1393, -2.1749],
        [-7.7228,  3.8943, -7.2917, -3.0681],
        [-6.9424,  3.8554, -6.9672, -3.8811],
        [-6.6837,  2.7656, -6.9457, -3.1285]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  5%|▌         | 15/289 [00:11<03:27,  1.32it/s]

Training loop 15
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.06449198722839355, logits - tensor([[-5.5591, -4.0742, -5.8377,  4.2090],
        [-6.6592,  1.3966, -7.1895, -1.3513],
        [-7.2563,  3.6933, -7.0708, -3.7616],
        [-6.2364, -2.9018,  1.2702, -1.6138],
        [-7.7493,  0.2548, -7.9777, -1.4369],
        [-5.6396,  2.8990, -6.9025, -3.2071],
        [-7.7634,  3.8208, -6.8595, -3.1119],
        [-7.5817,  3.4865, -6.1383, -2.9073]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  6%|▌         | 16/289 [00:12<03:27,  1.31it/s]

Training loop 16
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.24732422828674316, logits - tensor([[-6.5459,  2.4689, -5.3958, -3.0467],
        [-7.5024,  3.6642, -7.5840, -3.6371],
        [-7.9629,  3.7503, -7.1600, -3.3521],
        [-7.4186,  2.2419, -6.8554, -2.6879],
        [-8.5951,  3.5995, -7.3703, -3.5484],
        [-6.8991, -3.6057,  2.7285, -2.1396],
        [-7.4339,  2.9288, -6.8408, -4.1622],
        [-7.6045,  2.9965, -6.1612, -3.9178]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  6%|▌         | 17/289 [00:12<03:26,  1.32it/s]

Training loop 17
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.14792567491531372, logits - tensor([[-5.2965, -3.9275, -4.9300,  4.2671],
        [-6.6232,  1.9729, -6.8783, -2.0849],
        [-7.5976,  4.1868, -7.1091, -3.7997],
        [-7.5302,  3.3470, -6.8402, -3.2809],
        [-6.7514,  2.5635, -7.9123, -2.1605],
        [-6.5466, -2.9830, -5.3416,  2.9241],
        [-6.2876, -3.4924,  1.6016, -2.0208],
        [-6.5494, -3.6747, -5.5756,  3.1662]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  6%|▌         | 18/289 [00:13<03:25,  1.32it/s]

Training loop 18
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.22612158954143524, logits - tensor([[-6.6724,  2.4547, -7.3998, -3.0513],
        [-6.7014,  3.4673, -7.0246, -2.8405],
        [-5.5838, -3.8241,  1.5948, -2.6946],
        [-7.1744,  2.9553, -6.9269, -2.7348],
        [-6.2280, -3.8364, -5.8890,  3.8500],
        [-6.2207,  3.3006, -7.2896, -2.8320],
        [-6.8150,  3.2975, -7.2707, -3.4493],
        [-5.8803, -4.4912,  3.0772, -3.0678]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  7%|▋         | 19/289 [00:14<03:24,  1.32it/s]

Training loop 19
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.14482861757278442, logits - tensor([[-5.9199,  4.3289, -5.6364, -3.9344],
        [-8.2320,  3.2924, -6.0656, -3.3719],
        [-7.2760, -3.7632, -6.2369,  4.2755],
        [-6.9156,  1.2912, -7.0565, -2.1075],
        [-5.4794,  2.5945, -5.6977, -3.1653],
        [-7.1423, -4.5925, -6.4932,  4.1070],
        [-7.3507, -1.5325, -5.8442,  1.0198],
        [-5.9808,  3.1025, -6.4354, -3.8668]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  7%|▋         | 20/289 [00:15<03:23,  1.32it/s]

Training loop 20
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19910088181495667, logits - tensor([[-4.7501, -2.8774,  1.9342, -2.3433],
        [-6.3904,  3.9389, -6.7602, -2.9663],
        [-6.1732,  3.7990, -6.9449, -3.7495],
        [-6.8570, -3.7326, -6.0577,  2.4088],
        [-6.1522,  3.3977, -6.5275, -3.2043],
        [-5.7024,  3.6412, -6.6609, -3.6877],
        [-7.5199,  2.2223, -7.2602, -3.1583],
        [-6.6824, -4.0526,  2.0649, -2.9389]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  7%|▋         | 21/289 [00:15<03:22,  1.32it/s]

Training loop 21
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.03475275635719299, logits - tensor([[-5.8240, -3.0796,  1.3876, -1.2982],
        [-5.1070,  3.0125, -6.2301, -3.7263],
        [-6.2585,  3.9109, -6.6511, -3.5473],
        [-7.0004, -3.9409, -5.4050,  3.0896],
        [-6.1434, -2.9569, -5.3187,  3.9170],
        [-7.3465,  2.0763, -7.4685, -3.0585],
        [-6.2100,  3.1747, -6.4939, -3.3143],
        [-6.1100, -3.2915, -6.2712,  3.3039]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  8%|▊         | 22/289 [00:16<03:23,  1.31it/s]

Training loop 22
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19931425154209137, logits - tensor([[-6.9841,  3.3655, -7.8001, -3.5760],
        [-5.7133,  2.7436, -6.1373, -2.0388],
        [-5.8714,  4.1547, -6.1420, -3.6394],
        [-7.2937, -1.4688, -6.6684,  1.8534],
        [-6.7122,  3.8258, -7.1592, -3.8229],
        [-7.6050, -3.3527, -7.1788,  4.3247],
        [-6.8934,  2.4772, -7.3060, -2.8817],
        [-6.3740, -2.6841, -6.5125,  2.7034]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  8%|▊         | 23/289 [00:17<03:22,  1.31it/s]

Training loop 23
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.15965791046619415, logits - tensor([[-5.8173,  3.3555, -5.8221, -3.2944],
        [-6.4774,  3.4343, -7.0304, -4.2946],
        [-5.8846,  3.7752, -6.7063, -3.5795],
        [-7.1083, -1.6663, -6.0825,  2.1409],
        [-6.4823,  3.3186, -6.9906, -3.3829],
        [-6.1483, -2.4397,  1.5633, -1.5464],
        [-6.9558, -2.0095, -5.2975,  2.4811],
        [-5.7832, -4.4399,  2.9566, -4.0029]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  8%|▊         | 24/289 [00:18<03:22,  1.31it/s]

Training loop 24
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.028424128890037537, logits - tensor([[-7.1071,  3.9241, -6.1080, -2.6405],
        [-6.5986, -4.7390, -6.3503,  4.9235],
        [-6.2545, -3.4425,  1.8979, -1.2315],
        [-5.9471,  4.4748, -6.3748, -4.4235],
        [-6.3109, -2.9186, -5.3333,  2.7221],
        [-4.7969, -3.9097, -5.1856,  4.1835],
        [-6.6390,  3.9583, -7.1489, -2.9849],
        [-6.4472,  2.5797, -6.6101, -3.7051]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  9%|▊         | 25/289 [00:19<03:20,  1.32it/s]

Training loop 25
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.054179683327674866, logits - tensor([[-7.4527, -2.2728, -6.3425,  1.6630],
        [-7.1478,  2.7261, -7.3361, -1.5521],
        [-5.4058,  1.9239, -6.1561, -2.3595],
        [-7.4214, -2.0485, -5.7316,  2.2226],
        [-6.3892, -3.0853,  0.7983, -2.3189],
        [-7.4131, -3.3836, -6.6420,  2.6791],
        [-7.9087,  3.8501, -7.2042, -3.9836],
        [-6.4691,  2.9244, -7.3519, -3.3929]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  9%|▉         | 26/289 [00:19<03:20,  1.31it/s]

Training loop 26
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.018266204744577408, logits - tensor([[-6.8346,  4.3016, -7.7226, -3.3627],
        [-6.5364,  2.2364, -6.1462, -2.8844],
        [-6.8120, -3.9150, -5.3209,  4.6689],
        [-6.2363,  2.6565, -6.1634, -3.2831],
        [-7.2471,  3.3042, -7.2490, -3.5460],
        [-5.7766,  3.5534, -5.8445, -3.3453],
        [-6.8895,  4.0777, -6.1835, -3.4697],
        [-6.6959,  4.5239, -7.3878, -3.3562]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


  9%|▉         | 27/289 [00:20<03:20,  1.31it/s]

Training loop 27
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3699689507484436, logits - tensor([[-7.2235, -2.5666, -5.9487,  2.9966],
        [-7.4593,  1.5591, -6.4669, -2.3480],
        [-6.3027, -3.2717, -5.6067,  3.3720],
        [-6.3570, -3.7553, -5.7285,  4.3357],
        [-7.0659,  3.1990, -6.3560, -3.3290],
        [-6.0256,  3.3490, -6.3735, -4.6814],
        [-7.2047,  2.9664, -6.8714, -3.9519],
        [-6.1751, -3.7382,  2.0139, -1.9092]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 10%|▉         | 28/289 [00:21<03:18,  1.31it/s]

Training loop 28
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.03922518715262413, logits - tensor([[-6.4015,  3.4071, -6.7171, -3.3006],
        [-5.6990,  2.6784, -6.8384, -3.4503],
        [-5.3035, -4.0018, -4.5384,  3.5203],
        [-6.4727,  3.6380, -6.7477, -3.7879],
        [-6.6201,  1.5963, -7.1119, -1.2085],
        [-7.4627,  3.1026, -7.0468, -3.0379],
        [-5.7854, -4.3874,  1.8727, -2.9123],
        [-6.5153,  2.6195, -6.6944, -1.8729]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 10%|█         | 29/289 [00:22<03:17,  1.32it/s]

Training loop 29
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.0402834489941597, logits - tensor([[-6.4894,  3.0812, -7.2526, -3.7917],
        [-6.0019,  2.3992, -6.1637, -3.8032],
        [-7.1263,  1.9740, -7.0044, -2.7017],
        [-5.5868,  3.8990, -6.2002, -2.7547],
        [-5.9235, -2.4353,  0.7615, -2.0108],
        [-5.9469,  3.1771, -6.2066, -4.1905],
        [-7.0341,  2.9747, -7.4616, -3.5234],
        [-7.5197, -2.8607, -6.6955,  3.5563]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 10%|█         | 30/289 [00:22<03:16,  1.32it/s]

Training loop 30
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.02065279707312584, logits - tensor([[-6.0713,  2.9881, -6.7457, -2.9334],
        [-7.0446, -2.9752, -5.8786,  4.1304],
        [-6.9857,  2.3511, -6.3688, -3.2253],
        [-5.5271,  3.3995, -5.8053, -3.4374],
        [-6.3855,  4.1383, -7.0361, -4.6090],
        [-6.2839, -2.9943, -7.1130,  2.9922],
        [-7.2260,  3.8927, -7.8685, -3.7973],
        [-6.1524, -4.5457,  3.1583, -2.8475]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 11%|█         | 31/289 [00:23<03:15,  1.32it/s]

Training loop 31
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.04716397821903229, logits - tensor([[-7.1699,  3.2174, -6.1270, -2.1392],
        [-7.2653,  2.9977, -7.2744, -2.4940],
        [-8.0022, -1.8161, -6.7591,  1.8450],
        [-6.8133, -5.0427,  3.2080, -3.3759],
        [-5.8314, -2.7672, -5.7283,  2.8566],
        [-5.5806, -3.7932,  3.7757, -3.3189],
        [-6.4260,  2.8708, -6.9288, -3.2715],
        [-7.6368, -1.1018, -5.9144,  1.2394]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 11%|█         | 32/289 [00:24<03:14,  1.32it/s]

Training loop 32
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.021915433928370476, logits - tensor([[-5.7033, -4.0220,  2.7989, -2.4535],
        [-6.6711, -4.2783, -5.9935,  4.0668],
        [-5.8550,  3.0706, -6.5567, -2.9671],
        [-6.3323, -4.7377,  3.3142, -3.1745],
        [-7.6765,  2.8099, -6.0703, -3.8778],
        [-7.2297,  3.9907, -7.3138, -3.4166],
        [-6.1890, -4.5981,  2.7449, -2.9187],
        [-8.0876,  3.4231, -6.7145, -3.7681]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 11%|█▏        | 33/289 [00:25<03:13,  1.32it/s]

Training loop 33
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.15160217881202698, logits - tensor([[-7.1999,  1.8738, -6.4783, -2.2193],
        [-7.3614, -2.2644, -6.0898,  3.3936],
        [-6.0943, -3.8321,  2.0307, -2.5930],
        [-6.8019,  2.4690, -7.0949, -2.5150],
        [-6.2999,  3.3500, -5.6347, -3.1359],
        [-6.5460, -3.4519, -5.8883,  2.7816],
        [-6.3229, -2.7310,  1.7473, -1.7350],
        [-7.8446, -3.5146, -6.8641,  4.6887]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 12%|█▏        | 34/289 [00:25<03:13,  1.32it/s]

Training loop 34
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.04002923145890236, logits - tensor([[-6.2149,  2.4379, -6.9002, -2.8084],
        [-5.8707,  3.1268, -6.4032, -3.4408],
        [-6.7944, -4.6060, -5.8275,  3.5429],
        [-5.4939, -3.0336,  2.3048, -1.7398],
        [-5.8704, -3.6070,  1.3761, -1.4060],
        [-6.0004, -4.3157, -6.1962,  3.9467],
        [-5.4633, -3.8185,  3.1505, -2.9094],
        [-6.4085,  3.9036, -5.9936, -2.9659]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 12%|█▏        | 35/289 [00:26<03:12,  1.32it/s]

Training loop 35
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2634107172489166, logits - tensor([[-7.4749,  2.9157, -7.6969, -3.1096],
        [-6.5299,  3.4687, -6.3283, -3.2866],
        [-6.1095,  4.1599, -6.0629, -3.0842],
        [-6.4283,  4.6745, -6.7894, -4.3427],
        [-6.5494, -3.1579, -5.2792,  4.3065],
        [-5.6886, -3.0300,  1.8214, -1.1437],
        [-5.9544,  3.1536, -6.9127, -2.9427],
        [-7.0091, -3.2705, -4.6550,  3.7211]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 12%|█▏        | 36/289 [00:27<03:11,  1.32it/s]

Training loop 36
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.030948853120207787, logits - tensor([[-7.1994,  3.4122, -7.2635, -4.1212],
        [-7.2037, -2.2353, -6.0770,  1.9259],
        [-6.2042,  2.2829, -6.0892, -3.0995],
        [-7.2801, -3.8444, -5.6686,  4.4466],
        [-5.8881, -3.0658, -4.4777,  3.0092],
        [-5.5154, -3.6344,  2.3317, -1.8028],
        [-5.7437, -3.5711, -5.6040,  3.3556],
        [-5.7553,  3.3147, -6.4708, -3.6848]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 13%|█▎        | 37/289 [00:28<03:11,  1.32it/s]

Training loop 37
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.24163293838500977, logits - tensor([[-5.9789, -3.6714, -6.2458,  3.8377],
        [-6.2228, -3.6574,  2.8614, -3.7001],
        [-6.0541, -4.2519,  2.5223, -3.0703],
        [-7.4049, -3.9641, -6.3860,  3.1152],
        [-6.0921,  1.7678, -6.5419, -2.7929],
        [-6.6345,  3.4250, -6.2177, -3.0422],
        [-5.7169,  2.2319, -5.9793, -2.7878],
        [-6.8227,  2.5137, -6.3347, -3.0758]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 13%|█▎        | 38/289 [00:28<03:10,  1.32it/s]

Training loop 38
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2524832487106323, logits - tensor([[-7.5270, -0.5109, -6.8840,  1.8591],
        [-6.6180, -3.0267, -5.8499,  3.6838],
        [-7.7060,  3.5430, -7.4422, -3.4511],
        [-5.9672, -4.2286, -5.5943,  4.4464],
        [-6.4953,  3.0526, -6.2912, -3.6118],
        [-6.2484,  2.5485, -6.4388, -3.6103],
        [-7.5462,  4.0755, -7.3765, -3.4448],
        [-7.8076,  3.6726, -6.8299, -3.3281]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 13%|█▎        | 39/289 [00:29<03:10,  1.32it/s]

Training loop 39
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.04864449426531792, logits - tensor([[-6.2350,  2.3210, -5.4523, -2.7349],
        [-6.3727,  2.3968, -6.2593, -3.8045],
        [-8.0832, -1.6664, -7.5449,  0.9327],
        [-7.9678,  1.3970, -7.8953, -1.3013],
        [-6.4969, -3.9238, -5.0586,  3.5877],
        [-6.9244, -2.7929, -6.6320,  3.6190],
        [-6.0546, -2.9862, -5.8105,  3.7272],
        [-6.3099,  3.3282, -6.7410, -2.9020]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 14%|█▍        | 40/289 [00:30<03:08,  1.32it/s]

Training loop 40
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.0281141996383667, logits - tensor([[-5.5261, -2.9672,  2.0995, -2.0264],
        [-5.2699, -3.8564, -6.0230,  3.1174],
        [-7.2373, -2.5439, -6.7800,  2.4817],
        [-5.7930, -3.9978,  3.1986, -2.8199],
        [-7.1368, -3.0605, -5.8041,  3.1240],
        [-7.6367,  4.2873, -6.1768, -3.9083],
        [-4.9816,  2.7608, -5.9082, -3.8994],
        [-6.3615,  4.1660, -6.3341, -3.9603]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 14%|█▍        | 41/289 [00:31<03:07,  1.32it/s]

Training loop 41
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.04660848528146744, logits - tensor([[-6.6582, -4.1211, -6.2163,  3.5164],
        [-5.0471, -3.2870,  2.2786, -2.7577],
        [-6.3752,  3.1738, -7.3381, -2.9585],
        [-5.3804, -3.3342,  2.3826, -1.9629],
        [-6.4273, -2.7789, -5.9346,  2.9185],
        [-6.6518,  2.6884, -7.3974, -3.0346],
        [-7.1069,  4.0463, -7.5397, -3.2467],
        [-6.1195, -0.9763, -6.8525,  1.1468]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 15%|█▍        | 42/289 [00:31<03:07,  1.32it/s]

Training loop 42
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.16814365983009338, logits - tensor([[-6.3116,  1.0055, -6.1648, -1.1660],
        [-6.8876,  3.0672, -7.2991, -3.2370],
        [-7.5649,  2.2855, -7.3398, -1.7390],
        [-6.2879, -4.3695,  3.2377, -3.0674],
        [-5.2394,  3.2426, -5.6533, -3.5684],
        [-6.7778,  3.8943, -7.6618, -3.2749],
        [-5.7743, -4.5321,  2.5478, -3.0149],
        [-7.1804,  3.2797, -8.0948, -4.1126]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 15%|█▍        | 43/289 [00:32<03:06,  1.32it/s]

Training loop 43
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.05670040845870972, logits - tensor([[-6.0218,  4.0613, -6.2642, -3.6666],
        [-5.9151, -0.2077, -6.3328,  0.3300],
        [-5.9394,  3.0292, -6.4789, -2.4464],
        [-7.0747,  3.5352, -7.3003, -4.1523],
        [-6.5215, -4.4716, -6.4814,  4.5951],
        [-6.6441,  2.9808, -6.6626, -2.6548],
        [-6.6523,  4.2914, -6.5391, -3.9837],
        [-5.9066,  2.1580, -7.0881, -1.7895]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 15%|█▌        | 44/289 [00:33<03:05,  1.32it/s]

Training loop 44
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.0164233036339283, logits - tensor([[-7.4070,  2.4566, -7.0590, -3.0198],
        [-6.0146,  3.9656, -6.8877, -4.0444],
        [-6.7994,  3.5081, -7.0170, -4.0214],
        [-6.0819,  3.9250, -6.4861, -4.4246],
        [-6.5139,  3.7381, -7.6778, -3.4894],
        [-6.9028,  4.3974, -6.9390, -4.1914],
        [-6.9565,  3.3059, -7.2542, -3.1113],
        [-6.4745, -2.7935, -4.7925,  3.3403]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 16%|█▌        | 45/289 [00:34<03:04,  1.32it/s]

Training loop 45
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 16%|█▌        | 46/289 [00:34<03:03,  1.32it/s]

loss - 0.06792223453521729, logits - tensor([[-7.8386,  2.2931, -7.3259, -3.0516],
        [-6.7598,  3.0088, -6.1212, -3.0478],
        [-5.7169, -4.7380, -5.4025,  3.9359],
        [-5.7335, -2.9115,  1.2820, -1.7944],
        [-6.0625, -4.3894,  2.2569, -2.8015],
        [-6.7282, -4.6282, -5.1867,  4.2804],
        [-6.0110,  2.8232, -6.6410, -3.0561],
        [-7.5601,  0.4801, -7.0805, -0.0932]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 46
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.246718630194664, logits - tensor([[-7.0076,  3.4279, -6.3349, -2.8521],
        [-7.0276, -4.3262, -6.3073,  4.6160],
        [-6.9183, -1.3549, -5.3516,  1.3414],
        [-6.6898, -3.3358, -6.4145,  3.2195],
        [-5.7713, -4.2396,  3.3602, -4.0545],
        [-7.7564, -2.7850, -5.3586,  3.8357],
        [-6.9766,  3.4673, -7.0715, -3.4456],
        [-6.6881,  3.3151, -7.1065, -2.71

 16%|█▋        | 47/289 [00:35<03:03,  1.32it/s]

Training loop 47
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1932942122220993, logits - tensor([[-6.5452,  2.8919, -6.6669, -3.3008],
        [-7.2891,  3.8425, -6.4931, -3.9325],
        [-7.1577,  4.4700, -6.7258, -4.0864],
        [-5.8423, -3.3391,  2.4539, -3.0203],
        [-6.0950,  3.1527, -6.0647, -3.6919],
        [-7.3946,  3.4287, -7.0584, -3.4160],
        [-6.6298,  1.8469, -6.9475, -1.7494],
        [-6.0266, -4.1300,  2.4370, -2.8083]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 17%|█▋        | 48/289 [00:36<03:03,  1.32it/s]

Training loop 48
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.031008761376142502, logits - tensor([[-8.2092,  2.0305, -7.7916, -1.5784],
        [-7.2582, -2.4347, -5.9834,  2.8352],
        [-6.3624, -4.2254,  2.7538, -2.6080],
        [-6.5651, -3.9123, -6.4621,  4.1872],
        [-6.7845,  2.6520, -6.0656, -3.7153],
        [-7.8342,  2.5942, -7.5189, -2.6957],
        [-5.5745, -3.2569, -5.6332,  4.0363],
        [-6.4544, -4.3879,  4.0757, -3.7982]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 17%|█▋        | 49/289 [00:37<03:02,  1.32it/s]

Training loop 49
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.026109132915735245, logits - tensor([[-7.0847, -4.2335, -6.4781,  3.6941],
        [-7.1314, -3.0013, -5.7497,  3.5036],
        [-7.4533,  3.1343, -7.2065, -2.6566],
        [-6.1855, -4.3657,  1.6958, -2.0415],
        [-6.1309, -3.9573,  2.9708, -3.0385],
        [-6.4220, -4.7975,  3.0695, -4.1070],
        [-6.0120,  3.5828, -6.2235, -3.5045],
        [-6.6678,  4.3969, -6.3025, -3.5333]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 17%|█▋        | 50/289 [00:38<03:01,  1.32it/s]

Training loop 50
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.057756029069423676, logits - tensor([[-7.1981,  3.5229, -5.6770, -3.2765],
        [-5.2156,  2.7014, -4.9583, -3.7020],
        [-5.9982, -2.4496,  0.1161, -0.9717],
        [-5.7322,  2.8507, -6.3827, -2.7815],
        [-5.8105, -3.6887, -5.0105,  2.9549],
        [-8.7939, -2.0985, -5.9924,  2.5509],
        [-6.0013,  2.7698, -6.0926, -3.1507],
        [-7.7052,  2.7060, -7.4826, -2.7802]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 18%|█▊        | 51/289 [00:38<02:59,  1.32it/s]

Training loop 51
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.18378645181655884, logits - tensor([[-6.6071,  1.7622, -6.0460, -2.6160],
        [-7.0437, -4.0243,  2.8562, -3.0680],
        [-7.1283, -4.4098, -5.1811,  4.0880],
        [-6.2073,  3.7211, -6.3389, -3.0076],
        [-6.4855, -4.9685, -6.9264,  4.8314],
        [-6.5066, -2.1892, -0.6968,  0.2828],
        [-6.7788, -2.8041, -5.1406,  3.2646],
        [-6.1956, -3.7831,  1.7060, -2.2230]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 18%|█▊        | 52/289 [00:39<02:58,  1.33it/s]

Training loop 52
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.03385008126497269, logits - tensor([[-6.2134, -4.5552,  2.6671, -2.8031],
        [-6.6493, -2.6620, -5.2329,  2.6247],
        [-6.7430, -2.1332, -6.0566,  1.9673],
        [-7.1233, -2.1475, -5.6634,  2.3945],
        [-7.1737, -4.8910,  3.2918, -3.4042],
        [-5.7420,  3.2742, -5.9264, -2.9156],
        [-6.7472,  3.3276, -6.2138, -3.0066],
        [-5.7277,  3.3123, -5.2964, -3.0046]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 18%|█▊        | 53/289 [00:40<02:57,  1.33it/s]

Training loop 53
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.04555073752999306, logits - tensor([[-6.8256, -3.2843, -6.5903,  4.4018],
        [-6.9772,  3.8261, -6.7055, -4.6878],
        [-8.4838,  3.8631, -7.5330, -4.2253],
        [-5.9979,  3.0919, -6.5085, -3.7305],
        [-6.8462, -2.8712,  0.0272, -1.3186],
        [-7.0140,  3.3816, -6.2338, -3.3536],
        [-7.1831,  2.6325, -6.7496, -2.5339],
        [-5.7288,  3.5524, -5.7587, -3.2162]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 19%|█▊        | 54/289 [00:41<02:57,  1.33it/s]

Training loop 54
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.06244611740112305, logits - tensor([[-5.4253, -3.6963,  2.6281, -2.7236],
        [-6.5514, -3.7783, -5.5390,  3.7735],
        [-6.6099,  2.7127, -6.7706, -2.8437],
        [-8.0935,  1.3924, -7.1497, -0.1581],
        [-7.1144,  2.2308, -7.3924, -2.2729],
        [-6.1610, -4.3541,  2.7759, -3.1326],
        [-6.0563, -4.9515,  3.7889, -3.0106],
        [-6.4106, -1.7422, -5.6229,  1.2326]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 19%|█▉        | 55/289 [00:41<02:57,  1.32it/s]

Training loop 55
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.11068429052829742, logits - tensor([[-6.3909, -3.2893, -5.7146,  2.6206],
        [-7.5675,  3.2125, -8.1163, -3.8395],
        [-6.4420, -4.4863,  2.8711, -3.2903],
        [-6.6034,  0.6999, -5.9897, -1.4586],
        [-5.8255,  3.2430, -6.9222, -2.8401],
        [-5.7805,  3.1496, -7.0263, -2.8681],
        [-6.2403,  3.1034, -6.9315, -3.6163],
        [-5.9983, -3.4062,  2.3644, -2.3523]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 19%|█▉        | 56/289 [00:42<02:56,  1.32it/s]

Training loop 56
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.031698908656835556, logits - tensor([[-5.9898,  2.2015, -6.6201, -3.0018],
        [-6.3706,  1.7933, -6.2020, -1.8257],
        [-7.2759, -3.2827, -7.3645,  3.8813],
        [-6.5259,  3.3227, -6.9354, -3.4682],
        [-5.4023, -4.2804, -5.1934,  4.2966],
        [-7.7251,  1.9584, -6.8777, -1.9917],
        [-6.6530, -4.0939, -5.6723,  3.3816],
        [-7.7139, -3.6040, -6.2820,  3.1888]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 20%|█▉        | 57/289 [00:43<02:55,  1.32it/s]

Training loop 57
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.075190469622612, logits - tensor([[-6.3545,  0.2971, -6.1432,  0.1205],
        [-5.2540, -2.5391, -3.8604,  1.6169],
        [-7.1893,  3.3180, -7.7710, -3.0084],
        [-8.5561,  2.3545, -7.6313, -1.3757],
        [-6.0547,  2.5750, -7.0593, -3.6004],
        [-8.1027,  3.6277, -7.3357, -3.2248],
        [-5.9317, -3.4187,  2.7896, -2.6517],
        [-5.9421,  3.2017, -5.7181, -3.5426]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 20%|██        | 58/289 [00:44<02:55,  1.32it/s]

Training loop 58
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.04299285635352135, logits - tensor([[-7.5371,  4.1159, -7.0183, -3.7341],
        [-7.0649, -4.3738,  2.1787, -2.7302],
        [-5.4734,  3.1076, -6.4445, -3.5747],
        [-6.0097, -3.9480, -5.0222,  4.3112],
        [-6.4616, -3.6040,  2.2522, -2.9876],
        [-7.9578, -3.1466, -7.3274,  2.4957],
        [-7.7719,  1.1823, -7.6789, -0.8089],
        [-7.5104,  3.3103, -7.0925, -2.8117]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 20%|██        | 59/289 [00:44<02:54,  1.32it/s]

Training loop 59
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.14115004241466522, logits - tensor([[-7.6828,  3.2640, -7.1812, -3.1825],
        [-7.3848,  0.2977, -6.1997, -0.3410],
        [-7.2206, -0.7946, -6.0334,  1.0927],
        [-6.2284, -2.9799, -4.4443,  2.6905],
        [-6.8821, -3.7950,  1.8515, -2.1864],
        [-7.4362,  3.7519, -6.8417, -3.3785],
        [-6.7194,  2.3526, -6.7036, -3.0431],
        [-6.6264,  2.5852, -6.5532, -2.1336]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 21%|██        | 60/289 [00:45<02:53,  1.32it/s]

Training loop 60
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.024845058098435402, logits - tensor([[-7.5694,  2.4480, -7.8255, -2.6099],
        [-5.7628,  3.7193, -6.3719, -3.2229],
        [-5.4559,  2.6346, -6.0664, -2.3248],
        [-6.1960, -4.4023, -5.2039,  4.7257],
        [-7.2857,  2.5557, -6.7473, -3.3824],
        [-6.0998,  2.7551, -5.8472, -3.4392],
        [-6.8300,  4.1617, -6.4657, -3.3696],
        [-7.1879,  3.5215, -7.6058, -2.4351]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 21%|██        | 61/289 [00:46<02:52,  1.32it/s]

Training loop 61
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 21%|██▏       | 62/289 [00:47<02:51,  1.32it/s]

loss - 0.2368675172328949, logits - tensor([[-7.4448,  2.5736, -6.8648, -2.9281],
        [-6.1118, -3.6649,  2.5517, -1.9402],
        [-6.5591, -4.4515, -5.5611,  3.7662],
        [-6.0614, -4.3396,  2.8402, -3.3352],
        [-6.6570, -3.3625, -4.8436,  2.6046],
        [-6.5519, -2.0153, -5.6519,  2.3315],
        [-5.8056, -2.3181,  1.2970, -2.1183],
        [-8.1646,  3.5195, -7.9126, -3.3938]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 62
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.25586897134780884, logits - tensor([[-6.6403, -4.0249,  2.8710, -3.2885],
        [-8.0498,  3.8772, -6.8937, -3.4195],
        [-7.0524,  3.0880, -6.6718, -3.4609],
        [-7.0759,  1.9535, -5.4282, -1.8490],
        [-7.4205,  3.9740, -7.3872, -3.7816],
        [-6.6619,  2.2324, -6.6410, -3.1685],
        [-7.4078,  3.6057, -6.7595, -3.7304],
        [-6.9898, -4.0976, -5.8252,  3.9

 22%|██▏       | 63/289 [00:47<02:51,  1.32it/s]

Training loop 63
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.35194236040115356, logits - tensor([[-5.9304,  2.2451, -6.2560, -2.8472],
        [-6.3908,  3.5282, -6.8381, -3.5959],
        [-6.4274, -3.4955,  0.8511, -2.1004],
        [-6.1189, -3.3855,  2.1996, -2.3535],
        [-7.4484,  3.6780, -7.3902, -4.2224],
        [-4.7976, -2.6553, -4.5529,  2.9236],
        [-7.3513, -4.6490, -5.5763,  3.5597],
        [-6.9524,  2.4854, -7.7251, -2.5799]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 22%|██▏       | 64/289 [00:48<02:51,  1.32it/s]

Training loop 64
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1563459038734436, logits - tensor([[-6.1084,  0.9726, -5.9564, -1.5652],
        [-7.5559,  1.4019, -8.1089, -1.5193],
        [-6.9454,  3.1614, -6.2290, -3.0772],
        [-6.0442,  2.2071, -6.2058, -2.1913],
        [-6.8045, -4.0217, -6.5349,  3.4011],
        [-7.8045,  1.5804, -7.2518, -1.8794],
        [-6.4340, -3.9900,  2.2169, -2.9584],
        [-7.1183, -1.0046, -6.4344,  0.8529]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 22%|██▏       | 65/289 [00:49<02:50,  1.32it/s]

Training loop 65
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.026563558727502823, logits - tensor([[-5.6985, -4.0367,  2.7925, -2.2836],
        [-7.3333,  3.1256, -6.7013, -3.3568],
        [-5.2363,  2.4895, -5.8281, -3.8424],
        [-4.9386,  3.4833, -6.0995, -3.6274],
        [-7.1341, -2.0227, -7.0099,  2.1282],
        [-7.6642,  4.2965, -7.4976, -4.5198],
        [-6.6991,  3.4960, -6.0791, -2.9095],
        [-6.3677,  3.3089, -6.0787, -3.5478]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 23%|██▎       | 66/289 [00:50<02:50,  1.31it/s]

Training loop 66
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 23%|██▎       | 67/289 [00:50<02:49,  1.31it/s]

loss - 0.031448524445295334, logits - tensor([[-7.1129, -4.3876, -4.9819,  3.9441],
        [-8.9751,  3.3421, -7.9706, -3.7477],
        [-7.5223, -1.3405, -6.6847,  1.4003],
        [-6.4674,  2.4304, -6.2473, -2.8931],
        [-6.9728,  3.3467, -7.4823, -3.1019],
        [-5.6092,  3.8461, -6.2251, -3.6683],
        [-7.7898,  2.7886, -7.5846, -2.4730],
        [-5.6258, -4.3183, -4.7988,  4.2902]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 67
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.16414044797420502, logits - tensor([[-5.9819, -2.9487,  1.9648, -1.7644],
        [-6.8519,  2.7522, -6.4235, -3.0049],
        [-5.1173, -3.4082,  1.3434, -1.3421],
        [-7.1360,  2.8406, -7.9485, -2.5526],
        [-5.8559,  2.5395, -5.5976, -2.8624],
        [-6.0307,  3.3048, -6.5957, -2.9455],
        [-6.4649, -3.5320,  2.7657, -3.0217],
        [-6.7674, -2.8820, -5.8946,  5

 24%|██▎       | 68/289 [00:51<02:48,  1.31it/s]

Training loop 68
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.032124683260917664, logits - tensor([[-5.1355, -4.3430, -6.1031,  4.0997],
        [-5.7633, -4.2220,  3.7091, -3.6565],
        [-6.2615,  2.7106, -6.1672, -3.2181],
        [-6.8158,  2.4542, -7.2141, -2.6437],
        [-7.1442, -1.3845, -6.1113,  1.6204],
        [-7.5054, -4.4373,  2.4711, -2.8256],
        [-6.3789, -4.5315, -5.9350,  4.1191],
        [-7.0927,  3.1354, -6.6998, -3.5345]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 24%|██▍       | 69/289 [00:52<02:47,  1.31it/s]

Training loop 69
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.02894262783229351, logits - tensor([[-6.5173,  2.2020, -6.6835, -2.6342],
        [-7.8311,  3.3430, -7.0447, -3.7594],
        [-5.7458, -3.3354, -4.4967,  3.2588],
        [-7.6774,  3.6131, -7.4479, -4.1303],
        [-8.0966,  3.3179, -7.2643, -2.3958],
        [-6.8364, -3.0807, -5.4056,  2.0877],
        [-6.0149,  2.1234, -6.2844, -2.1228],
        [-8.3041,  3.7435, -7.3620, -4.2695]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 24%|██▍       | 70/289 [00:53<02:46,  1.31it/s]

Training loop 70
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.06049656122922897, logits - tensor([[-7.2166,  3.3919, -7.5319, -3.0085],
        [-5.6397,  2.7363, -5.4490, -3.0295],
        [-6.1481,  2.5220, -5.7182, -3.3275],
        [-5.0684, -3.6916,  2.0365, -2.7161],
        [-7.2902, -0.4573, -4.0087, -0.0796],
        [-6.8952,  3.7904, -7.0210, -4.0749],
        [-7.2666, -4.3725, -5.8050,  4.4944],
        [-6.6499,  3.1170, -6.5740, -3.1318]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 25%|██▍       | 71/289 [00:53<02:46,  1.31it/s]

Training loop 71
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1536327302455902, logits - tensor([[-7.9979,  3.1520, -7.7499, -2.7530],
        [-5.9192, -3.1750,  2.2842, -2.3370],
        [-7.3873, -4.1452, -6.2301,  4.1006],
        [-7.4326, -3.3362, -6.2503,  3.2133],
        [-6.5470, -3.6498,  1.9933, -1.9090],
        [-6.4999,  3.3541, -6.9090, -3.6815],
        [-6.6889,  3.6737, -7.0408, -4.0079],
        [-7.0883,  3.0396, -6.5086, -2.0957]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 25%|██▍       | 72/289 [00:54<02:45,  1.31it/s]

Training loop 72
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.02948187105357647, logits - tensor([[-7.0331,  3.3132, -7.1839, -3.1105],
        [-7.1098, -3.8971,  2.2840, -2.4988],
        [-7.7739,  3.6420, -6.7170, -3.4443],
        [-5.3256, -4.0381,  2.7821, -3.6084],
        [-6.5225,  2.4422, -6.5515, -3.2846],
        [-8.5965, -2.1206, -5.8429,  2.5797],
        [-5.1301, -3.9184,  2.8015, -3.3405],
        [-6.8641, -3.5052, -6.0085,  3.4093]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 25%|██▌       | 73/289 [00:55<02:43,  1.32it/s]

Training loop 73
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.018430549651384354, logits - tensor([[-7.2074,  3.8125, -7.7213, -4.0142],
        [-6.8004, -3.8404, -6.3772,  3.9994],
        [-7.5418,  3.2441, -7.1698, -2.9775],
        [-6.2842,  4.0596, -6.9589, -4.0087],
        [-7.2988, -3.4149, -5.3479,  2.6220],
        [-6.9663,  3.6970, -6.8880, -3.7053],
        [-7.4162,  2.6543, -7.8394, -2.3226],
        [-5.3066, -4.2324, -5.6055,  3.3265]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 26%|██▌       | 74/289 [00:56<02:43,  1.32it/s]

Training loop 74
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.04497632384300232, logits - tensor([[-7.6919,  3.2215, -7.8886, -3.6080],
        [-7.2096, -1.9511,  0.1403, -1.8762],
        [-6.0238,  3.5657, -7.0984, -3.7298],
        [-5.4699, -2.9789, -3.8000,  4.6285],
        [-7.8274,  3.2405, -7.3133, -3.4969],
        [-6.4342,  3.9860, -6.9859, -3.5232],
        [-6.3904, -4.6954,  2.8029, -2.6220],
        [-6.4809,  3.2537, -6.4917, -3.4775]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 26%|██▌       | 75/289 [00:56<02:42,  1.32it/s]

Training loop 75
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.040208619087934494, logits - tensor([[-7.4840,  3.0097, -8.5309, -2.7058],
        [-6.5946,  2.9660, -6.4425, -3.3267],
        [-6.5486, -2.7310, -4.7438,  2.7945],
        [-7.4244,  3.5339, -6.9577, -3.5032],
        [-7.3639,  3.5739, -7.1873, -3.2601],
        [-5.8060, -3.3922,  1.6914, -1.2126],
        [-7.4348, -1.6209, -7.0480,  2.1875],
        [-6.4595,  3.4147, -6.6741, -3.2952]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 26%|██▋       | 76/289 [00:57<02:40,  1.32it/s]

Training loop 76
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3105858266353607, logits - tensor([[-7.2174,  2.8872, -7.2587, -3.3461],
        [-7.5366,  0.3703, -7.1813, -1.0749],
        [-6.4224, -3.4655,  1.5727, -2.2777],
        [-5.7979, -3.4261, -6.1916,  4.0713],
        [-6.3774,  3.7995, -6.6848, -3.8031],
        [-7.1037,  2.0766, -7.5268, -2.7124],
        [-7.2819,  4.3661, -7.5081, -3.9698],
        [-7.5773,  3.9126, -7.5395, -3.3120]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 27%|██▋       | 77/289 [00:58<02:39,  1.33it/s]

Training loop 77
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.025552693754434586, logits - tensor([[-6.6365, -3.6560, -5.8533,  2.8649],
        [-6.4368, -2.7920, -5.0080,  3.4951],
        [-6.6833, -2.7096, -6.7033,  2.5582],
        [-5.5814,  2.4585, -5.5892, -3.0469],
        [-6.1035,  3.2377, -6.5444, -3.1521],
        [-6.1961,  3.3650, -6.1152, -2.7783],
        [-7.0288,  2.5030, -7.6940, -3.0559],
        [-6.0766, -4.2803,  4.5318, -3.8008]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 27%|██▋       | 78/289 [00:59<02:38,  1.33it/s]

Training loop 78
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.05242982879281044, logits - tensor([[-6.2655, -2.8782, -4.6829,  2.3624],
        [-7.0558, -1.1922, -6.1492,  2.1909],
        [-5.4752, -2.8731,  0.8914, -1.3543],
        [-6.3106,  3.1834, -6.4977, -3.0490],
        [-7.1131,  2.7730, -7.0966, -2.9928],
        [-6.4118,  2.7863, -6.9522, -2.6186],
        [-6.4692,  3.2233, -7.6187, -3.0405],
        [-6.2957, -3.3606, -5.8983,  2.8785]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 27%|██▋       | 79/289 [00:59<02:37,  1.33it/s]

Training loop 79
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.022517988458275795, logits - tensor([[-6.4346,  3.2449, -6.7538, -4.0443],
        [-5.9853,  2.2271, -5.9405, -3.0391],
        [-6.5475, -3.3479, -5.4493,  3.6356],
        [-6.9820,  2.9937, -6.8316, -3.2272],
        [-7.5785,  3.1982, -5.7230, -3.8350],
        [-6.3951,  3.3904, -6.4189, -2.9618],
        [-6.6998, -4.2516, -5.0755,  5.0846],
        [-6.4321, -3.5826,  2.5949, -2.6102]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 28%|██▊       | 80/289 [01:00<02:37,  1.33it/s]

Training loop 80
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.02502306178212166, logits - tensor([[-5.8770,  3.0876, -5.8093, -2.7698],
        [-6.0855,  2.4361, -6.3186, -2.8027],
        [-6.7607,  3.1817, -6.9792, -2.5828],
        [-7.3532,  3.3224, -7.0460, -3.8300],
        [-7.8298,  4.1932, -7.0233, -5.0650],
        [-6.9848, -2.2450, -6.4904,  2.3704],
        [-6.1260, -4.5492,  3.5305, -4.1504],
        [-6.9072,  3.0141, -7.0364, -3.0416]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 28%|██▊       | 81/289 [01:01<02:36,  1.33it/s]

Training loop 81
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.02430487424135208, logits - tensor([[-6.6996, -4.3807,  2.9370, -3.6260],
        [-6.4686, -3.1431, -5.0331,  4.0958],
        [-5.7982, -3.1127, -6.0663,  3.1592],
        [-6.7298, -4.3426,  2.4033, -2.4769],
        [-6.4340,  2.6648, -6.7181, -2.3636],
        [-5.7227, -4.5617, -5.5941,  4.3968],
        [-7.3499,  3.8315, -6.5884, -2.9380],
        [-6.4622,  3.3509, -6.8809, -3.0601]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 28%|██▊       | 82/289 [01:02<02:36,  1.33it/s]

Training loop 82
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.07375850528478622, logits - tensor([[-8.2263, -1.3049, -6.0970,  1.1224],
        [-7.0214,  3.4326, -7.4254, -3.9885],
        [-5.4867, -4.0172,  3.2232, -2.0084],
        [-7.6864, -0.0578, -6.7684, -0.8734],
        [-6.7897, -3.4995,  1.5452, -2.1359],
        [-5.9282,  3.1894, -6.4154, -3.8778],
        [-6.5548,  4.1537, -7.0432, -4.5290],
        [-6.9876, -4.4752,  2.9667, -3.4149]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 29%|██▊       | 83/289 [01:03<02:36,  1.32it/s]

Training loop 83
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 29%|██▉       | 84/289 [01:03<02:35,  1.31it/s]

loss - 0.02791898511350155, logits - tensor([[-6.3162,  2.6054, -6.5120, -3.2114],
        [-5.4012,  3.4879, -6.6798, -2.9630],
        [-7.2103,  2.7671, -7.9698, -2.5747],
        [-6.4859, -3.3459,  2.1281, -2.0184],
        [-5.8881,  3.1111, -6.4003, -3.2493],
        [-6.8225,  3.4178, -7.4262, -3.0600],
        [-7.0895,  4.1970, -7.1654, -3.2168],
        [-6.7956,  3.9278, -6.9406, -3.1738]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 84
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 29%|██▉       | 85/289 [01:04<02:34,  1.32it/s]

loss - 0.2770366072654724, logits - tensor([[-7.3588,  0.9411, -6.9275, -0.7065],
        [-6.6978, -4.1416,  3.4512, -2.8056],
        [-6.8942,  2.8614, -6.1657, -3.9645],
        [-6.6210,  3.1779, -6.6999, -2.4249],
        [-6.8609,  3.0789, -7.0802, -2.9309],
        [-6.4282,  1.6435, -6.4111, -1.3185],
        [-7.6866,  1.7308, -7.8464, -1.1838],
        [-7.0575,  3.6560, -6.5327, -3.9083]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 85
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.0200236476957798, logits - tensor([[-6.7842,  2.3479, -7.6439, -2.6242],
        [-5.6001, -4.5512, -4.9302,  4.5667],
        [-6.2349,  2.9026, -6.8567, -3.6872],
        [-7.0832,  3.7306, -6.6078, -3.5939],
        [-6.8246,  2.9363, -6.2815, -2.8134],
        [-6.2182,  3.9284, -6.0956, -3.9449],
        [-6.4651,  3.3091, -6.2637, -3.8567],
        [-5.7976,  3.4084, -5.7961, -2.83

 30%|██▉       | 86/289 [01:05<02:34,  1.32it/s]

Training loop 86
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.019036348909139633, logits - tensor([[-5.9664, -4.9217, -6.2216,  5.0126],
        [-6.4482,  2.6580, -7.8925, -3.2248],
        [-6.4038,  3.1458, -6.6186, -3.2763],
        [-6.2384,  3.2584, -7.3614, -4.3943],
        [-7.3669,  2.7450, -7.1592, -3.2745],
        [-7.8422,  4.3562, -7.4245, -3.6275],
        [-6.7492,  2.6997, -7.5537, -2.6869],
        [-6.5913,  3.0293, -6.7370, -3.7142]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 30%|███       | 87/289 [01:06<02:33,  1.32it/s]

Training loop 87
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.02233486995100975, logits - tensor([[-6.7489,  3.6196, -6.7422, -3.6814],
        [-6.6447,  2.1505, -6.6899, -2.6955],
        [-5.7749,  2.6780, -5.4405, -3.7869],
        [-5.8950, -3.0847, -4.8112,  4.0817],
        [-7.5754, -3.9069, -5.9944,  3.6457],
        [-7.5436,  3.7985, -8.1151, -3.4810],
        [-6.1693, -4.1011,  2.2947, -3.5197],
        [-7.5025,  3.2937, -6.6673, -3.4753]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 30%|███       | 88/289 [01:06<02:32,  1.32it/s]

Training loop 88
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.023710206151008606, logits - tensor([[-6.0857, -3.6827,  2.8205, -3.5453],
        [-5.3697, -3.4647,  3.1520, -3.2151],
        [-5.6801, -3.9827,  2.4285, -2.5337],
        [-7.4029,  2.9458, -6.9836, -3.2145],
        [-7.7618, -4.0297, -5.9277,  3.8367],
        [-6.4256, -3.1546, -5.9093,  3.5784],
        [-6.7989,  3.4288, -6.5086, -3.4561],
        [-6.9129, -3.2682, -6.7393,  3.3980]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 31%|███       | 89/289 [01:07<02:31,  1.32it/s]

Training loop 89
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.13980047404766083, logits - tensor([[-7.4790,  3.9937, -7.2640, -3.8532],
        [-5.5374, -3.0875,  2.1397, -2.7574],
        [-5.8726, -3.3179, -5.8572,  2.6078],
        [-6.3288,  2.8885, -7.2339, -3.3592],
        [-7.2357,  3.6386, -6.5783, -3.3894],
        [-6.8738,  3.0927, -7.6603, -2.7463],
        [-5.9619,  3.6250, -6.0560, -3.5852],
        [-5.8349, -1.9469,  1.6495, -1.6421]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 31%|███       | 90/289 [01:08<02:31,  1.31it/s]

Training loop 90
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.01621074788272381, logits - tensor([[-7.7101, -4.0972, -5.3216,  4.1743],
        [-6.2539,  3.2083, -5.8368, -4.1655],
        [-7.6592, -3.2654, -5.4639,  3.7501],
        [-8.1767, -4.7200, -6.1354,  4.0279],
        [-5.8253,  2.8297, -6.7507, -3.2726],
        [-7.2326, -3.4669, -5.6595,  3.5529],
        [-6.8284, -5.0436,  3.6127, -3.6700],
        [-6.0631,  2.9200, -6.4813, -2.9730]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 31%|███▏      | 91/289 [01:09<02:30,  1.31it/s]

Training loop 91
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.02076437883079052, logits - tensor([[-6.0375, -4.0911,  3.3538, -3.1357],
        [-6.9962,  3.5169, -6.8670, -2.3861],
        [-5.0374, -3.3923,  2.6796, -3.0206],
        [-6.5013,  3.2879, -6.0138, -3.2639],
        [-5.6063,  3.6537, -6.8851, -3.0616],
        [-6.8481, -3.8703, -6.2942,  5.0613],
        [-6.2783,  3.3284, -6.5252, -3.1580],
        [-6.8763,  4.3927, -6.5773, -4.0677]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 32%|███▏      | 92/289 [01:09<02:29,  1.32it/s]

Training loop 92
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.04007633775472641, logits - tensor([[-5.9324,  3.4364, -6.5012, -3.1616],
        [-5.9839,  2.6225, -5.9040, -3.4801],
        [-7.4350, -0.6483, -6.2882,  0.7625],
        [-6.3861,  4.1168, -6.8433, -3.3039],
        [-6.1276,  3.7786, -5.7265, -4.2138],
        [-5.9833, -4.8936,  3.2501, -3.6146],
        [-8.4349,  3.4645, -7.8436, -3.4436],
        [-7.2019,  3.5509, -6.5196, -3.5681]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 32%|███▏      | 93/289 [01:10<02:28,  1.32it/s]

Training loop 93
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 33%|███▎      | 94/289 [01:11<02:27,  1.32it/s]

loss - 0.06602819263935089, logits - tensor([[-6.9444,  3.2797, -7.9288, -3.3281],
        [-6.1161, -4.9937, -5.3933,  4.2370],
        [-6.5278, -2.3102, -6.2263,  2.4508],
        [-6.9942, -1.6999, -7.2795,  1.7022],
        [-6.4890,  4.2678, -5.6895, -4.1566],
        [-7.0493, -0.8715, -2.8330, -0.3980],
        [-6.7768,  3.6650, -6.6917, -3.7994],
        [-6.7439,  3.2284, -7.4734, -2.9730]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 94
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.040874183177948, logits - tensor([[-6.4171,  2.8006, -7.0367, -3.5589],
        [-7.6549,  2.5870, -6.6112, -2.3371],
        [-7.4865,  3.4991, -7.2153, -3.6762],
        [-7.3413, -4.1705, -6.2098,  3.2734],
        [-6.2308,  4.0960, -6.8430, -3.2965],
        [-5.5718,  2.9152, -5.3522, -2.5820],
        [-6.0513,  2.5456, -6.7165, -1.9011],
        [-5.3387, -2.9144,  1.1466, -1.51

 33%|███▎      | 95/289 [01:12<02:27,  1.32it/s]

Training loop 95
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.016231857240200043, logits - tensor([[-5.6277, -3.8513,  3.5498, -3.7555],
        [-7.0842,  3.5581, -6.9558, -2.7595],
        [-7.1671,  4.0030, -6.9607, -3.5751],
        [-5.4649,  2.8109, -6.1519, -3.3707],
        [-7.1725, -3.4506, -5.9059,  3.4356],
        [-5.4588, -4.7577, -4.4206,  5.0125],
        [-6.8163,  3.8064, -7.1964, -4.3402],
        [-7.0225,  3.6310, -6.1778, -3.1396]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 33%|███▎      | 96/289 [01:12<02:27,  1.31it/s]

Training loop 96
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.027539528906345367, logits - tensor([[-7.0853,  2.8869, -6.0440, -2.6765],
        [-6.7534,  2.1844, -6.1897, -2.4808],
        [-6.9891,  3.6898, -6.3289, -4.0580],
        [-7.7658,  4.0496, -7.6903, -3.5270],
        [-7.6553,  3.5404, -6.4413, -2.7341],
        [-7.9614, -3.2845, -6.3587,  3.0198],
        [-7.4469,  2.5160, -7.2552, -1.8484],
        [-7.4546, -3.8124, -4.9166,  3.1901]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 34%|███▎      | 97/289 [01:13<02:25,  1.32it/s]

Training loop 97
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 34%|███▍      | 98/289 [01:14<02:24,  1.32it/s]

loss - 0.04723864793777466, logits - tensor([[-6.3369,  3.8227, -6.6287, -3.3238],
        [-6.8016, -1.4954, -5.6644,  2.1990],
        [-7.0906,  4.3773, -6.9151, -4.8016],
        [-6.0045, -2.8346,  0.9912, -1.2159],
        [-5.3249, -3.7661, -4.3967,  3.8287],
        [-6.5746,  2.6046, -6.5889, -3.3056],
        [-7.1682, -1.9291, -6.1528,  2.2016],
        [-7.3263, -3.1386, -7.2283,  3.7459]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 98
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.22369301319122314, logits - tensor([[-6.5653, -4.1723, -6.7650,  2.8499],
        [-6.0613,  3.7499, -7.5918, -3.2774],
        [-6.6263,  3.6285, -7.1609, -3.7727],
        [-6.9957,  4.5729, -6.6935, -4.1065],
        [-5.1045,  2.7240, -5.7329, -3.5275],
        [-6.1760, -4.0156,  3.2254, -3.3993],
        [-5.8325, -4.1394, -6.2617,  3.8654],
        [-6.9500, -3.3108, -6.0564,  3.

 34%|███▍      | 99/289 [01:15<02:23,  1.33it/s]

Training loop 99
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.07412423193454742, logits - tensor([[-6.2759, -2.9475,  0.7732, -1.4366],
        [-6.9889,  3.9934, -6.7023, -3.6626],
        [-6.0084,  4.4280, -6.0002, -3.6002],
        [-7.3393,  3.0425, -7.3453, -3.5149],
        [-6.4494, -0.3441, -6.2568,  0.8212],
        [-6.1224,  3.3914, -6.3955, -3.3071],
        [-7.9868,  1.2782, -7.2487, -1.2132],
        [-5.9405, -2.8455, -5.5100,  4.3791]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 35%|███▍      | 100/289 [01:15<02:22,  1.32it/s]

Training loop 100
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.17680171132087708, logits - tensor([[-7.9127,  3.1413, -6.8391, -3.9111],
        [-5.4301,  3.2565, -5.3526, -3.6566],
        [-5.7676,  2.8230, -5.2456, -2.5478],
        [-5.4789,  3.3594, -5.8115, -3.5868],
        [-6.7526,  3.6327, -5.8038, -3.5895],
        [-6.7147, -1.6573, -6.3701,  2.1791],
        [-5.8667, -3.2900,  1.2183, -1.9764],
        [-6.1944,  4.0729, -6.3504, -4.3983]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 35%|███▍      | 101/289 [01:16<02:22,  1.32it/s]

Training loop 101
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.01934703439474106, logits - tensor([[-6.7962,  2.8508, -7.2851, -3.2234],
        [-5.6966,  3.8964, -6.6069, -3.7545],
        [-6.1545, -3.8317,  2.6126, -2.5834],
        [-7.7335,  3.7514, -7.2853, -2.9626],
        [-7.2721,  3.9682, -6.0492, -3.3867],
        [-6.2429,  3.3075, -6.2245, -3.0584],
        [-5.5555,  3.9075, -5.7010, -3.8817],
        [-6.9510,  4.1945, -7.0782, -3.5921]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 35%|███▌      | 102/289 [01:17<02:21,  1.32it/s]

Training loop 102
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.029718538746237755, logits - tensor([[-6.3205,  2.2715, -5.7678, -3.4746],
        [-7.7792, -1.6348, -6.2869,  1.6493],
        [-6.5304,  3.3781, -7.0942, -3.3864],
        [-6.7210,  3.2000, -6.4962, -4.7894],
        [-7.5794,  2.6611, -7.5736, -2.7981],
        [-6.9205,  3.3862, -7.5676, -4.0891],
        [-5.5553,  2.6699, -6.5532, -3.4446],
        [-5.8683,  3.7965, -6.3421, -3.4391]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 36%|███▌      | 103/289 [01:18<02:20,  1.32it/s]

Training loop 103
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.028616569936275482, logits - tensor([[-7.4834,  2.8019, -6.4974, -2.8732],
        [-6.9864, -3.9831, -5.8614,  3.9311],
        [-6.7356,  2.2725, -7.0187, -3.3717],
        [-6.1791,  2.8305, -6.9798, -2.9362],
        [-6.1979,  3.7234, -5.4264, -3.9455],
        [-7.4733,  2.7755, -7.8651, -2.3515],
        [-6.7594,  3.2578, -7.4978, -2.1361],
        [-6.9972,  2.2992, -7.4607, -2.7471]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 36%|███▌      | 104/289 [01:18<02:19,  1.32it/s]

Training loop 104
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.0184567142277956, logits - tensor([[-8.0431, -4.1998, -7.4483,  2.6336],
        [-7.2851, -4.7090, -5.9949,  4.4833],
        [-6.7323,  3.8966, -6.8467, -4.0437],
        [-6.4574, -4.1699, -5.9155,  4.4226],
        [-6.5297,  4.3452, -7.0833, -4.5370],
        [-7.3554,  2.7295, -7.3976, -2.1218],
        [-6.7809,  3.2196, -7.6849, -3.4400],
        [-5.2035, -3.5257,  3.1110, -2.8373]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 36%|███▋      | 105/289 [01:19<02:18,  1.33it/s]

Training loop 105
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1243504136800766, logits - tensor([[-7.6851,  3.4726, -7.9584, -4.4761],
        [-6.0477, -2.4822,  0.9294, -1.5884],
        [-5.7339, -4.4499,  3.0195, -3.6404],
        [-6.6431,  3.4376, -5.9940, -3.5764],
        [-7.1239,  3.2389, -7.3311, -3.3913],
        [-6.9214, -3.6548,  2.1750, -2.4180],
        [-7.2521, -2.3032, -6.1906,  2.8257],
        [-5.1541, -2.6783,  2.6961, -2.4791]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 37%|███▋      | 106/289 [01:20<02:17,  1.33it/s]

Training loop 106
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.0753181129693985, logits - tensor([[-6.4852,  0.7781, -5.8314, -0.1632],
        [-6.9164,  3.6674, -6.0470, -4.2253],
        [-7.1364,  3.9643, -8.2918, -3.5812],
        [-6.8069, -4.3356,  3.1323, -3.5925],
        [-7.7085,  3.4110, -7.3048, -2.9789],
        [-5.8933, -3.8913,  3.5559, -3.6130],
        [-5.8669, -3.8569,  3.3732, -3.2802],
        [-6.5354,  3.6261, -6.4628, -4.3624]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 37%|███▋      | 107/289 [01:21<02:17,  1.32it/s]

Training loop 107
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.01995973475277424, logits - tensor([[-5.8349,  3.0705, -5.7984, -3.0875],
        [-7.0819,  2.9935, -6.0159, -4.4843],
        [-6.0572,  2.9589, -6.2240, -4.0539],
        [-7.3478,  5.0607, -6.1964, -3.0193],
        [-7.1452, -2.9500, -4.9507,  2.6786],
        [-7.6840, -2.8833, -6.2028,  3.5203],
        [-6.2180, -4.1805,  3.9055, -2.4437],
        [-5.9899, -5.5343, -4.8636,  5.3417]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 37%|███▋      | 108/289 [01:21<02:17,  1.32it/s]

Training loop 108
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.016413923352956772, logits - tensor([[-6.2102,  4.2712, -6.0956, -4.1558],
        [-7.1913, -3.1393, -5.1524,  3.4821],
        [-6.8188,  3.5229, -6.8904, -3.1884],
        [-5.0620,  3.0322, -6.1830, -3.5124],
        [-5.2821,  4.0321, -6.1594, -3.2104],
        [-7.0144, -2.6793, -6.5906,  2.6767],
        [-6.3888, -5.2287, -6.4035,  4.5086],
        [-5.5277,  4.4090, -6.0917, -3.9812]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 38%|███▊      | 109/289 [01:22<02:17,  1.31it/s]

Training loop 109
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.020734963938593864, logits - tensor([[-6.8600,  3.3348, -6.0415, -3.8562],
        [-6.2340,  3.6854, -5.7907, -3.6745],
        [-6.9826, -3.3510, -5.6205,  2.6102],
        [-5.4534, -5.1242, -6.3631,  4.6850],
        [-5.6131,  3.0007, -6.3243, -3.4726],
        [-5.8506, -4.2350,  2.1512, -3.1367],
        [-5.5524, -3.6868,  2.9456, -3.5881],
        [-6.3207,  3.7330, -6.2175, -3.5133]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 38%|███▊      | 110/289 [01:23<02:16,  1.31it/s]

Training loop 110
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.0634276270866394, logits - tensor([[-7.5366,  2.4947, -6.8310, -2.3517],
        [-7.4138, -3.4192,  1.7574, -2.1635],
        [-6.5015,  3.1738, -7.1739, -4.1869],
        [-6.7310,  2.8435, -6.6747, -3.4671],
        [-6.3913, -0.0765, -7.0203,  0.2601],
        [-6.7689, -5.0707, -4.5056,  3.9807],
        [-7.5496,  3.4456, -7.9347, -3.1244],
        [-7.1661, -3.6572, -6.5424,  3.2135]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 38%|███▊      | 111/289 [01:24<02:15,  1.32it/s]

Training loop 111
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1246977373957634, logits - tensor([[-7.5810,  4.8910, -7.8778, -3.7895],
        [-7.4017, -3.2754, -6.6571,  4.2661],
        [-8.0027,  3.8912, -7.1372, -3.8299],
        [-6.4956, -2.2133, -5.8637,  1.9403],
        [-6.6089,  3.7160, -6.2263, -4.2245],
        [-7.0059,  3.7763, -6.6594, -3.4025],
        [-6.8196, -4.9842, -5.3977,  3.8875],
        [-6.4053,  1.5090, -5.9863, -1.5909]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 39%|███▉      | 112/289 [01:25<02:14,  1.31it/s]

Training loop 112
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.22856707870960236, logits - tensor([[-7.6160,  4.7627, -6.4175, -3.9473],
        [-6.7619, -2.7545, -6.3413,  3.3041],
        [-5.0650, -3.2379,  2.1064, -2.2950],
        [-8.2807,  2.9664, -6.9557, -2.1015],
        [-6.3820,  2.6403, -5.9947, -2.2553],
        [-6.8721, -4.9454, -4.7786,  5.2138],
        [-7.0093,  2.8331, -5.7847, -3.3790],
        [-6.9157,  1.8967, -7.0223, -2.0765]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 39%|███▉      | 113/289 [01:25<02:14,  1.31it/s]

Training loop 113
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.14258842170238495, logits - tensor([[-6.0451,  4.3709, -5.9523, -4.6055],
        [-7.2169,  4.1635, -7.3804, -3.9646],
        [-5.1200, -2.4931,  1.2463, -1.8033],
        [-7.0329, -3.8715, -5.7506,  3.3694],
        [-6.9416, -4.3472, -5.9737,  4.1025],
        [-6.8447, -4.9316, -5.4020,  4.5518],
        [-5.9763,  3.2645, -5.0847, -3.0181],
        [-7.2881,  3.2871, -7.2431, -3.7494]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 39%|███▉      | 114/289 [01:26<02:14,  1.30it/s]

Training loop 114
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.02169591560959816, logits - tensor([[-6.4540,  3.2705, -6.1862, -3.3926],
        [-6.7081,  4.3293, -5.9951, -3.8798],
        [-7.4081,  3.5428, -6.8107, -3.4330],
        [-7.2667,  3.9982, -7.8379, -3.6232],
        [-5.7306, -4.2121,  3.0537, -3.8670],
        [-6.1357, -2.2143, -6.2390,  2.4375],
        [-6.8071, -2.7614, -5.6976,  3.7967],
        [-5.9249,  2.9327, -6.4050, -2.8732]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 40%|███▉      | 115/289 [01:27<02:13,  1.30it/s]

Training loop 115
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.08728523552417755, logits - tensor([[-6.9263, -2.9407, -6.4977,  3.0930],
        [-6.7866, -3.2841, -5.4268,  3.8637],
        [-6.8287, -2.4136, -5.9523,  3.3014],
        [-5.7629,  3.2158, -6.7235, -3.1013],
        [-7.5446,  0.1922, -6.1143, -0.8198],
        [-7.6199,  3.6859, -7.2156, -3.2301],
        [-7.1306,  2.5170, -6.7418, -2.8257],
        [-7.3185, -3.4231,  2.3258, -2.1602]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 40%|████      | 116/289 [01:28<02:11,  1.31it/s]

Training loop 116
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.08861356973648071, logits - tensor([[-7.0493, -2.2231, -5.6479,  1.5242],
        [-6.9004,  4.3965, -7.5911, -4.9830],
        [-6.1845,  3.4632, -7.0982, -3.8953],
        [-6.5346,  3.3505, -6.5860, -3.9127],
        [-6.0921, -2.8917,  1.4937, -1.9921],
        [-7.7102,  0.0087, -6.2959, -0.5534],
        [-5.4098, -4.0636,  1.8552, -3.2551],
        [-6.5992, -3.0142, -7.2804,  2.9491]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 40%|████      | 117/289 [01:28<02:11,  1.31it/s]

Training loop 117
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.04096503555774689, logits - tensor([[-4.8450, -4.5235,  3.3074, -3.6911],
        [-7.7981,  2.1896, -7.0916, -2.5008],
        [-5.5559, -4.7028, -4.6926,  3.9497],
        [-5.7605, -3.8862,  3.4098, -2.5638],
        [-5.5689,  3.3541, -6.5128, -3.2346],
        [-5.3609,  3.6295, -5.9365, -4.2559],
        [-8.1002,  3.4754, -7.5892, -3.1801],
        [-6.8538, -3.4331,  0.8764, -1.0678]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 41%|████      | 118/289 [01:29<02:09,  1.32it/s]

Training loop 118
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.22014668583869934, logits - tensor([[-6.3272, -3.4066,  1.9930, -2.4589],
        [-6.7290,  2.7975, -7.0113, -2.9758],
        [-7.4211,  3.6661, -6.9401, -2.9445],
        [-6.4872,  2.8511, -6.6139, -2.9872],
        [-7.6369,  3.3167, -6.2525, -3.6426],
        [-7.0517,  3.2682, -7.1494, -3.0011],
        [-6.4305,  2.4709, -5.5130, -2.2951],
        [-7.2248,  0.5950, -7.1736,  0.0239]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 41%|████      | 119/289 [01:30<02:08,  1.32it/s]

Training loop 119
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.027188021689653397, logits - tensor([[-5.8207,  3.4798, -5.7904, -3.1034],
        [-7.6016,  3.7827, -7.9600, -3.5348],
        [-6.7979,  3.3290, -6.5138, -2.6854],
        [-7.0816,  3.6774, -6.4119, -3.6881],
        [-6.1675, -3.4019,  1.6923, -2.0082],
        [-6.0399,  2.2461, -5.9745, -2.4923],
        [-6.9727,  4.6997, -6.8891, -4.6368],
        [-6.3398, -3.8318, -4.6260,  4.3110]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 42%|████▏     | 120/289 [01:31<02:08,  1.32it/s]

Training loop 120
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.15619730949401855, logits - tensor([[-7.2739, -3.7624,  2.2847, -2.1368],
        [-5.1507, -4.8011, -5.0356,  3.3565],
        [-5.8527,  3.0374, -6.5640, -2.7764],
        [-6.6454, -3.2113, -6.0718,  3.7156],
        [-7.9311,  2.0983, -8.0868, -1.8837],
        [-6.4131,  3.4573, -6.4934, -3.7279],
        [-7.3858,  2.9030, -7.3087, -2.4967],
        [-6.1116, -4.1389,  3.0093, -3.3387]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 42%|████▏     | 121/289 [01:31<02:07,  1.32it/s]

Training loop 121
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.017841260880231857, logits - tensor([[-6.6421,  2.8624, -6.7118, -3.4598],
        [-5.8938, -4.2464, -4.7630,  4.1004],
        [-5.5953, -3.9473,  3.0448, -3.1478],
        [-6.2714,  3.0457, -5.2456, -3.3345],
        [-7.6039,  3.1154, -5.7658, -2.9673],
        [-7.5604,  3.4097, -7.3844, -4.0549],
        [-6.9000,  4.0302, -6.2336, -4.3309],
        [-5.7464,  3.4085, -5.4219, -4.0013]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 42%|████▏     | 122/289 [01:32<02:07,  1.31it/s]

Training loop 122
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.057326264679431915, logits - tensor([[-7.3750,  3.2115, -7.3826, -2.5886],
        [-5.1820,  3.1359, -5.2261, -2.9666],
        [-7.6818,  3.1150, -6.4538, -2.7788],
        [-6.8821,  2.8912, -5.8661, -3.3250],
        [-6.5240,  2.9319, -6.4024, -2.9614],
        [-5.7331,  2.8552, -5.5579, -2.3621],
        [-6.8510, -2.8636, -5.8492,  2.7285],
        [-7.5764, -0.6384, -5.2061,  0.1797]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 43%|████▎     | 123/289 [01:33<02:06,  1.31it/s]

Training loop 123
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2711896300315857, logits - tensor([[-7.1827, -3.0887, -6.2103,  3.1106],
        [-6.0996,  0.2691, -6.8149, -0.0590],
        [-5.4528, -3.5042,  3.0957, -3.3384],
        [-5.9803,  4.1982, -5.9059, -3.2771],
        [-5.3658, -4.1724, -5.6219,  4.8130],
        [-6.1764,  2.9190, -6.5552, -2.6368],
        [-7.1254,  2.2473, -7.0228, -2.3636],
        [-7.0659, -3.0837, -5.3483,  3.8883]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 43%|████▎     | 124/289 [01:34<02:05,  1.31it/s]

Training loop 124
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.03328898921608925, logits - tensor([[-6.3154,  2.5515, -7.0323, -3.6299],
        [-6.5838,  2.6056, -5.9617, -2.6880],
        [-5.3310, -4.0571, -5.6353,  3.9970],
        [-5.5642, -3.2615,  1.7976, -1.7859],
        [-6.7029, -4.3622,  2.9116, -2.6843],
        [-6.8832,  3.2569, -7.0273, -3.4030],
        [-7.4127,  3.1896, -6.8125, -3.1184],
        [-7.3937,  2.9205, -6.2579, -2.4715]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 43%|████▎     | 125/289 [01:34<02:04,  1.31it/s]

Training loop 125
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.051707785576581955, logits - tensor([[-5.9057, -3.1000, -5.5051,  2.7647],
        [-6.9077,  2.6998, -7.3423, -2.7561],
        [-6.8191,  2.9201, -7.7431, -2.6989],
        [-6.7747, -3.7566, -5.6221,  4.8283],
        [-6.3175,  3.8981, -5.5912, -3.8953],
        [-7.5840, -3.2267, -1.1660,  0.3108],
        [-6.6187, -3.5258, -4.4898,  2.9627],
        [-6.9404, -2.1135, -7.3395,  1.8525]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 44%|████▎     | 126/289 [01:35<02:03,  1.32it/s]

Training loop 126
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.14431613683700562, logits - tensor([[-6.8472, -2.5570, -5.3633,  3.2461],
        [-6.3615,  1.8365, -6.8466, -1.8397],
        [-7.4741,  2.6310, -7.7023, -2.7971],
        [-6.5926, -3.2995, -4.9079,  2.9397],
        [-6.5197,  3.1968, -6.9719, -3.3475],
        [-7.0622,  2.8546, -6.8475, -2.9757],
        [-6.0938,  3.5695, -6.4436, -3.3444],
        [-6.6446,  4.8831, -5.6272, -3.3205]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 44%|████▍     | 127/289 [01:36<02:02,  1.32it/s]

Training loop 127
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.01689518243074417, logits - tensor([[-5.7677,  3.7008, -6.7882, -3.3234],
        [-6.2677,  3.8240, -6.9373, -3.5909],
        [-7.1520,  3.7006, -7.8535, -4.0922],
        [-5.5414, -3.7381,  2.9349, -2.6597],
        [-5.2327,  3.9113, -5.8691, -3.4964],
        [-7.6058, -3.2163, -5.5366,  3.9098],
        [-6.4237,  3.9286, -5.9320, -3.5019],
        [-6.7719,  3.8294, -7.8626, -3.2495]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 44%|████▍     | 128/289 [01:37<02:01,  1.33it/s]

Training loop 128
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.22267141938209534, logits - tensor([[-6.9936,  3.1465, -6.7322, -3.2262],
        [-7.4783,  2.8464, -7.3930, -2.6040],
        [-5.3159,  3.5328, -6.2476, -3.4689],
        [-6.6479,  4.0867, -6.9953, -3.4525],
        [-7.2398, -3.8124,  2.5641, -2.6314],
        [-5.1674, -3.6824,  2.9117, -2.6471],
        [-7.9807,  3.5566, -7.2288, -3.7049],
        [-7.1158, -3.5115, -6.1165,  3.8941]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 45%|████▍     | 129/289 [01:37<02:00,  1.33it/s]

Training loop 129
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 45%|████▍     | 130/289 [01:38<02:00,  1.32it/s]

loss - 0.0453639030456543, logits - tensor([[-7.2223,  2.5541, -6.8391, -2.1449],
        [-6.7041,  2.2335, -5.9177, -3.3364],
        [-5.7135,  3.1728, -6.6811, -2.4556],
        [-7.1302,  2.1947, -6.2041, -2.3946],
        [-6.0223, -4.3138,  3.4516, -3.4519],
        [-7.2257, -1.1249, -7.1349,  1.3355],
        [-6.5028,  4.3350, -7.7775, -3.7700],
        [-5.3519, -3.6716,  2.2706, -3.2035]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 130
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.03823312371969223, logits - tensor([[-5.7199, -4.3862,  2.3669, -2.6374],
        [-6.5947,  3.4535, -6.4445, -3.2744],
        [-6.6427,  3.4521, -5.3908, -2.3818],
        [-5.8073, -3.0663,  1.6097, -1.4651],
        [-7.8513,  3.4121, -7.5944, -3.9511],
        [-5.2292,  3.0221, -6.1561, -3.0878],
        [-6.4575,  2.5560, -6.3756, -4.3397],
        [-6.2171,  2.3194, -6.9293, -2.

 45%|████▌     | 131/289 [01:39<01:59,  1.33it/s]

Training loop 131
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.016485966742038727, logits - tensor([[-6.1858, -4.7080, -5.1273,  4.5571],
        [-6.1778,  2.6495, -7.0641, -2.4782],
        [-6.6287,  3.4307, -7.2840, -3.5863],
        [-6.2223,  3.2630, -6.6739, -2.7463],
        [-6.9001,  3.6200, -6.6996, -4.0976],
        [-6.5158,  3.6441, -6.7157, -4.2522],
        [-6.3651,  3.5793, -7.4153, -3.6416],
        [-7.3386,  4.0147, -6.6447, -3.8588]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 46%|████▌     | 132/289 [01:40<01:58,  1.33it/s]

Training loop 132
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.0330587700009346, logits - tensor([[-6.1765, -3.6710,  2.5357, -1.8733],
        [-6.9456, -2.9794, -6.5808,  2.7512],
        [-6.0910, -3.8590,  1.9725, -1.4587],
        [-6.0574,  3.0814, -6.3640, -3.8520],
        [-6.7526,  3.2589, -7.3062, -3.4278],
        [-5.3749,  3.1046, -6.1490, -3.9508],
        [-5.3444,  2.8918, -5.6992, -3.5304],
        [-5.1113, -4.0297, -5.3701,  5.3148]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 46%|████▌     | 133/289 [01:40<01:57,  1.33it/s]

Training loop 133
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.10939673334360123, logits - tensor([[-7.1190,  3.9339, -7.2475, -3.1488],
        [-8.4700, -3.7178, -7.0355,  3.4394],
        [-6.9380,  2.0519, -6.1181, -2.3011],
        [-8.4130, -1.2048, -7.4111,  1.2995],
        [-7.2531,  3.1144, -7.3816, -3.2852],
        [-5.1696, -3.4317,  2.9023, -3.1303],
        [-5.0206, -4.6899,  4.2824, -3.9412],
        [-7.9616,  0.4612, -6.7627, -1.1722]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 46%|████▋     | 134/289 [01:41<01:57,  1.32it/s]

Training loop 134
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.017963681370019913, logits - tensor([[-6.8874,  4.2738, -5.8098, -4.1042],
        [-6.4482,  4.0137, -6.5616, -3.8462],
        [-5.7529, -4.2221,  2.9301, -2.5977],
        [-6.9731,  3.6077, -6.9045, -3.4996],
        [-6.3601, -4.3309,  3.3287, -3.2271],
        [-5.9192,  3.7348, -5.9803, -3.6704],
        [-5.2930,  3.3318, -5.3950, -2.8916],
        [-5.9664,  3.1160, -6.9346, -4.5848]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 47%|████▋     | 135/289 [01:42<01:57,  1.32it/s]

Training loop 135
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.21464379131793976, logits - tensor([[-4.9014, -4.0821,  3.2262, -3.8088],
        [-5.8742, -4.4397,  3.0384, -2.4636],
        [-6.1149, -3.7565,  1.8961, -1.8381],
        [-6.7597,  0.3553, -4.1996, -0.6283],
        [-6.8349,  3.9482, -6.1962, -4.4913],
        [-7.0927,  4.3511, -4.8801, -3.8614],
        [-6.8741, -1.7315, -6.2288,  1.4268],
        [-5.8874, -4.4913,  2.4758, -2.4365]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 47%|████▋     | 136/289 [01:43<01:56,  1.32it/s]

Training loop 136
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.027087323367595673, logits - tensor([[-7.4616,  3.2956, -6.5727, -3.7011],
        [-6.4080, -2.9742, -6.1710,  3.4859],
        [-7.8724,  3.4591, -7.5774, -3.0912],
        [-7.5295,  3.6051, -6.5597, -4.0475],
        [-7.0152,  4.0625, -7.5524, -2.4445],
        [-5.5090, -3.3080,  2.2528, -2.3295],
        [-6.0556,  3.2270, -7.0027, -3.7844],
        [-7.2694, -2.0259, -5.9518,  2.6005]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 47%|████▋     | 137/289 [01:43<01:55,  1.32it/s]

Training loop 137
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.0686708316206932, logits - tensor([[-6.4218, -4.0001,  2.1547, -2.2276],
        [-8.0160,  3.9816, -7.5819, -4.4550],
        [-5.8050,  3.7996, -6.0893, -3.7664],
        [-7.2635,  2.1051, -6.6049, -2.2166],
        [-6.9758, -4.2495,  1.3730, -0.5095],
        [-6.7996, -2.5516,  0.7003, -1.9151],
        [-6.9411,  1.3988, -6.2277, -2.4434],
        [-7.4983, -4.1844, -5.5969,  4.6375]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 48%|████▊     | 138/289 [01:44<01:54,  1.32it/s]

Training loop 138
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.216621533036232, logits - tensor([[-5.7913,  1.3913, -4.8392, -2.6175],
        [-7.1265, -0.4387, -7.2087, -0.0989],
        [-7.5960, -1.6446, -6.9473,  1.2267],
        [-7.5878, -2.1660, -2.8986, -0.8952],
        [-7.0904,  2.7105, -6.9496, -2.4626],
        [-7.0659, -3.7280, -5.8358,  3.7043],
        [-6.4465,  3.3695, -6.6740, -3.5269],
        [-7.3526,  3.0489, -7.6014, -3.6472]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 48%|████▊     | 139/289 [01:45<01:53,  1.32it/s]

Training loop 139
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1545717418193817, logits - tensor([[-6.3391, -4.8115, -4.9602,  4.3052],
        [-5.9997, -3.9533, -4.6684,  3.8754],
        [-7.8774,  0.3288, -6.5838, -0.4024],
        [-5.9807, -2.7353,  1.8155, -2.0591],
        [-7.9170, -0.7128, -7.7499,  0.9724],
        [-5.6265,  2.4026, -5.4194, -2.7034],
        [-6.1520,  2.4511, -6.5570, -3.7480],
        [-7.1490,  3.4024, -6.2856, -3.4543]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 48%|████▊     | 140/289 [01:46<01:52,  1.32it/s]

Training loop 140
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.08258958905935287, logits - tensor([[-6.3673, -4.8659,  2.8194, -3.6169],
        [-6.7218,  3.7860, -7.0979, -2.5738],
        [-6.6254,  3.1378, -6.0106, -3.0180],
        [-8.6310,  1.3638, -6.8751, -0.8483],
        [-6.6558,  0.2249, -7.2573,  0.0491],
        [-7.1668,  3.2565, -8.0032, -2.5247],
        [-6.4122,  1.6618, -7.2348, -2.2661],
        [-7.2779,  3.0813, -6.7617, -3.5853]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 49%|████▉     | 141/289 [01:47<01:51,  1.32it/s]

Training loop 141
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.02817067876458168, logits - tensor([[-6.2464,  3.4299, -6.3679, -3.0115],
        [-6.7682,  2.8927, -6.4536, -2.7799],
        [-6.8819,  3.4137, -7.0867, -3.6922],
        [-6.8330,  3.4904, -6.4725, -1.9865],
        [-7.1489, -4.5827,  1.9830, -3.4909],
        [-7.2720,  3.0540, -7.5271, -1.9048],
        [-6.8347, -4.5892, -6.7541,  4.1718],
        [-6.5045,  2.8108, -7.1197, -3.2908]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 49%|████▉     | 142/289 [01:47<01:51,  1.32it/s]

Training loop 142
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.02405073493719101, logits - tensor([[-7.3915,  3.1526, -6.6884, -3.1323],
        [-6.4385,  3.9676, -6.2772, -4.2927],
        [-5.3751, -4.0865, -4.9892,  4.5740],
        [-6.0292, -4.0531,  2.2149, -2.5560],
        [-6.8603,  3.0486, -6.9168, -4.4141],
        [-6.7092,  3.3506, -6.6373, -2.3182],
        [-5.1323, -5.0942, -5.4321,  4.8741],
        [-7.1195,  2.7634, -6.5011, -1.9553]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 49%|████▉     | 143/289 [01:48<01:50,  1.33it/s]

Training loop 143
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.0197207722812891, logits - tensor([[-7.4662,  3.8245, -7.7104, -4.1758],
        [-5.9602,  3.7023, -5.9029, -3.0413],
        [-5.6350,  2.9179, -5.8037, -3.3680],
        [-6.0283, -3.2740, -5.7110,  3.1414],
        [-6.2263,  3.0196, -6.2635, -3.7127],
        [-7.3256, -2.8448, -5.7689,  3.4003],
        [-5.9555, -4.2886, -4.8408,  4.3945],
        [-6.1998, -3.6945,  2.9780, -2.8317]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 50%|████▉     | 144/289 [01:49<01:49,  1.33it/s]

Training loop 144
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.025633100420236588, logits - tensor([[-6.6533, -2.5855, -5.2739,  1.9503],
        [-6.2583, -3.5903, -5.3009,  3.3120],
        [-6.7889, -2.1617, -6.1655,  2.4865],
        [-6.8583, -2.9632, -5.1323,  3.3673],
        [-7.8734, -3.9485, -6.4922,  4.1059],
        [-5.7901,  3.2875, -5.3920, -3.5143],
        [-7.4590,  2.9718, -6.5778, -3.5188],
        [-7.3448,  3.6204, -7.2314, -3.3299]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 50%|█████     | 145/289 [01:50<01:48,  1.33it/s]

Training loop 145
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.015043571591377258, logits - tensor([[-7.5566,  4.4560, -7.5687, -3.7222],
        [-6.2360,  3.7074, -6.9737, -4.1628],
        [-6.7853,  3.7089, -6.2846, -3.6829],
        [-6.5178,  4.0325, -6.8940, -4.0396],
        [-6.7378, -2.7699, -4.6867,  2.5644],
        [-6.3471,  4.4809, -6.8449, -4.1616],
        [-7.3232,  3.9013, -6.5938, -3.6844],
        [-7.1567,  2.9038, -6.8736, -3.3080]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 51%|█████     | 146/289 [01:50<01:48,  1.32it/s]

Training loop 146
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.04602521285414696, logits - tensor([[-7.4265,  2.8945, -5.9436, -2.2356],
        [-5.7095, -2.8608,  1.1385, -2.0111],
        [-7.7540,  1.9198, -7.0032, -2.1619],
        [-4.7681, -3.2338,  3.1031, -3.0184],
        [-6.6429,  2.3449, -5.6686, -2.9637],
        [-6.2126,  3.3677, -6.2566, -3.3143],
        [-4.9274, -4.8806, -5.8403,  4.4453],
        [-6.5219, -3.2967,  2.4095, -2.3341]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 51%|█████     | 147/289 [01:51<01:47,  1.33it/s]

Training loop 147
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2530662417411804, logits - tensor([[-7.1619,  0.4135, -6.8675, -1.5709],
        [-6.2138,  3.5488, -5.7875, -3.3194],
        [-6.5730,  2.6657, -5.8132, -2.3221],
        [-7.2891,  3.5437, -7.4929, -2.5450],
        [-6.5184,  3.2827, -6.7992, -4.2442],
        [-6.1750,  2.2913, -6.8417, -3.4888],
        [-4.9636, -4.1092,  2.9675, -2.8977],
        [-7.3175,  3.7394, -7.3786, -2.9341]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 51%|█████     | 148/289 [01:52<01:46,  1.33it/s]

Training loop 148
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.03049815259873867, logits - tensor([[-6.7449,  2.6744, -7.2723, -2.9447],
        [-6.6698, -4.4235,  3.1398, -2.8081],
        [-6.0007,  2.5508, -7.0071, -2.5345],
        [-5.1484,  2.8372, -6.4843, -3.2143],
        [-7.1645,  4.8279, -6.2545, -3.9519],
        [-5.9795,  3.0975, -6.6095, -3.5598],
        [-7.2139,  2.1879, -7.3458, -1.6173],
        [-7.4485,  3.3224, -6.8055, -2.9325]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 52%|█████▏    | 149/289 [01:53<01:45,  1.33it/s]

Training loop 149
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.02775287814438343, logits - tensor([[-6.0928,  2.6053, -6.7641, -3.7289],
        [-6.1974,  3.4146, -6.8479, -2.7966],
        [-6.0907, -3.6276,  2.0579, -2.1289],
        [-6.5448,  3.3345, -6.4769, -3.7712],
        [-6.2416, -4.0367,  2.7839, -2.5854],
        [-8.2194, -5.3575, -5.2143,  4.1796],
        [-5.3582, -4.6980, -5.9810,  3.6984],
        [-6.9119,  2.9206, -7.1136, -2.2477]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 52%|█████▏    | 150/289 [01:53<01:44,  1.33it/s]

Training loop 150
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.022653181105852127, logits - tensor([[-7.2210, -5.0907, -5.1446,  4.9563],
        [-6.1120,  3.8704, -6.2887, -3.7872],
        [-7.9363,  2.1616, -7.3787, -2.4404],
        [-6.8469,  3.4080, -5.9313, -3.4037],
        [-7.0543,  2.1864, -6.0677, -2.3735],
        [-7.1226,  3.9089, -6.9050, -4.3288],
        [-5.9472,  3.1382, -6.5268, -3.3367],
        [-6.8946,  3.1311, -7.4020, -3.2973]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 52%|█████▏    | 151/289 [01:54<01:44,  1.33it/s]

Training loop 151
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.018153827637434006, logits - tensor([[-6.4625,  3.2722, -6.9233, -3.4857],
        [-5.1168, -3.7214,  2.2005, -3.5262],
        [-5.9133,  3.6002, -5.8687, -3.9592],
        [-6.9058, -4.4921, -6.3596,  4.8491],
        [-5.9715,  3.2419, -6.2773, -3.4425],
        [-6.9084, -5.2495, -5.1127,  4.5147],
        [-7.1587,  4.0470, -7.1490, -4.0951],
        [-7.9032,  2.5289, -8.8141, -2.7167]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 53%|█████▎    | 152/289 [01:55<01:43,  1.32it/s]

Training loop 152
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.04672526568174362, logits - tensor([[-8.3560,  2.2426, -8.3261, -2.4029],
        [-7.5935, -2.7285, -4.6245,  2.9971],
        [-6.8682,  4.7945, -6.3409, -4.1943],
        [-7.3113,  3.1897, -7.5485, -2.7217],
        [-6.3602, -3.5322,  1.4838, -1.2767],
        [-6.3942, -4.0922,  1.9816, -1.7946],
        [-6.0468, -3.8158,  2.1594, -2.3938],
        [-6.9696,  3.3794, -7.3253, -4.1401]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 53%|█████▎    | 153/289 [01:56<01:43,  1.31it/s]

Training loop 153
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.09363105893135071, logits - tensor([[-6.7879, -3.1667, -6.0514,  2.7872],
        [-6.7917,  3.4201, -7.2125, -2.3025],
        [-6.1484, -3.5766,  2.1534, -3.1935],
        [-6.4119, -2.9809, -4.6501,  2.6660],
        [-7.2049,  3.5360, -6.7019, -3.4195],
        [-6.7278, -4.5249, -6.2980,  3.3044],
        [-5.6196, -3.7431, -5.3606,  4.6771],
        [-7.6813,  0.9882, -7.2065, -0.5335]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 53%|█████▎    | 154/289 [01:56<01:42,  1.32it/s]

Training loop 154
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.013137664645910263, logits - tensor([[-5.5166, -4.8663, -4.8204,  3.5806],
        [-7.2424,  3.9182, -6.3988, -3.4277],
        [-6.7330,  4.0072, -7.1420, -3.9231],
        [-7.1722, -4.7931, -6.4230,  4.4894],
        [-6.0267,  3.3676, -6.4249, -3.0949],
        [-7.7396,  3.9467, -8.2690, -3.5925],
        [-6.3936,  3.1941, -6.7602, -3.3522],
        [-6.8263,  3.3950, -7.4206, -4.0949]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 54%|█████▎    | 155/289 [01:57<01:42,  1.31it/s]

Training loop 155
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.03124685399234295, logits - tensor([[-7.7255,  2.4979, -7.7146, -2.2638],
        [-6.4561, -4.1437, -5.7594,  4.5693],
        [-5.4866,  3.5684, -5.9082, -3.9649],
        [-6.6566,  3.8673, -6.0453, -3.5764],
        [-6.5417,  2.3596, -7.2528, -2.3351],
        [-6.9799,  3.1787, -7.3561, -3.1743],
        [-6.5767,  3.0149, -6.4607, -3.4677],
        [-7.4271,  2.1274, -5.9809, -1.4045]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 54%|█████▍    | 156/289 [01:58<01:41,  1.31it/s]

Training loop 156
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.03673993796110153, logits - tensor([[-7.1205,  3.9364, -5.6823, -4.0523],
        [-6.2804,  3.2812, -6.3051, -4.0248],
        [-7.5419,  1.4665, -7.1011, -1.5196],
        [-6.4720,  3.1198, -7.1068, -3.6691],
        [-6.2342, -3.3918,  0.9961, -2.0023],
        [-6.4674, -3.6746, -5.7120,  4.6349],
        [-7.7262,  4.0780, -8.0304, -3.9977],
        [-6.1994,  4.1343, -5.5631, -3.6393]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 54%|█████▍    | 157/289 [01:59<01:41,  1.31it/s]

Training loop 157
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.23157645761966705, logits - tensor([[-5.2865, -4.8319, -4.8280,  4.2155],
        [-6.9521,  3.3529, -7.0275, -3.6858],
        [-5.7362, -2.8697, -6.0975,  2.4205],
        [-7.4884, -2.8069, -5.7437,  4.0223],
        [-6.3830,  3.5519, -6.6193, -3.4546],
        [-6.9699,  3.3160, -7.2585, -3.1993],
        [-5.5120, -3.3224, -5.7619,  4.6809],
        [-6.4450,  3.4049, -6.0858, -3.4696]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 55%|█████▍    | 158/289 [01:59<01:39,  1.31it/s]

Training loop 158
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.03205126151442528, logits - tensor([[-7.1995,  3.1067, -7.6627, -3.6863],
        [-8.2014,  3.5270, -8.7482, -3.4789],
        [-5.9904, -4.3179,  1.9253, -3.1495],
        [-7.1669,  2.9052, -7.8044, -2.8057],
        [-8.5087,  1.7527, -8.5332, -2.0516],
        [-6.1740,  3.6022, -6.2664, -3.8505],
        [-6.3846, -4.4572,  3.3409, -3.7505],
        [-7.4198,  2.5853, -7.3595, -2.1432]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 55%|█████▌    | 159/289 [02:00<01:38,  1.32it/s]

Training loop 159
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.03214767575263977, logits - tensor([[-6.6232,  3.0304, -7.2964, -2.5406],
        [-7.8208,  3.7896, -7.1222, -3.6372],
        [-6.9209,  2.7982, -7.1586, -3.1751],
        [-5.2781, -4.2034, -4.6436,  3.8347],
        [-6.0854, -3.4697,  2.0877, -3.3373],
        [-5.6646, -3.1813,  2.6617, -2.3183],
        [-6.9819, -5.5551, -5.3759,  5.6375],
        [-6.9090,  1.8478, -6.4013, -1.8268]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 55%|█████▌    | 160/289 [02:01<01:37,  1.33it/s]

Training loop 160
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.03764166682958603, logits - tensor([[-7.0241,  1.4554, -6.3426, -1.1027],
        [-5.3756, -3.4432,  2.8961, -2.3247],
        [-7.6328,  3.0352, -6.8213, -2.0380],
        [-7.6276, -4.2388, -5.8975,  5.1008],
        [-6.9877,  3.7287, -6.8480, -4.1319],
        [-6.7104,  2.3769, -7.0977, -2.6052],
        [-6.1898,  3.2187, -5.8236, -3.9439],
        [-6.7208,  3.4465, -6.4790, -3.6184]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 56%|█████▌    | 161/289 [02:02<01:36,  1.33it/s]

Training loop 161
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.021714333444833755, logits - tensor([[-6.0261, -3.3947,  2.5643, -2.2647],
        [-7.3496,  3.5712, -6.1669, -3.4289],
        [-6.3044, -3.8649, -5.8019,  4.1115],
        [-6.6137,  3.7465, -7.3253, -3.6905],
        [-5.1599,  2.8866, -6.8268, -2.8710],
        [-6.7033,  3.6962, -5.8079, -4.1751],
        [-6.8283,  2.6203, -7.1986, -2.6326],
        [-5.1094, -4.2571, -5.2107,  5.2821]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 56%|█████▌    | 162/289 [02:02<01:36,  1.32it/s]

Training loop 162
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1295536309480667, logits - tensor([[-5.9985,  3.0878, -6.3839, -2.7947],
        [-7.5227, -3.8619, -6.0547,  3.3929],
        [-6.2254,  3.4935, -5.9795, -3.0779],
        [-7.0122,  1.3159, -6.0031, -1.5945],
        [-6.0192, -2.2020, -5.9219,  3.1183],
        [-7.0713,  3.8036, -5.6498, -3.2572],
        [-7.1147,  3.2182, -6.4753, -3.3325],
        [-5.6028,  2.2843, -6.3837, -1.6942]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 56%|█████▋    | 163/289 [02:03<01:35,  1.32it/s]

Training loop 163
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.029483690857887268, logits - tensor([[-7.3212,  2.2839, -7.1551, -2.4399],
        [-7.2155, -2.0868, -6.5066,  2.1795],
        [-7.0387,  3.7180, -7.2924, -4.7623],
        [-6.8753,  3.0532, -7.0988, -2.9898],
        [-6.0967, -4.1282, -5.4305,  3.8955],
        [-6.5934,  3.9041, -6.2704, -4.1393],
        [-5.3586,  2.7181, -5.6582, -3.1436],
        [-6.8626,  2.2826, -6.4586, -2.1697]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 57%|█████▋    | 164/289 [02:04<01:34,  1.32it/s]

Training loop 164
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.26416364312171936, logits - tensor([[-7.3915, -3.7978, -5.0091,  4.0617],
        [-6.8172,  3.5021, -6.2293, -3.0795],
        [-6.2777, -3.4996, -5.2893,  3.3562],
        [-7.1313, -1.6866, -7.4367,  1.0531],
        [-7.4650, -3.4757, -5.9765,  3.5391],
        [-7.1876,  1.3853, -7.4556, -1.2589],
        [-7.4563,  3.2355, -7.4268, -3.1002],
        [-7.9568,  2.0006, -6.7872, -1.6070]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 57%|█████▋    | 165/289 [02:05<01:33,  1.32it/s]

Training loop 165
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.036181338131427765, logits - tensor([[-6.7917, -3.7854, -6.8226,  4.1404],
        [-6.1287, -3.4149,  2.3766, -2.5851],
        [-6.8628,  3.0564, -7.2727, -3.7480],
        [-6.6992,  1.8900, -6.2408, -1.7999],
        [-6.0061, -4.5958, -5.8624,  3.6647],
        [-6.4337, -3.9437,  1.9322, -2.2577],
        [-7.3224,  2.3488, -6.6902, -2.3066],
        [-6.1971,  3.6022, -5.7556, -3.2735]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 57%|█████▋    | 166/289 [02:05<01:32,  1.33it/s]

Training loop 166
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.10723263770341873, logits - tensor([[-6.3043,  3.2837, -6.0027, -3.4081],
        [-7.1342, -4.2111, -6.8654,  3.8961],
        [-7.5412,  2.6300, -7.1570, -2.9973],
        [-7.0310, -3.8244,  2.0018, -2.0719],
        [-7.7351,  4.4499, -7.4483, -4.3727],
        [-4.9743, -2.0753,  0.7498, -1.2579],
        [-6.3112,  2.9588, -6.9432, -3.6416],
        [-6.5393, -4.5165, -5.1567,  3.1679]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 58%|█████▊    | 167/289 [02:06<01:31,  1.33it/s]

Training loop 167
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.044520534574985504, logits - tensor([[-6.9443,  3.5336, -6.3745, -3.4016],
        [-7.8763, -2.9051, -5.9139,  2.9736],
        [-7.4263,  2.1409, -6.6545, -1.1088],
        [-5.8193,  3.0871, -6.2353, -3.2460],
        [-5.3686, -4.2491, -4.2113,  4.6729],
        [-6.8335,  2.9700, -7.5258, -3.3600],
        [-5.0990,  3.4711, -6.8867, -3.1996],
        [-5.5273, -2.4671,  1.2939, -1.3320]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 58%|█████▊    | 168/289 [02:07<01:31,  1.32it/s]

Training loop 168
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.22630761563777924, logits - tensor([[-7.0340, -3.7690, -5.8253,  4.2570],
        [-6.7240,  2.3685, -7.5659, -2.5757],
        [-7.1098,  3.5648, -7.5671, -3.7304],
        [-6.0137,  3.8232, -5.9692, -4.1178],
        [-7.7318, -0.3118, -6.7435,  0.6037],
        [-6.3231, -2.8583,  2.0140, -2.2250],
        [-5.8857, -3.2523,  1.4089, -1.5201],
        [-6.5407, -4.1213, -4.6043,  3.5225]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 58%|█████▊    | 169/289 [02:08<01:30,  1.32it/s]

Training loop 169
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.03354471176862717, logits - tensor([[-7.3074,  1.5929, -6.6616, -1.3528],
        [-7.4329,  2.6119, -6.6324, -2.5144],
        [-6.5810, -4.1017,  3.1379, -1.6258],
        [-5.7570, -4.5419, -5.5736,  3.8970],
        [-6.9060,  3.2540, -6.4815, -3.6623],
        [-5.7797,  3.7174, -6.1680, -4.0432],
        [-6.9568,  3.2628, -6.0249, -3.1390],
        [-5.4994, -4.4850, -5.0146,  4.5228]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 59%|█████▉    | 170/289 [02:08<01:29,  1.32it/s]

Training loop 170
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.04893043637275696, logits - tensor([[-7.2424,  2.7246, -7.0393, -2.8495],
        [-7.4660,  1.5268, -6.4307, -2.0596],
        [-7.6978,  3.0480, -7.9585, -3.2853],
        [-6.6700,  2.9774, -6.9710, -2.7984],
        [-6.0822,  3.7198, -6.7978, -4.1687],
        [-6.2512, -4.8706, -4.8788,  5.0018],
        [-5.6866,  2.9780, -5.7701, -3.3964],
        [-8.5613,  0.5144, -7.3486, -1.0288]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 59%|█████▉    | 171/289 [02:09<01:28,  1.33it/s]

Training loop 171
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 60%|█████▉    | 172/289 [02:10<01:28,  1.33it/s]

loss - 0.0434134341776371, logits - tensor([[-6.5890,  3.4236, -6.3720, -3.0241],
        [-6.7591,  2.3330, -6.6935, -2.9413],
        [-7.6066,  0.7676, -6.9013, -0.4658],
        [-8.2345, -3.5342, -5.1005,  2.7740],
        [-6.7425, -3.7405, -6.2623,  5.2539],
        [-5.7051, -3.9152, -5.5779,  3.8783],
        [-5.8276, -4.1200, -5.2696,  3.2715],
        [-6.2936, -4.1657, -6.5175,  3.3220]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 172
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.030820664018392563, logits - tensor([[-6.3617, -4.3411, -5.3247,  4.7816],
        [-6.0623, -4.6176,  3.2638, -3.3916],
        [-6.8842, -2.5683, -5.6512,  1.8983],
        [-6.8569,  2.0435, -5.7894, -1.8859],
        [-7.7157,  3.3965, -7.1571, -3.7196],
        [-6.4522, -3.5490,  3.1945, -2.7353],
        [-6.1412,  3.6849, -6.2560, -3.2317],
        [-7.5573,  2.4646, -8.1744, -2

 60%|█████▉    | 173/289 [02:11<01:27,  1.33it/s]

Training loop 173
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.16548767685890198, logits - tensor([[-5.4399,  2.2778, -5.4796, -2.4133],
        [-6.0897, -3.4514,  1.8420, -1.7488],
        [-7.4924,  2.3731, -6.8093, -3.0559],
        [-5.4392, -4.3435, -5.1575,  3.4024],
        [-6.9101, -3.4899,  1.8303, -2.2248],
        [-5.5834, -3.7150, -5.5031,  3.7995],
        [-7.4562,  3.4244, -6.9987, -4.1063],
        [-5.3579, -3.8688,  2.7023, -3.2370]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 60%|██████    | 174/289 [02:11<01:26,  1.33it/s]

Training loop 174
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.04031314700841904, logits - tensor([[-6.5773, -4.7297,  2.7015, -3.4539],
        [-6.3609,  2.7504, -7.0961, -2.2780],
        [-5.9252, -3.6329,  2.7941, -2.5379],
        [-7.4345, -1.6679, -6.5896,  1.3952],
        [-8.3158,  2.3093, -8.3844, -2.7292],
        [-6.1163,  3.8507, -5.5128, -3.6683],
        [-5.8993, -4.4405,  3.3259, -3.5619],
        [-6.0427,  2.8160, -6.5271, -2.1446]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 61%|██████    | 175/289 [02:12<01:25,  1.33it/s]

Training loop 175
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.17381106317043304, logits - tensor([[-6.4552, -3.7886,  1.8677, -3.1375],
        [-6.7038, -3.5610, -5.3978,  3.2035],
        [-5.6122,  4.1724, -6.3282, -3.8707],
        [-7.3016,  3.1174, -7.2276, -2.6769],
        [-7.1139,  3.9629, -7.7029, -4.1624],
        [-6.0470,  2.2853, -6.0211, -2.5030],
        [-6.7983,  4.4609, -6.7454, -4.0116],
        [-6.1509,  3.0993, -6.0087, -3.1707]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 61%|██████    | 176/289 [02:13<01:25,  1.33it/s]

Training loop 176
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2030847668647766, logits - tensor([[-7.7693, -1.9090, -6.0488,  2.0919],
        [-6.8979,  2.9103, -6.3925, -2.0433],
        [-6.2679,  4.1353, -6.7475, -3.3924],
        [-6.9539, -1.2991, -5.9100,  1.5048],
        [-5.9316, -4.4186, -5.2715,  4.4949],
        [-6.1626,  3.7094, -7.8429, -3.6042],
        [-7.0061,  1.2291, -7.1355, -1.3251],
        [-6.7420,  4.2664, -6.6686, -4.0153]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 61%|██████    | 177/289 [02:14<01:24,  1.32it/s]

Training loop 177
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.02206200361251831, logits - tensor([[-5.4308, -4.1406, -4.2928,  3.7423],
        [-6.0671,  3.8931, -6.7441, -4.3943],
        [-6.8748, -4.3146, -5.0920,  4.7828],
        [-6.3299,  4.2565, -5.6835, -4.1913],
        [-5.9665, -3.9500,  2.5448, -2.3285],
        [-5.0391, -4.6306, -4.9993,  3.7415],
        [-5.6047, -3.3061,  1.8992, -2.3873],
        [-6.2838, -4.1670, -5.3527,  3.7046]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 62%|██████▏   | 178/289 [02:14<01:24,  1.32it/s]

Training loop 178
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 62%|██████▏   | 179/289 [02:15<01:23,  1.32it/s]

loss - 0.028556901961565018, logits - tensor([[-6.2422, -4.3278,  2.9241, -1.8967],
        [-6.0546,  3.3672, -7.7235, -3.1269],
        [-6.7785, -4.2994, -5.1135,  4.0462],
        [-7.8898,  3.7015, -6.7634, -3.8489],
        [-6.4077, -4.0346,  2.5808, -2.6150],
        [-6.7048,  2.9621, -6.2267, -2.2702],
        [-7.4898,  2.3128, -6.8521, -2.2045],
        [-6.3917, -4.1183, -5.7701,  5.1449]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 179
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.024797817692160606, logits - tensor([[-5.8483,  3.3671, -6.8320, -3.8193],
        [-7.7725,  3.1945, -7.4533, -3.5835],
        [-7.5703,  3.4937, -7.4013, -3.4666],
        [-6.3756,  3.5477, -6.5402, -2.7874],
        [-5.9636,  3.2921, -5.7307, -3.8469],
        [-5.0790, -3.6223,  2.7115, -2.4253],
        [-6.6741,  2.8313, -6.3792, -3.0046],
        [-7.4487,  2.9758, -7.0000, 

 62%|██████▏   | 180/289 [02:16<01:22,  1.32it/s]

Training loop 180
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.026233283802866936, logits - tensor([[-8.4258,  3.1052, -6.1120, -2.6285],
        [-6.9302,  2.8614, -5.8821, -4.3190],
        [-6.5103,  3.1575, -6.8974, -3.7213],
        [-6.3329,  3.9870, -6.7826, -3.6966],
        [-6.6076,  3.6792, -7.0970, -3.6295],
        [-5.9386, -4.3425,  2.8461, -3.2813],
        [-5.9389, -2.7290, -5.0603,  2.7561],
        [-5.5475, -2.9097,  2.3796, -2.2999]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 63%|██████▎   | 181/289 [02:17<01:21,  1.32it/s]

Training loop 181
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.016322888433933258, logits - tensor([[-5.9777,  3.1400, -6.1746, -4.9848],
        [-6.2709,  3.9277, -6.4686, -3.9611],
        [-6.6737,  3.2768, -6.7048, -3.3214],
        [-5.0328,  3.4650, -6.1829, -3.9569],
        [-6.9790,  2.9829, -6.7087, -3.2355],
        [-7.3954,  4.3956, -6.1374, -4.5417],
        [-7.2656,  2.7047, -7.7009, -3.7616],
        [-7.3498,  2.7149, -6.9745, -3.7567]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 63%|██████▎   | 182/289 [02:18<01:21,  1.32it/s]

Training loop 182
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.024722995236516, logits - tensor([[-6.6067,  3.3101, -6.7756, -3.4269],
        [-6.9726, -3.9251,  2.5101, -1.9663],
        [-6.7962,  2.7369, -6.1416, -2.5377],
        [-6.6029, -3.4312, -3.7446,  3.5946],
        [-6.6475,  2.9808, -6.6238, -3.7638],
        [-6.0118,  3.5899, -6.1608, -3.6183],
        [-7.6860, -3.0235, -5.4411,  3.2102],
        [-7.3167,  3.7831, -6.9004, -4.1191]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 63%|██████▎   | 183/289 [02:18<01:20,  1.32it/s]

Training loop 183
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.13709434866905212, logits - tensor([[-5.7155,  3.6007, -6.1844, -2.6434],
        [-6.2847, -3.6503,  1.7086, -2.6411],
        [-6.4223,  1.5092, -6.9167, -1.7951],
        [-7.6484,  3.3407, -6.5330, -3.6566],
        [-7.1799,  4.2831, -7.1791, -4.7352],
        [-7.2456,  2.5936, -6.3010, -2.6253],
        [-6.2779, -3.6014, -4.8534,  3.5480],
        [-6.5937,  3.2785, -7.5459, -3.5484]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 64%|██████▎   | 184/289 [02:19<01:19,  1.32it/s]

Training loop 184
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2303292155265808, logits - tensor([[-6.1941,  4.5424, -6.1880, -3.8286],
        [-6.1669, -3.4366,  2.2566, -2.0120],
        [-6.1484, -4.7167,  3.1204, -2.8685],
        [-7.8385,  2.8817, -7.2942, -2.9541],
        [-6.3115,  3.4846, -5.9721, -3.4176],
        [-7.1521,  1.9884, -7.0309, -3.1728],
        [-5.4517, -4.0815,  2.5247, -3.3524],
        [-8.5555, -3.0730, -6.5806,  3.3345]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 64%|██████▍   | 185/289 [02:20<01:18,  1.32it/s]

Training loop 185
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.01955329068005085, logits - tensor([[-6.4453,  3.1879, -6.2491, -2.9909],
        [-5.9928,  3.3354, -6.1125, -4.0690],
        [-7.3951,  4.1364, -7.0560, -3.8913],
        [-6.4576, -4.3470,  4.0210, -3.0518],
        [-6.2443,  2.4860, -6.3536, -2.0014],
        [-6.6795, -4.0911, -4.9944,  3.9427],
        [-6.7519,  4.1032, -6.6109, -4.2042],
        [-6.8499,  3.3180, -5.9884, -3.3922]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 64%|██████▍   | 186/289 [02:21<01:17,  1.32it/s]

Training loop 186
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.23707729578018188, logits - tensor([[-7.0327,  1.9044, -6.4132, -2.1692],
        [-6.0687,  3.0293, -6.0220, -3.7714],
        [-7.2060,  3.6752, -7.6134, -3.7048],
        [-6.7446,  3.3302, -7.2305, -3.4032],
        [-6.6695,  2.9040, -6.3756, -2.4536],
        [-6.9348,  3.8497, -7.0934, -3.7800],
        [-6.5966,  3.7801, -6.6150, -3.4943],
        [-6.9765,  3.4915, -7.1634, -2.6246]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 65%|██████▍   | 187/289 [02:21<01:17,  1.32it/s]

Training loop 187
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.11671089380979538, logits - tensor([[-6.8436,  1.7985, -6.4589, -2.5404],
        [-7.0570, -2.4355,  0.7482, -1.4848],
        [-5.9902,  4.1138, -6.1330, -4.4725],
        [-5.9209, -3.5316,  2.1769, -2.5802],
        [-5.4119, -4.1062,  2.9869, -3.6856],
        [-6.6665,  2.8330, -7.3581, -3.1127],
        [-7.0899,  3.6012, -5.6160, -4.3164],
        [-6.4077, -4.4095,  3.1496, -2.9368]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 65%|██████▌   | 188/289 [02:22<01:16,  1.32it/s]

Training loop 188
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3473069369792938, logits - tensor([[-5.8660,  3.1443, -6.2647, -2.5670],
        [-7.2715, -4.3043, -5.3779,  4.9510],
        [-6.0415, -3.7538, -5.1481,  4.5249],
        [-6.6019,  3.1391, -7.6315, -3.4885],
        [-6.4250,  2.4127, -5.3656, -2.3929],
        [-6.9981, -3.0322, -5.1844,  3.7345],
        [-8.1093,  4.1935, -7.5033, -3.8156],
        [-6.0176,  3.9095, -6.6130, -4.1346]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 65%|██████▌   | 189/289 [02:23<01:15,  1.32it/s]

Training loop 189
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.02206231653690338, logits - tensor([[-6.6883,  2.8373, -6.9399, -3.5709],
        [-6.3271,  2.9030, -6.6190, -4.3641],
        [-7.2944,  3.1819, -7.2123, -4.6743],
        [-7.4158, -4.3128, -5.4099,  4.3677],
        [-6.6328, -4.7438, -5.6935,  4.9241],
        [-6.5017,  3.9573, -6.5421, -2.8570],
        [-8.4232,  1.5742, -7.8830, -1.8467],
        [-6.5983,  4.4723, -6.0515, -3.9620]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 66%|██████▌   | 190/289 [02:24<01:14,  1.33it/s]

Training loop 190
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.07642082124948502, logits - tensor([[-7.0074, -3.1073, -6.9784,  2.9995],
        [-7.1996,  3.5700, -6.9344, -3.4766],
        [-5.6119, -3.2541,  2.0077, -1.5677],
        [-6.7108,  2.3715, -6.5697, -3.2474],
        [-6.4409, -1.6303, -1.3225, -0.8333],
        [-8.2814,  3.5341, -8.1951, -3.9220],
        [-6.7461,  3.9295, -7.2712, -2.9622],
        [-6.6757,  3.1667, -6.5548, -3.5297]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 66%|██████▌   | 191/289 [02:24<01:14,  1.32it/s]

Training loop 191
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.049240175634622574, logits - tensor([[-7.0577e+00, -3.5934e+00, -6.2807e+00,  3.6195e+00],
        [-6.7064e+00,  2.4177e+00, -7.0639e+00, -3.7319e+00],
        [-6.0933e+00, -2.1522e+00, -4.9361e+00,  2.8562e+00],
        [-6.9413e+00,  4.4566e+00, -5.8378e+00, -4.2708e+00],
        [-5.9539e+00, -4.2635e+00,  2.9606e+00, -2.7441e+00],
        [-7.6853e+00,  3.7130e+00, -6.1022e+00, -3.8120e+00],
        [-6.5831e+00, -4.1513e+00,  2.9364e+00, -3.3946e+00],
        [-6.2267e+00, -2.5012e+00,  3.5212e-03, -1.8084e+00]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 66%|██████▋   | 192/289 [02:25<01:13,  1.32it/s]

Training loop 192
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2538009583950043, logits - tensor([[-7.6318, -2.1041, -6.5997,  1.1424],
        [-6.3031,  3.4286, -6.1329, -3.0070],
        [-6.9056, -4.1699, -6.3229,  3.8695],
        [-7.9842, -3.0244, -6.7800,  3.5210],
        [-5.3412, -2.9250, -4.9658,  3.4399],
        [-5.3903, -4.1466, -5.7608,  3.8025],
        [-6.4355, -2.8312, -6.1572,  2.6845],
        [-6.1166, -4.4554,  3.7487, -3.4415]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 67%|██████▋   | 193/289 [02:26<01:12,  1.32it/s]

Training loop 193
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1782764345407486, logits - tensor([[-7.0014,  0.0970, -3.8422, -0.8967],
        [-5.1072, -2.6429,  0.4899, -1.4472],
        [-7.4282, -1.4318, -6.6535,  1.3189],
        [-6.5456,  3.2092, -7.5172, -3.5008],
        [-6.5063, -3.6627, -5.6383,  3.4766],
        [-6.9659,  4.1217, -7.0814, -4.8521],
        [-6.2826,  3.4102, -6.2548, -4.4357],
        [-6.4339, -4.4216,  2.6050, -2.8863]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 67%|██████▋   | 194/289 [02:27<01:11,  1.32it/s]

Training loop 194
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.03438904881477356, logits - tensor([[-6.9183,  2.9422, -6.5602, -3.5632],
        [-7.1249,  2.7077, -7.2057, -4.1172],
        [-7.1089,  2.4751, -6.3105, -2.5785],
        [-8.1328, -4.4890, -6.0030,  4.4441],
        [-6.6805,  3.2783, -5.9684, -3.5216],
        [-6.8302,  2.8316, -6.8680, -2.2963],
        [-6.9834,  1.6744, -6.8548, -1.1556],
        [-5.5459, -3.1285, -6.3352,  3.3935]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 67%|██████▋   | 195/289 [02:27<01:11,  1.32it/s]

Training loop 195
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.03267868608236313, logits - tensor([[-6.2045, -4.2012,  2.4408, -3.2194],
        [-4.2593, -4.0771,  2.4530, -2.7128],
        [-7.0087, -2.0935, -6.3229,  2.5928],
        [-6.0727,  4.0614, -7.0529, -3.7156],
        [-7.2079,  3.2155, -7.0838, -2.1758],
        [-7.2595,  3.9130, -6.2666, -3.0745],
        [-6.9800,  2.2135, -6.8853, -1.8710],
        [-7.7626, -4.2416, -5.7092,  4.5789]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 68%|██████▊   | 196/289 [02:28<01:10,  1.31it/s]

Training loop 196
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 68%|██████▊   | 197/289 [02:29<01:10,  1.31it/s]

loss - 0.018410351127386093, logits - tensor([[-6.2541,  3.9068, -5.9565, -3.7630],
        [-5.4426, -3.0242, -6.0838,  3.8011],
        [-8.3855,  2.7197, -7.6756, -3.2400],
        [-6.4396,  4.0130, -6.4495, -3.5794],
        [-6.5382,  3.3842, -6.5278, -3.1774],
        [-6.1378,  3.1200, -6.6592, -3.6726],
        [-6.2232,  3.1658, -7.3992, -3.3148],
        [-6.4131,  2.9287, -7.0630, -3.4557]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 197
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.023636359721422195, logits - tensor([[-7.1432,  3.6749, -7.8371, -4.2715],
        [-6.5016, -1.3804, -4.9736,  1.9486],
        [-5.7293, -4.3839, -5.2899,  4.4585],
        [-7.7330,  3.9098, -7.6598, -3.8979],
        [-6.4074,  2.6795, -6.1705, -3.6281],
        [-5.7932,  3.9109, -6.3088, -4.2715],
        [-6.9897,  3.2250, -7.0057, -4.1215],
        [-6.7935,  2.7529, -6.7844, 

 69%|██████▊   | 198/289 [02:30<01:09,  1.31it/s]

Training loop 198
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.26504144072532654, logits - tensor([[-5.4049, -3.6842,  2.6384, -2.3823],
        [-6.1491, -3.7481, -4.7982,  4.0845],
        [-6.7478,  2.7442, -6.1869, -2.9041],
        [-5.4625, -3.6688,  2.4148, -2.1028],
        [-6.0229, -4.6507,  2.9657, -2.4850],
        [-4.9904, -4.4630, -5.0510,  4.5930],
        [-5.6996,  4.2855, -6.3442, -3.4889],
        [-6.6949,  3.7255, -6.2587, -3.6324]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 69%|██████▉   | 199/289 [02:30<01:08,  1.31it/s]

Training loop 199
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.03351970016956329, logits - tensor([[-5.7201,  3.1313, -5.4106, -3.1814],
        [-5.9124,  3.3894, -5.8568, -3.6546],
        [-6.7795,  3.1072, -6.3711, -3.2568],
        [-6.1128,  3.1740, -6.5108, -3.0941],
        [-6.1077, -4.0655,  3.2759, -2.4099],
        [-8.0674,  1.6902, -7.5160, -1.4457],
        [-7.6985, -2.5974, -4.6372,  3.3042],
        [-5.9979,  3.5855, -5.9839, -2.6324]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 69%|██████▉   | 200/289 [02:31<01:07,  1.32it/s]

Training loop 200
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1335161030292511, logits - tensor([[-7.2093,  3.1221, -8.2515, -4.0466],
        [-5.5393, -3.6281, -5.4569,  3.3688],
        [-7.1932,  3.3525, -7.1888, -3.2835],
        [-6.1228,  1.5626, -5.9348, -1.8706],
        [-6.6660,  3.1145, -7.0307, -3.1369],
        [-6.3577,  3.0219, -6.9002, -3.5820],
        [-6.5061, -3.7592, -5.8812,  3.5344],
        [-7.1625,  4.2732, -7.6642, -2.7565]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 70%|██████▉   | 201/289 [02:32<01:06,  1.32it/s]

Training loop 201
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.020345190539956093, logits - tensor([[-7.9446,  2.6835, -7.1297, -3.4098],
        [-5.7704,  3.8594, -5.7553, -3.4056],
        [-7.4679,  3.5587, -6.8592, -2.1192],
        [-6.1548,  3.3317, -5.9802, -3.5018],
        [-6.1345,  3.3517, -6.1574, -4.0098],
        [-6.3130,  3.1587, -6.4670, -2.9279],
        [-7.4920,  3.8961, -7.4754, -2.5512],
        [-6.6387, -4.0595, -6.1441,  4.7955]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 70%|██████▉   | 202/289 [02:33<01:06,  1.32it/s]

Training loop 202
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.02002790942788124, logits - tensor([[-4.4391, -4.7206, -4.5360,  3.9676],
        [-7.0168,  2.4485, -6.9523, -3.3166],
        [-7.2076,  3.0136, -6.1789, -3.0806],
        [-6.2916,  4.2748, -6.2012, -3.3567],
        [-5.1830, -3.8824, -5.4442,  3.7463],
        [-4.8733,  3.5694, -5.5718, -3.6021],
        [-6.2036,  3.1891, -5.2897, -3.4413],
        [-6.8106, -2.4718, -5.9045,  3.2162]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 70%|███████   | 203/289 [02:33<01:05,  1.31it/s]

Training loop 203
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.03218046575784683, logits - tensor([[-6.7443,  2.3631, -6.5371, -2.9263],
        [-7.4720,  2.1912, -7.8487, -2.5959],
        [-7.5042,  4.0116, -7.2565, -4.2523],
        [-7.2070,  2.7937, -7.0698, -3.1661],
        [-7.4265,  2.7525, -7.3814, -3.2728],
        [-7.1078,  1.5797, -7.3383, -1.4430],
        [-6.2676,  4.3409, -6.4053, -3.8254],
        [-7.2737,  3.8488, -8.0296, -4.6772]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 71%|███████   | 204/289 [02:34<01:04,  1.32it/s]

Training loop 204
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.10809376835823059, logits - tensor([[-5.8837, -4.1724,  1.9544, -2.3601],
        [-6.6506, -3.7941, -4.9662,  3.4619],
        [-6.5537,  3.6958, -6.9320, -3.1631],
        [-6.3533,  3.2624, -6.2599, -4.0186],
        [-7.5695,  0.1031, -7.5353,  0.5796],
        [-6.0516, -3.3444,  0.2674, -1.0062],
        [-6.5191,  3.8150, -6.1367, -3.5419],
        [-8.4338,  1.8734, -7.2580, -1.2753]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 71%|███████   | 205/289 [02:35<01:03,  1.32it/s]

Training loop 205
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.04425617679953575, logits - tensor([[-5.9832,  2.8045, -7.0069, -3.1251],
        [-7.3436,  3.0384, -7.3776, -3.2770],
        [-6.0695,  2.8646, -6.4243, -3.4123],
        [-7.0411, -3.6781,  1.7686, -2.0976],
        [-7.1149, -3.1192, -6.1962,  3.8288],
        [-6.0612, -4.4633,  2.8642, -2.7690],
        [-6.2608,  2.6892, -6.4675, -2.7027],
        [-6.8879,  1.4810, -5.9526, -1.0707]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 71%|███████▏  | 206/289 [02:36<01:02,  1.32it/s]

Training loop 206
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.03312758356332779, logits - tensor([[-5.3119,  3.1770, -6.7142, -3.8372],
        [-6.0827,  3.0228, -5.3825, -2.5806],
        [-6.9576,  3.6193, -7.1397, -3.1786],
        [-5.7450, -3.8672,  2.9884, -2.5815],
        [-5.3957,  3.7505, -7.0466, -3.6050],
        [-5.9140, -2.6702,  0.9350, -2.2834],
        [-5.6615,  3.5162, -5.7497, -3.6994],
        [-5.4353, -4.1409, -5.6623,  4.4077]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 72%|███████▏  | 207/289 [02:36<01:01,  1.32it/s]

Training loop 207
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.026713190600275993, logits - tensor([[-6.3693, -2.9338, -6.0570,  3.4015],
        [-6.4856,  3.4963, -7.3887, -3.3530],
        [-6.6118,  2.4130, -6.1987, -2.7128],
        [-7.4705,  3.9802, -7.6316, -3.2302],
        [-7.0483, -3.4685, -5.9141,  3.8431],
        [-7.1833,  3.3292, -6.5146, -3.1914],
        [-7.0575,  3.7954, -6.9180, -4.5782],
        [-7.5285,  1.5884, -7.3204, -1.9460]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 72%|███████▏  | 208/289 [02:37<01:01,  1.32it/s]

Training loop 208
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.03179414942860603, logits - tensor([[-6.6767,  4.1744, -6.5471, -3.6518],
        [-6.3556,  3.8940, -6.8505, -4.1010],
        [-7.6849, -2.3050, -6.1195,  2.8460],
        [-5.4221,  2.8999, -6.1581, -3.2359],
        [-5.8617, -3.3068,  2.1826, -1.0491],
        [-6.2703, -4.6822,  3.2683, -3.3617],
        [-6.4712,  3.5167, -6.5316, -3.4385],
        [-6.5260,  2.7922, -7.1546, -3.5798]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 72%|███████▏  | 209/289 [02:38<01:00,  1.32it/s]

Training loop 209
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19152352213859558, logits - tensor([[-7.7359,  2.6833, -7.0603, -2.4564],
        [-6.8703,  2.6640, -5.5242, -2.2941],
        [-7.3206, -2.1414, -6.7521,  2.8702],
        [-6.4572,  4.7574, -7.3358, -3.7047],
        [-7.3993, -1.7193, -5.5547,  1.5352],
        [-5.4424, -2.7740, -5.7390,  3.4825],
        [-6.7973,  3.4673, -7.0973, -4.1716],
        [-7.1569,  3.1156, -7.4124, -3.1887]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 73%|███████▎  | 210/289 [02:39<00:59,  1.32it/s]

Training loop 210
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.029450558125972748, logits - tensor([[-6.8369,  3.0988, -6.0519, -3.7947],
        [-6.1813, -3.0150, -5.6018,  3.0982],
        [-6.1549, -3.9054,  1.6655, -1.6158],
        [-5.7510, -4.1774,  2.8910, -3.6332],
        [-6.6153,  2.6183, -7.1968, -2.9814],
        [-7.3277,  3.8155, -7.4553, -3.7786],
        [-6.2329,  3.5158, -6.9069, -4.5561],
        [-5.3311,  3.8754, -5.7198, -2.7677]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 73%|███████▎  | 211/289 [02:39<00:58,  1.32it/s]

Training loop 211
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.02486208826303482, logits - tensor([[-6.5179, -4.4561,  3.6228, -3.4940],
        [-6.8948,  3.1089, -6.7084, -1.9024],
        [-4.9531, -3.3924,  2.5790, -2.6417],
        [-6.6549, -5.2009,  3.3175, -3.1832],
        [-7.3819,  3.5903, -7.1917, -3.4458],
        [-7.0671, -2.6431, -6.0386,  2.7754],
        [-5.7466, -4.0235, -5.3663,  4.0569],
        [-6.3444,  3.9125, -6.2428, -4.0289]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 73%|███████▎  | 212/289 [02:40<00:58,  1.33it/s]

Training loop 212
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.14997252821922302, logits - tensor([[-6.6223, -3.3151, -6.8061,  3.2899],
        [-7.2111,  0.6323, -6.8806, -0.4029],
        [-6.8412,  3.7055, -6.5793, -3.1613],
        [-6.2434, -2.3278,  0.4674, -1.0927],
        [-6.0265,  3.2802, -6.4954, -3.9038],
        [-7.1504, -2.4265,  0.2374, -1.5325],
        [-7.0039,  3.3913, -6.4549, -2.3805],
        [-7.0245,  4.1344, -7.0621, -4.5227]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 74%|███████▎  | 213/289 [02:41<00:57,  1.32it/s]

Training loop 213
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.03225379437208176, logits - tensor([[-6.9840,  4.4303, -6.8267, -3.5880],
        [-5.8436,  3.8396, -6.5639, -3.4777],
        [-7.5327,  1.4892, -6.6039, -1.0069],
        [-5.4955, -4.7055,  3.1660, -2.6474],
        [-7.2667,  2.7236, -7.1242, -2.7409],
        [-7.5790, -4.1888, -7.0779,  4.1431],
        [-6.2652, -5.1307,  3.4350, -3.2831],
        [-6.7416,  3.5064, -6.5297, -3.5913]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 74%|███████▍  | 214/289 [02:42<00:56,  1.33it/s]

Training loop 214
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 74%|███████▍  | 215/289 [02:43<00:55,  1.32it/s]

loss - 0.018869437277317047, logits - tensor([[-7.1177,  2.5219, -6.6288, -3.0302],
        [-7.3261, -3.0203, -5.8915,  3.5414],
        [-5.2956,  3.7365, -5.3667, -3.8738],
        [-5.8085, -4.4832, -5.4346,  4.1364],
        [-5.7271,  3.0947, -6.1608, -3.3040],
        [-7.0913,  3.6078, -7.4969, -4.0620],
        [-6.5700,  2.9100, -5.9415, -2.8973],
        [-7.2753, -4.9391,  3.3644, -3.6406]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 215
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.043225035071372986, logits - tensor([[-5.3772, -4.2633,  2.6543, -2.7497],
        [-7.2982,  3.2838, -7.8107, -3.4758],
        [-6.8338,  1.8499, -6.7209, -1.9093],
        [-6.2439, -3.6966, -5.7117,  3.2617],
        [-7.7440, -0.9726, -7.4064,  1.4398],
        [-6.0877, -4.2841,  2.5339, -2.9931],
        [-6.0832, -4.0772,  3.3721, -3.2418],
        [-7.3802,  3.7034, -7.4028, 

 75%|███████▍  | 216/289 [02:43<00:55,  1.32it/s]

Training loop 216
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 75%|███████▌  | 217/289 [02:44<00:54,  1.32it/s]

loss - 0.017984431236982346, logits - tensor([[-5.5578, -4.9625, -5.9925,  4.8397],
        [-6.6270,  2.6428, -6.7515, -3.6022],
        [-6.7427,  3.0564, -5.9500, -3.0099],
        [-6.6031,  3.0317, -7.1794, -3.3741],
        [-4.9632, -3.7048, -5.5231,  3.9847],
        [-7.2319, -3.2891, -6.8667,  4.7291],
        [-5.4737, -4.2446,  3.0753, -3.2280],
        [-8.3756,  3.3432, -7.8434, -3.2850]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 217
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.21032148599624634, logits - tensor([[-6.6994,  2.3767, -7.1380, -2.8108],
        [-6.4957,  1.4482, -6.0512, -1.2896],
        [-5.3164, -2.9519,  1.9493, -3.0423],
        [-7.0072, -1.1856, -6.8581,  2.0611],
        [-5.4825,  2.6594, -5.7171, -2.3201],
        [-6.6607, -4.7341,  3.6697, -3.3940],
        [-6.1626, -4.6235,  3.8925, -3.5820],
        [-6.8530,  2.5440, -7.5328, -

 75%|███████▌  | 218/289 [02:45<00:53,  1.32it/s]

Training loop 218
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.024398360401391983, logits - tensor([[-6.1743, -4.6850,  3.4427, -2.3817],
        [-6.5382, -3.0144, -5.1674,  2.7095],
        [-7.3737,  4.1108, -7.4115, -3.8050],
        [-5.4694, -3.6718,  2.0651, -2.9788],
        [-5.5109,  3.0079, -4.6987, -3.2449],
        [-6.2135,  3.0812, -5.1738, -3.0239],
        [-6.2853,  3.0893, -6.3845, -4.1557],
        [-5.6370, -4.4041, -5.3332,  4.7483]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 76%|███████▌  | 219/289 [02:46<00:53,  1.31it/s]

Training loop 219
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.023761795833706856, logits - tensor([[-6.6994, -2.3940, -6.0005,  3.0672],
        [-6.7057,  3.5057, -7.0563, -3.3311],
        [-6.8518, -4.8263,  3.9120, -3.2257],
        [-6.1484, -4.9628, -5.7459,  5.2412],
        [-5.1704,  2.8447, -4.8972, -3.4993],
        [-7.4417,  3.0371, -7.5327, -3.4706],
        [-6.0508,  2.2143, -6.5481, -2.1774],
        [-5.8994, -3.9530,  3.4554, -3.6363]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 76%|███████▌  | 220/289 [02:46<00:52,  1.31it/s]

Training loop 220
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.04363299533724785, logits - tensor([[-6.9352,  3.5975, -7.4053, -4.6782],
        [-6.8478, -4.4812,  2.7413, -3.9464],
        [-7.8552, -3.3923, -4.2296,  2.6757],
        [-7.0000,  0.3376, -5.8740, -1.1929],
        [-6.2841,  2.5841, -6.6395, -2.5018],
        [-5.9950,  2.9434, -6.6109, -3.9081],
        [-5.7656,  3.6248, -6.4063, -3.9744],
        [-5.9085, -3.5444, -5.6009,  3.5448]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 76%|███████▋  | 221/289 [02:47<00:51,  1.32it/s]

Training loop 221
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.045617200434207916, logits - tensor([[-6.0717,  3.6759, -7.6201, -3.3376],
        [-6.7300, -2.8441,  0.3977, -1.4687],
        [-5.9157,  3.8539, -6.5950, -3.0392],
        [-7.5577,  2.8135, -7.8065, -2.8940],
        [-7.0489, -2.1267, -5.8636,  1.5634],
        [-6.0868,  3.9526, -6.5817, -4.0531],
        [-6.7666, -4.0358, -5.3781,  4.2757],
        [-7.1376,  3.7299, -7.2769, -3.6916]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 77%|███████▋  | 222/289 [02:48<00:50,  1.32it/s]

Training loop 222
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.0108915064483881, logits - tensor([[-7.1742,  3.5084, -6.7120, -3.4916],
        [-6.9213,  3.6129, -6.1636, -4.1623],
        [-6.2285,  3.6273, -6.1796, -4.0436],
        [-6.5534,  4.0206, -6.5996, -4.9153],
        [-5.5499, -4.5179, -5.8739,  5.2738],
        [-6.2591,  3.7382, -5.9330, -3.7760],
        [-7.0618, -4.5151, -4.9855,  3.3055],
        [-6.6107, -4.4414, -5.2719,  3.8914]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 77%|███████▋  | 223/289 [02:49<00:50,  1.32it/s]

Training loop 223
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3535395562648773, logits - tensor([[-6.5450, -4.6391,  2.9616, -2.8666],
        [-6.9817,  2.6137, -6.5959, -2.9830],
        [-6.4039, -2.1407, -6.0301,  1.7699],
        [-6.5680,  3.1334, -6.2861, -2.6604],
        [-6.1731,  3.5757, -6.7038, -2.7691],
        [-6.5144, -2.5451, -6.0357,  1.5068],
        [-6.8223,  4.0582, -6.2656, -4.4480],
        [-7.1225, -4.3329, -5.5231,  4.4436]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 78%|███████▊  | 224/289 [02:49<00:49,  1.32it/s]

Training loop 224
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.016342956572771072, logits - tensor([[-7.1924,  3.0742, -7.9102, -3.4063],
        [-7.0489,  3.3547, -6.8794, -4.3121],
        [-6.0390,  2.6898, -5.9185, -3.4273],
        [-6.7170,  3.8415, -6.4172, -3.0859],
        [-6.2250,  3.9282, -6.5665, -4.1172],
        [-6.4829,  3.7061, -5.8930, -3.1411],
        [-5.8494, -4.1247, -4.6302,  3.3223],
        [-6.2599,  3.5966, -6.0379, -4.0242]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 78%|███████▊  | 225/289 [02:50<00:48,  1.32it/s]

Training loop 225
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.08940500766038895, logits - tensor([[-6.3319, -3.6415, -6.5786,  3.9865],
        [-6.3482,  2.9963, -6.8943, -3.6320],
        [-6.4927,  3.7199, -6.5340, -3.0820],
        [-8.1855,  3.7263, -7.3993, -3.4902],
        [-8.0957, -0.1340, -6.8549,  1.4157],
        [-6.2847, -3.5732, -6.4554,  3.4019],
        [-5.4829,  3.0807, -7.1100, -3.0805],
        [-7.6939,  3.8923, -7.0331, -3.4112]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 78%|███████▊  | 226/289 [02:51<00:47,  1.32it/s]

Training loop 226
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.014731417410075665, logits - tensor([[-6.0527,  2.8547, -6.3971, -3.9709],
        [-7.0975,  3.4926, -7.4801, -3.4180],
        [-6.2871,  4.1831, -6.3235, -4.0819],
        [-7.1816,  3.5943, -6.6281, -4.1463],
        [-6.5589,  3.7029, -6.2551, -3.6708],
        [-7.2339, -3.2711, -6.4710,  3.1540],
        [-6.2665,  4.0630, -5.4046, -4.6055],
        [-6.7114,  2.8823, -7.2533, -3.6861]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 79%|███████▊  | 227/289 [02:52<00:47,  1.32it/s]

Training loop 227
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.015625659376382828, logits - tensor([[-6.8838,  4.3540, -6.6635, -4.1621],
        [-6.0495,  3.8134, -6.5243, -3.2843],
        [-6.8952,  3.1486, -7.2182, -3.7939],
        [-5.3936, -3.8460, -5.3934,  3.6691],
        [-6.5171,  3.6580, -6.0086, -2.9057],
        [-6.8797, -3.4840, -7.5141,  2.8558],
        [-6.9664,  3.7238, -7.1522, -3.6205],
        [-6.4851, -3.4451, -6.1774,  3.5517]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 79%|███████▉  | 228/289 [02:52<00:46,  1.32it/s]

Training loop 228
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.15609966218471527, logits - tensor([[-5.9334, -2.9676,  0.4758, -1.5906],
        [-5.3490,  4.1109, -5.8335, -3.9468],
        [-5.9174,  3.5118, -5.7210, -3.1989],
        [-7.0982, -2.4928, -5.8708,  2.9365],
        [-7.7454, -3.0832, -6.6463,  3.1450],
        [-6.9290,  3.0850, -6.7024, -3.0090],
        [-5.8028, -4.8731, -5.3607,  4.3125],
        [-8.0440,  1.2253, -7.4017, -2.1885]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 79%|███████▉  | 229/289 [02:53<00:45,  1.32it/s]

Training loop 229
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.02396707981824875, logits - tensor([[-6.4418,  3.9915, -6.1453, -3.1213],
        [-6.8558,  3.0425, -6.5869, -3.3484],
        [-6.7027, -4.0710, -5.7200,  4.1976],
        [-5.7627, -4.4745,  2.3028, -2.9574],
        [-6.7903,  2.6302, -7.1526, -1.9331],
        [-7.1728, -4.8207,  3.2900, -3.6426],
        [-7.0102,  3.0590, -7.3579, -2.8617],
        [-6.2085,  4.5860, -6.7886, -3.5875]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 80%|███████▉  | 230/289 [02:54<00:44,  1.32it/s]

Training loop 230
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.023342004045844078, logits - tensor([[-7.2550,  2.8743, -7.7216, -3.2778],
        [-7.9540,  3.0771, -6.8507, -2.3115],
        [-6.2618, -3.9243, -4.1616,  3.3049],
        [-6.9849, -4.5458, -5.4137,  3.9239],
        [-6.4384, -4.1562,  3.2730, -3.8173],
        [-6.5717,  2.5347, -7.2964, -3.5615],
        [-6.5187, -3.3540, -6.3322,  3.3813],
        [-6.3946,  2.2453, -6.1095, -3.0478]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 80%|███████▉  | 231/289 [02:55<00:43,  1.32it/s]

Training loop 231
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.03769802302122116, logits - tensor([[-5.9495,  2.6289, -6.6840, -2.7870],
        [-5.8419,  3.3730, -6.3675, -3.2690],
        [-7.8103,  2.3086, -7.4153, -3.1122],
        [-6.8651,  4.0801, -6.6564, -5.0651],
        [-6.5512, -4.3794, -4.7304,  3.4601],
        [-6.5173, -3.1892,  1.1217, -1.8768],
        [-6.8555, -2.0650, -6.4627,  2.6756],
        [-6.3011,  2.9389, -6.5437, -2.6339]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 80%|████████  | 232/289 [02:55<00:43,  1.32it/s]

Training loop 232
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.053443461656570435, logits - tensor([[-6.3782, -4.0701, -4.5398,  4.1882],
        [-6.3432, -2.1362, -5.8615,  2.3265],
        [-7.3717,  3.8835, -7.3787, -4.2877],
        [-6.7102, -3.9923, -5.0906,  3.6855],
        [-6.0932, -1.2807, -4.5875,  0.1693],
        [-8.6930,  2.1126, -7.5183, -1.2856],
        [-6.9851,  3.2137, -6.0863, -3.9341],
        [-5.8271, -3.7243, -5.6728,  2.9708]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 81%|████████  | 233/289 [02:56<00:42,  1.32it/s]

Training loop 233
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.3623730540275574, logits - tensor([[-6.0347, -3.0722, -5.6030,  3.3131],
        [-7.1880,  3.4173, -6.1479, -3.9235],
        [-6.8674,  2.8088, -6.4742, -2.9377],
        [-7.2535,  3.7804, -7.6356, -3.6475],
        [-6.7087,  3.9106, -6.7126, -4.5267],
        [-6.0773,  3.8436, -6.6600, -4.3801],
        [-5.6875, -3.5568,  2.4794, -2.7382],
        [-5.4886, -3.2172, -6.0717,  3.2751]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 81%|████████  | 234/289 [02:57<00:41,  1.33it/s]

Training loop 234
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.058600254356861115, logits - tensor([[-6.3298,  2.5647, -6.5815, -3.1159],
        [-6.1647, -4.7169,  3.3297, -3.8720],
        [-6.2708,  3.9922, -5.9455, -4.0754],
        [-6.8004, -4.0680, -5.6143,  2.9227],
        [-6.2437, -3.8759,  1.7442, -2.2959],
        [-6.9929, -4.4574, -5.0369,  4.7677],
        [-6.6900,  3.0265, -6.6801, -3.0378],
        [-6.1321,  0.1493, -6.0490,  0.7238]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 81%|████████▏ | 235/289 [02:58<00:40,  1.33it/s]

Training loop 235
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 82%|████████▏ | 236/289 [02:58<00:39,  1.33it/s]

loss - 0.02440222166478634, logits - tensor([[-6.6953,  4.1013, -6.8019, -4.1906],
        [-7.9076, -2.5603, -6.6291,  2.3029],
        [-7.4324, -3.6343, -6.0624,  2.7257],
        [-6.9250,  3.1720, -7.3244, -2.8486],
        [-6.3276,  3.1640, -5.8594, -4.4368],
        [-6.5420, -3.6185,  2.7235, -2.0968],
        [-5.8180, -4.3494,  2.9078, -4.0431],
        [-6.5177, -4.0456, -5.5864,  4.5825]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 236
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.0313725583255291, logits - tensor([[-5.7719, -4.0200,  2.2662, -2.6971],
        [-5.4704, -3.6557, -4.3441,  3.8612],
        [-6.9586, -1.9632, -6.1127,  1.8956],
        [-5.6961, -2.3684, -6.0667,  2.4565],
        [-5.6696, -3.4618,  3.0619, -2.9535],
        [-5.4659, -4.2308, -4.6995,  3.8017],
        [-6.6216,  2.8386, -6.8542, -3.2985],
        [-6.9614,  4.4384, -7.0406, -4.

 82%|████████▏ | 237/289 [02:59<00:39,  1.32it/s]

Training loop 237
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 82%|████████▏ | 238/289 [03:00<00:38,  1.33it/s]

loss - 0.2638860046863556, logits - tensor([[-7.4516, -3.3755, -6.3991,  2.2676],
        [-6.0887,  3.3642, -6.0197, -4.3648],
        [-6.8156,  4.6987, -6.5715, -3.8226],
        [-6.3562,  4.4455, -6.3281, -3.1219],
        [-6.0288,  3.4318, -5.5411, -2.7794],
        [-6.0265,  1.6157, -6.1314, -1.9744],
        [-5.0868, -4.3957,  3.4709, -3.4816],
        [-5.4419, -4.1185,  2.9994, -3.2776]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 238
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 83%|████████▎ | 239/289 [03:01<00:37,  1.33it/s]

loss - 0.0382363423705101, logits - tensor([[-7.5961,  3.2985, -7.2556, -2.9335],
        [-6.6794, -1.5392, -5.6055,  1.6457],
        [-5.6201, -3.6233,  1.2080, -1.6680],
        [-8.4651,  3.7914, -7.0481, -4.7386],
        [-7.1340,  4.0435, -6.5197, -3.7063],
        [-5.8938, -3.6309, -5.4518,  2.8039],
        [-6.4669,  3.2528, -6.7965, -3.9129],
        [-6.4947,  3.1566, -6.4575, -3.8432]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 239
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.03056555613875389, logits - tensor([[-6.8708,  3.3318, -5.5389, -2.9813],
        [-7.3081,  4.5488, -7.3072, -3.7788],
        [-5.0613, -3.3176,  1.5598, -2.3455],
        [-6.1396,  3.5277, -5.6939, -2.7345],
        [-6.8208,  3.4078, -6.8251, -3.2229],
        [-6.5976,  2.1444, -6.7492, -2.2396],
        [-6.1004,  3.2723, -5.8939, -3.0826],
        [-5.7435, -5.0919, -3.6931,  3.

 83%|████████▎ | 240/289 [03:01<00:37,  1.32it/s]

Training loop 240
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.23068392276763916, logits - tensor([[-8.5760,  0.2190, -5.1377, -0.6643],
        [-5.8721, -3.7111, -5.2901,  4.5609],
        [-6.6024,  4.1741, -7.2904, -3.5335],
        [-6.9159,  2.3274, -7.5965, -2.2159],
        [-6.6503,  2.7864, -6.9473, -2.2478],
        [-6.6316,  4.0958, -6.9475, -3.7204],
        [-7.0218,  1.5031, -6.7185, -1.8691],
        [-6.9418, -4.6201,  2.5864, -3.7643]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 83%|████████▎ | 241/289 [03:02<00:36,  1.32it/s]

Training loop 241
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.286896288394928, logits - tensor([[-6.3768, -4.5240, -6.1147,  4.7917],
        [-6.2615,  1.6878, -6.3568, -2.6872],
        [-7.1923,  3.1908, -6.8221, -3.7089],
        [-7.3908,  3.4242, -6.7683, -3.4093],
        [-7.0491, -3.2317, -6.9215,  3.5533],
        [-6.4342, -4.8933, -5.4667,  4.2605],
        [-6.6859,  4.3764, -6.6660, -4.2335],
        [-6.5590,  3.4167, -7.3087, -4.1244]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 84%|████████▎ | 242/289 [03:03<00:35,  1.32it/s]

Training loop 242
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.044423747807741165, logits - tensor([[-6.7601,  2.5213, -6.1804, -3.0443],
        [-6.5824,  4.0747, -6.2862, -3.6748],
        [-7.0985,  2.5207, -6.6519, -3.2639],
        [-6.3504,  4.0852, -6.6155, -4.2586],
        [-8.3820,  1.1260, -6.8660, -0.0909],
        [-7.6438,  4.0560, -7.5854, -3.9174],
        [-7.7643, -3.7491, -5.3956,  5.1671],
        [-6.1867,  2.8921, -6.5157, -3.2403]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 84%|████████▍ | 243/289 [03:04<00:34,  1.32it/s]

Training loop 243
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.05299076810479164, logits - tensor([[-7.7235,  1.7611, -7.1840, -2.2599],
        [-6.1952,  3.6111, -7.0727, -3.7952],
        [-6.5602,  2.3922, -6.8567, -2.3685],
        [-6.4391, -1.4821, -7.1308,  0.3098],
        [-5.4010,  4.2599, -6.6027, -4.2159],
        [-8.0792,  2.6659, -7.1465, -2.9006],
        [-6.6435, -2.2821, -6.5423,  3.0799],
        [-7.3678, -3.6862, -6.7154,  2.0571]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 84%|████████▍ | 244/289 [03:04<00:34,  1.32it/s]

Training loop 244
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.02504490688443184, logits - tensor([[-6.9017, -2.6036, -5.5000,  3.1246],
        [-6.1354,  3.2486, -5.9592, -2.9129],
        [-6.9817, -2.7510, -6.3816,  3.0179],
        [-5.8711,  2.9877, -6.5395, -3.9702],
        [-5.7095,  3.6574, -5.6002, -3.0071],
        [-6.7037,  3.3901, -7.0096, -4.1163],
        [-7.1025,  3.1967, -7.9247, -1.9852],
        [-7.5618, -2.8717, -4.5469,  3.3784]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 85%|████████▍ | 245/289 [03:05<00:33,  1.32it/s]

Training loop 245
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.022116929292678833, logits - tensor([[-5.9059, -3.7605, -4.3272,  4.3907],
        [-7.3127,  3.7022, -6.8947, -2.8231],
        [-7.1572, -2.8119, -5.6603,  3.9066],
        [-5.4558,  1.9904, -6.3018, -2.6181],
        [-6.1891, -4.2701,  2.9132, -3.0470],
        [-7.1153,  3.7122, -6.3024, -4.1057],
        [-6.2193, -3.7771,  3.4208, -3.5829],
        [-6.3721,  3.6339, -6.8995, -4.2331]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 85%|████████▌ | 246/289 [03:06<00:32,  1.33it/s]

Training loop 246
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.025439061224460602, logits - tensor([[-7.1630, -3.9367, -6.0678,  4.8879],
        [-5.7352, -4.0947, -5.5421,  3.7329],
        [-6.6681,  3.7926, -6.8768, -3.3168],
        [-6.9799,  4.0071, -6.6423, -4.1040],
        [-5.3746, -3.7315,  2.8674, -2.3758],
        [-5.2511, -3.5341,  2.8133, -3.2834],
        [-5.8150,  3.2191, -6.1278, -2.9855],
        [-6.6718,  1.6517, -6.9475, -2.6666]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 85%|████████▌ | 247/289 [03:07<00:31,  1.33it/s]

Training loop 247
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.04166204109787941, logits - tensor([[-6.5756,  1.9347, -7.0446, -2.0545],
        [-7.1900,  3.2988, -6.1160, -3.0596],
        [-7.6404, -1.4248, -6.4782,  0.7351],
        [-5.9788, -3.6843, -5.7956,  4.8179],
        [-6.2244, -3.2266, -6.3817,  3.2569],
        [-6.0337, -4.4084,  2.8123, -3.0252],
        [-5.9830,  3.2422, -7.0990, -3.4465],
        [-5.4923,  3.8321, -5.5205, -3.2125]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 86%|████████▌ | 248/289 [03:07<00:30,  1.32it/s]

Training loop 248
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.012376021593809128, logits - tensor([[-5.7612,  3.8670, -6.5998, -3.4457],
        [-6.4910, -4.9019, -5.4352,  4.1812],
        [-6.4649,  3.1755, -7.0679, -3.9264],
        [-6.5660,  3.6094, -6.3595, -4.4105],
        [-5.8377,  3.6744, -6.3036, -4.0168],
        [-6.3921,  3.4591, -6.4059, -3.6977],
        [-6.2589,  3.4720, -7.0941, -3.4067],
        [-6.9986, -4.5061, -6.3252,  3.8920]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 86%|████████▌ | 249/289 [03:08<00:30,  1.32it/s]

Training loop 249
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.21310117840766907, logits - tensor([[-6.0067, -2.9078,  1.1668, -1.2951],
        [-6.4516, -3.1094,  1.1335, -1.8753],
        [-6.5261,  3.2236, -6.8118, -3.1503],
        [-7.7361,  4.7369, -7.2750, -3.6719],
        [-7.6075,  0.1567, -6.9457,  0.1630],
        [-6.1450, -4.1280,  3.2405, -3.5770],
        [-6.5438,  3.8535, -7.2390, -3.2118],
        [-6.5949,  3.3315, -6.9183, -4.1796]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 87%|████████▋ | 250/289 [03:09<00:29,  1.33it/s]

Training loop 250
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.04512465000152588, logits - tensor([[-7.5245,  2.7346, -6.8459, -2.4020],
        [-8.3835, -3.7895, -5.6798,  4.4858],
        [-5.6600,  2.5395, -6.8489, -2.9901],
        [-6.7033, -4.0777, -5.2989,  4.1726],
        [-7.2691,  2.9857, -6.6180, -2.9712],
        [-6.4322, -2.4655,  0.4327, -0.9792],
        [-5.9754,  3.9375, -6.1503, -3.7243],
        [-7.2338, -3.9196, -6.7949,  4.1851]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 87%|████████▋ | 251/289 [03:10<00:28,  1.33it/s]

Training loop 251
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19431044161319733, logits - tensor([[-7.3938,  2.4433, -7.2281, -2.7862],
        [-7.3349, -1.7025, -6.1712,  2.0477],
        [-7.9365,  4.0035, -8.0714, -3.8397],
        [-6.3446, -2.5435, -0.9009, -0.9473],
        [-7.1489,  3.8540, -7.9894, -3.7343],
        [-6.9800, -2.2666, -5.5505,  2.5211],
        [-7.2573, -4.1699, -6.3156,  4.0384],
        [-7.3731, -4.2030, -5.7312,  4.0120]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 87%|████████▋ | 252/289 [03:10<00:27,  1.33it/s]

Training loop 252
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.0646919459104538, logits - tensor([[-6.9621, -4.4093, -6.6572,  4.9034],
        [-7.9950, -2.7296, -5.8034,  3.6363],
        [-5.7916,  3.3514, -6.2035, -3.9007],
        [-6.8285,  0.2821, -5.5351,  0.7060],
        [-7.3648,  3.1874, -7.4174, -2.7464],
        [-6.0976, -3.7866,  1.7576, -1.9755],
        [-6.6793, -3.0398, -6.2576,  2.2244],
        [-5.8147,  3.8086, -6.0716, -2.9982]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 88%|████████▊ | 253/289 [03:11<00:27,  1.33it/s]

Training loop 253
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19181036949157715, logits - tensor([[-5.5358, -3.8741, -4.7351,  3.6789],
        [-6.0393, -3.6462,  2.5994, -2.0829],
        [-5.9990,  2.5892, -5.7713, -2.5185],
        [-6.8242, -2.4107, -6.8136,  2.8717],
        [-7.0804,  3.0125, -6.8214, -3.0467],
        [-5.4077, -3.5748,  3.3733, -3.1533],
        [-6.3879,  2.6233, -7.5618, -2.0004],
        [-7.3197,  3.7266, -7.5172, -3.5616]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 88%|████████▊ | 254/289 [03:12<00:26,  1.33it/s]

Training loop 254
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.13359598815441132, logits - tensor([[-6.6252,  3.3693, -6.3303, -4.0883],
        [-7.3901,  2.9583, -6.5105, -2.5767],
        [-6.1688,  3.2206, -6.9808, -2.7678],
        [-7.1020,  4.1415, -5.7892, -4.5114],
        [-5.7310,  2.9727, -5.9481, -2.7907],
        [-6.2613, -1.4969, -6.1521,  1.8417],
        [-6.2422,  2.6702, -6.2073, -3.3747],
        [-7.2673,  3.7452, -6.7139, -3.6801]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 88%|████████▊ | 255/289 [03:13<00:25,  1.33it/s]

Training loop 255
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.10145451128482819, logits - tensor([[-7.7019,  3.3001, -7.3275, -3.0490],
        [-7.2444, -3.9751, -6.4765,  4.7026],
        [-7.0331,  0.5625, -6.2862, -0.2467],
        [-7.3581,  2.8191, -7.4044, -3.2855],
        [-6.8844,  3.0610, -6.5167, -4.0293],
        [-7.2058, -3.2916,  0.4918, -0.9228],
        [-4.4008, -3.1577,  2.6843, -2.4375],
        [-6.1237,  3.4251, -6.5680, -3.1349]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 89%|████████▊ | 256/289 [03:14<00:24,  1.33it/s]

Training loop 256
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.05517127364873886, logits - tensor([[-7.1107,  3.3768, -7.4573, -3.6189],
        [-7.2139,  3.5203, -8.0054, -3.7184],
        [-4.9076, -4.5056, -5.3710,  4.6674],
        [-6.2504,  3.0230, -6.8753, -2.5799],
        [-7.7381,  3.0731, -6.7498, -3.0258],
        [-6.3365,  2.1548, -6.5500, -2.6197],
        [-7.9819, -0.0145, -6.6360,  0.5135],
        [-5.6926, -3.2313, -6.1488,  4.1426]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 89%|████████▉ | 257/289 [03:14<00:24,  1.33it/s]

Training loop 257
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.026723477989435196, logits - tensor([[-5.6096, -4.2336,  3.2806, -3.2580],
        [-6.6537,  3.0639, -6.4543, -3.2217],
        [-8.7439,  3.3390, -7.6903, -2.6456],
        [-7.3768,  3.3828, -6.5583, -2.6853],
        [-7.4798, -3.2396, -6.5777,  3.5986],
        [-5.6717, -2.9595, -4.6112,  3.2891],
        [-6.6791,  2.8302, -6.2205, -1.9977],
        [-7.8102, -2.6018, -6.3896,  3.1950]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 89%|████████▉ | 258/289 [03:15<00:23,  1.32it/s]

Training loop 258
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.03367050737142563, logits - tensor([[-7.8225,  2.2036, -7.1508, -2.5461],
        [-7.2793,  2.9345, -6.9306, -3.1263],
        [-8.3548,  4.0268, -7.9470, -3.9774],
        [-5.3847,  2.4598, -5.2622, -4.2319],
        [-6.8726, -4.1612,  1.1038, -2.8728],
        [-6.7775, -4.6848,  3.2307, -3.8430],
        [-6.2907, -3.7844,  3.3120, -3.0776],
        [-6.3485,  3.0896, -7.4274, -2.5322]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 90%|████████▉ | 259/289 [03:16<00:22,  1.33it/s]

Training loop 259
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.0645899623632431, logits - tensor([[-7.3966,  2.1808, -6.4299, -1.3858],
        [-8.2986, -1.2806, -6.3877,  1.7957],
        [-7.5776,  2.0720, -7.6084, -1.9792],
        [-8.5355, -2.8923, -6.3949,  3.3109],
        [-6.8421, -2.2581, -6.9581,  2.6763],
        [-5.6922, -4.5754,  2.7750, -2.5575],
        [-6.6322, -4.4674,  1.4257, -0.9951],
        [-6.3188,  2.7857, -5.8851, -2.6374]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 90%|████████▉ | 260/289 [03:17<00:21,  1.33it/s]

Training loop 260
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.10725370794534683, logits - tensor([[-5.9852,  2.9875, -6.4142, -3.5643],
        [-7.2208, -3.4497, -4.3539,  0.3665],
        [-6.0198,  3.2276, -5.2794, -3.0460],
        [-5.5475, -3.0591, -6.3417,  3.6810],
        [-6.4778,  0.8625, -6.3870, -0.8753],
        [-6.4691, -3.4720, -5.4417,  3.9843],
        [-7.2394, -3.8943, -6.1765,  5.0790],
        [-6.4176,  3.1051, -6.5256, -3.2594]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 90%|█████████ | 261/289 [03:17<00:21,  1.33it/s]

Training loop 261
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.030433854088187218, logits - tensor([[-6.3253, -3.8079,  1.9529, -2.8194],
        [-5.1490, -3.1621,  2.2645, -2.0314],
        [-6.4603,  2.5599, -7.2628, -2.2888],
        [-5.9423, -4.2628, -5.5984,  2.9659],
        [-6.2687, -3.3547, -5.9402,  3.5893],
        [-6.3890,  4.1730, -7.0941, -4.4827],
        [-6.9115,  3.3791, -7.4340, -3.2563],
        [-6.0410,  3.9220, -6.2429, -2.8835]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 91%|█████████ | 262/289 [03:18<00:20,  1.33it/s]

Training loop 262
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.023442072793841362, logits - tensor([[-6.7558, -2.6208, -6.5946,  3.1385],
        [-5.2517, -3.0540, -3.9003,  4.4253],
        [-5.1979, -3.7109, -4.6961,  3.6252],
        [-6.5527,  2.8228, -6.5601, -3.3275],
        [-6.8486, -3.0904, -6.8154,  2.3852],
        [-6.4382,  2.9578, -6.5278, -3.6677],
        [-7.5058,  3.1253, -7.2558, -2.9471],
        [-6.7317, -3.4869, -5.7568,  3.0271]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 91%|█████████ | 263/289 [03:19<00:19,  1.32it/s]

Training loop 263
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.07644364982843399, logits - tensor([[-6.3859,  2.4837, -6.9167, -2.0077],
        [-6.1876, -4.7913,  3.6614, -3.6393],
        [-5.4333,  2.7393, -5.9930, -3.9162],
        [-5.7453, -2.4691, -4.0408,  2.4460],
        [-5.8674, -0.1111, -6.3176, -0.0213],
        [-7.6233,  2.6868, -7.0742, -2.5410],
        [-4.5770, -3.9761,  2.7388, -3.1901],
        [-5.6966, -3.6883,  2.2881, -2.7075]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 91%|█████████▏| 264/289 [03:20<00:18,  1.32it/s]

Training loop 264
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.02068464830517769, logits - tensor([[-7.6246,  2.9690, -7.9217, -2.9581],
        [-6.6399,  3.5488, -6.1567, -3.8649],
        [-7.3096,  3.4261, -6.9129, -2.6477],
        [-5.4761, -4.0860,  2.9606, -3.8101],
        [-5.8943,  3.4693, -5.7414, -3.5688],
        [-7.0775, -4.4956, -6.6585,  3.3786],
        [-5.1511,  3.1014, -5.8857, -3.0306],
        [-5.7166,  2.7144, -5.6845, -3.4640]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 92%|█████████▏| 265/289 [03:20<00:18,  1.32it/s]

Training loop 265
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.1788322627544403, logits - tensor([[-6.5345, -2.3252, -6.8492,  2.5145],
        [-7.0911, -2.7660, -6.2437,  3.0416],
        [-8.3523,  4.5708, -7.5208, -3.6609],
        [-6.1708,  2.7306, -6.0759, -2.1018],
        [-6.3162,  3.1196, -6.2691, -3.9438],
        [-6.2177, -3.3767, -5.2302,  3.2745],
        [-6.5514, -4.1834,  2.6633, -2.0019],
        [-6.0110, -4.5888, -4.6995,  4.5549]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 92%|█████████▏| 266/289 [03:21<00:17,  1.33it/s]

Training loop 266
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.20764967799186707, logits - tensor([[-7.0561,  3.2184, -6.4731, -3.2087],
        [-5.7457,  1.9911, -6.9079, -2.5664],
        [-7.0679, -0.9481, -6.6912,  0.9378],
        [-5.9214,  3.4381, -6.8736, -3.2939],
        [-5.2350,  2.7510, -6.1744, -3.2601],
        [-6.6865, -3.6749,  2.8841, -2.3724],
        [-6.8782, -3.7504, -6.3417,  4.0038],
        [-5.1461,  3.6007, -6.2259, -4.3017]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 92%|█████████▏| 267/289 [03:22<00:16,  1.33it/s]

Training loop 267
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.021550511941313744, logits - tensor([[-6.3974,  3.5146, -7.8969, -4.0707],
        [-7.4286, -2.2971, -5.6695,  2.2859],
        [-7.0267, -4.3008, -3.0969,  3.0394],
        [-4.7980, -3.5809, -4.3204,  3.8017],
        [-7.3188,  3.3755, -6.7852, -3.7914],
        [-7.0928,  3.1864, -7.5274, -4.5529],
        [-5.7190, -3.8232, -5.6733,  2.7045],
        [-5.9390, -3.4594, -6.2462,  3.5026]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 93%|█████████▎| 268/289 [03:23<00:15,  1.32it/s]

Training loop 268
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.0500309094786644, logits - tensor([[-6.8039, -4.1196,  2.5425, -2.5026],
        [-6.8351,  4.1358, -6.6796, -4.1103],
        [-7.4418, -1.6437, -6.2017,  0.7166],
        [-7.3475,  3.3051, -7.3556, -4.2205],
        [-6.7820,  2.5564, -6.9021, -2.6233],
        [-6.5422, -4.0167,  2.1771, -1.4999],
        [-5.6232, -4.1713,  3.2671, -3.3330],
        [-5.7463,  2.6840, -6.4425, -1.9851]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 93%|█████████▎| 269/289 [03:23<00:15,  1.33it/s]

Training loop 269
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.03453107178211212, logits - tensor([[-6.5430, -3.6029,  2.1459, -1.8985],
        [-6.0766, -4.8528, -6.2489,  4.8594],
        [-7.0143,  1.8449, -7.5542, -1.3961],
        [-5.6478, -3.8698,  3.1015, -2.1508],
        [-5.6100,  3.0702, -5.7944, -3.1522],
        [-6.5617,  2.9180, -6.1321, -3.7266],
        [-7.5432,  3.5470, -7.3566, -4.3673],
        [-6.6168, -4.2127, -4.8824,  3.9309]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 93%|█████████▎| 270/289 [03:24<00:14,  1.33it/s]

Training loop 270
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.04442743957042694, logits - tensor([[-6.4728,  3.3221, -7.3899, -3.0266],
        [-6.5331,  0.6030, -6.8919, -0.9487],
        [-6.3824,  3.2007, -6.4126, -3.5749],
        [-6.9801,  2.4376, -7.4922, -1.7874],
        [-5.3802,  3.9862, -5.5411, -4.1449],
        [-6.9603,  4.0975, -7.0419, -3.6379],
        [-5.0940,  2.6845, -5.4059, -3.1849],
        [-7.1791,  4.1054, -7.2636, -3.2589]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 94%|█████████▍| 271/289 [03:25<00:13,  1.33it/s]

Training loop 271
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.02767571434378624, logits - tensor([[-6.1148, -4.2040, -4.6315,  5.0033],
        [-5.9480, -4.6128,  2.8799, -3.2019],
        [-7.0068,  3.3192, -7.1356, -3.3471],
        [-6.5609,  3.2417, -6.9965, -3.4312],
        [-4.9765, -4.0831,  2.4619, -2.7979],
        [-7.5857,  3.9640, -7.2368, -2.9129],
        [-5.4566,  3.6077, -6.3091, -3.7002],
        [-6.6251, -3.7809,  1.5589, -2.3472]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 94%|█████████▍| 272/289 [03:26<00:12,  1.33it/s]

Training loop 272
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.19615037739276886, logits - tensor([[-6.6070,  2.9642, -6.7222, -2.5328],
        [-6.4324, -1.8331, -5.0651,  1.8143],
        [-6.9343,  3.5952, -7.2729, -3.3085],
        [-7.1044,  3.2936, -7.3897, -3.7300],
        [-6.1828,  4.4975, -5.8806, -3.3452],
        [-5.9967, -4.5916, -5.3167,  4.8010],
        [-6.8884,  3.3169, -7.6899, -3.3913],
        [-5.3391,  3.3528, -6.1082, -3.5193]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 94%|█████████▍| 273/289 [03:26<00:12,  1.32it/s]

Training loop 273
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.030989995226264, logits - tensor([[-5.4618,  2.5081, -5.2775, -3.3263],
        [-6.7287,  2.8869, -7.0420, -3.7192],
        [-7.2679,  1.9356, -6.5228, -1.9397],
        [-8.0210,  2.2672, -6.5463, -2.4801],
        [-7.1513,  3.3800, -6.2851, -3.7308],
        [-6.2786,  3.1993, -6.5123, -2.6998],
        [-5.6988,  3.0639, -5.4971, -3.6327],
        [-5.7897,  3.0413, -6.3398, -3.2632]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 95%|█████████▍| 274/289 [03:27<00:11,  1.32it/s]

Training loop 274
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.024981725960969925, logits - tensor([[-6.2851,  2.4722, -5.6429, -3.0506],
        [-6.8522, -3.6363, -6.2377,  2.9806],
        [-5.6694, -4.3990, -5.6646,  4.6194],
        [-8.6665, -2.6349, -6.8990,  3.3468],
        [-6.7370, -1.9945, -6.4343,  1.8017],
        [-6.8491,  3.9077, -7.2153, -2.8845],
        [-7.2486,  3.7691, -7.2110, -3.6607],
        [-6.8543,  3.4954, -6.8635, -4.3034]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 95%|█████████▌| 275/289 [03:28<00:10,  1.33it/s]

Training loop 275
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.01806527003645897, logits - tensor([[-5.9388,  3.2564, -6.5739, -3.8779],
        [-7.8517,  3.0055, -6.0974, -3.2407],
        [-6.9112,  3.4430, -6.8293, -3.9325],
        [-7.1789,  3.5453, -7.2415, -3.5219],
        [-6.3946,  4.3350, -7.0934, -3.3337],
        [-7.8479, -4.3562, -4.9288,  3.9089],
        [-8.0582, -5.0980,  3.4866, -3.5925],
        [-7.0452, -4.6423,  2.7111, -2.4206]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 96%|█████████▌| 276/289 [03:29<00:09,  1.33it/s]

Training loop 276
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.18874748051166534, logits - tensor([[-6.3504,  3.6201, -6.3936, -2.9682],
        [-5.9119,  2.9259, -6.9072, -2.7728],
        [-5.9100, -3.7620,  1.6444, -1.8110],
        [-6.9308, -4.5825,  4.0973, -3.8467],
        [-6.5280,  2.4547, -7.1203, -2.6039],
        [-5.9895, -4.8975,  2.8574, -3.2919],
        [-8.1032,  3.6276, -7.5313, -4.2582],
        [-6.0654,  3.5731, -6.0985, -2.9568]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 96%|█████████▌| 277/289 [03:29<00:09,  1.33it/s]

Training loop 277
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.037711478769779205, logits - tensor([[-4.3018,  3.7448, -5.6692, -3.0018],
        [-7.0057,  2.6253, -6.3469, -2.5951],
        [-5.5764, -4.0046,  3.1915, -2.7700],
        [-6.3571, -4.2662, -6.5576,  4.8957],
        [-5.9196,  2.8163, -5.1371, -2.2052],
        [-6.4227,  2.6863, -6.8164, -2.2687],
        [-6.4963, -4.3739,  2.7315, -3.0466],
        [-6.0611, -2.9400,  1.4158, -2.3281]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 96%|█████████▌| 278/289 [03:30<00:08,  1.33it/s]

Training loop 278
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.023674452677369118, logits - tensor([[-5.9500,  3.6038, -5.3148, -3.6392],
        [-6.8982,  3.0077, -6.5624, -4.0717],
        [-7.9984,  3.9996, -7.4905, -3.9992],
        [-6.0391, -3.4532,  2.0070, -2.0159],
        [-4.8861,  3.3375, -4.9594, -3.1408],
        [-7.5541,  2.5549, -7.0151, -2.7242],
        [-7.2171, -3.9843, -6.2588,  3.5703],
        [-6.4292, -4.4414, -6.0199,  4.4416]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 97%|█████████▋| 279/289 [03:31<00:07,  1.33it/s]

Training loop 279
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.2709704637527466, logits - tensor([[-6.5971, -4.2605, -5.8500,  4.3593],
        [-6.8554, -2.3939, -5.6858,  3.5318],
        [-5.3315,  2.6957, -5.6057, -3.3508],
        [-6.7565, -3.1323,  1.2428, -2.0496],
        [-6.4373,  3.4580, -6.0657, -3.2813],
        [-7.6727,  3.5743, -6.6863, -3.2650],
        [-6.3856,  3.3373, -6.6010, -4.1914],
        [-6.2677, -3.9254,  1.5815, -2.7193]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 97%|█████████▋| 280/289 [03:32<00:06,  1.32it/s]

Training loop 280
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 97%|█████████▋| 281/289 [03:32<00:06,  1.32it/s]

loss - 0.029144510626792908, logits - tensor([[-7.2423,  3.2667, -6.4108, -3.4150],
        [-6.7916,  3.1422, -6.4989, -3.4991],
        [-7.3838,  3.0868, -6.7403, -2.9370],
        [-6.1237, -3.2642,  2.0999, -2.4930],
        [-5.9564,  3.1214, -5.9081, -3.2131],
        [-6.2215, -3.0254, -5.1164,  3.3174],
        [-6.1473, -3.3253, -5.8717,  3.4672],
        [-7.1753, -2.6852, -6.0339,  1.9418]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 281
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>


 98%|█████████▊| 282/289 [03:33<00:05,  1.31it/s]

loss - 0.13241755962371826, logits - tensor([[-6.3718,  3.5700, -5.6474, -3.3182],
        [-5.4684, -4.4955,  2.8473, -3.1757],
        [-6.1414,  1.2688, -5.3879, -1.3401],
        [-6.2936, -3.7511,  2.9950, -2.7290],
        [-6.4041,  2.8657, -6.3383, -3.7584],
        [-7.7972, -0.8180, -6.1982,  1.5165],
        [-6.0146,  4.4958, -6.3737, -3.2727],
        [-6.9108,  2.5487, -6.1533, -3.1018]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True
Training loop 282
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.022306546568870544, logits - tensor([[-5.9712, -4.1277, -6.2551,  3.5735],
        [-6.1276,  3.6623, -6.4672, -4.3505],
        [-6.7034,  3.1446, -5.7609, -3.5073],
        [-5.7439, -3.4228, -6.1563,  3.0205],
        [-6.8327,  3.4872, -7.6560, -3.1979],
        [-6.3009, -4.0226,  2.3301, -2.2954],
        [-6.8430,  2.6193, -6.9569, -2.5936],
        [-6.1972, -4.2965, -5.6089,  

 98%|█████████▊| 283/289 [03:34<00:04,  1.32it/s]

Training loop 283
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.04868511110544205, logits - tensor([[-6.7482,  1.7397, -7.5307, -1.7450],
        [-5.8597,  3.5186, -5.7955, -3.6190],
        [-6.1653, -3.0392,  0.8202, -2.1401],
        [-6.2654, -3.9077, -5.2333,  5.2341],
        [-6.3458,  3.8061, -5.5617, -3.7558],
        [-6.0138,  2.7044, -5.7773, -2.9149],
        [-5.7364,  2.8814, -6.2019, -3.3393],
        [-6.7288,  1.4566, -7.2407, -1.9362]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 98%|█████████▊| 284/289 [03:35<00:03,  1.32it/s]

Training loop 284
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.04272478073835373, logits - tensor([[-6.0329,  2.3854, -6.1386, -2.7069],
        [-5.6281, -4.1431,  2.6745, -2.9788],
        [-6.2126,  2.1380, -7.2028, -3.3202],
        [-5.2870,  3.5275, -6.0372, -3.7072],
        [-6.6161,  3.8445, -6.3885, -3.8130],
        [-7.1755, -3.7860,  1.6606, -0.6593],
        [-7.2581,  2.7837, -7.4655, -3.0462],
        [-6.4094,  3.4851, -6.8797, -2.7263]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 99%|█████████▊| 285/289 [03:35<00:03,  1.32it/s]

Training loop 285
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.09139300882816315, logits - tensor([[-6.3257,  4.1001, -6.7218, -3.6957],
        [-7.5562,  2.9015, -7.0297, -3.3550],
        [-7.2236,  3.9859, -7.3426, -3.3866],
        [-6.8990, -4.5546, -6.8073,  4.3528],
        [-6.7042,  3.7870, -6.5478, -4.0071],
        [-7.4832, -3.9755, -5.8262,  3.4484],
        [-7.4809,  1.2986, -7.5061, -0.5719],
        [-7.4606,  3.5354, -6.2129, -3.6885]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 99%|█████████▉| 286/289 [03:36<00:02,  1.32it/s]

Training loop 286
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.04159874841570854, logits - tensor([[-5.3035,  3.3086, -5.9286, -3.1810],
        [-7.1337,  3.8209, -6.8925, -3.9356],
        [-5.6852,  3.7445, -6.4667, -3.5504],
        [-5.9524, -4.2492, -5.0091,  4.1614],
        [-5.9099,  2.2808, -6.1003, -2.7503],
        [-6.6173, -3.9320, -5.4398,  3.5962],
        [-7.9174, -1.0230, -6.8975,  0.3301],
        [-6.5290,  4.4615, -6.4303, -3.5788]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


 99%|█████████▉| 287/289 [03:37<00:01,  1.32it/s]

Training loop 287
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.027346109971404076, logits - tensor([[-6.6189,  3.4353, -6.4854, -4.3550],
        [-6.9150, -4.6738, -6.0758,  3.8870],
        [-5.7217,  3.3816, -5.8436, -3.1296],
        [-6.5982, -3.4407, -4.0340,  2.8025],
        [-7.4690, -4.7906, -5.9577,  4.5797],
        [-6.5558, -4.6036, -5.0447,  4.5303],
        [-5.6468, -3.5631,  1.8485, -2.2380],
        [-7.2480,  2.0887, -6.5270, -1.7861]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


100%|█████████▉| 288/289 [03:38<00:00,  1.33it/s]

Training loop 288
input - False, attention_mask - False
type - <class 'torch.Tensor'> type - <class 'torch.Tensor'>
loss - 0.05534245818853378, logits - tensor([[-5.8538,  3.8769, -5.9042, -3.7796],
        [-6.5961,  3.1136, -6.2020, -2.8239],
        [-6.5466,  2.8876, -6.4755, -2.4155],
        [-7.9270,  1.0873, -7.9092, -1.0569]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
loss__ - True, logits__ - True


100%|██████████| 289/289 [03:38<00:00,  1.32it/s]
  0%|          | 0/194 [00:00<?, ?it/s]

Validation Loop 0
input - False, attention_mask - False


  1%|          | 1/194 [00:00<00:53,  3.62it/s]

Validation Loop 1
input - False, attention_mask - False


  1%|          | 2/194 [00:00<00:49,  3.85it/s]

Validation Loop 2
input - False, attention_mask - False


  2%|▏         | 3/194 [00:00<00:48,  3.97it/s]

Validation Loop 3
input - False, attention_mask - False


  2%|▏         | 4/194 [00:01<00:49,  3.82it/s]

Validation Loop 4
input - False, attention_mask - False


  3%|▎         | 5/194 [00:01<00:48,  3.87it/s]

Validation Loop 5
input - False, attention_mask - False


  3%|▎         | 6/194 [00:01<00:47,  3.94it/s]

Validation Loop 6
input - False, attention_mask - False


  4%|▎         | 7/194 [00:01<00:46,  3.98it/s]

Validation Loop 7
input - False, attention_mask - False


  4%|▍         | 8/194 [00:02<00:47,  3.92it/s]

Validation Loop 8
input - False, attention_mask - False


  5%|▍         | 9/194 [00:02<00:47,  3.92it/s]

Validation Loop 9
input - False, attention_mask - False


  5%|▌         | 10/194 [00:02<00:46,  3.96it/s]

Validation Loop 10
input - False, attention_mask - False


  6%|▌         | 11/194 [00:02<00:46,  3.98it/s]

Validation Loop 11
input - False, attention_mask - False


  6%|▌         | 12/194 [00:03<00:46,  3.94it/s]

Validation Loop 12
input - False, attention_mask - False


  7%|▋         | 13/194 [00:03<00:45,  3.97it/s]

Validation Loop 13
input - False, attention_mask - False


  7%|▋         | 14/194 [00:03<00:45,  3.96it/s]

Validation Loop 14
input - False, attention_mask - False


  8%|▊         | 15/194 [00:03<00:45,  3.97it/s]

Validation Loop 15
input - False, attention_mask - False


  8%|▊         | 16/194 [00:04<00:45,  3.92it/s]

Validation Loop 16
input - False, attention_mask - False


  9%|▉         | 17/194 [00:04<00:45,  3.90it/s]

Validation Loop 17
input - False, attention_mask - False


  9%|▉         | 18/194 [00:04<00:45,  3.91it/s]

Validation Loop 18
input - False, attention_mask - False


 10%|▉         | 19/194 [00:04<00:44,  3.93it/s]

Validation Loop 19
input - False, attention_mask - False


 10%|█         | 20/194 [00:05<00:44,  3.92it/s]

Validation Loop 20
input - False, attention_mask - False


 11%|█         | 21/194 [00:05<00:44,  3.93it/s]

Validation Loop 21
input - False, attention_mask - False


 11%|█▏        | 22/194 [00:05<00:43,  3.98it/s]

Validation Loop 22
input - False, attention_mask - False


 12%|█▏        | 23/194 [00:05<00:42,  3.99it/s]

Validation Loop 23
input - False, attention_mask - False


 12%|█▏        | 24/194 [00:06<00:42,  3.96it/s]

Validation Loop 24
input - False, attention_mask - False


 13%|█▎        | 25/194 [00:06<00:42,  3.97it/s]

Validation Loop 25
input - False, attention_mask - False


 13%|█▎        | 26/194 [00:06<00:42,  3.96it/s]

Validation Loop 26
input - False, attention_mask - False


 14%|█▍        | 27/194 [00:06<00:41,  3.98it/s]

Validation Loop 27
input - False, attention_mask - False


 14%|█▍        | 28/194 [00:07<00:41,  4.00it/s]

Validation Loop 28
input - False, attention_mask - False


 15%|█▍        | 29/194 [00:07<00:41,  3.98it/s]

Validation Loop 29
input - False, attention_mask - False


 15%|█▌        | 30/194 [00:07<00:41,  3.95it/s]

Validation Loop 30
input - False, attention_mask - False


 16%|█▌        | 31/194 [00:07<00:41,  3.94it/s]

Validation Loop 31
input - False, attention_mask - False


 16%|█▋        | 32/194 [00:08<00:41,  3.94it/s]

Validation Loop 32
input - False, attention_mask - False


 17%|█▋        | 33/194 [00:08<00:41,  3.92it/s]

Validation Loop 33
input - False, attention_mask - False


 18%|█▊        | 34/194 [00:08<00:40,  3.94it/s]

Validation Loop 34
input - False, attention_mask - False


 18%|█▊        | 35/194 [00:08<00:40,  3.96it/s]

Validation Loop 35
input - False, attention_mask - False


 19%|█▊        | 36/194 [00:09<00:40,  3.93it/s]

Validation Loop 36
input - False, attention_mask - False


 19%|█▉        | 37/194 [00:09<00:39,  3.94it/s]

Validation Loop 37
input - False, attention_mask - False


 20%|█▉        | 38/194 [00:09<00:39,  3.91it/s]

Validation Loop 38
input - False, attention_mask - False


 20%|██        | 39/194 [00:09<00:39,  3.92it/s]

Validation Loop 39
input - False, attention_mask - False


 21%|██        | 40/194 [00:10<00:39,  3.95it/s]

Validation Loop 40
input - False, attention_mask - False


 21%|██        | 41/194 [00:10<00:38,  3.96it/s]

Validation Loop 41
input - False, attention_mask - False


 22%|██▏       | 42/194 [00:10<00:38,  3.99it/s]

Validation Loop 42
input - False, attention_mask - False


 22%|██▏       | 43/194 [00:10<00:37,  3.97it/s]

Validation Loop 43
input - False, attention_mask - False


 23%|██▎       | 44/194 [00:11<00:37,  3.97it/s]

Validation Loop 44
input - False, attention_mask - False


 23%|██▎       | 45/194 [00:11<00:37,  3.94it/s]

Validation Loop 45
input - False, attention_mask - False


 24%|██▎       | 46/194 [00:11<00:37,  3.90it/s]

Validation Loop 46
input - False, attention_mask - False


 24%|██▍       | 47/194 [00:11<00:37,  3.91it/s]

Validation Loop 47
input - False, attention_mask - False


 25%|██▍       | 48/194 [00:12<00:37,  3.92it/s]

Validation Loop 48
input - False, attention_mask - False


 25%|██▌       | 49/194 [00:12<00:36,  3.96it/s]

Validation Loop 49
input - False, attention_mask - False


 26%|██▌       | 50/194 [00:12<00:36,  3.99it/s]

Validation Loop 50
input - False, attention_mask - False


 26%|██▋       | 51/194 [00:12<00:35,  3.98it/s]

Validation Loop 51
input - False, attention_mask - False


 27%|██▋       | 52/194 [00:13<00:35,  3.99it/s]

Validation Loop 52
input - False, attention_mask - False


 27%|██▋       | 53/194 [00:13<00:35,  3.99it/s]

Validation Loop 53
input - False, attention_mask - False


 28%|██▊       | 54/194 [00:13<00:35,  3.99it/s]

Validation Loop 54
input - False, attention_mask - False


 28%|██▊       | 55/194 [00:13<00:35,  3.95it/s]

Validation Loop 55
input - False, attention_mask - False


 29%|██▉       | 56/194 [00:14<00:35,  3.91it/s]

Validation Loop 56
input - False, attention_mask - False


 29%|██▉       | 57/194 [00:14<00:34,  3.92it/s]

Validation Loop 57
input - False, attention_mask - False


 30%|██▉       | 58/194 [00:14<00:34,  3.89it/s]

Validation Loop 58
input - False, attention_mask - False


 30%|███       | 59/194 [00:14<00:34,  3.94it/s]

Validation Loop 59
input - False, attention_mask - False


 31%|███       | 60/194 [00:15<00:33,  3.94it/s]

Validation Loop 60
input - False, attention_mask - False


 31%|███▏      | 61/194 [00:15<00:34,  3.90it/s]

Validation Loop 61
input - False, attention_mask - False


 32%|███▏      | 62/194 [00:15<00:33,  3.93it/s]

Validation Loop 62
input - False, attention_mask - False


 32%|███▏      | 63/194 [00:15<00:33,  3.94it/s]

Validation Loop 63
input - False, attention_mask - False


 33%|███▎      | 64/194 [00:16<00:32,  3.96it/s]

Validation Loop 64
input - False, attention_mask - False


 34%|███▎      | 65/194 [00:16<00:32,  3.99it/s]

Validation Loop 65
input - False, attention_mask - False


 34%|███▍      | 66/194 [00:16<00:32,  3.99it/s]

Validation Loop 66
input - False, attention_mask - False


 35%|███▍      | 67/194 [00:16<00:32,  3.97it/s]

Validation Loop 67
input - False, attention_mask - False


 35%|███▌      | 68/194 [00:17<00:31,  3.98it/s]

Validation Loop 68
input - False, attention_mask - False


 36%|███▌      | 69/194 [00:17<00:31,  3.99it/s]

Validation Loop 69
input - False, attention_mask - False


 36%|███▌      | 70/194 [00:17<00:31,  3.97it/s]

Validation Loop 70
input - False, attention_mask - False


 37%|███▋      | 71/194 [00:17<00:31,  3.96it/s]

Validation Loop 71
input - False, attention_mask - False


 37%|███▋      | 72/194 [00:18<00:30,  3.94it/s]

Validation Loop 72
input - False, attention_mask - False


 38%|███▊      | 73/194 [00:18<00:30,  3.93it/s]

Validation Loop 73
input - False, attention_mask - False


 38%|███▊      | 74/194 [00:18<00:30,  3.91it/s]

Validation Loop 74
input - False, attention_mask - False


 39%|███▊      | 75/194 [00:19<00:30,  3.91it/s]

Validation Loop 75
input - False, attention_mask - False


 39%|███▉      | 76/194 [00:19<00:30,  3.92it/s]

Validation Loop 76
input - False, attention_mask - False


 40%|███▉      | 77/194 [00:19<00:29,  3.95it/s]

Validation Loop 77
input - False, attention_mask - False


 40%|████      | 78/194 [00:19<00:29,  3.96it/s]

Validation Loop 78
input - False, attention_mask - False


 41%|████      | 79/194 [00:20<00:29,  3.95it/s]

Validation Loop 79
input - False, attention_mask - False


 41%|████      | 80/194 [00:20<00:28,  3.97it/s]

Validation Loop 80
input - False, attention_mask - False


 42%|████▏     | 81/194 [00:20<00:28,  3.95it/s]

Validation Loop 81
input - False, attention_mask - False


 42%|████▏     | 82/194 [00:20<00:28,  3.97it/s]

Validation Loop 82
input - False, attention_mask - False


 43%|████▎     | 83/194 [00:21<00:28,  3.96it/s]

Validation Loop 83
input - False, attention_mask - False


 43%|████▎     | 84/194 [00:21<00:27,  3.96it/s]

Validation Loop 84
input - False, attention_mask - False


 44%|████▍     | 85/194 [00:21<00:27,  3.99it/s]

Validation Loop 85
input - False, attention_mask - False


 44%|████▍     | 86/194 [00:21<00:27,  3.97it/s]

Validation Loop 86
input - False, attention_mask - False


 45%|████▍     | 87/194 [00:22<00:26,  3.98it/s]

Validation Loop 87
input - False, attention_mask - False


 45%|████▌     | 88/194 [00:22<00:26,  3.96it/s]

Validation Loop 88
input - False, attention_mask - False


 46%|████▌     | 89/194 [00:22<00:26,  3.99it/s]

Validation Loop 89
input - False, attention_mask - False


 46%|████▋     | 90/194 [00:22<00:26,  3.98it/s]

Validation Loop 90
input - False, attention_mask - False


 47%|████▋     | 91/194 [00:23<00:25,  3.96it/s]

Validation Loop 91
input - False, attention_mask - False


 47%|████▋     | 92/194 [00:23<00:25,  3.97it/s]

Validation Loop 92
input - False, attention_mask - False


 48%|████▊     | 93/194 [00:23<00:25,  3.93it/s]

Validation Loop 93
input - False, attention_mask - False


 48%|████▊     | 94/194 [00:23<00:25,  3.95it/s]

Validation Loop 94
input - False, attention_mask - False


 49%|████▉     | 95/194 [00:24<00:24,  3.98it/s]

Validation Loop 95
input - False, attention_mask - False


 49%|████▉     | 96/194 [00:24<00:24,  3.99it/s]

Validation Loop 96
input - False, attention_mask - False


 50%|█████     | 97/194 [00:24<00:24,  3.98it/s]

Validation Loop 97
input - False, attention_mask - False


 51%|█████     | 98/194 [00:24<00:24,  3.97it/s]

Validation Loop 98
input - False, attention_mask - False


 51%|█████     | 99/194 [00:25<00:23,  3.98it/s]

Validation Loop 99
input - False, attention_mask - False


 52%|█████▏    | 100/194 [00:25<00:23,  4.00it/s]

Validation Loop 100
input - False, attention_mask - False


 52%|█████▏    | 101/194 [00:25<00:23,  3.99it/s]

Validation Loop 101
input - False, attention_mask - False


 53%|█████▎    | 102/194 [00:25<00:23,  3.99it/s]

Validation Loop 102
input - False, attention_mask - False


 53%|█████▎    | 103/194 [00:26<00:22,  4.00it/s]

Validation Loop 103
input - False, attention_mask - False


 54%|█████▎    | 104/194 [00:26<00:22,  3.97it/s]

Validation Loop 104
input - False, attention_mask - False


 54%|█████▍    | 105/194 [00:26<00:22,  3.95it/s]

Validation Loop 105
input - False, attention_mask - False


 55%|█████▍    | 106/194 [00:26<00:22,  3.98it/s]

Validation Loop 106
input - False, attention_mask - False


 55%|█████▌    | 107/194 [00:27<00:21,  4.00it/s]

Validation Loop 107
input - False, attention_mask - False


 56%|█████▌    | 108/194 [00:27<00:21,  3.99it/s]

Validation Loop 108
input - False, attention_mask - False


 56%|█████▌    | 109/194 [00:27<00:21,  4.00it/s]

Validation Loop 109
input - False, attention_mask - False


 57%|█████▋    | 110/194 [00:27<00:21,  4.00it/s]

Validation Loop 110
input - False, attention_mask - False


 57%|█████▋    | 111/194 [00:28<00:20,  3.99it/s]

Validation Loop 111
input - False, attention_mask - False


 58%|█████▊    | 112/194 [00:28<00:20,  3.99it/s]

Validation Loop 112
input - False, attention_mask - False


 58%|█████▊    | 113/194 [00:28<00:20,  3.99it/s]

Validation Loop 113
input - False, attention_mask - False


 59%|█████▉    | 114/194 [00:28<00:19,  4.00it/s]

Validation Loop 114
input - False, attention_mask - False


 59%|█████▉    | 115/194 [00:29<00:19,  3.99it/s]

Validation Loop 115
input - False, attention_mask - False


 60%|█████▉    | 116/194 [00:29<00:19,  4.00it/s]

Validation Loop 116
input - False, attention_mask - False


 60%|██████    | 117/194 [00:29<00:19,  3.99it/s]

Validation Loop 117
input - False, attention_mask - False


 61%|██████    | 118/194 [00:29<00:19,  3.99it/s]

Validation Loop 118
input - False, attention_mask - False


 61%|██████▏   | 119/194 [00:30<00:18,  4.01it/s]

Validation Loop 119
input - False, attention_mask - False


 62%|██████▏   | 120/194 [00:30<00:18,  3.99it/s]

Validation Loop 120
input - False, attention_mask - False


 62%|██████▏   | 121/194 [00:30<00:18,  3.97it/s]

Validation Loop 121
input - False, attention_mask - False


 63%|██████▎   | 122/194 [00:30<00:18,  3.95it/s]

Validation Loop 122
input - False, attention_mask - False


 63%|██████▎   | 123/194 [00:31<00:17,  3.95it/s]

Validation Loop 123
input - False, attention_mask - False


 64%|██████▍   | 124/194 [00:31<00:17,  3.94it/s]

Validation Loop 124
input - False, attention_mask - False


 64%|██████▍   | 125/194 [00:31<00:17,  3.94it/s]

Validation Loop 125
input - False, attention_mask - False


 65%|██████▍   | 126/194 [00:31<00:17,  3.92it/s]

Validation Loop 126
input - False, attention_mask - False


 65%|██████▌   | 127/194 [00:32<00:17,  3.92it/s]

Validation Loop 127
input - False, attention_mask - False


 66%|██████▌   | 128/194 [00:32<00:16,  3.90it/s]

Validation Loop 128
input - False, attention_mask - False


 66%|██████▋   | 129/194 [00:32<00:16,  3.87it/s]

Validation Loop 129
input - False, attention_mask - False


 67%|██████▋   | 130/194 [00:32<00:16,  3.88it/s]

Validation Loop 130
input - False, attention_mask - False


 68%|██████▊   | 131/194 [00:33<00:16,  3.89it/s]

Validation Loop 131
input - False, attention_mask - False


 68%|██████▊   | 132/194 [00:33<00:15,  3.90it/s]

Validation Loop 132
input - False, attention_mask - False


 69%|██████▊   | 133/194 [00:33<00:15,  3.91it/s]

Validation Loop 133
input - False, attention_mask - False


 69%|██████▉   | 134/194 [00:33<00:15,  3.90it/s]

Validation Loop 134
input - False, attention_mask - False


 70%|██████▉   | 135/194 [00:34<00:14,  3.93it/s]

Validation Loop 135
input - False, attention_mask - False


 70%|███████   | 136/194 [00:34<00:14,  3.93it/s]

Validation Loop 136
input - False, attention_mask - False


 71%|███████   | 137/194 [00:34<00:14,  3.95it/s]

Validation Loop 137
input - False, attention_mask - False


 71%|███████   | 138/194 [00:34<00:14,  3.95it/s]

Validation Loop 138
input - False, attention_mask - False


 72%|███████▏  | 139/194 [00:35<00:14,  3.91it/s]

Validation Loop 139
input - False, attention_mask - False


 72%|███████▏  | 140/194 [00:35<00:13,  3.95it/s]

Validation Loop 140
input - False, attention_mask - False


 73%|███████▎  | 141/194 [00:35<00:13,  3.98it/s]

Validation Loop 141
input - False, attention_mask - False


 73%|███████▎  | 142/194 [00:35<00:13,  3.96it/s]

Validation Loop 142
input - False, attention_mask - False


 74%|███████▎  | 143/194 [00:36<00:12,  3.98it/s]

Validation Loop 143
input - False, attention_mask - False


 74%|███████▍  | 144/194 [00:36<00:12,  3.99it/s]

Validation Loop 144
input - False, attention_mask - False


 75%|███████▍  | 145/194 [00:36<00:12,  3.96it/s]

Validation Loop 145
input - False, attention_mask - False


 75%|███████▌  | 146/194 [00:36<00:12,  3.97it/s]

Validation Loop 146
input - False, attention_mask - False


 76%|███████▌  | 147/194 [00:37<00:11,  3.96it/s]

Validation Loop 147
input - False, attention_mask - False


 76%|███████▋  | 148/194 [00:37<00:11,  3.94it/s]

Validation Loop 148
input - False, attention_mask - False


 77%|███████▋  | 149/194 [00:37<00:11,  3.90it/s]

Validation Loop 149
input - False, attention_mask - False


 77%|███████▋  | 150/194 [00:37<00:11,  3.94it/s]

Validation Loop 150
input - False, attention_mask - False


 78%|███████▊  | 151/194 [00:38<00:10,  3.95it/s]

Validation Loop 151
input - False, attention_mask - False


 78%|███████▊  | 152/194 [00:38<00:10,  3.92it/s]

Validation Loop 152
input - False, attention_mask - False


 79%|███████▉  | 153/194 [00:38<00:10,  3.93it/s]

Validation Loop 153
input - False, attention_mask - False


 79%|███████▉  | 154/194 [00:38<00:10,  3.92it/s]

Validation Loop 154
input - False, attention_mask - False


 80%|███████▉  | 155/194 [00:39<00:09,  3.93it/s]

Validation Loop 155
input - False, attention_mask - False


 80%|████████  | 156/194 [00:39<00:09,  3.92it/s]

Validation Loop 156
input - False, attention_mask - False


 81%|████████  | 157/194 [00:39<00:09,  3.92it/s]

Validation Loop 157
input - False, attention_mask - False


 81%|████████▏ | 158/194 [00:39<00:09,  3.95it/s]

Validation Loop 158
input - False, attention_mask - False


 82%|████████▏ | 159/194 [00:40<00:08,  3.96it/s]

Validation Loop 159
input - False, attention_mask - False


 82%|████████▏ | 160/194 [00:40<00:08,  3.97it/s]

Validation Loop 160
input - False, attention_mask - False


 83%|████████▎ | 161/194 [00:40<00:08,  3.96it/s]

Validation Loop 161
input - False, attention_mask - False


 84%|████████▎ | 162/194 [00:40<00:08,  3.98it/s]

Validation Loop 162
input - False, attention_mask - False


 84%|████████▍ | 163/194 [00:41<00:07,  3.93it/s]

Validation Loop 163
input - False, attention_mask - False


 85%|████████▍ | 164/194 [00:41<00:07,  3.93it/s]

Validation Loop 164
input - False, attention_mask - False


 85%|████████▌ | 165/194 [00:41<00:07,  3.87it/s]

Validation Loop 165
input - False, attention_mask - False


 86%|████████▌ | 166/194 [00:42<00:07,  3.92it/s]

Validation Loop 166
input - False, attention_mask - False


 86%|████████▌ | 167/194 [00:42<00:06,  3.95it/s]

Validation Loop 167
input - False, attention_mask - False


 87%|████████▋ | 168/194 [00:42<00:06,  3.94it/s]

Validation Loop 168
input - False, attention_mask - False


 87%|████████▋ | 169/194 [00:42<00:06,  3.95it/s]

Validation Loop 169
input - False, attention_mask - False


 88%|████████▊ | 170/194 [00:43<00:06,  3.99it/s]

Validation Loop 170
input - False, attention_mask - False


 88%|████████▊ | 171/194 [00:43<00:05,  4.02it/s]

Validation Loop 171
input - False, attention_mask - False


 89%|████████▊ | 172/194 [00:43<00:05,  3.99it/s]

Validation Loop 172
input - False, attention_mask - False


 89%|████████▉ | 173/194 [00:43<00:05,  3.97it/s]

Validation Loop 173
input - False, attention_mask - False


 90%|████████▉ | 174/194 [00:44<00:05,  3.98it/s]

Validation Loop 174
input - False, attention_mask - False


 90%|█████████ | 175/194 [00:44<00:04,  3.99it/s]

Validation Loop 175
input - False, attention_mask - False


 91%|█████████ | 176/194 [00:44<00:04,  3.97it/s]

Validation Loop 176
input - False, attention_mask - False


 91%|█████████ | 177/194 [00:44<00:04,  3.99it/s]

Validation Loop 177
input - False, attention_mask - False


 92%|█████████▏| 178/194 [00:45<00:04,  3.97it/s]

Validation Loop 178
input - False, attention_mask - False


 92%|█████████▏| 179/194 [00:45<00:03,  3.99it/s]

Validation Loop 179
input - False, attention_mask - False


 93%|█████████▎| 180/194 [00:45<00:03,  3.99it/s]

Validation Loop 180
input - False, attention_mask - False


 93%|█████████▎| 181/194 [00:45<00:03,  3.99it/s]

Validation Loop 181
input - False, attention_mask - False


 94%|█████████▍| 182/194 [00:46<00:02,  4.01it/s]

Validation Loop 182
input - False, attention_mask - False


 94%|█████████▍| 183/194 [00:46<00:02,  4.00it/s]

Validation Loop 183
input - False, attention_mask - False


 95%|█████████▍| 184/194 [00:46<00:02,  3.99it/s]

Validation Loop 184
input - False, attention_mask - False


 95%|█████████▌| 185/194 [00:46<00:02,  4.00it/s]

Validation Loop 185
input - False, attention_mask - False


 96%|█████████▌| 186/194 [00:47<00:01,  4.00it/s]

Validation Loop 186
input - False, attention_mask - False


 96%|█████████▋| 187/194 [00:47<00:01,  3.99it/s]

Validation Loop 187
input - False, attention_mask - False


 97%|█████████▋| 188/194 [00:47<00:01,  4.00it/s]

Validation Loop 188
input - False, attention_mask - False


 97%|█████████▋| 189/194 [00:47<00:01,  3.98it/s]

Validation Loop 189
input - False, attention_mask - False


 98%|█████████▊| 190/194 [00:48<00:01,  4.00it/s]

Validation Loop 190
input - False, attention_mask - False


 98%|█████████▊| 191/194 [00:48<00:00,  3.99it/s]

Validation Loop 191
input - False, attention_mask - False


 99%|█████████▉| 192/194 [00:48<00:00,  4.00it/s]

Validation Loop 192
input - False, attention_mask - False


 99%|█████████▉| 193/194 [00:48<00:00,  3.97it/s]

Validation Loop 193
input - False, attention_mask - False


100%|██████████| 194/194 [00:49<00:00,  3.95it/s]

[{'tp': 0, 'tn': 1552, 'fp': 0, 'fn': 0}, {'tp': 818, 'tn': 374, 'fp': 143, 'fn': 217}, {'tp': 155, 'tn': 1367, 'fp': 5, 'fn': 25}, {'tp': 183, 'tn': 1014, 'fp': 248, 'fn': 107}]
Detailed accuracy after 7 epoch:
unanswerable accuarcy: 1.0
extractive accuarcy: 0.7680412371134021
yes_no accuarcy: 0.9806701030927835
abstractive accuarcy: 0.7712628865979382
Overall accuarcy: 0.879993556701031
Best accuarcy: 0.899645618556701





In [29]:
context = "Hello my name name is Preetam."
question = "What is my name?"

print("Loading Model...")

classification_model = torch.load(MODEL_PATH)

Loading Model...


In [32]:
tokens = tokenizer.encode_plus(
                                            question,
                                            context,
                                            add_special_tokens=True,
                                            return_tensors='pt',
                                            truncation=True,
                                            padding='max_length',
                                            max_length=max_token_length,
                                            return_attention_mask = True
                                            )
data = {'input_ids': tokens.input_ids.flatten(), 'attention_mask': tokens.attention_mask.flatten()}

In [38]:
input_ids = data['input_ids']
attention_mask = data['attention_mask']

input_ids = input_ids.to(device, dtype=torch.long)
attention_mask = attention_mask.to(device, dtype=torch.long)

# logits = bert_model(
#               input_ids = input_ids,
#               attention_mask = attention_mask,
#           )