In [1]:
import torch
import random
import numpy as np
import os
from tqdm import tqdm, trange
# torch.cuda.empty_cache()
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from pytorch_pretrained_bert.optimization import BertAdam

In [2]:
from run_classifier import ColaProcessor, MrpcProcessor, logger, convert_examples_to_features,\
    set_optimizer_params_grad, copy_optimizer_params_to_model, accuracy, p_r_f1, tp_pcount_gcount

In [3]:
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    n_gpu = torch.cuda.device_count()
    
    print('There are %d GPU(s) available.' % n_gpu)

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: GeForce GTX 1050 Ti


In [4]:
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from pytorch_pretrained_bert.modeling import BertForSequenceClassification


06/23/2020 19:11:49 - INFO - transformers.file_utils -   PyTorch version 1.4.0 available.
06/23/2020 19:11:51 - INFO - transformers.file_utils -   TensorFlow version 2.1.0 available.


In [5]:
# import logging
# logging.basicConfig(level=logging.INFO)

In [6]:
# def train_and_test(data_dir, bert_model="bert-base-uncased", task_name=None,
#                    output_dir=None, max_seq_length=128, do_train=False, do_eval=False, do_lower_case=False,
#                    train_batch_size=32, eval_batch_size=8, learning_rate=5e-5, num_train_epochs=3,
#                    warmup_proportion=0.1,no_cuda=False, local_rank=-1, seed=42, gradient_accumulation_steps=1,
#                    optimize_on_cpu=False, fp16=False, loss_scale=128, saved_model=""):

In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

05/22/2020 15:44:36 - INFO - transformers.tokenization_utils -   loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at C:\Users\arsen\.cache\torch\transformers\26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084


In [8]:
# Prepare model 
model = BertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels = 2)
model.to(device)

# model = BertModel.from_pretrained('bert-base-uncased')

05/21/2020 12:42:52 - INFO - pytorch_pretrained_bert.modeling -   loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz from cache at C:\Users\arsen\.pytorch_pretrained_bert\9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba
05/21/2020 12:42:52 - INFO - pytorch_pretrained_bert.modeling -   extracting archive file C:\Users\arsen\.pytorch_pretrained_bert\9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba to temp dir C:\Users\arsen\AppData\Local\Temp\tmpd0kymw7h
05/21/2020 12:42:57 - INFO - pytorch_pretrained_bert.modeling -   Model config {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_lay

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
   

In [9]:
# Get all of the model's parameters as a list of tuples.
params = list(model.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
    
print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

The BERT model has 201 different named parameters.

==== Embedding Layer ====

bert.embeddings.word_embeddings.weight                  (30522, 768)
bert.embeddings.position_embeddings.weight                (512, 768)
bert.embeddings.token_type_embeddings.weight                (2, 768)
bert.embeddings.LayerNorm.weight                              (768,)
bert.embeddings.LayerNorm.bias                                (768,)

==== First Transformer ====

bert.encoder.layer.0.attention.self.query.weight          (768, 768)
bert.encoder.layer.0.attention.self.query.bias                (768,)
bert.encoder.layer.0.attention.self.key.weight            (768, 768)
bert.encoder.layer.0.attention.self.key.bias                  (768,)
bert.encoder.layer.0.attention.self.value.weight          (768, 768)
bert.encoder.layer.0.attention.self.value.bias                (768,)
bert.encoder.layer.0.attention.output.dense.weight        (768, 768)
bert.encoder.layer.0.attention.output.dense.bias              (

In [14]:
data_dir = "D:/Projects/Stance/Dataset/BertForOppositeClassification/"
# data_dir = "D:/Jupyter/data/dataset/perspective_stances/"
data_dir_output = "D:/Projects/Stance/Models/dataExpantion/"
output_dir=data_dir_output
max_seq_length=32
max_grad_norm = 1.0
num_training_steps = 1000
num_warmup_steps = 100
warmup_proportion = float(num_warmup_steps) / float(num_training_steps)  # 0.1
# warmup_proportion = 0.1
train_batch_size=32
eval_batch_size=8
learning_rate=5e-5
num_train_epochs=3
local_rank=-1
seed=42
gradient_accumulation_steps=1
loss_scale=128
train_batch_size = int(train_batch_size / gradient_accumulation_steps)

processors = {
        "mrpc": MrpcProcessor,
    }

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
    
os.makedirs(output_dir, exist_ok=True)
processor = processors['mrpc']()
label_list = processor.get_labels()

train_examples = processor.get_train_examples(data_dir)
num_train_steps = int(
    len(train_examples) / train_batch_size / gradient_accumulation_steps * num_train_epochs)

##preprare optimizer
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0}
    ]
t_total = num_train_steps
optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=learning_rate,
                         warmup=warmup_proportion,
                         t_total=t_total)
# optimizer = AdamW(optimizer_grouped_parameters,
#                   lr = learning_rate, # args.learning_rate - default is 5e-5, our notebook had 2e-5
#                   eps = 1e-8, # args.adam_epsilon  - default is 1e-8.
#                   correct_bias=False
#                 )

# scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=t_total)  # PyTorch scheduler

05/21/2020 13:28:59 - INFO - run_classifier -   LOOKING AT D:/Projects/Stance/Dataset/BertForOppositeClassification/train.tsv


In [15]:
global_step = 0
train_features = convert_examples_to_features(train_examples, label_list, max_seq_length, tokenizer)
logger.info("***** Running training *****")
logger.info("  Num examples = %d", len(train_examples))
logger.info("  Batch size = %d", train_batch_size)
logger.info("  Num steps = %d", num_train_steps)
all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)

train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=train_batch_size)

05/21/2020 13:29:01 - INFO - run_classifier -   *** Example ***
05/21/2020 13:29:01 - INFO - run_classifier -   guid: train-1
05/21/2020 13:29:01 - INFO - run_classifier -   tokens: [CLS] male infant ci ##rc ##um ##cision is tan ##tam ##ount to child abuse [SEP] parents have the right to use their best judgment , in the light of medical advice [SEP]
05/21/2020 13:29:01 - INFO - run_classifier -   input_ids: 101 3287 10527 25022 11890 2819 28472 2003 9092 15464 21723 2000 2775 6905 102 3008 2031 1996 2157 2000 2224 2037 2190 8689 1010 1999 1996 2422 1997 2966 6040 102
05/21/2020 13:29:01 - INFO - run_classifier -   input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
05/21/2020 13:29:01 - INFO - run_classifier -   segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
05/21/2020 13:29:01 - INFO - run_classifier -   label: 0 (id = 0)
05/21/2020 13:29:01 - INFO - run_classifier -   *** Example ***
05/21/2020 13:29:01 - INFO - run_classifier -  

In [16]:
model.train()
for _ in trange(int(num_train_epochs), desc="Epoch"):
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
        batch = tuple(t.to(device) for t in batch)
        input_ids, input_mask, segment_ids, label_ids = batch
        loss = model(input_ids, segment_ids, input_mask, label_ids)
#         cos_loss = model()
#         cos_loss = torch.nn.CosineEmbeddingLoss()
        
        print(loss)
        print(loss.item())
        if n_gpu > 1:
            loss = loss.mean() # mean() to average on multi-gpu.
#         if fp16 and loss_scale != 1.0:
#             # rescale loss for fp16 training
#             # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html
#             loss = loss * loss_scale
        if gradient_accumulation_steps > 1:
            loss = loss / gradient_accumulation_steps
        loss.backward()
        
        tr_loss += loss.item()
        nb_tr_examples += input_ids.size(0)
        nb_tr_steps += 1
        if (step + 1) % gradient_accumulation_steps == 0:
#             if fp16 or optimize_on_cpu:
#                 if fp16 and loss_scale != 1.0:
#                     # scale down gradients for fp16 training
#                     for param in model.parameters():
#                         if param.grad is not None:
#                             param.grad.data = param.grad.data / loss_scale           
#                 is_nan = set_optimizer_params_grad(param_optimizer, model.named_parameters(), test_nan=True)
#                 if is_nan:
#                     logger.info("FP16 TRAINING: Nan in gradients, reducing loss scaling")
#                     loss_scale = loss_scale / 2
#                     model.zero_grad()
#                     continue 
#                 optimizer.step()
# #                 scheduler.step()
#                 copy_optimizer_params_to_model(model.named_parameters(), param_optimizer)
#             else:
#                 torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
            optimizer.step()
#                 scheduler.step()
            model.zero_grad()
            global_step += 1

torch.save(model.state_dict(), output_dir + "dataExpantion.pth")

Epoch:   0%|                                                                                     | 0/3 [00:00<?, ?it/s]
Iteration:   0%|                                                                               | 0/415 [00:00<?, ?it/s][A

tensor(0.6337, device='cuda:0', grad_fn=<NllLossBackward>)
0.6337074041366577



Iteration:   0%|▏                                                                      | 1/415 [00:00<05:00,  1.38it/s][A

tensor(0.5101, device='cuda:0', grad_fn=<NllLossBackward>)
0.510123074054718



Iteration:   0%|▎                                                                      | 2/415 [00:01<04:55,  1.40it/s][A

tensor(0.5015, device='cuda:0', grad_fn=<NllLossBackward>)
0.5014696717262268



Iteration:   1%|▌                                                                      | 3/415 [00:02<04:51,  1.41it/s][A

tensor(0.4727, device='cuda:0', grad_fn=<NllLossBackward>)
0.47266536951065063



Iteration:   1%|▋                                                                      | 4/415 [00:02<04:46,  1.43it/s][A

tensor(0.6011, device='cuda:0', grad_fn=<NllLossBackward>)
0.6011034846305847



Iteration:   1%|▊                                                                      | 5/415 [00:03<04:43,  1.45it/s][A

tensor(0.7056, device='cuda:0', grad_fn=<NllLossBackward>)
0.7056450247764587



Iteration:   1%|█                                                                      | 6/415 [00:04<04:40,  1.46it/s][A

tensor(0.4018, device='cuda:0', grad_fn=<NllLossBackward>)
0.40181851387023926



Iteration:   2%|█▏                                                                     | 7/415 [00:04<04:39,  1.46it/s][A

tensor(0.6254, device='cuda:0', grad_fn=<NllLossBackward>)
0.6253997087478638



Iteration:   2%|█▎                                                                     | 8/415 [00:05<04:38,  1.46it/s][A

tensor(0.8554, device='cuda:0', grad_fn=<NllLossBackward>)
0.8554226160049438



Iteration:   2%|█▌                                                                     | 9/415 [00:06<04:37,  1.46it/s][A

tensor(0.7775, device='cuda:0', grad_fn=<NllLossBackward>)
0.777462899684906



Iteration:   2%|█▋                                                                    | 10/415 [00:06<04:36,  1.46it/s][A

tensor(0.6358, device='cuda:0', grad_fn=<NllLossBackward>)
0.6358155608177185



Iteration:   3%|█▊                                                                    | 11/415 [00:07<04:35,  1.47it/s][A

tensor(0.7916, device='cuda:0', grad_fn=<NllLossBackward>)
0.7915506362915039



Iteration:   3%|██                                                                    | 12/415 [00:08<04:33,  1.47it/s][A

tensor(0.4837, device='cuda:0', grad_fn=<NllLossBackward>)
0.4836887717247009



Iteration:   3%|██▏                                                                   | 13/415 [00:08<04:32,  1.47it/s][A

tensor(0.5440, device='cuda:0', grad_fn=<NllLossBackward>)
0.5439602732658386



Iteration:   3%|██▎                                                                   | 14/415 [00:09<04:32,  1.47it/s][A

tensor(0.6595, device='cuda:0', grad_fn=<NllLossBackward>)
0.659456193447113



Iteration:   4%|██▌                                                                   | 15/415 [00:10<04:32,  1.47it/s][A

tensor(0.5432, device='cuda:0', grad_fn=<NllLossBackward>)
0.5431581139564514



Iteration:   4%|██▋                                                                   | 16/415 [00:10<04:30,  1.47it/s][A

tensor(0.6099, device='cuda:0', grad_fn=<NllLossBackward>)
0.6098856329917908



Iteration:   4%|██▊                                                                   | 17/415 [00:11<04:29,  1.48it/s][A

tensor(0.6813, device='cuda:0', grad_fn=<NllLossBackward>)
0.6813104748725891



Iteration:   4%|███                                                                   | 18/415 [00:12<04:29,  1.47it/s][A

tensor(0.6437, device='cuda:0', grad_fn=<NllLossBackward>)
0.6436854600906372



Iteration:   5%|███▏                                                                  | 19/415 [00:12<04:29,  1.47it/s][A

tensor(0.6956, device='cuda:0', grad_fn=<NllLossBackward>)
0.6956433653831482



Iteration:   5%|███▎                                                                  | 20/415 [00:13<04:27,  1.47it/s][A

tensor(0.5016, device='cuda:0', grad_fn=<NllLossBackward>)
0.5015546083450317



Iteration:   5%|███▌                                                                  | 21/415 [00:14<04:27,  1.48it/s][A

tensor(0.7051, device='cuda:0', grad_fn=<NllLossBackward>)
0.7050790190696716



Iteration:   5%|███▋                                                                  | 22/415 [00:15<04:26,  1.47it/s][A

tensor(0.6706, device='cuda:0', grad_fn=<NllLossBackward>)
0.6705963015556335



Iteration:   6%|███▉                                                                  | 23/415 [00:15<04:26,  1.47it/s][A

tensor(0.8736, device='cuda:0', grad_fn=<NllLossBackward>)
0.8735988140106201



Iteration:   6%|████                                                                  | 24/415 [00:16<04:25,  1.47it/s][A

tensor(0.6557, device='cuda:0', grad_fn=<NllLossBackward>)
0.6557289361953735



Iteration:   6%|████▏                                                                 | 25/415 [00:17<04:24,  1.47it/s][A

tensor(0.7505, device='cuda:0', grad_fn=<NllLossBackward>)
0.7505316734313965



Iteration:   6%|████▍                                                                 | 26/415 [00:17<04:25,  1.47it/s][A

tensor(0.5298, device='cuda:0', grad_fn=<NllLossBackward>)
0.529839813709259



Iteration:   7%|████▌                                                                 | 27/415 [00:18<04:24,  1.47it/s][A

tensor(0.5750, device='cuda:0', grad_fn=<NllLossBackward>)
0.5749586820602417



Iteration:   7%|████▋                                                                 | 28/415 [00:19<04:23,  1.47it/s][A

tensor(0.6300, device='cuda:0', grad_fn=<NllLossBackward>)
0.6299853920936584



Iteration:   7%|████▉                                                                 | 29/415 [00:19<04:25,  1.45it/s][A

tensor(0.7330, device='cuda:0', grad_fn=<NllLossBackward>)
0.7329940795898438



Iteration:   7%|█████                                                                 | 30/415 [00:20<04:24,  1.46it/s][A

tensor(0.5764, device='cuda:0', grad_fn=<NllLossBackward>)
0.5764061808586121



Iteration:   7%|█████▏                                                                | 31/415 [00:21<04:23,  1.46it/s][A

tensor(0.6241, device='cuda:0', grad_fn=<NllLossBackward>)
0.6240524649620056



Iteration:   8%|█████▍                                                                | 32/415 [00:21<04:22,  1.46it/s][A

tensor(0.6070, device='cuda:0', grad_fn=<NllLossBackward>)
0.6070200204849243



Iteration:   8%|█████▌                                                                | 33/415 [00:22<04:20,  1.46it/s][A

tensor(0.5687, device='cuda:0', grad_fn=<NllLossBackward>)
0.5686730742454529



Iteration:   8%|█████▋                                                                | 34/415 [00:23<04:20,  1.46it/s][A

tensor(0.5919, device='cuda:0', grad_fn=<NllLossBackward>)
0.5919309258460999



Iteration:   8%|█████▉                                                                | 35/415 [00:23<04:20,  1.46it/s][A

tensor(0.5815, device='cuda:0', grad_fn=<NllLossBackward>)
0.5814608931541443



Iteration:   9%|██████                                                                | 36/415 [00:24<04:20,  1.46it/s][A

tensor(0.6256, device='cuda:0', grad_fn=<NllLossBackward>)
0.6256245970726013



Iteration:   9%|██████▏                                                               | 37/415 [00:25<04:19,  1.45it/s][A

tensor(0.5165, device='cuda:0', grad_fn=<NllLossBackward>)
0.5165102481842041



Iteration:   9%|██████▍                                                               | 38/415 [00:25<04:19,  1.45it/s][A

tensor(0.6375, device='cuda:0', grad_fn=<NllLossBackward>)
0.6375222206115723



Iteration:   9%|██████▌                                                               | 39/415 [00:26<04:18,  1.45it/s][A

tensor(0.6860, device='cuda:0', grad_fn=<NllLossBackward>)
0.6860039234161377



Iteration:  10%|██████▋                                                               | 40/415 [00:27<04:18,  1.45it/s][A

tensor(0.7428, device='cuda:0', grad_fn=<NllLossBackward>)
0.7428112030029297



Iteration:  10%|██████▉                                                               | 41/415 [00:28<04:18,  1.45it/s][A

tensor(0.6075, device='cuda:0', grad_fn=<NllLossBackward>)
0.6074999570846558



Iteration:  10%|███████                                                               | 42/415 [00:28<04:17,  1.45it/s][A

tensor(0.5765, device='cuda:0', grad_fn=<NllLossBackward>)
0.5765144228935242



Iteration:  10%|███████▎                                                              | 43/415 [00:29<04:19,  1.43it/s][A

tensor(0.6382, device='cuda:0', grad_fn=<NllLossBackward>)
0.6381612420082092



Iteration:  11%|███████▍                                                              | 44/415 [00:30<04:17,  1.44it/s][A

tensor(0.6434, device='cuda:0', grad_fn=<NllLossBackward>)
0.6434146165847778



Iteration:  11%|███████▌                                                              | 45/415 [00:30<04:18,  1.43it/s][A

tensor(0.5744, device='cuda:0', grad_fn=<NllLossBackward>)
0.5743687152862549



Iteration:  11%|███████▊                                                              | 46/415 [00:31<04:18,  1.43it/s][A

tensor(0.6411, device='cuda:0', grad_fn=<NllLossBackward>)
0.6411269307136536



Iteration:  11%|███████▉                                                              | 47/415 [00:32<04:18,  1.42it/s][A

tensor(0.4849, device='cuda:0', grad_fn=<NllLossBackward>)
0.48491865396499634



Iteration:  12%|████████                                                              | 48/415 [00:32<04:18,  1.42it/s][A

tensor(0.4369, device='cuda:0', grad_fn=<NllLossBackward>)
0.4368695616722107



Iteration:  12%|████████▎                                                             | 49/415 [00:33<04:16,  1.42it/s][A

tensor(0.6930, device='cuda:0', grad_fn=<NllLossBackward>)
0.693020761013031



Iteration:  12%|████████▍                                                             | 50/415 [00:34<04:16,  1.42it/s][A

tensor(0.4375, device='cuda:0', grad_fn=<NllLossBackward>)
0.43748053908348083



Iteration:  12%|████████▌                                                             | 51/415 [00:35<04:16,  1.42it/s][A

tensor(0.5043, device='cuda:0', grad_fn=<NllLossBackward>)
0.504252016544342



Iteration:  13%|████████▊                                                             | 52/415 [00:35<04:16,  1.41it/s][A

tensor(0.5870, device='cuda:0', grad_fn=<NllLossBackward>)
0.5869505405426025



Iteration:  13%|████████▉                                                             | 53/415 [00:36<04:16,  1.41it/s][A

tensor(0.5062, device='cuda:0', grad_fn=<NllLossBackward>)
0.5062167048454285



Iteration:  13%|█████████                                                             | 54/415 [00:37<04:15,  1.41it/s][A

tensor(0.6283, device='cuda:0', grad_fn=<NllLossBackward>)
0.6282557845115662



Iteration:  13%|█████████▎                                                            | 55/415 [00:37<04:14,  1.42it/s][A

tensor(0.5380, device='cuda:0', grad_fn=<NllLossBackward>)
0.5380182266235352



Iteration:  13%|█████████▍                                                            | 56/415 [00:38<04:15,  1.41it/s][A

tensor(0.6197, device='cuda:0', grad_fn=<NllLossBackward>)
0.6197277307510376



Iteration:  14%|█████████▌                                                            | 57/415 [00:39<04:14,  1.41it/s][A

tensor(0.6075, device='cuda:0', grad_fn=<NllLossBackward>)
0.6075024008750916



Iteration:  14%|█████████▊                                                            | 58/415 [00:40<04:13,  1.41it/s][A

tensor(0.5606, device='cuda:0', grad_fn=<NllLossBackward>)
0.5606284737586975



Iteration:  14%|█████████▉                                                            | 59/415 [00:40<04:12,  1.41it/s][A

tensor(0.5652, device='cuda:0', grad_fn=<NllLossBackward>)
0.5651935338973999



Iteration:  14%|██████████                                                            | 60/415 [00:41<04:11,  1.41it/s][A

tensor(0.6215, device='cuda:0', grad_fn=<NllLossBackward>)
0.6214739680290222



Iteration:  15%|██████████▎                                                           | 61/415 [00:42<04:11,  1.41it/s][A

tensor(0.5208, device='cuda:0', grad_fn=<NllLossBackward>)
0.5207816362380981



Iteration:  15%|██████████▍                                                           | 62/415 [00:42<04:11,  1.41it/s][A

tensor(0.7322, device='cuda:0', grad_fn=<NllLossBackward>)
0.7322471737861633



Iteration:  15%|██████████▋                                                           | 63/415 [00:43<04:10,  1.40it/s][A

tensor(0.4390, device='cuda:0', grad_fn=<NllLossBackward>)
0.439040869474411



Iteration:  15%|██████████▊                                                           | 64/415 [00:44<04:10,  1.40it/s][A

tensor(0.6748, device='cuda:0', grad_fn=<NllLossBackward>)
0.6748287081718445



Iteration:  16%|██████████▉                                                           | 65/415 [00:45<04:09,  1.40it/s][A

tensor(0.6639, device='cuda:0', grad_fn=<NllLossBackward>)
0.6639488935470581



Iteration:  16%|███████████▏                                                          | 66/415 [00:45<04:10,  1.39it/s][A

tensor(0.5669, device='cuda:0', grad_fn=<NllLossBackward>)
0.5669047832489014



Iteration:  16%|███████████▎                                                          | 67/415 [00:46<04:08,  1.40it/s][A

tensor(0.6380, device='cuda:0', grad_fn=<NllLossBackward>)
0.6380437016487122



Iteration:  16%|███████████▍                                                          | 68/415 [00:47<04:09,  1.39it/s][A

tensor(0.5948, device='cuda:0', grad_fn=<NllLossBackward>)
0.5947794318199158



Iteration:  17%|███████████▋                                                          | 69/415 [00:47<04:08,  1.39it/s][A

tensor(0.6610, device='cuda:0', grad_fn=<NllLossBackward>)
0.6610258221626282



Iteration:  17%|███████████▊                                                          | 70/415 [00:48<04:08,  1.39it/s][A

tensor(0.5221, device='cuda:0', grad_fn=<NllLossBackward>)
0.5220550298690796



Iteration:  17%|███████████▉                                                          | 71/415 [00:49<04:07,  1.39it/s][A

tensor(0.6407, device='cuda:0', grad_fn=<NllLossBackward>)
0.6407228112220764



Iteration:  17%|████████████▏                                                         | 72/415 [00:50<04:06,  1.39it/s][A

tensor(0.6464, device='cuda:0', grad_fn=<NllLossBackward>)
0.6464292407035828



Iteration:  18%|████████████▎                                                         | 73/415 [00:50<04:06,  1.39it/s][A

tensor(0.5666, device='cuda:0', grad_fn=<NllLossBackward>)
0.5665736198425293



Iteration:  18%|████████████▍                                                         | 74/415 [00:51<04:05,  1.39it/s][A

tensor(0.7608, device='cuda:0', grad_fn=<NllLossBackward>)
0.7608274221420288



Iteration:  18%|████████████▋                                                         | 75/415 [00:52<04:05,  1.39it/s][A

tensor(0.5441, device='cuda:0', grad_fn=<NllLossBackward>)
0.5440833568572998



Iteration:  18%|████████████▊                                                         | 76/415 [00:52<04:05,  1.38it/s][A

tensor(0.6841, device='cuda:0', grad_fn=<NllLossBackward>)
0.6841050982475281



Iteration:  19%|████████████▉                                                         | 77/415 [00:53<04:03,  1.39it/s][A

tensor(0.6657, device='cuda:0', grad_fn=<NllLossBackward>)
0.6656519770622253



Iteration:  19%|█████████████▏                                                        | 78/415 [00:54<04:04,  1.38it/s][A

tensor(0.6071, device='cuda:0', grad_fn=<NllLossBackward>)
0.6071061491966248



Iteration:  19%|█████████████▎                                                        | 79/415 [00:55<04:02,  1.38it/s][A

tensor(0.6666, device='cuda:0', grad_fn=<NllLossBackward>)
0.6665903925895691



Iteration:  19%|█████████████▍                                                        | 80/415 [00:55<04:03,  1.38it/s][A

tensor(0.5652, device='cuda:0', grad_fn=<NllLossBackward>)
0.565199077129364



Iteration:  20%|█████████████▋                                                        | 81/415 [00:56<04:03,  1.37it/s][A

tensor(0.7825, device='cuda:0', grad_fn=<NllLossBackward>)
0.7825462818145752



Iteration:  20%|█████████████▊                                                        | 82/415 [00:57<04:02,  1.37it/s][A

tensor(0.6643, device='cuda:0', grad_fn=<NllLossBackward>)
0.664280891418457



Iteration:  20%|██████████████                                                        | 83/415 [00:58<04:02,  1.37it/s][A

tensor(0.6993, device='cuda:0', grad_fn=<NllLossBackward>)
0.6992642283439636



Iteration:  20%|██████████████▏                                                       | 84/415 [00:58<04:01,  1.37it/s][A

tensor(0.5700, device='cuda:0', grad_fn=<NllLossBackward>)
0.5699799060821533



Iteration:  20%|██████████████▎                                                       | 85/415 [00:59<04:01,  1.37it/s][A

tensor(0.6327, device='cuda:0', grad_fn=<NllLossBackward>)
0.6326504349708557



Iteration:  21%|██████████████▌                                                       | 86/415 [01:00<04:00,  1.37it/s][A

tensor(0.5216, device='cuda:0', grad_fn=<NllLossBackward>)
0.5216144323348999



Iteration:  21%|██████████████▋                                                       | 87/415 [01:01<04:01,  1.36it/s][A

tensor(0.6472, device='cuda:0', grad_fn=<NllLossBackward>)
0.6471977829933167



Iteration:  21%|██████████████▊                                                       | 88/415 [01:01<03:59,  1.36it/s][A

tensor(0.5602, device='cuda:0', grad_fn=<NllLossBackward>)
0.560196578502655



Iteration:  21%|███████████████                                                       | 89/415 [01:02<03:59,  1.36it/s][A

tensor(0.9002, device='cuda:0', grad_fn=<NllLossBackward>)
0.900231659412384



Iteration:  22%|███████████████▏                                                      | 90/415 [01:03<03:59,  1.35it/s][A

tensor(0.5337, device='cuda:0', grad_fn=<NllLossBackward>)
0.5337142944335938



Iteration:  22%|███████████████▎                                                      | 91/415 [01:03<03:57,  1.36it/s][A

tensor(0.5498, device='cuda:0', grad_fn=<NllLossBackward>)
0.5497826337814331



Iteration:  22%|███████████████▌                                                      | 92/415 [01:04<03:57,  1.36it/s][A

tensor(0.5220, device='cuda:0', grad_fn=<NllLossBackward>)
0.5219932794570923



Iteration:  22%|███████████████▋                                                      | 93/415 [01:05<03:57,  1.36it/s][A

tensor(0.5714, device='cuda:0', grad_fn=<NllLossBackward>)
0.5714299082756042



Iteration:  23%|███████████████▊                                                      | 94/415 [01:06<03:57,  1.35it/s][A

tensor(0.5719, device='cuda:0', grad_fn=<NllLossBackward>)
0.5719417333602905



Iteration:  23%|████████████████                                                      | 95/415 [01:06<03:55,  1.36it/s][A

tensor(0.6607, device='cuda:0', grad_fn=<NllLossBackward>)
0.6607035994529724



Iteration:  23%|████████████████▏                                                     | 96/415 [01:07<03:55,  1.35it/s][A

tensor(0.6589, device='cuda:0', grad_fn=<NllLossBackward>)
0.6588533520698547



Iteration:  23%|████████████████▎                                                     | 97/415 [01:08<03:55,  1.35it/s][A

tensor(0.4412, device='cuda:0', grad_fn=<NllLossBackward>)
0.44115495681762695



Iteration:  24%|████████████████▌                                                     | 98/415 [01:09<03:54,  1.35it/s][A

tensor(0.4699, device='cuda:0', grad_fn=<NllLossBackward>)
0.46992167830467224



Iteration:  24%|████████████████▋                                                     | 99/415 [01:09<03:53,  1.35it/s][A

tensor(0.6287, device='cuda:0', grad_fn=<NllLossBackward>)
0.6287389993667603



Iteration:  24%|████████████████▋                                                    | 100/415 [01:10<03:55,  1.34it/s][A

tensor(0.5475, device='cuda:0', grad_fn=<NllLossBackward>)
0.5474762320518494



Iteration:  24%|████████████████▊                                                    | 101/415 [01:11<03:54,  1.34it/s][A

tensor(0.5914, device='cuda:0', grad_fn=<NllLossBackward>)
0.5913994312286377



Iteration:  25%|████████████████▉                                                    | 102/415 [01:12<03:52,  1.34it/s][A

tensor(0.5943, device='cuda:0', grad_fn=<NllLossBackward>)
0.5942943096160889



Iteration:  25%|█████████████████▏                                                   | 103/415 [01:12<03:51,  1.35it/s][A

tensor(0.6301, device='cuda:0', grad_fn=<NllLossBackward>)
0.6301075220108032



Iteration:  25%|█████████████████▎                                                   | 104/415 [01:13<03:50,  1.35it/s][A

tensor(0.5316, device='cuda:0', grad_fn=<NllLossBackward>)
0.5315815210342407



Iteration:  25%|█████████████████▍                                                   | 105/415 [01:14<03:49,  1.35it/s][A

tensor(0.6775, device='cuda:0', grad_fn=<NllLossBackward>)
0.6775198578834534



Iteration:  26%|█████████████████▌                                                   | 106/415 [01:15<03:48,  1.35it/s][A

tensor(0.5316, device='cuda:0', grad_fn=<NllLossBackward>)
0.5315505266189575



Iteration:  26%|█████████████████▊                                                   | 107/415 [01:15<03:48,  1.35it/s][A

tensor(0.7266, device='cuda:0', grad_fn=<NllLossBackward>)
0.7266078591346741



Iteration:  26%|█████████████████▉                                                   | 108/415 [01:16<03:50,  1.33it/s][A

tensor(0.6928, device='cuda:0', grad_fn=<NllLossBackward>)
0.6927828788757324



Iteration:  26%|██████████████████                                                   | 109/415 [01:17<03:47,  1.34it/s][A

tensor(0.6294, device='cuda:0', grad_fn=<NllLossBackward>)
0.629368245601654



Iteration:  27%|██████████████████▎                                                  | 110/415 [01:18<03:46,  1.35it/s][A

tensor(0.7138, device='cuda:0', grad_fn=<NllLossBackward>)
0.7138065099716187



Iteration:  27%|██████████████████▍                                                  | 111/415 [01:18<03:45,  1.35it/s][A

tensor(0.6510, device='cuda:0', grad_fn=<NllLossBackward>)
0.6510012149810791



Iteration:  27%|██████████████████▌                                                  | 112/415 [01:19<03:45,  1.34it/s][A

tensor(0.6174, device='cuda:0', grad_fn=<NllLossBackward>)
0.6174163818359375



Iteration:  27%|██████████████████▊                                                  | 113/415 [01:20<03:44,  1.34it/s][A

tensor(0.5456, device='cuda:0', grad_fn=<NllLossBackward>)
0.5456075072288513



Iteration:  27%|██████████████████▉                                                  | 114/415 [01:21<03:45,  1.33it/s][A

tensor(0.6970, device='cuda:0', grad_fn=<NllLossBackward>)
0.6970421075820923



Iteration:  28%|███████████████████                                                  | 115/415 [01:21<03:45,  1.33it/s][A

tensor(0.6454, device='cuda:0', grad_fn=<NllLossBackward>)
0.6453638672828674



Iteration:  28%|███████████████████▎                                                 | 116/415 [01:22<03:43,  1.34it/s][A

tensor(0.6991, device='cuda:0', grad_fn=<NllLossBackward>)
0.6991096138954163



Iteration:  28%|███████████████████▍                                                 | 117/415 [01:23<03:44,  1.33it/s][A

tensor(0.6552, device='cuda:0', grad_fn=<NllLossBackward>)
0.6551586985588074



Iteration:  28%|███████████████████▌                                                 | 118/415 [01:24<03:44,  1.32it/s][A

tensor(0.6954, device='cuda:0', grad_fn=<NllLossBackward>)
0.6954017877578735



Iteration:  29%|███████████████████▊                                                 | 119/415 [01:24<03:42,  1.33it/s][A

tensor(0.6090, device='cuda:0', grad_fn=<NllLossBackward>)
0.6090148091316223



Iteration:  29%|███████████████████▉                                                 | 120/415 [01:25<03:43,  1.32it/s][A

tensor(0.6243, device='cuda:0', grad_fn=<NllLossBackward>)
0.6243338584899902



Iteration:  29%|████████████████████                                                 | 121/415 [01:26<03:40,  1.33it/s][A

tensor(0.6393, device='cuda:0', grad_fn=<NllLossBackward>)
0.6392866373062134



Iteration:  29%|████████████████████▎                                                | 122/415 [01:27<03:40,  1.33it/s][A

tensor(0.6109, device='cuda:0', grad_fn=<NllLossBackward>)
0.6109173893928528



Iteration:  30%|████████████████████▍                                                | 123/415 [01:27<03:39,  1.33it/s][A

tensor(0.6487, device='cuda:0', grad_fn=<NllLossBackward>)
0.6486791968345642



Iteration:  30%|████████████████████▌                                                | 124/415 [01:28<03:39,  1.33it/s][A

tensor(0.6138, device='cuda:0', grad_fn=<NllLossBackward>)
0.6138100028038025



Iteration:  30%|████████████████████▊                                                | 125/415 [01:29<03:39,  1.32it/s][A

tensor(0.6325, device='cuda:0', grad_fn=<NllLossBackward>)
0.6324594616889954



Iteration:  30%|████████████████████▉                                                | 126/415 [01:30<03:40,  1.31it/s][A

tensor(0.6332, device='cuda:0', grad_fn=<NllLossBackward>)
0.633206844329834



Iteration:  31%|█████████████████████                                                | 127/415 [01:30<03:40,  1.31it/s][A

tensor(0.6137, device='cuda:0', grad_fn=<NllLossBackward>)
0.6137104034423828



Iteration:  31%|█████████████████████▎                                               | 128/415 [01:31<03:39,  1.31it/s][A

tensor(0.5743, device='cuda:0', grad_fn=<NllLossBackward>)
0.574270486831665



Iteration:  31%|█████████████████████▍                                               | 129/415 [01:32<03:39,  1.30it/s][A

tensor(0.6171, device='cuda:0', grad_fn=<NllLossBackward>)
0.6170555949211121



Iteration:  31%|█████████████████████▌                                               | 130/415 [01:33<03:37,  1.31it/s][A

tensor(0.5229, device='cuda:0', grad_fn=<NllLossBackward>)
0.522920548915863



Iteration:  32%|█████████████████████▊                                               | 131/415 [01:33<03:36,  1.31it/s][A

tensor(0.6454, device='cuda:0', grad_fn=<NllLossBackward>)
0.6453837156295776



Iteration:  32%|█████████████████████▉                                               | 132/415 [01:34<03:34,  1.32it/s][A

tensor(0.5498, device='cuda:0', grad_fn=<NllLossBackward>)
0.5498349070549011



Iteration:  32%|██████████████████████                                               | 133/415 [01:35<03:32,  1.33it/s][A

tensor(0.4850, device='cuda:0', grad_fn=<NllLossBackward>)
0.48502811789512634



Iteration:  32%|██████████████████████▎                                              | 134/415 [01:36<03:32,  1.32it/s][A

tensor(0.6706, device='cuda:0', grad_fn=<NllLossBackward>)
0.6705590486526489



Iteration:  33%|██████████████████████▍                                              | 135/415 [01:36<03:32,  1.32it/s][A

tensor(0.6172, device='cuda:0', grad_fn=<NllLossBackward>)
0.6172212958335876



Iteration:  33%|██████████████████████▌                                              | 136/415 [01:37<03:31,  1.32it/s][A

tensor(0.5619, device='cuda:0', grad_fn=<NllLossBackward>)
0.5619010925292969



Iteration:  33%|██████████████████████▊                                              | 137/415 [01:38<03:31,  1.32it/s][A

tensor(0.6900, device='cuda:0', grad_fn=<NllLossBackward>)
0.690019965171814



Iteration:  33%|██████████████████████▉                                              | 138/415 [01:39<03:30,  1.32it/s][A

tensor(0.6252, device='cuda:0', grad_fn=<NllLossBackward>)
0.6251594424247742



Iteration:  33%|███████████████████████                                              | 139/415 [01:40<03:29,  1.32it/s][A

tensor(0.5617, device='cuda:0', grad_fn=<NllLossBackward>)
0.5616574287414551



Iteration:  34%|███████████████████████▎                                             | 140/415 [01:40<03:29,  1.31it/s][A

tensor(0.7280, device='cuda:0', grad_fn=<NllLossBackward>)
0.727978527545929



Iteration:  34%|███████████████████████▍                                             | 141/415 [01:41<03:31,  1.29it/s][A

tensor(0.5495, device='cuda:0', grad_fn=<NllLossBackward>)
0.5494831204414368



Iteration:  34%|███████████████████████▌                                             | 142/415 [01:42<03:31,  1.29it/s][A

tensor(0.7759, device='cuda:0', grad_fn=<NllLossBackward>)
0.7758695483207703



Iteration:  34%|███████████████████████▊                                             | 143/415 [01:43<03:28,  1.31it/s][A

tensor(0.7006, device='cuda:0', grad_fn=<NllLossBackward>)
0.7005705833435059



Iteration:  35%|███████████████████████▉                                             | 144/415 [01:43<03:27,  1.31it/s][A

tensor(0.6266, device='cuda:0', grad_fn=<NllLossBackward>)
0.6266205906867981



Iteration:  35%|████████████████████████                                             | 145/415 [01:44<03:27,  1.30it/s][A

tensor(0.5268, device='cuda:0', grad_fn=<NllLossBackward>)
0.5267689228057861



Iteration:  35%|████████████████████████▎                                            | 146/415 [01:45<03:28,  1.29it/s][A

tensor(0.7252, device='cuda:0', grad_fn=<NllLossBackward>)
0.7251625657081604



Iteration:  35%|████████████████████████▍                                            | 147/415 [01:46<03:25,  1.31it/s][A

tensor(0.6080, device='cuda:0', grad_fn=<NllLossBackward>)
0.6079938411712646



Iteration:  36%|████████████████████████▌                                            | 148/415 [01:46<03:25,  1.30it/s][A

tensor(0.6428, device='cuda:0', grad_fn=<NllLossBackward>)
0.6427654027938843



Iteration:  36%|████████████████████████▊                                            | 149/415 [01:47<03:28,  1.28it/s][A

tensor(0.6658, device='cuda:0', grad_fn=<NllLossBackward>)
0.6658035516738892



Iteration:  36%|████████████████████████▉                                            | 150/415 [01:48<03:26,  1.28it/s][A

tensor(0.5794, device='cuda:0', grad_fn=<NllLossBackward>)
0.5794121623039246



Iteration:  36%|█████████████████████████                                            | 151/415 [01:49<03:24,  1.29it/s][A

tensor(0.7941, device='cuda:0', grad_fn=<NllLossBackward>)
0.7941128611564636



Iteration:  37%|█████████████████████████▎                                           | 152/415 [01:50<03:21,  1.30it/s][A

tensor(0.5907, device='cuda:0', grad_fn=<NllLossBackward>)
0.5906844139099121



Iteration:  37%|█████████████████████████▍                                           | 153/415 [01:50<03:22,  1.30it/s][A

tensor(0.6558, device='cuda:0', grad_fn=<NllLossBackward>)
0.6558496952056885



Iteration:  37%|█████████████████████████▌                                           | 154/415 [01:51<03:22,  1.29it/s][A

tensor(0.5974, device='cuda:0', grad_fn=<NllLossBackward>)
0.5974425673484802



Iteration:  37%|█████████████████████████▊                                           | 155/415 [01:52<03:21,  1.29it/s][A

tensor(0.6247, device='cuda:0', grad_fn=<NllLossBackward>)
0.6246829032897949



Iteration:  38%|█████████████████████████▉                                           | 156/415 [01:53<03:21,  1.29it/s][A

tensor(0.6186, device='cuda:0', grad_fn=<NllLossBackward>)
0.618586540222168



Iteration:  38%|██████████████████████████                                           | 157/415 [01:53<03:20,  1.29it/s][A

tensor(0.6504, device='cuda:0', grad_fn=<NllLossBackward>)
0.6503832936286926



Iteration:  38%|██████████████████████████▎                                          | 158/415 [01:54<03:18,  1.29it/s][A

tensor(0.5717, device='cuda:0', grad_fn=<NllLossBackward>)
0.5716966986656189



Iteration:  38%|██████████████████████████▍                                          | 159/415 [01:55<03:18,  1.29it/s][A

tensor(0.6851, device='cuda:0', grad_fn=<NllLossBackward>)
0.6850849986076355



Iteration:  39%|██████████████████████████▌                                          | 160/415 [01:56<03:18,  1.29it/s][A

tensor(0.6856, device='cuda:0', grad_fn=<NllLossBackward>)
0.685563325881958



Iteration:  39%|██████████████████████████▊                                          | 161/415 [01:57<03:17,  1.28it/s][A

tensor(0.5879, device='cuda:0', grad_fn=<NllLossBackward>)
0.5878714323043823



Iteration:  39%|██████████████████████████▉                                          | 162/415 [01:57<03:14,  1.30it/s][A

tensor(0.6335, device='cuda:0', grad_fn=<NllLossBackward>)
0.6334707736968994



Iteration:  39%|███████████████████████████                                          | 163/415 [01:58<03:15,  1.29it/s][A

tensor(0.6126, device='cuda:0', grad_fn=<NllLossBackward>)
0.6126302480697632



Iteration:  40%|███████████████████████████▎                                         | 164/415 [01:59<03:14,  1.29it/s][A

tensor(0.6604, device='cuda:0', grad_fn=<NllLossBackward>)
0.6604409217834473



Iteration:  40%|███████████████████████████▍                                         | 165/415 [02:00<03:13,  1.29it/s][A

tensor(0.6474, device='cuda:0', grad_fn=<NllLossBackward>)
0.6474292278289795



Iteration:  40%|███████████████████████████▌                                         | 166/415 [02:00<03:12,  1.29it/s][A

tensor(0.6590, device='cuda:0', grad_fn=<NllLossBackward>)
0.6589656472206116



Iteration:  40%|███████████████████████████▊                                         | 167/415 [02:01<03:10,  1.30it/s][A

tensor(0.7350, device='cuda:0', grad_fn=<NllLossBackward>)
0.7350467443466187



Iteration:  40%|███████████████████████████▉                                         | 168/415 [02:02<03:14,  1.27it/s][A

tensor(0.6571, device='cuda:0', grad_fn=<NllLossBackward>)
0.657094419002533



Iteration:  41%|████████████████████████████                                         | 169/415 [02:03<03:12,  1.28it/s][A

tensor(0.5896, device='cuda:0', grad_fn=<NllLossBackward>)
0.5896182656288147



Iteration:  41%|████████████████████████████▎                                        | 170/415 [02:04<03:10,  1.28it/s][A

tensor(0.5404, device='cuda:0', grad_fn=<NllLossBackward>)
0.5403685569763184



Iteration:  41%|████████████████████████████▍                                        | 171/415 [02:04<03:07,  1.30it/s][A

tensor(0.6689, device='cuda:0', grad_fn=<NllLossBackward>)
0.6688883900642395



Iteration:  41%|████████████████████████████▌                                        | 172/415 [02:05<03:05,  1.31it/s][A

tensor(0.7441, device='cuda:0', grad_fn=<NllLossBackward>)
0.7441281676292419



Iteration:  42%|████████████████████████████▊                                        | 173/415 [02:06<03:13,  1.25it/s][A

tensor(0.5091, device='cuda:0', grad_fn=<NllLossBackward>)
0.5091468691825867



Iteration:  42%|████████████████████████████▉                                        | 174/415 [02:07<03:09,  1.27it/s][A

tensor(0.5685, device='cuda:0', grad_fn=<NllLossBackward>)
0.5684645771980286



Iteration:  42%|█████████████████████████████                                        | 175/415 [02:08<03:17,  1.22it/s][A

tensor(0.6432, device='cuda:0', grad_fn=<NllLossBackward>)
0.6432260870933533



Iteration:  42%|█████████████████████████████▎                                       | 176/415 [02:08<03:10,  1.25it/s][A

tensor(0.6941, device='cuda:0', grad_fn=<NllLossBackward>)
0.6940600275993347



Iteration:  43%|█████████████████████████████▍                                       | 177/415 [02:09<03:06,  1.27it/s][A

tensor(0.6654, device='cuda:0', grad_fn=<NllLossBackward>)
0.6654196977615356



Iteration:  43%|█████████████████████████████▌                                       | 178/415 [02:10<03:04,  1.29it/s][A

tensor(0.6273, device='cuda:0', grad_fn=<NllLossBackward>)
0.6272600889205933



Iteration:  43%|█████████████████████████████▊                                       | 179/415 [02:11<03:01,  1.30it/s][A

tensor(0.6036, device='cuda:0', grad_fn=<NllLossBackward>)
0.6035821437835693



Iteration:  43%|█████████████████████████████▉                                       | 180/415 [02:11<03:03,  1.28it/s][A

tensor(0.5675, device='cuda:0', grad_fn=<NllLossBackward>)
0.5675076246261597



Iteration:  44%|██████████████████████████████                                       | 181/415 [02:12<03:02,  1.29it/s][A

tensor(0.6010, device='cuda:0', grad_fn=<NllLossBackward>)
0.6010092496871948



Iteration:  44%|██████████████████████████████▎                                      | 182/415 [02:13<02:59,  1.30it/s][A

tensor(0.5509, device='cuda:0', grad_fn=<NllLossBackward>)
0.5509160161018372



Iteration:  44%|██████████████████████████████▍                                      | 183/415 [02:14<02:58,  1.30it/s][A

tensor(0.6708, device='cuda:0', grad_fn=<NllLossBackward>)
0.6707642674446106



Iteration:  44%|██████████████████████████████▌                                      | 184/415 [02:14<02:57,  1.30it/s][A

tensor(0.6554, device='cuda:0', grad_fn=<NllLossBackward>)
0.6553729772567749



Iteration:  45%|██████████████████████████████▊                                      | 185/415 [02:15<02:56,  1.30it/s][A

tensor(0.6395, device='cuda:0', grad_fn=<NllLossBackward>)
0.6395299434661865



Iteration:  45%|██████████████████████████████▉                                      | 186/415 [02:16<02:55,  1.30it/s][A

tensor(0.5295, device='cuda:0', grad_fn=<NllLossBackward>)
0.529492974281311



Iteration:  45%|███████████████████████████████                                      | 187/415 [02:17<02:53,  1.31it/s][A

tensor(0.6805, device='cuda:0', grad_fn=<NllLossBackward>)
0.6805354356765747



Iteration:  45%|███████████████████████████████▎                                     | 188/415 [02:18<02:56,  1.29it/s][A

tensor(0.5416, device='cuda:0', grad_fn=<NllLossBackward>)
0.5416362881660461



Iteration:  46%|███████████████████████████████▍                                     | 189/415 [02:18<02:56,  1.28it/s][A

tensor(0.6863, device='cuda:0', grad_fn=<NllLossBackward>)
0.6862668395042419



Iteration:  46%|███████████████████████████████▌                                     | 190/415 [02:19<02:53,  1.30it/s][A

tensor(0.7878, device='cuda:0', grad_fn=<NllLossBackward>)
0.7878302335739136



Iteration:  46%|███████████████████████████████▊                                     | 191/415 [02:20<02:51,  1.30it/s][A

tensor(0.6008, device='cuda:0', grad_fn=<NllLossBackward>)
0.6007820963859558



Iteration:  46%|███████████████████████████████▉                                     | 192/415 [02:21<02:52,  1.29it/s][A

tensor(0.7416, device='cuda:0', grad_fn=<NllLossBackward>)
0.7416141033172607



Iteration:  47%|████████████████████████████████                                     | 193/415 [02:21<02:53,  1.28it/s][A

tensor(0.6942, device='cuda:0', grad_fn=<NllLossBackward>)
0.6941852569580078



Iteration:  47%|████████████████████████████████▎                                    | 194/415 [02:22<02:52,  1.28it/s][A

tensor(0.8919, device='cuda:0', grad_fn=<NllLossBackward>)
0.8918898701667786



Iteration:  47%|████████████████████████████████▍                                    | 195/415 [02:23<02:49,  1.29it/s][A

tensor(0.7821, device='cuda:0', grad_fn=<NllLossBackward>)
0.7820528149604797



Iteration:  47%|████████████████████████████████▌                                    | 196/415 [02:24<02:47,  1.31it/s][A

tensor(0.7071, device='cuda:0', grad_fn=<NllLossBackward>)
0.7070528864860535



Iteration:  47%|████████████████████████████████▊                                    | 197/415 [02:25<02:47,  1.30it/s][A

tensor(0.7021, device='cuda:0', grad_fn=<NllLossBackward>)
0.7020620107650757



Iteration:  48%|████████████████████████████████▉                                    | 198/415 [02:25<02:48,  1.29it/s][A

tensor(0.6631, device='cuda:0', grad_fn=<NllLossBackward>)
0.6631147265434265



Iteration:  48%|█████████████████████████████████                                    | 199/415 [02:26<02:47,  1.29it/s][A

tensor(0.6993, device='cuda:0', grad_fn=<NllLossBackward>)
0.6993341445922852



Iteration:  48%|█████████████████████████████████▎                                   | 200/415 [02:27<02:45,  1.30it/s][A

tensor(0.6789, device='cuda:0', grad_fn=<NllLossBackward>)
0.6788933873176575



Iteration:  48%|█████████████████████████████████▍                                   | 201/415 [02:28<02:44,  1.30it/s][A

tensor(0.7067, device='cuda:0', grad_fn=<NllLossBackward>)
0.7067221999168396



Iteration:  49%|█████████████████████████████████▌                                   | 202/415 [02:28<02:44,  1.29it/s][A

tensor(0.6790, device='cuda:0', grad_fn=<NllLossBackward>)
0.6789524555206299



Iteration:  49%|█████████████████████████████████▊                                   | 203/415 [02:29<02:43,  1.30it/s][A

tensor(0.6674, device='cuda:0', grad_fn=<NllLossBackward>)
0.6673727035522461



Iteration:  49%|█████████████████████████████████▉                                   | 204/415 [02:30<02:41,  1.31it/s][A

tensor(0.6912, device='cuda:0', grad_fn=<NllLossBackward>)
0.69117671251297



Iteration:  49%|██████████████████████████████████                                   | 205/415 [02:31<02:42,  1.29it/s][A

tensor(0.7013, device='cuda:0', grad_fn=<NllLossBackward>)
0.7012903094291687



Iteration:  50%|██████████████████████████████████▎                                  | 206/415 [02:31<02:41,  1.30it/s][A

tensor(0.7213, device='cuda:0', grad_fn=<NllLossBackward>)
0.7212976813316345



Iteration:  50%|██████████████████████████████████▍                                  | 207/415 [02:32<02:41,  1.29it/s][A

tensor(0.6475, device='cuda:0', grad_fn=<NllLossBackward>)
0.6475226283073425



Iteration:  50%|██████████████████████████████████▌                                  | 208/415 [02:33<02:39,  1.30it/s][A

tensor(0.6914, device='cuda:0', grad_fn=<NllLossBackward>)
0.6914443373680115



Iteration:  50%|██████████████████████████████████▋                                  | 209/415 [02:34<02:39,  1.29it/s][A

tensor(0.7873, device='cuda:0', grad_fn=<NllLossBackward>)
0.7872668504714966



Iteration:  51%|██████████████████████████████████▉                                  | 210/415 [02:35<02:38,  1.29it/s][A

tensor(0.6898, device='cuda:0', grad_fn=<NllLossBackward>)
0.6898365020751953



Iteration:  51%|███████████████████████████████████                                  | 211/415 [02:35<02:38,  1.28it/s][A

tensor(0.6802, device='cuda:0', grad_fn=<NllLossBackward>)
0.6802243590354919



Iteration:  51%|███████████████████████████████████▏                                 | 212/415 [02:36<02:36,  1.29it/s][A

tensor(0.6849, device='cuda:0', grad_fn=<NllLossBackward>)
0.684893012046814



Iteration:  51%|███████████████████████████████████▍                                 | 213/415 [02:37<02:36,  1.29it/s][A

tensor(0.8236, device='cuda:0', grad_fn=<NllLossBackward>)
0.823560893535614



Iteration:  52%|███████████████████████████████████▌                                 | 214/415 [02:38<02:35,  1.30it/s][A

tensor(0.7708, device='cuda:0', grad_fn=<NllLossBackward>)
0.7708136439323425



Iteration:  52%|███████████████████████████████████▋                                 | 215/415 [02:38<02:34,  1.29it/s][A

tensor(0.6365, device='cuda:0', grad_fn=<NllLossBackward>)
0.6364603638648987



Iteration:  52%|███████████████████████████████████▉                                 | 216/415 [02:39<02:33,  1.30it/s][A

tensor(0.6999, device='cuda:0', grad_fn=<NllLossBackward>)
0.6998567581176758



Iteration:  52%|████████████████████████████████████                                 | 217/415 [02:40<02:32,  1.30it/s][A

tensor(0.6871, device='cuda:0', grad_fn=<NllLossBackward>)
0.6870827078819275



Iteration:  53%|████████████████████████████████████▏                                | 218/415 [02:41<02:31,  1.30it/s][A

tensor(0.6570, device='cuda:0', grad_fn=<NllLossBackward>)
0.6569822430610657



Iteration:  53%|████████████████████████████████████▍                                | 219/415 [02:42<02:31,  1.29it/s][A

tensor(0.7452, device='cuda:0', grad_fn=<NllLossBackward>)
0.7451972961425781



Iteration:  53%|████████████████████████████████████▌                                | 220/415 [02:42<02:31,  1.29it/s][A

tensor(0.6227, device='cuda:0', grad_fn=<NllLossBackward>)
0.622714102268219



Iteration:  53%|████████████████████████████████████▋                                | 221/415 [02:43<02:30,  1.29it/s][A

tensor(0.6465, device='cuda:0', grad_fn=<NllLossBackward>)
0.6465357542037964



Iteration:  53%|████████████████████████████████████▉                                | 222/415 [02:44<02:28,  1.30it/s][A

tensor(0.7625, device='cuda:0', grad_fn=<NllLossBackward>)
0.7625032067298889



Iteration:  54%|█████████████████████████████████████                                | 223/415 [02:45<02:29,  1.29it/s][A

tensor(0.6729, device='cuda:0', grad_fn=<NllLossBackward>)
0.6728993654251099



Iteration:  54%|█████████████████████████████████████▏                               | 224/415 [02:45<02:29,  1.28it/s][A

tensor(0.7414, device='cuda:0', grad_fn=<NllLossBackward>)
0.741418182849884



Iteration:  54%|█████████████████████████████████████▍                               | 225/415 [02:46<02:31,  1.26it/s][A

tensor(0.7755, device='cuda:0', grad_fn=<NllLossBackward>)
0.7754886746406555



Iteration:  54%|█████████████████████████████████████▌                               | 226/415 [02:47<02:28,  1.27it/s][A

tensor(0.6979, device='cuda:0', grad_fn=<NllLossBackward>)
0.6978744268417358



Iteration:  55%|█████████████████████████████████████▋                               | 227/415 [02:48<02:25,  1.29it/s][A

tensor(0.6658, device='cuda:0', grad_fn=<NllLossBackward>)
0.6657692790031433



Iteration:  55%|█████████████████████████████████████▉                               | 228/415 [02:49<02:25,  1.29it/s][A

tensor(0.6812, device='cuda:0', grad_fn=<NllLossBackward>)
0.6812082529067993



Iteration:  55%|██████████████████████████████████████                               | 229/415 [02:49<02:25,  1.28it/s][A

tensor(0.6225, device='cuda:0', grad_fn=<NllLossBackward>)
0.6224579811096191



Iteration:  55%|██████████████████████████████████████▏                              | 230/415 [02:50<02:23,  1.29it/s][A

tensor(0.7761, device='cuda:0', grad_fn=<NllLossBackward>)
0.7760635614395142



Iteration:  56%|██████████████████████████████████████▍                              | 231/415 [02:51<02:22,  1.29it/s][A

tensor(0.7041, device='cuda:0', grad_fn=<NllLossBackward>)
0.7040558457374573



Iteration:  56%|██████████████████████████████████████▌                              | 232/415 [02:52<02:21,  1.29it/s][A

tensor(0.6960, device='cuda:0', grad_fn=<NllLossBackward>)
0.6959534287452698



Iteration:  56%|██████████████████████████████████████▋                              | 233/415 [02:52<02:20,  1.29it/s][A

tensor(0.6532, device='cuda:0', grad_fn=<NllLossBackward>)
0.6531627178192139



Iteration:  56%|██████████████████████████████████████▉                              | 234/415 [02:53<02:21,  1.28it/s][A

tensor(0.7089, device='cuda:0', grad_fn=<NllLossBackward>)
0.7088636159896851



Iteration:  57%|███████████████████████████████████████                              | 235/415 [02:54<02:21,  1.27it/s][A

tensor(0.6719, device='cuda:0', grad_fn=<NllLossBackward>)
0.6719463467597961



Iteration:  57%|███████████████████████████████████████▏                             | 236/415 [02:55<02:21,  1.26it/s][A

tensor(0.6216, device='cuda:0', grad_fn=<NllLossBackward>)
0.6216381788253784



Iteration:  57%|███████████████████████████████████████▍                             | 237/415 [02:56<02:22,  1.25it/s][A

tensor(0.6693, device='cuda:0', grad_fn=<NllLossBackward>)
0.6692743897438049



Iteration:  57%|███████████████████████████████████████▌                             | 238/415 [02:56<02:19,  1.27it/s][A

tensor(0.6592, device='cuda:0', grad_fn=<NllLossBackward>)
0.6592056155204773



Iteration:  58%|███████████████████████████████████████▋                             | 239/415 [02:57<02:18,  1.27it/s][A

tensor(0.6498, device='cuda:0', grad_fn=<NllLossBackward>)
0.6498300433158875



Iteration:  58%|███████████████████████████████████████▉                             | 240/415 [02:58<02:16,  1.28it/s][A

tensor(0.5916, device='cuda:0', grad_fn=<NllLossBackward>)
0.5916428565979004



Iteration:  58%|████████████████████████████████████████                             | 241/415 [02:59<02:16,  1.27it/s][A

tensor(0.6035, device='cuda:0', grad_fn=<NllLossBackward>)
0.6035275459289551



Iteration:  58%|████████████████████████████████████████▏                            | 242/415 [03:00<02:15,  1.27it/s][A

tensor(0.6865, device='cuda:0', grad_fn=<NllLossBackward>)
0.6864571571350098



Iteration:  59%|████████████████████████████████████████▍                            | 243/415 [03:00<02:16,  1.26it/s][A

tensor(0.6399, device='cuda:0', grad_fn=<NllLossBackward>)
0.6399282813072205



Iteration:  59%|████████████████████████████████████████▌                            | 244/415 [03:01<02:15,  1.26it/s][A

tensor(0.7880, device='cuda:0', grad_fn=<NllLossBackward>)
0.7880235910415649



Iteration:  59%|████████████████████████████████████████▋                            | 245/415 [03:02<02:15,  1.25it/s][A

tensor(0.7247, device='cuda:0', grad_fn=<NllLossBackward>)
0.7246989011764526



Iteration:  59%|████████████████████████████████████████▉                            | 246/415 [03:03<02:14,  1.26it/s][A

tensor(0.5375, device='cuda:0', grad_fn=<NllLossBackward>)
0.5375244617462158



Iteration:  60%|█████████████████████████████████████████                            | 247/415 [03:04<02:17,  1.22it/s][A

tensor(0.7133, device='cuda:0', grad_fn=<NllLossBackward>)
0.7133315205574036



Iteration:  60%|█████████████████████████████████████████▏                           | 248/415 [03:04<02:14,  1.24it/s][A

tensor(0.6023, device='cuda:0', grad_fn=<NllLossBackward>)
0.6023271679878235



Iteration:  60%|█████████████████████████████████████████▍                           | 249/415 [03:05<02:12,  1.25it/s][A

tensor(0.5037, device='cuda:0', grad_fn=<NllLossBackward>)
0.503705620765686



Iteration:  60%|█████████████████████████████████████████▌                           | 250/415 [03:06<02:11,  1.26it/s][A

tensor(0.6187, device='cuda:0', grad_fn=<NllLossBackward>)
0.6186972260475159



Iteration:  60%|█████████████████████████████████████████▋                           | 251/415 [03:07<02:11,  1.25it/s][A

tensor(0.7117, device='cuda:0', grad_fn=<NllLossBackward>)
0.7117212414741516



Iteration:  61%|█████████████████████████████████████████▉                           | 252/415 [03:08<02:09,  1.25it/s][A

tensor(0.5709, device='cuda:0', grad_fn=<NllLossBackward>)
0.5708722472190857



Iteration:  61%|██████████████████████████████████████████                           | 253/415 [03:08<02:09,  1.25it/s][A

tensor(0.6484, device='cuda:0', grad_fn=<NllLossBackward>)
0.6483763456344604



Iteration:  61%|██████████████████████████████████████████▏                          | 254/415 [03:09<02:09,  1.24it/s][A

tensor(0.5282, device='cuda:0', grad_fn=<NllLossBackward>)
0.528240442276001



Iteration:  61%|██████████████████████████████████████████▍                          | 255/415 [03:10<02:08,  1.25it/s][A

tensor(0.5848, device='cuda:0', grad_fn=<NllLossBackward>)
0.5847558975219727



Iteration:  62%|██████████████████████████████████████████▌                          | 256/415 [03:11<02:06,  1.25it/s][A

tensor(0.6712, device='cuda:0', grad_fn=<NllLossBackward>)
0.6711634397506714



Iteration:  62%|██████████████████████████████████████████▋                          | 257/415 [03:12<02:05,  1.26it/s][A

tensor(0.5965, device='cuda:0', grad_fn=<NllLossBackward>)
0.596505343914032



Iteration:  62%|██████████████████████████████████████████▉                          | 258/415 [03:12<02:03,  1.27it/s][A

tensor(0.5047, device='cuda:0', grad_fn=<NllLossBackward>)
0.5047129392623901



Iteration:  62%|███████████████████████████████████████████                          | 259/415 [03:13<02:02,  1.27it/s][A

tensor(0.7402, device='cuda:0', grad_fn=<NllLossBackward>)
0.740177571773529



Iteration:  63%|███████████████████████████████████████████▏                         | 260/415 [03:14<02:01,  1.27it/s][A

tensor(0.6153, device='cuda:0', grad_fn=<NllLossBackward>)
0.6153445839881897



Iteration:  63%|███████████████████████████████████████████▍                         | 261/415 [03:15<02:01,  1.26it/s][A

tensor(0.7123, device='cuda:0', grad_fn=<NllLossBackward>)
0.7122832536697388



Iteration:  63%|███████████████████████████████████████████▌                         | 262/415 [03:15<02:01,  1.26it/s][A

tensor(0.5167, device='cuda:0', grad_fn=<NllLossBackward>)
0.5166721343994141



Iteration:  63%|███████████████████████████████████████████▋                         | 263/415 [03:16<02:00,  1.26it/s][A

tensor(0.5164, device='cuda:0', grad_fn=<NllLossBackward>)
0.516354501247406



Iteration:  64%|███████████████████████████████████████████▉                         | 264/415 [03:17<01:59,  1.26it/s][A

tensor(0.6191, device='cuda:0', grad_fn=<NllLossBackward>)
0.6191191673278809



Iteration:  64%|████████████████████████████████████████████                         | 265/415 [03:18<01:57,  1.28it/s][A

tensor(0.7136, device='cuda:0', grad_fn=<NllLossBackward>)
0.713591456413269



Iteration:  64%|████████████████████████████████████████████▏                        | 266/415 [03:19<01:56,  1.28it/s][A

tensor(0.5686, device='cuda:0', grad_fn=<NllLossBackward>)
0.5685718059539795



Iteration:  64%|████████████████████████████████████████████▍                        | 267/415 [03:19<01:58,  1.25it/s][A

tensor(0.4999, device='cuda:0', grad_fn=<NllLossBackward>)
0.49992457032203674



Iteration:  65%|████████████████████████████████████████████▌                        | 268/415 [03:20<01:58,  1.24it/s][A

tensor(0.5942, device='cuda:0', grad_fn=<NllLossBackward>)
0.5941767692565918



Iteration:  65%|████████████████████████████████████████████▋                        | 269/415 [03:21<01:58,  1.24it/s][A

tensor(0.6198, device='cuda:0', grad_fn=<NllLossBackward>)
0.6198249459266663



Iteration:  65%|████████████████████████████████████████████▉                        | 270/415 [03:22<01:59,  1.21it/s][A

tensor(0.5048, device='cuda:0', grad_fn=<NllLossBackward>)
0.5047534108161926



Iteration:  65%|█████████████████████████████████████████████                        | 271/415 [03:23<01:57,  1.23it/s][A

tensor(0.7201, device='cuda:0', grad_fn=<NllLossBackward>)
0.7201403379440308



Iteration:  66%|█████████████████████████████████████████████▏                       | 272/415 [03:24<01:54,  1.25it/s][A

tensor(0.5181, device='cuda:0', grad_fn=<NllLossBackward>)
0.5180529356002808



Iteration:  66%|█████████████████████████████████████████████▍                       | 273/415 [03:24<01:53,  1.25it/s][A

tensor(0.5388, device='cuda:0', grad_fn=<NllLossBackward>)
0.5388159155845642



Iteration:  66%|█████████████████████████████████████████████▌                       | 274/415 [03:25<01:52,  1.25it/s][A

tensor(0.5908, device='cuda:0', grad_fn=<NllLossBackward>)
0.5907692909240723



Iteration:  66%|█████████████████████████████████████████████▋                       | 275/415 [03:26<01:52,  1.25it/s][A

tensor(0.6014, device='cuda:0', grad_fn=<NllLossBackward>)
0.6013689637184143



Iteration:  67%|█████████████████████████████████████████████▉                       | 276/415 [03:27<01:51,  1.25it/s][A

tensor(0.5013, device='cuda:0', grad_fn=<NllLossBackward>)
0.5013450980186462



Iteration:  67%|██████████████████████████████████████████████                       | 277/415 [03:27<01:49,  1.26it/s][A

tensor(0.5812, device='cuda:0', grad_fn=<NllLossBackward>)
0.5811595916748047



Iteration:  67%|██████████████████████████████████████████████▏                      | 278/415 [03:28<01:48,  1.27it/s][A

tensor(0.5275, device='cuda:0', grad_fn=<NllLossBackward>)
0.5274989604949951



Iteration:  67%|██████████████████████████████████████████████▍                      | 279/415 [03:29<01:48,  1.26it/s][A

tensor(0.4636, device='cuda:0', grad_fn=<NllLossBackward>)
0.46361246705055237



Iteration:  67%|██████████████████████████████████████████████▌                      | 280/415 [03:30<01:46,  1.26it/s][A

tensor(0.6345, device='cuda:0', grad_fn=<NllLossBackward>)
0.6344841122627258



Iteration:  68%|██████████████████████████████████████████████▋                      | 281/415 [03:31<01:46,  1.26it/s][A

tensor(0.6507, device='cuda:0', grad_fn=<NllLossBackward>)
0.6507320404052734



Iteration:  68%|██████████████████████████████████████████████▉                      | 282/415 [03:31<01:46,  1.24it/s][A

tensor(0.5278, device='cuda:0', grad_fn=<NllLossBackward>)
0.5278266668319702



Iteration:  68%|███████████████████████████████████████████████                      | 283/415 [03:32<01:44,  1.26it/s][A

tensor(0.5457, device='cuda:0', grad_fn=<NllLossBackward>)
0.5457068085670471



Iteration:  68%|███████████████████████████████████████████████▏                     | 284/415 [03:33<01:43,  1.26it/s][A

tensor(0.5394, device='cuda:0', grad_fn=<NllLossBackward>)
0.539376437664032



Iteration:  69%|███████████████████████████████████████████████▍                     | 285/415 [03:34<01:47,  1.21it/s][A

tensor(0.5515, device='cuda:0', grad_fn=<NllLossBackward>)
0.5515282154083252



Iteration:  69%|███████████████████████████████████████████████▌                     | 286/415 [03:35<01:45,  1.23it/s][A

tensor(0.6872, device='cuda:0', grad_fn=<NllLossBackward>)
0.6871758103370667



Iteration:  69%|███████████████████████████████████████████████▋                     | 287/415 [03:36<01:43,  1.23it/s][A

tensor(0.7532, device='cuda:0', grad_fn=<NllLossBackward>)
0.7532311081886292



Iteration:  69%|███████████████████████████████████████████████▉                     | 288/415 [03:36<01:42,  1.24it/s][A

tensor(0.4605, device='cuda:0', grad_fn=<NllLossBackward>)
0.46046632528305054



Iteration:  70%|████████████████████████████████████████████████                     | 289/415 [03:37<01:40,  1.25it/s][A

tensor(0.4500, device='cuda:0', grad_fn=<NllLossBackward>)
0.45002761483192444



Iteration:  70%|████████████████████████████████████████████████▏                    | 290/415 [03:38<01:39,  1.26it/s][A

tensor(0.6119, device='cuda:0', grad_fn=<NllLossBackward>)
0.6118527054786682



Iteration:  70%|████████████████████████████████████████████████▍                    | 291/415 [03:39<01:38,  1.25it/s][A

tensor(0.5543, device='cuda:0', grad_fn=<NllLossBackward>)
0.5543127655982971



Iteration:  70%|████████████████████████████████████████████████▌                    | 292/415 [03:40<01:39,  1.24it/s][A

tensor(0.6153, device='cuda:0', grad_fn=<NllLossBackward>)
0.6153198480606079



Iteration:  71%|████████████████████████████████████████████████▋                    | 293/415 [03:40<01:37,  1.25it/s][A

tensor(0.5854, device='cuda:0', grad_fn=<NllLossBackward>)
0.5853691697120667



Iteration:  71%|████████████████████████████████████████████████▉                    | 294/415 [03:41<01:36,  1.25it/s][A

tensor(0.5546, device='cuda:0', grad_fn=<NllLossBackward>)
0.5545986294746399



Iteration:  71%|█████████████████████████████████████████████████                    | 295/415 [03:42<01:35,  1.26it/s][A

tensor(0.6569, device='cuda:0', grad_fn=<NllLossBackward>)
0.6569214463233948



Iteration:  71%|█████████████████████████████████████████████████▏                   | 296/415 [03:43<01:34,  1.25it/s][A

tensor(0.6294, device='cuda:0', grad_fn=<NllLossBackward>)
0.6294304132461548



Iteration:  72%|█████████████████████████████████████████████████▍                   | 297/415 [03:43<01:33,  1.26it/s][A

tensor(0.6214, device='cuda:0', grad_fn=<NllLossBackward>)
0.6214361786842346



Iteration:  72%|█████████████████████████████████████████████████▌                   | 298/415 [03:44<01:33,  1.26it/s][A

tensor(0.6421, device='cuda:0', grad_fn=<NllLossBackward>)
0.6421013474464417



Iteration:  72%|█████████████████████████████████████████████████▋                   | 299/415 [03:45<01:31,  1.26it/s][A

tensor(0.6428, device='cuda:0', grad_fn=<NllLossBackward>)
0.6427926421165466



Iteration:  72%|█████████████████████████████████████████████████▉                   | 300/415 [03:46<01:31,  1.25it/s][A

tensor(0.7118, device='cuda:0', grad_fn=<NllLossBackward>)
0.711772084236145



Iteration:  73%|██████████████████████████████████████████████████                   | 301/415 [03:47<01:30,  1.25it/s][A

tensor(0.5810, device='cuda:0', grad_fn=<NllLossBackward>)
0.5809599757194519



Iteration:  73%|██████████████████████████████████████████████████▏                  | 302/415 [03:47<01:29,  1.26it/s][A

tensor(0.6426, device='cuda:0', grad_fn=<NllLossBackward>)
0.642605185508728



Iteration:  73%|██████████████████████████████████████████████████▍                  | 303/415 [03:48<01:28,  1.26it/s][A

tensor(0.6706, device='cuda:0', grad_fn=<NllLossBackward>)
0.670625627040863



Iteration:  73%|██████████████████████████████████████████████████▌                  | 304/415 [03:49<01:28,  1.26it/s][A

tensor(0.6522, device='cuda:0', grad_fn=<NllLossBackward>)
0.6521903872489929



Iteration:  73%|██████████████████████████████████████████████████▋                  | 305/415 [03:50<01:26,  1.27it/s][A

tensor(0.5467, device='cuda:0', grad_fn=<NllLossBackward>)
0.5467011332511902



Iteration:  74%|██████████████████████████████████████████████████▉                  | 306/415 [03:51<01:25,  1.28it/s][A

tensor(0.6319, device='cuda:0', grad_fn=<NllLossBackward>)
0.6319196224212646



Iteration:  74%|███████████████████████████████████████████████████                  | 307/415 [03:51<01:24,  1.28it/s][A

tensor(0.6719, device='cuda:0', grad_fn=<NllLossBackward>)
0.6718549728393555



Iteration:  74%|███████████████████████████████████████████████████▏                 | 308/415 [03:52<01:23,  1.28it/s][A

tensor(0.6640, device='cuda:0', grad_fn=<NllLossBackward>)
0.6640291213989258



Iteration:  74%|███████████████████████████████████████████████████▍                 | 309/415 [03:53<01:23,  1.27it/s][A

tensor(0.6351, device='cuda:0', grad_fn=<NllLossBackward>)
0.6351032257080078



Iteration:  75%|███████████████████████████████████████████████████▌                 | 310/415 [03:54<01:23,  1.26it/s][A

tensor(0.5899, device='cuda:0', grad_fn=<NllLossBackward>)
0.5898791551589966



Iteration:  75%|███████████████████████████████████████████████████▋                 | 311/415 [03:55<01:22,  1.25it/s][A

tensor(0.7441, device='cuda:0', grad_fn=<NllLossBackward>)
0.7441427707672119



Iteration:  75%|███████████████████████████████████████████████████▊                 | 312/415 [03:55<01:21,  1.27it/s][A

tensor(0.6008, device='cuda:0', grad_fn=<NllLossBackward>)
0.6008371710777283



Iteration:  75%|████████████████████████████████████████████████████                 | 313/415 [03:56<01:21,  1.26it/s][A

tensor(0.6240, device='cuda:0', grad_fn=<NllLossBackward>)
0.6239625811576843



Iteration:  76%|████████████████████████████████████████████████████▏                | 314/415 [03:57<01:19,  1.26it/s][A

tensor(0.5823, device='cuda:0', grad_fn=<NllLossBackward>)
0.582331120967865



Iteration:  76%|████████████████████████████████████████████████████▎                | 315/415 [03:58<01:18,  1.27it/s][A

tensor(0.5116, device='cuda:0', grad_fn=<NllLossBackward>)
0.5116032958030701



Iteration:  76%|████████████████████████████████████████████████████▌                | 316/415 [03:58<01:17,  1.27it/s][A

tensor(0.6425, device='cuda:0', grad_fn=<NllLossBackward>)
0.6425086259841919



Iteration:  76%|████████████████████████████████████████████████████▋                | 317/415 [03:59<01:17,  1.26it/s][A

tensor(0.5359, device='cuda:0', grad_fn=<NllLossBackward>)
0.5358569622039795



Iteration:  77%|████████████████████████████████████████████████████▊                | 318/415 [04:00<01:16,  1.26it/s][A

tensor(0.4930, device='cuda:0', grad_fn=<NllLossBackward>)
0.4930199980735779



Iteration:  77%|█████████████████████████████████████████████████████                | 319/415 [04:01<01:16,  1.26it/s][A

tensor(0.6510, device='cuda:0', grad_fn=<NllLossBackward>)
0.6510103344917297



Iteration:  77%|█████████████████████████████████████████████████████▏               | 320/415 [04:02<01:18,  1.21it/s][A

tensor(0.5002, device='cuda:0', grad_fn=<NllLossBackward>)
0.5002279281616211



Iteration:  77%|█████████████████████████████████████████████████████▎               | 321/415 [04:03<01:16,  1.22it/s][A

tensor(0.6163, device='cuda:0', grad_fn=<NllLossBackward>)
0.6163409948348999



Iteration:  78%|█████████████████████████████████████████████████████▌               | 322/415 [04:03<01:15,  1.23it/s][A

tensor(0.5639, device='cuda:0', grad_fn=<NllLossBackward>)
0.563928484916687



Iteration:  78%|█████████████████████████████████████████████████████▋               | 323/415 [04:04<01:14,  1.24it/s][A

tensor(0.4915, device='cuda:0', grad_fn=<NllLossBackward>)
0.491455078125



Iteration:  78%|█████████████████████████████████████████████████████▊               | 324/415 [04:05<01:12,  1.25it/s][A

tensor(0.5528, device='cuda:0', grad_fn=<NllLossBackward>)
0.5528362989425659



Iteration:  78%|██████████████████████████████████████████████████████               | 325/415 [04:06<01:11,  1.25it/s][A

tensor(0.4783, device='cuda:0', grad_fn=<NllLossBackward>)
0.4783051013946533



Iteration:  79%|██████████████████████████████████████████████████████▏              | 326/415 [04:07<01:10,  1.26it/s][A

tensor(0.5653, device='cuda:0', grad_fn=<NllLossBackward>)
0.5652501583099365



Iteration:  79%|██████████████████████████████████████████████████████▎              | 327/415 [04:07<01:10,  1.24it/s][A

tensor(0.8145, device='cuda:0', grad_fn=<NllLossBackward>)
0.814483106136322



Iteration:  79%|██████████████████████████████████████████████████████▌              | 328/415 [04:08<01:09,  1.24it/s][A

tensor(0.5105, device='cuda:0', grad_fn=<NllLossBackward>)
0.5104966163635254



Iteration:  79%|██████████████████████████████████████████████████████▋              | 329/415 [04:09<01:09,  1.25it/s][A

tensor(0.5962, device='cuda:0', grad_fn=<NllLossBackward>)
0.5962395071983337



Iteration:  80%|██████████████████████████████████████████████████████▊              | 330/415 [04:10<01:08,  1.24it/s][A

tensor(0.5475, device='cuda:0', grad_fn=<NllLossBackward>)
0.5475099682807922



Iteration:  80%|███████████████████████████████████████████████████████              | 331/415 [04:11<01:07,  1.24it/s][A

tensor(0.5167, device='cuda:0', grad_fn=<NllLossBackward>)
0.5166879296302795



Iteration:  80%|███████████████████████████████████████████████████████▏             | 332/415 [04:11<01:06,  1.24it/s][A

tensor(0.6879, device='cuda:0', grad_fn=<NllLossBackward>)
0.687930166721344



Iteration:  80%|███████████████████████████████████████████████████████▎             | 333/415 [04:12<01:05,  1.26it/s][A

tensor(0.6362, device='cuda:0', grad_fn=<NllLossBackward>)
0.6362346410751343



Iteration:  80%|███████████████████████████████████████████████████████▌             | 334/415 [04:13<01:04,  1.26it/s][A

tensor(0.4465, device='cuda:0', grad_fn=<NllLossBackward>)
0.44649285078048706



Iteration:  81%|███████████████████████████████████████████████████████▋             | 335/415 [04:14<01:04,  1.24it/s][A

tensor(0.5523, device='cuda:0', grad_fn=<NllLossBackward>)
0.5522741079330444



Iteration:  81%|███████████████████████████████████████████████████████▊             | 336/415 [04:15<01:03,  1.24it/s][A

tensor(0.5692, device='cuda:0', grad_fn=<NllLossBackward>)
0.5692302584648132



Iteration:  81%|████████████████████████████████████████████████████████             | 337/415 [04:15<01:03,  1.23it/s][A

tensor(0.5297, device='cuda:0', grad_fn=<NllLossBackward>)
0.5296773910522461



Iteration:  81%|████████████████████████████████████████████████████████▏            | 338/415 [04:16<01:01,  1.24it/s][A

tensor(0.4580, device='cuda:0', grad_fn=<NllLossBackward>)
0.45803284645080566



Iteration:  82%|████████████████████████████████████████████████████████▎            | 339/415 [04:17<01:00,  1.25it/s][A

tensor(0.8089, device='cuda:0', grad_fn=<NllLossBackward>)
0.8088969588279724



Iteration:  82%|████████████████████████████████████████████████████████▌            | 340/415 [04:18<01:00,  1.25it/s][A

tensor(0.5443, device='cuda:0', grad_fn=<NllLossBackward>)
0.5442874431610107



Iteration:  82%|████████████████████████████████████████████████████████▋            | 341/415 [04:19<00:59,  1.25it/s][A

tensor(0.4969, device='cuda:0', grad_fn=<NllLossBackward>)
0.4969322681427002



Iteration:  82%|████████████████████████████████████████████████████████▊            | 342/415 [04:19<00:58,  1.24it/s][A

tensor(0.6089, device='cuda:0', grad_fn=<NllLossBackward>)
0.60893714427948



Iteration:  83%|█████████████████████████████████████████████████████████            | 343/415 [04:20<00:58,  1.24it/s][A

tensor(0.3862, device='cuda:0', grad_fn=<NllLossBackward>)
0.3861986994743347



Iteration:  83%|█████████████████████████████████████████████████████████▏           | 344/415 [04:21<00:58,  1.21it/s][A

tensor(0.4754, device='cuda:0', grad_fn=<NllLossBackward>)
0.4753637909889221



Iteration:  83%|█████████████████████████████████████████████████████████▎           | 345/415 [04:22<00:57,  1.23it/s][A

tensor(0.5058, device='cuda:0', grad_fn=<NllLossBackward>)
0.5058498978614807



Iteration:  83%|█████████████████████████████████████████████████████████▌           | 346/415 [04:23<00:56,  1.23it/s][A

tensor(0.6442, device='cuda:0', grad_fn=<NllLossBackward>)
0.6442268490791321



Iteration:  84%|█████████████████████████████████████████████████████████▋           | 347/415 [04:24<00:55,  1.23it/s][A

tensor(0.4142, device='cuda:0', grad_fn=<NllLossBackward>)
0.41418227553367615



Iteration:  84%|█████████████████████████████████████████████████████████▊           | 348/415 [04:24<00:54,  1.22it/s][A

tensor(0.3495, device='cuda:0', grad_fn=<NllLossBackward>)
0.3495293855667114



Iteration:  84%|██████████████████████████████████████████████████████████           | 349/415 [04:25<00:53,  1.23it/s][A

tensor(0.4094, device='cuda:0', grad_fn=<NllLossBackward>)
0.409372478723526



Iteration:  84%|██████████████████████████████████████████████████████████▏          | 350/415 [04:26<00:52,  1.23it/s][A

tensor(0.6187, device='cuda:0', grad_fn=<NllLossBackward>)
0.6187002062797546



Iteration:  85%|██████████████████████████████████████████████████████████▎          | 351/415 [04:27<00:51,  1.23it/s][A

tensor(0.7620, device='cuda:0', grad_fn=<NllLossBackward>)
0.762047529220581



Iteration:  85%|██████████████████████████████████████████████████████████▌          | 352/415 [04:28<00:51,  1.21it/s][A

tensor(0.5910, device='cuda:0', grad_fn=<NllLossBackward>)
0.5909921526908875



Iteration:  85%|██████████████████████████████████████████████████████████▋          | 353/415 [04:28<00:50,  1.22it/s][A

tensor(0.4628, device='cuda:0', grad_fn=<NllLossBackward>)
0.46281686425209045



Iteration:  85%|██████████████████████████████████████████████████████████▊          | 354/415 [04:29<00:50,  1.22it/s][A

tensor(0.5774, device='cuda:0', grad_fn=<NllLossBackward>)
0.5773854851722717



Iteration:  86%|███████████████████████████████████████████████████████████          | 355/415 [04:30<00:48,  1.23it/s][A

tensor(0.4986, device='cuda:0', grad_fn=<NllLossBackward>)
0.49864163994789124



Iteration:  86%|███████████████████████████████████████████████████████████▏         | 356/415 [04:31<00:48,  1.23it/s][A

tensor(0.6295, device='cuda:0', grad_fn=<NllLossBackward>)
0.6295377016067505



Iteration:  86%|███████████████████████████████████████████████████████████▎         | 357/415 [04:32<00:47,  1.21it/s][A

tensor(0.6179, device='cuda:0', grad_fn=<NllLossBackward>)
0.6178581714630127



Iteration:  86%|███████████████████████████████████████████████████████████▌         | 358/415 [04:33<00:47,  1.21it/s][A

tensor(0.6989, device='cuda:0', grad_fn=<NllLossBackward>)
0.6989302635192871



Iteration:  87%|███████████████████████████████████████████████████████████▋         | 359/415 [04:33<00:45,  1.22it/s][A

tensor(0.4199, device='cuda:0', grad_fn=<NllLossBackward>)
0.4199308454990387



Iteration:  87%|███████████████████████████████████████████████████████████▊         | 360/415 [04:34<00:44,  1.22it/s][A

tensor(0.6763, device='cuda:0', grad_fn=<NllLossBackward>)
0.6762544512748718



Iteration:  87%|████████████████████████████████████████████████████████████         | 361/415 [04:35<00:43,  1.23it/s][A

tensor(0.5345, device='cuda:0', grad_fn=<NllLossBackward>)
0.534493625164032



Iteration:  87%|████████████████████████████████████████████████████████████▏        | 362/415 [04:36<00:42,  1.23it/s][A

tensor(0.5852, device='cuda:0', grad_fn=<NllLossBackward>)
0.5852227807044983



Iteration:  87%|████████████████████████████████████████████████████████████▎        | 363/415 [04:37<00:42,  1.23it/s][A

tensor(0.5559, device='cuda:0', grad_fn=<NllLossBackward>)
0.5558909773826599



Iteration:  88%|████████████████████████████████████████████████████████████▌        | 364/415 [04:37<00:41,  1.23it/s][A

tensor(0.6635, device='cuda:0', grad_fn=<NllLossBackward>)
0.6634579300880432



Iteration:  88%|████████████████████████████████████████████████████████████▋        | 365/415 [04:38<00:40,  1.24it/s][A

tensor(0.5902, device='cuda:0', grad_fn=<NllLossBackward>)
0.5901721119880676



Iteration:  88%|████████████████████████████████████████████████████████████▊        | 366/415 [04:39<00:39,  1.26it/s][A

tensor(0.6123, device='cuda:0', grad_fn=<NllLossBackward>)
0.6122596859931946



Iteration:  88%|█████████████████████████████████████████████████████████████        | 367/415 [04:40<00:38,  1.26it/s][A

tensor(0.5130, device='cuda:0', grad_fn=<NllLossBackward>)
0.5129886865615845



Iteration:  89%|█████████████████████████████████████████████████████████████▏       | 368/415 [04:41<00:37,  1.26it/s][A

tensor(0.5034, device='cuda:0', grad_fn=<NllLossBackward>)
0.5033997297286987



Iteration:  89%|█████████████████████████████████████████████████████████████▎       | 369/415 [04:41<00:36,  1.25it/s][A

tensor(0.5421, device='cuda:0', grad_fn=<NllLossBackward>)
0.5421448945999146



Iteration:  89%|█████████████████████████████████████████████████████████████▌       | 370/415 [04:42<00:36,  1.24it/s][A

tensor(0.6525, device='cuda:0', grad_fn=<NllLossBackward>)
0.652463436126709



Iteration:  89%|█████████████████████████████████████████████████████████████▋       | 371/415 [04:43<00:35,  1.25it/s][A

tensor(0.5010, device='cuda:0', grad_fn=<NllLossBackward>)
0.5009690523147583



Iteration:  90%|█████████████████████████████████████████████████████████████▊       | 372/415 [04:44<00:35,  1.20it/s][A

tensor(0.6001, device='cuda:0', grad_fn=<NllLossBackward>)
0.6001258492469788



Iteration:  90%|██████████████████████████████████████████████████████████████       | 373/415 [04:45<00:34,  1.22it/s][A

tensor(0.5089, device='cuda:0', grad_fn=<NllLossBackward>)
0.5088844895362854



Iteration:  90%|██████████████████████████████████████████████████████████████▏      | 374/415 [04:45<00:33,  1.24it/s][A

tensor(0.5289, device='cuda:0', grad_fn=<NllLossBackward>)
0.5289093852043152



Iteration:  90%|██████████████████████████████████████████████████████████████▎      | 375/415 [04:46<00:32,  1.22it/s][A

tensor(0.6352, device='cuda:0', grad_fn=<NllLossBackward>)
0.6351513862609863



Iteration:  91%|██████████████████████████████████████████████████████████████▌      | 376/415 [04:47<00:31,  1.23it/s][A

tensor(0.5961, device='cuda:0', grad_fn=<NllLossBackward>)
0.5961228609085083



Iteration:  91%|██████████████████████████████████████████████████████████████▋      | 377/415 [04:48<00:30,  1.24it/s][A

tensor(0.7141, device='cuda:0', grad_fn=<NllLossBackward>)
0.714129626750946



Iteration:  91%|██████████████████████████████████████████████████████████████▊      | 378/415 [04:49<00:31,  1.18it/s][A

tensor(0.4922, device='cuda:0', grad_fn=<NllLossBackward>)
0.492213636636734



Iteration:  91%|███████████████████████████████████████████████████████████████      | 379/415 [04:50<00:30,  1.20it/s][A

tensor(0.4953, device='cuda:0', grad_fn=<NllLossBackward>)
0.4952968657016754



Iteration:  92%|███████████████████████████████████████████████████████████████▏     | 380/415 [04:50<00:28,  1.21it/s][A

tensor(0.3289, device='cuda:0', grad_fn=<NllLossBackward>)
0.3288861811161041



Iteration:  92%|███████████████████████████████████████████████████████████████▎     | 381/415 [04:51<00:27,  1.22it/s][A

tensor(0.6215, device='cuda:0', grad_fn=<NllLossBackward>)
0.6215073466300964



Iteration:  92%|███████████████████████████████████████████████████████████████▌     | 382/415 [04:52<00:26,  1.24it/s][A

tensor(0.6617, device='cuda:0', grad_fn=<NllLossBackward>)
0.6616847515106201



Iteration:  92%|███████████████████████████████████████████████████████████████▋     | 383/415 [04:53<00:26,  1.23it/s][A

tensor(0.5430, device='cuda:0', grad_fn=<NllLossBackward>)
0.5430331826210022



Iteration:  93%|███████████████████████████████████████████████████████████████▊     | 384/415 [04:54<00:25,  1.23it/s][A

tensor(0.6059, device='cuda:0', grad_fn=<NllLossBackward>)
0.605924129486084



Iteration:  93%|████████████████████████████████████████████████████████████████     | 385/415 [04:54<00:24,  1.23it/s][A

tensor(0.5487, device='cuda:0', grad_fn=<NllLossBackward>)
0.5486602783203125



Iteration:  93%|████████████████████████████████████████████████████████████████▏    | 386/415 [04:55<00:23,  1.23it/s][A

tensor(0.6014, device='cuda:0', grad_fn=<NllLossBackward>)
0.6013914346694946



Iteration:  93%|████████████████████████████████████████████████████████████████▎    | 387/415 [04:56<00:22,  1.22it/s][A

tensor(0.6341, device='cuda:0', grad_fn=<NllLossBackward>)
0.6340692639350891



Iteration:  93%|████████████████████████████████████████████████████████████████▌    | 388/415 [04:57<00:22,  1.22it/s][A

tensor(0.5035, device='cuda:0', grad_fn=<NllLossBackward>)
0.5035005807876587



Iteration:  94%|████████████████████████████████████████████████████████████████▋    | 389/415 [04:58<00:21,  1.23it/s][A

tensor(0.5681, device='cuda:0', grad_fn=<NllLossBackward>)
0.5681218504905701



Iteration:  94%|████████████████████████████████████████████████████████████████▊    | 390/415 [04:59<00:20,  1.22it/s][A

tensor(0.3820, device='cuda:0', grad_fn=<NllLossBackward>)
0.38203340768814087



Iteration:  94%|█████████████████████████████████████████████████████████████████    | 391/415 [04:59<00:19,  1.23it/s][A

tensor(0.5088, device='cuda:0', grad_fn=<NllLossBackward>)
0.5088497996330261



Iteration:  94%|█████████████████████████████████████████████████████████████████▏   | 392/415 [05:00<00:18,  1.22it/s][A

tensor(0.4620, device='cuda:0', grad_fn=<NllLossBackward>)
0.4619673788547516



Iteration:  95%|█████████████████████████████████████████████████████████████████▎   | 393/415 [05:01<00:17,  1.24it/s][A

tensor(0.5498, device='cuda:0', grad_fn=<NllLossBackward>)
0.5497567653656006



Iteration:  95%|█████████████████████████████████████████████████████████████████▌   | 394/415 [05:02<00:17,  1.23it/s][A

tensor(0.5742, device='cuda:0', grad_fn=<NllLossBackward>)
0.5742073059082031



Iteration:  95%|█████████████████████████████████████████████████████████████████▋   | 395/415 [05:03<00:17,  1.17it/s][A

tensor(0.4552, device='cuda:0', grad_fn=<NllLossBackward>)
0.45520728826522827



Iteration:  95%|█████████████████████████████████████████████████████████████████▊   | 396/415 [05:04<00:15,  1.20it/s][A

tensor(0.4375, device='cuda:0', grad_fn=<NllLossBackward>)
0.4375426173210144



Iteration:  96%|██████████████████████████████████████████████████████████████████   | 397/415 [05:04<00:15,  1.16it/s][A

tensor(0.7123, device='cuda:0', grad_fn=<NllLossBackward>)
0.7123208045959473



Iteration:  96%|██████████████████████████████████████████████████████████████████▏  | 398/415 [05:05<00:14,  1.14it/s][A

tensor(0.4110, device='cuda:0', grad_fn=<NllLossBackward>)
0.4110037088394165



Iteration:  96%|██████████████████████████████████████████████████████████████████▎  | 399/415 [05:06<00:13,  1.18it/s][A

tensor(0.5414, device='cuda:0', grad_fn=<NllLossBackward>)
0.5413681268692017



Iteration:  96%|██████████████████████████████████████████████████████████████████▌  | 400/415 [05:07<00:12,  1.20it/s][A

tensor(0.5176, device='cuda:0', grad_fn=<NllLossBackward>)
0.5176204442977905



Iteration:  97%|██████████████████████████████████████████████████████████████████▋  | 401/415 [05:08<00:11,  1.20it/s][A

tensor(0.4374, device='cuda:0', grad_fn=<NllLossBackward>)
0.43736374378204346



Iteration:  97%|██████████████████████████████████████████████████████████████████▊  | 402/415 [05:09<00:10,  1.22it/s][A

tensor(0.7069, device='cuda:0', grad_fn=<NllLossBackward>)
0.7068583369255066



Iteration:  97%|███████████████████████████████████████████████████████████████████  | 403/415 [05:09<00:09,  1.22it/s][A

tensor(0.4700, device='cuda:0', grad_fn=<NllLossBackward>)
0.47004154324531555



Iteration:  97%|███████████████████████████████████████████████████████████████████▏ | 404/415 [05:10<00:09,  1.21it/s][A

tensor(0.5917, device='cuda:0', grad_fn=<NllLossBackward>)
0.5917205810546875



Iteration:  98%|███████████████████████████████████████████████████████████████████▎ | 405/415 [05:11<00:08,  1.22it/s][A

tensor(0.4802, device='cuda:0', grad_fn=<NllLossBackward>)
0.48023486137390137



Iteration:  98%|███████████████████████████████████████████████████████████████████▌ | 406/415 [05:12<00:07,  1.22it/s][A

tensor(0.6242, device='cuda:0', grad_fn=<NllLossBackward>)
0.6242204904556274



Iteration:  98%|███████████████████████████████████████████████████████████████████▋ | 407/415 [05:13<00:06,  1.22it/s][A

tensor(0.4860, device='cuda:0', grad_fn=<NllLossBackward>)
0.48602116107940674



Iteration:  98%|███████████████████████████████████████████████████████████████████▊ | 408/415 [05:13<00:05,  1.23it/s][A

tensor(0.6393, device='cuda:0', grad_fn=<NllLossBackward>)
0.6393458843231201



Iteration:  99%|████████████████████████████████████████████████████████████████████ | 409/415 [05:14<00:04,  1.24it/s][A

tensor(0.4484, device='cuda:0', grad_fn=<NllLossBackward>)
0.4484022557735443



Iteration:  99%|████████████████████████████████████████████████████████████████████▏| 410/415 [05:15<00:04,  1.23it/s][A

tensor(0.4924, device='cuda:0', grad_fn=<NllLossBackward>)
0.4923780858516693



Iteration:  99%|████████████████████████████████████████████████████████████████████▎| 411/415 [05:16<00:03,  1.23it/s][A

tensor(0.6606, device='cuda:0', grad_fn=<NllLossBackward>)
0.6606042385101318



Iteration:  99%|████████████████████████████████████████████████████████████████████▌| 412/415 [05:17<00:02,  1.24it/s][A

tensor(0.3648, device='cuda:0', grad_fn=<NllLossBackward>)
0.3648446798324585



Iteration: 100%|████████████████████████████████████████████████████████████████████▋| 413/415 [05:18<00:01,  1.23it/s][A

tensor(0.6278, device='cuda:0', grad_fn=<NllLossBackward>)
0.6277946829795837



Iteration: 100%|████████████████████████████████████████████████████████████████████▊| 414/415 [05:18<00:00,  1.22it/s][A

tensor(0.3916, device='cuda:0', grad_fn=<NllLossBackward>)
0.39162248373031616



Iteration: 100%|█████████████████████████████████████████████████████████████████████| 415/415 [05:19<00:00,  1.30it/s][A
Epoch:  33%|█████████████████████████▎                                                  | 1/3 [05:19<10:39, 319.66s/it]
Iteration:   0%|                                                                               | 0/415 [00:00<?, ?it/s][A

tensor(0.4280, device='cuda:0', grad_fn=<NllLossBackward>)
0.4279995858669281



Iteration:   0%|▏                                                                      | 1/415 [00:00<05:38,  1.22it/s][A

tensor(0.3683, device='cuda:0', grad_fn=<NllLossBackward>)
0.3683375120162964



Iteration:   0%|▎                                                                      | 2/415 [00:01<05:33,  1.24it/s][A

tensor(0.4769, device='cuda:0', grad_fn=<NllLossBackward>)
0.4768674969673157



Iteration:   1%|▌                                                                      | 3/415 [00:02<05:31,  1.24it/s][A

tensor(0.4287, device='cuda:0', grad_fn=<NllLossBackward>)
0.42867714166641235



Iteration:   1%|▋                                                                      | 4/415 [00:03<05:30,  1.24it/s][A

tensor(0.3470, device='cuda:0', grad_fn=<NllLossBackward>)
0.34703969955444336



Iteration:   1%|▊                                                                      | 5/415 [00:04<05:29,  1.25it/s][A

tensor(0.5401, device='cuda:0', grad_fn=<NllLossBackward>)
0.5400656461715698



Iteration:   1%|█                                                                      | 6/415 [00:04<05:31,  1.24it/s][A

tensor(0.3856, device='cuda:0', grad_fn=<NllLossBackward>)
0.3856147825717926



Iteration:   2%|█▏                                                                     | 7/415 [00:05<05:28,  1.24it/s][A

tensor(0.7707, device='cuda:0', grad_fn=<NllLossBackward>)
0.7706527709960938



Iteration:   2%|█▎                                                                     | 8/415 [00:06<05:30,  1.23it/s][A

tensor(0.4683, device='cuda:0', grad_fn=<NllLossBackward>)
0.4683014154434204



Iteration:   2%|█▌                                                                     | 9/415 [00:07<05:28,  1.23it/s][A

tensor(0.7791, device='cuda:0', grad_fn=<NllLossBackward>)
0.7790766358375549



Iteration:   2%|█▋                                                                    | 10/415 [00:08<05:34,  1.21it/s][A

tensor(0.3254, device='cuda:0', grad_fn=<NllLossBackward>)
0.3253612816333771



Iteration:   3%|█▊                                                                    | 11/415 [00:08<05:29,  1.23it/s][A

tensor(0.5365, device='cuda:0', grad_fn=<NllLossBackward>)
0.5364638566970825



Iteration:   3%|██                                                                    | 12/415 [00:09<05:33,  1.21it/s][A

tensor(0.3881, device='cuda:0', grad_fn=<NllLossBackward>)
0.3880704343318939



Iteration:   3%|██▏                                                                   | 13/415 [00:10<05:32,  1.21it/s][A

tensor(0.7055, device='cuda:0', grad_fn=<NllLossBackward>)
0.7055040597915649



Iteration:   3%|██▎                                                                   | 14/415 [00:11<05:28,  1.22it/s][A

tensor(0.4659, device='cuda:0', grad_fn=<NllLossBackward>)
0.46592044830322266



Iteration:   4%|██▌                                                                   | 15/415 [00:12<05:24,  1.23it/s][A

tensor(0.3937, device='cuda:0', grad_fn=<NllLossBackward>)
0.39373552799224854



Iteration:   4%|██▋                                                                   | 16/415 [00:13<05:24,  1.23it/s][A

tensor(0.4059, device='cuda:0', grad_fn=<NllLossBackward>)
0.40593400597572327



Iteration:   4%|██▊                                                                   | 17/415 [00:13<05:26,  1.22it/s][A

tensor(0.3795, device='cuda:0', grad_fn=<NllLossBackward>)
0.3794578015804291



Iteration:   4%|███                                                                   | 18/415 [00:14<05:20,  1.24it/s][A

tensor(0.5251, device='cuda:0', grad_fn=<NllLossBackward>)
0.5251287221908569



Iteration:   5%|███▏                                                                  | 19/415 [00:15<05:18,  1.24it/s][A

tensor(0.4051, device='cuda:0', grad_fn=<NllLossBackward>)
0.4051288366317749



Iteration:   5%|███▎                                                                  | 20/415 [00:16<05:20,  1.23it/s][A

tensor(0.4413, device='cuda:0', grad_fn=<NllLossBackward>)
0.4412630796432495



Iteration:   5%|███▌                                                                  | 21/415 [00:17<05:17,  1.24it/s][A

tensor(0.4838, device='cuda:0', grad_fn=<NllLossBackward>)
0.4838288128376007



Iteration:   5%|███▋                                                                  | 22/415 [00:17<05:20,  1.23it/s][A

tensor(0.3871, device='cuda:0', grad_fn=<NllLossBackward>)
0.38714492321014404



Iteration:   6%|███▉                                                                  | 23/415 [00:18<05:20,  1.22it/s][A

tensor(0.4384, device='cuda:0', grad_fn=<NllLossBackward>)
0.4383641481399536



Iteration:   6%|████                                                                  | 24/415 [00:19<05:18,  1.23it/s][A

tensor(0.4957, device='cuda:0', grad_fn=<NllLossBackward>)
0.4957202672958374



Iteration:   6%|████▏                                                                 | 25/415 [00:20<05:19,  1.22it/s][A

tensor(0.2991, device='cuda:0', grad_fn=<NllLossBackward>)
0.29908114671707153



Iteration:   6%|████▍                                                                 | 26/415 [00:21<05:18,  1.22it/s][A

tensor(0.3269, device='cuda:0', grad_fn=<NllLossBackward>)
0.3269127607345581



Iteration:   7%|████▌                                                                 | 27/415 [00:21<05:18,  1.22it/s][A

tensor(0.3889, device='cuda:0', grad_fn=<NllLossBackward>)
0.3889116644859314



Iteration:   7%|████▋                                                                 | 28/415 [00:22<05:17,  1.22it/s][A

tensor(0.6544, device='cuda:0', grad_fn=<NllLossBackward>)
0.6544485688209534



Iteration:   7%|████▉                                                                 | 29/415 [00:23<05:14,  1.23it/s][A

tensor(0.7575, device='cuda:0', grad_fn=<NllLossBackward>)
0.7575081586837769



Iteration:   7%|█████                                                                 | 30/415 [00:24<05:14,  1.22it/s][A

tensor(0.3951, device='cuda:0', grad_fn=<NllLossBackward>)
0.3951222002506256



Iteration:   7%|█████▏                                                                | 31/415 [00:25<05:16,  1.21it/s][A

tensor(0.2890, device='cuda:0', grad_fn=<NllLossBackward>)
0.2889884412288666



Iteration:   8%|█████▍                                                                | 32/415 [00:26<05:12,  1.23it/s][A

tensor(0.3237, device='cuda:0', grad_fn=<NllLossBackward>)
0.32367268204689026



Iteration:   8%|█████▌                                                                | 33/415 [00:26<05:14,  1.22it/s][A

tensor(0.6322, device='cuda:0', grad_fn=<NllLossBackward>)
0.6322319507598877



Iteration:   8%|█████▋                                                                | 34/415 [00:27<05:14,  1.21it/s][A

tensor(0.2612, device='cuda:0', grad_fn=<NllLossBackward>)
0.26118791103363037



Iteration:   8%|█████▉                                                                | 35/415 [00:28<05:22,  1.18it/s][A

tensor(0.4639, device='cuda:0', grad_fn=<NllLossBackward>)
0.46389591693878174



Iteration:   9%|██████                                                                | 36/415 [00:29<05:15,  1.20it/s][A

tensor(0.3059, device='cuda:0', grad_fn=<NllLossBackward>)
0.3059427738189697



Iteration:   9%|██████▏                                                               | 37/415 [00:30<05:10,  1.22it/s][A

tensor(0.6199, device='cuda:0', grad_fn=<NllLossBackward>)
0.6199324131011963



Iteration:   9%|██████▍                                                               | 38/415 [00:31<05:09,  1.22it/s][A

tensor(0.4930, device='cuda:0', grad_fn=<NllLossBackward>)
0.4930231273174286



Iteration:   9%|██████▌                                                               | 39/415 [00:31<05:05,  1.23it/s][A

tensor(0.5540, device='cuda:0', grad_fn=<NllLossBackward>)
0.5539705753326416



Iteration:  10%|██████▋                                                               | 40/415 [00:32<05:16,  1.18it/s][A

tensor(0.3906, device='cuda:0', grad_fn=<NllLossBackward>)
0.3905743658542633



Iteration:  10%|██████▉                                                               | 41/415 [00:33<05:08,  1.21it/s][A

tensor(0.5185, device='cuda:0', grad_fn=<NllLossBackward>)
0.5184903144836426



Iteration:  10%|███████                                                               | 42/415 [00:34<05:04,  1.23it/s][A

tensor(0.5289, device='cuda:0', grad_fn=<NllLossBackward>)
0.528943657875061



Iteration:  10%|███████▎                                                              | 43/415 [00:35<05:02,  1.23it/s][A

tensor(0.5223, device='cuda:0', grad_fn=<NllLossBackward>)
0.5222954750061035



Iteration:  11%|███████▍                                                              | 44/415 [00:35<05:01,  1.23it/s][A

tensor(0.4699, device='cuda:0', grad_fn=<NllLossBackward>)
0.46992653608322144



Iteration:  11%|███████▌                                                              | 45/415 [00:36<05:02,  1.22it/s][A

tensor(0.4246, device='cuda:0', grad_fn=<NllLossBackward>)
0.42460960149765015



Iteration:  11%|███████▊                                                              | 46/415 [00:37<04:58,  1.24it/s][A

tensor(0.6711, device='cuda:0', grad_fn=<NllLossBackward>)
0.6710929870605469



Iteration:  11%|███████▉                                                              | 47/415 [00:38<05:03,  1.21it/s][A

tensor(0.4504, device='cuda:0', grad_fn=<NllLossBackward>)
0.4504266083240509



Iteration:  12%|████████                                                              | 48/415 [00:39<05:00,  1.22it/s][A

tensor(0.3963, device='cuda:0', grad_fn=<NllLossBackward>)
0.39630961418151855



Iteration:  12%|████████▎                                                             | 49/415 [00:40<05:00,  1.22it/s][A

tensor(0.3952, device='cuda:0', grad_fn=<NllLossBackward>)
0.3952435553073883



Iteration:  12%|████████▍                                                             | 50/415 [00:40<04:58,  1.22it/s][A

tensor(0.6176, device='cuda:0', grad_fn=<NllLossBackward>)
0.6176076531410217



Iteration:  12%|████████▌                                                             | 51/415 [00:41<04:57,  1.22it/s][A

tensor(0.5351, device='cuda:0', grad_fn=<NllLossBackward>)
0.5350721478462219



Iteration:  13%|████████▊                                                             | 52/415 [00:42<04:57,  1.22it/s][A

tensor(0.3092, device='cuda:0', grad_fn=<NllLossBackward>)
0.30920684337615967



Iteration:  13%|████████▉                                                             | 53/415 [00:43<04:53,  1.23it/s][A

tensor(0.2942, device='cuda:0', grad_fn=<NllLossBackward>)
0.29417723417282104



Iteration:  13%|█████████                                                             | 54/415 [00:44<05:06,  1.18it/s][A

tensor(0.5194, device='cuda:0', grad_fn=<NllLossBackward>)
0.5193634033203125



Iteration:  13%|█████████▎                                                            | 55/415 [00:45<05:00,  1.20it/s][A

tensor(0.4816, device='cuda:0', grad_fn=<NllLossBackward>)
0.48163697123527527



Iteration:  13%|█████████▍                                                            | 56/415 [00:45<04:55,  1.21it/s][A

tensor(0.3800, device='cuda:0', grad_fn=<NllLossBackward>)
0.38004815578460693



Iteration:  14%|█████████▌                                                            | 57/415 [00:46<04:58,  1.20it/s][A

tensor(0.3990, device='cuda:0', grad_fn=<NllLossBackward>)
0.39899855852127075



Iteration:  14%|█████████▊                                                            | 58/415 [00:47<04:52,  1.22it/s][A

tensor(0.2513, device='cuda:0', grad_fn=<NllLossBackward>)
0.2513251006603241



Iteration:  14%|█████████▉                                                            | 59/415 [00:48<04:49,  1.23it/s][A

tensor(0.3005, device='cuda:0', grad_fn=<NllLossBackward>)
0.30046573281288147



Iteration:  14%|██████████                                                            | 60/415 [00:49<04:55,  1.20it/s][A

tensor(0.4583, device='cuda:0', grad_fn=<NllLossBackward>)
0.45825648307800293



Iteration:  15%|██████████▎                                                           | 61/415 [00:49<04:54,  1.20it/s][A

tensor(0.4106, device='cuda:0', grad_fn=<NllLossBackward>)
0.4105500876903534



Iteration:  15%|██████████▍                                                           | 62/415 [00:50<04:52,  1.21it/s][A

tensor(0.3625, device='cuda:0', grad_fn=<NllLossBackward>)
0.36247918009757996



Iteration:  15%|██████████▋                                                           | 63/415 [00:51<04:55,  1.19it/s][A

tensor(0.2887, device='cuda:0', grad_fn=<NllLossBackward>)
0.28874456882476807



Iteration:  15%|██████████▊                                                           | 64/415 [00:52<04:53,  1.20it/s][A

tensor(0.5332, device='cuda:0', grad_fn=<NllLossBackward>)
0.533198356628418



Iteration:  16%|██████████▉                                                           | 65/415 [00:53<05:06,  1.14it/s][A

tensor(0.2912, device='cuda:0', grad_fn=<NllLossBackward>)
0.29121139645576477



Iteration:  16%|███████████▏                                                          | 66/415 [00:54<04:56,  1.18it/s][A

tensor(0.4265, device='cuda:0', grad_fn=<NllLossBackward>)
0.4264897406101227



Iteration:  16%|███████████▎                                                          | 67/415 [00:55<04:52,  1.19it/s][A

tensor(0.2027, device='cuda:0', grad_fn=<NllLossBackward>)
0.20267412066459656



Iteration:  16%|███████████▍                                                          | 68/415 [00:55<04:45,  1.21it/s][A

tensor(0.3933, device='cuda:0', grad_fn=<NllLossBackward>)
0.39325737953186035



Iteration:  17%|███████████▋                                                          | 69/415 [00:56<04:43,  1.22it/s][A

tensor(0.6043, device='cuda:0', grad_fn=<NllLossBackward>)
0.6043056845664978



Iteration:  17%|███████████▊                                                          | 70/415 [00:57<04:40,  1.23it/s][A

tensor(0.4313, device='cuda:0', grad_fn=<NllLossBackward>)
0.43132680654525757



Iteration:  17%|███████████▉                                                          | 71/415 [00:58<04:39,  1.23it/s][A

tensor(0.4479, device='cuda:0', grad_fn=<NllLossBackward>)
0.44786393642425537



Iteration:  17%|████████████▏                                                         | 72/415 [00:59<04:39,  1.23it/s][A

tensor(0.3411, device='cuda:0', grad_fn=<NllLossBackward>)
0.3411007821559906



Iteration:  18%|████████████▎                                                         | 73/415 [00:59<04:38,  1.23it/s][A

tensor(0.4641, device='cuda:0', grad_fn=<NllLossBackward>)
0.464112788438797



Iteration:  18%|████████████▍                                                         | 74/415 [01:00<04:38,  1.22it/s][A

tensor(0.4143, device='cuda:0', grad_fn=<NllLossBackward>)
0.41431060433387756



Iteration:  18%|████████████▋                                                         | 75/415 [01:01<04:38,  1.22it/s][A

tensor(0.3931, device='cuda:0', grad_fn=<NllLossBackward>)
0.39314430952072144



Iteration:  18%|████████████▊                                                         | 76/415 [01:02<04:34,  1.23it/s][A

tensor(0.3931, device='cuda:0', grad_fn=<NllLossBackward>)
0.3931308388710022



Iteration:  19%|████████████▉                                                         | 77/415 [01:03<04:40,  1.20it/s][A

tensor(0.4321, device='cuda:0', grad_fn=<NllLossBackward>)
0.4321438670158386



Iteration:  19%|█████████████▏                                                        | 78/415 [01:04<04:39,  1.21it/s][A

tensor(0.5459, device='cuda:0', grad_fn=<NllLossBackward>)
0.5459222197532654



Iteration:  19%|█████████████▎                                                        | 79/415 [01:04<04:41,  1.19it/s][A

tensor(0.7519, device='cuda:0', grad_fn=<NllLossBackward>)
0.7519210577011108



Iteration:  19%|█████████████▍                                                        | 80/415 [01:05<04:40,  1.20it/s][A

tensor(0.4258, device='cuda:0', grad_fn=<NllLossBackward>)
0.42583993077278137



Iteration:  20%|█████████████▋                                                        | 81/415 [01:06<04:35,  1.21it/s][A

tensor(0.2912, device='cuda:0', grad_fn=<NllLossBackward>)
0.2912302017211914



Iteration:  20%|█████████████▊                                                        | 82/415 [01:07<04:32,  1.22it/s][A

tensor(0.4895, device='cuda:0', grad_fn=<NllLossBackward>)
0.4894558787345886



Iteration:  20%|██████████████                                                        | 83/415 [01:08<04:40,  1.18it/s][A

tensor(0.4289, device='cuda:0', grad_fn=<NllLossBackward>)
0.42894884943962097



Iteration:  20%|██████████████▏                                                       | 84/415 [01:09<04:52,  1.13it/s][A

tensor(0.3250, device='cuda:0', grad_fn=<NllLossBackward>)
0.3250005543231964



Iteration:  20%|██████████████▎                                                       | 85/415 [01:10<04:48,  1.14it/s][A

tensor(0.4892, device='cuda:0', grad_fn=<NllLossBackward>)
0.4891839027404785



Iteration:  21%|██████████████▌                                                       | 86/415 [01:10<04:44,  1.16it/s][A

tensor(0.4237, device='cuda:0', grad_fn=<NllLossBackward>)
0.42373186349868774



Iteration:  21%|██████████████▋                                                       | 87/415 [01:11<04:41,  1.16it/s][A

tensor(0.4492, device='cuda:0', grad_fn=<NllLossBackward>)
0.4491993188858032



Iteration:  21%|██████████████▊                                                       | 88/415 [01:12<04:36,  1.18it/s][A

tensor(0.4146, device='cuda:0', grad_fn=<NllLossBackward>)
0.41459739208221436



Iteration:  21%|███████████████                                                       | 89/415 [01:13<04:37,  1.18it/s][A

tensor(0.3993, device='cuda:0', grad_fn=<NllLossBackward>)
0.39934295415878296



Iteration:  22%|███████████████▏                                                      | 90/415 [01:14<04:31,  1.20it/s][A

tensor(0.4303, device='cuda:0', grad_fn=<NllLossBackward>)
0.43033015727996826



Iteration:  22%|███████████████▎                                                      | 91/415 [01:15<04:27,  1.21it/s][A

tensor(0.3747, device='cuda:0', grad_fn=<NllLossBackward>)
0.37473973631858826



Iteration:  22%|███████████████▌                                                      | 92/415 [01:15<04:30,  1.20it/s][A

tensor(0.4225, device='cuda:0', grad_fn=<NllLossBackward>)
0.4225230813026428



Iteration:  22%|███████████████▋                                                      | 93/415 [01:16<04:24,  1.22it/s][A

tensor(0.5129, device='cuda:0', grad_fn=<NllLossBackward>)
0.5128545761108398



Iteration:  23%|███████████████▊                                                      | 94/415 [01:17<04:23,  1.22it/s][A

tensor(0.2972, device='cuda:0', grad_fn=<NllLossBackward>)
0.29721781611442566



Iteration:  23%|████████████████                                                      | 95/415 [01:18<04:25,  1.20it/s][A

tensor(0.4950, device='cuda:0', grad_fn=<NllLossBackward>)
0.4950079023838043



Iteration:  23%|████████████████▏                                                     | 96/415 [01:19<04:24,  1.20it/s][A

tensor(0.5708, device='cuda:0', grad_fn=<NllLossBackward>)
0.5708329677581787



Iteration:  23%|████████████████▎                                                     | 97/415 [01:19<04:20,  1.22it/s][A

tensor(0.4921, device='cuda:0', grad_fn=<NllLossBackward>)
0.49207940697669983



Iteration:  24%|████████████████▌                                                     | 98/415 [01:20<04:19,  1.22it/s][A

tensor(0.3905, device='cuda:0', grad_fn=<NllLossBackward>)
0.39049065113067627



Iteration:  24%|████████████████▋                                                     | 99/415 [01:21<04:16,  1.23it/s][A

tensor(0.4575, device='cuda:0', grad_fn=<NllLossBackward>)
0.4575105309486389



Iteration:  24%|████████████████▋                                                    | 100/415 [01:22<04:16,  1.23it/s][A

tensor(0.5271, device='cuda:0', grad_fn=<NllLossBackward>)
0.5271397829055786



Iteration:  24%|████████████████▊                                                    | 101/415 [01:23<04:21,  1.20it/s][A

tensor(0.3847, device='cuda:0', grad_fn=<NllLossBackward>)
0.3847191631793976



Iteration:  25%|████████████████▉                                                    | 102/415 [01:24<04:18,  1.21it/s][A

tensor(0.3448, device='cuda:0', grad_fn=<NllLossBackward>)
0.34476837515830994



Iteration:  25%|█████████████████▏                                                   | 103/415 [01:24<04:14,  1.22it/s][A

tensor(0.3181, device='cuda:0', grad_fn=<NllLossBackward>)
0.3181404173374176



Iteration:  25%|█████████████████▎                                                   | 104/415 [01:25<04:11,  1.24it/s][A

tensor(0.2579, device='cuda:0', grad_fn=<NllLossBackward>)
0.2578526735305786



Iteration:  25%|█████████████████▍                                                   | 105/415 [01:26<04:19,  1.19it/s][A

tensor(0.4328, device='cuda:0', grad_fn=<NllLossBackward>)
0.43278107047080994



Iteration:  26%|█████████████████▌                                                   | 106/415 [01:27<04:17,  1.20it/s][A

tensor(0.4901, device='cuda:0', grad_fn=<NllLossBackward>)
0.4900915324687958



Iteration:  26%|█████████████████▊                                                   | 107/415 [01:28<04:15,  1.21it/s][A

tensor(0.2500, device='cuda:0', grad_fn=<NllLossBackward>)
0.24999822676181793



Iteration:  26%|█████████████████▉                                                   | 108/415 [01:29<04:22,  1.17it/s][A

tensor(0.5092, device='cuda:0', grad_fn=<NllLossBackward>)
0.5091849565505981



Iteration:  26%|██████████████████                                                   | 109/415 [01:29<04:20,  1.18it/s][A

tensor(0.3271, device='cuda:0', grad_fn=<NllLossBackward>)
0.32710543274879456



Iteration:  27%|██████████████████▎                                                  | 110/415 [01:30<04:16,  1.19it/s][A

tensor(0.3712, device='cuda:0', grad_fn=<NllLossBackward>)
0.371201753616333



Iteration:  27%|██████████████████▍                                                  | 111/415 [01:31<04:12,  1.20it/s][A

tensor(0.3663, device='cuda:0', grad_fn=<NllLossBackward>)
0.3663361966609955



Iteration:  27%|██████████████████▌                                                  | 112/415 [01:32<04:09,  1.21it/s][A

tensor(0.4930, device='cuda:0', grad_fn=<NllLossBackward>)
0.4929662346839905



Iteration:  27%|██████████████████▊                                                  | 113/415 [01:33<04:11,  1.20it/s][A

tensor(0.3297, device='cuda:0', grad_fn=<NllLossBackward>)
0.3296547532081604



Iteration:  27%|██████████████████▉                                                  | 114/415 [01:34<04:09,  1.21it/s][A

tensor(0.3984, device='cuda:0', grad_fn=<NllLossBackward>)
0.3983684778213501



Iteration:  28%|███████████████████                                                  | 115/415 [01:34<04:11,  1.19it/s][A

tensor(0.7294, device='cuda:0', grad_fn=<NllLossBackward>)
0.7293691635131836



Iteration:  28%|███████████████████▎                                                 | 116/415 [01:35<04:07,  1.21it/s][A

tensor(0.5813, device='cuda:0', grad_fn=<NllLossBackward>)
0.5812572836875916



Iteration:  28%|███████████████████▍                                                 | 117/415 [01:36<04:03,  1.22it/s][A

tensor(0.2825, device='cuda:0', grad_fn=<NllLossBackward>)
0.28249531984329224



Iteration:  28%|███████████████████▌                                                 | 118/415 [01:37<04:07,  1.20it/s][A

tensor(0.3916, device='cuda:0', grad_fn=<NllLossBackward>)
0.3915647268295288



Iteration:  29%|███████████████████▊                                                 | 119/415 [01:38<04:03,  1.22it/s][A

tensor(0.4974, device='cuda:0', grad_fn=<NllLossBackward>)
0.4973818361759186



Iteration:  29%|███████████████████▉                                                 | 120/415 [01:39<04:02,  1.22it/s][A

tensor(0.2827, device='cuda:0', grad_fn=<NllLossBackward>)
0.28265783190727234



Iteration:  29%|████████████████████                                                 | 121/415 [01:39<04:00,  1.22it/s][A

tensor(0.5294, device='cuda:0', grad_fn=<NllLossBackward>)
0.5294232964515686



Iteration:  29%|████████████████████▎                                                | 122/415 [01:40<03:59,  1.22it/s][A

tensor(0.5922, device='cuda:0', grad_fn=<NllLossBackward>)
0.5921816825866699



Iteration:  30%|████████████████████▍                                                | 123/415 [01:41<03:58,  1.22it/s][A

tensor(0.5485, device='cuda:0', grad_fn=<NllLossBackward>)
0.5485140681266785



Iteration:  30%|████████████████████▌                                                | 124/415 [01:42<04:02,  1.20it/s][A

tensor(0.3061, device='cuda:0', grad_fn=<NllLossBackward>)
0.3061051070690155



Iteration:  30%|████████████████████▊                                                | 125/415 [01:43<03:57,  1.22it/s][A

tensor(0.5566, device='cuda:0', grad_fn=<NllLossBackward>)
0.5565799474716187



Iteration:  30%|████████████████████▉                                                | 126/415 [01:44<04:00,  1.20it/s][A

tensor(0.4213, device='cuda:0', grad_fn=<NllLossBackward>)
0.42130401730537415



Iteration:  31%|█████████████████████                                                | 127/415 [01:44<04:01,  1.19it/s][A

tensor(0.4779, device='cuda:0', grad_fn=<NllLossBackward>)
0.4779103696346283



Iteration:  31%|█████████████████████▎                                               | 128/415 [01:45<03:58,  1.20it/s][A

tensor(0.3120, device='cuda:0', grad_fn=<NllLossBackward>)
0.31197771430015564



Iteration:  31%|█████████████████████▍                                               | 129/415 [01:46<04:00,  1.19it/s][A

tensor(0.3799, device='cuda:0', grad_fn=<NllLossBackward>)
0.37988603115081787



Iteration:  31%|█████████████████████▌                                               | 130/415 [01:47<04:01,  1.18it/s][A

tensor(0.4305, device='cuda:0', grad_fn=<NllLossBackward>)
0.4304647147655487



Iteration:  32%|█████████████████████▊                                               | 131/415 [01:48<03:56,  1.20it/s][A

tensor(0.3694, device='cuda:0', grad_fn=<NllLossBackward>)
0.3694137930870056



Iteration:  32%|█████████████████████▉                                               | 132/415 [01:49<03:56,  1.20it/s][A

tensor(0.4157, device='cuda:0', grad_fn=<NllLossBackward>)
0.4157028794288635



Iteration:  32%|██████████████████████                                               | 133/415 [01:49<03:52,  1.21it/s][A

tensor(0.2909, device='cuda:0', grad_fn=<NllLossBackward>)
0.29087698459625244



Iteration:  32%|██████████████████████▎                                              | 134/415 [01:50<03:54,  1.20it/s][A

tensor(0.4601, device='cuda:0', grad_fn=<NllLossBackward>)
0.4601457118988037



Iteration:  33%|██████████████████████▍                                              | 135/415 [01:51<03:56,  1.18it/s][A

tensor(0.2911, device='cuda:0', grad_fn=<NllLossBackward>)
0.29106831550598145



Iteration:  33%|██████████████████████▌                                              | 136/415 [01:52<03:52,  1.20it/s][A

tensor(0.3615, device='cuda:0', grad_fn=<NllLossBackward>)
0.3614838123321533



Iteration:  33%|██████████████████████▊                                              | 137/415 [01:53<03:52,  1.19it/s][A

tensor(0.4232, device='cuda:0', grad_fn=<NllLossBackward>)
0.4232063889503479



Iteration:  33%|██████████████████████▉                                              | 138/415 [01:54<03:50,  1.20it/s][A

tensor(0.4324, device='cuda:0', grad_fn=<NllLossBackward>)
0.4323793053627014



Iteration:  33%|███████████████████████                                              | 139/415 [01:54<03:48,  1.21it/s][A

tensor(0.7494, device='cuda:0', grad_fn=<NllLossBackward>)
0.7493845820426941



Iteration:  34%|███████████████████████▎                                             | 140/415 [01:55<03:49,  1.20it/s][A

tensor(0.2657, device='cuda:0', grad_fn=<NllLossBackward>)
0.2657186985015869



Iteration:  34%|███████████████████████▍                                             | 141/415 [01:56<03:47,  1.21it/s][A

tensor(0.2616, device='cuda:0', grad_fn=<NllLossBackward>)
0.26160338521003723



Iteration:  34%|███████████████████████▌                                             | 142/415 [01:57<03:49,  1.19it/s][A

tensor(0.3981, device='cuda:0', grad_fn=<NllLossBackward>)
0.39810389280319214



Iteration:  34%|███████████████████████▊                                             | 143/415 [01:58<03:49,  1.18it/s][A

tensor(0.2622, device='cuda:0', grad_fn=<NllLossBackward>)
0.2622391879558563



Iteration:  35%|███████████████████████▉                                             | 144/415 [01:59<03:45,  1.20it/s][A

tensor(0.2598, device='cuda:0', grad_fn=<NllLossBackward>)
0.2598089575767517



Iteration:  35%|████████████████████████                                             | 145/415 [01:59<03:45,  1.20it/s][A

tensor(0.3649, device='cuda:0', grad_fn=<NllLossBackward>)
0.3649267554283142



Iteration:  35%|████████████████████████▎                                            | 146/415 [02:00<03:43,  1.21it/s][A

tensor(0.4987, device='cuda:0', grad_fn=<NllLossBackward>)
0.49866998195648193



Iteration:  35%|████████████████████████▍                                            | 147/415 [02:01<03:49,  1.17it/s][A

tensor(0.4009, device='cuda:0', grad_fn=<NllLossBackward>)
0.4008883833885193



Iteration:  36%|████████████████████████▌                                            | 148/415 [02:02<03:44,  1.19it/s][A

tensor(0.5368, device='cuda:0', grad_fn=<NllLossBackward>)
0.5368329882621765



Iteration:  36%|████████████████████████▊                                            | 149/415 [02:03<03:41,  1.20it/s][A

tensor(0.2787, device='cuda:0', grad_fn=<NllLossBackward>)
0.2786639928817749



Iteration:  36%|████████████████████████▉                                            | 150/415 [02:04<04:00,  1.10it/s][A

tensor(0.2218, device='cuda:0', grad_fn=<NllLossBackward>)
0.22183434665203094



Iteration:  36%|█████████████████████████                                            | 151/415 [02:05<03:54,  1.13it/s][A

tensor(0.4026, device='cuda:0', grad_fn=<NllLossBackward>)
0.4026256501674652



Iteration:  37%|█████████████████████████▎                                           | 152/415 [02:06<03:50,  1.14it/s][A

tensor(0.4348, device='cuda:0', grad_fn=<NllLossBackward>)
0.4347521662712097



Iteration:  37%|█████████████████████████▍                                           | 153/415 [02:06<03:44,  1.17it/s][A

tensor(0.3180, device='cuda:0', grad_fn=<NllLossBackward>)
0.31795191764831543



Iteration:  37%|█████████████████████████▌                                           | 154/415 [02:07<03:44,  1.16it/s][A

tensor(0.3330, device='cuda:0', grad_fn=<NllLossBackward>)
0.3329772651195526



Iteration:  37%|█████████████████████████▊                                           | 155/415 [02:08<03:39,  1.18it/s][A

tensor(0.6024, device='cuda:0', grad_fn=<NllLossBackward>)
0.6024261116981506



Iteration:  38%|█████████████████████████▉                                           | 156/415 [02:09<03:36,  1.20it/s][A

tensor(0.2835, device='cuda:0', grad_fn=<NllLossBackward>)
0.28349098563194275



Iteration:  38%|██████████████████████████                                           | 157/415 [02:10<03:40,  1.17it/s][A

tensor(0.2845, device='cuda:0', grad_fn=<NllLossBackward>)
0.2845417261123657



Iteration:  38%|██████████████████████████▎                                          | 158/415 [02:11<03:37,  1.18it/s][A

tensor(0.4605, device='cuda:0', grad_fn=<NllLossBackward>)
0.4604960083961487



Iteration:  38%|██████████████████████████▍                                          | 159/415 [02:11<03:32,  1.20it/s][A

tensor(0.7218, device='cuda:0', grad_fn=<NllLossBackward>)
0.7217502593994141



Iteration:  39%|██████████████████████████▌                                          | 160/415 [02:12<03:31,  1.20it/s][A

tensor(0.3318, device='cuda:0', grad_fn=<NllLossBackward>)
0.3317897915840149



Iteration:  39%|██████████████████████████▊                                          | 161/415 [02:13<03:32,  1.19it/s][A

tensor(0.3452, device='cuda:0', grad_fn=<NllLossBackward>)
0.3451647162437439



Iteration:  39%|██████████████████████████▉                                          | 162/415 [02:14<03:32,  1.19it/s][A

tensor(0.4394, device='cuda:0', grad_fn=<NllLossBackward>)
0.43939515948295593



Iteration:  39%|███████████████████████████                                          | 163/415 [02:15<03:31,  1.19it/s][A

tensor(0.3259, device='cuda:0', grad_fn=<NllLossBackward>)
0.3258809745311737



Iteration:  40%|███████████████████████████▎                                         | 164/415 [02:16<03:30,  1.19it/s][A

tensor(0.6759, device='cuda:0', grad_fn=<NllLossBackward>)
0.6759074330329895



Iteration:  40%|███████████████████████████▍                                         | 165/415 [02:16<03:27,  1.21it/s][A

tensor(0.3985, device='cuda:0', grad_fn=<NllLossBackward>)
0.3985428512096405



Iteration:  40%|███████████████████████████▌                                         | 166/415 [02:17<03:31,  1.18it/s][A

tensor(0.5784, device='cuda:0', grad_fn=<NllLossBackward>)
0.5783615112304688



Iteration:  40%|███████████████████████████▊                                         | 167/415 [02:18<03:27,  1.20it/s][A

tensor(0.5176, device='cuda:0', grad_fn=<NllLossBackward>)
0.5175603032112122



Iteration:  40%|███████████████████████████▉                                         | 168/415 [02:19<03:25,  1.20it/s][A

tensor(0.4158, device='cuda:0', grad_fn=<NllLossBackward>)
0.41583380103111267



Iteration:  41%|████████████████████████████                                         | 169/415 [02:20<03:23,  1.21it/s][A

tensor(0.5678, device='cuda:0', grad_fn=<NllLossBackward>)
0.5677915811538696



Iteration:  41%|████████████████████████████▎                                        | 170/415 [02:21<03:24,  1.20it/s][A

tensor(0.5014, device='cuda:0', grad_fn=<NllLossBackward>)
0.50144362449646



Iteration:  41%|████████████████████████████▍                                        | 171/415 [02:21<03:24,  1.20it/s][A

tensor(0.4878, device='cuda:0', grad_fn=<NllLossBackward>)
0.4878324568271637



Iteration:  41%|████████████████████████████▌                                        | 172/415 [02:22<03:21,  1.21it/s][A

tensor(0.6132, device='cuda:0', grad_fn=<NllLossBackward>)
0.613226056098938



Iteration:  42%|████████████████████████████▊                                        | 173/415 [02:23<03:19,  1.21it/s][A

tensor(0.6958, device='cuda:0', grad_fn=<NllLossBackward>)
0.6958091855049133



Iteration:  42%|████████████████████████████▉                                        | 174/415 [02:24<03:29,  1.15it/s][A

tensor(0.4588, device='cuda:0', grad_fn=<NllLossBackward>)
0.458818256855011



Iteration:  42%|█████████████████████████████                                        | 175/415 [02:25<03:24,  1.17it/s][A

tensor(0.3638, device='cuda:0', grad_fn=<NllLossBackward>)
0.36377519369125366



Iteration:  42%|█████████████████████████████▎                                       | 176/415 [02:26<03:20,  1.19it/s][A

tensor(0.3980, device='cuda:0', grad_fn=<NllLossBackward>)
0.39803534746170044



Iteration:  43%|█████████████████████████████▍                                       | 177/415 [02:26<03:20,  1.19it/s][A

tensor(0.3709, device='cuda:0', grad_fn=<NllLossBackward>)
0.37087196111679077



Iteration:  43%|█████████████████████████████▌                                       | 178/415 [02:27<03:19,  1.19it/s][A

tensor(0.4545, device='cuda:0', grad_fn=<NllLossBackward>)
0.45448797941207886



Iteration:  43%|█████████████████████████████▊                                       | 179/415 [02:28<03:16,  1.20it/s][A

tensor(0.3713, device='cuda:0', grad_fn=<NllLossBackward>)
0.37131667137145996



Iteration:  43%|█████████████████████████████▉                                       | 180/415 [02:29<03:14,  1.21it/s][A

tensor(0.2776, device='cuda:0', grad_fn=<NllLossBackward>)
0.2776048183441162



Iteration:  44%|██████████████████████████████                                       | 181/415 [02:30<03:11,  1.22it/s][A

tensor(0.4073, device='cuda:0', grad_fn=<NllLossBackward>)
0.4072926342487335



Iteration:  44%|██████████████████████████████▎                                      | 182/415 [02:31<03:09,  1.23it/s][A

tensor(0.4021, device='cuda:0', grad_fn=<NllLossBackward>)
0.4020940959453583



Iteration:  44%|██████████████████████████████▍                                      | 183/415 [02:31<03:16,  1.18it/s][A

tensor(0.3625, device='cuda:0', grad_fn=<NllLossBackward>)
0.362455278635025



Iteration:  44%|██████████████████████████████▌                                      | 184/415 [02:32<03:15,  1.18it/s][A

tensor(0.3574, device='cuda:0', grad_fn=<NllLossBackward>)
0.3574138879776001



Iteration:  45%|██████████████████████████████▊                                      | 185/415 [02:33<03:15,  1.17it/s][A

tensor(0.3043, device='cuda:0', grad_fn=<NllLossBackward>)
0.3043443560600281



Iteration:  45%|██████████████████████████████▉                                      | 186/415 [02:34<03:13,  1.18it/s][A

tensor(0.3090, device='cuda:0', grad_fn=<NllLossBackward>)
0.30903810262680054



Iteration:  45%|███████████████████████████████                                      | 187/415 [02:35<03:10,  1.20it/s][A

tensor(0.2515, device='cuda:0', grad_fn=<NllLossBackward>)
0.25150924921035767



Iteration:  45%|███████████████████████████████▎                                     | 188/415 [02:36<03:08,  1.20it/s][A

tensor(0.4952, device='cuda:0', grad_fn=<NllLossBackward>)
0.49523505568504333



Iteration:  46%|███████████████████████████████▍                                     | 189/415 [02:36<03:07,  1.20it/s][A

tensor(0.2357, device='cuda:0', grad_fn=<NllLossBackward>)
0.23573428392410278



Iteration:  46%|███████████████████████████████▌                                     | 190/415 [02:37<03:12,  1.17it/s][A

tensor(0.6111, device='cuda:0', grad_fn=<NllLossBackward>)
0.6111029386520386



Iteration:  46%|███████████████████████████████▊                                     | 191/415 [02:38<03:07,  1.19it/s][A

tensor(0.7829, device='cuda:0', grad_fn=<NllLossBackward>)
0.7829331159591675



Iteration:  46%|███████████████████████████████▉                                     | 192/415 [02:39<03:15,  1.14it/s][A

tensor(0.3466, device='cuda:0', grad_fn=<NllLossBackward>)
0.3465796709060669



Iteration:  47%|████████████████████████████████                                     | 193/415 [02:40<03:10,  1.17it/s][A

tensor(0.3652, device='cuda:0', grad_fn=<NllLossBackward>)
0.3651818633079529



Iteration:  47%|████████████████████████████████▎                                    | 194/415 [02:41<03:10,  1.16it/s][A

tensor(0.4077, device='cuda:0', grad_fn=<NllLossBackward>)
0.4077034592628479



Iteration:  47%|████████████████████████████████▍                                    | 195/415 [02:42<03:06,  1.18it/s][A

tensor(0.3376, device='cuda:0', grad_fn=<NllLossBackward>)
0.3376079201698303



Iteration:  47%|████████████████████████████████▌                                    | 196/415 [02:43<03:09,  1.16it/s][A

tensor(0.5774, device='cuda:0', grad_fn=<NllLossBackward>)
0.5774309635162354



Iteration:  47%|████████████████████████████████▊                                    | 197/415 [02:43<03:08,  1.16it/s][A

tensor(0.5203, device='cuda:0', grad_fn=<NllLossBackward>)
0.5203242897987366



Iteration:  48%|████████████████████████████████▉                                    | 198/415 [02:44<03:05,  1.17it/s][A

tensor(0.3784, device='cuda:0', grad_fn=<NllLossBackward>)
0.37840670347213745



Iteration:  48%|█████████████████████████████████                                    | 199/415 [02:45<03:06,  1.16it/s][A

tensor(0.3053, device='cuda:0', grad_fn=<NllLossBackward>)
0.30533844232559204



Iteration:  48%|█████████████████████████████████▎                                   | 200/415 [02:46<03:01,  1.18it/s][A

tensor(0.4265, device='cuda:0', grad_fn=<NllLossBackward>)
0.4264920949935913



Iteration:  48%|█████████████████████████████████▍                                   | 201/415 [02:47<02:58,  1.20it/s][A

tensor(0.2819, device='cuda:0', grad_fn=<NllLossBackward>)
0.2819279134273529



Iteration:  49%|█████████████████████████████████▌                                   | 202/415 [02:48<02:57,  1.20it/s][A

tensor(0.4238, device='cuda:0', grad_fn=<NllLossBackward>)
0.42378461360931396



Iteration:  49%|█████████████████████████████████▊                                   | 203/415 [02:48<02:56,  1.20it/s][A

tensor(0.3977, device='cuda:0', grad_fn=<NllLossBackward>)
0.3976956009864807



Iteration:  49%|█████████████████████████████████▉                                   | 204/415 [02:49<02:55,  1.20it/s][A

tensor(0.4808, device='cuda:0', grad_fn=<NllLossBackward>)
0.48079848289489746



Iteration:  49%|██████████████████████████████████                                   | 205/415 [02:50<02:54,  1.21it/s][A

tensor(0.3619, device='cuda:0', grad_fn=<NllLossBackward>)
0.3618978261947632



Iteration:  50%|██████████████████████████████████▎                                  | 206/415 [02:51<02:52,  1.21it/s][A

tensor(0.4823, device='cuda:0', grad_fn=<NllLossBackward>)
0.4822882115840912



Iteration:  50%|██████████████████████████████████▍                                  | 207/415 [02:52<02:58,  1.16it/s][A

tensor(0.3625, device='cuda:0', grad_fn=<NllLossBackward>)
0.36251723766326904



Iteration:  50%|██████████████████████████████████▌                                  | 208/415 [02:53<02:56,  1.18it/s][A

tensor(0.4394, device='cuda:0', grad_fn=<NllLossBackward>)
0.4393690824508667



Iteration:  50%|██████████████████████████████████▋                                  | 209/415 [02:54<02:58,  1.15it/s][A

tensor(0.4292, device='cuda:0', grad_fn=<NllLossBackward>)
0.4292106628417969



Iteration:  51%|██████████████████████████████████▉                                  | 210/415 [02:54<02:55,  1.17it/s][A

tensor(0.3834, device='cuda:0', grad_fn=<NllLossBackward>)
0.38343346118927



Iteration:  51%|███████████████████████████████████                                  | 211/415 [02:55<02:52,  1.19it/s][A

tensor(0.2951, device='cuda:0', grad_fn=<NllLossBackward>)
0.29513663053512573



Iteration:  51%|███████████████████████████████████▏                                 | 212/415 [02:56<02:50,  1.19it/s][A

tensor(0.4534, device='cuda:0', grad_fn=<NllLossBackward>)
0.4533637464046478



Iteration:  51%|███████████████████████████████████▍                                 | 213/415 [02:57<02:47,  1.20it/s][A

tensor(0.4105, device='cuda:0', grad_fn=<NllLossBackward>)
0.41047176718711853



Iteration:  52%|███████████████████████████████████▌                                 | 214/415 [02:58<03:01,  1.11it/s][A

tensor(0.3184, device='cuda:0', grad_fn=<NllLossBackward>)
0.3183777332305908



Iteration:  52%|███████████████████████████████████▋                                 | 215/415 [02:59<02:56,  1.13it/s][A

tensor(0.3018, device='cuda:0', grad_fn=<NllLossBackward>)
0.30184295773506165



Iteration:  52%|███████████████████████████████████▉                                 | 216/415 [03:00<02:52,  1.15it/s][A

tensor(0.5854, device='cuda:0', grad_fn=<NllLossBackward>)
0.5854041576385498



Iteration:  52%|████████████████████████████████████                                 | 217/415 [03:00<02:49,  1.17it/s][A

tensor(0.3134, device='cuda:0', grad_fn=<NllLossBackward>)
0.3133509159088135



Iteration:  53%|████████████████████████████████████▏                                | 218/415 [03:01<02:49,  1.16it/s][A

tensor(0.1890, device='cuda:0', grad_fn=<NllLossBackward>)
0.18898436427116394



Iteration:  53%|████████████████████████████████████▍                                | 219/415 [03:02<02:44,  1.19it/s][A

tensor(0.4823, device='cuda:0', grad_fn=<NllLossBackward>)
0.482316792011261



Iteration:  53%|████████████████████████████████████▌                                | 220/415 [03:03<02:43,  1.19it/s][A

tensor(0.3921, device='cuda:0', grad_fn=<NllLossBackward>)
0.39208731055259705



Iteration:  53%|████████████████████████████████████▋                                | 221/415 [03:04<02:42,  1.19it/s][A

tensor(0.4740, device='cuda:0', grad_fn=<NllLossBackward>)
0.47395357489585876



Iteration:  53%|████████████████████████████████████▉                                | 222/415 [03:05<02:41,  1.20it/s][A

tensor(0.4920, device='cuda:0', grad_fn=<NllLossBackward>)
0.49199411273002625



Iteration:  54%|█████████████████████████████████████                                | 223/415 [03:05<02:38,  1.21it/s][A

tensor(0.5075, device='cuda:0', grad_fn=<NllLossBackward>)
0.5074628591537476



Iteration:  54%|█████████████████████████████████████▏                               | 224/415 [03:06<02:37,  1.21it/s][A

tensor(0.2858, device='cuda:0', grad_fn=<NllLossBackward>)
0.2857853174209595



Iteration:  54%|█████████████████████████████████████▍                               | 225/415 [03:07<02:36,  1.21it/s][A

tensor(0.3065, device='cuda:0', grad_fn=<NllLossBackward>)
0.30654847621917725



Iteration:  54%|█████████████████████████████████████▌                               | 226/415 [03:08<02:35,  1.22it/s][A

tensor(0.2796, device='cuda:0', grad_fn=<NllLossBackward>)
0.2795901596546173



Iteration:  55%|█████████████████████████████████████▋                               | 227/415 [03:09<02:35,  1.21it/s][A

tensor(0.4244, device='cuda:0', grad_fn=<NllLossBackward>)
0.4244157075881958



Iteration:  55%|█████████████████████████████████████▉                               | 228/415 [03:09<02:33,  1.22it/s][A

tensor(0.5606, device='cuda:0', grad_fn=<NllLossBackward>)
0.5606290698051453



Iteration:  55%|██████████████████████████████████████                               | 229/415 [03:10<02:32,  1.22it/s][A

tensor(0.4387, device='cuda:0', grad_fn=<NllLossBackward>)
0.4387202858924866



Iteration:  55%|██████████████████████████████████████▏                              | 230/415 [03:11<02:30,  1.23it/s][A

tensor(0.3859, device='cuda:0', grad_fn=<NllLossBackward>)
0.3859395980834961



Iteration:  56%|██████████████████████████████████████▍                              | 231/415 [03:12<02:32,  1.21it/s][A

tensor(0.2676, device='cuda:0', grad_fn=<NllLossBackward>)
0.26761212944984436



Iteration:  56%|██████████████████████████████████████▌                              | 232/415 [03:13<02:33,  1.20it/s][A

tensor(0.3783, device='cuda:0', grad_fn=<NllLossBackward>)
0.3782750368118286



Iteration:  56%|██████████████████████████████████████▋                              | 233/415 [03:14<02:30,  1.21it/s][A

tensor(0.4071, device='cuda:0', grad_fn=<NllLossBackward>)
0.407074898481369



Iteration:  56%|██████████████████████████████████████▉                              | 234/415 [03:15<02:36,  1.16it/s][A

tensor(0.2822, device='cuda:0', grad_fn=<NllLossBackward>)
0.2821846008300781



Iteration:  57%|███████████████████████████████████████                              | 235/415 [03:15<02:36,  1.15it/s][A

tensor(0.4202, device='cuda:0', grad_fn=<NllLossBackward>)
0.4201955795288086



Iteration:  57%|███████████████████████████████████████▏                             | 236/415 [03:16<02:36,  1.15it/s][A

tensor(0.6181, device='cuda:0', grad_fn=<NllLossBackward>)
0.6180548071861267



Iteration:  57%|███████████████████████████████████████▍                             | 237/415 [03:17<02:35,  1.15it/s][A

tensor(0.5234, device='cuda:0', grad_fn=<NllLossBackward>)
0.5234488844871521



Iteration:  57%|███████████████████████████████████████▌                             | 238/415 [03:18<02:32,  1.16it/s][A

tensor(0.3139, device='cuda:0', grad_fn=<NllLossBackward>)
0.31391558051109314



Iteration:  58%|███████████████████████████████████████▋                             | 239/415 [03:19<02:30,  1.17it/s][A

tensor(0.3069, device='cuda:0', grad_fn=<NllLossBackward>)
0.3069169819355011



Iteration:  58%|███████████████████████████████████████▉                             | 240/415 [03:20<02:32,  1.15it/s][A

tensor(0.3034, device='cuda:0', grad_fn=<NllLossBackward>)
0.30335545539855957



Iteration:  58%|████████████████████████████████████████                             | 241/415 [03:21<02:29,  1.16it/s][A

tensor(0.4756, device='cuda:0', grad_fn=<NllLossBackward>)
0.47564035654067993



Iteration:  58%|████████████████████████████████████████▏                            | 242/415 [03:21<02:28,  1.17it/s][A

tensor(0.4009, device='cuda:0', grad_fn=<NllLossBackward>)
0.4009309411048889



Iteration:  59%|████████████████████████████████████████▍                            | 243/415 [03:23<02:40,  1.07it/s][A

tensor(0.3141, device='cuda:0', grad_fn=<NllLossBackward>)
0.3140812814235687



Iteration:  59%|████████████████████████████████████████▌                            | 244/415 [03:23<02:32,  1.12it/s][A

tensor(0.4427, device='cuda:0', grad_fn=<NllLossBackward>)
0.44265949726104736



Iteration:  59%|████████████████████████████████████████▋                            | 245/415 [03:24<02:27,  1.15it/s][A

tensor(0.3245, device='cuda:0', grad_fn=<NllLossBackward>)
0.32450270652770996



Iteration:  59%|████████████████████████████████████████▉                            | 246/415 [03:25<02:31,  1.11it/s][A

tensor(0.5537, device='cuda:0', grad_fn=<NllLossBackward>)
0.5536962747573853



Iteration:  60%|█████████████████████████████████████████                            | 247/415 [03:26<02:38,  1.06it/s][A

tensor(0.5477, device='cuda:0', grad_fn=<NllLossBackward>)
0.5476783514022827



Iteration:  60%|█████████████████████████████████████████▏                           | 248/415 [03:27<02:32,  1.09it/s][A

tensor(0.3720, device='cuda:0', grad_fn=<NllLossBackward>)
0.3720109462738037



Iteration:  60%|█████████████████████████████████████████▍                           | 249/415 [03:28<02:30,  1.11it/s][A

tensor(0.3246, device='cuda:0', grad_fn=<NllLossBackward>)
0.3246017396450043



Iteration:  60%|█████████████████████████████████████████▌                           | 250/415 [03:29<02:24,  1.14it/s][A

tensor(0.3331, device='cuda:0', grad_fn=<NllLossBackward>)
0.33314263820648193



Iteration:  60%|█████████████████████████████████████████▋                           | 251/415 [03:30<02:20,  1.17it/s][A

tensor(0.2892, device='cuda:0', grad_fn=<NllLossBackward>)
0.2891989052295685



Iteration:  61%|█████████████████████████████████████████▉                           | 252/415 [03:30<02:18,  1.18it/s][A

tensor(0.3843, device='cuda:0', grad_fn=<NllLossBackward>)
0.3843234181404114



Iteration:  61%|██████████████████████████████████████████                           | 253/415 [03:31<02:15,  1.19it/s][A

tensor(0.6093, device='cuda:0', grad_fn=<NllLossBackward>)
0.6093063354492188



Iteration:  61%|██████████████████████████████████████████▏                          | 254/415 [03:32<02:15,  1.18it/s][A

tensor(0.2544, device='cuda:0', grad_fn=<NllLossBackward>)
0.2544494569301605



Iteration:  61%|██████████████████████████████████████████▍                          | 255/415 [03:33<02:14,  1.19it/s][A

tensor(0.2346, device='cuda:0', grad_fn=<NllLossBackward>)
0.23455749452114105



Iteration:  62%|██████████████████████████████████████████▌                          | 256/415 [03:34<02:13,  1.19it/s][A

tensor(0.2842, device='cuda:0', grad_fn=<NllLossBackward>)
0.28418490290641785



Iteration:  62%|██████████████████████████████████████████▋                          | 257/415 [03:35<02:15,  1.16it/s][A

tensor(0.4265, device='cuda:0', grad_fn=<NllLossBackward>)
0.4264891445636749



Iteration:  62%|██████████████████████████████████████████▉                          | 258/415 [03:35<02:13,  1.18it/s][A

tensor(0.5257, device='cuda:0', grad_fn=<NllLossBackward>)
0.5256670713424683



Iteration:  62%|███████████████████████████████████████████                          | 259/415 [03:36<02:14,  1.16it/s][A

tensor(0.2971, device='cuda:0', grad_fn=<NllLossBackward>)
0.2971484959125519



Iteration:  63%|███████████████████████████████████████████▏                         | 260/415 [03:37<02:16,  1.13it/s][A

tensor(0.4628, device='cuda:0', grad_fn=<NllLossBackward>)
0.46276167035102844



Iteration:  63%|███████████████████████████████████████████▍                         | 261/415 [03:38<02:14,  1.15it/s][A

tensor(0.3263, device='cuda:0', grad_fn=<NllLossBackward>)
0.32630467414855957



Iteration:  63%|███████████████████████████████████████████▌                         | 262/415 [03:39<02:11,  1.17it/s][A

tensor(0.4714, device='cuda:0', grad_fn=<NllLossBackward>)
0.4713519513607025



Iteration:  63%|███████████████████████████████████████████▋                         | 263/415 [03:40<02:09,  1.18it/s][A

tensor(0.2837, device='cuda:0', grad_fn=<NllLossBackward>)
0.2837403416633606



Iteration:  64%|███████████████████████████████████████████▉                         | 264/415 [03:41<02:06,  1.19it/s][A

tensor(0.2463, device='cuda:0', grad_fn=<NllLossBackward>)
0.24628767371177673



Iteration:  64%|████████████████████████████████████████████                         | 265/415 [03:41<02:06,  1.18it/s][A

tensor(0.3165, device='cuda:0', grad_fn=<NllLossBackward>)
0.3164905309677124



Iteration:  64%|████████████████████████████████████████████▏                        | 266/415 [03:42<02:06,  1.18it/s][A

tensor(0.3443, device='cuda:0', grad_fn=<NllLossBackward>)
0.34433290362358093



Iteration:  64%|████████████████████████████████████████████▍                        | 267/415 [03:43<02:04,  1.19it/s][A

tensor(0.4237, device='cuda:0', grad_fn=<NllLossBackward>)
0.4237082302570343



Iteration:  65%|████████████████████████████████████████████▌                        | 268/415 [03:44<02:04,  1.18it/s][A

tensor(0.4702, device='cuda:0', grad_fn=<NllLossBackward>)
0.47021159529685974



Iteration:  65%|████████████████████████████████████████████▋                        | 269/415 [03:45<02:10,  1.12it/s][A

tensor(0.2445, device='cuda:0', grad_fn=<NllLossBackward>)
0.24446949362754822



Iteration:  65%|████████████████████████████████████████████▉                        | 270/415 [03:46<02:05,  1.15it/s][A

tensor(0.3692, device='cuda:0', grad_fn=<NllLossBackward>)
0.36924582719802856



Iteration:  65%|█████████████████████████████████████████████                        | 271/415 [03:47<02:06,  1.14it/s][A

tensor(0.2237, device='cuda:0', grad_fn=<NllLossBackward>)
0.22373954951763153



Iteration:  66%|█████████████████████████████████████████████▏                       | 272/415 [03:48<02:03,  1.16it/s][A

tensor(0.3598, device='cuda:0', grad_fn=<NllLossBackward>)
0.3598160743713379



Iteration:  66%|█████████████████████████████████████████████▍                       | 273/415 [03:48<02:00,  1.18it/s][A

tensor(0.2454, device='cuda:0', grad_fn=<NllLossBackward>)
0.24535995721817017



Iteration:  66%|█████████████████████████████████████████████▌                       | 274/415 [03:49<02:00,  1.17it/s][A

tensor(0.2824, device='cuda:0', grad_fn=<NllLossBackward>)
0.2823847234249115



Iteration:  66%|█████████████████████████████████████████████▋                       | 275/415 [03:50<01:56,  1.20it/s][A

tensor(0.2677, device='cuda:0', grad_fn=<NllLossBackward>)
0.26774194836616516



Iteration:  67%|█████████████████████████████████████████████▉                       | 276/415 [03:51<01:55,  1.21it/s][A

tensor(0.2859, device='cuda:0', grad_fn=<NllLossBackward>)
0.2858509421348572



Iteration:  67%|██████████████████████████████████████████████                       | 277/415 [03:52<01:56,  1.19it/s][A

tensor(0.3524, device='cuda:0', grad_fn=<NllLossBackward>)
0.35240206122398376



Iteration:  67%|██████████████████████████████████████████████▏                      | 278/415 [03:53<01:55,  1.19it/s][A

tensor(0.4035, device='cuda:0', grad_fn=<NllLossBackward>)
0.4034988284111023



Iteration:  67%|██████████████████████████████████████████████▍                      | 279/415 [03:53<01:53,  1.20it/s][A

tensor(0.5711, device='cuda:0', grad_fn=<NllLossBackward>)
0.5711309909820557



Iteration:  67%|██████████████████████████████████████████████▌                      | 280/415 [03:54<01:57,  1.15it/s][A

tensor(0.5067, device='cuda:0', grad_fn=<NllLossBackward>)
0.506722092628479



Iteration:  68%|██████████████████████████████████████████████▋                      | 281/415 [03:55<01:54,  1.17it/s][A

tensor(0.3454, device='cuda:0', grad_fn=<NllLossBackward>)
0.3454150855541229



Iteration:  68%|██████████████████████████████████████████████▉                      | 282/415 [03:56<01:52,  1.18it/s][A

tensor(0.5552, device='cuda:0', grad_fn=<NllLossBackward>)
0.5552186965942383



Iteration:  68%|███████████████████████████████████████████████                      | 283/415 [03:57<01:51,  1.19it/s][A

tensor(0.4002, device='cuda:0', grad_fn=<NllLossBackward>)
0.40023767948150635



Iteration:  68%|███████████████████████████████████████████████▏                     | 284/415 [03:58<01:49,  1.19it/s][A

tensor(0.3402, device='cuda:0', grad_fn=<NllLossBackward>)
0.34019550681114197



Iteration:  69%|███████████████████████████████████████████████▍                     | 285/415 [03:58<01:48,  1.20it/s][A

tensor(0.4371, device='cuda:0', grad_fn=<NllLossBackward>)
0.4371284544467926



Iteration:  69%|███████████████████████████████████████████████▌                     | 286/415 [03:59<01:48,  1.19it/s][A

tensor(0.2732, device='cuda:0', grad_fn=<NllLossBackward>)
0.27320533990859985



Iteration:  69%|███████████████████████████████████████████████▋                     | 287/415 [04:00<01:45,  1.21it/s][A

tensor(0.3268, device='cuda:0', grad_fn=<NllLossBackward>)
0.32683879137039185



Iteration:  69%|███████████████████████████████████████████████▉                     | 288/415 [04:01<01:46,  1.19it/s][A

tensor(0.3033, device='cuda:0', grad_fn=<NllLossBackward>)
0.30331382155418396



Iteration:  70%|████████████████████████████████████████████████                     | 289/415 [04:02<01:46,  1.19it/s][A

tensor(0.5050, device='cuda:0', grad_fn=<NllLossBackward>)
0.504986584186554



Iteration:  70%|████████████████████████████████████████████████▏                    | 290/415 [04:03<01:46,  1.17it/s][A

tensor(0.2755, device='cuda:0', grad_fn=<NllLossBackward>)
0.2755358815193176



Iteration:  70%|████████████████████████████████████████████████▍                    | 291/415 [04:04<01:44,  1.18it/s][A

tensor(0.4222, device='cuda:0', grad_fn=<NllLossBackward>)
0.4222257435321808



Iteration:  70%|████████████████████████████████████████████████▌                    | 292/415 [04:04<01:46,  1.16it/s][A

tensor(0.3065, device='cuda:0', grad_fn=<NllLossBackward>)
0.3065052330493927



Iteration:  71%|████████████████████████████████████████████████▋                    | 293/415 [04:05<01:44,  1.17it/s][A

tensor(0.3779, device='cuda:0', grad_fn=<NllLossBackward>)
0.37786734104156494



Iteration:  71%|████████████████████████████████████████████████▉                    | 294/415 [04:06<01:42,  1.18it/s][A

tensor(0.2702, device='cuda:0', grad_fn=<NllLossBackward>)
0.27016523480415344



Iteration:  71%|█████████████████████████████████████████████████                    | 295/415 [04:07<01:42,  1.17it/s][A

tensor(0.2368, device='cuda:0', grad_fn=<NllLossBackward>)
0.23676520586013794



Iteration:  71%|█████████████████████████████████████████████████▏                   | 296/415 [04:08<01:42,  1.16it/s][A

tensor(0.5036, device='cuda:0', grad_fn=<NllLossBackward>)
0.5036329030990601



Iteration:  72%|█████████████████████████████████████████████████▍                   | 297/415 [04:09<01:39,  1.18it/s][A

tensor(0.2920, device='cuda:0', grad_fn=<NllLossBackward>)
0.2920457422733307



Iteration:  72%|█████████████████████████████████████████████████▌                   | 298/415 [04:09<01:38,  1.19it/s][A

tensor(0.2951, device='cuda:0', grad_fn=<NllLossBackward>)
0.2951348125934601



Iteration:  72%|█████████████████████████████████████████████████▋                   | 299/415 [04:10<01:38,  1.18it/s][A

tensor(0.4710, device='cuda:0', grad_fn=<NllLossBackward>)
0.4710391163825989



Iteration:  72%|█████████████████████████████████████████████████▉                   | 300/415 [04:11<01:37,  1.18it/s][A

tensor(0.2835, device='cuda:0', grad_fn=<NllLossBackward>)
0.283499538898468



Iteration:  73%|██████████████████████████████████████████████████                   | 301/415 [04:12<01:40,  1.13it/s][A

tensor(0.3150, device='cuda:0', grad_fn=<NllLossBackward>)
0.3150009512901306



Iteration:  73%|██████████████████████████████████████████████████▏                  | 302/415 [04:13<01:37,  1.16it/s][A

tensor(0.2409, device='cuda:0', grad_fn=<NllLossBackward>)
0.24085991084575653



Iteration:  73%|██████████████████████████████████████████████████▍                  | 303/415 [04:14<01:35,  1.17it/s][A

tensor(0.3304, device='cuda:0', grad_fn=<NllLossBackward>)
0.3304339349269867



Iteration:  73%|██████████████████████████████████████████████████▌                  | 304/415 [04:15<01:33,  1.18it/s][A

tensor(0.2446, device='cuda:0', grad_fn=<NllLossBackward>)
0.2446010410785675



Iteration:  73%|██████████████████████████████████████████████████▋                  | 305/415 [04:15<01:32,  1.19it/s][A

tensor(0.4189, device='cuda:0', grad_fn=<NllLossBackward>)
0.4189082086086273



Iteration:  74%|██████████████████████████████████████████████████▉                  | 306/415 [04:16<01:30,  1.20it/s][A

tensor(0.1983, device='cuda:0', grad_fn=<NllLossBackward>)
0.19832368195056915



Iteration:  74%|███████████████████████████████████████████████████                  | 307/415 [04:17<01:29,  1.20it/s][A

tensor(0.3558, device='cuda:0', grad_fn=<NllLossBackward>)
0.355759859085083



Iteration:  74%|███████████████████████████████████████████████████▏                 | 308/415 [04:18<01:28,  1.21it/s][A

tensor(0.4589, device='cuda:0', grad_fn=<NllLossBackward>)
0.4588809609413147



Iteration:  74%|███████████████████████████████████████████████████▍                 | 309/415 [04:19<01:27,  1.21it/s][A

tensor(0.3597, device='cuda:0', grad_fn=<NllLossBackward>)
0.35965627431869507



Iteration:  75%|███████████████████████████████████████████████████▌                 | 310/415 [04:20<01:26,  1.21it/s][A

tensor(0.2057, device='cuda:0', grad_fn=<NllLossBackward>)
0.205664724111557



Iteration:  75%|███████████████████████████████████████████████████▋                 | 311/415 [04:20<01:26,  1.20it/s][A

tensor(0.3994, device='cuda:0', grad_fn=<NllLossBackward>)
0.3993892967700958



Iteration:  75%|███████████████████████████████████████████████████▊                 | 312/415 [04:21<01:30,  1.14it/s][A

tensor(0.1822, device='cuda:0', grad_fn=<NllLossBackward>)
0.18217161297798157



Iteration:  75%|████████████████████████████████████████████████████                 | 313/415 [04:22<01:26,  1.17it/s][A

tensor(0.2425, device='cuda:0', grad_fn=<NllLossBackward>)
0.24251960217952728



Iteration:  76%|████████████████████████████████████████████████████▏                | 314/415 [04:23<01:25,  1.18it/s][A

tensor(0.3639, device='cuda:0', grad_fn=<NllLossBackward>)
0.3638947606086731



Iteration:  76%|████████████████████████████████████████████████████▎                | 315/415 [04:24<01:24,  1.18it/s][A

tensor(0.6096, device='cuda:0', grad_fn=<NllLossBackward>)
0.6096396446228027



Iteration:  76%|████████████████████████████████████████████████████▌                | 316/415 [04:25<01:23,  1.19it/s][A

tensor(0.1911, device='cuda:0', grad_fn=<NllLossBackward>)
0.19108109176158905



Iteration:  76%|████████████████████████████████████████████████████▋                | 317/415 [04:26<01:24,  1.16it/s][A

tensor(0.3018, device='cuda:0', grad_fn=<NllLossBackward>)
0.30180609226226807



Iteration:  77%|████████████████████████████████████████████████████▊                | 318/415 [04:26<01:22,  1.18it/s][A

tensor(0.2776, device='cuda:0', grad_fn=<NllLossBackward>)
0.27758628129959106



Iteration:  77%|█████████████████████████████████████████████████████                | 319/415 [04:27<01:22,  1.17it/s][A

tensor(0.4256, device='cuda:0', grad_fn=<NllLossBackward>)
0.4255582094192505



Iteration:  77%|█████████████████████████████████████████████████████▏               | 320/415 [04:28<01:22,  1.16it/s][A

tensor(0.3064, device='cuda:0', grad_fn=<NllLossBackward>)
0.3063701391220093



Iteration:  77%|█████████████████████████████████████████████████████▎               | 321/415 [04:29<01:21,  1.15it/s][A

tensor(0.2994, device='cuda:0', grad_fn=<NllLossBackward>)
0.2994471490383148



Iteration:  78%|█████████████████████████████████████████████████████▌               | 322/415 [04:30<01:19,  1.16it/s][A

tensor(0.2693, device='cuda:0', grad_fn=<NllLossBackward>)
0.2692650258541107



Iteration:  78%|█████████████████████████████████████████████████████▋               | 323/415 [04:31<01:20,  1.15it/s][A

tensor(0.2094, device='cuda:0', grad_fn=<NllLossBackward>)
0.2093539535999298



Iteration:  78%|█████████████████████████████████████████████████████▊               | 324/415 [04:32<01:18,  1.16it/s][A

tensor(0.3562, device='cuda:0', grad_fn=<NllLossBackward>)
0.35622090101242065



Iteration:  78%|██████████████████████████████████████████████████████               | 325/415 [04:32<01:17,  1.16it/s][A

tensor(0.5156, device='cuda:0', grad_fn=<NllLossBackward>)
0.5155884027481079



Iteration:  79%|██████████████████████████████████████████████████████▏              | 326/415 [04:33<01:15,  1.18it/s][A

tensor(0.2770, device='cuda:0', grad_fn=<NllLossBackward>)
0.27699118852615356



Iteration:  79%|██████████████████████████████████████████████████████▎              | 327/415 [04:34<01:14,  1.18it/s][A

tensor(0.2643, device='cuda:0', grad_fn=<NllLossBackward>)
0.26427820324897766



Iteration:  79%|██████████████████████████████████████████████████████▌              | 328/415 [04:35<01:12,  1.19it/s][A

tensor(0.3064, device='cuda:0', grad_fn=<NllLossBackward>)
0.3064310550689697



Iteration:  79%|██████████████████████████████████████████████████████▋              | 329/415 [04:36<01:12,  1.18it/s][A

tensor(0.2948, device='cuda:0', grad_fn=<NllLossBackward>)
0.29479971528053284



Iteration:  80%|██████████████████████████████████████████████████████▊              | 330/415 [04:37<01:12,  1.17it/s][A

tensor(0.5677, device='cuda:0', grad_fn=<NllLossBackward>)
0.5676760077476501



Iteration:  80%|███████████████████████████████████████████████████████              | 331/415 [04:38<01:10,  1.18it/s][A

tensor(0.3702, device='cuda:0', grad_fn=<NllLossBackward>)
0.37020769715309143



Iteration:  80%|███████████████████████████████████████████████████████▏             | 332/415 [04:38<01:09,  1.19it/s][A

tensor(0.4047, device='cuda:0', grad_fn=<NllLossBackward>)
0.404740571975708



Iteration:  80%|███████████████████████████████████████████████████████▎             | 333/415 [04:39<01:10,  1.16it/s][A

tensor(0.1588, device='cuda:0', grad_fn=<NllLossBackward>)
0.1587827503681183



Iteration:  80%|███████████████████████████████████████████████████████▌             | 334/415 [04:40<01:08,  1.18it/s][A

tensor(0.3183, device='cuda:0', grad_fn=<NllLossBackward>)
0.3182683289051056



Iteration:  81%|███████████████████████████████████████████████████████▋             | 335/415 [04:41<01:08,  1.16it/s][A

tensor(0.3202, device='cuda:0', grad_fn=<NllLossBackward>)
0.3202073276042938



Iteration:  81%|███████████████████████████████████████████████████████▊             | 336/415 [04:42<01:08,  1.16it/s][A

tensor(0.2121, device='cuda:0', grad_fn=<NllLossBackward>)
0.21205970644950867



Iteration:  81%|████████████████████████████████████████████████████████             | 337/415 [04:43<01:06,  1.18it/s][A

tensor(0.5119, device='cuda:0', grad_fn=<NllLossBackward>)
0.5119150280952454



Iteration:  81%|████████████████████████████████████████████████████████▏            | 338/415 [04:43<01:05,  1.18it/s][A

tensor(0.3681, device='cuda:0', grad_fn=<NllLossBackward>)
0.36814016103744507



Iteration:  82%|████████████████████████████████████████████████████████▎            | 339/415 [04:44<01:04,  1.18it/s][A

tensor(0.3108, device='cuda:0', grad_fn=<NllLossBackward>)
0.3108116388320923



Iteration:  82%|████████████████████████████████████████████████████████▌            | 340/415 [04:45<01:04,  1.16it/s][A

tensor(0.1781, device='cuda:0', grad_fn=<NllLossBackward>)
0.17807133495807648



Iteration:  82%|████████████████████████████████████████████████████████▋            | 341/415 [04:46<01:05,  1.14it/s][A

tensor(0.2505, device='cuda:0', grad_fn=<NllLossBackward>)
0.25048673152923584



Iteration:  82%|████████████████████████████████████████████████████████▊            | 342/415 [04:47<01:02,  1.16it/s][A

tensor(0.2455, device='cuda:0', grad_fn=<NllLossBackward>)
0.24546033143997192



Iteration:  83%|█████████████████████████████████████████████████████████            | 343/415 [04:48<01:02,  1.16it/s][A

tensor(0.4883, device='cuda:0', grad_fn=<NllLossBackward>)
0.48829764127731323



Iteration:  83%|█████████████████████████████████████████████████████████▏           | 344/415 [04:49<01:00,  1.18it/s][A

tensor(0.4569, device='cuda:0', grad_fn=<NllLossBackward>)
0.45693713426589966



Iteration:  83%|█████████████████████████████████████████████████████████▎           | 345/415 [04:50<00:59,  1.17it/s][A

tensor(0.3145, device='cuda:0', grad_fn=<NllLossBackward>)
0.3144921064376831



Iteration:  83%|█████████████████████████████████████████████████████████▌           | 346/415 [04:50<01:00,  1.15it/s][A

tensor(0.3439, device='cuda:0', grad_fn=<NllLossBackward>)
0.3438762128353119



Iteration:  84%|█████████████████████████████████████████████████████████▋           | 347/415 [04:51<00:58,  1.17it/s][A

tensor(0.3999, device='cuda:0', grad_fn=<NllLossBackward>)
0.39989370107650757



Iteration:  84%|█████████████████████████████████████████████████████████▊           | 348/415 [04:52<00:57,  1.17it/s][A

tensor(0.2785, device='cuda:0', grad_fn=<NllLossBackward>)
0.27851471304893494



Iteration:  84%|██████████████████████████████████████████████████████████           | 349/415 [04:53<00:56,  1.18it/s][A

tensor(0.2224, device='cuda:0', grad_fn=<NllLossBackward>)
0.22243961691856384



Iteration:  84%|██████████████████████████████████████████████████████████▏          | 350/415 [04:54<00:54,  1.19it/s][A

tensor(0.4173, device='cuda:0', grad_fn=<NllLossBackward>)
0.41730183362960815



Iteration:  85%|██████████████████████████████████████████████████████████▎          | 351/415 [04:55<00:53,  1.20it/s][A

tensor(0.1718, device='cuda:0', grad_fn=<NllLossBackward>)
0.17183905839920044



Iteration:  85%|██████████████████████████████████████████████████████████▌          | 352/415 [04:56<00:57,  1.09it/s][A

tensor(0.3601, device='cuda:0', grad_fn=<NllLossBackward>)
0.36009424924850464



Iteration:  85%|██████████████████████████████████████████████████████████▋          | 353/415 [04:57<00:55,  1.12it/s][A

tensor(0.3972, device='cuda:0', grad_fn=<NllLossBackward>)
0.39715635776519775



Iteration:  85%|██████████████████████████████████████████████████████████▊          | 354/415 [04:57<00:53,  1.13it/s][A

tensor(0.2139, device='cuda:0', grad_fn=<NllLossBackward>)
0.21392372250556946



Iteration:  86%|███████████████████████████████████████████████████████████          | 355/415 [04:58<00:52,  1.15it/s][A

tensor(0.3275, device='cuda:0', grad_fn=<NllLossBackward>)
0.3275488615036011



Iteration:  86%|███████████████████████████████████████████████████████████▏         | 356/415 [04:59<00:50,  1.17it/s][A

tensor(0.2461, device='cuda:0', grad_fn=<NllLossBackward>)
0.2460627406835556



Iteration:  86%|███████████████████████████████████████████████████████████▎         | 357/415 [05:00<00:48,  1.19it/s][A

tensor(0.2259, device='cuda:0', grad_fn=<NllLossBackward>)
0.22587206959724426



Iteration:  86%|███████████████████████████████████████████████████████████▌         | 358/415 [05:01<00:48,  1.19it/s][A

tensor(0.5746, device='cuda:0', grad_fn=<NllLossBackward>)
0.574635922908783



Iteration:  87%|███████████████████████████████████████████████████████████▋         | 359/415 [05:02<00:47,  1.19it/s][A

tensor(0.3483, device='cuda:0', grad_fn=<NllLossBackward>)
0.34833264350891113



Iteration:  87%|███████████████████████████████████████████████████████████▊         | 360/415 [05:02<00:46,  1.19it/s][A

tensor(0.4111, device='cuda:0', grad_fn=<NllLossBackward>)
0.41107413172721863



Iteration:  87%|████████████████████████████████████████████████████████████         | 361/415 [05:03<00:45,  1.20it/s][A

tensor(0.4459, device='cuda:0', grad_fn=<NllLossBackward>)
0.44588032364845276



Iteration:  87%|████████████████████████████████████████████████████████████▏        | 362/415 [05:04<00:48,  1.08it/s][A

tensor(0.5444, device='cuda:0', grad_fn=<NllLossBackward>)
0.5443649888038635



Iteration:  87%|████████████████████████████████████████████████████████████▎        | 363/415 [05:05<00:47,  1.08it/s][A

tensor(0.3181, device='cuda:0', grad_fn=<NllLossBackward>)
0.31811296939849854



Iteration:  88%|████████████████████████████████████████████████████████████▌        | 364/415 [05:06<00:45,  1.11it/s][A

tensor(0.2792, device='cuda:0', grad_fn=<NllLossBackward>)
0.2792462110519409



Iteration:  88%|████████████████████████████████████████████████████████████▋        | 365/415 [05:07<00:44,  1.11it/s][A

tensor(0.3499, device='cuda:0', grad_fn=<NllLossBackward>)
0.3498801290988922



Iteration:  88%|████████████████████████████████████████████████████████████▊        | 366/415 [05:08<00:43,  1.14it/s][A

tensor(0.4098, device='cuda:0', grad_fn=<NllLossBackward>)
0.4098123610019684



Iteration:  88%|█████████████████████████████████████████████████████████████        | 367/415 [05:09<00:42,  1.13it/s][A

tensor(0.5989, device='cuda:0', grad_fn=<NllLossBackward>)
0.5989323854446411



Iteration:  89%|█████████████████████████████████████████████████████████████▏       | 368/415 [05:10<00:41,  1.15it/s][A

tensor(0.2708, device='cuda:0', grad_fn=<NllLossBackward>)
0.2708394229412079



Iteration:  89%|█████████████████████████████████████████████████████████████▎       | 369/415 [05:11<00:41,  1.12it/s][A

tensor(0.3636, device='cuda:0', grad_fn=<NllLossBackward>)
0.3636029064655304



Iteration:  89%|█████████████████████████████████████████████████████████████▌       | 370/415 [05:11<00:39,  1.15it/s][A

tensor(0.4387, device='cuda:0', grad_fn=<NllLossBackward>)
0.4387419521808624



Iteration:  89%|█████████████████████████████████████████████████████████████▋       | 371/415 [05:12<00:37,  1.16it/s][A

tensor(0.3871, device='cuda:0', grad_fn=<NllLossBackward>)
0.38713136315345764



Iteration:  90%|█████████████████████████████████████████████████████████████▊       | 372/415 [05:13<00:37,  1.16it/s][A

tensor(0.2953, device='cuda:0', grad_fn=<NllLossBackward>)
0.29529428482055664



Iteration:  90%|██████████████████████████████████████████████████████████████       | 373/415 [05:14<00:36,  1.14it/s][A

tensor(0.2631, device='cuda:0', grad_fn=<NllLossBackward>)
0.26311951875686646



Iteration:  90%|██████████████████████████████████████████████████████████████▏      | 374/415 [05:15<00:38,  1.07it/s][A

tensor(0.2241, device='cuda:0', grad_fn=<NllLossBackward>)
0.22406379878520966



Iteration:  90%|██████████████████████████████████████████████████████████████▎      | 375/415 [05:16<00:36,  1.10it/s][A

tensor(0.2155, device='cuda:0', grad_fn=<NllLossBackward>)
0.215512216091156



Iteration:  91%|██████████████████████████████████████████████████████████████▌      | 376/415 [05:17<00:36,  1.08it/s][A

tensor(0.4911, device='cuda:0', grad_fn=<NllLossBackward>)
0.49110284447669983



Iteration:  91%|██████████████████████████████████████████████████████████████▋      | 377/415 [05:18<00:34,  1.10it/s][A

tensor(0.5842, device='cuda:0', grad_fn=<NllLossBackward>)
0.5841903686523438



Iteration:  91%|██████████████████████████████████████████████████████████████▊      | 378/415 [05:19<00:32,  1.14it/s][A

tensor(0.5471, device='cuda:0', grad_fn=<NllLossBackward>)
0.547135591506958



Iteration:  91%|███████████████████████████████████████████████████████████████      | 379/415 [05:19<00:31,  1.16it/s][A

tensor(0.3171, device='cuda:0', grad_fn=<NllLossBackward>)
0.31710612773895264



Iteration:  92%|███████████████████████████████████████████████████████████████▏     | 380/415 [05:20<00:29,  1.17it/s][A

tensor(0.2726, device='cuda:0', grad_fn=<NllLossBackward>)
0.2725604176521301



Iteration:  92%|███████████████████████████████████████████████████████████████▎     | 381/415 [05:21<00:28,  1.19it/s][A

tensor(0.2926, device='cuda:0', grad_fn=<NllLossBackward>)
0.29255855083465576



Iteration:  92%|███████████████████████████████████████████████████████████████▌     | 382/415 [05:22<00:28,  1.15it/s][A

tensor(0.2405, device='cuda:0', grad_fn=<NllLossBackward>)
0.24049581587314606



Iteration:  92%|███████████████████████████████████████████████████████████████▋     | 383/415 [05:23<00:27,  1.16it/s][A

tensor(0.4171, device='cuda:0', grad_fn=<NllLossBackward>)
0.4170544743537903



Iteration:  93%|███████████████████████████████████████████████████████████████▊     | 384/415 [05:24<00:27,  1.13it/s][A

tensor(0.2691, device='cuda:0', grad_fn=<NllLossBackward>)
0.2691006064414978



Iteration:  93%|████████████████████████████████████████████████████████████████     | 385/415 [05:25<00:26,  1.14it/s][A

tensor(0.2622, device='cuda:0', grad_fn=<NllLossBackward>)
0.26217448711395264



Iteration:  93%|████████████████████████████████████████████████████████████████▏    | 386/415 [05:25<00:25,  1.14it/s][A

tensor(0.2776, device='cuda:0', grad_fn=<NllLossBackward>)
0.27760517597198486



Iteration:  93%|████████████████████████████████████████████████████████████████▎    | 387/415 [05:26<00:24,  1.16it/s][A

tensor(0.3163, device='cuda:0', grad_fn=<NllLossBackward>)
0.31631556153297424



Iteration:  93%|████████████████████████████████████████████████████████████████▌    | 388/415 [05:27<00:23,  1.17it/s][A

tensor(0.2643, device='cuda:0', grad_fn=<NllLossBackward>)
0.2642804980278015



Iteration:  94%|████████████████████████████████████████████████████████████████▋    | 389/415 [05:28<00:21,  1.18it/s][A

tensor(0.3221, device='cuda:0', grad_fn=<NllLossBackward>)
0.32209116220474243



Iteration:  94%|████████████████████████████████████████████████████████████████▊    | 390/415 [05:29<00:21,  1.19it/s][A

tensor(0.4963, device='cuda:0', grad_fn=<NllLossBackward>)
0.49628961086273193



Iteration:  94%|█████████████████████████████████████████████████████████████████    | 391/415 [05:30<00:20,  1.19it/s][A

tensor(0.2898, device='cuda:0', grad_fn=<NllLossBackward>)
0.28975778818130493



Iteration:  94%|█████████████████████████████████████████████████████████████████▏   | 392/415 [05:31<00:19,  1.16it/s][A

tensor(0.2946, device='cuda:0', grad_fn=<NllLossBackward>)
0.2945680320262909



Iteration:  95%|█████████████████████████████████████████████████████████████████▎   | 393/415 [05:31<00:19,  1.15it/s][A

tensor(0.3132, device='cuda:0', grad_fn=<NllLossBackward>)
0.3131621479988098



Iteration:  95%|█████████████████████████████████████████████████████████████████▌   | 394/415 [05:32<00:18,  1.16it/s][A

tensor(0.3343, device='cuda:0', grad_fn=<NllLossBackward>)
0.3342563807964325



Iteration:  95%|█████████████████████████████████████████████████████████████████▋   | 395/415 [05:33<00:16,  1.18it/s][A

tensor(0.5101, device='cuda:0', grad_fn=<NllLossBackward>)
0.510106086730957



Iteration:  95%|█████████████████████████████████████████████████████████████████▊   | 396/415 [05:34<00:16,  1.18it/s][A

tensor(0.2772, device='cuda:0', grad_fn=<NllLossBackward>)
0.2771954834461212



Iteration:  96%|██████████████████████████████████████████████████████████████████   | 397/415 [05:35<00:15,  1.17it/s][A

tensor(0.4014, device='cuda:0', grad_fn=<NllLossBackward>)
0.40139099955558777



Iteration:  96%|██████████████████████████████████████████████████████████████████▏  | 398/415 [05:36<00:14,  1.16it/s][A

tensor(0.2683, device='cuda:0', grad_fn=<NllLossBackward>)
0.2682567834854126



Iteration:  96%|██████████████████████████████████████████████████████████████████▎  | 399/415 [05:36<00:13,  1.17it/s][A

tensor(0.2863, device='cuda:0', grad_fn=<NllLossBackward>)
0.2863255739212036



Iteration:  96%|██████████████████████████████████████████████████████████████████▌  | 400/415 [05:37<00:12,  1.18it/s][A

tensor(0.3779, device='cuda:0', grad_fn=<NllLossBackward>)
0.37791135907173157



Iteration:  97%|██████████████████████████████████████████████████████████████████▋  | 401/415 [05:38<00:11,  1.19it/s][A

tensor(0.2953, device='cuda:0', grad_fn=<NllLossBackward>)
0.29525521397590637



Iteration:  97%|██████████████████████████████████████████████████████████████████▊  | 402/415 [05:39<00:11,  1.15it/s][A

tensor(0.3070, device='cuda:0', grad_fn=<NllLossBackward>)
0.3069670796394348



Iteration:  97%|███████████████████████████████████████████████████████████████████  | 403/415 [05:40<00:10,  1.14it/s][A

tensor(0.2551, device='cuda:0', grad_fn=<NllLossBackward>)
0.2550501227378845



Iteration:  97%|███████████████████████████████████████████████████████████████████▏ | 404/415 [05:41<00:09,  1.16it/s][A

tensor(0.2751, device='cuda:0', grad_fn=<NllLossBackward>)
0.27509695291519165



Iteration:  98%|███████████████████████████████████████████████████████████████████▎ | 405/415 [05:42<00:08,  1.16it/s][A

tensor(0.3897, device='cuda:0', grad_fn=<NllLossBackward>)
0.3896794319152832



Iteration:  98%|███████████████████████████████████████████████████████████████████▌ | 406/415 [05:42<00:07,  1.18it/s][A

tensor(0.2522, device='cuda:0', grad_fn=<NllLossBackward>)
0.25217100977897644



Iteration:  98%|███████████████████████████████████████████████████████████████████▋ | 407/415 [05:43<00:06,  1.18it/s][A

tensor(0.1678, device='cuda:0', grad_fn=<NllLossBackward>)
0.16782914102077484



Iteration:  98%|███████████████████████████████████████████████████████████████████▊ | 408/415 [05:44<00:06,  1.16it/s][A

tensor(0.2154, device='cuda:0', grad_fn=<NllLossBackward>)
0.21539835631847382



Iteration:  99%|████████████████████████████████████████████████████████████████████ | 409/415 [05:45<00:05,  1.16it/s][A

tensor(0.3150, device='cuda:0', grad_fn=<NllLossBackward>)
0.314981073141098



Iteration:  99%|████████████████████████████████████████████████████████████████████▏| 410/415 [05:46<00:04,  1.18it/s][A

tensor(0.2663, device='cuda:0', grad_fn=<NllLossBackward>)
0.2663154602050781



Iteration:  99%|████████████████████████████████████████████████████████████████████▎| 411/415 [05:47<00:03,  1.18it/s][A

tensor(0.2435, device='cuda:0', grad_fn=<NllLossBackward>)
0.2435479462146759



Iteration:  99%|████████████████████████████████████████████████████████████████████▌| 412/415 [05:48<00:02,  1.19it/s][A

tensor(0.2929, device='cuda:0', grad_fn=<NllLossBackward>)
0.2929089069366455



Iteration: 100%|████████████████████████████████████████████████████████████████████▋| 413/415 [05:48<00:01,  1.19it/s][A

tensor(0.3405, device='cuda:0', grad_fn=<NllLossBackward>)
0.3404770791530609



Iteration: 100%|████████████████████████████████████████████████████████████████████▊| 414/415 [05:49<00:00,  1.18it/s][A

tensor(0.1689, device='cuda:0', grad_fn=<NllLossBackward>)
0.16893987357616425



Iteration: 100%|█████████████████████████████████████████████████████████████████████| 415/415 [05:50<00:00,  1.18it/s][A
Epoch:  67%|██████████████████████████████████████████████████▋                         | 2/3 [11:10<05:28, 328.94s/it]
Iteration:   0%|                                                                               | 0/415 [00:00<?, ?it/s][A

tensor(0.3329, device='cuda:0', grad_fn=<NllLossBackward>)
0.33294591307640076



Iteration:   0%|▏                                                                      | 1/415 [00:00<05:44,  1.20it/s][A

tensor(0.1566, device='cuda:0', grad_fn=<NllLossBackward>)
0.156564861536026



Iteration:   0%|▎                                                                      | 2/415 [00:01<05:50,  1.18it/s][A

tensor(0.2197, device='cuda:0', grad_fn=<NllLossBackward>)
0.21966983377933502



Iteration:   1%|▌                                                                      | 3/415 [00:02<05:51,  1.17it/s][A

tensor(0.1554, device='cuda:0', grad_fn=<NllLossBackward>)
0.15535491704940796



Iteration:   1%|▋                                                                      | 4/415 [00:03<06:00,  1.14it/s][A

tensor(0.2171, device='cuda:0', grad_fn=<NllLossBackward>)
0.21709215641021729



Iteration:   1%|▊                                                                      | 5/415 [00:04<05:59,  1.14it/s][A

tensor(0.2700, device='cuda:0', grad_fn=<NllLossBackward>)
0.27003955841064453



Iteration:   1%|█                                                                      | 6/415 [00:05<06:14,  1.09it/s][A

tensor(0.2547, device='cuda:0', grad_fn=<NllLossBackward>)
0.25470465421676636



Iteration:   2%|█▏                                                                     | 7/415 [00:06<06:32,  1.04it/s][A

tensor(0.2210, device='cuda:0', grad_fn=<NllLossBackward>)
0.2210022211074829



Iteration:   2%|█▎                                                                     | 8/415 [00:07<06:26,  1.05it/s][A

tensor(0.3927, device='cuda:0', grad_fn=<NllLossBackward>)
0.3926973342895508



Iteration:   2%|█▌                                                                     | 9/415 [00:08<06:16,  1.08it/s][A

tensor(0.2394, device='cuda:0', grad_fn=<NllLossBackward>)
0.23935559391975403



Iteration:   2%|█▋                                                                    | 10/415 [00:09<06:09,  1.10it/s][A

tensor(0.2257, device='cuda:0', grad_fn=<NllLossBackward>)
0.22574374079704285



Iteration:   3%|█▊                                                                    | 11/415 [00:09<05:55,  1.14it/s][A

tensor(0.1871, device='cuda:0', grad_fn=<NllLossBackward>)
0.18712599575519562



Iteration:   3%|██                                                                    | 12/415 [00:10<05:48,  1.16it/s][A

tensor(0.2425, device='cuda:0', grad_fn=<NllLossBackward>)
0.24248886108398438



Iteration:   3%|██▏                                                                   | 13/415 [00:11<05:55,  1.13it/s][A

tensor(0.1956, device='cuda:0', grad_fn=<NllLossBackward>)
0.1955745965242386



Iteration:   3%|██▎                                                                   | 14/415 [00:12<05:52,  1.14it/s][A

tensor(0.3457, device='cuda:0', grad_fn=<NllLossBackward>)
0.3457479178905487



Iteration:   4%|██▌                                                                   | 15/415 [00:13<05:49,  1.14it/s][A

tensor(0.2199, device='cuda:0', grad_fn=<NllLossBackward>)
0.21989262104034424



Iteration:   4%|██▋                                                                   | 16/415 [00:14<05:55,  1.12it/s][A

tensor(0.2056, device='cuda:0', grad_fn=<NllLossBackward>)
0.2055608034133911



Iteration:   4%|██▊                                                                   | 17/415 [00:15<05:46,  1.15it/s][A

tensor(0.1481, device='cuda:0', grad_fn=<NllLossBackward>)
0.14809297025203705



Iteration:   4%|███                                                                   | 18/415 [00:16<05:44,  1.15it/s][A

tensor(0.1930, device='cuda:0', grad_fn=<NllLossBackward>)
0.19297797977924347



Iteration:   5%|███▏                                                                  | 19/415 [00:16<05:40,  1.16it/s][A

tensor(0.2998, device='cuda:0', grad_fn=<NllLossBackward>)
0.29976537823677063



Iteration:   5%|███▎                                                                  | 20/415 [00:17<05:37,  1.17it/s][A

tensor(0.2367, device='cuda:0', grad_fn=<NllLossBackward>)
0.23671913146972656



Iteration:   5%|███▌                                                                  | 21/415 [00:18<05:31,  1.19it/s][A

tensor(0.1474, device='cuda:0', grad_fn=<NllLossBackward>)
0.14743474125862122



Iteration:   5%|███▋                                                                  | 22/415 [00:19<05:30,  1.19it/s][A

tensor(0.3198, device='cuda:0', grad_fn=<NllLossBackward>)
0.3197553753852844



Iteration:   6%|███▉                                                                  | 23/415 [00:20<05:38,  1.16it/s][A

tensor(0.3141, device='cuda:0', grad_fn=<NllLossBackward>)
0.31409841775894165



Iteration:   6%|████                                                                  | 24/415 [00:21<05:34,  1.17it/s][A

tensor(0.1434, device='cuda:0', grad_fn=<NllLossBackward>)
0.14341703057289124



Iteration:   6%|████▏                                                                 | 25/415 [00:22<05:37,  1.15it/s][A

tensor(0.1240, device='cuda:0', grad_fn=<NllLossBackward>)
0.12402815371751785



Iteration:   6%|████▍                                                                 | 26/415 [00:22<05:32,  1.17it/s][A

tensor(0.2636, device='cuda:0', grad_fn=<NllLossBackward>)
0.2636430263519287



Iteration:   7%|████▌                                                                 | 27/415 [00:23<05:37,  1.15it/s][A

tensor(0.2506, device='cuda:0', grad_fn=<NllLossBackward>)
0.250591516494751



Iteration:   7%|████▋                                                                 | 28/415 [00:24<05:33,  1.16it/s][A

tensor(0.1046, device='cuda:0', grad_fn=<NllLossBackward>)
0.10464783012866974



Iteration:   7%|████▉                                                                 | 29/415 [00:25<05:28,  1.17it/s][A

tensor(0.4281, device='cuda:0', grad_fn=<NllLossBackward>)
0.42813119292259216



Iteration:   7%|█████                                                                 | 30/415 [00:26<05:30,  1.17it/s][A

tensor(0.1927, device='cuda:0', grad_fn=<NllLossBackward>)
0.19265799224376678



Iteration:   7%|█████▏                                                                | 31/415 [00:27<05:26,  1.17it/s][A

tensor(0.0917, device='cuda:0', grad_fn=<NllLossBackward>)
0.09173250198364258



Iteration:   8%|█████▍                                                                | 32/415 [00:27<05:24,  1.18it/s][A

tensor(0.2201, device='cuda:0', grad_fn=<NllLossBackward>)
0.22005632519721985



Iteration:   8%|█████▌                                                                | 33/415 [00:28<05:22,  1.19it/s][A

tensor(0.1685, device='cuda:0', grad_fn=<NllLossBackward>)
0.16854660212993622



Iteration:   8%|█████▋                                                                | 34/415 [00:29<05:20,  1.19it/s][A

tensor(0.0838, device='cuda:0', grad_fn=<NllLossBackward>)
0.08382651209831238



Iteration:   8%|█████▉                                                                | 35/415 [00:30<05:20,  1.19it/s][A

tensor(0.2585, device='cuda:0', grad_fn=<NllLossBackward>)
0.2585252821445465



Iteration:   9%|██████                                                                | 36/415 [00:31<05:17,  1.19it/s][A

tensor(0.2213, device='cuda:0', grad_fn=<NllLossBackward>)
0.2213335782289505



Iteration:   9%|██████▏                                                               | 37/415 [00:32<05:26,  1.16it/s][A

tensor(0.3020, device='cuda:0', grad_fn=<NllLossBackward>)
0.3019627034664154



Iteration:   9%|██████▍                                                               | 38/415 [00:33<05:30,  1.14it/s][A

tensor(0.1807, device='cuda:0', grad_fn=<NllLossBackward>)
0.1807018369436264



Iteration:   9%|██████▌                                                               | 39/415 [00:33<05:20,  1.17it/s][A

tensor(0.1667, device='cuda:0', grad_fn=<NllLossBackward>)
0.16673293709754944



Iteration:  10%|██████▋                                                               | 40/415 [00:34<05:19,  1.17it/s][A

tensor(0.2152, device='cuda:0', grad_fn=<NllLossBackward>)
0.21522004902362823



Iteration:  10%|██████▉                                                               | 41/415 [00:35<05:27,  1.14it/s][A

tensor(0.4862, device='cuda:0', grad_fn=<NllLossBackward>)
0.48616087436676025



Iteration:  10%|███████                                                               | 42/415 [00:36<05:39,  1.10it/s][A

tensor(0.1990, device='cuda:0', grad_fn=<NllLossBackward>)
0.1989685744047165



Iteration:  10%|███████▎                                                              | 43/415 [00:37<05:51,  1.06it/s][A

tensor(0.2464, device='cuda:0', grad_fn=<NllLossBackward>)
0.24642440676689148



Iteration:  11%|███████▍                                                              | 44/415 [00:38<05:41,  1.09it/s][A

tensor(0.3453, device='cuda:0', grad_fn=<NllLossBackward>)
0.3453190326690674



Iteration:  11%|███████▌                                                              | 45/415 [00:39<05:28,  1.12it/s][A

tensor(0.2372, device='cuda:0', grad_fn=<NllLossBackward>)
0.23720183968544006



Iteration:  11%|███████▊                                                              | 46/415 [00:40<05:44,  1.07it/s][A

tensor(0.2409, device='cuda:0', grad_fn=<NllLossBackward>)
0.24085965752601624



Iteration:  11%|███████▉                                                              | 47/415 [00:41<05:41,  1.08it/s][A

tensor(0.2466, device='cuda:0', grad_fn=<NllLossBackward>)
0.24662016332149506



Iteration:  12%|████████                                                              | 48/415 [00:42<05:29,  1.11it/s][A

tensor(0.1391, device='cuda:0', grad_fn=<NllLossBackward>)
0.1390610933303833



Iteration:  12%|████████▎                                                             | 49/415 [00:43<05:26,  1.12it/s][A

tensor(0.1178, device='cuda:0', grad_fn=<NllLossBackward>)
0.11784200370311737



Iteration:  12%|████████▍                                                             | 50/415 [00:43<05:23,  1.13it/s][A

tensor(0.1641, device='cuda:0', grad_fn=<NllLossBackward>)
0.16405647993087769



Iteration:  12%|████████▌                                                             | 51/415 [00:44<05:14,  1.16it/s][A

tensor(0.1875, device='cuda:0', grad_fn=<NllLossBackward>)
0.18747664988040924



Iteration:  13%|████████▊                                                             | 52/415 [00:45<05:21,  1.13it/s][A

tensor(0.2062, device='cuda:0', grad_fn=<NllLossBackward>)
0.20616617798805237



Iteration:  13%|████████▉                                                             | 53/415 [00:46<05:12,  1.16it/s][A

tensor(0.0737, device='cuda:0', grad_fn=<NllLossBackward>)
0.07368841767311096



Iteration:  13%|█████████                                                             | 54/415 [00:47<05:05,  1.18it/s][A

tensor(0.1528, device='cuda:0', grad_fn=<NllLossBackward>)
0.15276768803596497



Iteration:  13%|█████████▎                                                            | 55/415 [00:48<05:10,  1.16it/s][A

tensor(0.3444, device='cuda:0', grad_fn=<NllLossBackward>)
0.3443973660469055



Iteration:  13%|█████████▍                                                            | 56/415 [00:49<05:07,  1.17it/s][A

tensor(0.3605, device='cuda:0', grad_fn=<NllLossBackward>)
0.36047783493995667



Iteration:  14%|█████████▌                                                            | 57/415 [00:49<05:03,  1.18it/s][A

tensor(0.0966, device='cuda:0', grad_fn=<NllLossBackward>)
0.09664653241634369



Iteration:  14%|█████████▊                                                            | 58/415 [00:50<05:06,  1.16it/s][A

tensor(0.2355, device='cuda:0', grad_fn=<NllLossBackward>)
0.23547416925430298



Iteration:  14%|█████████▉                                                            | 59/415 [00:51<05:03,  1.17it/s][A

tensor(0.2828, device='cuda:0', grad_fn=<NllLossBackward>)
0.28283587098121643



Iteration:  14%|██████████                                                            | 60/415 [00:52<05:11,  1.14it/s][A

tensor(0.2852, device='cuda:0', grad_fn=<NllLossBackward>)
0.2851908802986145



Iteration:  15%|██████████▎                                                           | 61/415 [00:53<05:11,  1.14it/s][A

tensor(0.0643, device='cuda:0', grad_fn=<NllLossBackward>)
0.06426860392093658



Iteration:  15%|██████████▍                                                           | 62/415 [00:54<05:07,  1.15it/s][A

tensor(0.1635, device='cuda:0', grad_fn=<NllLossBackward>)
0.1634899377822876



Iteration:  15%|██████████▋                                                           | 63/415 [00:55<05:03,  1.16it/s][A

tensor(0.3225, device='cuda:0', grad_fn=<NllLossBackward>)
0.3224501311779022



Iteration:  15%|██████████▊                                                           | 64/415 [00:55<04:59,  1.17it/s][A

tensor(0.1049, device='cuda:0', grad_fn=<NllLossBackward>)
0.10488291084766388



Iteration:  16%|██████████▉                                                           | 65/415 [00:56<04:56,  1.18it/s][A

tensor(0.2297, device='cuda:0', grad_fn=<NllLossBackward>)
0.22967804968357086



Iteration:  16%|███████████▏                                                          | 66/415 [00:57<04:57,  1.17it/s][A

tensor(0.2356, device='cuda:0', grad_fn=<NllLossBackward>)
0.2355562299489975



Iteration:  16%|███████████▎                                                          | 67/415 [00:58<05:04,  1.14it/s][A

tensor(0.3270, device='cuda:0', grad_fn=<NllLossBackward>)
0.3270188868045807



Iteration:  16%|███████████▍                                                          | 68/415 [00:59<05:03,  1.14it/s][A

tensor(0.3613, device='cuda:0', grad_fn=<NllLossBackward>)
0.3613283038139343



Iteration:  17%|███████████▋                                                          | 69/415 [01:00<05:01,  1.15it/s][A

tensor(0.3053, device='cuda:0', grad_fn=<NllLossBackward>)
0.30526411533355713



Iteration:  17%|███████████▊                                                          | 70/415 [01:01<04:57,  1.16it/s][A

tensor(0.4140, device='cuda:0', grad_fn=<NllLossBackward>)
0.4139597415924072



Iteration:  17%|███████████▉                                                          | 71/415 [01:01<04:52,  1.18it/s][A

tensor(0.1930, device='cuda:0', grad_fn=<NllLossBackward>)
0.19297918677330017



Iteration:  17%|████████████▏                                                         | 72/415 [01:02<04:57,  1.15it/s][A

tensor(0.2324, device='cuda:0', grad_fn=<NllLossBackward>)
0.2324027270078659



Iteration:  18%|████████████▎                                                         | 73/415 [01:03<04:52,  1.17it/s][A

tensor(0.2524, device='cuda:0', grad_fn=<NllLossBackward>)
0.252422571182251



Iteration:  18%|████████████▍                                                         | 74/415 [01:04<04:58,  1.14it/s][A

tensor(0.2184, device='cuda:0', grad_fn=<NllLossBackward>)
0.21840175986289978



Iteration:  18%|████████████▋                                                         | 75/415 [01:05<04:52,  1.16it/s][A

tensor(0.1840, device='cuda:0', grad_fn=<NllLossBackward>)
0.18399061262607574



Iteration:  18%|████████████▊                                                         | 76/415 [01:06<04:53,  1.16it/s][A

tensor(0.2910, device='cuda:0', grad_fn=<NllLossBackward>)
0.29104170203208923



Iteration:  19%|████████████▉                                                         | 77/415 [01:07<04:52,  1.16it/s][A

tensor(0.3554, device='cuda:0', grad_fn=<NllLossBackward>)
0.3553791344165802



Iteration:  19%|█████████████▏                                                        | 78/415 [01:08<04:55,  1.14it/s][A

tensor(0.2271, device='cuda:0', grad_fn=<NllLossBackward>)
0.22713004052639008



Iteration:  19%|█████████████▎                                                        | 79/415 [01:08<04:49,  1.16it/s][A

tensor(0.1899, device='cuda:0', grad_fn=<NllLossBackward>)
0.18991927802562714



Iteration:  19%|█████████████▍                                                        | 80/415 [01:09<04:52,  1.14it/s][A

tensor(0.2036, device='cuda:0', grad_fn=<NllLossBackward>)
0.2036193609237671



Iteration:  20%|█████████████▋                                                        | 81/415 [01:10<04:45,  1.17it/s][A

tensor(0.2391, device='cuda:0', grad_fn=<NllLossBackward>)
0.23908549547195435



Iteration:  20%|█████████████▊                                                        | 82/415 [01:11<04:40,  1.19it/s][A

tensor(0.1370, device='cuda:0', grad_fn=<NllLossBackward>)
0.13697661459445953



Iteration:  20%|██████████████                                                        | 83/415 [01:12<04:53,  1.13it/s][A

tensor(0.3205, device='cuda:0', grad_fn=<NllLossBackward>)
0.3205186724662781



Iteration:  20%|██████████████▏                                                       | 84/415 [01:13<04:47,  1.15it/s][A

tensor(0.1519, device='cuda:0', grad_fn=<NllLossBackward>)
0.15194033086299896



Iteration:  20%|██████████████▎                                                       | 85/415 [01:14<04:41,  1.17it/s][A

tensor(0.2149, device='cuda:0', grad_fn=<NllLossBackward>)
0.21493732929229736



Iteration:  21%|██████████████▌                                                       | 86/415 [01:14<04:39,  1.18it/s][A

tensor(0.2531, device='cuda:0', grad_fn=<NllLossBackward>)
0.2531411349773407



Iteration:  21%|██████████████▋                                                       | 87/415 [01:15<04:36,  1.19it/s][A

tensor(0.1057, device='cuda:0', grad_fn=<NllLossBackward>)
0.10566097497940063



Iteration:  21%|██████████████▊                                                       | 88/415 [01:16<04:34,  1.19it/s][A

tensor(0.1232, device='cuda:0', grad_fn=<NllLossBackward>)
0.12321107089519501



Iteration:  21%|███████████████                                                       | 89/415 [01:17<04:46,  1.14it/s][A

tensor(0.1842, device='cuda:0', grad_fn=<NllLossBackward>)
0.1842420995235443



Iteration:  22%|███████████████▏                                                      | 90/415 [01:18<04:43,  1.15it/s][A

tensor(0.2053, device='cuda:0', grad_fn=<NllLossBackward>)
0.20527014136314392



Iteration:  22%|███████████████▎                                                      | 91/415 [01:19<04:38,  1.16it/s][A

tensor(0.3357, device='cuda:0', grad_fn=<NllLossBackward>)
0.335728257894516



Iteration:  22%|███████████████▌                                                      | 92/415 [01:20<04:48,  1.12it/s][A

tensor(0.1186, device='cuda:0', grad_fn=<NllLossBackward>)
0.11862564831972122



Iteration:  22%|███████████████▋                                                      | 93/415 [01:21<04:42,  1.14it/s][A

tensor(0.2802, device='cuda:0', grad_fn=<NllLossBackward>)
0.28020766377449036



Iteration:  23%|███████████████▊                                                      | 94/415 [01:21<04:36,  1.16it/s][A

tensor(0.0976, device='cuda:0', grad_fn=<NllLossBackward>)
0.097634457051754



Iteration:  23%|████████████████                                                      | 95/415 [01:22<04:38,  1.15it/s][A

tensor(0.2329, device='cuda:0', grad_fn=<NllLossBackward>)
0.23285122215747833



Iteration:  23%|████████████████▏                                                     | 96/415 [01:23<04:40,  1.14it/s][A

tensor(0.1724, device='cuda:0', grad_fn=<NllLossBackward>)
0.17240272462368011



Iteration:  23%|████████████████▎                                                     | 97/415 [01:24<04:46,  1.11it/s][A

tensor(0.1015, device='cuda:0', grad_fn=<NllLossBackward>)
0.10154534131288528



Iteration:  24%|████████████████▌                                                     | 98/415 [01:25<04:42,  1.12it/s][A

tensor(0.2529, device='cuda:0', grad_fn=<NllLossBackward>)
0.2528682351112366



Iteration:  24%|████████████████▋                                                     | 99/415 [01:26<04:39,  1.13it/s][A

tensor(0.2133, device='cuda:0', grad_fn=<NllLossBackward>)
0.21334585547447205



Iteration:  24%|████████████████▋                                                    | 100/415 [01:27<04:37,  1.14it/s][A

tensor(0.3484, device='cuda:0', grad_fn=<NllLossBackward>)
0.3483788073062897



Iteration:  24%|████████████████▊                                                    | 101/415 [01:28<04:33,  1.15it/s][A

tensor(0.1511, device='cuda:0', grad_fn=<NllLossBackward>)
0.15112601220607758



Iteration:  25%|████████████████▉                                                    | 102/415 [01:28<04:35,  1.13it/s][A

tensor(0.2639, device='cuda:0', grad_fn=<NllLossBackward>)
0.263948917388916



Iteration:  25%|█████████████████▏                                                   | 103/415 [01:29<04:29,  1.16it/s][A

tensor(0.2775, device='cuda:0', grad_fn=<NllLossBackward>)
0.277502179145813



Iteration:  25%|█████████████████▎                                                   | 104/415 [01:30<04:26,  1.17it/s][A

tensor(0.1936, device='cuda:0', grad_fn=<NllLossBackward>)
0.19363613426685333



Iteration:  25%|█████████████████▍                                                   | 105/415 [01:31<04:28,  1.16it/s][A

tensor(0.3652, device='cuda:0', grad_fn=<NllLossBackward>)
0.3652426600456238



Iteration:  26%|█████████████████▌                                                   | 106/415 [01:32<04:26,  1.16it/s][A

tensor(0.3099, device='cuda:0', grad_fn=<NllLossBackward>)
0.30986425280570984



Iteration:  26%|█████████████████▊                                                   | 107/415 [01:33<04:32,  1.13it/s][A

tensor(0.1726, device='cuda:0', grad_fn=<NllLossBackward>)
0.1725752055644989



Iteration:  26%|█████████████████▉                                                   | 108/415 [01:34<04:28,  1.15it/s][A

tensor(0.1327, device='cuda:0', grad_fn=<NllLossBackward>)
0.1326749324798584



Iteration:  26%|██████████████████                                                   | 109/415 [01:34<04:21,  1.17it/s][A

tensor(0.1443, device='cuda:0', grad_fn=<NllLossBackward>)
0.144348606467247



Iteration:  27%|██████████████████▎                                                  | 110/415 [01:35<04:19,  1.18it/s][A

tensor(0.2844, device='cuda:0', grad_fn=<NllLossBackward>)
0.28440889716148376



Iteration:  27%|██████████████████▍                                                  | 111/415 [01:36<04:20,  1.17it/s][A

tensor(0.1520, device='cuda:0', grad_fn=<NllLossBackward>)
0.15197373926639557



Iteration:  27%|██████████████████▌                                                  | 112/415 [01:37<04:23,  1.15it/s][A

tensor(0.1616, device='cuda:0', grad_fn=<NllLossBackward>)
0.16158926486968994



Iteration:  27%|██████████████████▊                                                  | 113/415 [01:38<04:29,  1.12it/s][A

tensor(0.1750, device='cuda:0', grad_fn=<NllLossBackward>)
0.1749788224697113



Iteration:  27%|██████████████████▉                                                  | 114/415 [01:39<04:28,  1.12it/s][A

tensor(0.3138, device='cuda:0', grad_fn=<NllLossBackward>)
0.3138372004032135



Iteration:  28%|███████████████████                                                  | 115/415 [01:40<04:22,  1.14it/s][A

tensor(0.0923, device='cuda:0', grad_fn=<NllLossBackward>)
0.09232637286186218



Iteration:  28%|███████████████████▎                                                 | 116/415 [01:41<04:25,  1.13it/s][A

tensor(0.1731, device='cuda:0', grad_fn=<NllLossBackward>)
0.17309674620628357



Iteration:  28%|███████████████████▍                                                 | 117/415 [01:42<04:20,  1.14it/s][A

tensor(0.0973, device='cuda:0', grad_fn=<NllLossBackward>)
0.09732094407081604



Iteration:  28%|███████████████████▌                                                 | 118/415 [01:42<04:23,  1.13it/s][A

tensor(0.1844, device='cuda:0', grad_fn=<NllLossBackward>)
0.1844046264886856



Iteration:  29%|███████████████████▊                                                 | 119/415 [01:43<04:20,  1.14it/s][A

tensor(0.3273, device='cuda:0', grad_fn=<NllLossBackward>)
0.3272660970687866



Iteration:  29%|███████████████████▉                                                 | 120/415 [01:44<04:14,  1.16it/s][A

tensor(0.2505, device='cuda:0', grad_fn=<NllLossBackward>)
0.2505198121070862



Iteration:  29%|████████████████████                                                 | 121/415 [01:45<04:23,  1.11it/s][A

tensor(0.2077, device='cuda:0', grad_fn=<NllLossBackward>)
0.2077467441558838



Iteration:  29%|████████████████████▎                                                | 122/415 [01:46<04:18,  1.13it/s][A

tensor(0.2869, device='cuda:0', grad_fn=<NllLossBackward>)
0.28689664602279663



Iteration:  30%|████████████████████▍                                                | 123/415 [01:47<04:23,  1.11it/s][A

tensor(0.3014, device='cuda:0', grad_fn=<NllLossBackward>)
0.30144327878952026



Iteration:  30%|████████████████████▌                                                | 124/415 [01:48<04:18,  1.12it/s][A

tensor(0.1864, device='cuda:0', grad_fn=<NllLossBackward>)
0.1863711178302765



Iteration:  30%|████████████████████▊                                                | 125/415 [01:49<04:14,  1.14it/s][A

tensor(0.1902, device='cuda:0', grad_fn=<NllLossBackward>)
0.1902208924293518



Iteration:  30%|████████████████████▉                                                | 126/415 [01:49<04:08,  1.16it/s][A

tensor(0.0909, device='cuda:0', grad_fn=<NllLossBackward>)
0.0909096971154213



Iteration:  31%|█████████████████████                                                | 127/415 [01:50<04:05,  1.17it/s][A

tensor(0.1070, device='cuda:0', grad_fn=<NllLossBackward>)
0.10699702054262161



Iteration:  31%|█████████████████████▎                                               | 128/415 [01:51<04:09,  1.15it/s][A

tensor(0.4042, device='cuda:0', grad_fn=<NllLossBackward>)
0.404202938079834



Iteration:  31%|█████████████████████▍                                               | 129/415 [01:52<04:08,  1.15it/s][A

tensor(0.1938, device='cuda:0', grad_fn=<NllLossBackward>)
0.19382306933403015



Iteration:  31%|█████████████████████▌                                               | 130/415 [01:53<04:05,  1.16it/s][A

tensor(0.2932, device='cuda:0', grad_fn=<NllLossBackward>)
0.2932122051715851



Iteration:  32%|█████████████████████▊                                               | 131/415 [01:54<04:04,  1.16it/s][A

tensor(0.2595, device='cuda:0', grad_fn=<NllLossBackward>)
0.25953540205955505



Iteration:  32%|█████████████████████▉                                               | 132/415 [01:55<04:01,  1.17it/s][A

tensor(0.1903, device='cuda:0', grad_fn=<NllLossBackward>)
0.19027960300445557



Iteration:  32%|██████████████████████                                               | 133/415 [01:55<04:01,  1.17it/s][A

tensor(0.0473, device='cuda:0', grad_fn=<NllLossBackward>)
0.0472615510225296



Iteration:  32%|██████████████████████▎                                              | 134/415 [01:56<03:57,  1.18it/s][A

tensor(0.0993, device='cuda:0', grad_fn=<NllLossBackward>)
0.09925176948308945



Iteration:  33%|██████████████████████▍                                              | 135/415 [01:57<03:57,  1.18it/s][A

tensor(0.1764, device='cuda:0', grad_fn=<NllLossBackward>)
0.17636215686798096



Iteration:  33%|██████████████████████▌                                              | 136/415 [01:58<03:57,  1.18it/s][A

tensor(0.1916, device='cuda:0', grad_fn=<NllLossBackward>)
0.19164541363716125



Iteration:  33%|██████████████████████▊                                              | 137/415 [01:59<03:56,  1.17it/s][A

tensor(0.4443, device='cuda:0', grad_fn=<NllLossBackward>)
0.44425150752067566



Iteration:  33%|██████████████████████▉                                              | 138/415 [02:00<03:56,  1.17it/s][A

tensor(0.3316, device='cuda:0', grad_fn=<NllLossBackward>)
0.3316328823566437



Iteration:  33%|███████████████████████                                              | 139/415 [02:01<04:01,  1.14it/s][A

tensor(0.1527, device='cuda:0', grad_fn=<NllLossBackward>)
0.15272143483161926



Iteration:  34%|███████████████████████▎                                             | 140/415 [02:02<04:01,  1.14it/s][A

tensor(0.1171, device='cuda:0', grad_fn=<NllLossBackward>)
0.11709269881248474



Iteration:  34%|███████████████████████▍                                             | 141/415 [02:02<04:02,  1.13it/s][A

tensor(0.1399, device='cuda:0', grad_fn=<NllLossBackward>)
0.13985544443130493



Iteration:  34%|███████████████████████▌                                             | 142/415 [02:03<04:03,  1.12it/s][A

tensor(0.1699, device='cuda:0', grad_fn=<NllLossBackward>)
0.16993185877799988



Iteration:  34%|███████████████████████▊                                             | 143/415 [02:04<03:59,  1.14it/s][A

tensor(0.1423, device='cuda:0', grad_fn=<NllLossBackward>)
0.14232981204986572



Iteration:  35%|███████████████████████▉                                             | 144/415 [02:05<04:07,  1.10it/s][A

tensor(0.2796, device='cuda:0', grad_fn=<NllLossBackward>)
0.2796492278575897



Iteration:  35%|████████████████████████                                             | 145/415 [02:06<04:10,  1.08it/s][A

tensor(0.3197, device='cuda:0', grad_fn=<NllLossBackward>)
0.3196732997894287



Iteration:  35%|████████████████████████▎                                            | 146/415 [02:07<04:13,  1.06it/s][A

tensor(0.3332, device='cuda:0', grad_fn=<NllLossBackward>)
0.3332313001155853



Iteration:  35%|████████████████████████▍                                            | 147/415 [02:08<04:10,  1.07it/s][A

tensor(0.3039, device='cuda:0', grad_fn=<NllLossBackward>)
0.30390429496765137



Iteration:  36%|████████████████████████▌                                            | 148/415 [02:09<04:11,  1.06it/s][A

tensor(0.4930, device='cuda:0', grad_fn=<NllLossBackward>)
0.4930039048194885



Iteration:  36%|████████████████████████▊                                            | 149/415 [02:10<04:10,  1.06it/s][A

tensor(0.1340, device='cuda:0', grad_fn=<NllLossBackward>)
0.13402505218982697



Iteration:  36%|████████████████████████▉                                            | 150/415 [02:11<04:04,  1.08it/s][A

tensor(0.0784, device='cuda:0', grad_fn=<NllLossBackward>)
0.07844580709934235



Iteration:  36%|█████████████████████████                                            | 151/415 [02:12<04:07,  1.07it/s][A

tensor(0.4520, device='cuda:0', grad_fn=<NllLossBackward>)
0.45199620723724365



Iteration:  37%|█████████████████████████▎                                           | 152/415 [02:13<03:58,  1.10it/s][A

tensor(0.1969, device='cuda:0', grad_fn=<NllLossBackward>)
0.19694016873836517



Iteration:  37%|█████████████████████████▍                                           | 153/415 [02:14<04:02,  1.08it/s][A

tensor(0.1815, device='cuda:0', grad_fn=<NllLossBackward>)
0.181515634059906



Iteration:  37%|█████████████████████████▌                                           | 154/415 [02:15<04:04,  1.07it/s][A

tensor(0.1852, device='cuda:0', grad_fn=<NllLossBackward>)
0.1852191537618637



Iteration:  37%|█████████████████████████▊                                           | 155/415 [02:15<04:05,  1.06it/s][A

tensor(0.1847, device='cuda:0', grad_fn=<NllLossBackward>)
0.18470272421836853



Iteration:  38%|█████████████████████████▉                                           | 156/415 [02:16<04:03,  1.06it/s][A

tensor(0.1761, device='cuda:0', grad_fn=<NllLossBackward>)
0.17610716819763184



Iteration:  38%|██████████████████████████                                           | 157/415 [02:17<03:54,  1.10it/s][A

tensor(0.1441, device='cuda:0', grad_fn=<NllLossBackward>)
0.14411455392837524



Iteration:  38%|██████████████████████████▎                                          | 158/415 [02:18<03:58,  1.08it/s][A

tensor(0.2402, device='cuda:0', grad_fn=<NllLossBackward>)
0.24022525548934937



Iteration:  38%|██████████████████████████▍                                          | 159/415 [02:19<03:52,  1.10it/s][A

tensor(0.1792, device='cuda:0', grad_fn=<NllLossBackward>)
0.17923784255981445



Iteration:  39%|██████████████████████████▌                                          | 160/415 [02:20<03:47,  1.12it/s][A

tensor(0.1549, device='cuda:0', grad_fn=<NllLossBackward>)
0.15491244196891785



Iteration:  39%|██████████████████████████▊                                          | 161/415 [02:21<03:42,  1.14it/s][A

tensor(0.2439, device='cuda:0', grad_fn=<NllLossBackward>)
0.24392780661582947



Iteration:  39%|██████████████████████████▉                                          | 162/415 [02:22<03:36,  1.17it/s][A

tensor(0.1652, device='cuda:0', grad_fn=<NllLossBackward>)
0.16518747806549072



Iteration:  39%|███████████████████████████                                          | 163/415 [02:22<03:36,  1.16it/s][A

tensor(0.1513, device='cuda:0', grad_fn=<NllLossBackward>)
0.1513153612613678



Iteration:  40%|███████████████████████████▎                                         | 164/415 [02:23<03:36,  1.16it/s][A

tensor(0.1317, device='cuda:0', grad_fn=<NllLossBackward>)
0.13169851899147034



Iteration:  40%|███████████████████████████▍                                         | 165/415 [02:24<03:55,  1.06it/s][A

tensor(0.2700, device='cuda:0', grad_fn=<NllLossBackward>)
0.2700314521789551



Iteration:  40%|███████████████████████████▌                                         | 166/415 [02:25<03:51,  1.08it/s][A

tensor(0.3089, device='cuda:0', grad_fn=<NllLossBackward>)
0.3089236319065094



Iteration:  40%|███████████████████████████▊                                         | 167/415 [02:26<03:51,  1.07it/s][A

tensor(0.1110, device='cuda:0', grad_fn=<NllLossBackward>)
0.11101770401000977



Iteration:  40%|███████████████████████████▉                                         | 168/415 [02:27<03:42,  1.11it/s][A

tensor(0.2605, device='cuda:0', grad_fn=<NllLossBackward>)
0.26054975390434265



Iteration:  41%|████████████████████████████                                         | 169/415 [02:28<03:35,  1.14it/s][A

tensor(0.1578, device='cuda:0', grad_fn=<NllLossBackward>)
0.15781043469905853



Iteration:  41%|████████████████████████████▎                                        | 170/415 [02:29<03:33,  1.15it/s][A

tensor(0.2721, device='cuda:0', grad_fn=<NllLossBackward>)
0.27211880683898926



Iteration:  41%|████████████████████████████▍                                        | 171/415 [02:30<03:31,  1.15it/s][A

tensor(0.3967, device='cuda:0', grad_fn=<NllLossBackward>)
0.3966801166534424



Iteration:  41%|████████████████████████████▌                                        | 172/415 [02:31<03:32,  1.15it/s][A

tensor(0.3548, device='cuda:0', grad_fn=<NllLossBackward>)
0.3548080325126648



Iteration:  42%|████████████████████████████▊                                        | 173/415 [02:31<03:30,  1.15it/s][A

tensor(0.3076, device='cuda:0', grad_fn=<NllLossBackward>)
0.3075542747974396



Iteration:  42%|████████████████████████████▉                                        | 174/415 [02:32<03:27,  1.16it/s][A

tensor(0.3618, device='cuda:0', grad_fn=<NllLossBackward>)
0.36178892850875854



Iteration:  42%|█████████████████████████████                                        | 175/415 [02:33<03:27,  1.15it/s][A

tensor(0.2980, device='cuda:0', grad_fn=<NllLossBackward>)
0.2979971170425415



Iteration:  42%|█████████████████████████████▎                                       | 176/415 [02:34<03:30,  1.14it/s][A

tensor(0.5134, device='cuda:0', grad_fn=<NllLossBackward>)
0.5133697986602783



Iteration:  43%|█████████████████████████████▍                                       | 177/415 [02:35<03:32,  1.12it/s][A

tensor(0.2526, device='cuda:0', grad_fn=<NllLossBackward>)
0.2525734007358551



Iteration:  43%|█████████████████████████████▌                                       | 178/415 [02:36<03:25,  1.15it/s][A

tensor(0.1329, device='cuda:0', grad_fn=<NllLossBackward>)
0.13294613361358643



Iteration:  43%|█████████████████████████████▊                                       | 179/415 [02:37<03:46,  1.04it/s][A

tensor(0.3247, device='cuda:0', grad_fn=<NllLossBackward>)
0.32471805810928345



Iteration:  43%|█████████████████████████████▉                                       | 180/415 [02:38<03:37,  1.08it/s][A

tensor(0.2014, device='cuda:0', grad_fn=<NllLossBackward>)
0.20143860578536987



Iteration:  44%|██████████████████████████████                                       | 181/415 [02:39<03:32,  1.10it/s][A

tensor(0.1141, device='cuda:0', grad_fn=<NllLossBackward>)
0.11412407457828522



Iteration:  44%|██████████████████████████████▎                                      | 182/415 [02:40<03:27,  1.13it/s][A

tensor(0.0978, device='cuda:0', grad_fn=<NllLossBackward>)
0.09781274199485779



Iteration:  44%|██████████████████████████████▍                                      | 183/415 [02:40<03:21,  1.15it/s][A

tensor(0.1142, device='cuda:0', grad_fn=<NllLossBackward>)
0.11416685581207275



Iteration:  44%|██████████████████████████████▌                                      | 184/415 [02:41<03:27,  1.11it/s][A

tensor(0.1786, device='cuda:0', grad_fn=<NllLossBackward>)
0.17863622307777405



Iteration:  45%|██████████████████████████████▊                                      | 185/415 [02:42<03:25,  1.12it/s][A

tensor(0.1392, device='cuda:0', grad_fn=<NllLossBackward>)
0.13921965658664703



Iteration:  45%|██████████████████████████████▉                                      | 186/415 [02:43<03:23,  1.12it/s][A

tensor(0.0691, device='cuda:0', grad_fn=<NllLossBackward>)
0.06909161806106567



Iteration:  45%|███████████████████████████████                                      | 187/415 [02:44<03:19,  1.14it/s][A

tensor(0.1221, device='cuda:0', grad_fn=<NllLossBackward>)
0.12212707847356796



Iteration:  45%|███████████████████████████████▎                                     | 188/415 [02:45<03:16,  1.15it/s][A

tensor(0.0801, device='cuda:0', grad_fn=<NllLossBackward>)
0.08006071299314499



Iteration:  46%|███████████████████████████████▍                                     | 189/415 [02:46<03:13,  1.17it/s][A

tensor(0.2288, device='cuda:0', grad_fn=<NllLossBackward>)
0.22878912091255188



Iteration:  46%|███████████████████████████████▌                                     | 190/415 [02:46<03:11,  1.17it/s][A

tensor(0.4660, device='cuda:0', grad_fn=<NllLossBackward>)
0.4659605324268341



Iteration:  46%|███████████████████████████████▊                                     | 191/415 [02:47<03:14,  1.15it/s][A

tensor(0.2463, device='cuda:0', grad_fn=<NllLossBackward>)
0.24628984928131104



Iteration:  46%|███████████████████████████████▉                                     | 192/415 [02:48<03:14,  1.14it/s][A

tensor(0.2671, device='cuda:0', grad_fn=<NllLossBackward>)
0.26709988713264465



Iteration:  47%|████████████████████████████████                                     | 193/415 [02:49<03:12,  1.16it/s][A

tensor(0.2342, device='cuda:0', grad_fn=<NllLossBackward>)
0.23424795269966125



Iteration:  47%|████████████████████████████████▎                                    | 194/415 [02:50<03:13,  1.14it/s][A

tensor(0.1302, device='cuda:0', grad_fn=<NllLossBackward>)
0.13024282455444336



Iteration:  47%|████████████████████████████████▍                                    | 195/415 [02:51<03:16,  1.12it/s][A

tensor(0.2920, device='cuda:0', grad_fn=<NllLossBackward>)
0.29204708337783813



Iteration:  47%|████████████████████████████████▌                                    | 196/415 [02:52<03:12,  1.14it/s][A

tensor(0.1924, device='cuda:0', grad_fn=<NllLossBackward>)
0.19242238998413086



Iteration:  47%|████████████████████████████████▊                                    | 197/415 [02:53<03:07,  1.16it/s][A

tensor(0.2164, device='cuda:0', grad_fn=<NllLossBackward>)
0.21640925109386444



Iteration:  48%|████████████████████████████████▉                                    | 198/415 [02:53<03:05,  1.17it/s][A

tensor(0.0567, device='cuda:0', grad_fn=<NllLossBackward>)
0.056666918098926544



Iteration:  48%|█████████████████████████████████                                    | 199/415 [02:54<03:02,  1.19it/s][A

tensor(0.2027, device='cuda:0', grad_fn=<NllLossBackward>)
0.2027183473110199



Iteration:  48%|█████████████████████████████████▎                                   | 200/415 [02:55<03:05,  1.16it/s][A

tensor(0.1226, device='cuda:0', grad_fn=<NllLossBackward>)
0.122621551156044



Iteration:  48%|█████████████████████████████████▍                                   | 201/415 [02:56<03:06,  1.15it/s][A

tensor(0.1655, device='cuda:0', grad_fn=<NllLossBackward>)
0.1655445098876953



Iteration:  49%|█████████████████████████████████▌                                   | 202/415 [02:57<03:04,  1.16it/s][A

tensor(0.1524, device='cuda:0', grad_fn=<NllLossBackward>)
0.15242499113082886



Iteration:  49%|█████████████████████████████████▊                                   | 203/415 [02:58<03:05,  1.14it/s][A

tensor(0.2373, device='cuda:0', grad_fn=<NllLossBackward>)
0.2372877597808838



Iteration:  49%|█████████████████████████████████▉                                   | 204/415 [02:59<03:03,  1.15it/s][A

tensor(0.2382, device='cuda:0', grad_fn=<NllLossBackward>)
0.23818394541740417



Iteration:  49%|██████████████████████████████████                                   | 205/415 [02:59<03:03,  1.15it/s][A

tensor(0.2191, device='cuda:0', grad_fn=<NllLossBackward>)
0.21914471685886383



Iteration:  50%|██████████████████████████████████▎                                  | 206/415 [03:00<03:06,  1.12it/s][A

tensor(0.1985, device='cuda:0', grad_fn=<NllLossBackward>)
0.1985412836074829



Iteration:  50%|██████████████████████████████████▍                                  | 207/415 [03:01<03:04,  1.13it/s][A

tensor(0.1796, device='cuda:0', grad_fn=<NllLossBackward>)
0.1796162873506546



Iteration:  50%|██████████████████████████████████▌                                  | 208/415 [03:02<02:58,  1.16it/s][A

tensor(0.2330, device='cuda:0', grad_fn=<NllLossBackward>)
0.23304450511932373



Iteration:  50%|██████████████████████████████████▋                                  | 209/415 [03:03<02:55,  1.17it/s][A

tensor(0.2346, device='cuda:0', grad_fn=<NllLossBackward>)
0.23460449278354645



Iteration:  51%|██████████████████████████████████▉                                  | 210/415 [03:04<02:54,  1.17it/s][A

tensor(0.1248, device='cuda:0', grad_fn=<NllLossBackward>)
0.12484678626060486



Iteration:  51%|███████████████████████████████████                                  | 211/415 [03:05<02:59,  1.14it/s][A

tensor(0.2311, device='cuda:0', grad_fn=<NllLossBackward>)
0.23110097646713257



Iteration:  51%|███████████████████████████████████▏                                 | 212/415 [03:06<02:57,  1.14it/s][A

tensor(0.2959, device='cuda:0', grad_fn=<NllLossBackward>)
0.29593825340270996



Iteration:  51%|███████████████████████████████████▍                                 | 213/415 [03:06<02:56,  1.14it/s][A

tensor(0.0841, device='cuda:0', grad_fn=<NllLossBackward>)
0.0840744897723198



Iteration:  52%|███████████████████████████████████▌                                 | 214/415 [03:07<02:52,  1.16it/s][A

tensor(0.1628, device='cuda:0', grad_fn=<NllLossBackward>)
0.1627843677997589



Iteration:  52%|███████████████████████████████████▋                                 | 215/415 [03:08<02:59,  1.11it/s][A

tensor(0.1264, device='cuda:0', grad_fn=<NllLossBackward>)
0.1264103353023529



Iteration:  52%|███████████████████████████████████▉                                 | 216/415 [03:09<02:59,  1.11it/s][A

tensor(0.4676, device='cuda:0', grad_fn=<NllLossBackward>)
0.46761149168014526



Iteration:  52%|████████████████████████████████████                                 | 217/415 [03:10<03:01,  1.09it/s][A

tensor(0.1598, device='cuda:0', grad_fn=<NllLossBackward>)
0.15982533991336823



Iteration:  53%|████████████████████████████████████▏                                | 218/415 [03:11<03:00,  1.09it/s][A

tensor(0.2523, device='cuda:0', grad_fn=<NllLossBackward>)
0.2523183822631836



Iteration:  53%|████████████████████████████████████▍                                | 219/415 [03:12<02:58,  1.10it/s][A

tensor(0.2577, device='cuda:0', grad_fn=<NllLossBackward>)
0.2576633095741272



Iteration:  53%|████████████████████████████████████▌                                | 220/415 [03:13<02:52,  1.13it/s][A

tensor(0.1675, device='cuda:0', grad_fn=<NllLossBackward>)
0.1674702763557434



Iteration:  53%|████████████████████████████████████▋                                | 221/415 [03:14<02:48,  1.15it/s][A

tensor(0.1626, device='cuda:0', grad_fn=<NllLossBackward>)
0.1626027226448059



Iteration:  53%|████████████████████████████████████▉                                | 222/415 [03:15<02:49,  1.14it/s][A

tensor(0.1743, device='cuda:0', grad_fn=<NllLossBackward>)
0.17433057725429535



Iteration:  54%|█████████████████████████████████████                                | 223/415 [03:15<02:50,  1.13it/s][A

tensor(0.1894, device='cuda:0', grad_fn=<NllLossBackward>)
0.18939673900604248



Iteration:  54%|█████████████████████████████████████▏                               | 224/415 [03:16<02:46,  1.14it/s][A

tensor(0.1212, device='cuda:0', grad_fn=<NllLossBackward>)
0.1212373599410057



Iteration:  54%|█████████████████████████████████████▍                               | 225/415 [03:17<02:42,  1.17it/s][A

tensor(0.0632, device='cuda:0', grad_fn=<NllLossBackward>)
0.06321173906326294



Iteration:  54%|█████████████████████████████████████▌                               | 226/415 [03:18<02:41,  1.17it/s][A

tensor(0.2396, device='cuda:0', grad_fn=<NllLossBackward>)
0.23956725001335144



Iteration:  55%|█████████████████████████████████████▋                               | 227/415 [03:19<02:47,  1.12it/s][A

tensor(0.3113, device='cuda:0', grad_fn=<NllLossBackward>)
0.31128692626953125



Iteration:  55%|█████████████████████████████████████▉                               | 228/415 [03:20<02:42,  1.15it/s][A

tensor(0.1638, device='cuda:0', grad_fn=<NllLossBackward>)
0.163782998919487



Iteration:  55%|██████████████████████████████████████                               | 229/415 [03:21<02:44,  1.13it/s][A

tensor(0.1962, device='cuda:0', grad_fn=<NllLossBackward>)
0.19621708989143372



Iteration:  55%|██████████████████████████████████████▏                              | 230/415 [03:22<02:47,  1.11it/s][A

tensor(0.2209, device='cuda:0', grad_fn=<NllLossBackward>)
0.22088772058486938



Iteration:  56%|██████████████████████████████████████▍                              | 231/415 [03:22<02:41,  1.14it/s][A

tensor(0.1194, device='cuda:0', grad_fn=<NllLossBackward>)
0.1193954199552536



Iteration:  56%|██████████████████████████████████████▌                              | 232/415 [03:23<02:46,  1.10it/s][A

tensor(0.3317, device='cuda:0', grad_fn=<NllLossBackward>)
0.331685870885849



Iteration:  56%|██████████████████████████████████████▋                              | 233/415 [03:24<02:43,  1.11it/s][A

tensor(0.2121, device='cuda:0', grad_fn=<NllLossBackward>)
0.21213193237781525



Iteration:  56%|██████████████████████████████████████▉                              | 234/415 [03:25<02:40,  1.12it/s][A

tensor(0.1662, device='cuda:0', grad_fn=<NllLossBackward>)
0.16618043184280396



Iteration:  57%|███████████████████████████████████████                              | 235/415 [03:26<02:41,  1.11it/s][A

tensor(0.1254, device='cuda:0', grad_fn=<NllLossBackward>)
0.12535032629966736



Iteration:  57%|███████████████████████████████████████▏                             | 236/415 [03:27<02:38,  1.13it/s][A

tensor(0.1084, device='cuda:0', grad_fn=<NllLossBackward>)
0.10837215185165405



Iteration:  57%|███████████████████████████████████████▍                             | 237/415 [03:28<02:35,  1.15it/s][A

tensor(0.1991, device='cuda:0', grad_fn=<NllLossBackward>)
0.19906550645828247



Iteration:  57%|███████████████████████████████████████▌                             | 238/415 [03:29<02:34,  1.14it/s][A

tensor(0.2060, device='cuda:0', grad_fn=<NllLossBackward>)
0.20601636171340942



Iteration:  58%|███████████████████████████████████████▋                             | 239/415 [03:30<02:33,  1.15it/s][A

tensor(0.2136, device='cuda:0', grad_fn=<NllLossBackward>)
0.21356159448623657



Iteration:  58%|███████████████████████████████████████▉                             | 240/415 [03:30<02:34,  1.13it/s][A

tensor(0.1458, device='cuda:0', grad_fn=<NllLossBackward>)
0.14584848284721375



Iteration:  58%|████████████████████████████████████████                             | 241/415 [03:31<02:32,  1.14it/s][A

tensor(0.1261, device='cuda:0', grad_fn=<NllLossBackward>)
0.1261468380689621



Iteration:  58%|████████████████████████████████████████▏                            | 242/415 [03:32<02:29,  1.15it/s][A

tensor(0.2272, device='cuda:0', grad_fn=<NllLossBackward>)
0.22720836102962494



Iteration:  59%|████████████████████████████████████████▍                            | 243/415 [03:33<02:31,  1.14it/s][A

tensor(0.1969, device='cuda:0', grad_fn=<NllLossBackward>)
0.19685959815979004



Iteration:  59%|████████████████████████████████████████▌                            | 244/415 [03:34<02:33,  1.12it/s][A

tensor(0.4097, device='cuda:0', grad_fn=<NllLossBackward>)
0.40974292159080505



Iteration:  59%|████████████████████████████████████████▋                            | 245/415 [03:35<02:30,  1.13it/s][A

tensor(0.2982, device='cuda:0', grad_fn=<NllLossBackward>)
0.29822003841400146



Iteration:  59%|████████████████████████████████████████▉                            | 246/415 [03:36<02:30,  1.12it/s][A

tensor(0.2144, device='cuda:0', grad_fn=<NllLossBackward>)
0.21437865495681763



Iteration:  60%|█████████████████████████████████████████                            | 247/415 [03:37<02:27,  1.14it/s][A

tensor(0.2987, device='cuda:0', grad_fn=<NllLossBackward>)
0.29867023229599



Iteration:  60%|█████████████████████████████████████████▏                           | 248/415 [03:37<02:28,  1.13it/s][A

tensor(0.1995, device='cuda:0', grad_fn=<NllLossBackward>)
0.199479877948761



Iteration:  60%|█████████████████████████████████████████▍                           | 249/415 [03:38<02:27,  1.13it/s][A

tensor(0.2510, device='cuda:0', grad_fn=<NllLossBackward>)
0.25096943974494934



Iteration:  60%|█████████████████████████████████████████▌                           | 250/415 [03:39<02:24,  1.15it/s][A

tensor(0.3772, device='cuda:0', grad_fn=<NllLossBackward>)
0.3771584928035736



Iteration:  60%|█████████████████████████████████████████▋                           | 251/415 [03:40<02:22,  1.15it/s][A

tensor(0.3054, device='cuda:0', grad_fn=<NllLossBackward>)
0.30536729097366333



Iteration:  61%|█████████████████████████████████████████▉                           | 252/415 [03:41<02:34,  1.06it/s][A

tensor(0.1076, device='cuda:0', grad_fn=<NllLossBackward>)
0.10759972035884857



Iteration:  61%|██████████████████████████████████████████                           | 253/415 [03:42<02:30,  1.08it/s][A

tensor(0.3681, device='cuda:0', grad_fn=<NllLossBackward>)
0.368097186088562



Iteration:  61%|██████████████████████████████████████████▏                          | 254/415 [03:43<02:24,  1.11it/s][A

tensor(0.2331, device='cuda:0', grad_fn=<NllLossBackward>)
0.2331339418888092



Iteration:  61%|██████████████████████████████████████████▍                          | 255/415 [03:44<02:20,  1.14it/s][A

tensor(0.2790, device='cuda:0', grad_fn=<NllLossBackward>)
0.27897804975509644



Iteration:  62%|██████████████████████████████████████████▌                          | 256/415 [03:45<02:21,  1.13it/s][A

tensor(0.0888, device='cuda:0', grad_fn=<NllLossBackward>)
0.08877212554216385



Iteration:  62%|██████████████████████████████████████████▋                          | 257/415 [03:46<02:21,  1.12it/s][A

tensor(0.1619, device='cuda:0', grad_fn=<NllLossBackward>)
0.16186006367206573



Iteration:  62%|██████████████████████████████████████████▉                          | 258/415 [03:46<02:16,  1.15it/s][A

tensor(0.1860, device='cuda:0', grad_fn=<NllLossBackward>)
0.18596750497817993



Iteration:  62%|███████████████████████████████████████████                          | 259/415 [03:47<02:18,  1.13it/s][A

tensor(0.1010, device='cuda:0', grad_fn=<NllLossBackward>)
0.100958451628685



Iteration:  63%|███████████████████████████████████████████▏                         | 260/415 [03:48<02:17,  1.13it/s][A

tensor(0.3200, device='cuda:0', grad_fn=<NllLossBackward>)
0.3199998438358307



Iteration:  63%|███████████████████████████████████████████▍                         | 261/415 [03:49<02:13,  1.15it/s][A

tensor(0.1754, device='cuda:0', grad_fn=<NllLossBackward>)
0.17543216049671173



Iteration:  63%|███████████████████████████████████████████▌                         | 262/415 [03:50<02:12,  1.15it/s][A

tensor(0.2073, device='cuda:0', grad_fn=<NllLossBackward>)
0.20733889937400818



Iteration:  63%|███████████████████████████████████████████▋                         | 263/415 [03:51<02:11,  1.16it/s][A

tensor(0.2573, device='cuda:0', grad_fn=<NllLossBackward>)
0.2573460638523102



Iteration:  64%|███████████████████████████████████████████▉                         | 264/415 [03:52<02:09,  1.17it/s][A

tensor(0.1400, device='cuda:0', grad_fn=<NllLossBackward>)
0.13995453715324402



Iteration:  64%|████████████████████████████████████████████                         | 265/415 [03:52<02:10,  1.15it/s][A

tensor(0.2473, device='cuda:0', grad_fn=<NllLossBackward>)
0.24731376767158508



Iteration:  64%|████████████████████████████████████████████▏                        | 266/415 [03:53<02:07,  1.17it/s][A

tensor(0.2468, device='cuda:0', grad_fn=<NllLossBackward>)
0.24677802622318268



Iteration:  64%|████████████████████████████████████████████▍                        | 267/415 [03:54<02:10,  1.13it/s][A

tensor(0.2213, device='cuda:0', grad_fn=<NllLossBackward>)
0.22129571437835693



Iteration:  65%|████████████████████████████████████████████▌                        | 268/415 [03:55<02:08,  1.14it/s][A

tensor(0.1442, device='cuda:0', grad_fn=<NllLossBackward>)
0.1442006528377533



Iteration:  65%|████████████████████████████████████████████▋                        | 269/415 [03:56<02:05,  1.16it/s][A

tensor(0.1918, device='cuda:0', grad_fn=<NllLossBackward>)
0.19177423417568207



Iteration:  65%|████████████████████████████████████████████▉                        | 270/415 [03:57<02:06,  1.15it/s][A

tensor(0.1794, device='cuda:0', grad_fn=<NllLossBackward>)
0.17943687736988068



Iteration:  65%|█████████████████████████████████████████████                        | 271/415 [03:58<02:02,  1.18it/s][A

tensor(0.0653, device='cuda:0', grad_fn=<NllLossBackward>)
0.06526098400354385



Iteration:  66%|█████████████████████████████████████████████▏                       | 272/415 [03:59<02:03,  1.15it/s][A

tensor(0.2005, device='cuda:0', grad_fn=<NllLossBackward>)
0.20053519308567047



Iteration:  66%|█████████████████████████████████████████████▍                       | 273/415 [03:59<02:06,  1.12it/s][A

tensor(0.2304, device='cuda:0', grad_fn=<NllLossBackward>)
0.23044367134571075



Iteration:  66%|█████████████████████████████████████████████▌                       | 274/415 [04:00<02:02,  1.15it/s][A

tensor(0.1629, device='cuda:0', grad_fn=<NllLossBackward>)
0.1628798246383667



Iteration:  66%|█████████████████████████████████████████████▋                       | 275/415 [04:01<02:01,  1.16it/s][A

tensor(0.3547, device='cuda:0', grad_fn=<NllLossBackward>)
0.3547397553920746



Iteration:  67%|█████████████████████████████████████████████▉                       | 276/415 [04:02<02:01,  1.14it/s][A

tensor(0.3412, device='cuda:0', grad_fn=<NllLossBackward>)
0.34121185541152954



Iteration:  67%|██████████████████████████████████████████████                       | 277/415 [04:03<02:04,  1.11it/s][A

tensor(0.2821, device='cuda:0', grad_fn=<NllLossBackward>)
0.2820679843425751



Iteration:  67%|██████████████████████████████████████████████▏                      | 278/415 [04:04<02:03,  1.11it/s][A

tensor(0.2219, device='cuda:0', grad_fn=<NllLossBackward>)
0.2219410240650177



Iteration:  67%|██████████████████████████████████████████████▍                      | 279/415 [04:05<02:02,  1.11it/s][A

tensor(0.1837, device='cuda:0', grad_fn=<NllLossBackward>)
0.18370211124420166



Iteration:  67%|██████████████████████████████████████████████▌                      | 280/415 [04:06<01:59,  1.13it/s][A

tensor(0.1080, device='cuda:0', grad_fn=<NllLossBackward>)
0.10797715187072754



Iteration:  68%|██████████████████████████████████████████████▋                      | 281/415 [04:07<01:56,  1.15it/s][A

tensor(0.1016, device='cuda:0', grad_fn=<NllLossBackward>)
0.10156846046447754



Iteration:  68%|██████████████████████████████████████████████▉                      | 282/415 [04:07<01:56,  1.15it/s][A

tensor(0.3146, device='cuda:0', grad_fn=<NllLossBackward>)
0.31457483768463135



Iteration:  68%|███████████████████████████████████████████████                      | 283/415 [04:08<01:56,  1.13it/s][A

tensor(0.1284, device='cuda:0', grad_fn=<NllLossBackward>)
0.12839338183403015



Iteration:  68%|███████████████████████████████████████████████▏                     | 284/415 [04:09<01:54,  1.14it/s][A

tensor(0.1324, device='cuda:0', grad_fn=<NllLossBackward>)
0.1324012726545334



Iteration:  69%|███████████████████████████████████████████████▍                     | 285/415 [04:10<01:51,  1.16it/s][A

tensor(0.0741, device='cuda:0', grad_fn=<NllLossBackward>)
0.0740608274936676



Iteration:  69%|███████████████████████████████████████████████▌                     | 286/415 [04:11<01:50,  1.17it/s][A

tensor(0.2308, device='cuda:0', grad_fn=<NllLossBackward>)
0.2308194935321808



Iteration:  69%|███████████████████████████████████████████████▋                     | 287/415 [04:12<01:52,  1.14it/s][A

tensor(0.0883, device='cuda:0', grad_fn=<NllLossBackward>)
0.08825737982988358



Iteration:  69%|███████████████████████████████████████████████▉                     | 288/415 [04:13<01:49,  1.16it/s][A

tensor(0.2967, device='cuda:0', grad_fn=<NllLossBackward>)
0.29666653275489807



Iteration:  70%|████████████████████████████████████████████████                     | 289/415 [04:13<01:48,  1.16it/s][A

tensor(0.2480, device='cuda:0', grad_fn=<NllLossBackward>)
0.24796812236309052



Iteration:  70%|████████████████████████████████████████████████▏                    | 290/415 [04:14<01:47,  1.16it/s][A

tensor(0.1514, device='cuda:0', grad_fn=<NllLossBackward>)
0.1513751894235611



Iteration:  70%|████████████████████████████████████████████████▍                    | 291/415 [04:15<01:49,  1.13it/s][A

tensor(0.1714, device='cuda:0', grad_fn=<NllLossBackward>)
0.17141251266002655



Iteration:  70%|████████████████████████████████████████████████▌                    | 292/415 [04:16<01:47,  1.14it/s][A

tensor(0.1899, device='cuda:0', grad_fn=<NllLossBackward>)
0.18987435102462769



Iteration:  71%|████████████████████████████████████████████████▋                    | 293/415 [04:17<01:50,  1.11it/s][A

tensor(0.3554, device='cuda:0', grad_fn=<NllLossBackward>)
0.3553624153137207



Iteration:  71%|████████████████████████████████████████████████▉                    | 294/415 [04:18<01:47,  1.12it/s][A

tensor(0.3600, device='cuda:0', grad_fn=<NllLossBackward>)
0.3599783778190613



Iteration:  71%|█████████████████████████████████████████████████                    | 295/415 [04:19<01:46,  1.13it/s][A

tensor(0.1962, device='cuda:0', grad_fn=<NllLossBackward>)
0.19615067541599274



Iteration:  71%|█████████████████████████████████████████████████▏                   | 296/415 [04:20<01:46,  1.12it/s][A

tensor(0.3178, device='cuda:0', grad_fn=<NllLossBackward>)
0.3178376257419586



Iteration:  72%|█████████████████████████████████████████████████▍                   | 297/415 [04:21<01:43,  1.14it/s][A

tensor(0.3789, device='cuda:0', grad_fn=<NllLossBackward>)
0.37891507148742676



Iteration:  72%|█████████████████████████████████████████████████▌                   | 298/415 [04:21<01:40,  1.16it/s][A

tensor(0.5905, device='cuda:0', grad_fn=<NllLossBackward>)
0.590549111366272



Iteration:  72%|█████████████████████████████████████████████████▋                   | 299/415 [04:22<01:40,  1.15it/s][A

tensor(0.1917, device='cuda:0', grad_fn=<NllLossBackward>)
0.19174014031887054



Iteration:  72%|█████████████████████████████████████████████████▉                   | 300/415 [04:23<01:41,  1.14it/s][A

tensor(0.2361, device='cuda:0', grad_fn=<NllLossBackward>)
0.23610961437225342



Iteration:  73%|██████████████████████████████████████████████████                   | 301/415 [04:24<01:44,  1.09it/s][A

tensor(0.2017, device='cuda:0', grad_fn=<NllLossBackward>)
0.20171263813972473



Iteration:  73%|██████████████████████████████████████████████████▏                  | 302/415 [04:25<01:41,  1.12it/s][A

tensor(0.2938, device='cuda:0', grad_fn=<NllLossBackward>)
0.2938435673713684



Iteration:  73%|██████████████████████████████████████████████████▍                  | 303/415 [04:26<01:39,  1.12it/s][A

tensor(0.1914, device='cuda:0', grad_fn=<NllLossBackward>)
0.19137226045131683



Iteration:  73%|██████████████████████████████████████████████████▌                  | 304/415 [04:27<01:37,  1.14it/s][A

tensor(0.1047, device='cuda:0', grad_fn=<NllLossBackward>)
0.10466036945581436



Iteration:  73%|██████████████████████████████████████████████████▋                  | 305/415 [04:28<01:35,  1.16it/s][A

tensor(0.1776, device='cuda:0', grad_fn=<NllLossBackward>)
0.1775818169116974



Iteration:  74%|██████████████████████████████████████████████████▉                  | 306/415 [04:29<01:38,  1.11it/s][A

tensor(0.2643, device='cuda:0', grad_fn=<NllLossBackward>)
0.2643001973628998



Iteration:  74%|███████████████████████████████████████████████████                  | 307/415 [04:29<01:34,  1.14it/s][A

tensor(0.1868, device='cuda:0', grad_fn=<NllLossBackward>)
0.18682506680488586



Iteration:  74%|███████████████████████████████████████████████████▏                 | 308/415 [04:30<01:32,  1.15it/s][A

tensor(0.1481, device='cuda:0', grad_fn=<NllLossBackward>)
0.14805608987808228



Iteration:  74%|███████████████████████████████████████████████████▍                 | 309/415 [04:31<01:33,  1.13it/s][A

tensor(0.2727, device='cuda:0', grad_fn=<NllLossBackward>)
0.272732675075531



Iteration:  75%|███████████████████████████████████████████████████▌                 | 310/415 [04:32<01:33,  1.12it/s][A

tensor(0.1581, device='cuda:0', grad_fn=<NllLossBackward>)
0.15813204646110535



Iteration:  75%|███████████████████████████████████████████████████▋                 | 311/415 [04:33<01:30,  1.14it/s][A

tensor(0.2623, device='cuda:0', grad_fn=<NllLossBackward>)
0.2622537314891815



Iteration:  75%|███████████████████████████████████████████████████▊                 | 312/415 [04:34<01:30,  1.13it/s][A

tensor(0.1968, device='cuda:0', grad_fn=<NllLossBackward>)
0.19683586061000824



Iteration:  75%|████████████████████████████████████████████████████                 | 313/415 [04:35<01:31,  1.12it/s][A

tensor(0.3963, device='cuda:0', grad_fn=<NllLossBackward>)
0.39629703760147095



Iteration:  76%|████████████████████████████████████████████████████▏                | 314/415 [04:36<01:27,  1.15it/s][A

tensor(0.3126, device='cuda:0', grad_fn=<NllLossBackward>)
0.31260353326797485



Iteration:  76%|████████████████████████████████████████████████████▎                | 315/415 [04:36<01:26,  1.15it/s][A

tensor(0.1270, device='cuda:0', grad_fn=<NllLossBackward>)
0.12701746821403503



Iteration:  76%|████████████████████████████████████████████████████▌                | 316/415 [04:37<01:28,  1.12it/s][A

tensor(0.1635, device='cuda:0', grad_fn=<NllLossBackward>)
0.16349884867668152



Iteration:  76%|████████████████████████████████████████████████████▋                | 317/415 [04:38<01:25,  1.15it/s][A

tensor(0.2069, device='cuda:0', grad_fn=<NllLossBackward>)
0.20691069960594177



Iteration:  77%|████████████████████████████████████████████████████▊                | 318/415 [04:39<01:23,  1.16it/s][A

tensor(0.3617, device='cuda:0', grad_fn=<NllLossBackward>)
0.3617492914199829



Iteration:  77%|█████████████████████████████████████████████████████                | 319/415 [04:40<01:24,  1.13it/s][A

tensor(0.3035, device='cuda:0', grad_fn=<NllLossBackward>)
0.3034572899341583



Iteration:  77%|█████████████████████████████████████████████████████▏               | 320/415 [04:41<01:26,  1.10it/s][A

tensor(0.1984, device='cuda:0', grad_fn=<NllLossBackward>)
0.1983594298362732



Iteration:  77%|█████████████████████████████████████████████████████▎               | 321/415 [04:42<01:24,  1.11it/s][A

tensor(0.2971, device='cuda:0', grad_fn=<NllLossBackward>)
0.2971038520336151



Iteration:  78%|█████████████████████████████████████████████████████▌               | 322/415 [04:43<01:24,  1.11it/s][A

tensor(0.1920, device='cuda:0', grad_fn=<NllLossBackward>)
0.191981241106987



Iteration:  78%|█████████████████████████████████████████████████████▋               | 323/415 [04:44<01:21,  1.13it/s][A

tensor(0.2465, device='cuda:0', grad_fn=<NllLossBackward>)
0.24651777744293213



Iteration:  78%|█████████████████████████████████████████████████████▊               | 324/415 [04:44<01:19,  1.15it/s][A

tensor(0.1275, device='cuda:0', grad_fn=<NllLossBackward>)
0.12749747931957245



Iteration:  78%|██████████████████████████████████████████████████████               | 325/415 [04:45<01:18,  1.15it/s][A

tensor(0.2465, device='cuda:0', grad_fn=<NllLossBackward>)
0.24651995301246643



Iteration:  79%|██████████████████████████████████████████████████████▏              | 326/415 [04:46<01:16,  1.16it/s][A

tensor(0.1096, device='cuda:0', grad_fn=<NllLossBackward>)
0.10962529480457306



Iteration:  79%|██████████████████████████████████████████████████████▎              | 327/415 [04:47<01:18,  1.12it/s][A

tensor(0.1923, device='cuda:0', grad_fn=<NllLossBackward>)
0.19234150648117065



Iteration:  79%|██████████████████████████████████████████████████████▌              | 328/415 [04:48<01:27,  1.01s/it][A

tensor(0.1602, device='cuda:0', grad_fn=<NllLossBackward>)
0.16024917364120483



Iteration:  79%|██████████████████████████████████████████████████████▋              | 329/415 [04:49<01:22,  1.04it/s][A

tensor(0.1980, device='cuda:0', grad_fn=<NllLossBackward>)
0.1979762315750122



Iteration:  80%|██████████████████████████████████████████████████████▊              | 330/415 [04:50<01:18,  1.09it/s][A

tensor(0.2020, device='cuda:0', grad_fn=<NllLossBackward>)
0.20204168558120728



Iteration:  80%|███████████████████████████████████████████████████████              | 331/415 [04:51<01:16,  1.09it/s][A

tensor(0.3717, device='cuda:0', grad_fn=<NllLossBackward>)
0.3717168867588043



Iteration:  80%|███████████████████████████████████████████████████████▏             | 332/415 [04:52<01:13,  1.13it/s][A

tensor(0.2725, device='cuda:0', grad_fn=<NllLossBackward>)
0.27254050970077515



Iteration:  80%|███████████████████████████████████████████████████████▎             | 333/415 [04:53<01:11,  1.15it/s][A

tensor(0.2661, device='cuda:0', grad_fn=<NllLossBackward>)
0.26611295342445374



Iteration:  80%|███████████████████████████████████████████████████████▌             | 334/415 [04:53<01:09,  1.16it/s][A

tensor(0.1605, device='cuda:0', grad_fn=<NllLossBackward>)
0.160519540309906



Iteration:  81%|███████████████████████████████████████████████████████▋             | 335/415 [04:54<01:08,  1.17it/s][A

tensor(0.2401, device='cuda:0', grad_fn=<NllLossBackward>)
0.2401314526796341



Iteration:  81%|███████████████████████████████████████████████████████▊             | 336/415 [04:55<01:14,  1.06it/s][A

tensor(0.1624, device='cuda:0', grad_fn=<NllLossBackward>)
0.16237908601760864



Iteration:  81%|████████████████████████████████████████████████████████             | 337/415 [04:56<01:10,  1.10it/s][A

tensor(0.1605, device='cuda:0', grad_fn=<NllLossBackward>)
0.16054143011569977



Iteration:  81%|████████████████████████████████████████████████████████▏            | 338/415 [04:57<01:08,  1.13it/s][A

tensor(0.2123, device='cuda:0', grad_fn=<NllLossBackward>)
0.21231722831726074



Iteration:  82%|████████████████████████████████████████████████████████▎            | 339/415 [04:58<01:05,  1.15it/s][A

tensor(0.2321, device='cuda:0', grad_fn=<NllLossBackward>)
0.23208852112293243



Iteration:  82%|████████████████████████████████████████████████████████▌            | 340/415 [04:59<01:03,  1.18it/s][A

tensor(0.2710, device='cuda:0', grad_fn=<NllLossBackward>)
0.2710130512714386



Iteration:  82%|████████████████████████████████████████████████████████▋            | 341/415 [05:00<01:03,  1.16it/s][A

tensor(0.0792, device='cuda:0', grad_fn=<NllLossBackward>)
0.07916257530450821



Iteration:  82%|████████████████████████████████████████████████████████▊            | 342/415 [05:00<01:04,  1.13it/s][A

tensor(0.3423, device='cuda:0', grad_fn=<NllLossBackward>)
0.34233805537223816



Iteration:  83%|█████████████████████████████████████████████████████████            | 343/415 [05:01<01:04,  1.12it/s][A

tensor(0.2624, device='cuda:0', grad_fn=<NllLossBackward>)
0.2623636722564697



Iteration:  83%|█████████████████████████████████████████████████████████▏           | 344/415 [05:02<01:03,  1.13it/s][A

tensor(0.3516, device='cuda:0', grad_fn=<NllLossBackward>)
0.35156506299972534



Iteration:  83%|█████████████████████████████████████████████████████████▎           | 345/415 [05:03<01:02,  1.13it/s][A

tensor(0.1618, device='cuda:0', grad_fn=<NllLossBackward>)
0.1618008315563202



Iteration:  83%|█████████████████████████████████████████████████████████▌           | 346/415 [05:04<00:59,  1.16it/s][A

tensor(0.1417, device='cuda:0', grad_fn=<NllLossBackward>)
0.1417330801486969



Iteration:  84%|█████████████████████████████████████████████████████████▋           | 347/415 [05:05<00:59,  1.14it/s][A

tensor(0.1700, device='cuda:0', grad_fn=<NllLossBackward>)
0.16995909810066223



Iteration:  84%|█████████████████████████████████████████████████████████▊           | 348/415 [05:06<00:58,  1.15it/s][A

tensor(0.1277, device='cuda:0', grad_fn=<NllLossBackward>)
0.1276804804801941



Iteration:  84%|██████████████████████████████████████████████████████████           | 349/415 [05:07<00:58,  1.13it/s][A

tensor(0.1804, device='cuda:0', grad_fn=<NllLossBackward>)
0.18041867017745972



Iteration:  84%|██████████████████████████████████████████████████████████▏          | 350/415 [05:08<00:56,  1.15it/s][A

tensor(0.2447, device='cuda:0', grad_fn=<NllLossBackward>)
0.2446604073047638



Iteration:  85%|██████████████████████████████████████████████████████████▎          | 351/415 [05:08<00:56,  1.13it/s][A

tensor(0.3102, device='cuda:0', grad_fn=<NllLossBackward>)
0.31017717719078064



Iteration:  85%|██████████████████████████████████████████████████████████▌          | 352/415 [05:09<00:57,  1.09it/s][A

tensor(0.1820, device='cuda:0', grad_fn=<NllLossBackward>)
0.1819998025894165



Iteration:  85%|██████████████████████████████████████████████████████████▋          | 353/415 [05:10<00:56,  1.11it/s][A

tensor(0.3474, device='cuda:0', grad_fn=<NllLossBackward>)
0.3474048376083374



Iteration:  85%|██████████████████████████████████████████████████████████▊          | 354/415 [05:11<00:54,  1.12it/s][A

tensor(0.2337, device='cuda:0', grad_fn=<NllLossBackward>)
0.23374390602111816



Iteration:  86%|███████████████████████████████████████████████████████████          | 355/415 [05:12<00:52,  1.14it/s][A

tensor(0.1283, device='cuda:0', grad_fn=<NllLossBackward>)
0.1283121109008789



Iteration:  86%|███████████████████████████████████████████████████████████▏         | 356/415 [05:13<00:51,  1.14it/s][A

tensor(0.2512, device='cuda:0', grad_fn=<NllLossBackward>)
0.2512197494506836



Iteration:  86%|███████████████████████████████████████████████████████████▎         | 357/415 [05:14<00:50,  1.15it/s][A

tensor(0.2547, device='cuda:0', grad_fn=<NllLossBackward>)
0.2546819746494293



Iteration:  86%|███████████████████████████████████████████████████████████▌         | 358/415 [05:15<00:49,  1.16it/s][A

tensor(0.2049, device='cuda:0', grad_fn=<NllLossBackward>)
0.20493322610855103



Iteration:  87%|███████████████████████████████████████████████████████████▋         | 359/415 [05:15<00:48,  1.15it/s][A

tensor(0.2569, device='cuda:0', grad_fn=<NllLossBackward>)
0.25694704055786133



Iteration:  87%|███████████████████████████████████████████████████████████▊         | 360/415 [05:16<00:47,  1.15it/s][A

tensor(0.0887, device='cuda:0', grad_fn=<NllLossBackward>)
0.0886533260345459



Iteration:  87%|████████████████████████████████████████████████████████████         | 361/415 [05:17<00:46,  1.17it/s][A

tensor(0.3518, device='cuda:0', grad_fn=<NllLossBackward>)
0.351777583360672



Iteration:  87%|████████████████████████████████████████████████████████████▏        | 362/415 [05:18<00:50,  1.05it/s][A

tensor(0.4171, device='cuda:0', grad_fn=<NllLossBackward>)
0.41706886887550354



Iteration:  87%|████████████████████████████████████████████████████████████▎        | 363/415 [05:19<00:47,  1.09it/s][A

tensor(0.0785, device='cuda:0', grad_fn=<NllLossBackward>)
0.07848338782787323



Iteration:  88%|████████████████████████████████████████████████████████████▌        | 364/415 [05:20<00:45,  1.11it/s][A

tensor(0.2881, device='cuda:0', grad_fn=<NllLossBackward>)
0.2880634665489197



Iteration:  88%|████████████████████████████████████████████████████████████▋        | 365/415 [05:21<00:43,  1.14it/s][A

tensor(0.3628, device='cuda:0', grad_fn=<NllLossBackward>)
0.3627932071685791



Iteration:  88%|████████████████████████████████████████████████████████████▊        | 366/415 [05:22<00:48,  1.01it/s][A

tensor(0.2248, device='cuda:0', grad_fn=<NllLossBackward>)
0.22482222318649292



Iteration:  88%|█████████████████████████████████████████████████████████████        | 367/415 [05:23<00:45,  1.06it/s][A

tensor(0.2079, device='cuda:0', grad_fn=<NllLossBackward>)
0.2079346626996994



Iteration:  89%|█████████████████████████████████████████████████████████████▏       | 368/415 [05:24<00:43,  1.09it/s][A

tensor(0.2962, device='cuda:0', grad_fn=<NllLossBackward>)
0.29623159766197205



Iteration:  89%|█████████████████████████████████████████████████████████████▎       | 369/415 [05:25<00:41,  1.11it/s][A

tensor(0.4056, device='cuda:0', grad_fn=<NllLossBackward>)
0.4056089222431183



Iteration:  89%|█████████████████████████████████████████████████████████████▌       | 370/415 [05:26<00:40,  1.12it/s][A

tensor(0.1851, device='cuda:0', grad_fn=<NllLossBackward>)
0.18513010442256927



Iteration:  89%|█████████████████████████████████████████████████████████████▋       | 371/415 [05:26<00:39,  1.11it/s][A

tensor(0.2733, device='cuda:0', grad_fn=<NllLossBackward>)
0.2732633352279663



Iteration:  90%|█████████████████████████████████████████████████████████████▊       | 372/415 [05:27<00:38,  1.11it/s][A

tensor(0.2401, device='cuda:0', grad_fn=<NllLossBackward>)
0.2400650978088379



Iteration:  90%|██████████████████████████████████████████████████████████████       | 373/415 [05:28<00:37,  1.13it/s][A

tensor(0.2019, device='cuda:0', grad_fn=<NllLossBackward>)
0.20187801122665405



Iteration:  90%|██████████████████████████████████████████████████████████████▏      | 374/415 [05:29<00:35,  1.14it/s][A

tensor(0.0936, device='cuda:0', grad_fn=<NllLossBackward>)
0.09357937425374985



Iteration:  90%|██████████████████████████████████████████████████████████████▎      | 375/415 [05:30<00:34,  1.15it/s][A

tensor(0.2120, device='cuda:0', grad_fn=<NllLossBackward>)
0.21198870241641998



Iteration:  91%|██████████████████████████████████████████████████████████████▌      | 376/415 [05:31<00:35,  1.11it/s][A

tensor(0.1627, device='cuda:0', grad_fn=<NllLossBackward>)
0.16270869970321655



Iteration:  91%|██████████████████████████████████████████████████████████████▋      | 377/415 [05:32<00:36,  1.05it/s][A

tensor(0.1213, device='cuda:0', grad_fn=<NllLossBackward>)
0.12128091603517532



Iteration:  91%|██████████████████████████████████████████████████████████████▊      | 378/415 [05:33<00:33,  1.10it/s][A

tensor(0.1331, device='cuda:0', grad_fn=<NllLossBackward>)
0.13308203220367432



Iteration:  91%|███████████████████████████████████████████████████████████████      | 379/415 [05:34<00:32,  1.11it/s][A

tensor(0.2497, device='cuda:0', grad_fn=<NllLossBackward>)
0.24974222481250763



Iteration:  92%|███████████████████████████████████████████████████████████████▏     | 380/415 [05:34<00:30,  1.13it/s][A

tensor(0.1836, device='cuda:0', grad_fn=<NllLossBackward>)
0.1835961490869522



Iteration:  92%|███████████████████████████████████████████████████████████████▎     | 381/415 [05:35<00:29,  1.16it/s][A

tensor(0.1223, device='cuda:0', grad_fn=<NllLossBackward>)
0.12230357527732849



Iteration:  92%|███████████████████████████████████████████████████████████████▌     | 382/415 [05:36<00:28,  1.17it/s][A

tensor(0.2562, device='cuda:0', grad_fn=<NllLossBackward>)
0.2562367916107178



Iteration:  92%|███████████████████████████████████████████████████████████████▋     | 383/415 [05:37<00:27,  1.18it/s][A

tensor(0.2711, device='cuda:0', grad_fn=<NllLossBackward>)
0.27106618881225586



Iteration:  93%|███████████████████████████████████████████████████████████████▊     | 384/415 [05:38<00:26,  1.18it/s][A

tensor(0.2022, device='cuda:0', grad_fn=<NllLossBackward>)
0.2021646946668625



Iteration:  93%|████████████████████████████████████████████████████████████████     | 385/415 [05:39<00:25,  1.18it/s][A

tensor(0.0848, device='cuda:0', grad_fn=<NllLossBackward>)
0.0848376452922821



Iteration:  93%|████████████████████████████████████████████████████████████████▏    | 386/415 [05:40<00:25,  1.13it/s][A

tensor(0.3647, device='cuda:0', grad_fn=<NllLossBackward>)
0.36467933654785156



Iteration:  93%|████████████████████████████████████████████████████████████████▎    | 387/415 [05:40<00:24,  1.15it/s][A

tensor(0.1717, device='cuda:0', grad_fn=<NllLossBackward>)
0.17170487344264984



Iteration:  93%|████████████████████████████████████████████████████████████████▌    | 388/415 [05:41<00:23,  1.14it/s][A

tensor(0.1547, device='cuda:0', grad_fn=<NllLossBackward>)
0.1546763777732849



Iteration:  94%|████████████████████████████████████████████████████████████████▋    | 389/415 [05:42<00:23,  1.13it/s][A

tensor(0.1962, device='cuda:0', grad_fn=<NllLossBackward>)
0.19615288078784943



Iteration:  94%|████████████████████████████████████████████████████████████████▊    | 390/415 [05:43<00:21,  1.14it/s][A

tensor(0.2264, device='cuda:0', grad_fn=<NllLossBackward>)
0.22642453014850616



Iteration:  94%|█████████████████████████████████████████████████████████████████    | 391/415 [05:44<00:20,  1.16it/s][A

tensor(0.1436, device='cuda:0', grad_fn=<NllLossBackward>)
0.1436375081539154



Iteration:  94%|█████████████████████████████████████████████████████████████████▏   | 392/415 [05:45<00:19,  1.17it/s][A

tensor(0.3806, device='cuda:0', grad_fn=<NllLossBackward>)
0.3805752694606781



Iteration:  95%|█████████████████████████████████████████████████████████████████▎   | 393/415 [05:46<00:18,  1.16it/s][A

tensor(0.2167, device='cuda:0', grad_fn=<NllLossBackward>)
0.2166663110256195



Iteration:  95%|█████████████████████████████████████████████████████████████████▌   | 394/415 [05:47<00:18,  1.12it/s][A

tensor(0.1364, device='cuda:0', grad_fn=<NllLossBackward>)
0.13643783330917358



Iteration:  95%|█████████████████████████████████████████████████████████████████▋   | 395/415 [05:47<00:17,  1.15it/s][A

tensor(0.2347, device='cuda:0', grad_fn=<NllLossBackward>)
0.2347002625465393



Iteration:  95%|█████████████████████████████████████████████████████████████████▊   | 396/415 [05:48<00:17,  1.11it/s][A

tensor(0.1914, device='cuda:0', grad_fn=<NllLossBackward>)
0.19144225120544434



Iteration:  96%|██████████████████████████████████████████████████████████████████   | 397/415 [05:49<00:15,  1.14it/s][A

tensor(0.2501, device='cuda:0', grad_fn=<NllLossBackward>)
0.25014305114746094



Iteration:  96%|██████████████████████████████████████████████████████████████████▏  | 398/415 [05:50<00:14,  1.14it/s][A

tensor(0.2197, device='cuda:0', grad_fn=<NllLossBackward>)
0.219719797372818



Iteration:  96%|██████████████████████████████████████████████████████████████████▎  | 399/415 [05:51<00:14,  1.14it/s][A

tensor(0.1479, device='cuda:0', grad_fn=<NllLossBackward>)
0.14787043631076813



Iteration:  96%|██████████████████████████████████████████████████████████████████▌  | 400/415 [05:52<00:13,  1.13it/s][A

tensor(0.2088, device='cuda:0', grad_fn=<NllLossBackward>)
0.20881740748882294



Iteration:  97%|██████████████████████████████████████████████████████████████████▋  | 401/415 [05:53<00:12,  1.14it/s][A

tensor(0.2913, device='cuda:0', grad_fn=<NllLossBackward>)
0.2912907898426056



Iteration:  97%|██████████████████████████████████████████████████████████████████▊  | 402/415 [05:54<00:11,  1.16it/s][A

tensor(0.3479, device='cuda:0', grad_fn=<NllLossBackward>)
0.3479193150997162



Iteration:  97%|███████████████████████████████████████████████████████████████████  | 403/415 [05:55<00:10,  1.10it/s][A

tensor(0.1288, device='cuda:0', grad_fn=<NllLossBackward>)
0.12881827354431152



Iteration:  97%|███████████████████████████████████████████████████████████████████▏ | 404/415 [05:56<00:10,  1.08it/s][A

tensor(0.0984, device='cuda:0', grad_fn=<NllLossBackward>)
0.09836570173501968



Iteration:  98%|███████████████████████████████████████████████████████████████████▎ | 405/415 [05:56<00:08,  1.12it/s][A

tensor(0.2049, device='cuda:0', grad_fn=<NllLossBackward>)
0.20486387610435486



Iteration:  98%|███████████████████████████████████████████████████████████████████▌ | 406/415 [05:57<00:07,  1.13it/s][A

tensor(0.2941, device='cuda:0', grad_fn=<NllLossBackward>)
0.2941439747810364



Iteration:  98%|███████████████████████████████████████████████████████████████████▋ | 407/415 [05:58<00:06,  1.15it/s][A

tensor(0.2020, device='cuda:0', grad_fn=<NllLossBackward>)
0.20199891924858093



Iteration:  98%|███████████████████████████████████████████████████████████████████▊ | 408/415 [05:59<00:06,  1.16it/s][A

tensor(0.4298, device='cuda:0', grad_fn=<NllLossBackward>)
0.4298105835914612



Iteration:  99%|████████████████████████████████████████████████████████████████████ | 409/415 [06:00<00:05,  1.16it/s][A

tensor(0.1403, device='cuda:0', grad_fn=<NllLossBackward>)
0.14030857384204865



Iteration:  99%|████████████████████████████████████████████████████████████████████▏| 410/415 [06:01<00:04,  1.14it/s][A

tensor(0.1495, device='cuda:0', grad_fn=<NllLossBackward>)
0.14950695633888245



Iteration:  99%|████████████████████████████████████████████████████████████████████▎| 411/415 [06:02<00:03,  1.14it/s][A

tensor(0.0638, device='cuda:0', grad_fn=<NllLossBackward>)
0.06377135962247849



Iteration:  99%|████████████████████████████████████████████████████████████████████▌| 412/415 [06:02<00:02,  1.14it/s][A

tensor(0.1920, device='cuda:0', grad_fn=<NllLossBackward>)
0.19201606512069702



Iteration: 100%|████████████████████████████████████████████████████████████████████▋| 413/415 [06:03<00:01,  1.13it/s][A

tensor(0.5178, device='cuda:0', grad_fn=<NllLossBackward>)
0.5178437829017639



Iteration: 100%|████████████████████████████████████████████████████████████████████▊| 414/415 [06:04<00:00,  1.15it/s][A

tensor(0.1020, device='cuda:0', grad_fn=<NllLossBackward>)
0.10197137296199799



Iteration: 100%|█████████████████████████████████████████████████████████████████████| 415/415 [06:05<00:00,  1.14it/s][A
Epoch: 100%|████████████████████████████████████████████████████████████████████████████| 3/3 [17:15<00:00, 345.26s/it]


In [5]:
import csv
from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
def train_and_test(data_dir, bert_model="bert-base-uncased", task_name=None,
                   output_dir=None, max_seq_length=32, do_train=False, do_eval=False, do_lower_case=False,
                   train_batch_size=32, eval_batch_size=8, learning_rate=5e-5, num_train_epochs=3,
                   warmup_proportion=0.1,no_cuda=False, local_rank=-1, seed=42, gradient_accumulation_steps=1,
                   optimize_on_cpu=False, fp16=False, loss_scale=128, saved_model=""):


    # ## Required parameters
    # parser.add_argument("--data_dir",
    #                     default=None,
    #                     type=str,
    #                     required=True,
    #                     help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
    # parser.add_argument("--bert_model", default=None, type=str, required=True,
    #                     help="Bert pre-trained model selected in the list: bert-base-uncased, "
    #                          "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")
    # parser.add_argument("--task_name",
    #                     default=None,
    #                     type=str,
    #                     required=True,
    #                     help="The name of the task to train.")
    # parser.add_argument("--output_dir",
    #                     default=None,
    #                     type=str,
    #                     required=True,
    #                     help="The output directory where the model checkpoints will be written.")

    ## Other parameters
    # parser.add_argument("--max_seq_length",
    #                     default=128,
    #                     type=int,
    #                     help="The maximum total input sequence length after WordPiece tokenization. \n"
    #                          "Sequences longer than this will be truncated, and sequences shorter \n"
    #                          "than this will be padded.")
    # parser.add_argument("--do_train",
    #                     default=False,
    #                     action='store_true',
    #                     help="Whether to run training.")
    # parser.add_argument("--do_eval",
    #                     default=False,
    #                     action='store_true',
    #                     help="Whether to run eval on the dev set.")
    # parser.add_argument("--do_lower_case",
    #                     default=False,
    #                     action='store_true',
    #                     help="Set this flag if you are using an uncased model.")
    # parser.add_argument("--train_batch_size",
    #                     default=32,
    #                     type=int,
    #                     help="Total batch size for training.")
    # parser.add_argument("--eval_batch_size",
    #                     default=8,
    #                     type=int,
    #                     help="Total batch size for eval.")
    # parser.add_argument("--learning_rate",
    #                     default=5e-5,
    #                     type=float,
    #                     help="The initial learning rate for Adam.")
    # parser.add_argument("--num_train_epochs",
    #                     default=3.0,
    #                     type=float,
    #                     help="Total number of training epochs to perform.")
    # parser.add_argument("--warmup_proportion",
    #                     default=0.1,
    #                     type=float,
    #                     help="Proportion of training to perform linear learning rate warmup for. "
    #                          "E.g., 0.1 = 10%% of training.")
    # parser.add_argument("--no_cuda",
    #                     default=False,
    #                     action='store_true',
    #                     help="Whether not to use CUDA when available")
    # parser.add_argument("--local_rank",
    #                     type=int,
    #                     default=-1,
    #                     help="local_rank for distributed training on gpus")
    # parser.add_argument('--seed',
    #                     type=int,
    #                     default=42,
    #                     help="random seed for initialization")
    # parser.add_argument('--gradient_accumulation_steps',
    #                     type=int,
    #                     default=1,
    #                     help="Number of updates steps to accumulate before performing a backward/update pass.")
    # parser.add_argument('--optimize_on_cpu',
    #                     default=False,
    #                     action='store_true',
    #                     help="Whether to perform optimization and keep the optimizer averages on CPU")
    # parser.add_argument('--fp16',
    #                     default=False,
    #                     action='store_true',
    #                     help="Whether to use 16-bit float precision instead of 32-bit")
    # parser.add_argument('--loss_scale',
    #                     type=float, default=128,
    #                     help='Loss scaling, positive power of 2 values can improve fp16 convergence.')

    # args = parser.parse_args()

    processors = {
#         "cola": ColaProcessor,
#         "mnli": MnliProcessor,
        "mrpc": MrpcProcessor,
    }

    if local_rank == -1 or no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
        if fp16:
            logger.info("16-bits training currently not supported in distributed training")
            fp16 = False # (see https://github.com/pytorch/pytorch/pull/13496)
    logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(local_rank != -1))

    if gradient_accumulation_steps < 1:
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
                            gradient_accumulation_steps))

    train_batch_size = int(train_batch_size / gradient_accumulation_steps)

    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(seed)

    if not do_train and not do_eval:
        raise ValueError("At least one of `do_train` or `do_eval` must be True.")

    if do_train:
        if os.path.exists(output_dir) and os.listdir(output_dir):
            raise ValueError("Output directory ({}) already exists and is not emp1ty.".format(output_dir))
        os.makedirs(output_dir, exist_ok=True)

    task_name = task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()
    label_list = processor.get_labels()

    tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=do_lower_case)

    train_examples = None
    num_train_steps = None
    if do_train:
        train_examples = processor.get_train_examples(data_dir)
        num_train_steps = int(
            len(train_examples) / train_batch_size / gradient_accumulation_steps * num_train_epochs)

    # Prepare model
    model = BertForSequenceClassification.from_pretrained(bert_model,
                cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(local_rank), num_labels = 2)
    if fp16:
        model.half()
    model.to(device)
    if local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[local_rank],
                                                          output_device=local_rank)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Prepare optimizer
    if fp16:
        param_optimizer = [(n, param.clone().detach().to('cpu').float().requires_grad_()) \
                            for n, param in model.named_parameters()]
    elif optimize_on_cpu:
        param_optimizer = [(n, param.clone().detach().to('cpu').requires_grad_()) \
                            for n, param in model.named_parameters()]
    else:
        param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0}
        ]
    t_total = num_train_steps
#     print(t_total)
    if local_rank != -1:
        t_total = t_total // torch.distributed.get_world_size()
    if do_train:
        optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=learning_rate,
                         warmup=warmup_proportion,
                         t_total=t_total)

    global_step = 0
    if do_train:
        train_features = convert_examples_to_features(
            train_examples, label_list, max_seq_length, tokenizer)
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", train_batch_size)
        logger.info("  Num steps = %d", num_train_steps)
        all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
        if local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=train_batch_size)

        model.train()
        for _ in trange(int(num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch
                loss = model(input_ids, segment_ids, input_mask, label_ids)
                if n_gpu > 1:
                    loss = loss.mean() # mean() to average on multi-gpu.
                if fp16 and loss_scale != 1.0:
                    # rescale loss for fp16 training
                    # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html
                    loss = loss * loss_scale
                if gradient_accumulation_steps > 1:
                    loss = loss / gradient_accumulation_steps
                loss.backward()
                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % gradient_accumulation_steps == 0:
                    if fp16 or optimize_on_cpu:
                        if fp16 and loss_scale != 1.0:
                            # scale down gradients for fp16 training
                            for param in model.parameters():
                                if param.grad is not None:
                                    param.grad.data = param.grad.data / loss_scale
                        is_nan = set_optimizer_params_grad(param_optimizer, model.named_parameters(), test_nan=True)
                        if is_nan:
                            logger.info("FP16 TRAINING: Nan in gradients, reducing loss scaling")
                            loss_scale = loss_scale / 2
                            model.zero_grad()
                            continue
                        optimizer.step()
                        copy_optimizer_params_to_model(model.named_parameters(), param_optimizer)
                    else:
                        optimizer.step()
                    model.zero_grad()
                    global_step += 1

        torch.save(model.state_dict(), output_dir + "output.pth")


    if do_eval and (local_rank == -1 or torch.distributed.get_rank() == 0):
        eval_examples = processor.get_test_examples(data_dir)
#         eval_examples = processor.get_dev_examples(data_dir)
        eval_features = convert_examples_to_features(
            eval_examples, label_list, max_seq_length, tokenizer)
        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", eval_batch_size)
        
        all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
        eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
        # Run prediction for full data
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=eval_batch_size)

        model.load_state_dict(torch.load(saved_model))

        model.eval()
        # eval_loss, eval_accuracy = 0, 0

        eval_tp, eval_pred_c, eval_gold_c = 0, 0, 0
        eval_loss, eval_macro_p, eval_macro_r = 0, 0, 0

        raw_score = []

        nb_eval_steps, nb_eval_examples = 0, 0
        for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            label_ids = label_ids.to(device)

            with torch.no_grad():
                tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids)
                logits = model(input_ids, segment_ids, input_mask)
#             print(logits)
#             print(logits[0])
            logits = logits.detach().cpu().numpy()
            print(logits)
            label_ids = label_ids.to('cpu').numpy()

            # Micro F1 (aggregated tp, fp, fn counts across all examples)
            tmp_tp, tmp_pred_c, tmp_gold_c = tp_pcount_gcount(logits, label_ids)
            eval_tp += tmp_tp
            eval_pred_c += tmp_pred_c
            eval_gold_c += tmp_gold_c
            
            pred_label = np.argmax(logits, axis=1)
            raw_score += zip(logits, pred_label, label_ids)
            
            # Macro F1 (averaged P, R across mini batches)
            tmp_eval_p, tmp_eval_r, tmp_eval_f1 = p_r_f1(logits, label_ids)

            eval_macro_p += tmp_eval_p
            eval_macro_r += tmp_eval_r

            eval_loss += tmp_eval_loss.mean().item()
            nb_eval_examples += input_ids.size(0)
            nb_eval_steps += 1


        # Micro F1 (aggregated tp, fp, fn counts across all examples)
        eval_micro_p = eval_tp / eval_pred_c
        eval_micro_r = eval_tp / eval_gold_c
        eval_micro_f1 = 2 * eval_micro_p * eval_micro_r / (eval_micro_p + eval_micro_r)

        # Macro F1 (averaged P, R across mini batches)
        eval_macro_p = eval_macro_p / nb_eval_steps
        eval_macro_r = eval_macro_r / nb_eval_steps
        eval_macro_f1 = 2 * eval_macro_p * eval_macro_r / (eval_macro_p + eval_macro_r)

        eval_loss = eval_loss / nb_eval_steps
        result = {'eval_loss': eval_loss,
                  'eval_micro_p': eval_micro_p,
                  'eval_micro_r': eval_micro_r,
                  'eval_micro_f1': eval_micro_f1,
                  'eval_macro_p': eval_macro_p,
                  'eval_macro_r': eval_macro_r,
                  'eval_macro_f1': eval_macro_f1,
#                   'global_step': global_step,
#                   'loss': tr_loss/nb_tr_steps
                  }

        output_eval_file = os.path.join(output_dir, "ibmcs_test_test_results.txt")
        output_raw_score = os.path.join(output_dir, "ibmcs_test_raw_score.csv")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

        with open(output_raw_score, 'w') as fout:
            fields = ["undermine_score", "support_score","predict_label", "gold"]
            writer = csv.DictWriter(fout, fieldnames=fields)
            writer.writeheader()
            for score, pred, gold in raw_score:
                writer.writerow({
                    "undermine_score": str(score[0]),
                    "support_score": str(score[1]),
                    "predict_label": str(pred),
                    "gold": str(gold)
                })

In [None]:
def experiments():
    data_dir = "D:/Jupyter/data/dataset/perspective_stances/"
#     data_dir = "/home/syg340/Dataset/"

    # data_dir_output = data_dir + "output2/"
    data_dir_output = "D:/Projects/Stance/Models/"
    train_and_test(data_dir=data_dir, do_train=True, do_eval=True, output_dir=data_dir_output,task_name="Mrpc")


In [6]:
def evaluation_with_pretrained():
#     bert_model = "D:/Projects/Stance/Models/dataExpantion/dataExpantion.pth"
    bert_model = "D:/Projects/Stance/Models/output.pth"
#     data_dir = "D:/Jupyter/data/dataset/perspective_stances/"
    data_dir = "D:/Jupyter/data/dataset/ibmcs/"
#     data_dir = "D:/Projects/Stance/Dataset/BertForOppositeClassification/"
#     data_dir = "D:/Projects/Stance/Dataset/OnlyNew/"
    
    # data_dir_output = data_dir + "output2/"
    data_dir_output = "D:/Projects/Stance/Evaluation/bert_dummy_output/bert_experiment_local/"
    train_and_test(data_dir=data_dir, do_train=False, do_eval=True, output_dir=data_dir_output,task_name="Mrpc",saved_model=bert_model)

In [8]:
if __name__ == "__main__":
#     experiments()
    evaluation_with_pretrained()

06/23/2020 19:17:41 - INFO - run_classifier -   device cuda n_gpu 1 distributed training False
06/23/2020 19:17:47 - INFO - transformers.tokenization_utils -   loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at C:\Users\arsen\.cache\torch\transformers\26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
06/23/2020 19:17:50 - INFO - pytorch_pretrained_bert.modeling -   loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz from cache at C:\Users\arsen\.pytorch_pretrained_bert\distributed_-1\9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba
06/23/2020 19:17:50 - INFO - pytorch_pretrained_bert.modeling -   extracting archive file C:\Users\arsen\.pytorch_pretrained_bert\distributed_-1\9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e

[[ 0.22526492 -0.1359578 ]
 [-1.1638577   0.9447243 ]
 [-3.8214025   3.4761896 ]
 [-2.8672209   2.581253  ]
 [-3.5721085   3.3372922 ]
 [-1.0940547   1.0814598 ]
 [ 1.6747785  -1.7683944 ]
 [ 3.0573168  -3.1752877 ]]
[[ 1.9513241  -2.0810244 ]
 [ 3.2504196  -3.4706068 ]
 [ 3.1359282  -3.3171177 ]
 [ 2.9123049  -3.1310844 ]
 [-0.20514917  0.04915991]
 [-1.5929928   1.0126593 ]
 [-1.3923291   0.96343154]
 [-2.8917625   2.3453178 ]]
[[-2.7325084  2.3123937]
 [-1.6009648  0.9626405]
 [-2.3347592  1.7234968]
 [-2.1092277  1.6469979]
 [ 3.0571556 -3.2978592]
 [-2.899314   2.3698723]
 [-2.0240765  1.5631143]
 [-2.3300414  1.8470433]]
[[-2.7489173   2.2737896 ]
 [-1.3752568   1.0973673 ]
 [ 1.756354   -1.970819  ]
 [ 1.0893006  -1.5607188 ]
 [-2.8941655   2.4198298 ]
 [-0.08625522 -0.33838677]
 [-3.052361    2.5377746 ]
 [ 0.2295401  -0.44196385]]
[[-0.15706661 -0.05336304]
 [-0.25269383  0.24361613]
 [-1.6503954   1.2902805 ]
 [-1.444307    0.9549874 ]
 [ 0.10183146 -0.3054131 ]
 [-2.2485776 

[[-3.0038798e+00  2.5815125e+00]
 [-2.0265764e-01  1.6309285e-01]
 [ 2.0209904e+00 -2.0276432e+00]
 [ 6.7123741e-01 -6.2305397e-01]
 [-1.2212667e+00  9.6863860e-01]
 [ 2.8820693e+00 -2.8116155e+00]
 [ 2.9048662e+00 -2.9532094e+00]
 [-2.3760127e-04 -1.2839246e-01]]
[[-3.4418588   3.1010642 ]
 [ 0.6803525  -0.80875975]
 [ 1.138728   -1.1660708 ]
 [ 3.1549273  -3.4085386 ]
 [ 1.0106814  -1.0356042 ]
 [ 1.9812071  -2.08339   ]
 [-3.3889656   3.0782342 ]
 [ 2.143598   -2.2038493 ]]
[[ 2.109135   -2.3194427 ]
 [-3.3732486   3.1358616 ]
 [-0.5629128   0.36455184]
 [ 3.0978975  -3.1562462 ]
 [-3.7234054   3.4308927 ]
 [-2.9760115   2.6573582 ]
 [ 1.0058241  -1.102215  ]
 [-2.2257285   1.7769444 ]]
[[ 2.4668472 -2.7182996]
 [-2.0896938  1.8403684]
 [-1.2423711  1.0450085]
 [-2.5408318  2.1448665]
 [ 1.9144485 -1.9703102]
 [ 2.150492  -2.2977839]
 [-3.7621512  3.4745069]
 [ 1.3488781 -1.4786894]]
[[-2.9467473  2.5825734]
 [ 2.8694682 -3.0742064]
 [-0.845234   0.6130297]
 [-3.1966276  2.805417 ]


[[-2.6021452  2.1363497]
 [ 2.192955  -2.3057375]
 [ 3.0981522 -3.2699122]
 [-3.657322   3.3979292]
 [-3.1474662  2.6682305]
 [-1.7768719  1.3795236]
 [ 1.9468364 -2.138032 ]
 [-2.4472294  2.0725799]]
[[ 1.337096   -1.4561658 ]
 [-3.3347793   3.0699115 ]
 [-2.6448116   2.115899  ]
 [-3.147044    2.631565  ]
 [ 1.3283415  -1.4885819 ]
 [-0.19154401 -0.19537987]
 [ 1.7593483  -1.8309189 ]
 [-3.0888138   2.4566116 ]]
[[-2.7730916   2.3416634 ]
 [ 0.2048395  -0.49110562]
 [ 2.5162392  -2.6896875 ]
 [-0.7683035   0.5814263 ]
 [-2.7659156   2.3405385 ]
 [-2.38061     1.8348904 ]
 [-3.1622934   2.7454567 ]
 [-0.8236678   0.47818628]]
[[ 2.651247  -2.818678 ]
 [-1.9193285  1.5625787]
 [-2.3487315  1.9321551]
 [-1.3086078  0.971966 ]
 [-1.2350805  0.8484915]
 [-2.9248512  2.524723 ]
 [ 2.6446788 -3.0021472]
 [-2.6827517  2.0906477]]
[[-1.5147614   1.1444787 ]
 [ 0.7268008  -1.1055642 ]
 [-2.330668    1.8596869 ]
 [-2.9830546   2.4788291 ]
 [-2.4153824   2.0264618 ]
 [ 2.7509072  -2.8418922 ]
 [

[[-3.6727977   3.3751264 ]
 [-3.008013    2.5483913 ]
 [-2.7144098   2.193995  ]
 [-3.734805    3.3654404 ]
 [ 2.976077   -3.2465582 ]
 [ 0.03351098 -0.0841948 ]
 [-2.3491771   1.7670085 ]
 [ 1.437438   -1.6915553 ]]
[[-2.1136634   1.6305323 ]
 [ 0.02821856 -0.07946622]
 [ 1.1052607  -1.1417979 ]
 [-3.4965498   3.1772761 ]
 [ 1.1478446  -1.3244506 ]
 [-2.6183186   2.2792273 ]
 [-1.8362495   1.3341684 ]
 [-1.4695725   1.2870258 ]]
[[-1.978976   1.5652394]
 [ 1.5651784 -1.8758566]
 [-2.5215225  1.9762611]
 [-2.4950397  2.0675611]
 [-2.7273984  2.3972476]
 [-1.2708967  0.8646224]
 [-3.4823847  3.062057 ]
 [ 2.7439299 -2.939228 ]]
[[ 1.4784431  -1.6721692 ]
 [ 0.65115434 -0.82199246]
 [ 2.36449    -2.5287335 ]
 [-2.896564    2.3967965 ]
 [-1.4890599   1.1722393 ]
 [-3.5441084   3.1732054 ]
 [ 1.0557786  -1.1657113 ]
 [ 3.0309153  -3.396084  ]]
[[-3.2506227   2.8661473 ]
 [ 2.3468533  -2.7367363 ]
 [-0.25846967  0.02233155]
 [ 2.8042119  -3.0236835 ]
 [-3.678971    3.349814  ]
 [ 2.538509  

[[-3.6521459   3.1953769 ]
 [ 0.8859053  -0.9687415 ]
 [-0.37053862  0.3346861 ]
 [-0.60577506  0.4264709 ]
 [ 2.537708   -2.5986156 ]
 [ 1.4639186  -1.514252  ]
 [-1.7308805   1.5126576 ]
 [-1.2759929   0.99871933]]
[[-2.9131784  2.409051 ]
 [ 1.9404132 -2.1496475]
 [ 2.0316713 -2.1342635]
 [-2.8648639  2.4263353]
 [ 2.4325638 -2.694754 ]
 [-3.5204124  3.1303878]
 [ 1.0464636 -1.0464163]
 [-3.5542927  3.0850005]]
[[ 1.5097653 -1.776742 ]
 [-2.5999603  2.236787 ]
 [ 1.7034118 -1.9319416]
 [-1.6449492  1.357893 ]
 [ 2.9117632 -3.1633854]
 [ 1.8917416 -2.1086164]
 [ 2.4261227 -2.704894 ]
 [-1.6641982  1.2922581]]
[[ 2.805138  -3.050229 ]
 [-3.1523519  2.726867 ]
 [ 2.3476653 -2.5556202]
 [ 2.0244863 -2.2681103]
 [ 2.9131331 -3.2072058]
 [ 0.8948124 -0.937067 ]
 [ 3.1607265 -3.4048073]
 [ 2.8730564 -3.1838465]]
[[ 0.9687564  -1.0886737 ]
 [ 2.563425   -2.687982  ]
 [-1.9086101   1.7628039 ]
 [ 1.9808308  -2.0383048 ]
 [ 0.4709853  -0.61391383]
 [-0.30691692  0.14379261]
 [ 0.33409977 -0.4

[[-3.0144958   2.6014872 ]
 [-1.1534065   0.6404877 ]
 [ 0.59858257 -0.83939254]
 [ 2.3691907  -2.284113  ]
 [-1.144493    0.6597936 ]
 [-2.5277476   2.014059  ]
 [-2.897297    2.4355981 ]
 [-2.8941174   2.354506  ]]
[[-2.8912463  2.401785 ]
 [-2.3491588  1.8228292]
 [ 1.8918369 -1.9569497]
 [ 1.9809545 -2.053298 ]
 [ 2.7589495 -2.5925608]
 [-2.9736185  2.4789488]
 [-2.4344673  1.9696159]
 [ 1.9410937 -1.9932349]]
[[-1.6885663  1.2823677]
 [ 2.483538  -2.420455 ]
 [ 1.080601  -1.1869977]
 [ 2.4713852 -2.4703174]
 [ 1.6545883 -1.7138495]
 [-2.8666337  2.3744183]
 [-2.1788511  1.5670842]
 [-1.1445582  0.7855641]]
[[ 0.10964014 -0.20662984]
 [ 1.7338436  -1.782514  ]
 [-2.5699039   2.1134517 ]
 [-1.046286    0.76240855]
 [ 1.2235644  -1.4123101 ]
 [ 1.1291974  -1.3365971 ]
 [ 1.4669001  -1.68252   ]
 [-2.6241922   2.1198115 ]]
[[ 1.9106227  -2.0462348 ]
 [ 1.4288806  -1.5944278 ]
 [ 0.06825354 -0.26360962]
 [-0.79878384  0.5055548 ]
 [ 0.49105775 -0.72613424]
 [ 0.2948447  -0.38193467]
 [

[[-2.794053    2.254346  ]
 [ 2.228303   -2.2937975 ]
 [ 2.6370747  -2.5395076 ]
 [-1.5893207   1.2103354 ]
 [ 2.5043201  -2.5367813 ]
 [-0.38307393 -0.08202669]
 [-2.2488284   1.6373938 ]
 [ 2.1701517  -2.2240243 ]]
[[ 2.6362529 -2.6023736]
 [ 2.7155797 -2.609128 ]
 [ 2.8937836 -2.6463504]
 [-2.3331318  1.8414435]
 [ 2.8047938 -2.5696125]
 [ 2.7855237 -2.6283884]
 [ 1.2700343 -1.45032  ]
 [ 1.8605478 -1.9116422]]
[[ 2.7630358  -2.616259  ]
 [ 0.8820211  -1.1678579 ]
 [-2.6668925   2.1424463 ]
 [ 2.315366   -2.2702758 ]
 [ 0.65360904 -1.0247266 ]
 [-2.901956    2.3897843 ]
 [-2.4127939   1.9415917 ]
 [ 1.5119832  -1.7080619 ]]
[[-2.9630938   2.4558783 ]
 [-1.9668639   1.565142  ]
 [-2.6227098   2.1624765 ]
 [-1.1420932   0.64570636]
 [-2.4578223   1.8956611 ]
 [-2.316202    1.7873386 ]
 [ 1.0294336  -1.2293042 ]
 [-1.0441421   0.7715073 ]]
[[ 2.8931987  -2.7423773 ]
 [ 0.71139616 -0.93358284]
 [-2.5961227   2.0652316 ]
 [-0.86839974  0.4731833 ]
 [-1.9609817   1.4073756 ]
 [-0.28354526

[[-2.115293   1.7066073]
 [ 2.895578  -3.09723  ]
 [-1.5194936  1.1034063]
 [ 2.508599  -2.6050148]
 [-1.5523536  1.3552691]
 [ 1.1328039 -1.178375 ]
 [-1.7295115  1.5070066]
 [-2.2797978  1.8597124]]
[[ 2.7481163 -2.9320707]
 [ 1.0463929 -1.1084664]
 [ 3.107009  -3.2314882]
 [-2.8840034  2.5227363]
 [ 0.9212108 -1.0009491]
 [-2.5239363  2.186771 ]
 [-2.2839913  1.9474859]
 [ 3.0294926 -3.2270658]]
[[ 2.4636915  -2.7291803 ]
 [ 1.0609486  -1.1487142 ]
 [ 1.7249284  -1.8478599 ]
 [ 0.94607717 -0.9888816 ]
 [ 1.622206   -1.7625326 ]
 [-1.7109392   1.4088782 ]
 [ 1.6306235  -1.6671729 ]
 [ 1.8505027  -1.9349663 ]]
[[ 0.3195206  -0.6311251 ]
 [ 2.586194   -2.679429  ]
 [ 1.6273661  -1.8360634 ]
 [-0.5885386   0.52403694]
 [-1.6833096   1.593674  ]
 [ 1.7325991  -1.8568959 ]
 [ 0.76444924 -0.88536537]
 [-0.01055528 -0.18859668]]
[[-0.08280133  0.01133446]
 [-3.1858072   2.9204574 ]
 [-3.1401076   2.8457572 ]
 [-0.40811762  0.32997748]
 [ 2.6799014  -2.8597736 ]
 [-2.6520271   2.296145  ]
 [

06/23/2020 19:18:25 - INFO - run_classifier -   ***** Eval results *****
06/23/2020 19:18:25 - INFO - run_classifier -     eval_loss = 1.049898855909705
06/23/2020 19:18:25 - INFO - run_classifier -     eval_macro_f1 = 0.7015565327013262
06/23/2020 19:18:25 - INFO - run_classifier -     eval_macro_p = 0.6751785714285713
06/23/2020 19:18:25 - INFO - run_classifier -     eval_macro_r = 0.7300793650793651
06/23/2020 19:18:25 - INFO - run_classifier -     eval_micro_f1 = 0.7103578154425612
06/23/2020 19:18:25 - INFO - run_classifier -     eval_micro_p = 0.7090225563909774
06/23/2020 19:18:25 - INFO - run_classifier -     eval_micro_r = 0.7116981132075472


[[-1.9387293  1.4911381]
 [ 1.3352097 -1.4424179]]
