In [9]:
import torch
import random
import numpy as np
import os
from tqdm import tqdm, trange
# torch.cuda.empty_cache()
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from pytorch_pretrained_bert.optimization import BertAdam

In [10]:
from run_classifier import ColaProcessor, MrpcProcessor, logger, convert_examples_to_features,\
    set_optimizer_params_grad, copy_optimizer_params_to_model, accuracy, p_r_f1, tp_pcount_gcount

In [3]:
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    n_gpu = torch.cuda.device_count()
    
    print('There are %d GPU(s) available.' % n_gpu)

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: GeForce GTX 1050 Ti


In [8]:
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from pytorch_pretrained_bert.modeling import BertForSequenceClassification

03/07/2020 21:18:47 - INFO - transformers.file_utils -   PyTorch version 1.4.0 available.
03/07/2020 21:18:48 - INFO - transformers.file_utils -   TensorFlow version 2.1.0 available.


In [5]:
# import logging
# logging.basicConfig(level=logging.INFO)

In [6]:
# def train_and_test(data_dir, bert_model="bert-base-uncased", task_name=None,
#                    output_dir=None, max_seq_length=128, do_train=False, do_eval=False, do_lower_case=False,
#                    train_batch_size=32, eval_batch_size=8, learning_rate=5e-5, num_train_epochs=3,
#                    warmup_proportion=0.1,no_cuda=False, local_rank=-1, seed=42, gradient_accumulation_steps=1,
#                    optimize_on_cpu=False, fp16=False, loss_scale=128, saved_model=""):

In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

03/07/2020 20:59:42 - INFO - transformers.tokenization_utils -   loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at C:\Users\arsen\.cache\torch\transformers\26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084


In [8]:
# Prepare model 
model = BertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels = 2)
model.to(device)

# model = BertModel.from_pretrained('bert-base-uncased')

03/07/2020 20:59:42 - INFO - pytorch_pretrained_bert.modeling -   loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz from cache at C:\Users\arsen\.pytorch_pretrained_bert\9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba
03/07/2020 20:59:42 - INFO - pytorch_pretrained_bert.modeling -   extracting archive file C:\Users\arsen\.pytorch_pretrained_bert\9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba to temp dir C:\Users\arsen\AppData\Local\Temp\tmp5zjz3zv7
03/07/2020 20:59:46 - INFO - pytorch_pretrained_bert.modeling -   Model config {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_lay

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
   

In [9]:
# Get all of the model's parameters as a list of tuples.
params = list(model.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
 
print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

The BERT model has 201 different named parameters.

==== Embedding Layer ====

bert.embeddings.word_embeddings.weight                  (30522, 768)
bert.embeddings.position_embeddings.weight                (512, 768)
bert.embeddings.token_type_embeddings.weight                (2, 768)
bert.embeddings.LayerNorm.weight                              (768,)
bert.embeddings.LayerNorm.bias                                (768,)

==== First Transformer ====

bert.encoder.layer.0.attention.self.query.weight          (768, 768)
bert.encoder.layer.0.attention.self.query.bias                (768,)
bert.encoder.layer.0.attention.self.key.weight            (768, 768)
bert.encoder.layer.0.attention.self.key.bias                  (768,)
bert.encoder.layer.0.attention.self.value.weight          (768, 768)
bert.encoder.layer.0.attention.self.value.bias                (768,)
bert.encoder.layer.0.attention.output.dense.weight        (768, 768)
bert.encoder.layer.0.attention.output.dense.bias              (

In [10]:
data_dir = "D:/Jupyter/data/dataset/perspective_stances/"
data_dir_output = "D:/Projects/Stance/Models/"
output_dir=data_dir_output
max_seq_length=32
max_grad_norm = 1.0
num_training_steps = 1000
num_warmup_steps = 100
warmup_proportion = float(num_warmup_steps) / float(num_training_steps)  # 0.1
# warmup_proportion = 0.1
train_batch_size=32
eval_batch_size=8
learning_rate=5e-5
num_train_epochs=3
local_rank=-1
seed=42
gradient_accumulation_steps=1
loss_scale=128
train_batch_size = int(train_batch_size / gradient_accumulation_steps)

processors = {
        "mrpc": MrpcProcessor,
    }

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
    
os.makedirs(output_dir, exist_ok=True)
processor = processors['mrpc']()
label_list = processor.get_labels()

train_examples = processor.get_train_examples(data_dir)
num_train_steps = int(
    len(train_examples) / train_batch_size / gradient_accumulation_steps * num_train_epochs)

##preprare optimizer
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0}
    ]
t_total = num_train_steps
optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=learning_rate,
                         warmup=warmup_proportion,
                         t_total=t_total)
# optimizer = AdamW(optimizer_grouped_parameters,
#                   lr = learning_rate, # args.learning_rate - default is 5e-5, our notebook had 2e-5
#                   eps = 1e-8, # args.adam_epsilon  - default is 1e-8.
#                   correct_bias=False
#                 )

# scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=t_total)  # PyTorch scheduler

03/07/2020 21:00:05 - INFO - run_classifier -   LOOKING AT D:/Jupyter/data/dataset/perspective_stances/train.tsv


In [11]:
global_step = 0
train_features = convert_examples_to_features(train_examples, label_list, max_seq_length, tokenizer)
logger.info("***** Running training *****")
logger.info("  Num examples = %d", len(train_examples))
logger.info("  Batch size = %d", train_batch_size)
logger.info("  Num steps = %d", num_train_steps)
all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)

train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=train_batch_size)

03/07/2020 21:00:05 - INFO - run_classifier -   *** Example ***
03/07/2020 21:00:05 - INFO - run_classifier -   guid: train-1
03/07/2020 21:00:05 - INFO - run_classifier -   tokens: [CLS] male infant ci ##rc ##um ##cision is tan ##tam ##ount to child abuse [SEP] parents know what best for th ##ier child [SEP]
03/07/2020 21:00:05 - INFO - run_classifier -   input_ids: 101 3287 10527 25022 11890 2819 28472 2003 9092 15464 21723 2000 2775 6905 102 3008 2113 2054 2190 2005 16215 3771 2775 102 0 0 0 0 0 0 0 0
03/07/2020 21:00:05 - INFO - run_classifier -   input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0
03/07/2020 21:00:05 - INFO - run_classifier -   segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0
03/07/2020 21:00:05 - INFO - run_classifier -   label: 0 (id = 0)
03/07/2020 21:00:05 - INFO - run_classifier -   *** Example ***
03/07/2020 21:00:05 - INFO - run_classifier -   guid: train-2
03/07/2020 21:00:05 - INFO - run_classifier -   t

In [12]:
model.train()
for _ in trange(int(num_train_epochs), desc="Epoch"):
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
        batch = tuple(t.to(device) for t in batch)
        input_ids, input_mask, segment_ids, label_ids = batch
        loss = model(input_ids, segment_ids, input_mask, label_ids)
        print(loss)
        if n_gpu > 1:
            loss = loss.mean() # mean() to average on multi-gpu.
#         if fp16 and loss_scale != 1.0:
#             # rescale loss for fp16 training
#             # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html
#             loss = loss * loss_scale
        if gradient_accumulation_steps > 1:
            loss = loss / gradient_accumulation_steps
        loss.backward()
        
        tr_loss += loss.item()
        nb_tr_examples += input_ids.size(0)
        nb_tr_steps += 1
        if (step + 1) % gradient_accumulation_steps == 0:
#             if fp16 or optimize_on_cpu:
#                 if fp16 and loss_scale != 1.0:
#                     # scale down gradients for fp16 training
#                     for param in model.parameters():
#                         if param.grad is not None:
#                             param.grad.data = param.grad.data / loss_scale           
#                 is_nan = set_optimizer_params_grad(param_optimizer, model.named_parameters(), test_nan=True)
#                 if is_nan:
#                     logger.info("FP16 TRAINING: Nan in gradients, reducing loss scaling")
#                     loss_scale = loss_scale / 2
#                     model.zero_grad()
#                     continue 
#                 optimizer.step()
# #                 scheduler.step()
#                 copy_optimizer_params_to_model(model.named_parameters(), param_optimizer)
#             else:
#                 torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
            optimizer.step()
#                 scheduler.step()
            model.zero_grad()
            global_step += 1

torch.save(model.state_dict(), output_dir + "output.pth")

Epoch:   0%|                                                                                     | 0/3 [00:00<?, ?it/s]
Iteration:   0%|                                                                               | 0/219 [00:00<?, ?it/s][A

tensor(0.7222, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   0%|▎                                                                      | 1/219 [00:03<11:02,  3.04s/it][A

tensor(0.7170, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   1%|▋                                                                      | 2/219 [00:03<08:25,  2.33s/it][A

tensor(0.7230, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   1%|▉                                                                      | 3/219 [00:04<06:36,  1.83s/it][A

tensor(0.7422, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   2%|█▎                                                                     | 4/219 [00:05<05:19,  1.49s/it][A

tensor(0.6856, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   2%|█▌                                                                     | 5/219 [00:05<04:26,  1.24s/it][A

tensor(0.6746, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   3%|█▉                                                                     | 6/219 [00:06<03:48,  1.07s/it][A

tensor(0.6812, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   3%|██▎                                                                    | 7/219 [00:07<03:21,  1.05it/s][A

tensor(0.6928, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   4%|██▌                                                                    | 8/219 [00:07<03:03,  1.15it/s][A

tensor(0.6562, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   4%|██▉                                                                    | 9/219 [00:08<02:50,  1.23it/s][A

tensor(0.6932, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   5%|███▏                                                                  | 10/219 [00:09<02:40,  1.30it/s][A

tensor(0.7072, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   5%|███▌                                                                  | 11/219 [00:09<02:37,  1.32it/s][A

tensor(0.7063, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   5%|███▊                                                                  | 12/219 [00:10<02:31,  1.37it/s][A

tensor(0.6899, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   6%|████▏                                                                 | 13/219 [00:11<02:27,  1.40it/s][A

tensor(0.7404, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   6%|████▍                                                                 | 14/219 [00:11<02:23,  1.42it/s][A

tensor(0.6962, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   7%|████▊                                                                 | 15/219 [00:12<02:23,  1.42it/s][A

tensor(0.7003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   7%|█████                                                                 | 16/219 [00:13<02:20,  1.44it/s][A

tensor(0.7028, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   8%|█████▍                                                                | 17/219 [00:13<02:19,  1.45it/s][A

tensor(0.7175, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   8%|█████▊                                                                | 18/219 [00:14<02:17,  1.46it/s][A

tensor(0.7028, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   9%|██████                                                                | 19/219 [00:15<02:16,  1.47it/s][A

tensor(0.6914, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   9%|██████▍                                                               | 20/219 [00:15<02:15,  1.47it/s][A

tensor(0.7074, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  10%|██████▋                                                               | 21/219 [00:16<02:14,  1.47it/s][A

tensor(0.7058, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  10%|███████                                                               | 22/219 [00:17<02:16,  1.44it/s][A

tensor(0.6617, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  11%|███████▎                                                              | 23/219 [00:18<02:14,  1.45it/s][A

tensor(0.7052, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  11%|███████▋                                                              | 24/219 [00:18<02:13,  1.46it/s][A

tensor(0.6729, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  11%|███████▉                                                              | 25/219 [00:19<02:12,  1.46it/s][A

tensor(0.6272, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  12%|████████▎                                                             | 26/219 [00:20<02:11,  1.47it/s][A

tensor(0.6402, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  12%|████████▋                                                             | 27/219 [00:20<02:10,  1.47it/s][A

tensor(0.8291, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  13%|████████▉                                                             | 28/219 [00:21<02:09,  1.47it/s][A

tensor(0.7183, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  13%|█████████▎                                                            | 29/219 [00:22<02:08,  1.47it/s][A

tensor(0.6822, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  14%|█████████▌                                                            | 30/219 [00:22<02:08,  1.48it/s][A

tensor(0.6652, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  14%|█████████▉                                                            | 31/219 [00:23<02:07,  1.48it/s][A

tensor(0.6945, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  15%|██████████▏                                                           | 32/219 [00:24<02:10,  1.43it/s][A

tensor(0.6928, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  15%|██████████▌                                                           | 33/219 [00:24<02:08,  1.44it/s][A

tensor(0.6465, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  16%|██████████▊                                                           | 34/219 [00:25<02:07,  1.45it/s][A

tensor(0.6618, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  16%|███████████▏                                                          | 35/219 [00:26<02:06,  1.46it/s][A

tensor(0.6987, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  16%|███████████▌                                                          | 36/219 [00:26<02:04,  1.46it/s][A

tensor(0.6695, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  17%|███████████▊                                                          | 37/219 [00:27<02:03,  1.47it/s][A

tensor(0.6608, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  17%|████████████▏                                                         | 38/219 [00:28<02:02,  1.47it/s][A

tensor(0.7301, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  18%|████████████▍                                                         | 39/219 [00:28<02:02,  1.47it/s][A

tensor(0.6272, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  18%|████████████▊                                                         | 40/219 [00:29<02:01,  1.47it/s][A

tensor(0.5548, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  19%|█████████████                                                         | 41/219 [00:30<02:01,  1.47it/s][A

tensor(0.6852, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  19%|█████████████▍                                                        | 42/219 [00:30<02:00,  1.47it/s][A

tensor(0.6267, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  20%|█████████████▋                                                        | 43/219 [00:31<01:59,  1.47it/s][A

tensor(0.5889, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  20%|██████████████                                                        | 44/219 [00:32<02:01,  1.44it/s][A

tensor(0.6279, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  21%|██████████████▍                                                       | 45/219 [00:33<02:01,  1.43it/s][A

tensor(0.6383, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  21%|██████████████▋                                                       | 46/219 [00:33<01:59,  1.44it/s][A

tensor(0.6525, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  21%|███████████████                                                       | 47/219 [00:34<01:58,  1.45it/s][A

tensor(0.6093, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  22%|███████████████▎                                                      | 48/219 [00:35<01:57,  1.46it/s][A

tensor(0.6133, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  22%|███████████████▋                                                      | 49/219 [00:35<01:56,  1.47it/s][A

tensor(0.5809, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  23%|███████████████▉                                                      | 50/219 [00:36<01:55,  1.47it/s][A

tensor(0.6346, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  23%|████████████████▎                                                     | 51/219 [00:37<01:54,  1.47it/s][A

tensor(0.6615, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  24%|████████████████▌                                                     | 52/219 [00:37<01:54,  1.46it/s][A

tensor(0.6229, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  24%|████████████████▉                                                     | 53/219 [00:38<01:53,  1.46it/s][A

tensor(0.5720, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  25%|█████████████████▎                                                    | 54/219 [00:39<01:52,  1.47it/s][A

tensor(0.7498, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  25%|█████████████████▌                                                    | 55/219 [00:39<01:51,  1.47it/s][A

tensor(0.5228, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  26%|█████████████████▉                                                    | 56/219 [00:40<01:50,  1.47it/s][A

tensor(0.6288, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  26%|██████████████████▏                                                   | 57/219 [00:41<01:49,  1.47it/s][A

tensor(0.6775, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  26%|██████████████████▌                                                   | 58/219 [00:41<01:51,  1.44it/s][A

tensor(0.5991, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  27%|██████████████████▊                                                   | 59/219 [00:42<01:50,  1.45it/s][A

tensor(0.6630, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  27%|███████████████████▏                                                  | 60/219 [00:43<01:49,  1.45it/s][A

tensor(0.6118, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  28%|███████████████████▍                                                  | 61/219 [00:43<01:48,  1.46it/s][A

tensor(0.5829, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  28%|███████████████████▊                                                  | 62/219 [00:44<01:49,  1.43it/s][A

tensor(0.5332, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  29%|████████████████████▏                                                 | 63/219 [00:45<01:48,  1.44it/s][A

tensor(0.7342, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  29%|████████████████████▍                                                 | 64/219 [00:46<01:47,  1.45it/s][A

tensor(0.6244, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  30%|████████████████████▊                                                 | 65/219 [00:46<01:46,  1.44it/s][A

tensor(0.6019, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  30%|█████████████████████                                                 | 66/219 [00:47<01:45,  1.45it/s][A

tensor(0.5900, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  31%|█████████████████████▍                                                | 67/219 [00:48<01:44,  1.45it/s][A

tensor(0.6598, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  31%|█████████████████████▋                                                | 68/219 [00:48<01:44,  1.44it/s][A

tensor(0.5324, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  32%|██████████████████████                                                | 69/219 [00:49<01:43,  1.45it/s][A

tensor(0.6352, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  32%|██████████████████████▎                                               | 70/219 [00:50<01:42,  1.46it/s][A

tensor(0.4011, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  32%|██████████████████████▋                                               | 71/219 [00:50<01:42,  1.45it/s][A

tensor(0.5338, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  33%|███████████████████████                                               | 72/219 [00:51<01:41,  1.45it/s][A

tensor(0.4925, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  33%|███████████████████████▎                                              | 73/219 [00:52<01:40,  1.45it/s][A

tensor(0.5450, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  34%|███████████████████████▋                                              | 74/219 [00:52<01:39,  1.46it/s][A

tensor(0.6648, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  34%|███████████████████████▉                                              | 75/219 [00:53<01:39,  1.45it/s][A

tensor(0.7472, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  35%|████████████████████████▎                                             | 76/219 [00:54<01:38,  1.45it/s][A

tensor(0.5801, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  35%|████████████████████████▌                                             | 77/219 [00:55<01:39,  1.43it/s][A

tensor(0.6091, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  36%|████████████████████████▉                                             | 78/219 [00:55<01:38,  1.43it/s][A

tensor(0.6513, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  36%|█████████████████████████▎                                            | 79/219 [00:56<01:37,  1.44it/s][A

tensor(0.6496, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  37%|█████████████████████████▌                                            | 80/219 [00:57<01:36,  1.45it/s][A

tensor(0.5487, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  37%|█████████████████████████▉                                            | 81/219 [00:57<01:35,  1.44it/s][A

tensor(0.7050, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  37%|██████████████████████████▏                                           | 82/219 [00:58<01:34,  1.44it/s][A

tensor(0.6265, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  38%|██████████████████████████▌                                           | 83/219 [00:59<01:33,  1.45it/s][A

tensor(0.5699, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  38%|██████████████████████████▊                                           | 84/219 [00:59<01:33,  1.45it/s][A

tensor(0.6260, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  39%|███████████████████████████▏                                          | 85/219 [01:00<01:32,  1.45it/s][A

tensor(0.6347, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  39%|███████████████████████████▍                                          | 86/219 [01:01<01:31,  1.45it/s][A

tensor(0.6289, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  40%|███████████████████████████▊                                          | 87/219 [01:01<01:30,  1.46it/s][A

tensor(0.6249, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  40%|████████████████████████████▏                                         | 88/219 [01:02<01:29,  1.46it/s][A

tensor(0.6324, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  41%|████████████████████████████▍                                         | 89/219 [01:03<01:29,  1.46it/s][A

tensor(0.6851, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  41%|████████████████████████████▊                                         | 90/219 [01:04<01:29,  1.45it/s][A

tensor(0.5564, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  42%|█████████████████████████████                                         | 91/219 [01:04<01:28,  1.45it/s][A

tensor(0.5015, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  42%|█████████████████████████████▍                                        | 92/219 [01:05<01:27,  1.45it/s][A

tensor(0.6480, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  42%|█████████████████████████████▋                                        | 93/219 [01:06<01:26,  1.45it/s][A

tensor(0.7094, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  43%|██████████████████████████████                                        | 94/219 [01:06<01:27,  1.43it/s][A

tensor(0.6682, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  43%|██████████████████████████████▎                                       | 95/219 [01:07<01:26,  1.44it/s][A

tensor(0.5434, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  44%|██████████████████████████████▋                                       | 96/219 [01:08<01:26,  1.42it/s][A

tensor(0.5937, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  44%|███████████████████████████████                                       | 97/219 [01:08<01:25,  1.42it/s][A

tensor(0.5601, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  45%|███████████████████████████████▎                                      | 98/219 [01:09<01:24,  1.43it/s][A

tensor(0.5974, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  45%|███████████████████████████████▋                                      | 99/219 [01:10<01:23,  1.44it/s][A

tensor(0.4826, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  46%|███████████████████████████████▌                                     | 100/219 [01:11<01:22,  1.44it/s][A

tensor(0.7061, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  46%|███████████████████████████████▊                                     | 101/219 [01:11<01:21,  1.44it/s][A

tensor(0.6997, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  47%|████████████████████████████████▏                                    | 102/219 [01:12<01:20,  1.45it/s][A

tensor(0.5173, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  47%|████████████████████████████████▍                                    | 103/219 [01:13<01:19,  1.45it/s][A

tensor(0.5859, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  47%|████████████████████████████████▊                                    | 104/219 [01:13<01:18,  1.46it/s][A

tensor(0.6192, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  48%|█████████████████████████████████                                    | 105/219 [01:14<01:18,  1.46it/s][A

tensor(0.6340, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  48%|█████████████████████████████████▍                                   | 106/219 [01:15<01:17,  1.46it/s][A

tensor(0.5674, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  49%|█████████████████████████████████▋                                   | 107/219 [01:15<01:16,  1.46it/s][A

tensor(0.6427, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  49%|██████████████████████████████████                                   | 108/219 [01:16<01:15,  1.47it/s][A

tensor(0.5616, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  50%|██████████████████████████████████▎                                  | 109/219 [01:17<01:15,  1.45it/s][A

tensor(0.5728, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  50%|██████████████████████████████████▋                                  | 110/219 [01:17<01:15,  1.45it/s][A

tensor(0.5388, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  51%|██████████████████████████████████▉                                  | 111/219 [01:18<01:14,  1.45it/s][A

tensor(0.5119, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  51%|███████████████████████████████████▎                                 | 112/219 [01:19<01:13,  1.45it/s][A

tensor(0.6534, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  52%|███████████████████████████████████▌                                 | 113/219 [01:19<01:12,  1.46it/s][A

tensor(0.5773, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  52%|███████████████████████████████████▉                                 | 114/219 [01:20<01:12,  1.45it/s][A

tensor(0.6200, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  53%|████████████████████████████████████▏                                | 115/219 [01:21<01:12,  1.43it/s][A

tensor(0.5545, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  53%|████████████████████████████████████▌                                | 116/219 [01:22<01:11,  1.44it/s][A

tensor(0.4732, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  53%|████████████████████████████████████▊                                | 117/219 [01:22<01:10,  1.45it/s][A

tensor(0.5005, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  54%|█████████████████████████████████████▏                               | 118/219 [01:23<01:09,  1.45it/s][A

tensor(0.6595, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  54%|█████████████████████████████████████▍                               | 119/219 [01:24<01:09,  1.44it/s][A

tensor(0.4348, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  55%|█████████████████████████████████████▊                               | 120/219 [01:24<01:08,  1.45it/s][A

tensor(0.6016, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  55%|██████████████████████████████████████                               | 121/219 [01:25<01:07,  1.45it/s][A

tensor(0.6332, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  56%|██████████████████████████████████████▍                              | 122/219 [01:26<01:06,  1.45it/s][A

tensor(0.5578, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  56%|██████████████████████████████████████▊                              | 123/219 [01:26<01:05,  1.46it/s][A

tensor(0.4437, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  57%|███████████████████████████████████████                              | 124/219 [01:27<01:05,  1.46it/s][A

tensor(0.6592, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  57%|███████████████████████████████████████▍                             | 125/219 [01:28<01:04,  1.46it/s][A

tensor(0.7851, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  58%|███████████████████████████████████████▋                             | 126/219 [01:28<01:03,  1.46it/s][A

tensor(0.5019, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  58%|████████████████████████████████████████                             | 127/219 [01:29<01:03,  1.46it/s][A

tensor(0.5744, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  58%|████████████████████████████████████████▎                            | 128/219 [01:30<01:02,  1.46it/s][A

tensor(0.6540, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  59%|████████████████████████████████████████▋                            | 129/219 [01:30<01:01,  1.46it/s][A

tensor(0.5637, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  59%|████████████████████████████████████████▉                            | 130/219 [01:31<01:00,  1.46it/s][A

tensor(0.6353, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  60%|█████████████████████████████████████████▎                           | 131/219 [01:32<01:00,  1.46it/s][A

tensor(0.4727, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  60%|█████████████████████████████████████████▌                           | 132/219 [01:32<00:59,  1.46it/s][A

tensor(0.5333, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  61%|█████████████████████████████████████████▉                           | 133/219 [01:33<00:58,  1.46it/s][A

tensor(0.5993, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  61%|██████████████████████████████████████████▏                          | 134/219 [01:34<00:58,  1.46it/s][A

tensor(0.4794, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  62%|██████████████████████████████████████████▌                          | 135/219 [01:35<00:58,  1.45it/s][A

tensor(0.7356, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  62%|██████████████████████████████████████████▊                          | 136/219 [01:35<00:57,  1.45it/s][A

tensor(0.6423, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  63%|███████████████████████████████████████████▏                         | 137/219 [01:36<00:56,  1.45it/s][A

tensor(0.4511, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  63%|███████████████████████████████████████████▍                         | 138/219 [01:37<00:56,  1.43it/s][A

tensor(0.5639, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  63%|███████████████████████████████████████████▊                         | 139/219 [01:37<00:55,  1.44it/s][A

tensor(0.5795, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  64%|████████████████████████████████████████████                         | 140/219 [01:38<00:54,  1.44it/s][A

tensor(0.4744, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  64%|████████████████████████████████████████████▍                        | 141/219 [01:39<00:53,  1.45it/s][A

tensor(0.4270, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  65%|████████████████████████████████████████████▋                        | 142/219 [01:39<00:54,  1.42it/s][A

tensor(0.6043, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  65%|█████████████████████████████████████████████                        | 143/219 [01:40<00:53,  1.43it/s][A

tensor(0.5454, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  66%|█████████████████████████████████████████████▎                       | 144/219 [01:41<00:52,  1.44it/s][A

tensor(0.3210, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  66%|█████████████████████████████████████████████▋                       | 145/219 [01:42<00:51,  1.44it/s][A

tensor(0.5223, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  67%|██████████████████████████████████████████████                       | 146/219 [01:42<00:50,  1.44it/s][A

tensor(0.5724, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  67%|██████████████████████████████████████████████▎                      | 147/219 [01:43<00:49,  1.44it/s][A

tensor(0.5198, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  68%|██████████████████████████████████████████████▋                      | 148/219 [01:44<00:50,  1.42it/s][A

tensor(0.4210, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  68%|██████████████████████████████████████████████▉                      | 149/219 [01:44<00:49,  1.40it/s][A

tensor(0.4552, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  68%|███████████████████████████████████████████████▎                     | 150/219 [01:45<00:48,  1.41it/s][A

tensor(0.4338, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  69%|███████████████████████████████████████████████▌                     | 151/219 [01:46<00:47,  1.43it/s][A

tensor(0.5070, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  69%|███████████████████████████████████████████████▉                     | 152/219 [01:46<00:46,  1.43it/s][A

tensor(0.4236, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  70%|████████████████████████████████████████████████▏                    | 153/219 [01:47<00:45,  1.44it/s][A

tensor(0.4105, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  70%|████████████████████████████████████████████████▌                    | 154/219 [01:48<00:44,  1.44it/s][A

tensor(0.5248, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  71%|████████████████████████████████████████████████▊                    | 155/219 [01:49<00:44,  1.45it/s][A

tensor(0.6461, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  71%|█████████████████████████████████████████████████▏                   | 156/219 [01:49<00:43,  1.45it/s][A

tensor(0.7435, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  72%|█████████████████████████████████████████████████▍                   | 157/219 [01:50<00:42,  1.45it/s][A

tensor(0.5054, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  72%|█████████████████████████████████████████████████▊                   | 158/219 [01:51<00:42,  1.42it/s][A

tensor(0.4430, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  73%|██████████████████████████████████████████████████                   | 159/219 [01:51<00:42,  1.40it/s][A

tensor(0.4923, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  73%|██████████████████████████████████████████████████▍                  | 160/219 [01:52<00:41,  1.41it/s][A

tensor(0.6173, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  74%|██████████████████████████████████████████████████▋                  | 161/219 [01:53<00:40,  1.43it/s][A

tensor(0.4773, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  74%|███████████████████████████████████████████████████                  | 162/219 [01:53<00:39,  1.43it/s][A

tensor(0.5100, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  74%|███████████████████████████████████████████████████▎                 | 163/219 [01:54<00:38,  1.44it/s][A

tensor(0.4869, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  75%|███████████████████████████████████████████████████▋                 | 164/219 [01:55<00:38,  1.44it/s][A

tensor(0.5494, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  75%|███████████████████████████████████████████████████▉                 | 165/219 [01:56<00:37,  1.44it/s][A

tensor(0.5853, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  76%|████████████████████████████████████████████████████▎                | 166/219 [01:56<00:36,  1.45it/s][A

tensor(0.7073, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  76%|████████████████████████████████████████████████████▌                | 167/219 [01:57<00:36,  1.44it/s][A

tensor(0.6103, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  77%|████████████████████████████████████████████████████▉                | 168/219 [01:58<00:35,  1.42it/s][A

tensor(0.4162, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  77%|█████████████████████████████████████████████████████▏               | 169/219 [01:58<00:34,  1.43it/s][A

tensor(0.4625, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  78%|█████████████████████████████████████████████████████▌               | 170/219 [01:59<00:34,  1.44it/s][A

tensor(0.5792, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  78%|█████████████████████████████████████████████████████▉               | 171/219 [02:00<00:33,  1.44it/s][A

tensor(0.4399, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  79%|██████████████████████████████████████████████████████▏              | 172/219 [02:00<00:32,  1.44it/s][A

tensor(0.4962, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  79%|██████████████████████████████████████████████████████▌              | 173/219 [02:01<00:31,  1.45it/s][A

tensor(0.6800, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  79%|██████████████████████████████████████████████████████▊              | 174/219 [02:02<00:31,  1.42it/s][A

tensor(0.5381, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  80%|███████████████████████████████████████████████████████▏             | 175/219 [02:02<00:30,  1.43it/s][A

tensor(0.4968, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  80%|███████████████████████████████████████████████████████▍             | 176/219 [02:03<00:29,  1.43it/s][A

tensor(0.7484, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  81%|███████████████████████████████████████████████████████▊             | 177/219 [02:04<00:29,  1.44it/s][A

tensor(0.3688, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  81%|████████████████████████████████████████████████████████             | 178/219 [02:05<00:28,  1.42it/s][A

tensor(0.4802, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  82%|████████████████████████████████████████████████████████▍            | 179/219 [02:05<00:28,  1.42it/s][A

tensor(0.6105, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  82%|████████████████████████████████████████████████████████▋            | 180/219 [02:06<00:27,  1.43it/s][A

tensor(0.6396, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  83%|█████████████████████████████████████████████████████████            | 181/219 [02:07<00:26,  1.44it/s][A

tensor(0.4121, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  83%|█████████████████████████████████████████████████████████▎           | 182/219 [02:07<00:25,  1.43it/s][A

tensor(0.5347, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  84%|█████████████████████████████████████████████████████████▋           | 183/219 [02:08<00:25,  1.41it/s][A

tensor(0.4651, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  84%|█████████████████████████████████████████████████████████▉           | 184/219 [02:09<00:24,  1.42it/s][A

tensor(0.8574, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  84%|██████████████████████████████████████████████████████████▎          | 185/219 [02:10<00:23,  1.42it/s][A

tensor(0.5677, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  85%|██████████████████████████████████████████████████████████▌          | 186/219 [02:10<00:23,  1.43it/s][A

tensor(0.4214, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  85%|██████████████████████████████████████████████████████████▉          | 187/219 [02:11<00:22,  1.42it/s][A

tensor(0.5829, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  86%|███████████████████████████████████████████████████████████▏         | 188/219 [02:12<00:21,  1.43it/s][A

tensor(0.5864, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  86%|███████████████████████████████████████████████████████████▌         | 189/219 [02:12<00:20,  1.43it/s][A

tensor(0.4980, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  87%|███████████████████████████████████████████████████████████▊         | 190/219 [02:13<00:20,  1.43it/s][A

tensor(0.5158, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  87%|████████████████████████████████████████████████████████████▏        | 191/219 [02:14<00:19,  1.42it/s][A

tensor(0.3528, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  88%|████████████████████████████████████████████████████████████▍        | 192/219 [02:14<00:18,  1.42it/s][A

tensor(0.4967, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  88%|████████████████████████████████████████████████████████████▊        | 193/219 [02:15<00:18,  1.41it/s][A

tensor(0.5631, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  89%|█████████████████████████████████████████████████████████████        | 194/219 [02:16<00:17,  1.41it/s][A

tensor(0.5199, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  89%|█████████████████████████████████████████████████████████████▍       | 195/219 [02:17<00:17,  1.37it/s][A

tensor(0.4699, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  89%|█████████████████████████████████████████████████████████████▊       | 196/219 [02:17<00:16,  1.36it/s][A

tensor(0.4101, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  90%|██████████████████████████████████████████████████████████████       | 197/219 [02:18<00:16,  1.37it/s][A

tensor(0.3816, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  90%|██████████████████████████████████████████████████████████████▍      | 198/219 [02:19<00:15,  1.34it/s][A

tensor(0.4695, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  91%|██████████████████████████████████████████████████████████████▋      | 199/219 [02:20<00:14,  1.36it/s][A

tensor(0.4349, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  91%|███████████████████████████████████████████████████████████████      | 200/219 [02:20<00:14,  1.35it/s][A

tensor(0.5201, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  92%|███████████████████████████████████████████████████████████████▎     | 201/219 [02:21<00:13,  1.37it/s][A

tensor(0.6344, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  92%|███████████████████████████████████████████████████████████████▋     | 202/219 [02:22<00:12,  1.38it/s][A

tensor(0.4905, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  93%|███████████████████████████████████████████████████████████████▉     | 203/219 [02:23<00:11,  1.36it/s][A

tensor(0.4325, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  93%|████████████████████████████████████████████████████████████████▎    | 204/219 [02:23<00:10,  1.38it/s][A

tensor(0.5512, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  94%|████████████████████████████████████████████████████████████████▌    | 205/219 [02:24<00:10,  1.39it/s][A

tensor(0.7054, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  94%|████████████████████████████████████████████████████████████████▉    | 206/219 [02:25<00:09,  1.40it/s][A

tensor(0.6794, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  95%|█████████████████████████████████████████████████████████████████▏   | 207/219 [02:25<00:08,  1.41it/s][A

tensor(0.4400, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  95%|█████████████████████████████████████████████████████████████████▌   | 208/219 [02:26<00:07,  1.39it/s][A

tensor(0.5235, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  95%|█████████████████████████████████████████████████████████████████▊   | 209/219 [02:27<00:07,  1.40it/s][A

tensor(0.5511, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  96%|██████████████████████████████████████████████████████████████████▏  | 210/219 [02:27<00:06,  1.39it/s][A

tensor(0.4685, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  96%|██████████████████████████████████████████████████████████████████▍  | 211/219 [02:28<00:05,  1.38it/s][A

tensor(0.5275, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  97%|██████████████████████████████████████████████████████████████████▊  | 212/219 [02:29<00:04,  1.40it/s][A

tensor(0.5531, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  97%|███████████████████████████████████████████████████████████████████  | 213/219 [02:30<00:04,  1.41it/s][A

tensor(0.5094, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  98%|███████████████████████████████████████████████████████████████████▍ | 214/219 [02:30<00:03,  1.42it/s][A

tensor(0.6070, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  98%|███████████████████████████████████████████████████████████████████▋ | 215/219 [02:31<00:02,  1.43it/s][A

tensor(0.6529, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  99%|████████████████████████████████████████████████████████████████████ | 216/219 [02:32<00:02,  1.43it/s][A

tensor(0.5366, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  99%|████████████████████████████████████████████████████████████████████▎| 217/219 [02:32<00:01,  1.43it/s][A

tensor(0.4339, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration: 100%|████████████████████████████████████████████████████████████████████▋| 218/219 [02:33<00:00,  1.42it/s][A

tensor(0.4700, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration: 100%|█████████████████████████████████████████████████████████████████████| 219/219 [02:34<00:00,  1.42it/s][A
Epoch:  33%|█████████████████████████▎                                                  | 1/3 [02:34<05:08, 154.29s/it]
Iteration:   0%|                                                                               | 0/219 [00:00<?, ?it/s][A

tensor(0.4064, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   0%|▎                                                                      | 1/219 [00:00<02:30,  1.45it/s][A

tensor(0.4137, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   1%|▋                                                                      | 2/219 [00:01<02:29,  1.45it/s][A

tensor(0.4103, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   1%|▉                                                                      | 3/219 [00:02<02:29,  1.45it/s][A

tensor(0.2903, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   2%|█▎                                                                     | 4/219 [00:02<02:30,  1.43it/s][A

tensor(0.3656, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   2%|█▌                                                                     | 5/219 [00:03<02:29,  1.43it/s][A

tensor(0.3123, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   3%|█▉                                                                     | 6/219 [00:04<02:27,  1.44it/s][A

tensor(0.3979, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   3%|██▎                                                                    | 7/219 [00:04<02:30,  1.41it/s][A

tensor(0.3009, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   4%|██▌                                                                    | 8/219 [00:05<02:28,  1.42it/s][A

tensor(0.2771, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   4%|██▉                                                                    | 9/219 [00:06<02:27,  1.43it/s][A

tensor(0.2576, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   5%|███▏                                                                  | 10/219 [00:06<02:25,  1.43it/s][A

tensor(0.4913, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   5%|███▌                                                                  | 11/219 [00:07<02:25,  1.43it/s][A

tensor(0.2821, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   5%|███▊                                                                  | 12/219 [00:08<02:24,  1.43it/s][A

tensor(0.3477, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   6%|████▏                                                                 | 13/219 [00:09<02:24,  1.42it/s][A

tensor(0.4487, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   6%|████▍                                                                 | 14/219 [00:09<02:25,  1.40it/s][A

tensor(0.3727, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   7%|████▊                                                                 | 15/219 [00:10<02:25,  1.41it/s][A

tensor(0.5085, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   7%|█████                                                                 | 16/219 [00:11<02:23,  1.41it/s][A

tensor(0.3492, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   8%|█████▍                                                                | 17/219 [00:11<02:21,  1.42it/s][A

tensor(0.2468, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   8%|█████▊                                                                | 18/219 [00:12<02:20,  1.43it/s][A

tensor(0.2441, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   9%|██████                                                                | 19/219 [00:13<02:19,  1.44it/s][A

tensor(0.4076, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   9%|██████▍                                                               | 20/219 [00:14<02:18,  1.44it/s][A

tensor(0.2469, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  10%|██████▋                                                               | 21/219 [00:14<02:17,  1.44it/s][A

tensor(0.3346, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  10%|███████                                                               | 22/219 [00:15<02:18,  1.42it/s][A

tensor(0.4237, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  11%|███████▎                                                              | 23/219 [00:16<02:20,  1.40it/s][A

tensor(0.3323, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  11%|███████▋                                                              | 24/219 [00:16<02:18,  1.41it/s][A

tensor(0.3175, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  11%|███████▉                                                              | 25/219 [00:17<02:16,  1.42it/s][A

tensor(0.3400, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  12%|████████▎                                                             | 26/219 [00:18<02:15,  1.42it/s][A

tensor(0.1321, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  12%|████████▋                                                             | 27/219 [00:18<02:14,  1.43it/s][A

tensor(0.1752, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  13%|████████▉                                                             | 28/219 [00:19<02:13,  1.43it/s][A

tensor(0.4008, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  13%|█████████▎                                                            | 29/219 [00:20<02:11,  1.44it/s][A

tensor(0.4492, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  14%|█████████▌                                                            | 30/219 [00:21<02:10,  1.44it/s][A

tensor(0.3095, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  14%|█████████▉                                                            | 31/219 [00:21<02:12,  1.42it/s][A

tensor(0.2151, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  15%|██████████▏                                                           | 32/219 [00:22<02:11,  1.43it/s][A

tensor(0.2831, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  15%|██████████▌                                                           | 33/219 [00:23<02:13,  1.40it/s][A

tensor(0.2416, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  16%|██████████▊                                                           | 34/219 [00:23<02:12,  1.40it/s][A

tensor(0.3145, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  16%|███████████▏                                                          | 35/219 [00:24<02:10,  1.40it/s][A

tensor(0.2620, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  16%|███████████▌                                                          | 36/219 [00:25<02:09,  1.42it/s][A

tensor(0.1672, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  17%|███████████▊                                                          | 37/219 [00:25<02:07,  1.42it/s][A

tensor(0.4231, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  17%|████████████▏                                                         | 38/219 [00:26<02:08,  1.41it/s][A

tensor(0.3103, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  18%|████████████▍                                                         | 39/219 [00:27<02:06,  1.42it/s][A

tensor(0.3248, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  18%|████████████▊                                                         | 40/219 [00:28<02:05,  1.43it/s][A

tensor(0.2184, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  19%|█████████████                                                         | 41/219 [00:28<02:05,  1.42it/s][A

tensor(0.1846, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  19%|█████████████▍                                                        | 42/219 [00:29<02:07,  1.39it/s][A

tensor(0.2074, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  20%|█████████████▋                                                        | 43/219 [00:30<02:05,  1.41it/s][A

tensor(0.5187, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  20%|██████████████                                                        | 44/219 [00:30<02:03,  1.41it/s][A

tensor(0.2522, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  21%|██████████████▍                                                       | 45/219 [00:31<02:02,  1.42it/s][A

tensor(0.5294, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  21%|██████████████▋                                                       | 46/219 [00:32<02:01,  1.42it/s][A

tensor(0.2723, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  21%|███████████████                                                       | 47/219 [00:33<02:03,  1.40it/s][A

tensor(0.1552, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  22%|███████████████▎                                                      | 48/219 [00:33<02:01,  1.41it/s][A

tensor(0.1694, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  22%|███████████████▋                                                      | 49/219 [00:34<02:00,  1.41it/s][A

tensor(0.2973, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  23%|███████████████▉                                                      | 50/219 [00:35<01:59,  1.42it/s][A

tensor(0.3708, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  23%|████████████████▎                                                     | 51/219 [00:35<01:57,  1.43it/s][A

tensor(0.4855, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  24%|████████████████▌                                                     | 52/219 [00:36<01:56,  1.43it/s][A

tensor(0.2811, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  24%|████████████████▉                                                     | 53/219 [00:37<01:55,  1.43it/s][A

tensor(0.3580, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  25%|█████████████████▎                                                    | 54/219 [00:37<01:54,  1.44it/s][A

tensor(0.3086, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  25%|█████████████████▌                                                    | 55/219 [00:38<01:55,  1.42it/s][A

tensor(0.4524, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  26%|█████████████████▉                                                    | 56/219 [00:39<01:54,  1.42it/s][A

tensor(0.3901, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  26%|██████████████████▏                                                   | 57/219 [00:40<01:53,  1.43it/s][A

tensor(0.4949, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  26%|██████████████████▌                                                   | 58/219 [00:40<01:52,  1.43it/s][A

tensor(0.3523, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  27%|██████████████████▊                                                   | 59/219 [00:41<01:51,  1.43it/s][A

tensor(0.4488, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  27%|███████████████████▏                                                  | 60/219 [00:42<01:54,  1.39it/s][A

tensor(0.3265, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  28%|███████████████████▍                                                  | 61/219 [00:42<01:54,  1.38it/s][A

tensor(0.5061, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  28%|███████████████████▊                                                  | 62/219 [00:43<01:56,  1.34it/s][A

tensor(0.3110, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  29%|████████████████████▏                                                 | 63/219 [00:44<01:54,  1.37it/s][A

tensor(0.4046, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  29%|████████████████████▍                                                 | 64/219 [00:45<01:52,  1.38it/s][A

tensor(0.2119, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  30%|████████████████████▊                                                 | 65/219 [00:45<01:51,  1.38it/s][A

tensor(0.2917, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  30%|█████████████████████                                                 | 66/219 [00:46<01:49,  1.40it/s][A

tensor(0.3718, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  31%|█████████████████████▍                                                | 67/219 [00:47<01:48,  1.41it/s][A

tensor(0.4981, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  31%|█████████████████████▋                                                | 68/219 [00:48<01:47,  1.41it/s][A

tensor(0.2043, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  32%|██████████████████████                                                | 69/219 [00:48<01:46,  1.40it/s][A

tensor(0.3844, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  32%|██████████████████████▎                                               | 70/219 [00:49<01:45,  1.41it/s][A

tensor(0.5250, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  32%|██████████████████████▋                                               | 71/219 [00:50<01:44,  1.42it/s][A

tensor(0.3585, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  33%|███████████████████████                                               | 72/219 [00:50<01:45,  1.39it/s][A

tensor(0.3807, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  33%|███████████████████████▎                                              | 73/219 [00:51<01:43,  1.41it/s][A

tensor(0.4403, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  34%|███████████████████████▋                                              | 74/219 [00:52<01:42,  1.42it/s][A

tensor(0.4772, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  34%|███████████████████████▉                                              | 75/219 [00:53<01:42,  1.40it/s][A

tensor(0.2691, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  35%|████████████████████████▎                                             | 76/219 [00:53<01:41,  1.41it/s][A

tensor(0.2647, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  35%|████████████████████████▌                                             | 77/219 [00:54<01:41,  1.39it/s][A

tensor(0.3760, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  36%|████████████████████████▉                                             | 78/219 [00:55<01:42,  1.38it/s][A

tensor(0.1816, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  36%|█████████████████████████▎                                            | 79/219 [00:55<01:40,  1.40it/s][A

tensor(0.2673, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  37%|█████████████████████████▌                                            | 80/219 [00:56<01:39,  1.40it/s][A

tensor(0.3288, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  37%|█████████████████████████▉                                            | 81/219 [00:57<01:40,  1.38it/s][A

tensor(0.3677, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  37%|██████████████████████████▏                                           | 82/219 [00:58<01:42,  1.33it/s][A

tensor(0.2936, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  38%|██████████████████████████▌                                           | 83/219 [00:58<01:41,  1.34it/s][A

tensor(0.2334, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  38%|██████████████████████████▊                                           | 84/219 [00:59<01:39,  1.36it/s][A

tensor(0.2455, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  39%|███████████████████████████▏                                          | 85/219 [01:00<01:38,  1.36it/s][A

tensor(0.4244, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  39%|███████████████████████████▍                                          | 86/219 [01:01<01:36,  1.38it/s][A

tensor(0.2829, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  40%|███████████████████████████▊                                          | 87/219 [01:01<01:35,  1.39it/s][A

tensor(0.4290, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  40%|████████████████████████████▏                                         | 88/219 [01:02<01:33,  1.40it/s][A

tensor(0.1795, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  41%|████████████████████████████▍                                         | 89/219 [01:03<01:32,  1.40it/s][A

tensor(0.3274, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  41%|████████████████████████████▊                                         | 90/219 [01:03<01:32,  1.39it/s][A

tensor(0.3753, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  42%|█████████████████████████████                                         | 91/219 [01:04<01:32,  1.39it/s][A

tensor(0.2326, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  42%|█████████████████████████████▍                                        | 92/219 [01:05<01:31,  1.40it/s][A

tensor(0.3919, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  42%|█████████████████████████████▋                                        | 93/219 [01:06<01:29,  1.41it/s][A

tensor(0.1626, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  43%|██████████████████████████████                                        | 94/219 [01:06<01:28,  1.42it/s][A

tensor(0.4358, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  43%|██████████████████████████████▎                                       | 95/219 [01:07<01:28,  1.40it/s][A

tensor(0.1841, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  44%|██████████████████████████████▋                                       | 96/219 [01:08<01:28,  1.38it/s][A

tensor(0.2337, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  44%|███████████████████████████████                                       | 97/219 [01:08<01:29,  1.37it/s][A

tensor(0.2900, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  45%|███████████████████████████████▎                                      | 98/219 [01:09<01:27,  1.39it/s][A

tensor(0.4302, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  45%|███████████████████████████████▋                                      | 99/219 [01:10<01:25,  1.40it/s][A

tensor(0.1810, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  46%|███████████████████████████████▌                                     | 100/219 [01:11<01:27,  1.37it/s][A

tensor(0.3845, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  46%|███████████████████████████████▊                                     | 101/219 [01:11<01:27,  1.36it/s][A

tensor(0.3891, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  47%|████████████████████████████████▏                                    | 102/219 [01:12<01:26,  1.36it/s][A

tensor(0.1810, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  47%|████████████████████████████████▍                                    | 103/219 [01:13<01:24,  1.38it/s][A

tensor(0.2279, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  47%|████████████████████████████████▊                                    | 104/219 [01:14<01:25,  1.35it/s][A

tensor(0.1373, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  48%|█████████████████████████████████                                    | 105/219 [01:14<01:25,  1.33it/s][A

tensor(0.3914, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  48%|█████████████████████████████████▍                                   | 106/219 [01:15<01:23,  1.36it/s][A

tensor(0.2815, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  49%|█████████████████████████████████▋                                   | 107/219 [01:16<01:21,  1.37it/s][A

tensor(0.2928, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  49%|██████████████████████████████████                                   | 108/219 [01:16<01:20,  1.39it/s][A

tensor(0.2967, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  50%|██████████████████████████████████▎                                  | 109/219 [01:17<01:19,  1.39it/s][A

tensor(0.2420, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  50%|██████████████████████████████████▋                                  | 110/219 [01:18<01:17,  1.40it/s][A

tensor(0.5830, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  51%|██████████████████████████████████▉                                  | 111/219 [01:19<01:16,  1.41it/s][A

tensor(0.3237, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  51%|███████████████████████████████████▎                                 | 112/219 [01:19<01:15,  1.41it/s][A

tensor(0.6649, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  52%|███████████████████████████████████▌                                 | 113/219 [01:20<01:15,  1.40it/s][A

tensor(0.3208, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  52%|███████████████████████████████████▉                                 | 114/219 [01:21<01:15,  1.39it/s][A

tensor(0.1712, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  53%|████████████████████████████████████▏                                | 115/219 [01:21<01:14,  1.40it/s][A

tensor(0.3834, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  53%|████████████████████████████████████▌                                | 116/219 [01:22<01:13,  1.41it/s][A

tensor(0.2440, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  53%|████████████████████████████████████▊                                | 117/219 [01:23<01:13,  1.38it/s][A

tensor(0.2469, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  54%|█████████████████████████████████████▏                               | 118/219 [01:24<01:12,  1.39it/s][A

tensor(0.3906, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  54%|█████████████████████████████████████▍                               | 119/219 [01:24<01:12,  1.39it/s][A

tensor(0.2570, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  55%|█████████████████████████████████████▊                               | 120/219 [01:25<01:11,  1.39it/s][A

tensor(0.3066, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  55%|██████████████████████████████████████                               | 121/219 [01:26<01:09,  1.41it/s][A

tensor(0.3087, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  56%|██████████████████████████████████████▍                              | 122/219 [01:26<01:09,  1.40it/s][A

tensor(0.2769, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  56%|██████████████████████████████████████▊                              | 123/219 [01:27<01:09,  1.39it/s][A

tensor(0.2848, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  57%|███████████████████████████████████████                              | 124/219 [01:28<01:08,  1.40it/s][A

tensor(0.2047, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  57%|███████████████████████████████████████▍                             | 125/219 [01:29<01:07,  1.39it/s][A

tensor(0.3167, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  58%|███████████████████████████████████████▋                             | 126/219 [01:29<01:06,  1.40it/s][A

tensor(0.1609, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  58%|████████████████████████████████████████                             | 127/219 [01:30<01:05,  1.41it/s][A

tensor(0.3740, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  58%|████████████████████████████████████████▎                            | 128/219 [01:31<01:04,  1.41it/s][A

tensor(0.2260, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  59%|████████████████████████████████████████▋                            | 129/219 [01:31<01:03,  1.41it/s][A

tensor(0.3371, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  59%|████████████████████████████████████████▉                            | 130/219 [01:32<01:03,  1.40it/s][A

tensor(0.2620, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  60%|█████████████████████████████████████████▎                           | 131/219 [01:33<01:02,  1.40it/s][A

tensor(0.1381, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  60%|█████████████████████████████████████████▌                           | 132/219 [01:34<01:02,  1.39it/s][A

tensor(0.3900, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  61%|█████████████████████████████████████████▉                           | 133/219 [01:34<01:02,  1.38it/s][A

tensor(0.1317, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  61%|██████████████████████████████████████████▏                          | 134/219 [01:35<01:02,  1.36it/s][A

tensor(0.3631, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  62%|██████████████████████████████████████████▌                          | 135/219 [01:36<01:01,  1.36it/s][A

tensor(0.2710, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  62%|██████████████████████████████████████████▊                          | 136/219 [01:37<01:01,  1.36it/s][A

tensor(0.4338, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  63%|███████████████████████████████████████████▏                         | 137/219 [01:37<00:59,  1.37it/s][A

tensor(0.4636, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  63%|███████████████████████████████████████████▍                         | 138/219 [01:38<00:58,  1.38it/s][A

tensor(0.3681, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  63%|███████████████████████████████████████████▊                         | 139/219 [01:39<00:59,  1.35it/s][A

tensor(0.2853, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  64%|████████████████████████████████████████████                         | 140/219 [01:40<00:58,  1.36it/s][A

tensor(0.1680, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  64%|████████████████████████████████████████████▍                        | 141/219 [01:40<00:56,  1.37it/s][A

tensor(0.1213, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  65%|████████████████████████████████████████████▋                        | 142/219 [01:41<00:55,  1.38it/s][A

tensor(0.1609, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  65%|█████████████████████████████████████████████                        | 143/219 [01:42<00:55,  1.37it/s][A

tensor(0.2375, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  66%|█████████████████████████████████████████████▎                       | 144/219 [01:42<00:55,  1.36it/s][A

tensor(0.1320, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  66%|█████████████████████████████████████████████▋                       | 145/219 [01:43<00:53,  1.37it/s][A

tensor(0.4822, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  67%|██████████████████████████████████████████████                       | 146/219 [01:44<00:52,  1.39it/s][A

tensor(0.1125, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  67%|██████████████████████████████████████████████▎                      | 147/219 [01:45<00:51,  1.40it/s][A

tensor(0.2379, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  68%|██████████████████████████████████████████████▋                      | 148/219 [01:45<00:50,  1.40it/s][A

tensor(0.3658, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  68%|██████████████████████████████████████████████▉                      | 149/219 [01:46<00:49,  1.41it/s][A

tensor(0.1477, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  68%|███████████████████████████████████████████████▎                     | 150/219 [01:47<00:50,  1.36it/s][A

tensor(0.3658, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  69%|███████████████████████████████████████████████▌                     | 151/219 [01:47<00:49,  1.38it/s][A

tensor(0.1915, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  69%|███████████████████████████████████████████████▉                     | 152/219 [01:48<00:47,  1.40it/s][A

tensor(0.4996, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  70%|████████████████████████████████████████████████▏                    | 153/219 [01:49<00:47,  1.40it/s][A

tensor(0.1429, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  70%|████████████████████████████████████████████████▌                    | 154/219 [01:50<00:47,  1.38it/s][A

tensor(0.1960, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  71%|████████████████████████████████████████████████▊                    | 155/219 [01:50<00:46,  1.38it/s][A

tensor(0.2129, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  71%|█████████████████████████████████████████████████▏                   | 156/219 [01:51<00:45,  1.39it/s][A

tensor(0.4604, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  72%|█████████████████████████████████████████████████▍                   | 157/219 [01:52<00:45,  1.37it/s][A

tensor(0.2940, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  72%|█████████████████████████████████████████████████▊                   | 158/219 [01:52<00:43,  1.39it/s][A

tensor(0.1996, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  73%|██████████████████████████████████████████████████                   | 159/219 [01:53<00:42,  1.40it/s][A

tensor(0.2107, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  73%|██████████████████████████████████████████████████▍                  | 160/219 [01:54<00:41,  1.41it/s][A

tensor(0.1579, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  74%|██████████████████████████████████████████████████▋                  | 161/219 [01:55<00:41,  1.41it/s][A

tensor(0.3562, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  74%|███████████████████████████████████████████████████                  | 162/219 [01:55<00:40,  1.40it/s][A

tensor(0.0969, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  74%|███████████████████████████████████████████████████▎                 | 163/219 [01:56<00:40,  1.40it/s][A

tensor(0.1518, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  75%|███████████████████████████████████████████████████▋                 | 164/219 [01:57<00:39,  1.41it/s][A

tensor(0.2970, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  75%|███████████████████████████████████████████████████▉                 | 165/219 [01:57<00:38,  1.39it/s][A

tensor(0.0859, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  76%|████████████████████████████████████████████████████▎                | 166/219 [01:58<00:38,  1.36it/s][A

tensor(0.2837, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  76%|████████████████████████████████████████████████████▌                | 167/219 [01:59<00:38,  1.37it/s][A

tensor(0.1567, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  77%|████████████████████████████████████████████████████▉                | 168/219 [02:00<00:37,  1.37it/s][A

tensor(0.3056, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  77%|█████████████████████████████████████████████████████▏               | 169/219 [02:00<00:36,  1.39it/s][A

tensor(0.5555, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  78%|█████████████████████████████████████████████████████▌               | 170/219 [02:01<00:35,  1.39it/s][A

tensor(0.3159, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  78%|█████████████████████████████████████████████████████▉               | 171/219 [02:02<00:34,  1.40it/s][A

tensor(0.2689, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  79%|██████████████████████████████████████████████████████▏              | 172/219 [02:03<00:33,  1.40it/s][A

tensor(0.5072, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  79%|██████████████████████████████████████████████████████▌              | 173/219 [02:03<00:32,  1.39it/s][A

tensor(0.1037, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  79%|██████████████████████████████████████████████████████▊              | 174/219 [02:04<00:32,  1.40it/s][A

tensor(0.2474, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  80%|███████████████████████████████████████████████████████▏             | 175/219 [02:05<00:31,  1.40it/s][A

tensor(0.1760, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  80%|███████████████████████████████████████████████████████▍             | 176/219 [02:05<00:30,  1.41it/s][A

tensor(0.2723, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  81%|███████████████████████████████████████████████████████▊             | 177/219 [02:06<00:29,  1.41it/s][A

tensor(0.2440, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  81%|████████████████████████████████████████████████████████             | 178/219 [02:07<00:29,  1.41it/s][A

tensor(0.4406, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  82%|████████████████████████████████████████████████████████▍            | 179/219 [02:08<00:28,  1.41it/s][A

tensor(0.2212, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  82%|████████████████████████████████████████████████████████▋            | 180/219 [02:08<00:27,  1.41it/s][A

tensor(0.1088, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  83%|█████████████████████████████████████████████████████████            | 181/219 [02:09<00:26,  1.41it/s][A

tensor(0.2846, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  83%|█████████████████████████████████████████████████████████▎           | 182/219 [02:10<00:26,  1.40it/s][A

tensor(0.1726, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  84%|█████████████████████████████████████████████████████████▋           | 183/219 [02:10<00:26,  1.38it/s][A

tensor(0.1468, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  84%|█████████████████████████████████████████████████████████▉           | 184/219 [02:11<00:25,  1.39it/s][A

tensor(0.1057, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  84%|██████████████████████████████████████████████████████████▎          | 185/219 [02:12<00:24,  1.40it/s][A

tensor(0.3960, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  85%|██████████████████████████████████████████████████████████▌          | 186/219 [02:13<00:23,  1.40it/s][A

tensor(0.2128, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  85%|██████████████████████████████████████████████████████████▉          | 187/219 [02:13<00:23,  1.37it/s][A

tensor(0.1365, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  86%|███████████████████████████████████████████████████████████▏         | 188/219 [02:14<00:22,  1.37it/s][A

tensor(0.4037, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  86%|███████████████████████████████████████████████████████████▌         | 189/219 [02:15<00:21,  1.38it/s][A

tensor(0.2569, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  87%|███████████████████████████████████████████████████████████▊         | 190/219 [02:15<00:20,  1.40it/s][A

tensor(0.3377, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  87%|████████████████████████████████████████████████████████████▏        | 191/219 [02:16<00:20,  1.40it/s][A

tensor(0.2500, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  88%|████████████████████████████████████████████████████████████▍        | 192/219 [02:17<00:19,  1.41it/s][A

tensor(0.2308, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  88%|████████████████████████████████████████████████████████████▊        | 193/219 [02:18<00:18,  1.41it/s][A

tensor(0.3217, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  89%|█████████████████████████████████████████████████████████████        | 194/219 [02:18<00:17,  1.40it/s][A

tensor(0.2684, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  89%|█████████████████████████████████████████████████████████████▍       | 195/219 [02:19<00:17,  1.40it/s][A

tensor(0.2108, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  89%|█████████████████████████████████████████████████████████████▊       | 196/219 [02:20<00:16,  1.41it/s][A

tensor(0.3634, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  90%|██████████████████████████████████████████████████████████████       | 197/219 [02:20<00:15,  1.40it/s][A

tensor(0.5445, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  90%|██████████████████████████████████████████████████████████████▍      | 198/219 [02:21<00:15,  1.38it/s][A

tensor(0.3015, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  91%|██████████████████████████████████████████████████████████████▋      | 199/219 [02:22<00:14,  1.39it/s][A

tensor(0.2845, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  91%|███████████████████████████████████████████████████████████████      | 200/219 [02:23<00:13,  1.40it/s][A

tensor(0.2565, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  92%|███████████████████████████████████████████████████████████████▎     | 201/219 [02:23<00:12,  1.40it/s][A

tensor(0.3670, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  92%|███████████████████████████████████████████████████████████████▋     | 202/219 [02:24<00:12,  1.38it/s][A

tensor(0.2091, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  93%|███████████████████████████████████████████████████████████████▉     | 203/219 [02:25<00:11,  1.37it/s][A

tensor(0.2018, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  93%|████████████████████████████████████████████████████████████████▎    | 204/219 [02:25<00:10,  1.39it/s][A

tensor(0.2126, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  94%|████████████████████████████████████████████████████████████████▌    | 205/219 [02:26<00:10,  1.39it/s][A

tensor(0.2495, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  94%|████████████████████████████████████████████████████████████████▉    | 206/219 [02:27<00:09,  1.40it/s][A

tensor(0.1311, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  95%|█████████████████████████████████████████████████████████████████▏   | 207/219 [02:28<00:08,  1.36it/s][A

tensor(0.3546, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  95%|█████████████████████████████████████████████████████████████████▌   | 208/219 [02:28<00:08,  1.37it/s][A

tensor(0.2448, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  95%|█████████████████████████████████████████████████████████████████▊   | 209/219 [02:29<00:07,  1.38it/s][A

tensor(0.3321, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  96%|██████████████████████████████████████████████████████████████████▏  | 210/219 [02:30<00:06,  1.39it/s][A

tensor(0.2450, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  96%|██████████████████████████████████████████████████████████████████▍  | 211/219 [02:31<00:05,  1.40it/s][A

tensor(0.3125, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  97%|██████████████████████████████████████████████████████████████████▊  | 212/219 [02:31<00:05,  1.39it/s][A

tensor(0.2411, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  97%|███████████████████████████████████████████████████████████████████  | 213/219 [02:32<00:04,  1.38it/s][A

tensor(0.4154, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  98%|███████████████████████████████████████████████████████████████████▍ | 214/219 [02:33<00:03,  1.38it/s][A

tensor(0.1083, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  98%|███████████████████████████████████████████████████████████████████▋ | 215/219 [02:33<00:02,  1.39it/s][A

tensor(0.2763, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  99%|████████████████████████████████████████████████████████████████████ | 216/219 [02:34<00:02,  1.36it/s][A

tensor(0.1807, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  99%|████████████████████████████████████████████████████████████████████▎| 217/219 [02:35<00:01,  1.37it/s][A

tensor(0.2526, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration: 100%|████████████████████████████████████████████████████████████████████▋| 218/219 [02:36<00:00,  1.35it/s][A

tensor(0.1460, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration: 100%|█████████████████████████████████████████████████████████████████████| 219/219 [02:36<00:00,  1.40it/s][A
Epoch:  67%|██████████████████████████████████████████████████▋                         | 2/3 [05:11<02:35, 155.07s/it]
Iteration:   0%|                                                                               | 0/219 [00:00<?, ?it/s][A

tensor(0.1760, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   0%|▎                                                                      | 1/219 [00:00<02:31,  1.44it/s][A

tensor(0.1556, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   1%|▋                                                                      | 2/219 [00:01<02:31,  1.43it/s][A

tensor(0.0987, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   1%|▉                                                                      | 3/219 [00:02<02:31,  1.43it/s][A

tensor(0.1054, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   2%|█▎                                                                     | 4/219 [00:02<02:30,  1.43it/s][A

tensor(0.0784, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   2%|█▌                                                                     | 5/219 [00:03<02:31,  1.41it/s][A

tensor(0.2477, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   3%|█▉                                                                     | 6/219 [00:04<02:32,  1.40it/s][A

tensor(0.1651, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   3%|██▎                                                                    | 7/219 [00:04<02:30,  1.41it/s][A

tensor(0.0318, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   4%|██▌                                                                    | 8/219 [00:05<02:29,  1.41it/s][A

tensor(0.0699, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   4%|██▉                                                                    | 9/219 [00:06<02:28,  1.42it/s][A

tensor(0.0679, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   5%|███▏                                                                  | 10/219 [00:07<02:26,  1.42it/s][A

tensor(0.1308, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   5%|███▌                                                                  | 11/219 [00:07<02:25,  1.43it/s][A

tensor(0.1187, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   5%|███▊                                                                  | 12/219 [00:08<02:25,  1.43it/s][A

tensor(0.1328, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   6%|████▏                                                                 | 13/219 [00:09<02:24,  1.43it/s][A

tensor(0.1094, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   6%|████▍                                                                 | 14/219 [00:09<02:23,  1.42it/s][A

tensor(0.2123, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   7%|████▊                                                                 | 15/219 [00:10<02:23,  1.42it/s][A

tensor(0.0502, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   7%|█████                                                                 | 16/219 [00:11<02:22,  1.42it/s][A

tensor(0.1053, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   8%|█████▍                                                                | 17/219 [00:11<02:22,  1.42it/s][A

tensor(0.1687, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   8%|█████▊                                                                | 18/219 [00:12<02:21,  1.42it/s][A

tensor(0.0569, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   9%|██████                                                                | 19/219 [00:13<02:21,  1.42it/s][A

tensor(0.1672, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   9%|██████▍                                                               | 20/219 [00:14<02:20,  1.42it/s][A

tensor(0.0615, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  10%|██████▋                                                               | 21/219 [00:14<02:20,  1.41it/s][A

tensor(0.1592, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  10%|███████                                                               | 22/219 [00:15<02:21,  1.40it/s][A

tensor(0.0980, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  11%|███████▎                                                              | 23/219 [00:16<02:20,  1.39it/s][A

tensor(0.1467, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  11%|███████▋                                                              | 24/219 [00:16<02:19,  1.39it/s][A

tensor(0.2395, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  11%|███████▉                                                              | 25/219 [00:17<02:18,  1.40it/s][A

tensor(0.0489, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  12%|████████▎                                                             | 26/219 [00:18<02:17,  1.40it/s][A

tensor(0.0992, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  12%|████████▋                                                             | 27/219 [00:19<02:16,  1.41it/s][A

tensor(0.0509, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  13%|████████▉                                                             | 28/219 [00:19<02:15,  1.41it/s][A

tensor(0.0499, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  13%|█████████▎                                                            | 29/219 [00:20<02:16,  1.39it/s][A

tensor(0.0728, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  14%|█████████▌                                                            | 30/219 [00:21<02:16,  1.38it/s][A

tensor(0.0375, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  14%|█████████▉                                                            | 31/219 [00:22<02:15,  1.39it/s][A

tensor(0.0374, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  15%|██████████▏                                                           | 32/219 [00:22<02:13,  1.40it/s][A

tensor(0.0486, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  15%|██████████▌                                                           | 33/219 [00:23<02:15,  1.37it/s][A

tensor(0.0420, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  16%|██████████▊                                                           | 34/219 [00:24<02:15,  1.36it/s][A

tensor(0.0170, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  16%|███████████▏                                                          | 35/219 [00:24<02:13,  1.38it/s][A

tensor(0.0249, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  16%|███████████▌                                                          | 36/219 [00:25<02:12,  1.38it/s][A

tensor(0.3268, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  17%|███████████▊                                                          | 37/219 [00:26<02:11,  1.38it/s][A

tensor(0.0635, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  17%|████████████▏                                                         | 38/219 [00:27<02:12,  1.37it/s][A

tensor(0.1781, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  18%|████████████▍                                                         | 39/219 [00:27<02:13,  1.35it/s][A

tensor(0.2389, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  18%|████████████▊                                                         | 40/219 [00:28<02:11,  1.36it/s][A

tensor(0.2498, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  19%|█████████████                                                         | 41/219 [00:29<02:09,  1.38it/s][A

tensor(0.0865, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  19%|█████████████▍                                                        | 42/219 [00:30<02:08,  1.38it/s][A

tensor(0.0158, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  20%|█████████████▋                                                        | 43/219 [00:30<02:06,  1.39it/s][A

tensor(0.0312, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  20%|██████████████                                                        | 44/219 [00:31<02:04,  1.40it/s][A

tensor(0.1174, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  21%|██████████████▍                                                       | 45/219 [00:32<02:05,  1.39it/s][A

tensor(0.0868, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  21%|██████████████▋                                                       | 46/219 [00:32<02:04,  1.39it/s][A

tensor(0.0634, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  21%|███████████████                                                       | 47/219 [00:33<02:03,  1.39it/s][A

tensor(0.0875, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  22%|███████████████▎                                                      | 48/219 [00:34<02:02,  1.40it/s][A

tensor(0.1220, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  22%|███████████████▋                                                      | 49/219 [00:35<02:01,  1.40it/s][A

tensor(0.0311, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  23%|███████████████▉                                                      | 50/219 [00:35<02:00,  1.41it/s][A

tensor(0.0795, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  23%|████████████████▎                                                     | 51/219 [00:36<01:59,  1.41it/s][A

tensor(0.0950, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  24%|████████████████▌                                                     | 52/219 [00:37<01:58,  1.41it/s][A

tensor(0.0510, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  24%|████████████████▉                                                     | 53/219 [00:37<01:57,  1.41it/s][A

tensor(0.0477, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  25%|█████████████████▎                                                    | 54/219 [00:38<01:56,  1.41it/s][A

tensor(0.0102, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  25%|█████████████████▌                                                    | 55/219 [00:39<01:56,  1.40it/s][A

tensor(0.1234, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  26%|█████████████████▉                                                    | 56/219 [00:39<01:56,  1.40it/s][A

tensor(0.1625, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  26%|██████████████████▏                                                   | 57/219 [00:40<01:54,  1.41it/s][A

tensor(0.0150, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  26%|██████████████████▌                                                   | 58/219 [00:41<01:54,  1.41it/s][A

tensor(0.1190, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  27%|██████████████████▊                                                   | 59/219 [00:42<01:54,  1.40it/s][A

tensor(0.1579, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  27%|███████████████████▏                                                  | 60/219 [00:42<01:53,  1.41it/s][A

tensor(0.0365, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  28%|███████████████████▍                                                  | 61/219 [00:43<01:52,  1.41it/s][A

tensor(0.2012, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  28%|███████████████████▊                                                  | 62/219 [00:44<01:51,  1.40it/s][A

tensor(0.1454, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  29%|████████████████████▏                                                 | 63/219 [00:44<01:50,  1.41it/s][A

tensor(0.0252, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  29%|████████████████████▍                                                 | 64/219 [00:45<01:52,  1.37it/s][A

tensor(0.0074, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  30%|████████████████████▊                                                 | 65/219 [00:46<01:51,  1.38it/s][A

tensor(0.0361, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  30%|█████████████████████                                                 | 66/219 [00:47<01:49,  1.40it/s][A

tensor(0.1428, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  31%|█████████████████████▍                                                | 67/219 [00:47<01:48,  1.40it/s][A

tensor(0.0710, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  31%|█████████████████████▋                                                | 68/219 [00:48<01:47,  1.40it/s][A

tensor(0.0625, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  32%|██████████████████████                                                | 69/219 [00:49<01:46,  1.40it/s][A

tensor(0.0209, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  32%|██████████████████████▎                                               | 70/219 [00:49<01:46,  1.40it/s][A

tensor(0.1793, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  32%|██████████████████████▋                                               | 71/219 [00:50<01:47,  1.38it/s][A

tensor(0.1977, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  33%|███████████████████████                                               | 72/219 [00:51<01:45,  1.39it/s][A

tensor(0.1378, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  33%|███████████████████████▎                                              | 73/219 [00:52<01:44,  1.39it/s][A

tensor(0.0801, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  34%|███████████████████████▋                                              | 74/219 [00:52<01:43,  1.40it/s][A

tensor(0.0102, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  34%|███████████████████████▉                                              | 75/219 [00:53<01:42,  1.41it/s][A

tensor(0.0530, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  35%|████████████████████████▎                                             | 76/219 [00:54<01:41,  1.41it/s][A

tensor(0.1105, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  35%|████████████████████████▌                                             | 77/219 [00:55<01:42,  1.38it/s][A

tensor(0.0472, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  36%|████████████████████████▉                                             | 78/219 [00:55<01:41,  1.39it/s][A

tensor(0.1380, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  36%|█████████████████████████▎                                            | 79/219 [00:56<01:40,  1.39it/s][A

tensor(0.0601, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  37%|█████████████████████████▌                                            | 80/219 [00:57<01:39,  1.40it/s][A

tensor(0.0716, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  37%|█████████████████████████▉                                            | 81/219 [00:57<01:39,  1.38it/s][A

tensor(0.0837, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  37%|██████████████████████████▏                                           | 82/219 [00:58<01:40,  1.37it/s][A

tensor(0.1146, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  38%|██████████████████████████▌                                           | 83/219 [00:59<01:40,  1.36it/s][A

tensor(0.0328, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  38%|██████████████████████████▊                                           | 84/219 [01:00<01:38,  1.38it/s][A

tensor(0.0203, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  39%|███████████████████████████▏                                          | 85/219 [01:00<01:36,  1.38it/s][A

tensor(0.0547, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  39%|███████████████████████████▍                                          | 86/219 [01:01<01:35,  1.39it/s][A

tensor(0.1645, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  40%|███████████████████████████▊                                          | 87/219 [01:02<01:35,  1.39it/s][A

tensor(0.0494, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  40%|████████████████████████████▏                                         | 88/219 [01:02<01:34,  1.38it/s][A

tensor(0.0363, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  41%|████████████████████████████▍                                         | 89/219 [01:03<01:34,  1.37it/s][A

tensor(0.0342, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  41%|████████████████████████████▊                                         | 90/219 [01:04<01:34,  1.36it/s][A

tensor(0.0455, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  42%|█████████████████████████████                                         | 91/219 [01:05<01:33,  1.37it/s][A

tensor(0.0529, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  42%|█████████████████████████████▍                                        | 92/219 [01:05<01:32,  1.38it/s][A

tensor(0.0699, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  42%|█████████████████████████████▋                                        | 93/219 [01:06<01:32,  1.36it/s][A

tensor(0.0194, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  43%|██████████████████████████████                                        | 94/219 [01:07<01:30,  1.38it/s][A

tensor(0.0369, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  43%|██████████████████████████████▎                                       | 95/219 [01:08<01:29,  1.39it/s][A

tensor(0.0673, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  44%|██████████████████████████████▋                                       | 96/219 [01:08<01:28,  1.40it/s][A

tensor(0.0184, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  44%|███████████████████████████████                                       | 97/219 [01:09<01:26,  1.41it/s][A

tensor(0.0719, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  45%|███████████████████████████████▎                                      | 98/219 [01:10<01:25,  1.41it/s][A

tensor(0.0559, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  45%|███████████████████████████████▋                                      | 99/219 [01:10<01:26,  1.38it/s][A

tensor(0.1202, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  46%|███████████████████████████████▌                                     | 100/219 [01:11<01:25,  1.40it/s][A

tensor(0.0152, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  46%|███████████████████████████████▊                                     | 101/219 [01:12<01:24,  1.40it/s][A

tensor(0.0475, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  47%|████████████████████████████████▏                                    | 102/219 [01:13<01:23,  1.40it/s][A

tensor(0.1537, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  47%|████████████████████████████████▍                                    | 103/219 [01:13<01:24,  1.37it/s][A

tensor(0.2091, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  47%|████████████████████████████████▊                                    | 104/219 [01:14<01:22,  1.39it/s][A

tensor(0.0146, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  48%|█████████████████████████████████                                    | 105/219 [01:15<01:21,  1.40it/s][A

tensor(0.1885, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  48%|█████████████████████████████████▍                                   | 106/219 [01:15<01:20,  1.40it/s][A

tensor(0.1639, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  49%|█████████████████████████████████▋                                   | 107/219 [01:16<01:19,  1.40it/s][A

tensor(0.0381, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  49%|██████████████████████████████████                                   | 108/219 [01:17<01:18,  1.41it/s][A

tensor(0.0151, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  50%|██████████████████████████████████▎                                  | 109/219 [01:18<01:19,  1.38it/s][A

tensor(0.0490, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  50%|██████████████████████████████████▋                                  | 110/219 [01:18<01:18,  1.39it/s][A

tensor(0.1931, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  51%|██████████████████████████████████▉                                  | 111/219 [01:19<01:17,  1.40it/s][A

tensor(0.0386, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  51%|███████████████████████████████████▎                                 | 112/219 [01:20<01:16,  1.40it/s][A

tensor(0.1684, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  52%|███████████████████████████████████▌                                 | 113/219 [01:20<01:15,  1.40it/s][A

tensor(0.0203, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  52%|███████████████████████████████████▉                                 | 114/219 [01:21<01:14,  1.41it/s][A

tensor(0.0167, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  53%|████████████████████████████████████▏                                | 115/219 [01:22<01:13,  1.41it/s][A

tensor(0.0613, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  53%|████████████████████████████████████▌                                | 116/219 [01:23<01:12,  1.41it/s][A

tensor(0.0102, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  53%|████████████████████████████████████▊                                | 117/219 [01:23<01:13,  1.39it/s][A

tensor(0.0615, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  54%|█████████████████████████████████████▏                               | 118/219 [01:24<01:12,  1.40it/s][A

tensor(0.0162, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  54%|█████████████████████████████████████▍                               | 119/219 [01:25<01:11,  1.40it/s][A

tensor(0.0774, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  55%|█████████████████████████████████████▊                               | 120/219 [01:25<01:10,  1.41it/s][A

tensor(0.0064, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  55%|██████████████████████████████████████                               | 121/219 [01:26<01:09,  1.40it/s][A

tensor(0.0115, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  56%|██████████████████████████████████████▍                              | 122/219 [01:27<01:08,  1.41it/s][A

tensor(0.1053, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  56%|██████████████████████████████████████▊                              | 123/219 [01:28<01:09,  1.38it/s][A

tensor(0.0320, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  57%|███████████████████████████████████████                              | 124/219 [01:28<01:09,  1.37it/s][A

tensor(0.0185, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  57%|███████████████████████████████████████▍                             | 125/219 [01:29<01:09,  1.36it/s][A

tensor(0.0139, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  58%|███████████████████████████████████████▋                             | 126/219 [01:30<01:08,  1.35it/s][A

tensor(0.1363, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  58%|████████████████████████████████████████                             | 127/219 [01:31<01:07,  1.36it/s][A

tensor(0.1100, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  58%|████████████████████████████████████████▎                            | 128/219 [01:31<01:07,  1.35it/s][A

tensor(0.0612, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  59%|████████████████████████████████████████▋                            | 129/219 [01:32<01:05,  1.37it/s][A

tensor(0.0122, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  59%|████████████████████████████████████████▉                            | 130/219 [01:33<01:03,  1.39it/s][A

tensor(0.0756, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  60%|█████████████████████████████████████████▎                           | 131/219 [01:33<01:03,  1.39it/s][A

tensor(0.0177, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  60%|█████████████████████████████████████████▌                           | 132/219 [01:34<01:02,  1.39it/s][A

tensor(0.3081, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  61%|█████████████████████████████████████████▉                           | 133/219 [01:35<01:01,  1.40it/s][A

tensor(0.0135, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  61%|██████████████████████████████████████████▏                          | 134/219 [01:36<01:00,  1.40it/s][A

tensor(0.0101, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  62%|██████████████████████████████████████████▌                          | 135/219 [01:36<00:59,  1.40it/s][A

tensor(0.0895, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  62%|██████████████████████████████████████████▊                          | 136/219 [01:37<00:59,  1.40it/s][A

tensor(0.1012, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  63%|███████████████████████████████████████████▏                         | 137/219 [01:38<00:58,  1.40it/s][A

tensor(0.1803, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  63%|███████████████████████████████████████████▍                         | 138/219 [01:38<00:58,  1.38it/s][A

tensor(0.0876, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  63%|███████████████████████████████████████████▊                         | 139/219 [01:39<00:57,  1.39it/s][A

tensor(0.1122, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  64%|████████████████████████████████████████████                         | 140/219 [01:40<00:56,  1.39it/s][A

tensor(0.1212, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  64%|████████████████████████████████████████████▍                        | 141/219 [01:41<00:55,  1.40it/s][A

tensor(0.3834, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  65%|████████████████████████████████████████████▋                        | 142/219 [01:41<00:54,  1.40it/s][A

tensor(0.2575, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  65%|█████████████████████████████████████████████                        | 143/219 [01:42<00:54,  1.41it/s][A

tensor(0.2291, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  66%|█████████████████████████████████████████████▎                       | 144/219 [01:43<00:53,  1.40it/s][A

tensor(0.0451, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  66%|█████████████████████████████████████████████▋                       | 145/219 [01:43<00:52,  1.40it/s][A

tensor(0.0469, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  67%|██████████████████████████████████████████████                       | 146/219 [01:44<00:52,  1.40it/s][A

tensor(0.1944, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  67%|██████████████████████████████████████████████▎                      | 147/219 [01:45<00:51,  1.40it/s][A

tensor(0.0312, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  68%|██████████████████████████████████████████████▋                      | 148/219 [01:46<00:50,  1.40it/s][A

tensor(0.0844, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  68%|██████████████████████████████████████████████▉                      | 149/219 [01:46<00:51,  1.37it/s][A

tensor(0.1135, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  68%|███████████████████████████████████████████████▎                     | 150/219 [01:47<00:50,  1.37it/s][A

tensor(0.1110, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  69%|███████████████████████████████████████████████▌                     | 151/219 [01:48<00:49,  1.38it/s][A

tensor(0.0316, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  69%|███████████████████████████████████████████████▉                     | 152/219 [01:49<00:48,  1.39it/s][A

tensor(0.0511, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  70%|████████████████████████████████████████████████▏                    | 153/219 [01:49<00:47,  1.40it/s][A

tensor(0.0330, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  70%|████████████████████████████████████████████████▌                    | 154/219 [01:50<00:46,  1.40it/s][A

tensor(0.0046, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  71%|████████████████████████████████████████████████▊                    | 155/219 [01:51<00:45,  1.39it/s][A

tensor(0.0443, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  71%|█████████████████████████████████████████████████▏                   | 156/219 [01:51<00:45,  1.38it/s][A

tensor(0.0827, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  72%|█████████████████████████████████████████████████▍                   | 157/219 [01:52<00:44,  1.39it/s][A

tensor(0.1038, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  72%|█████████████████████████████████████████████████▊                   | 158/219 [01:53<00:43,  1.40it/s][A

tensor(0.1788, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  73%|██████████████████████████████████████████████████                   | 159/219 [01:54<00:43,  1.40it/s][A

tensor(0.1388, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  73%|██████████████████████████████████████████████████▍                  | 160/219 [01:54<00:43,  1.36it/s][A

tensor(0.1144, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  74%|██████████████████████████████████████████████████▋                  | 161/219 [01:55<00:42,  1.38it/s][A

tensor(0.0477, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  74%|███████████████████████████████████████████████████                  | 162/219 [01:56<00:41,  1.39it/s][A

tensor(0.1763, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  74%|███████████████████████████████████████████████████▎                 | 163/219 [01:56<00:40,  1.39it/s][A

tensor(0.0124, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  75%|███████████████████████████████████████████████████▋                 | 164/219 [01:57<00:39,  1.39it/s][A

tensor(0.1022, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  75%|███████████████████████████████████████████████████▉                 | 165/219 [01:58<00:39,  1.38it/s][A

tensor(0.2656, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  76%|████████████████████████████████████████████████████▎                | 166/219 [01:59<00:38,  1.38it/s][A

tensor(0.0138, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  76%|████████████████████████████████████████████████████▌                | 167/219 [01:59<00:37,  1.38it/s][A

tensor(0.0256, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  77%|████████████████████████████████████████████████████▉                | 168/219 [02:00<00:36,  1.38it/s][A

tensor(0.1078, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  77%|█████████████████████████████████████████████████████▏               | 169/219 [02:01<00:36,  1.38it/s][A

tensor(0.0700, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  78%|█████████████████████████████████████████████████████▌               | 170/219 [02:02<00:36,  1.35it/s][A

tensor(0.1409, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  78%|█████████████████████████████████████████████████████▉               | 171/219 [02:02<00:35,  1.34it/s][A

tensor(0.1363, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  79%|██████████████████████████████████████████████████████▏              | 172/219 [02:03<00:34,  1.36it/s][A

tensor(0.1375, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  79%|██████████████████████████████████████████████████████▌              | 173/219 [02:04<00:33,  1.37it/s][A

tensor(0.0181, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  79%|██████████████████████████████████████████████████████▊              | 174/219 [02:05<00:33,  1.36it/s][A

tensor(0.1106, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  80%|███████████████████████████████████████████████████████▏             | 175/219 [02:05<00:32,  1.36it/s][A

tensor(0.1234, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  80%|███████████████████████████████████████████████████████▍             | 176/219 [02:06<00:31,  1.37it/s][A

tensor(0.0686, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  81%|███████████████████████████████████████████████████████▊             | 177/219 [02:07<00:31,  1.34it/s][A

tensor(0.0320, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  81%|████████████████████████████████████████████████████████             | 178/219 [02:07<00:30,  1.36it/s][A

tensor(0.0923, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  82%|████████████████████████████████████████████████████████▍            | 179/219 [02:08<00:29,  1.37it/s][A

tensor(0.1613, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  82%|████████████████████████████████████████████████████████▋            | 180/219 [02:09<00:28,  1.37it/s][A

tensor(0.0217, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  83%|█████████████████████████████████████████████████████████            | 181/219 [02:10<00:27,  1.38it/s][A

tensor(0.0501, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  83%|█████████████████████████████████████████████████████████▎           | 182/219 [02:10<00:26,  1.37it/s][A

tensor(0.1078, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  84%|█████████████████████████████████████████████████████████▋           | 183/219 [02:11<00:26,  1.37it/s][A

tensor(0.1421, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  84%|█████████████████████████████████████████████████████████▉           | 184/219 [02:12<00:25,  1.38it/s][A

tensor(0.2076, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  84%|██████████████████████████████████████████████████████████▎          | 185/219 [02:13<00:24,  1.39it/s][A

tensor(0.2023, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  85%|██████████████████████████████████████████████████████████▌          | 186/219 [02:13<00:23,  1.39it/s][A

tensor(0.1584, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  85%|██████████████████████████████████████████████████████████▉          | 187/219 [02:14<00:23,  1.38it/s][A

tensor(0.1302, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  86%|███████████████████████████████████████████████████████████▏         | 188/219 [02:15<00:22,  1.38it/s][A

tensor(0.0570, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  86%|███████████████████████████████████████████████████████████▌         | 189/219 [02:15<00:21,  1.39it/s][A

tensor(0.0681, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  87%|███████████████████████████████████████████████████████████▊         | 190/219 [02:16<00:20,  1.39it/s][A

tensor(0.1039, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  87%|████████████████████████████████████████████████████████████▏        | 191/219 [02:17<00:20,  1.40it/s][A

tensor(0.2254, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  88%|████████████████████████████████████████████████████████████▍        | 192/219 [02:18<00:19,  1.39it/s][A

tensor(0.0486, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  88%|████████████████████████████████████████████████████████████▊        | 193/219 [02:18<00:18,  1.40it/s][A

tensor(0.0683, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  89%|█████████████████████████████████████████████████████████████        | 194/219 [02:19<00:17,  1.40it/s][A

tensor(0.1026, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  89%|█████████████████████████████████████████████████████████████▍       | 195/219 [02:20<00:17,  1.37it/s][A

tensor(0.1908, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  89%|█████████████████████████████████████████████████████████████▊       | 196/219 [02:20<00:16,  1.38it/s][A

tensor(0.1161, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  90%|██████████████████████████████████████████████████████████████       | 197/219 [02:21<00:15,  1.38it/s][A

tensor(0.1678, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  90%|██████████████████████████████████████████████████████████████▍      | 198/219 [02:22<00:15,  1.38it/s][A

tensor(0.2747, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  91%|██████████████████████████████████████████████████████████████▋      | 199/219 [02:23<00:14,  1.39it/s][A

tensor(0.0269, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  91%|███████████████████████████████████████████████████████████████      | 200/219 [02:23<00:14,  1.35it/s][A

tensor(0.1858, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  92%|███████████████████████████████████████████████████████████████▎     | 201/219 [02:24<00:13,  1.34it/s][A

tensor(0.0430, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  92%|███████████████████████████████████████████████████████████████▋     | 202/219 [02:25<00:12,  1.36it/s][A

tensor(0.0117, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  93%|███████████████████████████████████████████████████████████████▉     | 203/219 [02:26<00:11,  1.37it/s][A

tensor(0.0608, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  93%|████████████████████████████████████████████████████████████████▎    | 204/219 [02:26<00:10,  1.38it/s][A

tensor(0.1679, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  94%|████████████████████████████████████████████████████████████████▌    | 205/219 [02:27<00:10,  1.38it/s][A

tensor(0.2648, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  94%|████████████████████████████████████████████████████████████████▉    | 206/219 [02:28<00:09,  1.39it/s][A

tensor(0.0561, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  95%|█████████████████████████████████████████████████████████████████▏   | 207/219 [02:28<00:08,  1.39it/s][A

tensor(0.0134, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  95%|█████████████████████████████████████████████████████████████████▌   | 208/219 [02:29<00:07,  1.40it/s][A

tensor(0.0289, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  95%|█████████████████████████████████████████████████████████████████▊   | 209/219 [02:30<00:07,  1.37it/s][A

tensor(0.2991, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  96%|██████████████████████████████████████████████████████████████████▏  | 210/219 [02:31<00:06,  1.38it/s][A

tensor(0.0340, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  96%|██████████████████████████████████████████████████████████████████▍  | 211/219 [02:31<00:05,  1.38it/s][A

tensor(0.2260, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  97%|██████████████████████████████████████████████████████████████████▊  | 212/219 [02:32<00:05,  1.38it/s][A

tensor(0.0184, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  97%|███████████████████████████████████████████████████████████████████  | 213/219 [02:33<00:04,  1.38it/s][A

tensor(0.0156, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  98%|███████████████████████████████████████████████████████████████████▍ | 214/219 [02:34<00:03,  1.37it/s][A

tensor(0.2173, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  98%|███████████████████████████████████████████████████████████████████▋ | 215/219 [02:34<00:02,  1.38it/s][A

tensor(0.1265, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  99%|████████████████████████████████████████████████████████████████████ | 216/219 [02:35<00:02,  1.39it/s][A

tensor(0.2419, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  99%|████████████████████████████████████████████████████████████████████▎| 217/219 [02:36<00:01,  1.40it/s][A

tensor(0.1855, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration: 100%|████████████████████████████████████████████████████████████████████▋| 218/219 [02:36<00:00,  1.40it/s][A

tensor(0.0350, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration: 100%|█████████████████████████████████████████████████████████████████████| 219/219 [02:37<00:00,  1.39it/s][A
Epoch: 100%|████████████████████████████████████████████████████████████████████████████| 3/3 [07:48<00:00, 156.24s/it]


In [20]:
import csv
from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
def train_and_test(data_dir, bert_model="bert-base-uncased", task_name=None,
                   output_dir=None, max_seq_length=32, do_train=False, do_eval=False, do_lower_case=False,
                   train_batch_size=32, eval_batch_size=8, learning_rate=5e-5, num_train_epochs=3,
                   warmup_proportion=0.1,no_cuda=False, local_rank=-1, seed=42, gradient_accumulation_steps=1,
                   optimize_on_cpu=False, fp16=False, loss_scale=128, saved_model=""):


    # ## Required parameters
    # parser.add_argument("--data_dir",
    #                     default=None,
    #                     type=str,
    #                     required=True,
    #                     help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
    # parser.add_argument("--bert_model", default=None, type=str, required=True,
    #                     help="Bert pre-trained model selected in the list: bert-base-uncased, "
    #                          "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")
    # parser.add_argument("--task_name",
    #                     default=None,
    #                     type=str,
    #                     required=True,
    #                     help="The name of the task to train.")
    # parser.add_argument("--output_dir",
    #                     default=None,
    #                     type=str,
    #                     required=True,
    #                     help="The output directory where the model checkpoints will be written.")

    ## Other parameters
    # parser.add_argument("--max_seq_length",
    #                     default=128,
    #                     type=int,
    #                     help="The maximum total input sequence length after WordPiece tokenization. \n"
    #                          "Sequences longer than this will be truncated, and sequences shorter \n"
    #                          "than this will be padded.")
    # parser.add_argument("--do_train",
    #                     default=False,
    #                     action='store_true',
    #                     help="Whether to run training.")
    # parser.add_argument("--do_eval",
    #                     default=False,
    #                     action='store_true',
    #                     help="Whether to run eval on the dev set.")
    # parser.add_argument("--do_lower_case",
    #                     default=False,
    #                     action='store_true',
    #                     help="Set this flag if you are using an uncased model.")
    # parser.add_argument("--train_batch_size",
    #                     default=32,
    #                     type=int,
    #                     help="Total batch size for training.")
    # parser.add_argument("--eval_batch_size",
    #                     default=8,
    #                     type=int,
    #                     help="Total batch size for eval.")
    # parser.add_argument("--learning_rate",
    #                     default=5e-5,
    #                     type=float,
    #                     help="The initial learning rate for Adam.")
    # parser.add_argument("--num_train_epochs",
    #                     default=3.0,
    #                     type=float,
    #                     help="Total number of training epochs to perform.")
    # parser.add_argument("--warmup_proportion",
    #                     default=0.1,
    #                     type=float,
    #                     help="Proportion of training to perform linear learning rate warmup for. "
    #                          "E.g., 0.1 = 10%% of training.")
    # parser.add_argument("--no_cuda",
    #                     default=False,
    #                     action='store_true',
    #                     help="Whether not to use CUDA when available")
    # parser.add_argument("--local_rank",
    #                     type=int,
    #                     default=-1,
    #                     help="local_rank for distributed training on gpus")
    # parser.add_argument('--seed',
    #                     type=int,
    #                     default=42,
    #                     help="random seed for initialization")
    # parser.add_argument('--gradient_accumulation_steps',
    #                     type=int,
    #                     default=1,
    #                     help="Number of updates steps to accumulate before performing a backward/update pass.")
    # parser.add_argument('--optimize_on_cpu',
    #                     default=False,
    #                     action='store_true',
    #                     help="Whether to perform optimization and keep the optimizer averages on CPU")
    # parser.add_argument('--fp16',
    #                     default=False,
    #                     action='store_true',
    #                     help="Whether to use 16-bit float precision instead of 32-bit")
    # parser.add_argument('--loss_scale',
    #                     type=float, default=128,
    #                     help='Loss scaling, positive power of 2 values can improve fp16 convergence.')

    # args = parser.parse_args()

    processors = {
#         "cola": ColaProcessor,
#         "mnli": MnliProcessor,
        "mrpc": MrpcProcessor,
    }

    if local_rank == -1 or no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
        if fp16:
            logger.info("16-bits training currently not supported in distributed training")
            fp16 = False # (see https://github.com/pytorch/pytorch/pull/13496)
    logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(local_rank != -1))

    if gradient_accumulation_steps < 1:
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
                            gradient_accumulation_steps))

    train_batch_size = int(train_batch_size / gradient_accumulation_steps)

    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(seed)

    if not do_train and not do_eval:
        raise ValueError("At least one of `do_train` or `do_eval` must be True.")

    if do_train:
        if os.path.exists(output_dir) and os.listdir(output_dir):
            raise ValueError("Output directory ({}) already exists and is not emp1ty.".format(output_dir))
        os.makedirs(output_dir, exist_ok=True)

    task_name = task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()
    label_list = processor.get_labels()

    tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=do_lower_case)

    train_examples = None
    num_train_steps = None
    if do_train:
        train_examples = processor.get_train_examples(data_dir)
        num_train_steps = int(
            len(train_examples) / train_batch_size / gradient_accumulation_steps * num_train_epochs)

    # Prepare model
    model = BertForSequenceClassification.from_pretrained(bert_model,
                cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(local_rank), num_labels = 2)
    if fp16:
        model.half()
    model.to(device)
    if local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[local_rank],
                                                          output_device=local_rank)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Prepare optimizer
    if fp16:
        param_optimizer = [(n, param.clone().detach().to('cpu').float().requires_grad_()) \
                            for n, param in model.named_parameters()]
    elif optimize_on_cpu:
        param_optimizer = [(n, param.clone().detach().to('cpu').requires_grad_()) \
                            for n, param in model.named_parameters()]
    else:
        param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0}
        ]
    t_total = num_train_steps
#     print(t_total)
    if local_rank != -1:
        t_total = t_total // torch.distributed.get_world_size()
    if do_train:
        optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=learning_rate,
                         warmup=warmup_proportion,
                         t_total=t_total)

    global_step = 0
    if do_train:
        train_features = convert_examples_to_features(
            train_examples, label_list, max_seq_length, tokenizer)
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", train_batch_size)
        logger.info("  Num steps = %d", num_train_steps)
        all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
        if local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=train_batch_size)

        model.train()
        for _ in trange(int(num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch
                loss = model(input_ids, segment_ids, input_mask, label_ids)
                if n_gpu > 1:
                    loss = loss.mean() # mean() to average on multi-gpu.
                if fp16 and loss_scale != 1.0:
                    # rescale loss for fp16 training
                    # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html
                    loss = loss * loss_scale
                if gradient_accumulation_steps > 1:
                    loss = loss / gradient_accumulation_steps
                loss.backward()
                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % gradient_accumulation_steps == 0:
                    if fp16 or optimize_on_cpu:
                        if fp16 and loss_scale != 1.0:
                            # scale down gradients for fp16 training
                            for param in model.parameters():
                                if param.grad is not None:
                                    param.grad.data = param.grad.data / loss_scale
                        is_nan = set_optimizer_params_grad(param_optimizer, model.named_parameters(), test_nan=True)
                        if is_nan:
                            logger.info("FP16 TRAINING: Nan in gradients, reducing loss scaling")
                            loss_scale = loss_scale / 2
                            model.zero_grad()
                            continue
                        optimizer.step()
                        copy_optimizer_params_to_model(model.named_parameters(), param_optimizer)
                    else:
                        optimizer.step()
                    model.zero_grad()
                    global_step += 1

        torch.save(model.state_dict(), output_dir + "output.pth")


    if do_eval and (local_rank == -1 or torch.distributed.get_rank() == 0):
        eval_examples = processor.get_test_examples(data_dir)
#         eval_examples = processor.get_dev_examples(data_dir)
        eval_features = convert_examples_to_features(
            eval_examples, label_list, max_seq_length, tokenizer)
        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", eval_batch_size)
        all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
        eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
        # Run prediction for full data
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=eval_batch_size)

        model.load_state_dict(torch.load(saved_model))

        model.eval()
        # eval_loss, eval_accuracy = 0, 0

        eval_tp, eval_pred_c, eval_gold_c = 0, 0, 0
        eval_loss, eval_macro_p, eval_macro_r = 0, 0, 0

        raw_score = []

        nb_eval_steps, nb_eval_examples = 0, 0
        for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            label_ids = label_ids.to(device)

            with torch.no_grad():
                tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids)
                logits = model(input_ids, segment_ids, input_mask)
#             print(logits)
#             print(logits[0])
            logits = logits.detach().cpu().numpy()
#             print(logits)
            label_ids = label_ids.to('cpu').numpy()

            # Micro F1 (aggregated tp, fp, fn counts across all examples)
            tmp_tp, tmp_pred_c, tmp_gold_c = tp_pcount_gcount(logits, label_ids)
            eval_tp += tmp_tp
            eval_pred_c += tmp_pred_c
            eval_gold_c += tmp_gold_c
            
            pred_label = np.argmax(logits, axis=1)
            raw_score += zip(logits, pred_label, label_ids)
            
            # Macro F1 (averaged P, R across mini batches)
            tmp_eval_p, tmp_eval_r, tmp_eval_f1 = p_r_f1(logits, label_ids)

            eval_macro_p += tmp_eval_p
            eval_macro_r += tmp_eval_r

            eval_loss += tmp_eval_loss.mean().item()
            nb_eval_examples += input_ids.size(0)
            nb_eval_steps += 1


        # Micro F1 (aggregated tp, fp, fn counts across all examples)
        eval_micro_p = eval_tp / eval_pred_c
        eval_micro_r = eval_tp / eval_gold_c
        eval_micro_f1 = 2 * eval_micro_p * eval_micro_r / (eval_micro_p + eval_micro_r)

        # Macro F1 (averaged P, R across mini batches)
        eval_macro_p = eval_macro_p / nb_eval_steps
        eval_macro_r = eval_macro_r / nb_eval_steps
        eval_macro_f1 = 2 * eval_macro_p * eval_macro_r / (eval_macro_p + eval_macro_r)

        eval_loss = eval_loss / nb_eval_steps
        result = {'eval_loss': eval_loss,
                  'eval_micro_p': eval_micro_p,
                  'eval_micro_r': eval_micro_r,
                  'eval_micro_f1': eval_micro_f1,
                  'eval_macro_p': eval_macro_p,
                  'eval_macro_r': eval_macro_r,
                  'eval_macro_f1': eval_macro_f1,
#                   'global_step': global_step,
#                   'loss': tr_loss/nb_tr_steps
                  }

        output_eval_file = os.path.join(output_dir, "test_eval_results.txt")
        output_raw_score = os.path.join(output_dir, "test_raw_score.csv")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

        with open(output_raw_score, 'w') as fout:
            fields = ["undermine_score", "support_score","predict_label", "gold"]
            writer = csv.DictWriter(fout, fieldnames=fields)
            writer.writeheader()
            for score, pred, gold in raw_score:
                writer.writerow({
                    "undermine_score": str(score[0]),
                    "support_score": str(score[1]),
                    "predict_label": str(pred),
                    "gold": str(gold)
                })

In [None]:
def experiments():
    data_dir = "D:/Jupyter/data/dataset/perspective_stances/"
#     data_dir = "/home/syg340/Dataset/"

    # data_dir_output = data_dir + "output2/"
    data_dir_output = "D:/Projects/Stance/Models/"
    train_and_test(data_dir=data_dir, do_train=True, do_eval=True, output_dir=data_dir_output,task_name="Mrpc")


In [2]:
def evaluation_with_pretrained():
    bert_model = "D:/Projects/Stance/Models/output.pth"
    data_dir = "D:/Jupyter/data/dataset/perspective_stances/"
    # data_dir_output = data_dir + "output2/"
    data_dir_output = "D:/Projects/Stance/Evaluation/bert_dummy_output/"
    train_and_test(data_dir=data_dir, do_train=False, do_eval=True, output_dir=data_dir_output,task_name="Mrpc",saved_model=bert_model)

In [21]:
if __name__ == "__main__":
#     experiments()
    evaluation_with_pretrained()

03/07/2020 21:45:20 - INFO - run_classifier -   device cuda n_gpu 1 distributed training False
03/07/2020 21:45:21 - INFO - transformers.tokenization_utils -   loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at C:\Users\arsen\.cache\torch\transformers\26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
03/07/2020 21:45:21 - INFO - pytorch_pretrained_bert.modeling -   loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz from cache at C:\Users\arsen\.pytorch_pretrained_bert\distributed_-1\9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba
03/07/2020 21:45:21 - INFO - pytorch_pretrained_bert.modeling -   extracting archive file C:\Users\arsen\.pytorch_pretrained_bert\distributed_-1\9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e