In [6]:
import torch
import random
import numpy as np
import os
from tqdm import tqdm, trange
# torch.cuda.empty_cache()
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from pytorch_pretrained_bert.optimization import BertAdam

In [7]:
from run_classifier import ColaProcessor, MrpcProcessor, logger, convert_examples_to_features,\
    set_optimizer_params_grad, copy_optimizer_params_to_model, accuracy, p_r_f1, tp_pcount_gcount

In [8]:
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    n_gpu = torch.cuda.device_count()
    
    print('There are %d GPU(s) available.' % n_gpu)

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: GeForce GTX 1050 Ti


In [9]:
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from pytorch_pretrained_bert.modeling import BertForSequenceClassification

03/03/2020 10:07:12 - INFO - transformers.file_utils -   PyTorch version 1.4.0 available.
03/03/2020 10:08:02 - INFO - transformers.file_utils -   TensorFlow version 2.1.0 available.


In [10]:
# import logging
# logging.basicConfig(level=logging.INFO)

In [11]:
# def train_and_test(data_dir, bert_model="bert-base-uncased", task_name=None,
#                    output_dir=None, max_seq_length=128, do_train=False, do_eval=False, do_lower_case=False,
#                    train_batch_size=32, eval_batch_size=8, learning_rate=5e-5, num_train_epochs=3,
#                    warmup_proportion=0.1,no_cuda=False, local_rank=-1, seed=42, gradient_accumulation_steps=1,
#                    optimize_on_cpu=False, fp16=False, loss_scale=128, saved_model=""):

In [12]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

03/03/2020 10:08:10 - INFO - transformers.tokenization_utils -   loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at C:\Users\arsen\.cache\torch\transformers\26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084


In [13]:
# Prepare model 
model = BertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels = 2)
model.to(device)

# model = BertModel.from_pretrained('bert-base-uncased')

03/03/2020 10:08:11 - INFO - pytorch_pretrained_bert.modeling -   loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz from cache at C:\Users\arsen\.pytorch_pretrained_bert\9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba
03/03/2020 10:08:11 - INFO - pytorch_pretrained_bert.modeling -   extracting archive file C:\Users\arsen\.pytorch_pretrained_bert\9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba to temp dir C:\Users\arsen\AppData\Local\Temp\tmpzb5bazk1
03/03/2020 10:08:15 - INFO - pytorch_pretrained_bert.modeling -   Model config {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_lay

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
   

In [14]:
# Get all of the model's parameters as a list of tuples.
params = list(model.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
 
print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

The BERT model has 201 different named parameters.

==== Embedding Layer ====

bert.embeddings.word_embeddings.weight                  (30522, 768)
bert.embeddings.position_embeddings.weight                (512, 768)
bert.embeddings.token_type_embeddings.weight                (2, 768)
bert.embeddings.LayerNorm.weight                              (768,)
bert.embeddings.LayerNorm.bias                                (768,)

==== First Transformer ====

bert.encoder.layer.0.attention.self.query.weight          (768, 768)
bert.encoder.layer.0.attention.self.query.bias                (768,)
bert.encoder.layer.0.attention.self.key.weight            (768, 768)
bert.encoder.layer.0.attention.self.key.bias                  (768,)
bert.encoder.layer.0.attention.self.value.weight          (768, 768)
bert.encoder.layer.0.attention.self.value.bias                (768,)
bert.encoder.layer.0.attention.output.dense.weight        (768, 768)
bert.encoder.layer.0.attention.output.dense.bias              (

In [10]:
data_dir = "D:/Jupyter/data/dataset/perspective_stances/"
data_dir_output = "D:/Projects/Stance/Models/"
output_dir=data_dir_output
max_seq_length=32
max_grad_norm = 1.0
num_training_steps = 1000
num_warmup_steps = 100
warmup_proportion = float(num_warmup_steps) / float(num_training_steps)  # 0.1
# warmup_proportion = 0.1
train_batch_size=32
eval_batch_size=8
learning_rate=5e-5
num_train_epochs=3
local_rank=-1
seed=42
gradient_accumulation_steps=1
loss_scale=128
train_batch_size = int(train_batch_size / gradient_accumulation_steps)

processors = {
        "mrpc": MrpcProcessor,
    }

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
    
os.makedirs(output_dir, exist_ok=True)
processor = processors['mrpc']()
label_list = processor.get_labels()

train_examples = processor.get_train_examples(data_dir)
num_train_steps = int(
    len(train_examples) / train_batch_size / gradient_accumulation_steps * num_train_epochs)

##preprare optimizer
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0}
    ]
t_total = num_train_steps
optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=learning_rate,
                         warmup=warmup_proportion,
                         t_total=t_total)
# optimizer = AdamW(optimizer_grouped_parameters,
#                   lr = learning_rate, # args.learning_rate - default is 5e-5, our notebook had 2e-5
#                   eps = 1e-8, # args.adam_epsilon  - default is 1e-8.
#                   correct_bias=False
#                 )

# scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=t_total)  # PyTorch scheduler

03/01/2020 20:03:11 - INFO - run_classifier -   LOOKING AT D:/Jupyter/data/dataset/perspective_stances/train.tsv


In [11]:
global_step = 0
train_features = convert_examples_to_features(train_examples, label_list, max_seq_length, tokenizer)
logger.info("***** Running training *****")
logger.info("  Num examples = %d", len(train_examples))
logger.info("  Batch size = %d", train_batch_size)
logger.info("  Num steps = %d", num_train_steps)
all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)

train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=train_batch_size)

03/01/2020 20:03:12 - INFO - run_classifier -   *** Example ***
03/01/2020 20:03:12 - INFO - run_classifier -   guid: train-1
03/01/2020 20:03:12 - INFO - run_classifier -   tokens: [CLS] male infant ci ##rc ##um ##cision is tan ##tam ##ount to child abuse [SEP] parents know what best for th ##ier child [SEP]
03/01/2020 20:03:12 - INFO - run_classifier -   input_ids: 101 3287 10527 25022 11890 2819 28472 2003 9092 15464 21723 2000 2775 6905 102 3008 2113 2054 2190 2005 16215 3771 2775 102 0 0 0 0 0 0 0 0
03/01/2020 20:03:12 - INFO - run_classifier -   input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0
03/01/2020 20:03:12 - INFO - run_classifier -   segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0
03/01/2020 20:03:12 - INFO - run_classifier -   label: 0 (id = 0)
03/01/2020 20:03:12 - INFO - run_classifier -   *** Example ***
03/01/2020 20:03:12 - INFO - run_classifier -   guid: train-2
03/01/2020 20:03:12 - INFO - run_classifier -   t

In [12]:
model.train()
for _ in trange(int(num_train_epochs), desc="Epoch"):
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
        batch = tuple(t.to(device) for t in batch)
        input_ids, input_mask, segment_ids, label_ids = batch
        loss = model(input_ids, segment_ids, input_mask, label_ids)
        print(loss)
        if n_gpu > 1:
            loss = loss.mean() # mean() to average on multi-gpu.
#         if fp16 and loss_scale != 1.0:
#             # rescale loss for fp16 training
#             # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html
#             loss = loss * loss_scale
        if gradient_accumulation_steps > 1:
            loss = loss / gradient_accumulation_steps
        loss.backward()
        
        tr_loss += loss.item()
        nb_tr_examples += input_ids.size(0)
        nb_tr_steps += 1
        if (step + 1) % gradient_accumulation_steps == 0:
#             if fp16 or optimize_on_cpu:
#                 if fp16 and loss_scale != 1.0:
#                     # scale down gradients for fp16 training
#                     for param in model.parameters():
#                         if param.grad is not None:
#                             param.grad.data = param.grad.data / loss_scale           
#                 is_nan = set_optimizer_params_grad(param_optimizer, model.named_parameters(), test_nan=True)
#                 if is_nan:
#                     logger.info("FP16 TRAINING: Nan in gradients, reducing loss scaling")
#                     loss_scale = loss_scale / 2
#                     model.zero_grad()
#                     continue 
#                 optimizer.step()
# #                 scheduler.step()
#                 copy_optimizer_params_to_model(model.named_parameters(), param_optimizer)
#             else:
#                 torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
            optimizer.step()
#                 scheduler.step()
            model.zero_grad()
            global_step += 1

torch.save(model.state_dict(), output_dir + "output.pth")

Epoch:   0%|                                                                                     | 0/3 [00:00<?, ?it/s]
Iteration:   0%|                                                                               | 0/219 [00:00<?, ?it/s][A

tensor(0.7563, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   0%|▎                                                                      | 1/219 [00:00<03:28,  1.04it/s][A

tensor(0.7777, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   1%|▋                                                                      | 2/219 [00:01<03:13,  1.12it/s][A

tensor(0.7995, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   1%|▉                                                                      | 3/219 [00:02<03:02,  1.18it/s][A

tensor(0.8389, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   2%|█▎                                                                     | 4/219 [00:03<02:51,  1.25it/s][A

tensor(0.7867, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   2%|█▌                                                                     | 5/219 [00:03<02:45,  1.30it/s][A

tensor(0.6458, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   3%|█▉                                                                     | 6/219 [00:04<02:41,  1.32it/s][A

tensor(0.7886, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   3%|██▎                                                                    | 7/219 [00:05<02:35,  1.36it/s][A

tensor(0.7220, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   4%|██▌                                                                    | 8/219 [00:05<02:35,  1.36it/s][A

tensor(0.7180, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   4%|██▉                                                                    | 9/219 [00:06<02:30,  1.39it/s][A

tensor(0.7001, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   5%|███▏                                                                  | 10/219 [00:07<02:27,  1.42it/s][A

tensor(0.7246, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   5%|███▌                                                                  | 11/219 [00:08<02:25,  1.43it/s][A

tensor(0.6763, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   5%|███▊                                                                  | 12/219 [00:08<02:25,  1.42it/s][A

tensor(0.6426, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   6%|████▏                                                                 | 13/219 [00:09<02:23,  1.43it/s][A

tensor(0.7523, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   6%|████▍                                                                 | 14/219 [00:10<02:21,  1.45it/s][A

tensor(0.7240, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   7%|████▊                                                                 | 15/219 [00:10<02:23,  1.42it/s][A

tensor(0.7103, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   7%|█████                                                                 | 16/219 [00:11<02:21,  1.43it/s][A

tensor(0.7143, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   8%|█████▍                                                                | 17/219 [00:12<02:19,  1.45it/s][A

tensor(0.7215, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   8%|█████▊                                                                | 18/219 [00:12<02:19,  1.45it/s][A

tensor(0.6992, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   9%|██████                                                                | 19/219 [00:13<02:18,  1.44it/s][A

tensor(0.7027, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   9%|██████▍                                                               | 20/219 [00:14<02:19,  1.43it/s][A

tensor(0.6864, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  10%|██████▋                                                               | 21/219 [00:14<02:18,  1.43it/s][A

tensor(0.7062, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  10%|███████                                                               | 22/219 [00:15<02:16,  1.45it/s][A

tensor(0.6824, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  11%|███████▎                                                              | 23/219 [00:16<02:14,  1.46it/s][A

tensor(0.7066, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  11%|███████▋                                                              | 24/219 [00:17<02:13,  1.46it/s][A

tensor(0.6957, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  11%|███████▉                                                              | 25/219 [00:17<02:15,  1.43it/s][A

tensor(0.6872, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  12%|████████▎                                                             | 26/219 [00:18<02:17,  1.41it/s][A

tensor(0.6643, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  12%|████████▋                                                             | 27/219 [00:19<02:14,  1.42it/s][A

tensor(0.7021, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  13%|████████▉                                                             | 28/219 [00:19<02:15,  1.41it/s][A

tensor(0.7166, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  13%|█████████▎                                                            | 29/219 [00:20<02:15,  1.41it/s][A

tensor(0.6968, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  14%|█████████▌                                                            | 30/219 [00:21<02:13,  1.42it/s][A

tensor(0.6631, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  14%|█████████▉                                                            | 31/219 [00:21<02:11,  1.43it/s][A

tensor(0.6987, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  15%|██████████▏                                                           | 32/219 [00:22<02:16,  1.37it/s][A

tensor(0.6855, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  15%|██████████▌                                                           | 33/219 [00:23<02:16,  1.37it/s][A

tensor(0.6899, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  16%|██████████▊                                                           | 34/219 [00:24<02:13,  1.39it/s][A

tensor(0.6809, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  16%|███████████▏                                                          | 35/219 [00:24<02:12,  1.39it/s][A

tensor(0.6860, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  16%|███████████▌                                                          | 36/219 [00:25<02:11,  1.39it/s][A

tensor(0.6663, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  17%|███████████▊                                                          | 37/219 [00:26<02:08,  1.41it/s][A

tensor(0.6966, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  17%|████████████▏                                                         | 38/219 [00:27<02:07,  1.42it/s][A

tensor(0.6777, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  18%|████████████▍                                                         | 39/219 [00:27<02:05,  1.44it/s][A

tensor(0.6716, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  18%|████████████▊                                                         | 40/219 [00:28<02:06,  1.41it/s][A

tensor(0.6052, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  19%|█████████████                                                         | 41/219 [00:29<02:07,  1.39it/s][A

tensor(0.6792, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  19%|█████████████▍                                                        | 42/219 [00:29<02:07,  1.39it/s][A

tensor(0.6928, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  20%|█████████████▋                                                        | 43/219 [00:30<02:08,  1.37it/s][A

tensor(0.6197, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  20%|██████████████                                                        | 44/219 [00:31<02:05,  1.40it/s][A

tensor(0.6212, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  21%|██████████████▍                                                       | 45/219 [00:32<02:05,  1.39it/s][A

tensor(0.5911, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  21%|██████████████▋                                                       | 46/219 [00:32<02:05,  1.37it/s][A

tensor(0.6408, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  21%|███████████████                                                       | 47/219 [00:33<02:04,  1.38it/s][A

tensor(0.6060, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  22%|███████████████▎                                                      | 48/219 [00:34<02:04,  1.38it/s][A

tensor(0.6555, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  22%|███████████████▋                                                      | 49/219 [00:34<02:01,  1.39it/s][A

tensor(0.6477, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  23%|███████████████▉                                                      | 50/219 [00:35<01:59,  1.41it/s][A

tensor(0.6387, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  23%|████████████████▎                                                     | 51/219 [00:36<01:57,  1.43it/s][A

tensor(0.6494, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  24%|████████████████▌                                                     | 52/219 [00:37<01:56,  1.43it/s][A

tensor(0.6543, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  24%|████████████████▉                                                     | 53/219 [00:37<01:56,  1.43it/s][A

tensor(0.7016, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  25%|█████████████████▎                                                    | 54/219 [00:38<01:56,  1.41it/s][A

tensor(0.7029, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  25%|█████████████████▌                                                    | 55/219 [00:39<01:54,  1.43it/s][A

tensor(0.6169, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  26%|█████████████████▉                                                    | 56/219 [00:39<01:58,  1.37it/s][A

tensor(0.6802, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  26%|██████████████████▏                                                   | 57/219 [00:40<02:04,  1.30it/s][A

tensor(0.7520, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  26%|██████████████████▌                                                   | 58/219 [00:41<02:10,  1.23it/s][A

tensor(0.6157, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  27%|██████████████████▊                                                   | 59/219 [00:42<02:05,  1.28it/s][A

tensor(0.6477, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  27%|███████████████████▏                                                  | 60/219 [00:43<02:01,  1.31it/s][A

tensor(0.6459, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  28%|███████████████████▍                                                  | 61/219 [00:43<01:56,  1.35it/s][A

tensor(0.5931, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  28%|███████████████████▊                                                  | 62/219 [00:44<01:53,  1.38it/s][A

tensor(0.6530, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  29%|████████████████████▏                                                 | 63/219 [00:45<01:51,  1.40it/s][A

tensor(0.7097, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  29%|████████████████████▍                                                 | 64/219 [00:45<01:51,  1.40it/s][A

tensor(0.6440, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  30%|████████████████████▊                                                 | 65/219 [00:46<01:51,  1.39it/s][A

tensor(0.5860, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  30%|█████████████████████                                                 | 66/219 [00:47<01:49,  1.40it/s][A

tensor(0.5974, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  31%|█████████████████████▍                                                | 67/219 [00:48<01:48,  1.40it/s][A

tensor(0.6590, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  31%|█████████████████████▋                                                | 68/219 [00:48<01:46,  1.42it/s][A

tensor(0.5876, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  32%|██████████████████████                                                | 69/219 [00:49<01:46,  1.40it/s][A

tensor(0.7011, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  32%|██████████████████████▎                                               | 70/219 [00:50<01:45,  1.42it/s][A

tensor(0.5824, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  32%|██████████████████████▋                                               | 71/219 [00:50<01:43,  1.43it/s][A

tensor(0.5831, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  33%|███████████████████████                                               | 72/219 [00:51<01:42,  1.43it/s][A

tensor(0.5460, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  33%|███████████████████████▎                                              | 73/219 [00:52<01:41,  1.43it/s][A

tensor(0.5871, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  34%|███████████████████████▋                                              | 74/219 [00:52<01:41,  1.43it/s][A

tensor(0.6662, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  34%|███████████████████████▉                                              | 75/219 [00:53<01:40,  1.44it/s][A

tensor(0.7854, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  35%|████████████████████████▎                                             | 76/219 [00:54<01:39,  1.43it/s][A

tensor(0.5124, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  35%|████████████████████████▌                                             | 77/219 [00:55<01:39,  1.43it/s][A

tensor(0.6397, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  36%|████████████████████████▉                                             | 78/219 [00:55<01:40,  1.41it/s][A

tensor(0.7200, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  36%|█████████████████████████▎                                            | 79/219 [00:56<01:41,  1.39it/s][A

tensor(0.6697, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  37%|█████████████████████████▌                                            | 80/219 [00:57<01:39,  1.40it/s][A

tensor(0.6438, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  37%|█████████████████████████▉                                            | 81/219 [00:57<01:37,  1.41it/s][A

tensor(0.6140, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  37%|██████████████████████████▏                                           | 82/219 [00:58<01:37,  1.41it/s][A

tensor(0.6151, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  38%|██████████████████████████▌                                           | 83/219 [00:59<01:36,  1.40it/s][A

tensor(0.5883, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  38%|██████████████████████████▊                                           | 84/219 [01:00<01:35,  1.41it/s][A

tensor(0.6992, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  39%|███████████████████████████▏                                          | 85/219 [01:00<01:35,  1.40it/s][A

tensor(0.6636, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  39%|███████████████████████████▍                                          | 86/219 [01:01<01:35,  1.39it/s][A

tensor(0.6789, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  40%|███████████████████████████▊                                          | 87/219 [01:02<01:34,  1.39it/s][A

tensor(0.6538, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  40%|████████████████████████████▏                                         | 88/219 [01:02<01:34,  1.39it/s][A

tensor(0.6149, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  41%|████████████████████████████▍                                         | 89/219 [01:03<01:35,  1.36it/s][A

tensor(0.6404, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  41%|████████████████████████████▊                                         | 90/219 [01:04<01:35,  1.36it/s][A

tensor(0.6606, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  42%|█████████████████████████████                                         | 91/219 [01:05<01:36,  1.32it/s][A

tensor(0.5572, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  42%|█████████████████████████████▍                                        | 92/219 [01:05<01:33,  1.36it/s][A

tensor(0.7579, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  42%|█████████████████████████████▋                                        | 93/219 [01:06<01:32,  1.36it/s][A

tensor(0.7492, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  43%|██████████████████████████████                                        | 94/219 [01:07<01:31,  1.37it/s][A

tensor(0.6604, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  43%|██████████████████████████████▎                                       | 95/219 [01:08<01:29,  1.39it/s][A

tensor(0.5977, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  44%|██████████████████████████████▋                                       | 96/219 [01:08<01:28,  1.39it/s][A

tensor(0.6010, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  44%|███████████████████████████████                                       | 97/219 [01:09<01:28,  1.38it/s][A

tensor(0.6520, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  45%|███████████████████████████████▎                                      | 98/219 [01:10<01:27,  1.39it/s][A

tensor(0.6164, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  45%|███████████████████████████████▋                                      | 99/219 [01:10<01:25,  1.40it/s][A

tensor(0.5926, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  46%|███████████████████████████████▌                                     | 100/219 [01:11<01:24,  1.42it/s][A

tensor(0.6762, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  46%|███████████████████████████████▊                                     | 101/219 [01:12<01:24,  1.40it/s][A

tensor(0.6067, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  47%|████████████████████████████████▏                                    | 102/219 [01:13<01:23,  1.40it/s][A

tensor(0.6869, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  47%|████████████████████████████████▍                                    | 103/219 [01:13<01:21,  1.42it/s][A

tensor(0.6226, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  47%|████████████████████████████████▊                                    | 104/219 [01:14<01:20,  1.43it/s][A

tensor(0.6251, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  48%|█████████████████████████████████                                    | 105/219 [01:15<01:20,  1.42it/s][A

tensor(0.7344, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  48%|█████████████████████████████████▍                                   | 106/219 [01:15<01:18,  1.43it/s][A

tensor(0.6548, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  49%|█████████████████████████████████▋                                   | 107/219 [01:16<01:20,  1.40it/s][A

tensor(0.6537, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  49%|██████████████████████████████████                                   | 108/219 [01:17<01:19,  1.40it/s][A

tensor(0.7109, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  50%|██████████████████████████████████▎                                  | 109/219 [01:18<01:18,  1.41it/s][A

tensor(0.5863, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  50%|██████████████████████████████████▋                                  | 110/219 [01:18<01:17,  1.41it/s][A

tensor(0.7290, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  51%|██████████████████████████████████▉                                  | 111/219 [01:19<01:15,  1.42it/s][A

tensor(0.6215, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  51%|███████████████████████████████████▎                                 | 112/219 [01:20<01:14,  1.43it/s][A

tensor(0.6874, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  52%|███████████████████████████████████▌                                 | 113/219 [01:20<01:13,  1.44it/s][A

tensor(0.6636, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  52%|███████████████████████████████████▉                                 | 114/219 [01:21<01:14,  1.42it/s][A

tensor(0.6592, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  53%|████████████████████████████████████▏                                | 115/219 [01:22<01:12,  1.43it/s][A

tensor(0.6435, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  53%|████████████████████████████████████▌                                | 116/219 [01:22<01:11,  1.43it/s][A

tensor(0.5217, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  53%|████████████████████████████████████▊                                | 117/219 [01:23<01:12,  1.42it/s][A

tensor(0.5141, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  54%|█████████████████████████████████████▏                               | 118/219 [01:24<01:11,  1.41it/s][A

tensor(0.6905, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  54%|█████████████████████████████████████▍                               | 119/219 [01:25<01:12,  1.38it/s][A

tensor(0.5583, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  55%|█████████████████████████████████████▊                               | 120/219 [01:25<01:11,  1.38it/s][A

tensor(0.6990, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  55%|██████████████████████████████████████                               | 121/219 [01:26<01:11,  1.38it/s][A

tensor(0.5653, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  56%|██████████████████████████████████████▍                              | 122/219 [01:27<01:09,  1.39it/s][A

tensor(0.5366, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  56%|██████████████████████████████████████▊                              | 123/219 [01:28<01:09,  1.39it/s][A

tensor(0.5854, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  57%|███████████████████████████████████████                              | 124/219 [01:28<01:11,  1.34it/s][A

tensor(0.7012, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  57%|███████████████████████████████████████▍                             | 125/219 [01:29<01:10,  1.34it/s][A

tensor(0.7287, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  58%|███████████████████████████████████████▋                             | 126/219 [01:30<01:08,  1.36it/s][A

tensor(0.5584, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  58%|████████████████████████████████████████                             | 127/219 [01:31<01:07,  1.36it/s][A

tensor(0.6806, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  58%|████████████████████████████████████████▎                            | 128/219 [01:31<01:06,  1.37it/s][A

tensor(0.7136, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  59%|████████████████████████████████████████▋                            | 129/219 [01:32<01:05,  1.37it/s][A

tensor(0.5841, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  59%|████████████████████████████████████████▉                            | 130/219 [01:33<01:04,  1.38it/s][A

tensor(0.5927, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  60%|█████████████████████████████████████████▎                           | 131/219 [01:33<01:03,  1.39it/s][A

tensor(0.4665, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  60%|█████████████████████████████████████████▌                           | 132/219 [01:34<01:05,  1.32it/s][A

tensor(0.6026, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  61%|█████████████████████████████████████████▉                           | 133/219 [01:35<01:05,  1.31it/s][A

tensor(0.5836, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  61%|██████████████████████████████████████████▏                          | 134/219 [01:36<01:03,  1.34it/s][A

tensor(0.6517, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  62%|██████████████████████████████████████████▌                          | 135/219 [01:36<01:03,  1.33it/s][A

tensor(0.7009, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  62%|██████████████████████████████████████████▊                          | 136/219 [01:37<01:02,  1.34it/s][A

tensor(0.6137, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  63%|███████████████████████████████████████████▏                         | 137/219 [01:38<01:00,  1.35it/s][A

tensor(0.6063, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  63%|███████████████████████████████████████████▍                         | 138/219 [01:39<00:58,  1.38it/s][A

tensor(0.6135, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  63%|███████████████████████████████████████████▊                         | 139/219 [01:39<00:58,  1.37it/s][A

tensor(0.6265, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  64%|████████████████████████████████████████████                         | 140/219 [01:40<00:57,  1.37it/s][A

tensor(0.6026, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  64%|████████████████████████████████████████████▍                        | 141/219 [01:41<00:55,  1.40it/s][A

tensor(0.6786, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  65%|████████████████████████████████████████████▋                        | 142/219 [01:41<00:54,  1.40it/s][A

tensor(0.6271, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  65%|█████████████████████████████████████████████                        | 143/219 [01:42<00:53,  1.42it/s][A

tensor(0.6667, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  66%|█████████████████████████████████████████████▎                       | 144/219 [01:43<00:52,  1.43it/s][A

tensor(0.5111, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  66%|█████████████████████████████████████████████▋                       | 145/219 [01:44<00:51,  1.43it/s][A

tensor(0.6387, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  67%|██████████████████████████████████████████████                       | 146/219 [01:44<00:51,  1.43it/s][A

tensor(0.6071, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  67%|██████████████████████████████████████████████▎                      | 147/219 [01:45<00:50,  1.42it/s][A

tensor(0.5204, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  68%|██████████████████████████████████████████████▋                      | 148/219 [01:46<00:49,  1.43it/s][A

tensor(0.5537, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  68%|██████████████████████████████████████████████▉                      | 149/219 [01:46<00:48,  1.44it/s][A

tensor(0.4990, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  68%|███████████████████████████████████████████████▎                     | 150/219 [01:47<00:47,  1.44it/s][A

tensor(0.5077, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  69%|███████████████████████████████████████████████▌                     | 151/219 [01:48<00:47,  1.43it/s][A

tensor(0.4772, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  69%|███████████████████████████████████████████████▉                     | 152/219 [01:48<00:47,  1.41it/s][A

tensor(0.7352, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  70%|████████████████████████████████████████████████▏                    | 153/219 [01:49<00:47,  1.39it/s][A

tensor(0.5587, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  70%|████████████████████████████████████████████████▌                    | 154/219 [01:50<00:46,  1.40it/s][A

tensor(0.5926, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  71%|████████████████████████████████████████████████▊                    | 155/219 [01:51<00:46,  1.36it/s][A

tensor(0.7087, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  71%|█████████████████████████████████████████████████▏                   | 156/219 [01:51<00:46,  1.37it/s][A

tensor(0.6958, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  72%|█████████████████████████████████████████████████▍                   | 157/219 [01:52<00:46,  1.34it/s][A

tensor(0.5864, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  72%|█████████████████████████████████████████████████▊                   | 158/219 [01:53<00:46,  1.32it/s][A

tensor(0.6074, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  73%|██████████████████████████████████████████████████                   | 159/219 [01:54<00:46,  1.29it/s][A

tensor(0.5872, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  73%|██████████████████████████████████████████████████▍                  | 160/219 [01:55<00:46,  1.28it/s][A

tensor(0.7053, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  74%|██████████████████████████████████████████████████▋                  | 161/219 [01:55<00:45,  1.26it/s][A

tensor(0.5798, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  74%|███████████████████████████████████████████████████                  | 162/219 [01:56<00:45,  1.25it/s][A

tensor(0.5507, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  74%|███████████████████████████████████████████████████▎                 | 163/219 [01:57<00:45,  1.22it/s][A

tensor(0.5166, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  75%|███████████████████████████████████████████████████▋                 | 164/219 [01:58<00:44,  1.24it/s][A

tensor(0.5709, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  75%|███████████████████████████████████████████████████▉                 | 165/219 [01:59<00:43,  1.24it/s][A

tensor(0.6254, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  76%|████████████████████████████████████████████████████▎                | 166/219 [02:00<00:43,  1.23it/s][A

tensor(0.5782, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  76%|████████████████████████████████████████████████████▌                | 167/219 [02:00<00:41,  1.24it/s][A

tensor(0.5912, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  77%|████████████████████████████████████████████████████▉                | 168/219 [02:01<00:40,  1.25it/s][A

tensor(0.5694, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  77%|█████████████████████████████████████████████████████▏               | 169/219 [02:02<00:40,  1.24it/s][A

tensor(0.5527, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  78%|█████████████████████████████████████████████████████▌               | 170/219 [02:03<00:39,  1.25it/s][A

tensor(0.6659, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  78%|█████████████████████████████████████████████████████▉               | 171/219 [02:04<00:38,  1.24it/s][A

tensor(0.5401, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  79%|██████████████████████████████████████████████████████▏              | 172/219 [02:04<00:37,  1.25it/s][A

tensor(0.6267, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  79%|██████████████████████████████████████████████████████▌              | 173/219 [02:05<00:37,  1.23it/s][A

tensor(0.6384, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  79%|██████████████████████████████████████████████████████▊              | 174/219 [02:06<00:36,  1.22it/s][A

tensor(0.7465, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  80%|███████████████████████████████████████████████████████▏             | 175/219 [02:07<00:35,  1.23it/s][A

tensor(0.5336, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  80%|███████████████████████████████████████████████████████▍             | 176/219 [02:08<00:34,  1.24it/s][A

tensor(0.6689, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  81%|███████████████████████████████████████████████████████▊             | 177/219 [02:08<00:34,  1.23it/s][A

tensor(0.4793, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  81%|████████████████████████████████████████████████████████             | 178/219 [02:09<00:32,  1.24it/s][A

tensor(0.6245, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  82%|████████████████████████████████████████████████████████▍            | 179/219 [02:10<00:32,  1.25it/s][A

tensor(0.6210, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  82%|████████████████████████████████████████████████████████▋            | 180/219 [02:11<00:31,  1.25it/s][A

tensor(0.5967, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  83%|█████████████████████████████████████████████████████████            | 181/219 [02:12<00:30,  1.24it/s][A

tensor(0.5057, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  83%|█████████████████████████████████████████████████████████▎           | 182/219 [02:12<00:29,  1.24it/s][A

tensor(0.4216, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  84%|█████████████████████████████████████████████████████████▋           | 183/219 [02:13<00:29,  1.24it/s][A

tensor(0.4799, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  84%|█████████████████████████████████████████████████████████▉           | 184/219 [02:14<00:28,  1.24it/s][A

tensor(0.5243, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  84%|██████████████████████████████████████████████████████████▎          | 185/219 [02:15<00:27,  1.24it/s][A

tensor(0.4202, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  85%|██████████████████████████████████████████████████████████▌          | 186/219 [02:16<00:26,  1.23it/s][A

tensor(0.5734, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  85%|██████████████████████████████████████████████████████████▉          | 187/219 [02:17<00:26,  1.21it/s][A

tensor(0.6398, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  86%|███████████████████████████████████████████████████████████▏         | 188/219 [02:17<00:25,  1.23it/s][A

tensor(0.7126, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  86%|███████████████████████████████████████████████████████████▌         | 189/219 [02:18<00:24,  1.24it/s][A

tensor(0.4578, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  87%|███████████████████████████████████████████████████████████▊         | 190/219 [02:19<00:23,  1.25it/s][A

tensor(0.7496, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  87%|████████████████████████████████████████████████████████████▏        | 191/219 [02:20<00:22,  1.25it/s][A

tensor(0.3466, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  88%|████████████████████████████████████████████████████████████▍        | 192/219 [02:20<00:21,  1.26it/s][A

tensor(0.4750, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  88%|████████████████████████████████████████████████████████████▊        | 193/219 [02:21<00:20,  1.26it/s][A

tensor(0.5598, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  89%|█████████████████████████████████████████████████████████████        | 194/219 [02:22<00:19,  1.26it/s][A

tensor(0.5526, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  89%|█████████████████████████████████████████████████████████████▍       | 195/219 [02:23<00:19,  1.26it/s][A

tensor(0.4568, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  89%|█████████████████████████████████████████████████████████████▊       | 196/219 [02:24<00:18,  1.26it/s][A

tensor(0.5256, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  90%|██████████████████████████████████████████████████████████████       | 197/219 [02:24<00:17,  1.25it/s][A

tensor(0.5801, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  90%|██████████████████████████████████████████████████████████████▍      | 198/219 [02:25<00:16,  1.25it/s][A

tensor(0.5641, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  91%|██████████████████████████████████████████████████████████████▋      | 199/219 [02:26<00:15,  1.25it/s][A

tensor(0.5596, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  91%|███████████████████████████████████████████████████████████████      | 200/219 [02:27<00:15,  1.26it/s][A

tensor(0.6268, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  92%|███████████████████████████████████████████████████████████████▎     | 201/219 [02:28<00:14,  1.25it/s][A

tensor(0.6473, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  92%|███████████████████████████████████████████████████████████████▋     | 202/219 [02:28<00:13,  1.24it/s][A

tensor(0.6750, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  93%|███████████████████████████████████████████████████████████████▉     | 203/219 [02:29<00:12,  1.23it/s][A

tensor(0.4966, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  93%|████████████████████████████████████████████████████████████████▎    | 204/219 [02:30<00:12,  1.23it/s][A

tensor(0.4852, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  94%|████████████████████████████████████████████████████████████████▌    | 205/219 [02:31<00:11,  1.24it/s][A

tensor(0.6507, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  94%|████████████████████████████████████████████████████████████████▉    | 206/219 [02:32<00:10,  1.23it/s][A

tensor(0.4786, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  95%|█████████████████████████████████████████████████████████████████▏   | 207/219 [02:33<00:09,  1.24it/s][A

tensor(0.4959, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  95%|█████████████████████████████████████████████████████████████████▌   | 208/219 [02:33<00:08,  1.24it/s][A

tensor(0.5106, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  95%|█████████████████████████████████████████████████████████████████▊   | 209/219 [02:34<00:08,  1.23it/s][A

tensor(0.5221, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  96%|██████████████████████████████████████████████████████████████████▏  | 210/219 [02:35<00:07,  1.24it/s][A

tensor(0.6092, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  96%|██████████████████████████████████████████████████████████████████▍  | 211/219 [02:36<00:06,  1.23it/s][A

tensor(0.6159, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  97%|██████████████████████████████████████████████████████████████████▊  | 212/219 [02:37<00:05,  1.23it/s][A

tensor(0.5105, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  97%|███████████████████████████████████████████████████████████████████  | 213/219 [02:37<00:04,  1.23it/s][A

tensor(0.4663, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  98%|███████████████████████████████████████████████████████████████████▍ | 214/219 [02:38<00:04,  1.24it/s][A

tensor(0.6363, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  98%|███████████████████████████████████████████████████████████████████▋ | 215/219 [02:39<00:03,  1.25it/s][A

tensor(0.5787, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  99%|████████████████████████████████████████████████████████████████████ | 216/219 [02:40<00:02,  1.25it/s][A

tensor(0.5126, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  99%|████████████████████████████████████████████████████████████████████▎| 217/219 [02:41<00:01,  1.25it/s][A

tensor(0.3574, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration: 100%|████████████████████████████████████████████████████████████████████▋| 218/219 [02:41<00:00,  1.26it/s][A

tensor(0.5897, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration: 100%|█████████████████████████████████████████████████████████████████████| 219/219 [02:42<00:00,  1.35it/s][A
Epoch:  33%|█████████████████████████▎                                                  | 1/3 [02:42<05:25, 162.66s/it]
Iteration:   0%|                                                                               | 0/219 [00:00<?, ?it/s][A

tensor(0.5556, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   0%|▎                                                                      | 1/219 [00:00<03:02,  1.19it/s][A

tensor(0.4956, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   1%|▋                                                                      | 2/219 [00:01<03:00,  1.20it/s][A

tensor(0.4247, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   1%|▉                                                                      | 3/219 [00:02<03:02,  1.18it/s][A

tensor(0.3859, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   2%|█▎                                                                     | 4/219 [00:03<03:01,  1.19it/s][A

tensor(0.5533, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   2%|█▌                                                                     | 5/219 [00:04<02:57,  1.20it/s][A

tensor(0.4788, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   3%|█▉                                                                     | 6/219 [00:05<02:59,  1.19it/s][A

tensor(0.5259, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   3%|██▎                                                                    | 7/219 [00:05<03:00,  1.18it/s][A

tensor(0.3800, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   4%|██▌                                                                    | 8/219 [00:06<02:56,  1.20it/s][A

tensor(0.3649, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   4%|██▉                                                                    | 9/219 [00:07<02:55,  1.20it/s][A

tensor(0.4644, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   5%|███▏                                                                  | 10/219 [00:08<02:52,  1.21it/s][A

tensor(0.5383, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   5%|███▌                                                                  | 11/219 [00:09<02:55,  1.19it/s][A

tensor(0.3136, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   5%|███▊                                                                  | 12/219 [00:10<02:55,  1.18it/s][A

tensor(0.5162, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   6%|████▏                                                                 | 13/219 [00:10<02:54,  1.18it/s][A

tensor(0.5908, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   6%|████▍                                                                 | 14/219 [00:11<02:53,  1.18it/s][A

tensor(0.3278, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   7%|████▊                                                                 | 15/219 [00:12<02:50,  1.20it/s][A

tensor(0.7643, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   7%|█████                                                                 | 16/219 [00:13<02:48,  1.21it/s][A

tensor(0.2919, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   8%|█████▍                                                                | 17/219 [00:14<02:45,  1.22it/s][A

tensor(0.3892, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   8%|█████▊                                                                | 18/219 [00:15<02:43,  1.23it/s][A

tensor(0.3554, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   9%|██████                                                                | 19/219 [00:15<02:45,  1.21it/s][A

tensor(0.3972, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   9%|██████▍                                                               | 20/219 [00:16<02:43,  1.22it/s][A

tensor(0.4027, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  10%|██████▋                                                               | 21/219 [00:17<02:44,  1.21it/s][A

tensor(0.4545, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  10%|███████                                                               | 22/219 [00:18<02:41,  1.22it/s][A

tensor(0.5158, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  11%|███████▎                                                              | 23/219 [00:19<02:39,  1.23it/s][A

tensor(0.4490, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  11%|███████▋                                                              | 24/219 [00:19<02:40,  1.21it/s][A

tensor(0.2546, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  11%|███████▉                                                              | 25/219 [00:20<02:41,  1.20it/s][A

tensor(0.3959, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  12%|████████▎                                                             | 26/219 [00:21<02:41,  1.20it/s][A

tensor(0.3158, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  12%|████████▋                                                             | 27/219 [00:22<02:41,  1.19it/s][A

tensor(0.3449, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  13%|████████▉                                                             | 28/219 [00:23<02:42,  1.18it/s][A

tensor(0.4126, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  13%|█████████▎                                                            | 29/219 [00:24<02:38,  1.20it/s][A

tensor(0.4634, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  14%|█████████▌                                                            | 30/219 [00:25<02:41,  1.17it/s][A

tensor(0.4577, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  14%|█████████▉                                                            | 31/219 [00:25<02:38,  1.19it/s][A

tensor(0.4009, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  15%|██████████▏                                                           | 32/219 [00:26<02:40,  1.16it/s][A

tensor(0.3953, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  15%|██████████▌                                                           | 33/219 [00:27<02:37,  1.18it/s][A

tensor(0.4077, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  16%|██████████▊                                                           | 34/219 [00:28<02:34,  1.19it/s][A

tensor(0.3426, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  16%|███████████▏                                                          | 35/219 [00:29<02:33,  1.20it/s][A

tensor(0.3553, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  16%|███████████▌                                                          | 36/219 [00:30<02:31,  1.21it/s][A

tensor(0.2705, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  17%|███████████▊                                                          | 37/219 [00:30<02:28,  1.22it/s][A

tensor(0.2359, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  17%|████████████▏                                                         | 38/219 [00:31<02:27,  1.23it/s][A

tensor(0.3031, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  18%|████████████▍                                                         | 39/219 [00:32<02:25,  1.24it/s][A

tensor(0.6074, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  18%|████████████▊                                                         | 40/219 [00:33<02:23,  1.24it/s][A

tensor(0.4292, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  19%|█████████████                                                         | 41/219 [00:34<02:23,  1.24it/s][A

tensor(0.2869, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  19%|█████████████▍                                                        | 42/219 [00:34<02:25,  1.21it/s][A

tensor(0.2848, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  20%|█████████████▋                                                        | 43/219 [00:35<02:23,  1.22it/s][A

tensor(0.4815, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  20%|██████████████                                                        | 44/219 [00:36<02:22,  1.23it/s][A

tensor(0.3317, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  21%|██████████████▍                                                       | 45/219 [00:37<02:21,  1.23it/s][A

tensor(0.4989, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  21%|██████████████▋                                                       | 46/219 [00:38<02:20,  1.23it/s][A

tensor(0.3054, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  21%|███████████████                                                       | 47/219 [00:38<02:18,  1.24it/s][A

tensor(0.4173, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  22%|███████████████▎                                                      | 48/219 [00:39<02:18,  1.23it/s][A

tensor(0.2610, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  22%|███████████████▋                                                      | 49/219 [00:40<02:16,  1.24it/s][A

tensor(0.4725, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  23%|███████████████▉                                                      | 50/219 [00:41<02:16,  1.23it/s][A

tensor(0.3718, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  23%|████████████████▎                                                     | 51/219 [00:42<02:19,  1.21it/s][A

tensor(0.2687, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  24%|████████████████▌                                                     | 52/219 [00:43<02:34,  1.08it/s][A

tensor(0.4019, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  24%|████████████████▉                                                     | 53/219 [00:44<02:40,  1.04it/s][A

tensor(0.3471, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  25%|█████████████████▎                                                    | 54/219 [00:45<02:53,  1.05s/it][A

tensor(0.6787, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  25%|█████████████████▌                                                    | 55/219 [00:46<03:02,  1.11s/it][A

tensor(0.3493, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  26%|█████████████████▉                                                    | 56/219 [00:48<02:59,  1.10s/it][A

tensor(0.5048, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  26%|██████████████████▏                                                   | 57/219 [00:48<02:46,  1.03s/it][A

tensor(0.3194, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  26%|██████████████████▌                                                   | 58/219 [00:49<02:40,  1.00it/s][A

tensor(0.4210, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  27%|██████████████████▊                                                   | 59/219 [00:50<02:34,  1.03it/s][A

tensor(0.4161, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  27%|███████████████████▏                                                  | 60/219 [00:51<02:28,  1.07it/s][A

tensor(0.3084, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  28%|███████████████████▍                                                  | 61/219 [00:52<02:23,  1.10it/s][A

tensor(0.4090, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  28%|███████████████████▊                                                  | 62/219 [00:53<02:22,  1.10it/s][A

tensor(0.4380, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  29%|████████████████████▏                                                 | 63/219 [00:54<02:17,  1.13it/s][A

tensor(0.6301, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  29%|████████████████████▍                                                 | 64/219 [00:55<02:15,  1.15it/s][A

tensor(0.2447, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  30%|████████████████████▊                                                 | 65/219 [00:55<02:13,  1.16it/s][A

tensor(0.2859, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  30%|█████████████████████                                                 | 66/219 [00:56<02:09,  1.18it/s][A

tensor(0.3921, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  31%|█████████████████████▍                                                | 67/219 [00:57<02:08,  1.18it/s][A

tensor(0.4185, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  31%|█████████████████████▋                                                | 68/219 [00:58<02:06,  1.19it/s][A

tensor(0.2206, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  32%|██████████████████████                                                | 69/219 [00:59<02:05,  1.20it/s][A

tensor(0.3977, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  32%|██████████████████████▎                                               | 70/219 [00:59<02:04,  1.20it/s][A

tensor(0.5326, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  32%|██████████████████████▋                                               | 71/219 [01:00<02:02,  1.21it/s][A

tensor(0.2165, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  33%|███████████████████████                                               | 72/219 [01:01<02:00,  1.22it/s][A

tensor(0.3877, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  33%|███████████████████████▎                                              | 73/219 [01:02<02:00,  1.21it/s][A

tensor(0.3457, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  34%|███████████████████████▋                                              | 74/219 [01:03<01:58,  1.22it/s][A

tensor(0.5609, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  34%|███████████████████████▉                                              | 75/219 [01:04<01:59,  1.20it/s][A

tensor(0.4761, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  35%|████████████████████████▎                                             | 76/219 [01:04<01:57,  1.22it/s][A

tensor(0.2422, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  35%|████████████████████████▌                                             | 77/219 [01:05<01:55,  1.23it/s][A

tensor(0.3183, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  36%|████████████████████████▉                                             | 78/219 [01:06<01:55,  1.22it/s][A

tensor(0.2873, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  36%|█████████████████████████▎                                            | 79/219 [01:07<01:54,  1.23it/s][A

tensor(0.2787, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  37%|█████████████████████████▌                                            | 80/219 [01:08<01:52,  1.24it/s][A

tensor(0.4136, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  37%|█████████████████████████▉                                            | 81/219 [01:08<01:51,  1.24it/s][A

tensor(0.4168, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  37%|██████████████████████████▏                                           | 82/219 [01:09<01:49,  1.25it/s][A

tensor(0.2994, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  38%|██████████████████████████▌                                           | 83/219 [01:10<01:49,  1.24it/s][A

tensor(0.2725, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  38%|██████████████████████████▊                                           | 84/219 [01:11<01:48,  1.25it/s][A

tensor(0.3334, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  39%|███████████████████████████▏                                          | 85/219 [01:12<01:47,  1.25it/s][A

tensor(0.6436, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  39%|███████████████████████████▍                                          | 86/219 [01:12<01:48,  1.23it/s][A

tensor(0.3715, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  40%|███████████████████████████▊                                          | 87/219 [01:13<01:47,  1.23it/s][A

tensor(0.5163, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  40%|████████████████████████████▏                                         | 88/219 [01:14<01:47,  1.22it/s][A

tensor(0.2790, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  41%|████████████████████████████▍                                         | 89/219 [01:15<01:47,  1.21it/s][A

tensor(0.4119, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  41%|████████████████████████████▊                                         | 90/219 [01:16<01:45,  1.22it/s][A

tensor(0.4062, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  42%|█████████████████████████████                                         | 91/219 [01:17<01:44,  1.23it/s][A

tensor(0.2709, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  42%|█████████████████████████████▍                                        | 92/219 [01:17<01:43,  1.23it/s][A

tensor(0.3794, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  42%|█████████████████████████████▋                                        | 93/219 [01:18<01:42,  1.23it/s][A

tensor(0.2656, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  43%|██████████████████████████████                                        | 94/219 [01:19<01:40,  1.24it/s][A

tensor(0.2820, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  43%|██████████████████████████████▎                                       | 95/219 [01:20<01:42,  1.22it/s][A

tensor(0.3054, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  44%|██████████████████████████████▋                                       | 96/219 [01:21<01:42,  1.20it/s][A

tensor(0.2745, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  44%|███████████████████████████████                                       | 97/219 [01:22<01:44,  1.17it/s][A

tensor(0.2636, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  45%|███████████████████████████████▎                                      | 98/219 [01:22<01:42,  1.19it/s][A

tensor(0.5692, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  45%|███████████████████████████████▋                                      | 99/219 [01:23<01:40,  1.19it/s][A

tensor(0.3559, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  46%|███████████████████████████████▌                                     | 100/219 [01:24<01:39,  1.20it/s][A

tensor(0.5980, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  46%|███████████████████████████████▊                                     | 101/219 [01:25<01:37,  1.21it/s][A

tensor(0.4171, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  47%|████████████████████████████████▏                                    | 102/219 [01:26<01:37,  1.20it/s][A

tensor(0.2036, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  47%|████████████████████████████████▍                                    | 103/219 [01:27<01:36,  1.20it/s][A

tensor(0.3508, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  47%|████████████████████████████████▊                                    | 104/219 [01:27<01:36,  1.20it/s][A

tensor(0.3505, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  48%|█████████████████████████████████                                    | 105/219 [01:28<01:33,  1.22it/s][A

tensor(0.3316, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  48%|█████████████████████████████████▍                                   | 106/219 [01:29<01:32,  1.22it/s][A

tensor(0.5294, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  49%|█████████████████████████████████▋                                   | 107/219 [01:30<01:30,  1.23it/s][A

tensor(0.3478, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  49%|██████████████████████████████████                                   | 108/219 [01:31<01:29,  1.23it/s][A

tensor(0.2312, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  50%|██████████████████████████████████▎                                  | 109/219 [01:31<01:29,  1.22it/s][A

tensor(0.3411, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  50%|██████████████████████████████████▋                                  | 110/219 [01:32<01:28,  1.23it/s][A

tensor(0.5900, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  51%|██████████████████████████████████▉                                  | 111/219 [01:33<01:26,  1.24it/s][A

tensor(0.4187, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  51%|███████████████████████████████████▎                                 | 112/219 [01:34<01:26,  1.23it/s][A

tensor(0.3692, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  52%|███████████████████████████████████▌                                 | 113/219 [01:35<01:25,  1.24it/s][A

tensor(0.3965, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  52%|███████████████████████████████████▉                                 | 114/219 [01:35<01:24,  1.24it/s][A

tensor(0.3936, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  53%|████████████████████████████████████▏                                | 115/219 [01:36<01:23,  1.24it/s][A

tensor(0.3892, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  53%|████████████████████████████████████▌                                | 116/219 [01:37<01:22,  1.25it/s][A

tensor(0.3543, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  53%|████████████████████████████████████▊                                | 117/219 [01:38<01:23,  1.22it/s][A

tensor(0.4071, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  54%|█████████████████████████████████████▏                               | 118/219 [01:39<01:22,  1.22it/s][A

tensor(0.6199, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  54%|█████████████████████████████████████▍                               | 119/219 [01:40<01:21,  1.23it/s][A

tensor(0.3796, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  55%|█████████████████████████████████████▊                               | 120/219 [01:40<01:19,  1.24it/s][A

tensor(0.3531, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  55%|██████████████████████████████████████                               | 121/219 [01:41<01:18,  1.24it/s][A

tensor(0.3334, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  56%|██████████████████████████████████████▍                              | 122/219 [01:42<01:18,  1.24it/s][A

tensor(0.2905, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  56%|██████████████████████████████████████▊                              | 123/219 [01:43<01:17,  1.24it/s][A

tensor(0.3237, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  57%|███████████████████████████████████████                              | 124/219 [01:44<01:17,  1.23it/s][A

tensor(0.2935, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  57%|███████████████████████████████████████▍                             | 125/219 [01:44<01:17,  1.21it/s][A

tensor(0.4001, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  58%|███████████████████████████████████████▋                             | 126/219 [01:45<01:16,  1.22it/s][A

tensor(0.3623, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  58%|████████████████████████████████████████                             | 127/219 [01:46<01:15,  1.22it/s][A

tensor(0.3321, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  58%|████████████████████████████████████████▎                            | 128/219 [01:47<01:13,  1.23it/s][A

tensor(0.4573, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  59%|████████████████████████████████████████▋                            | 129/219 [01:48<01:13,  1.22it/s][A

tensor(0.2433, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  59%|████████████████████████████████████████▉                            | 130/219 [01:48<01:12,  1.22it/s][A

tensor(0.3728, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  60%|█████████████████████████████████████████▎                           | 131/219 [01:49<01:11,  1.23it/s][A

tensor(0.2985, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  60%|█████████████████████████████████████████▌                           | 132/219 [01:50<01:10,  1.23it/s][A

tensor(0.6347, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  61%|█████████████████████████████████████████▉                           | 133/219 [01:51<01:09,  1.23it/s][A

tensor(0.1709, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  61%|██████████████████████████████████████████▏                          | 134/219 [01:52<01:10,  1.21it/s][A

tensor(0.2705, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  62%|██████████████████████████████████████████▌                          | 135/219 [01:53<01:09,  1.21it/s][A

tensor(0.3732, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  62%|██████████████████████████████████████████▊                          | 136/219 [01:53<01:08,  1.21it/s][A

tensor(0.6934, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  63%|███████████████████████████████████████████▏                         | 137/219 [01:54<01:08,  1.20it/s][A

tensor(0.6353, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  63%|███████████████████████████████████████████▍                         | 138/219 [01:55<01:08,  1.17it/s][A

tensor(0.4245, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  63%|███████████████████████████████████████████▊                         | 139/219 [01:56<01:08,  1.17it/s][A

tensor(0.3977, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  64%|████████████████████████████████████████████                         | 140/219 [01:57<01:06,  1.19it/s][A

tensor(0.1972, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  64%|████████████████████████████████████████████▍                        | 141/219 [01:58<01:06,  1.17it/s][A

tensor(0.3509, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  65%|████████████████████████████████████████████▋                        | 142/219 [01:59<01:05,  1.17it/s][A

tensor(0.2940, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  65%|█████████████████████████████████████████████                        | 143/219 [01:59<01:03,  1.19it/s][A

tensor(0.3966, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  66%|█████████████████████████████████████████████▎                       | 144/219 [02:00<01:02,  1.20it/s][A

tensor(0.1978, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  66%|█████████████████████████████████████████████▋                       | 145/219 [02:01<01:01,  1.20it/s][A

tensor(0.3236, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  67%|██████████████████████████████████████████████                       | 146/219 [02:02<01:01,  1.19it/s][A

tensor(0.2536, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  67%|██████████████████████████████████████████████▎                      | 147/219 [02:03<01:00,  1.18it/s][A

tensor(0.2726, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  68%|██████████████████████████████████████████████▋                      | 148/219 [02:04<00:59,  1.19it/s][A

tensor(0.3733, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  68%|██████████████████████████████████████████████▉                      | 149/219 [02:04<00:58,  1.19it/s][A

tensor(0.2140, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  68%|███████████████████████████████████████████████▎                     | 150/219 [02:05<00:57,  1.21it/s][A

tensor(0.3635, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  69%|███████████████████████████████████████████████▌                     | 151/219 [02:06<00:57,  1.19it/s][A

tensor(0.2356, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  69%|███████████████████████████████████████████████▉                     | 152/219 [02:07<00:57,  1.17it/s][A

tensor(0.4849, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  70%|████████████████████████████████████████████████▏                    | 153/219 [02:08<00:55,  1.19it/s][A

tensor(0.1863, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  70%|████████████████████████████████████████████████▌                    | 154/219 [02:09<00:55,  1.18it/s][A

tensor(0.3242, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  71%|████████████████████████████████████████████████▊                    | 155/219 [02:09<00:53,  1.20it/s][A

tensor(0.3350, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  71%|█████████████████████████████████████████████████▏                   | 156/219 [02:10<00:52,  1.20it/s][A

tensor(0.4542, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  72%|█████████████████████████████████████████████████▍                   | 157/219 [02:11<00:51,  1.20it/s][A

tensor(0.2218, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  72%|█████████████████████████████████████████████████▊                   | 158/219 [02:12<00:50,  1.20it/s][A

tensor(0.2528, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  73%|██████████████████████████████████████████████████                   | 159/219 [02:13<00:49,  1.22it/s][A

tensor(0.3549, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  73%|██████████████████████████████████████████████████▍                  | 160/219 [02:14<00:48,  1.21it/s][A

tensor(0.2312, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  74%|██████████████████████████████████████████████████▋                  | 161/219 [02:14<00:48,  1.21it/s][A

tensor(0.3213, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  74%|███████████████████████████████████████████████████                  | 162/219 [02:15<00:47,  1.20it/s][A

tensor(0.1529, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  74%|███████████████████████████████████████████████████▎                 | 163/219 [02:16<00:46,  1.22it/s][A

tensor(0.2009, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  75%|███████████████████████████████████████████████████▋                 | 164/219 [02:17<00:45,  1.21it/s][A

tensor(0.4878, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  75%|███████████████████████████████████████████████████▉                 | 165/219 [02:18<00:44,  1.21it/s][A

tensor(0.1035, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  76%|████████████████████████████████████████████████████▎                | 166/219 [02:19<00:44,  1.19it/s][A

tensor(0.1593, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  76%|████████████████████████████████████████████████████▌                | 167/219 [02:19<00:43,  1.20it/s][A

tensor(0.2880, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  77%|████████████████████████████████████████████████████▉                | 168/219 [02:20<00:43,  1.18it/s][A

tensor(0.2518, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  77%|█████████████████████████████████████████████████████▏               | 169/219 [02:21<00:42,  1.17it/s][A

tensor(0.4309, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  78%|█████████████████████████████████████████████████████▌               | 170/219 [02:22<00:41,  1.17it/s][A

tensor(0.3567, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  78%|█████████████████████████████████████████████████████▉               | 171/219 [02:23<00:40,  1.18it/s][A

tensor(0.2153, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  79%|██████████████████████████████████████████████████████▏              | 172/219 [02:24<00:39,  1.19it/s][A

tensor(0.2186, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  79%|██████████████████████████████████████████████████████▌              | 173/219 [02:24<00:38,  1.19it/s][A

tensor(0.2826, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  79%|██████████████████████████████████████████████████████▊              | 174/219 [02:25<00:38,  1.18it/s][A

tensor(0.3142, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  80%|███████████████████████████████████████████████████████▏             | 175/219 [02:26<00:36,  1.20it/s][A

tensor(0.2567, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  80%|███████████████████████████████████████████████████████▍             | 176/219 [02:27<00:36,  1.19it/s][A

tensor(0.4042, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  81%|███████████████████████████████████████████████████████▊             | 177/219 [02:28<00:35,  1.18it/s][A

tensor(0.4237, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  81%|████████████████████████████████████████████████████████             | 178/219 [02:29<00:34,  1.20it/s][A

tensor(0.4568, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  82%|████████████████████████████████████████████████████████▍            | 179/219 [02:29<00:33,  1.21it/s][A

tensor(0.2187, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  82%|████████████████████████████████████████████████████████▋            | 180/219 [02:30<00:31,  1.22it/s][A

tensor(0.1489, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  83%|█████████████████████████████████████████████████████████            | 181/219 [02:31<00:31,  1.21it/s][A

tensor(0.2369, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  83%|█████████████████████████████████████████████████████████▎           | 182/219 [02:32<00:30,  1.20it/s][A

tensor(0.2514, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  84%|█████████████████████████████████████████████████████████▋           | 183/219 [02:33<00:29,  1.21it/s][A

tensor(0.3192, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  84%|█████████████████████████████████████████████████████████▉           | 184/219 [02:34<00:29,  1.19it/s][A

tensor(0.2272, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  84%|██████████████████████████████████████████████████████████▎          | 185/219 [02:34<00:28,  1.20it/s][A

tensor(0.4172, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  85%|██████████████████████████████████████████████████████████▌          | 186/219 [02:35<00:27,  1.20it/s][A

tensor(0.2555, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  85%|██████████████████████████████████████████████████████████▉          | 187/219 [02:36<00:26,  1.21it/s][A

tensor(0.2272, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  86%|███████████████████████████████████████████████████████████▏         | 188/219 [02:37<00:25,  1.22it/s][A

tensor(0.2268, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  86%|███████████████████████████████████████████████████████████▌         | 189/219 [02:38<00:24,  1.22it/s][A

tensor(0.2315, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  87%|███████████████████████████████████████████████████████████▊         | 190/219 [02:39<00:23,  1.22it/s][A

tensor(0.3666, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  87%|████████████████████████████████████████████████████████████▏        | 191/219 [02:39<00:22,  1.22it/s][A

tensor(0.4765, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  88%|████████████████████████████████████████████████████████████▍        | 192/219 [02:40<00:22,  1.21it/s][A

tensor(0.2158, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  88%|████████████████████████████████████████████████████████████▊        | 193/219 [02:41<00:21,  1.20it/s][A

tensor(0.5404, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  89%|█████████████████████████████████████████████████████████████        | 194/219 [02:42<00:20,  1.19it/s][A

tensor(0.3768, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  89%|█████████████████████████████████████████████████████████████▍       | 195/219 [02:43<00:19,  1.20it/s][A

tensor(0.1549, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  89%|█████████████████████████████████████████████████████████████▊       | 196/219 [02:44<00:19,  1.21it/s][A

tensor(0.2499, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  90%|██████████████████████████████████████████████████████████████       | 197/219 [02:44<00:18,  1.22it/s][A

tensor(0.4979, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  90%|██████████████████████████████████████████████████████████████▍      | 198/219 [02:45<00:17,  1.22it/s][A

tensor(0.3478, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  91%|██████████████████████████████████████████████████████████████▋      | 199/219 [02:46<00:16,  1.23it/s][A

tensor(0.3154, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  91%|███████████████████████████████████████████████████████████████      | 200/219 [02:47<00:15,  1.24it/s][A

tensor(0.3143, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  92%|███████████████████████████████████████████████████████████████▎     | 201/219 [02:48<00:14,  1.23it/s][A

tensor(0.3630, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  92%|███████████████████████████████████████████████████████████████▋     | 202/219 [02:48<00:13,  1.22it/s][A

tensor(0.5024, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  93%|███████████████████████████████████████████████████████████████▉     | 203/219 [02:49<00:13,  1.21it/s][A

tensor(0.2770, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  93%|████████████████████████████████████████████████████████████████▎    | 204/219 [02:50<00:12,  1.20it/s][A

tensor(0.3783, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  94%|████████████████████████████████████████████████████████████████▌    | 205/219 [02:51<00:11,  1.21it/s][A

tensor(0.1550, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  94%|████████████████████████████████████████████████████████████████▉    | 206/219 [02:52<00:10,  1.22it/s][A

tensor(0.2662, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  95%|█████████████████████████████████████████████████████████████████▏   | 207/219 [02:53<00:09,  1.22it/s][A

tensor(0.3608, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  95%|█████████████████████████████████████████████████████████████████▌   | 208/219 [02:53<00:09,  1.21it/s][A

tensor(0.3956, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  95%|█████████████████████████████████████████████████████████████████▊   | 209/219 [02:54<00:08,  1.20it/s][A

tensor(0.3066, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  96%|██████████████████████████████████████████████████████████████████▏  | 210/219 [02:55<00:07,  1.22it/s][A

tensor(0.3608, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  96%|██████████████████████████████████████████████████████████████████▍  | 211/219 [02:56<00:06,  1.22it/s][A

tensor(0.5971, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  97%|██████████████████████████████████████████████████████████████████▊  | 212/219 [02:57<00:05,  1.19it/s][A

tensor(0.2709, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  97%|███████████████████████████████████████████████████████████████████  | 213/219 [02:58<00:05,  1.18it/s][A

tensor(0.4059, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  98%|███████████████████████████████████████████████████████████████████▍ | 214/219 [02:58<00:04,  1.19it/s][A

tensor(0.2892, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  98%|███████████████████████████████████████████████████████████████████▋ | 215/219 [02:59<00:03,  1.19it/s][A

tensor(0.2268, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  99%|████████████████████████████████████████████████████████████████████ | 216/219 [03:00<00:02,  1.21it/s][A

tensor(0.2594, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  99%|████████████████████████████████████████████████████████████████████▎| 217/219 [03:01<00:01,  1.20it/s][A

tensor(0.3092, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration: 100%|████████████████████████████████████████████████████████████████████▋| 218/219 [03:02<00:00,  1.21it/s][A

tensor(0.3060, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration: 100%|█████████████████████████████████████████████████████████████████████| 219/219 [03:03<00:00,  1.20it/s][A
Epoch:  67%|██████████████████████████████████████████████████▋                         | 2/3 [05:45<02:48, 168.77s/it]
Iteration:   0%|                                                                               | 0/219 [00:00<?, ?it/s][A

tensor(0.3898, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   0%|▎                                                                      | 1/219 [00:00<02:53,  1.26it/s][A

tensor(0.1150, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   1%|▋                                                                      | 2/219 [00:01<02:53,  1.25it/s][A

tensor(0.1895, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   1%|▉                                                                      | 3/219 [00:02<02:53,  1.25it/s][A

tensor(0.1584, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   2%|█▎                                                                     | 4/219 [00:03<02:53,  1.24it/s][A

tensor(0.1477, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   2%|█▌                                                                     | 5/219 [00:04<02:51,  1.24it/s][A

tensor(0.3306, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   3%|█▉                                                                     | 6/219 [00:04<02:50,  1.25it/s][A

tensor(0.2080, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   3%|██▎                                                                    | 7/219 [00:05<02:50,  1.25it/s][A

tensor(0.1094, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   4%|██▌                                                                    | 8/219 [00:06<02:49,  1.25it/s][A

tensor(0.1491, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   4%|██▉                                                                    | 9/219 [00:07<02:49,  1.24it/s][A

tensor(0.1185, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   5%|███▏                                                                  | 10/219 [00:08<02:48,  1.24it/s][A

tensor(0.1870, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   5%|███▌                                                                  | 11/219 [00:08<02:47,  1.24it/s][A

tensor(0.1142, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   5%|███▊                                                                  | 12/219 [00:09<02:49,  1.22it/s][A

tensor(0.2952, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   6%|████▏                                                                 | 13/219 [00:10<02:50,  1.21it/s][A

tensor(0.1860, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   6%|████▍                                                                 | 14/219 [00:11<02:48,  1.22it/s][A

tensor(0.2152, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   7%|████▊                                                                 | 15/219 [00:12<02:51,  1.19it/s][A

tensor(0.0459, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   7%|█████                                                                 | 16/219 [00:13<02:48,  1.20it/s][A

tensor(0.1232, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   8%|█████▍                                                                | 17/219 [00:13<02:47,  1.21it/s][A

tensor(0.2022, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   8%|█████▊                                                                | 18/219 [00:14<02:45,  1.21it/s][A

tensor(0.0475, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   9%|██████                                                                | 19/219 [00:15<02:43,  1.22it/s][A

tensor(0.2281, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:   9%|██████▍                                                               | 20/219 [00:16<02:42,  1.23it/s][A

tensor(0.1095, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  10%|██████▋                                                               | 21/219 [00:17<02:40,  1.23it/s][A

tensor(0.2430, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  10%|███████                                                               | 22/219 [00:17<02:40,  1.23it/s][A

tensor(0.4584, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  11%|███████▎                                                              | 23/219 [00:18<02:39,  1.23it/s][A

tensor(0.1935, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  11%|███████▋                                                              | 24/219 [00:19<02:39,  1.23it/s][A

tensor(0.1563, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  11%|███████▉                                                              | 25/219 [00:20<02:40,  1.21it/s][A

tensor(0.3653, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  12%|████████▎                                                             | 26/219 [00:21<02:40,  1.20it/s][A

tensor(0.2936, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  12%|████████▋                                                             | 27/219 [00:22<02:39,  1.20it/s][A

tensor(0.1467, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  13%|████████▉                                                             | 28/219 [00:22<02:40,  1.19it/s][A

tensor(0.0883, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  13%|█████████▎                                                            | 29/219 [00:23<02:37,  1.21it/s][A

tensor(0.0375, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  14%|█████████▌                                                            | 30/219 [00:24<02:37,  1.20it/s][A

tensor(0.0983, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  14%|█████████▉                                                            | 31/219 [00:25<02:37,  1.19it/s][A

tensor(0.0954, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  15%|██████████▏                                                           | 32/219 [00:26<02:35,  1.20it/s][A

tensor(0.0405, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  15%|██████████▌                                                           | 33/219 [00:27<02:33,  1.21it/s][A

tensor(0.0279, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  16%|██████████▊                                                           | 34/219 [00:27<02:34,  1.19it/s][A

tensor(0.1067, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  16%|███████████▏                                                          | 35/219 [00:28<02:35,  1.18it/s][A

tensor(0.1155, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  16%|███████████▌                                                          | 36/219 [00:29<02:36,  1.17it/s][A

tensor(0.1686, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  17%|███████████▊                                                          | 37/219 [00:30<02:32,  1.19it/s][A

tensor(0.0867, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  17%|████████████▏                                                         | 38/219 [00:31<02:30,  1.20it/s][A

tensor(0.2512, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  18%|████████████▍                                                         | 39/219 [00:32<02:31,  1.19it/s][A

tensor(0.1663, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  18%|████████████▊                                                         | 40/219 [00:33<02:31,  1.18it/s][A

tensor(0.2122, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  19%|█████████████                                                         | 41/219 [00:33<02:29,  1.19it/s][A

tensor(0.0819, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  19%|█████████████▍                                                        | 42/219 [00:34<02:27,  1.20it/s][A

tensor(0.1246, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  20%|█████████████▋                                                        | 43/219 [00:35<02:26,  1.20it/s][A

tensor(0.1349, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  20%|██████████████                                                        | 44/219 [00:36<02:27,  1.19it/s][A

tensor(0.1221, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  21%|██████████████▍                                                       | 45/219 [00:37<02:27,  1.18it/s][A

tensor(0.2398, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  21%|██████████████▋                                                       | 46/219 [00:38<02:29,  1.16it/s][A

tensor(0.3205, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  21%|███████████████                                                       | 47/219 [00:38<02:28,  1.16it/s][A

tensor(0.0809, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  22%|███████████████▎                                                      | 48/219 [00:39<02:30,  1.13it/s][A

tensor(0.1256, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  22%|███████████████▋                                                      | 49/219 [00:40<02:29,  1.14it/s][A

tensor(0.0737, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  23%|███████████████▉                                                      | 50/219 [00:41<02:25,  1.16it/s][A

tensor(0.1517, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  23%|████████████████▎                                                     | 51/219 [00:42<02:22,  1.18it/s][A

tensor(0.2288, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  24%|████████████████▌                                                     | 52/219 [00:43<02:20,  1.19it/s][A

tensor(0.2033, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  24%|████████████████▉                                                     | 53/219 [00:44<02:21,  1.17it/s][A

tensor(0.1037, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  25%|█████████████████▎                                                    | 54/219 [00:44<02:19,  1.18it/s][A

tensor(0.1405, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  25%|█████████████████▌                                                    | 55/219 [00:45<02:16,  1.20it/s][A

tensor(0.0844, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  26%|█████████████████▉                                                    | 56/219 [00:46<02:15,  1.20it/s][A

tensor(0.1738, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  26%|██████████████████▏                                                   | 57/219 [00:47<02:15,  1.20it/s][A

tensor(0.1733, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  26%|██████████████████▌                                                   | 58/219 [00:48<02:12,  1.21it/s][A

tensor(0.2293, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  27%|██████████████████▊                                                   | 59/219 [00:49<02:12,  1.20it/s][A

tensor(0.1175, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  27%|███████████████████▏                                                  | 60/219 [00:49<02:12,  1.20it/s][A

tensor(0.1541, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  28%|███████████████████▍                                                  | 61/219 [00:50<02:12,  1.20it/s][A

tensor(0.1623, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  28%|███████████████████▊                                                  | 62/219 [00:51<02:09,  1.21it/s][A

tensor(0.1996, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  29%|████████████████████▏                                                 | 63/219 [00:52<02:09,  1.21it/s][A

tensor(0.1694, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  29%|████████████████████▍                                                 | 64/219 [00:53<02:09,  1.20it/s][A

tensor(0.0508, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  30%|████████████████████▊                                                 | 65/219 [00:54<02:08,  1.20it/s][A

tensor(0.1065, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  30%|█████████████████████                                                 | 66/219 [00:54<02:09,  1.18it/s][A

tensor(0.2018, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  31%|█████████████████████▍                                                | 67/219 [00:55<02:06,  1.20it/s][A

tensor(0.1332, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  31%|█████████████████████▋                                                | 68/219 [00:56<02:04,  1.21it/s][A

tensor(0.0875, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  32%|██████████████████████                                                | 69/219 [00:57<02:03,  1.21it/s][A

tensor(0.0683, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  32%|██████████████████████▎                                               | 70/219 [00:58<02:01,  1.22it/s][A

tensor(0.2218, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  32%|██████████████████████▋                                               | 71/219 [00:58<02:00,  1.23it/s][A

tensor(0.1193, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  33%|███████████████████████                                               | 72/219 [00:59<01:59,  1.23it/s][A

tensor(0.2666, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  33%|███████████████████████▎                                              | 73/219 [01:00<01:58,  1.24it/s][A

tensor(0.2319, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  34%|███████████████████████▋                                              | 74/219 [01:01<01:57,  1.24it/s][A

tensor(0.1451, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  34%|███████████████████████▉                                              | 75/219 [01:02<01:58,  1.21it/s][A

tensor(0.1038, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  35%|████████████████████████▎                                             | 76/219 [01:03<01:57,  1.22it/s][A

tensor(0.1844, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  35%|████████████████████████▌                                             | 77/219 [01:03<01:56,  1.22it/s][A

tensor(0.0456, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  36%|████████████████████████▉                                             | 78/219 [01:04<01:54,  1.23it/s][A

tensor(0.0899, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  36%|█████████████████████████▎                                            | 79/219 [01:05<01:53,  1.24it/s][A

tensor(0.0272, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  37%|█████████████████████████▌                                            | 80/219 [01:06<01:55,  1.21it/s][A

tensor(0.1906, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  37%|█████████████████████████▉                                            | 81/219 [01:07<01:53,  1.22it/s][A

tensor(0.2311, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  37%|██████████████████████████▏                                           | 82/219 [01:07<01:52,  1.22it/s][A

tensor(0.1947, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  38%|██████████████████████████▌                                           | 83/219 [01:08<01:54,  1.19it/s][A

tensor(0.1528, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  38%|██████████████████████████▊                                           | 84/219 [01:09<01:53,  1.19it/s][A

tensor(0.1116, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  39%|███████████████████████████▏                                          | 85/219 [01:10<01:51,  1.21it/s][A

tensor(0.1057, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  39%|███████████████████████████▍                                          | 86/219 [01:11<01:51,  1.20it/s][A

tensor(0.1269, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  40%|███████████████████████████▊                                          | 87/219 [01:12<01:49,  1.21it/s][A

tensor(0.2582, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  40%|████████████████████████████▏                                         | 88/219 [01:12<01:47,  1.22it/s][A

tensor(0.1951, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  41%|████████████████████████████▍                                         | 89/219 [01:13<01:47,  1.21it/s][A

tensor(0.0967, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  41%|████████████████████████████▊                                         | 90/219 [01:14<01:45,  1.22it/s][A

tensor(0.0393, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  42%|█████████████████████████████                                         | 91/219 [01:15<01:44,  1.22it/s][A

tensor(0.1084, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  42%|█████████████████████████████▍                                        | 92/219 [01:16<01:43,  1.23it/s][A

tensor(0.2250, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  42%|█████████████████████████████▋                                        | 93/219 [01:17<01:43,  1.21it/s][A

tensor(0.0508, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  43%|██████████████████████████████                                        | 94/219 [01:17<01:42,  1.21it/s][A

tensor(0.0516, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  43%|██████████████████████████████▎                                       | 95/219 [01:18<01:41,  1.22it/s][A

tensor(0.1152, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  44%|██████████████████████████████▋                                       | 96/219 [01:19<01:41,  1.21it/s][A

tensor(0.0320, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  44%|███████████████████████████████                                       | 97/219 [01:20<01:41,  1.20it/s][A

tensor(0.2655, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  45%|███████████████████████████████▎                                      | 98/219 [01:21<01:41,  1.19it/s][A

tensor(0.0614, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  45%|███████████████████████████████▋                                      | 99/219 [01:22<01:41,  1.19it/s][A

tensor(0.0613, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  46%|███████████████████████████████▌                                     | 100/219 [01:22<01:40,  1.19it/s][A

tensor(0.0799, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  46%|███████████████████████████████▊                                     | 101/219 [01:23<01:38,  1.20it/s][A

tensor(0.2766, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  47%|████████████████████████████████▏                                    | 102/219 [01:24<01:36,  1.21it/s][A

tensor(0.1723, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  47%|████████████████████████████████▍                                    | 103/219 [01:25<01:36,  1.20it/s][A

tensor(0.1847, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  47%|████████████████████████████████▊                                    | 104/219 [01:26<01:34,  1.21it/s][A

tensor(0.1394, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  48%|█████████████████████████████████                                    | 105/219 [01:27<01:35,  1.19it/s][A

tensor(0.1158, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  48%|█████████████████████████████████▍                                   | 106/219 [01:27<01:33,  1.21it/s][A

tensor(0.0478, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  49%|█████████████████████████████████▋                                   | 107/219 [01:28<01:32,  1.21it/s][A

tensor(0.1302, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  49%|██████████████████████████████████                                   | 108/219 [01:29<01:31,  1.22it/s][A

tensor(0.1165, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  50%|██████████████████████████████████▎                                  | 109/219 [01:30<01:29,  1.23it/s][A

tensor(0.1997, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  50%|██████████████████████████████████▋                                  | 110/219 [01:31<01:28,  1.23it/s][A

tensor(0.1000, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  51%|██████████████████████████████████▉                                  | 111/219 [01:31<01:28,  1.22it/s][A

tensor(0.0815, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  51%|███████████████████████████████████▎                                 | 112/219 [01:32<01:27,  1.23it/s][A

tensor(0.1120, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  52%|███████████████████████████████████▌                                 | 113/219 [01:33<01:26,  1.23it/s][A

tensor(0.0315, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  52%|███████████████████████████████████▉                                 | 114/219 [01:34<01:25,  1.23it/s][A

tensor(0.1335, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  53%|████████████████████████████████████▏                                | 115/219 [01:35<01:26,  1.21it/s][A

tensor(0.4853, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  53%|████████████████████████████████████▌                                | 116/219 [01:36<01:24,  1.22it/s][A

tensor(0.0624, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  53%|████████████████████████████████████▊                                | 117/219 [01:36<01:24,  1.20it/s][A

tensor(0.2636, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  54%|█████████████████████████████████████▏                               | 118/219 [01:37<01:23,  1.21it/s][A

tensor(0.1804, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  54%|█████████████████████████████████████▍                               | 119/219 [01:38<01:21,  1.22it/s][A

tensor(0.0890, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  55%|█████████████████████████████████████▊                               | 120/219 [01:39<01:20,  1.23it/s][A

tensor(0.0634, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  55%|██████████████████████████████████████                               | 121/219 [01:40<01:20,  1.22it/s][A

tensor(0.1382, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  56%|██████████████████████████████████████▍                              | 122/219 [01:41<01:21,  1.19it/s][A

tensor(0.1744, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  56%|██████████████████████████████████████▊                              | 123/219 [01:41<01:21,  1.18it/s][A

tensor(0.2965, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  57%|███████████████████████████████████████                              | 124/219 [01:42<01:19,  1.20it/s][A

tensor(0.2825, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  57%|███████████████████████████████████████▍                             | 125/219 [01:43<01:18,  1.19it/s][A

tensor(0.1278, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  58%|███████████████████████████████████████▋                             | 126/219 [01:44<01:17,  1.20it/s][A

tensor(0.2666, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  58%|████████████████████████████████████████                             | 127/219 [01:45<01:15,  1.21it/s][A

tensor(0.2003, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  58%|████████████████████████████████████████▎                            | 128/219 [01:46<01:16,  1.20it/s][A

tensor(0.0744, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  59%|████████████████████████████████████████▋                            | 129/219 [01:46<01:14,  1.21it/s][A

tensor(0.1431, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  59%|████████████████████████████████████████▉                            | 130/219 [01:47<01:12,  1.22it/s][A

tensor(0.0769, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  60%|█████████████████████████████████████████▎                           | 131/219 [01:48<01:11,  1.23it/s][A

tensor(0.0432, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  60%|█████████████████████████████████████████▌                           | 132/219 [01:49<01:10,  1.23it/s][A

tensor(0.3602, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  61%|█████████████████████████████████████████▉                           | 133/219 [01:50<01:09,  1.23it/s][A

tensor(0.1274, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  61%|██████████████████████████████████████████▏                          | 134/219 [01:50<01:08,  1.23it/s][A

tensor(0.0612, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  62%|██████████████████████████████████████████▌                          | 135/219 [01:51<01:08,  1.23it/s][A

tensor(0.1886, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  62%|██████████████████████████████████████████▊                          | 136/219 [01:52<01:07,  1.23it/s][A

tensor(0.0924, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  63%|███████████████████████████████████████████▏                         | 137/219 [01:53<01:07,  1.22it/s][A

tensor(0.1865, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  63%|███████████████████████████████████████████▍                         | 138/219 [01:54<01:06,  1.22it/s][A

tensor(0.1303, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  63%|███████████████████████████████████████████▊                         | 139/219 [01:54<01:05,  1.22it/s][A

tensor(0.2776, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  64%|████████████████████████████████████████████                         | 140/219 [01:55<01:05,  1.20it/s][A

tensor(0.0451, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  64%|████████████████████████████████████████████▍                        | 141/219 [01:56<01:03,  1.22it/s][A

tensor(0.1429, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  65%|████████████████████████████████████████████▋                        | 142/219 [01:57<01:02,  1.23it/s][A

tensor(0.1348, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  65%|█████████████████████████████████████████████                        | 143/219 [01:58<01:02,  1.22it/s][A

tensor(0.1851, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  66%|█████████████████████████████████████████████▎                       | 144/219 [01:59<01:00,  1.23it/s][A

tensor(0.0442, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  66%|█████████████████████████████████████████████▋                       | 145/219 [01:59<01:01,  1.21it/s][A

tensor(0.0203, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  67%|██████████████████████████████████████████████                       | 146/219 [02:00<00:59,  1.22it/s][A

tensor(0.0654, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  67%|██████████████████████████████████████████████▎                      | 147/219 [02:01<00:58,  1.23it/s][A

tensor(0.1065, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  68%|██████████████████████████████████████████████▋                      | 148/219 [02:02<00:57,  1.23it/s][A

tensor(0.2214, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  68%|██████████████████████████████████████████████▉                      | 149/219 [02:03<00:56,  1.23it/s][A

tensor(0.1883, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  68%|███████████████████████████████████████████████▎                     | 150/219 [02:03<00:56,  1.23it/s][A

tensor(0.1815, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  69%|███████████████████████████████████████████████▌                     | 151/219 [02:04<00:55,  1.23it/s][A

tensor(0.1192, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  69%|███████████████████████████████████████████████▉                     | 152/219 [02:05<00:55,  1.21it/s][A

tensor(0.1772, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  70%|████████████████████████████████████████████████▏                    | 153/219 [02:06<00:54,  1.20it/s][A

tensor(0.0837, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  70%|████████████████████████████████████████████████▌                    | 154/219 [02:07<00:53,  1.21it/s][A

tensor(0.0690, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  71%|████████████████████████████████████████████████▊                    | 155/219 [02:08<00:52,  1.22it/s][A

tensor(0.1562, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  71%|█████████████████████████████████████████████████▏                   | 156/219 [02:08<00:52,  1.20it/s][A

tensor(0.1191, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  72%|█████████████████████████████████████████████████▍                   | 157/219 [02:09<00:51,  1.21it/s][A

tensor(0.2462, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  72%|█████████████████████████████████████████████████▊                   | 158/219 [02:10<00:50,  1.22it/s][A

tensor(0.4257, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  73%|██████████████████████████████████████████████████                   | 159/219 [02:11<00:50,  1.20it/s][A

tensor(0.2281, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  73%|██████████████████████████████████████████████████▍                  | 160/219 [02:12<00:48,  1.21it/s][A

tensor(0.1318, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  74%|██████████████████████████████████████████████████▋                  | 161/219 [02:13<00:47,  1.22it/s][A

tensor(0.2827, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  74%|███████████████████████████████████████████████████                  | 162/219 [02:13<00:47,  1.21it/s][A

tensor(0.2168, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  74%|███████████████████████████████████████████████████▎                 | 163/219 [02:14<00:47,  1.18it/s][A

tensor(0.1517, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  75%|███████████████████████████████████████████████████▋                 | 164/219 [02:15<00:46,  1.18it/s][A

tensor(0.1949, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  75%|███████████████████████████████████████████████████▉                 | 165/219 [02:16<00:45,  1.19it/s][A

tensor(0.1516, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  76%|████████████████████████████████████████████████████▎                | 166/219 [02:17<00:44,  1.19it/s][A

tensor(0.0627, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  76%|████████████████████████████████████████████████████▌                | 167/219 [02:18<00:43,  1.20it/s][A

tensor(0.2701, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  77%|████████████████████████████████████████████████████▉                | 168/219 [02:19<00:43,  1.18it/s][A

tensor(0.2264, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  77%|█████████████████████████████████████████████████████▏               | 169/219 [02:19<00:41,  1.20it/s][A

tensor(0.0717, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  78%|█████████████████████████████████████████████████████▌               | 170/219 [02:20<00:40,  1.20it/s][A

tensor(0.1299, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  78%|█████████████████████████████████████████████████████▉               | 171/219 [02:21<00:39,  1.20it/s][A

tensor(0.1024, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  79%|██████████████████████████████████████████████████████▏              | 172/219 [02:22<00:38,  1.21it/s][A

tensor(0.1697, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  79%|██████████████████████████████████████████████████████▌              | 173/219 [02:23<00:37,  1.22it/s][A

tensor(0.1668, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  79%|██████████████████████████████████████████████████████▊              | 174/219 [02:23<00:37,  1.19it/s][A

tensor(0.0587, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  80%|███████████████████████████████████████████████████████▏             | 175/219 [02:24<00:36,  1.21it/s][A

tensor(0.1880, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  80%|███████████████████████████████████████████████████████▍             | 176/219 [02:25<00:35,  1.22it/s][A

tensor(0.0783, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  81%|███████████████████████████████████████████████████████▊             | 177/219 [02:26<00:34,  1.23it/s][A

tensor(0.0664, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  81%|████████████████████████████████████████████████████████             | 178/219 [02:27<00:33,  1.23it/s][A

tensor(0.0619, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  82%|████████████████████████████████████████████████████████▍            | 179/219 [02:27<00:32,  1.23it/s][A

tensor(0.0514, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  82%|████████████████████████████████████████████████████████▋            | 180/219 [02:28<00:31,  1.23it/s][A

tensor(0.1478, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  83%|█████████████████████████████████████████████████████████            | 181/219 [02:29<00:31,  1.22it/s][A

tensor(0.1021, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  83%|█████████████████████████████████████████████████████████▎           | 182/219 [02:30<00:30,  1.22it/s][A

tensor(0.0515, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  84%|█████████████████████████████████████████████████████████▋           | 183/219 [02:31<00:29,  1.23it/s][A

tensor(0.1370, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  84%|█████████████████████████████████████████████████████████▉           | 184/219 [02:32<00:28,  1.23it/s][A

tensor(0.1101, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  84%|██████████████████████████████████████████████████████████▎          | 185/219 [02:32<00:27,  1.23it/s][A

tensor(0.1454, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  85%|██████████████████████████████████████████████████████████▌          | 186/219 [02:33<00:26,  1.23it/s][A

tensor(0.1098, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  85%|██████████████████████████████████████████████████████████▉          | 187/219 [02:34<00:25,  1.23it/s][A

tensor(0.1691, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  86%|███████████████████████████████████████████████████████████▏         | 188/219 [02:35<00:25,  1.23it/s][A

tensor(0.1332, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  86%|███████████████████████████████████████████████████████████▌         | 189/219 [02:36<00:24,  1.24it/s][A

tensor(0.1346, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  87%|███████████████████████████████████████████████████████████▊         | 190/219 [02:36<00:23,  1.24it/s][A

tensor(0.1413, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  87%|████████████████████████████████████████████████████████████▏        | 191/219 [02:37<00:22,  1.24it/s][A

tensor(0.0950, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  88%|████████████████████████████████████████████████████████████▍        | 192/219 [02:38<00:21,  1.24it/s][A

tensor(0.0862, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  88%|████████████████████████████████████████████████████████████▊        | 193/219 [02:39<00:20,  1.24it/s][A

tensor(0.1239, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  89%|█████████████████████████████████████████████████████████████        | 194/219 [02:40<00:20,  1.23it/s][A

tensor(0.1674, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  89%|█████████████████████████████████████████████████████████████▍       | 195/219 [02:40<00:19,  1.23it/s][A

tensor(0.1692, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  89%|█████████████████████████████████████████████████████████████▊       | 196/219 [02:41<00:18,  1.23it/s][A

tensor(0.2283, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  90%|██████████████████████████████████████████████████████████████       | 197/219 [02:42<00:17,  1.23it/s][A

tensor(0.1416, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  90%|██████████████████████████████████████████████████████████████▍      | 198/219 [02:43<00:17,  1.21it/s][A

tensor(0.1642, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  91%|██████████████████████████████████████████████████████████████▋      | 199/219 [02:44<00:16,  1.20it/s][A

tensor(0.0570, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  91%|███████████████████████████████████████████████████████████████      | 200/219 [02:45<00:15,  1.20it/s][A

tensor(0.2702, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  92%|███████████████████████████████████████████████████████████████▎     | 201/219 [02:45<00:14,  1.21it/s][A

tensor(0.1099, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  92%|███████████████████████████████████████████████████████████████▋     | 202/219 [02:46<00:13,  1.22it/s][A

tensor(0.1802, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  93%|███████████████████████████████████████████████████████████████▉     | 203/219 [02:47<00:13,  1.22it/s][A

tensor(0.2031, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  93%|████████████████████████████████████████████████████████████████▎    | 204/219 [02:48<00:12,  1.21it/s][A

tensor(0.4522, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  94%|████████████████████████████████████████████████████████████████▌    | 205/219 [02:49<00:11,  1.20it/s][A

tensor(0.1672, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  94%|████████████████████████████████████████████████████████████████▉    | 206/219 [02:50<00:10,  1.21it/s][A

tensor(0.1711, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  95%|█████████████████████████████████████████████████████████████████▏   | 207/219 [02:50<00:09,  1.20it/s][A

tensor(0.1822, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  95%|█████████████████████████████████████████████████████████████████▌   | 208/219 [02:51<00:09,  1.21it/s][A

tensor(0.0438, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  95%|█████████████████████████████████████████████████████████████████▊   | 209/219 [02:52<00:08,  1.20it/s][A

tensor(0.2637, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  96%|██████████████████████████████████████████████████████████████████▏  | 210/219 [02:53<00:07,  1.21it/s][A

tensor(0.1974, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  96%|██████████████████████████████████████████████████████████████████▍  | 211/219 [02:54<00:06,  1.21it/s][A

tensor(0.1204, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  97%|██████████████████████████████████████████████████████████████████▊  | 212/219 [02:55<00:05,  1.22it/s][A

tensor(0.0281, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  97%|███████████████████████████████████████████████████████████████████  | 213/219 [02:55<00:04,  1.21it/s][A

tensor(0.1106, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  98%|███████████████████████████████████████████████████████████████████▍ | 214/219 [02:56<00:04,  1.18it/s][A

tensor(0.1289, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  98%|███████████████████████████████████████████████████████████████████▋ | 215/219 [02:57<00:03,  1.18it/s][A

tensor(0.0893, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  99%|████████████████████████████████████████████████████████████████████ | 216/219 [02:58<00:02,  1.15it/s][A

tensor(0.1623, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration:  99%|████████████████████████████████████████████████████████████████████▎| 217/219 [02:59<00:01,  1.15it/s][A

tensor(0.5181, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration: 100%|████████████████████████████████████████████████████████████████████▋| 218/219 [03:00<00:00,  1.17it/s][A

tensor(0.1392, device='cuda:0', grad_fn=<NllLossBackward>)



Iteration: 100%|█████████████████████████████████████████████████████████████████████| 219/219 [03:01<00:00,  1.21it/s][A
Epoch: 100%|████████████████████████████████████████████████████████████████████████████| 3/3 [08:46<00:00, 175.58s/it]


In [26]:
import csv
from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
def train_and_test(data_dir, bert_model="bert-base-uncased", task_name=None,
                   output_dir=None, max_seq_length=32, do_train=False, do_eval=False, do_lower_case=False,
                   train_batch_size=32, eval_batch_size=8, learning_rate=5e-5, num_train_epochs=3,
                   warmup_proportion=0.1,no_cuda=False, local_rank=-1, seed=42, gradient_accumulation_steps=1,
                   optimize_on_cpu=False, fp16=False, loss_scale=128, saved_model=""):


    # ## Required parameters
    # parser.add_argument("--data_dir",
    #                     default=None,
    #                     type=str,
    #                     required=True,
    #                     help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
    # parser.add_argument("--bert_model", default=None, type=str, required=True,
    #                     help="Bert pre-trained model selected in the list: bert-base-uncased, "
    #                          "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")
    # parser.add_argument("--task_name",
    #                     default=None,
    #                     type=str,
    #                     required=True,
    #                     help="The name of the task to train.")
    # parser.add_argument("--output_dir",
    #                     default=None,
    #                     type=str,
    #                     required=True,
    #                     help="The output directory where the model checkpoints will be written.")

    ## Other parameters
    # parser.add_argument("--max_seq_length",
    #                     default=128,
    #                     type=int,
    #                     help="The maximum total input sequence length after WordPiece tokenization. \n"
    #                          "Sequences longer than this will be truncated, and sequences shorter \n"
    #                          "than this will be padded.")
    # parser.add_argument("--do_train",
    #                     default=False,
    #                     action='store_true',
    #                     help="Whether to run training.")
    # parser.add_argument("--do_eval",
    #                     default=False,
    #                     action='store_true',
    #                     help="Whether to run eval on the dev set.")
    # parser.add_argument("--do_lower_case",
    #                     default=False,
    #                     action='store_true',
    #                     help="Set this flag if you are using an uncased model.")
    # parser.add_argument("--train_batch_size",
    #                     default=32,
    #                     type=int,
    #                     help="Total batch size for training.")
    # parser.add_argument("--eval_batch_size",
    #                     default=8,
    #                     type=int,
    #                     help="Total batch size for eval.")
    # parser.add_argument("--learning_rate",
    #                     default=5e-5,
    #                     type=float,
    #                     help="The initial learning rate for Adam.")
    # parser.add_argument("--num_train_epochs",
    #                     default=3.0,
    #                     type=float,
    #                     help="Total number of training epochs to perform.")
    # parser.add_argument("--warmup_proportion",
    #                     default=0.1,
    #                     type=float,
    #                     help="Proportion of training to perform linear learning rate warmup for. "
    #                          "E.g., 0.1 = 10%% of training.")
    # parser.add_argument("--no_cuda",
    #                     default=False,
    #                     action='store_true',
    #                     help="Whether not to use CUDA when available")
    # parser.add_argument("--local_rank",
    #                     type=int,
    #                     default=-1,
    #                     help="local_rank for distributed training on gpus")
    # parser.add_argument('--seed',
    #                     type=int,
    #                     default=42,
    #                     help="random seed for initialization")
    # parser.add_argument('--gradient_accumulation_steps',
    #                     type=int,
    #                     default=1,
    #                     help="Number of updates steps to accumulate before performing a backward/update pass.")
    # parser.add_argument('--optimize_on_cpu',
    #                     default=False,
    #                     action='store_true',
    #                     help="Whether to perform optimization and keep the optimizer averages on CPU")
    # parser.add_argument('--fp16',
    #                     default=False,
    #                     action='store_true',
    #                     help="Whether to use 16-bit float precision instead of 32-bit")
    # parser.add_argument('--loss_scale',
    #                     type=float, default=128,
    #                     help='Loss scaling, positive power of 2 values can improve fp16 convergence.')

    # args = parser.parse_args()

    processors = {
#         "cola": ColaProcessor,
#         "mnli": MnliProcessor,
        "mrpc": MrpcProcessor,
    }

    if local_rank == -1 or no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
        if fp16:
            logger.info("16-bits training currently not supported in distributed training")
            fp16 = False # (see https://github.com/pytorch/pytorch/pull/13496)
    logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(local_rank != -1))

    if gradient_accumulation_steps < 1:
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
                            gradient_accumulation_steps))

    train_batch_size = int(train_batch_size / gradient_accumulation_steps)

    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(seed)

    if not do_train and not do_eval:
        raise ValueError("At least one of `do_train` or `do_eval` must be True.")

    if do_train:
        if os.path.exists(output_dir) and os.listdir(output_dir):
            raise ValueError("Output directory ({}) already exists and is not emp1ty.".format(output_dir))
        os.makedirs(output_dir, exist_ok=True)

    task_name = task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()
    label_list = processor.get_labels()

    tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=do_lower_case)

    train_examples = None
    num_train_steps = None
    if do_train:
        train_examples = processor.get_train_examples(data_dir)
        num_train_steps = int(
            len(train_examples) / train_batch_size / gradient_accumulation_steps * num_train_epochs)

    # Prepare model
    model = BertForSequenceClassification.from_pretrained(bert_model,
                cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(local_rank), num_labels = 2)
    if fp16:
        model.half()
    model.to(device)
    if local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[local_rank],
                                                          output_device=local_rank)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Prepare optimizer
    if fp16:
        param_optimizer = [(n, param.clone().detach().to('cpu').float().requires_grad_()) \
                            for n, param in model.named_parameters()]
    elif optimize_on_cpu:
        param_optimizer = [(n, param.clone().detach().to('cpu').requires_grad_()) \
                            for n, param in model.named_parameters()]
    else:
        param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0}
        ]
    t_total = num_train_steps
#     print(t_total)
    if local_rank != -1:
        t_total = t_total // torch.distributed.get_world_size()
    if do_train:
        optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=learning_rate,
                         warmup=warmup_proportion,
                         t_total=t_total)

    global_step = 0
    if do_train:
        train_features = convert_examples_to_features(
            train_examples, label_list, max_seq_length, tokenizer)
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", train_batch_size)
        logger.info("  Num steps = %d", num_train_steps)
        all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
        if local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=train_batch_size)

        model.train()
        for _ in trange(int(num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch
                loss = model(input_ids, segment_ids, input_mask, label_ids)
                if n_gpu > 1:
                    loss = loss.mean() # mean() to average on multi-gpu.
                if fp16 and loss_scale != 1.0:
                    # rescale loss for fp16 training
                    # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html
                    loss = loss * loss_scale
                if gradient_accumulation_steps > 1:
                    loss = loss / gradient_accumulation_steps
                loss.backward()
                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % gradient_accumulation_steps == 0:
                    if fp16 or optimize_on_cpu:
                        if fp16 and loss_scale != 1.0:
                            # scale down gradients for fp16 training
                            for param in model.parameters():
                                if param.grad is not None:
                                    param.grad.data = param.grad.data / loss_scale
                        is_nan = set_optimizer_params_grad(param_optimizer, model.named_parameters(), test_nan=True)
                        if is_nan:
                            logger.info("FP16 TRAINING: Nan in gradients, reducing loss scaling")
                            loss_scale = loss_scale / 2
                            model.zero_grad()
                            continue
                        optimizer.step()
                        copy_optimizer_params_to_model(model.named_parameters(), param_optimizer)
                    else:
                        optimizer.step()
                    model.zero_grad()
                    global_step += 1

        torch.save(model.state_dict(), output_dir + "output.pth")


    if do_eval and (local_rank == -1 or torch.distributed.get_rank() == 0):
        eval_examples = processor.get_test_examples(data_dir)
#         eval_examples = processor.get_dev_examples(data_dir)
        eval_features = convert_examples_to_features(
            eval_examples, label_list, max_seq_length, tokenizer)
        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", eval_batch_size)
        all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
        eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
        # Run prediction for full data
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=eval_batch_size)

        model.load_state_dict(torch.load(saved_model))

        model.eval()
        # eval_loss, eval_accuracy = 0, 0

        eval_tp, eval_pred_c, eval_gold_c = 0, 0, 0
        eval_loss, eval_macro_p, eval_macro_r = 0, 0, 0

        raw_score = []

        nb_eval_steps, nb_eval_examples = 0, 0
        for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            label_ids = label_ids.to(device)

            with torch.no_grad():
                tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids)
                logits = model(input_ids, segment_ids, input_mask)

            logits = logits.detach().cpu().numpy()
            label_ids = label_ids.to('cpu').numpy()

            # Micro F1 (aggregated tp, fp, fn counts across all examples)
            tmp_tp, tmp_pred_c, tmp_gold_c = tp_pcount_gcount(logits, label_ids)
            eval_tp += tmp_tp
            eval_pred_c += tmp_pred_c
            eval_gold_c += tmp_gold_c

            raw_score += zip(logits, label_ids)
            # Macro F1 (averaged P, R across mini batches)
            tmp_eval_p, tmp_eval_r, tmp_eval_f1 = p_r_f1(logits, label_ids)

            eval_macro_p += tmp_eval_p
            eval_macro_r += tmp_eval_r

            eval_loss += tmp_eval_loss.mean().item()
            nb_eval_examples += input_ids.size(0)
            nb_eval_steps += 1


        # Micro F1 (aggregated tp, fp, fn counts across all examples)
        eval_micro_p = eval_tp / eval_pred_c
        eval_micro_r = eval_tp / eval_gold_c
        eval_micro_f1 = 2 * eval_micro_p * eval_micro_r / (eval_micro_p + eval_micro_r)

        # Macro F1 (averaged P, R across mini batches)
        eval_macro_p = eval_macro_p / nb_eval_steps
        eval_macro_r = eval_macro_r / nb_eval_steps
        eval_macro_f1 = 2 * eval_macro_p * eval_macro_r / (eval_macro_p + eval_macro_r)

        eval_loss = eval_loss / nb_eval_steps
        result = {'eval_loss': eval_loss,
                  'eval_micro_p': eval_micro_p,
                  'eval_micro_r': eval_micro_r,
                  'eval_micro_f1': eval_micro_f1,
                  # 'eval_macro_p': eval_macro_p,
                  # 'eval_macro_r': eval_macro_r,
                  # 'eval_macro_f1': eval_macro_f1,
#                   'global_step': global_step,
                  # 'loss': tr_loss/nb_tr_steps
                  }

        output_eval_file = os.path.join(output_dir, "test_eval_results.txt")
        output_raw_score = os.path.join(output_dir, "test_raw_score.csv")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

        with open(output_raw_score, 'w') as fout:
            fields = ["undermine_score", "support_score", "gold"]
            writer = csv.DictWriter(fout, fieldnames=fields)
            writer.writeheader()
            for score, gold in raw_score:
                writer.writerow({
                    "undermine_score": str(score[0]),
                    "support_score": str(score[1]),
                    "gold": str(gold)
                })

In [None]:
def experiments():
    data_dir = "D:/Jupyter/data/dataset/perspective_stances/"
    # data_dir_output = data_dir + "output2/"
    data_dir_output = "D:/Projects/Stance/Models/"
    train_and_test(data_dir=data_dir, do_train=True, do_eval=True, output_dir=data_dir_output,task_name="Mrpc")


In [17]:
def evaluation_with_pretrained():
    bert_model = "D:/Projects/Stance/Models/output.pth"
    data_dir = "D:/Jupyter/data/dataset/perspective_stances/"
    # data_dir_output = data_dir + "output2/"
    data_dir_output = "D:/Projects/Stance/Evaluation/bert_dummy_output/"
    train_and_test(data_dir=data_dir, do_train=False, do_eval=True, output_dir=data_dir_output,task_name="Mrpc",saved_model=bert_model)

In [27]:
if __name__ == "__main__":
#     experiments()
    evaluation_with_pretrained()

03/02/2020 14:22:21 - INFO - run_classifier -   device cuda n_gpu 1 distributed training False
03/02/2020 14:22:21 - INFO - transformers.tokenization_utils -   loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at C:\Users\arsen\.cache\torch\transformers\26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
03/02/2020 14:22:22 - INFO - pytorch_pretrained_bert.modeling -   loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz from cache at C:\Users\arsen\.pytorch_pretrained_bert\distributed_-1\9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba
03/02/2020 14:22:22 - INFO - pytorch_pretrained_bert.modeling -   extracting archive file C:\Users\arsen\.pytorch_pretrained_bert\distributed_-1\9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e