In [1]:
import torch
import random
import numpy as np
import os

from tqdm import tqdm, trange
# torch.cuda.empty_cache()
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from pytorch_pretrained_bert.optimization import BertAdam

In [2]:
from run_classifier import StanceProcessor, MrpcProcessor, logger, convert_examples_to_features,\
    set_optimizer_params_grad, copy_optimizer_params_to_model, accuracy, p_r_f1, tp_pcount_gcount, convert_claims_to_features, convert_pers_to_features

In [3]:
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    n_gpu = torch.cuda.device_count()
    logger.info('There are %d GPU(s) available.' % (n_gpu))
    logger.info('We will use the GPU:')
    logger.info(torch.cuda.get_device_name(0))

# If not...
else:
    logger.info('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

06/11/2020 10:30:21 - INFO - run_classifier -   There are 1 GPU(s) available.
06/11/2020 10:30:21 - INFO - run_classifier -   We will use the GPU:
06/11/2020 10:30:21 - INFO - run_classifier -   GeForce GTX 1050 Ti


In [4]:
from transformers import BertTokenizer, AdamW, get_linear_schedule_with_warmup
from pytorch_pretrained_bert.modeling import BertForSequenceClassification, BertPreTrainedModel, BertModel, BertConfig
from torch.nn import BCEWithLogitsLoss, CosineEmbeddingLoss,CrossEntropyLoss, MSELoss

06/11/2020 10:30:21 - INFO - transformers.file_utils -   PyTorch version 1.4.0 available.
06/11/2020 10:30:23 - INFO - transformers.file_utils -   TensorFlow version 2.1.0 available.


In [5]:
class BertForConsistencyCueClassification(BertPreTrainedModel):
    def __init__(self, config, num_labels=2):
        super(BertForConsistencyCueClassification, self).__init__(config)
        self.num_labels = num_labels

        self.bert = BertModel(config)
        self.dropout = torch.nn.Dropout(config.hidden_dropout_prob)
        self.classifier = torch.nn.Linear(config.hidden_size*4+1, num_labels)
        self.classifier2 = torch.nn.Linear(config.hidden_size*4, num_labels)
        self.apply(self.init_bert_weights)
#         self.init_weights()

#     @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        input_ids2=None,
        attention_mask2=None,
        token_type_ids2=None,
        position_ids2=None,
        head_mask2=None,
        inputs_embeds2=None,
        labels2=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
            Labels for computing the sequence classification/regression loss.
            Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
            If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).

    Returns:
        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
            Classification (or regression if config.num_labels==1) loss.
        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

    Examples::

        from transformers import BertTokenizer, BertForSequenceClassification
        import torch

        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
        outputs = model(input_ids, labels=labels)

        loss, logits = outputs[:2]

        """

        _, outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
#             position_ids=position_ids,
#             head_mask=head_mask,
#             inputs_embeds=inputs_embeds,
        )

        _, outputs2 = self.bert(
            input_ids2,
            attention_mask=attention_mask2,
            token_type_ids=token_type_ids2,
#             position_ids=position_ids2,
#             head_mask=head_mask2,
#             inputs_embeds=inputs_embeds2,
        )

        pooled_output = outputs
        pooled_output2 = outputs2

        pooled_output = self.dropout(pooled_output)
        pooled_output2 = self.dropout(pooled_output2)
        
#         A series of different concatenations(concat(),|minus|,multiply, ...)
        final_output_cat = torch.cat((pooled_output, pooled_output2),1)
        final_output_minus = torch.abs(pooled_output-pooled_output2)
        final_output_mult = torch.mul(pooled_output, pooled_output2)
#         final_output_mimu = torch.cat((final_output_minus, final_output_mult),1)
#         final_output_camu = torch.cat((final_output_cat, final_output_mult),1)
#         final_output_cami = torch.cat((final_output_cat, final_output_minus),1)
        final_output_camimu = torch.cat((final_output_cat, final_output_minus, final_output_mult),1)
    
        cos_pooled_outputs = torch.cosine_similarity(pooled_output, pooled_output2, dim=1)
#         1
#         torch.Size([hidden_size*2, 768])
#         2
#         torch.Size([hidden_size, 768])
#         3
#         torch.Size([hidden_size, 768])
#         4
#         torch.Size([hidden_size*2, 768])
#         5
#         torch.Size([hidden_size*3, 768])
#         6
#         torch.Size([hidden_size*3, 768])
#         7
#         torch.Size([hidden_size*4, 768])
        
        batch_size = list(pooled_output.size())[0]
        hidden_size = list(pooled_output.size())[1]
        
        final_output_all = torch.cat((final_output_camimu, cos_pooled_outputs.unsqueeze(1)),1)
        logits_ce = self.classifier(final_output_all)
#         print('logits_ce:')
#         print(logits_ce)
        
#         logits_ori = self.classifier2(final_output_camimu)
#         print('logits_ori:')
#         print(logits_ori)

        #Calculate loss during training process
        if labels is not None:
            if self.num_labels == 1:
                #  We are doing regression
                loss_fct = MSELoss()
                loss = loss_fct(logits.view(-1), labels.view(-1))
            else:
                loss_fct_ce = CrossEntropyLoss()
                loss_ce = loss_fct_ce(logits_ce.view(-1, self.num_labels), labels.view(-1))
                print('loss_ce:')
                print(loss_ce)

#                 loss_ori = loss_fct_ce(logits_ori.view(-1, self.num_labels), labels.view(-1))
#                 print('loss_ori:')
#                 print(loss_ori)
                loss_fct_cos = CosineEmbeddingLoss()

                labels2[labels2==0] = -1
                loss_cos = loss_fct_cos(pooled_output, pooled_output2, labels2)
                labels2[labels2==-1] = 0
                print('loss_cos:')
                print(loss_cos)
            
                loss = loss_ce
                print('final loss:')
                print(loss)
                
#             outputs = (loss,) + outputs
#             outputs = (loss,) + logits_cos 
                outputs = loss
                return outputs
        else:
            #Get predictions when doing evaluation
            return logits_ce
        
          # (loss), logits, (hidden_states), (attentions)


In [6]:
# # Get all of the model's parameters as a list of tuples.
# params = list(model.named_parameters())

# print('The BERT model has {:} different named parameters.\n'.format(len(params)))

# print('==== Embedding Layer ====\n')

# for p in params[0:5]:
#     print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

# print('\n==== First Transformer ====\n')

# for p in params[5:21]:
#     print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
    
# print('\n==== Output Layer ====\n')

# for p in params[-4:]:
#     print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

In [7]:
# data_dir = "D:/Jupyter/data/dataset/perspective_stances/"
# # data_dir = "D:/Projects/Stance/Dataset/OnlyNew/"
# data_dir_output = "D:/Projects/Stance/Models/Consistency_Cues/"
# output_dir=data_dir_output
# max_seq_length=32
# max_grad_norm = 1.0
# num_training_steps = 1000
# num_warmup_steps = 100
# warmup_proportion = float(num_warmup_steps) / float(num_training_steps)  # 0.1
# # warmup_proportion = 0.1
# # train_batch_size=32
# train_batch_size=16
# eval_batch_size=8
# learning_rate=5e-5
# num_train_epochs=4
# local_rank=-1
# seed=19
# gradient_accumulation_steps=1
# loss_scale=128
# train_batch_size = int(train_batch_size / gradient_accumulation_steps)

# processors = {
#         "mrpc": MrpcProcessor,
#     }

# random.seed(seed)
# np.random.seed(seed)
# torch.manual_seed(seed)
# torch.cuda.manual_seed_all(seed)
    
# os.makedirs(output_dir, exist_ok=True)
# processor = processors['mrpc']()
# label_list = processor.get_labels()
# num_labels = len(label_list)
# # print('label list')
# # print(label_list)

# train_examples = processor.get_train_examples(data_dir)
# num_train_steps = int(
#     len(train_examples) / train_batch_size / gradient_accumulation_steps * num_train_epochs)

# ##preprare optimizer
# param_optimizer = list(model.named_parameters())
# no_decay = ['bias', 'gamma', 'beta']
# optimizer_grouped_parameters = [
#     {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01},
#     {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0}
#     ]
# t_total = num_train_steps
# optimizer = BertAdam(optimizer_grouped_parameters,
#                          lr=learning_rate,
#                          warmup=warmup_proportion,
#                          t_total=t_total)
# # optimizer = AdamW(optimizer_grouped_parameters,
# #                   lr = learning_rate, # args.learning_rate - default is 5e-5, our notebook had 2e-5
# #                   eps = 1e-8, # args.adam_epsilon  - default is 1e-8.
# #                   correct_bias=False
# #                 )

# # scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=t_total)  # PyTorch scheduler

In [8]:
# global_step = 0
# claim_features = convert_claims_to_features(train_examples, label_list, max_seq_length, tokenizer)
# train_features = convert_pers_to_features(train_examples, label_list, max_seq_length, tokenizer)

# logger.info("***** Running training *****")
# logger.info("  Num examples = %d", len(train_examples))
# logger.info("  Batch size = %d", train_batch_size)
# logger.info("  Num steps = %d", num_train_steps)


# all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
# all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
# all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
# all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)

# claims_input_ids = torch.tensor([f.input_ids for f in claim_features], dtype=torch.long)
# claims_input_mask = torch.tensor([f.input_mask for f in claim_features], dtype=torch.long)
# claims_segment_ids = torch.tensor([f.segment_ids for f in claim_features], dtype=torch.long)
# claims_label_ids = torch.tensor([f.label_id for f in claim_features], dtype=torch.long)

# train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, claims_input_ids, claims_input_mask, claims_segment_ids, claims_label_ids)
# train_sampler = RandomSampler(train_data)
# train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=train_batch_size)

# # claims_data = TensorDataset(claims_input_ids, claims_input_mask, claims_segment_ids, claims_label_ids)
# # claims_sampler = RandomSampler(claims_data)
# # claims_dataloader = DataLoader(claims_data, sampler=train_sampler, batch_size=train_batch_size)

In [9]:
# model.train()
# for _ in trange(int(num_train_epochs), desc="Epoch"):
#     tr_loss = 0
#     nb_tr_examples, nb_tr_steps = 0, 0
#     for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
#         batch = tuple(t.to(device) for t in batch)
#         input_ids, input_mask, segment_ids, label_ids, claim_input_ids, claim_input_mask, claim_segment_ids, claim_label_ids = batch
# #         ce_loss = model(input_ids, segment_ids, input_mask, label_ids)
# #         cos_loss = model(claim_input_ids, claim_segment_ids, claim_input_mask, claim_label_ids)
        
# #         print("start")
# #         print(input_ids)
# #         print(input_mask)
# #         print(segment_ids)
# #         print(label_ids)
# #         print(claim_input_ids)
# #         print(claim_input_mask)
# #         print(claim_segment_ids)
# #         print(claim_label_ids)
# #         print("end")
    
#         out_results = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids, input_ids2=claim_input_ids, token_type_ids2=claim_segment_ids, attention_mask2=claim_input_mask, labels2=claim_label_ids)
# #         loss = ce_loss + cos_loss
#         print("out_results:")
#         print(out_results)
#         loss = out_results
# #         print(cos_loss)
# #         print(loss.item())
#         if n_gpu > 1:
#             loss = loss.mean() # mean() to average on multi-gpu.
# #         if fp16 and loss_scale != 1.0:
# #             # rescale loss for fp16 training
# #             # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html
# #             loss = loss * loss_scale
#         if gradient_accumulation_steps > 1:
#             loss = loss / gradient_accumulation_steps
#         loss.backward()
        
#         tr_loss += loss.item()
#         nb_tr_examples += input_ids.size(0)
#         nb_tr_steps += 1
#         if (step + 1) % gradient_accumulation_steps == 0:
# #             if fp16 or optimize_on_cpu:
# #                 if fp16 and loss_scale != 1.0:
# #                     # scale down gradients for fp16 training
# #                     for param in model.parameters():
# #                         if param.grad is not None:
# #                             param.grad.data = param.grad.data / loss_scale           
# #                 is_nan = set_optimizer_params_grad(param_optimizer, model.named_parameters(), test_nan=True)
# #                 if is_nan:
# #                     logger.info("FP16 TRAINING: Nan in gradients, reducing loss scaling")
# #                     loss_scale = loss_scale / 2
# #                     model.zero_grad()
# #                     continue 
# #                 optimizer.step()
# # #                 scheduler.step()
# #                 copy_optimizer_params_to_model(model.named_parameters(), param_optimizer)
# #             else:
# #                 torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
#             optimizer.step()
# #                 scheduler.step()
#             model.zero_grad()
#             global_step += 1

        
# ## v2: concat
# ## v3: multiply
# model_to_save = model.module if hasattr(model, 'module') else model
# torch.save(model.state_dict(), output_dir + "cosloss_camimu_siamese_bert_epoch4.pth")
# # torch.save(model_to_save.state_dict(), output_dir + "cos_camimu_siamese_bert.bin")

In [14]:
import csv
from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
def train_and_test(data_dir, bert_model="bert-base-uncased", task_name=None,
                   output_dir=None, max_seq_length=32, do_train=False, do_eval=False, do_lower_case=False,
                   train_batch_size=16, eval_batch_size=8, learning_rate=5e-5, num_train_epochs=5,
                   warmup_proportion=0.1,no_cuda=False, local_rank=-1, seed=19, gradient_accumulation_steps=1,
                   optimize_on_cpu=False, fp16=False, loss_scale=128, saved_model=""):
    


    # ## Required parameters
    # parser.add_argument("--data_dir",
    #                     default=None,
    #                     type=str,
    #                     required=True,
    #                     help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
    # parser.add_argument("--bert_model", default=None, type=str, required=True,
    #                     help="Bert pre-trained model selected in the list: bert-base-uncased, "
    #                          "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")
    # parser.add_argument("--task_name",
    #                     default=None,
    #                     type=str,
    #                     required=True,
    #                     help="The name of the task to train.")
    # parser.add_argument("--output_dir",
    #                     default=None,
    #                     type=str,
    #                     required=True,
    #                     help="The output directory where the model checkpoints will be written.")

    ## Other parameters
    # parser.add_argument("--max_seq_length",
    #                     default=128,
    #                     type=int,
    #                     help="The maximum total input sequence length after WordPiece tokenization. \n"
    #                          "Sequences longer than this will be truncated, and sequences shorter \n"
    #                          "than this will be padded.")
    # parser.add_argument("--do_train",
    #                     default=False,
    #                     action='store_true',
    #                     help="Whether to run training.")
    # parser.add_argument("--do_eval",
    #                     default=False,
    #                     action='store_true',
    #                     help="Whether to run eval on the dev set.")
    # parser.add_argument("--do_lower_case",
    #                     default=False,
    #                     action='store_true',
    #                     help="Set this flag if you are using an uncased model.")
    # parser.add_argument("--train_batch_size",
    #                     default=32,
    #                     type=int,
    #                     help="Total batch size for training.")
    # parser.add_argument("--eval_batch_size",
    #                     default=8,
    #                     type=int,
    #                     help="Total batch size for eval.")
    # parser.add_argument("--learning_rate",
    #                     default=5e-5,
    #                     type=float,
    #                     help="The initial learning rate for Adam.")
    # parser.add_argument("--num_train_epochs",
    #                     default=3.0,
    #                     type=float,
    #                     help="Total number of training epochs to perform.")
    # parser.add_argument("--warmup_proportion",
    #                     default=0.1,
    #                     type=float,
    #                     help="Proportion of training to perform linear learning rate warmup for. "
    #                          "E.g., 0.1 = 10%% of training.")
    # parser.add_argument("--no_cuda",
    #                     default=False,
    #                     action='store_true',
    #                     help="Whether not to use CUDA when available")
    # parser.add_argument("--local_rank",
    #                     type=int,
    #                     default=-1,
    #                     help="local_rank for distributed training on gpus")
    # parser.add_argument('--seed',
    #                     type=int,
    #                     default=42,
    #                     help="random seed for initialization")
    # parser.add_argument('--gradient_accumulation_steps',
    #                     type=int,
    #                     default=1,
    #                     help="Number of updates steps to accumulate before performing a backward/update pass.")
    # parser.add_argument('--optimize_on_cpu',
    #                     default=False,
    #                     action='store_true',
    #                     help="Whether to perform optimization and keep the optimizer averages on CPU")
    # parser.add_argument('--fp16',
    #                     default=False,
    #                     action='store_true',
    #                     help="Whether to use 16-bit float precision instead of 32-bit")
    # parser.add_argument('--loss_scale',
    #                     type=float, default=128,
    #                     help='Loss scaling, positive power of 2 values can improve fp16 convergence.')

    # args = parser.parse_args()

    
    processors = {
#         "cola": ColaProcessor,
#         "mnli": MnliProcessor,
        "mrpc": MrpcProcessor,
        "stance":StanceProcessor
    }

    if local_rank == -1 or no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
        if fp16:
            logger.info("16-bits training currently not supported in distributed training")
            fp16 = False # (see https://github.com/pytorch/pytorch/pull/13496)
    logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(local_rank != -1))

    if gradient_accumulation_steps < 1:
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
                            gradient_accumulation_steps))

    train_batch_size = int(train_batch_size / gradient_accumulation_steps)

    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(seed)

    if not do_train and not do_eval:
        raise ValueError("At least one of `do_train` or `do_eval` must be True.")

    if do_train:
        if os.path.exists(output_dir) and os.listdir(output_dir):
            raise ValueError("Output directory ({}) already exists and is not emp1ty.".format(output_dir))
        os.makedirs(output_dir, exist_ok=True)

    task_name = task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()
    label_list = processor.get_labels()

#     tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=do_lower_case)
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    

    train_examples = None
    num_train_steps = None
    if do_train:
        train_examples = processor.get_train_examples(data_dir)
        
        num_train_steps = int(
            len(train_examples) / train_batch_size / gradient_accumulation_steps * num_train_epochs)

    # Prepare model
#     model = BertForSequenceClassification.from_pretrained(bert_model,
#                 cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(local_rank), num_labels = 2)

        model = BertForConsistencyCueClassification.from_pretrained('bert-base-uncased', num_labels=2)
        model.to(device)
        
        if fp16:
            model.half()

        if local_rank != -1:
            model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[local_rank],
                                                              output_device=local_rank)
        elif n_gpu > 1:
            model = torch.nn.DataParallel(model)

        # Prepare optimizer
        if fp16:
            param_optimizer = [(n, param.clone().detach().to('cpu').float().requires_grad_()) \
                                for n, param in model.named_parameters()]
        elif optimize_on_cpu:
            param_optimizer = [(n, param.clone().detach().to('cpu').requires_grad_()) \
                                for n, param in model.named_parameters()]
        else:
            param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'gamma', 'beta']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0}
            ]
        t_total = num_train_steps
#     print(t_total)
    if local_rank != -1:
        t_total = t_total // torch.distributed.get_world_size()
    if do_train:
        optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=learning_rate,
                         warmup=warmup_proportion,
                         t_total=t_total)

    global_step = 0
    if do_train:
        claim_features = convert_claims_to_features(train_examples, label_list, max_seq_length, tokenizer)
        train_features = convert_pers_to_features(train_examples, label_list, max_seq_length, tokenizer)
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", train_batch_size)
        logger.info("  Num steps = %d", num_train_steps)
        all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)

        claims_input_ids = torch.tensor([f.input_ids for f in claim_features], dtype=torch.long)
        claims_input_mask = torch.tensor([f.input_mask for f in claim_features], dtype=torch.long)
        claims_segment_ids = torch.tensor([f.segment_ids for f in claim_features], dtype=torch.long)
        claims_label_ids = torch.tensor([f.label_id for f in claim_features], dtype=torch.long)

        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, claims_input_ids, claims_input_mask, claims_segment_ids, claims_label_ids)

        if local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=train_batch_size)

        model.train()
        for _ in trange(int(num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids, claim_input_ids, claim_input_mask, claim_segment_ids, claim_label_ids = batch
                
                out_results = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids, input_ids2=claim_input_ids, token_type_ids2=claim_segment_ids, attention_mask2=claim_input_mask, labels2=claim_label_ids)
#                 loss = model(input_ids, segment_ids, input_mask, label_ids)
                print("out_results:")
                print(out_results)
                loss = out_results
            
                if n_gpu > 1:
                    loss = loss.mean() # mean() to average on multi-gpu.
                if fp16 and loss_scale != 1.0:
                    # rescale loss for fp16 training
                    # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html
                    loss = loss * loss_scale
                if gradient_accumulation_steps > 1:
                    loss = loss / gradient_accumulation_steps
                loss.backward()
                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % gradient_accumulation_steps == 0:
                    if fp16 or optimize_on_cpu:
                        if fp16 and loss_scale != 1.0:
                            # scale down gradients for fp16 training
                            for param in model.parameters():
                                if param.grad is not None:
                                    param.grad.data = param.grad.data / loss_scale
                        is_nan = set_optimizer_params_grad(param_optimizer, model.named_parameters(), test_nan=True)
                        if is_nan:
                            logger.info("FP16 TRAINING: Nan in gradients, reducing loss scaling")
                            loss_scale = loss_scale / 2
                            model.zero_grad()
                            continue
                        optimizer.step()
                        copy_optimizer_params_to_model(model.named_parameters(), param_optimizer)
                    else:
                        optimizer.step()
                    model.zero_grad()
                    global_step += 1

        torch.save(model.state_dict(), output_dir + "cos_camimu_siamese_bert_epoch5.pth")


    if do_eval and (local_rank == -1 or torch.distributed.get_rank() == 0):
#         eval_examples = processor.get_test_examples(data_dir)
        eval_examples = processor.get_dev_examples(data_dir)
        claim_features = convert_claims_to_features(eval_examples, label_list, max_seq_length, tokenizer)
        eval_features = convert_pers_to_features(
            eval_examples, label_list, max_seq_length, tokenizer)
            
    
        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", eval_batch_size)
        all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
        
        claims_input_ids = torch.tensor([f.input_ids for f in claim_features], dtype=torch.long)
        claims_input_mask = torch.tensor([f.input_mask for f in claim_features], dtype=torch.long)
        claims_segment_ids = torch.tensor([f.segment_ids for f in claim_features], dtype=torch.long)
        claims_label_ids = torch.tensor([f.label_id for f in claim_features], dtype=torch.long)
        
        eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, claims_input_ids, claims_input_mask, claims_segment_ids, claims_label_ids)
        # Run prediction for full data
#         eval_sampler = SequentialSampler(eval_data)
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=eval_batch_size)
#         print('all_input_ids:')
#         print(all_input_ids)
        
        

#         model.load_state_dict(torch.load(saved_model))
        model_state_dict = torch.load(saved_model)
        model = BertForConsistencyCueClassification.from_pretrained('bert-base-uncased', num_labels=2, state_dict=model_state_dict)
        model.to(device)
        
        model.eval()
        # eval_loss, eval_accuracy = 0, 0

        eval_tp, eval_pred_c, eval_gold_c = 0, 0, 0
        eval_loss, eval_macro_p, eval_macro_r = 0, 0, 0

        raw_score = []

        nb_eval_steps, nb_eval_examples = 0, 0
        for input_ids, input_mask, segment_ids, label_ids, claim_input_ids, claim_input_mask, claim_segment_ids, claim_label_ids in eval_dataloader:
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            label_ids = label_ids.to(device)
            claim_input_ids = claim_input_ids.to(device)
            claim_input_mask = claim_input_mask.to(device)
            claim_segment_ids = claim_segment_ids.to(device)
            claim_label_ids = claim_label_ids.to(device)

#             print("start")
#             print(input_ids)
#             print(input_mask)
#             print(segment_ids)
#             print(label_ids)
#             print(claim_input_ids)
#             print(claim_input_mask)
#             print(claim_segment_ids)
#             print(claim_label_ids)
#             print("end")
            with torch.no_grad():
                tmp_eval_loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids, input_ids2=claim_input_ids, token_type_ids2=claim_segment_ids, attention_mask2=claim_input_mask, labels2=claim_label_ids)
                
                logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, input_ids2=claim_input_ids, token_type_ids2=claim_segment_ids, attention_mask2=claim_input_mask)
            
            print(logits)
#             print(logits[0])
            logits = logits.detach().cpu().numpy()
            print(logits)
            label_ids = label_ids.to('cpu').numpy()
#             print(label_ids)

            # Micro F1 (aggregated tp, fp, fn counts across all examples)
            tmp_tp, tmp_pred_c, tmp_gold_c = tp_pcount_gcount(logits, label_ids)
            eval_tp += tmp_tp
            eval_pred_c += tmp_pred_c
            eval_gold_c += tmp_gold_c
            
            pred_label = np.argmax(logits, axis=1)
            raw_score += zip(logits, pred_label, label_ids)
            
            # Macro F1 (averaged P, R across mini batches)
            tmp_eval_p, tmp_eval_r, tmp_eval_f1 = p_r_f1(logits, label_ids)

            eval_macro_p += tmp_eval_p
            eval_macro_r += tmp_eval_r

            eval_loss += tmp_eval_loss.mean().item()
            nb_eval_examples += input_ids.size(0)
            nb_eval_steps += 1


        # Micro F1 (aggregated tp, fp, fn counts across all examples)
        eval_micro_p = eval_tp / eval_pred_c
        eval_micro_r = eval_tp / eval_gold_c
        eval_micro_f1 = 2 * eval_micro_p * eval_micro_r / (eval_micro_p + eval_micro_r)

        # Macro F1 (averaged P, R across mini batches)
        eval_macro_p = eval_macro_p / nb_eval_steps
        eval_macro_r = eval_macro_r / nb_eval_steps
        eval_macro_f1 = 2 * eval_macro_p * eval_macro_r / (eval_macro_p + eval_macro_r)

        eval_loss = eval_loss / nb_eval_steps
        result = {
                  'eval_loss': eval_loss,
                  'eval_micro_p': eval_micro_p,
                  'eval_micro_r': eval_micro_r,
                  'eval_micro_f1': eval_micro_f1,
                  'eval_macro_p': eval_macro_p,
                  'eval_macro_r': eval_macro_r,
                  'eval_macro_f1': eval_macro_f1,
#                   'global_step': global_step,
#                   'loss': tr_loss/nb_tr_steps
                  }

        output_eval_file = os.path.join(output_dir, "cos_camimu_siamese_bert_epoch5_siamese_bert_test_eval_results.txt")
        output_raw_score = os.path.join(output_dir, "cos_camimu_siamese_bert_epoch5_siamese_bert_test_raw_score.csv")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

        with open(output_raw_score, 'w') as fout:
            fields = ["undermine_score", "support_score","predict_label", "gold"]
            writer = csv.DictWriter(fout, fieldnames=fields)
            writer.writeheader()
            for score, pred, gold in raw_score:
                writer.writerow({
                    "undermine_score": str(score[0]),
                    "support_score": str(score[1]),
                    "predict_label": str(pred),
                    "gold": str(gold)
                })

In [15]:
def experiments():
    data_dir = "D:/Jupyter/data/dataset/perspective_stances/"
    
    data_dir_output = "D:/Projects/Stance/Models/Siamese_bert/"
    train_and_test(data_dir=data_dir, do_train=True, do_eval=False, output_dir=data_dir_output,task_name="stance")


In [18]:
def evaluation_with_pretrained():
    bert_model = "D:/Projects/Stance/Models/Siamese_bert/cos_camimu_siamese_bert_epoch5.pth"
    data_dir = "D:/Jupyter/data/dataset/perspective_stances/"
    
    data_dir_output = "D:/Projects/Stance/Evaluation/local_output/"
    train_and_test(data_dir=data_dir, do_train=False, do_eval=True, output_dir=data_dir_output,task_name="stance",saved_model=bert_model)

In [19]:
if __name__ == "__main__":
#     experiments()
    evaluation_with_pretrained()

06/11/2020 11:22:46 - INFO - run_classifier -   device cuda n_gpu 1 distributed training False
06/11/2020 11:22:56 - INFO - transformers.tokenization_utils -   loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at C:\Users\arsen\.cache\torch\transformers\26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
06/11/2020 11:22:56 - INFO - run_classifier -   *** Claim Example ***
06/11/2020 11:22:56 - INFO - run_classifier -   guid: dev-1
06/11/2020 11:22:56 - INFO - run_classifier -   tokens: [CLS] va ##cci ##nation must be made compulsory [SEP]
06/11/2020 11:22:56 - INFO - run_classifier -   input_ids: 101 12436 14693 9323 2442 2022 2081 14770 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
06/11/2020 11:22:56 - INFO - run_classifier -   input_mask: 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
06/11/2020 11:22:56 - INFO - run_classifier -  

06/11/2020 11:23:19 - INFO - pytorch_pretrained_bert.modeling -   extracting archive file C:\Users\arsen\.pytorch_pretrained_bert\9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba to temp dir C:\Users\arsen\AppData\Local\Temp\tmp3cyxyqwi
06/11/2020 11:23:23 - INFO - pytorch_pretrained_bert.modeling -   Model config {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size": 30522
}



loss_ce:
tensor(1.9456, device='cuda:0')
loss_cos:
tensor(0.3183, device='cuda:0')
final loss:
tensor(1.9456, device='cuda:0')
tensor([[-2.1112,  2.4401],
        [-1.0531,  1.6990],
        [-0.8156,  1.3100],
        [ 3.9571, -2.8537],
        [ 3.0434, -1.9618],
        [ 1.0019, -0.1636],
        [-5.1143,  5.3516],
        [ 7.9747, -7.1257]], device='cuda:0')
[[-2.1111927   2.4401238 ]
 [-1.0530676   1.6990179 ]
 [-0.8156326   1.3100159 ]
 [ 3.9570687  -2.8537042 ]
 [ 3.0434203  -1.9618332 ]
 [ 1.0019134  -0.16360112]
 [-5.114317    5.3515525 ]
 [ 7.974735   -7.1257157 ]]
loss_ce:
tensor(2.2898, device='cuda:0')
loss_cos:
tensor(0.2666, device='cuda:0')
final loss:
tensor(2.2898, device='cuda:0')
tensor([[-5.2317,  5.6109],
        [-4.2373,  4.5113],
        [-3.4224,  3.9094],
        [-3.3702,  3.7086],
        [-5.1148,  5.4045],
        [-3.6862,  4.1105],
        [ 7.9258, -6.9202],
        [ 9.4822, -8.6011]], device='cuda:0')
[[-5.2317176  5.6108522]
 [-4.237316   4.5112

loss_ce:
tensor(1.1265e-05, device='cuda:0')
loss_cos:
tensor(0.0930, device='cuda:0')
final loss:
tensor(1.1265e-05, device='cuda:0')
tensor([[ 7.0735, -6.0897],
        [-4.3531,  5.1946],
        [-5.5050,  6.1664],
        [ 7.7407, -6.7460],
        [ 7.0119, -6.0763],
        [ 7.7797, -6.8107],
        [ 7.9752, -6.9830],
        [ 6.6217, -5.6458]], device='cuda:0')
[[ 7.07346   -6.089705 ]
 [-4.3530784  5.194648 ]
 [-5.5050335  6.1664047]
 [ 7.740675  -6.746016 ]
 [ 7.0118985 -6.076316 ]
 [ 7.779715  -6.810684 ]
 [ 7.975152  -6.9830165]
 [ 6.6217475 -5.6458087]]
loss_ce:
tensor(4.3511e-06, device='cuda:0')
loss_cos:
tensor(0.1138, device='cuda:0')
final loss:
tensor(4.3511e-06, device='cuda:0')
tensor([[ 7.7377, -6.7319],
        [ 6.5273, -5.4726],
        [ 7.8565, -6.8322],
        [ 6.9879, -5.9365],
        [-5.6007,  6.3893],
        [-5.6075,  6.3292],
        [-5.5835,  6.3959],
        [-5.5928,  6.3773]], device='cuda:0')
[[ 7.7376943 -6.731932 ]
 [ 6.5272856 -5.4725

loss_ce:
tensor(0.0001, device='cuda:0')
loss_cos:
tensor(0.1006, device='cuda:0')
final loss:
tensor(0.0001, device='cuda:0')
tensor([[ 5.2938, -4.9407],
        [ 5.2938, -4.9407],
        [ 5.2938, -4.9407],
        [-3.9779,  4.4168],
        [-4.3858,  4.3754],
        [-4.0152,  4.2624],
        [-4.4381,  4.6916],
        [-4.1402,  4.5454]], device='cuda:0')
[[ 5.293839  -4.9407187]
 [ 5.293839  -4.9407187]
 [ 5.293839  -4.9407187]
 [-3.977911   4.416843 ]
 [-4.3858404  4.375351 ]
 [-4.0152     4.2624393]
 [-4.4380746  4.69156  ]
 [-4.1401877  4.5454106]]
loss_ce:
tensor(0.0115, device='cuda:0')
loss_cos:
tensor(0.2293, device='cuda:0')
final loss:
tensor(0.0115, device='cuda:0')
tensor([[-1.3708,  1.1450],
        [-3.3294,  3.7668],
        [-2.7865,  3.2651],
        [-2.0519,  2.5545],
        [-4.5174,  4.5922],
        [-3.7723,  4.0051],
        [-3.6427,  3.3652],
        [-5.1105,  4.9127]], device='cuda:0')
[[-1.3707896  1.1449504]
 [-3.3293912  3.766784 ]
 [-2.786508

loss_ce:
tensor(0.0257, device='cuda:0')
loss_cos:
tensor(0.1118, device='cuda:0')
final loss:
tensor(0.0257, device='cuda:0')
tensor([[-5.3776,  6.2809],
        [-4.8050,  5.9828],
        [-4.8202,  5.9005],
        [-4.9881,  6.1842],
        [-3.1663,  2.4617],
        [-1.2592,  0.7393],
        [-1.5464,  1.0164],
        [-3.8170,  4.2355]], device='cuda:0')
[[-5.377603   6.2808986]
 [-4.8049517  5.9827523]
 [-4.820188   5.900534 ]
 [-4.9881144  6.18425  ]
 [-3.1663365  2.461735 ]
 [-1.2591957  0.7392931]
 [-1.5464113  1.0164089]
 [-3.8169518  4.2354507]]
loss_ce:
tensor(3.1650, device='cuda:0')
loss_cos:
tensor(0.6300, device='cuda:0')
final loss:
tensor(3.1650, device='cuda:0')
tensor([[ 4.1796, -3.3662],
        [-0.3850,  1.1073],
        [-5.5504,  6.1205],
        [-3.2386,  4.1049],
        [ 2.2795, -1.2741],
        [-2.5152,  3.4407],
        [-2.1027,  2.4100],
        [ 2.4321, -1.7824]], device='cuda:0')
[[ 4.1796327  -3.3661835 ]
 [-0.38504538  1.1073453 ]
 [-5.55

loss_ce:
tensor(0.0020, device='cuda:0')
loss_cos:
tensor(0.3707, device='cuda:0')
final loss:
tensor(0.0020, device='cuda:0')
tensor([[ 9.1228, -8.5770],
        [-5.6502,  6.4951],
        [-5.5816,  6.2858],
        [-5.6834,  6.4836],
        [-5.5479,  6.4030],
        [-5.0289,  6.0133],
        [-5.5484,  6.3983],
        [-1.3732,  2.7492]], device='cuda:0')
[[ 9.122796  -8.576993 ]
 [-5.6501684  6.4950747]
 [-5.581617   6.2857842]
 [-5.683412   6.483599 ]
 [-5.5479016  6.4030147]
 [-5.0289307  6.0133233]
 [-5.5484295  6.398274 ]
 [-1.3731898  2.7492497]]
loss_ce:
tensor(4.1602, device='cuda:0')
loss_cos:
tensor(0.3334, device='cuda:0')
final loss:
tensor(4.1602, device='cuda:0')
tensor([[-2.4681,  3.0455],
        [ 5.6163, -5.0057],
        [ 7.6998, -7.0947],
        [-5.4763,  6.2761],
        [-3.5543,  4.5309],
        [-6.2997,  7.1397],
        [ 5.9979, -5.2663],
        [ 6.4187, -5.6548]], device='cuda:0')
[[-2.468054   3.0455203]
 [ 5.616306  -5.0056825]
 [ 7.699837

loss_ce:
tensor(1.4087, device='cuda:0')
loss_cos:
tensor(0.4018, device='cuda:0')
final loss:
tensor(1.4087, device='cuda:0')
tensor([[-2.0842,  1.9833],
        [-0.9830,  1.0751],
        [-2.3284,  2.3227],
        [-2.7699,  2.7527],
        [-1.0396,  0.9328],
        [-1.6435,  1.4388],
        [ 0.8525, -0.8502],
        [ 6.7288, -6.3025]], device='cuda:0')
[[-2.0842297   1.9832721 ]
 [-0.98301405  1.0750625 ]
 [-2.328356    2.3227336 ]
 [-2.7699463   2.7527006 ]
 [-1.0396432   0.93279064]
 [-1.6434534   1.4387833 ]
 [ 0.85251504 -0.85023296]
 [ 6.728805   -6.302549  ]]
loss_ce:
tensor(0.0168, device='cuda:0')
loss_cos:
tensor(0.4261, device='cuda:0')
final loss:
tensor(0.0168, device='cuda:0')
tensor([[ 3.7611, -3.2274],
        [-5.3155,  4.5477],
        [-4.2354,  3.9468],
        [-3.9267,  3.9757],
        [-3.7924,  3.7941],
        [-1.1317,  1.4635],
        [-1.3825,  1.6910],
        [-1.8857,  2.3037]], device='cuda:0')
[[ 3.7610743 -3.2273843]
 [-5.315451   4.5476

loss_ce:
tensor(0.0366, device='cuda:0')
loss_cos:
tensor(0.0732, device='cuda:0')
final loss:
tensor(0.0366, device='cuda:0')
tensor([[-1.0572,  0.6451],
        [ 3.2624, -3.4070],
        [ 2.6663, -2.8338],
        [ 4.7560, -4.9633],
        [ 4.9687, -5.3674],
        [-2.0807,  1.6304],
        [-1.3354,  0.9757],
        [-3.4933,  3.0289]], device='cuda:0')
[[-1.057155    0.64509517]
 [ 3.262357   -3.4070363 ]
 [ 2.6662943  -2.833763  ]
 [ 4.7559648  -4.9633384 ]
 [ 4.968721   -5.367384  ]
 [-2.080713    1.6303657 ]
 [-1.3353992   0.9757317 ]
 [-3.4933312   3.0289114 ]]
loss_ce:
tensor(0.0526, device='cuda:0')
loss_cos:
tensor(0.0470, device='cuda:0')
final loss:
tensor(0.0526, device='cuda:0')
tensor([[ 0.8892, -0.6785],
        [ 1.9695, -1.9209],
        [ 1.9502, -1.9447],
        [ 4.1029, -4.4443],
        [-1.0743,  0.7149],
        [-1.7930,  1.6654],
        [ 2.7728, -2.8908],
        [ 3.1946, -3.3155]], device='cuda:0')
[[ 0.88922006 -0.6784547 ]
 [ 1.9694575  -1.9

loss_ce:
tensor(1.3537, device='cuda:0')
loss_cos:
tensor(0.1305, device='cuda:0')
final loss:
tensor(1.3537, device='cuda:0')
tensor([[-2.3860,  2.4531],
        [-0.8147,  1.4078],
        [-0.5282,  1.1851],
        [ 2.6339, -2.0589],
        [-0.4902,  0.4612],
        [ 1.7396, -1.4228],
        [ 1.6542, -1.4177],
        [ 0.5782, -0.1193]], device='cuda:0')
[[-2.386047    2.4530747 ]
 [-0.81467587  1.4078292 ]
 [-0.5282068   1.1851422 ]
 [ 2.633914   -2.0588915 ]
 [-0.490179    0.46115297]
 [ 1.7395777  -1.4227982 ]
 [ 1.6542307  -1.4177483 ]
 [ 0.5782484  -0.11932477]]
loss_ce:
tensor(0.4450, device='cuda:0')
loss_cos:
tensor(0., device='cuda:0')
final loss:
tensor(0.4450, device='cuda:0')
tensor([[ 1.4473, -1.1325],
        [ 2.0370, -1.6696],
        [-0.9422,  2.0139],
        [ 0.7078, -0.2482],
        [ 1.9912, -1.6741],
        [ 1.6937, -1.2694],
        [ 1.8278, -1.2221],
        [ 2.6612, -2.1084]], device='cuda:0')
[[ 1.4472724  -1.1325114 ]
 [ 2.0369692  -1.66961

loss_ce:
tensor(0.0856, device='cuda:0')
loss_cos:
tensor(0.1439, device='cuda:0')
final loss:
tensor(0.0856, device='cuda:0')
tensor([[-3.4242,  3.6540],
        [ 1.4241, -0.3287],
        [ 6.8466, -5.9549],
        [-1.7411,  1.9782],
        [-0.0564,  0.3983],
        [-4.4177,  4.5259],
        [-1.9850,  2.8092],
        [ 9.5784, -8.8623]], device='cuda:0')
[[-3.424239    3.6539948 ]
 [ 1.4240805  -0.32870573]
 [ 6.846634   -5.9548573 ]
 [-1.7411388   1.9781839 ]
 [-0.05642858  0.398268  ]
 [-4.41766     4.5258946 ]
 [-1.9849888   2.8092406 ]
 [ 9.578406   -8.86229   ]]
loss_ce:
tensor(7.9870e-06, device='cuda:0')
loss_cos:
tensor(0.0123, device='cuda:0')
final loss:
tensor(7.9870e-06, device='cuda:0')
tensor([[  7.0378,  -5.9085],
        [ 11.2483, -10.5742],
        [ -6.0647,   7.0530],
        [ -4.7355,   5.2158],
        [  6.0715,  -5.4443],
        [ 10.2363,  -9.5742],
        [ -6.0879,   6.9725],
        [  8.1584,  -7.6388]], device='cuda:0')
[[  7.0377545  -5.908

loss_ce:
tensor(0.0130, device='cuda:0')
loss_cos:
tensor(0., device='cuda:0')
final loss:
tensor(0.0130, device='cuda:0')
tensor([[ 7.6596, -6.6050],
        [ 2.4380, -1.8021],
        [ 2.6426, -2.1759],
        [ 1.5730, -1.0435],
        [ 3.0198, -2.5487],
        [ 3.5398, -2.8049],
        [ 3.1892, -2.4959],
        [ 3.3903, -2.6440]], device='cuda:0')
[[ 7.659556  -6.6049905]
 [ 2.4379551 -1.8020555]
 [ 2.642647  -2.1758876]
 [ 1.5730351 -1.043461 ]
 [ 3.0197527 -2.548679 ]
 [ 3.5397937 -2.8048685]
 [ 3.1891723 -2.4958725]
 [ 3.3902566 -2.644008 ]]
loss_ce:
tensor(0.3357, device='cuda:0')
loss_cos:
tensor(0.1090, device='cuda:0')
final loss:
tensor(0.3357, device='cuda:0')
tensor([[ 3.3330, -2.7548],
        [ 1.0687, -0.4618],
        [-3.0387,  2.9152],
        [ 0.1487, -0.6283],
        [-1.1665,  1.0589],
        [-4.3220,  4.3833],
        [ 0.0489, -0.4713],
        [-0.7888,  0.5164]], device='cuda:0')
[[ 3.3329587  -2.754792  ]
 [ 1.068698   -0.4618485 ]
 [-3.03871 

loss_ce:
tensor(1.2943, device='cuda:0')
loss_cos:
tensor(0.1473, device='cuda:0')
final loss:
tensor(1.2943, device='cuda:0')
tensor([[-3.6815,  4.2251],
        [ 1.3501, -0.4999],
        [ 6.1630, -5.5486],
        [-0.6494,  1.5336],
        [ 3.1256, -2.3873],
        [-3.1036,  3.4929],
        [-4.0864,  4.3580],
        [-2.5282,  2.5863]], device='cuda:0')
[[-3.681539    4.2250915 ]
 [ 1.3501399  -0.49992085]
 [ 6.162981   -5.5486197 ]
 [-0.6493817   1.5336434 ]
 [ 3.1256316  -2.3873217 ]
 [-3.1036117   3.4928868 ]
 [-4.0863757   4.358005  ]
 [-2.528198    2.5862734 ]]
loss_ce:
tensor(0.0226, device='cuda:0')
loss_cos:
tensor(0.0709, device='cuda:0')
final loss:
tensor(0.0226, device='cuda:0')
tensor([[ 2.5765, -2.1978],
        [ 1.1609, -0.6692],
        [ 2.6448, -2.3071],
        [-2.3250,  2.0297],
        [-3.5645,  3.5042],
        [-3.2660,  3.0925],
        [-3.5130,  3.4829],
        [-4.7124,  4.5733]], device='cuda:0')
[[ 2.5764954 -2.1977665]
 [ 1.160939  -0.6692

loss_ce:
tensor(0.3329, device='cuda:0')
loss_cos:
tensor(0.1160, device='cuda:0')
final loss:
tensor(0.3329, device='cuda:0')
tensor([[ 2.7730, -2.6314],
        [ 2.9688, -2.8918],
        [-3.3705,  2.9300],
        [ 0.6389, -0.4275],
        [-0.6159,  0.7297],
        [ 0.0477,  0.1860],
        [-2.3258,  2.1150],
        [ 3.0779, -2.1986]], device='cuda:0')
[[ 2.7730196  -2.6313715 ]
 [ 2.9687824  -2.8918264 ]
 [-3.3705266   2.92996   ]
 [ 0.6388826  -0.42751077]
 [-0.61592644  0.7297383 ]
 [ 0.0476988   0.18602717]
 [-2.3258402   2.1150417 ]
 [ 3.077876   -2.198593  ]]
loss_ce:
tensor(0.6077, device='cuda:0')
loss_cos:
tensor(0.3307, device='cuda:0')
final loss:
tensor(0.6077, device='cuda:0')
tensor([[ 5.2123, -4.4770],
        [ 6.0517, -5.7828],
        [-0.1406,  0.9737],
        [-1.9122,  2.4883],
        [ 4.5288, -3.7527],
        [ 3.4515, -2.5833],
        [-4.8452,  4.6090],
        [-0.8375,  0.8981]], device='cuda:0')
[[ 5.2123003  -4.4770036 ]
 [ 6.051719   -5.7

loss_ce:
tensor(0.8649, device='cuda:0')
loss_cos:
tensor(0.6117, device='cuda:0')
final loss:
tensor(0.8649, device='cuda:0')
tensor([[ 1.2791, -1.6616],
        [ 2.1727, -2.4902],
        [ 0.5304,  0.0544],
        [ 0.9592, -0.3566],
        [ 0.8110, -0.2155],
        [-1.2035,  1.8962],
        [-1.5608,  2.0757],
        [-2.8910,  3.3050]], device='cuda:0')
[[ 1.2791225  -1.66162   ]
 [ 2.1726825  -2.4901521 ]
 [ 0.5304078   0.05444863]
 [ 0.9592039  -0.35660535]
 [ 0.8110059  -0.21547744]
 [-1.2034633   1.8961705 ]
 [-1.5608386   2.0757372 ]
 [-2.8909686   3.3050354 ]]
loss_ce:
tensor(1.6837, device='cuda:0')
loss_cos:
tensor(0.4329, device='cuda:0')
final loss:
tensor(1.6837, device='cuda:0')
tensor([[-1.8376,  1.5990],
        [ 0.1647,  0.4757],
        [-2.6727,  1.9877],
        [ 4.3257, -4.3345],
        [-4.3948,  3.8118],
        [ 2.2961, -2.5218],
        [-3.3524,  2.6029],
        [-3.8700,  3.0435]], device='cuda:0')
[[-1.837644    1.5989754 ]
 [ 0.16468185  0.4

loss_ce:
tensor(1.6146, device='cuda:0')
loss_cos:
tensor(0.3331, device='cuda:0')
final loss:
tensor(1.6146, device='cuda:0')
tensor([[ 6.1318, -5.1221],
        [ 5.6675, -4.8507],
        [ 9.6784, -8.9233],
        [-4.9656,  5.8024],
        [-5.1109,  5.9299],
        [-3.5044,  4.0543],
        [-0.3385,  1.1128],
        [ 8.7751, -7.9149]], device='cuda:0')
[[ 6.1318493  -5.122144  ]
 [ 5.667518   -4.8507047 ]
 [ 9.678392   -8.923278  ]
 [-4.965552    5.8023877 ]
 [-5.1109004   5.92992   ]
 [-3.5044458   4.0542803 ]
 [-0.33854973  1.1128424 ]
 [ 8.775103   -7.914867  ]]
loss_ce:
tensor(0.0098, device='cuda:0')
loss_cos:
tensor(0., device='cuda:0')
final loss:
tensor(0.0098, device='cuda:0')
tensor([[ 10.7882, -10.1308],
        [ 10.4733,  -9.8758],
        [  8.4943,  -7.5722],
        [ 10.4183,  -9.8489],
        [  2.6618,  -1.7336],
        [ 10.7167, -10.1065],
        [ 10.7664, -10.1424],
        [  1.8587,  -0.8271]], device='cuda:0')
[[ 10.788197  -10.130754 ]
 [ 10.

loss_ce:
tensor(0.4777, device='cuda:0')
loss_cos:
tensor(0.4693, device='cuda:0')
final loss:
tensor(0.4777, device='cuda:0')
tensor([[ 6.0524, -5.8130],
        [ 0.1082,  0.4037],
        [-0.4456,  0.9291],
        [-2.6668,  2.8510],
        [-2.9251,  2.9619],
        [ 0.9131, -0.1504],
        [-3.1891,  3.4143],
        [ 3.6332, -2.9610]], device='cuda:0')
[[ 6.0523906  -5.8130407 ]
 [ 0.10816325  0.40368104]
 [-0.4455618   0.9290548 ]
 [-2.6668212   2.8509855 ]
 [-2.925061    2.96191   ]
 [ 0.9131014  -0.15039474]
 [-3.1891177   3.414302  ]
 [ 3.6332233  -2.9610069 ]]
loss_ce:
tensor(0.6670, device='cuda:0')
loss_cos:
tensor(0.2825, device='cuda:0')
final loss:
tensor(0.6670, device='cuda:0')
tensor([[ 0.8409, -0.3459],
        [ 6.1075, -5.8334],
        [ 4.0971, -3.5763],
        [ 5.7563, -5.4179],
        [ 4.0229, -3.2949],
        [-0.2784,  0.8853],
        [ 2.7338, -2.0240],
        [ 2.0486, -1.4366]], device='cuda:0')
[[ 0.8408691  -0.3459283 ]
 [ 6.1074605  -5.8

loss_ce:
tensor(3.7153, device='cuda:0')
loss_cos:
tensor(0.6321, device='cuda:0')
final loss:
tensor(3.7153, device='cuda:0')
tensor([[ 8.4378, -7.5529],
        [ 9.3029, -8.3781],
        [-2.8327,  3.1601],
        [ 0.9270, -0.4731],
        [-0.7026,  1.0726],
        [ 0.2879,  0.1302],
        [ 1.6975, -1.1358],
        [-0.6661,  0.6948]], device='cuda:0')
[[ 8.437751   -7.5529404 ]
 [ 9.302888   -8.378062  ]
 [-2.8326786   3.160066  ]
 [ 0.92696595 -0.47306952]
 [-0.7025563   1.072647  ]
 [ 0.2879195   0.13015275]
 [ 1.6974863  -1.1357504 ]
 [-0.6660741   0.69483805]]
loss_ce:
tensor(0.9308, device='cuda:0')
loss_cos:
tensor(0.4365, device='cuda:0')
final loss:
tensor(0.9308, device='cuda:0')
tensor([[-4.1447,  4.4462],
        [-1.7815,  2.2467],
        [ 1.5010, -0.7556],
        [ 0.8262, -0.5764],
        [-4.5761,  4.9147],
        [-5.2855,  5.5591],
        [-4.7720,  4.9157],
        [ 3.6142, -3.4939]], device='cuda:0')
[[-4.144699    4.4461837 ]
 [-1.7815182   2.2

loss_ce:
tensor(0.0006, device='cuda:0')
loss_cos:
tensor(0.0264, device='cuda:0')
final loss:
tensor(0.0006, device='cuda:0')
tensor([[ 5.4538, -5.3619],
        [ 3.2738, -3.0682],
        [ 3.4948, -3.2438],
        [ 3.6779, -3.2608],
        [-4.2542,  4.3173],
        [-4.3390,  4.2550],
        [-4.2524,  4.2092],
        [-4.8500,  5.0264]], device='cuda:0')
[[ 5.45379   -5.361934 ]
 [ 3.2737956 -3.068239 ]
 [ 3.494784  -3.2438314]
 [ 3.677922  -3.2608109]
 [-4.2541595  4.31729  ]
 [-4.338968   4.255012 ]
 [-4.25236    4.209185 ]
 [-4.850018   5.0264487]]
loss_ce:
tensor(0.0004, device='cuda:0')
loss_cos:
tensor(0.0322, device='cuda:0')
final loss:
tensor(0.0004, device='cuda:0')
tensor([[ -3.7766,   3.8110],
        [ -4.6708,   4.6726],
        [  4.2835,  -4.3634],
        [  3.1886,  -2.9996],
        [  4.1082,  -4.1560],
        [ -5.9912,   6.6201],
        [ 11.4616, -10.8160],
        [ -6.1435,   7.0499]], device='cuda:0')
[[ -3.7766488   3.8110292]
 [ -4.6707826   4.

  f1 = 2 * p * r / (p + r)


loss_ce:
tensor(1.2041, device='cuda:0')
loss_cos:
tensor(0.9477, device='cuda:0')
final loss:
tensor(1.2041, device='cuda:0')
tensor([[ 1.3640, -1.6419],
        [ 1.0717, -0.3736],
        [ 0.8238, -0.1026],
        [ 1.4445, -0.7733],
        [ 2.0518, -1.4012],
        [ 0.5554,  0.2420],
        [ 6.0486, -5.1024],
        [ 5.5300, -4.4780]], device='cuda:0')
[[ 1.3640443  -1.6418604 ]
 [ 1.0717369  -0.3735687 ]
 [ 0.8237986  -0.10259958]
 [ 1.4444944  -0.77327967]
 [ 2.0518134  -1.4012043 ]
 [ 0.55544657  0.24198681]
 [ 6.0486455  -5.1024423 ]
 [ 5.530005   -4.477967  ]]
loss_ce:
tensor(0.5601, device='cuda:0')
loss_cos:
tensor(0.2869, device='cuda:0')
final loss:
tensor(0.5601, device='cuda:0')
tensor([[ 4.9122, -3.7497],
        [-0.5352,  1.1368],
        [ 7.1063, -6.3619],
        [ 8.0297, -7.4681],
        [-0.6600,  1.3143],
        [-2.2239,  2.8949],
        [ 0.6177,  0.2504],
        [ 7.4682, -6.8385]], device='cuda:0')
[[ 4.9122353  -3.7496636 ]
 [-0.5352449   1.1

06/11/2020 11:24:26 - INFO - run_classifier -   ***** Eval results *****
06/11/2020 11:24:26 - INFO - run_classifier -     eval_loss = 0.9344146964602343
06/11/2020 11:24:26 - INFO - run_classifier -     eval_macro_f1 = 0.8365128890196813
06/11/2020 11:24:26 - INFO - run_classifier -     eval_macro_p = 0.8183978553253364
06/11/2020 11:24:26 - INFO - run_classifier -     eval_macro_r = 0.8554480189022176
06/11/2020 11:24:26 - INFO - run_classifier -     eval_micro_f1 = 0.8319847691575439
06/11/2020 11:24:26 - INFO - run_classifier -     eval_micro_p = 0.8323809523809523
06/11/2020 11:24:26 - INFO - run_classifier -     eval_micro_r = 0.8315889628924833


loss_ce:
tensor(0.0014, device='cuda:0')
loss_cos:
tensor(0.2894, device='cuda:0')
final loss:
tensor(0.0014, device='cuda:0')
tensor([[-3.8007,  3.7615],
        [-3.6100,  3.7648],
        [-3.3540,  3.5531],
        [-3.0096,  3.2826],
        [-2.6402,  2.9257],
        [-4.3520,  4.5767],
        [-3.5672,  3.7865],
        [-2.8672,  3.0934]], device='cuda:0')
[[-3.800697   3.7614791]
 [-3.610037   3.7647722]
 [-3.353997   3.5530803]
 [-3.0096157  3.282646 ]
 [-2.640158   2.9257405]
 [-4.3520365  4.5766897]
 [-3.5671952  3.78647  ]
 [-2.8671515  3.0933974]]
