# RATIO 2019 - Benchmarking Workshop

In [1]:
import pandas as pd
import numpy as np
import pickle
import csv
from nltk.tokenize import sent_tokenize, word_tokenize

In [2]:
# GW split
with open("data/distinct_sets/within.pkl", "rb") as f:
    within_train_df = pickle.load(f)
    within_dev_df = pickle.load(f)
print(len(within_train_df))
print(len(within_dev_df))

42302
21601


In [25]:
# AH split
if False:
    with open("../argmining19-same-side-classification/data/distinct_sets/within/within_train_arg_pickle.pkl", "rb") as f:
        within_train_df = pickle.load(f)
    with open("../argmining19-same-side-classification/data/distinct_sets/within/within_dev_arg_pickle.pkl", "rb") as f:
        within_dev_df = pickle.load(f)
    tmp_all_train = set(within_train_df.argument1.tolist() + within_train_df.argument2.tolist())
    tmp_all_dev = set(within_dev_df.argument1.tolist() + within_dev_df.argument2.tolist())
    print(len(tmp_all_train))
    print(len(tmp_all_dev))
    print(len(tmp_all_train.intersection(tmp_all_dev)))

11967
1553
0


In [3]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=4

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=4


In [4]:
from sklearn.metrics import classification_report, confusion_matrix , accuracy_score, f1_score
def report_training_results(y_test, y_pred):
    print('Confusion Matrix:')
    print(confusion_matrix(y_test, y_pred))  
    print()
    print('Accuracy: ', round(accuracy_score(y_test, y_pred), 2))  #
    print()

    print('Report:')
    print(classification_report(y_test, y_pred))  
    f1_dic = {}
    
    f1_dic['macro'] = round(f1_score(y_pred=y_pred, y_true=y_test, average='macro'), 2)
    f1_dic['micro'] = round(f1_score(y_pred=y_pred, y_true=y_test, average='micro'), 2)
    return f1_dic

In [5]:
if False:
    # Adding a tag for the topics in focus: "gay marriage" and "abortion"
    def add_tag(row):
        title = row['topic'].lower().strip()
        if title.find('abortion') > -1 :
            row['tag'] = 'abortion'
        elif title.find('gay marriage') > -1 :
            row['tag'] = 'gay marriage'
        else:
            row['tag'] = 'NA'
        return row

    within_train_df = within_train_df.apply(add_tag, axis=1)
    within_dev_df = within_dev_df.apply(add_tag, axis=1)
    # within_test_df = within_test_df.apply(add_tag, axis=1)
    
    with open("dev_tagged_data.pkl", "wb") as f:
        # pickle.dump(cross_train_df, f)
        # pickle.dump(cross_test_df, f)
        pickle.dump(within_train_df, f)
        pickle.dump(within_dev_df, f)

In [6]:
with open("dev_tagged_data.pkl", "rb") as f:
    within_train_df = pickle.load(f)
    within_dev_df = pickle.load(f)

In [7]:
def get_formatted_dataset(df):
    dataset = list(zip(df.argument1.tolist(), df.argument2.tolist(), df.is_same_side.tolist()))
    return dataset

In [8]:
X_train = get_formatted_dataset(within_dev_df)
X_dev = get_formatted_dataset(within_train_df)

In [9]:
# randomization experiments: 
# Exp 1: we shuffle the order of sentences in the dev set
# Exp 2: we shuffle the order of sentences in the training set

from nltk.tokenize import sent_tokenize
from random import shuffle

def shuffle_sentences(text):
    s = sent_tokenize(text)
    shuffle(s)
    return " ".join(s)

In [10]:
if False:
    X_dev_rnd = []
    for item in X_dev:
        X_dev_rnd.append((shuffle_sentences(item[0]), shuffle_sentences(item[1]), item[2]))
    X_dev = X_dev_rnd

    X_train_rnd = []
    for item in X_train:
        X_train_rnd.append((shuffle_sentences(item[0]), shuffle_sentences(item[1]), item[2]))
    X_train = X_train_rnd

# Within topic

In [11]:
from __future__ import absolute_import, division, print_function

import glob
import logging
import os
import random
import json

import numpy as np
import torch
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, TensorDataset)
import random
from torch.utils.data.distributed import DistributedSampler
from tqdm.notebook import tqdm, trange

from tensorboardX import SummaryWriter

from transformers import (WEIGHTS_NAME, BertConfig, BertTokenizer,
                                  XLMConfig, XLMForSequenceClassification, XLMTokenizer, 
                                  XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer,
                                  RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer)

from transformers import AdamW, WarmupLinearSchedule

from utils import (convert_examples_to_features, output_modes, processors, BertForBinaryClassification)

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [12]:
args = {
    'data_dir': 'data/',
    'model_type':  'bert',
    'model_name': 'bert-base-uncased',
    'task_name': 'binary',
    'output_dir': 'outputs/',
    'cache_dir': 'cache/',
    'do_train': True,
    'do_eval': True,
    'fp16': False,
    'fp16_opt_level': 'O1',
    'max_seq_length': 512,
    'output_mode': 'classification',
    'train_batch_size': 8,
    'eval_batch_size': 8,

    'gradient_accumulation_steps': 1,
    'num_train_epochs': 3,
    'weight_decay': 0,
    'learning_rate': 5e-6,
    'adam_epsilon': 1e-9,
    'warmup_steps': 0,
    'max_grad_norm': 1.0,

    'logging_steps': 0,
    'evaluate_during_training': True,
    'save_steps': 1000,
    'eval_all_checkpoints': True,
    'overwrite_output_dir': False,
    'reprocess_input_data': True,
    'notes': 'SameSide argument classification task'
}
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [13]:
MODEL_CLASSES = {
    'bert': (BertConfig, BertForBinaryClassification, BertTokenizer),
    'xlnet': (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
    'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
    'roberta': (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer)
}

config_class, model_class, tokenizer_class = MODEL_CLASSES[args['model_type']]

In [14]:
config = config_class.from_pretrained(args['model_name'], num_labels=1, finetuning_task=args['task_name'])
tokenizer = tokenizer_class.from_pretrained(args['model_name'])

INFO:transformers.configuration_utils:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at /srv/home/gwiedemann/.cache/torch/transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.bf3b9ea126d8c0001ee8a1e8b92229871d06d36d8808208cc2449280da87785c
INFO:transformers.configuration_utils:Model config {
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": "binary",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 1,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 2,
  "use_bfloat16": false,
  "vocab_size": 30522
}

INFO:transformers.tokenization_utils:loading file https://s3.am

In [15]:
model = model_class.from_pretrained(args['model_name'], num_labels=1)

INFO:transformers.configuration_utils:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at /srv/home/gwiedemann/.cache/torch/transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.bf3b9ea126d8c0001ee8a1e8b92229871d06d36d8808208cc2449280da87785c
INFO:transformers.configuration_utils:Model config {
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 1,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 2,
  "use_bfloat16": false,
  "vocab_size": 30522
}

INFO:transformers.modeling_utils:loading weights file https://s3.am

In [16]:
model.to(device)

BertForBinaryClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwi

In [17]:
task = args['task_name']

processor = processors[task](X_train, X_dev)
label_list = processor.get_labels()
num_labels = 1

In [18]:
def load_and_cache_examples(task, tokenizer, evaluate=False):
    processor = processors[task](X_train, X_dev)
    output_mode = args['output_mode']
    
    mode = 'dev' if evaluate else 'train'
    cached_features_file = os.path.join(args['data_dir'], f"cached_{mode}_{args['model_name']}_{args['max_seq_length']}_{task}")
    
    if os.path.exists(cached_features_file) and not args['reprocess_input_data']:
        logger.info("Loading features from cached file %s", cached_features_file)
        features = torch.load(cached_features_file)
               
    else:
        logger.info("Creating features from dataset file at %s", args['data_dir'])
        label_list = processor.get_labels()
        examples = processor.get_dev_examples(args['data_dir']) if evaluate else processor.get_train_examples(args['data_dir'])
        
        features = convert_examples_to_features(examples, label_list, args['max_seq_length'], tokenizer, output_mode,
            cls_token_at_end=bool(args['model_type'] in ['xlnet']),            # xlnet has a cls token at the end
            cls_token=tokenizer.cls_token,
            sep_token=tokenizer.sep_token,
            cls_token_segment_id=2 if args['model_type'] in ['xlnet'] else 0,
            pad_on_left=bool(args['model_type'] in ['xlnet']),                 # pad on the left for xlnet
            pad_token_segment_id=4 if args['model_type'] in ['xlnet'] else 0)
        
        logger.info("Saving features into cached file %s", cached_features_file)
        torch.save(features, cached_features_file)
        
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
    # labels
    all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float)

    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
    return dataset

                                        
from pprint import pprint
                                        
def train(train_dataset, model, tokenizer):
    tb_writer = SummaryWriter()
    
    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args['train_batch_size'])
    
    t_total = len(train_dataloader) // args['gradient_accumulation_steps'] * args['num_train_epochs']
    
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args['weight_decay']},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args['learning_rate'], eps=args['adam_epsilon'])
    scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args['warmup_steps'], t_total=t_total)
    
    if args['fp16']:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=args['fp16_opt_level'])
        
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args['num_train_epochs'])
    logger.info("  Total train batch size  = %d", args['train_batch_size'])
    logger.info("  Gradient Accumulation steps = %d", args['gradient_accumulation_steps'])
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(int(args['num_train_epochs']), desc="Epoch")
    
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration")
        for step, batch in enumerate(epoch_iterator):
            model.train()
            batch = tuple(t.to(device) for t in batch)
            inputs = {'input_ids':      batch[0],
                      'attention_mask': batch[1],
                      'token_type_ids': batch[2] if args['model_type'] in ['bert', 'xlnet'] else None,  # XLM don't use segment_ids
                      'labels':         batch[3]}
            outputs = model(**inputs)
            loss = outputs[0]  # model outputs are always tuple in pytorch-transformers (see doc)
            print("\r%f" % loss, end='')

            if args['gradient_accumulation_steps'] > 1:
                loss = loss / args['gradient_accumulation_steps']

            if args['fp16']:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args['max_grad_norm'])
                
            else:
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), args['max_grad_norm'])

            tr_loss += loss.item()
            if (step + 1) % args['gradient_accumulation_steps'] == 0:

                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()

                global_step += 1

                if args['logging_steps'] > 0 and global_step % args['logging_steps'] == 0:
                    # Log metrics
                    if args['evaluate_during_training']:  # Only evaluate when single GPU otherwise metrics may not average well
                        results, _ = evaluate(model, tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
                    tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args['logging_steps'], global_step)
                    logging_loss = tr_loss

                if args['save_steps'] > 0 and global_step % args['save_steps'] == 0:
                    # Save model checkpoint
                    output_dir = os.path.join(args['output_dir'], 'checkpoint-{}'.format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    logger.info("Saving model checkpoint to %s", output_dir)


    return global_step, tr_loss / global_step

In [19]:
from sklearn.metrics import mean_squared_error, matthews_corrcoef, confusion_matrix, accuracy_score, f1_score
from scipy.stats import pearsonr

def get_mismatched(labels, preds):
    mismatched = labels != preds
    examples = processor.get_dev_examples(args['data_dir'])
    wrong = [i for (i, v) in zip(examples, mismatched) if v]
    
    return wrong

def get_eval_report(labels, preds):
    
    print(labels)
    print(preds)
    
    mcc = matthews_corrcoef(labels, preds)
    tn, fp, fn, tp = confusion_matrix(labels, preds).ravel()
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='binary')
    return {
        "mcc": mcc,
        "tp": tp,
        "tn": tn,
        "fp": fp,
        "fn": fn,
        "acc" : acc,
        "f1" : f1
    }, get_mismatched(labels, preds)

def compute_metrics(task_name, preds, labels):
    assert len(preds) == len(labels)
    return get_eval_report(labels, preds)

def evaluate(model, tokenizer, prefix=""):
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_output_dir = args['output_dir']

    results = {}
    EVAL_TASK = args['task_name']

    eval_dataset = load_and_cache_examples(EVAL_TASK, tokenizer, evaluate=True)
    if not os.path.exists(eval_output_dir):
        os.makedirs(eval_output_dir)


    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args['eval_batch_size'])

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args['eval_batch_size'])
    eval_loss = 0.0
    nb_eval_steps = 0
    preds = None
    out_label_ids = None
    
    sigmoid_squash = torch.nn.Sigmoid()
    
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            inputs = {'input_ids':      batch[0],
                      'attention_mask': batch[1],
                      'token_type_ids': batch[2] if args['model_type'] in ['bert', 'xlnet'] else None,  # XLM don't use segment_ids
                      'labels':         batch[3]}
            outputs = model(**inputs)
            tmp_eval_loss, logits = outputs[:2]
            
            logits = sigmoid_squash(logits)

            eval_loss += tmp_eval_loss.mean().item()
        nb_eval_steps += 1
        
        if preds is None:
            preds = logits.detach().cpu().numpy()
            out_label_ids = inputs['labels'].detach().cpu().numpy()
        else:
            preds = np.append(preds, logits.detach().cpu().numpy())
            out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)

    eval_loss = eval_loss / nb_eval_steps
    if args['output_mode'] == "classification":
        # preds = np.argmax(preds, axis=1)
        preds = np.round(preds).astype(int)
    elif args['output_mode'] == "regression":
        preds = np.squeeze(preds)
    # print(preds)
    result, wrong = compute_metrics(EVAL_TASK, preds, out_label_ids)
    results.update(result)

    output_eval_file = os.path.join(eval_output_dir, "eval_results.txt")
    with open(output_eval_file, "w") as writer:
        logger.info("***** Eval results {} *****".format(prefix))
        for key in sorted(result.keys()):
            logger.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))

    return results, wrong

In [20]:
from tensorboardX import SummaryWriter

In [21]:
if args['do_train']:
    train_dataset = load_and_cache_examples(task, tokenizer)
    global_step, tr_loss = train(train_dataset, model, tokenizer)
    logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)

INFO:__main__:Creating features from dataset file at data/
100%|██████████| 21601/21601 [00:30<00:00, 707.29it/s]
INFO:__main__:Saving features into cached file data/cached_train_bert-base-uncased_512_binary
INFO:__main__:***** Running training *****
INFO:__main__:  Num examples = 21601
INFO:__main__:  Num Epochs = 3
INFO:__main__:  Total train batch size  = 8
INFO:__main__:  Gradient Accumulation steps = 1
INFO:__main__:  Total optimization steps = 8103


HBox(children=(IntProgress(value=0, description='Epoch', max=3, style=ProgressStyle(description_width='initial…

HBox(children=(IntProgress(value=0, description='Iteration', max=2701, style=ProgressStyle(description_width='…

0.556303

INFO:transformers.configuration_utils:Configuration saved in outputs/checkpoint-1000/config.json
INFO:transformers.modeling_utils:Model weights saved in outputs/checkpoint-1000/pytorch_model.bin
INFO:__main__:Saving model checkpoint to outputs/checkpoint-1000


0.386785

INFO:transformers.configuration_utils:Configuration saved in outputs/checkpoint-2000/config.json
INFO:transformers.modeling_utils:Model weights saved in outputs/checkpoint-2000/pytorch_model.bin
INFO:__main__:Saving model checkpoint to outputs/checkpoint-2000


0.921141


HBox(children=(IntProgress(value=0, description='Iteration', max=2701, style=ProgressStyle(description_width='…

0.762638

INFO:transformers.configuration_utils:Configuration saved in outputs/checkpoint-3000/config.json
INFO:transformers.modeling_utils:Model weights saved in outputs/checkpoint-3000/pytorch_model.bin
INFO:__main__:Saving model checkpoint to outputs/checkpoint-3000


0.564348

INFO:transformers.configuration_utils:Configuration saved in outputs/checkpoint-4000/config.json
INFO:transformers.modeling_utils:Model weights saved in outputs/checkpoint-4000/pytorch_model.bin
INFO:__main__:Saving model checkpoint to outputs/checkpoint-4000


0.701034

INFO:transformers.configuration_utils:Configuration saved in outputs/checkpoint-5000/config.json
INFO:transformers.modeling_utils:Model weights saved in outputs/checkpoint-5000/pytorch_model.bin
INFO:__main__:Saving model checkpoint to outputs/checkpoint-5000


0.147875


HBox(children=(IntProgress(value=0, description='Iteration', max=2701, style=ProgressStyle(description_width='…

0.585581

INFO:transformers.configuration_utils:Configuration saved in outputs/checkpoint-6000/config.json
INFO:transformers.modeling_utils:Model weights saved in outputs/checkpoint-6000/pytorch_model.bin
INFO:__main__:Saving model checkpoint to outputs/checkpoint-6000


0.825923

INFO:transformers.configuration_utils:Configuration saved in outputs/checkpoint-7000/config.json
INFO:transformers.modeling_utils:Model weights saved in outputs/checkpoint-7000/pytorch_model.bin
INFO:__main__:Saving model checkpoint to outputs/checkpoint-7000


0.128840

INFO:transformers.configuration_utils:Configuration saved in outputs/checkpoint-8000/config.json
INFO:transformers.modeling_utils:Model weights saved in outputs/checkpoint-8000/pytorch_model.bin
INFO:__main__:Saving model checkpoint to outputs/checkpoint-8000


0.209498

INFO:__main__: global_step = 8103, average loss = 0.5176098676032672


0.363011



In [22]:
if args['do_train']:
    if not os.path.exists(args['output_dir']):
            os.makedirs(args['output_dir'])
    logger.info("Saving model checkpoint to %s", args['output_dir'])
    
    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
    model_to_save.save_pretrained(args['output_dir'])
    tokenizer.save_pretrained(args['output_dir'])
    torch.save(args, os.path.join(args['output_dir'], 'training_args.bin'))

INFO:__main__:Saving model checkpoint to outputs/
INFO:transformers.configuration_utils:Configuration saved in outputs/config.json
INFO:transformers.modeling_utils:Model weights saved in outputs/pytorch_model.bin


In [23]:
## Evaluation

In [24]:
if args['do_eval']:
    results = {}
    checkpoints = [args['output_dir']]
    if args['eval_all_checkpoints']:
        checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args['output_dir'] + '/**/' + WEIGHTS_NAME, recursive=True)))
        logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
    logger.info("Evaluate the following checkpoints: %s", checkpoints)
    for checkpoint in checkpoints:
        global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
        model = model_class.from_pretrained(checkpoint)
        model.to(device)
        result, wrong_preds = evaluate(model, tokenizer, prefix=global_step)
        result = dict((k + '_{}'.format(global_step), v) for k, v in result.items())
        results.update(result)

INFO:__main__:Evaluate the following checkpoints: ['outputs/checkpoint-1000', 'outputs/checkpoint-2000', 'outputs/checkpoint-3000', 'outputs/checkpoint-4000', 'outputs/checkpoint-5000', 'outputs/checkpoint-6000', 'outputs/checkpoint-7000', 'outputs/checkpoint-8000', 'outputs']
INFO:transformers.configuration_utils:loading configuration file outputs/checkpoint-1000/config.json
INFO:transformers.configuration_utils:Model config {
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 1,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 2,
  "use_bfloat16": false,
  "vocab_size": 30522
}

INFO:transformers.modeling

HBox(children=(IntProgress(value=0, description='Evaluating', max=5288, style=ProgressStyle(description_width=…


[1. 1. 0. ... 1. 0. 0.]
[0 0 1 ... 1 0 0]


INFO:__main__:***** Eval results 1000 *****
INFO:__main__:  acc = 0.48711644839487495
INFO:__main__:  f1 = 0.5272790657137877
INFO:__main__:  fn = 12818
INFO:__main__:  fp = 8878
INFO:__main__:  mcc = -0.024706177707549033
INFO:__main__:  tn = 8506
INFO:__main__:  tp = 12100
INFO:transformers.configuration_utils:loading configuration file outputs/checkpoint-2000/config.json
INFO:transformers.configuration_utils:Model config {
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 1,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 2,
  "use_bfloat16": false,
  "vocab_size": 30522
}

INFO:transformers.modeling_u

HBox(children=(IntProgress(value=0, description='Evaluating', max=5288, style=ProgressStyle(description_width=…


[1. 1. 0. ... 1. 0. 0.]
[0 0 1 ... 1 0 0]


INFO:__main__:***** Eval results 2000 *****
INFO:__main__:  acc = 0.472909082312893
INFO:__main__:  f1 = 0.4540267881191997
INFO:__main__:  fn = 15647
INFO:__main__:  fp = 6650
INFO:__main__:  mcc = -0.010638181832048548
INFO:__main__:  tn = 10734
INFO:__main__:  tp = 9271
INFO:transformers.configuration_utils:loading configuration file outputs/checkpoint-3000/config.json
INFO:transformers.configuration_utils:Model config {
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 1,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 2,
  "use_bfloat16": false,
  "vocab_size": 30522
}

INFO:transformers.modeling_uti

HBox(children=(IntProgress(value=0, description='Evaluating', max=5288, style=ProgressStyle(description_width=…


[1. 1. 0. ... 1. 0. 0.]
[0 0 1 ... 1 0 0]


INFO:__main__:***** Eval results 3000 *****
INFO:__main__:  acc = 0.5097158526783604
INFO:__main__:  f1 = 0.5469835306452318
INFO:__main__:  fn = 12397
INFO:__main__:  fp = 8343
INFO:__main__:  mcc = 0.022205390207535553
INFO:__main__:  tn = 9041
INFO:__main__:  tp = 12521
INFO:transformers.configuration_utils:loading configuration file outputs/checkpoint-4000/config.json
INFO:transformers.configuration_utils:Model config {
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 1,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 2,
  "use_bfloat16": false,
  "vocab_size": 30522
}

INFO:transformers.modeling_uti

HBox(children=(IntProgress(value=0, description='Evaluating', max=5288, style=ProgressStyle(description_width=…


[1. 1. 0. ... 1. 0. 0.]
[0 1 1 ... 1 0 0]


INFO:__main__:***** Eval results 4000 *****
INFO:__main__:  acc = 0.5041605597844073
INFO:__main__:  f1 = 0.5494576307593169
INFO:__main__:  fn = 12128
INFO:__main__:  fp = 8847
INFO:__main__:  mcc = 0.004298636879224015
INFO:__main__:  tn = 8537
INFO:__main__:  tp = 12790
INFO:transformers.configuration_utils:loading configuration file outputs/checkpoint-5000/config.json
INFO:transformers.configuration_utils:Model config {
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 1,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 2,
  "use_bfloat16": false,
  "vocab_size": 30522
}

INFO:transformers.modeling_uti

HBox(children=(IntProgress(value=0, description='Evaluating', max=5288, style=ProgressStyle(description_width=…


[1. 1. 0. ... 1. 0. 0.]
[0 1 1 ... 1 0 1]


INFO:__main__:***** Eval results 5000 *****
INFO:__main__:  acc = 0.5150347501300175
INFO:__main__:  f1 = 0.5776807954381704
INFO:__main__:  fn = 10887
INFO:__main__:  fp = 9628
INFO:__main__:  mcc = 0.009161148296950108
INFO:__main__:  tn = 7756
INFO:__main__:  tp = 14031
INFO:transformers.configuration_utils:loading configuration file outputs/checkpoint-6000/config.json
INFO:transformers.configuration_utils:Model config {
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 1,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 2,
  "use_bfloat16": false,
  "vocab_size": 30522
}

INFO:transformers.modeling_uti

HBox(children=(IntProgress(value=0, description='Evaluating', max=5288, style=ProgressStyle(description_width=…


[1. 1. 0. ... 1. 0. 0.]
[0 1 1 ... 1 0 1]


INFO:__main__:***** Eval results 6000 *****
INFO:__main__:  acc = 0.5186043213086852
INFO:__main__:  f1 = 0.5820025452604787
INFO:__main__:  fn = 10741
INFO:__main__:  fp = 9623
INFO:__main__:  mcc = 0.015265277673864002
INFO:__main__:  tn = 7761
INFO:__main__:  tp = 14177
INFO:transformers.configuration_utils:loading configuration file outputs/checkpoint-7000/config.json
INFO:transformers.configuration_utils:Model config {
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 1,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 2,
  "use_bfloat16": false,
  "vocab_size": 30522
}

INFO:transformers.modeling_uti

HBox(children=(IntProgress(value=0, description='Evaluating', max=5288, style=ProgressStyle(description_width=…


[1. 1. 0. ... 1. 0. 0.]
[0 1 1 ... 1 0 1]


INFO:__main__:***** Eval results 7000 *****
INFO:__main__:  acc = 0.5199281357855421
INFO:__main__:  f1 = 0.5861928437525472
INFO:__main__:  fn = 10534
INFO:__main__:  fp = 9774
INFO:__main__:  mcc = 0.014923827617581707
INFO:__main__:  tn = 7610
INFO:__main__:  tp = 14384
INFO:transformers.configuration_utils:loading configuration file outputs/checkpoint-8000/config.json
INFO:transformers.configuration_utils:Model config {
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 1,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 2,
  "use_bfloat16": false,
  "vocab_size": 30522
}

INFO:transformers.modeling_uti

HBox(children=(IntProgress(value=0, description='Evaluating', max=5288, style=ProgressStyle(description_width=…


[1. 1. 0. ... 1. 0. 0.]
[0 1 1 ... 1 0 1]


INFO:__main__:***** Eval results 8000 *****
INFO:__main__:  acc = 0.5161930877972672
INFO:__main__:  f1 = 0.5790101616818201
INFO:__main__:  fn = 10844
INFO:__main__:  fp = 9622
INFO:__main__:  mcc = 0.011215701500123606
INFO:__main__:  tn = 7762
INFO:__main__:  tp = 14074
INFO:transformers.configuration_utils:loading configuration file outputs/config.json
INFO:transformers.configuration_utils:Model config {
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 1,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 2,
  "use_bfloat16": false,
  "vocab_size": 30522
}

INFO:transformers.modeling_utils:loading weigh

HBox(children=(IntProgress(value=0, description='Evaluating', max=5288, style=ProgressStyle(description_width=…


[1. 1. 0. ... 1. 0. 0.]
[0 1 1 ... 1 0 1]


INFO:__main__:***** Eval results outputs *****
INFO:__main__:  acc = 0.5161458087088081
INFO:__main__:  f1 = 0.5783271528636177
INFO:__main__:  fn = 10882
INFO:__main__:  fp = 9586
INFO:__main__:  mcc = 0.01175182735489496
INFO:__main__:  tn = 7798
INFO:__main__:  tp = 14036


In [None]:
# adversarial: mix 2 arguments -> predict if mixed or not (sameside true -> reverse gradient, false -> ?)