### Notebook for Challenging Dataset in Section 9
In this snippet of code, you can evaluate the performance on challanging dataset of the model you finetuned on QQP or QNLI.
You will need to finetune the model on QQP or QNLI first.
Refer to Readme.md for detailed procedures about GLUE finetuning.
Then, put the finetuned model's path in `finetuned_model_path`.

#### Load Model

In [4]:
from transformers import RobertaForSequenceClassification, BertTokenizer
finetuned_model_path = "" #Please put the model path finetuned with QNLI or QQP here.
                         #This directory should contain config.json and pytorch_model.bin
model = RobertaForSequenceClassification.from_pretrained('/work/temp_mount/qnli/repeat-2')
tokenizer = BertTokenizer.from_pretrained('/work/temp_mount/qqp/flat_parentheses')
model.eval()
model.cuda()

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(30000, 512, padding_idx=1)
      (position_embeddings): Embedding(512, 512, padding_idx=1)
      (token_type_embeddings): Embedding(2, 512)
      (LayerNorm): LayerNorm((512,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=512, out_features=512, bias=True)
              (key): Linear(in_features=512, out_features=512, bias=True)
              (value): Linear(in_features=512, out_features=512, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=512, out_features=512, bias=True)
              (LayerN

#### Evaluate on QNLI-adv

In [5]:
import torch
from transformers import (
    HfArgumentParser, EvalPrediction,
    Trainer,
    TrainingArguments,
    glue_output_modes,
    glue_compute_metrics,
    glue_tasks_num_labels,
    set_seed, GlueDataset
)
from transformers import GlueDataTrainingArguments as DataTrainingArguments
from torch.utils.data.sampler import SequentialSampler
data_args = DataTrainingArguments(task_name = 'qnli', data_dir = 'glue_data/QNLI-adv/')
#uncomment the following line to run QQP-PAWS
#data_args = DataTrainingArguments(task_name = 'qqp', data_dir = 'glue_data/QQP-PAWS')
eval_dataset = (
        GlueDataset(data_args, tokenizer=tokenizer, mode="dev", cache_dir='.')
    )
eval_sampler = SequentialSampler(eval_dataset)
from torch.utils.data.dataloader import DataLoader
from transformers.data.data_collator import DataCollator, DataCollatorWithPadding, default_data_collator
data_collator = default_data_collator
eval_dataloader = DataLoader(
  eval_dataset,
  batch_size = 1,
  sampler = eval_sampler,
  collate_fn = data_collator,
  drop_last = False,
)
result = [[],[]]
from tqdm import tqdm
#for i, data in tqdm(enumerate(eval_dataloader)):
#  with torch.no_grad():
#    output = model(data['input_ids'].cuda())
#    pred = output[-1].detach().argmax(-1).cpu()
#    if pred == data['labels']:
#      result[0].append(i)
#    else :
#      result[1].append(i)
    
        
#print("Accuracy: ", len(result[0]) / (len(result[0]) + len(result[1])))



In [6]:
# If the previous block does not work ,use this one
import numpy as np
from typing import Callable, Dict, Optional
output_mode = glue_output_modes['qqp'] # change to qqp if you are running QQP-PAWS
def build_compute_metrics_fn(task_name: str) -> Callable[[EvalPrediction], Dict]:
        def compute_metrics_fn(p: EvalPrediction):
            if output_mode == "classification":
                preds = np.argmax(p.predictions, axis=1)
            elif output_mode == "regression":
                preds = np.squeeze(p.predictions)
            return glue_compute_metrics(task_name, preds, p.label_ids)

        return compute_metrics_fn

training_args = TrainingArguments(output_dir =  '.', do_eval = True)
trainer = Trainer(
        model=model,
        args=training_args, 
        eval_dataset=eval_dataset,
        compute_metrics=build_compute_metrics_fn(data_args.task_name),
    )
trainer.compute_metrics = build_compute_metrics_fn('qqp') # change to qqp if you are running QQP-PAWS
result = trainer.evaluate(eval_dataset)
print(result)

(5463,)
{'eval_loss': 2.982379674911499, 'eval_acc': 0.006772835438403808, 'eval_f1': 0.013454545454545455, 'eval_acc_and_f1': 0.010113690446474631, 'eval_runtime': 17.2771, 'eval_samples_per_second': 316.198}
