### Evaluation of model using CUAD projects original metrics
This is a modified version that allows to run the evaluation as a standalone version. Will be converted into a script


In [1]:
import torch
import time
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

from transformers import (
    AutoConfig,
    AutoModelForQuestionAnswering,
    AutoTokenizer,
    squad_convert_examples_to_features
)

from transformers.data.processors.squad import SquadResult, SquadV2Processor, SquadExample
from transformers.data.metrics.squad_metrics import compute_predictions_logits
from datasets import load_dataset

In [2]:
processor = SquadV2Processor()

In [5]:
# Using the testfile from the original repo
examples = processor.get_dev_examples('./', filename='./test.json')

100%|██████████| 102/102 [01:39<00:00,  1.03it/s]


In [2]:
model_path='roberta-base/'
config = AutoConfig.from_pretrained(
        model_path,
        cache_dir=None,
    )
tokenizer = AutoTokenizer.from_pretrained(
    model_path,
    do_lower_case=True,
    use_fast=False,  # SquadDataset is not compatible with Fast tokenizers which have a smarter overflow handeling
)
model = AutoModelForQuestionAnswering.from_pretrained(
    model_path,
    from_tf=bool(".ckpt" in model_path),
    config=config,
    cache_dir=None,
)
model_type="roberta"

In [19]:
doc_stride=128
max_query_length=64
max_seq_length=512
evaluate=True
threads=12

# Very slow processing time (4-5 hours)
features, dataset = squad_convert_examples_to_features(
examples=examples,
tokenizer=tokenizer,
max_seq_length=max_seq_length,
doc_stride=doc_stride,
max_query_length=max_query_length,
is_training=not evaluate,
return_dataset="pt",
threads=threads,
)

convert squad examples to features: 100%|██████████| 4182/4182 [1:28:19<00:00,  1.27s/it]
add example index and unique id: 100%|██████████| 4182/4182 [00:00<00:00, 7611.60it/s] 


In [20]:
# Slow processing time
torch.save({"features": features, "dataset": dataset, "examples": examples}, './datafeats')

In [40]:
data = torch.load('./datafeats')
features, dataset, examples = data.values()
batch_size=32
eval_sampler = SequentialSampler(dataset)
eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=batch_size)

Evalutating

In [41]:
def to_list(tensor):
    return tensor.detach().cpu().tolist()

In [43]:
# Determine cuda
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

#Additional Info when using cuda
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

model.to(device)
print()

Using device: cuda

Tesla T4
Memory Usage:
Allocated: 0.5 GB
Cached:    0.8 GB



In [44]:
from tqdm import tqdm
all_results = []

for batch in tqdm(eval_dataloader, desc="Evaluating"):
    model.eval()
    batch = tuple(t.to(device) for t in batch)

    with torch.no_grad():
        inputs = {
            "input_ids": batch[0],
            "attention_mask": batch[1],
            "token_type_ids": batch[2],
        }

        if model_type in ["xlm", "roberta", "distilbert", "camembert", "bart", "longformer"]:
            del inputs["token_type_ids"]

        feature_indices = batch[3]

        outputs = model(**inputs)

    for i, feature_index in enumerate(feature_indices):
        eval_feature = features[feature_index.item()]
        unique_id = int(eval_feature.unique_id)

        # for own model
        # start_logits, end_logits = outputs[0][i], outputs[1][i]
        output = [to_list(output[i]) for output in outputs.to_tuple()]

        # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
        # models only use two.
        if len(output) >= 5:
            start_logits = output[0]
            start_top_index = output[1]
            end_logits = output[2]
            end_top_index = output[3]
            cls_logits = output[4]

            result = SquadResult(
                unique_id,
                start_logits,
                end_logits,
                start_top_index=start_top_index,
                end_top_index=end_top_index,
                cls_logits=cls_logits,
            )

        else:
            start_logits, end_logits = output
            result = SquadResult(unique_id, start_logits, end_logits)

        all_results.append(result)

Evaluating: 100%|██████████| 19188/19188 [2:04:28<00:00,  2.57it/s]  


In [45]:
import os
output_prediction_file = os.path.join("./", "predictions_{}.json".format(model_type))
output_nbest_file = os.path.join("./", "nbest_predictions_{}.json".format(model_type))
output_null_log_odds_file = os.path.join("./", "null_odds_{}.json".format(model_type))

In [46]:
from utils import compute_predictions_logits, squad_evaluate
import json

In [47]:
with open('test.json', "r") as f:
        json_test_dict = json.load(f)

In [48]:
n_best_size=3
max_answer_length=512
do_lower_case=True
verbose_logging=False
version_2_with_negative=True
null_score_diff_threshold=0.0


predictions = compute_predictions_logits(
    json_test_dict,
    examples,
    features,
    all_results,
    n_best_size,
    max_answer_length,
    do_lower_case,
    output_prediction_file,
    output_nbest_file,
    output_null_log_odds_file,
    verbose_logging,
    version_2_with_negative,
    null_score_diff_threshold,
    tokenizer,
)

# Compute the F1 and exact scores.
results = squad_evaluate(examples, predictions)

In [51]:
torch.save({"pred":predictions},'pred_roberta')

In [52]:
torch.save({"all_results":all_results},'allres_roberta')

In [49]:
results

OrderedDict([('exact', 72.45337159253945),
             ('f1', 75.93157427285503),
             ('total', 4182),
             ('HasAns_exact', 71.70418006430869),
             ('HasAns_f1', 83.396980393151),
             ('HasAns_total', 1244),
             ('NoAns_exact', 72.7705922396188),
             ('NoAns_f1', 72.7705922396188),
             ('NoAns_total', 2938),
             ('best_exact', 72.7403156384505),
             ('best_exact_thresh', 0.0),
             ('best_f1', 75.91719507089961),
             ('best_f1_thresh', 0.0)])

## Getting the paper metrics

In [None]:
# Import cuad utils

In [None]:
verbose=True
name= "roberta"

gt_dict = json_test_dict
gt_dict = get_answers(gt_dict)

# predictions_path = 'nbest_predictions_roberta.json'

pred_dict = load_json(output_prediction_file)

assert sorted(list(pred_dict.keys())) == sorted(list(gt_dict.keys()))

precisions, recalls, confs = get_precisions_recalls(pred_dict, gt_dict)
prec_at_90_recall, _ = get_prec_at_recall(precisions, recalls, confs, recall_thresh=0.9)
prec_at_80_recall, _ = get_prec_at_recall(precisions, recalls, confs, recall_thresh=0.8)
aupr = get_aupr(precisions, recalls)

if verbose:
    print("AUPR: {:.3f}, Precision at 80% Recall: {:.3f}, Precision at 90% Recall: {:.3f}".format(aupr, prec_at_80_recall, prec_at_90_recall))

# now save results as a dataframe and return

results = {"name": name, "aupr": aupr, "prec_at_80_recall": prec_at_80_recall, "prec_at_90_recall": prec_at_90_recall}