# Transformer model testing

Very basic notebooks for testing transformer model performance

In [31]:
from src.models.drqa.drqa_train import *

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [40]:
import torch
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

# executing these commands for the first time initiates a download of the 
# model weights to ~/.cache/torch/transformers/
MDL="bert-large-uncased-whole-word-masking-finetuned-squad"

tokenizer = AutoTokenizer.from_pretrained(MDL)
model = AutoModelForQuestionAnswering.from_pretrained(MDL, return_dict=False)

# Model parameters

In [41]:
sum(p.numel() for p in model.parameters() if p.requires_grad)

334094338

# Out of the box performance

In [42]:
from tqdm import tqdm

In [44]:
df= pd.read_pickle('./data/processed/cuad_drqa/data.pkl')
predictions = {}
answers = {}
for idx, row in tqdm(df.iterrows()):
    question=row.question
    # limit context - Do intelligently. Ensure answer is within span.
    context = row.context[0:512]

    # 1. TOKENIZE THE INPUT
    inputs = tokenizer.encode_plus(question, context, return_tensors="pt") 

    # 2. OBTAIN MODEL SCORES
    # the AutoModelForQuestionAnswering class includes a span predictor on top of the model. 
    # the model returns answer start and end scores for each word in the text
    answer_start_scores, answer_end_scores = model(**inputs)
    answer_start = torch.argmax(answer_start_scores)  
    answer_end = torch.argmax(answer_end_scores) + 1  

    # 3. GET THE ANSWER SPAN
    ans = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end]))
    predictions[row.id_]=ans
    answers[row.id_]=row.answer

960it [39:48,  2.49s/it]


Evaluate predictions

In [45]:
def evaluate_single(predictions, answers, **kwargs):
    '''
    Gets a dictionary of predictions with question_id as key
    and prediction as value. The validation dataset has multiple 
    answers for a single question. Hence we compare our prediction
    with all the answers and choose the one that gives us
    the maximum metric (em or f1). 
    This method first parses the JSON file, gets all the answers
    for a given id and then passes the list of answers and the 
    predictions to calculate em, f1.


    :param dict predictions
    Returns
    : exact_match: 1 if the prediction and ground truth 
      match exactly, 0 otherwise.
    : f1_score: 
    '''
    assert len(predictions) == len(answers)
    f1 = exact_match = total = 0
    for key, value in predictions.items():
        prediction = value
        ground_truths = [answers[key]]

        exact_match += metric_max_over_ground_truths(exact_match_score, prediction, ground_truths)
        f1 += metric_max_over_ground_truths(f1_score, prediction, ground_truths)

    total = len(predictions)
    exact_match = 100.0 * exact_match / total
    f1 = 100.0 * f1 / total

    return exact_match, f1

In [46]:
em, f1 = evaluate_single(predictions,answers)

In [47]:
print(f"The model {MDL} has an \n exact match score of: {em}\n f1 of: {f1}")

The model bert-large-uncased-whole-word-masking-finetuned-squad has an 
 exact match score of: 2.9166666666666665
 f1 of: 16.310703236473493
