# A general purpose evaluation script

In [None]:
! pip install datasets transformers
import transformers
import datasets

In [None]:
import torch
import transformers
from transformers import AutoTokenizer,BertTokenizerFast, BertForQuestionAnswering, DistilBertForQuestionAnswering, AutoModelForQuestionAnswering
import json
from pathlib import Path
from torch.utils.data import DataLoader
import time
import pandas as pd
import pickle as pkl
from tqdm import tqdm

## Retrieve the desired model

In [None]:
tokenizer = AutoTokenizer.from_pretrained('/content/drive/MyDrive/Spring22/CS769/Project/FInal/bert-base-uncased-finetuned-squad')

model = AutoModelForQuestionAnswering.from_pretrained('/content/drive/MyDrive/Spring22/CS769/Project/FInal/bert-base-uncased-finetuned-squad')
model.eval()

## Extract the desired dataset

In [None]:
with open('/content/drive/MyDrive/Spring22/CS769/Project/FInal/SQUAD_COQA_uniform.pkl' , 'rb') as fp:
  datasets = pkl.load(fp)

In [None]:
def normalize_text(s):
  """Removing articles and punctuation, and standardizing whitespace are all typical text processing steps."""
  import string, re

  def remove_articles(text):
    regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
    return re.sub(regex, " ", text)

  def white_space_fix(text):
    return " ".join(text.split())

  def remove_punc(text):
    exclude = set(string.punctuation)
    return "".join(ch for ch in text if ch not in exclude)

  def lower(text):
    return text.lower()

  return white_space_fix(remove_articles(remove_punc(lower(s))))


## Define the pipeline for inference

In [None]:
from transformers import pipeline
generator = pipeline(task="question-answering", model = model, tokenizer = tokenizer, device = 0, batch_size = 16)

In [None]:
df_test = pd.DataFrame(datasets['test'])

In [None]:
qa_dict = {'what':[] , 'where': [], 'how': [], 'why':[], 'when': [], 'which':[],  'misc': [], 'who' : []}
qa_keys = ['what', 'where', 'how', 'why', 'when', 'which', 'who']

## Divide the test set per question type

In [None]:
for i in range(len(df_test)):
  question = df_test.iloc[i]['question']
  id = df_test.iloc[i]['id']
  context = df_test.iloc[i]['context']
  answer = df_test.iloc[i]['answers']
  misc_flag = True
  for key in qa_keys:
    if key in question.lower():
      qa_dict[key].append((question , id , context, answer))
      misc_flag = False
      break
  if misc_flag == True:
    qa_dict['misc'].append((question , id , context, answer))

In [None]:

gt_dict = {key : [{'id' : x[1] , 'answers' : x[3]} for x in qa_dict[key]] for key in qa_dict}
data_dict = {key : [x for x in qa_dict[key]] for key in qa_dict}


In [None]:
from datasets import load_metric
metric = load_metric('squad_v2')

## Run inference and store the computed metrics

In [None]:
results = {}
for key in qa_dict:
  questions = [normalize_text(x[0]) for x in data_dict[key]]
  contexts = [normalize_text(x[2]) for x in data_dict[key]]
  id = [x[1] for x in data_dict[key]]
  preds = []

  for i , out in enumerate(generator(question = questions, context = contexts, batch_size = 16)):
    if 1 - out['score'] > 0.5:
      preds.append({'id' : id[i] , 'prediction_text' : '' , 'no_answer_probability' : 1 - out['score']})
    else:
      preds.append({'id' : id[i] , 'prediction_text' : out['answer'] , 'no_answer_probability' : 1 - out['score']})

  results[key] = metric.compute(predictions=preds, references=gt_dict[key])

In [None]:
from pprint import pprint
pprint(results)