# Extractive QA


In [153]:
from core_metrics import compute_f1
from tqdm import tqdm
import pandas as pd
import numpy as np
from transformers import pipeline
pd.set_option('display.max_colwidth', None)

In [56]:
model_name = "deepset/tinyroberta-squad2"
eqa_model = pipeline('question-answering', model=model_name, tokenizer=model_name)

Downloading model.safetensors: 100%|██████████| 326M/326M [01:53<00:00, 2.88MB/s] 
Downloading (…)okenizer_config.json: 100%|██████████| 383/383 [00:00<?, ?B/s] 
Downloading (…)olve/main/vocab.json: 100%|██████████| 798k/798k [00:00<00:00, 1.33MB/s]
Downloading (…)olve/main/merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 2.65MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 1.36M/1.36M [00:00<00:00, 2.00MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 239/239 [00:00<?, ?B/s] 


In [111]:
def read_file(filename):
    '''
    Reads a tsv file for extractive qa into a pandas dataframe.
    '''
    rows = []
    with open(filename, "r") as file:
        for line in file:
            sep = line.split("\t")
            answers = [element.replace('\n', '') for element in sep[6:]] # handles variable ground truth amounts
            row = {'queryid':sep[0], 
                    'documentid':sep[1], 
                    'relevance-grade':sep[2], 
                    'question':sep[3], 
                    'context':sep[4], 
                    'text-selection':answers
                }
            rows.append(row)
    return pd.DataFrame(rows)

def prepare_inputs(qatuples):
    '''
    Prepares tuples into dictionaries containing questions and contexts, a suitable format for the extractive qa model.
    '''
    return qatuples[['question', 'context']].to_dict(orient='records')

def generate_answers(inputs):
    '''
    Computes a list of answers for given questions and contexts (qa tuples), simulatenously reports the progress.
    '''
    return [eqa_model(i) for i in tqdm(inputs)]

def evaluate(gts, answer):
    '''
    Computes the f1 score of an answer, given a list of ground truths. 
    An answer is as good as it is similar to any of the ground truths, so we take a maximum of the computed scores.
    '''
    f1 = max([compute_f1(gt, answer) for gt in gts])
    return f1

def evaluate_batch(gts_list, answers_list):
    '''
    Computes scores over lists of answers and ground truths.
    '''
    scores = [evaluate(gts_list[i], answers_list[i]) for i in range(len(answers_list))]
    return scores

def evaluation_report(f1, conf, label):
    print('#####################\n')
    print(f"{label}\n")
    print("f1 score")
    print(f"MEAN:  {np.mean(f1):.4f}")
    print(f"STD:   {np.std(f1):.4f}")
    print(f"MIN:   {np.min(f1):.4f}")
    print(f"MAX:   {np.max(f1):.4f}\n")
    print("confidence score")
    print(f"MEAN:  {np.mean(conf):.4f}")
    print(f"STD:   {np.std(conf):.4f}")
    print(f"MIN:   {np.min(conf):.4f}")
    print(f"MAX:   {np.max(conf):.4f}\n")
    print('#####################')

## MSMARCO 21

In [63]:
tuples = read_file('../Part-3/msmarco-fira-21.qrels.qa-tuples.tsv')
model_inputs = prepare_inputs(tuples)
answers = generate_answers(model_inputs)

100%|██████████| 52606/52606 [1:25:01<00:00, 10.31it/s]  


In [144]:
answer_predictions = [a['answer'] for a in answers]
confidence_predictions = [a['score'] for a in answers]
scores = evaluate_batch(tuples['text-selection'], answer_predictions)
evaluation_report(scores, confidence_predictions, 'MSMARCO FIRA 21')

#####################

MSMARCO FIRA 21

f1 score
MEAN:  0.4194
STD:   0.3270
MIN:   0.0000
MAX:   1.0000

confidence score
MEAN:  0.3237
STD:   0.3111
MIN:   0.0000
MAX:   0.9986

#####################


### "Worst" answers

In [169]:
tuples['scores'] = scores
ind = tuples.sort_values(by='scores').head(10).index
tuples.iloc[ind]

Unnamed: 0,queryid,documentid,relevance-grade,question,context,text-selection,scores
0,135386,100163,3,definition of imagination,"imagination - the formation of a mental image of something that is not perceived as real and is not present to the senses; popular imagination created a world of demons; imagination reveals what the world could be. imaginativeness, vision. creative thinking, creativeness, creativity - the ability to create.",[the formation of a mental image of something that is not perceived as real and is not present to the senses],0.0
26099,873914,6778171,2,what level is blood pressure too low,Most doctors consider chronically low blood pressure too low only if it causes noticeable symptoms. Some experts define low blood pressure as readings lower than 90 mm Hg systolic or 60 mm Hg diastolic — you need to have only one number in the low range for your blood pressure to be considered lower than normal.ome experts define low blood pressure as readings lower than 90 mm Hg systolic or 60 mm Hg diastolic — you need to have only one number in the low range for your blood pressure to be considered lower than normal.,[Some experts define low blood pressure as readings lower than 90 mm Hg systolic or 60 mm Hg diastolic],0.0
26110,285729,6781526,2,how many hours are in fmla,"The employee sued. The employer claimed she hadn’t worked 1,250 hours in the previous 12 months to qualify. The employee said with her on-call time, she had 1,900 hours, way over the minimum required by FMLA regulations. But the court said her on-call time didn’t count. So she had no FMLA case.",[way over the minimum required by FMLA regulations],0.0
5011,999028,7814587,2,where is trumps estate in nj,Bedminster & Far Hills Public Library - Bedminster NJ. Trump National Golf Course - Bedminster NJ. Formerly the home of famous car maker John Delorean and now the site of a championship golf course owned and operated by Donald Trump located off Lamington Road.,[located off Lamington Road.],0.0
26111,995141,6781678,3,"which county is knoxville, tn in","Knoxville is a city in the U.S. state of Tennessee, and the county seat of Knox County. The city had an estimated population of 183,270 in 2013, and a population of 178,874 as of the 2010 census, making it the state's third largest city after Nashville and Memphis.",[Knoxville is a city in the U.S. state of Tennessee],0.0
26130,626918,6789909,3,what do you put basil on,"1 Basil blends well with capers, chives, cilantro, garlic, marjoram, oregano, mint, onion, parsley, rosemary, thyme, and savory. 2 It can be used alone or along with other spices to enhance the taste of pasta. 3 Serve basil with artichokes, eggplant, green vegetables, mushrooms, olives, pizza, grains and some fruits.","[Serve basil with artichokes, eggplant, green vegetables, mushrooms, olives, pizza, grains and some fruits.]",0.0
26146,804103,6794234,3,what is the actual us independence day,"Independence Day is the national holiday of the United States of America commemorating the signing of the Declaration of Independence by the Continental Congress on July 4, 1776, in Philadelphia, Pennsylvania. At the time of the signing the US consisted of 13 colonies under the rule of England’s King George III.","[July 4, 1776]",0.0
26155,763878,6796020,3,what is kyc?,Response: KYC means “Know Your Customer”. It is a process by which banks obtain information about the identity and address of the customers. This process helps to ensure that banks’ services are not misused.,[KYC means “Know Your Customer”],0.0
5002,1063177,7812997,3,why did spock die in wrath of khan,"The damage done to the ship in battle made her unable to achieve warp drive; she couldn't get out of the danger zone fast enough. Spock entered the containment area of the anti-matter core and adjusted it manually, allowing the ship to go to warp speed. In the process, he fatally irradiated himself. I also recall that when the news leaked that Spock would die in The Wrath of Khan, fans went crazy.","[Spock entered the containment area of the anti-matter core and adjusted it manually, allowing the ship to go to warp speed. In the process, he fatally irradiated himself]",0.0
26162,1002554,6798355,3,define uncut?,"Wiktionary (0.00 / 0 votes) Rate this definition: uncut (Adjective). not cut. uncut (Adjective). not cut or ground to the desired shape. uncut (Adjective). not having the page edges trimmed. uncut (Adjective). not edited or abridged by a censor. uncut (Adjective).ncut (adj). not cut; not separated or divided by cutting or otherwise; -- said especially of books, periodicals, and the like, when the leaves have not been separated by trimming in binding. Uncut (adj). not ground, or otherwise cut, into a certain shape; as, an uncut diamond.","[not cut, not cut or ground to the desired shape, not having the page edges trimmed, edited or abridged by a censor, not separated or divided by cutting or otherwise]",0.0


In [170]:
ind = ind.tolist()
for i in ind:
    print(answer_predictions[i])

imagination reveals what the world could be.
if it causes noticeable symptoms
1,900
Bedminster & Far Hills Public Library - Bedminster NJ
Knox County
spices to enhance the taste of pasta
the national holiday
a process by which banks obtain information about the identity and address of the customers
fans went crazy
Adjective


## TOP-N RERANKING RESULTS

In [None]:
tuples = read_file('../Part-3/msmarco-fira-21.qrels.qa-tuples.tsv')
model_inputs = prepare_inputs(tuples)
answers = generate_answers(model_inputs)

In [None]:
answer_predictions = [a['answer'] for a in answers]
confidence_predictions = [a['score'] for a in answers]
scores = evaluate_batch(tuples['text-selection'], answer_predictions)
evaluation_report(scores, confidence_predictions, 'TOP MSMARCO RE-RANKING')