## Imports 

In [8]:
import numpy as np
import json
import pandas as pd
import re

import nltk
from nltk.translate.bleu_score import SmoothingFunction, corpus_bleu, sentence_bleu
from nltk.translate.meteor_score import meteor_score

In [15]:
#nltk.download('wordnet')

# Functions

### Bleu Function

In [113]:
def bleu(ref, gen):
    ''' 
    calculate pair wise bleu score. uses nltk implementation
    Args:
        references : a list of reference sentences 
        candidates : a list of candidate(generated) sentences
    Returns:
        bleu score(float)
    '''
    
    ref_bleu = ref
    gen_bleu = gen
    
    cc = SmoothingFunction()
    
    score_bleu_1 = corpus_bleu(ref_bleu, gen_bleu, weights=(1, 0, 0, 0), smoothing_function=cc.method4)
    score_bleu_2 = corpus_bleu(ref_bleu, gen_bleu, weights=(0.5, 0.5, 0, 0), smoothing_function=cc.method4)
    score_bleu_3 = corpus_bleu(ref_bleu, gen_bleu, weights=(0.33, 0.33, 0.33, 0), smoothing_function=cc.method4)
    score_bleu_4 = corpus_bleu(ref_bleu, gen_bleu, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=cc.method4)
    
    return [score_bleu_1, score_bleu_2, score_bleu_3, score_bleu_4]

### Read JSON 

In [3]:
def unpack_squad_val(data):
    contexts = []
    questions = []
    answers_text = []
    answers_start = []
    for i in range(data.shape[0]):
        topic = data.iloc[i,1]['paragraphs']
        for sub_para in topic:
            for q_a in sub_para['qas']:
                if(q_a['answers']):
                    questions.append(q_a['question'])
                    answers_start.append(q_a['answers'][0]['answer_start'])
                    answers_text.append(q_a['answers'][0]['text'])
                    contexts.append(sub_para['context'])   
    return contexts, questions, answers_text, answers_start 

### Evaluate

In [187]:
def calculate_scores(eval_data, original_text, predicted_text):
    bleu_score_1 = []
    bleu_score_2 = []
    bleu_score_3 = []
    bleu_score_4 = []
    meteor_score_lst = []
    
    # replace any nan prediction values with single space 
    eval_data[predicted_text].fillna(' ', inplace=True)
    eval_data[predicted_text].str.lower()
    eval_data[original_text].str.lower()

    for idx, row in eval_data.iterrows():  
        
        # remove spaces and punctuation and create a list of words
        text = re.sub("[^\w]", " ",  row[original_text]).split()
        pred = re.sub("[^\w]", " ",  row[predicted_text]).split()
        
        #text = re.sub(r'[^a-zA-Z\d]','', str(row['text']).lower())
        #pred = re.sub(r'[^a-zA-Z\d]','', str(row['pred']).lower())

        # calculate the meteor score
        meteor = meteor_score([row[original_text]], row[predicted_text])
        meteor_score_lst.append(meteor)

        # if the array of text and prediction are not the same then pad out the sequence 
        if(len(pred) < len(text)):
            diff = len(text) - len(pred)
            #pred = pred + ("-"*diff)
            pred.extend([" " for i in range(diff)])

            
        if(len(text) < len(pred)):
            diff = len(pred) - len(text)
            #text = text + ("-"*diff)
            text.extend([" " for i in range(diff)])

        if(len(text) == 1 and len(pred) == 1):
            #text = text + ("-")
            #pred = pred + ("-")
            text.extend(" ")
            pred.extend(" ")

        
        scores = bleu(text, pred)
        bleu_score_1.append(scores[0])
        bleu_score_2.append(scores[1])
        bleu_score_3.append(scores[2])
        bleu_score_4.append(scores[3])
        
    return bleu_score_1, bleu_score_2, bleu_score_3, bleu_score_4, meteor_score_lst

In [190]:
def create_scores_table(eval_data, original_text, predicted_text):
    
    bleu_1, bleu_2, bleu_3, bleu_4, meteor = calculate_scores(eval_data, original_text, predicted_text)
    
    eval_data['BLEU1'] = bleu_1
    eval_data['BLEU2'] = bleu_2
    eval_data['BLEU3'] = bleu_3
    eval_data['BLEU4'] = bleu_4
    eval_data['Meteor'] = meteor
    
    print("BLEU1: " + str(sum(eval_data['BLEU1'].values)/len(eval_data['BLEU1'])))
    print("BLEU2: " + str(sum(eval_data['BLEU2'].values)/len(eval_data['BLEU2'])))
    print("BLEU3: " + str(sum(eval_data['BLEU3'].values)/len(eval_data['BLEU3'])))
    print("BLEU4: " + str(sum(eval_data['BLEU4'].values)/len(eval_data['BLEU4'])))
    print("Meteor: " + str(sum(eval_data['Meteor'].values)/len(eval_data['Meteor'])))

# Eval SQUAD

In [179]:
squad_eval_data = pd.read_csv('../QA/predictions/hypertune_8.csv', index_col = 0)

In [180]:
create_scores_table(squad_eval_data, 'text', 'pred')

BLEU1: 0.5397896416795487
BLEU2: 0.3737671974621117
BLEU3: 0.3111107805824293
BLEU4: 0.26582376937404356
Meteor: 0.5035400747574519


In [181]:
squad_eval_data.head()

Unnamed: 0,context,question,answer_start,text,pred,BLEU1,BLEU2,BLEU3,BLEU4,Meteor
0,The Normans (Norman: Nourmands; French: Norman...,In what country is Normandy located?,159,France,france,0.857143,0.490031,0.378261,0.311667,0.5
1,The Normans (Norman: Nourmands; French: Norman...,When were the Normans in Normandy?,94,10th and 11th centuries,10th and 11th centuries,0.9,0.580689,0.454884,0.374565,0.992188
2,The Normans (Norman: Nourmands; French: Norman...,From which countries did the Norse originate?,256,"Denmark, Iceland and Norway","denmark , iceland and norway",0.869565,0.578911,0.457043,0.377424,0.718157
3,The Normans (Norman: Nourmands; French: Norman...,Who was the Norse leader?,308,Rollo,rollo,0.5,0.36319,0.305605,0.262597,0.5
4,The Normans (Norman: Nourmands; French: Norman...,What century did the Normans first gain their ...,671,10th century,10th,0.8,0.441367,0.340316,0.281135,0.263158


In [204]:
squad_eval_data.to_csv('../QA/predictions/squad_metrics.csv')

### Subsets of Poor Questions 

In [146]:
# Taking the best Bleu metric of Bleu1 we can analyse the poorer responses - this threshold is set at 0.2 to start
# to understand a suitable threshold

In [147]:
squad_thresh = squad_eval_data[squad_eval_data["BLEU1"] < 0.2]

In [148]:
print("Length of SQuAD threshold: " + str(len(squad_thresh)))
print("Percentage of samples: " + str(len(squad_thresh)/len(squad_eval_data)))

Length of SQuAD threshold: 1517
Percentage of samples: 0.2559041835357625


In [149]:
squad_thresh = squad_thresh.reset_index(drop=True)
squad_thresh

Unnamed: 0,context,question,answer_start,text,pred,BLEU1,BLEU2,BLEU3,BLEU4,Meteor
0,"The English name ""Normans"" comes from the Fren...",What is the original meaning of the word Norman?,341,Viking,"norseman , viking """,0.071429,0.157087,0.188191,0.189681,0.384615
1,"Before Rollo's arrival, its populations did no...",Who upon arriving gave the original viking set...,7,Rollo,[CLS] who upon arriving gave the original viki...,0.000000,0.000000,0.000000,0.000000,0.083333
2,"Soon after the Normans began to enter Italy, t...","Who was the Normans' main enemy in Italy, the ...",161,Seljuk Turks,"the pechenegs , the bulgars , and especially t...",0.040816,0.133659,0.178942,0.188752,0.646552
3,One of the first Norman mercenaries to serve a...,Who ruined Roussel de Bailleul's plans for an ...,522,Alexius Komnenos,byzantine general alexius komnenos,0.129032,0.229209,0.251202,0.241676,0.852273
4,"Robert Guiscard, an other Norman adventurer pr...",When did Dyrrachium fall to the Normans?,1315,1082,february 1082,0.000000,0.000000,0.000000,0.000000,0.454545
...,...,...,...,...,...,...,...,...,...,...
1512,A conservative force that acts on a closed sys...,What is the force between two locations relate...,330,difference in potential energy,[CLS],0.000000,0.000000,0.000000,0.000000,0.000000
1513,The connection between macroscopic nonconserva...,In what treatment are nonconservative and cons...,134,statistical mechanics,[CLS],0.000000,0.000000,0.000000,0.000000,0.000000
1514,The connection between macroscopic nonconserva...,What is the exchange of heat associated with?,188,nonconservative forces,[CLS],0.000000,0.000000,0.000000,0.000000,0.000000
1515,"The pound-force has a metric counterpart, less...",What is the kilogram-force sometimes reffered ...,114,kilopond,[CLS] what is the kilogram - force sometimes r...,0.000000,0.000000,0.000000,0.000000,0.104167


# Eval QuAC 

In [165]:
quac_eval_data = pd.read_csv('../QA/predictions/quac_hypertune_12.csv', index_col = 0)

In [166]:
create_scores_table(quac_eval_data, 'text', 'pred')

BLEU1: 0.20289720415450696
BLEU2: 0.1983669700883264
BLEU3: 0.1912250250766035
BLEU4: 0.17555015758392137
Meteor: 0.18787356714981457


In [167]:
quac_eval_data.head()

Unnamed: 0,context,question,answer_start,text,pred,BLEU1,BLEU2,BLEU3,BLEU4,Meteor
0,"In May 1983, she married Nikos Karvelas, a com...",what happened in 1983?,0,"In May 1983, she married Nikos Karvelas, a com...","in may 1983 , she married nikos karvelas , a c...",0.485714,0.472359,0.416255,0.3601,0.430696
1,"In May 1983, she married Nikos Karvelas, a com...",did they have any children?,92,in November she gave birth to her daughter Sofia.,in november she gave birth to her daughter sof...,0.925,0.626666,0.49266,0.404877,0.878262
2,"In May 1983, she married Nikos Karvelas, a com...",did she have any other children?,2024,CANNOTANSWER,cannotanswer,0.076923,0.161497,0.190781,0.191123,0.5
3,"In May 1983, she married Nikos Karvelas, a com...",what collaborations did she do with nikos?,213,"Since 1975, all her releases have become gold ...","after their marriage , she started a close col...",0.246154,0.334666,0.330859,0.302231,0.0
4,"In May 1983, she married Nikos Karvelas, a com...",what influences does he have in her music?,2024,CANNOTANSWER,cannotanswer,0.076923,0.161497,0.190781,0.191123,0.5


In [205]:
quac_eval_data.to_csv('predictions/quac_metrics.csv')

### Subsets of Poor Questions

In [168]:
# Taking the best Bleu metric of Bleu1 we can analyse the poorer responses - this threshold is set at 0.2 to start
# to understand a suitable threshold

In [169]:
quac_thresh = quac_eval_data[quac_eval_data["BLEU1"] < 0.2]

In [170]:
print("Length of QuAC threshold: " + str(len(quac_thresh)))
print("Percentage of samples: " + str(len(quac_thresh)/len(quac_eval_data)))

Length of QuAC threshold: 5267
Percentage of samples: 0.716208865923307


In [171]:
quac_thresh = quac_thresh.reset_index(drop=True)
quac_thresh

Unnamed: 0,context,question,answer_start,text,pred,BLEU1,BLEU2,BLEU3,BLEU4,Meteor
0,"In May 1983, she married Nikos Karvelas, a com...",did she have any other children?,2024,CANNOTANSWER,cannotanswer,0.076923,0.161497,0.190781,0.191123,0.500000
1,"In May 1983, she married Nikos Karvelas, a com...",what influences does he have in her music?,2024,CANNOTANSWER,cannotanswer,0.076923,0.161497,0.190781,0.191123,0.500000
2,"In May 1983, she married Nikos Karvelas, a com...",what were some of the songs?,879,"one of her most famous songs, titled ""Dodeka"" ...",,0.000000,0.000000,0.000000,0.000000,0.000000
3,"In May 1983, she married Nikos Karvelas, a com...",did she have any other famous songs?,1049,"The album included the hit Pragmata (""Things"")...",,0.000000,0.000000,0.000000,0.000000,0.000000
4,"In September 2016 Vladimir Markin, official sp...",Did they have any clues?,1908,probably FSB) are known to have targeted the w...,cannotanswer,0.107143,0.206999,0.233870,0.228368,0.000000
...,...,...,...,...,...,...,...,...,...,...
5262,"In 1963, Major League Baseball expanded the st...",What records did Koufax set in 1963?,1226,"Koufax threw 11 shutouts, setting a new post-1...","on may 11 , koufax no - hit the san francisco ...",0.010549,0.078098,0.133425,0.156455,0.176942
5263,"In 1963, Major League Baseball expanded the st...",How did the team do in 1963?,628,"On May 11, Koufax no-hit the San Francisco Gia...",,0.000000,0.000000,0.000000,0.000000,0.000000
5264,"In 1963, Major League Baseball expanded the st...",In what other games did Koufax perform well?,786,Koufax carried a perfect game into the eighth ...,[CLS],0.000000,0.000000,0.000000,0.000000,0.000000
5265,"In 1963, Major League Baseball expanded the st...",How did Koufax perform in the post-season?,1661,"Facing the Yankees in the 1963 World Series, K...",,0.000000,0.000000,0.000000,0.000000,0.000000


# Eval Generated SQUAD

In [183]:
squad_gen = pd.read_csv('../QA/results/squad_gen_qa.csv', index_col=0)

In [192]:
create_scores_table(squad_gen, 'question', 'original questions')

BLEU1: 0.05454947246951334
BLEU2: 0.14601995143557397
BLEU3: 0.18599749546364394
BLEU4: 0.19181666804371872
Meteor: 0.15777602736159088


In [193]:
squad_gen

Unnamed: 0,context,question,pred,original questions,BLEU1,BLEU2,BLEU3,BLEU4,Meteor
0,The Normans (Norman: Nourmands; French: Norman...,What is the first name of the Normans?,nourmands,"['In what country is Normandy located?', 'When...",0.058824,0.172644,0.218947,0.224110,0.235849
1,The Normans (Norman: Nourmands; French: Norman...,What was the first name given to the Normans?,nourmands,"['In what country is Normandy located?', 'When...",0.058824,0.172644,0.218947,0.224110,0.323478
2,The Normans (Norman: Nourmands; French: Norman...,Who was the leader of the Normans?,rollo,"['In what country is Normandy located?', 'When...",0.041176,0.144444,0.194635,0.204991,0.263459
3,The Normans (Norman: Nourmands; French: Norman...,What was the name of the nation that gave the...,normans,"['In what country is Normandy located?', 'When...",0.070588,0.189122,0.232524,0.234561,0.218750
4,The Normans (Norman: Nourmands; French: Norman...,What did the Normans come from?,"norse ( "" norman "" comes from "" norseman "" ) r...","['In what country is Normandy located?', 'When...",0.029412,0.122077,0.174180,0.188453,0.506818
...,...,...,...,...,...,...,...,...,...
6628,"The pound-force has a metric counterpart, less...",What is the kilogram-force equivalent to?,[CLS],['What is the metric term less used than the N...,0.066667,0.186714,0.232155,0.235191,0.226155
6629,"The pound-force has a metric counterpart, less...",What is the mug?,metric slug,['What is the metric term less used than the N...,0.041667,0.147610,0.198801,0.209118,0.197368
6630,"The pound-force has a metric counterpart, less...",What is the ball?,[CLS],['What is the metric term less used than the N...,0.037500,0.140035,0.192008,0.203681,0.197368
6631,"The pound-force has a metric counterpart, less...",What is the kip?,1000 lbf,['What is the metric term less used than the N...,0.041667,0.147610,0.198801,0.209118,0.197368


In [199]:
squad_gen.to_csv('../QA/results/squad_gen_metrics.csv')

In [194]:
# due to there being more samples for the question to be compared against for these datasets the bleu4 and meteor will be used
# for analysis as they work of a higher set of n-grams

# Eval Generated QUAC

In [195]:
quac_gen = pd.read_csv('../QA/results/quac_gen_qa.csv', index_col=0)

In [196]:
create_scores_table(quac_gen, 'question', 'original questions')

BLEU1: 0.03743766011919225
BLEU2: 0.12554186398727898
BLEU3: 0.1707298507017856
BLEU4: 0.1813654474002611
Meteor: 0.11302342476767464


In [197]:
quac_gen

Unnamed: 0,context,question,pred,original questions,BLEU1,BLEU2,BLEU3,BLEU4,Meteor
0,"In May 1983, she married Nikos Karvelas, a com...",What is 2017?,cannotanswer,"['what happened in 1983?', 'did they have any ...",0.015000,0.087848,0.140646,0.160571,0.000000
1,"In September 2016 Vladimir Markin, official sp...",Did the investigation do anything else intere...,,"['Did they have any clues?', 'How did they tar...",0.048780,0.158598,0.207814,0.215890,0.394690
2,Graham returned to the WWWF in April 1977 afte...,What was his debut?,graham returned to the wwwf in april 1977 afte...,"['Why did he return to the WWWF?', 'What was h...",0.032258,0.128394,0.180413,0.193745,0.129870
3,In the early 1990s US federal agents were inve...,What was he going to do with the WWF?,[CLS],"['what disputes did he have?', 'why?', 'did he...",0.044872,0.150157,0.199316,0.208499,0.169492
4,During the aftermath of the murder of Stefan P...,Did he have any ties to the US?,the game manhunt was linked after the media wr...,"[""How was Jack Thompson's related to Manhunt?""...",0.014793,0.089213,0.143466,0.163894,0.296103
...,...,...,...,...,...,...,...,...,...
1031,Red worked on new material for Until We Have F...,What is the album about?,[CLS],"['what happened in 2010?', 'when was the album...",0.013575,0.083948,0.136761,0.157374,0.104167
1032,"Red started work on their second album, Innoce...",did he tour with any other bands?,"in march , the band toured with breaking benja...","['When was Innocence & Instinct released?', 'W...",0.035088,0.135152,0.187379,0.199841,0.339209
1033,"Before 1948, most Palestinian artists were sel...",What was the inspiration for the designs?,cannotanswer,['Are there any other interesting aspects abou...,0.022727,0.109465,0.163490,0.180504,0.044643
1034,"Reubens was born Paul Rubenfeld in Peekskill, ...",,[CLS],"[""What is Pee wee's playhouse?"", 'when did the...",0.000000,0.000000,0.000000,0.000000,0.000000


In [198]:
quac_gen.to_csv('../QA/results/quac_gen_metrics.csv')

# QuAC Generated pred with SQuAD Evaluation

In [200]:
quac_squad_gen = pd.read_csv('../QA/results/quac_squad_gen_qa.csv', index_col=0)

In [201]:
create_scores_table(quac_squad_gen, 'question', 'original questions')

BLEU1: 0.03743766011919225
BLEU2: 0.12554186398727898
BLEU3: 0.1707298507017856
BLEU4: 0.1813654474002611
Meteor: 0.11302342476767464


In [202]:
quac_squad_gen

Unnamed: 0,context,question,pred,original questions,BLEU1,BLEU2,BLEU3,BLEU4,Meteor
0,"In May 1983, she married Nikos Karvelas, a com...",What is 2017?,[CLS],"['what happened in 1983?', 'did they have any ...",0.015000,0.087848,0.140646,0.160571,0.000000
1,"In September 2016 Vladimir Markin, official sp...",Did the investigation do anything else intere...,included the killing of anna politkovskaya amo...,"['Did they have any clues?', 'How did they tar...",0.048780,0.158598,0.207814,0.215890,0.394690
2,Graham returned to the WWWF in April 1977 afte...,What was his debut?,"baltimore , maryland","['Why did he return to the WWWF?', 'What was h...",0.032258,0.128394,0.180413,0.193745,0.129870
3,In the early 1990s US federal agents were inve...,What was he going to do with the WWF?,sold his hall of fame ring on ebay to help pay...,"['what disputes did he have?', 'why?', 'did he...",0.044872,0.150157,0.199316,0.208499,0.169492
4,During the aftermath of the murder of Stefan P...,Did he have any ties to the US?,the police officially denied any link,"[""How was Jack Thompson's related to Manhunt?""...",0.014793,0.089213,0.143466,0.163894,0.296103
...,...,...,...,...,...,...,...,...,...
1031,Red worked on new material for Until We Have F...,What is the album about?,novel,"['what happened in 2010?', 'when was the album...",0.013575,0.083948,0.136761,0.157374,0.104167
1032,"Red started work on their second album, Innoce...",did he tour with any other bands?,[CLS],"['When was Innocence & Instinct released?', 'W...",0.035088,0.135152,0.187379,0.199841,0.339209
1033,"Before 1948, most Palestinian artists were sel...",What was the inspiration for the designs?,[CLS],['Are there any other interesting aspects abou...,0.022727,0.109465,0.163490,0.180504,0.044643
1034,"Reubens was born Paul Rubenfeld in Peekskill, ...",,[CLS],"[""What is Pee wee's playhouse?"", 'when did the...",0.000000,0.000000,0.000000,0.000000,0.000000


In [203]:
quac_squad_gen.to_csv('../QA/results/quac_squad_gen_metrics.csv')