In [1]:
import json

with open('data/squad/dev.json', 'r') as f:
    squad = json.load(f)

In [2]:
from transformers import BertTokenizer, BertForQuestionAnswering

# https://huggingface.co/models
# Filter for Question/Answering:
# Filter for Bert.
# deepset/bert-base-cased-squad2
    # https://huggingface.co/deepset/bert-base-cased-squad2

modelname = 'deepset/bert-base-cased-squad2'

tokenizer = BertTokenizer.from_pretrained(modelname)
model = BertForQuestionAnswering.from_pretrained(modelname)

In [3]:
# https://huggingface.co/transformers/main_classes/pipelines.html

from transformers import pipeline

qa = pipeline('question-answering', model=model, tokenizer=tokenizer)

In [4]:
squad[:2]

[{'question': 'In what country is Normandy located?',
  'answer': 'France',
  'context': 'The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.'},
 {'question': 'When were the Normans in Normandy?',
  'answer': 'in the 10th and 11th centuries',
  'context': 'The Normans (Norman: Nourmands; French: Normands; La

In [5]:
qa({
    'question': 'In what country is Normandy located?',
    'context': 'The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.'
})

{'score': 0.9995271563529968, 'start': 159, 'end': 166, 'answer': 'France.'}

In [6]:
answers = []

for pair in squad[:5]:
    ans = qa({
        'question': pair['question'],
        'context': pair['context']
    })
    answers.append({
        'predicted': ans['answer'],
        'true': pair['answer']
    })

In [7]:
answers

[{'predicted': 'France.', 'true': 'France'},
 {'predicted': '10th and 11th centuries',
  'true': 'in the 10th and 11th centuries'},
 {'predicted': '10th and 11th centuries', 'true': '10th and 11th centuries'},
 {'predicted': 'Denmark, Iceland and Norway',
  'true': 'Denmark, Iceland and Norway'},
 {'predicted': 'Rollo,', 'true': 'Rollo'}]

### Exact Match (EM)

In [8]:
em = []

for answer in answers:
    if answer['predicted'] == answer['true']:
        em.append(1)
    else:
        em.append(0)

In [9]:
sum(em)/len(em)

0.4

In [10]:
import re

em = []

for answer in answers:
    pred = re.sub('[^0-9a-z ]', '', answer['predicted'].lower())
    true = re.sub('[^0-9a-z ]', '', answer['true'].lower())
    if pred == true:
        em.append(1)
    else:
        em.append(0)

In [11]:
sum(em)/len(em)

0.8

# ROGUE

In [12]:
from rouge import Rouge

In [13]:
model_out = 'hello to the world'
reference = 'hello world'

In [14]:
rouge = Rouge()

In [15]:
# output = f-1 scores, precision, recall

rouge.get_scores(model_out, reference)

[{'rouge-1': {'r': 1.0, 'p': 0.5, 'f': 0.6666666622222223},
  'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0},
  'rouge-l': {'r': 1.0, 'p': 0.5, 'f': 0.6666666622222223}}]

In [16]:
answers

[{'predicted': 'France.', 'true': 'France'},
 {'predicted': '10th and 11th centuries',
  'true': 'in the 10th and 11th centuries'},
 {'predicted': '10th and 11th centuries', 'true': '10th and 11th centuries'},
 {'predicted': 'Denmark, Iceland and Norway',
  'true': 'Denmark, Iceland and Norway'},
 {'predicted': 'Rollo,', 'true': 'Rollo'}]

In [17]:
model_out = [ans['predicted'] for ans in answers]
reference = [ans['true'] for ans in answers]

model_out

['France.',
 '10th and 11th centuries',
 '10th and 11th centuries',
 'Denmark, Iceland and Norway',
 'Rollo,']

In [18]:
rouge.get_scores(model_out, reference)

[{'rouge-1': {'r': 1.0, 'p': 1.0, 'f': 0.999999995},
  'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0},
  'rouge-l': {'r': 1.0, 'p': 1.0, 'f': 0.999999995}},
 {'rouge-1': {'r': 0.6666666666666666, 'p': 1.0, 'f': 0.7999999952000001},
  'rouge-2': {'r': 0.6, 'p': 1.0, 'f': 0.7499999953125},
  'rouge-l': {'r': 0.6666666666666666, 'p': 1.0, 'f': 0.7999999952000001}},
 {'rouge-1': {'r': 1.0, 'p': 1.0, 'f': 0.999999995},
  'rouge-2': {'r': 1.0, 'p': 1.0, 'f': 0.999999995},
  'rouge-l': {'r': 1.0, 'p': 1.0, 'f': 0.999999995}},
 {'rouge-1': {'r': 1.0, 'p': 1.0, 'f': 0.999999995},
  'rouge-2': {'r': 1.0, 'p': 1.0, 'f': 0.999999995},
  'rouge-l': {'r': 1.0, 'p': 1.0, 'f': 0.999999995}},
 {'rouge-1': {'r': 0.0, 'p': 0.0, 'f': 0.0},
  'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0},
  'rouge-l': {'r': 0.0, 'p': 0.0, 'f': 0.0}}]

In [19]:
# average score
rouge.get_scores(model_out, reference, avg=True)

{'rouge-1': {'r': 0.7333333333333333, 'p': 0.8, 'f': 0.7599999960400001},
 'rouge-2': {'r': 0.52, 'p': 0.6, 'f': 0.5499999970625},
 'rouge-l': {'r': 0.7333333333333333, 'p': 0.8, 'f': 0.7599999960400001}}

### Implemented for the answers

In [20]:
from rouge import Rouge

In [21]:
rouge = Rouge()

In [22]:
from tqdm import tqdm

model_out = []
reference = []

In [23]:
for pair in tqdm(squad[:50], leave=True):
    ans = qa({
        'question': pair['question'],
        'context': pair['context']
    })
    model_out.append(ans['answer'])
    reference.append(pair['answer'])

100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [21:46<00:00, 26.13s/it]


In [24]:
rouge.get_scores(model_out, reference, avg=True)

{'rouge-1': {'r': 0.4152380952380952,
  'p': 0.45285714285714285,
  'f': 0.409301584803846},
 'rouge-2': {'r': 0.2245, 'p': 0.24696969696969695, 'f': 0.21939626912222163},
 'rouge-l': {'r': 0.4152380952380952,
  'p': 0.45285714285714285,
  'f': 0.409301584803846}}

In [25]:
scores = rouge.get_scores(model_out, reference)

In [27]:
print(model_out[4], ' | ', reference[4], ' | ', scores[4]['rouge-1']['f'])

Rollo,  |  Rollo  |  0.0


In [28]:
import re

clean = re.compile('(?i)[^0-9a-z ]')

model_out = [clean.sub('', text) for text in model_out]
reference = [clean.sub('', text) for text in reference]

In [29]:
scores = rouge.get_scores(model_out, reference)

In [31]:
print(model_out[4], ' | ', reference[4], ' | ', scores[4]['rouge-1']['f'])

Rollo  |  Rollo  |  0.999999995


In [32]:
rouge.get_scores(model_out, reference, avg=True)

{'rouge-1': {'r': 0.5919047619047618,
  'p': 0.5608658008658007,
  'f': 0.5337460286760682},
 'rouge-2': {'r': 0.3145, 'p': 0.3198268398268398, 'f': 0.2943962687784716},
 'rouge-l': {'r': 0.5919047619047618,
  'p': 0.5608658008658007,
  'f': 0.5337460286760682}}