# QA Bert

https://towardsdatascience.com/question-answering-with-a-fine-tuned-bert-bc4dafd45626

In [1]:
import pandas as pd
import numpy as np
import torch
from transformers import BertForQuestionAnswering, BertTokenizer

In [13]:
squad = pd.read_json('..\\data\\dev-v2.0.json')
del squad['version']
squad.head()

Unnamed: 0,data
0,"{'title': 'Normans', 'paragraphs': [{'qas': [{..."
1,"{'title': 'Computational_complexity_theory', '..."
2,"{'title': 'Southern_California', 'paragraphs':..."
3,"{'title': 'Sky_(United_Kingdom)', 'paragraphs'..."
4,"{'title': 'Victoria_(Australia)', 'paragraphs'..."


Data cleaning

In [22]:
# require columns in our dataframe
cols = ['text', 'question', 'answers']

# list of lists to create our dataframe
comp_list = []
i = 0
for _, dset in squad.iterrows():
    for row in dset['data']['paragraphs']:
        for qas in row['qas']:
            temp_list = []
            temp_list.append(row['context'])
            temp_list.append(qas['question'])
            temp_list.append([a['text'] for a in qas['answers']])
            comp_list.append(temp_list)
df = pd.DataFrame(comp_list, columns=cols)

In [23]:

print(f"Number of questions and answers: {len(df)}")
df.head()

Number of questions and answers: 11873


Unnamed: 0,text,question,answers
0,The Normans (Norman: Nourmands; French: Norman...,In what country is Normandy located?,"[France, France, France, France]"
1,The Normans (Norman: Nourmands; French: Norman...,When were the Normans in Normandy?,"[10th and 11th centuries, in the 10th and 11th..."
2,The Normans (Norman: Nourmands; French: Norman...,From which countries did the Norse originate?,"[Denmark, Iceland and Norway, Denmark, Iceland..."
3,The Normans (Norman: Nourmands; French: Norman...,Who was the Norse leader?,"[Rollo, Rollo, Rollo, Rollo]"
4,The Normans (Norman: Nourmands; French: Norman...,What century did the Normans first gain their ...,"[10th century, the first half of the 10th cent..."


Model initialization

In [24]:
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

Downloading: 100%|██████████| 443/443 [00:00<00:00, 442kB/s]
Downloading: 100%|██████████| 1.25G/1.25G [01:48<00:00, 12.4MB/s]
Downloading: 100%|██████████| 226k/226k [00:00<00:00, 661kB/s] 
Downloading: 100%|██████████| 28.0/28.0 [00:00<00:00, 27.9kB/s]


Asking a random question

In [41]:
rand_n = np.random.randint(0, len(df))

question = df['question'][rand_n]
text = df['text'][rand_n]

Tokenization of the question and text as a pair

In [42]:
input_ids = tokenizer.encode(question, text)
print(f"The input has a total of {len(input_ids)} tokens.")

tokens = tokenizer.convert_ids_to_tokens(input_ids)
count = 0
for token, id in zip(tokens, input_ids):
    if count >= 20:
        break
    count += 1
    print(f" {token:15} {id:15,}")

The input has a total of 128 tokens.
 [CLS]                       101
 who                       2,040
 were                      2,020
 ot                       27,178
 ##achi                   21,046
 ?                         1,029
 [SEP]                       102
 the                       1,996
 physicians               11,572
 of                        1,997
 the                       1,996
 yuan                     11,237
 court                     2,457
 came                      2,234
 from                      2,013
 diverse                   7,578
 cultures                  8,578
 .                         1,012
 healer                   19,783
 ##s                       2,015


Segment and position embeddings

In [36]:
# first occurrence of [SEP] token
sep_idx = input_ids.index(tokenizer.sep_token_id)
print(f"[SEP] token index: {sep_idx}")

# number of tokens in segment A (question)
# this will be one more than the sep_idx as the index in Python starts from 0
num_seg_a = sep_idx + 1
print(f"Numbers of tokens in segment A: {num_seg_a}")

# number of tokens in segment B (text)
num_seg_b = len(input_ids) - num_seg_a
print(f"Numbers of tokens in segment B: {num_seg_b}")

# creating the segment ids
segment_ids = [0] * num_seg_a + [1] * num_seg_b

# making sure that every input token has a segment id
assert len(segment_ids) == len(input_ids)

[SEP] token index: 11
Numbers of tokens in segment A: 12
Numbers of tokens in segment B: 138


Feeding this in our model

In [40]:
# token input_ids to represent the input and token segment_ids to differentiate our segments - question and text
output = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([segment_ids])) 

# tokens with highest start and end scores
answer_start = torch.argmax(output.start_logits)
answer_end = torch.argmax(output.end_logits)

if answer_end >= answer_start:
    answer = tokens[answer_start]
    for i in range(answer_start + 1, answer_end + 1):
        if tokens[i][0:2] == "##":
            answer += tokens[i][2:]
        else:
            answer += " " + tokens[i]
else:
    print("I am unable to find the answer to this question. Can you please ask another question?")

print(f"\nQuestion:\n{question.capitalize()}")
print(f"\nAnswer:\n{answer.capitalize()}.")


Question:
What can a contingency plan suffer from?

Answer:
Preventable financial problems.


Let us now turn this process into function

In [None]:
def question_answer(question, text):
    # tokenize question and text as a pair
    input_ids = tokenizer.encode(question, text)
    
    # string version of tokenized ids
    tokens = tokenizer.convert_ids_to_tokens(input_ids)
    
    # segment IDs
    # first occurrence of [SEP] token
    sep_idx = input_ids.index(tokenizer.sep_token_id)
    # number of tokens in segment A (question)
    num_seg_a = sep_idx+1
    # number of tokens in segment B (text)
    num_seg_b = len(input_ids) - num_seg_a
    
    # list of 0s and 1s for segment embeddings
    segment_ids = [0]*num_seg_a + [1]*num_seg_b
    assert len(segment_ids) == len(input_ids)
    
    # model output using input_ids and segment_ids
    output = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([segment_ids]))
    
    # reconstructing the answer
    answer_start = torch.argmax(output.start_logits)
    answer_end = torch.argmax(output.end_logits)
    if answer_end >= answer_start:
        answer = tokens[answer_start]
        for i in range(answer_start+1, answer_end+1):
            if tokens[i][0:2] == "##":
                answer += tokens[i][2:]
            else:
                answer += " " + tokens[i]
                
    if answer.startswith("[CLS]"):
        answer = "Unable to find the answer to your question."
    
    print("\nPredicted answer:\n{}".format(answer.capitalize()))

Test model using different text and question (not from our dataset)

In [None]:
text = """New York (CNN) -- More than 80 Michael Jackson collectibles -- including the late pop star's famous rhinestone-studded glove from a 1983 performance -- were auctioned off Saturday, reaping a total $2 million. Profits from the auction at the Hard Rock Cafe in New York's Times Square crushed pre-sale expectations of only $120,000 in sales. The highly prized memorabilia, which included items spanning the many stages of Jackson's career, came from more than 30 fans, associates and family members, who contacted Julien's Auctions to sell their gifts and mementos of the singer. Jackson's flashy glove was the big-ticket item of the night, fetching $420,000 from a buyer in Hong Kong, China. Jackson wore the glove at a 1983 performance during \"Motown 25,\" an NBC special where he debuted his revolutionary moonwalk. Fellow Motown star Walter \"Clyde\" Orange of the Commodores, who also performed in the special 26 years ago, said he asked for Jackson's autograph at the time, but Jackson gave him the glove instead. "The legacy that [Jackson] left behind is bigger than life for me,\" Orange said. \"I hope that through that glove people can see what he was trying to say in his music and what he said in his music.\" Orange said he plans to give a portion of the proceeds to charity. Hoffman Ma, who bought the glove on behalf of Ponte 16 Resort in Macau, paid a 25 percent buyer's premium, which was tacked onto all final sales over $50,000. Winners of items less than $50,000 paid a 20 percent premium."""
question = "Where was the Auction held?"
question_answer(question, text)