
# QA BERT


## Fine-tuned BERT

https://towardsdatascience.com/question-answering-with-a-fine-tuned-bert-bc4dafd45626

In [1]:
import pandas as pd
import numpy as np
import torch
from transformers import BertForQuestionAnswering, BertTokenizer

In [13]:
squad = pd.read_json('..\\data\\dev-v2.0.json')
del squad['version']
squad.head()

Unnamed: 0,data
0,"{'title': 'Normans', 'paragraphs': [{'qas': [{..."
1,"{'title': 'Computational_complexity_theory', '..."
2,"{'title': 'Southern_California', 'paragraphs':..."
3,"{'title': 'Sky_(United_Kingdom)', 'paragraphs'..."
4,"{'title': 'Victoria_(Australia)', 'paragraphs'..."


Data cleaning

In [22]:
# require columns in our dataframe
cols = ['text', 'question', 'answers']

# list of lists to create our dataframe
comp_list = []
i = 0
for _, dset in squad.iterrows():
    for row in dset['data']['paragraphs']:
        for qas in row['qas']:
            temp_list = []
            temp_list.append(row['context'])
            temp_list.append(qas['question'])
            temp_list.append([a['text'] for a in qas['answers']])
            comp_list.append(temp_list)
df = pd.DataFrame(comp_list, columns=cols)

In [23]:
print(f"Number of questions and answers: {len(df)}")
df.head()

Number of questions and answers: 11873


Unnamed: 0,text,question,answers
0,The Normans (Norman: Nourmands; French: Norman...,In what country is Normandy located?,"[France, France, France, France]"
1,The Normans (Norman: Nourmands; French: Norman...,When were the Normans in Normandy?,"[10th and 11th centuries, in the 10th and 11th..."
2,The Normans (Norman: Nourmands; French: Norman...,From which countries did the Norse originate?,"[Denmark, Iceland and Norway, Denmark, Iceland..."
3,The Normans (Norman: Nourmands; French: Norman...,Who was the Norse leader?,"[Rollo, Rollo, Rollo, Rollo]"
4,The Normans (Norman: Nourmands; French: Norman...,What century did the Normans first gain their ...,"[10th century, the first half of the 10th cent..."


Model initialization

In [46]:
model_fine_tuned = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
tokenizer_fine_tuned = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

Asking a random question

In [47]:
rand_n = np.random.randint(0, len(df))

question = df['question'][rand_n]
text = df['text'][rand_n]

Tokenization of the question and text as a pair

In [48]:
input_ids = tokenizer_fine_tuned.encode(question, text)
print(f"The input has a total of {len(input_ids)} tokens.")

tokens = tokenizer_fine_tuned.convert_ids_to_tokens(input_ids)
count = 0
for token, id in zip(tokens, input_ids):
    if count >= 20:
        break
    count += 1
    print(f" {token:15} {id:15,}")

The input has a total of 195 tokens.
 [CLS]                       101
 where                     2,073
 was                       2,001
 francis                   4,557
 he                        2,002
 ##is                      2,483
 ##ler                     3,917
 taken                     2,579
 after                     2,044
 the                       1,996
 protest                   6,186
 ?                         1,029
 [SEP]                       102
 when                      2,043
 the                       1,996
 committee                 2,837
 for                       2,005
 non                       2,512
 -                         1,011
 violent                   6,355


Segment and position embeddings

In [49]:
# first occurrence of [SEP] token
sep_idx = input_ids.index(tokenizer.sep_token_id)
print(f"[SEP] token index: {sep_idx}")

# number of tokens in segment A (question)
# this will be one more than the sep_idx as the index in Python starts from 0
num_seg_a = sep_idx + 1
print(f"Numbers of tokens in segment A: {num_seg_a}")

# number of tokens in segment B (text)
num_seg_b = len(input_ids) - num_seg_a
print(f"Numbers of tokens in segment B: {num_seg_b}")

# creating the segment ids
segment_ids = [0] * num_seg_a + [1] * num_seg_b

# making sure that every input token has a segment id
assert len(segment_ids) == len(input_ids)

[SEP] token index: 12
Numbers of tokens in segment A: 13
Numbers of tokens in segment B: 182


Feeding this to our model

In [50]:
# token input_ids to represent the input and token segment_ids to differentiate our segments - question and text
output = model_fine_tuned(torch.tensor([input_ids]), token_type_ids=torch.tensor([segment_ids])) 

# tokens with highest start and end scores
answer_start = torch.argmax(output.start_logits)
answer_end = torch.argmax(output.end_logits)

if answer_end >= answer_start:
    answer = tokens[answer_start]
    for i in range(answer_start + 1, answer_end + 1):
        if tokens[i][0:2] == "##":
            answer += tokens[i][2:]
        else:
            answer += " " + tokens[i]
else:
    print("I am unable to find the answer to this question. Can you please ask another question?")

print(f"\nQuestion:\n{question.capitalize()}")
print(f"\nAnswer:\n{answer.capitalize()}.")


Question:
Where was francis heisler taken after the protest?

Answer:
Tonopah , nevada.


Let us now turn this process into function

In [51]:
def question_answer(question, text):
    # tokenize question and text as a pair
    input_ids = tokenizer_fine_tuned.encode(question, text)
    
    # string version of tokenized ids
    tokens = tokenizer_fine_tuned.convert_ids_to_tokens(input_ids)
    
    # segment IDs
    # first occurrence of [SEP] token
    sep_idx = input_ids.index(tokenizer_fine_tuned.sep_token_id)
    # number of tokens in segment A (question)
    num_seg_a = sep_idx+1
    # number of tokens in segment B (text)
    num_seg_b = len(input_ids) - num_seg_a
    
    # list of 0s and 1s for segment embeddings
    segment_ids = [0]*num_seg_a + [1]*num_seg_b
    assert len(segment_ids) == len(input_ids)
    
    # model output using input_ids and segment_ids
    output = model_fine_tuned(torch.tensor([input_ids]), token_type_ids=torch.tensor([segment_ids]))
    
    # reconstructing the answer
    answer_start = torch.argmax(output.start_logits)
    answer_end = torch.argmax(output.end_logits)
    if answer_end >= answer_start:
        answer = tokens[answer_start]
        for i in range(answer_start+1, answer_end+1):
            if tokens[i][0:2] == "##":
                answer += tokens[i][2:]
            else:
                answer += " " + tokens[i]
                
    if answer.startswith("[CLS]"):
        answer = "Unable to find the answer to your question."
    
    print("\nPredicted answer:\n{}".format(answer.capitalize()))

Test model using different text and question (not from our dataset)

In [52]:
text = """New York (CNN) -- More than 80 Michael Jackson collectibles -- including the late pop star's famous rhinestone-studded glove from a 1983 performance -- were auctioned off Saturday, reaping a total $2 million. Profits from the auction at the Hard Rock Cafe in New York's Times Square crushed pre-sale expectations of only $120,000 in sales. The highly prized memorabilia, which included items spanning the many stages of Jackson's career, came from more than 30 fans, associates and family members, who contacted Julien's Auctions to sell their gifts and mementos of the singer. Jackson's flashy glove was the big-ticket item of the night, fetching $420,000 from a buyer in Hong Kong, China. Jackson wore the glove at a 1983 performance during \"Motown 25,\" an NBC special where he debuted his revolutionary moonwalk. Fellow Motown star Walter \"Clyde\" Orange of the Commodores, who also performed in the special 26 years ago, said he asked for Jackson's autograph at the time, but Jackson gave him the glove instead. "The legacy that [Jackson] left behind is bigger than life for me,\" Orange said. \"I hope that through that glove people can see what he was trying to say in his music and what he said in his music.\" Orange said he plans to give a portion of the proceeds to charity. Hoffman Ma, who bought the glove on behalf of Ponte 16 Resort in Macau, paid a 25 percent buyer's premium, which was tacked onto all final sales over $50,000. Winners of items less than $50,000 paid a 20 percent premium."""
question = "Where was the Auction held?"
question_answer(question, text)


Predicted answer:
Hard rock cafe in new york ' s times square


## Tokenizer

https://towardsdatascience.com/how-to-build-a-wordpiece-tokenizer-for-bert-f505d97dddbb

In [53]:
import datasets
dataset = datasets.load_dataset('oscar', 'unshuffled_deduplicated_sl')
dataset = dataset['train']

Downloading builder script: 14.8kB [00:00, 4.92MB/s]                   
Downloading metadata: 3.07MB [00:00, 90.6MB/s]                  


Downloading and preparing dataset oscar/unshuffled_deduplicated_sl (download: 498.98 MiB, generated: 1.22 GiB, post-processed: Unknown size, total: 1.71 GiB) to C:\Users\Nace\.cache\huggingface\datasets\oscar\unshuffled_deduplicated_sl\1.0.0\84838bd49d2295f62008383b05620571535451d84545037bb94d6f3501651df2...


Downloading data: 100%|██████████| 81.0/81.0 [00:00<00:00, 80.9kB/s]
Downloading data: 100%|██████████| 523M/523M [00:42<00:00, 12.4MB/s]
Downloading data files: 100%|██████████| 1/1 [00:43<00:00, 43.26s/it]
                                                                                         

Dataset oscar downloaded and prepared to C:\Users\Nace\.cache\huggingface\datasets\oscar\unshuffled_deduplicated_sl\1.0.0\84838bd49d2295f62008383b05620571535451d84545037bb94d6f3501651df2. Subsequent calls will reuse this data.


100%|██████████| 1/1 [00:00<00:00,  7.19it/s]


Reformating data into simple plaintext files.

In [59]:
from tqdm.auto import tqdm

text_data = []
file_count = 0

for sample in tqdm(dataset):
    # remove newline characters from each sample as we need to use exclusively as seperators
    sample = sample['text'].replace('\n', '\s')
    text_data.append(sample)
    if len(text_data) == 5_000:
        # once we hit the 5K mark, save to file
        with open(f'../data/oscar_sl/text_{file_count}.txt', 'w', encoding='utf-8') as fp:
            fp.write('\n'.join(text_data))
        text_data = []
        file_count += 1
# after saving in 5K chunks, we may have leftover samples, we save those now too
with open(f'../data/oscar_sl/text_{file_count}.txt', 'w', encoding='utf-8') as fp:
    fp.write('\n'.join(text_data))

100%|██████████| 886223/886223 [00:50<00:00, 17505.04it/s]


Training

In [4]:
from pathlib import Path
paths = [str(x) for x in Path('../data/oscar_sl').glob('**/*.txt')]
print(len(paths))
paths[:5]

178


['..\\data\\oscar_sl\\text_0.txt',
 '..\\data\\oscar_sl\\text_1.txt',
 '..\\data\\oscar_sl\\text_10.txt',
 '..\\data\\oscar_sl\\text_100.txt',
 '..\\data\\oscar_sl\\text_101.txt']

In [14]:
from tokenizers import BertWordPieceTokenizer

# initialize
tokenizer = BertWordPieceTokenizer(
    clean_text=True,
    handle_chinese_chars=False,
    strip_accents=False,
    lowercase=False
)
# and train
tokenizer.train(files=paths, vocab_size=50_000, min_frequency=2,
                limit_alphabet=1000, wordpieces_prefix='##',
                special_tokens=['[PAD', '[UNK]', '[CLS]', '[SEP]', '[MASK]'])

In [15]:
# save tokenizer
import os

#os.mkdir('../data/bert_sl')
tokenizer.save_model('../data/bert_sl', 'sl')

['../data/bert_sl\\-vocab.txt']

Import tokenizer

In [17]:
tokenizer = BertTokenizer.from_pretrained('../data/bert_sl/sl-vocab.txt')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


{'input_ids': [2, 5018, 23901, 16, 2250, 2097, 1954, 16909, 1026, 1948, 36018, 7182, 18, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [19]:
with open('../data/bert_sl/sl-vocab.txt', 'r', encoding='utf-8') as fp:
    vocab = fp.read().split('\n')

('[CLS]', 'pravih', '!', '##red', '##raj', '?', '[SEP]')

In [23]:
stavek = 'Tukaj lahko uporabnik napiše poljuben stavek v slovenščini.'
tokens = tokenizer(stavek)['input_ids']
for t in tokens:
    print(vocab[t])

[CLS]
dec
##ek
in
deklica
ne
uz
##i
##vata
ob
poslu
##san
##ju
glasbe
in
prepe
##vanju
novih
melodi
##j
,
pa
##c
pa
raje
prisluh
##neta
dobri
knjigi
in
uz
##i
##vata
ob
kozarcu
dobrega
vina
in
prijetni
dru
##zbi
.
[SEP]
