
# QA BERT


## Fine-tuned BERT

https://towardsdatascience.com/question-answering-with-a-fine-tuned-bert-bc4dafd45626

In [37]:
import pandas as pd
import numpy as np
import torch
from transformers import BertForQuestionAnswering, BertTokenizer
from tqdm.auto import tqdm

In [38]:
squad = pd.read_json('../data/dev-v2.0.json')
del squad['version']
squad.head()

Unnamed: 0,data
0,"{'title': 'Normans', 'paragraphs': [{'qas': [{..."
1,"{'title': 'Computational_complexity_theory', '..."
2,"{'title': 'Southern_California', 'paragraphs':..."
3,"{'title': 'Sky_(United_Kingdom)', 'paragraphs'..."
4,"{'title': 'Victoria_(Australia)', 'paragraphs'..."


Data cleaning

In [39]:
def get_dataframe(data):
    # require columns in our dataframe
    cols = ['text', 'question', 'answers']

    # list of lists to create our dataframe
    comp_list = []
    i = 0
    for _, dset in data.iterrows():
        for row in dset['data']['paragraphs']:
            for qas in row['qas']:
                temp_list = []
                temp_list.append(row['context'])
                temp_list.append(qas['question'])
                temp_list.append([a['text'] for a in qas['answers']])
                comp_list.append(temp_list)
    return pd.DataFrame(comp_list, columns=cols)

df = get_dataframe(squad)

In [40]:
print(f"Number of questions and answers: {len(df)}")
df.head()

Number of questions and answers: 11873


Unnamed: 0,text,question,answers
0,The Normans (Norman: Nourmands; French: Norman...,In what country is Normandy located?,"[France, France, France, France]"
1,The Normans (Norman: Nourmands; French: Norman...,When were the Normans in Normandy?,"[10th and 11th centuries, in the 10th and 11th..."
2,The Normans (Norman: Nourmands; French: Norman...,From which countries did the Norse originate?,"[Denmark, Iceland and Norway, Denmark, Iceland..."
3,The Normans (Norman: Nourmands; French: Norman...,Who was the Norse leader?,"[Rollo, Rollo, Rollo, Rollo]"
4,The Normans (Norman: Nourmands; French: Norman...,What century did the Normans first gain their ...,"[10th century, the first half of the 10th cent..."


Model initialization

In [41]:
model_fine_tuned = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
tokenizer_fine_tuned = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

Asking a random question

In [42]:
rand_n = np.random.randint(0, len(df))

question = df['question'][rand_n]
text = df['text'][rand_n]

Tokenization of the question and text as a pair

In [43]:
input_ids = tokenizer_fine_tuned.encode(question, text)
print(f"The input has a total of {len(input_ids)} tokens.")

tokens = tokenizer_fine_tuned.convert_ids_to_tokens(input_ids)
count = 0
for token, id in zip(tokens, input_ids):
    if count >= 20:
        break
    count += 1
    print(f" {token:15} {id:15,}")

The input has a total of 130 tokens.
 [CLS]                       101
 how                       2,129
 many                      2,116
 years                     2,086
 have                      2,031
 imperial                  4,461
 ##istic                   6,553
 practices                 6,078
 existed                   5,839
 ?                         1,029
 [SEP]                       102
 the                       1,996
 age                       2,287
 of                        1,997
 imperialism              28,087
 ,                         1,010
 a                         1,037
 time                      2,051
 period                    2,558
 beginning                 2,927


Segment and position embeddings

In [44]:
# first occurrence of [SEP] token
sep_idx = input_ids.index(tokenizer_fine_tuned.sep_token_id)
print(f"[SEP] token index: {sep_idx}")

# number of tokens in segment A (question)
# this will be one more than the sep_idx as the index in Python starts from 0
num_seg_a = sep_idx + 1
print(f"Numbers of tokens in segment A: {num_seg_a}")

# number of tokens in segment B (text)
num_seg_b = len(input_ids) - num_seg_a
print(f"Numbers of tokens in segment B: {num_seg_b}")

# creating the segment ids
segment_ids = [0] * num_seg_a + [1] * num_seg_b

# making sure that every input token has a segment id
assert len(segment_ids) == len(input_ids)

[SEP] token index: 10
Numbers of tokens in segment A: 11
Numbers of tokens in segment B: 119


Feeding this to our model

In [45]:
# token input_ids to represent the input and token segment_ids to differentiate our segments - question and text
output = model_fine_tuned(torch.tensor([input_ids]), token_type_ids=torch.tensor([segment_ids])) 

# tokens with highest start and end scores
answer_start = torch.argmax(output.start_logits)
answer_end = torch.argmax(output.end_logits)

if answer_end >= answer_start:
    answer = tokens[answer_start]
    for i in range(answer_start + 1, answer_end + 1):
        if tokens[i][0:2] == "##":
            answer += tokens[i][2:]
        else:
            answer += " " + tokens[i]
else:
    print("I am unable to find the answer to this question. Can you please ask another question?")

print(f"\nQuestion:\n{question.capitalize()}")
print(f"\nAnswer:\n{answer.capitalize()}.")


Question:
How many years have imperialistic practices existed?

Answer:
Thousands of years.


Let us now turn this process into function

In [46]:
def question_answer(question, text):
    # tokenize question and text as a pair
    input_ids = tokenizer_fine_tuned.encode(question, text)
    
    # string version of tokenized ids
    tokens = tokenizer_fine_tuned.convert_ids_to_tokens(input_ids)
    
    # segment IDs
    # first occurrence of [SEP] token
    sep_idx = input_ids.index(tokenizer_fine_tuned.sep_token_id)
    # number of tokens in segment A (question)
    num_seg_a = sep_idx+1
    # number of tokens in segment B (text)
    num_seg_b = len(input_ids) - num_seg_a
    
    # list of 0s and 1s for segment embeddings
    segment_ids = [0]*num_seg_a + [1]*num_seg_b
    assert len(segment_ids) == len(input_ids)
    
    # model output using input_ids and segment_ids
    output = model_fine_tuned(torch.tensor([input_ids]), token_type_ids=torch.tensor([segment_ids]))
    
    # reconstructing the answer
    answer_start = torch.argmax(output.start_logits)
    answer_end = torch.argmax(output.end_logits)
    if answer_end >= answer_start:
        answer = tokens[answer_start]
        for i in range(answer_start+1, answer_end+1):
            if tokens[i][0:2] == "##":
                answer += tokens[i][2:]
            else:
                answer += " " + tokens[i]
                
    if answer.startswith("[CLS]"):
        answer = "Unable to find the answer to your question."
    
    print("\nPredicted answer:\n{}".format(answer.capitalize()))

Test model using different text and question (not from our dataset)

In [47]:
text = """New York (CNN) -- More than 80 Michael Jackson collectibles -- including the late pop star's famous rhinestone-studded glove from a 1983 performance -- were auctioned off Saturday, reaping a total $2 million. Profits from the auction at the Hard Rock Cafe in New York's Times Square crushed pre-sale expectations of only $120,000 in sales. The highly prized memorabilia, which included items spanning the many stages of Jackson's career, came from more than 30 fans, associates and family members, who contacted Julien's Auctions to sell their gifts and mementos of the singer. Jackson's flashy glove was the big-ticket item of the night, fetching $420,000 from a buyer in Hong Kong, China. Jackson wore the glove at a 1983 performance during \"Motown 25,\" an NBC special where he debuted his revolutionary moonwalk. Fellow Motown star Walter \"Clyde\" Orange of the Commodores, who also performed in the special 26 years ago, said he asked for Jackson's autograph at the time, but Jackson gave him the glove instead. "The legacy that [Jackson] left behind is bigger than life for me,\" Orange said. \"I hope that through that glove people can see what he was trying to say in his music and what he said in his music.\" Orange said he plans to give a portion of the proceeds to charity. Hoffman Ma, who bought the glove on behalf of Ponte 16 Resort in Macau, paid a 25 percent buyer's premium, which was tacked onto all final sales over $50,000. Winners of items less than $50,000 paid a 20 percent premium."""
question = "Where was the Auction held?"
question_answer(question, text)


Predicted answer:
Hard rock cafe in new york ' s times square


## BERT Train

https://towardsdatascience.com/how-to-train-bert-for-q-a-in-any-language-63b62c780014

## Tokenizer

https://towardsdatascience.com/how-to-build-a-wordpiece-tokenizer-for-bert-f505d97dddbb

In [48]:
'''import datasets
dataset = datasets.load_dataset('oscar', 'unshuffled_deduplicated_sl')
dataset = dataset['train']'''

"import datasets\ndataset = datasets.load_dataset('oscar', 'unshuffled_deduplicated_sl')\ndataset = dataset['train']"

Reformating data into simple plaintext files.

In [49]:
'''

text_data = []
file_count = 0

for sample in tqdm(dataset):
    # remove newline characters from each sample as we need to use exclusively as seperators
    sample = sample['text'].replace('\n', '\s')
    text_data.append(sample)
    if len(text_data) == 5_000:
        # once we hit the 5K mark, save to file
        with open(f'../data/oscar_sl/text_{file_count}.txt', 'w', encoding='utf-8') as fp:
            fp.write('\n'.join(text_data))
        text_data = []
        file_count += 1
# after saving in 5K chunks, we may have leftover samples, we save those now too
with open(f'../data/oscar_sl/text_{file_count}.txt', 'w', encoding='utf-8') as fp:
    fp.write('\n'.join(text_data))'''

"\n\ntext_data = []\nfile_count = 0\n\nfor sample in tqdm(dataset):\n    # remove newline characters from each sample as we need to use exclusively as seperators\n    sample = sample['text'].replace('\n', '\\s')\n    text_data.append(sample)\n    if len(text_data) == 5_000:\n        # once we hit the 5K mark, save to file\n        with open(f'../data/oscar_sl/text_{file_count}.txt', 'w', encoding='utf-8') as fp:\n            fp.write('\n'.join(text_data))\n        text_data = []\n        file_count += 1\n# after saving in 5K chunks, we may have leftover samples, we save those now too\nwith open(f'../data/oscar_sl/text_{file_count}.txt', 'w', encoding='utf-8') as fp:\n    fp.write('\n'.join(text_data))"

Training

In [3]:
from pathlib import Path
paths = [str(x) for x in Path('../data/oscar_sl').glob('**/*.txt')]
print(len(paths))
paths[:5]

178


['..\\data\\oscar_sl\\text_0.txt',
 '..\\data\\oscar_sl\\text_1.txt',
 '..\\data\\oscar_sl\\text_10.txt',
 '..\\data\\oscar_sl\\text_100.txt',
 '..\\data\\oscar_sl\\text_101.txt']

In [51]:
from tokenizers import BertWordPieceTokenizer

# initialize
tokenizer = BertWordPieceTokenizer(
    clean_text=True,
    handle_chinese_chars=False,
    strip_accents=False,
    lowercase=False
)
# and train
tokenizer.train(files=paths, vocab_size=100_000, min_frequency=2,
                limit_alphabet=1000, wordpieces_prefix='##',
                special_tokens=['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]'])

"\n# initialize\ntokenizer = BertWordPieceTokenizer(\n    clean_text=True,\n    handle_chinese_chars=False,\n    strip_accents=False,\n    lowercase=False\n)\n# and train\ntokenizer.train(files=paths, vocab_size=100_000, min_frequency=2,\n                limit_alphabet=1000, wordpieces_prefix='##',\n                special_tokens=['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]'])"

In [4]:
from transformers import BertTokenizerFast

# initialize
tokenizer = BertTokenizerFast(
    clean_text=True,
    handle_chinese_chars=False,
    strip_accents=False,
    lowercase=False
)
# and train
tokenizer.train(files=paths, vocab_size=100_000, min_frequency=2,
                limit_alphabet=1000, wordpieces_prefix='##',
                special_tokens=['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]'])

TypeError: stat: path should be string, bytes, os.PathLike or integer, not NoneType

In [52]:
# save tokenizer
'''
import os

os.mkdir('../data/bert_sl')
tokenizer.save_model('../data/bert_sl', 'sl')'''

"\nimport os\n\nos.mkdir('../data/bert_sl')\ntokenizer.save_model('../data/bert_sl', 'sl')"

Import tokenizer

In [53]:
tokenizer = BertTokenizer.from_pretrained('../data/bert_sl/sl-vocab.txt')



In [54]:
with open('../data/bert_sl/sl-vocab.txt', 'r', encoding='utf-8') as fp:
    vocab = fp.read().split('\n')

In [55]:
stavek = 'Tukaj lahko uporabnik [MASK] napiše poljuben stavek v [PAD] slovenščini.'
tokens = tokenizer(stavek)['input_ids']
for t in tokens:
    print(str(t) + " " + vocab[t])

2 [CLS]
4084 tukaj
2039 lahko
5407 uporabnik
4 [MASK]
54542 napise
63277 poljuben
18591 stavek
90 v
0 [PAD]
5961 sloven
14307 ##sci
1935 ##ni
18 .
3 [SEP]


## BERT sl

https://towardsdatascience.com/how-to-train-bert-for-q-a-in-any-language-63b62c780014

### BERT pretrain (MLM)

Data prep

In [56]:
import re
from nltk import tokenize, download
"""
text = []
for p in paths[:1]:
    with open(p, 'r',encoding='utf-8') as f:
        tmp = re.split(r'[\n,\s]', f.read())
        if type(tmp) == list:
            for t in tmp:
                tokenize.sent_tokenize(t)
        else:
            text.append(tmp)
"""




"\ntext = []\nfor p in paths[:1]:\n    with open(p, 'r',encoding='utf-8') as f:\n        tmp = re.split(r'[\n,\\s]', f.read())\n        if type(tmp) == list:\n            for t in tmp:\n                tokenize.sent_tokenize(t)\n        else:\n            text.append(tmp)\n"

In [57]:
mlm_data = []
for p in paths[:1]:
    with open(p, 'r',encoding='utf-8') as f:
        for line in f.readlines():
            line = line.replace('\n', '').replace('\\s', ' ').replace('\s', ' ')
            mlm_data.append(line)


In [58]:
from transformers import BertForMaskedLM, AdamW

In [59]:
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [60]:
squad_sl = pd.read_json('./export-test.json')
del squad_sl['version']
squad_sl.head()

Unnamed: 0,data
0,"{'title': 'Normani', 'paragraphs': [{'qas': [{..."
1,"{'title': 'Computational_complexity_theory', '..."
2,"{'title': 'Southern_California', 'paragraphs':..."
3,"{'title': 'Sky_(Združeno kraljestvo)', 'paragr..."
4,"{'title': 'Victoria_(Avstralija)', 'paragraphs..."


In [61]:
def get_data(data):
    # require columns in our dataframe
    cols = ['text', 'question', 'answers']

    # list of lists to create our dataframe
    comp_list = []
    i = 0
    for _, dset in data.iterrows():
        for row in dset['data']['paragraphs']:
            for qas in row['qas']:
                temp_list = []
                temp_list.append(row['context'])
                temp_list.append(qas['question'])
                temp_list.append([(a['answer_start'], a['text']) for a in qas['answers']])
                comp_list.append(temp_list)
    return pd.DataFrame(comp_list, columns=cols)


data = get_data(squad)
data

Unnamed: 0,text,question,answers
0,The Normans (Norman: Nourmands; French: Norman...,In what country is Normandy located?,"[(159, France), (159, France), (159, France), ..."
1,The Normans (Norman: Nourmands; French: Norman...,When were the Normans in Normandy?,"[(94, 10th and 11th centuries), (87, in the 10..."
2,The Normans (Norman: Nourmands; French: Norman...,From which countries did the Norse originate?,"[(256, Denmark, Iceland and Norway), (256, Den..."
3,The Normans (Norman: Nourmands; French: Norman...,Who was the Norse leader?,"[(308, Rollo), (308, Rollo), (308, Rollo), (30..."
4,The Normans (Norman: Nourmands; French: Norman...,What century did the Normans first gain their ...,"[(671, 10th century), (649, the first half of ..."
...,...,...,...
11868,"The pound-force has a metric counterpart, less...",What is the seldom used force unit equal to on...,"[(665, sthène), (665, sthène), (665, sthène), ..."
11869,"The pound-force has a metric counterpart, less...",What does not have a metric counterpart?,[]
11870,"The pound-force has a metric counterpart, less...",What is the force exerted by standard gravity ...,[]
11871,"The pound-force has a metric counterpart, less...",What force leads to a commonly used unit of mass?,[]


In [62]:
inputs = tokenizer(mlm_data, return_tensors='pt', max_length=2048, truncation=True, padding='max_length')

In [63]:
inputs['labels'] = inputs.input_ids.detach().clone()
inputs

{'input_ids': tensor([[    2,  2099, 74072,  ...,     0,     0,     0],
        [    2,  2041,  2563,  ...,     0,     0,     0],
        [    2,  2460,  1954,  ...,     0,     0,     0],
        ...,
        [    2, 44868,  1985,  ...,     0,     0,     0],
        [    2,  3678, 15569,  ...,     0,     0,     0],
        [    2,  6935, 53719,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([[    2,  2099, 74072,  ...,     0,     0,     0],
        [    2,  2041,  2563,  ...,     0,     0,     0],
        [    2,  2460, 

In [64]:
rand = torch.rand(inputs.input_ids.shape)
mask_arr = (rand < 0.15) * (inputs.input_ids != 2) * (inputs.input_ids != 4) * (inputs.input_ids != 0) # we don't want to mask [CLS], [MASK] and [PAD] tokens
mask_arr

tensor([[False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        ...,
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False]])

In [65]:
selection = []

for i in range(inputs.input_ids.shape[0]):
    selection.append(torch.flatten(mask_arr[i].nonzero()).tolist())


In [66]:
for i in range(mask_arr.shape[0]):
    inputs.input_ids[i, selection[i]] = 4 # 4 == [MASK]

In [67]:
inputs.input_ids

tensor([[    2,  2099, 74072,  ...,     0,     0,     0],
        [    2,  2041,  2563,  ...,     0,     0,     0],
        [    2,  2460,  1954,  ...,     0,     0,     0],
        ...,
        [    2, 44868,  1985,  ...,     0,     0,     0],
        [    2,  3678, 15569,  ...,     0,     0,     0],
        [    2,  6935, 53719,  ...,     0,     0,     0]])

In [68]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

In [69]:
dataset = Dataset(inputs)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=True)

In [87]:
#device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
#device

import argparse

parser = argparse.ArgumentParser(description='PyTorch Example')
parser.add_argument('--disable-cuda', action='store_true',
                    help='Disable CUDA')
args = parser.parse_args()
args.device = None
if not args.disable_cuda and torch.cuda.is_available():
    args.device = torch.device('cuda')
else:
    args.device = torch.device('cpu')

usage: ipykernel_launcher.py [-h] [--disable-cuda]
ipykernel_launcher.py: error: unrecognized arguments: --ip=127.0.0.1 --stdin=9008 --control=9006 --hb=9005 --Session.signature_scheme="hmac-sha256" --Session.key=b"1d982e7b-9042-4c94-a50b-82fc5b533d46" --shell=9007 --transport="tcp" --iopub=9009 --f=C:\Users\Nace\AppData\Local\Temp\tmp-1077669wGW0cwJsiQ.json


SystemExit: 2

In [79]:
model.to(device)
model.train()

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

In [None]:
optim = AdamW(model.parameters(), lr=1e-5)

In [None]:
t = torch.tensor([1,2]).to(device)
t.get_device()

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

In [None]:
epochs = 2 # if number is large it can overtrain easily

for epoch in range(epochs):
    loop = tqdm(dataloader, leave=True)
    for batch in loop:
        optim.zero_grad()
        input_ids = torch.tensor(batch['input_ids'], device=device)
        attention_mask = torch.tensor(batch['attention_mask'], device=device)
        labels = torch.tensor(batch['labels'], device=device)
        print(input_ids.size)
        print(attention_mask.size)
        print(labels.size)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = output.loss
        loss.backward()
        optim.step()

        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())



  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  input_ids = torch.tensor(batch['input_ids'], device=device)
  attention_mask = torch.tensor(batch['attention_mask'], device=device)
  labels = torch.tensor(batch['labels'], device=device)


<built-in method size of Tensor object at 0x000001AF9181B360>
<built-in method size of Tensor object at 0x000001AF9181B220>
<built-in method size of Tensor object at 0x000001AF9181B310>


  0%|          | 0/5000 [00:02<?, ?it/s]


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.