In [1]:
import pandas as pd
import numpy as np
import torch
from transformers import BertForQuestionAnswering, BertTokenizer, BertForMaskedLM, AdamW
from tqdm.auto import tqdm
from tokenizers import BertWordPieceTokenizer

In [1]:
from pathlib import Path
paths = [str(x) for x in Path('../data/oscar_sl').glob('**/*.txt')]
print(len(paths))
paths[:5]

178


['..\\data\\oscar_sl\\text_0.txt',
 '..\\data\\oscar_sl\\text_1.txt',
 '..\\data\\oscar_sl\\text_10.txt',
 '..\\data\\oscar_sl\\text_100.txt',
 '..\\data\\oscar_sl\\text_101.txt']

In [2]:
tokenizer = BertTokenizer.from_pretrained('../data/bert_sl/sl-vocab.txt')
with open('../data/bert_sl/sl-vocab.txt', 'r', encoding='utf-8') as fp:
    vocab = fp.read().split('\n')

stavek = 'Tukaj lahko uporabnik [MASK] napiše poljuben stavek v [PAD] slovenščini.'
tokens = tokenizer(stavek)['input_ids']
for t in tokens:
    print(str(t) + " " + vocab[t])

2 [CLS]
4084 tukaj
2039 lahko
5407 uporabnik
4 [MASK]
54542 napise
63277 poljuben
18591 stavek
90 v
0 [PAD]
5961 sloven
14307 ##sci
1935 ##ni
18 .
3 [SEP]




In [None]:
import re
alphabets= "([A-Za-z])"
lowercase = "[.][ ]([a-z])"
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov)"

def split_into_sentences(text):
    text = " " + text + "  "
    text = text.replace("\n"," ").replace("\\s"," ").replace("\s"," ")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    text = re.sub(lowercase,"<prd> \\1",text)
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    sentences = text.split("<stop>")
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]
    return sentences


In [3]:
max_length = 256


In [None]:

mlm_data = []
for p in paths[:10]:
    with open(p, 'r',encoding='utf-8') as f:
        for lines in f.readlines():
            lines = split_into_sentences(lines)
            for line in lines:
                if len(line.split(" ")) < max_length - 30:
                    if len(tokenizer(line)['input_ids']) <= max_length:
                        mlm_data.append(line)
mlm_data[10:20]


In [None]:
max_len = 0
string = ""
for tmp in mlm_data:
    l = len(tokenizer(tmp)['input_ids'])
    if l > max_len:
        max_len = l
        string = tmp
print(max_len)
print(string)

In [None]:
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
model.resize_token_embeddings(len(tokenizer))


In [None]:
inputs = tokenizer(mlm_data[:50000], return_tensors='pt', max_length=max_length, truncation=True, padding='max_length')
inputs['labels'] = inputs.input_ids.detach().clone()
inputs

In [None]:
rand = torch.rand(inputs.input_ids.shape)
mask_arr = (rand < 0.15) * (inputs.input_ids != 2) * (inputs.input_ids != 4) * (inputs.input_ids != 0) # we don't want to mask [CLS], [MASK] and [PAD] tokens
mask_arr

In [None]:
selection = []

for i in range(inputs.input_ids.shape[0]):
    selection.append(torch.flatten(mask_arr[i].nonzero()).tolist())
    inputs.input_ids

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

dataset = Dataset(inputs)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=4, shuffle=True)

In [None]:
device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
print(device)


In [None]:
model.to(device)
model.train()
optim = AdamW(model.parameters(), lr=1e-5)

In [None]:
import gc

gc.collect()

torch.cuda.empty_cache()

In [None]:
epochs = 2 # if number is large it can overtrain easily

for epoch in range(epochs):
    loop = tqdm(dataloader, leave=True)
    for batch in loop:
        optim.zero_grad()
        input_ids = torch.tensor(batch['input_ids'], device=device)
        attention_mask = torch.tensor(batch['attention_mask'], device=device)
        labels = torch.tensor(batch['labels'], device=device)
        #print(input_ids.size())
        #print(attention_mask.size())
        #print(labels.size())

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optim.step()

        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())


In [None]:
import os

os.mkdir('../data/bert_mlm')

torch.save(model.state_dict(), '../data/bert_mlm/weights_pretrain')

### QA

In [2]:
squad = pd.read_json('../data/test2.json')
del squad['version']
squad.head()

Unnamed: 0,data
0,"{'title': 'Normani', 'paragraphs': [{'qas': [{..."
1,"{'title': 'Computational_complexity_theory', '..."
2,"{'title': 'Southern_California', 'paragraphs':..."
3,"{'title': 'Sky_(Združeno kraljestvo)', 'paragr..."
4,"{'title': 'Victoria_(Avstralija)', 'paragraphs..."


In [3]:
def read_squad(data):
    contexts = []
    questions = []
    answers = []
    # iterate through all data in squad data
    for _, group in data.iterrows():
        for passage in group['data']['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                if 'plausible_answers' in qa.keys():
                    access = 'plausible_answers'
                else:
                    access = 'answers'
                for answer in qa[access]:
                    # append data to lists
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)
    # return formatted data lists
    return contexts, questions, answers

In [4]:
train_contexts, train_questions, train_answers = read_squad(squad)

In [5]:
train_answers[:10]

[{'text': 'Francija', 'answer_start': 147, 'answer_end': 155},
 {'text': 'Francija', 'answer_start': 147, 'answer_end': 155},
 {'text': 'Francija', 'answer_start': 147, 'answer_end': 155},
 {'text': 'Francija', 'answer_start': 147, 'answer_end': 155},
 {'text': '10. in 11. stoletje', 'answer_start': 97, 'answer_end': 116},
 {'text': 'v 10. in 11. stoletju', 'answer_start': 95, 'answer_end': 116},
 {'text': '10. in 11. stoletje', 'answer_start': 97, 'answer_end': 116},
 {'text': '10. in 11. stoletje', 'answer_start': 97, 'answer_end': 116},
 {'text': 'Danska, Islandija in Norveška',
  'answer_start': 220,
  'answer_end': 249},
 {'text': 'Danska, Islandija in Norveška',
  'answer_start': 220,
  'answer_end': 249}]

In [31]:
from transformers import BertTokenizerFast

tokenizer_qa = BertTokenizerFast.from_pretrained('../data/CroSloEngual_BERT')
# tokenizer_qa = BertTokenizer.from_pretrained('../data/CroSloEngual_BERT/vocab.txt')


In [104]:
train_encodings = tokenizer_qa(train_contexts, train_questions, truncation=True, padding='max_length', max_length=512, return_tensors='pt')

In [33]:
tokenizer_qa.decode(train_encodings['input_ids'][10])

'[CLS] normani ( norman : nourmands ; francoscina : normandi ; latinscina : normanni ) so bili ljudje, ki so v 10. in 11. stoletju dali ime normandiji, regiji v franciji. bili so potomci nordijskih ( " norman " ) napadalcev in piratov iz danske, islandije in norveske, ki so se pod svojim voditeljem rollom strinjali, da bodo prisegli zvestobo kralju karlu iii. iz zahodne frankovske. skozi generacije asimilacije in mesanja z domacimi frankovskimi in rimsko - gavskimi populacijami so se njihovi potomci postopoma zdruzili s karolinskimi kulturami zahodne frankovske. posebna kulturna in etnicna identiteta normanov se je najprej pojavila v prvi polovici 10. stoletja in se je razvijala v naslednjih stoletjih. [SEP] iz katerih drzav je norveska izvirala? [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PA

We have to convert the character start and end position into token start and end position

In [105]:
def add_token_positions(encodings, answers):
    # initialize lists to contain the token indices of answer start/end
    start_positions = []
    end_positions = []
    invalid = []
    for i in range(len(answers)):
        # append start/end token position using char_to_token method
        start = encodings.char_to_token(i, answers[i]['answer_start'])
        end = encodings.char_to_token(i, answers[i]['answer_start'])
        cond = True

        # if start position is None, the answer passage has been truncated
        if start is None:
            start = tokenizer_qa.model_max_length
            invalid.append(i)
            cond = False
        # end position cannot be found, char_to_token found space, so shift one token forward
        if cond:
            go_back = 1
            while end is None:
                end = encodings.char_to_token(i, answers[i]['answer_end']-go_back)
                go_back +=1
            start_positions.append(start)
            end_positions.append(end)
    # update our encodings object with the new token-based start/end positions
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})
    return invalid

# apply function to our data
invalid = add_token_positions(train_encodings, train_answers)

In [None]:
keys = ['input_ids', 'token_type_ids', 'attention_mask']
for key in keys:
    tmp = np.delete(train_encodings[key], invalid, 0)
    train_encodings.update({key: tmp})
train_encodings

In [107]:
class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        try:
            a = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        except RuntimeError:
            for k, v in self.encodings.items():
                print(f'{k}: {v}')
            a = None
        return a

    def __len__(self):
        return len(self.encodings.input_ids)

# build datasets for both our training data
train_dataset = SquadDataset(train_encodings)
loader = torch.utils.data.DataLoader(train_dataset, batch_size=4)

In [108]:
from transformers import BertForQuestionAnswering
model = BertForQuestionAnswering.from_pretrained('../data/CroSloEngual_BERT')

Some weights of the model checkpoint at ../data/CroSloEngual_BERT were not used when initializing BertForQuestionAnswering: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at ../data/Cro

In [109]:
from transformers import AdamW

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model.to(device)
model.train()
optim = AdamW(model.parameters(), lr=5e-5)



In [111]:
for epoch in range(3):
    loop = tqdm(loader)
    for batch in loop:
        optim.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask,
                        start_positions=start_positions,
                        end_positions=end_positions)
        
        loss = outputs[0]
        loss.backward()
        optim.step()

        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

  a = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 0: 100%|██████████| 5006/5006 [37:05<00:00,  2.25it/s, loss=3.04]  
Epoch 1: 100%|██████████| 5006/5006 [36:08<00:00,  2.31it/s, loss=2.41]  
Epoch 2: 100%|██████████| 5006/5006 [32:47<00:00,  2.54it/s, loss=0.518]  


In [112]:
import os

os.mkdir('../data/bert_qa')

torch.save(model.state_dict(), '../data/bert_qa/weights')

In [None]:
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')

model.eval()

In [138]:
def question_answer(question, text):
    # tokenize question and text as a pair
    input_ids = tokenizer_qa.encode(question, text)
    
    # string version of tokenized ids
    tokens = tokenizer_qa.convert_ids_to_tokens(input_ids)
    
    # segment IDs
    # first occurrence of [SEP] token
    sep_idx = input_ids.index(tokenizer_qa.sep_token_id)
    # number of tokens in segment A (question)
    num_seg_a = sep_idx+1
    # number of tokens in segment B (text)
    num_seg_b = len(input_ids) - num_seg_a
    
    # list of 0s and 1s for segment embeddings
    segment_ids = [0]*num_seg_a + [1]*num_seg_b
    assert len(segment_ids) == len(input_ids)
    
    # model output using input_ids and segment_ids
    output = model(torch.tensor([input_ids]).to(device), token_type_ids=torch.tensor([segment_ids]).to(device))
    
    # reconstructing the answer
    answer_start = torch.argmax(output.start_logits)
    answer_end = torch.argmax(output.end_logits) + 1 # needs to be changed down
    if answer_end >= answer_start:
        answer = tokens[answer_start]
        for i in range(answer_start+1, answer_end+1): # here
            if tokens[i][0:2] == "##":
                answer += tokens[i][2:]
            else:
                answer += " " + tokens[i]
                
    if answer.startswith("[CLS]"):
        answer = "Unable to find the answer to your question."
    
    return "{}".format(answer.capitalize())

In [142]:
import time

text = "Slovenija je imela nadpovprečno visoko gospodarsko rast, zgodovinsko najvišjo zaposlenost in kljub temu podpovprečno inflacijo. Izredno nizko stopnjo inflacije je beležila še prejšnji mesec, vendar so se številke z aprilom usmerile v nasprotno smer, kar pa v ljudeh vzbuja dvom in nezaupanje v prihodnjo vlado, da bo Slovenijo lahko peljala v napredek tako, kot je to uspevalo vladi Janeza Janše."
print(f"{text}\n\n")

questions = ["Kaksno rast ima Slovenija?", "Kaksno gospodarsko rast je imela Slovenija?", "Kaj je imela Slovenija?", "Kaksna je bila inflacija?", "Kdaj je belezila izredno nisko stopnjo inflacije?", "Kam so se stevilke usmirile?", "V kom vzbuja dvom?", "Komu ljudje ne zaupajo?"]
for count, question in enumerate(questions):
    predicted_answer = question_answer(question, text)
    print(f"{count + 1}. {question}          {predicted_answer}.\n")


Slovenija je imela nadpovprečno visoko gospodarsko rast, zgodovinsko najvišjo zaposlenost in kljub temu podpovprečno inflacijo. Izredno nizko stopnjo inflacije je beležila še prejšnji mesec, vendar so se številke z aprilom usmerile v nasprotno smer, kar pa v ljudeh vzbuja dvom in nezaupanje v prihodnjo vlado, da bo Slovenijo lahko peljala v napredek tako, kot je to uspevalo vladi Janeza Janše.


1. Kaksno rast ima Slovenija?          Visoko gospodarsko.

2. Kaksno gospodarsko rast je imela Slovenija?          Nadpov.

3. Kaj je imela Slovenija?          Imela nad.

4. Kaksna je bila inflacija?          Izredno nizko.

5. Kdaj je belezila izredno nisko stopnjo inflacije?          Prejsn.

6. Kam so se stevilke usmirile?          Z april.

7. V kom vzbuja dvom?          Dvom in.

8. Komu ljudje ne zaupajo?          V ljudeh.

