# QA

In [47]:
from pathlib import Path
squad_paths = [str(x) for x in Path('../data/aligned_data_SL/directly').glob('**/*.json')]

for path in squad_paths:
    if "train" in path:
        train_path = path
    else:
        test_path = path
train_path, test_path

('..\\data\\aligned_data_SL\\directly\\train-v2.0_aligned_directly.json',
 '..\\data\\aligned_data_SL\\directly\\dev-v2.0_aligned_directly.json')

In [48]:
def read_squad(data):
    contexts = []
    questions = []
    answers = []
    # iterate through all data in squad data
    for _, group in data.iterrows():
        for passage in group['data']['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                if 'plausible_answers' in qa.keys():
                    access = 'plausible_answers'
                else:
                    access = 'answers'
                for answer in qa[access]:
                    # append data to lists
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)
    # return formatted data lists
    return contexts, questions, answers

In [49]:
from pandas import read_json
from sklearn.model_selection import train_test_split

train = read_json(train_path)
test = read_json(test_path)
test, val = train_test_split(test, test_size=0.5, shuffle=True)
train_contexts, train_questions, train_answers = read_squad(train)
test_contexts, test_questions, test_answers = read_squad(test)
val_contexts, val_questions, val_answers = read_squad(val)

train.shape, test.shape, val.shape

((442, 1), (17, 1), (18, 1))

In [50]:
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained('../data/CroSloEngual_BERT')

In [51]:
train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding='max_length', max_length=512, return_tensors='pt')
test_encodings = tokenizer(test_contexts, test_questions, truncation=True, padding='max_length', max_length=512, return_tensors='pt')
val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding='max_length', max_length=512, return_tensors='pt')

In [52]:
from tqdm.auto import tqdm
import numpy as np

def add_token_positions(encodings, answers):
    # initialize lists to contain the token indices of answer start/end
    start_positions = []
    end_positions = []
    invalid = []
    for i in range(len(answers)):
        # append start/end token position using char_to_token method
        start = encodings.char_to_token(i, answers[i]['answer_start'])
        end = encodings.char_to_token(i, answers[i]['answer_start'])
        cond = True

        # if start position is None, the answer passage has been truncated
        if start is None:
            start = tokenizer.model_max_length
            invalid.append(i)
            cond = False
        # end position cannot be found, char_to_token found space, so shift one token forward
        if cond:
            go_back = 1
            while end is None:
                end = encodings.char_to_token(i, answers[i]['answer_end']-go_back)
                go_back +=1
            start_positions.append(start)
            end_positions.append(end)
    # update our encodings object with the new token-based start/end positions
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})
    keys = ['input_ids', 'token_type_ids', 'attention_mask']
    for key in keys:
        tmp = np.delete(encodings[key], invalid, 0)
        encodings.update({key: tmp})

In [53]:
add_token_positions(train_encodings, train_answers)
add_token_positions(test_encodings, test_answers)
add_token_positions(val_encodings, val_answers)

In [54]:
import torch

class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        try:
            a = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        except RuntimeError:
            for k, v in self.encodings.items():
                print(f'{k}: {v}')
            a = None
        return a

    def __len__(self):
        return len(self.encodings.input_ids)

In [55]:
train_dataset = SquadDataset(train_encodings)
test_dataset = SquadDataset(test_encodings)
val_dataset = SquadDataset(val_encodings)

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=4)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=4)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=4)

In [56]:
from transformers import BertForQuestionAnswering
model = BertForQuestionAnswering.from_pretrained('../data/CroSloEngual_BERT')

Some weights of the model checkpoint at ../data/CroSloEngual_BERT were not used when initializing BertForQuestionAnswering: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at ../data/Cro

In [57]:
from transformers import AdamW

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model.to(device)


BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(49601, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_

In [58]:
FULL_FINETUNING = True
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters())
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

optimizer = AdamW(
    optimizer_grouped_parameters,
    lr=3e-5,
    eps=1e-8
)



In [59]:
import numpy as np

model_parameters = filter(lambda p: p.requires_grad, model.parameters())
params = sum([np.prod(p.size()) for p in model_parameters])
print(f"The model has {params} trainable parameters")

The model has 123545858 trainable parameters


In [60]:
from transformers import get_linear_schedule_with_warmup

epochs = 3
max_grad_norm = 1.0

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

In [61]:
import gc

gc.collect()

torch.cuda.empty_cache()

In [62]:
loss_values, validation_loss_values, val_accuracy = [], [], []
train_step = 0

for epoch in range(epochs):
    model.train()
    total_loss = 0
    loop = tqdm(train_dataloader)
    for batch in loop:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask,
                        start_positions=start_positions,
                        end_positions=end_positions)
        
        loss = outputs[0]
        loss.backward()
        total_loss += loss.item()
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        optimizer.step()
        scheduler.step()

        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

        train_step += 1
    
    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / train_step
    print("Average train loss: {}".format(avg_train_loss))

    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)

    ## VALIDATION
    model.eval()
    eval_loss = 0
    eval_accuracy = []
    eval_steps = 0
    loop = tqdm(val_dataloader)
    for batch in loop:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask,
                            start_positions=start_true,
                            end_positions=end_true)
        
        start_pred = torch.argmax(outputs['start_logits'], dim=1)
        end_pred = torch.argmax(outputs['end_logits'], dim=1)

        loss = outputs[0]
        eval_loss += loss.item()

        eval_accuracy.append(((start_pred == start_true).sum()/len(start_pred)).item())
        eval_accuracy.append(((end_pred == end_true).sum()/len(end_pred)).item())
        
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

        eval_steps += 1
    
    validation_loss_values.append(eval_loss / eval_steps)
    val_accuracy.append(sum(eval_accuracy)/len(eval_accuracy))
    print("Validation loss: {}".format(validation_loss_values[-1]))
    print("Validation Accuracy: {}".format(val_accuracy[-1]))


  a = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 0: 100%|██████████| 9366/9366 [1:09:46<00:00,  2.24it/s, loss=1.44]   


Average train loss: 1.5965317958641387


Epoch 0: 100%|██████████| 1125/1125 [02:17<00:00,  8.17it/s, loss=3.28]   


Validation loss: 1.549923556284384
Validation Accuracy: 0.6146666666666667


Epoch 1: 100%|██████████| 9366/9366 [1:11:18<00:00,  2.19it/s, loss=0.273]   


Average train loss: 0.44385107307056465


Epoch 1: 100%|██████████| 1125/1125 [02:45<00:00,  6.78it/s, loss=2.55]    


Validation loss: 1.71110857762639
Validation Accuracy: 0.6432222222222223


Epoch 2: 100%|██████████| 9366/9366 [1:08:43<00:00,  2.27it/s, loss=0.000362]


Average train loss: 0.17934867174392158


Epoch 2: 100%|██████████| 1125/1125 [02:46<00:00,  6.77it/s, loss=2.95]    

Validation loss: 2.362176842510774
Validation Accuracy: 0.6468888888888888





In [63]:
#import os

#os.mkdir('../data/aligned_data_SL/tokenized/results')

model.save_pretrained('../data/aligned_data_SL/directly/results/weights/')

#torch.save(model.state_dict(), '../data/aligned_data_SL/tokenized/results/weights')


In [None]:

import pickle

losses = {'train_loss': loss_values, 'validation_loss': validation_loss_values, 'validation_accuracy': val_accuracy}
with open("../data/aligned_data_SL/directly/results/results", 'wb') as fp:
    pickle.dump(losses, fp)

In [64]:
model2 = BertForQuestionAnswering.from_pretrained('../data/aligned_data_SL/directly/results/weights/')
model2.eval()

BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(49601, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_

In [65]:
def question_answer(question, text):
    # tokenize question and text as a pair
    input_ids = tokenizer.encode(question, text)
    
    # string version of tokenized ids
    tokens = tokenizer.convert_ids_to_tokens(input_ids)
    
    # segment IDs
    # first occurrence of [SEP] token
    sep_idx = input_ids.index(tokenizer.sep_token_id)
    # number of tokens in segment A (question)
    num_seg_a = sep_idx+1
    # number of tokens in segment B (text)
    num_seg_b = len(input_ids) - num_seg_a
    
    # list of 0s and 1s for segment embeddings
    segment_ids = [0]*num_seg_a + [1]*num_seg_b
    assert len(segment_ids) == len(input_ids)
    
    # model output using input_ids and segment_ids
    output = model(torch.tensor([input_ids]).to(device), token_type_ids=torch.tensor([segment_ids]).to(device))
    
    # reconstructing the answer
    answer_start = torch.argmax(output.start_logits)
    answer_end = torch.argmax(output.end_logits) + 1 # needs to be changed down
    if answer_end >= answer_start:
        answer = tokens[answer_start]
        for i in range(answer_start+1, answer_end+1): # here
            if tokens[i][0:2] == "##":
                answer += tokens[i][2:]
            else:
                answer += " " + tokens[i]
                
    if answer.startswith("[CLS]"):
        answer = "Unable to find the answer to your question."
    
    return "{}".format(answer.capitalize())

In [66]:
text = "Slovenija je imela nadpovprečno visoko gospodarsko rast, zgodovinsko najvišjo zaposlenost in kljub temu podpovprečno inflacijo. Izredno nizko stopnjo inflacije je beležila še prejšnji mesec, vendar so se številke z aprilom usmerile v nasprotno smer, kar pa v ljudeh vzbuja dvom in nezaupanje v prihodnjo vlado, da bo Slovenijo lahko peljala v napredek tako, kot je to uspevalo vladi Janeza Janše."
print(f"{text}\n\n")

questions = ["Čigavi vladi je uspevalo?","Kakšno rast je imela Slovenija?", "Kakšno gospodarsko rast je imela Slovenija?", "Kaj je imela Slovenija?", "Kakšna je bila inflacija?", "Kdaj je belezila izredno nisko stopnjo inflacije?", "Kam so se stevilke usmirile?", "V kom vzbuja dvom?", "Komu ljudje ne zaupajo?"]
for count, question in enumerate(questions):
    predicted_answer = question_answer(question, text)
    print(f"{count + 1}. {question}          {predicted_answer}.\n")

Slovenija je imela nadpovprečno visoko gospodarsko rast, zgodovinsko najvišjo zaposlenost in kljub temu podpovprečno inflacijo. Izredno nizko stopnjo inflacije je beležila še prejšnji mesec, vendar so se številke z aprilom usmerile v nasprotno smer, kar pa v ljudeh vzbuja dvom in nezaupanje v prihodnjo vlado, da bo Slovenijo lahko peljala v napredek tako, kot je to uspevalo vladi Janeza Janše.


1. Čigavi vladi je uspevalo?          Slovenija.

2. Kakšno rast je imela Slovenija?          Nadpov.

3. Kakšno gospodarsko rast je imela Slovenija?          Nadpov.

4. Kaj je imela Slovenija?          Slovenija.

5. Kakšna je bila inflacija?          Slovenija.

6. Kdaj je belezila izredno nisko stopnjo inflacije?          Prejsn.

7. Kam so se stevilke usmirile?          Slovenija.

8. V kom vzbuja dvom?          Dvom ?.

9. Komu ljudje ne zaupajo?          Dvom in.

