# QA

In [1]:
from pathlib import Path
squad_paths = [str(x) for x in Path('../data/aligned_data_SL/directly').glob('**/*.json')]

for path in squad_paths:
    if "train" in path:
        train_path = path
    else:
        test_path = path
train_path, test_path

('..\\data\\aligned_data_SL\\directly\\train-v2.0_aligned_directly.json',
 '..\\data\\aligned_data_SL\\directly\\dev-v2.0_aligned_directly.json')

In [2]:
def read_squad(data):
    contexts = []
    questions = []
    answers = []
    # iterate through all data in squad data
    for _, group in data.iterrows():
        for passage in group['data']['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                if 'plausible_answers' in qa.keys():
                    access = 'plausible_answers'
                else:
                    access = 'answers'
                for answer in qa[access]:
                    # append data to lists
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)
    # return formatted data lists
    return contexts, questions, answers

In [3]:
from pandas import read_json
from sklearn.model_selection import train_test_split

train = read_json(train_path)
test = read_json(test_path)
test, val = train_test_split(test, test_size=0.5, shuffle=True)
train_contexts, train_questions, train_answers = read_squad(train)
test_contexts, test_questions, test_answers = read_squad(test)
val_contexts, val_questions, val_answers = read_squad(val)

train.shape, test.shape, val.shape

((442, 1), (17, 1), (18, 1))

In [4]:
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained('../data/CroSloEngual_BERT')

In [5]:
train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding='max_length', max_length=512, return_tensors='pt')
test_encodings = tokenizer(test_contexts, test_questions, truncation=True, padding='max_length', max_length=512, return_tensors='pt')
val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding='max_length', max_length=512, return_tensors='pt')

In [6]:
from tqdm.auto import tqdm
import numpy as np

def add_token_positions(encodings, answers):
    # initialize lists to contain the token indices of answer start/end
    start_positions = []
    end_positions = []
    invalid = []
    for i in range(len(answers)):
        # append start/end token position using char_to_token method
        start = encodings.char_to_token(i, answers[i]['answer_start'])
        end = encodings.char_to_token(i, answers[i]['answer_start'])
        cond = True

        # if start position is None, the answer passage has been truncated
        if start is None:
            start = tokenizer.model_max_length
            invalid.append(i)
            cond = False
        # end position cannot be found, char_to_token found space, so shift one token forward
        if cond:
            go_back = 1
            while end is None:
                end = encodings.char_to_token(i, answers[i]['answer_end']-go_back)
                go_back +=1
            start_positions.append(start)
            end_positions.append(end)
    # update our encodings object with the new token-based start/end positions
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})
    keys = ['input_ids', 'token_type_ids', 'attention_mask']
    for key in keys:
        tmp = np.delete(encodings[key], invalid, 0)
        encodings.update({key: tmp})

In [7]:
add_token_positions(train_encodings, train_answers)
add_token_positions(test_encodings, test_answers)
add_token_positions(val_encodings, val_answers)

In [8]:
import torch

class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        try:
            a = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        except RuntimeError:
            for k, v in self.encodings.items():
                print(f'{k}: {v}')
            a = None
        return a

    def __len__(self):
        return len(self.encodings.input_ids)

In [9]:
train_dataset = SquadDataset(train_encodings)
test_dataset = SquadDataset(test_encodings)
val_dataset = SquadDataset(val_encodings)

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=4)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=4)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=4)

In [10]:
from transformers import BertForQuestionAnswering
model = BertForQuestionAnswering.from_pretrained('../data/CroSloEngual_BERT')

Some weights of the model checkpoint at ../data/CroSloEngual_BERT were not used when initializing BertForQuestionAnswering: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at ../data/Cro

In [11]:
from transformers import AdamW

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model.to(device)


BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(49601, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_

In [12]:
FULL_FINETUNING = True
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters())
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

optimizer = AdamW(
    optimizer_grouped_parameters,
    lr=3e-5,
    eps=1e-8
)



In [13]:
import numpy as np

model_parameters = filter(lambda p: p.requires_grad, model.parameters())
params = sum([np.prod(p.size()) for p in model_parameters])
print(f"The model has {params} trainable parameters")

The model has 123545858 trainable parameters


In [14]:
from transformers import get_linear_schedule_with_warmup

epochs = 3
max_grad_norm = 1.0

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

In [15]:
import gc

gc.collect()

torch.cuda.empty_cache()

In [16]:
loss_values, validation_loss_values, val_accuracy = [], [], []
train_step = 0

for epoch in range(epochs):
    model.train()
    total_loss = 0
    loop = tqdm(train_dataloader)
    for batch in loop:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask,
                        start_positions=start_positions,
                        end_positions=end_positions)
        
        loss = outputs[0]
        loss.backward()
        total_loss += loss.item()
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        optimizer.step()
        scheduler.step()

        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

        train_step += 1
    
    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / train_step
    print("Average train loss: {}".format(avg_train_loss))

    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)

    ## VALIDATION
    model.eval()
    eval_loss = 0
    eval_accuracy = []
    eval_steps = 0
    loop = tqdm(val_dataloader)
    for batch in loop:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask,
                            start_positions=start_true,
                            end_positions=end_true)
        
        start_pred = torch.argmax(outputs['start_logits'], dim=1)
        end_pred = torch.argmax(outputs['end_logits'], dim=1)

        loss = outputs[0]
        eval_loss += loss.item()

        eval_accuracy.append(((start_pred == start_true).sum()/len(start_pred)).item())
        eval_accuracy.append(((end_pred == end_true).sum()/len(end_pred)).item())
        
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

        eval_steps += 1
    
    validation_loss_values.append(eval_loss / eval_steps)
    val_accuracy.append(sum(eval_accuracy)/len(eval_accuracy))
    print("Validation loss: {}".format(validation_loss_values[-1]))
    print("Validation Accuracy: {}".format(val_accuracy[-1]))


  a = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 0:  21%|██        | 1949/9366 [14:23<56:46,  2.18it/s, loss=1.58]    

In [None]:
import os

os.mkdir('../data/aligned_data_SL/results')

torch.save(model.state_dict(), '../data/bert_qa/weights')

import pickle

losses = {'train_loss': loss_values, 'validation_loss': validation_loss_values, 'validation_accuracy': val_accuracy}
with open("results", 'wb') as fp:
    pickle.dump(losses, fp)