In [1]:
import os
import random
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import multiprocessing as mp
from types import SimpleNamespace
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import AutoTokenizer
from torch.nn.parameter import Parameter
from torch.cuda.amp import GradScaler, autocast
from torch.utils.data import Dataset, DataLoader
from transformers import get_cosine_schedule_with_warmup
from transformers import AutoConfig, AutoModelForQuestionAnswering

N_CORES = mp.cpu_count()
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

# 1. General Settings

In [2]:
cfg = {
    'root': '/kaggle/input/question-and-answering-dataset' + '/',
    'seed': 279,
    'batch_size': 16,
    'epochs': 3,
    'weight_decay': 0.001,
    'learning_rate': 5e-5,
    'warmup_steps': 0.0,
    'max_length': 96,
    'intermediate_dropout': 0.0,
    'padding_quantile': 1.0,
    'device': 'cuda',
    'num_workers': N_CORES,
    'backbone': 'xlm-roberta-base'
}

cfg = SimpleNamespace(**cfg)

In [3]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

def add_end_pos(df):
    len_answers = df.answer.apply(len)
    df['answer_end'] = df.answer_start + len_answers
    return df

def add_token_positions(encodings, start_pos, end_pos):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        start_positions.append(encodings.char_to_token(i, start_pos[i]))
        end_positions.append(encodings.char_to_token(i, end_pos[i]-1)) 
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length 
        if end_positions[-1] is None:
            end_positions[-1] = tokenizer.model_max_length 
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

@torch.inference_mode()
def run_prediction(context, question):
    inputs = tokenizer.encode_plus(
        question, context, 
        return_tensors='pt'
    ).to(cfg.device)
    
    model.eval()
    with autocast():
        outputs = model(**inputs)
        
    answer_start = torch.argmax(outputs[0])  
    answer_end = torch.argmax(outputs[1]) + 1 
    answer = tokenizer.convert_tokens_to_string(
        tokenizer.convert_ids_to_tokens(
            inputs.input_ids[0][answer_start:answer_end]
        )
    )
    
    print(
        f'- Question: {question}\n'
        f'- Answer: {answer}'
    )

# 2. Load in dataset

In [4]:
train_df = pd.read_json(cfg.root + 'qa_dataset.json')

# 3. Add end position 

In [5]:
train_df = add_end_pos(train_df)

# 4. Tokenization

In [6]:
questions = train_df.question.values.tolist()
contexts = train_df.context.values.tolist()
answers = train_df.answer.values.tolist()
answer_starts = train_df.answer_start.values.tolist()
answer_ends = train_df.answer_end.values.tolist()

In [7]:
tokenizer = AutoTokenizer.from_pretrained(cfg.backbone)

encodings = tokenizer(
    contexts, questions,
    padding='max_length',
    truncation=True,
    max_length=cfg.max_length,
)

add_token_positions(
    encodings, 
    answer_starts, 
    answer_ends
)

Downloading (…)lve/main/config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

# 5. Dataset and DataLoader

In [8]:
class QADataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

In [9]:
dataset = QADataset(encodings)
data_loader = DataLoader(
    dataset=dataset,
    batch_size=cfg.batch_size, 
    drop_last=True,
    shuffle=True,
    num_workers=cfg.num_workers
)

# 6. Model Setups

In [10]:
config = AutoConfig.from_pretrained(cfg.backbone)
config.hidden_dropout_prob = cfg.intermediate_dropout
config.attention_probs_dropout_prob = cfg.intermediate_dropout

set_seed(seed=cfg.seed)
model = AutoModelForQuestionAnswering.from_pretrained(
    cfg.backbone, 
    config=config
)
model.to(cfg.device);

Downloading pytorch_model.bin:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForQuestionAnswering: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing XLMRobertaForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForQuestionAnswering were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream tas

In [11]:
optimizer = optim.AdamW(
    model.parameters(), 
    lr=cfg.learning_rate,
    weight_decay=cfg.weight_decay
)

scheduler = get_cosine_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=cfg.warmup_steps,
    num_training_steps=len(data_loader)*cfg.epochs
)

scaler = GradScaler()

# 7. Fine-tuning

In [12]:
for epoch in tqdm(range(cfg.epochs)):
    
    model.train()
    
    for batch_idx, batch in enumerate(data_loader):
        input_ids = batch['input_ids'].to(cfg.device)
        attention_mask = batch['attention_mask'].to(cfg.device)
        start_positions = batch['start_positions'].to(cfg.device)
        end_positions = batch['end_positions'].to(cfg.device)
        
        idx = torch.quantile(
            torch.where(attention_mask==1)[1].float(),
            q=cfg.padding_quantile
        ).long()
        
        attention_mask = attention_mask[:, :idx]
        input_ids = input_ids[:, :idx]

        with autocast(): 
            outputs = model(
                input_ids=input_ids, 
                attention_mask=attention_mask,
                start_positions=start_positions,
                end_positions=end_positions
            )
            
            loss = outputs[0]
            
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()
        scheduler.step()
        
        if not batch_idx % 100:
            print(
                f'Epoch: {epoch + 1}/{cfg.epochs}'
                f' | Batch: {batch_idx}/{len(data_loader)}'
                f' | Loss: {loss:.4f}'
            )

  0%|          | 0/3 [00:00<?, ?it/s]

Epoch: 1/3 | Batch: 0/388 | Loss: 4.5768
Epoch: 1/3 | Batch: 100/388 | Loss: 0.5002
Epoch: 1/3 | Batch: 200/388 | Loss: 0.6547
Epoch: 1/3 | Batch: 300/388 | Loss: 0.8190
Epoch: 2/3 | Batch: 0/388 | Loss: 0.3760
Epoch: 2/3 | Batch: 100/388 | Loss: 0.2905
Epoch: 2/3 | Batch: 200/388 | Loss: 0.2072
Epoch: 2/3 | Batch: 300/388 | Loss: 0.3333
Epoch: 3/3 | Batch: 0/388 | Loss: 0.0700
Epoch: 3/3 | Batch: 100/388 | Loss: 0.0209
Epoch: 3/3 | Batch: 200/388 | Loss: 0.0321
Epoch: 3/3 | Batch: 300/388 | Loss: 0.0477


# 8. Save model

In [13]:
model.eval()
torch.save(model.state_dict(), 'qa_model.pth')

# Q-A

In [14]:
context_1 = """
Elon Reeve Musk FRS (sinh ngày 28 tháng 6 năm 1971), là một kỹ sư, nhà tài phiệt, 
nhà phát minh, doanh nhân công nghệ và nhà từ thiện người Mỹ gốc Nam Phi. 
"""

context_2 = """
Elon Musk cùng với em trai, Kimbal, đồng sáng lập ra Zip2, 
một công ty phần mềm web và được hãng Compaq mua lại với giá 340 triệu USD vào năm 1999.
"""

In [15]:
question_1 = 'Elon Musk là người nước nào?'
run_prediction(context_1, question_1)

- Question: Elon Musk là người nước nào?
- Answer: Mỹ gốc Nam Phi


In [16]:
question_2 = 'Elon Musk sinh ngày bao nhiêu?'
run_prediction(context_1, question_2)

- Question: Elon Musk sinh ngày bao nhiêu?
- Answer: ngày 28 tháng 6 năm 1971


In [17]:
question_3 = 'Em trai Elon tên là gì?'
run_prediction(context_2, question_3)

- Question: Em trai Elon tên là gì?
- Answer: Kimbal


In [18]:
question_4 = 'Hãng Compaq mua lại Zip2 với giá bao nhiêu?'
run_prediction(context_2, question_4)

- Question: Hãng Compaq mua lại Zip2 với giá bao nhiêu?
- Answer: 340 triệu USD
