In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler
from transformers import BertTokenizerFast, BertForQuestionAnswering
from transformers import AdamW, get_linear_schedule_with_warmup

In [13]:
import pandas as pd
import numpy as np
import json
from pprint import pprint
from tqdm.notebook import tqdm
import os

# Hyperparams

In [3]:
batch_size = 8
epochs = 4
learning_rate = 2e-5
eps = 1e-7

# Load & preprocess data

In [4]:
with open('data/coqa-train-v1.0.json', 'r') as json_file:
    data = json.loads(json_file.read())

In [5]:
formatted_data = []
for d in data['data']:
    for ques, ans in zip(d['questions'], d['answers']):
        formatted_data.append({
            'id': d['id'],
            'question': ques['input_text'],
            'answer': ans,
            'context': d['story'],
            'source': d['source']
        })

In [6]:
df = pd.DataFrame(formatted_data)

# Prepare data for training

In [7]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

In [8]:
class QADataset(Dataset):
    def __init__(self, df, tokenizer, max_len=400, stride=128):
        super(QADataset, self).__init__()
        self.tokenizer = tokenizer
        self.df = df
        self.max_len = max_len
        self.stride = stride
        
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, index):
        question = self.df['question'].iloc[index]
        ans = self.df['answer'].iloc[index]
        context = self.df['context'].iloc[index]
        
        tokenized_input = self.tokenizer(
            question, context,
            max_length=self.max_len, truncation="only_second",
            return_overflowing_tokens=True, return_offsets_mapping=True,
            stride=self.stride, padding='max_length'
        )
        
        start_position = []
        end_position = []
        for i, offsets in enumerate(tokenized_input['offset_mapping']):
            input_ids = tokenized_input['input_ids'][i]
            cls_index = input_ids.index(tokenizer.cls_token_id)
            sequence_ids = tokenized_input.sequence_ids(i)

            if ans['span_start'] == 0:
                start_position.append(cls_index)
                end_position.append(cls_index)
            else:
                start_char = ans['span_start']
                end_char = ans['span_end']

                token_start_idx = 0
                while sequence_ids[token_start_idx] != 1:
                    token_start_idx += 1

                token_end_index = len(tokenized_input['input_ids']) - 1
                while sequence_ids[token_end_index] != 1:
                    token_end_index -= 1
                if (offsets[token_start_idx][0] <= start_char and offsets[token_end_index][1] >= end_char):
                    # Move the token_start_index and token_end_index to the two ends of the answer.
                    # Note: we could go after the last offset if the answer is the last word (edge case).
                    while token_start_idx < len(offsets) and offsets[token_start_idx][0] <= start_char:
                        token_start_idx += 1

                    start_position.append(token_start_idx - 1)
                    while offsets[token_end_index][1] >= end_char:
                        token_end_index -= 1
                    end_position.append(token_end_index + 1)
                else:
                    start_position.append(cls_index)
                    end_position.append(cls_index)
        
        return {
            'input_ids': torch.tensor(tokenized_input['input_ids'][0]),
            'attention_mask': torch.tensor(tokenized_input['attention_mask'][0]),
            'start_positions': torch.tensor(start_position[0]),
            'end_positions': torch.tensor(end_position[0])
        }

In [9]:
train_dataset = QADataset(df, tokenizer)
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size)

# Model preparation

In [10]:
model = BertForQuestionAnswering.from_pretrained('bert-base-cased')
model.to('cuda');

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForQuestionAnswering: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and a

In [11]:
optimizer = AdamW(model.parameters(), lr=learning_rate, eps=eps)
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=0, num_training_steps=epochs * len(train_dataloader)
)

# Model training

In [12]:
total_training_loss = []

for epoch in tqdm(range(epochs)):
    training_loss = 0
    model.train()
    
    for step, train_data in tqdm(enumerate(train_dataloader), total=len(train_dataloader)):
        input_ids = train_data['input_ids'].to('cuda')
        attention_mask = train_data['attention_mask'].to('cuda')
        start_positions = train_data['start_positions'].to('cuda')
        end_positions = train_data['end_positions'].to('cuda')
        
        # Forward pass
        model.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
        
        loss = outputs[0]
        training_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()
        
    print('[Training Epoch %d]\nTraining loss: %.3f' %(epoch + 1, training_loss / len(train_dataloader)))

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=13581.0), HTML(value='')))


[Training Epoch 1]
Training loss: 1.642


HBox(children=(FloatProgress(value=0.0, max=13581.0), HTML(value='')))


[Training Epoch 2]
Training loss: 1.299


HBox(children=(FloatProgress(value=0.0, max=13581.0), HTML(value='')))


[Training Epoch 3]
Training loss: 1.057


HBox(children=(FloatProgress(value=0.0, max=13581.0), HTML(value='')))


[Training Epoch 4]
Training loss: 0.857



In [None]:
if not os.path.exists('weights'):
    os.makedirs('weights')
    
model.save_pretrained