# Installing Libraries and Downloading Datasets

In [1]:
! pip install -q datasets transformers tokenizers
from datasets import load_dataset, load_metric

In [2]:
!mkdir squad

!wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json -O squad/train-v2.0.json
!wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json -O squad/dev-v2.0.json

mkdir: cannot create directory ‘squad’: File exists
--2020-12-23 18:09:21--  https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json
Resolving rajpurkar.github.io (rajpurkar.github.io)... 185.199.108.153, 185.199.109.153, 185.199.110.153, ...
Connecting to rajpurkar.github.io (rajpurkar.github.io)|185.199.108.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 42123633 (40M) [application/json]
Saving to: ‘squad/train-v2.0.json’


2020-12-23 18:09:21 (243 MB/s) - ‘squad/train-v2.0.json’ saved [42123633/42123633]

--2020-12-23 18:09:21--  https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json
Resolving rajpurkar.github.io (rajpurkar.github.io)... 185.199.108.153, 185.199.109.153, 185.199.110.153, ...
Connecting to rajpurkar.github.io (rajpurkar.github.io)|185.199.108.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4370528 (4.2M) [application/json]
Saving to: ‘squad/dev-v2.0.json’


2020-12-23 18:09:22 (75.6 MB/s

# Reading the datasets

In [3]:
import json
from pathlib import Path

def read_squad(path):
    path = Path(path)
    with open(path, 'rb') as f:
        squad_dict = json.load(f)

    contexts = []
    questions = []
    answers = []
    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                for answer in qa['answers']:
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)

    return contexts, questions, answers

train_contexts, train_questions, train_answers = read_squad('squad/train-v2.0.json')
val_contexts, val_questions, val_answers = read_squad('squad/dev-v2.0.json')

# preprocessing the data for QA format

In [4]:
def add_end_idx(answers, contexts):
    for answer, context in zip(answers, contexts):
        gold_text = answer['text']
        start_idx = answer['answer_start']
        end_idx = start_idx + len(gold_text)

        # sometimes squad answers are off by a character or two – fix this
        if context[start_idx:end_idx] == gold_text:
            answer['answer_end'] = end_idx
        elif context[start_idx-1:end_idx-1] == gold_text:
            answer['answer_start'] = start_idx - 1
            answer['answer_end'] = end_idx - 1     # When the gold label is off by one character
        elif context[start_idx-2:end_idx-2] == gold_text:
            answer['answer_start'] = start_idx - 2
            answer['answer_end'] = end_idx - 2     # When the gold label is off by two characters

add_end_idx(train_answers, train_contexts)
add_end_idx(val_answers, val_contexts)

## Tokenising

In [5]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)

## Adding Token Position

In [6]:
def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1))
        # if None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        if end_positions[-1] is None:
            end_positions[-1] = tokenizer.model_max_length
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)

## Converting the tokens to pytorch data loaders

In [7]:
import torch

class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)


In [8]:
!pip install -q pytorch_lightning

from torch.utils.data import DataLoader
from transformers import AdamW
import pytorch_lightning as pl
from transformers import DistilBertForQuestionAnswering

# Defining the Model in Pytorch Lightning

In [9]:
class QA_BERT(pl.LightningModule):
  def __init__(self,lr=0.0005):
    super().__init__()

    self.model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")

    self.lr= lr

  def forward(self,input_ids,attention_mask,start_positions,end_positions):

    return self.model(input_ids,
                      attention_mask=attention_mask,
                      start_positions=start_positions, 
                      end_positions=end_positions
                      )

  def training_step(self, batch, batch_idx):
    input_ids = batch['input_ids'].to(self.device)
    attention_mask = batch['attention_mask'].to(self.device)
    start_positions = batch['start_positions'].to(self.device)
    end_positions = batch['end_positions'].to(self.device)
    outputs = self(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
    loss = outputs[0]
    return loss

  def validation_step(self,batch,batch_idx):
    input_ids = batch['input_ids'].to(self.device)
    attention_mask = batch['attention_mask'].to(self.device)
    start_positions = batch['start_positions'].to(self.device)
    end_positions = batch['end_positions'].to(self.device)
    outputs = self(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
    loss = outputs[0]
    return loss

  def configure_optimizers(self):
    return AdamW(self.parameters(), lr=self.lr)

  def prepare_data(self,stage=None):
    self.train_dataset = SquadDataset(train_encodings)
    self.val_dataset = SquadDataset(val_encodings)

  def setup(self,stage=None):

    self.train_loader = DataLoader(self.train_dataset,
                              batch_size=16,
                              shuffle= True,
                              )

    self.val_loader = DataLoader(self.val_dataset, 
                            batch_size=16,
                            shuffle =False,
                            )

  def train_dataloader(self):
    return self.train_loader

  def val_dataloader(self):
    return self.val_loader

# Initialising the Model and Training the model

In [10]:
from pytorch_lightning import Trainer
import tensorflow as tf

trainer = Trainer(max_epochs=1,
                  fast_dev_run=False,
                  gpus=(-1 if torch.cuda.is_available() else 0),
                  auto_lr_find=True,
                  )

model = QA_BERT()

trainer.tune(model)

GPU available: True, used: True
TPU available: None, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-ba

HBox(children=(FloatProgress(value=0.0, description='Finding best initial lr', style=ProgressStyle(description…

Learning rate set to 0.0002089296130854041


In [None]:
trainer.fit(model)


  | Name  | Type                           | Params
---------------------------------------------------------
0 | model | DistilBertForQuestionAnswering | 66.4 M
---------------------------------------------------------
66.4 M    Trainable params
0         Non-trainable params
66.4 M    Total params


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…