In [None]:

!curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
!python pytorch-xla-env-setup.py --version nightly --apt-packages libomp5 libopenblas-dev


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  5116  100  5116    0     0  24132      0 --:--:-- --:--:-- --:--:-- 24018
Updating... This may take around 2 minutes.
Updating TPU runtime to pytorch-nightly ...
Collecting cloud-tpu-client
  Downloading https://files.pythonhosted.org/packages/56/9f/7b1958c2886db06feb5de5b2c191096f9e619914b6c31fdf93999fdbbd8b/cloud_tpu_client-0.10-py3-none-any.whl
Collecting google-api-python-client==1.8.0
[?25l  Downloading https://files.pythonhosted.org/packages/9a/b4/a955f393b838bc47cbb6ae4643b9d0f90333d3b4db4dc1e819f36aad18cc/google_api_python_client-1.8.0-py3-none-any.whl (57kB)
[K     |████████████████████████████████| 61kB 3.5MB/s 
Uninstalling torch-1.7.0+cu101:
Installing collected packages: google-api-python-client, cloud-tpu-client
  Found existing installation: google-api-python-client 1.7.12
    Uninstalling google-api-python-c

# Installing Libraries and Downloading Datasets

In [None]:
! pip install -q datasets transformers tokenizers
from datasets import load_dataset, load_metric

[K     |████████████████████████████████| 163kB 5.0MB/s 
[K     |████████████████████████████████| 1.5MB 28.1MB/s 
[K     |████████████████████████████████| 2.9MB 52.9MB/s 
[K     |████████████████████████████████| 245kB 52.0MB/s 
[K     |████████████████████████████████| 17.7MB 240kB/s 
[K     |████████████████████████████████| 890kB 38.4MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


In [None]:
!mkdir squad

!wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json -O squad/train-v2.0.json
!wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json -O squad/dev-v2.0.json

--2020-12-23 15:42:54--  https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json
Resolving rajpurkar.github.io (rajpurkar.github.io)... 185.199.111.153, 185.199.109.153, 185.199.110.153, ...
Connecting to rajpurkar.github.io (rajpurkar.github.io)|185.199.111.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 42123633 (40M) [application/json]
Saving to: ‘squad/train-v2.0.json’


2020-12-23 15:42:54 (159 MB/s) - ‘squad/train-v2.0.json’ saved [42123633/42123633]

--2020-12-23 15:42:54--  https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json
Resolving rajpurkar.github.io (rajpurkar.github.io)... 185.199.111.153, 185.199.109.153, 185.199.110.153, ...
Connecting to rajpurkar.github.io (rajpurkar.github.io)|185.199.111.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4370528 (4.2M) [application/json]
Saving to: ‘squad/dev-v2.0.json’


2020-12-23 15:42:55 (46.9 MB/s) - ‘squad/dev-v2.0.json’ saved [4370528/4370528]



# Reading the datasets

In [None]:
import json
from pathlib import Path

def read_squad(path):
    path = Path(path)
    with open(path, 'rb') as f:
        squad_dict = json.load(f)

    contexts = []
    questions = []
    answers = []
    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                for answer in qa['answers']:
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)

    return contexts, questions, answers

train_contexts, train_questions, train_answers = read_squad('squad/train-v2.0.json')
val_contexts, val_questions, val_answers = read_squad('squad/dev-v2.0.json')

# preprocessing the data for QA format

In [None]:
def add_end_idx(answers, contexts):
    for answer, context in zip(answers, contexts):
        gold_text = answer['text']
        start_idx = answer['answer_start']
        end_idx = start_idx + len(gold_text)

        # sometimes squad answers are off by a character or two – fix this
        if context[start_idx:end_idx] == gold_text:
            answer['answer_end'] = end_idx
        elif context[start_idx-1:end_idx-1] == gold_text:
            answer['answer_start'] = start_idx - 1
            answer['answer_end'] = end_idx - 1     # When the gold label is off by one character
        elif context[start_idx-2:end_idx-2] == gold_text:
            answer['answer_start'] = start_idx - 2
            answer['answer_end'] = end_idx - 2     # When the gold label is off by two characters

add_end_idx(train_answers, train_contexts)
add_end_idx(val_answers, val_contexts)

## Tokenising

In [None]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




## Adding Token Position

In [None]:
def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1))
        # if None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        if end_positions[-1] is None:
            end_positions[-1] = tokenizer.model_max_length
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)

## Converting the tokens to pytorch data loaders

In [None]:
import torch

class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)


In [None]:
!pip install -q pytorch_lightning

from torch.utils.data import DataLoader
from transformers import AdamW
import pytorch_lightning as pl
from transformers import DistilBertForQuestionAnswering
import torch_xla.core.xla_model as xm

[K     |████████████████████████████████| 675kB 5.4MB/s 
[K     |████████████████████████████████| 102kB 6.7MB/s 
[K     |████████████████████████████████| 829kB 22.3MB/s 
[K     |████████████████████████████████| 276kB 35.6MB/s 
[?25h  Building wheel for future (setup.py) ... [?25l[?25hdone
  Building wheel for PyYAML (setup.py) ... [?25l[?25hdone


# Defining the Model in Pytorch Lightning

In [None]:
class QA_BERT(pl.LightningModule):
  def __init__(self,lr=0.0005):
    super().__init__()

    self.model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")

    self.lr= lr

  def forward(self,input_ids,attention_mask,start_positions,end_positions):

    return self.model(input_ids,
                      attention_mask=attention_mask,
                      start_positions=start_positions, 
                      end_positions=end_positions
                      )

  def training_step(self, batch, batch_idx):
    input_ids = batch['input_ids'].to(self.device)
    attention_mask = batch['attention_mask'].to(self.device)
    start_positions = batch['start_positions'].to(self.device)
    end_positions = batch['end_positions'].to(self.device)
    outputs = self(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
    loss = outputs[0]
    return loss

  def validation_step(self,batch,batch_idx):
    input_ids = batch['input_ids'].to(self.device)
    attention_mask = batch['attention_mask'].to(self.device)
    start_positions = batch['start_positions'].to(self.device)
    end_positions = batch['end_positions'].to(self.device)
    outputs = self(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
    loss = outputs[0]
    return loss

  def configure_optimizers(self):
    return AdamW(self.parameters(), lr=self.lr)

  def prepare_data(self,stage=None):
    self.train_dataset = SquadDataset(train_encodings)
    self.val_dataset = SquadDataset(val_encodings)

  def setup(self,stage=None):
    train_sampler = torch.utils.data.distributed.DistributedSampler(
            self.train_dataset,
            num_replicas=xm.xrt_world_size(),
            rank=xm.get_ordinal(),
            shuffle=True
        )

    val_sampler = torch.utils.data.distributed.DistributedSampler(
                self.val_dataset,
                num_replicas=xm.xrt_world_size(),
                rank=xm.get_ordinal(),
                shuffle=False
            )


    device  = "cuda" if torch.cuda.is_available() else "cpu"


    self.train_loader = DataLoader(self.train_dataset,
                              batch_size=16,
                              # shuffle= True,
                              sampler=train_sampler,
                              )

    self.val_loader = DataLoader(self.val_dataset, 
                            batch_size=16,
                            # shuffle =False,
                            sampler=val_sampler,
                            )

  def train_dataloader(self):
    return self.train_loader

  def val_dataloader(self):
    return self.val_loader

# Initialising the Model and Training the model

In [None]:
from pytorch_lightning import Trainer
import tensorflow as tf

trainer = Trainer(max_epochs=3,
                  fast_dev_run=False,
                  gpus=(-1 if torch.cuda.is_available() else 0),
                  auto_lr_find=True,
                  tpu_cores=8,
                  # track_grad_norm = 'inf',
                  # gradient_clip_val =1
                  )

model = QA_BERT()

trainer.tune(model)

GPU available: False, used: False
TPU available: True, using: 8 TPU cores


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=267967963.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this mode

HBox(children=(FloatProgress(value=0.0, description='Finding best initial lr', style=ProgressStyle(description…

HBox(children=(FloatProgress(value=0.0, description='Finding best initial lr', style=ProgressStyle(description…

HBox(children=(FloatProgress(value=0.0, description='Finding best initial lr', style=ProgressStyle(description…

HBox(children=(FloatProgress(value=0.0, description='Finding best initial lr', style=ProgressStyle(description…

HBox(children=(FloatProgress(value=0.0, description='Finding best initial lr', style=ProgressStyle(description…

HBox(children=(FloatProgress(value=0.0, description='Finding best initial lr', style=ProgressStyle(description…

HBox(children=(FloatProgress(value=0.0, description='Finding best initial lr', style=ProgressStyle(description…

HBox(children=(FloatProgress(value=0.0, description='Finding best initial lr', style=ProgressStyle(description…



In [None]:
trainer.fit(model)