<a href="https://colab.research.google.com/github/dar-tau/nlp-experiments/blob/master/introbert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Initialization

In [1]:
!pip install transformers datasets
# !pip install simpletransformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/ae/05/c8c55b600308dc04e95100dc8ad8a244dd800fe75dfafcf1d6348c6f6209/transformers-3.1.0-py3-none-any.whl (884kB)
[K     |████████████████████████████████| 890kB 5.1MB/s 
[?25hCollecting datasets
[?25l  Downloading https://files.pythonhosted.org/packages/8e/f2/d213673d76ee56d907e462e6c144f1418368d35e6a9221799403116516de/datasets-1.0.1-py3-none-any.whl (1.8MB)
[K     |████████████████████████████████| 1.8MB 23.7MB/s 
Collecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 42.7MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |███████████████████████

In [2]:
%cd /content
!mkdir data
%cd /content/data
!wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json

/content
/content/data
--2020-09-20 11:14:56--  https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json
Resolving rajpurkar.github.io (rajpurkar.github.io)... 185.199.110.153, 185.199.109.153, 185.199.108.153, ...
Connecting to rajpurkar.github.io (rajpurkar.github.io)|185.199.110.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 42123633 (40M) [application/json]
Saving to: ‘train-v2.0.json’


2020-09-20 11:14:58 (96.8 MB/s) - ‘train-v2.0.json’ saved [42123633/42123633]



In [3]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
import json
import os

import re
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader


import datasets
from transformers import AutoModelForQuestionAnswering, AutoTokenizer
from transformers import pipeline
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

from transformers.data.processors.squad import SquadV2Processor, squad_convert_examples_to_features

PyTorch version 1.6.0+cu101 available.
TensorFlow version 2.3.0 available.


In [5]:
device = 'cuda'

def torchTokenize(*args):
  return tokenizer(*args, truncation = True,
                       padding = True, return_tensors = 'pt')

def squad_to_introbert(squad_zipped_example_and_features):
  squad_example, squad_features = squad_zipped_example_and_features
  res = {'start_position': squad_features.start_position, 
         'end_position': squad_features.end_position}
  res.update({'context': squad_example.context_text,
              'question': squad_example.question_text})
  # res.update({k: torch.Tensor(as_dict[k], device = device) for k in ['input_ids', 'attention_mask','token_type_ids'] })
  return res

def dictToDevice(d, device):
  d_ = {}
  for k, v in d.items():
    if isinstance(v, torch.Tensor):
      d_[k] = v.to(device)
    else:
      d_[k] = v
  return d_

In [10]:
class IntrobertDataset(Dataset):
  def __init__(self, srcDataset, func, device = device):
    self.ds = srcDataset
    self.func = func
    self.device = device
    self.isModelSet = False

  def setModel(self, model, nLayers, nHeads):
    self.model = model
    self.nLayers = nLayers
    self.nHeads = nHeads
    self.isModelSet = True 

  def __getitem__(self, i):
    assert(self.isModelSet)
    data = self.func(self.ds[i])
    context = data['context']
    start_position = None
    end_position = None
    inputs = None
    introspection = None
    use_original = self.choose_use_original()
    if use_original: 
      start_position = data['start_position']
      end_position = data['end_position']
      question = data['question']
    else:
        chosenLayer = np.random.choice(self.nLayers)
        chosenHead = np.random.choice(self.nHeads)
        question = "what is the most attended word in layer {} head {}?".format(chosenLayer, chosenHead)

        def introspection(model, attentions):
          res = attentions[chosenLayer][:,chosenHead].sum(dim = -2)[:, 1:].argmax()
          res += 1
          return (res, res)
        
    inputs = torchTokenize(context,question)
    inputs = dictToDevice(inputs, self.device)

    return {'context': context, 'inputs': inputs, 'start_position' : start_position, 'end_position': end_position,
            'question': question, 'introspection': introspection, 'use_original': use_original}

  def choose_use_original(self):
    return np.random.choice(2) == 0

  def __len__(self):
    return len(self.ds)

## Main

In [21]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased-distilled-squad")
model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-cased-distilled-squad", output_attentions = True,  
                                                      return_dict = True)
model.to(device)
optimizer = AdamW(model.parameters(), lr = 3e-6)


In [11]:
if 'dataset' not in globals():
  max_seq_length = 384
  doc_stride = 128
  max_query_length = 64
  total = 10000
  squad_examples = SquadV2Processor().get_train_examples("/content/data")[:total]
  squad_features = squad_convert_examples_to_features(squad_examples, tokenizer = tokenizer, 
                                    max_seq_length = max_seq_length,
                                    max_query_length = max_query_length,
                                    doc_stride = doc_stride, is_training = True, return_dataset = None)

dataset = IntrobertDataset(list(zip(squad_examples, squad_features)), squad_to_introbert)
dataset.setModel("distilbert-base-cased-distilled-squad", 6, 12)

In [22]:
from collections import defaultdict
start_positions = defaultdict(int)
was_original = []

n_epochs = 10
num_training_steps = total * n_epochs 
num_warmup_steps = total 
losses = []
 
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lambda x: x/num_training_steps)
# get_linear_schedule_with_warmup(optimizer,num_warmup_steps, num_training_steps)
# torch.optim.lr_scheduler.OneCycleLR(optimizer, 5e-5, total_steps = n_epochs * total,
#                                                 epochs = n_epochs)


from tqdm import tqdm as simple_tqdm
model.train()

for e in range(n_epochs):
  losses.append([])
  t = simple_tqdm(dataset, total = total, leave = True, position = 0)
  acc_sum1 = 0
  acc_sum2 = 0
  i_ = 0
  for i, data in enumerate(t):
    if i >= total:
      break    

    model.eval()
    inputs = data['inputs']
    introspection = data['introspection']
    use_original = data['use_original']
    was_original.append(int(use_original))
    if not use_original:
      outputs = model(**inputs)
    model.train()
    model.zero_grad()
    if use_original:
      start_position = data['start_position']
      end_position = data['end_position']
    else:
      start_position, end_position = introspection(model, outputs.attentions)
      start_positions[start_position.item()] += 1          
    start_position = torch.Tensor([start_position]).to(device).detach().long()
    end_position = torch.Tensor([end_position]).to(device).detach().long()

    outputs = model(**inputs, start_positions = start_position,
                    end_positions = end_position)
    
    loss = outputs.loss

    losses[e].append(loss.item())

    if not use_original:
      acc_sum1 += int((start_position.item() == outputs.start_logits.argmax().item())) 
      acc_sum2 += int((end_position.item() == outputs.end_logits.argmax().item()))
      i_ += 1
      acc1 = acc_sum1/i_
      acc2 = acc_sum2/i_

    t.set_postfix_str("Loss: {:.2f}, Acc1: {:.2f}, Acc2: {:.2f}".format(loss.item(), acc1, acc2))
    loss.backward()
    optimizer.step()
    scheduler.step()

100%|██████████| 10000/10000 [08:00<00:00, 20.81it/s, Loss: 0.78, Acc1: 0.23, Acc2: 0.18]
100%|██████████| 10000/10000 [08:00<00:00, 20.79it/s, Loss: 1.68, Acc1: 0.37, Acc2: 0.37]
100%|██████████| 10000/10000 [08:03<00:00, 20.70it/s, Loss: 0.30, Acc1: 0.39, Acc2: 0.39]
100%|██████████| 10000/10000 [08:02<00:00, 20.74it/s, Loss: 0.76, Acc1: 0.39, Acc2: 0.39]
100%|██████████| 10000/10000 [08:07<00:00, 20.51it/s, Loss: 0.27, Acc1: 0.39, Acc2: 0.38]
 17%|█▋        | 1742/10000 [01:30<07:32, 18.25it/s, Loss: 0.96, Acc1: 0.38, Acc2: 0.38]

Buffered data was truncated after reaching the output size limit.

## Old

In [None]:
def setModelHooks(model):
  attentionLayerRegex = r'^(.+\.)*layer\.(\d+)\.attention$'
  def _guyAttentionHook(name):
    layerNum = int(re.match(attentionLayerRegex, name).group(2))
    # Assumes there's only one attention per number
    def _myHook(m, inp, outp):
      assert((type(outp) == tuple) and (len(outp) == 1) )

      model.guyData[layerNum] = F.softmax(outp[0], dim = -1)

    return _myHook


  if hasattr(model, 'guyHooks'):
    print("Removing existing hooks!")
    [hook.remove() for hook in model.guyHooks]
  
  model.guyData = {}
  model.guyHooks = [module.register_forward_hook(_guyAttentionHook(name)) for name, module in model.named_modules()
                                                                          if re.match(attentionLayerRegex, name) is not None]
