<a href="https://colab.research.google.com/github/dar-tau/nlp-experiments/blob/master/introbert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Initialization

In [None]:
!pip install transformers datasets
# !pip install simpletransformers

In [353]:
import re
import torch
from torch.utils.data import Dataset, DataLoader
import datasets
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
import json
import os
from transformers import AutoModelForQuestionAnswering, AutoTokenizer
from transformers import pipeline

In [381]:
def setModelHooks(model):
  attentionLayerRegex = r'^(.+\.)*layer\.(\d+)\.attention$'
  def _guyAttentionHook(name):
    layerNum = int(re.match(attentionLayerRegex, name).group(2))
    # Assumes there's only one attention per number
    def _myHook(m, inp, outp):
      assert((type(outp) == tuple) and (len(outp) == 1) )
      model.guyData[layerNum] = outp[0].argmax(dim = -1)

    return _myHook


  if hasattr(model, 'guyHooks'):
    print("Removing existing hooks!")
    [hook.remove() for hook in model.guyHooks]
  
  model.guyData = {}
  model.guyHooks = [module.register_forward_hook(_guyAttentionHook(name)) for name, module in model.named_modules()
                                                                          if re.match(attentionLayerRegex, name) is not None]


In [373]:
class IntrobertDataset(Dataset):
  def __init__(self, srcDataset, func):
    self.ds = srcDataset
    self.func = func

  def __getitem__(self, i):
    context = self.func(self.ds[i])
    return {'context': context}

  def __len__(self):
    return len(self.ds)

## Main

In [382]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased-distilled-squad")
model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-cased-distilled-squad", return_dict = True)
optimizer = AdamW(model.parameters(), lr = 5e-5)
setModelHooks(model)

In [384]:
imdb = datasets.load_dataset("imdb")
dataset = IntrobertDataset(imdb['train'], lambda x: x['text'])
# dataloader = DataLoader(dataset, batch_size = 8, shuffle = True)

Checking /root/.cache/huggingface/datasets/4d2b2997408b65402b80ecde9f2710be3b9edec2632497552299709859efe061.c39acffee84b8d7965ae2e5269ad438ebdb9a40b0607f38a5fdd81b1f8607864.py for additional imports.
Found main folder for dataset https://raw.githubusercontent.com/huggingface/datasets/1.0.1/datasets/imdb/imdb.py at /root/.cache/huggingface/modules/datasets_modules/datasets/imdb
Found specific version folder for dataset https://raw.githubusercontent.com/huggingface/datasets/1.0.1/datasets/imdb/imdb.py at /root/.cache/huggingface/modules/datasets_modules/datasets/imdb/90099cb476936b753383ba2ae6ab2eae419b2e87f71cd5189cb9c8e5814d12a3
Found script file from https://raw.githubusercontent.com/huggingface/datasets/1.0.1/datasets/imdb/imdb.py to /root/.cache/huggingface/modules/datasets_modules/datasets/imdb/90099cb476936b753383ba2ae6ab2eae419b2e87f71cd5189cb9c8e5814d12a3/imdb.py
Found dataset infos file from https://raw.githubusercontent.com/huggingface/datasets/1.0.1/datasets/imdb/dataset_info

In [None]:
model.train()
for e in range(10):
  for data in tqdm(dataset):
    model.zero_grad()
    context = data['context']
    inputs = tokenizer(context, truncation = True,
                       padding = True, return_tensors = 'pt')
    outputs = model(**inputs)
    
    # loss.backward()
    # optimizer.step()


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=0.0, max=25000.0), HTML(value='')))