# Making Transformers Efficient

In [49]:
import pathlib
from time import perf_counter

import numpy as np
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F

from datasets import load_metric, load_dataset
from transformers import pipeline, TrainingArguments, Trainer, AutoTokenizer, AutoConfig, AutoModelForSequenceClassification

In [3]:
ckpt = 'transformersbook/bert-base-uncased-finetuned-clinc'
pipe = pipeline('text-classification', model=ckpt)

Downloading:   0%|          | 0.00/7.98k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/418M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/252 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [4]:
query = "I'd like to rent a vehicle in Parise and I need a 15 passenger van"
pipe(query)

[{'label': 'car_rental', 'score': 0.5302357077598572}]

In [12]:
data = load_dataset('clinc_oos', 'plus')
data['test'][42]

Reusing dataset clinc_oos (/root/.cache/huggingface/datasets/clinc_oos/plus/1.0.0/abcc41d382f8137f039adc747af44714941e8196e845dfbdd8ae7a7e020e6ba1)


  0%|          | 0/3 [00:00<?, ?it/s]

{'text': 'transfer $100 from my checking to saving account', 'intent': 133}

In [16]:
intents = data['test'].features['intent']
get_intents = lambda x: intents.str2int(x)  # mapper func

In [20]:
intents.int2str(data['test'][42]['intent'])

'transfer'

<hr>

## Baseline Benchmark

In [25]:
class PerformanceBenchmark:

    def __init__(self, pipeline, dataset, optim_type='BERT baseline'):
        self.pipeline = pipeline
        self.dataset = dataset
        self.optim_type = optim_type
        self.score = load_metric('accuracy')

    def compute_accuracy(self, mapper):
        preds, labels = [], []
        for example in tqdm(self.dataset):
            pred = self.pipeline(example['text'])[0]['label']
            label = example['intent']
            if mapper:
                preds.append(mapper(pred))
            else:
                preds.append(pred)
            labels.append(label)
        results = self.score.compute(predictions=preds, references=labels)
        print(f'Accuracy on test set: {results["accuracy"]:.3f}')
        return results

    def compute_size(self):
        state = self.pipeline.model.state_dict()
        fp = pathlib.Path('model.pt')
        torch.save(state, fp)
        mb = pathlib.Path(fp).stat().st_size / (1024**2)
        fp.unlink(missing_ok=True)
        print(f'Model size (MB): {mb:.2f}')
        return {'size_mb': mb}

    def time_pipeline(self, query='What is the pin number for my account?'):
        latencies = []
        for _ in range(100):
            start = perf_counter()
            _ = self.pipeline(query)
            latencies.append(perf_counter() - start)
        avg_ms = np.mean(latencies) * 1000
        std_ms = np.std(latencies) * 1000
        print(f'Average latency (ms): {avg_ms:.2f} +/- {std_ms:.2f}')
        return {'avg_ms': avg_ms, 'std_ms': std_ms}

    def run(self, query=None, mapper=None):
        if query is None:
            query='What is the pin number for my account?'
        metrics = {
            self.optim_type: self.compute_size()
        }
        metrics[self.optim_type].update(self.time_pipeline(query))
        metrics[self.optim_type].update(self.compute_accuracy(mapper))
        return metrics

In [26]:
pb = PerformanceBenchmark(pipe, data['test'])
perf = pb.run(mapper=get_intents)

Model size (MB): 418.16
Average latency (ms): 54.52 +/- 17.27


100%|██████████| 5500/5500 [05:30<00:00, 16.64it/s]


Accuracy on test set: 0.867


## Knowledge Distillation

In [42]:
class DistillationTrainArgs(TrainingArguments):
    def __init__(self, *args, alpha=0.5, temp=2.0, **kwargs):
        super().__init__(*args, **kwargs)
        self.alpha = alpha
        self.temp = temp

In [43]:
class DistillationTrainer(Trainer):
    def __init__(self, *args, teacher=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.teacher = teacher
        self.loss_func = nn.KLDivLoss(reduction='batchmean')
        
    def calc_loss(self, model, inputs, return_outputs=False):
        student_out = model(**inputs)
        student_logits = student_out.logits
        loss_ce = student_out.loss
        
        with torch.no_grad():
            teacher_out = self.teacher(**inputs)
            teacher_logits = teacher_out.logits
        
        loss_kd = self.args.temp ** 2 * self.loss_func(
            F.log_softmax(student_logits / self.args.temp, dim=-1),
            F.softmax(teacher_logits / self.args.temp, dim=-1)
        )
        loss = self.args.alpha * loss_ce + (1. - self.args.alpha) * loss_kd
        return (loss, student_out) if return_outputs else loss
        

In [31]:
student_ckpt = 'distilbert-base-uncased'
student_tokenizer = AutoTokenizer.from_pretrained(student_ckpt)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [35]:
def tokenize_text(batch):
    return student_tokenizer(batch['text'], truncation=True)

In [36]:
data_enc = data.map(tokenize_text, batched=True, remove_columns=['text'])
data_enc = data_enc.rename_column('intent', 'labels')



  0%|          | 0/16 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

In [38]:
data_enc['train'][0]

{'labels': 61,
 'input_ids': [101,
  2054,
  3670,
  2052,
  1045,
  2224,
  2000,
  2360,
  1045,
  2293,
  2017,
  2065,
  1045,
  2020,
  2019,
  3059,
  102],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [53]:
def get_metrics(outputs):
    metric = load_metric('accuracy')
    preds, labels = outputs
    preds = np.argmax(preds, axis=1)
    return metric.compute(predictions=preds, references=labels)

In [44]:
bs = 48
ft_ckpt = 'distilbert-base-uncased-finetuned-clinc'

train_args = DistillationTrainArgs(
    output_dir=ft_ckpt,
    evaluation_strategy='epoch',
    num_train_epochs=5,
    learning_rate=2e-5,
    per_device_train_batch_size=bs,
    per_device_eval_batch_size=bs,
    alpha=1,
    weight_decay=0.01,
    push_to_hub=False
)

In [45]:
id2label = pipe.model.config.id2label
label2id = pipe.model.config.label2id

In [47]:
num_labels = intents.num_classes

In [48]:
student_config = AutoConfig.from_pretrained(
    student_ckpt,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)

In [50]:
def student_init(device):
    return AutoModelForSequenceClassification.from_pretrained(student_ckpt, config=student_config).to(device)

In [51]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [52]:
teacher_ckpt = "transformersbook/bert-base-uncased-finetuned-clinc"
teacher_model = AutoModelForSequenceClassification.from_pretrained(teacher_ckpt, num_labels=num_labels).to(device)

In [56]:
torch.cuda.empty_cache()

distilbert_trainer = DistillationTrainer(
    model_init=student_init,
    teacher=teacher_model,
    args=train_args,
    train_dataset=data_enc['train'],
    eval_dataset=data_enc['validation'],
    compute_metrics=get_metrics,
    tokenizer=student_tokenizer
)

loading weights file https://huggingface.co/distilbert-base-uncased/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/9c169103d7e5a73936dd2b627e42851bec0831212b677c637033ee4bce9ab5ee.126183e36667471617ae2f0835fab707baa54b731f991507ebbb55ea85adb12a
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSe

RuntimeError: CUDA out of memory. Tried to allocate 90.00 MiB (GPU 0; 15.90 GiB total capacity; 419.17 MiB already allocated; 27.44 MiB free; 472.00 MiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF