# Statically Quantize Roberta

In [1]:
%load_ext autoreload
%autoreload 2

In [68]:
import torch
from transformers import glue_compute_metrics
import sklearn
from sklearn.metrics import f1_score

In [113]:
from static_quant_roberta import QuantRobertaForSequenceClassification

In [114]:
model = QuantRobertaForSequenceClassification.from_pretrained('textattack/roberta-base-MRPC')

Some weights of the model checkpoint at textattack/roberta-base-MRPC were not used when initializing QuantRobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing QuantRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing QuantRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [16]:
from datasets import load_dataset
dataset = load_dataset('glue', 'mrpc', split='test')

Reusing dataset glue (/Users/oliver/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


In [30]:
tokenizer.decode(tokenizer(dataset[0]['sentence1'], dataset[0]['sentence2'])['input_ids'])

"<s>PCCW's chief operating officer, Mike Butcher, and Alex Arena, the chief financial officer, will report directly to Mr So.</s></s>Current Chief Operating Officer Mike Butcher and Group Chief Financial Officer Alex Arena will report to So.</s>"

In [14]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('textattack/roberta-base-MRPC'.)

Downloading:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/780k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [32]:
def encode(examples):
    return tokenizer(examples['sentence1'], examples['sentence2'], truncation=True, padding='max_length')

dataset = dataset.map(encode, batched=True)

  0%|          | 0/2 [00:00<?, ?ba/s]

Ignored unknown kwarg option direction
Ignored unknown kwarg option direction


In [42]:
dataset = dataset.map(lambda examples: {'labels': examples['label']}, batched=True)
dataset

  0%|          | 0/2 [00:00<?, ?ba/s]

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1725
})

In [44]:
dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

In [45]:
dataloader = torch.utils.data.DataLoader(dataset, batch_size=32)

# 

In [115]:
torch.quantization.prepare(model, inplace=True)

QuantRobertaForSequenceClassification(
  (roberta): QuantRobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): QuantRobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(
                in_features=768, out_features=768, bias=True
                (activation_post_process): MinMaxObserver(min_val=inf, max_val=-inf)
              )
              (key): Linear(
                in_features=768, out_features=768, bias=True
                (activation_post_process): MinMaxObserver(min_val=inf, max_val=-inf)
              )
              (value): Linear(
   

In [117]:
from tqdm import tqdm
import numpy as np
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.eval()
torch.backends.quantized.engine = 'qnnpack'
preds = None

for i, batch in enumerate(tqdm(dataloader)):
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
        tmp_eval_loss, logits = outputs[:2]
        loss = outputs[0]
    if preds is None:
        preds = logits.detach().cpu().numpy()
        out_label_ids = batch['labels'].detach().cpu().numpy()
    else:
        preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
        out_label_ids = np.append(out_label_ids, batch['labels'].detach().cpu().numpy(), axis=0)
    if i % 10 == 0:
        print(f"loss: {loss}")
    break
        

  0%|                                                                                                                                                                                | 0/54 [00:00<?, ?it/s]


NotImplementedError: Could not run 'aten::q_scale' with arguments from the 'CPU' backend. This could be because the operator doesn't exist for this backend, or was omitted during the selective/custom build process (if using custom build). If you are a Facebook employee using PyTorch on mobile, please visit https://fburl.com/ptmfixes for possible resolutions. 'aten::q_scale' is only available for these backends: [QuantizedCPU, BackendSelect, Python, Named, Conjugate, Negative, ADInplaceOrView, AutogradOther, AutogradCPU, AutogradCUDA, AutogradXLA, AutogradLazy, AutogradXPU, AutogradMLC, AutogradHPU, AutogradNestedTensor, AutogradPrivateUse1, AutogradPrivateUse2, AutogradPrivateUse3, Tracer, UNKNOWN_TENSOR_TYPE_ID, Autocast, Batched, VmapMode].

QuantizedCPU: registered at /Users/runner/miniforge3/conda-bld/pytorch-recipe_1643987874214/work/build/aten/src/ATen/RegisterQuantizedCPU.cpp:1068 [kernel]
BackendSelect: fallthrough registered at /Users/runner/miniforge3/conda-bld/pytorch-recipe_1643987874214/work/aten/src/ATen/core/BackendSelectFallbackKernel.cpp:3 [backend fallback]
Python: registered at /Users/runner/miniforge3/conda-bld/pytorch-recipe_1643987874214/work/aten/src/ATen/core/PythonFallbackKernel.cpp:47 [backend fallback]
Named: registered at /Users/runner/miniforge3/conda-bld/pytorch-recipe_1643987874214/work/aten/src/ATen/core/NamedRegistrations.cpp:7 [backend fallback]
Conjugate: registered at /Users/runner/miniforge3/conda-bld/pytorch-recipe_1643987874214/work/aten/src/ATen/ConjugateFallback.cpp:18 [backend fallback]
Negative: registered at /Users/runner/miniforge3/conda-bld/pytorch-recipe_1643987874214/work/aten/src/ATen/native/NegateFallback.cpp:18 [backend fallback]
ADInplaceOrView: fallthrough registered at /Users/runner/miniforge3/conda-bld/pytorch-recipe_1643987874214/work/aten/src/ATen/core/VariableFallbackKernel.cpp:64 [backend fallback]
AutogradOther: registered at /Users/runner/miniforge3/conda-bld/pytorch-recipe_1643987874214/work/torch/csrc/autograd/generated/VariableType_4.cpp:8849 [autograd kernel]
AutogradCPU: registered at /Users/runner/miniforge3/conda-bld/pytorch-recipe_1643987874214/work/torch/csrc/autograd/generated/VariableType_4.cpp:8849 [autograd kernel]
AutogradCUDA: registered at /Users/runner/miniforge3/conda-bld/pytorch-recipe_1643987874214/work/torch/csrc/autograd/generated/VariableType_4.cpp:8849 [autograd kernel]
AutogradXLA: registered at /Users/runner/miniforge3/conda-bld/pytorch-recipe_1643987874214/work/torch/csrc/autograd/generated/VariableType_4.cpp:8849 [autograd kernel]
AutogradLazy: registered at /Users/runner/miniforge3/conda-bld/pytorch-recipe_1643987874214/work/torch/csrc/autograd/generated/VariableType_4.cpp:8849 [autograd kernel]
AutogradXPU: registered at /Users/runner/miniforge3/conda-bld/pytorch-recipe_1643987874214/work/torch/csrc/autograd/generated/VariableType_4.cpp:8849 [autograd kernel]
AutogradMLC: registered at /Users/runner/miniforge3/conda-bld/pytorch-recipe_1643987874214/work/torch/csrc/autograd/generated/VariableType_4.cpp:8849 [autograd kernel]
AutogradHPU: registered at /Users/runner/miniforge3/conda-bld/pytorch-recipe_1643987874214/work/torch/csrc/autograd/generated/VariableType_4.cpp:8849 [autograd kernel]
AutogradNestedTensor: registered at /Users/runner/miniforge3/conda-bld/pytorch-recipe_1643987874214/work/torch/csrc/autograd/generated/VariableType_4.cpp:8849 [autograd kernel]
AutogradPrivateUse1: registered at /Users/runner/miniforge3/conda-bld/pytorch-recipe_1643987874214/work/torch/csrc/autograd/generated/VariableType_4.cpp:8849 [autograd kernel]
AutogradPrivateUse2: registered at /Users/runner/miniforge3/conda-bld/pytorch-recipe_1643987874214/work/torch/csrc/autograd/generated/VariableType_4.cpp:8849 [autograd kernel]
AutogradPrivateUse3: registered at /Users/runner/miniforge3/conda-bld/pytorch-recipe_1643987874214/work/torch/csrc/autograd/generated/VariableType_4.cpp:8849 [autograd kernel]
Tracer: registered at /Users/runner/miniforge3/conda-bld/pytorch-recipe_1643987874214/work/torch/csrc/autograd/generated/TraceType_4.cpp:9274 [kernel]
UNKNOWN_TENSOR_TYPE_ID: fallthrough registered at /Users/runner/miniforge3/conda-bld/pytorch-recipe_1643987874214/work/aten/src/ATen/autocast_mode.cpp:466 [backend fallback]
Autocast: fallthrough registered at /Users/runner/miniforge3/conda-bld/pytorch-recipe_1643987874214/work/aten/src/ATen/autocast_mode.cpp:305 [backend fallback]
Batched: registered at /Users/runner/miniforge3/conda-bld/pytorch-recipe_1643987874214/work/aten/src/ATen/BatchingRegistrations.cpp:1016 [backend fallback]
VmapMode: fallthrough registered at /Users/runner/miniforge3/conda-bld/pytorch-recipe_1643987874214/work/aten/src/ATen/VmapModeRegistrations.cpp:33 [backend fallback]


In [110]:
preds = np.argmax(preds, axis=1)
from transformers.data.metrics import simple_accuracy
simple_accuracy(preds, out_label_ids)

0.90625

In [118]:
torch.backends.quantized.engine = 'qnnpack'
torch.quantization.convert(model, inplace=True)



QuantRobertaForSequenceClassification(
  (roberta): QuantRobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): QuantRobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): QuantizedLinear(in_features=768, out_features=768, scale=0.21292588114738464, zero_point=65, qscheme=torch.per_tensor_affine)
              (key): QuantizedLinear(in_features=768, out_features=768, scale=0.2001039832830429, zero_point=63, qscheme=torch.per_tensor_affine)
              (value): QuantizedLinear(in_features=768, out_features=768, scale=0.055448874831199646, zero_point=75, qschem

In [119]:
from tqdm import tqdm
import numpy as np
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.eval()

preds = None

for i, batch in enumerate(tqdm(dataloader)):
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
        tmp_eval_loss, logits = outputs[:2]
        loss = outputs[0]
    if preds is None:
        preds = logits.detach().cpu().numpy()
        out_label_ids = batch['labels'].detach().cpu().numpy()
    else:
        preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
        out_label_ids = np.append(out_label_ids, batch['labels'].detach().cpu().numpy(), axis=0)
    if i % 10 == 0:
        print(f"loss: {loss}")
    break

  0%|                                                                                                                                                                                | 0/54 [00:00<?, ?it/s]


RuntimeError: expected scalar type QInt8 but found QUInt8