# Statically Quantize Roberta

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
from transformers import glue_compute_metrics
import sklearn
from sklearn.metrics import f1_score
from tqdm import tqdm
import numpy as np

In [41]:
from dynamic_quant_roberta import QuantRobertaForSequenceClassification
from transformers import RobertaForSequenceClassification, AutoTokenizer
from transformers.data.metrics import simple_accuracy

In [42]:
qmodel = QuantRobertaForSequenceClassification.from_pretrained('textattack/roberta-base-MRPC')
model = RobertaForSequenceClassification.from_pretrained('textattack/roberta-base-MRPC')

tokenizer = AutoTokenizer.from_pretrained('textattack/roberta-base-MRPC')

Some weights of the model checkpoint at textattack/roberta-base-MRPC were not used when initializing QuantRobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing QuantRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing QuantRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at textattack/roberta-base-MRPC were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoin

In [5]:
from datasets import load_dataset
dataset = load_dataset('glue', 'mrpc', split='validation')
# dataset = load_dataset('glue', 'mrpc', split='test')

Reusing dataset glue (/Users/oliver/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


In [6]:
tokenizer.decode(tokenizer(dataset[0]['sentence1'], dataset[0]['sentence2'])['input_ids'])

'<s>He said the foodservice pie business doesn \'t fit the company\'s long-term growth strategy.</s></s>" The foodservice pie business does not fit our long-term growth strategy.</s>'

In [7]:
def encode(examples):
    return tokenizer(examples['sentence1'], examples['sentence2'], truncation=True, padding='max_length')

dataset = dataset.map(encode, batched=True)

Loading cached processed dataset at /Users/oliver/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-7ed1324c21291143.arrow


In [8]:
dataset = dataset.map(lambda examples: {'labels': examples['label']}, batched=True)
dataset

Loading cached processed dataset at /Users/oliver/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-9fc668b1e09db634.arrow


Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 408
})

In [9]:
dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

In [10]:
dataloader = torch.utils.data.DataLoader(dataset, batch_size=32)

# 

In [11]:
def eval_model(model, dataloader):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.eval()
    preds = None

    for i, batch in enumerate(tqdm(dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
            tmp_eval_loss, logits = outputs[:2]
            loss = outputs[0]
        if preds is None:
            preds = logits.detach().cpu().numpy()
            out_label_ids = batch['labels'].detach().cpu().numpy()
        else:
            preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
            out_label_ids = np.append(out_label_ids, batch['labels'].detach().cpu().numpy(), axis=0)
        if i % 10 == 0:
    #         print(f"loss: {loss}")
            pass

    preds = np.argmax(preds, axis=1)

    print(f'accuracy: {simple_accuracy(preds, out_label_ids)}')

In [15]:
eval_model(model, dataloader)

  0%|                                                                                                                                                                              | 0/13 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13/13 [04:05<00:00, 18.88s/it]

accuracy: 0.9117647058823529





In [38]:
eval_model(qmodel, dataloader)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13/13 [05:37<00:00, 25.98s/it]

accuracy: 0.6838235294117647





In [149]:
torch.backends.quantized.engine = 'qnnpack'

In [150]:
dynamic_model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)

In [151]:
eval_model(dynamic_model, dataloader)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 54/54 [17:12<00:00, 19.12s/it]

accuracy: 0.8684057971014493





## PyTorch dynamic quantization under the hood
How does pytorch determine the min and max of the range?

In [108]:
dynamic_model.roberta.encoder.layer[0].attention.self.query.weight()

tensor([[ 0.0727, -0.0056, -0.0895,  ...,  0.1007,  0.0895, -0.1007],
        [-0.0503,  0.2070,  0.0727,  ...,  0.0671,  0.0615,  0.1286],
        [ 0.0895,  0.0727, -0.0503,  ..., -0.0447, -0.0056,  0.1119],
        ...,
        [-0.1846,  0.0168, -0.0336,  ..., -0.0503,  0.1007, -0.1175],
        [-0.2517,  0.0447,  0.0615,  ...,  0.0727, -0.1063,  0.0112],
        [-0.0503, -0.0839,  0.1007,  ..., -0.1902,  0.0056, -0.0559]],
       size=(768, 768), dtype=torch.qint8,
       quantization_scheme=torch.per_tensor_affine, scale=0.0055933459661901,
       zero_point=0)

In [121]:
model.roberta.encoder.layer[0].attention.self.query.weight.data

tensor([[ 0.0733, -0.0037, -0.0904,  ...,  0.1033,  0.0894, -0.1026],
        [-0.0520,  0.2053,  0.0730,  ...,  0.0648,  0.0631,  0.1287],
        [ 0.0869,  0.0704, -0.0509,  ..., -0.0434, -0.0070,  0.1100],
        ...,
        [-0.1867,  0.0172, -0.0314,  ..., -0.0504,  0.1023, -0.1159],
        [-0.2524,  0.0435,  0.0640,  ...,  0.0703, -0.1036,  0.0117],
        [-0.0512, -0.0864,  0.1022,  ..., -0.1887,  0.0045, -0.0540]])

In [101]:
model.roberta.encoder.layer[0].attention.self.query.weight.data.max()

tensor(0.7132)

In [109]:
model.roberta.encoder.layer[0].attention.self.query.weight.data.min()

tensor(-0.5972)

In [114]:
tmin = -0.5972
tmax = 0.7132

In [113]:
qmax = 127
qmin = -128
symmetric_qmin = -((qmax - qmin) / 2 + 1)
symmetric_qmax = (qmax - qmin) / 2
symmetric_qmin, symmetric_qmax

(-128.5, 127.5)

In [116]:
max_scale = max(abs(tmin / symmetric_qmin), abs(tmax / symmetric_qmax))
max_scale

0.005593725490196078

In [117]:
tmin = max_scale*symmetric_qmin

In [118]:
tmax = max_scale*symmetric_qmax

In [119]:
tmin, tmax

(-0.718793725490196, 0.7132)

In [120]:
(tmax - tmin) / (qmax - qmin)

0.005615661668589004

In [125]:
aq = torch.round(model.roberta.encoder.layer[0].attention.self.query.weight.data / max_scale)
aq * max_scale

tensor([[ 0.0727, -0.0056, -0.0895,  ...,  0.1007,  0.0895, -0.1007],
        [-0.0503,  0.2070,  0.0727,  ...,  0.0671,  0.0615,  0.1287],
        [ 0.0895,  0.0727, -0.0503,  ..., -0.0447, -0.0056,  0.1119],
        ...,
        [-0.1846,  0.0168, -0.0336,  ..., -0.0503,  0.1007, -0.1175],
        [-0.2517,  0.0447,  0.0615,  ...,  0.0727, -0.1063,  0.0112],
        [-0.0503, -0.0839,  0.1007,  ..., -0.1902,  0.0056, -0.0559]])

In [91]:
dynamic_model.roberta.encoder.layer[0].attention.self.query._packed_params[0]

TypeError: 'LinearPackedParams' object is not subscriptable

In [130]:
from dynamic_quant_ops import tensor_quant_scale

In [132]:
aq, scale = tensor_quant_scale(model.roberta.encoder.layer[0].attention.self.query.weight.data)

In [133]:
aq * scale

tensor([[ 0.0727, -0.0056, -0.0895,  ...,  0.1007,  0.0895, -0.1007],
        [-0.0503,  0.2070,  0.0727,  ...,  0.0671,  0.0615,  0.1286],
        [ 0.0895,  0.0727, -0.0503,  ..., -0.0447, -0.0056,  0.1119],
        ...,
        [-0.1846,  0.0168, -0.0336,  ..., -0.0503,  0.1007, -0.1175],
        [-0.2517,  0.0447,  0.0615,  ...,  0.0727, -0.1063,  0.0112],
        [-0.0503, -0.0839,  0.1007,  ..., -0.1902,  0.0056, -0.0559]])

In [138]:
np.percentile(model.roberta.encoder.layer[0].attention.self.query.weight.data.detach().numpy(), 99.9)

0.382255209922794

In [139]:
max(abs(0.7132), abs(-0.5972)) / (2**(8-1)-1)

0.005615748031496063

In [4]:
state_dict = np.load('state_dict.npz')

In [8]:
for key in state_dict.keys():
    print(key)

bert.embeddings.word_embeddings.weight
bert.embeddings.position_embeddings.weight
bert.embeddings.token_type_embeddings.weight
bert.embeddings.LayerNorm.weight
bert.embeddings.LayerNorm.bias
bert.encoder.layer.0.attention.self.query.weight
bert.encoder.layer.0.attention.self.query.bias
bert.encoder.layer.0.attention.self.query._input_quantizer._amax
bert.encoder.layer.0.attention.self.key.weight
bert.encoder.layer.0.attention.self.key.bias
bert.encoder.layer.0.attention.self.key._input_quantizer._amax
bert.encoder.layer.0.attention.self.value.weight
bert.encoder.layer.0.attention.self.value.bias
bert.encoder.layer.0.attention.self.value._input_quantizer._amax
bert.encoder.layer.0.attention.self.qv_a_input_quantizer._amax
bert.encoder.layer.0.attention.self.qv_b_input_quantizer._amax
bert.encoder.layer.0.attention.self.av_a_input_quantizer._amax
bert.encoder.layer.0.attention.self.av_b_input_quantizer._amax
bert.encoder.layer.0.attention.output.dense.weight
bert.encoder.layer.0.attentio

In [20]:
state_dict['bert.encoder.layer.0.attention.self.query._input_quantizer._amax']

array(5.545955, dtype=float32)

In [12]:
state_dict['bert.encoder.layer.0.attention.self.qv_a_input_quantizer._amax']

array(6.7064476, dtype=float32)