In [1]:
import torch
import warnings
warnings.filterwarnings("ignore")

There are multiple quantiization methods
[Quantization Methods for Inference](https://huggingface.co/docs/transformers/main/en/quantization/selecting#inference)

Quantization methods are two
1. Data Free Calibration (relying on Weights distribution only without external data). Example bitsandbytes
2. Calibration based (relying on external data for quanziation). Example GPTQ

In [3]:
print(torch.iinfo(torch.int64))
print(torch.iinfo(torch.int32))
print(torch.iinfo(torch.int16))
print(torch.iinfo(torch.int8))
print(torch.iinfo(torch.uint8))

iinfo(min=-9.22337e+18, max=9.22337e+18, dtype=int64)
iinfo(min=-2.14748e+09, max=2.14748e+09, dtype=int32)
iinfo(min=-32768, max=32767, dtype=int16)
iinfo(min=-128, max=127, dtype=int8)
iinfo(min=0, max=255, dtype=uint8)


In [5]:
value = 1/3
format(value, '.20f')
print(value)
print(type(value))

0.3333333333333333
<class 'float'>


In [10]:
tensor_fp_64b = torch.tensor([1/3], dtype=torch.float64)
tensor_fp_32b = torch.tensor([1/3], dtype=torch.float32)
tensor_fp_16b = torch.tensor([1/3], dtype=torch.float16)
tensor_fp_8b = torch.tensor([1/3], dtype=torch.bfloat16)
print("Tensor 64b: ", tensor_fp_64b)
print("Tensor 32b: ", tensor_fp_32b)
print("Tensor 16b: ", tensor_fp_16b)   
print("Tensor 8b: ", tensor_fp_8b)

Tensor 64b:  tensor([0.3333], dtype=torch.float64)
Tensor 32b:  tensor([0.3333])
Tensor 16b:  tensor([0.3333], dtype=torch.float16)
Tensor 8b:  tensor([0.3340], dtype=torch.bfloat16)


In [3]:
from simple_model import SimpleCNNModel

In [None]:
model = SimpleCNNModel()


SimpleCNNModel(
  (convnet_layer1): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1))
  (convnet_layer2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1))
  (maxpool_layer1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (convnet_layer3): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
  (convnet_layer4): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
  (maxpool_layer2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fully_connected_layer1): Linear(in_features=1600, out_features=128, bias=True)
  (relu_layer): ReLU()
  (fully_connected_layer2): Linear(in_features=128, out_features=10, bias=True)
)


In [None]:

for name, param in model.named_parameters():
    print(name, param.dtype, param.device)

convnet_layer1.weight torch.float32 cpu
convnet_layer1.bias torch.float32 cpu
convnet_layer2.weight torch.float32 cpu
convnet_layer2.bias torch.float32 cpu
convnet_layer3.weight torch.float32 cpu
convnet_layer3.bias torch.float32 cpu
convnet_layer4.weight torch.float32 cpu
convnet_layer4.bias torch.float32 cpu
fully_connected_layer1.weight torch.float32 cpu
fully_connected_layer1.bias torch.float32 cpu
fully_connected_layer2.weight torch.float32 cpu
fully_connected_layer2.bias torch.float32 cpu


In [22]:
model_bfp16 = model.to(torch.bfloat16)
for name, param in model_bfp16.named_parameters():
    print(name, param.dtype, param.device)

convnet_layer1.weight torch.bfloat16 cpu
convnet_layer1.bias torch.bfloat16 cpu
convnet_layer2.weight torch.bfloat16 cpu
convnet_layer2.bias torch.bfloat16 cpu
convnet_layer3.weight torch.bfloat16 cpu
convnet_layer3.bias torch.bfloat16 cpu
convnet_layer4.weight torch.bfloat16 cpu
convnet_layer4.bias torch.bfloat16 cpu
fully_connected_layer1.weight torch.bfloat16 cpu
fully_connected_layer1.bias torch.bfloat16 cpu
fully_connected_layer2.weight torch.bfloat16 cpu
fully_connected_layer2.bias torch.bfloat16 cpu
