## Quantize huggingface model

### Original Model

In [7]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch 

model_id = "facebook/opt-350m"

model = AutoModelForCausalLM.from_pretrained(model_id, 
                                    torch_dtype=torch.bfloat16, 
                                             low_cpu_mem_usage=True)
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [8]:
print(f"Original footprint of the model: {model.get_memory_footprint()/1e+6} MB")

Original footprint of the model: 662.392832 MB


In [9]:
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
print(pipe("What are we having for dinner?"))

[{'generated_text': "What are we having for dinner?\nI'm having a steak and a salad.\nI'm"}]


### Quantize model

In [10]:
from qllm import LinearQuantizer
from qllm.layers import W8A16LL

In [11]:
## Replace and QUantize linear layers
LinearQuantizer.replace_and_quantize_modules(model, W8A16LL, ['lm_head'])

OPTForCausalLM(
  (model): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50272, 512, padding_idx=1)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 1024)
      (project_out): W8A16LL()
      (project_in): W8A16LL()
      (layers): ModuleList(
        (0-23): 24 x OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): W8A16LL()
            (v_proj): W8A16LL()
            (q_proj): W8A16LL()
            (out_proj): W8A16LL()
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (fc1): W8A16LL()
          (fc2): W8A16LL()
          (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
      )
    )
  )
  (lm_head): Linear(in_features=512, out_features=50272, bias=False)
)

In [12]:
print(f"Model footprint after quantization: {model.get_memory_footprint()/1e+6} MB")

Model footprint after quantization: 359.799808 MB


In [13]:
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
print(pipe("What are we having for dinner?"))

[{'generated_text': "What are we having for dinner?\nI'm having a steak dinner.\nI'm having a"}]


In [17]:
print(f"Reduction % : {((662.392832 - 359.799808)/662.392832) * 100}")

Reduction % : 45.68180834420624
