# HuggingFace Model Quantization
References:
* https://medium.com/@rakeshrajpurohit/model-quantization-with-hugging-face-transformers-and-bitsandbytes-integration-b4c9983e8996

I was having the problem that every model was saying that it did not have enough memory, so I cleaned up the huggingface cache and changed it to the NAS to leave more space in the machine, and it seems to be working. Just needed to restart the kernel.

In [16]:
!pip install chardet



In [17]:
from transformers import AutoModelForCausalLM, AutoTokenizer, T5ForConditionalGeneration, T5Tokenizer

import api_tokens
import os

In [18]:
os.environ['HF_DATASETS_CACHE'] = api_tokens.HF_DATASETS_CACHE

In [19]:
def human_readable_size(size_in_bytes):
    # Define the units
    for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
        if size_in_bytes < 1024:
            return f"{size_in_bytes:,.0f} {unit}"
        size_in_bytes /= 1024
    return f"{size_in_bytes:.2f} PB"

#### What is Quantization?
Technique to reduce the precision of numerical values in the model to reduce the memory usage and speed up model execution while maintaining acceptable accuracy.

### Load a model in 4-bit quantization

device_map="auto": https://huggingface.co/docs/accelerate/v0.19.0/en/usage_guides/big_modeling 
* first we use the maximum space available on the GPU(s)
* if we still need space, we store the remaining weights on the CPU
* if there is not enough RAM, we store the remaining weights on the hard drive as memory-mapped tensors

In [20]:
model_name = "bigscience/bloom-1b7"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, 
                                             device_map="auto", 
                                             load_in_4bit=True)

print(f"4-bit quantization for {model_name}: {human_readable_size(model.get_memory_footprint())}")
print(model.hf_device_map)

4-bit quantization for bigscience/bloom-1b7: 2 GB
OrderedDict([('transformer.word_embeddings', 0), ('lm_head', 0), ('transformer.word_embeddings_layernorm', 0), ('transformer.h.0', 0), ('transformer.h.1', 0), ('transformer.h.2', 0), ('transformer.h.3', 0), ('transformer.h.4', 0), ('transformer.h.5', 0), ('transformer.h.6', 1), ('transformer.h.7', 1), ('transformer.h.8', 1), ('transformer.h.9', 1), ('transformer.h.10', 1), ('transformer.h.11', 1), ('transformer.h.12', 1), ('transformer.h.13', 1), ('transformer.h.14', 1), ('transformer.h.15', 1), ('transformer.h.16', 1), ('transformer.h.17', 1), ('transformer.h.18', 1), ('transformer.h.19', 1), ('transformer.h.20', 1), ('transformer.h.21', 1), ('transformer.h.22', 1), ('transformer.h.23', 1), ('transformer.ln_f', 1)])


In [21]:
model_name = "meta-llama/Llama-2-7b-chat-hf"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, 
                                             device_map="auto", 
                                             load_in_4bit=True)

print(f"4-bit quantization for {model_name}: {human_readable_size(model.get_memory_footprint())}")
print(model.hf_device_map)

Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.59s/it]


4-bit quantization for meta-llama/Llama-2-7b-chat-hf: 4 GB
OrderedDict([('model.embed_tokens', 0), ('model.layers.0', 0), ('model.layers.1', 0), ('model.layers.2', 0), ('model.layers.3', 0), ('model.layers.4', 0), ('model.layers.5', 0), ('model.layers.6', 0), ('model.layers.7', 0), ('model.layers.8', 0), ('model.layers.9', 0), ('model.layers.10', 0), ('model.layers.11', 0), ('model.layers.12', 0), ('model.layers.13', 1), ('model.layers.14', 1), ('model.layers.15', 1), ('model.layers.16', 1), ('model.layers.17', 1), ('model.layers.18', 1), ('model.layers.19', 1), ('model.layers.20', 1), ('model.layers.21', 1), ('model.layers.22', 1), ('model.layers.23', 1), ('model.layers.24', 1), ('model.layers.25', 1), ('model.layers.26', 1), ('model.layers.27', 1), ('model.layers.28', 1), ('model.layers.29', 1), ('model.layers.30', 1), ('model.layers.31', 1), ('model.norm', 1), ('lm_head', 1)])


In [22]:
model_name = "mistralai/Mistral-7B-Instruct-v0.1"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, 
                                             device_map="auto", 
                                             load_in_4bit=True)

print(f"4-bit quantization for {model_name}: {human_readable_size(model.get_memory_footprint())}")
print(model.hf_device_map)

Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.05s/it]


4-bit quantization for mistralai/Mistral-7B-Instruct-v0.1: 4 GB
OrderedDict([('model.embed_tokens', 0), ('model.layers.0', 0), ('model.layers.1', 0), ('model.layers.2', 0), ('model.layers.3', 0), ('model.layers.4', 0), ('model.layers.5', 0), ('model.layers.6', 0), ('model.layers.7', 0), ('model.layers.8', 0), ('model.layers.9', 0), ('model.layers.10', 0), ('model.layers.11', 0), ('model.layers.12', 0), ('model.layers.13', 1), ('model.layers.14', 1), ('model.layers.15', 1), ('model.layers.16', 1), ('model.layers.17', 1), ('model.layers.18', 1), ('model.layers.19', 1), ('model.layers.20', 1), ('model.layers.21', 1), ('model.layers.22', 1), ('model.layers.23', 1), ('model.layers.24', 1), ('model.layers.25', 1), ('model.layers.26', 1), ('model.layers.27', 1), ('model.layers.28', 1), ('model.layers.29', 1), ('model.layers.30', 1), ('model.layers.31', 1), ('model.norm', 1), ('lm_head', 1)])


In [23]:
model_name = "google/flan-t5-base"

tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name, 
                                                   device_map="auto", 
                                                   load_in_4bit=True)

print(f"4-bit quantization for {model_name}: {human_readable_size(model.get_memory_footprint())}")
print(model.hf_device_map)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
model.safetensors: 100%|██████████| 990M/990M [00:13<00:00, 75.6MB/s] 
generation_config.json: 100%|██████████| 147/147 [00:00<00:00, 67.2kB/s]

4-bit quantization for google/flan-t5-base: 315 MB
OrderedDict([('shared', 0), ('decoder.embed_tokens', 0), ('encoder.embed_tokens', 0), ('encoder.block.0', 0), ('encoder.block.1', 0), ('encoder.block.2', 0), ('encoder.block.3', 0), ('encoder.block.4', 0), ('encoder.block.5', 0), ('encoder.block.6', 1), ('encoder.block.7', 1), ('encoder.block.8', 1), ('encoder.block.9', 1), ('encoder.block.10', 1), ('encoder.block.11', 1), ('encoder.final_layer_norm', 1), ('encoder.dropout', 1), ('decoder.block', 1), ('decoder.final_layer_norm', 1), ('decoder.dropout', 1), ('lm_head', 1)])





In [24]:
model_name = "Trelis/Llama-2-7b-chat-hf-sharded-bf16"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, 
                                             device_map="auto", 
                                             load_in_4bit=True)

print(f"4-bit quantization for {model_name}: {human_readable_size(model.get_memory_footprint())}")
print(model.hf_device_map)

tokenizer_config.json: 100%|██████████| 719/719 [00:00<00:00, 281kB/s]
tokenizer.model: 100%|██████████| 500k/500k [00:00<00:00, 81.5MB/s]
tokenizer.json: 100%|██████████| 1.84M/1.84M [00:00<00:00, 30.0MB/s]
special_tokens_map.json: 100%|██████████| 414/414 [00:00<00:00, 180kB/s]
config.json: 100%|██████████| 658/658 [00:00<00:00, 336kB/s]
pytorch_model.bin.index.json: 100%|██████████| 26.8k/26.8k [00:00<00:00, 9.47MB/s]
pytorch_model-00001-of-00007.bin: 100%|██████████| 1.98G/1.98G [00:28<00:00, 69.4MB/s]
pytorch_model-00002-of-00007.bin: 100%|██████████| 1.99G/1.99G [00:22<00:00, 87.1MB/s]
pytorch_model-00003-of-00007.bin: 100%|██████████| 1.99G/1.99G [00:23<00:00, 83.9MB/s]
pytorch_model-00004-of-00007.bin: 100%|██████████| 1.99G/1.99G [00:23<00:00, 84.3MB/s]
pytorch_model-00005-of-00007.bin: 100%|██████████| 1.93G/1.93G [00:24<00:00, 77.5MB/s]
pytorch_model-00006-of-00007.bin: 100%|██████████| 1.93G/1.93G [00:23<00:00, 81.6MB/s]
pytorch_model-00007-of-00007.bin: 100%|██████████| 1.

4-bit quantization for Trelis/Llama-2-7b-chat-hf-sharded-bf16: 4 GB
OrderedDict([('model.embed_tokens', 0), ('model.layers.0', 0), ('model.layers.1', 0), ('model.layers.2', 0), ('model.layers.3', 0), ('model.layers.4', 0), ('model.layers.5', 0), ('model.layers.6', 0), ('model.layers.7', 0), ('model.layers.8', 0), ('model.layers.9', 0), ('model.layers.10', 0), ('model.layers.11', 0), ('model.layers.12', 0), ('model.layers.13', 1), ('model.layers.14', 1), ('model.layers.15', 1), ('model.layers.16', 1), ('model.layers.17', 1), ('model.layers.18', 1), ('model.layers.19', 1), ('model.layers.20', 1), ('model.layers.21', 1), ('model.layers.22', 1), ('model.layers.23', 1), ('model.layers.24', 1), ('model.layers.25', 1), ('model.layers.26', 1), ('model.layers.27', 1), ('model.layers.28', 1), ('model.layers.29', 1), ('model.layers.30', 1), ('model.layers.31', 1), ('model.norm', 1), ('lm_head', 1)])


### Load a model in 8-bit quantization

In [25]:
model_name = "bigscience/bloom-1b7"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, 
                                             device_map="auto", load_in_8bit=True)

print(f"8-bit quantization for {model_name}: {human_readable_size(model.get_memory_footprint())}")
print(model.hf_device_map)

8-bit quantization for bigscience/bloom-1b7: 2 GB
OrderedDict([('transformer.word_embeddings', 0), ('lm_head', 0), ('transformer.word_embeddings_layernorm', 0), ('transformer.h.0', 0), ('transformer.h.1', 0), ('transformer.h.2', 0), ('transformer.h.3', 0), ('transformer.h.4', 0), ('transformer.h.5', 0), ('transformer.h.6', 0), ('transformer.h.7', 0), ('transformer.h.8', 1), ('transformer.h.9', 1), ('transformer.h.10', 1), ('transformer.h.11', 1), ('transformer.h.12', 1), ('transformer.h.13', 1), ('transformer.h.14', 1), ('transformer.h.15', 1), ('transformer.h.16', 1), ('transformer.h.17', 1), ('transformer.h.18', 1), ('transformer.h.19', 1), ('transformer.h.20', 1), ('transformer.h.21', 1), ('transformer.h.22', 1), ('transformer.h.23', 1), ('transformer.ln_f', 1)])


In [26]:
model_name = "meta-llama/Llama-2-7b-chat-hf"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, 
                                             device_map="auto", 
                                             load_in_4bit=True)

print(f"8-bit quantization for {model_name}: {human_readable_size(model.get_memory_footprint())}")
print(model.hf_device_map)

Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.21s/it]


8-bit quantization for meta-llama/Llama-2-7b-chat-hf: 4 GB
OrderedDict([('model.embed_tokens', 0), ('model.layers.0', 0), ('model.layers.1', 0), ('model.layers.2', 0), ('model.layers.3', 0), ('model.layers.4', 0), ('model.layers.5', 0), ('model.layers.6', 0), ('model.layers.7', 0), ('model.layers.8', 0), ('model.layers.9', 0), ('model.layers.10', 0), ('model.layers.11', 0), ('model.layers.12', 0), ('model.layers.13', 1), ('model.layers.14', 1), ('model.layers.15', 1), ('model.layers.16', 1), ('model.layers.17', 1), ('model.layers.18', 1), ('model.layers.19', 1), ('model.layers.20', 1), ('model.layers.21', 1), ('model.layers.22', 1), ('model.layers.23', 1), ('model.layers.24', 1), ('model.layers.25', 1), ('model.layers.26', 1), ('model.layers.27', 1), ('model.layers.28', 1), ('model.layers.29', 1), ('model.layers.30', 1), ('model.layers.31', 1), ('model.norm', 1), ('lm_head', 1)])


In [27]:
model_name = "mistralai/Mistral-7B-Instruct-v0.1"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, 
                                             device_map="auto", 
                                             load_in_4bit=True)

print(f"8-bit quantization for {model_name}: {human_readable_size(model.get_memory_footprint())}")
print(model.hf_device_map)

Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.05s/it]


8-bit quantization for mistralai/Mistral-7B-Instruct-v0.1: 4 GB
OrderedDict([('model.embed_tokens', 0), ('model.layers.0', 0), ('model.layers.1', 0), ('model.layers.2', 0), ('model.layers.3', 0), ('model.layers.4', 0), ('model.layers.5', 0), ('model.layers.6', 0), ('model.layers.7', 0), ('model.layers.8', 0), ('model.layers.9', 0), ('model.layers.10', 0), ('model.layers.11', 0), ('model.layers.12', 0), ('model.layers.13', 1), ('model.layers.14', 1), ('model.layers.15', 1), ('model.layers.16', 1), ('model.layers.17', 1), ('model.layers.18', 1), ('model.layers.19', 1), ('model.layers.20', 1), ('model.layers.21', 1), ('model.layers.22', 1), ('model.layers.23', 1), ('model.layers.24', 1), ('model.layers.25', 1), ('model.layers.26', 1), ('model.layers.27', 1), ('model.layers.28', 1), ('model.layers.29', 1), ('model.layers.30', 1), ('model.layers.31', 1), ('model.norm', 1), ('lm_head', 1)])


In [28]:
model_name = "google/flan-t5-base"

tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name, 
                                                   device_map="auto", 
                                                   load_in_4bit=True)

print(f"8-bit quantization for {model_name}: {human_readable_size(model.get_memory_footprint())}")
print(model.hf_device_map)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


8-bit quantization for google/flan-t5-base: 315 MB
OrderedDict([('shared', 0), ('decoder.embed_tokens', 0), ('encoder.embed_tokens', 0), ('encoder.block.0', 0), ('encoder.block.1', 0), ('encoder.block.2', 0), ('encoder.block.3', 0), ('encoder.block.4', 0), ('encoder.block.5', 0), ('encoder.block.6', 1), ('encoder.block.7', 1), ('encoder.block.8', 1), ('encoder.block.9', 1), ('encoder.block.10', 1), ('encoder.block.11', 1), ('encoder.final_layer_norm', 1), ('encoder.dropout', 1), ('decoder.block', 1), ('decoder.final_layer_norm', 1), ('decoder.dropout', 1), ('lm_head', 1)])


In [29]:
model_name = "Trelis/Llama-2-7b-chat-hf-sharded-bf16"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, 
                                             device_map="auto", 
                                             load_in_4bit=True)

print(f"8-bit quantization for {model_name}: {human_readable_size(model.get_memory_footprint())}")
print(model.hf_device_map)

Loading checkpoint shards: 100%|██████████| 7/7 [00:11<00:00,  1.69s/it]


8-bit quantization for Trelis/Llama-2-7b-chat-hf-sharded-bf16: 4 GB
OrderedDict([('model.embed_tokens', 0), ('model.layers.0', 0), ('model.layers.1', 0), ('model.layers.2', 0), ('model.layers.3', 0), ('model.layers.4', 0), ('model.layers.5', 0), ('model.layers.6', 0), ('model.layers.7', 0), ('model.layers.8', 0), ('model.layers.9', 0), ('model.layers.10', 0), ('model.layers.11', 0), ('model.layers.12', 0), ('model.layers.13', 1), ('model.layers.14', 1), ('model.layers.15', 1), ('model.layers.16', 1), ('model.layers.17', 1), ('model.layers.18', 1), ('model.layers.19', 1), ('model.layers.20', 1), ('model.layers.21', 1), ('model.layers.22', 1), ('model.layers.23', 1), ('model.layers.24', 1), ('model.layers.25', 1), ('model.layers.26', 1), ('model.layers.27', 1), ('model.layers.28', 1), ('model.layers.29', 1), ('model.layers.30', 1), ('model.layers.31', 1), ('model.norm', 1), ('lm_head', 1)])


### Load a model with changed compute data type

# TODO