In [1]:
from mingpt.model import GPT
from mingpt.bpe import BPETokenizer
from huggingface_hub import snapshot_download
import torch
import gc
import os
import timm
from transformers import AutoConfig, AutoModel
from transformers.modeling_utils import load_sharded_checkpoint

from accelerate import Accelerator, init_empty_weights, load_checkpoint_in_model
from accelerate.utils import BnbQuantizationConfig, load_and_quantize_model, release_memory

In [2]:
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [3]:
model_config = GPT.get_default_config()
model_config.model_type = 'gpt2-xl'
model_config.vocab_size = 50257
model_config.block_size = 1024

In [4]:
with init_empty_weights():
    empty_model = GPT(model_config)

number of parameters: 1557.61M


In [5]:
weights_location = snapshot_download(repo_id='marcsun13/gpt2-xl-linear-sharded')

Fetching 9 files:   0%|          | 0/9 [00:00<?, ?it/s]

In [7]:
load_sharded_checkpoint(empty_model, weights_location)



<All keys matched successfully>

In [8]:
def check_model_size(model):
    total_params = sum(p.numel() for p in model.parameters())
    print(f"Total number of parameters: {total_params}")
    
    total_size = sum(p.numel() * p.element_size() for p in model.parameters())
    total_size_mb = total_size / (1024 ** 2)
    total_size_gb = total_size / (1024 ** 3)
    
    print(f"Total model size: {total_size_mb:.2f} MB")
    print(f"Total model size: {total_size_gb:.2f} GB")

In [9]:
check_model_size(empty_model)

Total number of parameters: 1638022400
Total model size: 6248.56 MB
Total model size: 6.10 GB


In [12]:
bnb_quantization_config = BnbQuantizationConfig(load_in_8bit=True, llm_int8_threshold=6)

In [None]:
quantized_model = load_and_quantize_model(empty_model, weights_location=weights_location, bnb_quantization_config=bnb_quantization_config, device_map = "auto")

In [None]:
accelerator = Accelerator()

In [10]:
new_weights_location = "quantized_models"

In [None]:
accelerator.save_model(quantized_model, new_weights_location)

In [13]:
quantized_model_from_saved = load_and_quantize_model(empty_model, weights_location=new_weights_location, bnb_quantization_config=bnb_quantization_config, device_map = "auto")

Some weights of the model checkpoint at quantized_models were not used when initializing GPT: {'transformer.h.45.mlp.c_proj.weight_format', 'transformer.h.9.attn.c_proj.weight_format', 'transformer.h.8.attn.c_attn.weight_format', 'transformer.h.5.mlp.c_proj.weight_format', 'transformer.h.10.mlp.c_proj.weight_format', 'transformer.h.33.attn.c_attn.weight_format', 'transformer.h.18.mlp.c_proj.weight_format', 'transformer.h.33.mlp.c_proj.weight_format', 'transformer.h.10.attn.c_attn.weight_format', 'transformer.h.40.attn.c_attn.weight_format', 'transformer.h.30.mlp.c_fc.weight_format', 'transformer.h.42.mlp.c_proj.weight_format', 'transformer.h.10.mlp.c_fc.weight_format', 'transformer.h.24.attn.c_attn.weight_format', 'transformer.h.16.attn.c_proj.weight_format', 'transformer.h.20.attn.c_proj.weight_format', 'transformer.h.32.attn.c_attn.weight_format', 'transformer.h.28.mlp.c_proj.weight_format', 'transformer.h.23.attn.c_attn.weight_format', 'transformer.h.14.attn.c_attn.weight_format', '

In [14]:
check_model_size(quantized_model_from_saved)

Total number of parameters: 1638022400
Total model size: 1718.03 MB
Total model size: 1.68 GB


In [None]:
release_memory(quantized_model_from_saved)

In [None]:
def flush():
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats()

In [None]:
flush()

# Use the Quantized Model

In [15]:
# create tokenizer
prompt = "Hello my name is"
tokenizer = BPETokenizer()
x1 = tokenizer(prompt).to(0)

In [16]:
quantized_model_from_saved.eval()

GPT(
  (transformer): ModuleDict(
    (wte): Embedding(50257, 1600)
    (wpe): Embedding(1024, 1600)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-47): 48 x Block(
        (ln_1): LayerNorm((1600,), eps=1e-05, elementwise_affine=True)
        (attn): CausalSelfAttention(
          (c_attn): Linear8bitLt(in_features=1600, out_features=4800, bias=True)
          (c_proj): Linear8bitLt(in_features=1600, out_features=1600, bias=True)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1600,), eps=1e-05, elementwise_affine=True)
        (mlp): ModuleDict(
          (c_fc): Linear8bitLt(in_features=1600, out_features=6400, bias=True)
          (c_proj): Linear8bitLt(in_features=6400, out_features=1600, bias=True)
          (act): NewGELU()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1600,), eps=1e-05, elementwise_aff

In [17]:
for parameter in quantized_model_from_saved.parameters():
    print(parameter.device)

cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0

In [18]:
outputs = quantized_model_from_saved.generate(x1, max_new_tokens=10, do_sample=False)[0]

In [20]:
print(tokenizer.decode(outputs.cpu().squeeze()))

Hello my name is John Doe, and I'm a big fan of


# Load in 4 bits

In [21]:
model_config = GPT.get_default_config()
model_config.model_type = 'gpt2-xl'
model_config.vocab_size = 50257
model_config.block_size = 1024

with init_empty_weights():
  empty_model = GPT(model_config)

number of parameters: 1557.61M


In [22]:
# get quantization config
config = BnbQuantizationConfig(load_in_4bit=True,
                               bnb_4bit_compute_dtype=torch.bfloat16,
                               bnb_4bit_use_double_quant=True,
                               bnb_4bit_quant_type="nf4"
                               )

In [23]:
model_4bit = load_and_quantize_model(empty_model,
                                     bnb_quantization_config = config,
                                     weights_location = weights_location,
                                     device_map="auto")

In [24]:
print(model_4bit)

GPT(
  (transformer): ModuleDict(
    (wte): Embedding(50257, 1600)
    (wpe): Embedding(1024, 1600)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-47): 48 x Block(
        (ln_1): LayerNorm((1600,), eps=1e-05, elementwise_affine=True)
        (attn): CausalSelfAttention(
          (c_attn): Linear4bit(in_features=1600, out_features=4800, bias=True)
          (c_proj): Linear4bit(in_features=1600, out_features=1600, bias=True)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1600,), eps=1e-05, elementwise_affine=True)
        (mlp): ModuleDict(
          (c_fc): Linear4bit(in_features=1600, out_features=6400, bias=True)
          (c_proj): Linear4bit(in_features=6400, out_features=1600, bias=True)
          (act): NewGELU()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1600,), eps=1e-05, elementwise_affine=True

In [25]:
model_4bit.eval()
outputs = model_4bit.generate(x1, max_new_tokens=10, do_sample=False)[0]
print(tokenizer.decode(outputs.cpu().squeeze()))

Hello my name is John. I am a student at the University of


# Train a quantized model

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_id = "EleutherAI/gpt-neox-20b"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0})

Downloading shards:   0%|          | 0/46 [00:00<?, ?it/s]

model-00004-of-00046.safetensors:   7%|6         | 63.6M/910M [00:00<?, ?B/s]

model-00005-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00006-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00007-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00008-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00009-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00010-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00011-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00012-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00013-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00014-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00015-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

In [None]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=8, 
    lora_alpha=32, 
    target_modules=["query_key_value"], 
    lora_dropout=0.05, 
    bias="none", 
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)