In [1]:
from datasets import load_dataset
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, GPTQConfig, AutoConfig

In [2]:
model_id = "bloom-1b1"
tokenizer = AutoTokenizer.from_pretrained(model_id)
gptq_config = GPTQConfig(bits=4, dataset = "c4", tokenizer=tokenizer)

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16"
)

In [3]:
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
)

In [None]:
#model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", quantization_config=gptq_config)

In [4]:
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", quantization_config=bnb_config)

In [None]:
#model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto")

In [5]:
def print_param_precision(model):
  dtypes = {}
  for _, p in model.named_parameters():
      dtype = p.dtype
      if dtype not in dtypes:
          dtypes[dtype] = 0
      dtypes[dtype] += p.numel()
  total = 0
  for k, v in dtypes.items():
      total += v
  for k, v in dtypes.items():
      print(f"{k}, {v / 10**6:.4f} M, {v / total*100:.2f} %")

def print_trainable_parameters(model):
  # Count the total parameters
  total_params = sum(p.numel() for p in model.parameters())
  print(f"Total parameters: {total_params/10**6:.4f} M")

  # Count the trainable parameters
  trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
  print(f"Trainable parameters: {trainable_params/10**6:.4f} M")

In [6]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(f"{device} Memory Used: {model.get_memory_footprint() / 1024**2:.4f} MB")
print("\nParameters:")
print_trainable_parameters(model)
print("\nData types:")
print_param_precision(model)

cuda:0 Memory Used: 1383.9258 MB

Parameters:
Total parameters: 1065.3143 M
Trainable parameters: 385.5053 M

Data types:
torch.float16, 385.8371 M, 36.22 %
torch.int8, 679.4772 M, 63.78 %


In [7]:
model.save_pretrained("gptq-bloom-1b1")
tokenizer.save_pretrained("gptq-bloom-1b1")

('gptq-bloom-1b1/tokenizer_config.json',
 'gptq-bloom-1b1/special_tokens_map.json',
 'gptq-bloom-1b1/tokenizer.json')

In [8]:
new_model = "gptq-bloom-1b1"
device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(f"{device} Memory Used: {model.get_memory_footprint() / 1024**2:.4f} MB")
print("\nParameters:")
print_trainable_parameters(model)
print("\nData types:")
print_param_precision(model)

cuda:0 Memory Used: 1383.9258 MB

Parameters:
Total parameters: 1065.3143 M
Trainable parameters: 385.5053 M

Data types:
torch.float16, 385.8371 M, 36.22 %
torch.int8, 679.4772 M, 63.78 %


In [9]:
#load model into memory

model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto")

AttributeError: 'str' object has no attribute 'named_parameters'

In [None]:
for name, param in new_model.named_parameters():
    print(f"Parameter name: {name}, Data type: {param.dtype}")