In [None]:
from datasets import load_dataset
import torch
import time
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, GPTQConfig

In [None]:
prequantized_model = "merged_ft_model"
quantized_name = "gptq-ft_bloom1b1_8bit"
tokenizer = AutoTokenizer.from_pretrained(prequantized_model)
device_map = "cuda:0" if torch.cuda.is_available() else "cpu"
quantization_config = GPTQConfig(bits=8, dataset = "c4", tokenizer=tokenizer) #c4 local dir is not the actual c4 dataset but custom dataset (text-to-sql)

In [None]:
start = time.time()
quantized_model = AutoModelForCausalLM.from_pretrained(prequantized_model, device_map="auto", quantization_config=quantization_config)
end = time.time()
print("Total Seconds Taken to Quantize Using " + str(device_map) +": {}".format(end - start))

In [None]:
quantized_model.to("cpu")
quantized_model.save_pretrained(quantized_name)

In [None]:
tokenizer.save_pretrained(quantized_name)

In [None]:
def print_param_precision(model):
  dtypes = {}
  for _, p in model.named_parameters():
      dtype = p.dtype
      if dtype not in dtypes:
          dtypes[dtype] = 0
      dtypes[dtype] += p.numel()
  total = 0
  for k, v in dtypes.items():
      total += v
  for k, v in dtypes.items():
      print(f"{k}, {v / 10**6:.4f} M, {v / total*100:.2f} %")

def print_trainable_parameters(model):
  # Count the total parameters
  total_params = sum(p.numel() for p in model.parameters())
  print(f"Total parameters: {total_params/10**6:.4f} M")

  # Count the trainable parameters
  trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
  print(f"Trainable parameters: {trainable_params/10**6:.4f} M")

In [None]:
# Reset the iPython kernel before running the following test.

In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

def print_param_precision(model):
  dtypes = {}
  for _, p in model.named_parameters():
      dtype = p.dtype
      if dtype not in dtypes:
          dtypes[dtype] = 0
      dtypes[dtype] += p.numel()
  total = 0
  for k, v in dtypes.items():
      total += v
  for k, v in dtypes.items():
      print(f"{k}, {v / 10**6:.4f} M, {v / total*100:.2f} %")

def print_trainable_parameters(model):
  # Count the total parameters
  total_params = sum(p.numel() for p in model.parameters())
  print(f"Total parameters: {total_params/10**6:.4f} M")

  # Count the trainable parameters
  trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
  print(f"Trainable parameters: {trainable_params/10**6:.4f} M")

In [2]:
quantized_name = "gptq-ft_bloom1b1_8bit"
device_map = "cuda:0" if torch.cuda.is_available() else "cpu"
model_new = AutoModelForCausalLM.from_pretrained(quantized_name, device_map="auto")

print(f"{device_map} Memory Used: {model_new.get_memory_footprint() / 1024**2:.4f} MB")
print("\nParameters:")
print_trainable_parameters(model_new)
print("\nData types:")
print_param_precision(model_new)

cuda:0 Memory Used: 1400.0977 MB

Parameters:
Total parameters: 385.5053 M
Trainable parameters: 385.5053 M

Data types:
torch.float16, 385.5053 M, 100.00 %


In [3]:
ft_tokenizer = AutoTokenizer.from_pretrained(quantized_name,device_map=device_map)
ft_model = AutoModelForCausalLM.from_pretrained(quantized_name,device_map=device_map)

mytask="CREATE TABLE trip (bus_stop VARCHAR, duration INTEGER), list all the bus stops from which a trip of duration below 100 started."
prompt = f"""
# Instruction:
Use the context below to produce the result
# context:
{mytask}
# result:
"""

input_id1 = ft_tokenizer.encode(prompt, return_tensors="pt").to(device_map)
attention_mask1 = torch.ones(input_id1.shape, dtype=torch.long).to(device_map)
print(f"--------------------------------------\n")
print(f"Prompt:\n{prompt}\n")
print(f"--------------------------------------\n")

print(f"Fine-tuned Model Result :\n")
output_ft = ft_model.generate(input_ids=input_id1, do_sample=True, max_new_tokens=100, top_p=0.9,temperature=0.5,attention_mask=attention_mask1)
print(f"{ft_tokenizer.batch_decode(output_ft.detach().cpu().numpy(), skip_special_tokens=True)[0][len(prompt):]}")

--------------------------------------

Prompt:

# Instruction:
Use the context below to produce the result
# context:
CREATE TABLE trip (bus_stop VARCHAR, duration INTEGER), list all the bus stops from which a trip of duration below 100 started.
# result:


--------------------------------------

Fine-tuned Model Result :



RuntimeError: Unrecognized tensor type ID: AutocastCUDA