In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

In [2]:
quantized_name = "gptq_bloom-7b1_Q4bit"
device_map = "cuda:0" if torch.cuda.is_available() else "cpu"
model = AutoModelForCausalLM.from_pretrained(quantized_name, device_map=device_map)
tokenizer = AutoTokenizer.from_pretrained(quantized_name,device_map=device_map)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
def print_param_precision(model):
  dtypes = {}
  for _, p in model.named_parameters():
      dtype = p.dtype
      if dtype not in dtypes:
          dtypes[dtype] = 0
      dtypes[dtype] += p.numel()
  total = 0
  for k, v in dtypes.items():
      total += v
  for k, v in dtypes.items():
      print(f"{k}, {v / 10**6:.4f} M, {v / total*100:.2f} %")

def print_trainable_parameters(model):
  # Count the total parameters
  total_params = sum(p.numel() for p in model.parameters())
  print(f"Total parameters: {total_params/10**6:.4f} M")

In [4]:
print(f"{device_map} Memory Footprint: {model.get_memory_footprint() / 1024**2:.4f} MB")
print("\nData types:")
print_param_precision(model)

cuda:0 Memory Footprint: 7861.3594 MB

Data types:
torch.float16, 1028.1124 M, 100.00 %


In [5]:
#mytask="CREATE TABLE trip (bus_stop VARCHAR, duration INTEGER), list all the bus stops from which a trip of duration below 100 started."
mytask="CREATE TABLE book (Title VARCHAR, Writer VARCHAR). What are the titles of the books whose writer is not Dennis Lee?"

prompt = f"""
# Instruction:
Use the context below to produce the result
# context:
{mytask}
# result:
"""

In [6]:
input_id1 = tokenizer.encode(prompt, return_tensors="pt").to(device_map)
attention_mask1 = torch.ones(input_id1.shape, dtype=torch.long).to(device_map)
print(f"--------------------------------------")
print(f"Prompt:{prompt}")
print(f"--------------------------------------")

print(f"Quantized Model Result :")
output = model.generate(input_ids=input_id1, do_sample=True, max_new_tokens=100, top_p=0.9,temperature=0.5,attention_mask=attention_mask1)
print(f"{tokenizer.batch_decode(output.detach().cpu().numpy(), skip_special_tokens=True)[0][len(prompt):]}")

--------------------------------------
Prompt:
# Instruction:
Use the context below to produce the result
# context:
CREATE TABLE book (Title VARCHAR, Writer VARCHAR). What are the titles of the books whose writer is not Dennis Lee?
# result:

--------------------------------------
Quantized Model Result :
SELECT Title FROM book WHERE Writer <> "Dennis Lee"
    
