In [None]:
!pip install transformers torch bitsandbytes accelerate

## Quantization Resources

https://huggingface.co/docs/peft/en/developer_guides/quantization

https://huggingface.co/docs/transformers/quantization

### Load Tokenizer

In [7]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import transformers

tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-7b")



### Using 16bit Precision

This will crash on a 7B parameter model because we would need 14GB of GPU Ram.

In [None]:
from transformers import AutoModelForCausalLM
import transformers
import torch

model = AutoModelForCausalLM.from_pretrained("tiiuae/falcon-7b", torch_dtype=torch.bfloat16)

### Using 8bit Precision

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import transformers
import torch
from transformers import BitsAndBytesConfig

config = BitsAndBytesConfig(
    load_in_8bit=True
)

model = AutoModelForCausalLM.from_pretrained("tiiuae/falcon-7b", quantization_config=config)

gbs = model.get_memory_footprint() / 1e9

print(f"Number of parameters: {model.num_parameters()}")
print(f"Memory footprint if FP32: {(model.num_parameters()*4)/1e9} GB")
print(f"Memory footprint of the model with 8bit quantization: {gbs:.2f} GB")

In [5]:
prompt = "Girafatron is obsessed with giraffes, the most glorious animal on the face of this Earth. Giraftron believes all other animals are irrelevant when compared to the glorious majesty of the giraffe.\nDaniel: Hello, Girafatron!\nGirafatron:"
prompt

'Girafatron is obsessed with giraffes, the most glorious animal on the face of this Earth. Giraftron believes all other animals are irrelevant when compared to the glorious majesty of the giraffe.\nDaniel: Hello, Girafatron!\nGirafatron:'

In [None]:
def generate(prompt):
  tokenized_text = tokenizer(prompt, return_tensors="pt").to("cuda")
  output = model.generate(**tokenized_text, eos_token_id=tokenizer.eos_token_id, do_sample=True, max_new_tokens=100)
  result = tokenizer.batch_decode(output,  skip_special_tokens=True)[0]
  return result


result = generate(prompt)
print(result)

### Using 4bit Precision

Model Card: https://huggingface.co/tiiuae/falcon-7b

In [1]:
import torch
from transformers import BitsAndBytesConfig

config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import transformers

four_bit_model = AutoModelForCausalLM.from_pretrained("tiiuae/falcon-7b", quantization_config=config)

`low_cpu_mem_usage` was None, now set to True since model is quantized.
  return self.fget.__get__(instance, owner)()
Loading checkpoint shards: 100%|██████████| 2/2 [00:17<00:00,  8.88s/it]


In [3]:
gbs = four_bit_model.get_memory_footprint() / 1e9

print(f"Number of parameters: {four_bit_model.num_parameters()}")
print(f"Memory footprint if FP32: {(four_bit_model.num_parameters()*4)/1e9} GB")
print(f"Memory footprint of the model with 4bit quantization: {gbs:.2f} GB")

Number of parameters: 6921720704
Memory footprint if FP32: 27.686882816 GB
Memory footprint of the model with 4bit quantization: 3.92 GB


In [8]:
def generate(prompt):
  tokenized_text = tokenizer(prompt, return_tensors="pt").to("cuda")
  output = four_bit_model.generate(**tokenized_text, eos_token_id=tokenizer.eos_token_id, do_sample=True, max_new_tokens=100)
  result = tokenizer.batch_decode(output,  skip_special_tokens=True)[0]
  return result


result = generate(prompt)

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


In [9]:
print(result)

Girafatron is obsessed with giraffes, the most glorious animal on the face of this Earth. Giraftron believes all other animals are irrelevant when compared to the glorious majesty of the giraffe.
Daniel: Hello, Girafatron!
Girafatron: Daniel!!! My long lost brother. We haven't seen each other in an age, you're growing up!
Daniel: You know that. And you know that I know you are obsessed with giraffes.
Girafatron: Shut up, you rat!
Girafatron in his natural habitat.
Girafatron and his arch nemesis, Lionpoof.
Sigmund Schmatz: Wow, what a coincidence! I


## We were able to fit 2 models into memory!

In [None]:
x =