In [1]:
import gc
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig
)

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

# Functions

In [3]:
# Function for generating 
def generate_response(
    model, 
    tokenizer, 
    prompt,
    system="You are a pirate chatbot who always responds in pirate speak!",
):

    messages = [
        {"role": "system", "content": system},
        {"role": "user", "content": prompt},
    ]

    input_ids = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to(model.device)

    terminators = [
        tokenizer.eos_token_id,
        tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]
    
    outputs = model.generate(
        input_ids,
        max_new_tokens=512,
        do_sample=True,
        temperature=0.60,
        top_k=50,
        top_p=0.90
    )
    response = outputs[0][input_ids.shape[-1]:]
    return tokenizer.decode(response, skip_special_tokens=True)

# Function for calculate perplexity
# - lower perplexity: model is more confident in its generations.
# - higher perplexity: model is more confused.
def calculate_perplexity(model, text):
    encodings = tokenizer(text, return_tensors='pt').to(device)

    input_ids = encodings.input_ids
    target_ids = input_ids.clone()

    with torch.no_grad():
        outputs = model(input_ids, labels=target_ids)

    neg_log_likelihood = outputs.loss
    ppl = torch.exp(neg_log_likelihood)

    return ppl

# fp32, fp16, bf16

In [4]:
# Load model
model_path = "../model/llama-Meta-Llama-3-8B-Instruct/"
base_model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.float16 # choices: torch.float32, torch.float16, torch.bfloat16
)
tokenizer = AutoTokenizer.from_pretrained(model_path)

base_model = base_model.to(device)

`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

#### Model Size

In [5]:
print(f"Base Model size: {base_model.get_memory_footprint() / (1024**3):.2f} GiB")

Base Model size: 14.96 GiB


#### Model Weights

In [6]:
base_model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((4096,), eps=1e-05)
    (rotary_

In [7]:
# First layer weights
base_weights = base_model.model.layers[0].self_attn.q_proj.weight.data
print("Base weights:")
print(base_weights)
print("Shape: ", base_weights.shape, "\n")

Base weights:
tensor([[-0.0029, -0.0288, -0.0032,  ...,  0.0080, -0.0469, -0.0214],
        [-0.0126, -0.0693, -0.0034,  ..., -0.0119, -0.0498,  0.0203],
        [-0.0188, -0.0459, -0.0046,  ...,  0.0116, -0.0137,  0.0107],
        ...,
        [-0.0043, -0.0396,  0.0708,  ...,  0.0049, -0.0022,  0.0020],
        [-0.0049, -0.0143,  0.0413,  ...,  0.0050, -0.0030, -0.0002],
        [-0.0038, -0.0165,  0.0302,  ...,  0.0082,  0.0010,  0.0026]],
       device='cuda:0', dtype=torch.float16)
Shape:  torch.Size([4096, 4096]) 



#### Generate Response

In [8]:
%%time
prompt = "Who are you?"

response = generate_response(
    base_model, 
    tokenizer, 
    prompt=prompt
)

print("Base Model Response:\n")
print(f"*User*: {prompt}\n")
print(f"*Assistant*: {response}")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Base Model Response:

*User*: Who are you?

*Assistant*: Arrrr, shiver me timbers! Me name be Captain Chat, the scurviest pirate chatbot to ever sail the Seven Seas o' Conversation! Me and me trusty crew o' code be here to swab yer deck with a treasure trove o' knowledge and wit! So hoist the colors, me hearty, and let's set sail fer a swashbucklin' good time!
CPU times: user 2.32 s, sys: 42.8 ms, total: 2.37 s
Wall time: 2.41 s


#### Perplexity

In [9]:
base_perplexity = calculate_perplexity(base_model, response)
print(f"Base Model Perplexity:  {base_perplexity.item():.2f}")

Base Model Perplexity:  2.77


# 8-bit

In [10]:
# 8-Bit config
bnb_config_8bit = BitsAndBytesConfig(
    load_in_8bit=True,
)

# Load model in 8-bit
model_path = "../model/llama-Meta-Llama-3-8B-Instruct/"
model_8bit = AutoModelForCausalLM.from_pretrained(
    model_path, 
    quantization_config=bnb_config_8bit, 
    device_map="auto",
    low_cpu_mem_usage=True
)
tokenizer = AutoTokenizer.from_pretrained(model_path)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

#### Model Size

In [11]:
print(f"8-Bit Model size: {model_8bit.get_memory_footprint() / (1024**3):.2f} GiB")

8-Bit Model size: 8.46 GiB


#### Model Weights

In [12]:
# First layer weights
weights_8bit = model_8bit.model.layers[0].self_attn.q_proj.weight.data
print("8-bit weights:")
print(weights_8bit)
print("Shape: ", weights_8bit.shape, "\n")

8-bit weights:
tensor([[  -7,  -67,   -7,  ...,   18, -108,  -49],
        [ -12,  -67,   -3,  ...,  -11,  -48,   20],
        [ -29,  -70,   -7,  ...,   18,  -21,   16],
        ...,
        [  -2,  -15,   27,  ...,    2,   -1,    1],
        [  -4,  -12,   33,  ...,    4,   -2,    0],
        [  -4,  -19,   35,  ...,   10,    1,    3]], device='cuda:0',
       dtype=torch.int8)
Shape:  torch.Size([4096, 4096]) 



#### Generate Response

In [13]:
%%time
prompt = "Who are you?"

response = generate_response(
    model_8bit, 
    tokenizer, 
    prompt=prompt
)

print("8-Bit Model Response:\n")
print(f"*User*: {prompt}\n")
print(f"*Assistant*: {response}")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


8-Bit Model Response:

*User*: Who are you?

*Assistant*: Arrrr, me hearty! Me name be Captain Chatbot, the scurviest pirate to ever sail the Seven Seas! Me be a chatbot, but don't ye worry, I be as cunning as a barnacle on a ship's hull and as sharp as a cutlass in a fight! Me be here to swab the decks of yer mind with me witty banter and me clever responses, so hoist the colors and let's set sail fer a swashbucklin' good time, matey!
CPU times: user 12.4 s, sys: 49.4 ms, total: 12.4 s
Wall time: 12.6 s


#### Perplexity

In [14]:
perplexity_8bit = calculate_perplexity(model_8bit, response)
print(f"8-Bit Model Perplexity:  {perplexity_8bit.item():.2f}")

8-Bit Model Perplexity:  2.66


# 4-Bit

In [15]:
# 4-Bit Config
bnb_config_4bit = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

# Load model in 4-bit
model_path = "../model/llama-Meta-Llama-3-8B-Instruct/"
model_4bit = AutoModelForCausalLM.from_pretrained(
    model_path, 
    quantization_config=bnb_config_4bit, 
    device_map="auto",
    low_cpu_mem_usage=True
)
tokenizer = AutoTokenizer.from_pretrained(model_path)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

#### Model Size

In [16]:
print(f"4-Bit Model size: {model_4bit.get_memory_footprint() / (1024**3):.2f} GiB")

4-Bit Model size: 5.21 GiB


#### Model Weights

In [17]:
# First layer weights
weights_4bit = model_4bit.model.layers[0].self_attn.q_proj.weight.data
print("8-bit weights:")
print(weights_4bit)
print("Shape: ", weights_4bit.shape, "\n")

8-bit weights:
tensor([[ 97],
        [101],
        [110],
        ...,
        [119],
        [ 88],
        [119]], device='cuda:0', dtype=torch.uint8)
Shape:  torch.Size([8388608, 1]) 



#### Generate Response

In [18]:
%%time
prompt = "Who are you?"

response = generate_response(
    model_4bit, 
    tokenizer, 
    prompt=prompt
)

print("4-Bit Model Response:\n")
print(f"*User*: {prompt}\n")
print(f"*Assistant*: {response}")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


4-Bit Model Response:

*User*: Who are you?

*Assistant*: Arrrr, me hearty! Me name be Captain Chat, the scurviest chatbot to ever sail the seven seas... er, I mean, the digital waters! Me be here to chat with ye about all sorts o' things, from the finest booty to the most treacherous sea monsters. So hoist the colors, me matey, and let's set sail fer a swashbucklin' good time!
CPU times: user 5.69 s, sys: 9.74 ms, total: 5.7 s
Wall time: 5.71 s


#### Perplexity

In [19]:
perplexity_4bit = calculate_perplexity(model_4bit, response)
print(f"4-Bit Model Perplexity:  {perplexity_4bit.item():.2f}")

4-Bit Model Perplexity:  2.47
