In [1]:
%pip install -q -U transformers peft accelerate optimum
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
import torch
import sys
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from packages import toolbox

Note: you may need to restart the kernel to use updated packages.


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from huggingface_hub import login
login("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")

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /scale/cal/home/jychoi/.cache/huggingface/token
Login successful


In [3]:
def Perplexity(model, tokenizer):
    from tqdm import tqdm
    device = "cuda"

    if hasattr(model,'quantize_config'):
        act_order = model.quantize_config.desc_act
        if act_order:
            from auto_gptq import exllama_set_max_input_length
            model = exllama_set_max_input_length(
                model, 8192
            )  # Need to set when using LLaMa models

    # Load datasets, Fixed with wikitext-2 dataset.
    testdata = load_dataset('wikitext', 'wikitext-2-raw-v1', split='test')

    # preprocessing the data with GPTQ (or GPT-2) way
    # encodings = tokenizer("\n\n".join(testdata["text"]), return_tensors="pt")

    # preprocessing the data with AutoGPTQ (or llama.cpp) way
    testdata = [" \n" if s == "" else s for s in testdata["text"]]
    encodings = tokenizer(
        "".join(testdata),
        return_tensors="pt",
        truncation=False,
    )

    # preprocessing with GEMMA way
    # encodings = tokenizer(
    #     testdata["text"], add_special_tokens=False
    # )  # use tokenizer parallelism
    # encodings.input_ids = torch.tensor([sum(encodings.input_ids, [])])

    # max_length = model.config.max_position_embeddings
    max_length = 2048 # It is buggy with model.config.max_position_embeddings when it is used with gemma.
    stride = 2048
    seq_len = encodings.input_ids.size(1)

    nlls = []
    prev_end_loc = 0
    for begin_loc in tqdm(range(0, seq_len, stride)):
        end_loc = min(begin_loc + max_length, seq_len)
        trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
        input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
        input_ids[:, 0] = 2  # give a bos token, Check if it is OK with other than Gemma
        target_ids = input_ids.clone()
        target_ids[:, :-trg_len] = -100

        with torch.no_grad():
            outputs = model(input_ids, labels=target_ids)

            # loss is calculated using CrossEntropyLoss which averages over valid labels
            # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
            # to the left by 1.
            neg_log_likelihood = outputs.loss
        nlls.append(neg_log_likelihood)

        prev_end_loc = end_loc
        if end_loc == seq_len:
            break

    ppl = torch.exp(torch.stack(nlls).mean())
    print(f"Perplexity: {ppl:.2f}")
    return ppl

In [4]:
def ppl(model_id, branch):
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForCausalLM.from_pretrained(
        model_id, revision=branch, device_map="auto", torch_dtype=torch.float16
    )
    toolbox.print_model_size(model)
    toolbox.print_layer_size(model)

    # Evaluate the model
    Perplexity(model=model, tokenizer=tokenizer)
    

In [5]:
# Evaluate the OPT 1.3B model with and without quantization
model_list = [["dasistwo/opt-1.3b-GPTQ", None],
              ["dasistwo/opt-1.3b-GPTQ", "gptq-4bit-32g-actorder_True"],
              ["facebook/opt-1.3b", None]]

for model_id, branch in model_list:
    torch.cuda.empty_cache()
    ppl(model_id, branch)
    print("-------------------------")



Model size: 784.352MB
k_proj: weight: 2097152 bytes
 quant params: 1024, 4096, 8192 bytes
out_proj: weight: 2097152 bytes
 quant params: 1024, 4096, 8192 bytes
q_proj: weight: 2097152 bytes
 quant params: 1024, 4096, 8192 bytes
v_proj: weight: 2097152 bytes
 quant params: 1024, 4096, 8192 bytes
fc1: weight: 8388608 bytes
 quant params: 4096, 16384, 8192 bytes
fc2: weight: 8388608 bytes
 quant params: 1024, 4096, 32768 bytes


 99%|█████████▉| 139/140 [00:20<00:00,  6.79it/s]


Perplexity: 15.25
-------------------------




Model size: 873.297MB
k_proj: weight: 2097152 bytes
 quant params: 65536, 262144, 8192 bytes
out_proj: weight: 2097152 bytes
 quant params: 65536, 262144, 8192 bytes
q_proj: weight: 2097152 bytes
 quant params: 65536, 262144, 8192 bytes
v_proj: weight: 2097152 bytes
 quant params: 65536, 262144, 8192 bytes
fc1: weight: 8388608 bytes
 quant params: 262144, 1048576, 8192 bytes
fc2: weight: 8388608 bytes
 quant params: 262144, 1048576, 32768 bytes


 99%|█████████▉| 139/140 [00:20<00:00,  6.69it/s]


Perplexity: 14.81
-------------------------
Model size: 2509.609MB
k_proj: 8388608 bytes
v_proj: 8388608 bytes
q_proj: 8388608 bytes
out_proj: 8388608 bytes
fc1: 33554432 bytes
fc2: 33554432 bytes


 99%|█████████▉| 139/140 [00:19<00:00,  7.24it/s]

Perplexity: 14.19
-------------------------





In [6]:
# Evaluate the Llama-2 7B model with and without quantization
model_list = [["thebloke/llama-2-7b-gptq", None],
              ["thebloke/llama-2-7b-gptq", "gptq-4bit-32g-actorder_True"],
              ["meta-llama/Llama-2-7b-hf", None]]

for model_id, branch in model_list:
    torch.cuda.empty_cache()
    ppl(model_id, branch)
    print("-------------------------")

The cos_cached attribute will be removed in 4.39. Bear in mind that its contents changed in v4.38. Use the forward method of RoPE from now on instead. It is not used in the `LlamaAttention` class
The sin_cached attribute will be removed in 4.39. Bear in mind that its contents changed in v4.38. Use the forward method of RoPE from now on instead. It is not used in the `LlamaAttention` class


Model size: 3796.078MB
k_proj: weight: 8388608 bytes
 quant params: 65536, 262144, 16384 bytes
o_proj: weight: 8388608 bytes
 quant params: 65536, 262144, 16384 bytes
q_proj: weight: 8388608 bytes
 quant params: 65536, 262144, 16384 bytes
v_proj: weight: 8388608 bytes
 quant params: 65536, 262144, 16384 bytes
down_proj: weight: 22544384 bytes
 quant params: 176128, 704512, 44032 bytes
gate_proj: weight: 22544384 bytes
 quant params: 176128, 704512, 16384 bytes
up_proj: weight: 22544384 bytes
 quant params: 176128, 704512, 16384 bytes


 99%|█████████▉| 163/164 [00:47<00:00,  3.41it/s]


Perplexity: 7.39
-------------------------
Model size: 4113.953MB
k_proj: weight: 8388608 bytes
 quant params: 262144, 1048576, 16384 bytes
o_proj: weight: 8388608 bytes
 quant params: 262144, 1048576, 16384 bytes
q_proj: weight: 8388608 bytes
 quant params: 262144, 1048576, 16384 bytes
v_proj: weight: 8388608 bytes
 quant params: 262144, 1048576, 16384 bytes
down_proj: weight: 22544384 bytes
 quant params: 704512, 2818048, 44032 bytes
gate_proj: weight: 22544384 bytes
 quant params: 704512, 2818048, 16384 bytes
up_proj: weight: 22544384 bytes
 quant params: 704512, 2818048, 16384 bytes


 99%|█████████▉| 163/164 [00:47<00:00,  3.44it/s]


Perplexity: 5.74
-------------------------


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model size: 12932.516MB
q_proj: 33554432 bytes
k_proj: 33554432 bytes
v_proj: 33554432 bytes
o_proj: 33554432 bytes
gate_proj: 90177536 bytes
up_proj: 90177536 bytes
down_proj: 90177536 bytes


 99%|█████████▉| 163/164 [00:39<00:00,  4.08it/s]

Perplexity: 5.65
-------------------------





In [7]:
# Evaluate the gemma-2b model with and without quantization
model_list = [
    ["dasistwo/gemma-2b-GPTQ", None],
    ["dasistwo/gemma-2b-GPTQ", "gptq-4bit-32g-actorder_True"],
    ["google/gemma-2b", None]
    ]

for model_id, branch in model_list:
    torch.cuda.empty_cache()
    ppl(model_id, branch)
    print("-------------------------")

Model size: 2014.159MB
k_proj: weight: 262144 bytes
 quant params: 128, 512, 8192 bytes
o_proj: weight: 2097152 bytes
 quant params: 1024, 4096, 8192 bytes
q_proj: weight: 2097152 bytes
 quant params: 1024, 4096, 8192 bytes
v_proj: weight: 262144 bytes
 quant params: 128, 512, 8192 bytes
down_proj: weight: 16777216 bytes
 quant params: 1024, 4096, 65536 bytes
gate_proj: weight: 16777216 bytes
 quant params: 8192, 32768, 8192 bytes
up_proj: weight: 16777216 bytes
 quant params: 8192, 32768, 8192 bytes


 99%|█████████▉| 142/143 [01:02<00:00,  2.27it/s]


Perplexity: 11.05
-------------------------
Model size: 2160.123MB
k_proj: weight: 262144 bytes
 quant params: 8192, 32768, 8192 bytes
o_proj: weight: 2097152 bytes
 quant params: 65536, 262144, 8192 bytes
q_proj: weight: 2097152 bytes
 quant params: 65536, 262144, 8192 bytes
v_proj: weight: 262144 bytes
 quant params: 8192, 32768, 8192 bytes
down_proj: weight: 16777216 bytes
 quant params: 524288, 2097152, 65536 bytes
gate_proj: weight: 16777216 bytes
 quant params: 524288, 2097152, 8192 bytes
up_proj: weight: 16777216 bytes
 quant params: 524288, 2097152, 8192 bytes


 99%|█████████▉| 142/143 [01:02<00:00,  2.27it/s]


Perplexity: 9.07
-------------------------


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model size: 4844.145MB
q_proj: 8388608 bytes
k_proj: 1048576 bytes
v_proj: 1048576 bytes
o_proj: 8388608 bytes
gate_proj: 67108864 bytes
up_proj: 67108864 bytes
down_proj: 67108864 bytes


 99%|█████████▉| 142/143 [01:01<00:00,  2.30it/s]


Perplexity: 8.69
-------------------------


In [5]:
# Evaluate the gemma-7b model with and without quantization
model_list = [
    # ["dasistwo/gemma-7b-GPTQ", None, True],
#               ["dasistwo/gemma-7b-GPTQ", "gptq-4bit-32g-actorder_True", True],
              ["google/gemma-7b", None, False]
              ]

for model_id, branch, quantized in model_list:
    torch.cuda.empty_cache()
    ppl(model_id, branch)
    print("-------------------------")

Downloading shards:  25%|██▌       | 1/4 [33:23<1:40:10, 2003.34s/it]


KeyboardInterrupt: 