In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

from awq import AutoAWQForCausalLM
from auto_gptq import AutoGPTQForCausalLM
from awq.evaluation import evaluate_perplexity as Perplexity

Test the perplexity of the model on the test set.
FP16, INT4-awq, INT4-gptq

In [None]:
for size in [
    "3-8B",
    "2-7B",
    "2-13B",
]:
    fp16_model_id = "/data/storage1/model/huggingface/llama/{}".format(size)
    gptq_model_id = "TheBloke/Llama-{}-GPTQ".format(size)
    awq_model_id = "TheBloke/Llama-{}-AWQ".format(size)
    if size == "3-8B":
        gptq_model_id = "TechxGenus/Meta-Llama-3-8B-GPTQ"
        awq_model_id = "TechxGenus/Meta-Llama-3-8B-AWQ"

    tokenizer = AutoTokenizer.from_pretrained(fp16_model_id, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(fp16_model_id)
    awq_model = AutoAWQForCausalLM.from_quantized(awq_model_id)
    gptq_model = AutoGPTQForCausalLM.from_quantized(gptq_model_id, low_cpu_mem_usage=True)

    for testcases in [model, awq_model.model, gptq_model]:
        Perplexity(testcases, tokenizer)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]



Fetching 13 files:   0%|          | 0/13 [00:00<?, ?it/s]

Replacing layers...: 100%|██████████| 40/40 [00:08<00:00,  4.66it/s]


  0%|          | 0/923 [00:00<?, ?w/s]

Fusing layers...: 100%|██████████| 40/40 [00:01<00:00, 26.27it/s]
INFO - You passed a model that is compatible with the Marlin int4*fp16 GPTQ kernel but use_marlin is False. We recommend using `use_marlin=True` to use the optimized Marlin kernels for inference. Example: `model = AutoGPTQForCausalLM.from_quantized(..., use_marlin=True)`.
2024-06-28:14:29:23,376 INFO     [_base.py:922] You passed a model that is compatible with the Marlin int4*fp16 GPTQ kernel but use_marlin is False. We recommend using `use_marlin=True` to use the optimized Marlin kernels for inference. Example: `model = AutoGPTQForCausalLM.from_quantized(..., use_marlin=True)`.
INFO - The layer lm_head is not quantized.
2024-06-28:14:29:23,829 INFO     [_base.py:1011] The layer lm_head is not quantized.


  0%|          | 0/1523 [00:00<?, ?w/s]

Perplexity 4.883: 100%|██████████| 166/166 [1:23:44<00:00, 30.27s/it]
Perplexity 4.971: 100%|██████████| 166/166 [01:38<00:00,  1.68it/s]
Perplexity 4.973: 100%|██████████| 166/166 [01:06<00:00,  2.51it/s]


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Fetching 13 files:   0%|          | 0/13 [00:00<?, ?it/s]

Replacing layers...: 100%|██████████| 32/32 [00:09<00:00,  3.27it/s]


  0%|          | 0/739 [00:00<?, ?w/s]

Fusing layers...: 100%|██████████| 32/32 [00:02<00:00, 14.44it/s]
INFO - You passed a model that is compatible with the Marlin int4*fp16 GPTQ kernel but use_marlin is False. We recommend using `use_marlin=True` to use the optimized Marlin kernels for inference. Example: `model = AutoGPTQForCausalLM.from_quantized(..., use_marlin=True)`.
2024-06-28:15:56:51,541 INFO     [_base.py:922] You passed a model that is compatible with the Marlin int4*fp16 GPTQ kernel but use_marlin is False. We recommend using `use_marlin=True` to use the optimized Marlin kernels for inference. Example: `model = AutoGPTQForCausalLM.from_quantized(..., use_marlin=True)`.
INFO - The layer lm_head is not quantized.
2024-06-28:15:56:51,977 INFO     [_base.py:1011] The layer lm_head is not quantized.


  0%|          | 0/1219 [00:00<?, ?w/s]

Perplexity 5.472: 100%|██████████| 166/166 [45:33<00:00, 16.47s/it]
Perplexity 5.607: 100%|██████████| 166/166 [00:57<00:00,  2.91it/s]
Perplexity 5.730: 100%|██████████| 166/166 [00:39<00:00,  4.24it/s]


In [None]:
# This case is not reliable than AWQ Perplexity method.
from auto_gptq.utils import Perplexity as GPTQPerplexity
for size in [
    "3-8B",
    "2-7B",
    "2-13B",
]:
    fp16_model_id = "/data/storage1/model/huggingface/llama/{}".format(size)
    gptq_model_id = "TheBloke/Llama-{}-GPTQ".format(size)
    awq_model_id = "TheBloke/Llama-{}-AWQ".format(size)
    if size == "3-8B":
        gptq_model_id = "TechxGenus/Meta-Llama-3-8B-GPTQ"
        awq_model_id = "TechxGenus/Meta-Llama-3-8B-AWQ"

    tokenizer = AutoTokenizer.from_pretrained(fp16_model_id, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(fp16_model_id)
    awq_model = AutoAWQForCausalLM.from_quantized(awq_model_id)
    gptq_model = AutoGPTQForCausalLM.from_quantized(gptq_model_id, low_cpu_mem_usage=True)

    for testcases in [model, awq_model.model, gptq_model]:
        ppl = GPTQPerplexity(testcases, tokenizer)
        ppl.calculate_perplexity(2048, 2048)

Perplexity: 5.1226: 100%|██████████| 163/163 [1:00:30<00:00, 22.27s/it]


Downloading readme:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

Perplexity: 5.2486: 100%|██████████| 163/163 [01:45<00:00,  1.55it/s]
Perplexity: 6.6099: 100%|██████████| 163/163 [01:28<00:00,  1.84it/s]


print("--------------")
for name, data in awq_model.model.model.layers[0].attn.o_proj.named_buffers():
    print(name, data.nbytes)
print("--------------")
for name, data in gptq_model.model.model.layers[0].self_attn.o_proj.named_buffers():
    print(name, data.nbytes)