In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM

from awq import AutoAWQForCausalLM
from auto_gptq import AutoGPTQForCausalLM
from awq.evaluation import evaluate_perplexity as Perplexity

Test the perplexity of the model on the test set.
FP16, INT4-awq, INT4-gptq

In [4]:
for size in [
    "2-7B",
    "2-13B",
    "3-8B",
]:
    fp16_model_id = "/data/storage1/model/hf/llama/{}".format(size)
    gptq_model_id = "TheBloke/Llama-{}-GPTQ".format(size)
    awq_model_id = "TheBloke/Llama-{}-AWQ".format(size)
    if size == "3-8B":
        gptq_model_id = "TechxGenus/Meta-Llama-3-8B-GPTQ"
        awq_model_id = "TechxGenus/Meta-Llama-3-8B-AWQ"

    tokenizer = AutoTokenizer.from_pretrained(fp16_model_id, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(fp16_model_id)
    awq_model = AutoAWQForCausalLM.from_quantized(awq_model_id)
    gptq_model = AutoGPTQForCausalLM.from_quantized(gptq_model_id, low_cpu_mem_usage=True)

    for testcases in [model, awq_model.model, gptq_model]:
        print(size)
        Perplexity(testcases, tokenizer)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Fetching 13 files:   0%|          | 0/13 [00:00<?, ?it/s]

Replacing layers...: 100%|██████████| 32/32 [00:11<00:00,  2.68it/s]


  0%|          | 0/739 [00:00<?, ?w/s]

Fusing layers...: 100%|██████████| 32/32 [00:02<00:00, 13.04it/s]
INFO - You passed a model that is compatible with the Marlin int4*fp16 GPTQ kernel but use_marlin is False. We recommend using `use_marlin=True` to use the optimized Marlin kernels for inference. Example: `model = AutoGPTQForCausalLM.from_quantized(..., use_marlin=True)`.
2024-07-25:12:22:48,191 INFO     [_base.py:922] You passed a model that is compatible with the Marlin int4*fp16 GPTQ kernel but use_marlin is False. We recommend using `use_marlin=True` to use the optimized Marlin kernels for inference. Example: `model = AutoGPTQForCausalLM.from_quantized(..., use_marlin=True)`.
INFO - The layer lm_head is not quantized.
2024-07-25:12:22:48,650 INFO     [_base.py:1011] The layer lm_head is not quantized.


  0%|          | 0/1219 [00:00<?, ?w/s]



2-7B


Perplexity 5.472: 100%|██████████| 166/166 [1:22:23<00:00, 29.78s/it]


2-7B


Perplexity 5.607: 100%|██████████| 166/166 [00:57<00:00,  2.89it/s]


2-7B


Perplexity 5.730: 100%|██████████| 166/166 [00:39<00:00,  4.18it/s]


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Fetching 13 files:   0%|          | 0/13 [00:00<?, ?it/s]

Replacing layers...: 100%|██████████| 40/40 [00:13<00:00,  2.93it/s]


  0%|          | 0/923 [00:00<?, ?w/s]

Fusing layers...: 100%|██████████| 40/40 [00:02<00:00, 14.80it/s]
INFO - You passed a model that is compatible with the Marlin int4*fp16 GPTQ kernel but use_marlin is False. We recommend using `use_marlin=True` to use the optimized Marlin kernels for inference. Example: `model = AutoGPTQForCausalLM.from_quantized(..., use_marlin=True)`.
2024-07-25:13:48:02,565 INFO     [_base.py:922] You passed a model that is compatible with the Marlin int4*fp16 GPTQ kernel but use_marlin is False. We recommend using `use_marlin=True` to use the optimized Marlin kernels for inference. Example: `model = AutoGPTQForCausalLM.from_quantized(..., use_marlin=True)`.
INFO - The layer lm_head is not quantized.
2024-07-25:13:48:03,002 INFO     [_base.py:1011] The layer lm_head is not quantized.


  0%|          | 0/1523 [00:00<?, ?w/s]



2-13B


Perplexity 4.883: 100%|██████████| 166/166 [1:25:55<00:00, 31.06s/it]


2-13B


Perplexity 4.971: 100%|██████████| 166/166 [01:37<00:00,  1.70it/s]


2-13B


Perplexity 4.973: 100%|██████████| 166/166 [01:19<00:00,  2.10it/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Fetching 10 files:   0%|          | 0/10 [00:00<?, ?it/s]

Replacing layers...: 100%|██████████| 32/32 [00:15<00:00,  2.07it/s]


  0%|          | 0/738 [00:00<?, ?w/s]

  0%|          | 0/1 [00:00<?, ?w/s]

Fusing layers...: 100%|██████████| 32/32 [00:02<00:00, 12.42it/s]
INFO - The layer lm_head is not quantized.
2024-07-25:15:18:16,247 INFO     [_base.py:1011] The layer lm_head is not quantized.


  0%|          | 0/1187 [00:00<?, ?w/s]

3-8B


Perplexity 6.135: 100%|██████████| 141/141 [59:23<00:00, 25.27s/it]


3-8B


Perplexity 6.558: 100%|██████████| 141/141 [00:57<00:00,  2.46it/s]


3-8B


Perplexity 6.459: 100%|██████████| 141/141 [00:46<00:00,  3.06it/s]


In [5]:
# This case is not reliable than AWQ Perplexity method.
from auto_gptq.utils import Perplexity as GPTQPerplexity
for size in [
    "3-8B",
    "2-7B",
    "2-13B",
]:
    fp16_model_id = "/data/storage1/model/hf/llama/{}".format(size)
    gptq_model_id = "TheBloke/Llama-{}-GPTQ".format(size)
    awq_model_id = "TheBloke/Llama-{}-AWQ".format(size)
    if size == "3-8B":
        gptq_model_id = "TechxGenus/Meta-Llama-3-8B-GPTQ"
        awq_model_id = "TechxGenus/Meta-Llama-3-8B-AWQ"

    tokenizer = AutoTokenizer.from_pretrained(fp16_model_id, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(fp16_model_id)
    awq_model = AutoAWQForCausalLM.from_quantized(awq_model_id)
    gptq_model = AutoGPTQForCausalLM.from_quantized(gptq_model_id, low_cpu_mem_usage=True)

    for testcases in [model, awq_model.model, gptq_model]:
        ppl = GPTQPerplexity(testcases, tokenizer)
        ppl.calculate_perplexity(2048, 2048)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Fetching 10 files:   0%|          | 0/10 [00:00<?, ?it/s]

Replacing layers...: 100%|██████████| 32/32 [00:10<00:00,  2.97it/s]


  0%|          | 0/738 [00:00<?, ?w/s]

  0%|          | 0/1 [00:00<?, ?w/s]

Fusing layers...: 100%|██████████| 32/32 [00:03<00:00,  8.31it/s]
INFO - The layer lm_head is not quantized.
2024-07-25:16:20:34,293 INFO     [_base.py:1011] The layer lm_head is not quantized.


  0%|          | 0/1187 [00:00<?, ?w/s]

Perplexity: 5.6025: 100%|██████████| 141/141 [1:57:37<00:00, 50.06s/it]
Perplexity: 5.9889: 100%|██████████| 141/141 [02:54<00:00,  1.24s/it]
Perplexity: 5.9500: 100%|██████████| 141/141 [02:57<00:00,  1.26s/it]


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Fetching 13 files:   0%|          | 0/13 [00:00<?, ?it/s]

Replacing layers...: 100%|██████████| 32/32 [00:12<00:00,  2.48it/s]


  0%|          | 0/739 [00:00<?, ?w/s]

Fusing layers...: 100%|██████████| 32/32 [00:06<00:00,  4.58it/s]
INFO - You passed a model that is compatible with the Marlin int4*fp16 GPTQ kernel but use_marlin is False. We recommend using `use_marlin=True` to use the optimized Marlin kernels for inference. Example: `model = AutoGPTQForCausalLM.from_quantized(..., use_marlin=True)`.
2024-07-25:18:25:33,300 INFO     [_base.py:922] You passed a model that is compatible with the Marlin int4*fp16 GPTQ kernel but use_marlin is False. We recommend using `use_marlin=True` to use the optimized Marlin kernels for inference. Example: `model = AutoGPTQForCausalLM.from_quantized(..., use_marlin=True)`.
INFO - The layer lm_head is not quantized.
2024-07-25:18:25:33,775 INFO     [_base.py:1011] The layer lm_head is not quantized.


  0%|          | 0/1219 [00:00<?, ?w/s]

Perplexity: 5.0508:  45%|████▌     | 74/163 [57:08<1:08:30, 46.19s/it]

print("--------------")
for name, data in awq_model.model.model.layers[0].attn.o_proj.named_buffers():
    print(name, data.nbytes)
print("--------------")
for name, data in gptq_model.model.model.layers[0].self_attn.o_proj.named_buffers():
    print(name, data.nbytes)