In [None]:
import torch

# 1) torch shims
if not hasattr(torch, "get_default_device"):
    torch.get_default_device = lambda: torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
if not hasattr(torch, "xpu"):
    class _XPU:
        @staticmethod
        def is_available(): return False
    torch.xpu = _XPU()

from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer

model_path  = "/home/gunwoong/URP2025-1/llama3-8b"
tokenizer   = AutoTokenizer.from_pretrained(model_path, use_fast=False)
common_cfg  = {"zero_point": True, "q_group_size": 128, "w_bit": 4}

versions = [
    ("GEMM","/home/gunwoong/URP2025-1/Meta-Llama-3.1-8B-AWQ-GEMM-INT4"),
    ("Marlin","/home/gunwoong/URP2025-1/Meta-Llama-3.1-8B-AWQ-Marlin-INT4")
]

for version, out_path in versions:
    print(f"\n→ Quantizing version={version}")
    cfg = {**common_cfg, "version": version}
    if version == "Marlin":
        # Marlin은 zero-point 없이
        cfg["zero_point"] = False

    # 1) load onto GPU
    model = AutoAWQForCausalLM.from_pretrained(
        model_path,
        device_map="auto",
        torch_dtype=torch.float16,
        low_cpu_mem_usage=True,
        use_cache=False,
    )

    # 2) quantize (+ verbose & fewer samples if too 느림)
    model.quantize(
        tokenizer,
        quant_config=cfg,

    )

    # 3) pack into GPU memory
    model.pack()

    # 4) save
    model.save_quantized(out_path, safetensors=True)
    tokenizer.save_pretrained(out_path)
    print(f"  Saved to {out_path}")


I have left this message as the final dev message to help you transition.

Important Notice:
- AutoAWQ is officially deprecated and will no longer be maintained.
- The last tested configuration used Torch 2.6.0 and Transformers 4.51.3.
- If future versions of Transformers break AutoAWQ compatibility, please report the issue to the Transformers project.

Alternative:
- AutoAWQ has been adopted by the vLLM Project: https://github.com/vllm-project/llm-compressor

For further inquiries, feel free to reach out:
- X: https://x.com/casper_hansen_
- LinkedIn: https://www.linkedin.com/in/casper-hansen-804005170/




→ Quantizing version=Marlin


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Repo card metadata block was not found. Setting CardData to empty.
AWQ: 100%|██████████| 32/32 [12:27<00:00, 23.37s/it]
Packing: 100%|██████████| 32/32 [00:00<00:00, 50705.60it/s]


  Saved to /home/gunwoong/URP2025-1/Meta-Llama-3.1-8B-AWQ-Marlin-INT4


In [1]:
import torch

print("CUDA available:       ", torch.cuda.is_available())
print("CUDA version (PyTorch):", torch.version.cuda)
print("GPU count:            ", torch.cuda.device_count())

for i in range(torch.cuda.device_count()):
    props = torch.cuda.get_device_properties(i)
    print(f"\nGPU {i}: {props.name}")
    print(f"  Total memory: {props.total_memory/1024**3:.1f} GiB")
    print(f"  Multi-Processor Count: {props.multi_processor_count}")
    print(f"  Compute Capability: {props.major}.{props.minor}")


CUDA available:        True
CUDA version (PyTorch): 12.6
GPU count:             4

GPU 0: NVIDIA GeForce RTX 3090
  Total memory: 23.6 GiB
  Multi-Processor Count: 82
  Compute Capability: 8.6

GPU 1: NVIDIA GeForce RTX 3090
  Total memory: 23.6 GiB
  Multi-Processor Count: 82
  Compute Capability: 8.6

GPU 2: NVIDIA GeForce RTX 3090
  Total memory: 23.6 GiB
  Multi-Processor Count: 82
  Compute Capability: 8.6

GPU 3: NVIDIA GeForce RTX 3090
  Total memory: 23.6 GiB
  Multi-Processor Count: 82
  Compute Capability: 8.6


In [None]:
import os
import time
import psutil
import subprocess

import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    AwqConfig,
)

# ── Configuration ─────────────────────────────────────────────────────────────
MODEL_NAME     = "Meta-Llama-3.1-8B"
TOKENIZER_PATH = "/home/gunwoong/URP2025-1/llama3-8b"
QUANT_PATH     = "/home/gunwoong/URP2025-1/Meta-Llama-3.1-8B-AWQ-GEMM-INT4"
SEQLEN         = 2048
DEVICE         = torch.device("cuda:1")
# ────────────────────────────────────────────────────────────────────────────────

def fmt_mib(x: int) -> float:
    return x / (1024**2)

def get_driver_gpu_used(gpu_index: int = 0) -> float:
    out = subprocess.check_output([
        "nvidia-smi",
        "--query-gpu=memory.used",
        "--format=csv,noheader,nounits",
        "-i", str(gpu_index)
    ])
    return max(float(x) for x in out.decode().splitlines() if x.strip())

@torch.no_grad()
def eval_ppl_and_speed(model, tokens, seqlen, device):
    total_nll, total_toks = 0.0, 0
    # make sure all kernels are done before timing
    torch.cuda.synchronize(DEVICE)
    t0 = time.time()

    for i in range(tokens.size(1) // seqlen):
        chunk = tokens[:, i*seqlen:(i+1)*seqlen].to(DEVICE)
        out   = model(chunk, labels=chunk)
        total_nll += out.loss.item() * chunk.numel()
        total_toks += chunk.numel()

    # wait for last kernels
    torch.cuda.synchronize(DEVICE)
    t1 = time.time()
    peak_inference = torch.cuda.max_memory_reserved(device)
    ppl       = torch.exp(torch.tensor(total_nll/total_toks)).item()
    tok_per_s = total_toks / (t1 - t0)
    return ppl, tok_per_s, peak_inference

def main():
    # 1) Prepare tokens
    ds       = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
    tokenizer= AutoTokenizer.from_pretrained(TOKENIZER_PATH, use_fast=False)
    big_txt  = "\n\n".join(ds["text"])
    ids      = tokenizer.encode(big_txt, add_special_tokens=False)
    tokens   = torch.tensor([ids], dtype=torch.long)

    # 2) Load & measure alloc stats
    torch.cuda.reset_peak_memory_stats()
    t_start = time.time()

    awq_cfg = AwqConfig(bits=4, do_fuse=False, fuse_max_seq_len=SEQLEN)
    model   = AutoModelForCausalLM.from_pretrained(
        QUANT_PATH,
        quantization_config=awq_cfg,
        trust_remote_code=True,
        torch_dtype=torch.float16,
        low_cpu_mem_usage=True,
        device_map={ "": DEVICE.index },
    ).to(DEVICE)
    
    torch.cuda.synchronize(DEVICE)
    peak = torch.cuda.max_memory_reserved(DEVICE)
    print(f"Peak PT alloc : {peak/1024**2:.1f} MiB")
    load_time    = time.time() - t_start
    
    # PyTorch allocator usage (optional)
    gpu_used_tensor = fmt_mib(torch.cuda.memory_allocated(DEVICE))

    # Real driver-reported peak (≈nvidia-smi)
    gpu_used_driver = get_driver_gpu_used(1)

    # CPU RSS
    cpu_rss = fmt_mib(psutil.Process(os.getpid()).memory_info().rss)

    # 3) Eval PPL + speed
    ppl, speed, inf_peak = eval_ppl_and_speed(model, tokens, SEQLEN, DEVICE)

    # 4) Print bench
    print(f"=== {MODEL_NAME} 8B INT4 Bench ===")
    print(f"Load time           : {load_time:5.1f} s")
    print(f"Load GPU memory     : {gpu_used_tensor:7.1f} MiB  (PyTorch)")
    print(f"Load GPU memory     : {gpu_used_driver:7.1f} MiB  (nvidia-smi)")  # ← your 9793 MiB
    print(f"Load CPU RSS        : {cpu_rss:7.1f} MiB")
    print(f"Inference peak GPU  : {inf_peak/1024**2:7.1f} MiB")
    print(f"Inference speed     : {speed:7.1f} tokens/s (+{SEQLEN})")
    print(f"Wikitext-2 PPL      : {ppl:7.2f}")

if __name__ == "__main__":
    main()


Token indices sequence length is longer than the specified maximum sequence length for this model (289076 > 131072). Running this sequence through the model will result in indexing errors
`rope_scaling`'s original_max_position_embeddings field must be less than max_position_embeddings, got 8192 and max_position_embeddings=8192
I have left this message as the final dev message to help you transition.

Important Notice:
- AutoAWQ is officially deprecated and will no longer be maintained.
- The last tested configuration used Torch 2.6.0 and Transformers 4.51.3.
- If future versions of Transformers break AutoAWQ compatibility, please report the issue to the Transformers project.

Alternative:
- AutoAWQ has been adopted by the vLLM Project: https://github.com/vllm-project/llm-compressor

For further inquiries, feel free to reach out:
- X: https://x.com/casper_hansen_
- LinkedIn: https://www.linkedin.com/in/casper-hansen-804005170/



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Peak PT alloc : 5474.0 MiB
=== Meta-Llama-3.1-8B 8B INT4 Bench ===
Load time           :  12.9 s
Load GPU memory     :  5462.5 MiB  (PyTorch)
Load GPU memory     :  5751.0 MiB  (nvidia-smi)
Load CPU RSS        :  1056.6 MiB
Inference peak GPU  :  9458.0 MiB
Inference speed     :  3608.5 tokens/s (+2048)
Wikitext-2 PPL      :    6.64


In [1]:
import os
import time
import psutil
import subprocess

import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    AwqConfig,
)

# ── Configuration ─────────────────────────────────────────────────────────────
MODEL_NAME     = "Meta-Llama-3.1-8B"
TOKENIZER_PATH = "/home/gunwoong/URP2025-1/llama3-8b"
QUANT_PATH     = "/home/gunwoong/URP2025-1/Meta-Llama-3.1-8B-AWQ-GEMM-INT4"
SEQLEN         = 2048
DEVICE         = torch.device("cuda:1")
# ────────────────────────────────────────────────────────────────────────────────

def fmt_mib(x: int) -> float:
    return x / (1024**2)

def get_driver_gpu_used(gpu_index: int = 0) -> float:
    out = subprocess.check_output([
        "nvidia-smi",
        "--query-gpu=memory.used",
        "--format=csv,noheader,nounits",
        "-i", str(gpu_index)
    ])
    return max(float(x) for x in out.decode().splitlines() if x.strip())

@torch.no_grad()
def eval_ppl_and_speed(model, tokens, seqlen, device):
    total_nll, total_toks = 0.0, 0
    # make sure all kernels are done before timing
    torch.cuda.synchronize(DEVICE)
    t0 = time.time()

    for i in range(tokens.size(1) // seqlen):
        chunk = tokens[:, i*seqlen:(i+1)*seqlen].to(DEVICE)
        out   = model(chunk, labels=chunk)
        total_nll += out.loss.item() * chunk.numel()
        total_toks += chunk.numel()

    # wait for last kernels
    torch.cuda.synchronize(DEVICE)
    t1 = time.time()
    peak_inference = torch.cuda.max_memory_reserved(device)
    ppl       = torch.exp(torch.tensor(total_nll/total_toks)).item()
    tok_per_s = total_toks / (t1 - t0)
    return ppl, tok_per_s, peak_inference

def main():
    # 1) Prepare tokens
    ds       = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
    tokenizer= AutoTokenizer.from_pretrained(TOKENIZER_PATH, use_fast=False)
    big_txt  = "\n\n".join(ds["text"])
    ids      = tokenizer.encode(big_txt, add_special_tokens=False)
    tokens   = torch.tensor([ids], dtype=torch.long).repeat(2, 1)

    # 2) Load & measure alloc stats
    torch.cuda.reset_peak_memory_stats()
    t_start = time.time()

    awq_cfg = AwqConfig(bits=4, do_fuse=False, fuse_max_seq_len=SEQLEN)
    model   = AutoModelForCausalLM.from_pretrained(
        QUANT_PATH,
        quantization_config=awq_cfg,
        trust_remote_code=True,
        torch_dtype=torch.float16,
        low_cpu_mem_usage=True,
        device_map={ "": DEVICE.index },
    ).to(DEVICE)
    
    torch.cuda.synchronize(DEVICE)
    peak = torch.cuda.max_memory_reserved(DEVICE)
    print(f"Peak PT alloc : {peak/1024**2:.1f} MiB")
    load_time    = time.time() - t_start
    
    # PyTorch allocator usage (optional)
    gpu_used_tensor = fmt_mib(torch.cuda.memory_allocated(DEVICE))

    # Real driver-reported peak (≈nvidia-smi)
    gpu_used_driver = get_driver_gpu_used(1)

    # CPU RSS
    cpu_rss = fmt_mib(psutil.Process(os.getpid()).memory_info().rss)

    # 3) Eval PPL + speed
    ppl, speed, inf_peak = eval_ppl_and_speed(model, tokens, SEQLEN, DEVICE)

    # 4) Print bench
    print(f"=== {MODEL_NAME} 8B INT4 Bench ===")
    print(f"Load time           : {load_time:5.1f} s")
    print(f"Load GPU memory     : {gpu_used_tensor:7.1f} MiB  (PyTorch)")
    print(f"Load GPU memory     : {gpu_used_driver:7.1f} MiB  (nvidia-smi)")  # ← your 9793 MiB
    print(f"Load CPU RSS        : {cpu_rss:7.1f} MiB")
    print(f"Inference peak GPU  : {inf_peak/1024**2:7.1f} MiB")
    print(f"Inference speed     : {speed:7.1f} tokens/s (+{SEQLEN})")
    print(f"Wikitext-2 PPL      : {ppl:7.2f}")

if __name__ == "__main__":
    main()


Token indices sequence length is longer than the specified maximum sequence length for this model (289076 > 131072). Running this sequence through the model will result in indexing errors
`rope_scaling`'s original_max_position_embeddings field must be less than max_position_embeddings, got 8192 and max_position_embeddings=8192
I have left this message as the final dev message to help you transition.

Important Notice:
- AutoAWQ is officially deprecated and will no longer be maintained.
- The last tested configuration used Torch 2.6.0 and Transformers 4.51.3.
- If future versions of Transformers break AutoAWQ compatibility, please report the issue to the Transformers project.

Alternative:
- AutoAWQ has been adopted by the vLLM Project: https://github.com/vllm-project/llm-compressor

For further inquiries, feel free to reach out:
- X: https://x.com/casper_hansen_
- LinkedIn: https://www.linkedin.com/in/casper-hansen-804005170/



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Peak PT alloc : 5474.0 MiB
=== Meta-Llama-3.1-8B 8B INT4 Bench ===
Load time           :   3.6 s
Load GPU memory     :  5462.5 MiB  (PyTorch)
Load GPU memory     :  5751.0 MiB  (nvidia-smi)
Load CPU RSS        :  1057.3 MiB
Inference peak GPU  : 13132.0 MiB
Inference speed     :  3815.9 tokens/s (+2048)
Wikitext-2 PPL      :    6.64


In [2]:
import os
import time
import psutil
import subprocess

import torch
from datasets import load_dataset
from transformers import AutoTokenizer
from awq import AutoAWQForCausalLM

# ── Configuration ─────────────────────────────────────────────────────────────
MODEL_NAME     = "Meta-Llama-3.1-8B"
TOKENIZER_PATH = "/home/gunwoong/URP2025-1/llama3-8b"
QUANT_PATH     = "/home/gunwoong/URP2025-1/Meta-Llama-3.1-8B-AWQ-Marlin-INT4"
SEQLEN         = 2048
DEVICE         = torch.device("cuda:0")
# ────────────────────────────────────────────────────────────────────────────────

def fmt_mib(x: int) -> float:
    return x / (1024**2)

def get_driver_gpu_used(gpu_index: int = 0) -> float:
    out = subprocess.check_output([
        "nvidia-smi",
        "--query-gpu=memory.used",
        "--format=csv,noheader,nounits",
        "-i", str(gpu_index)
    ])
    return max(float(x) for x in out.decode().splitlines() if x.strip())

@torch.no_grad()
def eval_ppl_and_speed(model, tokens, seqlen, device):
    total_nll, total_toks = 0.0, 0
    torch.cuda.synchronize(device)
    t0 = time.time()

    for i in range(tokens.size(1) // seqlen):
        chunk = tokens[:, i*seqlen:(i+1)*seqlen].to(device)
        out   = model(chunk, labels=chunk)
        total_nll += out.loss.item() * chunk.numel()
        total_toks += chunk.numel()

    torch.cuda.synchronize(device)
    t1 = time.time()
    inf_peak = torch.cuda.max_memory_reserved(device)
    ppl       = torch.exp(torch.tensor(total_nll/total_toks)).item()
    speed     = total_toks / (t1 - t0)
    return ppl, speed, inf_peak

def main():
    # 1) Prepare tokens
    ds        = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
    tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH, use_fast=False)
    big_txt   = "\n\n".join(ds["text"])
    ids       = tokenizer.encode(big_txt, add_special_tokens=False)
    tokens    = torch.tensor([ids], dtype=torch.long)

    # 2) Load & measure alloc stats
    torch.cuda.reset_peak_memory_stats()
    t_start = time.time()

    # → Marlin 전용 로드
    model = AutoAWQForCausalLM.from_quantized(
        QUANT_PATH,
        device_map={"": "cuda:0"},  # 단일 GPU로 몰아넣기
        safetensors=True,
        fuse_layers=False,
        trust_remote_code=True,
    )

    torch.cuda.synchronize(DEVICE)
    peak_load = torch.cuda.max_memory_reserved(DEVICE)
    load_time = time.time() - t_start

    gpu_allocator = fmt_mib(torch.cuda.memory_allocated(DEVICE))
    gpu_driver    = get_driver_gpu_used(1)
    cpu_rss       = fmt_mib(psutil.Process(os.getpid()).memory_info().rss)

    # 3) Eval PPL + speed
    ppl, speed, inf_peak = eval_ppl_and_speed(model, tokens, SEQLEN, DEVICE)

    # 4) Print bench
    print(f"\n=== {MODEL_NAME} 8B AWQ-Marlin INT4 Bench ===")
    print(f"Load time           : {load_time:5.1f} s")
    print(f"Peak PT alloc       : {peak_load/1024**2:7.1f} MiB")
    print(f"Load GPU memory     : {gpu_allocator:7.1f} MiB  (torch)")
    print(f"Load GPU memory     : {gpu_driver:7.1f} MiB  (nvidia-smi)")
    print(f"Load CPU RSS        : {cpu_rss:7.1f} MiB")
    print(f"Inference peak GPU  : {inf_peak/1024**2:7.1f} MiB")
    print(f"Inference speed     : {speed:7.1f} tokens/s (+{SEQLEN})")
    print(f"Wikitext-2 PPL      : {ppl:7.2f}")
    print()
    
if __name__ == "__main__":
    main()


Token indices sequence length is longer than the specified maximum sequence length for this model (289076 > 131072). Running this sequence through the model will result in indexing errors
`rope_scaling`'s original_max_position_embeddings field must be less than max_position_embeddings, got 8192 and max_position_embeddings=8192
`rope_scaling`'s original_max_position_embeddings field must be less than max_position_embeddings, got 8192 and max_position_embeddings=8192
Replacing layers...: 100%|██████████| 32/32 [00:04<00:00,  7.03it/s]



=== Meta-Llama-3.1-8B 8B AWQ-Marlin INT4 Bench ===
Load time           :   6.0 s
Peak PT alloc       :  5438.0 MiB
Load GPU memory     :  5436.9 MiB  (torch)
Load GPU memory     :   355.0 MiB  (nvidia-smi)
Load CPU RSS        :  1916.3 MiB
Inference peak GPU  :  9314.0 MiB
Inference speed     :  3944.7 tokens/s (+2048)
Wikitext-2 PPL      :    7.00



In [1]:
import os
import time
import psutil
import subprocess

import torch
from datasets import load_dataset
from transformers import AutoTokenizer
from awq import AutoAWQForCausalLM

# ── Configuration ─────────────────────────────────────────────────────────────
MODEL_NAME     = "Meta-Llama-3.1-8B"
TOKENIZER_PATH = "/home/gunwoong/URP2025-1/llama3-8b"
QUANT_PATH     = "/home/gunwoong/URP2025-1/Meta-Llama-3.1-8B-AWQ-Marlin-INT4"
SEQLEN         = 2048
DEVICE         = torch.device("cuda:0")
# ────────────────────────────────────────────────────────────────────────────────

def fmt_mib(x: int) -> float:
    return x / (1024**2)

def get_driver_gpu_used(gpu_index: int = 0) -> float:
    out = subprocess.check_output([
        "nvidia-smi",
        "--query-gpu=memory.used",
        "--format=csv,noheader,nounits",
        "-i", str(gpu_index)
    ])
    return max(float(x) for x in out.decode().splitlines() if x.strip())

@torch.no_grad()
def eval_ppl_and_speed(model, tokens, seqlen, device):
    total_nll, total_toks = 0.0, 0
    torch.cuda.synchronize(device)
    t0 = time.time()

    for i in range(tokens.size(1) // seqlen):
        chunk = tokens[:, i*seqlen:(i+1)*seqlen].to(device)
        out   = model(chunk, labels=chunk)
        total_nll += out.loss.item() * chunk.numel()
        total_toks += chunk.numel()

    torch.cuda.synchronize(device)
    t1 = time.time()
    inf_peak = torch.cuda.max_memory_reserved(device)
    ppl       = torch.exp(torch.tensor(total_nll/total_toks)).item()
    speed     = total_toks / (t1 - t0)
    return ppl, speed, inf_peak

def main():
    # 1) Prepare tokens
    ds        = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
    tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH, use_fast=False)
    big_txt   = "\n\n".join(ds["text"])
    ids       = tokenizer.encode(big_txt, add_special_tokens=False)
    tokens    = torch.tensor([ids], dtype=torch.long).repeat(2, 1)

    # 2) Load & measure alloc stats
    torch.cuda.reset_peak_memory_stats()
    t_start = time.time()

    # → Marlin 전용 로드
    model = AutoAWQForCausalLM.from_quantized(
        QUANT_PATH,
        device_map={"": "cuda:0"},  # 단일 GPU로 몰아넣기
        safetensors=True,
        fuse_layers=False,
        trust_remote_code=True,
    )

    torch.cuda.synchronize(DEVICE)
    peak_load = torch.cuda.max_memory_reserved(DEVICE)
    load_time = time.time() - t_start

    gpu_allocator = fmt_mib(torch.cuda.memory_allocated(DEVICE))
    gpu_driver    = get_driver_gpu_used(1)
    cpu_rss       = fmt_mib(psutil.Process(os.getpid()).memory_info().rss)

    # 3) Eval PPL + speed
    ppl, speed, inf_peak = eval_ppl_and_speed(model, tokens, SEQLEN, DEVICE)

    # 4) Print bench
    print(f"\n=== {MODEL_NAME} 8B AWQ-Marlin INT4 Bench ===")
    print(f"Load time           : {load_time:5.1f} s")
    print(f"Peak PT alloc       : {peak_load/1024**2:7.1f} MiB")
    print(f"Load GPU memory     : {gpu_allocator:7.1f} MiB  (torch)")
    print(f"Load GPU memory     : {gpu_driver:7.1f} MiB  (nvidia-smi)")
    print(f"Load CPU RSS        : {cpu_rss:7.1f} MiB")
    print(f"Inference peak GPU  : {inf_peak/1024**2:7.1f} MiB")
    print(f"Inference speed     : {speed:7.1f} tokens/s (+{SEQLEN})")
    print(f"Wikitext-2 PPL      : {ppl:7.2f}")
    print()
    
if __name__ == "__main__":
    main()


I have left this message as the final dev message to help you transition.

Important Notice:
- AutoAWQ is officially deprecated and will no longer be maintained.
- The last tested configuration used Torch 2.6.0 and Transformers 4.51.3.
- If future versions of Transformers break AutoAWQ compatibility, please report the issue to the Transformers project.

Alternative:
- AutoAWQ has been adopted by the vLLM Project: https://github.com/vllm-project/llm-compressor

For further inquiries, feel free to reach out:
- X: https://x.com/casper_hansen_
- LinkedIn: https://www.linkedin.com/in/casper-hansen-804005170/

Token indices sequence length is longer than the specified maximum sequence length for this model (289076 > 131072). Running this sequence through the model will result in indexing errors
`rope_scaling`'s original_max_position_embeddings field must be less than max_position_embeddings, got 8192 and max_position_embeddings=8192
`rope_scaling`'s original_max_position_embeddings field mus


=== Meta-Llama-3.1-8B 8B AWQ-Marlin INT4 Bench ===
Load time           :   6.7 s
Peak PT alloc       :  5438.0 MiB
Load GPU memory     :  5436.9 MiB  (torch)
Load GPU memory     :    18.0 MiB  (nvidia-smi)
Load CPU RSS        :  1102.2 MiB
Inference peak GPU  : 13132.0 MiB
Inference speed     :  3953.9 tokens/s (+2048)
Wikitext-2 PPL      :    6.98



In [None]:
import os
# ── FLASH ATTENTION 완전 비활성화 ────────────────────────────────────────
os.environ["TRANSFORMERS_NO_TRITON_FLASH_ATTENTION"] = "1"
# 또는
os.environ["DISABLE_FLASH_ATTENTION"] = "1"

import time
import psutil
import subprocess
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM

# ── Configuration ─────────────────────────────────────────────────────────
MODEL_NAME     = "meta-llama/Meta-Llama-3.1-8B"
TOKENIZER_PATH = "/home/gunwoong/URP2025-1/llama3-8b"
SEQLEN         = 2048
DEVICE         = torch.device("cuda:0")
# ────────────────────────────────────────────────────────────────────────────────

def fmt_mib(x: int) -> float:
    return x / (1024**2)

def get_driver_gpu_used(gpu_index: int = 0) -> float:
    out = subprocess.check_output([
        "nvidia-smi",
        "--query-gpu=memory.used",
        "--format=csv,noheader,nounits",
        "-i", str(gpu_index)
    ])
    return max(float(x) for x in out.decode().splitlines() if x.strip())

@torch.no_grad()
def eval_ppl_and_speed(model, tokens, seqlen, device):
    torch.cuda.synchronize(device); t0 = time.time()
    total_nll, total_toks = 0.0, 0
    for i in range(tokens.size(1) // seqlen):
        chunk = tokens[:, i*seqlen:(i+1)*seqlen].to(device)
        out   = model(chunk, labels=chunk)
        total_nll += out.loss.item() * chunk.numel()
        total_toks += chunk.numel()
    torch.cuda.synchronize(device); t1 = time.time()
    inf_peak = torch.cuda.max_memory_reserved(device)
    ppl       = torch.exp(torch.tensor(total_nll/total_toks)).item()
    speed     = total_toks / (t1 - t0)
    return ppl, speed, inf_peak

def main():
    # 토큰 준비
    ds        = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
    tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH, use_fast=False)
    ids       = tokenizer.encode("\n\n".join(ds["text"]), add_special_tokens=False)
    tokens    = torch.tensor([ids], dtype=torch.long).repeat(2, 1)

    # 모델 로드 및 메모리 측정
    torch.cuda.reset_peak_memory_stats()
    t0 = time.time()
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        device_map={"": "cuda:0"},
        torch_dtype=torch.float16,
        trust_remote_code=True,
    )
    model.eval()
    torch.cuda.synchronize(DEVICE)
    peak_load = torch.cuda.max_memory_reserved(DEVICE)
    load_time = time.time() - t0

    gpu_alloc = fmt_mib(torch.cuda.memory_allocated(DEVICE))
    gpu_drv   = get_driver_gpu_used(0)
    cpu_rss   = fmt_mib(psutil.Process(os.getpid()).memory_info().rss)

    # PPL, 속도 측정
    ppl, speed, inf_peak = eval_ppl_and_speed(model, tokens, SEQLEN, DEVICE)

    # 결과 출력
    print(f"\n=== {MODEL_NAME} 8B Base FP16 (no fused-attn) ===")
    print(f"Load time           : {load_time:5.1f} s")
    print(f"Peak PT alloc       : {peak_load/1024**2:7.1f} MiB")
    print(f"Load GPU memory     : {gpu_alloc:7.1f} MiB  (torch)")
    print(f"Load GPU memory     : {gpu_drv:7.1f} MiB  (nvidia-smi)")
    print(f"Load CPU RSS        : {cpu_rss:7.1f} MiB")
    print(f"Inference peak GPU  : {inf_peak/1024**2:7.1f} MiB")
    print(f"Inference speed     : {speed:7.1f} tokens/s (+{SEQLEN})")
    print(f"Wikitext-2 PPL      : {ppl:7.2f}\n")

if __name__ == "__main__":
    main()


  from .autonotebook import tqdm as notebook_tqdm
Token indices sequence length is longer than the specified maximum sequence length for this model (289076 > 131072). Running this sequence through the model will result in indexing errors
Loading checkpoint shards: 100%|██████████| 4/4 [00:02<00:00,  1.62it/s]



=== meta-llama/Meta-Llama-3.1-8B 8B Base FP16 (no fused-attn) ===
Load time           :   5.9 s
Peak PT alloc       : 17324.0 MiB
Load GPU memory     : 15316.5 MiB  (torch)
Load GPU memory     : 17643.0 MiB  (nvidia-smi)
Load CPU RSS        :   995.0 MiB
Inference peak GPU  : 23340.0 MiB
Inference speed     :  4061.3 tokens/s (+2048)
Wikitext-2 PPL      :    6.24



In [1]:
import torch
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer

# ── 설정 ───────────────────────────────────────────────────────
DEVICE     = torch.device("cuda:1")  # 사용할 GPU
QUANT_PATH = "/home/gunwoong/URP2025-1/Meta-Llama-3.1-8B-AWQ-Marlin-INT4"
# ───────────────────────────────────────────────────────────────

# 1) 토크나이저 로드
tokenizer = AutoTokenizer.from_pretrained(QUANT_PATH, use_fast=False)

# 2) Marlin‐INT4 체크포인트 불러오기
model = AutoAWQForCausalLM.from_quantized(
    quant_path=QUANT_PATH,
    fuse_layers=True,        # Marlin CUDA 커널 활성화
    safetensors=True,        # safetensors 포맷 사용
    device_map="auto",       # 자동으로 GPU에 배치
    low_cpu_mem_usage=True,  # CPU 메모리 절약 모드
).to(DEVICE)

# 3) 간단히 생성 예시
prompt = "안녕하세요, 오늘 기분이 어때?"
inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)

# .generate() API 는 transformers 인터페이스와 동일합니다.
out = model.generate(
    **inputs,
    max_new_tokens=64,
    do_sample=True,
    temperature=0.8,
)
print(tokenizer.decode(out[0], skip_special_tokens=True))


I have left this message as the final dev message to help you transition.

Important Notice:
- AutoAWQ is officially deprecated and will no longer be maintained.
- The last tested configuration used Torch 2.6.0 and Transformers 4.51.3.
- If future versions of Transformers break AutoAWQ compatibility, please report the issue to the Transformers project.

Alternative:
- AutoAWQ has been adopted by the vLLM Project: https://github.com/vllm-project/llm-compressor

For further inquiries, feel free to reach out:
- X: https://x.com/casper_hansen_
- LinkedIn: https://www.linkedin.com/in/casper-hansen-804005170/

Replacing layers...: 100%|██████████| 32/32 [00:04<00:00,  6.74it/s]


  0%|          | 0/514 [00:00<?, ?w/s]

  0%|          | 0/1 [00:00<?, ?w/s]

Fusing layers...:   0%|          | 0/32 [00:00<?, ?it/s]


AttributeError: 'WQLinear_Marlin' object has no attribute 'qzeros'

In [1]:
import os
import time
import psutil
import subprocess

import torch
from datasets import load_dataset
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer

# ── USER SETTINGS ────────────────────────────────────────────────────────────
MODEL_NAME     = "Meta-Llama-3.1-8B (Marlin INT4)"
TOKENIZER_PATH = "/home/gunwoong/URP2025-1/llama3-8b"
QUANT_PATH     = "/home/gunwoong/URP2025-1/Meta-Llama-3.1-8B-AWQ-Marlin-INT4"
SEQLEN         = 2048
DEVICE         = torch.device("cuda:1")   # Marlin 모델이 올라간 GPU
# ───────────────────────────────────────────────────────────────────────────────

def fmt_mib(x: int) -> float:
    return x / (1024**2)

def get_driver_gpu_used(gpu_index: int = 1) -> float:
    out = subprocess.check_output([
        "nvidia-smi",
        "--query-gpu=memory.used",
        "--format=csv,noheader,nounits",
        "-i", str(gpu_index)
    ])
    return float(out.decode().strip())

@torch.no_grad()
def eval_ppl_and_speed(model, tokens, seqlen):
    # ensure all queued kernels are done before timing
    torch.cuda.synchronize(DEVICE)
    t0 = time.time()

    total_nll, total_toks = 0.0, 0
    for i in range(tokens.size(1) // seqlen):
        chunk = tokens[:, i*seqlen:(i+1)*seqlen].to(DEVICE)
        out   = model(chunk, labels=chunk)
        total_nll += out.loss.item() * chunk.numel()
        total_toks += chunk.numel()

    # wait for last kernels
    torch.cuda.synchronize(DEVICE)
    t1 = time.time()

    ppl       = torch.exp(torch.tensor(total_nll / total_toks)).item()
    tok_per_s = total_toks / (t1 - t0)
    return ppl, tok_per_s

def main():
    # 1) Prepare tokens
    ds_test   = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
    tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH, use_fast=False)
    big_txt   = "\n\n".join(ds_test["text"])
    ids       = tokenizer.encode(big_txt, add_special_tokens=False)
    tokens    = torch.tensor([ids], dtype=torch.long)

    # 2) Load quantized Marlin model with fused kernels
    torch.cuda.reset_peak_memory_stats()
    t_start = time.time()

    model = AutoAWQForCausalLM.from_pretrained(
        QUANT_PATH,
        version="Marlin",
        device_map="auto",
        torch_dtype=torch.float16,
        trust_remote_code=True,
        low_cpu_mem_usage=True,
        use_cache=False,        # ← fused CUDA 커널 활성화
    )
    model = model.pack(fuse=True).to(DEVICE)
    # sync & measure
    torch.cuda.synchronize(DEVICE)
    load_time = time.time() - t_start

    # PyTorch allocator memory
    gpu_alloc = fmt_mib(torch.cuda.memory_allocated(DEVICE))
    # driver-reported memory
    gpu_driver = get_driver_gpu_used(DEVICE.index)
    cpu_rss = fmt_mib(psutil.Process(os.getpid()).memory_info().rss)

    # 3) Eval PPL & Throughput
    ppl, speed = eval_ppl_and_speed(model, tokens, SEQLEN)

    # 4) Report
    print(f"=== {MODEL_NAME} Bench ===")
    print(f"Load time           : {load_time:5.1f} s")
    print(f"Load GPU memory     : {gpu_alloc:7.1f} MiB  (PyTorch)")
    print(f"Load GPU memory     : {gpu_driver:7.1f} MiB  (nvidia-smi)")
    print(f"Load CPU RSS        : {cpu_rss:7.1f} MiB")
    print(f"Inference speed     : {speed:7.1f} tokens/s (+{SEQLEN})")
    print(f"Wikitext-2 PPL      : {ppl:7.2f}")

if __name__ == "__main__":
    main()


I have left this message as the final dev message to help you transition.

Important Notice:
- AutoAWQ is officially deprecated and will no longer be maintained.
- The last tested configuration used Torch 2.6.0 and Transformers 4.51.3.
- If future versions of Transformers break AutoAWQ compatibility, please report the issue to the Transformers project.

Alternative:
- AutoAWQ has been adopted by the vLLM Project: https://github.com/vllm-project/llm-compressor

For further inquiries, feel free to reach out:
- X: https://x.com/casper_hansen_
- LinkedIn: https://www.linkedin.com/in/casper-hansen-804005170/

Token indices sequence length is longer than the specified maximum sequence length for this model (289076 > 131072). Running this sequence through the model will result in indexing errors


ValueError: Unknown AWQLinearVersion marlin

In [None]:
import torch
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer, pipeline

# 1) 양자화된 모델이 저장된 경로
quant_path = "/home/gunwoong/URP2025-1/Meta-Llama-3.1-8B-AWQ-Marlin-INT4"

# 2) 로드 (device_map="auto" 또는 "cuda:0" 등 지정)
print(f"Loading quantized model from: {quant_path}")
model = AutoAWQForCausalLM.from_quantized(
    quant_path,
    device_map="auto",     # GPU/CPU 자동 분산
    safetensors=True       # safetensors 형식이라면 True
)
tokenizer = AutoTokenizer.from_pretrained(
    quant_path,
    trust_remote_code=True
)
print("Quantized model and tokenizer loaded.")

# 3) (선택) 파이프라인을 이용한 예시 추론
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device_map="auto"
)
prompt = "Activation-Aware Weight Quantization (AWQ)란 무엇인가?"
outputs = pipe(prompt, max_new_tokens=50, do_sample=True, temperature=0.7)
print(outputs[0]["generated_text"])


`rope_scaling`'s original_max_position_embeddings field must be less than max_position_embeddings, got 8192 and max_position_embeddings=4096


INFO 05-29 00:49:23 [config.py:793] This model supports multiple tasks: {'generate', 'classify', 'embed', 'score', 'reward'}. Defaulting to 'generate'.
INFO 05-29 00:49:23 [awq_marlin.py:115] The model is convertible to awq_marlin during runtime. Using awq_marlin kernel.
INFO 05-29 00:49:23 [config.py:2118] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 05-29 00:49:23 [core.py:438] Waiting for init message from front-end.
INFO 05-29 00:49:23 [core.py:65] Initializing a V1 LLM engine (v0.9.0) with config: model='/home/gunwoong/URP2025-1/Meta-Llama-3.1-8B-AWQ-GEMM-INT4', speculative_config=None, tokenizer='/home/gunwoong/URP2025-1/Meta-Llama-3.1-8B-AWQ-GEMM-INT4', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=4096, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, 

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


INFO 05-29 00:49:25 [default_loader.py:280] Loading weights took 0.85 seconds
INFO 05-29 00:49:26 [gpu_model_runner.py:1549] Model loading took 5.3434 GiB and 1.535618 seconds
INFO 05-29 00:49:35 [backends.py:459] Using cache directory: /home/gunwoong/.cache/vllm/torch_compile_cache/26275be213/rank_0_0 for vLLM's torch.compile
INFO 05-29 00:49:35 [backends.py:469] Dynamo bytecode transform time: 8.59 s
INFO 05-29 00:49:38 [backends.py:158] Cache the graph of shape None for later use
INFO 05-29 00:50:06 [backends.py:170] Compiling a graph for general shape takes 30.52 s
INFO 05-29 00:50:33 [monitor.py:33] torch.compile takes 39.11 s in total
INFO 05-29 00:50:33 [kv_cache_utils.py:637] GPU KV cache size: 89,504 tokens
INFO 05-29 00:50:33 [kv_cache_utils.py:640] Maximum concurrency for 4,096 tokens per request: 21.85x
INFO 05-29 00:51:00 [gpu_model_runner.py:1933] Graph capturing finished in 26 secs, took 0.53 GiB
INFO 05-29 00:51:00 [core.py:167] init engine (profile, create kv cache, wa

Adding requests:   0%|          | 0/2 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/2 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Prompt: 안녕하세요.
Generated:  오늘은 대도서관에서 열리길래, 밴드 입단식 봤어요!
그리고 금일 2pm 로드 윈
Prompt: 대한민국의 수도는?
Generated:  Seoul is the capital of Korea.
