In [None]:
!pip uninstall torch torchvision torchaudio transformers vllm -y
!pip cache purge
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126
!pip install --upgrade transformers vllm datasets tqdm 
!pip install -U gptqmodel --no-build-isolation -v
!pip install optimum
!pip install --force-reinstall triton==3.2.0
!huggingface-cli login --token ***    # Read
# !huggingface-cli login --token ***    # Write

Found existing installation: torch 2.6.0+cu124
Uninstalling torch-2.6.0+cu124:
  Successfully uninstalled torch-2.6.0+cu124
Found existing installation: torchvision 0.21.0+cu124
Uninstalling torchvision-0.21.0+cu124:
  Successfully uninstalled torchvision-0.21.0+cu124
Found existing installation: torchaudio 2.6.0+cu124
Uninstalling torchaudio-2.6.0+cu124:
  Successfully uninstalled torchaudio-2.6.0+cu124
Found existing installation: transformers 4.51.3
Uninstalling transformers-4.51.3:
  Successfully uninstalled transformers-4.51.3
[0mFiles removed: 0
Looking in indexes: https://download.pytorch.org/whl/cu126
Collecting torch
  Downloading https://download.pytorch.org/whl/cu126/torch-2.7.0%2Bcu126-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (29 kB)
Collecting torchvision
  Downloading https://download.pytorch.org/whl/cu126/torchvision-0.22.0%2Bcu126-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (6.1 kB)
Collecting torchaudio
  Downloading https://download.pytorch.org/whl/cu126/tor

In [None]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm.auto import tqdm
from datasets import load_dataset
import random
import numpy as np
from vllm import LLM, SamplingParams
from gptqmodel import GPTQModel, QuantizeConfig
from vllm.config import CompilationConfig, CompilationLevel

#####################################################################
# === SPEC NOTICE ===
# Only "load model" and "generate" function selection can be modified.
# DO NOT change PPL calculation, timing, or throughput logic.
#####################################################################


# === (Optional) Define your own custom generate function. ===
# This is useful if you want full control over KV cache and generation steps.
# You can modify this function to suit your needs.
# By default, we use model.generate() for simplicity and general use.
def generate(model, input_ids, past_key_values, max_new_tokens):
    input_ids = input_ids.clone()
    with torch.no_grad():
        # Prefill
        outputs = model.prefill_forward(
            input_ids,
            past_key_values=past_key_values,
            position_ids=None,
            attention_mask=None,
            cache_position=None,
            logits_to_keep=1,
        )
        past_key_values = outputs.past_key_values
        next_token = torch.argmax(outputs.logits, dim=-1)
        input_ids = torch.cat([input_ids, next_token], dim=-1)

        # Token-by-token Decoding
        for _ in range(max_new_tokens):
            pos = input_ids.shape[1]
            cache_position = torch.arange(
                pos, pos + 1, device=input_ids.device, dtype=torch.long
            )

            outputs = model(
                next_token,
                past_key_values=past_key_values,
                position_ids=cache_position.unsqueeze(0),
                cache_position=cache_position,
            )
            logits = outputs.logits
            next_token = torch.argmax(logits, dim=-1)
            input_ids = torch.cat([input_ids, next_token], dim=-1)
            past_key_values = outputs.past_key_values

    return input_ids


def evaluate_ppl(model_name, device="cuda:0"):
    model = GPTQModel.load(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

    model.eval()
    model.to(device)
    test_dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")

    test_enc = tokenizer("\n\n".join(test_dataset["text"]), return_tensors="pt")
    model.seqlen = 2048
    test_enc = test_enc.input_ids.to(device)

    nsamples = test_enc.numel() // model.seqlen
    nlls = []
    for i in tqdm(range(nsamples), desc="Evaluating..."):
        batch = test_enc[:, (i * model.seqlen) : ((i + 1) * model.seqlen)]

        with torch.no_grad():
            lm_logits = model(batch).logits

        shift_logits = lm_logits[:, :-1, :].contiguous().float()
        shift_labels = test_enc[:, (i * model.seqlen) : ((i + 1) * model.seqlen)][:, 1:]

        loss_fct = nn.CrossEntropyLoss()
        loss = loss_fct(
            shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)
        )
        neg_log_likelihood = loss.float() * model.seqlen
        nlls.append(neg_log_likelihood)

    ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * model.seqlen))

    del model

    return ppl.item()


def main():
    ############## Set Up ##############
    torch.manual_seed(0)
    random.seed(0)

    max_new_tokens = 256  # Number of new tokens to generate
    device = "cuda:0"

    ### === TODO: Load your model (you may change this part) ===
    model_name = "zbyzby/Llama3.2-3B-Instruct-quantized"
    torch.cuda.empty_cache()
    vllm_model = LLM(
        model=model_name,
        kv_cache_dtype="auto",  # only supported in Amper GPU
        trust_remote_code=True,
        gpu_memory_utilization=0.9,
        max_num_seqs=64,
        max_model_len=2048,
        enable_chunked_prefill=True,
        enable_prefix_caching=True,
        tensor_parallel_size=1,
        speculative_config={
            "model": "zbyzby/Llama-3.2-1B-Instruct-GPTQ-Quant",
            "draft_tensor_parallel_size": 1,
            "num_speculative_tokens": 5,
        },
        compilation_config={
            "backend": "inductor",
            "custom_ops": ["vllm_ops"],
            "mode": "reduce-overhead",
            "enable_cudagraphs": True,
            "max_autotune": True,
            "level": CompilationLevel.PIECEWISE,
        },
    )
    sampling_params = SamplingParams(
        temperature=0.0, max_tokens=max_new_tokens, top_p=1.0
    )
    #####################################

    # === (Optional) Uncomment the following lines if using the custom generate() function. ===
    # model.prefill_forward = model.forward

    warmup_prompt = "Explain what AI is."
    # inputs = tokenizer(warmup_prompt, return_tensors="pt").to(device)
    # input_ids = inputs["input_ids"]
    # attention_mask = inputs["attention_mask"]

    # === (Optional) Set up StaticCache for manual KV cache management ===
    # from transformers import StaticCache
    # past_key_values = StaticCache(
    #     config=model.config,
    #     max_batch_size=1,
    #     max_cache_len=max_new_tokens + 16,
    #     device=model.device,
    #     dtype=torch.float16
    # )
    ####################################################################

    for i in tqdm(range(5), desc="Warm Up..."):
        #  === Default: use model.generate() for end-to-end warm-up ===
        _ = vllm_model.generate([warmup_prompt], sampling_params)

        # === (Optional) Use custom generate() if uncommented ===
        # generated = generate(model, input_ids, past_key_values, max_new_tokens)
        # past_key_values.reset()

    prompt = "How to learn a new language?"
    # inputs = tokenizer(prompt, return_tensors="pt").to(device)
    # input_ids = inputs["input_ids"]
    # attention_mask = inputs["attention_mask"]
    tputs = []
    time_record = []
    for _ in tqdm(range(10), desc="Test Inference"):
        torch.cuda.synchronize()
        start = torch.cuda.Event(enable_timing=True)
        end = torch.cuda.Event(enable_timing=True)
        start.record()

        # === Default: Use model.generate() for end-to-end timing ===
        generated = vllm_model.generate([prompt], sampling_params)

        # === Optional: Use custom generate() if uncommented ===
        # generated = generate(model, input_ids, past_key_values, max_new_tokens)
        # past_key_values.reset()

        end.record()
        torch.cuda.synchronize()
        elapsed_ms = start.elapsed_time(end)
        tput = max_new_tokens / (elapsed_ms / 1000)
        time_record.append(elapsed_ms / 1000)
        tputs.append(tput)

    response = generated[0].outputs[0].text
    sorted_tputs = np.sort(tputs)[2:-2]
    org_tput = np.mean(sorted_tputs)
    print(f"Prompt: {prompt}\nResponse: {response}\n")

    print(f"Time Record: {time_record}")
    print(f"Throughput Record: {tputs} toks/s\n")

    del vllm_model

    ### Your final throughput result ###
    print(f"Throughput: {org_tput} toks/s")

    torch.cuda.empty_cache()
    ppl = evaluate_ppl(model_name, device)
    print(f"Perplexity (PPL): {ppl}")

    # Save results to CSV
    import csv

    rounded_tput = round(org_tput, 1)
    ppl = round(ppl, 2)

    with open("result.csv", mode="w", newline="") as file:
        writer = csv.writer(file)
        writer.writerow(["Id", "value"])
        writer.writerow([0, ppl])
        writer.writerow([1, rounded_tput])


if __name__ == "__main__":
    main()


2025-06-01 13:37:03.984373: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748785024.393669      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748785024.507283      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


INFO 06-01 13:37:18 [__init__.py:243] Automatically detected platform cuda.

[32mINFO[0m  ENV: Auto setting PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' for memory saving.      
[32mINFO[0m  ENV: Auto setting CUDA_DEVICE_ORDER=PCI_BUS_ID for correctness.                              
INFO 06-01 13:37:25 [__init__.py:31] Available plugins for group vllm.general_plugins:
INFO 06-01 13:37:25 [__init__.py:33] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver
INFO 06-01 13:37:25 [__init__.py:36] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load.


config.json:   0%|          | 0.00/1.37k [00:00<?, ?B/s]

INFO 06-01 13:37:42 [config.py:793] This model supports multiple tasks: {'generate', 'reward', 'embed', 'score', 'classify'}. Defaulting to 'generate'.
INFO 06-01 13:37:43 [config.py:2118] Chunked prefill is enabled with max_num_batched_tokens=2048.
INFO 06-01 13:37:43 [llm_engine.py:230] Initializing a V0 LLM engine (v0.9.0.1) with config: model='zbyzby/Llama3.2-3B-Instruct-quantized', speculative_config=None, tokenizer='zbyzby/Llama3.2-3B-Instruct-quantized', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=2048, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=gptq, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reaso

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

chat_template.jinja:   0%|          | 0.00/3.83k [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

INFO 06-01 13:37:45 [cuda.py:240] Cannot use FlashAttention-2 backend for Volta and Turing GPUs.
INFO 06-01 13:37:45 [cuda.py:289] Using XFormers backend.


[W601 13:37:56.026980151 socket.cpp:200] [c10d] The hostname of the client socket cannot be retrieved. err=-3


INFO 06-01 13:38:06 [parallel_state.py:1064] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0
INFO 06-01 13:38:06 [model_runner.py:1170] Starting to load model zbyzby/Llama3.2-3B-Instruct-quantized...


[W601 13:38:06.037219934 socket.cpp:200] [c10d] The hostname of the client socket cannot be retrieved. err=-3


INFO 06-01 13:38:07 [weight_utils.py:291] Using model weights format ['*.safetensors']


model.safetensors:   0%|          | 0.00/2.26G [00:00<?, ?B/s]

INFO 06-01 13:38:13 [weight_utils.py:307] Time spent downloading weights for zbyzby/Llama3.2-3B-Instruct-quantized: 6.793954 seconds
INFO 06-01 13:38:13 [weight_utils.py:344] No model.safetensors.index.json found in remote.


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 06-01 13:38:15 [default_loader.py:280] Loading weights took 1.71 seconds
INFO 06-01 13:38:16 [model_runner.py:1202] Model loading took 2.1385 GiB and 9.297632 seconds
INFO 06-01 13:38:18 [worker.py:291] Memory profiling takes 1.28 seconds
INFO 06-01 13:38:18 [worker.py:291] the current vLLM instance can use total_gpu_memory (14.74GiB) x gpu_memory_utilization (0.90) = 13.27GiB
INFO 06-01 13:38:18 [worker.py:291] model weights take 2.14GiB; non_torch_memory takes 0.05GiB; PyTorch activation peak memory takes 0.40GiB; the rest of the memory reserved for KV Cache is 10.68GiB.
INFO 06-01 13:38:18 [executor_base.py:112] # cuda blocks: 6246, # CPU blocks: 2340
INFO 06-01 13:38:18 [executor_base.py:117] Maximum concurrency for 2048 tokens per request: 48.80x
INFO 06-01 13:38:25 [model_runner.py:1512] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in t

Capturing CUDA graph shapes:   0%|          | 0/11 [00:00<?, ?it/s]

INFO 06-01 13:38:39 [model_runner.py:1670] Graph capturing finished in 13 secs, took 0.19 GiB
INFO 06-01 13:38:39 [llm_engine.py:428] init engine (profile, create kv cache, warmup model) took 22.80 seconds


Warm Up...:   0%|          | 0/5 [00:00<?, ?it/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Test Inference:   0%|          | 0/10 [00:00<?, ?it/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Prompt: How to learn a new language?
Response:  Learning a new language can be a challenging but rewarding experience. Here are some steps you can take to learn a new language:
1. Set your goals: Before you start learning a new language, it's essential to set your goals. What do you want to achieve? Do you want to travel to a foreign country, communicate with family members or friends who speak the language, or improve your career prospects? Setting clear goals will help you stay motivated and focused throughout the learning process.
2. Choose your language: With so many languages to choose from, it can be difficult to decide which one to learn. Consider the following factors:
* Which language do you want to learn for personal or professional reasons?
* Which language is closest to your native language?
* Which language do you find most interesting or challenging?
* Which language do you think will be most useful for your goals?
3. Learn the basics: Once you've chosen your language, it

.gitattributes:   0%|          | 0.00/105 [00:00<?, ?B/s]

quant_log.csv:   0%|          | 0.00/8.68k [00:00<?, ?B/s]

quantize_config.json:   0%|          | 0.00/427 [00:00<?, ?B/s]

[32mINFO[0m  Loader: Auto dtype (native bfloat16): `torch.bfloat16`                                       
[32mINFO[0m  Estimated Quantization BPW (bits per weight): 4.2875 bpw, based on [bits: 4, group_size: 128]
[32mINFO[0m   Kernel: Auto-selection: adding candidate `ExllamaQuantLinear`                               
[32mINFO[0m   Kernel: Auto-selection: adding candidate `TritonV2QuantLinear`                              
[32mINFO[0m   Kernel: Auto-selection: adding candidate `TorchQuantLinear`                                 
[32mINFO[0m  Kernel: candidates -> `[ExllamaQuantLinear, TritonV2QuantLinear, TorchQuantLinear]`          
[32mINFO[0m  Kernel: selected -> `ExllamaQuantLinear`.                                                    
[32mINFO[0m  Format: Converting `checkpoint_format` from `gptq` to internal `gptq_v2`.                    
[32mINFO[0m  Format: Converting GPTQ v1 to v2                                                             
[32mINFO[0m  Form

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/733k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/6.36M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (289077 > 131072). Running this sequence through the model will result in indexing errors


Evaluating...:   0%|          | 0/141 [00:00<?, ?it/s]

Perplexity (PPL): 11.217229843139648
