In [3]:
# 從頭開始
!pip install -U numpy==1.26.4 scipy==1.11.4
!pip install -U transformers accelerate bitsandbytes tqdm datasets
!pip install -v gptqmodel --no-build-isolation


Collecting scipy==1.11.4
  Downloading scipy-1.11.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.4/60.4 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Downloading scipy-1.11.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (36.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m36.4/36.4 MB[0m [31m44.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: scipy
  Attempting uninstall: scipy
    Found existing installation: scipy 1.15.2
    Uninstalling scipy-1.15.2:
      Successfully uninstalled scipy-1.15.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tsfresh 0.21.0 requires scipy>=1.14.0; python_version >= "3.10", but you have scipy 1.11.4 which is incompatible.
cesium 0.12.4 requires numpy<3.0

In [None]:
from huggingface_hub import login

login("")  # 用你的 token


In [None]:
# use QLoRA to do GPTQ

import os
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
from datasets import load_dataset
from gptqmodel import GPTQModel, QuantizeConfig

import gc
import torch

# 清空 PyTorch CUDA 記憶體
torch.cuda.empty_cache()
torch.cuda.ipc_collect()

# 清空 Python 變數參考
gc.collect()


# === Step 1: 載入原始模型 ===
base_model_id = "meta-llama/Llama-3.2-3B-Instruct"
base_model = AutoModelForCausalLM.from_pretrained(base_model_id, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(base_model_id)

# === Step 2: 載入 QLoRA adapter 並合併 ===
adapter_path = "/kaggle/input/gptq-qlora2/qlora-wikitext2"
merged_model = PeftModel.from_pretrained(base_model, adapter_path)
merged_model = merged_model.merge_and_unload()  # 合併 adapter 權重

# === Step 3: 儲存合併後模型 ===
merged_path = "/kaggle/working/merged-qlora-model"
merged_model.save_pretrained(merged_path)
tokenizer.save_pretrained(merged_path)

# # === Step 4: 準備 GPTQ 校準資料 ===
# print("[4] Loading calibration dataset from huggingface wikitext-2...")
# raw_dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")
# calibration_dataset = [x["text"] for x in raw_dataset if len(x["text"].strip()) > 0][:128]

# === Step 4: 準備 GPTQ 校準資料（提升到 1024 筆） ===
print("[4] Loading calibration dataset from huggingface wikitext-2...")
raw_dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")
calibration_dataset = [x["text"] for x in raw_dataset if len(x["text"].strip()) > 0][:1024]


# === Step 5: 進行 GPTQ 量化 ===
print("[5] Quantizing model...")
quant_path = "/kaggle/working/Qlora-GPTQModel-4bit"
quant_config = QuantizeConfig(bits=4, group_size=128)
gptq_model = GPTQModel.load(merged_path, quant_config)
gptq_model.quantize(calibration_dataset, batch_size=1)

# ✅ 確認 kernel 是否成功啟用
kernel = getattr(getattr(gptq_model.model, "kernel", None), "name", "unknown")
print("[Kernel] GPTQ kernel used:", kernel)

# === Step 6: 儲存量化後模型 ===
print("[6] Saving quantized model to:", quant_path)
gptq_model.save(quant_path)


In [4]:
# Origin to GPTQ
import os
from datasets import load_dataset
from gptqmodel import GPTQModel, QuantizeConfig

# [1] 模型參數與儲存路徑
model_id = "meta-llama/Llama-3.2-3B-Instruct"
quant_path = "/kaggle/working/Llama3-3B-GPTQModel-4bit"

# [2] 準備校準資料集（改用 wikitext-2）
print("[2] Loading calibration dataset (WikiText-2)...")
raw_dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")
calibration_dataset = [x["text"] for x in raw_dataset if len(x["text"].strip()) > 0][:4096]

# [3] 建立量化配置
quant_config = QuantizeConfig(bits=4, group_size=128)

# [4] 載入模型（會自動下載並準備 tokenizer）
print("[4] Loading model with GPTQModel...")
model = GPTQModel.load(model_id, quant_config)

# [5] 執行量化（依 GPU 記憶體情況調整 batch_size）
print("[5] Quantizing model...")
model.quantize(calibration_dataset, batch_size=1)

# [6] 儲存模型
print("[6] Saving quantized model to:", quant_path)
model.save(quant_path)



[32mINFO[0m  ENV: Auto setting PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' for memory saving.      
[32mINFO[0m  ENV: Auto setting CUDA_DEVICE_ORDER=PCI_BUS_ID for correctness.                              


2025-05-30 14:04:54.589289: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748613894.790853      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748613894.847926      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

[2] Loading calibration dataset (WikiText-2)...


README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/733k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/6.36M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

[4] Loading model with GPTQModel...


config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

[32mINFO[0m  Estimated Quantization BPW (bits per weight): 4.2875 bpw, based on [bits: 4, group_size: 128]


Fetching 16 files:   0%|          | 0/16 [00:00<?, ?it/s]

README.md:   0%|          | 0.00/41.7k [00:00<?, ?B/s]

USE_POLICY.md:   0%|          | 0.00/6.02k [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

orig_params.json:   0%|          | 0.00/220 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

original/tokenizer.model:   0%|          | 0.00/2.18M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

original/consolidated.00.pth:   0%|          | 0.00/6.43G [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

[32mINFO[0m  Loader: Auto dtype (native bfloat16): `torch.bfloat16`                                       


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

[32mINFO[0m  Model: Loaded `generation_config`: GenerationConfig {
  "bos_token_id": 128000,
  "do_sample": true,
  "eos_token_id": [
    128001,
    128008,
    128009
  ],
  "temperature": 0.6,
  "top_p": 0.9
}

[32mINFO[0m  Kernel: loaded -> `[]`                                                                       
[5] Quantizing model...
[32mINFO[0m  Packing Kernel: Auto-selection: adding candidate `TritonV2QuantLinear`                       
[33mWARN[0m  The average length of input_ids of calibration_dataset should be greater than 256: actual avg: 99.41796875.
[32mINFO[0m  Process: progress logs for `gptq` will be streamed to file: `gptq_log_unmoderated_time_05_30_2025_14h_06m_34s.log`
[32mINFO[0m  --------------------------------------------------------------------------------------------------------------------------
[32mINFO[0m  | process     | layer     | module               | loss           | samples     | damp        | time      | fwd_time     |
[32mINFO[0m

In [4]:
# result.py use gptq

import torch
import torch.nn as nn
from datasets import load_dataset
import random
import numpy as np
from tqdm.auto import tqdm
from gptqmodel import GPTQModel

import gc
import torch

# 清空 PyTorch CUDA 記憶體
torch.cuda.empty_cache()
torch.cuda.ipc_collect()

# 清空 Python 變數參考
gc.collect()


# model_path = "/kaggle/working/Llama3-3B-GPTQModel-4bit"
model_path = "/kaggle/working/Qlora-GPTQModel-4bit"


def evaluate_ppl(model, tokenizer, device="cuda:0"):
    test_dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
    test_enc = tokenizer("\n\n".join(test_dataset["text"]), return_tensors="pt")
    model.seqlen = 2048
    test_enc = test_enc.input_ids.to(device)

    nsamples = test_enc.numel() // model.seqlen
    nlls = []
    for i in tqdm(range(nsamples), desc="Evaluating..."):
        batch = test_enc[:, (i * model.seqlen):((i + 1) * model.seqlen)]

        with torch.no_grad():
            lm_logits = model(batch).logits

        shift_logits = lm_logits[:, :-1, :].contiguous().float()
        shift_labels = test_enc[:, (i * model.seqlen):((i + 1) * model.seqlen)][:, 1:]

        loss_fct = nn.CrossEntropyLoss()
        loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
        neg_log_likelihood = loss.float() * model.seqlen
        nlls.append(neg_log_likelihood)

    ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * model.seqlen))
    return ppl.item()

def main():
    torch.manual_seed(0)
    random.seed(0)

    max_new_tokens = 256
    device = 'cuda:0'

    quant_path = model_path
    model = GPTQModel.load(quant_path, prefer_engine="exllamav2")
    tokenizer = model.tokenizer
    model.eval()

    # Attempt to show the kernel used from kernel_info fallback
    try:
        kernel_info = getattr(getattr(model.model, "kernel_info", None), "name", "unknown")
    except:
        print("no kernel info")
        kernel_info = "unknown"
    print(f"[Info] Loaded model with quant kernel: {kernel_info}")
    print(f"[Info] Model class: {type(model).__name__}")

    prompt = "How to learn a new language?"
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]

    tputs = []
    time_record = []
    for _ in tqdm(range(10), desc="Test Inference"):
        torch.cuda.synchronize()
        start = torch.cuda.Event(enable_timing=True)
        end = torch.cuda.Event(enable_timing=True)
        start.record()

        generated = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=max_new_tokens,
            pad_token_id=tokenizer.eos_token_id,
        )

        end.record()
        torch.cuda.synchronize()
        elapsed_ms = start.elapsed_time(end)
        tput = max_new_tokens / (elapsed_ms / 1000)
        time_record.append(elapsed_ms / 1000)
        tputs.append(tput)

    response = tokenizer.decode(generated[0][input_ids.shape[1]:], skip_special_tokens=True)
    sorted_tputs = np.sort(tputs)[2:-2]
    org_tput = np.mean(sorted_tputs)
    print(f'Prompt: {prompt}\nResponse: {response}\n')
    print(f'Time Record: {time_record}')
    print(f'Throughput Record: {tputs} toks/s\n')

    print(f'Throughput: {org_tput} toks/s')
    ppl = evaluate_ppl(model, tokenizer, device)
    print(f"Perplexity (PPL): {ppl}")

    import csv
    rounded_tput = round(org_tput, 1)
    ppl = round(ppl, 2)

    with open("result.csv", mode="w", newline="") as file:
        writer = csv.writer(file)
        writer.writerow(["Id", "value"])
        writer.writerow([0, ppl])
        writer.writerow([1, rounded_tput])

if __name__ == '__main__':
    main()





KeyboardInterrupt: 