In [None]:
!pip uninstall torch torchvision torchaudio transformers vllm -y
!pip cache purge
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126
!pip install --upgrade transformers vllm datasets tqdm 
!pip install -U gptqmodel --no-build-isolation -v
!pip install optimum
!huggingface-cli login --token **    # Read
# !huggingface-cli login --token **    # Write
!rm -rf /kaggle/working/your_folder/

In [None]:
from vllm import LLM, SamplingParams
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm.auto import tqdm
from datasets import load_dataset
from gptqmodel import GPTQModel, QuantizeConfig
import random
import numpy as np
import time
import csv


def evaluate_ppl(model, tokenizer, device="cuda:0"):
    model.to(device)
    test_dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
    
    test_enc = tokenizer("\n\n".join(test_dataset["text"]), return_tensors="pt")
    model.seqlen = 2048
    test_enc = test_enc.input_ids.to(device)
    
    nsamples = test_enc.numel() // model.seqlen
    nlls = []  
    for i in tqdm(range(nsamples), desc="Evaluating PPL..."):
        batch = test_enc[:, (i * model.seqlen):((i + 1) * model.seqlen)]
        
        with torch.no_grad():
            lm_logits = model(batch).logits

        shift_logits = lm_logits[:, :-1, :].contiguous().float()
        shift_labels = test_enc[:, (i * model.seqlen):((i + 1) * model.seqlen)][:, 1:]

        loss_fct = nn.CrossEntropyLoss()
        loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
        neg_log_likelihood = loss.float() * model.seqlen
        nlls.append(neg_log_likelihood)

    ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * model.seqlen))
    
    return ppl.item()


def main():
    ############## Set Up ##############
    torch.manual_seed(0)
    random.seed(0)

    max_new_tokens = 256    # Number of new tokens to generate
    device = 'cuda:0'

    ### === Load model with BitsAndBytes w4a16 quantization ===
    model_name = "zbyzby/Llama3.2-3B-Instruct-QLoRA-finetuned"
    folder_path = "/kaggle/working/your_folder" 

    calibration_dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test").select(range(1024))["text"]
    quant_config = QuantizeConfig(bits=4, group_size=128)

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = GPTQModel.load(model_name)

    # Quant
    model.quantize(calibration_dataset, batch_size=2)

    model.save(folder_path)
    tokenizer.save_pretrained(folder_path)

    from huggingface_hub import HfApi
    import os
    HF_TOKEN = "**" # Replace with your Hugging Face token
    repo_id = "zbyzby/Llama3.2-3B-Instruct-quantized"
    api = HfApi(token=HF_TOKEN)
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            file_path = os.path.join(root, file)
            path_in_repo = os.path.relpath(file_path, folder_path)
            print(f"Uploading: {file_path} -> {path_in_repo}")
            api.upload_file(
                path_or_fileobj=file_path,
                path_in_repo=path_in_repo,
                repo_id=repo_id,
                token=HF_TOKEN,
            )
    print("All files uploaded.")

    # Test perplexity
    print("\n=== Testing Perplexity ===")
    ppl = evaluate_ppl(model, tokenizer, device)
    print(f"Perplexity (PPL): {ppl}")


if __name__ == '__main__':
    main()

In [None]:
from datasets import load_dataset
from gptqmodel import GPTQModel, QuantizeConfig
from transformers import AutoTokenizer, AutoModelForCausalLM

model_id = "meta-llama/Llama-3.2-1B-Instruct"
quant_path = "Llama-3.2-1B-Instruct-gptqmodel-4bit"

calibration_dataset = load_dataset(
    "wikitext", "wikitext-2-raw-v1", split="test"
).select(range(1024))["text"]
quant_config = QuantizeConfig(bits=4, group_size=128)

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = GPTQModel.load(model_id, quant_config)

# increase `batch_size` to match gpu/vram specs to speed up quantization
model.quantize(calibration_dataset, batch_size=2)

model.save(quant_path)
tokenizer.save_pretrained(quant_path)

from huggingface_hub import HfApi
import os
HF_TOKEN = "**" # Replace with your Hugging Face token
repo_id = "zbyzby/Llama-3.2-1B-Instruct-GPTQ-Quant"
api = HfApi(token=HF_TOKEN)
for root, dirs, files in os.walk(quant_path):
    for file in files:
        file_path = os.path.join(root, file)
        path_in_repo = os.path.relpath(file_path, quant_path)
        print(f"Uploading: {file_path} -> {path_in_repo}")
        api.upload_file(
            path_or_fileobj=file_path,
            path_in_repo=path_in_repo,
            repo_id=repo_id,
            token=HF_TOKEN,
        )
print("All files uploaded.")