In [None]:
!rm -rf /kaggle/working/Compression-Framework-for-EdgeAI
!git clone https://github.com/ha405/Compression-Framework-for-EdgeAI

Run the below command only once per session. If you reset session run again. 

In [None]:
!pip install -r /kaggle/working/Compression-Framework-for-EdgeAI/requirements.txt
!pip install logbar
!pip install tokenicer
!pip install device_smi
!pip install random_word
!pip install datasets

In [None]:
import sys
import os
library_path = "/kaggle/working/Compression-Framework-for-EdgeAI/KLAWQ" 
if library_path not in sys.path:
     sys.path.insert(0, library_path)
     print(f"Added '{library_path}' to sys.path")
from quant import GPTQModel, QuantizeConfig 

## Imports

In [None]:
import os
import gc
import torch
import shutil
import math
import pandas as pd
from transformers import AutoTokenizer
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from datasets import load_dataset, DatasetDict
from tqdm import tqdm

## WikiText-2

In [None]:
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")

train_dataset = dataset["train"]
val_dataset = dataset["validation"]
test_dataset = dataset["test"]

calibration_dataset = train_dataset.select(range(1000))

dataset_splits = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset,
    "calibration": calibration_dataset,
})

print({k: len(v) for k, v in dataset_splits.items()})


To Clear GPU Cache

In [None]:
def clear_gpu_cache():
    gc.collect()  
    torch.cuda.empty_cache()  
    torch.cuda.ipc_collect()  
    print("✅ GPU VRAM and cache cleared.")

## Quantization Functions

In [None]:
def clear_quant_path(path=None):
    if path and os.path.exists(path):
        shutil.rmtree(path)
    torch.cuda.empty_cache()
    gc.collect()

def quantize_and_eval(
    model_id: str,
    calib_tokenized: dict,
    eval_texts: list[str],
    beta: float,
    tau: float,
    quant_path: str,
    tokenizer: AutoTokenizer,
    max_len: int,
    batch_size: int = 8
):
    print(f"  -> [Quantize] beta={beta}, tau={tau}")
    clear_quant_path(quant_path)

    # 1) Quantize
    quant_cfg = QuantizeConfig(bits=4, group_size=-1, beta=beta, tau=tau)
    model = GPTQModel.load(
        model_id,
        quant_cfg,
        trust_remote_code=True,
        torch_dtype="auto",
        device_map="auto"
    )

    # Format calibration data
    calibration_data = []
    input_ids_tensor     = calib_tokenized['input_ids']
    attention_mask_tensor= calib_tokenized['attention_mask']
    for i in range(input_ids_tensor.size(0)):
        calibration_data.append({
            "input_ids":     input_ids_tensor[i].tolist(),
            "attention_mask":attention_mask_tensor[i].tolist()
        })

    model.quantize(calibration_data, batch_size=batch_size)
    os.makedirs(os.path.dirname(quant_path), exist_ok=True)
    model.save(quant_path)
    print(f"     Quantization complete and saved to {quant_path}")

    # Free GPU RAM
    del model
    clear_quant_path()

    # 2) Load the quantized model
    model = GPTQModel.from_pretrained(
        quant_path,
        trust_remote_code=True,
        device_map="auto",
        quantize_config=quant_cfg
    )
    model.eval()

    # 3) Tokenize evaluation texts
    encodings = tokenizer(
        eval_texts,
        return_tensors="pt",
        padding="longest",
        truncation=True,
        max_length=max_len
    )
    input_ids     = encodings.input_ids
    attention_mask= encodings.attention_mask

    # 4) Compute loss & perplexity with proper pad‐masking
    total_nll    = 0.0
    total_tokens = 0

    with torch.no_grad():
        for i in tqdm(
            range(0, input_ids.size(0), batch_size),
            desc="     Evaluating PPL",
            leave=False
        ):
            b_ids  = input_ids[i:i+batch_size].to(model.device)
            b_mask = attention_mask[i:i+batch_size].to(model.device)

            # --- mask out pads in the labels ---
            labels = b_ids.clone()
            labels[b_mask == 0] = -100  # ignore padding

            out = model(
                input_ids=b_ids,
                attention_mask=b_mask,
                labels=labels
            )

            # out.loss is avg NLL over non-ignored tokens
            num_real = (labels != -100).sum().item()
            if num_real > 0:
                batch_nll = out.loss.item() * num_real
                total_nll    += batch_nll
                total_tokens += num_real

    avg_loss  = total_nll / total_tokens
    perplexity = math.exp(avg_loss)
    print(f"     Eval complete: loss={avg_loss:.4f}, ppl={perplexity:.2f}")

    # Cleanup
    del model
    clear_quant_path(quant_path)

    return avg_loss, perplexity


In [None]:
from huggingface_hub import login
from getpass import getpass

HF_TOKEN = getpass("Enter your Hugging Face token:")
login(token=HF_TOKEN)

print("Hugging Face login successful!")

In [None]:
calib_texts = dataset_splits["calibration"]["text"]
eval_texts  = [t for t in dataset_splits["validation"]["text"] if t.strip()][:3000]

# --- init tokenizer & pre-tokenize calibration set ---
model_id = "meta-llama/Llama-2-7b-hf"
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token
max_len = tokenizer.model_max_length
max_len = 4096 
calib_tokenized = tokenizer(
    calib_texts,
    truncation=True,
    padding="longest",
    max_length=max_len,
    return_tensors="pt"
)

base_quant_path = "/kaggle/working/llama2-7b-quant"
beta_values     = [0.2, 0.4, 0.6, 0.8, 1.0]
tau_values      = [0.5, 1.0, 1.5, 2.0]
results = []
total_iters = len(beta_values) + len(tau_values)
iter_count = 0


for beta in beta_values:
    iter_count += 1
    print(f"[Iter {iter_count}/{total_iters}] β={beta}, τ=0.5")
    qp = f"{base_quant_path}-b{beta}-t0.5"
    loss, ppl = quantize_and_eval(
        model_id=model_id,
        calib_tokenized=calib_tokenized,
        eval_texts=eval_texts,
        beta=beta,
        tau=0.5,
        quant_path=qp,
        tokenizer=tokenizer,      # ← pass it here
        max_len=max_len,          # ← and here
        batch_size=8
    )
    results.append({"beta": beta, "tau": 0.5, "loss": loss, "ppl": ppl})

# … select best_beta …

for tau in tau_values:
    iter_count += 1
    print(f"[Iter {iter_count}/{total_iters}] β={best_beta}, τ={tau}")
    qp = f"{base_quant_path}-b{best_beta}-t{tau}"
    loss, ppl = quantize_and_eval(
        model_id=model_id,
        calib_tokenized=calib_tokenized,
        eval_texts=eval_texts,
        beta=best_beta,
        tau=tau,
        quant_path=qp,
        tokenizer=tokenizer,      # ← and here too
        max_len=max_len,
        batch_size=8
    )
    results.append({"beta": best_beta, "tau": tau, "loss": loss, "ppl": ppl})

df = pd.DataFrame(results)
print(df.to_markdown(index=False))


## Plotting

In [None]:
df2 = pd.DataFrame(results2)

# Constants
const_tau = 0.5
df_beta = df[df['tau'] == const_tau].reset_index(drop=True)

best_beta = df_beta.loc[df_beta['ppl'].idxmin(), 'beta']
df_tau = df[df['beta'] == best_beta].reset_index(drop=True)

def plot_zoomed_bar(x, y, xlabel, ylabel, title, cmap):
    colors = cmap(np.linspace(0, 1, len(x)))
    fig, ax = plt.subplots(figsize=(8, 4))
    bars = ax.bar(x, y, color=colors, edgecolor='black', linewidth=0.8)

    ax.set_title(title, fontsize=14)
    ax.set_xlabel(xlabel, fontsize=12)
    ax.set_ylabel(ylabel, fontsize=12)
    ax.grid(axis='y', linestyle='--', alpha=0.6)
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)

    y_min, y_max = y.min(), y.max()
    margin = (y_max - y_min) * 0.15
    ax.set_ylim(y_min - margin, y_max + margin)
    
    plt.tight_layout()

plot_zoomed_bar(
    x=df_beta['beta'].astype(str),
    y=df_beta['ppl'],
    xlabel='β (Beta values)',
    ylabel='Perplexity',
    title='Perplexity vs Beta @ τ = 0.5',
    cmap=plt.cm.Set2
)

plot_zoomed_bar(
    x=df_beta['beta'].astype(str),
    y=df_beta['loss'],
    xlabel='β (Beta values)',
    ylabel='Avg NLL Loss',
    title='Loss vs Beta @ τ = 0.5',
    cmap=plt.cm.Pastel1
)

plot_zoomed_bar(
    x=df_tau['tau'].astype(str),
    y=df_tau['ppl'],
    xlabel='τ (Tau values)',
    ylabel='Perplexity',
    title=f'Perplexity vs Tau @ β = {best_beta}',
    cmap=plt.cm.Pastel2
)

plot_zoomed_bar(
    x=df_tau['tau'].astype(str),
    y=df_tau['loss'],
    xlabel='τ (Tau values)',
    ylabel='Avg NLL Loss',
    title=f'Loss vs Tau @ β = {best_beta}',
    cmap=plt.cm.Dark2
)

plt.show()