In [None]:
!git clone https://github.com/fannam/SoICT-LLM-Pruner.git

In [None]:
%cd /content/SoICT-LLM-Pruner

In [None]:
!pip install -e .

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained("TheGardener/Llama-3.2-1B-wikitext-finetune")
tokenizer = AutoTokenizer.from_pretrained("TheGardener/Llama-3.2-1B-wikitext-finetune")

In [None]:
!rm -rf ~/.cache/huggingface/datasets/eleuther_ai___wikitext_document_level
!pip install --upgrade datasets fsspec

# ELEMENT

In [3]:
from datasets import load_dataset
from transformers import AutoTokenizer
from torch.utils.data import DataLoader

# 1) Load the document-level split
model_name = "TheGardener/Llama-3.2-1B-wikitext-finetune"

raw = load_dataset("EleutherAI/wikitext_document_level", "wikitext-103-raw-v1", split="train")

# 2) Filter out too-short docs (optional but recommended)
raw = raw.filter(lambda ex: len(ex["page"].strip()) > 256)

# 3) Shuffle & pick exactly 1024 documents
raw = raw.shuffle(seed=13).select(range(1024))

# 4) Tokenizer setup
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# 5) Tokenize to ~512 tokens each, pad/truncate
def tokenize_fn(examples):
    return tokenizer(
        examples["page"],
        truncation=True,
        max_length=512,
        padding="max_length"
    )
tok = raw.map(tokenize_fn, batched=True, remove_columns=["page"])

# 6) PyTorch formatting + DataLoader
tok.set_format(type="torch", columns=["input_ids", "attention_mask"])
calib_loader = DataLoader(tok, batch_size=4, shuffle=False)

In [None]:
from estimator.element_estimator import Llama3ActivationElementEstimator

element_estimator = Llama3ActivationElementEstimator(model=model, device='cuda')

In [None]:
mlp_importance = element_estimator.estimate_mlp_neurons(calib_loader)

In [None]:
mlp_importance

In [None]:
model

In [None]:
from pruner.element_level_pruner.ElementPruner import Llama3ElementPruner

element_pruner = Llama3ElementPruner(original_model=model, device='cuda')
pruned_model = element_pruner.prune_mlp(neuron_importance=mlp_importance, target_num_neurons=6144)

In [None]:
pruned_model

In [None]:
prompt = "Paris is the capital of"

# Tokenize the input
input_ids = tokenizer.encode(prompt, return_tensors="pt").to('cuda:0')

# Generate text
output_ids = pruned_model.generate(
    input_ids,
    max_new_tokens=128,
    do_sample=True,
    temperature=0.7,
    top_k=50,
    top_p=0.95,
    num_return_sequences=1
)

# Decode and print the result
generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

In [None]:
generated_text

# LAYER

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained("TheGardener/Llama-3.2-1B-wikitext-finetune")
tokenizer = AutoTokenizer.from_pretrained("TheGardener/Llama-3.2-1B-wikitext-finetune")

In [2]:
from estimator.layer_estimator import Llama3SimilarityLayerEstimator

estimator = Llama3SimilarityLayerEstimator(model=model, device='cuda')

In [None]:
layer_importance = estimator.estimate(calib_loader)

In [None]:
layer_importance

In [None]:
from pruner.layer_level_pruner.LayerPruner import Llama3LayerPruner

layer_pruner = Llama3LayerPruner(model=model, device='cuda')

pruned_model = layer_pruner.prune(importance_scores=layer_importance, prune_counts={"attention": 2, "mlp": 1})

In [None]:
pruned_model

In [None]:
prompt = "Paris is the capital of"

# Tokenize the input
input_ids = tokenizer.encode(prompt, return_tensors="pt").to('cuda:0')

# Generate text
output_ids = pruned_model.generate(
    input_ids,
    max_new_tokens=128,
    do_sample=True,
    temperature=0.7,
    top_k=50,
    top_p=0.95,
    num_return_sequences=1
)

# Decode and print the result
generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

In [None]:
generated_text

# BLOCK

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained("TheGardener/Llama-3.2-1B-wikitext-finetune")
tokenizer = AutoTokenizer.from_pretrained("TheGardener/Llama-3.2-1B-wikitext-finetune")

In [6]:
from estimator.block_estimator import Llama3SimilarityBlockEstimator

estimator = Llama3SimilarityBlockEstimator(model=model, block_size=1, device='cuda')

In [None]:
block_importance = estimator.estimate(calib_loader)

In [None]:
block_importance

In [11]:
from pruner.block_level_pruner.BlockPruner import Llama3BlockPruner, Qwen2BlockPruner

block_pruner = Llama3BlockPruner(original_model=model, device='cuda')

pruned_model = block_pruner.prune(block_importance=block_importance, num_block_to_prune=4)

In [None]:
pruned_model

In [None]:
prompt = "Paris is the capital of"

# Tokenize the input
input_ids = tokenizer.encode(prompt, return_tensors="pt").to('cuda:0')

# Generate text
output_ids = pruned_model.generate(
    input_ids,
    max_new_tokens=128,
    do_sample=True,
    temperature=0.7,
    top_k=50,
    top_p=0.95,
    num_return_sequences=1
)

# Decode and print the result
generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

In [None]:
generated_text