In [None]:
!pip install transformers datasets evaluate accelerate -q


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from datasets import load_dataset
import numpy as np
import torch.nn.utils.prune as prune

# 1️⃣ Load and prune GPT-2
m = GPT2LMHeadModel.from_pretrained("gpt2")
tok = GPT2Tokenizer.from_pretrained("gpt2")
tok.pad_token = tok.eos_token

total = sum(p.numel() for p in m.parameters())
target_params = 82_000_000
target_ratio = 1 - (target_params / total)
adj_ratio = min(target_ratio * 1.5, 0.9)
print(f"Adjusted prune ratio: {adj_ratio:.2f}")

params_to_prune = []
for _, mod in m.named_modules():
    if isinstance(mod, torch.nn.Linear):
        params_to_prune.append((mod, "weight"))

prune.global_unstructured(params_to_prune, pruning_method=prune.L1Unstructured, amount=adj_ratio)
for mod, _ in params_to_prune:
    prune.remove(mod, "weight")

nonzero = sum(torch.count_nonzero(p).item() for p in m.parameters())
print(f"After pruning: {nonzero/1e6:.2f}M params")

# 2️⃣ Prepare WikiText-2
ds = load_dataset("wikitext", "wikitext-2-raw-v1")
def tok_fn(e): return tok(e["text"], truncation=True, padding="max_length", max_length=128)
ds = ds.map(tok_fn, batched=True, remove_columns=["text"])
ds.set_format(type="torch", columns=["input_ids", "attention_mask"])

# 3️⃣ Training setup
args = TrainingArguments(
    output_dir="./gpt2_pruned_ft",
    overwrite_output_dir=True,
    num_train_epochs=2,
    per_device_train_batch_size=2,
    learning_rate=5e-5,
    logging_steps=50,
    save_strategy="no",
    fp16=torch.cuda.is_available(),
    report_to="none"
)

coll = DataCollatorForLanguageModeling(tokenizer=tok, mlm=False)

trainer = Trainer(
    model=m,
    args=args,
    data_collator=coll,
    train_dataset=ds["train"]
)

trainer.train()

# 4️⃣ Evaluate perplexity
print("\nEvaluating perplexity...")
m.eval()
losses = []
for b in torch.utils.data.DataLoader(ds["validation"], batch_size=2):
    b = {k: v.to(m.device) for k, v in b.items()}
    with torch.no_grad():
        o = m(**b, labels=b["input_ids"])
    losses.append(o.loss.item())

ppl = np.exp(np.mean(losses))
print(f"✅ Perplexity: {ppl:.2f}")


Adjusted prune ratio: 0.51
After pruning: 104.69M params


Map:   0%|          | 0/4358 [00:00<?, ? examples/s]

Map:   0%|          | 0/36718 [00:00<?, ? examples/s]

Map:   0%|          | 0/3760 [00:00<?, ? examples/s]

`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
50,4.6949
100,4.0672
150,3.6926
200,3.6466
250,3.731
300,3.4225
350,3.6502
400,3.6782
450,3.7253
500,3.4067



Evaluating perplexity...
✅ Perplexity: 210346.42


In [None]:
import torch.nn.utils.prune as prune
from transformers.models.gpt2.modeling_gpt2 import Conv1D

params_to_prune = []
for _, module in m.named_modules():
    if isinstance(module, Conv1D):
        params_to_prune.append((module, "weight"))

print(f"Found {len(params_to_prune)} Conv1D layers to prune")

adj_ratio = 0.6# example
prune.global_unstructured(
    params_to_prune,
    pruning_method=prune.L1Unstructured,
    amount=adj_ratio
)
for module, _ in params_to_prune:
    prune.remove(module, "weight")

nonzero = sum(torch.count_nonzero(p).item() for p in m.parameters())
print(f"After pruning: {nonzero/1e6:.2f}M params")


Found 48 Conv1D layers to prune
After pruning: 53.73M params


In [None]:
import torch
import torch.nn as nn
import torch.nn.utils.prune as prune
from torch.utils.data import DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorWithPadding
from datasets import load_dataset
from torch.optim import AdamW

from transformers.models.gpt2.modeling_gpt2 import Conv1D
import numpy as np

device = "cuda" if torch.cuda.is_available() else "cpu"

# -------------------------------
# 1️⃣ Load GPT-2 and Tokenizer
# -------------------------------
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

model = GPT2LMHeadModel.from_pretrained(model_name)
model.to(device)

# -------------------------------
# 2️⃣ Prune Conv1D layers
# -------------------------------
params_to_prune = [(module, "weight") for _, module in model.named_modules() if isinstance(module, Conv1D)]
print(f"Found {len(params_to_prune)} Conv1D layers to prune")

# Target ~82M params (adjust ratio based on current total)
total_params = sum(p.numel() for p in model.parameters())
target_params = 82_000_000
prune_ratio = min(1 - target_params / total_params, 0.9)
print(f"Pruning {prune_ratio*100:.1f}% of Conv1D weights globally")

prune.global_unstructured(params_to_prune, pruning_method=prune.L1Unstructured, amount=prune_ratio)
for module, _ in params_to_prune:
    prune.remove(module, "weight")

nonzero = sum(torch.count_nonzero(p).item() for p in model.parameters())
print(f"After pruning: {nonzero/1e6:.2f}M nonzero params")

# -------------------------------
# 3️⃣ Prepare IMDB dataset
# -------------------------------
dataset = load_dataset("imdb")

# Tokenization
def tokenize_fn(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

tokenized_datasets = dataset.map(tokenize_fn, batched=True)
tokenized_datasets.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

train_dataset = tokenized_datasets["train"]
test_dataset  = tokenized_datasets["test"]

# -------------------------------
# 4️⃣ Create classification head
# -------------------------------
class GPT2ForSentiment(nn.Module):
    def __init__(self, base_model):
        super().__init__()
        self.gpt2 = base_model
        self.classifier = nn.Linear(base_model.config.n_embd, 2)  # binary classification

    def forward(self, input_ids, attention_mask=None, labels=None):
        outputs = self.gpt2(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden = outputs.last_hidden_state[:, -1, :]  # take last token
        logits = self.classifier(last_hidden)
        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)
        return {"loss": loss, "logits": logits}

sentiment_model = GPT2ForSentiment(model)
sentiment_model.to(device)

# -------------------------------
# 5️⃣ Training
# -------------------------------
training_args = TrainingArguments(
    output_dir="./gpt2_pruned_imdb",
    overwrite_output_dir=True,
    num_train_epochs=2,
    per_device_train_batch_size=2,
    learning_rate=5e-5,
    logging_steps=100,
    evaluation_strategy="epoch",
    save_strategy="no",
    fp16=torch.cuda.is_available(),
    report_to="none"
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=sentiment_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer.train()

# -------------------------------
# 6️⃣ Evaluate accuracy
# -------------------------------
sentiment_model.eval()
loader = DataLoader(test_dataset, batch_size=8)
correct = 0
total = 0

with torch.no_grad():
    for batch in loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = sentiment_model(input_ids=input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs["logits"], dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

accuracy = correct / total
print(f"\n✅ Test Accuracy: {accuracy*100:.2f}%")


Found 48 Conv1D layers to prune
Pruning 34.1% of Conv1D weights globally
After pruning: 95.47M nonzero params


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'

In [None]:
import torch
import torch.nn as nn
import torch.nn.utils.prune as prune
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, DataCollatorWithPadding
from datasets import load_dataset
import numpy as np

# -------------------------------
# 1️⃣ Load GPT-2 and prune
# -------------------------------
model_name = "gpt2"
m = GPT2LMHeadModel.from_pretrained(model_name)
tok = GPT2Tokenizer.from_pretrained(model_name)
tok.pad_token = tok.eos_token

# Target ~82M params
total_params = sum(p.numel() for p in m.parameters())
target_params = 82_000_000
prune_ratio = min((1 - target_params / total_params) * 1.5, 0.9)
print(f"Adjusted prune ratio: {prune_ratio:.2f}")

params_to_prune = [(mod, "weight") for _, mod in m.named_modules() if isinstance(mod, nn.Linear)]
prune.global_unstructured(params_to_prune, pruning_method=prune.L1Unstructured, amount=prune_ratio)
for mod, _ in params_to_prune:
    prune.remove(mod, "weight")

nonzero = sum(torch.count_nonzero(p).item() for p in m.parameters())
print(f"After pruning: {nonzero/1e6:.2f}M params")

# -------------------------------
# 2️⃣ Prepare IMDB dataset
# -------------------------------
ds = load_dataset("imdb")
def tok_fn(e):
    return tok(e["text"], truncation=True, padding="max_length", max_length=128)
ds = ds.map(tok_fn, batched=True, remove_columns=["text"])
ds.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

# -------------------------------
# 3️⃣ Wrap GPT-2 for classification
# -------------------------------
class GPT2ForSentiment(nn.Module):
    def __init__(self, gpt_model):
        super().__init__()
        self.gpt2 = gpt_model
        self.classifier = nn.Linear(self.gpt2.config.n_embd, 2)  # binary

    def forward(self, input_ids, attention_mask=None, labels=None):
        outputs = self.gpt2(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True)
        last_hidden = outputs.hidden_states[-1][:, -1, :]  # last token hidden state
        logits = self.classifier(last_hidden)
        loss = None
        if labels is not None:
            loss = nn.functional.cross_entropy(logits, labels)
        return {"loss": loss, "logits": logits}

clf_model = GPT2ForSentiment(m)

# -------------------------------
# 4️⃣ Training setup
# -------------------------------
args = TrainingArguments(
    output_dir="./gpt2_pruned_imdb",
    overwrite_output_dir=True,
    num_train_epochs=2,
    per_device_train_batch_size=2,
    learning_rate=5e-5,
    logging_steps=50,
    save_strategy="no",
    fp16=torch.cuda.is_available(),
    report_to="none"
)

collator = DataCollatorWithPadding(tokenizer=tok)
trainer = Trainer(
    model=clf_model,
    args=args,
    train_dataset=ds["train"],
    eval_dataset=ds["test"],
    tokenizer=tok,
    data_collator=collator
)

# -------------------------------
# 5️⃣ Train
# -------------------------------
trainer.train()

# -------------------------------
# 6️⃣ Evaluate accuracy
# -------------------------------
clf_model.eval()
correct, total = 0, 0
for batch in torch.utils.data.DataLoader(ds["test"], batch_size=8):
    batch = {k: v.to(clf_model.classifier.weight.device) for k, v in batch.items()}
    with torch.no_grad():
        out = clf_model(batch["input_ids"], batch["attention_mask"])
    preds = out["logits"].argmax(dim=-1)
    correct += (preds == batch["label"]).sum().item()
    total += batch["label"].size(0)

acc = correct / total
print(f"✅ Test Accuracy: {acc*100:.2f}%")


Adjusted prune ratio: 0.51
After pruning: 104.69M params


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

  trainer = Trainer(


Step,Training Loss
50,2.2364
100,0.8008
150,0.7219
200,0.7682
250,1.0379
300,0.6994
350,0.7571
400,0.7601
450,0.6043
500,1.2114


✅ Test Accuracy: 83.46%
