# PEFT Fine-Tuning with DistilBERT (LoRA)

This notebook follows the project instructions: load a foundation model, evaluate it, fine-tune with PEFT (LoRA), and compare inference results.

## Setup

In [1]:
import os
import random
import numpy as np
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
)
from peft import LoraConfig, get_peft_model
from transformers import logging
# Suppress informational messages about newly-initialized heads; training will still be required
logging.set_verbosity_error()

In [2]:
# Reproducibility
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

## Load Dataset (Cleanlab/amazon-reviews)

In [3]:
# Load a small subset for quick iteration
raw_ds = load_dataset("cleanlab/amazon-reviews")

# Inspect available splits and columns
raw_ds

DatasetDict({
    train: Dataset({
        features: ['review_text', 'label'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['review_text', 'label'],
        num_rows: 1000
    })
})

In [4]:
# Use the train split and take a small subset
train_ds = raw_ds["train"].shuffle(seed=seed).select(range(2000))
eval_ds = raw_ds["train"].shuffle(seed=seed + 1).select(range(500))

# Identify columns (common fields include 'text' and 'label')
train_ds.column_names

['review_text', 'label']

## Tokenization and Preprocessing

In [5]:
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Ensure pad token exists
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

text_column = "text" if "text" in train_ds.column_names else train_ds.column_names[0]
label_column = "label" if "label" in train_ds.column_names else train_ds.column_names[-1]

# Build deterministic label2id mapping from the raw train split (preserve order)
raw_labels = list(dict.fromkeys(train_ds[label_column]))
label2id = {lab: i for i, lab in enumerate(raw_labels)}
num_labels = len(label2id)

def tokenize_fn(batch):
    return tokenizer(batch[text_column], truncation=True, padding=False)

tokenized_train = train_ds.map(tokenize_fn, batched=True, remove_columns=[text_column])
tokenized_eval = eval_ds.map(tokenize_fn, batched=True, remove_columns=[text_column])

tokenized_train = tokenized_train.rename_column(label_column, "labels")
tokenized_eval = tokenized_eval.rename_column(label_column, "labels")

def _convert_labels_to_int(batch):
    new_labels = []
    for x in batch['labels']:
        if isinstance(x, str):
            new_labels.append(label2id[x])
        else:
            new_labels.append(int(x))
    batch['labels'] = new_labels
    return batch

tokenized_train = tokenized_train.map(_convert_labels_to_int, batched=True)
tokenized_eval = tokenized_eval.map(_convert_labels_to_int, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Load and Evaluate Foundation Model

In [6]:
# Determine number of labels from dataset (use label2id if available)
num_labels = len(label2id) if 'label2id' in globals() else len(set(tokenized_train["labels"]))

base_model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    pad_token_id=tokenizer.pad_token_id,
    ignore_mismatched_sizes=True,
)
base_model.to(device)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    accuracy = (preds == labels).mean().item()
    return {"accuracy": accuracy}

base_args = TrainingArguments(
    output_dir="./outputs_base",
    per_device_eval_batch_size=8,
    do_train=False,
    do_eval=True,
    logging_strategy="no",
    report_to="none",
)

base_trainer = Trainer(
    model=base_model,
    args=base_args,
    eval_dataset=tokenized_eval,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

base_metrics = base_trainer.evaluate()
base_metrics

Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

{'eval_loss': '0.6988', 'eval_model_preparation_time': '0', 'eval_accuracy': '0.39', 'eval_runtime': '0.6151', 'eval_samples_per_second': '812.8', 'eval_steps_per_second': '102.4'}


{'eval_loss': 0.6987676024436951,
 'eval_model_preparation_time': 0.0,
 'eval_accuracy': 0.39,
 'eval_runtime': 0.6151,
 'eval_samples_per_second': 812.839,
 'eval_steps_per_second': 102.418}

## Create PEFT (LoRA) Config and Model

In [7]:
# Quick diagnostic: list module names that look like attention projections (query/key/value/q_lin/v_lin)
candidates = []
for name, module in base_model.named_modules():
    lname = name.lower()
    if any(k in lname for k in ("query", "key", "value", "q_lin", "k_lin", "v_lin", "attn")):
        candidates.append(name)

print('Found candidate module names (showing up to 40):')
print(candidates[:40])

# Preferred target modules for encoder models (DistilBERT/BERT typically use 'query','key','value')
preferred_targets = ["query", "key", "value"]
# Fallback targets if model uses different attribute names
fallback_targets = ["q_lin", "k_lin", "v_lin"]
# Choose targets that appear in candidate module names; otherwise use preferred targets
selected = []
for t in preferred_targets + fallback_targets:
    if any(t in cname.lower() for cname in candidates):
        selected.append(t)
if not selected:
    selected = preferred_targets

print('Using LoRA target modules:', selected)

lora_config = LoraConfig(
    task_type="SEQ_CLS",
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    target_modules=selected,
)

peft_model = get_peft_model(base_model, lora_config)
peft_model.print_trainable_parameters()

Found candidate module names (showing up to 40):
['distilbert.transformer.layer.0.attention.q_lin', 'distilbert.transformer.layer.0.attention.k_lin', 'distilbert.transformer.layer.0.attention.v_lin', 'distilbert.transformer.layer.1.attention.q_lin', 'distilbert.transformer.layer.1.attention.k_lin', 'distilbert.transformer.layer.1.attention.v_lin', 'distilbert.transformer.layer.2.attention.q_lin', 'distilbert.transformer.layer.2.attention.k_lin', 'distilbert.transformer.layer.2.attention.v_lin', 'distilbert.transformer.layer.3.attention.q_lin', 'distilbert.transformer.layer.3.attention.k_lin', 'distilbert.transformer.layer.3.attention.v_lin', 'distilbert.transformer.layer.4.attention.q_lin', 'distilbert.transformer.layer.4.attention.k_lin', 'distilbert.transformer.layer.4.attention.v_lin', 'distilbert.transformer.layer.5.attention.q_lin', 'distilbert.transformer.layer.5.attention.k_lin', 'distilbert.transformer.layer.5.attention.v_lin']
Using LoRA target modules: ['q_lin', 'k_lin', 'v_l

## Train the PEFT Model

In [8]:
training_args = TrainingArguments(
    output_dir="./outputs_peft",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_accuracy",
    logging_steps=50,
    report_to="none",
)

# Ensure label mappings exist in the model config (helps Trainer and checkpoints)
if not hasattr(peft_model.config, "id2label") or not hasattr(peft_model.config, "label2id"):
    peft_model.config.id2label = {i: str(i) for i in range(num_labels)}
    peft_model.config.label2id = {str(i): i for i in range(num_labels)}

# Move PEFT model to the appropriate device
peft_model.to(device)

peft_trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Train the PEFT model (this updates the randomly-initialized classification head)
peft_trainer.train()

# Save the PEFT adapters (and optionally the full model if desired)
save_dir = "./distilbert-lora-finetuned"
peft_model.save_pretrained(save_dir)
print("Saved PEFT adapters to", save_dir)

# Evaluate after fine-tuning
peft_metrics = peft_trainer.evaluate()
peft_metrics

{'loss': '0.6716', 'grad_norm': '3.329', 'learning_rate': '1.869e-05', 'epoch': '0.2'}
{'loss': '0.6237', 'grad_norm': '1.677', 'learning_rate': '1.736e-05', 'epoch': '0.4'}
{'loss': '0.6156', 'grad_norm': '1.662', 'learning_rate': '1.603e-05', 'epoch': '0.6'}
{'loss': '0.5663', 'grad_norm': '1.37', 'learning_rate': '1.469e-05', 'epoch': '0.8'}
{'loss': '0.5342', 'grad_norm': '1.626', 'learning_rate': '1.336e-05', 'epoch': '1'}
{'eval_loss': '0.4814', 'eval_accuracy': '0.756', 'eval_runtime': '0.4556', 'eval_samples_per_second': '1097', 'eval_steps_per_second': '138.3', 'epoch': '1'}
{'loss': '0.4625', 'grad_norm': '1.084', 'learning_rate': '1.203e-05', 'epoch': '1.2'}
{'loss': '0.4227', 'grad_norm': '1.405', 'learning_rate': '1.069e-05', 'epoch': '1.4'}
{'loss': '0.4276', 'grad_norm': '1.361', 'learning_rate': '9.36e-06', 'epoch': '1.6'}
{'loss': '0.3912', 'grad_norm': '1.895', 'learning_rate': '8.027e-06', 'epoch': '1.8'}
{'loss': '0.3812', 'grad_norm': '2.807', 'learning_rate': '6.6



{'eval_loss': '0.3928', 'eval_accuracy': '0.814', 'eval_runtime': '0.4633', 'eval_samples_per_second': '1079', 'eval_steps_per_second': '136', 'epoch': '2'}
{'loss': '0.3612', 'grad_norm': '2.594', 'learning_rate': '5.36e-06', 'epoch': '2.2'}
{'loss': '0.3856', 'grad_norm': '2.588', 'learning_rate': '4.027e-06', 'epoch': '2.4'}
{'loss': '0.4307', 'grad_norm': '3.545', 'learning_rate': '2.693e-06', 'epoch': '2.6'}
{'loss': '0.3558', 'grad_norm': '1.901', 'learning_rate': '1.36e-06', 'epoch': '2.8'}
{'loss': '0.3467', 'grad_norm': '2.989', 'learning_rate': '2.667e-08', 'epoch': '3'}
{'eval_loss': '0.3923', 'eval_accuracy': '0.818', 'eval_runtime': '0.4358', 'eval_samples_per_second': '1147', 'eval_steps_per_second': '144.6', 'epoch': '3'}
{'train_runtime': '15.11', 'train_samples_per_second': '397.1', 'train_steps_per_second': '49.64', 'train_loss': '0.4651', 'epoch': '3'}
Saved PEFT adapters to ./distilbert-lora-finetuned
{'eval_loss': '0.3923', 'eval_accuracy': '0.818', 'eval_runtime':

{'eval_loss': 0.3923342227935791,
 'eval_accuracy': 0.818,
 'eval_runtime': 0.4242,
 'eval_samples_per_second': 1178.795,
 'eval_steps_per_second': 148.528,
 'epoch': 3.0}

In [9]:
# Evaluate the fine-tuned PEFT model
peft_metrics = peft_trainer.evaluate()
print("PEFT Model Evaluation Metrics:")
print(f"Eval Loss: {peft_metrics.get('eval_loss', 'N/A')}")
print(f"Eval Accuracy: {peft_metrics.get('eval_accuracy', 'N/A'):.4f}")

{'eval_loss': '0.3923', 'eval_accuracy': '0.818', 'eval_runtime': '0.4445', 'eval_samples_per_second': '1125', 'eval_steps_per_second': '141.7', 'epoch': '3'}
PEFT Model Evaluation Metrics:
Eval Loss: 0.3923342227935791
Eval Accuracy: 0.8180


## Evaluate Fine-Tuned PEFT Model

## Save the Trained PEFT Model

In [10]:
save_dir = "./distilbert-lora"
peft_model.save_pretrained(save_dir)
save_dir

'./distilbert-lora'

### Alternative Save Location

Save a copy of the trained model to a separate directory for reference.

## Evaluate the PEFT Model

In [11]:
from peft import AutoPeftModelForSequenceClassification

# Load the saved PEFT model from the fine-tuned directory
peft_model_loaded = AutoPeftModelForSequenceClassification.from_pretrained(
    "./distilbert-lora-finetuned",
    device_map="auto",
)

# Move to device if needed
peft_model_loaded.to(device)
print("Loaded fine-tuned PEFT model from ./distilbert-lora-finetuned")
print(f"Model device: {next(peft_model_loaded.parameters()).device}")

Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

Loaded fine-tuned PEFT model from ./distilbert-lora-finetuned
Model device: cuda:0


In [12]:
# Perform inference using the loaded PEFT model
peft_trainer_loaded = Trainer(
    model=peft_model_loaded,
    args=base_args,  # Use same eval args as base model for fair comparison
    eval_dataset=tokenized_eval,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Evaluate the loaded PEFT model
peft_metrics_loaded = peft_trainer_loaded.evaluate()
print("Loaded PEFT model evaluation metrics:", peft_metrics_loaded)

{'eval_loss': '0.3923', 'eval_model_preparation_time': '0.001', 'eval_accuracy': '0.818', 'eval_runtime': '0.4486', 'eval_samples_per_second': '1115', 'eval_steps_per_second': '140.4'}
Loaded PEFT model evaluation metrics: {'eval_loss': 0.3923342227935791, 'eval_model_preparation_time': 0.001, 'eval_accuracy': 0.818, 'eval_runtime': 0.4486, 'eval_samples_per_second': 1114.544, 'eval_steps_per_second': 140.432}


## Load and Use the Saved PEFT Model

In [13]:
peft_metrics = peft_trainer.evaluate()
peft_metrics

{'eval_loss': '0.3923', 'eval_accuracy': '0.818', 'eval_runtime': '0.4527', 'eval_samples_per_second': '1104', 'eval_steps_per_second': '139.2', 'epoch': '3'}


{'eval_loss': 0.3923342227935791,
 'eval_accuracy': 0.818,
 'eval_runtime': 0.4527,
 'eval_samples_per_second': 1104.437,
 'eval_steps_per_second': 139.159,
 'epoch': 3.0}

### Re-evaluate Using Trainer

Verify metrics by running evaluation through the trainer again.

## Summary: Compare Base vs. Fine-Tuned Model

In [14]:
import pandas as pd

# Create a comparison table
comparison_data = {
    "Model": ["Base (Pretrained)", "PEFT Fine-Tuned"],
    "Eval Loss": [base_metrics.get("eval_loss", "N/A"), peft_metrics.get("eval_loss", "N/A")],
    "Accuracy": [base_metrics.get("eval_accuracy", "N/A"), peft_metrics.get("eval_accuracy", "N/A")],
}

comparison_df = pd.DataFrame(comparison_data)
print("\n" + "="*60)
print("COMPARISON: Base Model vs. Fine-Tuned PEFT Model")
print("="*60)
print(comparison_df.to_string(index=False))
print("="*60 + "\n")

# Calculate improvement
if isinstance(base_metrics.get("eval_accuracy"), (int, float)) and isinstance(peft_metrics.get("eval_accuracy"), (int, float)):
    accuracy_improvement = (peft_metrics["eval_accuracy"] - base_metrics["eval_accuracy"]) * 100
    print(f"Accuracy Improvement: {accuracy_improvement:+.2f}%")
    
if isinstance(base_metrics.get("eval_loss"), (int, float)) and isinstance(peft_metrics.get("eval_loss"), (int, float)):
    loss_improvement = base_metrics["eval_loss"] - peft_metrics["eval_loss"]
    print(f"Loss Reduction: {loss_improvement:.4f}")


COMPARISON: Base Model vs. Fine-Tuned PEFT Model
            Model  Eval Loss  Accuracy
Base (Pretrained)   0.698768     0.390
  PEFT Fine-Tuned   0.392334     0.818

Accuracy Improvement: +42.80%
Loss Reduction: 0.3064


QLoRA (Quantized LoRA) for memory-efficient fine-tuning

In [15]:
# Optional: QLoRA (Quantized LoRA) for memory-efficient fine-tuning
try:
    from transformers import BitsAndBytesConfig
    from peft import prepare_model_for_kbit_training
    
    # Configure 4-bit quantization
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
    )
    
    # Load base model with quantization
    base_model_qlora = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=num_labels,
        pad_token_id=tokenizer.pad_token_id,
        quantization_config=bnb_config,
        device_map="auto",
    )
    
    # Prepare model for k-bit training
    base_model_qlora = prepare_model_for_kbit_training(base_model_qlora)
    
    # Apply LoRA on top of quantized model
    lora_config_qlora = LoraConfig(
        task_type="SEQ_CLS",
        r=8,
        lora_alpha=16,
        lora_dropout=0.1,
        bias="none",
        target_modules=selected,  # Use same targets as before
    )
    
    peft_model_qlora = get_peft_model(base_model_qlora, lora_config_qlora)
    peft_model_qlora.print_trainable_parameters()
    
    print("\n✓ QLoRA model successfully created (4-bit quantized + LoRA)")
    print("This configuration is more memory-efficient for large models!")
    
except ImportError as e:
    print(f"⚠️ bitsandbytes not available for QLoRA demo: {e}")
    print("Install with: pip install bitsandbytes")

Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

trainable params: 813,314 || all params: 67,768,324 || trainable%: 1.2001

✓ QLoRA model successfully created (4-bit quantized + LoRA)
This configuration is more memory-efficient for large models!


### QLoRA Implementation

Optional: Combine 4-bit quantization with LoRA for memory-efficient training on larger models.

## Bonus: QLoRA Fine-Tuning Example

This section demonstrates QLoRA (Quantized LoRA), which combines quantization with LoRA for memory-efficient training.