# Notebook 3 — Comparison & Practical Workflow
**PEFT & Transfer Learning Series · Part 3 of 3**

This notebook provides:
1. A unified benchmark comparing all strategies
2. A decision function to pick the right strategy for your situation
3. A save/load workflow for adapter weights
4. HuggingFace PEFT integration (with graceful fallback)

> Prerequisites: `pip install torch numpy` (optional: `transformers peft`)

## 3.1 Unified Benchmark Setup

All strategies are measured consistently: trainable parameters, training time, accuracy.

In [1]:
import torch, torch.nn as nn, torch.optim as optim
import copy, time, json, os

torch.manual_seed(42)
D, VOCAB, SEQ, N_CLS, N = 32, 100, 10, 4, 200

class TinyLayer(nn.Module):
    def __init__(self, d=D):
        super().__init__()
        self.attn = nn.MultiheadAttention(d, 4, batch_first=True)
        self.ff   = nn.Sequential(nn.Linear(d, d*4), nn.GELU(), nn.Linear(d*4, d))
        self.n1   = nn.LayerNorm(d); self.n2 = nn.LayerNorm(d)
    def forward(self, x):
        a, _ = self.attn(x, x, x)
        x = self.n1(x + a); return self.n2(x + self.ff(x))

class TinyModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.embed  = nn.Embedding(VOCAB, D)
        self.layers = nn.ModuleList([TinyLayer() for _ in range(2)])
        self.head   = nn.Linear(D, N_CLS)
    def forward(self, x):
        h = self.embed(x)
        for l in self.layers: h = l(h)
        return self.head(h.mean(1))

X = torch.randint(0, VOCAB, (N, SEQ)); y = torch.randint(0, N_CLS, (N,))
Xtr, Xte, ytr, yte = X[:160], X[160:], y[:160], y[160:]

def benchmark(model, epochs=80, lr=1e-3, label=''):
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total     = sum(p.numel() for p in model.parameters())
    opt = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=lr)
    fn  = nn.CrossEntropyLoss()
    t0 = time.time()
    model.train()
    for _ in range(epochs):
        opt.zero_grad(); fn(model(Xtr), ytr).backward(); opt.step()
    elapsed = time.time() - t0
    model.eval()
    with torch.no_grad():
        test_acc = (model(Xte).argmax(1) == yte).float().mean().item()
    return {'label':label, 'trainable':trainable, 'total':total,
            'pct':trainable/total*100, 'acc':test_acc, 'time_s':elapsed}

base = TinyModel()
benchmark(base, epochs=60, label='pretrain')
print(f'Base model ready  |  {sum(p.numel() for p in base.parameters()):,} total params')


Base model ready  |  28,740 total params


## 3.2 Run All Strategies

Full fine-tune, Feature Extraction (TL), Adapter PEFT, and LoRA PEFT — all benchmarked on the same data.

In [2]:
class LoRALinear(nn.Module):
    def __init__(self, linear, r=4, alpha=8):
        super().__init__()
        d_out, d_in = linear.weight.shape
        self.frozen = linear
        self.A = nn.Parameter(torch.randn(r, d_in) * 0.01)
        self.B = nn.Parameter(torch.zeros(d_out, r))
        self.scale = alpha / r
        self.frozen.weight.requires_grad = False
        if self.frozen.bias is not None: self.frozen.bias.requires_grad = False
    def forward(self, x):
        return self.frozen(x) + (x @ self.A.T @ self.B.T) * self.scale

class Adapter(nn.Module):
    def __init__(self, d=D, r=4):
        super().__init__()
        self.down = nn.Linear(d, r); self.act = nn.GELU(); self.up = nn.Linear(r, d)
        nn.init.zeros_(self.up.weight); nn.init.zeros_(self.up.bias)
    def forward(self, x): return x + self.up(self.act(self.down(x)))

class LayerWithAdapter(nn.Module):
    def __init__(self, base_layer):
        super().__init__(); self.base = base_layer; self.adapter = Adapter()
    def forward(self, x): return self.adapter(self.base(x))

results = []

# 1. Full fine-tune
m = copy.deepcopy(base)
for p in m.parameters(): p.requires_grad = True
results.append(benchmark(m, epochs=80, lr=5e-4, label='Full Fine-Tune'))

# 2. Feature extraction (Transfer Learning)
m = copy.deepcopy(base); m.head = nn.Linear(D, N_CLS)
for p in m.embed.parameters():  p.requires_grad = False
for p in m.layers.parameters(): p.requires_grad = False
results.append(benchmark(m, epochs=80, lr=1e-3, label='Feature Extraction (TL)'))

# 3. Adapter PEFT
m = copy.deepcopy(base)
m.layers = nn.ModuleList([LayerWithAdapter(l) for l in m.layers])
for name, p in m.named_parameters(): p.requires_grad = 'adapter' in name
results.append(benchmark(m, epochs=80, lr=3e-3, label='Adapter (PEFT)'))

# 4. LoRA PEFT
m = copy.deepcopy(base)
for layer in m.layers: layer.ff[0] = LoRALinear(layer.ff[0])
for name, p in m.named_parameters(): p.requires_grad = ('.A' in name or '.B' in name)
lora_trained = m
results.append(benchmark(m, epochs=80, lr=3e-3, label='LoRA (PEFT)'))

base_total = sum(p.numel() for p in base.parameters())
print(f"{'Strategy':<26} {'Trainable':>10} {'%':>6} {'Accuracy':>10} {'Time(s)':>9}")
print('-' * 65)
for r in results:
    print(f"{r['label']:<26} {r['trainable']:>10,} {r['pct']:>5.1f}% {r['acc']:>9.2%} {r['time_s']:>8.2f}s")


Strategy                    Trainable      %   Accuracy   Time(s)
-----------------------------------------------------------------
Full Fine-Tune                 28,740 100.0%    17.50%     1.37s
Feature Extraction (TL)           132   0.5%    15.00%     0.47s
Adapter (PEFT)                    584   2.0%    17.50%     0.87s
LoRA (PEFT)                     1,280   4.3%    20.00%     0.96s


## 3.3 Decision Framework

Answer four questions to get the right strategy recommendation.

In [3]:
# -------------------------------------------------------------------
# Strategy recommender: answers based on your actual constraints
# -------------------------------------------------------------------
def recommend(large_model, data_size, compute, task_similar):
    # large_model  : bool -- model larger than 1B params
    # data_size    : str  -- 'small' / 'medium' / 'large'
    # compute      : str  -- 'low' / 'medium' / 'high'
    # task_similar : bool -- new task similar to pre-training domain
    print('Your situation:')
    print(f'  Large model (>1B)  : {large_model}')
    print(f'  Dataset size       : {data_size}')
    print(f'  Compute budget     : {compute}')
    print(f'  Task similar       : {task_similar}')
    print()
    if large_model:
        if compute == 'low':
            rec = 'LoRA or Prompt Tuning'
            why = 'Large model needs PEFT; low budget favors minimal params'
        elif data_size == 'small':
            rec = 'Prompt Tuning or Prefix Tuning'
            why = 'Minimal params when labeled data is scarce'
        else:
            rec = 'LoRA (industry standard)'
            why = 'Best accuracy-to-cost ratio for large LLMs'
    else:
        if task_similar and data_size in ('small', 'medium'):
            rec = 'Feature Extraction (Transfer Learning)'
            why = 'Frozen backbone already captures useful features; very fast'
        elif compute == 'high' and data_size == 'large':
            rec = 'Full Fine-Tuning'
            why = 'Resources allow it; maximizes task-specific accuracy'
        else:
            rec = 'Fine-Tuning with partial unfreeze'
            why = 'Balances adaptation depth with overfitting risk'
    print(f'  Recommendation : {rec}')
    print(f'  Reason         : {why}')

print('SCENARIO 1: Running LLaMA-7B on a laptop')
recommend(large_model=True,  data_size='medium', compute='low',    task_similar=False)
print()
print('SCENARIO 2: BERT for a similar classification task, lots of data')
recommend(large_model=False, data_size='large',  compute='high',   task_similar=True)
print()
print('SCENARIO 3: GPT-2 for niche domain, small dataset, tight budget')
recommend(large_model=False, data_size='small',  compute='low',    task_similar=False)


SCENARIO 1: Running LLaMA-7B on a laptop
Your situation:
  Large model (>1B)  : True
  Dataset size       : medium
  Compute budget     : low
  Task similar       : False

  Recommendation : LoRA or Prompt Tuning
  Reason         : Large model needs PEFT; low budget favors minimal params

SCENARIO 2: BERT for a similar classification task, lots of data
Your situation:
  Large model (>1B)  : False
  Dataset size       : large
  Compute budget     : high
  Task similar       : True

  Recommendation : Full Fine-Tuning
  Reason         : Resources allow it; maximizes task-specific accuracy

SCENARIO 3: GPT-2 for niche domain, small dataset, tight budget
Your situation:
  Large model (>1B)  : False
  Dataset size       : small
  Compute budget     : low
  Task similar       : False

  Recommendation : Fine-Tuning with partial unfreeze
  Reason         : Balances adaptation depth with overfitting risk


## 3.4 Saving & Loading LoRA Adapter Weights

One of PEFT's biggest practical wins: adapter files are tiny (megabytes, not gigabytes).
You can version-control them, share on HuggingFace Hub, and swap tasks in seconds.

In [4]:
# -------------------------------------------------------------------
# Save only the trainable LoRA A and B matrices
# Load them back into a fresh base model with LoRA wrappers
# -------------------------------------------------------------------
def save_adapter(model, path):
    os.makedirs(path, exist_ok=True)
    state = {name: p.detach().cpu().tolist()
             for name, p in model.named_parameters()
             if '.A' in name or '.B' in name}
    with open(f'{path}/adapter.json', 'w') as f:
        json.dump(state, f)
    config = {'method': 'lora', 'rank': 4, 'alpha': 8}
    with open(f'{path}/config.json', 'w') as f:
        json.dump(config, f, indent=2)
    print(f'Saved {len(state)} tensors to {path}/')

def load_adapter(model, path):
    with open(f'{path}/adapter.json') as f:
        state = json.load(f)
    sd = model.state_dict()
    for name, val in state.items():
        sd[name] = torch.tensor(val)
    model.load_state_dict(sd)
    print(f'Loaded {len(state)} tensors from {path}/')

save_adapter(lora_trained, '/tmp/lora_demo')

fresh = copy.deepcopy(base)
for layer in fresh.layers:
    layer.ff[0] = LoRALinear(layer.ff[0])
load_adapter(fresh, '/tmp/lora_demo')

fresh.eval()
with torch.no_grad():
    loaded_acc = (fresh(Xte).argmax(1) == yte).float().mean().item()
orig_acc = next(r['acc'] for r in results if 'LoRA' in r['label'])
print(f'Accuracy after save/load round-trip: {loaded_acc:.2%}  (original: {orig_acc:.2%})')


Saved 4 tensors to /tmp/lora_demo/
Loaded 4 tensors from /tmp/lora_demo/
Accuracy after save/load round-trip: 20.00%  (original: 20.00%)


## 3.5 HuggingFace PEFT Integration

In production, the `peft` library wraps any HuggingFace model with LoRA in 3 lines.
This cell runs if `transformers` and `peft` are installed; otherwise it prints the equivalent code.

In [5]:
# -------------------------------------------------------------------
# Production LoRA with HuggingFace peft library
# Install: pip install transformers peft
# -------------------------------------------------------------------
try:
    from transformers import AutoModelForSequenceClassification
    from peft import get_peft_model, LoraConfig, TaskType

    model = AutoModelForSequenceClassification.from_pretrained(
        'bert-base-uncased', num_labels=4
    )
    config = LoraConfig(
        task_type      = TaskType.SEQ_CLS,
        r              = 8,
        lora_alpha     = 16,
        lora_dropout   = 0.1,
        target_modules = ['query', 'value'],
    )
    peft_model = get_peft_model(model, config)
    peft_model.print_trainable_parameters()
    print('HuggingFace PEFT LoRA model ready.')

except ImportError:
    print('transformers / peft not installed.  Run:  pip install transformers peft')
    print()
    print('Key LoraConfig parameters:')
    print('  r              : rank of low-rank matrices (4-64; lower = fewer params)')
    print('  lora_alpha     : scaling factor (usually 2x r)')
    print('  target_modules : which weight matrices to apply LoRA to')
    print("                   ['query','value'] is the standard BERT choice")
    print('  lora_dropout   : regularization for LoRA layers')
    print()
    print('After wrapping:')
    print('  peft_model.save_pretrained("./adapter")  # saves only adapter weights (~MB)')
    print('  peft_model.push_to_hub("username/my-lora")  # share on HuggingFace Hub')


transformers / peft not installed.  Run:  pip install transformers peft

Key LoraConfig parameters:
  r              : rank of low-rank matrices (4-64; lower = fewer params)
  lora_alpha     : scaling factor (usually 2x r)
  target_modules : which weight matrices to apply LoRA to
                   ['query','value'] is the standard BERT choice
  lora_dropout   : regularization for LoRA layers

After wrapping:
  peft_model.save_pretrained("./adapter")  # saves only adapter weights (~MB)
  peft_model.push_to_hub("username/my-lora")  # share on HuggingFace Hub


## 3.6 Full Series Summary

| | Full Fine-Tuning | PEFT (LoRA / Adapter) | Transfer Learning |
|---|---|---|---|
| **Params trained** | All | <1–10% | Head + optional layers |
| **Memory** | High | Low | Low to medium |
| **Data needed** | Large | Small | Small |
| **Accuracy** | Highest | Near-equal | Good (similar tasks) |
| **Concept** | Training strategy | Fine-tuning strategy | Learning paradigm |
| **Best for** | Unlimited resources | Large models, tight budget | Related tasks, small data |

**Core message**: PEFT and Transfer Learning make high-quality AI practical on commodity hardware —
enabling chatbots, medical AI, and recommendation systems without multi-million-dollar compute budgets.