# [Parameter-Efficient Fine-Tuning (PEFT)](https://huggingface.co/docs/peft/quicktour)

PEFT methods selectively adjust a small set of additional model parameters while keeping the majority of the pre-trained LLM's parameters unchanged. This significantly reduces computational and storage requirements and addresses the problem of catastrophic forgetting often seen during full fine-tuning. Additionally, PEFT methods outperform traditional fine-tuning in situations with limited data and demonstrate superior generalization to out-of-domain scenarios.

In [1]:
# !pip install peft
# !pip install evaluate

In [1]:
import transformers
transformers.__version__

  from .autonotebook import tqdm as notebook_tqdm


'4.37.2'

In [1]:
import peft
peft.__version__

  from .autonotebook import tqdm as notebook_tqdm


'0.8.2'

In [1]:
import torch, torchdata, torchtext
from torch import nn
import torch.nn.functional as F
from tqdm import tqdm
import random, math, time
from torch.autograd import Variable
import operator

import os
import torch
# Set GPU device
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

#make our work comparable if restarted the kernel
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

cuda


  from .autonotebook import tqdm as notebook_tqdm


| Techniques     | All Params  | Trainable Params | Trainable % |
|----------------|-------------|------------------|-------------|
| BitFit         | 124,808,448 | 102,144          | 0.082       |
| Adapter        | 124,808,448 | 894,528          | 0.714       |
| Prompt Tuning  | 124,808,448 | 6,144            | 0.004       |
| Prefix  Tuning | 124,808,448 | 368,640          | 0.295       |
| LoRA           | 124,808,448 | 294,912          | 0.236       |

In [4]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def get_nb_trainable_parameters():
    r"""
    Returns the number of trainable parameters and number of all parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        num_params = param.numel()
        # if using DS Zero 3 and the weights are initialized empty
        if num_params == 0 and hasattr(param, "ds_numel"):
            num_params = param.ds_numel

        # Due to the design of 4bit linear layers from bitsandbytes
        # one needs to multiply the number of parameters by 2 to get
        # the correct number of parameters
        if param.__class__.__name__ == "Params4bit":
            num_params = num_params * 2

        all_param += num_params
        if param.requires_grad:
            trainable_params += num_params

    return trainable_params, all_param


def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params, all_param = get_nb_trainable_parameters()

    print(f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}")

## BitFit

<img src = "figures/bitfit.pbm" width=700>

In [5]:
import torch
import torch.nn as nn
from transformers import AutoModelForCausalLM, GPT2TokenizerFast

model_name_or_path = "gpt2"
tokenizer_name_or_path = "gpt2"

model = AutoModelForCausalLM.from_pretrained('gpt2')

In [6]:
# Freeze all parameters except biases
for name, param in model.named_parameters():
    if 'bias' not in name:
        param.requires_grad = False

In [6]:
print_trainable_parameters(model)

trainable params: 102144 || all params: 124439808 || trainable%: 0.08208305818022477


## Adapter

<img src = "figures/adapter.webp" width=500>

In [7]:
# Caution!!!! Adapter-transformers have depreacted and conflicted 
# adapter-transformers required transformers version 4.28.1 which is quite old already
# there is no need to use anymore
# !pip install adapter-transformers
from transformers.adapters import GPT2AdapterModel
from transformers import AutoConfig

model_name_or_path = "gpt2"
config = AutoConfig.from_pretrained(model_name_or_path)
model = GPT2AdapterModel(config)
model.freeze_model()
model.add_causal_lm_head(
    head_name = 'lm_head',
)
model.add_adapter(adapter_name = 'adapter-name')
model

GPT2AdapterModel(
  (shared_parameters): ModuleDict()
  (transformer): GPT2Model(
    (shared_parameters): ModuleDict()
    (invertible_adapters): ModuleDict()
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): MergedLinear(
            in_features=768, out_features=2304, bias=True
            (loras): ModuleDict()
          )
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
          (prefix_tuning): PrefixTuningShim(
            (prefix_gates): ModuleDict()
            (pool): PrefixTuningPool(
              (prefix_tunings): ModuleDict()
            )
          )
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
  

In [8]:
print_trainable_parameters(model)

trainable params: 894528 || all params: 125334336 || trainable%: 0.7137134392286564


## Prompt Tuning
soft prompt tuning (Lester et al. 2021) concatenates the embeddings of the input tokens with a trainable tensor that can be optimized via backpropagation to improve the modeling performance on a target task.

<img src = "figures/prompt.webp" width=500>

In [1]:
from transformers import AutoModel, AutoModelForCausalLM
from peft import (
    get_peft_config, 
    get_peft_model, 
    PromptTuningInit, 
    PromptTuningConfig, 
    TaskType, 
    PeftType
    )

model_name_or_path = "gpt2"
tokenizer_name_or_path = "gpt2"

peft_config = PromptTuningConfig(
    task_type=TaskType.CAUSAL_LM,
    prompt_tuning_init=PromptTuningInit.TEXT,
    num_virtual_tokens=8,
    prompt_tuning_init_text="Classify if the tweet is a complaint or not:",
    tokenizer_name_or_path=model_name_or_path,
)

model = AutoModel.from_pretrained(model_name_or_path)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

  from .autonotebook import tqdm as notebook_tqdm


trainable params: 6,144 || all params: 124,445,952 || trainable%: 0.00493708304790822


## Prefix Tuning
To add trainable tensors to each transformer block instead of only the input embeddings, as in soft prompt tuning. Also, we obtain the soft prompt embedding via fully connected layers

<img src = "figures/prefix_adapterhub.png" width=300>

In [30]:
from transformers import AutoModel, AutoModelForCausalLM
from peft import get_peft_config, get_peft_model, PrefixTuningConfig, TaskType

model_name_or_path = "gpt2"
tokenizer_name_or_path = "gpt2"

peft_config = PrefixTuningConfig(
    task_type=TaskType.CAUSAL_LM, 
    inference_mode=False, 
    num_virtual_tokens=20
)

model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 368,640 || all params: 124,808,448 || trainable%: 0.2953646214717773


## LoRA

<img src = "figures/lora-2.png" width=500>

In [1]:
from transformers import AutoModel, AutoModelForCausalLM
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType

model_name_or_path = "gpt2"
tokenizer_name_or_path = "gpt2"

peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, 
    inference_mode=False, 
    r=8, 
    lora_alpha=32, 
    lora_dropout=0.1
)


model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

  from .autonotebook import tqdm as notebook_tqdm


trainable params: 294,912 || all params: 124,734,720 || trainable%: 0.23643136409814364




In [2]:
# model

### Language model training wiht Lora

In [3]:
from datasets import load_dataset

dataset = load_dataset("wikitext", "wikitext-2-raw-v1")
dataset

DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 36718
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})

In [4]:
from dataclasses import dataclass, field
from itertools import chain
from typing import Optional
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    output = tokenizer(examples['text'])
    return output


tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns="text")
tokenized_datasets

DatasetDict({
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 36718
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 3760
    })
})

In [5]:
block_size = tokenizer.model_max_length

# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, and if the total_length < block_size  we exclude this batch and return an empty dict.
    # We could add padding if the model supported it instead of this drop, you can customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

tokenized_datasets = tokenized_datasets.map(group_texts, batched=True)
tokenized_datasets

DatasetDict({
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 274
    })
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2318
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 240
    })
})

In [6]:
import evaluate
metric = evaluate.load("accuracy")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # preds have the same shape as the labels, after the argmax(-1) has been calculated
    # by preprocess_l"ogits_for_metrics but we need to shift the labels
    labels = labels[:, 1:].reshape(-1)
    preds = preds[:, :-1].reshape(-1)
    return metric.compute(predictions=preds, references=labels)

[explore more](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling)

In [7]:
from transformers import TrainingArguments, Trainer, default_data_collator

training_args = TrainingArguments(
    output_dir="gpt2-lora",
    learning_rate=1e-3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=2,
    weight_decay=0.01,
    # load_best_model_at_end=True,
    fp16=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=default_data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()
trainer.save_model()
model.save_pretrained("gpt2-lora")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mguntsvzz[0m. Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss


### Inference

In [10]:
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

model = AutoPeftModelForCausalLM.from_pretrained(
    "gpt2-lora")
tokenizer = AutoTokenizer.from_pretrained(
    "gpt2-lora")

model = model.to(device)
model.eval()

text = "Preheat the oven to 350 degrees and place the cookie dough"
inputs = tokenizer(text, return_tensors="pt")

outputs = model.generate(
    input_ids=inputs["input_ids"].to(device), 
    max_new_tokens = 100)

print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0])

cuda


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Preheat the oven to 350 degrees and place the cookie dough in a large bowl. In a large bowl, combine the flour, baking powder, baking soda, salt and pepper. In a separate bowl, whisk together the egg yolks, salt and pepper. In a separate bowl, whisk together the flour mixture and the sugar. In a separate bowl, whisk together the flour mixture and the sugar. In a separate bowl, whisk together the egg yolks, salt and pepper. In a separate bowl, whisk together the flour mixture and the sugar.
