# Parameter-Efficient Fine-Tuning (PEFT)

In [4]:
# !pip install peft
# !pip install adapter-transformers

In [1]:
import torch, torchdata, torchtext
from torch import nn
import torch.nn.functional as F
from tqdm import tqdm
import random, math, time
from torch.autograd import Variable
import operator

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

#make our work comparable if restarted the kernel
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

  from .autonotebook import tqdm as notebook_tqdm


cuda:0


| Techniques     | All Params  | Trainable Params | Trainable % |
|----------------|-------------|------------------|-------------|
| BitFit         | 124,808,448 | 102,144          | 0.082       |
| Adapter        | 124,808,448 | 894,528          | 0.714       |
| Prompt Tuning  | 124,808,448 | 6,144            | 0.004       |
| Prefix  Tuning | 124,808,448 | 368,640          | 0.295       |
| LoRA           | 124,808,448 | 294,912          | 0.236       |

In [19]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def get_nb_trainable_parameters():
    r"""
    Returns the number of trainable parameters and number of all parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        num_params = param.numel()
        # if using DS Zero 3 and the weights are initialized empty
        if num_params == 0 and hasattr(param, "ds_numel"):
            num_params = param.ds_numel

        # Due to the design of 4bit linear layers from bitsandbytes
        # one needs to multiply the number of parameters by 2 to get
        # the correct number of parameters
        if param.__class__.__name__ == "Params4bit":
            num_params = num_params * 2

        all_param += num_params
        if param.requires_grad:
            trainable_params += num_params

    return trainable_params, all_param


def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params, all_param = get_nb_trainable_parameters()

    print(f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}")

## BitFit

In [20]:
import torch
import torch.nn as nn
from transformers import AutoModel, GPT2TokenizerFast

model_name_or_path = "gpt2"
tokenizer_name_or_path = "gpt2"

model = AutoModel.from_pretrained('gpt2')

In [21]:
# Freeze all parameters except biases
for name, param in model.named_parameters():
    if 'bias' not in name:
        param.requires_grad = False

In [22]:
print_trainable_parameters(model)

trainable params: 102144 || all params: 124439808 || trainable%: 0.08208305818022477


## Adapter

In [25]:
from transformers.adapters import GPT2AdapterModel
from transformers import AutoConfig

model_name_or_path = "gpt2"
config = AutoConfig.from_pretrained(model_name_or_path)
model = GPT2AdapterModel(config)
model.freeze_model()
model.add_causal_lm_head(
    head_name = 'lm_head',
)
model.add_adapter(adapter_name = 'adapter-name')
model

GPT2AdapterModel(
  (shared_parameters): ModuleDict()
  (transformer): GPT2Model(
    (shared_parameters): ModuleDict()
    (invertible_adapters): ModuleDict()
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): MergedLinear(
            in_features=768, out_features=2304, bias=True
            (loras): ModuleDict()
          )
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
          (prefix_tuning): PrefixTuningShim(
            (prefix_gates): ModuleDict()
            (pool): PrefixTuningPool(
              (prefix_tunings): ModuleDict()
            )
          )
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
  

In [26]:
print_trainable_parameters(model)

trainable params: 894528 || all params: 125334336 || trainable%: 0.7137134392286564


## Prompt Tuning
soft prompt tuning (Lester et al. 2021) concatenates the embeddings of the input tokens with a trainable tensor that can be optimized via backpropagation to improve the modeling performance on a target task.

In [32]:
from transformers import AutoModel
from peft import get_peft_config, get_peft_model, PromptTuningInit, PromptTuningConfig, TaskType, PeftType

model_name_or_path = "gpt2"
tokenizer_name_or_path = "gpt2"

peft_config = PromptTuningConfig(
    task_type=TaskType.CAUSAL_LM,
    prompt_tuning_init=PromptTuningInit.TEXT,
    num_virtual_tokens=8,
    prompt_tuning_init_text="Classify if the tweet is a complaint or not:",
    tokenizer_name_or_path=model_name_or_path,
)

model = AutoModel.from_pretrained(model_name_or_path)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 6,144 || all params: 124,445,952 || trainable%: 0.00493708304790822


## Prefix Tuning
 prefix tuning is to add trainable tensors to each transformer block instead of only the input embeddings, as in soft prompt tuning. Also, we obtain the soft prompt embedding via fully connected layers

In [11]:
from transformers import AutoModel
from peft import get_peft_config, get_peft_model, PrefixTuningConfig, TaskType

model_name_or_path = "gpt2"
tokenizer_name_or_path = "gpt2"

peft_config = PrefixTuningConfig(
    task_type=TaskType.CAUSAL_LM, 
    inference_mode=False, 
    num_virtual_tokens=20
)

model = AutoModel.from_pretrained(model_name_or_path)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 368,640 || all params: 124,808,448 || trainable%: 0.2953646214717773


## LoRA

In [1]:
from transformers import AutoModel
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType

model_name_or_path = "gpt2"
tokenizer_name_or_path = "gpt2"

peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, 
    inference_mode=False, 
    r=8, 
    lora_alpha=32, 
    lora_dropout=0.1
)

model = AutoModel.from_pretrained(model_name_or_path)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

  from .autonotebook import tqdm as notebook_tqdm


trainable params: 294,912 || all params: 124,734,720 || trainable%: 0.23643136409814364




# Quantization
`pip install bitsandbytes`


$$r = S(q-Z)$$

$$q = round(\frac{r}{S}+Z)$$


$$ S = \frac{r_{max}-r_{min}}{q_{max}-q_{min}}$$

$$ Z = round(q_{max} - \frac{r_{max}}{S})$$

In [None]:
import torch

a = torch.randn(5)
a[-1] = -5
print(a)

#1a. find normalization constant
amax = torch.abs(a).max()
#1b. scale into range [-127, 127] (int8)
scaled = a/amax*127

#2. round to nearest value
int8val = torch.round(scaled)
print(int8val)

#dequantization by rescaling
fp32val = (int8val.float()*amax/127.0)

print(a)
print(fp32val)
print(torch.abs(fp32val-a))
print(torch.abs(fp32val-a).mean())

# Reference
- [Quantization Huggingface](https://huggingface.co/docs/accelerate/usage_guides/quantization)
- [Making LLMs even more accessible with bitsandbytes, 4-bit quantization and QLoRA](https://huggingface.co/blog/4bit-transformers-bitsandbytes)
- [A Gentle Introduction to 8-bit Matrix Multiplication for transformers at scale](https://huggingface.co/blog/hf-bitsandbytes-integration)