- references
    - https://huggingface.co/blog/peft_merging
    - https://huggingface.co/docs/peft/developer_guides/model_merging

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

os.environ['http_proxy'] = 'http://127.0.0.1:7890'
os.environ['https_proxy'] = 'http://127.0.0.1:7890'

In [2]:
# !pip install --upgrade peft

In [3]:
from peft import PeftConfig, PeftModel
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
import torch
import random

[2024-04-03 22:26:01,585] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)


## basics

- not only on LLMs, but diffusion models
- LoraConfig
    - target_modules: The names of the modules to apply the adapter to. 
    - r: Lora attention dimension (the "rank").
    - lora_alpha: The alpha parameter for Lora scaling.
    - fan_in_fan_out: boolean, 是否先入参再出参（维度、shape 对齐）
        ```
        # torch.nn.Linear, fan_in_fan_out: False
        # y = xA^T + b
        self.weight = Parameter(torch.empty((out_features, in_features), **factory_kwargs))
        ```
    - scaling
        ```
        # lora.layer
        if use_rslora:
            self.scaling[adapter_name] = lora_alpha / math.sqrt(r)
        else:
            self.scaling[adapter_name] = lora_alpha / r
        ```

- W, lora_a, lora_b
    - $\Delta W = $  lora_b @ lora_a
    - $(W+\Delta W)x=Wx+(lora\_b @ lora\_a)x$
- combination types
    - ties: https://arxiv.org/pdf/2306.01708.pdf
    - svd
        - $\Delta_{merged}=weight_1*scaling_1*lora\_{b1}*lora\_{a1} + weight_2*scaling_2*lora\_{b2}*lora\_{a2}$

## lora models

In [4]:
peft_model_id = "smangrul/tinyllama_lora_norobots"
device = "cuda"
config = PeftConfig.from_pretrained(peft_model_id)

In [5]:
config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path='TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T', revision=None, task_type='CAUSAL_LM', inference_mode=True, r=8, target_modules={'k_proj', 'down_proj', 'embed_tokens', 'q_proj', 'lm_head', 'o_proj', 'up_proj', 'v_proj', 'gate_proj'}, lora_alpha=16, lora_dropout=0.1, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, use_dora=False, layer_replication=None)

In [6]:
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, load_in_4bit=True, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(peft_model_id)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
len(tokenizer)

32005

In [8]:
model.config.vocab_size

32000

In [9]:
model.resize_token_embeddings(len(tokenizer))

Embedding(32005, 2048)

In [10]:
AutoTokenizer.from_pretrained("smangrul/tinyllama_lora_norobots").vocab.keys() - AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T").vocab.keys()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


{'<pad>',
 '<|im_end|>',
 '<|im_start|>assistant',
 '<|im_start|>system',
 '<|im_start|>user'}

In [11]:
model = PeftModel.from_pretrained(model, peft_model_id, adapter_name="norobots")

In [12]:
_ = model.load_adapter("smangrul/tinyllama_lora_sql", adapter_name="sql")
_ = model.load_adapter("smangrul/tinyllama_lora_adcopy", adapter_name="adcopy")

In [23]:
model.peft_config

{'norobots': LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path='TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T', revision=None, task_type='CAUSAL_LM', inference_mode=True, r=8, target_modules={'k_proj', 'down_proj', 'embed_tokens', 'q_proj', 'lm_head', 'o_proj', 'up_proj', 'v_proj', 'gate_proj'}, lora_alpha=16, lora_dropout=0.1, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, use_dora=False, layer_replication=None),
 'sql': LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path='TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T', revision=None, task_type='CAUSAL_LM', inference_mode=True, r=8, target_modules={'k_proj', 'down_proj', 'q_proj', 'o_proj', 'up_proj', 'v_proj', 'gate_proj'}, lora_alpha=16, lora_dro

In [13]:
model.peft_config.keys()

dict_keys(['norobots', 'sql', 'adcopy'])

In [45]:
for k, v in model.peft_config.items():
    print(k, v.r, v.lora_alpha, v.lora_alpha/v.r, v.fan_in_fan_out)

norobots 8 16 2.0 False
sql 8 16 2.0 False
adcopy 8 16 2.0 False
merge 8 8 1.0 False


In [15]:
model

## merge 3 adapters

- combination_type
    - [`svd`, `linear`, `cat`, `ties`, `ties_svd`, `dare_ties`, `dare_linear`, `dare_ties_svd`, `dare_linear_svd`, `magnitude_prune`, `magnitude_prune_svd`]
    - `combination_type = "linear" if len(adapters) == 1 else combination_type`
    - 两种主要的类型
        - TIES：TrIm, Elect, and Merge (TIES) is a three-step method for merging models. F
        - DARE：Drop And REscale is a method that can be used to prepare for other model merging methods like TIES.
    - 实现上
        -  `cat`
        -  `[svd, ties_svd, dare_linear_svd, dare_ties_svd, magnitude_prune_svd]`
        -  `[linear, ties, dare_linear, dare_ties, magnitude_prune]`
- target
    - lora_A/lora_B
    - lora_embedding_A/lora_embedding_B 

In [16]:
adapters = ["norobots", "adcopy", "sql"]
weights = [2.0, 0.3, 0.7]
adapter_name = "merge"
density = 0.2
# combination_type = "ties"
combination_type = "svd"
if adapter_name in model.peft_config:
    model.delete_adapter(adapter_name)
model.add_weighted_adapter(adapters, weights, adapter_name, combination_type=combination_type, density=density)

In [17]:
for adapter in adapters:
    print(adapter, model.peft_config[adapter].target_modules)

norobots {'k_proj', 'down_proj', 'embed_tokens', 'q_proj', 'lm_head', 'o_proj', 'up_proj', 'v_proj', 'gate_proj'}
adcopy {'k_proj', 'down_proj', 'embed_tokens', 'q_proj', 'lm_head', 'up_proj', 'o_proj', 'v_proj', 'gate_proj'}
sql {'k_proj', 'down_proj', 'q_proj', 'o_proj', 'up_proj', 'v_proj', 'gate_proj'}


In [37]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): lora.Embedding(
          (base_layer): Embedding(32005, 2048)
          (lora_dropout): ModuleDict(
            (norobots): Dropout(p=0.1, inplace=False)
            (adcopy): Dropout(p=0.1, inplace=False)
            (merge): Dropout(p=0.1, inplace=False)
          )
          (lora_A): ModuleDict()
          (lora_B): ModuleDict()
          (lora_embedding_A): ParameterDict(
              (norobots): Parameter containing: [torch.cuda.HalfTensor of size 8x32005 (GPU 0)]
              (adcopy): Parameter containing: [torch.cuda.HalfTensor of size 8x32005 (GPU 0)]
              (merge): Parameter containing: [torch.cuda.FloatTensor of size 8x32005 (GPU 0)]
          )
          (lora_embedding_B): ParameterDict(
              (norobots): Parameter containing: [torch.cuda.HalfTensor of size 2048x8 (GPU 0)]
              (adcopy): Parameter containing: [torch

In [55]:
# model.base_model.model.model.layers[0].mlp

In [36]:
model.model.get_submodule('model.embed_tokens').scaling

{'norobots': 2.0, 'adcopy': 2.0, 'merge': 1.0}

In [43]:
type(model.model.get_submodule('model.layers.0.self_attn.q_proj'))

peft.tuners.lora.bnb.Linear4bit

### from scratch

In [22]:
model.base_model.model.model.embed_tokens

lora.Embedding(
  (base_layer): Embedding(32005, 2048)
  (lora_dropout): ModuleDict(
    (norobots): Dropout(p=0.1, inplace=False)
    (adcopy): Dropout(p=0.1, inplace=False)
    (merge): Dropout(p=0.1, inplace=False)
  )
  (lora_A): ModuleDict()
  (lora_B): ModuleDict()
  (lora_embedding_A): ParameterDict(
      (norobots): Parameter containing: [torch.cuda.HalfTensor of size 8x32005 (GPU 0)]
      (adcopy): Parameter containing: [torch.cuda.HalfTensor of size 8x32005 (GPU 0)]
      (merge): Parameter containing: [torch.cuda.FloatTensor of size 8x32005 (GPU 0)]
  )
  (lora_embedding_B): ParameterDict(
      (norobots): Parameter containing: [torch.cuda.HalfTensor of size 2048x8 (GPU 0)]
      (adcopy): Parameter containing: [torch.cuda.HalfTensor of size 2048x8 (GPU 0)]
      (merge): Parameter containing: [torch.cuda.FloatTensor of size 2048x8 (GPU 0)]
  )
)

In [20]:
lora_a_1 = model.base_model.model.model.embed_tokens.lora_embedding_A['norobots']
lora_a_2 = model.base_model.model.model.embed_tokens.lora_embedding_A['adcopy']
lora_b_1 = model.base_model.model.model.embed_tokens.lora_embedding_B['norobots']
lora_b_2 = model.base_model.model.model.embed_tokens.lora_embedding_B['adcopy']

In [21]:
lora_a_1.shape, lora_b_1.shape, lora_a_2.shape, lora_b_2.shape

(torch.Size([8, 32005]),
 torch.Size([2048, 8]),
 torch.Size([8, 32005]),
 torch.Size([2048, 8]))

In [None]:
scaling = [2, 2]
weights = [2, 0.3] 
valid_weights = [2*2, 2*0.3]

In [None]:
def transpose(weight, fan_in_fan_out):
    if not fan_in_fan_out:
        return weight

    if isinstance(weight, torch.nn.Parameter):
        return torch.nn.Parameter(weight.T)
    return weight.T

In [None]:
delta_lora_1 = transpose(lora_b_1 @ lora_a_1, True) * scaling[0]
delta_lora_2 = transpose(lora_b_2 @ lora_a_2, True) * scaling[1]

In [None]:
delta_lora_1.shape, delta_lora_2.shape

In [None]:
delta_lora = valid_weights[0] * delta_lora_1 + valid_weights[1] * delta_lora_2
delta_lora

In [None]:
delta_lora = delta_lora.T
U, S, Vh = torch.linalg.svd(delta_lora.float(), full_matrices=True)
U = U[:, :8]
S = S[:8]
U = U @ torch.diag(S)
Vh = Vh[:8, :]

In [None]:
Vh.shape, U.shape

In [None]:
Vh

In [None]:
lora_a_new = model.base_model.model.model.embed_tokens.lora_embedding_A['merge']
lora_b_new = model.base_model.model.model.embed_tokens.lora_embedding_B['merge']

In [None]:
lora_a_new.shape, lora_b_new.shape

In [None]:
lora_a_new