- references
    - https://huggingface.co/blog/peft_merging
    - https://huggingface.co/docs/peft/developer_guides/model_merging

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

os.environ['http_proxy'] = 'http://127.0.0.1:7890'
os.environ['https_proxy'] = 'http://127.0.0.1:7890'

In [2]:
# !pip install --upgrade peft

In [38]:
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
import torch
import random
from IPython.display import Image

## basics

- not only on LLMs, but diffusion models
- LoraConfig
    - target_modules: The names of the modules to apply the adapter to. 
    - r: Lora attention dimension (the "rank").
    - lora_alpha: The alpha parameter for Lora scaling.
    - fan_in_fan_out: boolean, 是否先入参再出参（维度、shape 对齐）
        ```
        # torch.nn.Linear, fan_in_fan_out: False
        # y = xA^T + b
        self.weight = Parameter(torch.empty((out_features, in_features), **factory_kwargs))
        ```
    - scaling: https://arxiv.org/pdf/2106.09685.pdf
        - ∆W is scaled by α / r where α is a constant.
        
        ```
        # lora.layer
        if use_rslora:
            self.scaling[adapter_name] = lora_alpha / math.sqrt(r)
        else:
            self.scaling[adapter_name] = lora_alpha / r
        ```

- W, lora_a, lora_b
    - $\Delta W = $  lora_b @ lora_a
    - $(W+\Delta W)x=Wx+(lora\_b @ lora\_a)x$
- combination types
    - ties: https://arxiv.org/pdf/2306.01708.pdf
    - svd
        - $\Delta_{merged}=weight_1*scaling_1*lora\_{b1}*lora\_{a1} + weight_2*scaling_2*lora\_{b2}*lora\_{a2}$

In [39]:
Image(url='https://miro.medium.com/v2/resize:fit:1400/format:webp/1*EnUd1eXLvXCxRZj9NW2BeA.png', width=400)

- target_modules
    - nn.Linear, nn.Embedding and nn.Conv2d.

```
import re
pattern = r'\((\w+)\): Linear'
# (norobots): Linear
# (sql): Linear
# (adcopy): Linear
# (merge): Linear
linear_layers = re.findall(pattern, str(model.modules))
target_modules = list(set(linear_layers))
target_modules
```

## lora models

In [4]:
peft_model_id = "smangrul/tinyllama_lora_norobots"
device = "cuda"
config = PeftConfig.from_pretrained(peft_model_id)

In [5]:
config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path='TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T', revision=None, task_type='CAUSAL_LM', inference_mode=True, r=8, target_modules={'down_proj', 'o_proj', 'k_proj', 'gate_proj', 'q_proj', 'lm_head', 'up_proj', 'v_proj', 'embed_tokens'}, lora_alpha=16, lora_dropout=0.1, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, use_dora=False, layer_replication=None)

In [6]:
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, load_in_4bit=True, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(peft_model_id)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
len(tokenizer)

32005

In [8]:
model.config.vocab_size

32000

In [9]:
model.resize_token_embeddings(len(tokenizer))

Embedding(32005, 2048)

In [10]:
AutoTokenizer.from_pretrained("smangrul/tinyllama_lora_norobots").vocab.keys() - AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T").vocab.keys()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


{'<pad>',
 '<|im_end|>',
 '<|im_start|>assistant',
 '<|im_start|>system',
 '<|im_start|>user'}

In [11]:
model = PeftModel.from_pretrained(model, peft_model_id, adapter_name="norobots")

In [12]:
_ = model.load_adapter("smangrul/tinyllama_lora_sql", adapter_name="sql")
_ = model.load_adapter("smangrul/tinyllama_lora_adcopy", adapter_name="adcopy")

In [13]:
model.peft_config

{'norobots': LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path='TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T', revision=None, task_type='CAUSAL_LM', inference_mode=True, r=8, target_modules={'down_proj', 'o_proj', 'k_proj', 'gate_proj', 'q_proj', 'lm_head', 'up_proj', 'v_proj', 'embed_tokens'}, lora_alpha=16, lora_dropout=0.1, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, use_dora=False, layer_replication=None),
 'sql': LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path='TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T', revision=None, task_type='CAUSAL_LM', inference_mode=True, r=8, target_modules={'down_proj', 'o_proj', 'k_proj', 'gate_proj', 'q_proj', 'up_proj', 'v_proj'}, lora_alpha=16, lora_dro

In [14]:
model.peft_config.keys()

dict_keys(['norobots', 'sql', 'adcopy'])

In [15]:
for k, v in model.peft_config.items():
    print(k, v.r, v.lora_alpha, v.lora_alpha/v.r, v.fan_in_fan_out)

norobots 8 16 2.0 False
sql 8 16 2.0 False
adcopy 8 16 2.0 False


In [16]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): lora.Embedding(
          (base_layer): Embedding(32005, 2048)
          (lora_dropout): ModuleDict(
            (norobots): Dropout(p=0.1, inplace=False)
            (adcopy): Dropout(p=0.1, inplace=False)
          )
          (lora_A): ModuleDict()
          (lora_B): ModuleDict()
          (lora_embedding_A): ParameterDict(
              (norobots): Parameter containing: [torch.cuda.HalfTensor of size 8x32005 (GPU 0)]
              (adcopy): Parameter containing: [torch.cuda.HalfTensor of size 8x32005 (GPU 0)]
          )
          (lora_embedding_B): ParameterDict(
              (norobots): Parameter containing: [torch.cuda.HalfTensor of size 2048x8 (GPU 0)]
              (adcopy): Parameter containing: [torch.cuda.HalfTensor of size 2048x8 (GPU 0)]
          )
        )
        (layers): ModuleList(
          (0-21): 22 x LlamaDecoderLayer(
          

## merge 3 adapters

- combination_type
    - [`svd`, `linear`, `cat`, `ties`, `ties_svd`, `dare_ties`, `dare_linear`, `dare_ties_svd`, `dare_linear_svd`, `magnitude_prune`, `magnitude_prune_svd`]
    - `combination_type = "linear" if len(adapters) == 1 else combination_type`
    - 两种主要的类型
        - TIES：TrIm, Elect, and Merge (TIES) is a three-step method for merging models. F
        - DARE：Drop And REscale is a method that can be used to prepare for other model merging methods like TIES.
    - 实现上
        -  `cat`
        -  `[svd, ties_svd, dare_linear_svd, dare_ties_svd, magnitude_prune_svd]`
        -  `[linear, ties, dare_linear, dare_ties, magnitude_prune]`
- target
    - lora_A/lora_B
    - lora_embedding_A/lora_embedding_B 

In [17]:
adapters = ["norobots", "adcopy", "sql"]
weights = [2.0, 0.3, 0.7]
adapter_name = "merge"
density = 0.2
# combination_type = "ties"
combination_type = "svd"
if adapter_name in model.peft_config:
    model.delete_adapter(adapter_name)
model.add_weighted_adapter(adapters, weights, adapter_name, combination_type=combination_type, density=density)

In [18]:
for adapter in adapters:
    print(adapter, model.peft_config[adapter].target_modules)

norobots {'down_proj', 'o_proj', 'k_proj', 'gate_proj', 'q_proj', 'lm_head', 'up_proj', 'v_proj', 'embed_tokens'}
adcopy {'down_proj', 'o_proj', 'k_proj', 'gate_proj', 'q_proj', 'lm_head', 'up_proj', 'v_proj', 'embed_tokens'}
sql {'down_proj', 'o_proj', 'k_proj', 'gate_proj', 'q_proj', 'up_proj', 'v_proj'}


In [19]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): lora.Embedding(
          (base_layer): Embedding(32005, 2048)
          (lora_dropout): ModuleDict(
            (norobots): Dropout(p=0.1, inplace=False)
            (adcopy): Dropout(p=0.1, inplace=False)
            (merge): Dropout(p=0.1, inplace=False)
          )
          (lora_A): ModuleDict()
          (lora_B): ModuleDict()
          (lora_embedding_A): ParameterDict(
              (norobots): Parameter containing: [torch.cuda.HalfTensor of size 8x32005 (GPU 0)]
              (adcopy): Parameter containing: [torch.cuda.HalfTensor of size 8x32005 (GPU 0)]
              (merge): Parameter containing: [torch.cuda.FloatTensor of size 8x32005 (GPU 0)]
          )
          (lora_embedding_B): ParameterDict(
              (norobots): Parameter containing: [torch.cuda.HalfTensor of size 2048x8 (GPU 0)]
              (adcopy): Parameter containing: [torch

In [20]:
# model.base_model.model.model.layers[0].mlp

In [21]:
model.model.get_submodule('model.embed_tokens').scaling

{'norobots': 2.0, 'adcopy': 2.0, 'merge': 1.0}

In [22]:
type(model.model.get_submodule('model.layers.0.self_attn.q_proj'))

peft.tuners.lora.bnb.Linear4bit

### from scratch

In [23]:
model.base_model.model.model.embed_tokens

lora.Embedding(
  (base_layer): Embedding(32005, 2048)
  (lora_dropout): ModuleDict(
    (norobots): Dropout(p=0.1, inplace=False)
    (adcopy): Dropout(p=0.1, inplace=False)
    (merge): Dropout(p=0.1, inplace=False)
  )
  (lora_A): ModuleDict()
  (lora_B): ModuleDict()
  (lora_embedding_A): ParameterDict(
      (norobots): Parameter containing: [torch.cuda.HalfTensor of size 8x32005 (GPU 0)]
      (adcopy): Parameter containing: [torch.cuda.HalfTensor of size 8x32005 (GPU 0)]
      (merge): Parameter containing: [torch.cuda.FloatTensor of size 8x32005 (GPU 0)]
  )
  (lora_embedding_B): ParameterDict(
      (norobots): Parameter containing: [torch.cuda.HalfTensor of size 2048x8 (GPU 0)]
      (adcopy): Parameter containing: [torch.cuda.HalfTensor of size 2048x8 (GPU 0)]
      (merge): Parameter containing: [torch.cuda.FloatTensor of size 2048x8 (GPU 0)]
  )
)

In [24]:
lora_a_1 = model.base_model.model.model.embed_tokens.lora_embedding_A['norobots']
lora_a_2 = model.base_model.model.model.embed_tokens.lora_embedding_A['adcopy']
lora_b_1 = model.base_model.model.model.embed_tokens.lora_embedding_B['norobots']
lora_b_2 = model.base_model.model.model.embed_tokens.lora_embedding_B['adcopy']

In [25]:
lora_a_1.shape, lora_b_1.shape, lora_a_2.shape, lora_b_2.shape

(torch.Size([8, 32005]),
 torch.Size([2048, 8]),
 torch.Size([8, 32005]),
 torch.Size([2048, 8]))

In [26]:
scaling = [2, 2]
weights = [2, 0.3] 
valid_weights = [2*2, 2*0.3]

In [27]:
def transpose(weight, fan_in_fan_out):
    if not fan_in_fan_out:
        return weight

    if isinstance(weight, torch.nn.Parameter):
        return torch.nn.Parameter(weight.T)
    return weight.T

In [28]:
# https://github.com/huggingface/peft/blob/main/src/peft/tuners/lora/layer.py#L450-L482
delta_lora_1 = transpose(lora_b_1 @ lora_a_1, True) * scaling[0]
delta_lora_2 = transpose(lora_b_2 @ lora_a_2, True) * scaling[1]

In [58]:
delta_lora_1

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0051,  0.0033,  0.0034,  ...,  0.0007,  0.0063, -0.0040],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        ...,
        [-0.0040,  0.0023, -0.0061,  ..., -0.0033, -0.0078, -0.0032],
        [-0.0049, -0.0061,  0.0007,  ..., -0.0042, -0.0038,  0.0104],
        [ 0.0014,  0.0111, -0.0023,  ..., -0.0019,  0.0016,  0.0091]],
       device='cuda:0', dtype=torch.float16, grad_fn=<MulBackward0>)

In [59]:
delta_lora_2

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0098, -0.0076, -0.0048,  ...,  0.0222, -0.0203, -0.0034],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        ...,
        [ 0.0015,  0.0047, -0.0089,  ..., -0.0164,  0.0086, -0.0184],
        [-0.0065, -0.0049, -0.0055,  ..., -0.0001, -0.0073, -0.0074],
        [ 0.0045,  0.0220, -0.0076,  ..., -0.0301,  0.0191,  0.0123]],
       device='cuda:0', dtype=torch.float16)

In [29]:
delta_lora_1.shape, delta_lora_2.shape

(torch.Size([32005, 2048]), torch.Size([32005, 2048]))

In [30]:
delta_lora = valid_weights[0] * delta_lora_1 + valid_weights[1] * delta_lora_2
delta_lora

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0147,  0.0087,  0.0105,  ...,  0.0161,  0.0129, -0.0181],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        ...,
        [-0.0150,  0.0119, -0.0298,  ..., -0.0230, -0.0261, -0.0239],
        [-0.0233, -0.0274, -0.0006,  ..., -0.0169, -0.0194,  0.0371],
        [ 0.0084,  0.0575, -0.0138,  ..., -0.0258,  0.0178,  0.0437]],
       device='cuda:0', dtype=torch.float16, grad_fn=<AddBackward0>)

In [31]:
delta_lora = delta_lora.T
U, S, Vh = torch.linalg.svd(delta_lora.float(), full_matrices=True)
U = U[:, :8]
S = S[:8]
U = U @ torch.diag(S)
Vh = Vh[:8, :]

In [32]:
Vh.shape, U.shape

(torch.Size([8, 32005]), torch.Size([2048, 8]))

In [33]:
Vh

tensor([[-5.9250e-08,  3.6710e-03,  5.8551e-09,  ..., -5.5535e-03,
         -1.1281e-02, -5.8577e-03],
        [-5.3737e-08,  7.6944e-03, -9.9491e-09,  ..., -1.8266e-02,
         -2.5229e-02,  5.2019e-03],
        [-2.8191e-08, -3.8739e-03, -2.3445e-09,  ..., -2.6179e-03,
         -1.5592e-03,  2.8683e-02],
        ...,
        [ 1.6487e-08, -3.0573e-03, -1.4432e-10,  ...,  9.0157e-03,
         -2.2049e-02,  4.6260e-03],
        [-2.2342e-08,  1.2938e-02, -5.7639e-09,  ...,  1.6897e-02,
          8.5601e-03, -1.1252e-02],
        [-1.0293e-08, -4.6255e-03, -2.4660e-08,  ..., -1.4191e-02,
          1.2497e-02,  6.5705e-03]], device='cuda:0', grad_fn=<SliceBackward0>)

In [34]:
lora_a_new = model.base_model.model.model.embed_tokens.lora_embedding_A['merge']
lora_b_new = model.base_model.model.model.embed_tokens.lora_embedding_B['merge']

In [35]:
lora_a_new.shape, lora_b_new.shape

(torch.Size([8, 32005]), torch.Size([2048, 8]))

In [36]:
lora_a_new

Parameter containing:
tensor([[ 6.6927e-08,  3.6697e-03, -4.0709e-08,  ..., -5.5574e-03,
         -1.1279e-02, -5.8365e-03],
        [ 5.2394e-08,  7.7513e-03,  3.8410e-08,  ..., -1.8233e-02,
         -2.5212e-02,  4.8264e-03],
        [-3.8023e-08,  3.7841e-03,  8.6544e-09,  ...,  2.8255e-03,
          1.8587e-03, -2.8766e-02],
        ...,
        [-2.4451e-09, -3.0670e-03, -6.7911e-09,  ...,  8.9840e-03,
         -2.2061e-02,  4.7337e-03],
        [-3.0357e-08, -1.2936e-02, -3.8171e-09,  ..., -1.6890e-02,
         -8.5719e-03,  1.1253e-02],
        [ 1.4960e-08, -4.6251e-03,  7.7989e-08,  ..., -1.4207e-02,
          1.2501e-02,  6.5824e-03]], device='cuda:0')

## inference

### instruct

In [60]:
messages = [
    {"role": "user", "content": "Write an essay about Generative AI."},
]
text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
text

'<|im_start|>user\nWrite an essay about Generative AI.<|im_end|>\n<|im_start|>assistant\n'

In [61]:
inputs = tokenizer(text, return_tensors="pt")
inputs = {k: v.to("cuda") for k, v in inputs.items()}
outputs = model.generate(**inputs, max_new_tokens=256, do_sample=True, top_p=0.95, temperature=0.2, repetition_penalty=1.2, eos_token_id=tokenizer.eos_token_id)
print(tokenizer.decode(outputs[0]))

<s><|im_start|>user 
Write an essay about Generative AI.<|im_end|> 
<|im_start|>assistant 
Generative Artificial Intelligence (GAI) is a new type of artificial intelligence that uses generative models to create art and other forms of creativity. The goal of GAI is to generate artwork based on prompts or instructions provided by the user, such as "make this look like a panda", or "draw me in the style of Picasso". This process can be iterated over multiple iterations until the desired result is achieved. For example, if you were given the task of creating a painting inspired by Pablo Picasso's work, then your final product might resemble his famous portrait of Marie-Therese Walter.<|im_end|>


### ad copy

In [51]:
messages = [
    {"role": "system", "content": "Create a text ad given the following product and description."},
    {"role": "user", "content": "Product: Sony PS5 PlayStation Console\nDescription: The PS5 console unleashes new gaming possibilities that you never anticipated."},
]
text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
text

'<|im_start|>system\nCreate a text ad given the following product and description.<|im_end|>\n<|im_start|>user\nProduct: Sony PS5 PlayStation Console\nDescription: The PS5 console unleashes new gaming possibilities that you never anticipated.<|im_end|>\n<|im_start|>assistant\n'

In [53]:
inputs = tokenizer(text, return_tensors="pt")
inputs = {k: v.to("cuda") for k, v in inputs.items()}
outputs = model.generate(**inputs, max_new_tokens=256, do_sample=True, top_p=0.95, temperature=0.2, repetition_penalty=1.2, eos_token_id=tokenizer.eos_token_id)
print(tokenizer.decode(outputs[0]))

<s><|im_start|>system 
Create a text ad given the following product and description.<|im_end|> 
<|im_start|>user 
Product: Sony PS5 PlayStation Console
Description: The PS5 console unleashes new gaming possibilities that you never anticipated.<|im_end|> 
<|im_start|>assistant 
The PS5 is an exciting new addition to your entertainment system, featuring a sleek design with innovative technologies like Blu-ray Disc playback, WiFi connectivity, and more. It's packed with powerful hardware and software features for immersive gameplay experiences, including dynamic lighting effects, motion controls, and more. With its impressive specs, it will be sure to please gamers of all ages!<|im_end|>


### sql

In [56]:
text = """Table: 2-11365528-2
Columns: ['Team', 'Head Coach', 'President', 'Home Ground', 'Location']
Natural Query: Who is the Head Coach of the team whose President is Mario Volarevic?
SQL Query:"""
print(text)

Table: 2-11365528-2
Columns: ['Team', 'Head Coach', 'President', 'Home Ground', 'Location']
Natural Query: Who is the Head Coach of the team whose President is Mario Volarevic?
SQL Query:


In [57]:
inputs = tokenizer(text, return_tensors="pt")
inputs = {k: v.to("cuda") for k, v in inputs.items()}
outputs = model.generate(**inputs, max_new_tokens=64, repetition_penalty=1.1, eos_token_id=tokenizer("</s>").input_ids[-1])
print(tokenizer.decode(outputs[0]))

<s> Table: 2-11365528-2
Columns: ['Team', 'Head Coach', 'President', 'Home Ground', 'Location']
Natural Query: Who is the Head Coach of the team whose President is Mario Volarevic?
SQL Query: SELECT Team, Head Coach, President, Home Ground FROM Teams WHERE Head Coach = Mario Volarevic
Result:
Team   Head Coach      President    Home Ground
----------------------------------------------
Boston Celtics     <NAME>          Boston
Chicago Bulls       <NAME>
