In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel, CloverConfig, get_peft_model

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model = AutoModelForCausalLM.from_pretrained("/data2/mengfanxu/huggingface/Qwen2.5-0.5B", device_map = "auto")
tokenizer = AutoTokenizer.from_pretrained("/data2/mengfanxu/huggingface/Qwen2.5-0.5B")

In [3]:
for name, param in model.named_parameters():
    print(name, param.device)

model.embed_tokens.weight cuda:7
model.layers.0.self_attn.q_proj.weight cuda:7
model.layers.0.self_attn.q_proj.bias cuda:7
model.layers.0.self_attn.k_proj.weight cuda:7
model.layers.0.self_attn.k_proj.bias cuda:7
model.layers.0.self_attn.v_proj.weight cuda:7
model.layers.0.self_attn.v_proj.bias cuda:7
model.layers.0.self_attn.o_proj.weight cuda:7
model.layers.0.mlp.gate_proj.weight cuda:7
model.layers.0.mlp.up_proj.weight cuda:7
model.layers.0.mlp.down_proj.weight cuda:7
model.layers.0.input_layernorm.weight cuda:7
model.layers.0.post_attention_layernorm.weight cuda:7
model.layers.1.self_attn.q_proj.weight cuda:7
model.layers.1.self_attn.q_proj.bias cuda:7
model.layers.1.self_attn.k_proj.weight cuda:7
model.layers.1.self_attn.k_proj.bias cuda:7
model.layers.1.self_attn.v_proj.weight cuda:7
model.layers.1.self_attn.v_proj.bias cuda:7
model.layers.1.self_attn.o_proj.weight cuda:7
model.layers.1.mlp.gate_proj.weight cuda:7
model.layers.1.mlp.up_proj.weight cuda:7
model.layers.1.mlp.down_p

In [4]:
model

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2SdpaAttention(
          (q_proj): Linear(in_features=896, out_features=896, bias=True)
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): Linear(in_features=896, out_features=128, bias=True)
          (o_proj): Linear(in_features=896, out_features=896, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((

In [5]:
peft_model = PeftModel.from_pretrained(model, "/data2/mengfanxu/peft/src/peft/tuners/clover/clover_model")

In [2]:
import torch
import numpy as np
from scipy.linalg import qr

# Example matrix
A = (torch.randn(4096, 128)@torch.randn(128, 4096)).numpy()  # Convert PyTorch tensor to NumPy array
# Pivoted QR decomposition using SciPy
Q, R, P = qr(A, pivoting=True)

# Convert results back to PyTorch tensors
Q = torch.tensor(Q)
R = torch.tensor(R)
P = torch.tensor(P)

print("Q:", Q)
print("R:", R)
print("P (column permutation):", P)

Q: tensor([[-0.0230, -0.0107,  0.0010,  ..., -0.0034,  0.0044,  0.0004],
        [ 0.0149, -0.0086, -0.0119,  ..., -0.0032, -0.0049,  0.0055],
        [-0.0004, -0.0016, -0.0137,  ..., -0.0011, -0.0012,  0.0080],
        ...,
        [-0.0033,  0.0299,  0.0026,  ..., -0.0395,  0.0252, -0.0089],
        [ 0.0192, -0.0025,  0.0148,  ..., -0.0338, -0.0007, -0.0106],
        [-0.0083,  0.0105,  0.0126,  ..., -0.0247,  0.0107,  0.0099]])
R: tensor([[ 8.8886e+02, -8.6061e+00, -9.0715e+01,  ...,  7.3334e-01,
         -1.3633e+01,  1.7544e+01],
        [ 0.0000e+00, -8.7329e+02, -5.8500e+01,  ..., -4.8815e+01,
         -1.9377e+01, -4.3866e+01],
        [ 0.0000e+00,  0.0000e+00,  8.6374e+02,  ..., -6.3226e+01,
          8.3276e+01, -5.0930e+01],
        ...,
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  1.5508e-06,
          9.7916e-07, -2.9510e-07],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          9.6434e-07, -7.7335e-07],
        [ 0.0000e+00,  0.0000e+0

In [3]:
R[:128]

tensor([[ 8.8886e+02, -8.6061e+00, -9.0715e+01,  ...,  7.3334e-01,
         -1.3633e+01,  1.7544e+01],
        [ 0.0000e+00, -8.7329e+02, -5.8500e+01,  ..., -4.8815e+01,
         -1.9377e+01, -4.3866e+01],
        [ 0.0000e+00,  0.0000e+00,  8.6374e+02,  ..., -6.3226e+01,
          8.3276e+01, -5.0930e+01],
        ...,
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  9.8037e+00,
          2.6055e+01,  6.7967e+01],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  2.3755e+01,
          1.4145e+02, -3.0345e+01],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ..., -1.9294e+01,
         -1.2916e+02,  2.8201e+01]])

In [7]:
for name, param in peft_model.named_parameters():
    print(name, param.device)

base_model.model.model.embed_tokens.weight cuda:7
base_model.model.model.layers.0.self_attn.q_proj.base_layer.weight cuda:7
base_model.model.model.layers.0.self_attn.q_proj.base_layer.bias cuda:7
base_model.model.model.layers.0.self_attn.q_proj.clover_R.default cuda:7
base_model.model.model.layers.0.self_attn.k_proj.weight cuda:7
base_model.model.model.layers.0.self_attn.k_proj.bias cuda:7
base_model.model.model.layers.0.self_attn.v_proj.base_layer.weight cuda:7
base_model.model.model.layers.0.self_attn.v_proj.base_layer.bias cuda:7
base_model.model.model.layers.0.self_attn.v_proj.clover_R.default cuda:7
base_model.model.model.layers.0.self_attn.o_proj.weight cuda:7
base_model.model.model.layers.0.mlp.gate_proj.weight cuda:7
base_model.model.model.layers.0.mlp.up_proj.weight cuda:7
base_model.model.model.layers.0.mlp.down_proj.weight cuda:7
base_model.model.model.layers.0.input_layernorm.weight cuda:7
base_model.model.model.layers.0.post_attention_layernorm.weight cuda:7
base_model.mod

In [3]:
config = CloverConfig(init_clover_weights='qr', head_in_or_head_out={'q_proj':'in', 'v_proj':"out"})

In [4]:
peft_model = get_peft_model(model,config)

In [5]:
for name, param in peft_model.named_parameters():
    print(name, param.device)

base_model.model.model.embed_tokens.weight cuda:0
base_model.model.model.layers.0.self_attn.q_proj.base_layer.weight cuda:0
base_model.model.model.layers.0.self_attn.q_proj.clover_R.default cuda:0
base_model.model.model.layers.0.self_attn.k_proj.weight cuda:0
base_model.model.model.layers.0.self_attn.v_proj.base_layer.weight cuda:0
base_model.model.model.layers.0.self_attn.v_proj.clover_R.default cuda:0
base_model.model.model.layers.0.self_attn.o_proj.weight cuda:0
base_model.model.model.layers.0.mlp.gate_proj.weight cuda:0
base_model.model.model.layers.0.mlp.up_proj.weight cuda:0
base_model.model.model.layers.0.mlp.down_proj.weight cuda:0
base_model.model.model.layers.0.input_layernorm.weight cuda:0
base_model.model.model.layers.0.post_attention_layernorm.weight cuda:0
base_model.model.model.layers.1.self_attn.q_proj.base_layer.weight cuda:0
base_model.model.model.layers.1.self_attn.q_proj.clover_R.default cuda:0
base_model.model.model.layers.1.self_attn.k_proj.weight cuda:0
base_mode

In [5]:
peft_model.save_pretrained("clover_model")

In [8]:
output = peft_model.generate(**tokenizer("write me a peom",return_tensors='pt').to("cuda:7"))

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


In [9]:
tokenizer.batch_decode(output)

["write me a peom about the power of love\n\nIn the heart of the night,\nWhere love's light is bright,\nIt shines like a beacon,\nGuiding us through the night.\n\nLove is the compass,\nGuiding us through the storm,\nIt's the strength we need,\nTo face the challenges we face.\n\nLove is the key,\nTo unlock the secrets of the heart,\nIt's the secret to our happiness,\nAnd the joy that we find.\n\nLove is the source of life,\nThe source of all that we are,\nIt's the power that we need,\nTo make our dreams come true.\n\nLove is the bond that we share,\nThe bond that we cherish,\nIt's the love that we share,\nAnd the love that we cherish.\n\nLove is the force that we need,\nThe force that we need to overcome,\nIt's the power that we need,\nTo make our lives shine.\n\nLove is the light that we need,\nThe light that we need to see,\nIt's the power that we need,\nTo make our dreams come true.\n\nLove is the key to our happiness,\nThe key to our happiness,\nIt's the secret to our success,\nAnd 

In [10]:
peft_model = peft_model.merge_and_unload()

In [12]:
output = peft_model.generate(**tokenizer("write me a peom",return_tensors='pt').to("cuda:7"))
tokenizer.batch_decode(output)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


["write me a peom about the power of love\n\nIn the heart of the night,\nWhere love's light is bright,\nIt shines like a beacon,\nGuiding us through the night.\n\nLove is the compass,\nGuiding us through the storm,\nIt's the strength we need,\nTo face the challenges we face.\n\nLove is the key,\nTo unlock the secrets of the heart,\nIt's the secret to our happiness,\nAnd the joy that we find.\n\nLove is the source of life,\nThe source of all that we are,\nIt's the power that we need,\nTo make our dreams come true.\n\nLove is the bond that we share,\nThe bond that we cherish,\nIt's the love that we share,\nAnd the love that we cherish.\n\nLove is the force that we need,\nThe force that we need to overcome,\nIt's the power that we need,\nTo make our lives shine.\n\nLove is the light that we need,\nThe light that we need to see,\nIt's the power that we need,\nTo make our dreams come true.\n\nLove is the key to our happiness,\nThe key to our happiness,\nIt's the secret to our success,\nAnd 

In [7]:

import warnings
from typing import Any, Optional
import torch
import torch.nn as nn
from peft.tuners.tuners_utils import BaseTunerLayer, check_adapters_to_merge

class CloverLayer(BaseTunerLayer):
    # All names of layers that may contain (trainable) adapter weights
    adapter_layer_names = ("clover_R")

    def __init__(self, base_layer: nn.Module, **kwargs) -> None:
        self.base_layer = base_layer
        self.num_head = {}
        self.head_dim = {}
        self.head_in = {}
        self.clover_R = nn.ParameterDict({})
        # Mark the weight as unmerged
        self._disable_adapters = False
        self.merged_adapters = []
        self.kwargs = kwargs

        base_layer = self.get_base_layer()
        if isinstance(base_layer, nn.Linear):
            in_features, out_features = base_layer.in_features, base_layer.out_features
        else:
            warnings.warn(
                f"Unsupported layer type '{type(base_layer)}' encountered, proceed at your own risk.", UserWarning
            )

        self.in_features = in_features
        self.out_features = out_features
        
    def _check_forward_args(self, x, *args, **kwargs):
        """Check if the arguments are compatible with the configs and state of the model"""
        adapter_names = kwargs.get("adapter_names", None)
        if adapter_names is None:
            return

        if len(x) != len(adapter_names):
            msg = (
                "Length of `adapter_names` should be the same as the number of inputs, but got "
                f"{len(adapter_names)} and {len(x)} respectively."
            )
            raise ValueError(msg)

        if self.merged:
            # It is unclear what would be the right thing to do if users pass adapter_names and there are merged
            # adapters. Therefore, it is better to raise an error in this case.
            msg = "Cannot pass `adapter_names` when there are merged adapters, please call `unmerge_adapter` first."
            raise ValueError(msg)

class Linear(nn.Module, CloverLayer):
    def __init__(
        self,
        base_layer,
        head_dim: int,
        head_in: bool = False,
        adapter_name: str = 'default',
        init_clover_weights: str = 'eye', # Choices: ['eye','qr','absorb-decompose']
        **kwargs,
    ) -> None:
        super().__init__()
        CloverLayer.__init__(self, base_layer, **kwargs)
        self._active_adapter = adapter_name
        self.update_layer(
            adapter_name,
            head_dim,
            head_in,
            init_clover_weights=init_clover_weights,
        )

    def update_layer(
        self, adapter_name, head_dim, head_in, init_clover_weights
    ):
        self.head_dim[adapter_name] = head_dim
        self.head_in[adapter_name] = head_in
        # Actual trainable parameters
        if head_in:
            assert self.in_features % head_dim == 0
            self.num_head[adapter_name] = self.in_features // head_dim
        else:
            assert self.out_features % head_dim == 0
            self.num_head[adapter_name] = self.out_features // head_dim
        weight_R = torch.randn((self.num_head[adapter_name], head_dim, head_dim))
        self.clover_R[adapter_name] = nn.Parameter(weight_R)

        # for inits that require access to the base weight, use gather_param_ctx so that the weight is gathered when using DeepSpeed
        if init_clover_weights == "qr":
            self.qr_decompose_init(adapter_name)
        elif init_clover_weights == "absorb-decompose":
            self.absorb_decompose_init(adapter_name, init_clover_weights)
        else:
            self.reset_clover_parameters(adapter_name)

        #self.set_adapter(self.active_adapters)

    def reset_clover_parameters(self, adapter_name):
        if adapter_name in self.clover_R.keys():
            weight_R = torch.eye(self.head_dim[adapter_name]).unsqueeze(0).repeat(self.num_head[adapter_name], 1, 1)
            self.clover_R[adapter_name].data = weight_R

    def qr_decompose_init(self, adapter_name):
        dtype = self.base_layer.weight.dtype
        base_weight = self.base_layer.weight.data # (out_dim, in_dim)
        if self.head_in[adapter_name]:
            base_weight = base_weight.view(-1, self.num_head[adapter_name], self.head_dim[adapter_name]) # (out_dim, num_heads, head_dim)
            weight_R = []
            for h in range(self.num_head[adapter_name]):
                Q, R = torch.linalg.qr(base_weight[:,h].to(torch.float32)) # Q(out_dim, head_dim), R(head_dim, head_dim)
                base_weight[:,h] = Q # (out_dim, head_dim)
                weight_R.append(R.T)
                
            self.clover_R[adapter_name].data = torch.stack(weight_R).to(dtype) # (num_heads, head_dim, head_dim)
            self.base_layer.weight.data = base_weight.reshape(-1, self.num_head[adapter_name]*self.head_dim[adapter_name]).contiguous()
            
        else:        
            if self.base_layer.bias is not None:
                base_bias = self.base_layer.bias.data.unsqueeze(1) # (out_dim, 1)
                base_weight = torch.cat([base_weight, base_bias],dim=1)  # (out_dim, in_dim+1)
                
            base_weight = base_weight.view(self.num_head[adapter_name], self.head_dim[adapter_name], -1) # (num_heads, head_dim, in_dim) or (num_heads, head_dim, in_dim+1)
            weight_R = []
            for h in range(self.num_head[adapter_name]):
                Q, R = torch.linalg.qr(base_weight[h].T.to(torch.float32)) # Q(in_dim, head_dim), R(head_dim, head_dim)
                base_weight[h] = Q.T # (head_dim, in_dim) or (head_dim, in_dim+1)
                weight_R.append(R)
                
            self.clover_R[adapter_name].data = torch.stack(weight_R).to(dtype) # (num_heads, head_dim, head_dim)
            if self.base_layer.bias is not None:
                self.base_layer.bias.data = base_weight[:,:,-1].reshape(-1).contiguous().to(dtype) # (out_dim, )
                base_weight = base_weight[:,:,:-1].to(dtype) # (num_heads, head_dim, in_dim)

            self.base_layer.weight.data = base_weight.reshape(self.num_head[adapter_name]*self.head_dim[adapter_name], -1).contiguous()

    def absorb_decompose_init(adapter_name):
        pass
    
    def merge(self, safe_merge: bool = True, adapter_names: Optional[list[str]] = None) -> None:
        """
        Merge the active adapter weights into the base weights

        Args:
            safe_merge (`bool`, *optional*):
                If True, the merge operation will be performed in a copy of the original weights and check for NaNs
                before merging the weights. This is useful if you want to check if the merge operation will produce
                NaNs. Defaults to `True`.
            adapter_names (`list[str]`, *optional*):
                The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults
                to `None`.
        """
        adapter_names = check_adapters_to_merge(self, adapter_names)
        if not adapter_names:
            # no adapter to merge
            return

        for active_adapter in adapter_names:
            if active_adapter in self.clover_R.keys():
                base_layer = self.get_base_layer()
                # Note that safe_merge will be slower than the normal merge
                # because of the copy operation.
                base_weights = base_layer.weight.data.clone() # (out_dim, in_dim)
                weight_R = self.clover_R[active_adapter].data #(num_head, head_dim, head_dim)
                if self.head_in[active_adapter]:
                    base_weights = base_weights.view(self.out_features, self.num_head[active_adapter], self.head_dim[active_adapter])
                    base_weights = torch.einsum("ohd,hed->ohe", base_weights, weight_R) 
                else:
                    if base_layer.bias is not None:
                        base_bias = base_layer.bias.data.clone().unsqueeze(1) # (out_dim, 1)
                        base_weights = torch.cat([base_weights, base_bias], dim=1)
                        
                    base_weights = base_weights.view(self.num_head[active_adapter], self.head_dim[active_adapter], -1)
                    base_weights = torch.einsum("hdi,hde->hei", base_weights, weight_R)
                base_weights = base_weights.reshape(self.out_features, -1).contiguous()
                if not torch.isfinite(base_weights).all():
                    raise ValueError(
                        f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken"
                    )
                if base_layer.bias is not None and not self.head_in[active_adapter]:
                    base_layer.bias.data = base_weights[:,-1].contiguous()
                    base_weights = base_weights[:,:-1].contiguous()
                base_layer.weight.data = base_weights

                self.merged_adapters.append(active_adapter)

    def rotation(self, result, clover_R, num_head, head_dim):
        bsz, seq, _ = result.shape
        result = result.view(bsz, seq, num_head, head_dim)
        result = torch.einsum("bshd,hde->bshe", result, clover_R)
        result = result.reshape(bsz, seq, num_head*head_dim).contiguous() 
        return result

    def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor:
        self._check_forward_args(x, *args, **kwargs)
        adapter_names = kwargs.pop("adapter_names", None)

        if self.disable_adapters:
            if self.merged:
                self.unmerge()
            result = self.base_layer(x, *args, **kwargs)
        elif adapter_names is not None:
            result = self._mixed_batch_forward(x, *args, adapter_names=adapter_names, **kwargs)
        elif self.merged:
            result = self.base_layer(x, *args, **kwargs)
        else:
            torch_x_dtype = x.dtype
            for active_adapter in self.active_adapters:
                if active_adapter not in self.clover_R.keys():
                    continue
                if not self.head_in[active_adapter]:
                    continue
                clover_R = self.clover_R[active_adapter]
                x = self.rotation(x, clover_R, self.num_head[active_adapter], self.head_dim[active_adapter])
            x = x.to(torch_x_dtype)
                
            result = self.base_layer(x, *args, **kwargs) # (bsz, seq, num_heads*head_dim)
            torch_result_dtype = result.dtype
            for active_adapter in self.active_adapters:
                if active_adapter not in self.clover_R.keys():
                    continue
                if self.head_in[active_adapter]:
                    continue
                clover_R = self.clover_R[active_adapter]
                result = self.rotation(result, clover_R, self.num_head[active_adapter], self.head_dim[active_adapter])
            result = result.to(torch_result_dtype)

        return result

    def _mixed_batch_forward(
        self, x: torch.Tensor, *args: Any, adapter_names: list[str], **kwargs: Any
    ) -> torch.Tensor:
        # This is a special method that handles the case when users pass the argument `adapter_names`. This is an
        # extra argument that allows mixing different adapters in the same batch at inference time.

        unique_adapters = set(adapter_names)
        sub_batch_indices_list = []
        for adapter in unique_adapters:
            sub_batch_indices_list.append([index for index, item in enumerate(adapter_names) if item == adapter])
            
        for i, active_adapter in enumerate(unique_adapters):
            if active_adapter == "__base__":
                continue
            if active_adapter not in self.clover_R.keys():
                continue
            if not self.head_in[active_adapter]:
                    continue
            clover_R = self.clover_R[active_adapter]
            torch_x_dtype = x.dtype

            # getting the sub-batch, passing it to CLOVER layers and updating the corresponding indices of the linear
            # layer output
            sub_batch = x[sub_batch_indices_list[i]].to(clover_R.dtype)
            clover_x = self.rotation(sub_batch, clover_R, self.num_head[active_adapter], self.head_dim[active_adapter])
            x[sub_batch_indices_list[i]] = clover_x.to(torch_x_dtype)
            
        result = self.base_layer(x, *args, **kwargs)
        torch_result_dtype = result.dtype

        for i, active_adapter in enumerate(unique_adapters):
            if active_adapter == "__base__":
                continue
            if active_adapter not in self.clover_R.keys():
                continue
            if self.head_in[active_adapter]:
                    continue
            clover_R = self.clover_R[active_adapter]

            # getting the sub-batch, passing it to CLOVER layers and updating the corresponding indices of the linear
            # layer output
            sub_batch = result[sub_batch_indices_list[i]].to(clover_R.dtype)
            if self.head_in[active_adapter]:
                    continue
            clover_output = self.rotation(sub_batch, clover_R, self.num_head[active_adapter], self.head_dim[active_adapter])
            result[sub_batch_indices_list[i]] = clover_output.to(torch_result_dtype)

        return result

    def __repr__(self) -> str:
        rep = super().__repr__()
        return "clover." + rep

In [8]:
q_proj = nn.Linear(4096, 4096, bias=False)
x = torch.randn(2,15,4096)
print(q_proj(x))

tensor([[[-0.1147,  0.0594, -0.4917,  ..., -0.1536,  0.3382,  0.2766],
         [ 0.3614,  1.6465,  0.3168,  ...,  0.0198, -0.9441, -0.0502],
         [-1.3687,  0.7455,  0.4477,  ...,  0.2797,  0.4861, -0.3303],
         ...,
         [ 1.0341, -0.2328,  1.1667,  ...,  0.4766,  0.5874, -0.8720],
         [-0.6536,  0.2651,  0.1632,  ..., -0.0279, -0.5445,  0.4384],
         [ 0.2715, -0.4977, -0.2946,  ..., -0.5098,  0.2562,  0.2518]],

        [[ 0.7479, -0.8701, -0.6882,  ...,  1.0735, -0.9788,  0.2787],
         [ 1.1530, -0.2348,  0.5584,  ...,  1.3575, -0.4017, -0.2344],
         [-0.1461,  0.4578, -0.9949,  ...,  0.0065,  0.1757, -0.1041],
         ...,
         [-0.2544, -1.1965, -0.5829,  ..., -0.7570,  0.4917, -0.5717],
         [ 0.0058, -0.3130, -0.1992,  ..., -0.1002,  0.2319, -0.1983],
         [ 0.4276,  0.4038, -0.2927,  ...,  0.0962,  0.0613, -0.3319]]],
       grad_fn=<UnsafeViewBackward0>)


In [9]:
clover_proj = Linear(q_proj, 128, head_in=True,init_clover_weights="qr")
print(clover_proj(x))

tensor([[[-0.1147,  0.0594, -0.4917,  ..., -0.1536,  0.3382,  0.2766],
         [ 0.3614,  1.6465,  0.3168,  ...,  0.0198, -0.9441, -0.0502],
         [-1.3687,  0.7455,  0.4477,  ...,  0.2797,  0.4861, -0.3303],
         ...,
         [ 1.0341, -0.2328,  1.1667,  ...,  0.4766,  0.5874, -0.8720],
         [-0.6536,  0.2651,  0.1632,  ..., -0.0279, -0.5445,  0.4384],
         [ 0.2715, -0.4977, -0.2946,  ..., -0.5098,  0.2562,  0.2518]],

        [[ 0.7479, -0.8701, -0.6882,  ...,  1.0735, -0.9788,  0.2787],
         [ 1.1530, -0.2348,  0.5584,  ...,  1.3575, -0.4017, -0.2344],
         [-0.1461,  0.4578, -0.9949,  ...,  0.0065,  0.1757, -0.1041],
         ...,
         [-0.2544, -1.1965, -0.5829,  ..., -0.7570,  0.4917, -0.5717],
         [ 0.0058, -0.3130, -0.1992,  ..., -0.1002,  0.2319, -0.1983],
         [ 0.4276,  0.4038, -0.2927,  ...,  0.0962,  0.0613, -0.3319]]],
       grad_fn=<UnsafeViewBackward0>)


In [10]:
clover_proj.merge(safe_merge=True)

In [11]:
print(clover_proj(x))

tensor([[[-0.1147,  0.0594, -0.4917,  ..., -0.1536,  0.3382,  0.2766],
         [ 0.3614,  1.6465,  0.3168,  ...,  0.0198, -0.9441, -0.0502],
         [-1.3687,  0.7455,  0.4477,  ...,  0.2797,  0.4861, -0.3303],
         ...,
         [ 1.0341, -0.2328,  1.1667,  ...,  0.4766,  0.5874, -0.8720],
         [-0.6536,  0.2651,  0.1632,  ..., -0.0279, -0.5445,  0.4384],
         [ 0.2715, -0.4977, -0.2946,  ..., -0.5098,  0.2562,  0.2518]],

        [[ 0.7479, -0.8701, -0.6882,  ...,  1.0735, -0.9788,  0.2787],
         [ 1.1530, -0.2348,  0.5584,  ...,  1.3575, -0.4017, -0.2344],
         [-0.1461,  0.4578, -0.9949,  ...,  0.0065,  0.1757, -0.1041],
         ...,
         [-0.2544, -1.1965, -0.5829,  ..., -0.7570,  0.4917, -0.5717],
         [ 0.0058, -0.3130, -0.1992,  ..., -0.1002,  0.2319, -0.1983],
         [ 0.4276,  0.4038, -0.2927,  ...,  0.0962,  0.0613, -0.3319]]],
       grad_fn=<UnsafeViewBackward0>)


In [12]:
print(clover_proj._mixed_batch_forward(x,adapter_names=['default', 'default']))

tensor([[[-1.5202e-02,  8.5682e-01,  2.9192e-01,  ..., -2.3793e-01,
          -3.0827e-01,  2.2188e-01],
         [-1.3650e-01,  6.0451e-01,  4.2241e-01,  ..., -2.6414e-01,
          -1.3790e-01,  6.5217e-02],
         [ 2.8061e-01,  4.0249e-01,  2.6483e-01,  ..., -3.4663e-01,
          -1.3704e-01, -8.3752e-01],
         ...,
         [-4.2521e-01, -1.7031e-01,  3.1694e-01,  ..., -3.6772e-01,
          -1.1709e-01,  2.5517e-01],
         [ 5.9677e-01,  5.4387e-02,  1.6737e-01,  ..., -3.6323e-02,
          -5.5437e-02, -5.0203e-03],
         [ 6.0396e-02,  2.9254e-01, -1.5522e-01,  ..., -5.1261e-01,
           2.5141e-01,  2.4361e-01]],

        [[ 1.1792e-01, -3.6792e-01,  3.2642e-01,  ...,  2.0840e-01,
           5.6022e-01,  6.9570e-02],
         [ 1.7640e-01, -7.3480e-01, -3.9186e-01,  ...,  2.7923e-01,
          -5.3349e-04,  6.8417e-01],
         [ 2.9929e-01,  1.2432e-01,  3.4336e-01,  ..., -4.6908e-01,
           4.1230e-01, -3.6135e-01],
         ...,
         [-1.7490e-02,  1

In [13]:
clover_proj

clover.Linear(
  (base_layer): Linear(in_features=4096, out_features=4096, bias=False)
  (clover_R): ParameterDict(  (default): Parameter containing: [torch.FloatTensor of size 32x128x128])
)