## Adapter 

In [1]:
import torch.nn as nn
def adapter(in_dim, bottleneck_dim, out_dim):
    adapter_layers = nn.Sequential(
        nn.Linear(in_dim, bottleneck_dim),
        nn.GELU(),
        nn.Linear(bottleneck_dim, out_dim),
    )
    return adapter_layers

## Prompt Tuning

In [2]:
import torch
import torch.nn as nn

num_tokens, embed_dim = 50, 768

soft_prompt = nn.Parameter ( # Make tensor trainable
    torch.rand(num_tokens, embed_dim)) # Initialize soft prompt tensor

soft_prompt.shape

torch.Size([50, 768])

In [12]:
seq_len = 100
x = torch.rand(seq_len, embed_dim)
x.shape

torch.Size([100, 768])

In [13]:
def input_with_soft_prompt(x, soft_prompt) :
    x = torch.concat([soft_prompt, x], dim=-1) # Prepend soft prompt to input
    return x

x = input_with_soft_prompt(x, soft_prompt)
x.shape

torch.Size([150, 768])

## Prefix Tuning

In [None]:
import torch
import torch.nn as nn

class FullyConnectedLayers(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(FullyConnectedLayers, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

class LayerNorm(nn.Module):
    def __init__(self, features, eps=1e-6):
        super(LayerNorm, self).__init__()
        self.gamma = nn.Parameter(torch.ones(features))
        self.beta = nn.Parameter(torch.zeros(features))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.gamma * (x - mean) / (std + self.eps) + self.beta

class SelfAttention(nn.Module):
    def __init__(self, embed_size, heads):
        super(SelfAttention, self).__init__()
        self.embed_size = embed_size
        self.heads = heads
        self.head_dim = embed_size // heads

        assert (
            self.head_dim * heads == embed_size
        ), "Embedding size needs to be divisible by heads"

        self.values = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.keys = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.queries = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.fc_out = nn.Linear(heads * self.head_dim, embed_size)

    def forward(self, values, keys, query, mask):
        N = query.shape[0]
        value_len, key_len, query_len = values.shape[1], keys.shape[1], query.shape[1]

        # Split the embedding into self.heads different pieces
        values = values.reshape(N, value_len, self.heads, self.head_dim)
        keys = keys.reshape(N, key_len, self.heads, self.head_dim)
        queries = query.reshape(N, query_len, self.heads, self.head_dim)

        values = self.values(values)
        keys = self.keys(keys)
        queries = self.queries(queries)

        # Calculate energy
        energy = torch.einsum("nqhd,nkhd->nhqk", [queries, keys])
        if mask is not None:
            energy = energy.masked_fill(mask == 0, float("-1e20"))

        # Normalize energy
        attention = torch.nn.functional.softmax(energy / (self.embed_size ** (1 / 2)), dim=3)

        # Attention to values
        out = torch.einsum("nhql,nlhd->nqhd", [attention, values]).reshape(
            N, query_len, self.heads * self.head_dim
        )

        # Concatenate heads
        out = self.fc_out(out)
        return out

In [None]:
def transformer_block_with_prefix(soft_prompt, x):
    soft_prompt = FullyConnectedLayers(soft_prompt)
    x = torch.cat([soft_prompt, x], dim=-1)
    
    residual = x
    attention = SelfAttention(embed_size=... , heads=...)  # Define embed_size and heads
    x = attention(x, x, x, mask=None)  # Define mask if needed
    x = LayerNorm(x + residual)
    
    residual = x
    x = FullyConnectedLayers(... , ... , ...)  # Define input_size, hidden_size, output_size
    x = LayerNorm(x + residual)
    return x

## LoRA

In [2]:
import torch.nn as nn
import torch

class LoRALayer(nn.Module):
    def __init__(self, in_dim, out_dim, rank, alpha):
        super().__init__()
        std_dev = 1 / torch.sqrt(torch.tensor(rank).float())
        self.A = nn.Parameter(torch.randn(in_dim, rank) * std_dev)
        self.B = nn.Parameter(torch.zeros(rank, out_dim))
        self.alpha = alpha

    def forward(self, x):
        x = self.alpha * (x @ self.A @ self.B)
        return x

In [None]:
import torch.functional as F

def forward(self, x):
    x = self.linear_1(x) #+ self.lora_1(x)
    x = F.relu(x)
    x = self.linear_2(x) #+ self.lora_2(x)
    logits = #softmax
    return logits

In [None]:
class LinearWithLoRA(torch.nn.Module):
    def __init__(self, linear, rank, alpha):
        super().__init__()
        self.linear = linear
        self.lora = LoRALayer(
            linear.in_features, linear.out_features, rank, alpha
        )

    def forward(self, x):
        return self.linear(x) + self.lora(x)