In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# Traditional Transformer Block
class TraditionalTransformerBlock(nn.Module):
    def __init__(self, embed_size, heads, dropout, forward_expansion):
        super(TraditionalTransformerBlock, self).__init__()
        self.attention = nn.MultiheadAttention(embed_dim=embed_size, num_heads=heads)
        self.norm1 = nn.LayerNorm(embed_size)
        self.norm2 = nn.LayerNorm(embed_size)
        self.feed_forward = nn.Sequential(
            nn.Linear(embed_size, forward_expansion * embed_size),
            nn.ReLU(),
            nn.Linear(forward_expansion * embed_size, embed_size)
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, value, key, query, mask):
        attn_output, _ = self.attention(query, key, value, attn_mask=mask)
        x = self.dropout(self.norm1(attn_output + query))
        forward = self.feed_forward(x)
        out = self.dropout(self.norm2(forward + x))
        return out

# Transformer2 Block with Singular Value Fine-Tuning (SVF)
class Transformer2Block(nn.Module):
    def __init__(self, embed_size, heads, dropout, forward_expansion):
        super(Transformer2Block, self).__init__()
        self.attention = nn.MultiheadAttention(embed_dim=embed_size, num_heads=heads)
        self.norm1 = nn.LayerNorm(embed_size)
        self.norm2 = nn.LayerNorm(embed_size)
        self.feed_forward = nn.Sequential(
            nn.Linear(embed_size, forward_expansion * embed_size),
            nn.ReLU(),
            nn.Linear(forward_expansion * embed_size, embed_size)
        )
        self.dropout = nn.Dropout(dropout)
        # Expert vector for singular value scaling
        self.svf_expert = nn.Parameter(torch.ones(embed_size))

    def forward(self, value, key, query, mask):
        attn_output, _ = self.attention(query, key, value, attn_mask=mask)
        scaled_attn = attn_output * self.svf_expert.unsqueeze(0)  # Apply SVF scaling
        x = self.dropout(self.norm1(scaled_attn + query))
        forward = self.feed_forward(x)
        out = self.dropout(self.norm2(forward + x))
        return out

# Example usage
embed_size = 256
heads = 8
dropout = 0.1
forward_expansion = 4

# Dummy input
dummy_input = torch.rand(10, 32, embed_size)  # (sequence_length, batch_size, embed_size)
mask = None

# Traditional Transformer Output
traditional_block = TraditionalTransformerBlock(embed_size, heads, dropout, forward_expansion)
traditional_output = traditional_block(dummy_input, dummy_input, dummy_input, mask)

# Transformer2 Output
transformer2_block = Transformer2Block(embed_size, heads, dropout, forward_expansion)
transformer2_output = transformer2_block(dummy_input, dummy_input, dummy_input, mask)

print("Traditional Transformer Output Shape:", traditional_output.shape)
print("Transformer2 Output Shape:", transformer2_output.shape)


Traditional Transformer Output Shape: torch.Size([10, 32, 256])
Transformer2 Output Shape: torch.Size([10, 32, 256])


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# Singular Value Fine-Tuning (SVF) Module
class SVFExpert(nn.Module):
    def __init__(self, embed_size):
        super(SVFExpert, self).__init__()
        self.scale_vector = nn.Parameter(torch.ones(embed_size))

    def forward(self, input_tensor):
        return input_tensor * self.scale_vector.unsqueeze(0)

# Transformer2 Block with SVF and Expert Vectors
class Transformer2Block(nn.Module):
    def __init__(self, embed_size, heads, dropout, forward_expansion, num_experts):
        super(Transformer2Block, self).__init__()
        self.attention = nn.MultiheadAttention(embed_dim=embed_size, num_heads=heads)
        self.norm1 = nn.LayerNorm(embed_size)
        self.norm2 = nn.LayerNorm(embed_size)
        self.feed_forward = nn.Sequential(
            nn.Linear(embed_size, forward_expansion * embed_size),
            nn.ReLU(),
            nn.Linear(forward_expansion * embed_size, embed_size)
        )
        self.dropout = nn.Dropout(dropout)
        self.experts = nn.ModuleList([SVFExpert(embed_size) for _ in range(num_experts)])

    def select_expert(self, input_tensor):
        # Simple heuristic: pick an expert based on the mean value of the input
        expert_idx = int(input_tensor.mean().item() * len(self.experts)) % len(self.experts)
        return self.experts[expert_idx]

    def forward(self, value, key, query, mask):
        attn_output, _ = self.attention(query, key, value, attn_mask=mask)
        expert = self.select_expert(attn_output)
        scaled_attn = expert(attn_output)
        x = self.dropout(self.norm1(scaled_attn + query))
        forward = self.feed_forward(x)
        out = self.dropout(self.norm2(forward + x))
        return out

# Transformer2 Model
class Transformer2Model(nn.Module):
    def __init__(self, vocab_size, embed_size, num_layers, heads, dropout, forward_expansion, num_experts):
        super(Transformer2Model, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.layers = nn.ModuleList([
            Transformer2Block(embed_size, heads, dropout, forward_expansion, num_experts)
            for _ in range(num_layers)
        ])
        self.fc_out = nn.Linear(embed_size, vocab_size)

    def forward(self, x, mask=None):
        out = self.embedding(x)
        for layer in self.layers:
            out = layer(out, out, out, mask)
        logits = self.fc_out(out)
        return logits

# Parameters
vocab_size = 1000
embed_size = 256
num_layers = 4
heads = 8
dropout = 0.1
forward_expansion = 4
num_experts = 3

# Initialize model
model = Transformer2Model(vocab_size, embed_size, num_layers, heads, dropout, forward_expansion, num_experts)

# Dummy input (batch_size=32, seq_len=10)
dummy_input = torch.randint(0, vocab_size, (10, 32))
output = model(dummy_input)

print("Transformer2 Model Output Shape:", output.shape)

Transformer2 Model Output Shape: torch.Size([10, 32, 1000])


### 1. **Training Expert Vectors with RL**

- **Objective**: Each expert vector is specialized for a specific task or domain (e.g., coding, math, reasoning).
  
- **Process**:
  - A smaller, more parameter-efficient model (or a component of the main model) is fine-tuned using **Reinforcement Learning (RL)**.
  - The **Singular Value Fine-Tuning (SVF)** approach adjusts only the **singular values** of the model's weight matrices.
  - During training, the model interacts with the environment, receiving rewards based on performance (correct outputs, task success, etc.).
  - The RL algorithm (e.g., REINFORCE) optimizes these singular values to create a compact representation specialized for the task.

- **Result**: The fine-tuned singular values form a **task-specific expert vector** that captures the specialized behavior required for that task.

---

### 2. **How Expert Vectors are Used**

- **Modularity**: Each expert vector is stored independently and can be dynamically combined or selected during inference.

- **Dynamic Adaptation**:  
  - When a new input arrives, Transformer2 first **classifies** the task (using a dispatch system or prompt engineering).  
  - It then **activates the relevant expert vector(s)** to modify the model’s behavior without retraining.

- **Mixture of Experts (MoE) Strategy**:  
  - In more complex tasks, Transformer2 can combine multiple expert vectors (via weighted mixtures) for richer adaptability.

---

### 3. **Why This Is Powerful**

- **Efficiency**: Only the singular values are trained, which requires **fewer parameters** than retraining the full model.  
- **Scalability**: New expert vectors can be added over time without impacting existing ones.  
- **Continual Learning**: The model can adapt to new tasks without **catastrophic forgetting**.  

---

### 🔑 **Simplified Analogy**  
Think of each **expert vector** as a **"skill module"**. You don’t need to rebuild a robot every time it learns a new task; instead, you just install a specialized tool. Similarly, Transformer2 installs specialized "skills" via these expert vectors.