In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from math import sqrt

# Define the differential attention mechanism
class DifferentialAttention(nn.Module):
    def __init__(self, d_model, num_heads, λ_init=0.8):
        super(DifferentialAttention, self).__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads
        self.scale = 1 / sqrt(self.head_dim)

        self.query = nn.Linear(d_model, d_model)
        self.key = nn.Linear(d_model, d_model)
        self.value = nn.Linear(d_model, d_model)
        self.output = nn.Linear(d_model, d_model)

        self.λ = nn.Parameter(torch.tensor(λ_init), requires_grad=True)

    def forward(self, x):
        B, N, D = x.shape

        # Split the input into multiple heads
        Q = self.query(x).view(B, N, self.num_heads, self.head_dim)
        K = self.key(x).view(B, N, self.num_heads, self.head_dim)
        V = self.value(x).view(B, N, self.num_heads, self.head_dim)

        # Differential attention calculation
        Q1, Q2 = torch.chunk(Q, 2, dim=-1)
        K1, K2 = torch.chunk(K, 2, dim=-1)

        A1 = torch.einsum('bnqd,bnkd->bnqk', Q1, K1) * self.scale
        A2 = torch.einsum('bnqd,bnkd->bnqk', Q2, K2) * self.scale

        A1 = F.softmax(A1, dim=-1)
        A2 = F.softmax(A2, dim=-1)

        diff_attention = A1 - self.λ * A2

        output = torch.einsum('bnqk,bnvd->bnqd', diff_attention, V)
        output = output.reshape(B, N, D)

        return self.output(output)

# Define the differential transformer layer
class DifferentialTransformerLayer(nn.Module):
    def __init__(self, d_model, num_heads, dim_feedforward=2048, dropout=0.1):
        super(DifferentialTransformerLayer, self).__init__()
        self.diff_attention = DifferentialAttention(d_model, num_heads)
        self.ffn = nn.Sequential(
            nn.Linear(d_model, dim_feedforward),
            nn.ReLU(),
            nn.Linear(dim_feedforward, d_model)
        )
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # Apply differential attention followed by layer normalization and a feedforward network
        x = x + self.dropout(self.diff_attention(x))
        x = self.norm1(x)
        x = x + self.dropout(self.ffn(x))
        x = self.norm2(x)
        return x

# Define the full transformer model with differential layers
class DifferentialTransformer(nn.Module):
    def __init__(self, num_layers, d_model, num_heads, dim_feedforward=2048, dropout=0.1):
        super(DifferentialTransformer, self).__init__()
        self.layers = nn.ModuleList([
            DifferentialTransformerLayer(d_model, num_heads, dim_feedforward, dropout)
            for _ in range(num_layers)
        ])
        self.embedding = nn.Embedding(10000, d_model)
        self.fc_out = nn.Linear(d_model, 10000)

    def forward(self, x):
        x = self.embedding(x)
        for layer in self.layers:
            x = layer(x)
        x = self.fc_out(x)
        return x

# Example usage in Google Colab
if __name__ == "__main__":
    # Initialize a differential transformer with 4 layers, model dimension 512, and 8 attention heads
    model = DifferentialTransformer(num_layers=4, d_model=512, num_heads=8)

    # Sample input tensor (batch_size, sequence_length)
    input_data = torch.randint(0, 10000, (16, 100))  # Random token input

    # Forward pass
    output = model(input_data)
    print(output.shape)  # Output will have shape (batch_size, sequence_length, vocab_size)


torch.Size([16, 100, 10000])


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from math import sqrt

# Multi-head attention mechanism
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads
        self.scale = 1 / sqrt(self.head_dim)

        # Define query, key, value, and output projection layers
        self.query = nn.Linear(d_model, d_model)
        self.key = nn.Linear(d_model, d_model)
        self.value = nn.Linear(d_model, d_model)
        self.output = nn.Linear(d_model, d_model)

    def forward(self, q, k, v, mask=None):
        B, N, D = q.shape  # Batch size, Sequence length, Embedding dimension
        q = self.query(q).view(B, N, self.num_heads, self.head_dim).transpose(1, 2)
        k = self.key(k).view(B, N, self.num_heads, self.head_dim).transpose(1, 2)
        v = self.value(v).view(B, N, self.num_heads, self.head_dim).transpose(1, 2)

        # Scaled dot-product attention
        attn_weights = torch.einsum("bhqd,bhkd->bhqk", q, k) * self.scale
        if mask is not None:
            attn_weights = attn_weights.masked_fill(mask == 0, float('-inf'))
        attn_weights = F.softmax(attn_weights, dim=-1)

        output = torch.einsum("bhqk,bhvd->bhqd", attn_weights, v)
        output = output.transpose(1, 2).contiguous().view(B, N, D)

        return self.output(output)

# Position-wise feedforward network
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super(FeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(self.dropout(x))
        return x

# Transformer Encoder Layer
class TransformerEncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super(TransformerEncoderLayer, self).__init__()
        self.attention = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = FeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        # Apply multi-head attention
        attn_output = self.attention(x, x, x, mask)
        x = x + self.dropout(attn_output)
        x = self.norm1(x)

        # Apply feedforward network
        ffn_output = self.feed_forward(x)
        x = x + self.dropout(ffn_output)
        x = self.norm2(x)

        return x

# Full Transformer Encoder with multiple layers
class TransformerEncoder(nn.Module):
    def __init__(self, num_layers, d_model, num_heads, d_ff, vocab_size, max_seq_len, dropout=0.1):
        super(TransformerEncoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_embedding = nn.Parameter(torch.randn(1, max_seq_len, d_model))
        self.layers = nn.ModuleList([TransformerEncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.fc_out = nn.Linear(d_model, vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        B, N = x.shape
        x = self.embedding(x) + self.pos_embedding[:, :N, :]
        x = self.dropout(x)

        # Pass through each encoder layer
        for layer in self.layers:
            x = layer(x, mask)

        return self.fc_out(x)

# Example usage in Google Colab
if __name__ == "__main__":
    # Model parameters
    num_layers = 4
    d_model = 512
    num_heads = 8
    d_ff = 2048
    vocab_size = 10000
    max_seq_len = 100
    dropout = 0.1

    # Instantiate the Transformer model
    model = TransformerEncoder(num_layers, d_model, num_heads, d_ff, vocab_size, max_seq_len, dropout)

    # Sample input tensor (batch_size, sequence_length)
    input_data = torch.randint(0, vocab_size, (16, 100))  # Random token input

    # Forward pass through the transformer
    output = model(input_data)
    print(output.shape)  # Output will have shape (batch_size, sequence_length, vocab_size)

torch.Size([16, 100, 10000])
