# Algorithm 13: Diffusion Transformer (Boltz)

Transformer blocks for diffusion-based structure prediction.

## Source Code Location
- **File**: `Boltz-Ref-src/boltz-official/src/boltz/model/modules/transformers.py`
- **Class**: `DiffusionTransformer`

In [None]:
import numpy as np
np.random.seed(42)

def layer_norm(x, eps=1e-5):
    mean = np.mean(x, axis=-1, keepdims=True)
    var = np.var(x, axis=-1, keepdims=True)
    return (x - mean) / np.sqrt(var + eps)

def softmax(x, axis=-1):
    x_max = np.max(x, axis=axis, keepdims=True)
    exp_x = np.exp(x - x_max)
    return exp_x / np.sum(exp_x, axis=axis, keepdims=True)

def sigmoid(x):
    return 1 / (1 + np.exp(-np.clip(x, -500, 500)))

def swish(x):
    return x * sigmoid(x)

In [None]:
def adaptive_layer_norm(x, cond):
    """Adaptive LayerNorm conditioned on noise level."""
    c = x.shape[-1]
    x_norm = layer_norm(x)
    
    # Derive scale and shift from conditioning
    W = np.random.randn(cond.shape[-1], 2 * c) * (cond.shape[-1] ** -0.5)
    params = cond @ W
    scale, shift = np.split(params, 2, axis=-1)
    
    return (1 + scale) * x_norm + shift

In [None]:
def diffusion_transformer_block(x, z, t_emb, num_heads=8, c=32):
    """
    Single Diffusion Transformer block.
    
    Args:
        x: Token features [N, c_x]
        z: Pair representation [N, N, c_z]
        t_emb: Noise level embedding [c_t]
        num_heads: Number of attention heads
        c: Head dimension
    
    Returns:
        Updated x [N, c_x]
    """
    N, c_x = x.shape
    c_z = z.shape[-1]
    
    # Adaptive LayerNorm
    x_norm = adaptive_layer_norm(x, np.tile(t_emb, (N, 1)))
    z_norm = layer_norm(z)
    
    # Self-attention with pair bias
    W_q = np.random.randn(c_x, num_heads, c) * (c_x ** -0.5)
    W_k = np.random.randn(c_x, num_heads, c) * (c_x ** -0.5)
    W_v = np.random.randn(c_x, num_heads, c) * (c_x ** -0.5)
    W_b = np.random.randn(c_z, num_heads) * (c_z ** -0.5)
    
    q = np.einsum('ic,chd->ihd', x_norm, W_q)
    k = np.einsum('jc,chd->jhd', x_norm, W_k)
    v = np.einsum('jc,chd->jhd', x_norm, W_v)
    b = np.einsum('ijc,ch->ijh', z_norm, W_b)
    
    attn = np.einsum('ihd,jhd->ijh', q, k) / np.sqrt(c) + b
    attn = softmax(attn, axis=1)
    
    output = np.einsum('ijh,jhd->ihd', attn, v)
    output = output.reshape(N, -1)
    
    W_o = np.random.randn(num_heads * c, c_x) * ((num_heads * c) ** -0.5)
    x = x + output @ W_o
    
    # Transition
    x_norm = adaptive_layer_norm(x, np.tile(t_emb, (N, 1)))
    W_up = np.random.randn(c_x, c_x * 4 * 2) * (c_x ** -0.5)
    hidden = x_norm @ W_up
    a, b_h = np.split(hidden, 2, axis=-1)
    hidden = swish(a) * b_h
    
    W_down = np.random.randn(c_x * 4, c_x) * ((c_x * 4) ** -0.5)
    x = x + hidden @ W_down
    
    return x

In [None]:
def diffusion_transformer(x, z, t_emb, num_blocks=24):
    """
    Full Diffusion Transformer.
    """
    print(f"Diffusion Transformer ({num_blocks} blocks)")
    print(f"="*50)
    
    for i in range(num_blocks):
        x = diffusion_transformer_block(x, z, t_emb)
        if (i + 1) % 6 == 0:
            print(f"  Block {i+1}: x_norm={np.linalg.norm(x):.2f}")
    
    return x

In [None]:
# Test
print("Test: Diffusion Transformer")
print("="*60)

N = 24
c_x = 64
c_z = 32
c_t = 64

x = np.random.randn(N, c_x) * 0.1
z = np.random.randn(N, N, c_z) * 0.1
t_emb = np.random.randn(c_t)

x_out = diffusion_transformer(x, z, t_emb, num_blocks=6)

print(f"\nOutput: {x_out.shape}")
print(f"Finite: {np.isfinite(x_out).all()}")

## Key Insights

1. **Adaptive LayerNorm**: Conditioned on noise level
2. **Pair Bias**: Uses pair representation in attention
3. **24 Blocks**: Deep transformer for structure refinement
4. **SwiGLU FFN**: Modern activation function