# Algorithm 12: Triangle Attention Ending Node (AlphaFold3)

Triangle Attention around the ending node - column-wise attention.

## Source Code Location
- **File**: `AF3-Ref-src/alphafold3-official/src/alphafold3/model/network/modules.py`

In [None]:
import numpy as np
np.random.seed(42)

def layer_norm(x, eps=1e-5):
    mean = np.mean(x, axis=-1, keepdims=True)
    var = np.var(x, axis=-1, keepdims=True)
    return (x - mean) / np.sqrt(var + eps)

def softmax(x, axis=-1):
    x_max = np.max(x, axis=axis, keepdims=True)
    exp_x = np.exp(x - x_max)
    return exp_x / np.sum(exp_x, axis=axis, keepdims=True)

def sigmoid(x):
    return 1 / (1 + np.exp(-np.clip(x, -500, 500)))

In [None]:
def triangle_attention_ending(z, num_heads=4, c=32):
    """
    Triangle Attention around ending node.
    
    For edge (i,j), attends over k using edges (k,j) as Q and K.
    
    Args:
        z: Pair representation [N, N, c_z]
        num_heads: Number of attention heads
        c: Head dimension
    
    Returns:
        Update to pair representation [N, N, c_z]
    """
    N = z.shape[0]
    c_z = z.shape[-1]
    
    print(f"Triangle Attention (Ending Node)")
    print(f"="*50)
    print(f"Pair: [{N}, {N}, {c_z}]")
    
    z_norm = layer_norm(z)
    
    W_q = np.random.randn(c_z, num_heads, c) * (c_z ** -0.5)
    W_k = np.random.randn(c_z, num_heads, c) * (c_z ** -0.5)
    W_v = np.random.randn(c_z, num_heads, c) * (c_z ** -0.5)
    W_b = np.random.randn(c_z, num_heads) * (c_z ** -0.5)
    W_g = np.random.randn(c_z, num_heads, c) * (c_z ** -0.5)
    
    q = np.einsum('ijc,chd->ijhd', z_norm, W_q)
    k = np.einsum('ijc,chd->ijhd', z_norm, W_k)
    v = np.einsum('ijc,chd->ijhd', z_norm, W_v)
    b = np.einsum('ijc,ch->ijh', z_norm, W_b)
    g = sigmoid(np.einsum('ijc,chd->ijhd', z_norm, W_g))
    
    # Transpose for column-wise attention
    # q[i,j] attends to k[k,j], bias from b[i,k]
    q_t = q.transpose(1, 0, 2, 3)  # [N, N, H, c] -> [j, i, H, c]
    k_t = k.transpose(1, 0, 2, 3)
    v_t = v.transpose(1, 0, 2, 3)
    
    attn_logits = np.einsum('jihd,jkhd->jikh', q_t, k_t) / np.sqrt(c)  # [j, i, k, H]
    attn_logits = attn_logits + b.transpose(1, 0, 2)[:, None, :, :]  # Add bias
    attn_weights = softmax(attn_logits, axis=2)
    
    attended = np.einsum('jikh,jkhd->jihd', attn_weights, v_t)
    attended = attended.transpose(1, 0, 2, 3)  # Back to [i, j, H, c]
    attended = attended * g
    
    W_o = np.random.randn(num_heads, c, c_z) * ((num_heads * c) ** -0.5)
    output = np.einsum('ijhd,hdc->ijc', attended, W_o)
    
    print(f"Output: {output.shape}")
    
    return output

In [None]:
# Test
print("Test: Triangle Attention Ending Node")
print("="*60)

N = 24
c_z = 64

z = np.random.randn(N, N, c_z) * 0.1

output = triangle_attention_ending(z, num_heads=4, c=16)

print(f"\nOutput finite: {np.isfinite(output).all()}")

## Key Insights

1. **Column-wise Attention**: For each column j, attends over rows
2. **Ending Node**: The "j" index is fixed (ending point of edges)
3. **Transpose Trick**: Transpose, compute, transpose back
4. **Complementary**: With starting node, covers both attention directions