In [5]:
"""
Attention Mechanism: Attention × Value = Output
=================================================
This demonstrates how attention weights are multiplied by the Value matrix
to produce the final output, as shown in the diagram.

The example uses 3 tokens: ["Me", "against", "the"]
"""

import numpy as np

def attention_value_example():
    """Demonstrate Attention × Value = Output"""
    
    print("=" * 70)
    print("Attention Mechanism: Attention × Value = Output")
    print("=" * 70)
    
    # Setup
    seq_length = 3  # 3 tokens: "Me", "against", "the"
    d_v = 64  # Value dimension (matching d_k typically)
    
    # Step 1: Attention weights (after softmax of QK^T / √d_k)
    # Shape: (seq_length, seq_length) = (3, 3)
    # Each row shows how much each token attends to all tokens
    attention_weights = np.array([
        [0.7, 0.2, 0.1],   # "Me" attends mostly to itself
        [0.3, 0.5, 0.2],   # "against" attends to all three
        [0.1, 0.3, 0.6]    # "the" attends mostly to itself
    ])
    
    print("\n1. Attention Weights (3×3):")
    print("   Rows: Query tokens ['Me', 'against', 'the']")
    print("   Cols: Key tokens ['Me', 'against', 'the']")
    print(f"\n{attention_weights}")
    print("\n   Interpretation:")
    print("   - Row 0: 'Me' pays 70% attention to 'Me', 20% to 'against', 10% to 'the'")
    print("   - Row 1: 'against' pays 30% to 'Me', 50% to 'against', 20% to 'the'")
    print("   - Row 2: 'the' pays 10% to 'Me', 30% to 'against', 60% to 'the'")
    
    # Step 2: Value matrix
    # Shape: (seq_length, d_v) = (3, 64)
    # Each row is the value representation for a token
    np.random.seed(42)
    V = np.random.randn(seq_length, d_v)
    
    print(f"\n2. Value Matrix V (3×64):")
    print("   Each row is the 64-dim value vector for a token")
    print(f"\n{V[:, :10].round(2)}  ... (showing first 10 of 64 dimensions)")
    
    # Step 3: Compute Output = Attention × Value
    # Shape: (3, 3) × (3, 64) = (3, 64)
    output = attention_weights @ V
    
    print(f"\n3. Output = Attention × Value (3×64):")
    print("   Each row is the weighted sum of value vectors")
    print(f"\n{output[:, :10].round(2)}  ... (showing first 10 of 64 dimensions)")
    
    # Detailed breakdown for first token
    print("\n" + "=" * 70)
    print("DETAILED BREAKDOWN: How output for 'Me' (first token) is computed")
    print("=" * 70)
    
    print("\nOutput[0] = weighted sum of all value vectors:")
    print(f"  = {attention_weights[0, 0]:.1f} × V[0] (Me)")
    print(f"  + {attention_weights[0, 1]:.1f} × V[1] (against)")
    print(f"  + {attention_weights[0, 2]:.1f} × V[2] (the)")
    
    # Manual calculation for verification
    manual_output_0 = (attention_weights[0, 0] * V[0] + 
                       attention_weights[0, 1] * V[1] + 
                       attention_weights[0, 2] * V[2])
    
    print(f"\nFirst 10 dimensions of Output[0]:")
    print(f"  Computed: {output[0, :10].round(3)}")
    print(f"  Manual:   {manual_output_0[:10].round(3)}")
    print(f"  Match: {np.allclose(output[0], manual_output_0)}")
    
    # Complete example with mini dimensions for visualization
    print("\n" + "=" * 70)
    print("MINI EXAMPLE: With d_v=4 for easy visualization")
    print("=" * 70)
    
    # Smaller example for clarity
    V_mini = np.array([
        [1.0, 2.0, 3.0, 4.0],    # Value for "Me"
        [0.5, 1.5, 2.5, 3.5],    # Value for "against"
        [2.0, 1.0, 4.0, 3.0]     # Value for "the"
    ])
    
    print("\nValue Matrix (3×4):")
    print(V_mini)
    
    print("\nAttention Weights (3×3):")
    print(attention_weights)
    
    output_mini = attention_weights @ V_mini
    
    print("\nOutput = Attention × Value (3×4):")
    print(output_mini.round(2))
    
    print("\n\nManual calculation for Output[0] (token 'Me'):")
    print(f"  Output[0] = 0.7×[1.0, 2.0, 3.0, 4.0]")
    print(f"            + 0.2×[0.5, 1.5, 2.5, 3.5]")
    print(f"            + 0.1×[2.0, 1.0, 4.0, 3.0]")
    print(f"            = [{output_mini[0, 0]:.1f}, {output_mini[0, 1]:.1f}, {output_mini[0, 2]:.1f}, {output_mini[0, 3]:.1f}]")
    
    # Summary with equation
    print("\n" + "=" * 70)
    print("MATHEMATICAL FORMULA")
    print("=" * 70)
    print("\nFor each output position i:")
    print("  Output_i = Σ(k=1 to seq_length) Attention_ik × Value_k")
    print("\nWhere:")
    print("  - Attention_ik = how much position i attends to position k")
    print("  - Value_k = the value vector at position k")
    print("  - The sum creates a weighted combination of all value vectors")

def complete_attention_flow():
    """Show the complete attention mechanism from Q, K, V to Output"""
    
    print("\n\n" + "=" * 70)
    print("COMPLETE ATTENTION FLOW: Q, K, V → Output")
    print("=" * 70)
    
    seq_length = 3
    d_k = 4  # dimension
    
    # Input representations
    Q = np.array([
        [1.0, 0.0, 1.0, 0.0],
        [0.0, 1.0, 0.0, 1.0],
        [1.0, 1.0, 0.0, 0.0]
    ])
    
    K = np.array([
        [1.0, 0.0, 0.0, 1.0],
        [0.0, 1.0, 1.0, 0.0],
        [1.0, 0.0, 1.0, 1.0]
    ])
    
    V = np.array([
        [1.0, 2.0, 3.0, 4.0],
        [0.5, 1.5, 2.5, 3.5],
        [2.0, 1.0, 4.0, 3.0]
    ])
    
    print("\nStep 1: Compute attention scores (QK^T)")
    scores = Q @ K.T
    print(f"Scores shape: {scores.shape}")
    print(scores)
    
    print("\nStep 2: Scale by √d_k")
    scaled_scores = scores / np.sqrt(d_k)
    print(scaled_scores.round(2))
    
    print("\nStep 3: Apply softmax to get attention weights")
    exp_scores = np.exp(scaled_scores - np.max(scaled_scores, axis=1, keepdims=True))
    attention = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
    print(attention.round(3))
    
    print("\nStep 4: Multiply attention weights by Value matrix")
    output = attention @ V
    print(f"Output shape: {output.shape}")
    print(output.round(2))
    
    print("\n✓ This is the complete Scaled Dot-Product Attention!")
    print("  Formula: Attention(Q, K, V) = softmax(QK^T / √d_k) × V")

if __name__ == "__main__":
    attention_value_example()
    complete_attention_flow()

Attention Mechanism: Attention × Value = Output

1. Attention Weights (3×3):
   Rows: Query tokens ['Me', 'against', 'the']
   Cols: Key tokens ['Me', 'against', 'the']

[[0.7 0.2 0.1]
 [0.3 0.5 0.2]
 [0.1 0.3 0.6]]

   Interpretation:
   - Row 0: 'Me' pays 70% attention to 'Me', 20% to 'against', 10% to 'the'
   - Row 1: 'against' pays 30% to 'Me', 50% to 'against', 20% to 'the'
   - Row 2: 'the' pays 10% to 'Me', 30% to 'against', 60% to 'the'

2. Value Matrix V (3×64):
   Each row is the 64-dim value vector for a token

[[ 0.5  -0.14  0.65  1.52 -0.23 -0.23  1.58  0.77 -0.47  0.54]
 [ 0.81  1.36 -0.07  1.    0.36 -0.65  0.36  1.54 -0.04  1.56]
 [ 0.1  -0.5  -1.55  0.07 -1.06  0.47 -0.92  1.55 -0.78 -0.32]]  ... (showing first 10 of 64 dimensions)

3. Output = Attention × Value (3×64):
   Each row is the weighted sum of value vectors

[[ 0.52  0.12  0.28  1.27 -0.2  -0.25  1.09  1.   -0.41  0.66]
 [ 0.58  0.54 -0.15  0.97 -0.1  -0.3   0.47  1.31 -0.32  0.88]
 [ 0.35  0.09 -0.89  0.49

In [4]:
"""
QK^T Matrix Operation Example
==============================
This demonstrates the matrix multiplication of Q and K transpose,
commonly used in attention mechanisms (e.g., Transformer models).
"""

import numpy as np

def qk_transpose_operation():
    """Demonstrate Q * K^T matrix operation"""
    
    # Example 1: Simple 2x3 matrices
    print("=" * 50)
    print("Example 1: Basic QK^T Operation")
    print("=" * 50)
    
    Q = np.array([
        [1, 2, 3],
        [1, 2, 1],
    ])
    
    K = np.array([
        [1, 2, 1],
        [2, 1, 3],
    ])
    
    print("\nMatrix Q (2x3):")
    print(Q)
    
    print("\nMatrix K (2x3):")
    print(K)
    
    print("\nMatrix K^T (3x2):")
    K_transpose = K.T
    print(K_transpose)
    
    print("\nResult of QK^T (2x2):")
    result = Q @ K_transpose  # or np.matmul(Q, K.T)
    print(result)
    
    # Example 2: Attention mechanism context
    print("\n" + "=" * 50)
    print("Example 2: Attention Mechanism (Self-Attention)")
    print("=" * 50)
    
    # Simulating queries, keys with sequence length 4, embedding dimension 3
    seq_length = 4
    d_k = 3  # key/query dimension
    
    Q_attention = np.random.randn(seq_length, d_k)
    K_attention = np.random.randn(seq_length, d_k)
    
    print(f"\nQuery matrix Q ({seq_length}x{d_k}):")
    print(Q_attention.round(2))
    
    print(f"\nKey matrix K ({seq_length}x{d_k}):")
    print(K_attention.round(2))
    
    print(f"\nAttention scores (QK^T) ({seq_length}x{seq_length}):")
    attention_scores = Q_attention @ K_attention.T
    print(attention_scores.round(2))
    
    # Scaled attention scores (as in Transformer)
    print(f"\nScaled attention scores (QK^T / √d_k):")
    scaled_scores = attention_scores / np.sqrt(d_k)
    print(scaled_scores.round(2))
    
    # Apply softmax to get attention weights
    print(f"\nAttention weights (softmax of scaled scores):")
    exp_scores = np.exp(scaled_scores - np.max(scaled_scores, axis=1, keepdims=True))
    attention_weights = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
    print(attention_weights.round(3))
    
    # Example 3: Different methods to compute QK^T
    print("\n" + "=" * 50)
    print("Example 3: Different Ways to Compute QK^T")
    print("=" * 50)
    
    Q_test = np.array([[1, 2], [3, 4]])
    K_test = np.array([[5, 6], [7, 8]])
    
    print("\nMethod 1: Using @ operator")
    result1 = Q_test @ K_test.T
    print(result1)
    
    print("\nMethod 2: Using np.matmul()")
    result2 = np.matmul(Q_test, K_test.T)
    print(result2)
    
    print("\nMethod 3: Using np.dot()")
    result3 = np.dot(Q_test, K_test.T)
    print(result3)
    
    print("\nAll methods produce the same result:", 
          np.allclose(result1, result2) and np.allclose(result2, result3))

if __name__ == "__main__":
    qk_transpose_operation()

Example 1: Basic QK^T Operation

Matrix Q (2x3):
[[1 2 3]
 [1 2 1]]

Matrix K (2x3):
[[1 2 1]
 [2 1 3]]

Matrix K^T (3x2):
[[1 2]
 [2 1]
 [1 3]]

Result of QK^T (2x2):
[[ 8 13]
 [ 6  7]]

Example 2: Attention Mechanism (Self-Attention)

Query matrix Q (4x3):
[[-0.07 -1.43 -0.21]
 [ 0.13  1.43  0.9 ]
 [ 0.05 -0.58 -1.57]
 [-0.84  0.49 -0.58]]

Key matrix K (4x3):
[[-0.11 -0.04  0.79]
 [-1.68 -0.42  0.59]
 [-0.59  0.77 -0.67]
 [ 0.2   0.02  0.16]]

Attention scores (QK^T) (4x4):
[[-0.11  0.59 -0.92 -0.08]
 [ 0.65 -0.3   0.43  0.2 ]
 [-1.23 -0.76  0.57 -0.25]
 [-0.38  0.87  1.27 -0.25]]

Scaled attention scores (QK^T / √d_k):
[[-0.06  0.34 -0.53 -0.05]
 [ 0.38 -0.17  0.25  0.12]
 [-0.71 -0.44  0.33 -0.14]
 [-0.22  0.5   0.73 -0.14]]

Attention weights (softmax of scaled scores):
[[0.241 0.362 0.151 0.246]
 [0.309 0.179 0.273 0.239]
 [0.145 0.191 0.409 0.255]
 [0.149 0.306 0.385 0.16 ]]

Example 3: Different Ways to Compute QK^T

Method 1: Using @ operator
[[17 23]
 [39 53]]

Method 2: Usi