# Project 3: Positioning in 3D (RoPE Animator) - SOLUTION
## Implementing and Visualizing Positional Embeddings

**This notebook contains complete solutions to all tasks.**

In [None]:
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.animation import FuncAnimation
from IPython.display import HTML

torch.manual_seed(42)
print(f"PyTorch version: {torch.__version__}")

## Part 1: Sinusoidal Positional Embedding - SOLUTION

In [None]:
class SinusoidalPositionalEmbedding(nn.Module):
    """
    Fixed sinusoidal position encoding from 'Attention is All You Need'.
    
    PE(pos, 2i)   = sin(pos / 10000^(2i/d_model))
    PE(pos, 2i+1) = cos(pos / 10000^(2i/d_model))
    """
    def __init__(self, d_model: int, max_len: int = 5000):
        super().__init__()
        
        # Create position encoding matrix
        pe = torch.zeros(max_len, d_model)
        
        # Create position indices [0, 1, 2, ..., max_len-1]
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        
        # Create div_term for different frequencies
        # div_term = 1 / (10000 ^ (2i / d_model))
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
        
        # Apply sin to even indices
        pe[:, 0::2] = torch.sin(position * div_term)
        
        # Apply cos to odd indices
        pe[:, 1::2] = torch.cos(position * div_term)
        
        # Register as buffer (not a parameter)
        self.register_buffer('pe', pe.unsqueeze(0))  # [1, max_len, d_model]
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Args:
            x: [batch_size, seq_len, d_model]
        Returns:
            x + positional encodings
        """
        seq_len = x.size(1)
        return x + self.pe[:, :seq_len, :]

# Test
d_model = 64
sinusoidal_pe = SinusoidalPositionalEmbedding(d_model)
test_input = torch.randn(2, 10, d_model)  # batch=2, seq_len=10
output = sinusoidal_pe(test_input)
print(f"Input shape: {test_input.shape}")
print(f"Output shape: {output.shape}")
print("‚úì Sinusoidal PE implemented successfully!")

### Visualize Sinusoidal Encoding - SOLUTION

In [None]:
def plot_sinusoidal_heatmap(pe_module, max_pos=100, d_model=None):
    """
    Visualize the sinusoidal position encoding pattern.
    """
    # Extract position encodings
    if d_model is None:
        pe = pe_module.pe[0, :max_pos, :].detach().numpy()
    else:
        pe = pe_module.pe[0, :max_pos, :d_model].detach().numpy()
    
    # Plot heatmap
    plt.figure(figsize=(15, 8))
    plt.imshow(pe.T, aspect='auto', cmap='RdBu', interpolation='nearest')
    plt.colorbar(label='Encoding Value')
    plt.xlabel('Position')
    plt.ylabel('Embedding Dimension')
    plt.title('Sinusoidal Positional Encoding Heatmap')
    plt.tight_layout()
    plt.show()
    
    # Plot individual dimensions
    fig, axes = plt.subplots(4, 1, figsize=(15, 10))
    dims_to_plot = [0, 1, 8, 16] if pe.shape[1] > 16 else [0, 1, 2, 3]
    
    for ax, dim in zip(axes, dims_to_plot):
        if dim < pe.shape[1]:
            ax.plot(pe[:, dim])
            ax.set_title(f'Dimension {dim} - {"Sin" if dim % 2 == 0 else "Cos"}')
            ax.set_xlabel('Position')
            ax.set_ylabel('Value')
            ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

plot_sinusoidal_heatmap(sinusoidal_pe, max_pos=100, d_model=64)

## Part 2: Learned Positional Embedding - SOLUTION

In [None]:
class LearnedPositionalEmbedding(nn.Module):
    """
    Learnable positional embeddings (BERT-style).
    """
    def __init__(self, d_model: int, max_len: int = 5000):
        super().__init__()
        self.pos_embedding = nn.Embedding(max_len, d_model)
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Args:
            x: [batch_size, seq_len, d_model]
        Returns:
            x + learned positional encodings
        """
        batch_size, seq_len, d_model = x.shape
        positions = torch.arange(seq_len, device=x.device).unsqueeze(0).expand(batch_size, -1)
        return x + self.pos_embedding(positions)

# Test
learned_pe = LearnedPositionalEmbedding(d_model)
output = learned_pe(test_input)
print(f"Learned PE output shape: {output.shape}")
print(f"Number of parameters: {sum(p.numel() for p in learned_pe.parameters())}")
print("‚úì Learned PE implemented successfully!")

## Part 3: Rotary Positional Embeddings (RoPE) - SOLUTION

In [None]:
class RoPE(nn.Module):
    """
    Rotary Position Embedding.
    
    Applies rotation to query and key vectors based on position.
    """
    def __init__(self, d_model: int, base: int = 10000, max_len: int = 5000):
        super().__init__()
        self.d_model = d_model
        self.base = base
        
        # Precompute inverse frequencies
        # theta_i = base^(-2i/d_model) for i in [0, d_model/2)
        inv_freq = 1.0 / (base ** (torch.arange(0, d_model, 2).float() / d_model))
        self.register_buffer('inv_freq', inv_freq)
        
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Apply rotary embeddings.
        
        Args:
            x: [batch_size, seq_len, d_model]
        Returns:
            Rotated tensor
        """
        batch_size, seq_len, d_model = x.shape
        
        # Create position indices
        positions = torch.arange(seq_len, device=x.device).float()
        
        # Compute angles: pos * inv_freq
        freqs = torch.einsum('i,j->ij', positions, self.inv_freq)
        
        # Create rotation matrix using sin and cos
        emb = torch.cat([freqs, freqs], dim=-1)  # [seq_len, d_model]
        
        # Compute cos and sin
        cos_emb = emb.cos()[None, :, :]  # [1, seq_len, d_model]
        sin_emb = emb.sin()[None, :, :]  # [1, seq_len, d_model]
        
        # Rotate x
        # Split into pairs and apply rotation
        x_reshaped = x.reshape(batch_size, seq_len, -1, 2)
        cos_emb = cos_emb.reshape(1, seq_len, -1, 2)
        sin_emb = sin_emb.reshape(1, seq_len, -1, 2)
        
        # Apply rotation: [cos -sin] [x]
        #                 [sin  cos] [y]
        x_rotated = torch.stack([
            x_reshaped[..., 0] * cos_emb[..., 0] - x_reshaped[..., 1] * sin_emb[..., 0],
            x_reshaped[..., 0] * sin_emb[..., 1] + x_reshaped[..., 1] * cos_emb[..., 1]
        ], dim=-1)
        
        return x_rotated.reshape(batch_size, seq_len, d_model)

# Test
rope = RoPE(d_model)
output = rope(test_input)
print(f"RoPE output shape: {output.shape}")
print("‚úì RoPE implemented successfully!")

### Visualize RoPE Rotation - SOLUTION

In [None]:
def visualize_rope_rotation_2d(rope_module, num_positions=20):
    """
    Visualize how a 2D vector rotates with RoPE.
    """
    # Create a simple 2D vector
    vector = torch.tensor([[1.0, 0.0]])  # Unit vector along x-axis
    
    # Apply RoPE at different positions
    rotations = []
    for pos in range(num_positions):
        positions = torch.tensor([pos], dtype=torch.float)
        freqs = torch.einsum('i,j->ij', positions, rope_module.inv_freq[:1])
        
        angle = freqs[0, 0].item()
        
        # Apply 2D rotation manually
        cos_a = np.cos(angle)
        sin_a = np.sin(angle)
        rotated = np.array([cos_a, sin_a])
        
        rotations.append((rotated, angle))
    
    # Plot all rotations
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    
    # Left: Vector rotation visualization
    colors = plt.cm.viridis(np.linspace(0, 1, num_positions))
    
    for i, (vec, angle) in enumerate(rotations):
        ax1.arrow(0, 0, vec[0], vec[1], head_width=0.1, head_length=0.1, 
                 fc=colors[i], ec=colors[i], alpha=0.6)
        if i % 5 == 0:  # Label every 5th position
            ax1.text(vec[0]*1.2, vec[1]*1.2, f'pos={i}', fontsize=8)
    
    ax1.set_xlim(-1.5, 1.5)
    ax1.set_ylim(-1.5, 1.5)
    ax1.set_aspect('equal')
    ax1.grid(True, alpha=0.3)
    ax1.set_title('RoPE Rotation at Different Positions')
    ax1.set_xlabel('x')
    ax1.set_ylabel('y')
    
    # Right: Angle vs position
    angles = [angle for _, angle in rotations]
    ax2.plot(range(num_positions), angles, marker='o')
    ax2.set_xlabel('Position')
    ax2.set_ylabel('Rotation Angle (radians)')
    ax2.set_title('Rotation Angle vs Position')
    ax2.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

visualize_rope_rotation_2d(rope, num_positions=12)

## Part 4: Compare All Three Methods - SOLUTION

In [None]:
def compare_positional_methods(seq_len=50, d_model=64):
    """
    Side-by-side comparison of positional encoding methods.
    """
    # Create dummy input (all zeros to see pure positional encoding)
    x = torch.zeros(1, seq_len, d_model)
    
    # Apply each method
    sin_pe = SinusoidalPositionalEmbedding(d_model)
    learned_pe = LearnedPositionalEmbedding(d_model)
    rope_pe = RoPE(d_model)
    
    sin_out = sin_pe(x)[0].detach().numpy()  # [seq_len, d_model]
    learned_out = learned_pe(x)[0].detach().numpy()
    rope_out = rope_pe(x)[0].detach().numpy()
    
    # Plot
    fig, axes = plt.subplots(3, 1, figsize=(15, 12))
    
    # Sinusoidal
    im1 = axes[0].imshow(sin_out.T, aspect='auto', cmap='RdBu')
    axes[0].set_title('Sinusoidal Positional Encoding (Fixed)')
    axes[0].set_ylabel('Dimension')
    plt.colorbar(im1, ax=axes[0])
    
    # Learned (random initialization)
    im2 = axes[1].imshow(learned_out.T, aspect='auto', cmap='RdBu')
    axes[1].set_title('Learned Positional Encoding (Random Init - would be trained)')
    axes[1].set_ylabel('Dimension')
    plt.colorbar(im2, ax=axes[1])
    
    # RoPE
    im3 = axes[2].imshow(rope_out.T, aspect='auto', cmap='RdBu')
    axes[2].set_title('RoPE (Rotary Position Encoding)')
    axes[2].set_xlabel('Position')
    axes[2].set_ylabel('Dimension')
    plt.colorbar(im3, ax=axes[2])
    
    plt.tight_layout()
    plt.show()

compare_positional_methods(seq_len=50, d_model=64)

## Part 5: Demonstrate Position Importance - SOLUTION

In [None]:
# Create simple vocabulary
vocab = {'<PAD>': 0, 'dog': 1, 'bites': 2, 'man': 3}
inv_vocab = {v: k for k, v in vocab.items()}

# Create two sentences with different word order
sentence1 = torch.tensor([[vocab['dog'], vocab['bites'], vocab['man']]])  # "dog bites man"
sentence2 = torch.tensor([[vocab['man'], vocab['bites'], vocab['dog']]])  # "man bites dog"

print("Sentence 1:", [inv_vocab[i.item()] for i in sentence1[0]])
print("Sentence 2:", [inv_vocab[i.item()] for i in sentence2[0]])

# Create simple embedding layer
embed_dim = 16
embedding = nn.Embedding(len(vocab), embed_dim)

# Embed sentences
emb1 = embedding(sentence1)  # [1, 3, embed_dim]
emb2 = embedding(sentence2)

# Without positional encoding: sum embeddings
sum1 = emb1.sum(dim=1)  # [1, embed_dim]
sum2 = emb2.sum(dim=1)

print(f"\n{'='*60}")
print("WITHOUT POSITIONAL ENCODING:")
print(f"{'='*60}")
print(f"Sum of embeddings are equal: {torch.allclose(sum1, sum2)}")
print(f"Difference: {(sum1 - sum2).abs().sum().item():.6f}")
print("\n‚ö†Ô∏è  The model CANNOT distinguish word order!")
print("   'dog bites man' = 'man bites dog' = 'bites man dog'")

# With positional encoding
pos_enc = SinusoidalPositionalEmbedding(embed_dim)
emb1_pos = pos_enc(emb1)
emb2_pos = pos_enc(emb2)

sum1_pos = emb1_pos.sum(dim=1)
sum2_pos = emb2_pos.sum(dim=1)

print(f"\n{'='*60}")
print("WITH POSITIONAL ENCODING:")
print(f"{'='*60}")
print(f"Sum of embeddings are equal: {torch.allclose(sum1_pos, sum2_pos)}")
print(f"Difference: {(sum1_pos - sum2_pos).norm().item():.6f}")
print("\n‚úì The model CAN now distinguish word order!")
print("  'dog bites man' ‚â† 'man bites dog'")

## Part 6: Analysis and Reflection

### Questions and Answers:

#### 1. What patterns do you see in the sinusoidal encoding heatmap?

**Answer:**
- Wave patterns with different frequencies across dimensions
- Lower dimensions (0, 1) oscillate slowly
- Higher dimensions oscillate faster
- This creates a unique "fingerprint" for each position
- The pattern repeats but at very long intervals

#### 2. How does RoPE rotation change with position?

**Answer:**
- Rotation angle increases linearly with position
- Each position gets a unique rotation angle
- Different frequency components rotate at different rates
- This encodes relative positions naturally in the dot product

#### 3. Why can't learned embeddings generalize beyond max_len?

**Answer:**
- Learned embeddings use a lookup table (nn.Embedding)
- Each position has a separate learned vector
- Position 5001 doesn't exist if max_len=5000
- No interpolation mechanism for unseen positions
- Sinusoidal and RoPE can extrapolate to longer sequences

#### 4. What is the key difference between absolute and relative positional encodings?

**Answer:**
- **Absolute (Sinusoidal, Learned):** Each position has a fixed encoding
  - Position 5 always gets the same embedding
- **Relative (RoPE):** Attention depends on position difference
  - "How far apart are two tokens?" matters more than absolute position
  - Better generalization to different sequence lengths

#### 5. Why does RoPE work better for long sequences?

**Answer:**
- Encodes relative distances, not absolute positions
- The rotation mechanism extrapolates naturally
- No learned lookup table to outgrow
- Used in models like LLaMA for this reason

## üéØ Completion Checklist

- ‚úÖ Implemented `SinusoidalPositionalEmbedding`
- ‚úÖ Implemented `LearnedPositionalEmbedding`
- ‚úÖ Implemented `RoPE`
- ‚úÖ Visualized sinusoidal encoding heatmap
- ‚úÖ Created RoPE rotation visualization
- ‚úÖ Compared all three methods side-by-side
- ‚úÖ Demonstrated position-less model failure
- ‚úÖ Answered reflection questions

## Key Takeaways

1. **Positions are critical**: Sets are unordered, sequences need position info
2. **Sinusoidal is elegant**: No training needed, mathematically beautiful
3. **Learned is flexible**: Can adapt to data but doesn't extrapolate
4. **RoPE is powerful**: Relative positions, excellent for long context

## üöÄ Next Project
Move to **04_attention_lab** to build the attention mechanism!