# 04. Action Tokenization

**Goal**: Understand how OpenVLA converts continuous robot actions to discrete tokens.

## What We'll Learn
1. Why discretize actions?
2. The action tokenization process
3. Token vocabulary structure
4. Encoding and decoding actions
5. Quantization resolution and accuracy

---
## 1. Why Discretize Actions?

### The Problem
- Robot actions are **continuous**: (0.15, -0.32, 0.78, ...)
- LLMs generate **discrete tokens**: vocabulary indices

### The Solution
OpenVLA treats action prediction as **classification**, not regression:
- Divide continuous range into bins (buckets)
- Each bin corresponds to a token
- LLM generates tokens → decode to action values

### Benefits
1. **Unified training**: Same cross-entropy loss for language and actions
2. **LLM strengths**: LLMs excel at discrete prediction
3. **Multi-modal robustness**: Works across different action spaces
4. **Autoregressive generation**: Natural fit with LLM decoding

In [None]:
# ============================================================
# CRITICAL: Set environment variables BEFORE importing packages!
# ============================================================
import os

# Auto-detect environment (NERSC vs SciServer)
import os
if os.environ.get('SCRATCH'):
    SCRATCH = os.environ['SCRATCH']  # NERSC Perlmutter
elif os.environ.get('SCRATCH'):
    SCRATCH = os.environ['SCRATCH']  # Generic scratch
else:
    SCRATCH = "/home/idies/workspace/Temporary/dpark1/scratch"  # SciServer default  # CHANGE THIS TO YOUR PATH
CACHE_DIR = f"{SCRATCH}/.cache"

os.environ['XDG_CACHE_HOME'] = CACHE_DIR
os.environ['HF_HOME'] = f"{CACHE_DIR}/huggingface"
os.environ['TFDS_DATA_DIR'] = f"{CACHE_DIR}/tensorflow_datasets"
os.environ['TORCH_HOME'] = f"{CACHE_DIR}/torch"

for path in [CACHE_DIR, os.environ['HF_HOME'], os.environ['TFDS_DATA_DIR'], os.environ['TORCH_HOME']]:
    os.makedirs(path, exist_ok=True)

print(f"✅ All caches → {CACHE_DIR}")

# Now import packages
import numpy as np
import matplotlib.pyplot as plt

# Visualize the discretization concept
def visualize_discretization(n_bins=256):
    """Visualize how continuous values map to discrete bins."""
    
    # Create bin edges and centers
    bin_edges = np.linspace(-1, 1, n_bins + 1)
    bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2
    
    fig, axes = plt.subplots(2, 1, figsize=(14, 8))
    
    # Plot 1: Continuous to discrete mapping
    ax1 = axes[0]
    continuous_values = np.linspace(-1, 1, 1000)
    discrete_bins = np.digitize(continuous_values, bin_edges) - 1
    discrete_bins = np.clip(discrete_bins, 0, n_bins - 1)
    reconstructed = bin_centers[discrete_bins]
    
    ax1.plot(continuous_values, continuous_values, 'b-', label='Original', alpha=0.7)
    ax1.plot(continuous_values, reconstructed, 'r-', label='After discretization', alpha=0.7)
    ax1.set_xlabel('Original Value')
    ax1.set_ylabel('Value')
    ax1.set_title(f'Continuous → Discrete → Continuous ({n_bins} bins)')
    ax1.legend()
    ax1.grid(True, alpha=0.3)
    
    # Plot 2: Quantization error
    ax2 = axes[1]
    error = np.abs(continuous_values - reconstructed)
    ax2.plot(continuous_values, error * 1000, 'g-', alpha=0.7)
    ax2.set_xlabel('Original Value')
    ax2.set_ylabel('Error (×10⁻³)')
    ax2.set_title(f'Quantization Error (max: {error.max():.6f}, mean: {error.mean():.6f})')
    ax2.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    print(f"\nQuantization Statistics:")
    print(f"  Number of bins: {n_bins}")
    print(f"  Bin width: {2/n_bins:.6f}")
    print(f"  Max error: {2/(2*n_bins):.6f} (half bin width)")
    print(f"  Resolution: {1/n_bins:.4f} of full range")

visualize_discretization(256)

---
## 2. OpenVLA's Action Tokenizer

Let's look at the actual implementation.

In [None]:
# Read the action tokenizer source code
import os

# Path to action tokenizer in the repository
REPO_ROOT = "/Users/davidpark/Documents/Claude/openvla"
tokenizer_path = os.path.join(REPO_ROOT, "prismatic/vla/action_tokenizer.py")

print("ActionTokenizer Source Code Location:")
print(f"  {tokenizer_path}")

In [None]:
# Implement a simplified action tokenizer for understanding
class SimpleActionTokenizer:
    """Simplified action tokenizer for educational purposes."""
    
    def __init__(self, n_bins=256, vocab_size=32000):
        """
        Args:
            n_bins: Number of discrete bins per action dimension
            vocab_size: Total vocabulary size of the LLM
        """
        self.n_bins = n_bins
        self.vocab_size = vocab_size
        
        # Action tokens occupy the LAST n_bins positions in vocabulary
        # This avoids collision with text tokens
        self.action_token_start = vocab_size - n_bins
        
        # Create bin edges for discretization
        self.bin_edges = np.linspace(-1, 1, n_bins + 1)
        self.bin_centers = (self.bin_edges[:-1] + self.bin_edges[1:]) / 2
    
    def encode(self, continuous_action: np.ndarray) -> np.ndarray:
        """
        Convert continuous action to token IDs.
        
        Args:
            continuous_action: Array of shape (action_dim,) with values in [-1, 1]
            
        Returns:
            Array of token IDs, one per action dimension
        """
        # Clip to valid range
        clipped = np.clip(continuous_action, -1, 1)
        
        # Find bin indices (0 to n_bins-1)
        bin_indices = np.digitize(clipped, self.bin_edges) - 1
        bin_indices = np.clip(bin_indices, 0, self.n_bins - 1)
        
        # Convert to vocabulary token IDs
        token_ids = bin_indices + self.action_token_start
        
        return token_ids
    
    def decode(self, token_ids: np.ndarray) -> np.ndarray:
        """
        Convert token IDs back to continuous actions.
        
        Args:
            token_ids: Array of token IDs
            
        Returns:
            Array of continuous action values in [-1, 1]
        """
        # Convert token IDs to bin indices
        bin_indices = token_ids - self.action_token_start
        
        # Map to bin centers
        continuous_actions = self.bin_centers[bin_indices]
        
        return continuous_actions


# Create tokenizer
tokenizer = SimpleActionTokenizer(n_bins=256, vocab_size=32000)

print("SimpleActionTokenizer Configuration:")
print(f"  Total vocabulary size: {tokenizer.vocab_size}")
print(f"  Number of action bins: {tokenizer.n_bins}")
print(f"  Action tokens range: {tokenizer.action_token_start} to {tokenizer.vocab_size - 1}")
print(f"  Bin width: {2/tokenizer.n_bins:.6f}")

In [None]:
# Test encoding and decoding
print("\nEncoding/Decoding Examples:")
print("="*60)

# Sample continuous actions
sample_actions = np.array([
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],       # Zero action
    [0.5, -0.5, 0.25, -0.25, 0.1, -0.1, 1.0],  # Mixed action
    [1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0],    # Extreme action
])

for i, action in enumerate(sample_actions):
    # Encode
    tokens = tokenizer.encode(action)
    # Decode
    reconstructed = tokenizer.decode(tokens)
    # Error
    error = np.abs(action - reconstructed)
    
    print(f"\nExample {i+1}:")
    print(f"  Original:      {action}")
    print(f"  Token IDs:     {tokens}")
    print(f"  Reconstructed: {reconstructed}")
    print(f"  Max error:     {error.max():.6f}")

---
## 3. Token Vocabulary Structure

OpenVLA extends the LLM vocabulary to include action tokens.

In [None]:
vocab_structure = """
┌────────────────────────────────────────────────────────────────────┐
│                    OpenVLA Vocabulary Structure                     │
├────────────────────────────────────────────────────────────────────┤
│                                                                     │
│  Token ID: 0 ─────────────────────────────────────────── 32000     │
│            │                                                │      │
│  ┌─────────┴────────────────────────────┬───────────────────┴───┐  │
│  │      TEXT TOKENS (0 - 31743)         │  ACTION TOKENS        │  │
│  │                                       │  (31744 - 31999)      │  │
│  │  • BOS, EOS, PAD tokens              │                       │  │
│  │  • Language vocabulary               │  • 256 bins           │  │
│  │  • Special tokens                     │  • One per action val │  │
│  │                                       │                       │  │
│  └───────────────────────────────────────┴───────────────────────┘  │
│                                                                     │
│  For 7-DoF robot action:                                            │
│  ┌────────────────────────────────────────────────────────────┐    │
│  │ [x_token] [y_token] [z_token] [rx_token] [ry_token]        │    │
│  │ [rz_token] [gripper_token]                                  │    │
│  │                                                             │    │
│  │ Each token ID = action_token_start + bin_index              │    │
│  │ Example: x=0.5 → bin 192 → token 31936                     │    │
│  └────────────────────────────────────────────────────────────┘    │
│                                                                     │
└────────────────────────────────────────────────────────────────────┘
"""
print(vocab_structure)

In [None]:
# Load actual OpenVLA tokenizer
# Note: Environment variables were set in cell-2 above
import torch
from transformers import AutoProcessor, AutoModelForVision2Seq

print("Loading OpenVLA processor (this may take a while on first run)...")
processor = AutoProcessor.from_pretrained("openvla/openvla-7b", trust_remote_code=True)
tokenizer_hf = processor.tokenizer

print("OpenVLA Tokenizer Info:")
print(f"  Vocabulary size: {tokenizer_hf.vocab_size}")
print(f"  BOS token: {tokenizer_hf.bos_token} (ID: {tokenizer_hf.bos_token_id})")
print(f"  EOS token: {tokenizer_hf.eos_token} (ID: {tokenizer_hf.eos_token_id})")
print(f"  PAD token: {tokenizer_hf.pad_token} (ID: {tokenizer_hf.pad_token_id})")

In [None]:
# Examine the action token range
n_bins = 256
vocab_size = tokenizer_hf.vocab_size
action_start = vocab_size - n_bins

print(f"\nAction Token Configuration:")
print(f"  Total vocab: {vocab_size}")
print(f"  Action bins: {n_bins}")
print(f"  Action token range: [{action_start}, {vocab_size - 1}]")
print(f"\n  Bin 0 (value ≈ -1.0) → Token {action_start}")
print(f"  Bin 127 (value ≈ 0.0) → Token {action_start + 127}")
print(f"  Bin 255 (value ≈ 1.0) → Token {vocab_size - 1}")

---
## 4. Full Encoding/Decoding Pipeline

In [None]:
# Complete action tokenization pipeline
class OpenVLAActionTokenizer:
    """
    Full-featured action tokenizer matching OpenVLA implementation.
    """
    
    def __init__(self, tokenizer, n_bins=256, action_dim=7):
        self.tokenizer = tokenizer
        self.n_bins = n_bins
        self.action_dim = action_dim
        self.vocab_size = tokenizer.vocab_size
        
        # Action tokens occupy last n_bins positions
        self.action_token_start = self.vocab_size - n_bins
        
        # Create discretization bins
        self.bins = np.linspace(-1, 1, n_bins + 1)
        self.bin_centers = (self.bins[:-1] + self.bins[1:]) / 2
    
    def action_to_tokens(self, action: np.ndarray) -> str:
        """
        Convert continuous action to token string.
        
        This is what gets appended to the prompt during training.
        """
        # Clip to [-1, 1]
        action = np.clip(action, -1, 1)
        
        # Discretize each dimension
        bin_indices = np.digitize(action, self.bins) - 1
        bin_indices = np.clip(bin_indices, 0, self.n_bins - 1)
        
        # Convert to token IDs
        token_ids = bin_indices + self.action_token_start
        
        # Decode token IDs to string
        action_string = self.tokenizer.decode(token_ids)
        
        return action_string, token_ids
    
    def tokens_to_action(self, token_ids: np.ndarray) -> np.ndarray:
        """
        Convert token IDs back to continuous action.
        
        This is used during inference to decode model output.
        """
        # Validate token IDs are in action range
        assert np.all(token_ids >= self.action_token_start), "Invalid action token"
        assert np.all(token_ids < self.vocab_size), "Invalid action token"
        
        # Convert to bin indices
        bin_indices = token_ids - self.action_token_start
        
        # Map to continuous values
        action = self.bin_centers[bin_indices]
        
        return action


# Create the tokenizer
action_tokenizer = OpenVLAActionTokenizer(tokenizer_hf)

print("OpenVLA Action Tokenizer:")
print(f"  Action dimensions: {action_tokenizer.action_dim}")
print(f"  Bins per dimension: {action_tokenizer.n_bins}")
print(f"  Action token range: [{action_tokenizer.action_token_start}, {action_tokenizer.vocab_size-1}]")

In [None]:
# Test the full pipeline
print("\nFull Encoding/Decoding Test:")
print("="*60)

# Sample robot action
sample_action = np.array([0.15, -0.32, 0.78, 0.0, 0.1, -0.05, 0.8])

print(f"Original action: {sample_action}")
print(f"  x (position): {sample_action[0]:.4f}")
print(f"  y (position): {sample_action[1]:.4f}")
print(f"  z (position): {sample_action[2]:.4f}")
print(f"  roll:         {sample_action[3]:.4f}")
print(f"  pitch:        {sample_action[4]:.4f}")
print(f"  yaw:          {sample_action[5]:.4f}")
print(f"  gripper:      {sample_action[6]:.4f}")

# Encode
action_string, token_ids = action_tokenizer.action_to_tokens(sample_action)
print(f"\nEncoded token IDs: {token_ids}")
print(f"As string: '{action_string}'")

# Decode
reconstructed = action_tokenizer.tokens_to_action(token_ids)
print(f"\nReconstructed: {reconstructed}")

# Error analysis
error = np.abs(sample_action - reconstructed)
print(f"\nQuantization error per dimension:")
for i, (orig, recon, err) in enumerate(zip(sample_action, reconstructed, error)):
    dim_names = ['x', 'y', 'z', 'roll', 'pitch', 'yaw', 'gripper']
    print(f"  {dim_names[i]:8s}: {orig:+.4f} → {recon:+.4f}  (error: {err:.6f})")

---
## 5. Quantization Resolution Analysis

In [None]:
# Analyze quantization resolution for different bin counts
def analyze_resolution(n_bins_list=[64, 128, 256, 512, 1024]):
    """Compare quantization resolution for different bin counts."""
    
    results = []
    
    for n_bins in n_bins_list:
        bin_width = 2.0 / n_bins
        max_error = bin_width / 2
        
        # In robot context: if action range is ±10cm, what's the error?
        range_cm = 10  # ±10cm typical
        error_mm = max_error * range_cm * 10  # Convert to mm
        
        results.append({
            'n_bins': n_bins,
            'bin_width': bin_width,
            'max_error': max_error,
            'error_mm': error_mm
        })
    
    print("Quantization Resolution Analysis:")
    print("="*70)
    print(f"{'Bins':>8} {'Bin Width':>12} {'Max Error':>12} {'Error (mm)*':>12}")
    print("-"*70)
    for r in results:
        print(f"{r['n_bins']:>8} {r['bin_width']:>12.6f} {r['max_error']:>12.6f} {r['error_mm']:>12.3f}")
    
    print("\n* Assuming ±10cm action range")
    print("\nOpenVLA uses 256 bins: ~0.39mm error for typical robot manipulation")
    
    return results

results = analyze_resolution()

In [None]:
# Visualize resolution comparison
plt.figure(figsize=(12, 5))

n_bins_list = [r['n_bins'] for r in results]
errors_mm = [r['error_mm'] for r in results]

plt.subplot(1, 2, 1)
plt.bar(range(len(n_bins_list)), errors_mm, color='steelblue')
plt.xticks(range(len(n_bins_list)), n_bins_list)
plt.xlabel('Number of Bins')
plt.ylabel('Max Error (mm)')
plt.title('Quantization Error vs Bin Count')
plt.axhline(y=1.0, color='r', linestyle='--', label='1mm threshold')
plt.legend()

plt.subplot(1, 2, 2)
# Show what 256 bins looks like for different action ranges
action_ranges = [5, 10, 20, 50, 100]  # cm
errors = [r['max_error'] * ar * 10 for ar in action_ranges for r in results if r['n_bins'] == 256]
errors = [results[2]['max_error'] * ar * 10 for ar in action_ranges]  # 256 bins

plt.bar(range(len(action_ranges)), errors, color='coral')
plt.xticks(range(len(action_ranges)), [f'±{ar}cm' for ar in action_ranges])
plt.xlabel('Action Range')
plt.ylabel('Max Error (mm)')
plt.title('Error vs Action Range (256 bins)')

plt.tight_layout()
plt.show()

---
## 6. Action Normalization

Before tokenization, actions are normalized to [-1, 1] range.

In [None]:
# Action normalization strategies
class ActionNormalizer:
    """
    Normalize robot actions to [-1, 1] range for tokenization.
    
    OpenVLA supports two strategies:
    1. Bounds-based: min/max clipping
    2. Statistics-based: mean/std normalization
    """
    
    def __init__(self, stats: dict, strategy='bounds'):
        """
        Args:
            stats: Dictionary with normalization statistics
            strategy: 'bounds' or 'normal'
        """
        self.strategy = strategy
        
        if strategy == 'bounds':
            self.min_action = np.array(stats['min'])
            self.max_action = np.array(stats['max'])
        else:  # normal
            self.mean = np.array(stats['mean'])
            self.std = np.array(stats['std'])
    
    def normalize(self, action: np.ndarray) -> np.ndarray:
        """Normalize action to [-1, 1] range."""
        if self.strategy == 'bounds':
            # Scale from [min, max] to [-1, 1]
            normalized = 2 * (action - self.min_action) / (self.max_action - self.min_action) - 1
        else:
            # Normalize using mean/std, then clip
            normalized = (action - self.mean) / (self.std + 1e-8)
            normalized = np.clip(normalized, -1, 1)
        
        return np.clip(normalized, -1, 1)
    
    def denormalize(self, normalized_action: np.ndarray) -> np.ndarray:
        """Convert normalized action back to original scale."""
        if self.strategy == 'bounds':
            action = (normalized_action + 1) / 2 * (self.max_action - self.min_action) + self.min_action
        else:
            action = normalized_action * self.std + self.mean
        
        return action


# Example: LIBERO normalization statistics
libero_stats = {
    'bounds': {
        'min': [-0.1, -0.1, -0.1, -0.5, -0.5, -0.5, 0.0],
        'max': [0.1, 0.1, 0.1, 0.5, 0.5, 0.5, 1.0]
    },
    'description': 'LIBERO uses delta position/rotation actions'
}

normalizer = ActionNormalizer(libero_stats['bounds'], strategy='bounds')

# Test normalization
raw_action = np.array([0.05, -0.03, 0.02, 0.1, -0.2, 0.15, 0.8])
normalized = normalizer.normalize(raw_action)
denormalized = normalizer.denormalize(normalized)

print("Action Normalization Example:")
print("="*60)
print(f"Raw action:        {raw_action}")
print(f"Normalized [-1,1]: {normalized}")
print(f"Denormalized:      {denormalized}")
print(f"Reconstruction OK: {np.allclose(raw_action, denormalized)}")

---
## 7. Complete Action Generation Flow

In [None]:
action_flow = """
┌────────────────────────────────────────────────────────────────────┐
│               Complete Action Generation Pipeline                   │
├────────────────────────────────────────────────────────────────────┤
│                                                                     │
│  TRAINING:                                                          │
│  ─────────                                                          │
│  Raw Action        →  Normalize   →  Discretize  →  Token IDs      │
│  [0.05, -0.03, ..]    [-1, 1]        [0, 255]       [31744+bin]    │
│                                                                     │
│  ┌──────────────────────────────────────────────────────────────┐  │
│  │ Prompt: "Pick up red block\n" + action_tokens                 │  │
│  │ Loss: Cross-entropy on action tokens only                     │  │
│  └──────────────────────────────────────────────────────────────┘  │
│                                                                     │
│  INFERENCE:                                                         │
│  ──────────                                                         │
│  Token IDs     →  Undiscretize  →  Denormalize  →  Raw Action      │
│  [31872, ...]     [-1, 1]          Original        [0.05, ...]     │
│                                       scale                         │
│                                                                     │
│  ┌──────────────────────────────────────────────────────────────┐  │
│  │ 1. LLM autoregressively generates 7 action tokens            │  │
│  │ 2. Each token decoded to bin center value                     │  │
│  │ 3. Values denormalized using dataset statistics               │  │
│  │ 4. Final action sent to robot controller                      │  │
│  └──────────────────────────────────────────────────────────────┘  │
│                                                                     │
└────────────────────────────────────────────────────────────────────┘
"""
print(action_flow)

In [None]:
# Simulate the complete pipeline
def simulate_action_pipeline(
    raw_action: np.ndarray,
    normalizer: ActionNormalizer,
    tokenizer: OpenVLAActionTokenizer
):
    """
    Simulate full action encoding/decoding pipeline.
    """
    print("Action Pipeline Simulation")
    print("="*60)
    
    # Step 1: Start with raw action
    print(f"\n1. Raw action (robot units):")
    print(f"   {raw_action}")
    
    # Step 2: Normalize
    normalized = normalizer.normalize(raw_action)
    print(f"\n2. Normalized to [-1, 1]:")
    print(f"   {normalized}")
    
    # Step 3: Tokenize
    action_string, token_ids = tokenizer.action_to_tokens(normalized)
    print(f"\n3. Tokenized (token IDs):")
    print(f"   {token_ids}")
    print(f"   As string: '{action_string}'")
    
    # Step 4: Decode tokens (simulating LLM output)
    decoded_normalized = tokenizer.tokens_to_action(token_ids)
    print(f"\n4. Decoded from tokens:")
    print(f"   {decoded_normalized}")
    
    # Step 5: Denormalize
    final_action = normalizer.denormalize(decoded_normalized)
    print(f"\n5. Denormalized (robot units):")
    print(f"   {final_action}")
    
    # Error analysis
    total_error = np.abs(raw_action - final_action)
    print(f"\n6. Reconstruction error:")
    print(f"   {total_error}")
    print(f"   Max: {total_error.max():.6f}")
    print(f"   Mean: {total_error.mean():.6f}")
    
    return final_action

# Run simulation
sample_raw_action = np.array([0.05, -0.03, 0.02, 0.1, -0.2, 0.15, 0.8])
final = simulate_action_pipeline(sample_raw_action, normalizer, action_tokenizer)

---
## Summary

### Key Concepts

1. **Action Discretization**: Continuous actions → discrete bins → vocabulary tokens

2. **256 Bins**: Default resolution providing ~0.4mm precision at typical scales

3. **Vocabulary Extension**: Action tokens occupy last 256 positions in LLM vocabulary

4. **Normalization**: Raw actions normalized to [-1, 1] before tokenization

5. **Two-Stage Process**:
   - Training: raw → normalize → discretize → cross-entropy loss
   - Inference: generate tokens → undiscretize → denormalize → execute

### Why It Works
- LLMs are optimized for discrete token prediction
- Same loss function for language and actions (unified training)
- 256 bins provide sufficient precision for manipulation
- Autoregressive generation naturally handles action sequences

### Next Steps
→ Continue to **05_data_pipeline.ipynb** to understand how training data is prepared.