# Notebook 15: Evaluate Fine-tuned OpenVLA with Action Chunking

**Purpose**: Comprehensive evaluation of fine-tuned OpenVLA model trained with temporal subsampling.

**Key Comparisons**:
- Base OpenVLA vs Fine-tuned (with LoRA adapters)
- Action quality metrics: L1 error, direction accuracy, gripper accuracy
- Visualization of predictions vs ground truth

**Action Chunking Context**:
- LIBERO runs at 20 Hz
- OpenVLA trained on Bridge V2 (5 Hz) and Fractal (3 Hz)
- With 4x chunking: 20 Hz → 5 Hz (matches Bridge V2)

## 1. Environment Setup

In [None]:
import os
import sys

# Configuration for SciServer or local
if 'SCRATCH' in os.environ:
    BASE_DIR = os.environ['SCRATCH']
else:
    BASE_DIR = "/home/idies/workspace/Temporary/dpark1/scratch"

CACHE_DIR = f"{BASE_DIR}/.cache"
LIBERO_DATA_DIR = f"{BASE_DIR}/libero_data"

# Set cache directories
os.environ['HF_HOME'] = f"{CACHE_DIR}/huggingface"
os.environ['TORCH_HOME'] = f"{CACHE_DIR}/torch"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import warnings
warnings.filterwarnings('ignore')

print(f"Base directory: {BASE_DIR}")
print(f"Cache directory: {CACHE_DIR}")

In [None]:
# Version check
import transformers
import tokenizers
import timm

print(f"transformers: {transformers.__version__} (need 4.40.1)")
print(f"tokenizers: {tokenizers.__version__} (need 0.19.1)")
print(f"timm: {timm.__version__} (need 0.9.x)")

assert transformers.__version__ == "4.40.1", "transformers version mismatch!"
assert tokenizers.__version__ == "0.19.1", "tokenizers version mismatch!"
assert timm.__version__.startswith("0.9."), "timm version mismatch!"

In [None]:
import numpy as np
import torch
from pathlib import Path
import json
import h5py
from PIL import Image
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(f"Device: {device}")
if device == "cuda:0":
    print(f"GPU: {torch.cuda.get_device_name(0)}")

## 2. Configuration

In [None]:
# =====================================================
# CONFIGURE YOUR CHECKPOINT PATH HERE
# =====================================================

# Path to the fine-tuned model checkpoint
# Options: 'best', 'final', or specific checkpoint like 'checkpoints/checkpoint-1000'
RESULTS_DIR = Path("../../results")

# List available runs
print("Available runs:")
if RESULTS_DIR.exists():
    for run_dir in sorted(RESULTS_DIR.iterdir()):
        if run_dir.is_dir() and not run_dir.name.startswith('.'):
            config_path = run_dir / "config.json"
            if config_path.exists():
                with open(config_path) as f:
                    config = json.load(f)
                print(f"  - {run_dir.name}")
                print(f"      chunk_size: {config.get('chunk_size', 'N/A')}, epochs: {config.get('epochs', 'N/A')}")
else:
    print("  No results directory found. Run training first.")

In [None]:
# Select your run (modify this)
RUN_NAME = "libero_spatial_chunk4_XXXXXXXX_XXXXXX"  # Replace with your run name
CHECKPOINT_TYPE = "best"  # 'best', 'final', or checkpoint path

# Construct paths
RUN_DIR = RESULTS_DIR / RUN_NAME
CHECKPOINT_PATH = RUN_DIR / CHECKPOINT_TYPE

# Action chunking settings (must match training)
CHUNK_SIZE = 4  # 20 Hz → 5 Hz
VAL_DEMOS_PER_TASK = 5

print(f"Run directory: {RUN_DIR}")
print(f"Checkpoint: {CHECKPOINT_PATH}")
print(f"Chunk size: {CHUNK_SIZE}")

## 3. Load Training History

In [None]:
import pandas as pd

# Load training logs
training_log = None
validation_log = None

training_log_path = RUN_DIR / "training_log.csv"
validation_log_path = RUN_DIR / "validation_log.csv"

if training_log_path.exists():
    training_log = pd.read_csv(training_log_path)
    print(f"Training log: {len(training_log)} entries")
    print(training_log.head())
else:
    print("No training log found")

if validation_log_path.exists():
    validation_log = pd.read_csv(validation_log_path)
    print(f"\nValidation log: {len(validation_log)} entries")
    print(validation_log.head())
else:
    print("No validation log found")

In [None]:
# Plot training curves
if training_log is not None and validation_log is not None:
    fig, axes = plt.subplots(2, 3, figsize=(15, 10))
    
    # Training loss
    ax = axes[0, 0]
    ax.plot(training_log['step'], training_log['loss'], alpha=0.7)
    ax.set_xlabel('Step')
    ax.set_ylabel('Loss')
    ax.set_title('Training Loss')
    ax.grid(True, alpha=0.3)
    
    # Validation loss
    ax = axes[0, 1]
    ax.plot(validation_log['step'], validation_log['val_loss'], 'b-o', markersize=4)
    ax.set_xlabel('Step')
    ax.set_ylabel('Loss')
    ax.set_title('Validation Loss')
    ax.grid(True, alpha=0.3)
    
    # L1 Error
    ax = axes[0, 2]
    ax.plot(validation_log['step'], validation_log['l1_error'], 'g-o', markersize=4)
    ax.set_xlabel('Step')
    ax.set_ylabel('L1 Error')
    ax.set_title('Action L1 Error')
    ax.grid(True, alpha=0.3)
    
    # Direction Accuracy
    ax = axes[1, 0]
    ax.plot(validation_log['step'], validation_log['direction_accuracy'], 'r-o', markersize=4)
    ax.axhline(y=0.5, color='gray', linestyle='--', label='Random')
    ax.set_xlabel('Step')
    ax.set_ylabel('Accuracy')
    ax.set_title('Direction Accuracy')
    ax.legend()
    ax.grid(True, alpha=0.3)
    
    # Gripper Accuracy
    ax = axes[1, 1]
    ax.plot(validation_log['step'], validation_log['gripper_accuracy'], 'purple', marker='o', markersize=4)
    ax.set_xlabel('Step')
    ax.set_ylabel('Accuracy')
    ax.set_title('Gripper Accuracy')
    ax.grid(True, alpha=0.3)
    
    # Learning rate
    ax = axes[1, 2]
    ax.plot(training_log['step'], training_log['lr'])
    ax.set_xlabel('Step')
    ax.set_ylabel('Learning Rate')
    ax.set_title('Learning Rate Schedule')
    ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # Print summary
    print("\n" + "="*60)
    print("Training Summary")
    print("="*60)
    print(f"Total steps: {training_log['step'].max()}")
    print(f"Best validation loss: {validation_log['val_loss'].min():.4f}")
    print(f"Best L1 error: {validation_log['l1_error'].min():.4f}")
    print(f"Best direction accuracy: {validation_log['direction_accuracy'].max():.4f}")
    print(f"Best gripper accuracy: {validation_log['gripper_accuracy'].max():.4f}")

## 4. Load Model

In [None]:
from transformers import AutoModelForVision2Seq, AutoProcessor
from peft import PeftModel

print("Loading base model...")
base_model = AutoModelForVision2Seq.from_pretrained(
    "openvla/openvla-7b",
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    cache_dir=f"{CACHE_DIR}/huggingface",
    low_cpu_mem_usage=True,
    attn_implementation="eager",
)

processor = AutoProcessor.from_pretrained(
    "openvla/openvla-7b",
    trust_remote_code=True,
    cache_dir=f"{CACHE_DIR}/huggingface",
)

print("Base model loaded.")

In [None]:
# Load fine-tuned model with LoRA adapters
print(f"Loading LoRA adapters from {CHECKPOINT_PATH}...")

if CHECKPOINT_PATH.exists():
    model = PeftModel.from_pretrained(
        base_model,
        str(CHECKPOINT_PATH),
        is_trainable=False,
    )
    model = model.to(device)
    model.eval()
    print("Fine-tuned model loaded.")
    print(model.print_trainable_parameters())
else:
    print(f"ERROR: Checkpoint not found at {CHECKPOINT_PATH}")
    print("Please update RUN_NAME and CHECKPOINT_TYPE in Section 2.")

## 5. Action Tokenizer

In [None]:
class ActionTokenizer:
    """OpenVLA-compatible action tokenizer."""
    
    def __init__(self, vocab_size=32000, n_bins=256):
        self.vocab_size = vocab_size
        self.n_bins = n_bins
        self.bins = np.linspace(-1, 1, n_bins)
        self.bin_centers = (self.bins[:-1] + self.bins[1:]) / 2
        self.action_token_start = vocab_size - n_bins
        self.action_token_end = vocab_size - 1
    
    def encode(self, action):
        action = np.clip(action, -1, 1)
        discretized = np.digitize(action, self.bins)
        return self.vocab_size - discretized
    
    def decode(self, token_ids):
        if isinstance(token_ids, torch.Tensor):
            token_ids = token_ids.cpu().numpy()
        discretized = self.vocab_size - token_ids
        indices = np.clip(discretized - 1, 0, len(self.bin_centers) - 1)
        return self.bin_centers[indices]

vocab_size = len(processor.tokenizer)
action_tokenizer = ActionTokenizer(vocab_size=vocab_size)
print(f"Action token range: [{action_tokenizer.action_token_start}, {action_tokenizer.action_token_end}]")

## 6. Load Evaluation Data

In [None]:
def transform_action(action):
    """Transform LIBERO action to OpenVLA format."""
    action = action.astype(np.float32)
    action[:6] = np.clip(action[:6], -1.0, 1.0)
    gripper = np.clip(action[6], 0.0, 1.0)
    action[6] = 1.0 - gripper
    return action

def load_validation_samples(data_dir, suite_name, chunk_size=4, val_demos=5, max_samples=500):
    """Load validation samples with chunking."""
    data_dir = Path(data_dir)
    samples = []
    
    # Find HDF5 files
    hdf5_files = list(data_dir.rglob("*.hdf5"))
    print(f"Found {len(hdf5_files)} HDF5 files")
    
    for filepath in tqdm(hdf5_files, desc="Loading validation data"):
        try:
            with h5py.File(filepath, 'r') as f:
                # Get language instruction
                language = "complete the task"
                for key in ['language_instruction', 'problem_info', 'language']:
                    if key in f.attrs:
                        lang = f.attrs[key]
                        if isinstance(lang, bytes):
                            lang = lang.decode('utf-8')
                        language = lang
                        break
                
                if 'data' not in f:
                    continue
                
                demo_keys = sorted([k for k in f['data'].keys() if k.startswith('demo_')])
                
                # Use last N demos for validation
                val_demo_keys = demo_keys[-val_demos:]
                
                for demo_key in val_demo_keys:
                    demo = f['data'][demo_key]
                    
                    if 'actions' not in demo or 'obs' not in demo:
                        continue
                    
                    # Find image key
                    img_key = None
                    for key in ['agentview_rgb', 'agentview_image', 'rgb', 'image']:
                        if key in demo['obs']:
                            img_key = key
                            break
                    if img_key is None:
                        continue
                    
                    n_steps = len(demo['actions'])
                    
                    # Apply chunking
                    for t in range(0, n_steps, chunk_size):
                        image = demo['obs'][img_key][t]
                        image = np.rot90(image, k=2)  # 180° rotation
                        
                        action = demo['actions'][t]
                        if len(action) < 7:
                            action = np.pad(action, (0, 7 - len(action)))
                        else:
                            action = action[:7]
                        
                        samples.append({
                            'image': image,
                            'action': transform_action(action),
                            'language': language,
                        })
                        
                        if len(samples) >= max_samples:
                            return samples
                            
        except Exception as e:
            print(f"Error reading {filepath}: {e}")
    
    return samples

# Load validation samples
val_samples = load_validation_samples(
    LIBERO_DATA_DIR, 
    "libero_spatial",
    chunk_size=CHUNK_SIZE,
    val_demos=VAL_DEMOS_PER_TASK,
    max_samples=500
)

print(f"\nLoaded {len(val_samples)} validation samples")

## 7. Evaluate Base vs Fine-tuned

In [None]:
def predict_action(model, processor, image, instruction, device):
    """Predict action from image and instruction."""
    # Preprocess image
    pil_image = Image.fromarray(image.astype(np.uint8))
    pil_image = pil_image.resize((224, 224), Image.LANCZOS)
    
    # Create prompt
    prompt = f"In: What action should the robot take to {instruction.lower()}?\nOut:"
    
    # Process inputs
    inputs = processor(prompt, pil_image, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}
    if 'pixel_values' in inputs:
        inputs['pixel_values'] = inputs['pixel_values'].to(torch.bfloat16)
    
    # Generate
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=7,
            do_sample=False,
            pad_token_id=model.config.pad_token_id,
        )
    
    # Decode action tokens
    pred_tokens = outputs[0, -7:].cpu().numpy()
    pred_action = action_tokenizer.decode(pred_tokens)
    
    return pred_action

In [None]:
def evaluate_model(model, samples, processor, device, model_name="Model", max_samples=200):
    """Evaluate model on validation samples."""
    model.eval()
    
    predictions = []
    ground_truths = []
    
    for sample in tqdm(samples[:max_samples], desc=f"Evaluating {model_name}"):
        try:
            pred = predict_action(model, processor, sample['image'], sample['language'], device)
            predictions.append(pred)
            ground_truths.append(sample['action'])
        except Exception as e:
            continue
    
    predictions = np.array(predictions)
    ground_truths = np.array(ground_truths)
    
    # Compute metrics
    l1_error = np.abs(predictions - ground_truths).mean()
    position_l1 = np.abs(predictions[:, :3] - ground_truths[:, :3]).mean()
    rotation_l1 = np.abs(predictions[:, 3:6] - ground_truths[:, 3:6]).mean()
    
    # Direction accuracy
    threshold = 0.02
    dir_correct = 0
    dir_total = 0
    for dim in range(3):
        significant = np.abs(ground_truths[:, dim]) > threshold
        if significant.sum() > 0:
            same_sign = np.sign(ground_truths[:, dim][significant]) == np.sign(predictions[:, dim][significant])
            dir_correct += same_sign.sum()
            dir_total += significant.sum()
    direction_accuracy = dir_correct / dir_total if dir_total > 0 else 0.5
    
    # Gripper accuracy
    gripper_threshold = 0.5
    gt_gripper = (ground_truths[:, 6] > gripper_threshold).astype(int)
    pred_gripper = (predictions[:, 6] > gripper_threshold).astype(int)
    gripper_accuracy = (gt_gripper == pred_gripper).mean()
    
    return {
        'l1_error': l1_error,
        'position_l1': position_l1,
        'rotation_l1': rotation_l1,
        'direction_accuracy': direction_accuracy,
        'gripper_accuracy': gripper_accuracy,
        'predictions': predictions,
        'ground_truths': ground_truths,
    }

In [None]:
# Evaluate base model (disable LoRA adapters)
print("Evaluating BASE model (LoRA disabled)...")
model.disable_adapter_layers()
base_results = evaluate_model(model, val_samples, processor, device, "Base Model", max_samples=200)

print("\nBase Model Results:")
print(f"  L1 Error: {base_results['l1_error']:.4f}")
print(f"  Position L1: {base_results['position_l1']:.4f}")
print(f"  Direction Accuracy: {base_results['direction_accuracy']:.4f}")
print(f"  Gripper Accuracy: {base_results['gripper_accuracy']:.4f}")

In [None]:
# Evaluate fine-tuned model (enable LoRA adapters)
print("Evaluating FINE-TUNED model (LoRA enabled)...")
model.enable_adapter_layers()
finetuned_results = evaluate_model(model, val_samples, processor, device, "Fine-tuned Model", max_samples=200)

print("\nFine-tuned Model Results:")
print(f"  L1 Error: {finetuned_results['l1_error']:.4f}")
print(f"  Position L1: {finetuned_results['position_l1']:.4f}")
print(f"  Direction Accuracy: {finetuned_results['direction_accuracy']:.4f}")
print(f"  Gripper Accuracy: {finetuned_results['gripper_accuracy']:.4f}")

In [None]:
# Comparison summary
print("\n" + "="*60)
print(" COMPARISON: Base vs Fine-tuned (with Action Chunking)")
print("="*60)
print(f"\nChunk size: {CHUNK_SIZE} (20 Hz → {20/CHUNK_SIZE:.1f} Hz)")
print("\n" + "-"*60)
print(f"{'Metric':<25} {'Base':>12} {'Fine-tuned':>12} {'Change':>12}")
print("-"*60)

metrics = ['l1_error', 'position_l1', 'direction_accuracy', 'gripper_accuracy']
labels = ['L1 Error', 'Position L1', 'Direction Accuracy', 'Gripper Accuracy']

for metric, label in zip(metrics, labels):
    base_val = base_results[metric]
    ft_val = finetuned_results[metric]
    
    if 'accuracy' in metric:
        change = ft_val - base_val
        sign = '+' if change > 0 else ''
        status = '✅' if change > 0 else ('⚠️' if change < -0.05 else '→')
    else:
        change = (ft_val - base_val) / base_val * 100
        sign = '+' if change > 0 else ''
        status = '✅' if change < 0 else ('⚠️' if change > 10 else '→')
    
    if 'accuracy' in metric:
        print(f"{label:<25} {base_val:>11.1%} {ft_val:>11.1%} {sign}{change:>+.1%} {status}")
    else:
        print(f"{label:<25} {base_val:>12.4f} {ft_val:>12.4f} {sign}{change:>+.1f}% {status}")

print("-"*60)

## 8. Visualizations

In [None]:
# Plot prediction distributions
fig, axes = plt.subplots(2, 4, figsize=(16, 8))

dim_names = ['dx', 'dy', 'dz', 'rx', 'ry', 'rz', 'gripper']

for i, (name, ax_idx) in enumerate(zip(dim_names, [(0,0), (0,1), (0,2), (0,3), (1,0), (1,1), (1,2)])):
    ax = axes[ax_idx]
    
    gt = base_results['ground_truths'][:, i]
    base_pred = base_results['predictions'][:, i]
    ft_pred = finetuned_results['predictions'][:, i]
    
    ax.hist(gt, bins=30, alpha=0.5, label='Ground Truth', density=True)
    ax.hist(base_pred, bins=30, alpha=0.5, label='Base', density=True)
    ax.hist(ft_pred, bins=30, alpha=0.5, label='Fine-tuned', density=True)
    ax.set_title(f'{name}')
    ax.legend(fontsize=8)
    ax.set_xlabel('Value')
    ax.set_ylabel('Density')

# Hide unused subplot
axes[1, 3].axis('off')

plt.suptitle('Action Distribution: Ground Truth vs Base vs Fine-tuned', fontsize=14)
plt.tight_layout()
plt.show()

In [None]:
# Scatter plots: Predicted vs Ground Truth
fig, axes = plt.subplots(2, 3, figsize=(14, 10))

for i, (name, ax) in enumerate(zip(['dx', 'dy', 'dz'], axes[0])):
    gt = base_results['ground_truths'][:, i]
    base_pred = base_results['predictions'][:, i]
    ft_pred = finetuned_results['predictions'][:, i]
    
    ax.scatter(gt, base_pred, alpha=0.3, s=20, label='Base', color='blue')
    ax.scatter(gt, ft_pred, alpha=0.3, s=20, label='Fine-tuned', color='orange')
    ax.plot([-1, 1], [-1, 1], 'k--', label='Perfect')
    ax.set_xlabel(f'Ground Truth {name}')
    ax.set_ylabel(f'Predicted {name}')
    ax.set_title(f'{name}: Predicted vs Ground Truth')
    ax.legend()
    ax.set_xlim(-1, 1)
    ax.set_ylim(-1, 1)
    ax.grid(True, alpha=0.3)

for i, (name, ax) in enumerate(zip(['rx', 'ry', 'rz'], axes[1])):
    gt = base_results['ground_truths'][:, i+3]
    base_pred = base_results['predictions'][:, i+3]
    ft_pred = finetuned_results['predictions'][:, i+3]
    
    ax.scatter(gt, base_pred, alpha=0.3, s=20, label='Base', color='blue')
    ax.scatter(gt, ft_pred, alpha=0.3, s=20, label='Fine-tuned', color='orange')
    ax.plot([-1, 1], [-1, 1], 'k--', label='Perfect')
    ax.set_xlabel(f'Ground Truth {name}')
    ax.set_ylabel(f'Predicted {name}')
    ax.set_title(f'{name}: Predicted vs Ground Truth')
    ax.legend()
    ax.set_xlim(-1, 1)
    ax.set_ylim(-1, 1)
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Error analysis: Near-zero action percentage
threshold = 0.05

print("Near-Zero Action Analysis (|action| < 0.05)")
print("="*60)
print(f"{'Dimension':<10} {'GT %':>10} {'Base %':>10} {'FT %':>10}")
print("-"*60)

for i, name in enumerate(dim_names[:6]):
    gt = base_results['ground_truths'][:, i]
    base_pred = base_results['predictions'][:, i]
    ft_pred = finetuned_results['predictions'][:, i]
    
    gt_near_zero = (np.abs(gt) < threshold).mean() * 100
    base_near_zero = (np.abs(base_pred) < threshold).mean() * 100
    ft_near_zero = (np.abs(ft_pred) < threshold).mean() * 100
    
    print(f"{name:<10} {gt_near_zero:>9.1f}% {base_near_zero:>9.1f}% {ft_near_zero:>9.1f}%")

print("-"*60)
print("\nNote: Mode collapse is indicated when model predicts more near-zero")
print("actions than ground truth. Action chunking should reduce this.")

## 9. Sample Predictions

In [None]:
# Show sample predictions
np.random.seed(42)
sample_indices = np.random.choice(len(val_samples), min(6, len(val_samples)), replace=False)

fig, axes = plt.subplots(2, 3, figsize=(15, 10))

for idx, (sample_idx, ax) in enumerate(zip(sample_indices, axes.flat)):
    sample = val_samples[sample_idx]
    
    # Show image
    ax.imshow(sample['image'])
    ax.axis('off')
    
    # Get predictions
    gt = sample['action']
    base_pred = base_results['predictions'][sample_idx]
    ft_pred = finetuned_results['predictions'][sample_idx]
    
    # Create title with key metrics
    title = f"Task: {sample['language'][:40]}...\n"
    title += f"GT dx/dy/dz: [{gt[0]:.2f}, {gt[1]:.2f}, {gt[2]:.2f}]\n"
    title += f"Base: [{base_pred[0]:.2f}, {base_pred[1]:.2f}, {base_pred[2]:.2f}]\n"
    title += f"FT: [{ft_pred[0]:.2f}, {ft_pred[1]:.2f}, {ft_pred[2]:.2f}]"
    
    ax.set_title(title, fontsize=9)

plt.suptitle('Sample Predictions: Base vs Fine-tuned', fontsize=14)
plt.tight_layout()
plt.show()

## 10. Save Results

In [None]:
# Save evaluation results
eval_results = {
    'chunk_size': CHUNK_SIZE,
    'effective_hz': 20 / CHUNK_SIZE,
    'num_samples': len(val_samples),
    'base_model': {
        'l1_error': float(base_results['l1_error']),
        'position_l1': float(base_results['position_l1']),
        'rotation_l1': float(base_results['rotation_l1']),
        'direction_accuracy': float(base_results['direction_accuracy']),
        'gripper_accuracy': float(base_results['gripper_accuracy']),
    },
    'finetuned_model': {
        'l1_error': float(finetuned_results['l1_error']),
        'position_l1': float(finetuned_results['position_l1']),
        'rotation_l1': float(finetuned_results['rotation_l1']),
        'direction_accuracy': float(finetuned_results['direction_accuracy']),
        'gripper_accuracy': float(finetuned_results['gripper_accuracy']),
    },
    'improvement': {
        'l1_error_reduction': float((base_results['l1_error'] - finetuned_results['l1_error']) / base_results['l1_error'] * 100),
        'direction_accuracy_change': float(finetuned_results['direction_accuracy'] - base_results['direction_accuracy']),
        'gripper_accuracy_change': float(finetuned_results['gripper_accuracy'] - base_results['gripper_accuracy']),
    }
}

eval_output_path = RUN_DIR / "evaluation_results.json"
with open(eval_output_path, 'w') as f:
    json.dump(eval_results, f, indent=2)

print(f"Evaluation results saved to: {eval_output_path}")
print("\nSummary:")
print(json.dumps(eval_results['improvement'], indent=2))

## 11. Conclusion

This notebook evaluated the fine-tuned OpenVLA model trained with action chunking.

**Key Questions to Answer**:
1. Did L1 error improve? (Lower is better)
2. Did direction accuracy improve? (Higher is better, should be >50%)
3. Did gripper accuracy improve? (Higher is better)
4. Is there less mode collapse? (Near-zero % closer to GT)

**If direction accuracy improved**:
- Action chunking successfully addressed the control frequency mismatch
- The model learned meaningful LIBERO-specific behaviors

**If direction accuracy is still poor**:
- Try different chunk sizes (e.g., 5 or 7 for ~4 Hz or ~3 Hz)
- Consider action scaling instead of temporal subsampling
- Try longer training or different learning rates