# 13. Evaluate Fine-tuned OpenVLA on LIBERO

**Goal**: Evaluate a fine-tuned OpenVLA model on LIBERO demonstrations.

## What We'll Cover
1. Load fine-tuned model (with LoRA weights)
2. Run inference on held-out test data
3. Compute evaluation metrics
4. Visualize predictions vs ground truth
5. Compare with base model performance

---
## 1. Setup and Version Checks

In [None]:
import os
import sys
import subprocess

# Auto-detect environment
if os.environ.get('PSCRATCH'):
    SCRATCH = os.environ['PSCRATCH']
elif os.environ.get('SCRATCH'):
    SCRATCH = os.environ['SCRATCH']
else:
    SCRATCH = "/home/idies/workspace/Temporary/dpark1/scratch"

CACHE_DIR = f"{SCRATCH}/.cache"
LIBERO_DATA_DIR = f"{SCRATCH}/libero_data"
FINETUNED_DIR = f"{SCRATCH}/openvla_finetuned"

os.environ['HF_HOME'] = f"{CACHE_DIR}/huggingface"
os.environ['TORCH_HOME'] = f"{CACHE_DIR}/torch"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

print(f"Base directory: {SCRATCH}")
print(f"Fine-tuned models: {FINETUNED_DIR}")
print(f"LIBERO data: {LIBERO_DATA_DIR}")

In [None]:
# Version checks (CRITICAL)
import transformers
import tokenizers

print(f"transformers: {transformers.__version__}")
print(f"tokenizers: {tokenizers.__version__}")

if transformers.__version__ != "4.40.1":
    print("\n[WARNING] transformers version mismatch!")
    print("Fix: pip install transformers==4.40.1 tokenizers==0.19.1")
else:
    print("\n[OK] Version check passed")

In [None]:
import torch
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
from pathlib import Path
import h5py
from tqdm.notebook import tqdm
import json

import warnings
warnings.filterwarnings('ignore')

print(f"PyTorch: {torch.__version__}")
print(f"CUDA: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

---
## 2. Action Tokenizer

In [None]:
class ActionTokenizer:
    """OpenVLA-compatible action tokenizer."""
    
    def __init__(self, vocab_size=32000, n_bins=256):
        self.vocab_size = vocab_size
        self.n_bins = n_bins
        self.bins = np.linspace(-1, 1, n_bins)
        self.bin_centers = (self.bins[:-1] + self.bins[1:]) / 2
        self.action_token_start = vocab_size - n_bins
    
    def encode(self, action):
        action = np.clip(action, -1, 1)
        discretized = np.digitize(action, self.bins)
        return self.vocab_size - discretized
    
    def decode(self, token_ids):
        if isinstance(token_ids, torch.Tensor):
            token_ids = token_ids.cpu().numpy()
        discretized = self.vocab_size - token_ids
        indices = np.clip(discretized - 1, 0, len(self.bin_centers) - 1)
        return self.bin_centers[indices]

action_tokenizer = ActionTokenizer()
print(f"Action tokenizer ready")
print(f"Token range: [{action_tokenizer.action_token_start}, {action_tokenizer.vocab_size - 1}]")

---
## 3. Load Fine-tuned Model

In [None]:
# List available checkpoints
finetuned_path = Path(FINETUNED_DIR)

if finetuned_path.exists():
    runs = sorted(finetuned_path.iterdir())
    print("Available fine-tuned models:")
    for i, run in enumerate(runs):
        if run.is_dir():
            checkpoints = list(run.glob("checkpoint-*")) + list(run.glob("final")) + list(run.glob("best"))
            print(f"  [{i}] {run.name}")
            for cp in checkpoints[-3:]:
                print(f"      - {cp.name}")
else:
    print(f"No fine-tuned models found in {FINETUNED_DIR}")
    print("Run finetune_openvla_libero.py first")

In [None]:
# Set checkpoint path
# UPDATE THIS to your checkpoint path
CHECKPOINT_PATH = None  # e.g., f"{FINETUNED_DIR}/libero_spatial_20241230_120000/best"

# Auto-detect latest checkpoint if not specified
if CHECKPOINT_PATH is None and finetuned_path.exists():
    runs = sorted(finetuned_path.iterdir())
    if runs:
        latest_run = runs[-1]
        # Prefer 'best', then 'final', then latest checkpoint
        if (latest_run / "best").exists():
            CHECKPOINT_PATH = str(latest_run / "best")
        elif (latest_run / "final").exists():
            CHECKPOINT_PATH = str(latest_run / "final")
        else:
            checkpoints = sorted(latest_run.glob("checkpoint-*"))
            if checkpoints:
                CHECKPOINT_PATH = str(checkpoints[-1])

if CHECKPOINT_PATH:
    print(f"Using checkpoint: {CHECKPOINT_PATH}")
else:
    print("No checkpoint found. Will evaluate BASE model only.")

In [None]:
from transformers import AutoModelForVision2Seq, AutoProcessor

device = "cuda:0" if torch.cuda.is_available() else "cpu"

# Load processor (same for base and fine-tuned)
print("Loading processor...")
processor = AutoProcessor.from_pretrained(
    "openvla/openvla-7b",
    trust_remote_code=True,
    cache_dir=f"{CACHE_DIR}/huggingface",
)

# Load model
print("Loading model...")
base_model = AutoModelForVision2Seq.from_pretrained(
    "openvla/openvla-7b",
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    cache_dir=f"{CACHE_DIR}/huggingface",
    low_cpu_mem_usage=True,
    attn_implementation="eager",
)

# Load LoRA weights if checkpoint exists
if CHECKPOINT_PATH and Path(CHECKPOINT_PATH).exists():
    print(f"Loading LoRA weights from {CHECKPOINT_PATH}...")
    from peft import PeftModel
    model = PeftModel.from_pretrained(base_model, CHECKPOINT_PATH)
    print("LoRA weights loaded!")
    
    # Optionally merge weights for faster inference
    # model = model.merge_and_unload()
else:
    model = base_model
    print("Using BASE model (no fine-tuning)")

model = model.to(device).eval()
print(f"Model loaded on {device}")

---
## 4. Load Test Data

In [None]:
def find_libero_files(data_dir, suite_name="libero_spatial"):
    """Find LIBERO HDF5 files."""
    data_path = Path(data_dir)
    
    # Try multiple patterns
    for pattern in [f"**/*{suite_name}*/*.hdf5", "**/*.hdf5"]:
        files = list(data_path.rglob(pattern.replace("**/*", "*")))
        if files:
            return sorted(files)
    
    return []

def load_test_samples(data_dir, suite_name="libero_spatial", n_samples=50, seed=42):
    """Load a subset of samples for evaluation."""
    np.random.seed(seed)
    
    files = find_libero_files(data_dir, suite_name)
    if not files:
        print(f"No files found in {data_dir}")
        return []
    
    print(f"Found {len(files)} HDF5 files")
    
    all_samples = []
    
    for filepath in tqdm(files, desc="Loading"):
        try:
            with h5py.File(filepath, 'r') as f:
                # Get language
                language = f.attrs.get('language_instruction', f.attrs.get('problem_info', 'unknown'))
                if isinstance(language, bytes):
                    language = language.decode('utf-8')
                
                if 'data' not in f:
                    continue
                
                demo_keys = [k for k in f['data'].keys() if k.startswith('demo_')]
                
                # Sample a few frames from each demo
                for demo_key in demo_keys[:5]:  # Limit demos per file
                    demo = f['data'][demo_key]
                    
                    if 'actions' not in demo or 'obs' not in demo:
                        continue
                    
                    # Find image key
                    img_key = None
                    for k in ['agentview_rgb', 'agentview_image', 'rgb', 'image']:
                        if k in demo['obs']:
                            img_key = k
                            break
                    
                    if img_key is None:
                        continue
                    
                    n_steps = len(demo['actions'])
                    
                    # Sample 3 frames per demo (start, middle, end)
                    for t in [0, n_steps // 2, n_steps - 1]:
                        image = demo['obs'][img_key][t]
                        image = np.rot90(image, k=2)  # LIBERO rotation
                        
                        action = demo['actions'][t]
                        if len(action) < 7:
                            action = np.pad(action, (0, 7 - len(action)))
                        else:
                            action = action[:7]
                        
                        all_samples.append({
                            'image': Image.fromarray(image.astype(np.uint8)),
                            'action': action.astype(np.float32),
                            'instruction': language,
                            'source': f"{filepath.name}/{demo_key}/t{t}",
                        })
        except Exception as e:
            print(f"Error reading {filepath}: {e}")
    
    # Randomly sample
    if len(all_samples) > n_samples:
        indices = np.random.choice(len(all_samples), n_samples, replace=False)
        all_samples = [all_samples[i] for i in indices]
    
    print(f"Loaded {len(all_samples)} test samples")
    return all_samples

# Load test samples
SUITE_NAME = "libero_spatial"  # Change as needed
test_samples = load_test_samples(LIBERO_DATA_DIR, SUITE_NAME, n_samples=100)

---
## 5. Run Evaluation

In [None]:
@torch.no_grad()
def predict_action(model, processor, image, instruction, device):
    """Run inference to predict action."""
    prompt = f"In: What action should the robot take to {instruction.lower()}?\nOut:"
    
    inputs = processor(prompt, image, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}
    inputs['pixel_values'] = inputs['pixel_values'].to(torch.bfloat16)
    
    # Add empty token if needed (OpenVLA convention)
    if inputs['input_ids'][0, -1] != 29871:
        empty_token = torch.tensor([[29871]], device=device)
        inputs['input_ids'] = torch.cat([inputs['input_ids'], empty_token], dim=1)
        if 'attention_mask' in inputs:
            inputs['attention_mask'] = torch.cat([
                inputs['attention_mask'],
                torch.ones((1, 1), device=device, dtype=inputs['attention_mask'].dtype)
            ], dim=1)
    
    outputs = model.generate(
        **inputs,
        max_new_tokens=7,
        do_sample=False,
        pad_token_id=processor.tokenizer.pad_token_id,
    )
    
    action_tokens = outputs[0, -7:]
    action = action_tokenizer.decode(action_tokens)
    
    return action, action_tokens.tolist()

print("Inference function ready")

In [None]:
# Run evaluation
results = []

print(f"Evaluating {len(test_samples)} samples...")
print("="*60)

for i, sample in enumerate(tqdm(test_samples)):
    pred_action, pred_tokens = predict_action(
        model, processor, sample['image'], sample['instruction'], device
    )
    
    gt_action = sample['action']
    
    # Normalize GT action to [-1, 1] for fair comparison
    gt_action_norm = np.clip(gt_action, -1, 1)
    
    # Compute metrics
    l1_error = np.abs(pred_action - gt_action_norm).mean()
    
    # Sign accuracy (for significant actions)
    significant_mask = np.abs(gt_action_norm) > 0.05
    if significant_mask.sum() > 0:
        sign_accuracy = (np.sign(pred_action) == np.sign(gt_action_norm))[significant_mask].mean()
    else:
        sign_accuracy = 1.0
    
    results.append({
        'instruction': sample['instruction'],
        'pred_action': pred_action,
        'gt_action': gt_action_norm,
        'pred_tokens': pred_tokens,
        'l1_error': l1_error,
        'sign_accuracy': sign_accuracy,
    })
    
    # Print progress every 20 samples
    if (i + 1) % 20 == 0:
        avg_l1 = np.mean([r['l1_error'] for r in results])
        avg_sign = np.mean([r['sign_accuracy'] for r in results])
        print(f"[{i+1}/{len(test_samples)}] L1: {avg_l1:.4f}, Sign: {avg_sign:.3f}")

In [None]:
# Compute summary statistics
print("\n" + "="*60)
print(" EVALUATION RESULTS")
print("="*60)

l1_errors = [r['l1_error'] for r in results]
sign_accs = [r['sign_accuracy'] for r in results]

print(f"\nSamples evaluated: {len(results)}")
print(f"\nL1 Error:")
print(f"  Mean: {np.mean(l1_errors):.4f}")
print(f"  Std:  {np.std(l1_errors):.4f}")
print(f"  Min:  {np.min(l1_errors):.4f}")
print(f"  Max:  {np.max(l1_errors):.4f}")

print(f"\nSign Accuracy:")
print(f"  Mean: {np.mean(sign_accs):.3f}")
print(f"  Std:  {np.std(sign_accs):.3f}")

# Per-dimension analysis
print(f"\nPer-Dimension L1 Error:")
dim_names = ['dx', 'dy', 'dz', 'rx', 'ry', 'rz', 'gripper']
for dim in range(7):
    dim_errors = [np.abs(r['pred_action'][dim] - r['gt_action'][dim]) for r in results]
    print(f"  {dim_names[dim]}: {np.mean(dim_errors):.4f}")

---
## 6. Visualize Predictions

In [None]:
def visualize_prediction(sample, result, ax_img, ax_action):
    """Visualize a single prediction."""
    # Show image
    ax_img.imshow(sample['image'])
    ax_img.set_title(f"L1: {result['l1_error']:.3f}", fontsize=10)
    ax_img.axis('off')
    
    # Show action comparison
    dim_names = ['dx', 'dy', 'dz', 'rx', 'ry', 'rz', 'grip']
    x = np.arange(7)
    width = 0.35
    
    bars1 = ax_action.bar(x - width/2, result['gt_action'], width, label='Ground Truth', color='blue', alpha=0.7)
    bars2 = ax_action.bar(x + width/2, result['pred_action'], width, label='Predicted', color='orange', alpha=0.7)
    
    ax_action.set_xticks(x)
    ax_action.set_xticklabels(dim_names, fontsize=8)
    ax_action.set_ylim(-1.2, 1.2)
    ax_action.axhline(y=0, color='k', linestyle='-', linewidth=0.5)
    ax_action.legend(fontsize=7, loc='upper right')

# Visualize best and worst predictions
sorted_by_error = sorted(range(len(results)), key=lambda i: results[i]['l1_error'])

# Best predictions (lowest error)
fig, axes = plt.subplots(3, 4, figsize=(16, 10))
fig.suptitle('Best Predictions (Lowest L1 Error)', fontsize=14)

for i in range(6):
    idx = sorted_by_error[i]
    row = i // 2
    col = (i % 2) * 2
    visualize_prediction(test_samples[idx], results[idx], axes[row, col], axes[row, col+1])

plt.tight_layout()
plt.show()

In [None]:
# Worst predictions (highest error)
fig, axes = plt.subplots(3, 4, figsize=(16, 10))
fig.suptitle('Worst Predictions (Highest L1 Error)', fontsize=14)

for i in range(6):
    idx = sorted_by_error[-(i+1)]
    row = i // 2
    col = (i % 2) * 2
    visualize_prediction(test_samples[idx], results[idx], axes[row, col], axes[row, col+1])

plt.tight_layout()
plt.show()

In [None]:
# Error distribution
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# L1 Error histogram
axes[0].hist(l1_errors, bins=30, edgecolor='black', alpha=0.7)
axes[0].axvline(np.mean(l1_errors), color='r', linestyle='--', label=f'Mean: {np.mean(l1_errors):.3f}')
axes[0].set_xlabel('L1 Error')
axes[0].set_ylabel('Count')
axes[0].set_title('L1 Error Distribution')
axes[0].legend()

# Sign accuracy histogram
axes[1].hist(sign_accs, bins=20, edgecolor='black', alpha=0.7, color='green')
axes[1].axvline(np.mean(sign_accs), color='r', linestyle='--', label=f'Mean: {np.mean(sign_accs):.3f}')
axes[1].set_xlabel('Sign Accuracy')
axes[1].set_ylabel('Count')
axes[1].set_title('Sign Accuracy Distribution')
axes[1].legend()

plt.tight_layout()
plt.show()

---
## 7. Check Token Distribution

In [None]:
# Analyze predicted tokens
all_tokens = [t for r in results for t in r['pred_tokens']]

print("Token Statistics:")
print(f"  Min token: {min(all_tokens)}")
print(f"  Max token: {max(all_tokens)}")
print(f"  Expected range: [31744, 31999]")

# Check if all tokens are in valid action range
in_range = all(31744 <= t <= 31999 for t in all_tokens)
print(f"\nAll tokens in action range: {in_range}")

# Unique tokens
unique_tokens = set(tuple(r['pred_tokens']) for r in results)
print(f"Unique output patterns: {len(unique_tokens)} / {len(results)}")

if len(unique_tokens) == 1:
    print("\n[WARNING] Model produces SAME output for all inputs!")
    print("This suggests inference issues.")
elif len(unique_tokens) < len(results) // 2:
    print("\n[WARNING] Low output diversity.")
else:
    print("\n[OK] Model produces diverse outputs.")

---
## 8. Summary and Interpretation

In [None]:
print("="*60)
print(" EVALUATION SUMMARY")
print("="*60)

mean_l1 = np.mean(l1_errors)
mean_sign = np.mean(sign_accs)

print(f"\nModel: {'Fine-tuned' if CHECKPOINT_PATH else 'Base'}")
if CHECKPOINT_PATH:
    print(f"Checkpoint: {Path(CHECKPOINT_PATH).name}")
print(f"Suite: {SUITE_NAME}")
print(f"Samples: {len(results)}")

print(f"\nMetrics:")
print(f"  Mean L1 Error:     {mean_l1:.4f}")
print(f"  Mean Sign Accuracy: {mean_sign:.3f}")

print(f"\nInterpretation:")
if mean_l1 < 0.15:
    print("  [EXCELLENT] L1 error < 0.15 indicates strong action prediction")
elif mean_l1 < 0.25:
    print("  [GOOD] L1 error < 0.25 indicates reasonable action prediction")
elif mean_l1 < 0.35:
    print("  [MODERATE] L1 error < 0.35 suggests model is learning")
else:
    print("  [NEEDS IMPROVEMENT] L1 error >= 0.35 suggests more training needed")

if mean_sign > 0.8:
    print("  [EXCELLENT] Sign accuracy > 80% - model predicts correct directions")
elif mean_sign > 0.6:
    print("  [GOOD] Sign accuracy > 60% - model generally predicts correct directions")
else:
    print("  [NEEDS IMPROVEMENT] Sign accuracy <= 60% - direction prediction needs work")

print("\n" + "="*60)

---
## 9. Save Results

In [None]:
import pickle

# Save evaluation results
results_path = f"{CACHE_DIR}/evaluation_results.pkl"

save_data = {
    'model': 'finetuned' if CHECKPOINT_PATH else 'base',
    'checkpoint': CHECKPOINT_PATH,
    'suite': SUITE_NAME,
    'n_samples': len(results),
    'mean_l1_error': mean_l1,
    'std_l1_error': np.std(l1_errors),
    'mean_sign_accuracy': mean_sign,
    'results': results,
}

with open(results_path, 'wb') as f:
    pickle.dump(save_data, f)

print(f"Results saved to: {results_path}")

---
## Summary

### Evaluation Complete

This notebook evaluated the fine-tuned OpenVLA model on LIBERO demonstrations.

### Key Metrics
- **L1 Error**: Mean absolute difference between predicted and ground truth actions
- **Sign Accuracy**: Percentage of correct direction predictions for significant actions

### Expected Results
| Stage | L1 Error | Sign Accuracy |
|-------|----------|---------------|
| Base model (no fine-tuning) | 0.3-0.5 | 40-60% |
| After fine-tuning | 0.1-0.2 | 70-85% |

### Next Steps
1. If results are poor, try more training epochs
2. For real evaluation, run in LIBERO simulation (see notebook 07)
3. Compare with paper-reported success rates