# Visualize RoboCasa Runs

This notebook aggregates and visualizes evaluation results under `robocasa_runs/`.

- Top-level directories correspond to model/run names
- Subdirectories correspond to `env_name`
- Each leaf contains `metrics.json` (and `config.json`).

In [7]:
import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from typing import Dict, List, Tuple

# Set up plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Define the robocasa_runs directory
ROBOCASA_RUNS_DIR = Path("robocasa_runs")

print(f"Looking for results in: {ROBOCASA_RUNS_DIR.absolute()}")
if not ROBOCASA_RUNS_DIR.exists():
    raise FileNotFoundError(f"Directory not found: {ROBOCASA_RUNS_DIR}")

Looking for results in: /fsx/kimin/Issac-GR00T-robocasa/robocasa_runs


In [8]:
def load_robocasa_results(runs_dir: Path) -> Dict[str, Dict[str, Dict]]:
    """
    Load all robocasa results from the runs directory.
    
    Returns:
        Dict[model_name, Dict[env_name, metrics_dict]]
    """
    results = {}
    
    # Iterate through model directories
    for model_dir in runs_dir.iterdir():
        if not model_dir.is_dir():
            continue
            
        model_name = model_dir.name
        results[model_name] = {}
        
        # Iterate through environment directories
        for env_dir in model_dir.iterdir():
            if not env_dir.is_dir():
                continue
                
            env_name = env_dir.name
            metrics_file = env_dir / "metrics.json"
            
            if metrics_file.exists():
                try:
                    with open(metrics_file, 'r') as f:
                        metrics = json.load(f)
                    results[model_name][env_name] = metrics
                    print(f"Loaded: {model_name}/{env_name} - Success Rate: {metrics.get('success_rate', 'N/A')}")
                except Exception as e:
                    print(f"Error loading {metrics_file}: {e}")
            else:
                print(f"No metrics.json found in {env_dir}")
    
    return results

# Load all results
results = load_robocasa_results(ROBOCASA_RUNS_DIR)
print(f"\nLoaded results for {len(results)} models")

Loaded: gr00t_15_0801_robocasa_multitask_n1_g8_b32_lr0.0001_si-pg-noctx/PnPCounterToCab - Success Rate: 0.48
Loaded: gr00t_15_0801_robocasa_multitask_n1_g8_b32_lr0.0001_si-pg-noctx/PnPCabToCounter - Success Rate: 0.05
Loaded: gr00t_15_0801_robocasa_multitask_n1_g8_b32_lr0.0001_si-pg-noctx/TurnOnSinkFaucet - Success Rate: 0.84
Loaded: gr00t_15_0801_robocasa_multitask_n1_g8_b32_lr0.0001_si-pg-noctx/CoffeeServeMug - Success Rate: 0.58
Loaded: gr00t_15_0801_robocasa_multitask_n1_g8_b32_lr0.0001_si-pg-noctx/OpenDoubleDoor - Success Rate: 0.62
Loaded: gr00t_15_0801_robocasa_multitask_n1_g8_b32_lr0.0001_si-pg-noctx/CoffeeSetupMug - Success Rate: 0.3
Loaded: gr00t_15_0801_robocasa_multitask_n1_g8_b32_lr0.0001_si-pg-noctx/TurnOffSinkFaucet - Success Rate: 0.85
Loaded: gr00t_15_0801_robocasa_multitask_n1_g8_b32_lr0.0001_si-pg-noctx/CloseDoubleDoor - Success Rate: 0.84
Loaded: gr00t_15_150h_30K_0801_robocasa_multitask_n1_g8_b32_lr0.0001_si-pg-noctx/TurnOnSinkFaucet - Success Rate: 0.92
Loaded: gr

In [9]:
def create_results_dataframe(results: Dict) -> pd.DataFrame:
    """
    Convert results dictionary to a pandas DataFrame for easier analysis.
    """
    rows = []
    
    for model_name, envs in results.items():
        for env_name, metrics in envs.items():
            row = {
                'model': model_name,
                'environment': env_name,
                'success_rate': metrics.get('success_rate', 0.0),
                'num_episodes': metrics.get('num_episodes', 0),
                'num_successes': len([x for x in metrics.get('episode_success', []) if x]),
                'timestamp': metrics.get('timestamp', '')
            }
            rows.append(row)
    
    return pd.DataFrame(rows)

# Create DataFrame
df = create_results_dataframe(results)
print(f"Created DataFrame with {len(df)} rows")
print(f"Models: {df['model'].unique()}")
print(f"Environments: {df['environment'].unique()}")

# Display first few rows
df.head()

Created DataFrame with 16 rows
Models: ['gr00t_15_0801_robocasa_multitask_n1_g8_b32_lr0.0001_si-pg-noctx'
 'gr00t_15_150h_30K_0801_robocasa_multitask_n1_g8_b32_lr0.0001_si-pg-noctx']
Environments: ['PnPCounterToCab' 'PnPCabToCounter' 'TurnOnSinkFaucet' 'CoffeeServeMug'
 'OpenDoubleDoor' 'CoffeeSetupMug' 'TurnOffSinkFaucet' 'CloseDoubleDoor']


Unnamed: 0,model,environment,success_rate,num_episodes,num_successes,timestamp
0,gr00t_15_0801_robocasa_multitask_n1_g8_b32_lr0...,PnPCounterToCab,0.48,100,48,2025-08-12T10:09:34.870044
1,gr00t_15_0801_robocasa_multitask_n1_g8_b32_lr0...,PnPCabToCounter,0.05,100,5,2025-08-12T08:40:00.074600
2,gr00t_15_0801_robocasa_multitask_n1_g8_b32_lr0...,TurnOnSinkFaucet,0.84,100,84,2025-08-12T11:43:27.795709
3,gr00t_15_0801_robocasa_multitask_n1_g8_b32_lr0...,CoffeeServeMug,0.58,100,58,2025-08-12T10:22:25.811742
4,gr00t_15_0801_robocasa_multitask_n1_g8_b32_lr0...,OpenDoubleDoor,0.62,100,62,2025-08-12T09:20:24.682641


In [10]:
# Calculate average performance across all environments for each model
model_avg_performance = df.groupby('model')['success_rate'].mean().sort_values(ascending=False)

print("Average Performance Across All Environments:")
for model, avg_perf in model_avg_performance.items():
    print(f"{model}: {avg_perf:.3f}")

model_avg_performance

Average Performance Across All Environments:
gr00t_15_150h_30K_0801_robocasa_multitask_n1_g8_b32_lr0.0001_si-pg-noctx: 0.590
gr00t_15_0801_robocasa_multitask_n1_g8_b32_lr0.0001_si-pg-noctx: 0.570


model
gr00t_15_150h_30K_0801_robocasa_multitask_n1_g8_b32_lr0.0001_si-pg-noctx    0.59
gr00t_15_0801_robocasa_multitask_n1_g8_b32_lr0.0001_si-pg-noctx             0.57
Name: success_rate, dtype: float64

In [16]:
# 2. Detailed Table: Performance Per Environment for Each Model

# Create pivot tables for success rate and standard deviation
pivot_table = df.pivot(index='environment', columns='model', values='success_rate')

# Calculate standard deviation per environment across models (if multiple models exist)
if len(df['model'].unique()) > 1:
    env_std = df.groupby('environment')['success_rate'].std()
    
    print("Performance Per Environment (Success Rate ± Std Dev):")
    print("=" * 70)
    
    # Create a formatted table with mean ± std
    display_table = pivot_table.copy()
    
    # Format the values to show mean ± std for each environment
    for env in display_table.index:
        std_val = env_std[env] if pd.notna(env_std[env]) else 0
        for col in display_table.columns:
            val = display_table.loc[env, col]
            if pd.notna(val):
                display_table.loc[env, col] = f"{val:.3f} ± {std_val:.3f}"
            else:
                display_table.loc[env, col] = "N/A"
    
    print(display_table.to_string())
    
    # Also show just the standard deviation table
    print(f"\n\nStandard Deviation Across Models Per Environment:")
    print("=" * 60)
    std_df = pd.DataFrame({'Environment': env_std.index, 'Std_Dev': env_std.values})
    std_df['Std_Dev'] = std_df['Std_Dev'].round(3)
    print(std_df.to_string(index=False))

else:
    print("Performance Per Environment (Success Rate):")
    print("=" * 60)
    
    # Display as a formatted table
    display_table = pivot_table.copy()
    
    # Format the values to show 3 decimal places
    for col in display_table.columns:
        display_table[col] = display_table[col].apply(lambda x: f"{x:.3f}" if pd.notna(x) else "N/A")
    
    print(display_table.to_string())
    print("\nNote: Standard deviation not calculated (only one model)")

# Also display the raw DataFrame for reference
print("\n\nDetailed Results:")
print("=" * 60)
detailed_df = df[['model', 'environment', 'success_rate', 'num_episodes', 'num_successes']].copy()
detailed_df['success_rate'] = detailed_df['success_rate'].round(3)
detailed_df = detailed_df.sort_values(['model', 'environment'])

print(detailed_df.to_string(index=False))

Performance Per Environment (Success Rate ± Std Dev):
model             gr00t_15_0801_robocasa_multitask_n1_g8_b32_lr0.0001_si-pg-noctx gr00t_15_150h_30K_0801_robocasa_multitask_n1_g8_b32_lr0.0001_si-pg-noctx
environment                                                                                                                                               
CloseDoubleDoor                                                     0.840 ± 0.000                                                            0.840 ± 0.000
CoffeeServeMug                                                      0.580 ± 0.064                                                            0.670 ± 0.064
CoffeeSetupMug                                                      0.300 ± 0.049                                                            0.370 ± 0.049
OpenDoubleDoor                                                      0.620 ± 0.057                                                            0.700 ± 0.057
PnPCabToCounter 


Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value '0.840 ± 0.000' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.


Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value '0.840 ± 0.000' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.

