# Evaluation

In [None]:
import sys
sys.path.append('..')

import time
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from PIL import Image
from pathlib import Path
from tqdm import tqdm
from datetime import datetime

import config
from models import load_pipeline
from data import RobotKeypointDataset
from utils import compute_add_error_ik, compute_auc
from utils.kinematics import get_joint_positions

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")

In [None]:
# Create output directories
RESULTS_DIR = Path('../results')
DATA_DIR = RESULTS_DIR / 'data'
FIGURES_DIR = RESULTS_DIR / 'figures'
DATA_DIR.mkdir(parents=True, exist_ok=True)
FIGURES_DIR.mkdir(parents=True, exist_ok=True)
print(f"Output: {DATA_DIR}")

In [None]:
pipeline = load_pipeline(
    stage1_path='../checkpoints/stage1_best.pt',
    stage2_path='../checkpoints/stage2_best.pt',
    config=config,
    device=device
)
print("pipeline loaded")

test_dataset = RobotKeypointDataset(
    data_dirs=[config.TEST_DIR],
    config=config,
    load_3d=True,
    load_angles=True
)
print(f"Test set {len(test_dataset)} samples")

## Evaluation Loop

In [None]:
results_rows = []
inference_times = []

for i in tqdm(range(len(test_dataset)), desc="Evaluating"):
    sample = test_dataset.samples[i]
    img_path = sample['img_path']
    gt_2d = sample['keypoints']  # shape 6, 2
    gt_3d = sample.get('positions_3d')  # shape 6, 3
    gt_angles_deg = sample.get('joint_angles')  # in deg
    
    gt_bbox = test_dataset.compute_bbox(gt_2d)
    
    start_time = time.time()
    pred_2d, pred_bbox = pipeline.predict(img_path)
    inference_time = time.time() - start_time
    inference_times.append(inference_time)
    
    img = Image.open(img_path).convert('RGB')
    pred_2d_gt_bbox = pipeline.predict_with_gt_bbox(img, gt_bbox)
    
    errors_2d_full = np.linalg.norm(pred_2d - gt_2d, axis=1)
    errors_2d_gt_bbox = np.linalg.norm(pred_2d_gt_bbox - gt_2d, axis=1)
    
    row = {
        'sample_idx': i,
        'img_path': img_path,
        'inference_time_ms': inference_time * 1000,
        # Bboxes
        'gt_bbox_xmin': gt_bbox[0],
        'gt_bbox_ymin': gt_bbox[1],
        'gt_bbox_xmax': gt_bbox[2],
        'gt_bbox_ymax': gt_bbox[3],
        'pred_bbox_xmin': pred_bbox[0],
        'pred_bbox_ymin': pred_bbox[1],
        'pred_bbox_xmax': pred_bbox[2],
        'pred_bbox_ymax': pred_bbox[3],
        # Mean 2D errors
        'mean_2d_error_px': errors_2d_full.mean(),
        'mean_2d_error_gt_bbox_px': errors_2d_gt_bbox.mean(),
    }
    
    # per-joint 2D data
    for j, joint_name in enumerate(config.JOINT_NAMES):
        row[f'gt_{joint_name}_x'] = gt_2d[j, 0]
        row[f'gt_{joint_name}_y'] = gt_2d[j, 1]
        row[f'pred_{joint_name}_x'] = pred_2d[j, 0]
        row[f'pred_{joint_name}_y'] = pred_2d[j, 1]
        row[f'error_2d_{joint_name}_px'] = errors_2d_full[j]
        if gt_3d is not None:
            row[f'gt_3d_{joint_name}_x'] = gt_3d[j, 0]
            row[f'gt_3d_{joint_name}_y'] = gt_3d[j, 1]
            row[f'gt_3d_{joint_name}_z'] = gt_3d[j, 2]
        if gt_angles_deg is not None:
            row[f'gt_angle_J{j}_deg'] = gt_angles_deg[j]
    
    results_rows.append(row)

df = pd.DataFrame(results_rows)

inference_times = np.array(inference_times)
print(f"\ninference timing")
print(f"mean: {inference_times.mean()*1000:.1f} ms")
print(f"std: {inference_times.std()*1000:.1f} ms")
print(f"fps: {1/inference_times.mean():.1f}")

## IK Evaluation

In [None]:
has_angles = any(f'gt_angle_J{i}_deg' in df.columns for i in range(6))
has_3d = 'gt_3d_Base_x' in df.columns

if has_angles and has_3d:    
    ik_results = {'add_ik': [], 'angle_errors': [], 'reproj_errors': [], 'est_angles': []}
    
    for i in tqdm(range(len(df)), desc="IK Evaluation"):
        sample = test_dataset.samples[i]
        pred_2d = np.array([[df.loc[i, f'pred_{jn}_x'], df.loc[i, f'pred_{jn}_y']] 
                           for jn in config.JOINT_NAMES])
        gt_2d = sample['keypoints']
        gt_3d = sample['positions_3d']
        gt_angles_deg = sample['joint_angles']
        
        try:
            add_error, per_joint_3d, angle_errors_deg, reproj_error = compute_add_error_ik(
                pred_2d, gt_2d, gt_3d, gt_angles_deg, config.CAMERA_MATRIX
            )
            from utils.kinematics import solve_ik_from_2d
            est_angles_deg, est_3d, _ = solve_ik_from_2d(
                pred_2d, gt_2d, gt_3d, gt_angles_deg, config.CAMERA_MATRIX
            )
            ik_results['add_ik'].append(add_error)
            ik_results['angle_errors'].append(angle_errors_deg)
            ik_results['reproj_errors'].append(reproj_error)
            ik_results['est_angles'].append(est_angles_deg)
        except:
            ik_results['add_ik'].append(np.nan)
            ik_results['angle_errors'].append(np.full(6, np.nan))
            ik_results['reproj_errors'].append(np.nan)
            ik_results['est_angles'].append(np.full(6, np.nan))
    
    df['add_ik_m'] = ik_results['add_ik']
    df['add_ik_cm'] = np.array(ik_results['add_ik']) * 100
    df['ik_reproj_error_px'] = ik_results['reproj_errors']
    
    angle_errors = np.array(ik_results['angle_errors'])
    est_angles_all = np.array(ik_results['est_angles'])
    for j, jn in enumerate(config.JOINT_NAMES):
        df[f'angle_error_{jn}_deg'] = angle_errors[:, j]
        df[f'est_angle_J{j}_deg'] = est_angles_all[:, j]
    
    add_ik = np.array(ik_results['add_ik'])
    valid = ~np.isnan(add_ik)
    print(f"\nIK results ({valid.sum()}/{len(add_ik)} valid):")
    print(f"mean ADD Error: {np.nanmean(add_ik)*100:.2f} cm")
    print(f"median ADD Error: {np.nanmedian(add_ik)*100:.2f} cm")
    print(f"\nper joint angle errors (degreees):")
    for j, jn in enumerate(config.JOINT_NAMES):
        print(f"  {jn}: {np.nanmean(angle_errors[:, j]):.2f}°")
else:
    print("missing GT angle")

In [None]:
df.to_csv(DATA_DIR / 'predictions.csv', index=False, float_format='%.6f')
print(f"Saved: {DATA_DIR / 'predictions.csv'}")


## Summary

In [None]:
pixel_errors = df['mean_2d_error_px'].values
pixel_errors_gt = df['mean_2d_error_gt_bbox_px'].values

add_ik = df['add_ik_m'].dropna().values

auc_ik, thresholds, acc_ik = compute_auc(add_ik, max_threshold=0.30, num_steps=100)

summary = {
    'Metric': [
        'Mean 2D Error (px)',
        'Median 2D Error (px)',
        'Std 2D Error (px)',
        'Mean ADD Error (cm)',
        'Median ADD Error (cm)',
        'Std ADD Error (cm)',
        'AUC (0-30cm)',
        'Accuracy @5cm (%)',
        'Accuracy @10cm (%)',
        'Accuracy @20cm (%)',
        'Mean Inference Time (ms)',
        'FPS',
    ],
    'Full Pipeline': [
        f"{pixel_errors.mean():.2f}",
        f"{np.median(pixel_errors):.2f}",
        f"{pixel_errors.std():.2f}",
        f"{add_ik.mean() * 100:.2f}",
        f"{np.median(add_ik) * 100:.2f}",
        f"{add_ik.std() * 100:.2f}",
        f"{auc_ik:.3f}",
        f"{(add_ik <= 0.05).mean() * 100:.1f}",
        f"{(add_ik <= 0.10).mean() * 100:.1f}",
        f"{(add_ik <= 0.20).mean() * 100:.1f}",
        f"{inference_times.mean()*1000:.1f}",
        f"{1/inference_times.mean():.1f}",
    ],
    'GT BBox (Stage 2 only)': [
        f"{pixel_errors_gt.mean():.2f}",
        f"{np.median(pixel_errors_gt):.2f}",
        f"{pixel_errors_gt.std():.2f}",
        '-', '-', '-', '-', '-', '-', '-', '-', '-'
    ],
}

df_summary = pd.DataFrame(summary)
print(df_summary.to_string(index=False))

df_summary.to_csv(DATA_DIR / 'summary_metrics.csv', index=False)

df_auc = pd.DataFrame({
    'threshold_cm': thresholds * 100,
    'accuracy_pct': acc_ik * 100,
})


df_auc.to_csv(DATA_DIR / 'auc_curve_data.csv', index=False)

In [None]:
# per joint error summary
joint_summary = []
for j, jn in enumerate(config.JOINT_NAMES):
    row = {
        'Joint': jn,
        'Mean 2D (px)': df[f'error_2d_{jn}_px'].mean(),
        'Std 2D (px)': df[f'error_2d_{jn}_px'].std(),
    }
    if f'angle_error_{jn}_deg' in df.columns:
        row['Mean Angle Error (°)'] = df[f'angle_error_{jn}_deg'].mean()
        row['Std Angle Error (°)'] = df[f'angle_error_{jn}_deg'].std()
    joint_summary.append(row)

df_joints = pd.DataFrame(joint_summary)
print("per joint errors")
print(df_joints.to_string(index=False))

df_joints.to_csv(DATA_DIR / 'per_joint_errors.csv', index=False)

## 3D Vizualisation

In [None]:
def visualize_prediction_3d(sample_idx, df, test_dataset, save_path=None):
    sample = test_dataset.samples[sample_idx]
    img = Image.open(sample['img_path'])
    gt_2d = sample['keypoints']
    gt_3d = sample['positions_3d']
    gt_angles_deg = sample['joint_angles']
    
    pred_2d = np.array([[df.loc[sample_idx, f'pred_{jn}_x'], df.loc[sample_idx, f'pred_{jn}_y']] 
                       for jn in config.JOINT_NAMES])
    
    if f'est_angle_J0_deg' in df.columns:
        est_angles_deg = np.array([df.loc[sample_idx, f'est_angle_J{j}_deg'] for j in range(6)])
        est_3d = get_joint_positions(est_angles_deg, include_base=True, angles_in_radians=False)
    else:
        est_3d = gt_3d 
    
    connections = [(0, 1), (1, 2), (2, 3), (3, 4), (4, 5)]
    
    gt_color = '#2ecc71'
    pred_color = '#e74c3c'
    
    fig = plt.figure(figsize=(14, 6))
    
    ax1 = fig.add_subplot(121)
    ax1.imshow(img)
    
    ax1.scatter(gt_2d[:, 0], gt_2d[:, 1], c=gt_color, s=100, marker='o', 
                label='Ground Truth', edgecolors='white', linewidths=2, zorder=5) #gt keypoints
    ax1.scatter(pred_2d[:, 0], pred_2d[:, 1], c=pred_color, s=100, marker='x', # predicted keypoints
                label='Predicted', linewidths=3, zorder=5)
    
    for i, j in connections:
        ax1.plot([gt_2d[i, 0], gt_2d[j, 0]], [gt_2d[i, 1], gt_2d[j, 1]], 
                 c=gt_color, linewidth=2, alpha=0.7)
        ax1.plot([pred_2d[i, 0], pred_2d[j, 0]], [pred_2d[i, 1], pred_2d[j, 1]], 
                 c=pred_color, linewidth=2, alpha=0.7, linestyle='--')
    
    ax1.legend(loc='upper right')
    ax1.set_title(f'2D Keypoints (Error: {df.loc[sample_idx, "mean_2d_error_px"]:.1f} px)')
    ax1.axis('off')
    
    ax2 = fig.add_subplot(122, projection='3d')
    
    ax2.scatter(gt_3d[:, 0], gt_3d[:, 1], gt_3d[:, 2], c=gt_color, s=100, 
                label='Ground Truth', depthshade=False)
    for i, j in connections:
        ax2.plot([gt_3d[i, 0], gt_3d[j, 0]], 
                 [gt_3d[i, 1], gt_3d[j, 1]], 
                 [gt_3d[i, 2], gt_3d[j, 2]], c=gt_color, linewidth=3)
    
    ax2.scatter(est_3d[:, 0], est_3d[:, 1], est_3d[:, 2], c=pred_color, s=100,
                marker='x', label='Estimated (IK)', depthshade=False)
    for i, j in connections:
        ax2.plot([est_3d[i, 0], est_3d[j, 0]], 
                 [est_3d[i, 1], est_3d[j, 1]], 
                 [est_3d[i, 2], est_3d[j, 2]], c=pred_color, linewidth=2, linestyle='--')
    
    ax2.set_xlabel('X (m)')
    ax2.set_ylabel('Y (m)')
    ax2.set_zlabel('Z (m)')
    
    add_err = df.loc[sample_idx, 'add_ik_cm'] if 'add_ik_cm' in df.columns else 0
    ax2.set_title(f'3D Skeleton (ADD: {add_err:.1f} cm)')
    ax2.legend(loc='upper right')
    
    max_range = np.array([gt_3d[:, 0].max() - gt_3d[:, 0].min(),
                          gt_3d[:, 1].max() - gt_3d[:, 1].min(),
                          gt_3d[:, 2].max() - gt_3d[:, 2].min()]).max() / 2.0
    mid_x = (gt_3d[:, 0].max() + gt_3d[:, 0].min()) * 0.5
    mid_y = (gt_3d[:, 1].max() + gt_3d[:, 1].min()) * 0.5
    mid_z = (gt_3d[:, 2].max() + gt_3d[:, 2].min()) * 0.5
    ax2.set_xlim(mid_x - max_range, mid_x + max_range)
    ax2.set_ylim(mid_y - max_range, mid_y + max_range)
    ax2.set_zlim(mid_z - max_range, mid_z + max_range)
    
    plt.tight_layout()
    
    if save_path:
        fig.savefig(save_path, dpi=150, bbox_inches='tight', facecolor='white')
        print(f"Saved: {save_path}")
    
    return fig

In [None]:
if 'add_ik_cm' in df.columns:
    add_errors = df['add_ik_cm'].values
    valid_mask = ~np.isnan(add_errors)
    valid_indices = np.where(valid_mask)[0]
    sorted_indices = valid_indices[np.argsort(add_errors[valid_indices])]
    
    samples_to_viz = {
        'best': sorted_indices[0],
        'median': sorted_indices[len(sorted_indices)//2],
        'worst': sorted_indices[-1],
    }
    
    for name, idx in samples_to_viz.items():
        fig = visualize_prediction_3d(
            idx, df, test_dataset,
            save_path=FIGURES_DIR / f'sample_{name}.png'
        )
        plt.show()
        plt.close(fig)