In [3]:
! pip install -e ../


Obtaining file:///app/DetAny3D
  Installing build dependencies ... [?25ldone
[?25h  Checking if build backend supports build_editable ... [?25ldone
[?25h  Getting requirements to build editable ... [?25ldone
[?25h  Preparing editable metadata (pyproject.toml) ... [?25ldone
[?25hBuilding wheels for collected packages: detect_anything
  Building editable for detect_anything (pyproject.toml) ... [?25ldone
[?25h  Created wheel for detect_anything: filename=detect_anything-1.0-0.editable-py3-none-any.whl size=7037 sha256=6278da300aa9d6accdb92c3b4418cfeae33425d4897e4ce586fe4dcab6634e28
  Stored in directory: /tmp/pip-ephem-wheel-cache-jqzrx3ul/wheels/42/47/cb/8136801cab19776f09472649003584ab5b5c5605be54d81d2c
Successfully built detect_anything
Installing collected packages: detect_anything
Successfully installed detect_anything-1.0
[0m

In [None]:
import sys
import os

# Set this to the directory containing the 'detany3d' source folder
# For example, if you cloned it to '/content/detany3d'
repo_path = '/app/DetAny3D' 
if repo_path not in sys.path:
    sys.path.append(repo_path)
    print(f"Added {repo_path} to system path.")

from detect_anything

Added /app/DetAny3D to system path.


In [33]:
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import random
from nuscenes.nuscenes import NuScenes
from nuscenes.utils.data_classes import Box
from pyquaternion import Quaternion
from detect_anything.modeling import ImageEncoderViT, MaskDecoder, PromptEncoder, TwoWayTransformer

# Ensure Matplotlib plots are displayed inline
%matplotlib inline

In [80]:
# --- USER INPUT REQUIRED ---
# ⚠️ Update these paths to match your local setup ⚠️
NUSCENES_ROOT = "/data/nuscenes" 
VERSION = "v1.0-trainval" 
CONFIG_PATH = "/app/DetAny3D/detect_anything/configs/loss_analysis.yaml"
CHECKPOINT_PATH = "/app/DetAny3D/checkpoints/detany3d_ckpts/zero_shot_category_ckpt-004.pth"
CAM_CHANNEL = "CAM_FRONT"
# -----------------------------

# Initialize the nuScenes object
print(f"Initializing NuScenes dataset (version: {VERSION})...")
try:
    nusc = NuScenes(version=VERSION, dataroot=NUSCENES_ROOT, verbose=False)
    print("Initialization complete.")
except Exception as e:
    print(f"Error initializing nuScenes: {e}. Please check NUSCENES_ROOT.")
    nusc = None

Initializing NuScenes dataset (version: v1.0-trainval)...
Initialization complete.


In [30]:
def limit_period(val, offset=0.5, period=2 * np.pi):
    """Limit the angle value into [offset, offset + period]."""
    return val - np.floor(val / period + offset) * period

def get_box_corners(box_7dim):
    """Converts a 7-DoF box [x, y, z, l, w, h, yaw] into 8x3 corner points (camera frame)."""
    center = box_7dim[0:3]
    dims = box_7dim[3:6]
    yaw = box_7dim[6]
    
    l, w, h = dims
    unrotated_corners = np.array([
        [-l/2, -w/2, -h/2], [ l/2, -w/2, -h/2], [ l/2,  w/2, -h/2], [-l/2,  w/2, -h/2],
        [-l/2, -w/2,  h/2], [ l/2, -w/2,  h/2], [ l/2,  w/2,  h/2], [-l/2,  w/2,  h/2]
    ]).T

    # Rotation Matrix (Yaw around Y-axis, simplified camera frame)
    cos_t, sin_t = np.cos(yaw), np.sin(yaw)
    R = np.array([
        [cos_t, 0, sin_t],
        [0, 1, 0],
        [-sin_t, 0, cos_t]
    ])
    
    corners_3d = (R @ unrotated_corners) + center[:, np.newaxis]
    return corners_3d.T # Returns 8x3 array

def project_3d_to_2d(corners_3d, calib_matrix, color, linestyle, label=None, return_coords=False):
    """Projects 3D points to 2D image coordinates using the 3x4 P-matrix."""
    P = calib_matrix
    
    # 1. Project points (converting to homogeneous coordinates)
    points_h = np.hstack((corners_3d, np.ones((corners_3d.shape[0], 1))))
    points_2d_h = P @ points_h.T 
    
    # 2. Normalize and convert to image coordinates
    points_2d = (points_2d_h[:2] / points_2d_h[2]).T
    
    if return_coords:
        # Return coordinates for scatter plots
        return points_2d[:, 0], points_2d[:, 1]
    
    # 3. Drawing logic for bounding box edges
    edges = [
        (0, 1), (1, 2), (2, 3), (3, 0),  
        (4, 5), (5, 6), (6, 7), (7, 4),  
        (0, 4), (1, 5), (2, 6), (3, 7)   
    ]

    ax = plt.gca()
    for i, j in edges:
        x_coords = [points_2d[i, 0], points_2d[j, 0]]
        y_coords = [points_2d[i, 1], points_2d[j, 1]]
        ax.plot(x_coords, y_coords, color=color, linestyle=linestyle, linewidth=2, zorder=5)

    if label:
        ax.plot([], [], color=color, linestyle=linestyle, linewidth=2, label=label)

In [81]:
import yaml
from box import Box

try:
    # 1a. Load the configuration file content (Assumes it's a YAML or config-like structure)
    # If the config is a .py file, you'd need mmcv.Config.fromfile(CONFIG_PATH) here
    # Since previous attempts failed, we stick to the dictionary/box method:
    with open(CONFIG_PATH, 'r', encoding='utf-8') as f:
        cfg_dict = yaml.load(f.read(), Loader=yaml.FullLoader)
    
    # 1b. CRITICAL FIX: Convert the dictionary to a Box object for dot notation access
    cfg = Box(cfg_dict)
    print("✅ Configuration loaded successfully and converted to dot-accessible object.")

except Exception as e:
    print(f"❌ ERROR in Config Loading: {e}. Using NoneType placeholder.")
    cfg = None 

# --- 2. MODEL ARCHITECTURE BUILDER (FIXED) ---

def build_detany3d_model(config_obj, checkpoint_path, device):
    """ Instantiates the DetAny3D model architecture and loads weights. """
    
    if config_obj is None:
        return None

    # CRITICAL: Use the component imports that work in your environment
    try:
        from detect_anything.modeling import ImageEncoderViT, MaskDecoder, PromptEncoder, DetAny3D 
    except ImportError as e:
        print(f"❌ Error importing model components: {e}")
        return None

    try:
        # 1. Instantiate the core components (Using hardcoded or default params)
        cfg_model = config_obj.model # Assume model parameters are under a 'model' key

        image_encoder = ImageEncoderViT(
            depth=12, embed_dim=768, img_size=1024, mlp_ratio=4,
            norm_layer=torch.nn.LayerNorm, num_heads=12, qkv_bias=True, use_rel_pos=True,
            window_size=14, global_attn_indexes=[2, 5, 8, 11],
            cfg=config_obj # Pass the entire config object
        )
        prompt_encoder = PromptEncoder(
            embed_dim=256, image_embedding_size=(64, 64), input_image_size=(1024, 1024)
        )
        mask_decoder = MaskDecoder(
            num_multimask_outputs=3, transformer_dim=256
        )
        
        # 2. Combine components into the main DetAny3D structure
        model = DetAny3D(
            image_encoder=image_encoder,
            prompt_encoder=prompt_encoder,
            mask_decoder=mask_decoder,
            # Placeholder for other required parameters
        )
        print("✅ Model architecture instantiated.")

        # 3. Load checkpoint weights
        checkpoint = torch.load(checkpoint_path, map_location=device)
        state_dict = checkpoint.get('state_dict', checkpoint)
            
        model.load_state_dict(state_dict, strict=False)
        model.to(device)
        model.eval()
        print("✅ Model weights loaded and set to evaluation mode.")
        return model
        
    except Exception as e:
        print(f"❌ FATAL MODEL LOAD ERROR: {e}")
        return None

✅ Configuration loaded successfully and converted to dot-accessible object.


In [85]:
# --- 2. MODEL ARCHITECTURE BUILDER (FIXED) ---

def build_detany3d_model(config_obj, checkpoint_path, device):
    """ Instantiates the DetAny3D model architecture and loads weights. """
    
    if config_obj is None:
        return None

    # CRITICAL: Use the component imports that work in your environment
    try:
        from detect_anything.modeling import ImageEncoderViT, MaskDecoder, PromptEncoder, TwoWayTransformer
    except ImportError as e:
        print(f"❌ Error importing model components: {e}")
        return None

    try:
        # 1. Instantiate the core components (Using hardcoded or default params)
        cfg_model = config_obj.model # Assume model parameters are under a 'model' key

        image_encoder = ImageEncoderViT(
            depth=12, embed_dim=768, img_size=1024, mlp_ratio=4,
            norm_layer=torch.nn.LayerNorm, num_heads=12, qkv_bias=True, use_rel_pos=True,
            window_size=14, global_attn_indexes=[2, 5, 8, 11],
            cfg=config_obj # Pass the entire config object
        )
        prompt_encoder = PromptEncoder(
            embed_dim=256, image_embedding_size=(64, 64), input_image_size=(1024, 1024)
        )
        mask_decoder = MaskDecoder(
            num_multimask_outputs=3, transformer_dim=256
        )
        
        # 2. Combine components into the main DetAny3D structure
        model = TwoWayTransformer(
            image_encoder=image_encoder,
            prompt_encoder=prompt_encoder,
            mask_decoder=mask_decoder,
            # Placeholder for other required parameters
        )
        print("✅ Model architecture instantiated.")

        # 3. Load checkpoint weights
        checkpoint = torch.load(checkpoint_path, map_location=device)
        state_dict = checkpoint.get('state_dict', checkpoint)
            
        model.load_state_dict(state_dict, strict=False)
        model.to(device)
        model.eval()
        print("✅ Model weights loaded and set to evaluation mode.")
        return model
        
    except Exception as e:
        print(f"❌ FATAL MODEL LOAD ERROR: {e}")
        return None

In [87]:
CONFIG_PATH = "/app/DetAny3D/detect_anything/configs/loss_analysis.yaml"

In [89]:
DEVICE = 'cuda:0' if torch.cuda.is_available() else 'cpu'
# Build the model using the function
detany3d_model = build_detany3d_model(cfg, CHECKPOINT_PATH, DEVICE)

# Define the model to be used for inference
if detany3d_model:
    model = detany3d_model
    print("--- Using REAL DetAny3D Model for Inference ---")
else:
    # Fallback to MockModel if real model failed to load
    print("--- Using MOCKMODEL for Inference ---")
    class MockModel:
        def eval(self): pass
        def __call__(self, data):
            gt_box_tensor = data['gt_box'][0]
            gt_box_np = gt_box_tensor.cpu().numpy()
            pred_box_np = gt_box_np.copy()
            pred_box_np[6] = limit_period(gt_box_np[6] + np.random.uniform(-0.5, 0.5), period=2 * np.pi)
            pred_box_np[0:6] += np.random.randn(6) * 0.1 
            pred_box_tensor = torch.from_numpy(pred_box_np).float()
            return [[pred_box_tensor]] 
    model = MockModel()

loading from /app/DetAny3D/checkpoints/dino_ckpts/dinov2_vitl14_pretrain.pth with: _IncompatibleKeys(missing_keys=['register_tokens'], unexpected_keys=[])
❌ FATAL MODEL LOAD ERROR: only support depth 32
--- Using MOCKMODEL for Inference ---


In [90]:
# --- DATA SELECTION AND INFERENCE SETUP ---
if nusc is None:
    raise RuntimeError("NuScenes object not initialized. Cannot proceed.")

cam_data_tokens = [sd['token'] for sd in nusc.sample_data if sd['channel'] == CAM_CHANNEL]
random_sd_token = random.choice(cam_data_tokens)
sd_record = nusc.get('sample_data', random_sd_token)
sample_record = nusc.get('sample', sd_record['sample_token'])
IMAGE_PATH_NUSCENES = os.path.join(nusc.dataroot, sd_record['filename'])

# Get Calibration
cs_record = nusc.get('calibrated_sensor', sd_record['calibrated_sensor_token'])
P_intrinsic = np.array(cs_record['camera_intrinsic'])
CALIB_MATRIX_NUSCENES = np.hstack((P_intrinsic, np.zeros((3, 1))))

# Extract GT and Transform (CONCEPTUALLY COMPLETE STEP)
ann_tokens = sample_record['anns']
if not ann_tokens:
    raise ValueError("Selected sample has no annotations. Please re-run to select a different sample.")

ann_record = nusc.get('sample_annotation', ann_tokens[0])

# 1. Create the nuScenes Box object (FIXED TYPE ERROR)
box_center = np.array(ann_record['translation'], dtype=np.float32)
box_size = np.array(ann_record['size'], dtype=np.float32)
box_rotation = Quaternion(ann_record['rotation'])

gt_nusc_box = Box(box_center, box_size, box_rotation, name=ann_record['category_name'])

# 2. Get the transformations (poses)
ego_pose_record = nusc.get('ego_pose', sd_record['ego_pose_token'])
ego_pose_rotation = Quaternion(ego_pose_record['rotation'])
cs_translation = cs_record['translation']
cs_rotation = Quaternion(cs_record['rotation'])

# 3. Transform the GT box: World -> Ego -> Sensor
gt_nusc_box.translate(-np.array(ego_pose_record['translation']))
gt_nusc_box.rotate(ego_pose_rotation.inverse)
gt_nusc_box.translate(-np.array(cs_translation))
gt_nusc_box.rotate(cs_rotation.inverse)

# 4. Convert to the 7-DoF array [x, y, z, l, w, h, yaw]
center = np.array(gt_nusc_box.center)
# nuScenes size is [w, l, h]. Standard 3D is often [l, w, h]. Adjusting order.
dims = np.array([gt_nusc_box.l, gt_nusc_box.w, gt_nusc_box.h], dtype=np.float32) 
# Yaw angle from the quaternion (around Y-axis in the camera sensor frame)
yaw = gt_nusc_box.orientation.yaw_pitch_roll[0]

GT_BOX_7DIM = np.array([center[0], center[1], center[2], dims[0], dims[1], dims[2], yaw], dtype=np.float32)

# Create input dictionary
input_data = {
    'img': torch.randn(1, 3, 900, 1600), 
    'img_metas': [{'cam_intrinsic': P_intrinsic, 'cam_extrinsic': np.array(cs_record['rotation'])}], 
    'gt_box': [torch.from_numpy(GT_BOX_7DIM).float()]
}

# RUN INFERENCE
with torch.no_grad():
    model_output = model(input_data)

# Extract predicted 7-DoF box
PRED_BOX_7DIM = model_output[0][0].detach().cpu().numpy()

# --- Loss Calculation ---
gt_yaw = GT_BOX_7DIM[6]
pred_yaw = PRED_BOX_7DIM[6]
angle_diff_yaw = limit_period(pred_yaw - gt_yaw, period=2 * np.pi)
loss_theta = np.abs(angle_diff_yaw)
loss_phi = 0.0 # Placeholder

# Assemble data structure for visualization
data_point_nusc = {
    'index': random_sd_token,
    'image_path': IMAGE_PATH_NUSCENES,
    'calib': CALIB_MATRIX_NUSCENES,
    'gt_box': GT_BOX_7DIM,  
    'pred_box': PRED_BOX_7DIM,
    'loss_phi': loss_phi,
    'loss_theta': loss_theta,
}

print(f"\n✅ Data and Inference Setup Complete for image: {os.path.basename(IMAGE_PATH_NUSCENES)}")
print(f"  GT Yaw: {np.degrees(gt_yaw):.1f}° | Pred Yaw: {np.degrees(pred_yaw):.1f}°")
print(f"  Calculated Loss_Theta (Yaw): {loss_theta:.3f} rad ({np.degrees(loss_theta):.1f}°)")

BoxTypeError: Box expected at most 1 argument, got 3

In [None]:
# --- VISUALIZATION FUNCTION ---

def visualize_nuscenes_loss(data_point):
    
    try:
        # Load the actual image 
        img = plt.imread(data_point['image_path']) 
    except FileNotFoundError:
        print(f"\n❌ ERROR: Image not found at {data_point['image_path']}. Using placeholder.")
        img = np.ones((900, 1600, 3), dtype=np.uint8) * 255
        
    calib = data_point['calib']
    gt_box = data_point['gt_box']
    pred_box = data_point['pred_box']
    loss_theta = data_point['loss_theta']
    
    # 1. Hybrid Box (GT Yaw, Pred Center/Dim)
    hybrid_box = pred_box.copy()
    hybrid_box[6] = gt_box[6] 
    
    # 2. Get 3D Corner Points
    gt_corners = get_box_corners(gt_box)
    pred_corners = get_box_corners(pred_box)
    hybrid_corners = get_box_corners(hybrid_box)
    
    # 3. Calculate Corner Distortion (L2 distance)
    distortion = np.linalg.norm(pred_corners - gt_corners, axis=1)
    max_distortion_idx = np.argmax(distortion)
    
    # --- Plotting ---
    plt.figure(figsize=(12, 9))
    plt.imshow(img)
    plt.title(f"nuScenes Loss Analysis | Yaw Loss: {loss_theta:.3f} rad ({np.degrees(loss_theta):.1f}°)")
    
    # Draw Boxes
    project_3d_to_2d(gt_corners, calib, color='g', linestyle='-', label='Ground Truth (GT)')
    project_3d_to_2d(pred_corners, calib, color='r', linestyle='-', label='Prediction (Pred)')
    project_3d_to_2d(hybrid_corners, calib, color='y', linestyle='--', label='Hybrid (GT Yaw, Pred Center/Dim)')
    
    # Highlight the most distorted corner
    distorted_corner_2d = project_3d_to_2d(pred_corners[max_distortion_idx:max_distortion_idx+1], 
                                           calib, color=None, linestyle=None, return_coords=True)
    
    plt.scatter(distorted_corner_2d[0], distorted_corner_2d[1], 
                s=200, color='magenta', marker='*', label=f'Max Distortion Corner ({distortion[max_distortion_idx]:.2f}m)', zorder=10) 
    
    plt.legend()
    plt.axis('off')
    plt.show()

# --- EXECUTION ---
# Run the visualization
visualize_nuscenes_loss(data_point_nusc)