# Depth-Aware Isaac Model Test

Minimal test for model loading and basic inference.

## Setup

In [None]:
import sys
from pathlib import Path
import torch
import numpy as np
from PIL import Image

# Add project root to path
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))
sys.path.insert(0, str(project_root / "perceptron" / "huggingface"))
sys.path.insert(0, str(project_root / "Depth-Anything-V2"))

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {device}")

In [None]:
# Import depth-aware Isaac
from src.depth_isaac import (
    IsaacConfig,
    IsaacForConditionalGeneration,
    IsaacProcessor,
)

print("✓ Imports successful")

## Load Model

In [None]:
model_path = project_root / "isaac_model"
print(f"Loading model from: {model_path}")

# Load config
config = IsaacConfig.from_pretrained(str(model_path))

# Optional: Set depth checkpoint path if available
depth_checkpoint = project_root / "depth_anything_v2_vitl.pth"
if depth_checkpoint.exists():
    config.depth_checkpoint_path = str(depth_checkpoint)
    config.use_depth = True
    print(f"✓ Depth checkpoint found, enabling depth")
else:
    config.use_depth = False
    print(f"⚠ Depth checkpoint not found, depth disabled")

# Load processor
processor = IsaacProcessor.from_pretrained(str(model_path))

# Load model
dtype = torch.float16 if device == "cuda" else torch.float32
model = IsaacForConditionalGeneration.from_pretrained(
    str(model_path),
    torch_dtype=dtype,
    device_map="auto" if device == "cuda" else None,
)

print("✓ Model loaded")

## Test Text Inference

In [None]:
# Test text-only inference
text = "Hello, how are you?"
inputs = processor(text, return_tensors="pt")

if device == "cuda":
    inputs = {k: v.to(device) if isinstance(v, torch.Tensor) else v 
              for k, v in inputs.items()}

print(f"Input: {text}")
print(f"Input IDs shape: {inputs['input_ids'].shape}")

with torch.no_grad():
    outputs = model(**inputs)

print(f"✓ Text inference successful")
print(f"  Logits shape: {outputs.logits.shape}")

In [None]:
## Test Vision + Text Inference

In [None]:
# Test vision + text inference
dummy_image = Image.fromarray(
    np.random.randint(0, 255, (256, 256, 3), dtype=np.uint8)
)

text_with_image = f"Describe this image: {processor.vision_token}"
inputs = processor(text_with_image, images=dummy_image, return_tensors="pt")

if device == "cuda":
    inputs = {k: v.to(device) if isinstance(v, torch.Tensor) else v 
              for k, v in inputs.items()}

print(f"Input: {text_with_image}")
print(f"Image size: {dummy_image.size}")
print(f"Input IDs shape: {inputs['input_ids'].shape}")

with torch.no_grad():
    outputs = model(**inputs)

print(f"✓ Vision + text inference successful")
print(f"  Logits shape: {outputs.logits.shape}")