In [None]:
from pathlib import Path
import numpy as np
import cv2
import vpi

# Paths to the stereo pair (left/right)
left_path = Path("000042_10_L.png")
right_path = Path("000042_10_R.png")

if not left_path.exists() or not right_path.exists():
    raise FileNotFoundError("Expected stereo pair 000042_10_[L|R].png in the current directory.")

# Load images as grayscale for stereo matching
left_img = cv2.imread(str(left_path), cv2.IMREAD_GRAYSCALE)
right_img = cv2.imread(str(right_path), cv2.IMREAD_GRAYSCALE)

if left_img is None or right_img is None:
    raise RuntimeError("Failed to read stereo images with OpenCV.")

# Choose GPU backend when available, otherwise fall back to CPU

with vpi.Backend.CUDA:
    left_vpi = vpi.asimage(left_img)
    right_vpi = vpi.asimage(right_img)
    disparity_vpi = vpi.stereodisp(left_vpi, right_vpi, window=5, maxdisp=128)
    disparity = disparity_vpi.cpu().astype(np.float32)

# Normalize disparity for visualization
disp_norm = cv2.normalize(disparity, None, 0, 255, cv2.NORM_MINMAX)
disp_color = cv2.applyColorMap(disp_norm.astype(np.uint8), cv2.COLORMAP_JET)

In [None]:

import matplotlib.pyplot as plt
from ultralytics import YOLO
from torchvision import transforms
from PIL import Image
from efficientvit.seg_model_zoo import create_efficientvit_seg_model
from eval_efficientvit_seg_model_copy import CityscapesDataset, Resize, ToTensor, get_canvas
from typing import Any, Optional
# ---------------------------------------------------------
# 1. SETUP & CLASS DEFINITIONS
# ---------------------------------------------------------

# Cityscapes 19-class palette (Standard)
CITYSCAPES_CLASSES = {
    0: 'road', 1: 'sidewalk', 2: 'building', 3: 'wall', 4: 'fence',
    5: 'pole', 6: 'traffic light', 7: 'traffic sign', 8: 'vegetation',
    9: 'terrain', 10: 'sky', 11: 'person', 12: 'rider', 13: 'car',
    14: 'truck', 15: 'bus', 16: 'train', 17: 'motorcycle', 18: 'bicycle'
}

# Colors for visualization (Randomized for distinction)
np.random.seed(42)
COLORS = np.random.randint(0, 255, size=(20, 3), dtype=np.uint8)

# ---------------------------------------------------------
# 2. MODEL LOADING
# ---------------------------------------------------------

# A. Load YOLO11-seg (Foreground Specialist)
print("Loading YOLO11-seg...")
yolo_model = YOLO("yolo11n-seg.pt")  # using 'nano' for speed
yolo_model.to("cpu")
# B. Load EfficientViT-Seg (Background Specialist)
print("Loading EfficientViT-Seg...")


# We use 'b0' (fastest) trained on 'cityscapes'
eff_model = create_efficientvit_seg_model(name="efficientvit-seg-b0-cityscapes", pretrained=True).cuda()
eff_model.eval()

# Move to GPU if available
# device = "cuda" if torch.cuda.is_available() else "cpu"
# print(f"Using device: {device}")
# eff_model.to(device)

In [None]:
# ---------------------------------------------------------
# 3. HELPER FUNCTIONS
# ---------------------------------------------------------
def resize(
    x: torch.Tensor,
    size: Optional[Any] = None,
    scale_factor: Optional[list[float]] = None,
    mode: str = "bicubic",
    align_corners: Optional[bool] = False,
) -> torch.Tensor:
    if mode in {"bilinear", "bicubic"}:
        return torch.nn.functional.interpolate(
            x,
            size=size,
            scale_factor=scale_factor,
            mode=mode,
            align_corners=align_corners,
        )
    elif mode in {"nearest", "area"}:
        return torch.nn.functional.interpolate(x, size=size, scale_factor=scale_factor, mode=mode)
    else:
        raise NotImplementedError(f"resize(mode={mode}) not implemented.")

def preprocess_for_efficientvit(image_path):
    """Resizes and normalizes image for EfficientViT"""
    img = np.array(Image.open(image_path).convert('RGB'))
    # EfficientViT expects specific preprocessing (Standard ImageNet norm)
    transform = transforms.Compose([
        Resize((1024, 2048)), # Standard Cityscapes resolution
        ToTensor(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])
    return transform({"data": img, "label": np.ones_like(img)})["data"], img

def get_contours_from_mask(binary_mask):
    """Extracts contour points from a binary mask"""
    # cv2.findContours expects uint8 single channel
    contours, _ = cv2.findContours(binary_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    if contours:
        # Return the largest contour found (main object body)
        c = max(contours, key=cv2.contourArea)
        return c.reshape(-1, 2).tolist() # Convert [[x,y]] format to list of [x,y]
    return []

In [None]:
# ---------------------------------------------------------
# 4. MAIN PIPELINE
# ---------------------------------------------------------

def run_hybrid_segmentation(image_path):
    final_objects = []
    
    # --- Step 1: Run EfficientViT (Background) ---
    input_tensor, original_img_pil = preprocess_for_efficientvit(image_path)
    class_colors = CityscapesDataset.class_colors
    orig_h, orig_w = original_img_pil.shape[:2]

    model_device = next(eff_model.parameters()).device
    with torch.inference_mode():
        batch_tensor = input_tensor.unsqueeze(0).to(model_device, non_blocking=True)
        eff_output = eff_model(batch_tensor)
        if eff_output.shape[-2:] != (orig_h, orig_w):
            eff_output = resize(eff_output, size=(orig_h, orig_w))
        semantic_map = eff_output.argmax(dim=1).squeeze(0).cpu().numpy().astype(np.uint8)

    canvas = get_canvas(original_img_pil, semantic_map, class_colors)
    Image.fromarray(canvas).save("./test_results/efficientvit_output.png")

    del eff_output
    del batch_tensor
    del input_tensor
    if model_device.type == "cuda":
        torch.cuda.empty_cache()

    # Extract "Stuff" (Road, Sky, Building) from Semantic Map
    # We only care about specific background classes for the hybrid model
    stuff_classes_of_interest = [0, 10, 2] # 0=Road, 10=Sky, 2=Building
    
    for cls_id in stuff_classes_of_interest:
        # Create binary mask for this class
        mask = (semantic_map == cls_id).astype(np.uint8) * 255
        
        # Filter: Only keep if it's a significant chunk of the image
        if np.sum(mask) > (orig_h * orig_w * 0.01): 
            contours = get_contours_from_mask(mask)
            if contours:
                final_objects.append({
                    "name": CITYSCAPES_CLASSES[cls_id],
                    "index": int(cls_id),
                    "contour": contours,
                    "box": cv2.boundingRect(np.array(contours)) # x,y,w,h
                })
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    # --- Step 2: Run YOLO11 (Foreground) ---
    # YOLO handles resizing internally, so we pass the path or raw array
    yolo_results = yolo_model(image_path, verbose=False)[0].cpu()

    if yolo_results.masks is not None:
        for i, mask_data in enumerate(yolo_results.masks.data):
            # YOLO masks are float tensors, usually smaller size
            mask_np = mask_data.numpy()
            
            # Resize mask to original image size
            mask_resized = cv2.resize(mask_np, (orig_w, orig_h))
            binary_mask = (mask_resized > 0.5).astype(np.uint8) * 255
            
            # Get Class Name
            class_id = int(yolo_results.boxes.cls[i].item())
            class_name = yolo_model.names[class_id]
            
            contours = get_contours_from_mask(binary_mask)
            
            if contours:
                final_objects.append({
                    "name": class_name,
                    "index": 100 + i, # Offset index to differentiate from semantic IDs
                    "contour": contours,
                    "box": yolo_results.boxes.xywh[i].numpy().tolist()
                })

    del yolo_results
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    return original_img_pil, final_objects

In [None]:
# ---------------------------------------------------------
# 5. VISUALIZATION
# ---------------------------------------------------------

# Download a sample street image000042_10_L.png

image, objects = run_hybrid_segmentation("000042_10_L.png")

# Draw Results
vis_img = image.copy()

for obj in objects:
    pts = np.array(obj['contour'], np.int32)
    pts = pts.reshape((-1, 1, 2))
    
    # Pick a color (Green for YOLO objects, Blueish for Background)
    if obj['index'] < 100: # Background
        color = (255, 0, 0) # Blue
    else: # Foreground
        color = (0, 255, 0) # Green
        
    # Draw Contour
    cv2.polylines(vis_img, [pts], isClosed=True, color=color, thickness=3)
    
    # Draw Label
    box = obj['box']
    x, y = int(box[0]), int(box[1])
    cv2.putText(vis_img, f"{obj['name']}", (x, y - 10), 
                cv2.FONT_HERSHEY_SIMPLEX, 0.9, color, 2)

# Show Image
plt.figure(figsize=(15, 10))
plt.imshow(vis_img)
plt.axis('off')
plt.show()

# Print Data Output (First 3 objects as example)
print("--- Extracted Object Data (Preview) ---")
for i, obj in enumerate(objects[:3]):
    print(f"Object {i+1}: Name='{obj['name']}', Index={obj['index']}, Box={obj['box']}")
    print(f"Contour (First 5 pts): {obj['contour'][:5]}...")