# DROID-SLAM Processing for Stack Sessions

Process raw capture sessions (ultrawide camera, no ARKit) through DROID-SLAM to get 6DoF poses.

**Requirements:** Google Colab Pro with GPU runtime (A100/V100, 11GB+ VRAM)

**Pipeline:**
1. Upload session from Google Drive
2. Load RGB frames + camera intrinsics (calib.txt)
3. Run DROID-SLAM inference → SE3 poses
4. Apply metric scale correction
5. Write poses.json back to session
6. Download or sync back to Drive

## 1. Setup Environment

In [None]:
# Install DROID-SLAM and dependencies
!pip install torch torchvision --index-url https://download.pytorch.org/whl/cu121 -q
!pip install lietorch pytorch_scatter -q
!pip install opencv-python-headless scipy numpy pillow -q

# Clone and install DROID-SLAM
!git clone https://github.com/princeton-vl/DROID-SLAM.git /content/DROID-SLAM 2>/dev/null || true
!cd /content/DROID-SLAM && pip install -e . -q

# Download pretrained weights
!mkdir -p /content/DROID-SLAM/checkpoints
!gdown --id 1PpqVt1H4maBa_GbPJp4NwxRsd9jk-elh -O /content/DROID-SLAM/checkpoints/droid.pth 2>/dev/null || echo 'Download weights manually if gdown fails'

In [None]:
import json
import sys
from pathlib import Path

import cv2
import numpy as np
import torch
from scipy.spatial.transform import Rotation

print(f"PyTorch: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_mem / 1e9:.1f} GB")

## 2. Mount Google Drive & Select Session

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Set the path to your session directory on Google Drive
DRIVE_SESSIONS_DIR = Path('/content/drive/MyDrive/stack_sessions')

In [None]:
# Unzip uploaded session archives (skip already-extracted ones)
import subprocess
if DRIVE_SESSIONS_DIR.exists():
    zips = sorted(DRIVE_SESSIONS_DIR.glob('*.zip'))
    print(f"Found {len(zips)} zip files")
    for z in zips:
        session_dir = DRIVE_SESSIONS_DIR / z.stem
        if session_dir.exists():
            print(f"  {z.name}: already extracted, skipping")
        else:
            print(f"  {z.name}: extracting...")
            subprocess.run(['unzip', '-q', '-o', str(z), '-d', str(DRIVE_SESSIONS_DIR)], check=True)
    print("Done!")
else:
    print(f"Upload zipped sessions to Google Drive: My Drive/stack_sessions/")

In [None]:
# List available sessions
if DRIVE_SESSIONS_DIR.exists():
    sessions = sorted([d for d in DRIVE_SESSIONS_DIR.iterdir() if d.is_dir() and d.name.startswith('session_')])
    print(f"Found {len(sessions)} sessions:")
    for s in sessions:
        meta_file = s / 'metadata.json'
        if meta_file.exists():
            with open(meta_file) as f:
                meta = json.load(f)
            source = meta.get('captureSource', 'iphone_arkit')
            processed = meta.get('slamProcessed', False)
            n_frames = meta.get('rgbFrameCount', '?')
            status = 'done' if processed else ('arkit' if source == 'iphone_arkit' else 'needs SLAM')
            print(f"  {s.name}: {n_frames} frames, source={source}, status={status}")
        else:
            print(f"  {s.name}: no metadata")
else:
    print(f"No sessions found at {DRIVE_SESSIONS_DIR}")

In [None]:
# Select session to process
SESSION_NAME = 'session_2026-02-20_143000'  # <-- Change this
SESSION_DIR = DRIVE_SESSIONS_DIR / SESSION_NAME

assert SESSION_DIR.exists(), f"Session not found: {SESSION_DIR}"

# Load metadata
with open(SESSION_DIR / 'metadata.json') as f:
    metadata = json.load(f)
print(json.dumps(metadata, indent=2))

## 3. Load Frames & Intrinsics

In [None]:
# Load RGB frames
rgb_dir = SESSION_DIR / 'rgb'
frame_paths = sorted(rgb_dir.glob('*.jpg'))
print(f"Found {len(frame_paths)} RGB frames")

# Preview first frame
import matplotlib.pyplot as plt
first_frame = cv2.imread(str(frame_paths[0]))
first_frame = cv2.cvtColor(first_frame, cv2.COLOR_BGR2RGB)
plt.figure(figsize=(8, 6))
plt.imshow(first_frame)
plt.title(f"Frame 0: {first_frame.shape[1]}x{first_frame.shape[0]}")
plt.axis('off')
plt.show()

# Load intrinsics
calib_file = SESSION_DIR / 'calib.txt'
if calib_file.exists():
    parts = calib_file.read_text().strip().split()
    intrinsics = np.array([float(x) for x in parts[:4]])
    print(f"Intrinsics from calib.txt: fx={intrinsics[0]:.1f} fy={intrinsics[1]:.1f} cx={intrinsics[2]:.1f} cy={intrinsics[3]:.1f}")
else:
    # Default estimate for ultrawide at 480x360
    intrinsics = np.array([300.0, 300.0, 240.0, 180.0])
    print(f"No calib.txt — using default estimate: {intrinsics}")
    print("DROID-SLAM will auto-calibrate (opt_intr=True)")

## 4. Run DROID-SLAM

In [None]:
sys.path.insert(0, '/content/DROID-SLAM')
from droid_slam import Droid

# Check if we need intrinsic optimization
opt_intr = not calib_file.exists()

# Initialize DROID-SLAM
droid = Droid(
    image_size=[360, 480],  # H, W
    intrinsics=intrinsics,
    weights='/content/DROID-SLAM/checkpoints/droid.pth',
    opt_intr=opt_intr,
    buffer=512,
    beta=0.3,
)

print(f"DROID-SLAM initialized (opt_intr={opt_intr})")
print(f"Processing {len(frame_paths)} frames...")

In [None]:
# Feed frames to DROID-SLAM
from tqdm import tqdm

for i, path in enumerate(tqdm(frame_paths, desc="Tracking")):
    image = cv2.imread(str(path))
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    timestamp = float(i) / 60.0  # 60 FPS
    droid.track(timestamp, image)

print("\nRunning global bundle adjustment...")
traj = droid.terminate()  # (N, 7) [tx, ty, tz, qx, qy, qz, qw]
print(f"Got {traj.shape[0]} poses")

In [None]:
# Convert to 4x4 pose matrices
num_poses = traj.shape[0]
poses_4x4 = np.zeros((num_poses, 4, 4), dtype=np.float64)

for i in range(num_poses):
    t = traj[i, :3]
    q = traj[i, 3:]  # [qx, qy, qz, qw]
    R = Rotation.from_quat(q).as_matrix()
    poses_4x4[i, :3, :3] = R
    poses_4x4[i, :3, 3] = t
    poses_4x4[i, 3, 3] = 1.0

# Visualize trajectory
positions = poses_4x4[:, :3, 3]
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')
ax.plot(positions[:, 0], positions[:, 1], positions[:, 2], 'b-', linewidth=0.5)
ax.scatter(positions[0, 0], positions[0, 1], positions[0, 2], c='g', s=100, label='Start')
ax.scatter(positions[-1, 0], positions[-1, 1], positions[-1, 2], c='r', s=100, label='End')
ax.set_xlabel('X')
ax.set_ylabel('Y')
ax.set_zlabel('Z')
ax.legend()
ax.set_title(f'DROID-SLAM Trajectory ({num_poses} poses)')
plt.show()

print(f"Trajectory extent: X=[{positions[:,0].min():.3f}, {positions[:,0].max():.3f}]")
print(f"                   Y=[{positions[:,1].min():.3f}, {positions[:,1].max():.3f}]")
print(f"                   Z=[{positions[:,2].min():.3f}, {positions[:,2].max():.3f}]")

## 5. Metric Scale Correction

Monocular SLAM has arbitrary scale. To get metric (real-world) coordinates:
1. Place an object of known size in the scene (e.g., a stacking cube)
2. Measure the object in real life with calipers
3. Estimate the object size in SLAM coordinates (from the trajectory or depth)
4. Compute: `scale = real_size / slam_size`

In [None]:
# Metric scale correction
# Set these values based on your calibration object
KNOWN_REAL_DISTANCE_M = 0.05  # e.g., 50mm cube side = 0.05m
KNOWN_SLAM_DISTANCE = 1.0     # Measure from SLAM trajectory (update this!)

# For now, use scale=1.0 (uncorrected) — update after first calibration
USE_SCALE_CORRECTION = False

if USE_SCALE_CORRECTION:
    scale = KNOWN_REAL_DISTANCE_M / KNOWN_SLAM_DISTANCE
    poses_4x4[:, :3, 3] *= scale
    print(f"Applied scale correction: {scale:.4f}")
else:
    print("No scale correction applied (set USE_SCALE_CORRECTION=True after calibration)")

## 6. Write Poses to Session

In [None]:
# Build poses.json in StackCapture format
poses_list = []
for i in range(min(num_poses, len(frame_paths))):
    poses_list.append({
        'timestamp': float(i) / 60.0,
        'rgbIndex': i,
        'depth': None,
        'transform': poses_4x4[i].tolist(),
    })

# Write poses.json
poses_file = SESSION_DIR / 'poses.json'
with open(poses_file, 'w') as f:
    json.dump(poses_list, f, indent=2)
print(f"Wrote {len(poses_list)} poses to {poses_file}")

# Update metadata
metadata['slamProcessed'] = True
metadata['poseCount'] = len(poses_list)
with open(SESSION_DIR / 'metadata.json', 'w') as f:
    json.dump(metadata, f, indent=2)
print(f"Updated metadata: slamProcessed=true")

print(f"\nSession {SESSION_NAME} is now ready for training!")

## 7. Verification

In [None]:
# Verify the processed session
with open(SESSION_DIR / 'poses.json') as f:
    loaded_poses = json.load(f)
print(f"Loaded {len(loaded_poses)} poses from poses.json")

# Check first and last pose
first = np.array(loaded_poses[0]['transform'])
last = np.array(loaded_poses[-1]['transform'])
print(f"\nFirst pose translation: {first[:3, 3]}")
print(f"Last pose translation: {last[:3, 3]}")
print(f"Total displacement: {np.linalg.norm(last[:3, 3] - first[:3, 3]):.4f} units")

# Verify metadata
with open(SESSION_DIR / 'metadata.json') as f:
    meta = json.load(f)
print(f"\nMetadata:")
print(f"  captureSource: {meta.get('captureSource')}")
print(f"  slamProcessed: {meta.get('slamProcessed')}")
print(f"  poseCount: {meta.get('poseCount')}")
print(f"  rgbFrameCount: {meta.get('rgbFrameCount')}")