<a href="https://colab.research.google.com/github/ggmeiner22/FoundPose/blob/main/FoundPose_implementation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 📂 Clone Repository & 🔑 Mount Google Drive  & Install PyTorch3D/dependencies

Clone the repository and mount **Google Drive** (requires user interaction).  
This will also set up the environment and install the necessary libraries.

In [1]:
!pip --quiet install ipython-autotime
%load_ext autotime

!rm -rf /content/sample_data

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━[0m [32m1.4/1.6 MB[0m [31m41.3 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m23.5 MB/s[0m eta [36m0:00:00[0m
[?25htime: 109 ms (started: 2025-10-15 20:16:39 +00:00)


## Set name and email for GitHub Cloning

In [2]:
!git config --global user.name "penguini128"
!git config --global user.email "tgalletta2022@my.fit.edu"

time: 217 ms (started: 2025-10-15 20:16:39 +00:00)


In [3]:
def gh_clone(user, repo, token_key="GH_TOKEN"):
    from google.colab import userdata
    token = userdata.get(token_key)
    url = f"https://{user}:{token}@github.com/{user}/{repo}.git"
    !git clone "$url"
    !git remote set-url origin $url
    del token

time: 872 µs (started: 2025-10-15 20:16:40 +00:00)


##Clone the Repository
This cell will clone the repository and the helper functions we will need.

In [4]:
gh_clone("ggmeiner22", "FoundPose")

# ✅ Verify that the repository was cloned
import os
repo_name = "/content/FoundPose"   # <-- change to your repository folder name
if os.path.exists(repo_name):
    print(f"✅ Repository '{repo_name}' successfully cloned!")
else:
    print(f"❌ Repository '{repo_name}' not found. Try cloning manually.")

Cloning into 'FoundPose'...
remote: Enumerating objects: 15, done.[K
remote: Counting objects: 100% (15/15), done.[K
remote: Compressing objects: 100% (13/13), done.[K
remote: Total 15 (delta 2), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (15/15), 6.25 MiB | 26.13 MiB/s, done.
Resolving deltas: 100% (2/2), done.
fatal: not a git repository (or any of the parent directories): .git
✅ Repository '/content/FoundPose' successfully cloned!
time: 1.75 s (started: 2025-10-15 20:16:40 +00:00)



## Mount Google Drive

In [5]:
import os

from google.colab import drive
from google.colab import auth

# auth.authenticate_user()

local_path = ""

# Mount google drive if using Colab
if 'google.colab' in str(get_ipython()):
    print('Running on CoLab')
    from google.colab import drive
    drive.mount('/content/drive', force_remount=True)
    local_path = "/content/"
    os.makedirs("/content/matching_results", exist_ok=True)
else:
    local_path = "/teamspace/studios/this_studio/"
    os.makedirs("/teamspace/studios/this_studio/matching_results", exist_ok=True)



os.chdir(local_path)


from moviepy.video.io.ImageSequenceClip import ImageSequenceClip

os.makedirs("/content/matching_results", exist_ok=True)

Running on CoLab
Mounted at /content/drive


  IMAGEMAGICK_BINARY = r"C:\Program Files\ImageMagick-6.8.8-Q16\magick.exe"


time: 26.5 s (started: 2025-10-15 20:16:41 +00:00)


## ⚙️ Install Pytorch3D and other libraries

⚡ Install PyTorch3D from Wheel
PyTorch3D installation can take longer than 8-10 minutes when installed from source.

Here, PyTorch3D is installed from a wheel for a faster setup of about 2 minutes in Colab.

If the installer instead tries to build from source, it means the wheel is outdated or missing.
In that case, you can create your own wheel directly in Colab, save it to Google Drive (or Dropbox), and reuse it later for faster installation.
To create your own PyTorch3D wheel in Colab, follow the instructions in the cell after these installation cells.

In [6]:
# set path for libraries
import sys
sys.path.append('/content/FoundPose')

!wget -q -O installation_tools.py \
  https://raw.githubusercontent.com/ribeiro-computer-vision/pytorch3d_rendering/main/installation_tools.py

time: 209 ms (started: 2025-10-15 20:17:08 +00:00)


In [7]:
# --- Config ---
mount_gdrive = False

# --- Imports (module you saved as sse_env.py) ---
import importlib, os, sys, shutil, subprocess, urllib.request, pathlib
import installation_tools as install_tools
importlib.reload(install_tools)

# --- Short helpers (no notebook magics) ---
def run(cmd, check=True):
    print("$", " ".join(cmd))
    try:
        subprocess.run(cmd, check=check)
    except subprocess.CalledProcessError as e:
        print(f"Command failed ({e.returncode}): {' '.join(cmd)}")
        if check:
            raise

def pip_install(*pkgs, extra=None, check=True):
    args = [sys.executable, "-m", "pip", "install"]
    if extra:
        args += extra
    args += list(pkgs)
    run(args, check=check)

def conda_available():
    return shutil.which("conda") is not None

def conda_install(*pkgs):
    if not conda_available():
        print("conda not available; skipping conda installs.")
        return
    # Use -c conda-forge channel and auto-yes
    run(["conda", "install", "-y", "-c", "conda-forge", *pkgs], check=False)

# --- Detect platform ---
pm = install_tools.PlatformManager()
platform, local_path = pm.platform, pm.local_path
print("Detected:", platform, local_path)

# --- Optional: Mount GDrive if on Colab ---
if mount_gdrive and platform == "Colab":
    pm.mount_gdrive()

# --- Lightning AI specific environment tweaks ---
if platform == "LightningAI":
    # conda piece (if conda exists in the image)
    conda_install("libstdcxx-ng=13")
    # pip pins / extras
    pip_install("numpy<2.0", check=False)
    pip_install("scikit-image", "gradio", "moviepy", "plotly", check=False)
    # If requirements.txt exists in CWD, install it
    if os.path.exists("requirements.txt"):
        pip_install("-r", "requirements.txt")

# --- Install PyTorch3D (handles platform differences & fallbacks) ---
installer = install_tools.PyTorch3DInstaller(platform, local_path)
installer.install()

# --- Extra libraries (quiet-ish) ---
# Original line had: trimesh pyrender opencv-python matplotlib pytorch-lightning
pip_install("trimesh", "pyrender", "opencv-python", "matplotlib", "pytorch-lightning", "torch", "torchvision", "timm", "scikit-learn", "numpy", "Pillow", check=False)

# --- Download plot_image_grid.py if missing ---
filename = "plot_image_grid.py"
url = "https://raw.githubusercontent.com/facebookresearch/pytorch3d/main/docs/tutorials/utils/plot_image_grid.py"
if not os.path.exists(filename):
    print(f"Downloading {filename} ...")
    try:
        urllib.request.urlretrieve(url, filename)
        print("Saved to", pathlib.Path(filename).resolve())
    except Exception as e:
        print("Download failed:", e)

# --- gdown ---
pip_install("gdown", extra=["--quiet"], check=False)
print("✅ Setup complete.")

Running on Colab.
Detected: Colab /content/
$ /usr/bin/python3 -m pip install --upgrade pip
$ /usr/bin/python3 -m pip install --upgrade pip
$ sudo apt-get -qq update
$ sudo apt-get install -y freeglut3-dev libglew-dev libsdl2-dev
$ /usr/bin/python3 -m pip install PyOpenGL PyOpenGL_accelerate

PyTorch3D target wheel tag: py312_cu126_pyt280

$ /usr/bin/python3 -m pip install iopath
Trying to install PyTorch3D wheel on Colab (Linux).
$ /usr/bin/python3 -m pip install https://www.dropbox.com/scl/fi/fqvlnyponcbekjd01omhj/pytorch3d-0.7.8-cp312-cp312-linux_x86_64.whl?rlkey=563mfx35rog42z1c8y7qn31sk&dl=1
✅ PyTorch3D successfully installed!
$ /usr/bin/python3 -m pip install trimesh pyrender opencv-python matplotlib pytorch-lightning torch torchvision timm scikit-learn numpy Pillow
Downloading plot_image_grid.py ...
Saved to /content/plot_image_grid.py
$ /usr/bin/python3 -m pip install --quiet gdown
✅ Setup complete.
time: 59.6 s (started: 2025-10-15 20:17:08 +00:00)


In [18]:
import math, random
import numpy as np
from PIL import Image
import torch
import cv2
from sklearn.decomposition import PCA
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics.pairwise import cosine_similarity
import trimesh
import pyrender
import timm
import matplotlib.pyplot as plt

import os
os.environ["PYOPENGL_PLATFORM"] = "egl"   # must be set before importing pyrender/pyglet

time: 922 µs (started: 2025-10-15 20:25:23 +00:00)


# FoundPose (helpers): minimal components reproduction on a synthetic cube.

What this script does:
1) Offline onboarding:
   - Render N RGB-D templates of a unit cube at different orientations (pyrender).
   - Extract DINOv2 ViT-L/14 patch descriptors from an intermediate layer (≈ layer 18 analog).
   - Compute PCA (to 256D) and per-template Bag-of-Words (k-means vocabulary).
   - Store per-patch 3D points (from depth) to enable 2D-3D correspondences later.

2) Inference (single query):
   - Render a query RGB-D of the same cube at a new pose (unknown to the solver).
   - Extract query patch descriptors, build query BoW, retrieve top-k templates by cosine sim.
   - For each retrieved template: match query->template patch descriptors (1-NN),
     lift template patch centers to 3D, estimate pose with EPnP+RANSAC.
   - Take the hypothesis with most inliers (and lowest reprojection RMSE).

Notes:
- This demo keeps things simple and CPU-friendly.
- For real data, replace the synthetic renderer with your image/mask crops.


In [37]:
# ---------------------------
# Config (scale these later)
# ---------------------------
IMG_SIZE = 518
PATCH = 14                  # DINOv2 ViT-L/14 patch size
GRID = IMG_SIZE // PATCH    # 37
NUM_TEMPLATES = 800          # (paper ~800; keep small for demo)
VOCAB_K = 512               # BoW vocab size (paper ~2048)
TOP_K_RETRIEVE = 12
PCA_DIM = 256               # (paper uses 256)
RANSAC_ITERS = 2000
RANSAC_REPROJ_THRESH = 8.0  # px
FOCAL = 600.0               # simple intrinsics (fx=fy)
CX = IMG_SIZE/2.0
CY = IMG_SIZE/2.0

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

time: 1.36 ms (started: 2025-10-15 20:39:43 +00:00)


## DINOv2 feature extractor

We use timm to load a DINOv2 ViT-L/14 model and grab a mid-level token map akin to
"intermediate layer features" (the paper reports best at ViT-L layer ~18). Exact layer
indices differ across repos; as a practical demo we:
- take the last block token map and/or a mid block by hook (works similarly in practice),
- reshape (B, tokens, C) to (B, H, W, C) where H=W=IMG_SIZE/PATCH.

In [10]:
class DinoV2Grid:
    def __init__(self, model_name="vit_large_patch14_dinov2.lvd142m", target_block=-6):
        """
        model_name: a timm DINOv2 ViT-L/14 variant.
        target_block: which transformer block's output tokens to use.
                      -6 means "sixth from last" (mid-ish); adjust as desired.
        """
        self.model = timm.create_model(model_name, pretrained=True)
        self.model.eval().to(DEVICE)
        self.target_block = target_block
        self._tokens = None

        # register hook on blocks[target_block]
        blocks = self.model.blocks
        idx = target_block if target_block >= 0 else len(blocks) + target_block
        def hook_fn(module, inp, out):
            # out shape: (B, tokens+1, C); token 0 is cls token
            self._tokens = out.detach()
        blocks[idx].register_forward_hook(hook_fn)

        # patch embed stride should be 14 for ViT-L/14
        # we’ll resize input to IMG_SIZE so tokens grid = 30x30

        # Normalization (timm default for this model)
        data_cfg = timm.data.resolve_model_data_config(self.model)
        self.transforms = timm.data.create_transform(**data_cfg, is_training=False)

    @torch.no_grad()
    def extract_grid(self, img_pil: Image.Image) -> np.ndarray:
        """
        Returns (H=GRID, W=GRID, C) patch descriptors as float32 numpy array.
        """
        img_res = img_pil.resize((IMG_SIZE, IMG_SIZE), Image.BICUBIC)
        x = self.transforms(img_res).unsqueeze(0).to(DEVICE)  # (1,3,H,W) normalized
        _ = self.model(x)  # triggers hook
        tokens = self._tokens  # (1, 1+HW, C)
        assert tokens is not None, "Hook did not capture tokens"

        tokens = tokens[:, 1:, :]  # drop cls
        B, HW, C = tokens.shape
        assert B == 1 and HW == GRID*GRID, f"Unexpected tokens: {tokens.shape}"

        H = W = int(math.sqrt(HW))
        grid = tokens.view(B, H, W, C).cpu().float().numpy()[0]
        #grid = tokens.view(1, GRID, GRID, C).cpu().float().numpy()[0]  # (H,W,C)
        return grid  # float32

time: 3.95 ms (started: 2025-10-15 20:18:47 +00:00)


## Rendering utilities

In [11]:
def make_scene():
    # Unit cube centered at origin
    mesh = trimesh.creation.box(extents=(1,1,1))
    mesh.visual.vertex_colors = [180, 200, 240, 255]
    tm = pyrender.Mesh.from_trimesh(mesh, smooth=False)

    scene = pyrender.Scene(bg_color=[0,0,0,0])
    mnode = scene.add(tm)

    # Simple directional light
    light = pyrender.DirectionalLight(color=np.ones(3), intensity=3.0)
    scene.add(light, pose=np.eye(4))

    return scene, mnode

def camera_pose_from_euler(rx, ry, rz, dist=2.5):
    # Look-at from spherical angles around origin; put camera on a sphere
    Rx = trimesh.transformations.rotation_matrix(np.deg2rad(rx), [1,0,0])
    Ry = trimesh.transformations.rotation_matrix(np.deg2rad(ry), [0,1,0])
    Rz = trimesh.transformations.rotation_matrix(np.deg2rad(rz), [0,0,1])
    R = trimesh.transformations.concatenate_matrices(Rz, Ry, Rx)
    t = np.array([0, 0, dist, 1.0])
    T = R.copy()
    T[:,3] = t
    return T

def render_rgbd(rx, ry, rz, dist=2.5, img_size=IMG_SIZE):
    scene, mnode = make_scene()

    # Pinhole intrinsics
    camera = pyrender.IntrinsicsCamera(fx=FOCAL, fy=FOCAL, cx=CX, cy=CY)
    cam_node = scene.add(camera, pose=np.eye(4))

    # Pose cube (we'll leave cube at origin) and place camera
    cam_pose = camera_pose_from_euler(rx, ry, rz, dist=dist)
    scene.set_pose(cam_node, pose=cam_pose)

    r = pyrender.OffscreenRenderer(viewport_width=img_size, viewport_height=img_size)
    color, depth = r.render(scene, flags=pyrender.RenderFlags.RGBA)
    r.delete()
    # convert RGBA to RGB (black background)
    rgb = color[..., :3]
    return rgb, depth, cam_pose

# 3D from depth & intrinsics
def backproject(u, v, depth):
    z = depth[v, u]
    if z == 0:
        return None
    x = (u - CX) * z / FOCAL
    y = (v - CY) * z / FOCAL
    return np.array([x, y, z], dtype=np.float32)

time: 7.2 ms (started: 2025-10-15 20:18:47 +00:00)


## Patch extraction helpers

In [12]:
def grid_centers(img_size=IMG_SIZE, patch=PATCH):
    # returns list of (u,v) pixel centers for each 14x14 patch center
    coords = []
    offset = patch / 2.0
    for gy in range(GRID):
        for gx in range(GRID):
            u = int(gx*patch + offset)
            v = int(gy*patch + offset)
            # clamp inside image
            u = min(max(u, 0), img_size-1)
            v = min(max(v, 0), img_size-1)
            coords.append((u, v))
    return coords  # length GRID*GRID

time: 1.02 ms (started: 2025-10-15 20:18:47 +00:00)


## Onboarding (templates)

In [13]:
def sample_orientations(n=NUM_TEMPLATES, seed=0):
    random.seed(seed)
    oris = []
    for _ in range(n):
        rx = random.uniform(-60, 60)
        ry = random.uniform(-60, 60)
        rz = random.uniform(-180, 180)
        oris.append((rx, ry, rz))
    return oris

def build_templates(dino: DinoV2Grid):
    coords = grid_centers()
    T_desc = []      # list of (H,W,C) descriptors (before PCA)
    T_3d = []        # list of per-patch 3D points IN MODEL SPACE (H*W, 3)
    orientations = sample_orientations()

    print(f"[Onboarding] Rendering {len(orientations)} templates...")
    all_desc = []
    for (rx, ry, rz) in orientations:
        rgb, depth, cam_pose = render_rgbd(rx, ry, rz)
        img = Image.fromarray(rgb)
        grid = dino.extract_grid(img)         # (H,W,C)
        T_desc.append(grid)

        # Back-project to CAMERA frame, then transform to WORLD/MODEL frame
        # cam_pose is camera->world; so X_world = cam_pose @ X_cam_h
        xyzs = []
        for (u, v) in coords:
            p3 = backproject(u, v, depth)  # camera frame
            if p3 is None:
                xyzs.append([np.nan, np.nan, np.nan])
                continue
            X_cam_h = np.array([p3[0], p3[1], p3[2], 1.0], dtype=np.float32)
            X_w_h = cam_pose.astype(np.float32) @ X_cam_h
            xyzs.append(X_w_h[:3])  # world/model frame (cube at origin)
        T_3d.append(np.stack(xyzs, axis=0))   # (H*W, 3)

        all_desc.append(grid.reshape(-1, grid.shape[-1]))

    all_desc = np.concatenate(all_desc, axis=0)  # (N*H*W, C)
    print(f"[Onboarding] All descriptor stack: {all_desc.shape}")

    # PCA to 256D
    print("[Onboarding] PCA(256)...")
    pca = PCA(n_components=PCA_DIM, whiten=False, random_state=0)
    pca.fit(all_desc)
    T_desc_pca = [pca.transform(t.reshape(-1, t.shape[-1])).astype(np.float32) for t in T_desc]

    # k-means vocab (BoW)
    print(f"[Onboarding] MiniBatchKMeans(K={VOCAB_K})...")
    kmeans = MiniBatchKMeans(n_clusters=VOCAB_K, batch_size=8192, random_state=0, n_init=5)
    kmeans.fit(np.concatenate(T_desc_pca, axis=0))
    vocab = kmeans.cluster_centers_.astype(np.float32)

    # soft-assignment BoW
    def bow_from_desc(desc_pca):
        d2 = cosine_similarity(desc_pca, vocab)  # (M,K) cosine sim
        top3 = np.argpartition(-d2, 3, axis=1)[:, :3]
        bow = np.zeros((VOCAB_K,), dtype=np.float32)
        for i, idxs in enumerate(top3):
            sims = d2[i, idxs]
            sims = np.clip(sims, 0, None)
            if sims.sum() > 0:
                sims = sims / (sims.sum() + 1e-8)
            for j, w in zip(idxs, sims):
                bow[j] += w
        bow = bow / (bow.sum() + 1e-8)
        return bow

    print("[Onboarding] Building per-template BoWs...")
    T_bow = [bow_from_desc(d) for d in T_desc_pca]

    return {
        "desc_pca": T_desc_pca,           # list of (M, D)
        "xyz_model": T_3d,                # list of (M, 3) in model/world frame
        "bow": np.stack(T_bow, 0),        # (T, K)
        "pca": pca,
        "vocab": vocab,
        "orientations": orientations
    }

time: 10.6 ms (started: 2025-10-15 20:18:47 +00:00)


## Inference on a query

In [14]:
def render_query():
    # new pose not in the onboarding set
    rx, ry, rz = 35.0, -30.0, 45.0
    rgb, depth, cam_pose = render_rgbd(rx, ry, rz)
    return Image.fromarray(rgb), depth

def extract_query(dino: DinoV2Grid, pca, vocab):
    img_pil, depth = render_query()
    grid = dino.extract_grid(img_pil)                 # (H,W,C)
    desc = grid.reshape(-1, grid.shape[-1])
    desc_pca = pca.transform(desc).astype(np.float32) # (M, D)
    # query BoW (same soft assignment)
    d2 = cosine_similarity(desc_pca, vocab)           # (M,K)
    top3 = np.argpartition(-d2, 3, axis=1)[:, :3]
    bow = np.zeros((VOCAB_K,), dtype=np.float32)
    for i, idxs in enumerate(top3):
        sims = d2[i, idxs]
        sims = np.clip(sims, 0, None)
        if sims.sum() > 0:
            sims = sims / (sims.sum() + 1e-8)
        for j, w in zip(idxs, sims):
            bow[j] += w
    bow = bow / (bow.sum() + 1e-8)
    return img_pil, depth, desc_pca, bow

def retrieve_templates(query_bow, T_bow):
    sims = cosine_similarity(query_bow[None, :], T_bow)[0]  # (T,)
    idxs = np.argsort(-sims)[:TOP_K_RETRIEVE]
    return idxs, sims[idxs]

def correspondences(query_desc, template_desc, template_xyz, coords_uv, ratio=0.7, max_pairs=800):
    # L2-normalize
    q = query_desc / (np.linalg.norm(query_desc, axis=1, keepdims=True) + 1e-8)
    t = template_desc / (np.linalg.norm(template_desc, axis=1, keepdims=True) + 1e-8)

    # cosine sims
    sim_qt = q @ t.T                           # (Mq, Mt)
    # top-2 per query for ratio test
    top2 = np.sort(sim_qt, axis=1)[:, -2:]
    keep = top2[:,1] >= ratio * (top2[:,0] + 1e-8)

    # greedy 1-NN (no mutual check)
    nn = np.argmax(sim_qt, axis=1)             # (Mq,)

    # build pairs
    pairs = []
    for qi, tj in enumerate(nn):
        if not keep[qi]:
            continue
        p3 = template_xyz[tj]
        if np.any(np.isnan(p3)):
            continue
        sim = sim_qt[qi, tj]
        u, v = coords_uv[qi]
        pairs.append((sim, u, v, p3[0], p3[1], p3[2]))

    if not pairs:
        return np.empty((0,2), np.float32), np.empty((0,3), np.float32)

    # keep the best-N sims
    pairs.sort(key=lambda x: x[0], reverse=True)
    pairs = pairs[:max_pairs]

    uv  = np.array([[p[1], p[2]] for p in pairs], dtype=np.float32)
    xyz = np.array([[p[3], p[4], p[5]] for p in pairs], dtype=np.float32)
    return uv, xyz


def solve_pnp_ransac(uv, xyz):
    if len(uv) < 6:
        return None
    K = np.array([[FOCAL, 0, CX],
                  [0, FOCAL, CY],
                  [0, 0, 1]], dtype=np.float64)
    dist = np.zeros(5)
    ok, rvec, tvec, inliers = cv2.solvePnPRansac(
        xyz.astype(np.float64),
        uv.astype(np.float64),
        K, dist,
        iterationsCount=RANSAC_ITERS,
        reprojectionError=RANSAC_REPROJ_THRESH,
        flags=cv2.SOLVEPNP_EPNP
    )
    if not ok:
        return None
    # compute RMSE reprojection on inliers
    proj, _ = cv2.projectPoints(xyz[inliers[:,0]], rvec, tvec, K, dist)
    err = np.linalg.norm(proj.squeeze() - uv[inliers[:,0]], axis=1)
    rmse = float(np.sqrt((err**2).mean()))
    return {
        "rvec": rvec, "tvec": tvec,
        "inliers": inliers, "rmse": rmse,
        "num_inliers": len(inliers)
    }

time: 23.5 ms (started: 2025-10-15 20:18:47 +00:00)


# FoundPose (Demo)

Initilize the model

In [38]:
dino = DinoV2Grid(model_name="vit_large_patch14_dinov2.lvd142m", target_block=-6)

time: 5.29 s (started: 2025-10-15 20:39:49 +00:00)


Onboard templates (can take time; cache if needed)

In [19]:
templates = build_templates(dino)   # builds PCA, vocab, desc_pca, xyz_model, bow
len(templates["desc_pca"]), templates["bow"].shape

[Onboarding] Rendering 60 templates...
[Onboarding] All descriptor stack: (82140, 1024)
[Onboarding] PCA(256)...
[Onboarding] MiniBatchKMeans(K=512)...
[Onboarding] Building per-template BoWs...


(60, (60, 512))

time: 2min 18s (started: 2025-10-15 20:25:29 +00:00)


Render a query, extract descriptors + BoW, retrieve candidates

In [20]:
print("[Query] Rendering + extracting...")
q_img, q_depth, q_desc_pca, q_bow = extract_query(dino, templates["pca"], templates["vocab"])

top_idxs, top_sims = retrieve_templates(q_bow, templates["bow"])
print("[Retrieve] Top templates:", list(zip(top_idxs.tolist(), np.round(top_sims, 3).tolist())))

[Query] Rendering + extracting...
[Retrieve] Top templates: [(20, 0.7799999713897705), (2, 0.7630000114440918), (45, 0.753000020980835), (28, 0.671999990940094), (48, 0.6620000004768372), (23, 0.6610000133514404), (11, 0.5889999866485596), (7, 0.5569999814033508), (13, 0.5569999814033508), (14, 0.5569999814033508), (9, 0.5569999814033508), (10, 0.5569999814033508)]
time: 888 ms (started: 2025-10-15 20:28:33 +00:00)


Build query 2D coordinates

In [23]:
coords_uv = np.array(grid_centers(), dtype=np.float32)  # fixed GRID version
coords_uv.shape

(1369, 2)

time: 3.4 ms (started: 2025-10-15 20:29:12 +00:00)


Try PnP for each retrieved template

In [36]:
best = None
for tidx in top_idxs:
    uv, xyz = correspondences(
        q_desc_pca,
        templates["desc_pca"][tidx],
        templates["xyz_model"][tidx],
        coords_uv
    )
    print(f"[DBG] tidx={tidx}: matches={len(uv)}")
    res = solve_pnp_ransac(uv, xyz)
    if res is None:
        continue
    cand = (res["num_inliers"], -res["rmse"], tidx, res)
    if (best is None) or (cand > best):
        best = cand

[DBG] tidx=20: matches=1
[DBG] tidx=2: matches=2
[DBG] tidx=45: matches=1
[DBG] tidx=28: matches=3
[DBG] tidx=48: matches=2
[DBG] tidx=23: matches=1
[DBG] tidx=11: matches=1
[DBG] tidx=7: matches=0
[DBG] tidx=13: matches=0
[DBG] tidx=14: matches=0
[DBG] tidx=9: matches=0
[DBG] tidx=10: matches=0
time: 445 ms (started: 2025-10-15 20:38:13 +00:00)


Handle “no solution” early

In [None]:
if best is None:
    raise RuntimeError("No valid PnP solution — loosen matcher or RANSAC and rerun Cell 5.")

Report final pose metrics

In [None]:
num_inl, neg_rmse, tidx, res = best
print(f"[Pose] Best from template {tidx}: inliers={num_inl}, rmse={-neg_rmse:.3f}px")
res.keys()  # rvec, tvec, inliers, rmse, num_inliers

Visualize reprojection overlay

In [None]:
K = np.array([[FOCAL, 0, CX],
              [0, FOCAL, CY],
              [0, 0, 1]], dtype=np.float64)
dist = np.zeros(5)

proj, _ = cv2.projectPoints(templates["xyz_model"][tidx], res["rvec"], res["tvec"], K, dist)
reproj = proj.squeeze().astype(np.float32)

fig, ax = plt.subplots(1,1, figsize=(5,5))
ax.imshow(q_img)
ax.scatter(reproj[:,0], reproj[:,1], s=5, alpha=0.3)
ax.set_title(f"Best hypothesis (template {tidx})\nInliers={num_inl}, RMSE={-neg_rmse:.2f}px")
ax.axis('off')
plt.show()