# Debug Absolute Localization

This notebook helps visualize and debug the absolute localization evaluation.


In [None]:
import sys

sys.path.insert(0, "/home/guillemc/dev/LuPNT-private")

import numpy as np
import matplotlib.pyplot as plt
import cv2
import torch
import h5py
from pathlib import Path
import pylupnt as pnt
from pylupnt.numerics.frames import OCV_T_FLU, FLU_T_OCV
from gluefactory.geometry.wrappers import Camera, Pose
from gluefactory.eval.utils import eval_matches_epipolar
import plotly.graph_objects as go
import plotly.express as px

# Set up paths
dataset_path = Path("/home/shared_ws6/data/feature_matching/unreal_training/")
scene_name = "unreal"
scene_dir = dataset_path / scene_name

In [None]:
def load_dataset(dataset_path, scene_name):
    scene_dir = Path(dataset_path) / scene_name

    with open(scene_dir / "pairs.txt") as f:
        pairs = [tuple(line.strip().split()[:2]) for line in f if line.strip()]

    with open(scene_dir / "views.txt") as f:
        views = {parts[0]: parts[1:] for line in f if (parts := line.strip().split())}

    return pairs, views, scene_dir


pairs, views, scene_dir = load_dataset(dataset_path, scene_name)
print(f"Loaded {len(pairs)} pairs\nFirst pair: {pairs[0]}")

if pairs:
    frame_gaps = [
        abs(int(a.replace(".jpg", "")) - int(b.replace(".jpg", "")))
        for a, b in pairs[:100]
    ]
    print(
        f"Frame gaps (first 100 pairs): min={min(frame_gaps)}, max={max(frame_gaps)}, mean={np.mean(frame_gaps):.1f}"
    )

In [None]:
def parse_view(img_name, view_data, scene_dir):
    R = np.array(view_data[0:9], dtype=np.float32).reshape(3, 3)
    t = np.array(view_data[9:12], dtype=np.float32)
    w, h = int(view_data[13]), int(view_data[14])
    params = np.array(view_data[15:], dtype=np.float32)
    img = cv2.cvtColor(
        cv2.imread(str(scene_dir / "images" / img_name)), cv2.COLOR_BGR2RGB
    )
    img = img.astype(np.uint8)
    with h5py.File(scene_dir / "depths" / img_name.replace(".jpg", ".h5"), "r") as f:
        depth = f["depth"][:].astype(np.float32)
    camera = Camera.from_colmap(
        {"model": view_data[12], "width": w, "height": h, "params": params}
    )
    # Poses are in FLU frame, but gluefactory expects OCV frame
    # If world_T_cam_ocv = world_T_cam_flu @ FLU_T_OCV (camera-to-world)
    # Then T_w2cam_ocv = inv(world_T_cam_ocv) = OCV_T_FLU @ T_w2cam_flu (world-to-camera)
    T_w2cam_flu = Pose.from_Rt(torch.from_numpy(R).float(), torch.from_numpy(t).float())
    T_w2cam_ocv = Pose.from_4x4mat(torch.from_numpy(OCV_T_FLU).float()) @ T_w2cam_flu
    return img, depth, camera, T_w2cam_ocv


img0_name, img1_name = pairs[0]
print(
    f"View {img0_name}: len={len(views[img0_name])}, first 20: {views[img0_name][:20]}"
)
img0, depth0, camera0, T_w2cam0_gt = parse_view(img0_name, views[img0_name], scene_dir)
img1, depth1, camera1, T_w2cam1_gt = parse_view(img1_name, views[img1_name], scene_dir)
print(
    f"Image 0: {img0.shape}, depth: {depth0.shape}\nImage 1: {img1.shape}, depth: {depth1.shape}"
)

In [None]:
# Feature extraction and matching (concise)
extractor = pnt.FeatureExtractor.from_config({"class": "SuperPoint"})
matcher_lightglue = pnt.FeatureMatcher.from_config(
    {"class": "LightGlue", "features": "superpoint"}
)
matcher_superglue = pnt.FeatureMatcher.from_config({"class": "SuperGlue"})


# Convert images to uint8 if needed
def to_uint8(im):
    if im.dtype in [np.float32, np.float64]:
        return (im * 255).astype(np.uint8) if im.max() <= 1.0 else im.astype(np.uint8)
    return im


img0, img1 = to_uint8(img0), to_uint8(img1)

for i, im in enumerate([img0, img1]):
    print(
        f"Image {i} dtype: {im.dtype}, shape: {im.shape}, range: [{im.min()}, {im.max()}]"
    )

feats0, feats1 = extractor.extract(img0), extractor.extract(img1)
matches_superglue = matcher_superglue.match(feats0, feats1)
matches_lightglue = matcher_lightglue.match(feats0, feats1)
matches = matches_lightglue  # LightGlue by default

print(f"Features 0: {len(feats0)}, Features 1: {len(feats1)}")
print(
    f"Matches (SuperGlue): {len(matches_superglue)}, Matches (LightGlue): {len(matches_lightglue)}"
)

In [None]:
def visualize_keypoints(img0, img1, feats0, feats1, max_keypoints=1000):
    """Visualize all extracted keypoints using cyan and magenta."""
    fig, axes = plt.subplots(1, 2, figsize=(20, 10))
    kp0 = np.asarray(feats0.uv.cpu() if hasattr(feats0.uv, "cpu") else feats0.uv)
    kp1 = np.asarray(feats1.uv.cpu() if hasattr(feats1.uv, "cpu") else feats1.uv)
    idx0 = (
        np.random.choice(len(kp0), min(max_keypoints, len(kp0)), replace=False)
        if len(kp0) > max_keypoints
        else np.arange(len(kp0))
    )
    idx1 = (
        np.random.choice(len(kp1), min(max_keypoints, len(kp1)), replace=False)
        if len(kp1) > max_keypoints
        else np.arange(len(kp1))
    )

    axes[0].imshow(img0)
    axes[0].scatter(
        kp0[idx0, 0], kp0[idx0, 1], c="#00FFFF", s=10, alpha=0.6, label="cyan"
    )
    axes[0].set_title(f"Image 0: {len(kp0)} kpts (showing {len(idx0)})")
    axes[0].axis("off")
    axes[1].imshow(img1)
    axes[1].scatter(
        kp1[idx1, 0], kp1[idx1, 1], c="#FF00FF", s=10, alpha=0.6, label="magenta"
    )
    axes[1].set_title(f"Image 1: {len(kp1)} kpts (showing {len(idx1)})")
    axes[1].axis("off")
    plt.tight_layout()
    plt.show()
    print(f"Image 0: {len(kp0)} keypoints; Image 1: {len(kp1)} keypoints")
    if hasattr(feats0, "scores") and hasattr(feats1, "scores"):
        for i, (scores, tag) in enumerate(
            [(feats0.scores, "Image 0"), (feats1.scores, "Image 1")]
        ):
            s = np.asarray(scores.cpu() if hasattr(scores, "cpu") else scores)
            print(
                f"{tag} scores: min={s.min():.3f}, max={s.max():.3f}, mean={s.mean():.3f}"
            )


def visualize_matches(img0, img1, feats0, feats1, matches, max_matches=200):
    """Visualize matches between images using cyan, magenta, and yellow."""
    h0, w0 = img0.shape[:2]
    h1, w1 = img1.shape[:2]
    img_comb = np.zeros((max(h0, h1), w0 + w1, 3), dtype=img0.dtype)
    img_comb[:h0, :w0] = img0
    img_comb[:h1, w0:] = img1
    kp0 = np.asarray(feats0.uv.cpu() if hasattr(feats0.uv, "cpu") else feats0.uv)
    kp1 = np.asarray(feats1.uv.cpu() if hasattr(feats1.uv, "cpu") else feats1.uv)
    ms = (
        matches.indexes[:max_matches]
        if max_matches is not None and len(matches.indexes) > 0
        else matches.indexes
    )
    fig, ax = plt.subplots(1, 1, figsize=(20, 10))
    ax.imshow(img_comb)
    for idx in ms:
        pt0, pt1 = kp0[idx[0]], kp1[idx[1]] + np.array([w0, 0])
        # match lines = yellow, keypoints = cyan (img0), magenta (img1)
        ax.plot(
            [pt0[0], pt1[0]],
            [pt0[1], pt1[1]],
            color="#FFFF00",
            alpha=0.3,
            linewidth=0.5,
        )  # yellow
        ax.plot(pt0[0], pt0[1], "o", color="#00FFFF", markersize=2)  # cyan
        ax.plot(pt1[0], pt1[1], "o", color="#FF00FF", markersize=2)  # magenta
    ax.set_title(f"Matches (showing {len(ms)} of {len(matches)})")
    ax.axis("off")
    plt.tight_layout()
    plt.show()


print("=== All Keypoints ===")
visualize_keypoints(img0, img1, feats0, feats1)
print("\n=== Matches ===")
print(f"Total matches found: {len(matches)}")
print(f"Match ratio: {len(matches)/min(len(feats0),len(feats1))*100:.2f}%")
print(f"Image pair: {img0_name} <-> {img1_name}")

idx0, idx1 = int(img0_name.replace(".jpg", "")), int(img1_name.replace(".jpg", ""))
print(f"Frame gap: {abs(idx1-idx0)} frames")

match_scores = matches.distances
print(
    f"Match scores: min={match_scores.min():.3f}, max={match_scores.max():.3f}, mean={match_scores.mean():.3f}"
)
visualize_matches(img0, img1, feats0, feats1, matches, max_matches=None)

In [None]:
def get_matched_keypoints(feats0, feats1, matches):
    idx = matches.indexes
    return feats0.uv[idx[:, 0]], feats1.uv[idx[:, 1]]


def sample_depth_at_keypoints(depth_map, kpts):
    k = np.round(kpts).astype(int)
    k[:, 0] = np.clip(k[:, 0], 0, depth_map.shape[1] - 1)
    k[:, 1] = np.clip(k[:, 1], 0, depth_map.shape[0] - 1)
    d = depth_map[k[:, 1], k[:, 0]]
    valid = (d > 0) & (d < 1000)
    return d, valid


matched_kp0, matched_kp1 = get_matched_keypoints(feats0, feats1, matches)
depth0_vals, valid0 = sample_depth_at_keypoints(depth0, matched_kp0)

print(f"Total matches: {len(matched_kp0)}")
print(f"Valid depth matches: {valid0.sum()}")
print(
    f"Depth range: {depth0_vals[valid0].min():.2f} - {depth0_vals[valid0].max():.2f} m"
)

In [None]:
def convert_to_3d_world(matched_kp0, depth0_vals, valid0, camera0, T_w2cam0_gt):
    """Convert 2D keypoints + depth to 3D points in world frame.

    Note: T_w2cam0_gt is in OCV frame (for gluefactory), but we need to convert
    points to FLU frame for PnP solver (lupnt uses FLU).
    """
    intrinsics_dict = {
        "fx": float(camera0._data[2]),
        "fy": float(camera0._data[3]),
        "cx": float(camera0._data[4]),
        "cy": float(camera0._data[5]),
    }

    # Convert T_w2cam0_gt from OCV back to FLU
    T_w2cam0_gt_flu = (
        Pose.from_4x4mat(torch.from_numpy(FLU_T_OCV).float()) @ T_w2cam0_gt
    )

    # Compute 3D points in FLU camera frame, then transform to world frame
    # PnP solver expects points in world frame with FLU camera convention
    u, v = matched_kp0[valid0][:, 0], matched_kp0[valid0][:, 1]
    fx, fy = intrinsics_dict["fx"], intrinsics_dict["fy"]
    cx, cy = intrinsics_dict["cx"], intrinsics_dict["cy"]
    x_cam_ocv = (u - cx) * depth0_vals[valid0] / fx
    y_cam_ocv = (v - cy) * depth0_vals[valid0] / fy
    z_cam_ocv = depth0_vals[valid0]
    xyz_cam_ocv = np.stack([x_cam_ocv, y_cam_ocv, z_cam_ocv], axis=-1)

    # Convert from OCV to FLU camera frame
    xyz_cam_flu = pnt.apply_transform(FLU_T_OCV, xyz_cam_ocv)

    # Transform to world frame using FLU world_T_cam
    world_T_cam0_flu = pnt.make_transform(*T_w2cam0_gt_flu.inv().numpy())
    pts_w_flu = pnt.apply_transform(world_T_cam0_flu, xyz_cam_flu)

    return pts_w_flu, xyz_cam_flu


xyz_w, xyz_cam_flu = convert_to_3d_world(
    matched_kp0, depth0_vals, valid0, camera0, T_w2cam0_gt
)

print(f"3D points in world (FLU): {xyz_w.shape}")
print(
    f"3D points range: x[{xyz_w[:, 0].min():.2f}, {xyz_w[:, 0].max():.2f}], "
    f"y[{xyz_w[:, 1].min():.2f}, {xyz_w[:, 1].max():.2f}], "
    f"z[{xyz_w[:, 2].min():.2f}, {xyz_w[:, 2].max():.2f}])"
)

In [None]:
# Debug evaluation metrics - convert to gluefactory format and run eval functions
from gluefactory.eval.utils import (
    eval_matches_depth,
    eval_matches_epipolar,
    eval_relative_pose_robust,
)
from pylupnt.numerics import ensure_torch


def convert_to_gluefactory_format(
    img0, img1, depth0, depth1, camera0, camera1, T_w2cam0, T_w2cam1
):
    """Convert images and poses to glue-factory format."""
    # Add batch dimension to cameras if needed
    if camera0._data.ndim == 1:
        camera0 = Camera(camera0._data.unsqueeze(0))
    if camera1._data.ndim == 1:
        camera1 = Camera(camera1._data.unsqueeze(0))

    return {
        "view0": {
            "image": ensure_torch(img0, channels=3, device="cpu", batch_dim=True),
            "camera": camera0,
            "depth": ensure_torch(depth0, device="cpu", batch_dim=True),
            "T_w2cam": T_w2cam0,
        },
        "view1": {
            "image": ensure_torch(img1, channels=3, device="cpu", batch_dim=True),
            "camera": camera1,
            "depth": ensure_torch(depth1, device="cpu", batch_dim=True),
            "T_w2cam": T_w2cam1,
        },
        "T_0to1": T_w2cam1 @ T_w2cam0.inv(),
    }


def convert_features_to_gluefactory_format(feats1, feats2, matches):
    """Convert pylupnt Features and Matches to glue-factory format."""
    kpts0 = torch.from_numpy(feats1.uv.copy()).float()
    kpts1 = torch.from_numpy(feats2.uv.copy()).float()
    matches0 = torch.full((len(feats1),), -1, dtype=torch.long)
    matching_scores0 = torch.zeros(len(feats1), dtype=torch.float32)

    if len(matches.indexes) > 0:
        idx0 = torch.from_numpy(matches.indexes[:, 0].copy()).long()
        idx1 = torch.from_numpy(matches.indexes[:, 1].copy()).long()
        matches0[idx0] = idx1
        max_dist = matches.distances.max() if len(matches.distances) > 0 else 1.0
        scores = 1.0 - (matches.distances / (max_dist + 1e-6))
        matching_scores0[idx0] = torch.from_numpy(scores).float()

    return {
        "keypoints0": kpts0,
        "keypoints1": kpts1,
        "matches0": matches0,
        "matching_scores0": matching_scores0,
    }


# Convert to gluefactory format
data = convert_to_gluefactory_format(
    img0, img1, depth0, depth1, camera0, camera1, T_w2cam0_gt, T_w2cam1_gt
)
pred = convert_features_to_gluefactory_format(feats0, feats1, matches)

print("Data keys:", list(data.keys()))
print("Data view0 keys:", list(data["view0"].keys()))
print("Pred keys:", list(pred.keys()))
print(
    f"Keypoints0 shape: {pred['keypoints0'].shape}, dtype: {pred['keypoints0'].dtype}"
)
print(
    f"Keypoints1 shape: {pred['keypoints1'].shape}, dtype: {pred['keypoints1'].dtype}"
)
print(
    f"Matches0 shape: {pred['matches0'].shape}, valid matches: {(pred['matches0'] >= 0).sum().item()}"
)
print(f"T_0to1 shape: {data['T_0to1'].shape}")
print(f"Camera0 shape: {data['view0']['camera']._data.shape}")
print(f"Camera1 shape: {data['view1']['camera']._data.shape}")

In [None]:
# Epipolar evaluation
from gluefactory.geometry.epipolar import generalized_epi_dist

epipolar_results = eval_matches_epipolar(data, pred)
print("Epipolar Results:")
for k, v in epipolar_results.items():
    print(f"  {k}: {v}")

# Compute epipolar errors for matched points
kp0, kp1 = pred["keypoints0"], pred["keypoints1"]
m0 = pred["matches0"]
valid_matches = m0 >= 0
pts0, pts1 = kp0[valid_matches], kp1[m0[valid_matches]]

n_epi_err = generalized_epi_dist(
    pts0[None],
    pts1[None],
    data["view0"]["camera"],
    data["view1"]["camera"],
    data["T_0to1"],
    False,
    essential=True,
)[0]

print(
    f"\nEpipolar errors: mean={n_epi_err.mean().item():.6f}, median={n_epi_err.median().item():.6f}"
)
print(
    f"Matches below thresholds: <1e-4: {(n_epi_err < 1e-4).sum().item()}, <5e-4: {(n_epi_err < 5e-4).sum().item()}, <1e-3: {(n_epi_err < 1e-3).sum().item()}"
)

In [None]:
# Visualize epipolar errors
fig, axes = plt.subplots(1, 2, figsize=(8, 4))

# Histogram of epipolar errors
axes[0].hist(n_epi_err.cpu().numpy(), bins=100, log=True, edgecolor="black", alpha=0.7)
axes[0].axvline(1e-4, color="r", linestyle="--", label="1e-4 threshold")
axes[0].axvline(5e-4, color="orange", linestyle="--", label="5e-4 threshold")
axes[0].axvline(1e-3, color="y", linestyle="--", label="1e-3 threshold")
axes[0].set_xlabel("Epipolar Error")
axes[0].set_ylabel("Count (log scale)")
axes[0].set_title("Distribution of Epipolar Errors")
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# CDF of epipolar errors
sorted_err = torch.sort(n_epi_err)[0].cpu().numpy()
cumulative = np.arange(1, len(sorted_err) + 1) / len(sorted_err)
axes[1].plot(sorted_err, cumulative, linewidth=2)
axes[1].axvline(1e-4, color="r", linestyle="--", label="1e-4 threshold")
axes[1].axvline(5e-4, color="orange", linestyle="--", label="5e-4 threshold")
axes[1].axvline(1e-3, color="y", linestyle="--", label="1e-3 threshold")
axes[1].set_xlabel("Epipolar Error")
axes[1].set_ylabel("Cumulative Fraction")
axes[1].set_title("CDF of Epipolar Errors")
axes[1].legend()
axes[1].grid(True, alpha=0.3)
axes[1].set_xscale("log")

plt.tight_layout()
plt.show()

In [None]:
# Run depth evaluation and debug
from gluefactory.geometry.depth import symmetric_reprojection_error

depth_results = eval_matches_depth(data, pred)
print("Depth/Reprojection Results:")
for k, v in depth_results.items():
    print(f"  {k}: {v}")

# Debug reprojection errors directly
reproj_error, valid = symmetric_reprojection_error(
    pts0[None],
    pts1[None],
    data["view0"]["camera"],
    data["view1"]["camera"],
    data["T_0to1"],
    data["view0"]["depth"],
    data["view1"]["depth"],
)
reproj_error, valid = reproj_error[0], valid[0]

print(f"\nReprojection errors:")
print(f"  Shape: {reproj_error.shape}")
print(f"  Valid matches: {valid.sum().item()} / {len(valid)}")
print(f"  Valid error - Min: {reproj_error[valid].min().item():.3f}px")
print(f"  Valid error - Max: {reproj_error[valid].max().item():.3f}px")
print(f"  Valid error - Mean: {reproj_error[valid].mean().item():.3f}px")
print(f"  Valid error - Median: {reproj_error[valid].median().item():.3f}px")

# Count matches below thresholds
valid_errors = reproj_error[valid].nan_to_num(nan=float("inf"))
print(f"\nValid matches below thresholds:")
print(
    f"  < 1px: {(valid_errors < 1).sum().item()} ({(valid_errors < 1).float().mean().item()*100:.2f}%)"
)
print(
    f"  < 3px: {(valid_errors < 3).sum().item()} ({(valid_errors < 3).float().mean().item()*100:.2f}%)"
)
print(
    f"  < 5px: {(valid_errors < 5).sum().item()} ({(valid_errors < 5).float().mean().item()*100:.2f}%)"
)

In [None]:
# Visualize reprojection errors
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

valid_errors = reproj_error[valid].nan_to_num(nan=float("inf")).cpu().numpy()

# Histogram of reprojection errors
axes[0].hist(valid_errors, bins=100, log=True, edgecolor="black", alpha=0.7)
axes[0].axvline(1, color="r", linestyle="--", label="1px threshold")
axes[0].axvline(3, color="orange", linestyle="--", label="3px threshold")
axes[0].axvline(5, color="y", linestyle="--", label="5px threshold")
axes[0].set_xlabel("Reprojection Error (px)")
axes[0].set_ylabel("Count (log scale)")
axes[0].set_title("Distribution of Reprojection Errors (valid matches)")
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# CDF of reprojection errors
sorted_err = np.sort(valid_errors)
cumulative = np.arange(1, len(sorted_err) + 1) / len(sorted_err)
axes[1].plot(sorted_err, cumulative, linewidth=2)
axes[1].axvline(1, color="r", linestyle="--", label="1px threshold")
axes[1].axvline(3, color="orange", linestyle="--", label="3px threshold")
axes[1].axvline(5, color="y", linestyle="--", label="5px threshold")
axes[1].set_xlabel("Reprojection Error (px)")
axes[1].set_ylabel("Cumulative Fraction")
axes[1].set_title("CDF of Reprojection Errors")
axes[1].legend()
axes[1].grid(True, alpha=0.3)
axes[1].set_xscale("log")

plt.tight_layout()
plt.show()

In [None]:
# Run relative pose evaluation and debug
eval_conf = {"estimator": "opencv", "ransac_th": 1.0}

pose_results = eval_relative_pose_robust(data, pred, eval_conf)
print("Relative Pose Results:")
for k, v in pose_results.items():
    print(f"  {k}: {v}")

# Check T_0to1 transform
T_0to1 = data["T_0to1"]
print(f"\nT_0to1 transform:")
print(f"  Shape: {T_0to1.shape}")
print(f"  Translation norm: {torch.norm(T_0to1.t).item():.3f}")
print(f"  Rotation matrix:\n{T_0to1.R.numpy()}")

# Check camera calibration matrices
K0 = data["view0"]["camera"].calibration_matrix()
K1 = data["view1"]["camera"].calibration_matrix()
print(f"\nCamera 0 K:\n{K0.squeeze().numpy()}")
print(f"\nCamera 1 K:\n{K1.squeeze().numpy()}")

In [None]:
# Visualize matches colored by epipolar error
fig, ax = plt.subplots(1, 1, figsize=(20, 10))

h0, w0 = img0.shape[:2]
h1, w1 = img1.shape[:2]
img_comb = np.zeros((max(h0, h1), w0 + w1, 3), dtype=img0.dtype)
img_comb[:h0, :w0] = img0
img_comb[:h1, w0:] = img1

ax.imshow(img_comb)

# Color matches by epipolar error
epi_err_np = n_epi_err.cpu().numpy()
norm = plt.Normalize(vmin=epi_err_np.min(), vmax=epi_err_np.max())
cmap = plt.cm.viridis

for i, (pt0, pt1, err) in enumerate(zip(pts0, pts1, epi_err_np)):
    if i % 10 == 0:  # Sample every 10th match for visualization
        pt0_np = pt0.cpu().numpy() if torch.is_tensor(pt0) else pt0
        pt1_np = pt1.cpu().numpy() if torch.is_tensor(pt1) else pt1
        pt1_np[0] += w0

        color = cmap(norm(err))
        ax.plot(
            [pt0_np[0], pt1_np[0]],
            [pt0_np[1], pt1_np[1]],
            color=color,
            alpha=0.5,
            linewidth=0.5,
        )
        ax.plot(pt0_np[0], pt0_np[1], "o", color=color, markersize=2)
        ax.plot(pt1_np[0], pt1_np[1], "o", color=color, markersize=2)

ax.set_title(
    f"Matches colored by Epipolar Error (showing every 10th, total: {len(pts0)})"
)
ax.axis("off")

# Add colorbar
sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
sm.set_array([])
cbar = plt.colorbar(sm, ax=ax)
cbar.set_label("Epipolar Error", rotation=270, labelpad=20)

plt.tight_layout()
plt.show()

In [None]:
# Debug coordinate frames and transforms
print("=== Coordinate Frame Debug ===")
print(f"\nT_w2cam0_gt (world to camera 0):")
print(f"  Translation: {T_w2cam0_gt.t.numpy()}")
print(f"  Rotation (first row): {T_w2cam0_gt.R[0].numpy()}")

print(f"\nT_w2cam1_gt (world to camera 1):")
print(f"  Translation: {T_w2cam1_gt.t.numpy()}")
print(f"  Rotation (first row): {T_w2cam1_gt.R[0].numpy()}")

# Compute T_0to1 manually
T_0to1_manual = T_w2cam1_gt @ T_w2cam0_gt.inv()
print(f"\nT_0to1_manual (camera 0 to camera 1):")
print(f"  Translation: {T_0to1_manual.t.numpy()}")
print(f"  Translation norm: {torch.norm(T_0to1_manual.t).item():.3f}")
print(f"  Rotation (first row): {T_0to1_manual.R[0].numpy()}")

# Check what gluefactory computed
T_0to1_gluefactory = data["T_0to1"]
print(f"\nT_0to1_gluefactory (from data dict):")
print(f"  Translation: {T_0to1_gluefactory.t.numpy()}")
print(f"  Translation norm: {torch.norm(T_0to1_gluefactory.t).item():.3f}")
print(f"  Rotation (first row): {T_0to1_gluefactory.R[0].numpy()}")

# Check if they match
print(
    f"\nTransforms match: {torch.allclose(T_0to1_manual.R, T_0to1_gluefactory.R, atol=1e-5) and torch.allclose(T_0to1_manual.t, T_0to1_gluefactory.t, atol=1e-5)}"
)

# Check camera coordinate frames
print(f"\n=== Camera Info ===")
print(f"Camera 0 size: {camera0.size.numpy()}")
print(f"Camera 1 size: {camera1.size.numpy()}")
print(f"Camera 0 K:\n{camera0.calibration_matrix().squeeze().numpy()}")
print(f"Camera 1 K:\n{camera1.calibration_matrix().squeeze().numpy()}")

# Check a few matched points
print(f"\n=== Sample Matched Points ===")
print(f"First 5 matched points in image 0:\n{pts0[:5].cpu().numpy()}")
print(f"First 5 matched points in image 1:\n{pts1[:5].cpu().numpy()}")
print(f"Image 0 shape: {img0.shape}")
print(f"Image 1 shape: {img1.shape}")

In [None]:
# Debug the relative pose estimation - check what RANSAC is estimating
from gluefactory.robust_estimators import load_estimator
from gluefactory.geometry.epipolar import relative_pose_error

estimator = load_estimator("relative_pose", "opencv")(
    {"ransac_th": 1.0, "confidence": 0.99}
)
data_est = {
    "m_kpts0": pts0,
    "m_kpts1": pts1,
    "camera0": data["view0"]["camera"][0],
    "camera1": data["view1"]["camera"][0],
}
est = estimator(data_est)

print("=== RANSAC Pose Estimation Debug ===")
print(f"Success: {est['success']}")
if est["success"]:
    M_est = est["M_0to1"]
    print(f"\nEstimated pose M_0to1:")
    print(f"  Translation: {M_est.t.numpy()}")
    print(f"  Translation norm: {torch.norm(M_est.t).item():.3f}")
    print(f"  Rotation (first row): {M_est.R[0].numpy()}")

    print(f"\nGround truth T_0to1:")
    T_gt = data["T_0to1"]
    print(f"  Translation: {T_gt.t.numpy()}")
    print(f"  Translation norm: {torch.norm(T_gt.t).item():.3f}")
    print(f"  Rotation (first row): {T_gt.R[0].numpy()}")

    # Compute errors
    t_err, r_err = relative_pose_error(T_gt, M_est.R, M_est.t)
    print(f"\nPose errors:")
    print(f"  Translation error: {t_err.item():.3f} deg")
    print(f"  Rotation error: {r_err.item():.3f} deg")
    print(f"  Max error (rel_pose_error): {max(t_err.item(), r_err.item()):.3f} deg")

    print(
        f"\nInliers: {est['inliers'].sum().item()} / {len(est['inliers'])} ({est['inliers'].float().mean().item()*100:.1f}%)"
    )
else:
    print("RANSAC failed!")

In [None]:
# Check coordinate frame - the poses might be in OCV but gluefactory expects a different convention
print("=== Coordinate Frame Check ===")
print(f"\nT_w2cam0_gt translation: {T_w2cam0_gt.t.numpy()}")
print(f"T_w2cam1_gt translation: {T_w2cam1_gt.t.numpy()}")

# Check if poses are in OCV frame (Z forward, Y down, X right)
# vs FLU frame (X forward, Y left, Z up)
print(f"\nCamera 0 position in world (OCV): {T_w2cam0_gt.inv().t.numpy()}")
print(f"Camera 1 position in world (OCV): {T_w2cam1_gt.inv().t.numpy()}")

# The issue: GT translation is mostly in -X, but estimated is mostly in -Z
# This suggests the poses might need coordinate frame conversion
print(f"\n=== Testing Coordinate Frame Conversion ===")
print("If poses are in OCV frame, we might need to convert to FLU or another frame")
print("OCV: Z forward, Y down, X right")
print("FLU: X forward, Y left, Z up")

# Check the relative translation direction
rel_trans = T_w2cam1_gt.inv().t - T_w2cam0_gt.inv().t
print(f"\nRelative translation in world frame: {rel_trans.numpy()}")
print(f"Direction (normalized): {rel_trans / np.linalg.norm(rel_trans)}")

# Compare with T_0to1 translation
print(f"T_0to1 translation (camera 0 to 1): {T_0to1.t.numpy()}")
print(
    f"T_0to1 translation direction (normalized): {(T_0to1.t / torch.norm(T_0to1.t)).numpy()}"
)

# The estimated translation from RANSAC

In [None]:
# Concise 3D visualization using Plotly
def visualize_3d_scene(xyz_w, T_w2cam0_gt, T_w2cam1_gt, T_w2cam1_est=None):
    """Visualize 3D scene with camera poses. Converts poses to FLU for consistent visualization."""
    n_points = min(1000, len(xyz_w))
    points_sample = xyz_w[np.random.choice(len(xyz_w), n_points, replace=False)]

    fig = go.Figure()
    fig.add_trace(
        go.Scatter3d(
            x=points_sample[:, 0],
            y=points_sample[:, 1],
            z=points_sample[:, 2],
            mode="markers",
            marker=dict(size=2, color="blue", opacity=0.3),
            name="3D Points",
        )
    )

    # Convert poses to FLU for visualization
    for T_ocv, color, name in [
        (T_w2cam0_gt, "green", "Camera 0 (GT)"),
        (T_w2cam1_gt, "red", "Camera 1 (GT)"),
        (T_w2cam1_est, "orange", "Camera 1 (Est)"),
    ]:
        if T_ocv is None:
            continue
        # Convert from OCV to FLU
        T_flu = Pose.from_4x4mat(torch.from_numpy(FLU_T_OCV).float()) @ T_ocv
        p = T_flu.inv().t.numpy()
        fig.add_trace(
            go.Scatter3d(
                x=[p[0]],
                y=[p[1]],
                z=[p[2]],
                mode="markers",
                marker=dict(size=10, color=color, symbol="diamond"),
                name=name,
            )
        )

    fig.update_layout(
        title="3D Scene with Camera Poses (FLU)",
        scene=dict(
            xaxis_title="X (m)",
            yaxis_title="Y (m)",
            zaxis_title="Z (m)",
            aspectmode="data",
        ),
        width=600,
        height=400,
    )
    fig.show()


visualize_3d_scene(xyz_w, T_w2cam0_gt, T_w2cam1_gt)

In [None]:
def run_pnp_absolute_localization(xyz_w, matched_kp1, valid0, camera1, T_w2cam_prev):
    """
    Estimate camera 1 pose using PnP with previous camera pose as initial guess.

    Args:
        xyz_w: 3D points in world frame [N, 3] (FLU frame for lupnt PnP)
        matched_kp1: Matched keypoints in image 1 [N, 2]
        valid0: Valid depth mask [N]
        camera1: Camera 1 object
        T_w2cam_prev: Previous camera pose (T_w2cam0) in OCV frame, will be converted to FLU
    """
    K1 = camera1.calibration_matrix().numpy()

    # Convert pose from OCV to FLU for PnP solver (lupnt uses FLU)
    T_w2cam_prev_flu = (
        Pose.from_4x4mat(torch.from_numpy(FLU_T_OCV).float()) @ T_w2cam_prev
    )
    tgt_T_src_guess = pnt.make_transform(*T_w2cam_prev_flu.numpy())

    pnp_solver = pnt.PnpSolver(
        {"threshold": 1.0, "confidence": 0.99, "max_iterations": 10000}
    )
    pnp_result = pnp_solver.solve(xyz_w, matched_kp1[valid0], K1, tgt_T_src_guess)

    if not pnp_result.success:
        return None, None, None

    # PnP result is in FLU, convert back to OCV for comparison with GT (which is in OCV)
    # Ensure both are float32 to avoid dtype mismatch
    T_w2cam1_est_flu = pnp_result.tgt_T_src.astype(np.float32)
    T_w2cam1_est_ocv = Pose.from_4x4mat(
        torch.from_numpy(OCV_T_FLU).float()
    ) @ Pose.from_4x4mat(torch.from_numpy(T_w2cam1_est_flu).float())
    inliers = np.array(pnp_result.inliers)
    return T_w2cam1_est_ocv, inliers, xyz_w


# Use previous camera pose (T_w2cam0_gt) as initial guess for camera 1 pose estimation
# Note: T_w2cam0_gt is in OCV frame (for gluefactory), xyz_w is in FLU frame (for PnP)
T_w2cam1_est, inliers, points_3d_world_flu = run_pnp_absolute_localization(
    xyz_w, matched_kp1, valid0, camera1, T_w2cam0_gt
)

if T_w2cam1_est is not None:
    print(f"PnP succeeded with {len(inliers)} inliers")

    # Compute errors (both poses are now in OCV frame)
    # Translation error: use camera position in world frame (not T_w2cam translation component)
    cam_pos_est = T_w2cam1_est.inv().t.numpy()
    cam_pos_gt = T_w2cam1_gt.inv().t.numpy()
    t_error = np.linalg.norm(cam_pos_est - cam_pos_gt)
    R_est = T_w2cam1_est.R
    R_diff = R_est.T @ T_w2cam1_gt.R
    angle = np.clip((np.trace(R_diff) - 1) / 2, -1, 1)
    r_error = np.rad2deg(np.arccos(angle))

    print(f"GT camera 1 position: {cam_pos_gt}")
    print(f"Est camera 1 position: {cam_pos_est}")
    print(f"Translation error: {t_error:.3f} m")
    print(f"Rotation error: {r_error:.3f} deg")

    # Visualize (poses will be converted to FLU inside the function)
    visualize_3d_scene(xyz_w, T_w2cam0_gt, T_w2cam1_gt, T_w2cam1_est)
else:
    print("PnP failed")

In [None]:
# Visualize reprojection errors
def visualize_reprojection_errors(
    img1,
    points_3d_world_flu,
    matched_kp1_valid,
    T_w2cam1_gt,
    T_w2cam1_est,
    camera1,
    inliers,
):
    K1_t = camera1.calibration_matrix()
    if isinstance(K1_t, torch.Tensor):
        K1 = K1_t[0].cpu().numpy() if K1_t.ndim > 2 else K1_t.cpu().numpy()
    else:
        K1 = K1_t[0] if K1_t.ndim > 2 else K1_t
    K1 = np.array(K1, dtype=np.float64)

    # Project 3D points using GT and estimated poses

    def project_points(points_3d, T_w2cam, K):
        # Transform to camera frame
        R = T_w2cam.R.numpy()
        t = T_w2cam.t.numpy()
        points_cam = (points_3d @ R.T) + t

        # Project
        fx, fy, cx, cy = K[0, 0], K[1, 1], K[0, 2], K[1, 2]
        x = points_cam[:, 0] / points_cam[:, 2]
        y = points_cam[:, 1] / points_cam[:, 2]
        u = fx * x + cx
        v = fy * y + cy

        return np.stack([u, v], axis=1)

    proj_gt = project_points(points_3d_world_flu, T_w2cam1_gt, K1)
    proj_est = project_points(points_3d_world_flu, T_w2cam1_est, K1)

    # Compute reprojection errors
    errors_gt = np.linalg.norm(proj_gt - matched_kp1_valid, axis=1)
    errors_est = np.linalg.norm(proj_est - matched_kp1_valid, axis=1)

    fig, axes = plt.subplots(1, 2, figsize=(20, 10))

    # GT reprojections
    axes[0].imshow(img1)
    axes[0].scatter(
        matched_kp1_valid[:, 0],
        matched_kp1_valid[:, 1],
        c="green",
        s=10,
        alpha=0.5,
        label="Detected",
    )
    axes[0].scatter(
        proj_gt[:, 0], proj_gt[:, 1], c="red", s=10, alpha=0.5, label="GT Projected"
    )
    for i in range(min(100, len(matched_kp1_valid))):
        axes[0].plot(
            [matched_kp1_valid[i, 0], proj_gt[i, 0]],
            [matched_kp1_valid[i, 1], proj_gt[i, 1]],
            "r-",
            alpha=0.3,
            linewidth=0.5,
        )
    axes[0].set_title(f"GT Reprojections (mean error: {errors_gt.mean():.2f} px)")
    axes[0].legend()
    axes[0].axis("off")

    # Estimated reprojections
    axes[1].imshow(img1)
    axes[1].scatter(
        matched_kp1_valid[:, 0],
        matched_kp1_valid[:, 1],
        c="green",
        s=10,
        alpha=0.5,
        label="Detected",
    )
    axes[1].scatter(
        proj_est[:, 0], proj_est[:, 1], c="blue", s=10, alpha=0.5, label="Est Projected"
    )
    inlier_mask = np.zeros(len(matched_kp1_valid), dtype=bool)
    inlier_mask[inliers.flatten()] = True
    for i in range(min(100, len(matched_kp1_valid))):
        color = "g-" if inlier_mask[i] else "r-"
        axes[1].plot(
            [matched_kp1_valid[i, 0], proj_est[i, 0]],
            [matched_kp1_valid[i, 1], proj_est[i, 1]],
            color,
            alpha=0.3,
            linewidth=0.5,
        )
    axes[1].set_title(f"Est Reprojections (mean error: {errors_est.mean():.2f} px)")
    axes[1].legend()
    axes[1].axis("off")

    plt.tight_layout()
    plt.show()

    # Error histogram
    fig, ax = plt.subplots(1, 1, figsize=(10, 5))
    ax.hist(errors_gt, bins=50, alpha=0.5, label="GT", color="red")
    ax.hist(errors_est, bins=50, alpha=0.5, label="Est", color="blue")
    ax.set_xlabel("Reprojection Error (px)")
    ax.set_ylabel("Frequency")
    ax.set_title("Reprojection Error Distribution")
    ax.legend()
    plt.tight_layout()
    plt.show()


if T_w2cam1_est is not None:
    visualize_reprojection_errors(
        img1,
        xyz_w,
        matched_kp1[valid0],
        T_w2cam1_gt,
        T_w2cam1_est,
        camera1,
        inliers,
    )