In [None]:
import argparse
from pathlib import Path
import os
import sys

import torch

sys.path.append(os.path.join(os.getcwd(), "dust3r"))
from dust3r.inference import inference
from dust3r.model import AsymmetricCroCo3DStereo
from dust3r.utils.image import load_images
from dust3r.image_pairs import make_pairs
from dust3r.cloud_opt import global_aligner, GlobalAlignerMode


def save_dust3r_outs(focals, poses, pts3d, savepath):
    """ Code to save output of dust3r after global alignment into a dictionary
    Args: 
        focals (torch.Tensor): Optimized Focal length of the N cameras [N,1]
        poses (torch.Tensor): Optimized Camera Poses [N,4,4]
        pts3d list of (torch.Tensor): Point clouds as seen from each camera. 
    Returns:
        None
        saves a .pth file, can be loaded using torch.load()
    """
    out_dict = {}
    pts3d = [pts.cpu().detach() for pts in pts3d]
    out_dict["focals"] = focals.cpu().detach()
    out_dict["poses"] = poses.cpu().detach()
    out_dict["pts3d"] = pts3d
    os.makedirs(os.path.dirname(savepath), exist_ok=True)
    torch.save(out_dict, savepath)
    print(f"Saved Dust3r outputs to {savepath}")

In [None]:
device = "cuda"
batch_size = 1
schedule = "cosine"
lr = 0.01
niter = 300

model_name = "checkpoints/DUSt3R_ViTLarge_BaseDecoder_512_dpt.pth"
# you can put the path to a local checkpoint in model_name if needed
model = AsymmetricCroCo3DStereo.from_pretrained(model_name).to(device)
# load_images can take a list of images or a directory
# images = load_images(["croco/assets/Chateau1.png", "croco/assets/Chateau2.png"], size=512)
imdir = Path("data") / "barrelddt1"
outdir = Path("results") / f"{imdir.name}-reconstr"
outdir.mkdir(exist_ok=True, parents=True)
images = load_images(str(imdir), size=512)

In [None]:
pairs = make_pairs(images, scene_graph="complete", prefilter=None, symmetrize=True)
output = inference(pairs, model, device, batch_size=batch_size)

# at this stage, you have the raw dust3r predictions
view1, pred1 = output["view1"], output["pred1"]
view2, pred2 = output["view2"], output["pred2"]
# here, view1, pred1, view2, pred2 are dicts of lists of len(2)
#  -> because we symmetrize we have (im1, im2) and (im2, im1) pairs
# in each view you have:
# an integer image identifier: view1["idx"] and view2["idx"]
# the img: view1["img"] and view2["img"]
# the image shape: view1["true_shape"] and view2["true_shape"]
# an instance string output by the dataloader: view1["instance"] and view2["instance"]
# pred1 and pred2 contains the confidence values: pred1["conf"] and pred2["conf"]
# pred1 contains 3D points for view1["img"] in view1["img"] space: pred1["pts3d"]
# pred2 contains 3D points for view2["img"] in view1["img"] space: pred2["pts3d_in_other_view"]

# next we"ll use the global_aligner to align the predictions
# depending on your task, you may be fine with the raw output and not need it
# with only two input images, you could use GlobalAlignerMode.PairViewer: it would just convert the output
# if using GlobalAlignerMode.PairViewer, no need to run compute_global_alignment
scene = global_aligner(output, device=device, mode=GlobalAlignerMode.PointCloudOptimizer)
loss = scene.compute_global_alignment(init="mst", niter=niter, schedule=schedule, lr=lr)

# retrieve useful values from scene:
imgs = scene.imgs
focals = scene.get_focals()
poses = scene.get_im_poses()
pts3d = scene.get_pts3d()
save_dust3r_outs(focals, poses, pts3d, savepath=outdir / "dust3r_out.pth")
confidence_masks = scene.get_masks()
