In [1]:
import torch
from vggt.models.vggt import VGGT
from vggt.utils.load_fn import load_and_preprocess_images
from vggt.utils.pose_enc import pose_encoding_to_extri_intri
from vggt.utils.geometry import unproject_depth_map_to_point_map

import open3d as o3d
import numpy as np
import os
import glob
from matplotlib import pyplot as plt

Jupyter environment detected. Enabling Open3D WebVisualizer.
[Open3D INFO] WebRTC GUI backend enabled.
[Open3D INFO] WebRTCWindowSystem: HTTP handshake server disabled.


In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
# bfloat16 is supported on Ampere GPUs (Compute Capability 8.0+) 
dtype = torch.bfloat16 if torch.cuda.get_device_capability()[0] >= 8 else torch.float16

In [3]:
model = VGGT()
_URL = "./model.pt"
model.load_state_dict(torch.load(_URL))
model.eval()
model = model.to(device)

  model.load_state_dict(torch.load(_URL))


In [61]:
path = "./test_model/black_and_mx"

image_names = sorted(glob.glob(os.path.join(path, "*.png")))
images = load_and_preprocess_images(image_names).to(device)

with torch.no_grad():
    with torch.cuda.amp.autocast(dtype=dtype):
        images = images[None]  # add batch dimension
        aggregated_tokens_list, ps_idx = model.aggregator(images)
                
    # Predict Cameras
    pose_enc = model.camera_head(aggregated_tokens_list)[-1]
    # Extrinsic and intrinsic matrices, following OpenCV convention (camera from world)
    extrinsic, intrinsic = pose_encoding_to_extri_intri(pose_enc, images.shape[-2:])

    # Predict Depth Maps
    depth_map, depth_conf = model.depth_head(aggregated_tokens_list, images, ps_idx)

    # Predict Point Maps
    point_map, point_conf = model.point_head(aggregated_tokens_list, images, ps_idx)
        
    # Construct 3D Points from Depth Maps and Cameras
    # which usually leads to more accurate 3D points than point map branch
    point_map_by_unprojection = unproject_depth_map_to_point_map(depth_map.squeeze(0), 
                                                                extrinsic.squeeze(0), 
                                                                intrinsic.squeeze(0))

  with torch.cuda.amp.autocast(dtype=dtype):


#### 下面保存一些TSDF fusion需要的数据

In [5]:
np.median(depth_conf.cpu().numpy())

2.3328097

In [6]:
depth_select = depth_conf < 2.34
depth_map[depth_select] = 0

In [7]:
np.save("depth_map.npy",depth_map[0].cpu().numpy())
# np.save("depth_conf.npy",depth_conf[0].cpu().numpy())
np.save("extrinsic.npy",extrinsic[0].cpu().numpy())
np.save("intrinsic.npy",intrinsic[0].cpu().numpy())
np.save("images.npy",images[0].cpu().numpy())

In [9]:
point_all = point_map_by_unprojection.reshape(-1, 3)
color_all = images[0].permute(0, 2, 3, 1).reshape(-1, 3).detach().cpu().numpy()
conf_all = depth_conf[0].reshape(-1,).detach().cpu().numpy()
threshold = np.median(conf_all)
conf_mask = conf_all > threshold
point_all = point_all[conf_mask]
color_all = color_all[conf_mask]

pcd = o3d.geometry.PointCloud()
pcd.points = o3d.utility.Vector3dVector(point_all)
pcd.colors = o3d.utility.Vector3dVector(color_all)

o3d.io.write_point_cloud("pointcloud.ply", pcd)

True