In [1]:
# %load_ext autoreload
# %autoreload 2

In [2]:
import torch
from vggt.models.vggt import VGGT
from vggt.utils.load_fn import load_and_preprocess_images

device = "cuda"
# bfloat16 is supported on Ampere GPUs (Compute Capability 8.0+)
dtype = torch.bfloat16 if torch.cuda.get_device_capability()[0] >= 8 else torch.float16

# Initialize the model and load the pretrained weights.
# This will automatically download the model weights the first time it's run, which may take a while.
model = VGGT.from_pretrained("facebook/VGGT-1B").to(device)

In [3]:
from vggt.utils.pose_enc import pose_encoding_to_extri_intri
from vggt.utils.geometry import unproject_depth_map_to_point_map
import os
import json
import numpy as np

image_names = ["data/resized/images/back_right_3_rgb.png", "data/resized/images/front_left_3_rgb.png", "data/resized/images/front_right_3_rgb.png", "data/resized/images/robot_3_rgb.png"]
images = load_and_preprocess_images(image_names).to(device)

# img_dir = 'data/extracted_frames/robot/'
# image_names = [os.path.join(img_dir, img_name) for img_name in os.listdir(img_dir)][:50]
# images = load_and_preprocess_images(image_names).to(device)


with torch.no_grad():
    with torch.cuda.amp.autocast(dtype=dtype):
        images = images[None]  # add batch dimension
        aggregated_tokens_list, ps_idx = model.aggregator(images)

    # Predict Cameras
    pose_enc = model.camera_head(aggregated_tokens_list)[-1]

    # Extrinsic and intrinsic matrices, following OpenCV convention (camera from world)
    extrinsic, intrinsic = pose_encoding_to_extri_intri(pose_enc, images.shape[-2:])

    # Predict Depth Maps
    depth_map, depth_conf = model.depth_head(aggregated_tokens_list, images, ps_idx)

    # Predict Point Maps
    point_map, point_conf = model.point_head(aggregated_tokens_list, images, ps_idx)

    # Construct 3D Points from Depth Maps and Cameras
    # which usually leads to more accurate 3D points than point map branch
    point_map_by_unprojection = unproject_depth_map_to_point_map(depth_map.squeeze(0),
                                                                extrinsic.squeeze(0),
                                                                intrinsic.squeeze(0))

    points = json.load(open('data/resized/annotations/robot_3_rgb_points.json'))
    points = np.array(points)


    # Predict Tracks
    query_points = torch.FloatTensor(points).to(device)
    track_list, vis_score, conf_score = model.track_head(aggregated_tokens_list, images, ps_idx, query_points=query_points[None])

In [31]:
import numpy as np
import json
import torch
import torch.nn.functional as F

points_list = [
    np.array(json.load(open('data/resized/annotations/back_right_3_rgb_points.json'))),
    np.array(json.load(open('data/resized/annotations/front_left_random_points.json')))[3][None],
    np.array(json.load(open('data/resized/annotations/front_right_random_points.json')))[1][None],
    np.array(json.load(open('data/resized/annotations/robot_3_random_points.json')))[5][None]
]
print(points_list)
tracks_results = []

with torch.no_grad():

    for i in range(len(points_list)):
        query_points = torch.FloatTensor(points_list[i]).to(device)
        track_list, vis_score, track_feats, query_track_feat, conf_e = model.track_head(
            aggregated_tokens_list,
            images,
            ps_idx,
            query_points=query_points[None],
            return_feat=True,
            query_img_idx=i
        )
        tracks_results.append((track_list, vis_score, track_feats, query_track_feat, conf_e))


[array([[297, 262]]), array([[412, 152]]), array([[20, 96]]), array([[300, 137]])]


In [32]:
features = torch.stack([result[3].squeeze(0).squeeze(0) for result in tracks_results])
features_normalized = F.normalize(features, p=2, dim=1)

similarity_matrix = features_normalized @ features_normalized.T
print(similarity_matrix)

tensor([[ 1.0000,  0.0433, -0.0639,  0.7642],
        [ 0.0433,  1.0000,  0.5397,  0.0698],
        [-0.0639,  0.5397,  1.0000, -0.0658],
        [ 0.7642,  0.0698, -0.0658,  1.0000]], device='cuda:0')
