In [1]:
# %load_ext autoreload
# %autoreload 2

In [2]:
import torch
from vggt.models.vggt import VGGT
from vggt.utils.load_fn import load_and_preprocess_images

device = "cuda"
# bfloat16 is supported on Ampere GPUs (Compute Capability 8.0+)
dtype = torch.bfloat16 if torch.cuda.get_device_capability()[0] >= 8 else torch.float16

# Initialize the model and load the pretrained weights.
# This will automatically download the model weights the first time it's run, which may take a while.
model = VGGT.from_pretrained("facebook/VGGT-1B").to(device)

In [3]:
from vggt.utils.pose_enc import pose_encoding_to_extri_intri
from vggt.utils.geometry import unproject_depth_map_to_point_map
import os
import json
import numpy as np

image_names = ["data/resized/images/back_right_3_rgb.png", "data/resized/images/front_left_3_rgb.png", "data/resized/images/front_right_3_rgb.png", "data/resized/images/robot_3_rgb.png"]
images = load_and_preprocess_images(image_names).to(device)

with torch.no_grad():
    with torch.cuda.amp.autocast(dtype=dtype):
        images = images[None]  # add batch dimension
        aggregated_tokens_list, ps_idx = model.aggregator(images)

    # Predict Cameras
    pose_enc = model.camera_head(aggregated_tokens_list)[-1]
    extrinsic, intrinsic = pose_encoding_to_extri_intri(pose_enc, images.shape[-2:])
    depth_map, depth_conf = model.depth_head(aggregated_tokens_list, images, ps_idx)

    point_map, point_conf = model.point_head(aggregated_tokens_list, images, ps_idx)

    # Construct 3D Points from Depth Maps and Cameras
    # which usually leads to more accurate 3D points than point map branch
    point_map_by_unprojection = unproject_depth_map_to_point_map(depth_map.squeeze(0),
                                                                extrinsic.squeeze(0),
                                                                intrinsic.squeeze(0))

In [31]:
import numpy as np
import json
import torch
import torch.nn.functional as F

points_list = [
    np.array(json.load(open('data/resized/annotations/back_right_3_rgb_points.json'))),
    np.array(json.load(open('data/resized/annotations/front_left_random_points.json')))[3][None],
    np.array(json.load(open('data/resized/annotations/front_right_random_points.json')))[1][None],
    np.array(json.load(open('data/resized/annotations/robot_3_random_points.json')))[5][None]
]
print(points_list)
tracks_results = []

with torch.no_grad():

    for i in range(len(points_list)):
        query_points = torch.FloatTensor(points_list[i]).to(device)
        track_list, vis_score, track_feats, query_track_feat, conf_e = model.track_head(
            aggregated_tokens_list,
            images,
            ps_idx,
            query_points=query_points[None],
            return_feat=True,
            query_img_idx=i
        )
        tracks_results.append((track_list, vis_score, track_feats, query_track_feat, conf_e))


[array([[297, 262]]), array([[412, 152]]), array([[20, 96]]), array([[300, 137]])]


In [32]:
features = torch.stack([result[3].squeeze(0).squeeze(0) for result in tracks_results])
features_normalized = F.normalize(features, p=2, dim=1)

similarity_matrix = features_normalized @ features_normalized.T
print(similarity_matrix)

tensor([[ 1.0000,  0.0433, -0.0639,  0.7642],
        [ 0.0433,  1.0000,  0.5397,  0.0698],
        [-0.0639,  0.5397,  1.0000, -0.0658],
        [ 0.7642,  0.0698, -0.0658,  1.0000]], device='cuda:0')


In [25]:
import json
import numpy as np
from PIL import Image
import torch
import torch.nn.functional as F

def get_similarity_matrix(points_list, images, model):
    with torch.no_grad():
        with torch.cuda.amp.autocast(dtype=dtype):
            images = images[None]  # add batch dimension
            aggregated_tokens_list, ps_idx = model.aggregator(images)

        tracks_results = []
        for i in range(len(points_list)):
            query_points = torch.FloatTensor(points_list[i]).to(device)
            track_list, vis_score, track_feats, query_track_feat, conf_e = model.track_head(
                aggregated_tokens_list,
                images,
                ps_idx,
                query_points=query_points[None],
                return_feat=True,
                query_img_idx=i
            )
            tracks_results.append((track_list, vis_score, track_feats, query_track_feat, conf_e))

        features = torch.stack([result[3].squeeze(0).squeeze(0) for result in tracks_results])
        features_normalized = F.normalize(features, p=2, dim=1)

        similarity_matrix = features_normalized @ features_normalized.T
        return similarity_matrix

data_dir = "data/current_frames/robot"
base_file_names = ["frame_65", "frame_75", "frame_85", "frame_95", "frame_105", "frame_115", "frame_125"]

points = []
image_names = [
    "data/current_frames/robot/frame_65.png",
    "data/current_frames/robot/frame_75.png",
    "data/current_frames/robot/frame_85.png",
    "data/current_frames/robot/frame_95.png",
    "data/current_frames/robot/frame_105.png",
]

points = [
    np.array(json.load(open("data/current_frames/robot/frame_65_points.json")))[0][None],
    np.array(json.load(open("data/current_frames/robot/frame_75_points.json")))[0][None],
    np.array(json.load(open("data/current_frames/robot/frame_85_points.json")))[0][None],
    np.array(json.load(open("data/current_frames/robot/frame_95_points.json")))[0][None],
    np.array(json.load(open("data/current_frames/robot/frame_105_points.json")))[0][None],
]


images = load_and_preprocess_images(image_names).to(device)
similarity_matrix = get_similarity_matrix(points, images, model)

print(similarity_matrix.shape)
print(similarity_matrix)
# sum the average of the non-diagonal elements
print((similarity_matrix.sum() - similarity_matrix.diagonal().sum()) / (similarity_matrix.shape[0] * (similarity_matrix.shape[0] - 1)))


torch.Size([5, 5])
tensor([[1.0000, 0.8986, 0.5071, 0.4620, 0.7093],
        [0.8986, 1.0000, 0.5422, 0.4628, 0.8168],
        [0.5071, 0.5422, 1.0000, 0.7966, 0.7590],
        [0.4620, 0.4628, 0.7966, 1.0000, 0.7010],
        [0.7093, 0.8168, 0.7590, 0.7010, 1.0000]], device='cuda:0')
tensor(0.6656, device='cuda:0')


In [9]:
points

[array([[249,  34],
        [252,  78],
        [181,  77],
        [179,  33],
        [233, 203]]),
 array([[258,  77],
        [281, 108],
        [224, 144],
        [200, 109],
        [307, 217]]),
 array([[225, 152],
        [260, 139],
        [280, 202],
        [244, 214],
        [388, 162]]),
 array([[260, 198],
        [286, 167],
        [333, 211],
        [308, 240],
        [141, 208]]),
 array([[262,  96],
        [276,  57],
        [337,  78],
        [327, 116],
        [207, 185]]),
 array([[174,  84],
        [167,  47],
        [235,  35],
        [242,  77],
        [259, 202]]),
 array([[126, 160],
        [122, 122],
        [187, 115],
        [192, 156],
        [202, 252]])]