In [1]:
import os
import numpy as np
import torch

import open3d as o3d
from plyfile import PlyData, PlyElement

import MinkowskiEngine as ME

import matplotlib.pyplot as plt
from tqdm import tqdm

Jupyter environment detected. Enabling Open3D WebVisualizer.
[Open3D INFO] WebRTC GUI backend enabled.
[Open3D INFO] WebRTCWindowSystem: HTTP handshake server disabled.


## Load the 3D Distilled Model

In [2]:
'''simple config class to recreate what's done in the openscene code'''
class ModelConfig:
    def __init__(self, feature_2d_extractor, arch_3d):
        self.feature_2d_extractor = feature_2d_extractor
        self.arch_3d = arch_3d

In [3]:
# checkpoint_path = '/home/rsl_admin/openscene/checkpoints/matterport_openseg.pth'
checkpoint_path = '/home/rsl_admin/openscene/checkpoints/scannet_openseg.pth'

# checkpoint_path = '/home/rsl_admin/openscene/checkpoints/scannet_lseg.pth'

checkpoint = torch.load(checkpoint_path, map_location=lambda storage, loc: storage.cuda())

In [4]:
if 'matterport' in checkpoint_path:
    dataset = 'matterport'
elif 'scannet' in checkpoint_path:
    dataset = 'scannet'
else:
    raise NotImplementedError

if 'openseg' in checkpoint_path:
    embedding_space = 'openseg'
elif 'lseg' in checkpoint_path:
    embedding_space = 'lseg'
else:
    raise NotImplementedError

In [5]:
from run.distill import get_model

model_cfg = ModelConfig(
    feature_2d_extractor=embedding_space, 
    arch_3d='MinkUNet18A',
)
model = get_model(model_cfg)
model.load_state_dict(checkpoint['state_dict'], strict=True)

<All keys matched successfully>

In [6]:
model = model.to('cuda')

## Load CLIP text encoder model

In [7]:
import clip

if embedding_space == 'openseg':
    clip_model = 'ViT-L/14@336px'
elif embedding_space == 'lseg':
    clip_model = 'ViT-B/32'
    
clip_pretrained, _ = clip.load(clip_model, device='cuda', jit=False)

## Load the point cloud
Use scan from ScanNet for the test

In [8]:
scan_ply_filepath = "/home/rsl_admin/matterport/data/v1/scans/17DRP5sb8fy/house_segmentations/17DRP5sb8fy/house_segmentations/17DRP5sb8fy.ply"

mesh_ply = o3d.io.read_triangle_mesh(scan_ply_filepath)

In [9]:
locs_np = np.asarray(mesh_ply.vertices)
colors_np = np.asarray(mesh_ply.vertex_colors)

print(locs_np.shape)
print(colors_np.shape)

(1522546, 3)
(1522546, 3)


In [10]:
o3d.visualization.draw_geometries([mesh_ply])

### Function to run the 3D distill model on a input point cloud

In [11]:
# version of sparse_quantize in the OS code different than what's in ME
from dataset.voxelization_utils import sparse_quantize

In [24]:
def run_3d_distill_model(
    points, 
    model, 
    model_voxel_size=0.02,
    output_precision=np.float32,
):
    """
    Expects a point cloud of N points as an numpy array of shape (N,3)
    Creates a voxel representation of the point cloud with M voxels
    
    Returns:
        voxel_embeddings of shape (M,E), where E is the embedding dimension
        voxel_points of shape (M,3), center points of the voxels
        inverse_map(N,), maps voxel representation back to points
    """
    
    # voxelize the point cloud
    coords_np = np.floor(points / model_voxel_size)
    
    unique_map, inverse_map = sparse_quantize(coords_np, return_index=True)
    unique_coords = torch.Tensor(coords_np[unique_map])
    
    # add batch dimension to the coords
    unique_coords_batched = ME.utils.batched_coordinates([unique_coords])
    
    # 3D distill model trained with no color input, uses all ones as the feature
    feats = torch.ones(unique_coords.shape[0], 3)
    
    # move inputs to gpu
    unique_coords_batched = unique_coords_batched.to('cuda')
    feats = feats.to('cuda')
    
    input_st = ME.SparseTensor(features=feats, coordinates=unique_coords_batched)
    
    model.eval()
    with torch.no_grad():
        out = model(input_st)

        # normalize embeddings
        out /= out.norm(dim=-1, keepdim=True) + 1e-6
        
    voxel_embeddings = out.cpu().numpy().astype(np.float32)
    
    voxel_points = (unique_coords.cpu().numpy() * model_voxel_size) + model_voxel_size
    
    # use inverse_map to map embeddings to all points
    return voxel_embeddings, voxel_points, inverse_map

In [25]:
voxel_embeddings, voxel_points, inverse_map = run_3d_distill_model(
    locs_np,
    model,
    output_precision=np.float16,
)

### Voxelize the point cloud

In [11]:
# OpenScene uses 2cm voxelization for Scannet and Matterport
VOXEL_SIZE = 0.02

In [12]:
coords_np = np.floor(locs_np / VOXEL_SIZE)

In [13]:


unique_map, inverse_map = sparse_quantize(coords_np, return_index=True)
unique_coords = torch.Tensor(coords_np[unique_map])

In [14]:
# print(coords_np.shape)
# print(unique_map.shape)
# print(inverse_map.shape)
# print(np.all(coords_np[unique_map][inverse_map] == coords_np))

In [15]:
# add batch dimension to the coords
unique_coords_batched = ME.utils.batched_coordinates([unique_coords])

print(unique_coords_batched.shape)

torch.Size([990767, 4])


In [16]:
# 3D distill model trained with no color input, uses all ones as the feature
feats = torch.ones(unique_coords.shape[0], 3)

In [17]:
# Move inputs to gpu
unique_coords_batched = unique_coords_batched.to('cuda')
feats = feats.to('cuda')

In [18]:
# Create the SparseTensor input to the model
input_st = ME.SparseTensor(features=feats, coordinates=unique_coords_batched)

print(input_st)

SparseTensor(
  coordinates=tensor([[  0, -33,   0,   0],
        [  0, -33,   0, 110],
        [  0, -33,   1,   0],
        ...,
        [  0, -33,   1,  -1],
        [  0, -33,  -1,   0],
        [  0, -33,   0,  -1]], device='cuda:0', dtype=torch.int32)
  features=tensor([[1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.],
        ...,
        [1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.]], device='cuda:0')
  coordinate_map_key=coordinate map key:[1, 1, 1]
  coordinate_manager=CoordinateMapManagerGPU_c10(
	[1, 1, 1, ]:	CoordinateMapGPU:990767x4
	algorithm=MinkowskiAlgorithm.DEFAULT
  )
  spatial dimension=3)


### Forward the model

In [19]:
model.eval()
with torch.no_grad():
    out = model(input_st)
    
    # normalize embeddings
    out /= out.norm(dim=-1, keepdim=True) + 1e-6

In [20]:
print(out)
print(out.shape)
print(out.dtype)

tensor([[ 0.0213,  0.0422, -0.0042,  ...,  0.0202, -0.0261, -0.0507],
        [ 0.0095,  0.0514, -0.0341,  ...,  0.0244, -0.0036, -0.0283],
        [ 0.0244,  0.0422, -0.0029,  ...,  0.0177, -0.0229, -0.0521],
        ...,
        [ 0.0246,  0.0401, -0.0056,  ...,  0.0171, -0.0248, -0.0511],
        [ 0.0206,  0.0453, -0.0058,  ...,  0.0217, -0.0282, -0.0496],
        [ 0.0216,  0.0410, -0.0070,  ...,  0.0164, -0.0256, -0.0502]],
       device='cuda:0')
torch.Size([990767, 768])
torch.float32


In [21]:
# move computed embeddings to cpu
voxel_embeddings_np = out.cpu().numpy().astype(np.float32)

## Query the computed embeddings

In [14]:
# get a color map
import matplotlib.cm as cm
cmap = cm.get_cmap('jet')

  cmap = cm.get_cmap('jet')


In [26]:
def compute_text_embedding(query_string, encoder):
    with torch.no_grad():
        text = clip.tokenize([query_string]).to('cuda')
        text_embedding = encoder.encode_text(text)
        text_embedding /= text_embedding.norm(dim=-1, keepdim=True)
    return text_embedding.cpu().numpy().astype(np.float32)

In [27]:
def compute_point_scores(query_string, voxel_embeddings):
    query_embedding = compute_text_embedding(query_string, clip_pretrained)
    
    # compute the similarity first for each voxel
    similarity = voxel_embeddings @ query_embedding.T
    
#     # use the inverse map to get the similarity for each point
#     similarity = similarity[inverse_map]
    
    return (similarity - similarity.min()) / (similarity.max() - similarity.min())

In [28]:
def visualize_single_query(query_string, point_embeddings):
    scores = compute_point_scores(query_string, point_embeddings)
    
    pcd = o3d.geometry.PointCloud()
    pcd.points = o3d.utility.Vector3dVector(voxel_points)
    pcd.colors = o3d.utility.Vector3dVector(cmap(scores).reshape(-1,4)[:,:-1])
    o3d.visualization.draw_geometries([pcd])

In [29]:
visualize_single_query(
    # 'a comfortable place',
    # 'a desk in a scene',
    'a place to sit',
    voxel_embeddings
)

### Compute the scores for a list of queries for later visualization

In [31]:
object_queries = [
    'bed',
    'pillow',
    'tv',
    'guitar',
    'musical instrument',
    'bicycle',
    'sofa',
    'oven',
    'desk',
    'chair',
]

prompted_queries = []
prompted_queries += ['a photo of a {}'.format(q) for q in object_queries]
prompted_queries += ['a {} in a scene'.format(q) for q in object_queries]

object_queries += prompted_queries

In [32]:
abstract_queries = [
    'a place to sleep',
    'a place to cook'
]

In [33]:
room_queries = [
    'kitchen',
    'bathroom',
    'bedroom',
    'living room',
]

prompted_queries = ['a photo of a {}'.format(q) for q in room_queries]

room_queries += prompted_queries

In [34]:
precompute_queries = object_queries + abstract_queries + room_queries

# for q in precomputed_queries:
#     print(q)

In [35]:
# figure out where to save the outputs 
scan_id = scan_ply_filepath.split('/')[-1].split('.')[0]
config_dir_name = dataset + '_' + embedding_space + '_' + scan_id

save_dir = os.path.join('precomputed_queries', config_dir_name)
os.makedirs(save_dir, exist_ok=True)

In [36]:
skip_exist = True

for q in tqdm(precompute_queries):
    if skip_exist and q + '.npy' in os.listdir(save_dir):
        continue
    scores = compute_point_scores(q, voxel_embeddings_np)
    color_scores = cmap(scores).reshape(-1,4)[:,:-1]
    np.save(os.path.join(save_dir, q + '.npy'), color_scores)

np.save(os.path.join(save_dir, 'locs.npy'), locs_np)

100%|███████████████████████████████████████████████████████████████| 40/40 [00:08<00:00,  4.84it/s]


## Load and visualize a precomputed query

In [34]:
# # visualize a single pre-computed query
# precomputed_query = 'a photo of a living room'

# color_filename = precomputed_query + '.npy'
# assert color_filename in os.listdir(save_dir)

# precomputed_locs_np = np.load(os.path.join(save_dir, 'locs.npy'))
# precomputed_colors_np = np.load(os.path.join(save_dir, color_filename))

# pcd = o3d.geometry.PointCloud()
# pcd.points = o3d.utility.Vector3dVector(precomputed_locs_np)
# pcd.colors = o3d.utility.Vector3dVector(precomputed_colors_np)
# o3d.visualization.draw_geometries(
#     [pcd], 
#     window_name=precomputed_query,
#     width=800,
#     height=800
# )

In [37]:
# visualize all pre-computed queries for the current configuration

precomputed_locs_np = np.load(os.path.join(save_dir, 'locs.npy'))

for filename in os.listdir(save_dir):
    if filename == 'locs.npy':
        continue
    else:
        precomputed_query = filename.split('.')[0]
        precomputed_colors_np = np.load(os.path.join(save_dir, filename))
        pcd = o3d.geometry.PointCloud()
        pcd.points = o3d.utility.Vector3dVector(precomputed_locs_np)
        pcd.colors = o3d.utility.Vector3dVector(precomputed_colors_np)
        o3d.visualization.draw_geometries(
            [pcd], 
            window_name=precomputed_query,
            width=800,
            height=800
        )