In [1]:
import os
import numpy as np
import torch

import open3d as o3d

import MinkowskiEngine as ME

import matplotlib.pyplot as plt
from tqdm import tqdm

Jupyter environment detected. Enabling Open3D WebVisualizer.
[Open3D INFO] WebRTC GUI backend enabled.
[Open3D INFO] WebRTCWindowSystem: HTTP handshake server disabled.


## Load the 3D Distilled Model

In [2]:
'''simple config class to recreate what's done in the openscene code'''
class ModelConfig:
    def __init__(self, feature_2d_extractor, arch_3d):
        self.feature_2d_extractor = feature_2d_extractor
        self.arch_3d = arch_3d

In [3]:
# checkpoint_path = '/home/rsl_admin/openscene/checkpoints/matterport_openseg.pth'
checkpoint_path = '/home/rsl_admin/openscene/checkpoints/scannet_openseg.pth'

# checkpoint_path = '/home/rsl_admin/openscene/checkpoints/scannet_lseg.pth'

checkpoint = torch.load(checkpoint_path, map_location=lambda storage, loc: storage.cuda())

In [4]:
if 'matterport' in checkpoint_path:
    dataset = 'matterport'
elif 'scannet' in checkpoint_path:
    dataset = 'scannet'
else:
    raise NotImplementedError

if 'openseg' in checkpoint_path:
    embedding_space = 'openseg'
elif 'lseg' in checkpoint_path:
    embedding_space = 'lseg'
else:
    raise NotImplementedError

In [5]:
from run.distill import get_model

model_cfg = ModelConfig(
    feature_2d_extractor=embedding_space, 
    arch_3d='MinkUNet18A',
)
model = get_model(model_cfg)
model.load_state_dict(checkpoint['state_dict'], strict=True)

<All keys matched successfully>

In [6]:
model = model.to('cuda')

## Load CLIP text encoder model

In [7]:
import clip

if embedding_space == 'openseg':
    clip_model = 'ViT-L/14@336px'
elif embedding_space == 'lseg':
    clip_model = 'ViT-B/32'
    
clip_pretrained, _ = clip.load(clip_model, device='cuda', jit=False)

## Load the point cloud
Use scan from ScanNet for the test

In [8]:
scan_ply_filepath = "/home/rsl_admin/matterport/data/v1/scans/17DRP5sb8fy/house_segmentations/17DRP5sb8fy/house_segmentations/17DRP5sb8fy.ply"

mesh_ply = o3d.io.read_triangle_mesh(scan_ply_filepath)

In [9]:
locs_np = np.asarray(mesh_ply.vertices)
colors_np = np.asarray(mesh_ply.vertex_colors)

print(locs_np.shape)
print(colors_np.shape)

(1522546, 3)
(1522546, 3)


In [10]:
o3d.visualization.draw_geometries([mesh_ply])

### Function to run the 3D distill model on a input point cloud

In [11]:
# version of sparse_quantize in the OS code different than what's in ME
from dataset.voxelization_utils import sparse_quantize

In [12]:
def run_3d_distill_model(
    points, 
    model, 
    model_voxel_size=0.02,
    output_precision=np.float32,
):
    """
    Expects a point cloud of N points as an numpy array of shape (N,3)
    Creates a voxel representation of the point cloud with M voxels
    
    Returns:
        voxel_embeddings of shape (M,E), where E is the embedding dimension
        voxel_points of shape (M,3), center points of the voxels
        inverse_map(N,), maps voxel representation back to points
    """
    
    # voxelize the point cloud
    coords_np = np.floor(points / model_voxel_size)
    
    unique_map, inverse_map = sparse_quantize(coords_np, return_index=True)
    unique_coords = torch.Tensor(coords_np[unique_map])
    
    # add batch dimension to the coords
    unique_coords_batched = ME.utils.batched_coordinates([unique_coords])
    
    # 3D distill model trained with no color input, uses all ones as the feature
    feats = torch.ones(unique_coords.shape[0], 3)
    
    # move inputs to gpu
    unique_coords_batched = unique_coords_batched.to('cuda')
    feats = feats.to('cuda')
    
    input_st = ME.SparseTensor(features=feats, coordinates=unique_coords_batched)
    
    model.eval()
    with torch.no_grad():
        out = model(input_st)

        # normalize embeddings
        out /= out.norm(dim=-1, keepdim=True) + 1e-6
        
    voxel_embeddings = out.cpu().numpy().astype(np.float32)
    
    voxel_points = (unique_coords.cpu().numpy() * model_voxel_size) + model_voxel_size
    
    # use inverse_map to map embeddings to all points
    return voxel_embeddings, voxel_points, inverse_map

In [13]:
voxel_embeddings, voxel_points, inverse_map = run_3d_distill_model(
    locs_np,
    model,
    output_precision=np.float16,
)

## Query the computed embeddings

In [14]:
# get a color map
import matplotlib.cm as cm
cmap = cm.get_cmap('jet')

  cmap = cm.get_cmap('jet')


In [15]:
def compute_text_embedding(query_string, encoder):
    with torch.no_grad():
        text = clip.tokenize([query_string]).to('cuda')
        text_embedding = encoder.encode_text(text)
        text_embedding /= text_embedding.norm(dim=-1, keepdim=True)
    return text_embedding.cpu().numpy().astype(np.float32)

In [16]:
def compute_point_scores(query_string, voxel_embeddings):
    query_embedding = compute_text_embedding(query_string, clip_pretrained)
    
    # compute the similarity first for each voxel
    similarity = voxel_embeddings @ query_embedding.T
    
    return (similarity - similarity.min()) / (similarity.max() - similarity.min())

In [18]:
scores = compute_point_scores(
    "a place to sit", 
    voxel_embeddings)

pcd = o3d.geometry.PointCloud()
pcd.points = o3d.utility.Vector3dVector(voxel_points)
pcd.colors = o3d.utility.Vector3dVector(cmap(scores).reshape(-1,4)[:,:-1])
o3d.visualization.draw_geometries([pcd])