## NAVI Dataset (v1)

The NAVI dataset consists of:
- 36 object scans
- 10515 precise object-to-image alignments in total.
- 324 (267 unique) multi-view scenes.
- 35 in-the-wild collections with different backgrounds.

There are two different types of images sets for each object:
- Multiview (`multiview_xx_camera-model/`) captures of the same object in the same environment and pose by moving the cameras.
- In-the-wild (`wild_set/`) captures of the same object under different illumination, background, pose.

The folder organization is as follows:
```
object_id_0/
    3d_scan/
        object_id_0.obj
        object_id_0.mtl  # For textured objects.
        object_id_0.jpg  # For textured objects.
        object_id_0.glb
    multiview_00/
        annotations.json
        images/
            000.jpg
            ...
        depth/  # Pre-computed depth.
            000.png
            ...
        masks/  # Pre-computed masks.
            000.png
            ...
    multiview_01/
        ...
    ...
    wild_set/
        annotations.json
        images/
            000.jpg
            ...
        depth/
            ...
        masks/
            ...
object_id_1/
    ...
...
```

In [1]:
# Imports (Check requirements.txt)
%load_ext autoreload
%autoreload 2

import glob
import json
import os
import random
from typing import Tuple, Dict

from IPython.display import display
import ipywidgets as widgets
from PIL import Image
import cv2
import colour
from matplotlib import pyplot as plt
import numpy as np
import torch as t
import trimesh

# NAVI imports.
import data_util
import mesh_util
import transformations
import visualization
from gl import scene_renderer
from gl import camera_util

disp = visualization.display_images
resize_by = lambda x, y: x.resize((x.size[0]//y, x.size[1]//y))

navi_release_root = '/path/to/navi/dataset/'

### Process the `multiview` alignments.

In [2]:
# Load the mesh and the images.
query = 'duck_bath_yellow_s:multiview_00:pixel_4a'

# Load the multi-view scene data.
annotations, mesh, images = data_util.load_scene_data(query, navi_release_root)
triangles, triangle_colors, material_ids = (
    visualization.prepare_mesh_rendering_info(mesh))
  
# Create a sphere (for the camera centers).
sphere = trimesh.primitives.Sphere(radius=5.)
sphere_vert = data_util.convert_to_triangles(
    sphere.vertices, sphere.faces)
sphere_vert = t.tensor(sphere_vert)

# Get the camera center spheres.
camera_spheres = []
for i_anno, anno in enumerate(annotations):
  object_to_world, _ = data_util.camera_matrices_from_annotation(anno)
  camera_to_world = t.linalg.inv(object_to_world)
  camera_spheres.append(
      transformations.transform_mesh(sphere_vert, camera_to_world))

#### Display the multiview images, alignments, masks, and depth of the multiview scene on the fly.

In [3]:
# For Depth visualization.
min_depth = 100  # in mm.
max_depth = 400  # in mm.
disp_width = 200
resize_factor = 16


@widgets.interact
def display_multiview(frame_index=(0, len(annotations)-1, 1)):
  image = images[frame_index]
  anno = annotations[frame_index]
  object_to_world, intrinsics = data_util.camera_matrices_from_annotation(anno)
  h, w = anno['image_size']
  
  # Render the 3D model alignment.
  triangles_aligned = transformations.transform_mesh(
      triangles, object_to_world)
  rend = scene_renderer.render_scene(
      triangles_aligned, view_projection_matrix=intrinsics,
      material_ids=material_ids, diffuse_coefficients=triangle_colors,
      output_type=t.float32, clear_color=(0,0,0,0),
      image_size=(h, w), cull_back_facing=False, return_rgb=False)
  rendering = rend[:,:,:3].numpy()
  rendering = Image.fromarray((255*rendering/rendering.max()).astype(np.uint8))
  mask = rend.numpy().mean(axis=2) > 0
  mask = Image.fromarray((mask * 255).astype(np.uint8))
  depth = visualization.apply_colors_to_depth_map(rend[:,:,3].numpy(), minn=100, maxx=max_depth)
  depth = Image.fromarray(depth)
  

  # Resize and display the images.
  image_resize = resize_by(image, resize_factor)
  rendering_resize = resize_by(rendering, resize_factor)
  mask_resize = resize_by(mask, resize_factor)
  depth_resize = resize_by(depth, resize_factor)
  disp(image_resize, disp_width, 'RGB Image', rendering_resize, disp_width, 'Alignment',
        mask_resize, disp_width, 'Binary Mask', depth, disp_width, "Metric Depth")


  # Render the 3D object with the camera poses.
  camera_triangles = t.concat(camera_spheres, axis=0)
  camera_colors = t.tensor([[0.5, 0., 0.], [0., 1., 0]])
  camera_material_id = t.ones(camera_triangles.shape[0], dtype=t.int32)
  vert_index_start = frame_index * len(sphere_vert)
  vert_index_end = (frame_index + 1) * len(sphere_vert)
  camera_material_id[vert_index_start:vert_index_end] += 1
  triangles_all = t.concat((triangles, camera_triangles), axis=0)
  material_ids_all = t.concat(
      (material_ids, camera_material_id * (material_ids.numpy().max()+1)), axis=0)
  colors_all = t.concat((triangle_colors, camera_colors), axis=0)

  view_projection_matrix = camera_util.get_default_camera_for_mesh(
    triangles_all, move_away_mul=1.3, camera_index=3)
  rend_with_cameras = scene_renderer.render_scene(
      triangles_all, diffuse_coefficients=colors_all,
      material_ids=material_ids_all,
      view_projection_matrix=view_projection_matrix,
      image_size=(1024, 1024),
      cull_back_facing=False, clear_color=(1, 1, 1))
  disp(rend_with_cameras, 512, 'Multiview scene with camera positions.')

interactive(children=(IntSlider(value=9, description='frame_index', max=19), Output()), _dom_classes=('widget-…

### Process the `wild_set` images with different backgrounds.

In [4]:
# Load all 'wild_set' annotations.
annotation_paths_wild = glob.glob(os.path.join(
    navi_release_root, '*', 'wild_set', 'annotations.json'))
annotations_wild = []
for annotation_path in annotation_paths_wild:
  with open(annotation_path, 'r') as f:
    annotations_wild.append(json.load(f))
random.shuffle(annotations_wild)

In [5]:
max_num_objects = 2
max_num_images_per_object = 5
disp_size = 180
resize_factor = 16

for i_object, anno_object in enumerate(annotations_wild):
  if i_object >= max_num_objects:
    break

  # Load the scene data.
  object_id = anno_object[0]['object_id']
  query = f'{object_id}:wild_set'
  annotations, mesh, images_wild_set = data_util.load_scene_data(
    query, navi_release_root, max_num_images=max_num_images_per_object)

  # Load all images.
  images_resize = [resize_by(image, resize_factor) for image in images_wild_set]

  overlays = []
  for i_anno in range(len(images_wild_set)):
    anno = anno_object[i_anno]
    w, h = images_resize[i_anno].size
  
    # Convert camera poses and intrinsics into matrices.
    object_to_world, intrinsics = data_util.camera_matrices_from_annotation(anno)
  
    # Render the mesh.
    rend = visualization.render_navi_scan(
        mesh, extrinsics=object_to_world, intrinsics=intrinsics,
        image_size=(h, w), with_texture=True)
    overlay = visualization.overlay_images(
        np.array(images_resize[i_anno]), rend)
    overlays.append(overlay)
  visualization.display_multiple_images(images_resize, disp_size, 'RGB', 'width')
  visualization.display_multiple_images(overlays, disp_size, 'Alignment', 'width')
  print('\n')









### Sample dense correspondences from the NAVI scenes.

In [6]:
def project_and_filter_sample_coordinates(
    mesh_triangles: t.tensor, annotation, sampled_points: t.tensor,
    image: Image.Image) -> Dict[int, Tuple[int, int]]:
  """Returns the sampled points, projected on the image, that are visible from the current view."""

  object_to_world, intrinsics = data_util.camera_matrices_from_annotation(annotation)

  # Render the 3D model alignment.
  mesh_triangles_aligned = transformations.transform_mesh(
      mesh_triangles, object_to_world)
  rend = scene_renderer.render_scene(
      mesh_triangles_aligned, view_projection_matrix=intrinsics,
      output_type=t.float32, clear_color=(0,0,0,0),
      image_size=image.size[::-1], cull_back_facing=False, return_rgb=False)
  depth = rend[:, :, 3].numpy()

  # Align the sampled points.
  sampled_points_world = transformations.transform_points(
      sampled_points, object_to_world)
  sampled_points_screen = transformations.transform_points(
      sampled_points_world, intrinsics)

  # Convert from OpenGL space to image space.
  sampled_points_screen += t.tensor([1., 1., 0])
  sampled_points_screen *= t.tensor([image.size[0]/2, image.size[1]/2, 1])
  samples = t.concat(
      (sampled_points_screen[:, :2], sampled_points_world[:, 2:3]),
      dim=1).numpy()

  # Discard points where the depth doesn't match the OpenGL depth buffer.
  coords = {}
  for i_sample, sample in enumerate(samples):
    y = round(sample[1])
    x = round(sample[0])
    z = sample[2]
    if abs(depth[y, x] - z) < 1:
      coords[i_sample] = (y, x)
  return coords


def show_correspondences(image_1: Image.Image, image_2: Image.Image,
                         corresp_dict_1: Dict[int, Tuple[int, int]],
                         corresp_dict_2: Dict[int, Tuple[int, int]], resize_factor=1) -> None:
  """Display the intersection of valid correspondences between two images."""
  image_1 = np.array(image_1)
  image_2 = np.array(image_2)
  h1, w1 = image_1.shape[:2]
  h2, w2 = image_2.shape[:2]

  # Handle images of different shapes (in the wild_set images).
  if h1 != h2:
    h_max = max(h1, h2)
    image_1 = np.pad(image_1, [[0, h_max-h1], [0, 0], [0, 0]])
    image_2 = np.pad(image_2, [[0, h_max-h2], [0, 0], [0, 0]])

  # Concatenate the two images to display the correspondences.
  img_corresp = np.concatenate((image_1, image_2), axis=1)
  img_corresp = cv2.resize(
      img_corresp,
      (img_corresp.shape[1] // resize_factor, img_corresp.shape[0] // resize_factor))

  # Sort the correspondences of the left images by Y-coordinate
  corresp_1_as_list = [(k, *v) for k, v in corresp_dict_1.items()]
  corresp_1_as_list = sorted(corresp_1_as_list, key=lambda x: x[1])

  # Create the color gradient.
  red = colour.Color("red")
  colors = list(red.range_to(colour.Color("blue"), len(corresp_1_as_list)))
  
  plt.figure(figsize=(12, 17))
  plt.axis('off')
  plt.imshow(img_corresp)
  for color_idx, (corresp_idx, y1, x1) in enumerate(corresp_1_as_list):
    if corresp_idx in corresp_dict_2:
      y2, x2 = corresp_dict_2[corresp_idx]
      x = [x1 / resize_factor, (x2 + w1) / resize_factor]
      y = [y1 / resize_factor, y2 / resize_factor]
      plt.plot(x, y, color=colors[color_idx].rgb, marker='o')

In [7]:
# Load the mesh and the images.
query = 'duck_bath_yellow_s:multiview_00:pixel_4a'
# query = 'schleich_hereford_bull:wild_set'
annotations, mesh, images = data_util.load_scene_data(query, navi_release_root)
triangles, triangle_colors, material_ids = (
    visualization.prepare_mesh_rendering_info(mesh))

def visualize_correspondences_interactive(num_samples=100):
  max_num_image_pairs = 2

  # Sample the mesh.
  sampled_points, _ = mesh_util.sample_points_from_mesh(
      triangles, num_sample_points=num_samples)

  # Discard the mesh points that are not visible from the images.
  samples_visible = []
  for i_anno, anno in enumerate(annotations):
    if i_anno > max_num_image_pairs:
      break
    samples_visible.append(project_and_filter_sample_coordinates(
        triangles, anno, sampled_points, images[i_anno]))

  # Display the correspondences.
  for i_img in range(len(samples_visible)-1):
    show_correspondences(
        images[i_img], images[i_img+1], samples_visible[i_img],
        samples_visible[i_img+1], resize_factor=8)

w = widgets.IntSlider(value=100, min=100, max=2000, step=100)
_ = widgets.interact(visualize_correspondences_interactive, num_samples=w)

interactive(children=(IntSlider(value=100, description='num_samples', max=2000, min=100, step=100), Output()),…

### Load pre-computed masks and depth.
We have pre-computed the masks and depths for convenience.

In [8]:
object_id = 'duck_bath_yellow_s'
scene_name = 'wild_set'
image_idx = 0
disp_size = 256


# Load the image.
# IMPORTANT: The rotation of the image is stored in the exif metadata. Use the function below to load it.
image_path = os.path.join(navi_release_root, object_id, scene_name, 'images', f'{image_idx:03d}.jpg')
image_np = np.array(data_util.read_image(image_path))

# Load the depth.
depth_path = os.path.join(navi_release_root, object_id, scene_name, 'depth', f'{image_idx:03d}.png')
depth = data_util.read_depth_from_png(depth_path)

# Load the mask.
mask_path = os.path.join(navi_release_root, object_id, scene_name, 'masks', f'{image_idx:03d}.png')
mask = np.array(Image.open(mask_path))

# Resize for visualization.
factor = 8
image_resized = cv2.resize(image_np, (image_np.shape[1]//factor, image_np.shape[0]//factor))
depth_resized = cv2.resize(depth, (depth.shape[1]//factor, depth.shape[0]//factor))
mask_resized = cv2.resize(mask.astype(np.float32), (mask.shape[1]//factor, mask.shape[0]//factor))

# Visualize image, depth, and mask.
# fig, ax = plt.subplots(1, 3, figsize=(15, 5))
disp(image_resized, "Image", disp_size,
     depth_resized/depth_resized.max(), "Depth (in gray-scale)", disp_size,
     mask_resized, "Binary mask", disp_size)