In [None]:
%load_ext autoreload
%autoreload 2

from pathlib import Path
import numpy as np
from nerfstudio.process_data.colmap_utils import colmap_to_json, parse_colmap_camera_params

import shutil
from nerfstudio.data.utils.colmap_parsing_utils import (
    qvec2rotmat,
    read_cameras_binary,
    read_images_binary,
)
import mediapy
import cv2
import json
import torch
import os
from nerfiller.utils.mesh_utils import dilate

In [None]:
# backpack scene
# name = "backpack"
# input_folder = Path("../data/spinnerf-dataset/3")

# backpack scene
name = "backpack-original"
input_folder = Path("../data/spinnerf-dataset/3")

output_dir = Path(f"../data/nerfstudio/{name}")
output_dir.mkdir(parents=True, exist_ok=True)
(output_dir / "original_images").mkdir(parents=True, exist_ok=True)
(output_dir / "images").mkdir(parents=True, exist_ok=True)
(output_dir / "masks").mkdir(parents=True, exist_ok=True)
(output_dir / "depth").mkdir(parents=True, exist_ok=True)

target_height = 512
target_width = 512

image_filename_to_idx = {}

device = "cuda:0"

for idx, image_filename in enumerate(sorted(list((input_folder / "images").iterdir()))):

    if idx < 40:
        continue

    image_filename_to_idx[os.path.basename(image_filename)] = idx - 40

    image_file_path = f"images/image_{idx - 40:06d}.png"
    mask_file_path = f"masks/mask_{idx - 40:06d}.png"
    depth_file_path = f"depth/depth_{idx - 40:06d}.npy"

    image = mediapy.read_image(image_filename)
    original_height, original_width = image.shape[:2]
    image = mediapy.resize_image(image, (target_height, target_width))
    # image = cv2.rotate(image, cv2.ROTATE_90_CLOCKWISE)

    mask = mediapy.read_image(str(image_filename).replace("/images/", "/images_4/label/").replace(".jpg", ".png")).astype("float32")
    mask = cv2.resize(mask, (target_width, target_height), 0, 0, interpolation = cv2.INTER_NEAREST)

    if name.split("-")[-1] != "original":
        mask = torch.from_numpy(mask)[None,None].to(device)
        dilate_iters = 10
        kernel_size = 3
        for _ in range(dilate_iters):
            mask = dilate(mask, kernel_size=kernel_size)
        mask = mask[0,0].cpu().numpy()
        mask = 1 - mask

        y, x = (mask == 0.0).nonzero()
        center = int(y.min() + (y.max() - y.min()) * .25)

        mask[:center] = 1
    else:
        mask = torch.from_numpy(mask)[None,None].to(device)
        dilate_iters = 2
        kernel_size = 3
        for _ in range(dilate_iters):
            mask = dilate(mask, kernel_size=kernel_size)
        mask = mask[0,0].cpu().numpy()
        mask = 1 - mask

        mediapy.write_image(output_dir / Path("original_" + str(image_file_path)), image)

    
    image = image * mask[..., None].astype("uint8")

    mediapy.write_image(output_dir / image_file_path, image)
    mediapy.write_image(output_dir / mask_file_path, mask)
    np.save(output_dir / depth_file_path, np.zeros_like(mask).astype("float32"))

    # break

In [None]:
recon_dir = input_folder / "sparse/0"
cam_id_to_camera = read_cameras_binary(recon_dir / "cameras.bin")
im_id_to_image = read_images_binary(recon_dir / "images.bin")

frames = []
for _, (im_id, im_data) in enumerate(im_id_to_image.items()):
    
    if im_data.name not in image_filename_to_idx:
        continue
    idx = image_filename_to_idx[im_data.name]

    image_file_path = f"images/image_{idx:06d}.png"
    mask_file_path = f"masks/mask_{idx:06d}.png"
    depth_file_path = f"depth/depth_{idx:06d}.npy"

    # NB: COLMAP uses Eigen / scalar-first quaternions
    # * https://colmap.github.io/format.html
    # * https://github.com/colmap/colmap/blob/bf3e19140f491c3042bfd85b7192ef7d249808ec/src/base/pose.cc#L75
    # the `rotation_matrix()` handles that format for us.

    # TODO(1480) BEGIN use pycolmap API
    # rotation = im_data.rotation_matrix()
    rotation = qvec2rotmat(im_data.qvec)

    translation = im_data.tvec.reshape(3, 1)
    w2c = np.concatenate([rotation, translation], 1)
    w2c = np.concatenate([w2c, np.array([[0, 0, 0, 1]])], 0)
    c2w = np.linalg.inv(w2c)
    # Convert from COLMAP's camera coordinate system (OpenCV) to ours (OpenGL)
    c2w[0:3, 1:3] *= -1
    c2w = c2w[np.array([1, 0, 2, 3]), :]
    c2w[2, :] *= -1

    # c2w[0:3, 1:3] *= -1

    # R = scipy.spatial.transform.Rotation.from_euler("z", 90, degrees=True).as_matrix()
    # c2w[:3,:3] = c2w[:3,:3] @ R

    out = parse_colmap_camera_params(cam_id_to_camera[im_data.camera_id])
    del out["camera_model"]

    frame = {"file_path": image_file_path, "mask_path": mask_file_path, "depth_path": depth_file_path, "transform_matrix": c2w.tolist(), "colmap_im_id": im_id}
    frame.update(out)

    frame["w"] = target_width
    frame["h"] = target_height
    frame["cx"] = frame["cx"] * target_width / original_width
    frame["fl_x"] = frame["fl_x"] * target_width / original_width
    frame["cy"] = frame["cy"] * target_height / original_height
    frame["fl_y"] = frame["fl_y"] * target_height / original_height

    frames.append(frame)

final_out = {}
final_out["frames"] = sorted(frames, key=lambda x: x["file_path"])
final_out["camera_model"] = "OPENCV"

applied_transform = np.eye(4)[:3, :]
applied_transform = applied_transform[np.array([1, 0, 2]), :]
applied_transform[2, :] *= -1
final_out["applied_transform"] = applied_transform.tolist()

with open(output_dir / "transforms.json", "w", encoding="utf-8") as f:
    json.dump(final_out, f, indent=4)