In [1]:
from mono.model.monodepth_model import get_configured_monodepth_model
from mono.utils.running import load_ckpt
try:
    from mmcv.utils import Config
    # from mmcv.utils import Config, DictAction
except:
    from mmengine import Config
    # from mmengine import Config, DictAction
import cv2
import matplotlib.pyplot as plt
import open3d as o3d
import numpy as np
import torch
from PIL import Image
from mono.utils.do_test import transform_test_data_scalecano, get_prediction

Jupyter environment detected. Enabling Open3D WebVisualizer.
[Open3D INFO] WebRTC GUI backend enabled.
[Open3D INFO] WebRTCWindowSystem: HTTP handshake server disabled.


In [2]:
cfg_large = Config.fromfile('./mono/configs/HourglassDecoder/vit.raft5.large.py')
model_large = get_configured_monodepth_model(cfg_large, )
model_large, _,  _, _ = load_ckpt('./weight/metric_depth_vit_large_800k.pth', model_large, strict_match=False)
model_large.eval()
device = "cuda"
# device = "cpu"
model_large.to(device)


Triton is not available, some optimizations will not be enabled.
A matching Triton is not available, some optimizations will not be enabled.
Error caught was: No module named 'triton'


DepthModel(
  (depth_model): DensePredModel(
    (encoder): DinoVisionTransformer(
      (patch_embed): PatchEmbed(
        (proj): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
        (norm): Identity()
      )
      (blocks): ModuleList(
        (0): BlockChunk(
          (0-23): 24 x Block(
            (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (attn): MemEffAttention(
              (qkv): Linear(in_features=1024, out_features=3072, bias=True)
              (attn_drop): Dropout(p=0.0, inplace=False)
              (proj): Linear(in_features=1024, out_features=1024, bias=True)
              (proj_drop): Dropout(p=0.0, inplace=False)
            )
            (ls1): LayerScale()
            (drop_path1): Identity()
            (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (mlp): Mlp(
              (fc1): Linear(in_features=1024, out_features=4096, bias=True)
              (act): GELU(approximate='none')
          

In [3]:
def predict_depth_normal(img, fx=1000.0, fy=1000.0, state_cache={},model = model_large,cfg = cfg_large):
    """
    Predict depth map and normal map from input image, camera intrinsic

    """
    # if model_selection == "vit-small":
    #     model = model_small
    #     cfg = cfg_small
    # elif model_selection == "vit-large":
    #     model = model_large
    #     cfg = cfg_large

    # else:
    #     return None, None, None, None, state_cache, "Not implemented model."
    
    if img is None:
        return None, None, None, None, state_cache, "Please upload an image and wait for the upload to complete."

    model = model_large
    cfg = cfg_large
    cv_image = np.array(img) 
    img = cv2.cvtColor(cv_image, cv2.COLOR_BGR2RGB)

    # intrinsic = [fx, fy, cx, cy]
    intrinsic = [fx, fy, img.shape[1]/2, img.shape[0]/2]   #Assuming cx and cy are half of image width and height
    
    rgb_input, cam_models_stacks, pad, label_scale_factor = transform_test_data_scalecano(img, intrinsic, cfg.data_basic)
    
    with torch.no_grad():
        pred_depth, pred_depth_scale, scale, output, _ = get_prediction(
                    model = model,
                    input = rgb_input,
                    cam_model = cam_models_stacks,
                    pad_info = pad,
                    scale_info = label_scale_factor,
                    gt_depth = None,
                    normalize_scale = cfg.data_basic.depth_range[1],
                    ori_shape=[img.shape[0], img.shape[1]],
                )
    pred_depth = pred_depth.squeeze().cpu().numpy()
    pred_depth[pred_depth<0] = 0
    # pred_color = gray_to_colormap(pred_depth)

    pred_normal = output['normal_out_list'][0][:, :3, :, :] 
    H, W = pred_normal.shape[2:]
    pred_normal = pred_normal[:, :, pad[0]:H-pad[1], pad[2]:W-pad[3]]
    pred_normal = torch.nn.functional.interpolate(pred_normal, [img.shape[0], img.shape[1]], mode='bilinear').squeeze()
    pred_normal = pred_normal.permute(1,2,0)
    pred_normal = pred_normal.cpu().numpy()

    return  pred_depth, pred_depth_scale, scale, pred_normal

In [13]:
def get_pcd(H, W, depth_map, cx, cy, fx, fy):
    """
    Calculate x,y,z for 3d point cloud using camera intrinsics and depth map
    
    """
    x_row = np.arange(0, W)
    x = np.tile(x_row, (H, 1))
    x = x.astype(np.float32)
    u_m_cx = x - cx

    y_col = np.arange(0, H)  
    y = np.tile(y_col, (W, 1)).T
    y = y.astype(np.float32)
    v_m_cy = y - cy

    x = u_m_cx / fx
    y = v_m_cy / fy
    z = np.ones_like(x)
    pw = np.stack([x, y, z], axis=2)  
    pcd = depth_map[:, :, None] * pw
    return pcd

In [14]:
if __name__ == "__main__":
    img = Image.open("data\sample_input\IMG20240610231531.jpg")
    #Default values of fx and fy, actial values calculated from camera calibration.
    fx=1000.0
    fy=1000.0
    voxel_down=0.02
    #Depth Map
    depth_map,pred_depth_scale,scale,pred_normal=predict_depth_normal(img, fx=fx, fy=fy,
                                                                       state_cache={},model=model_large,cfg=cfg_large)
    
    # Scale Down the depth map and correspondingly other features to reduce the overall number of points in point cloud
    if isinstance(depth_map, (np.ndarray) ):
        img=np.array(img)
        #intrinsic = [fx, fy, cx, cy] 
        intrinsic = [fx, fy, img.shape[1]/2, img.shape[0]/2]  #Assuming cx and cy are half of image width and height
        if depth_map.shape[0] > 1080:
                scale = 1080 / depth_map.shape[0]
                depth_map = cv2.resize(depth_map, (0, 0), fx=scale, fy=scale, interpolation=cv2.INTER_LINEAR)
                img = cv2.resize(img, (0, 0), fx=scale, fy=scale, interpolation=cv2.INTER_LINEAR)
                pred_normal = cv2.resize(pred_normal, (0, 0), fx=scale, fy=scale, interpolation=cv2.INTER_LINEAR)
                intrinsic = [intrinsic[0]*scale, intrinsic[1]*scale, intrinsic[2]*scale, intrinsic[3]*scale]
                
    ##Point Cloud Creation
    #Points Creation 2d array
        if type(depth_map) == torch.__name__:
            depth_map = depth_map.cpu().numpy().squeeze()
        #Reduce noise in depth map
        depth_map = cv2.medianBlur(depth_map, 5)
        H, W = depth_map.shape
        
        pcd = get_pcd(H, W, depth_map, intrinsic[2], intrinsic[3], intrinsic[0], intrinsic[1])

       
    else:
         print("Point Cloud Not Created")
        

    #Points Color and Normal reshaping as 1d array
    pcd_filtered = pcd.reshape(-1, 3)
    img_filtered = img.reshape(-1, 3)
    pred_normal=pred_normal.reshape(-1,3)
    
    #o3d point cloud creation 
    pcd2 = o3d.geometry.PointCloud()
    pcd2.points = o3d.utility.Vector3dVector(pcd_filtered)
    pcd2.colors = o3d.utility.Vector3dVector(np.array(img_filtered)/255.0)
    pcd2.normals = o3d.utility.Vector3dVector(pred_normal)

    # pcd2 = pcd2.voxel_down_sample(voxel_size=voxel_down)
    o3d.visualization.draw_geometries([pcd2])


In [11]:

o3d.io.write_point_cloud("data\sample_output\IMG20240610231531.ply", pcd2)

True