In [10]:
import os
import sys
import cv2
import numpy as np
import torch
import Metric3D



# Add the Metric3D directory to the Python path
sys.path.append(os.path.join(os.getcwd(), 'Metric3D'))

from Metric3D.mono.model.monodepth_model import get_configured_monodepth_model

# Prepare data
rgb_file = 'Metric3D/data/kitti_demo/rgb/0000000050.png'
depth_file = 'Metric3D/data/kitti_demo/depth/0000000050.png'
intrinsic = [707.0493, 707.0493, 604.0814, 180.5066]
gt_depth_scale = 256.0
rgb_origin = cv2.imread(rgb_file)[:, :, ::-1]

# Adjust input size to fit pretrained model
input_size = (616, 1064)  # for vit model
# input_size = (544, 1216)  # for convnext model
h, w = rgb_origin.shape[:2]
scale = min(input_size[0] / h, input_size[1] / w)
rgb = cv2.resize(rgb_origin, (int(w * scale), int(h * scale)), interpolation=cv2.INTER_LINEAR)
intrinsic = [intrinsic[0] * scale, intrinsic[1] * scale, intrinsic[2] * scale, intrinsic[3] * scale]
padding = [123.675, 116.28, 103.53]
h, w = rgb.shape[:2]
pad_h = input_size[0] - h
pad_w = input_size[1] - w
pad_h_half = pad_h // 2
pad_w_half = pad_w // 2 
rgb = cv2.copyMakeBorder(rgb, pad_h_half, pad_h - pad_h_half, pad_w_half, pad_w - pad_w_half, cv2.BORDER_CONSTANT, value=padding)
pad_info = [pad_h_half, pad_h - pad_h_half, pad_w_half, pad_w - pad_w_half]

# Normalize
mean = torch.tensor([123.675, 116.28, 103.53]).float()[:, None, None]
std = torch.tensor([58.395, 57.12, 57.375]).float()[:, None, None]
rgb = torch.from_numpy(rgb.transpose((2, 0, 1))).float()
rgb = torch.div((rgb - mean), std)
rgb = rgb[None, :, :, :].cuda()

# Load the model
model = get_configured_monodepth_model(pretrain=True)
model.cuda().eval()

# Inference
with torch.no_grad():
    pred_depth, confidence, output_dict = model.inference({'input': rgb})

# Unpad
pred_depth = pred_depth.squeeze()
pred_depth = pred_depth[pad_info[0]:pred_depth.shape[0] - pad_info[1], pad_info[2]:pred_depth.shape[1] - pad_info[3]]

# Upsample to original size
pred_depth = torch.nn.functional.interpolate(pred_depth[None, None, :, :], rgb_origin.shape[:2], mode='bilinear').squeeze()

# De-canonical transform
canonical_to_real_scale = intrinsic[0] / 1000.0  # 1000.0 is the focal length of canonical camera
pred_depth = pred_depth * canonical_to_real_scale  # now the depth is metric
pred_depth = torch.clamp(pred_depth, 0, 300)

TypeError: get_configured_monodepth_model() missing 1 required positional argument: 'cfg'