# Estimate depth using Depth Anything model

In [None]:
# Make sure you have git-lfs installed (https://git-lfs.com)
!git lfs install
!git clone https://huggingface.co/spaces/LiheYoung/Depth-Anything
%cd Depth-Anything
!pip install -r requirements.txt --quiet

Git LFS initialized.
Cloning into 'Depth-Anything'...
remote: Enumerating objects: 393, done.[K
remote: Counting objects: 100% (6/6), done.[K
remote: Compressing objects: 100% (6/6), done.[K
remote: Total 393 (delta 1), reused 0 (delta 0), pack-reused 387 (from 1)[K
Receiving objects: 100% (393/393), 2.25 MiB | 7.51 MiB/s, done.
Resolving deltas: 100% (78/78), done.
Filtering content: 100% (84/84), 9.93 GiB | 38.78 MiB/s, done.
/content/Depth-Anything
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.5/101.5 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.6/16.6 MB[0m [31m43.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.0/92.0 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m305.1/305.1 kB[0m [31m24.2 MB/s[0m eta [36m0:00:0

In [None]:
# Access Google Drive data
from google.colab import drive
drive.mount('/content/drive')

MessageError: Error: credential propagation was unsuccessful

## 1. Relative depth estimation using Depth Anything model

In [None]:
import cv2
import numpy as np
import os
import torch
import torch.nn.functional as F
from torchvision.transforms import Compose
from tqdm import tqdm

SCANNET_SCENE_DIR = '/content/drive/MyDrive/3d-machine-learning/scannet/scene0708_00'

from depth_anything.dpt import DepthAnything
from depth_anything.util.transform import Resize, NormalizeImage, PrepareForNet

# Possible values for the encoder: ['vits', 'vitb', 'vitl']

def estimate_depth(depth_anything_model, img_path, out_dir, out_dir_vis):
    DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
    depth_anything_model = depth_anything_model.to(DEVICE).eval()

    total_params = sum(param.numel() for param in depth_anything_model.parameters())
    print('Total parameters: {:.2f}M'.format(total_params / 1e6))

    transform = Compose([
        Resize(
            width=518,
            height=518,
            resize_target=False,
            keep_aspect_ratio=True,
            ensure_multiple_of=14,
            resize_method='lower_bound',
            image_interpolation_method=cv2.INTER_CUBIC,
        ),
        NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        PrepareForNet(),
    ])

    if os.path.isfile(img_path):
        if img_path.endswith('txt'):
            with open(img_path, 'r') as f:
                filenames = f.read().splitlines()
        else:
            filenames = [img_path]
    else:
        filenames = os.listdir(img_path)
        filenames = [os.path.join(img_path, filename) for filename in filenames if not filename.startswith('.')]
        filenames.sort()

    os.makedirs(out_dir, exist_ok=True)

    for filename in tqdm(filenames):
        raw_image = cv2.imread(filename)
        image = cv2.cvtColor(raw_image, cv2.COLOR_BGR2RGB) / 255.0

        h, w = image.shape[:2]

        image = transform({'image': image})['image']
        image = torch.from_numpy(image).unsqueeze(0).to(DEVICE)

        with torch.no_grad():
            depth = depth_anything_model(image)

        depth = F.interpolate(depth[None], (h, w), mode='bilinear', align_corners=False)[0, 0]
        depth = depth.cpu().numpy()

        # Save result
        filename = os.path.basename(filename)
        if not os.path.exists(out_dir):
            os.makedirs(out_dir)
        np.save(os.path.join(out_dir, filename[:filename.rfind('.')] + '.npy'), depth)

        # Save some visualizations
        if (out_dir_vis is not None):
            if not os.path.exists(out_dir_vis):
                os.makedirs(out_dir_vis)

            depth = ((depth - depth.min()) / (depth.max() - depth.min()) * 255.0).astype(np.uint8)

            depth_gray = np.repeat(depth[..., np.newaxis], 3, axis=-1)
            cv2.imwrite(os.path.join(out_dir_vis, filename[:filename.rfind('.')] + '_gray.png'), depth_gray)

            depth_false_color = cv2.applyColorMap(depth, cv2.COLORMAP_INFERNO)
            cv2.imwrite(os.path.join(out_dir_vis, filename[:filename.rfind('.')] + '_color.png'), depth_false_color)
    return

In [None]:
# Define Depth Anything model using Visual Transformer Large (vitl)
depth_anything = DepthAnything.from_pretrained('LiheYoung/depth_anything_vitl14')

In [None]:
img_path = os.path.join(SCANNET_SCENE_DIR, 'test/rgb')
out_dir = os.path.join(SCANNET_SCENE_DIR, 'test/depth_DA')
out_dir_vis = os.path.join(SCANNET_SCENE_DIR, 'test/depth_DA_visualization')
estimate_depth(depth_anything, img_path, out_dir, out_dir_vis)

Total parameters: 335.32M


100%|██████████| 8/8 [06:16<00:00, 47.05s/it]


In [None]:
img_path = os.path.join(SCANNET_SCENE_DIR, 'train/rgb')
out_dir = os.path.join(SCANNET_SCENE_DIR, 'train/depth_DA')
out_dir_vis = os.path.join(SCANNET_SCENE_DIR, 'train/depth_DA_visualization')
estimate_depth(depth_anything, img_path, out_dir, out_dir_vis)

NameError: name 'os' is not defined

# 2. Metric depth estimation using fine tuned Depth Anything model

In [None]:
# Install Depth Anything from GitHub to have the ZoeDepth requirements
%mkdir Depth_Anything_GitHub`
%cd Depth_Anything_GitHub
!git clone https://github.com/LiheYoung/Depth-Anything
%cd Depth-Anything
!pip install -r requirements.txt --quiet

/bin/bash: -c: line 1: unexpected EOF while looking for matching ``'
/bin/bash: -c: line 2: syntax error: unexpected end of file
[Errno 2] No such file or directory: 'Depth_Anything_GitHub'
/content/Depth-Anything
Cloning into 'Depth-Anything'...
remote: Enumerating objects: 430, done.[K
remote: Counting objects: 100% (153/153), done.[K
remote: Compressing objects: 100% (112/112), done.[K
remote: Total 430 (delta 107), reused 47 (delta 41), pack-reused 277[K
Receiving objects: 100% (430/430), 237.89 MiB | 33.37 MiB/s, done.
Resolving deltas: 100% (150/150), done.
Updating files: 100% (219/219), done.
/content/Depth-Anything/Depth-Anything
[0m

In [None]:
%cd metric_depth

/content/Depth-Anything/Depth-Anything/metric_depth


In [None]:
import os
import yaml

with open("environment.yml") as file_handle:
    environment_data = yaml.safe_load(file_handle)

for dependency in environment_data["dependencies"]:
    if isinstance(dependency, dict):
      for lib in dependency['pip']:
        os.system(f"pip install {lib}")

In [None]:
# Copy Depth Anything checkpoints so that zoedepth can find them
%cp -r /content/Depth-Anything/checkpoints ./checkpoints

In [None]:
%matplotlib inline

In [None]:
# Define Depth Anything model using Visual Transformer Large (vitl) fine tuned on NYU_v2
# Code based on script by @1ssb
# https://github.com/LiheYoung/Depth-Anything/issues/36

# import argparse
from tqdm import tqdm
import os, glob
import cv2
import numpy as np
import torch
import torch.nn.functional as F
from torchvision.transforms import Compose
import torchvision.transforms as transforms
from zoedepth.models.builder import build_model
from zoedepth.utils.config import get_config

import matplotlib.pyplot as plt


def infer(model, image, dataset):
    """
    Performs model inference on a single image.

    Args:
        model (torch.nn.Module): The depth estimation model.
        image (torch.Tensor): The input image tensor.
        dataset (str): The name of the dataset being used.

    Returns:
        torch.Tensor: Predicted depth map.
    """
    model.eval()
    pred = model(image, dataset=dataset)
    return pred

def get_depth_from_prediction(pred):
    """
    Extracts the depth map from model prediction.

    Args:
        pred (torch.Tensor | list | tuple | dict): Model prediction.

    Returns:
        torch.Tensor: Extracted depth map.
    """
    if isinstance(pred, torch.Tensor):
        return pred
    elif isinstance(pred, (list, tuple)):
        return pred[-1]
    elif isinstance(pred, dict):
        return pred.get('metric_depth', pred.get('out'))
    else:
        raise TypeError(f"Unknown output type {type(pred)}")

def process_image(model, image_path, out_dir, out_dir_vis, dataset):
    """
    Processes a single image, performs depth estimation, and saves the resulting point cloud.

    Args:
        model (torch.nn.Module): The depth estimation model.
        image_path (str): Path to the image file.
        out_dir (str): Directory to save the predicted depth.
        out_dir_vis (str): Directory to save the visualization.
        dataset (str): The name of the dataset being used.
    """
    DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
    transform = Compose([
        Resize(
            width=392,
            height=518,
            resize_target=False,
            keep_aspect_ratio=True,
            ensure_multiple_of=14,
            resize_method='lower_bound',
            image_interpolation_method=cv2.INTER_CUBIC,
        ),
        #NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        PrepareForNet(),
    ])

    raw_image = cv2.imread(image_path)
    image = cv2.cvtColor(raw_image, cv2.COLOR_BGR2RGB) / 255.0
    h, w = image.shape[:2]

    image = transform({'image': image})['image']
    image = torch.from_numpy(image).unsqueeze(0).to(DEVICE)

    pred_dict = infer(model, image, dataset)
    pred = get_depth_from_prediction(pred_dict).squeeze(0)

    depth = F.interpolate(pred[None], (h, w), mode='bilinear', align_corners=False)[0, 0]
    depth = pred.squeeze().detach().cpu().numpy().astype(np.float32)

    min_depth, max_depth = np.min(depth[depth > 0]), np.max(depth)
    print(f"Processed {image_path}: Min Depth: {min_depth}, Max Depth: {max_depth}")

    # Save result
    filename = os.path.basename(image_path)
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)
    np.save(os.path.join(out_dir, filename[:filename.rfind('.')] + '.npy'), depth)

    # Save some visualizations
    if (out_dir_vis is not None):
        if not os.path.exists(out_dir_vis):
            os.makedirs(out_dir_vis)

        depth = ((depth - depth.min()) / (depth.max() - depth.min()) * 255.0).astype(np.uint8)

        depth_gray = np.repeat(depth[..., np.newaxis], 3, axis=-1)
        cv2.imwrite(os.path.join(out_dir_vis, filename[:filename.rfind('.')] + '_gray.png'), depth_gray)

        depth_false_color = cv2.applyColorMap(depth, cv2.COLORMAP_INFERNO)
        cv2.imwrite(os.path.join(out_dir_vis, filename[:filename.rfind('.')] + '_color.png'), depth_false_color)
    return

def main(config, input_dir, output_dir, out_dir_vis, dataset):
    """
    Main function to process all images in a directory.

    Args:
        config (dict): Configuration for the model.
        input_dir (str): Directory containing input images.
        output_dir (str): Directory to save point clouds.
        dataset (str): The name of the dataset being used.
    """
    model = build_model(config).to('cuda' if torch.cuda.is_available() else 'cpu')
    model.eval()

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    image_paths = glob.glob(os.path.join(input_dir, '*.png')) + glob.glob(os.path.join(input_dir, '*.jpg'))
    if not image_paths:
        print("No images found in the input directory.")
        return

    for image_path in tqdm(image_paths, desc="Processing Images"):
        try:
            process_image(model, image_path, output_dir, out_dir_vis, dataset)
        except Exception as e:
            print(f"Error processing {image_path}: {e}")

def test_model(model_name, pretrained_resource, input_dir, output_dir, out_dir_vis, dataset):
    """
    Tests a model with given parameters.

    Args:
        model_name (str): The name of the model.
        pretrained_resource (str): Path to pretrained model weights.
        input_dir (str): Directory containing input images.
        output_dir (str): Directory to save point clouds.
        dataset (str): The name of the dataset being used.
    """
    config = get_config(model_name, "eval", dataset)
    if pretrained_resource:
        config.pretrained_resource = pretrained_resource
    main(config, input_dir, output_dir, out_dir_vis, dataset)

In [None]:
# Read camera parameters for the sequence
# Not used right now but might be interesting in the future
def read_camera_list(filename):
  """
  Reads camera information from a text file in COLMAP format.

  Args:
    filename: The path to the text file.

  Returns:
    A list of dictionaries, where each dictionary represents a camera with keys:
      - camera_id: The ID of the camera (string).
      - model: The camera model (string).
      - width: The image width (integer).
      - height: The image height (integer).
      - params: A list of camera parameters (floats).
  """
  with open(filename, "r") as f:
    lines = f.readlines()

    # Find the number of cameras from the header
    for line in lines:
        if not line.startswith('#'):
          # Read camera information
          camera_data = line.strip().split()

          # Extract data
          camera_id, model, width, height, *params = camera_data

          # Convert data types
          width = int(width)
          height = int(height)
          params = [float(p) for p in params]

          # Create camera dictionary
          return {
              "camera_id": camera_id,
              "model": model,
              "width": width,
              "height": height,
              "focal_length": params[0],
              "central_point_x": params[1],
              "central_point_y": params[2],
          }

In [None]:
img_path = os.path.join(SCANNET_SCENE_DIR, 'test/rgb')
out_dir = os.path.join(SCANNET_SCENE_DIR, 'test/metric_depth_DA_ft_NYUv2')
out_dir_vis = os.path.join(SCANNET_SCENE_DIR, 'test/metric_depth_DA_ft_NYUv2_visualization')
# estimate_depth(depth_anything_fine_tuned_NYU_v2, img_path, out_dir, out_dir_vis)
test_model('zoedepth', 'local::/content/Depth-Anything/checkpoints_metric_depth/depth_anything_metric_depth_indoor.pt', img_path, out_dir, out_dir_vis, 'nyu')

Params passed to Resize transform:
	width:  518
	height:  392
	resize_target:  True
	keep_aspect_ratio:  False
	ensure_multiple_of:  14
	resize_method:  minimal
Using pretrained resource local::/content/Depth-Anything/checkpoints_metric_depth/depth_anything_metric_depth_indoor.pt
Loaded successfully


Processing Images:  12%|█▎        | 1/8 [00:22<02:35, 22.22s/it]

Processed /content/drive/MyDrive/3d-machine-learning/scannet/scene0708_00/test/rgb/417.jpg: Min Depth: 0.667252242565155, Max Depth: 5.105383396148682


Processing Images:  25%|██▌       | 2/8 [00:48<02:29, 24.87s/it]

Processed /content/drive/MyDrive/3d-machine-learning/scannet/scene0708_00/test/rgb/853.jpg: Min Depth: 0.9710784554481506, Max Depth: 2.736940622329712


Processing Images:  38%|███▊      | 3/8 [01:09<01:54, 22.88s/it]

Processed /content/drive/MyDrive/3d-machine-learning/scannet/scene0708_00/test/rgb/200.jpg: Min Depth: 0.7274956703186035, Max Depth: 2.606996536254883


Processing Images:  50%|█████     | 4/8 [01:29<01:26, 21.66s/it]

Processed /content/drive/MyDrive/3d-machine-learning/scannet/scene0708_00/test/rgb/965.jpg: Min Depth: 1.1321760416030884, Max Depth: 4.995934963226318


Processing Images:  62%|██████▎   | 5/8 [01:48<01:02, 20.76s/it]

Processed /content/drive/MyDrive/3d-machine-learning/scannet/scene0708_00/test/rgb/714.jpg: Min Depth: 0.6265469789505005, Max Depth: 5.582090854644775


Processing Images:  75%|███████▌  | 6/8 [02:09<00:41, 20.89s/it]

Processed /content/drive/MyDrive/3d-machine-learning/scannet/scene0708_00/test/rgb/12.jpg: Min Depth: 0.8831966519355774, Max Depth: 3.143127679824829


Processing Images:  88%|████████▊ | 7/8 [02:28<00:20, 20.37s/it]

Processed /content/drive/MyDrive/3d-machine-learning/scannet/scene0708_00/test/rgb/211.jpg: Min Depth: 0.7460336685180664, Max Depth: 2.4937057495117188


Processing Images: 100%|██████████| 8/8 [02:48<00:00, 21.01s/it]

Processed /content/drive/MyDrive/3d-machine-learning/scannet/scene0708_00/test/rgb/258.jpg: Min Depth: 0.8508521318435669, Max Depth: 3.3603603839874268





In [None]:
img_path = os.path.join(SCANNET_SCENE_DIR, 'train/rgb')
out_dir = os.path.join(SCANNET_SCENE_DIR, 'train/metric_depth_DA_ft_NYUv2')
out_dir_vis = os.path.join(SCANNET_SCENE_DIR, 'train/metric_depth_DA_ft_NYUv2_visualization')
test_model('zoedepth', 'local::/content/Depth-Anything/checkpoints_metric_depth/depth_anything_metric_depth_indoor.pt', img_path, out_dir, out_dir_vis, 'nyu')

Params passed to Resize transform:
	width:  518
	height:  392
	resize_target:  True
	keep_aspect_ratio:  False
	ensure_multiple_of:  14
	resize_method:  minimal
Using pretrained resource local::/content/Depth-Anything/checkpoints_metric_depth/depth_anything_metric_depth_indoor.pt
Loaded successfully


Processing Images:   0%|          | 0/18 [00:00<?, ?it/s]

Processed /content/drive/MyDrive/3d-machine-learning/scannet/scene0708_00/train/rgb/333.jpg: Min Depth: 0.793071448802948, Max Depth: 6.502934455871582


Processing Images:  11%|█         | 2/18 [00:42<05:39, 21.23s/it]

Processed /content/drive/MyDrive/3d-machine-learning/scannet/scene0708_00/train/rgb/229.jpg: Min Depth: 0.8292563557624817, Max Depth: 2.997804641723633


Processing Images:  17%|█▋        | 3/18 [01:04<05:17, 21.16s/it]

Processed /content/drive/MyDrive/3d-machine-learning/scannet/scene0708_00/train/rgb/86.jpg: Min Depth: 0.8270928263664246, Max Depth: 3.2769672870635986


Processing Images:  22%|██▏       | 4/18 [01:23<04:47, 20.50s/it]

Processed /content/drive/MyDrive/3d-machine-learning/scannet/scene0708_00/train/rgb/347.jpg: Min Depth: 0.752595841884613, Max Depth: 6.0402750968933105


Processing Images:  28%|██▊       | 5/18 [01:46<04:39, 21.52s/it]

Processed /content/drive/MyDrive/3d-machine-learning/scannet/scene0708_00/train/rgb/149.jpg: Min Depth: 0.7656700015068054, Max Depth: 3.0674631595611572


Processing Images:  33%|███▎      | 6/18 [02:06<04:11, 20.97s/it]

Processed /content/drive/MyDrive/3d-machine-learning/scannet/scene0708_00/train/rgb/639.jpg: Min Depth: 1.3340481519699097, Max Depth: 4.455844879150391


Processing Images:  39%|███▉      | 7/18 [02:26<03:46, 20.56s/it]

Processed /content/drive/MyDrive/3d-machine-learning/scannet/scene0708_00/train/rgb/677.jpg: Min Depth: 0.6701034903526306, Max Depth: 6.152810573577881


Processing Images:  44%|████▍     | 8/18 [02:47<03:27, 20.78s/it]

Processed /content/drive/MyDrive/3d-machine-learning/scannet/scene0708_00/train/rgb/794.jpg: Min Depth: 0.8746646046638489, Max Depth: 5.774078845977783


Processing Images:  50%|█████     | 9/18 [03:07<03:04, 20.52s/it]

Processed /content/drive/MyDrive/3d-machine-learning/scannet/scene0708_00/train/rgb/1030.jpg: Min Depth: 0.9427445530891418, Max Depth: 4.111330032348633


Processing Images:  56%|█████▌    | 10/18 [03:29<02:46, 20.77s/it]

Processed /content/drive/MyDrive/3d-machine-learning/scannet/scene0708_00/train/rgb/280.jpg: Min Depth: 0.7710055708885193, Max Depth: 5.116616249084473


Processing Images:  61%|██████    | 11/18 [03:49<02:24, 20.61s/it]

Processed /content/drive/MyDrive/3d-machine-learning/scannet/scene0708_00/train/rgb/320.jpg: Min Depth: 0.7512742280960083, Max Depth: 6.040557861328125


Processing Images:  67%|██████▋   | 12/18 [04:09<02:02, 20.41s/it]

Processed /content/drive/MyDrive/3d-machine-learning/scannet/scene0708_00/train/rgb/951.jpg: Min Depth: 1.2117100954055786, Max Depth: 5.221408367156982


Processing Images:  72%|███████▏  | 13/18 [04:31<01:45, 21.06s/it]

Processed /content/drive/MyDrive/3d-machine-learning/scannet/scene0708_00/train/rgb/448.jpg: Min Depth: 0.7787032723426819, Max Depth: 4.8587541580200195


Processing Images:  78%|███████▊  | 14/18 [04:51<01:22, 20.67s/it]

Processed /content/drive/MyDrive/3d-machine-learning/scannet/scene0708_00/train/rgb/930.jpg: Min Depth: 1.1113507747650146, Max Depth: 4.527143955230713


Processing Images:  83%|████████▎ | 15/18 [05:12<01:02, 20.85s/it]

Processed /content/drive/MyDrive/3d-machine-learning/scannet/scene0708_00/train/rgb/759.jpg: Min Depth: 0.6677700877189636, Max Depth: 6.23529052734375


Processing Images:  89%|████████▉ | 16/18 [05:32<00:41, 20.57s/it]

Processed /content/drive/MyDrive/3d-machine-learning/scannet/scene0708_00/train/rgb/610.jpg: Min Depth: 1.3050090074539185, Max Depth: 3.6760177612304688


Processing Images:  94%|█████████▍| 17/18 [05:53<00:20, 20.65s/it]

Processed /content/drive/MyDrive/3d-machine-learning/scannet/scene0708_00/train/rgb/979.jpg: Min Depth: 0.9364773631095886, Max Depth: 4.350281715393066


Processing Images: 100%|██████████| 18/18 [06:13<00:00, 20.75s/it]

Processed /content/drive/MyDrive/3d-machine-learning/scannet/scene0708_00/train/rgb/890.jpg: Min Depth: 0.9114779233932495, Max Depth: 2.5443437099456787



