### Setup Depth-Anything-V2

In [None]:
!git clone https://github.com/DepthAnything/Depth-Anything-V2.git
%cd Depth-Anything-V2
!pip install -q -r requirements.txt

### Setup the dataset

In [None]:
SCENE = "0097"

!unzip -q /content/drive/MyDrive/thesis/Datasets/{SCENE}-nerf.zip -d /content/{SCENE}

In [None]:
import os
import shutil
import glob
import random

In [None]:
SRC_DIR = f"/content/{SCENE}"
OUT_DIR = f"/content/data"

!rm -rf {OUT_DIR}

os.makedirs(os.path.join(OUT_DIR, "images"), exist_ok=True)
os.makedirs(os.path.join(OUT_DIR, "depths"), exist_ok=True)

In [None]:
image_paths = sorted(glob.glob(f"{SRC_DIR}/images/*.png"))
depth_paths = sorted(glob.glob(f"{SRC_DIR}/depths/*.npy"))

print(f"Found {len(image_paths)} images and {len(depth_paths)} depths.")

# Duplicate protection
used = set()
for img_path in image_paths:
    fname = os.path.basename(img_path)
    scene = os.path.basename(os.path.dirname(os.path.dirname(img_path)))
    shutil.copy(img_path, os.path.join(OUT_DIR, "images", fname))
    used.add(fname.replace("color.png", ""))

for dpath in depth_paths:
    fname = os.path.basename(dpath)
    scene = os.path.basename(os.path.dirname(os.path.dirname(dpath)))
    new_base = fname.replace("depth.npy", "")
    if new_base in used:
        shutil.copy(dpath, os.path.join(OUT_DIR, "depths", fname))

used = sorted(list(used))
random.seed(42)
random.shuffle(used)
split = int(0.9 * len(used))

with open(os.path.join(OUT_DIR, "train.txt"), "w") as f:
    for name in used[:split]:
        f.write(name + "\n")

with open(os.path.join(OUT_DIR, "val.txt"), "w") as f:
    for name in used[split:]:
        f.write(name + "\n")

print(f"Moved data to {OUT_DIR} with {split} train and {len(used)-split} val samples.")

* Create the NeRFRoMaDataset class file in the `metric_depth/dataset` dir

In [None]:
%%bash
cat << 'EOF' > /content/Depth-Anything-V2/metric_depth/dataset/custom.py
import os
import cv2
import torch
import numpy as np
from torch.utils.data import Dataset
from torchvision.transforms import Compose
from dataset.transform import Resize, NormalizeImage, PrepareForNet, Crop
import logging

class CustomDepthDataset(Dataset):
    def __init__(self, names_file, root_dir, mode='train', size=(518,518),
                 near=0.1, far=20.0):
        self.mode = mode
        self.size = size
        self.root = root_dir.rstrip('/')
        self.near = near
        self.far = far

        with open(names_file, 'r') as f:
            raw_names = [l.strip() for l in f if l.strip()]
        cleaned = [n.rstrip('.').strip() for n in raw_names]

        self.names = []
        img_suffix = '.color.png'
        depth_suffix = '.depth.npy'

        # Build valid sample list
        for name in cleaned:
            img_path = os.path.join(self.root, 'images', name + img_suffix)
            depth_path = os.path.join(self.root, 'depths', name + depth_suffix)
            if not os.path.isfile(img_path):
                logging.warning(f"Image not found: {img_path}, skipping.")
                continue
            if not os.path.isfile(depth_path):
                logging.warning(f"Depth not found: {depth_path}, skipping.")
                continue
            self.names.append((name, img_path, depth_path))

        net_w, net_h = size
        self.transform = Compose([
            Resize(
                width=net_w, height=net_h,
                resize_target=(mode == 'train'),
                keep_aspect_ratio=True,
                ensure_multiple_of=14,
                resize_method='lower_bound',
                image_interpolation_method=cv2.INTER_CUBIC
            ),
            NormalizeImage(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225]),
            PrepareForNet(),
            Crop(net_w) if mode == 'train' else lambda x: x
        ])

    def __len__(self):
        return len(self.names)

    def __getitem__(self, idx):
        name, img_path, depth_path = self.names[idx]

        img = cv2.imread(img_path, cv2.IMREAD_COLOR)
        if img is None:
            raise FileNotFoundError(f"Failed to load image '{img_path}'")
        img = img[..., ::-1].astype(np.float32) / 255.0

        depth = np.load(depth_path).astype(np.float32)

        depth = depth * (self.far - self.near) + self.near

        sample = self.transform({'image': img, 'depth': depth})
        sample['image'] = torch.from_numpy(sample['image'])
        sample['depth'] = torch.from_numpy(sample['depth'])
        sample['valid_mask'] = torch.isfinite(sample['depth'])
        sample['depth'][~sample['valid_mask']] = 0

        return sample
EOF

### Training Script

It is pasted in the `metric_depth/colab_train.py` for easier running in Colab.
As the `Depth-Anything-V2` repository does not have the option to be installed as a package, and it's modules can not be imported.

In [None]:
%%bash
cat << 'EOF' > /content/Depth-Anything-V2/metric_depth/colab_train.py
import os, sys, random, logging, glob, re, json
import cv2
import numpy as np
import torch
import torch.backends.cudnn as cudnn
import torch.nn.functional as F

from tqdm import tqdm
from torch.utils.data import DataLoader
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.utils.tensorboard import SummaryWriter

from dataset.transform import Resize, NormalizeImage, PrepareForNet, Crop
from dataset.custom import CustomDepthDataset

from depth_anything_v2.dpt import DepthAnythingV2
from util.loss import SiLogLoss
from util.metric import eval_depth
from util.utils import init_log

# Configuration
cfg = {
    "img_size": 518,
    "min_depth": 0.1,
    "max_depth": 10.0,

    "epochs": 210,
    "bs": 2,
    "lr": 1e-6,  # encoder LR; decoder LR will *10

    "encoder": "vitl",

    "freeze_encoder": True,
    "freeze_epochs": 10,

    "lr_warmup": 20,
    "lr_scheduler": "cosine",
    "weight_decay": 0.00001
}

# Paths
root_dir        = "/content/data"
train_list      = os.path.join(root_dir, "train.txt")
val_list        = os.path.join(root_dir, "val.txt")
save_path       = "/content/runs"
ckpt_path       = f"/content/drive/MyDrive/thesis/Depth_Anything_V2/Checkpoints_Metrics/0097_{cfg['encoder']}"
pretrained_path = f"/content/drive/MyDrive/thesis/Depth_Anything_V2/Checkpoints_Metrics/depth_anything_v2_{cfg['encoder']}.pth"
metrics_json    = os.path.join(ckpt_path, f"metrics_{cfg['encoder']}.json")
loss_json_path = os.path.join(ckpt_path, f"losses_{cfg['encoder']}.json")

os.makedirs(ckpt_path, exist_ok=True)
os.makedirs(save_path, exist_ok=True)

# Setup
torch.cuda.empty_cache()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
cudnn.benchmark = True

# Load or initialize metrics history
if os.path.exists(metrics_json):
    with open(metrics_json, 'r') as f:
        metrics_history = json.load(f)
else:
    metrics_history = {}

if os.path.exists(loss_json_path):
    with open(loss_json_path, 'r') as f:
        loss_history = json.load(f)
else:
    loss_history = {}

# Resume or load pretrained
pattern = os.path.join(ckpt_path, f"checkpoint_{cfg['encoder']}_epoch_*.pth")
ckpt_files = glob.glob(pattern)
if ckpt_files:
    def _epoch_from_path(p):
        m = re.search(r"epoch_(\d+)\.pth$", os.path.basename(p))
        return int(m.group(1)) if m else -1
    ckpt_files.sort(key=_epoch_from_path)
    latest_ckpt = ckpt_files[-1]
    ckpt = torch.load(latest_ckpt, map_location=device)
    start_epoch = ckpt['epoch'] + 1
    best_d1      = ckpt.get('best_d1', 0.0)
    model_state  = ckpt['state_dict']
    optim_state  = ckpt['optimizer']
    scheduler_state = ckpt['scheduler']
    print(f"Resumed from {latest_ckpt} → starting epoch {start_epoch}, best_d1={best_d1:.3f}")
else:
    ckpt = torch.load(pretrained_path, map_location=device)
    start_epoch = 0
    best_d1     = ckpt.get('best_d1', 0.0)
    model_state = ckpt.get('state_dict', ckpt)
    optim_state = None
    scheduler_state = None
    print(f"Loaded pretrained weights from {pretrained_path}")

# Logger and TensorBoard
logger = init_log("global", logging.INFO)
logger.propagate = 0
writer = SummaryWriter(save_path)

# Data loaders
train_ds = CustomDepthDataset(train_list, root_dir, mode="train",
                              size=(cfg["img_size"], cfg["img_size"]),
                              near=cfg["min_depth"], far=cfg["max_depth"])
val_ds   = CustomDepthDataset(val_list,   root_dir, mode="val",
                              size=(cfg["img_size"], cfg["img_size"]),
                              near=cfg["min_depth"], far=cfg["max_depth"])
train_loader = DataLoader(train_ds, batch_size=cfg["bs"], shuffle=True,
                          num_workers=2, pin_memory=True, drop_last=True)
val_loader   = DataLoader(val_ds,   batch_size=1, shuffle=False,
                          num_workers=2, pin_memory=True, drop_last=True)

# Model
model_cfgs = {
    "vits": {"encoder":"vits","features":64,  "out_channels":[48,96,192,384]},
    "vitb": {"encoder":"vitb","features":128, "out_channels":[96,192,384,768]},
    "vitl": {"encoder":"vitl","features":256, "out_channels":[256,512,1024,1024]},
    "vitg": {"encoder":"vitg","features":384, "out_channels":[1536]*4}
}
model = DepthAnythingV2(**{**model_cfgs[cfg["encoder"]], "max_depth":cfg["max_depth"]}).to(device)
model.load_state_dict(model_state)

criterion = SiLogLoss().to(device)

# Optimizer and scheduler setup
optimizer = AdamW([
    {"params":[p for n,p in model.named_parameters() if "pretrained" in n],     "lr":cfg["lr"]},
    {"params":[p for n,p in model.named_parameters() if "pretrained" not in n], "lr":cfg["lr"]*10}
], lr=cfg["lr"], betas=(0.9,0.999), weight_decay=cfg.get("weight_decay",0.01))
if optim_state is not None:
    optimizer.load_state_dict(optim_state)

# Warmup and cosine
total_steps = cfg["epochs"] * len(train_loader)
warmup_steps = cfg.get("lr_warmup", 0) * len(train_loader)
base_lrs = [g["lr"] for g in optimizer.param_groups]
scheduler = None
if cfg.get("lr_scheduler") == "cosine":
    scheduler = CosineAnnealingLR(optimizer, T_max=total_steps - warmup_steps)
    if scheduler_state is not None:
        scheduler.load_state_dict(scheduler_state)

# Training loop
for epoch in range(start_epoch, cfg["epochs"]):
    if cfg.get("freeze_encoder") and epoch < cfg.get("freeze_epochs",0):
        for name, param in model.named_parameters():
            if "pretrained" in name:
                param.requires_grad = False
    elif cfg.get("freeze_encoder"):
        for name, param in model.named_parameters():
            if "pretrained" in name:
                param.requires_grad = True

    # Train
    model.train()
    train_bar = tqdm(train_loader, desc=f"[{epoch+1}/{cfg['epochs']}] train", ncols=80, miniters=100, unit="it")
    batch_losses = []

    for i, sample in enumerate(train_bar):
        img, depth, vm = sample["image"].to(device), sample["depth"].to(device), sample["valid_mask"].to(device)
        if random.random() < 0.5:
            img, depth, vm = img.flip(-1), depth.flip(-1), vm.flip(-1)
        pred = model(img)
        pred = torch.clamp(pred, cfg["min_depth"], cfg["max_depth"])
        mask = (vm==1) & (depth>=cfg["min_depth"]) & (depth<=cfg["max_depth"])
        loss = criterion(pred, depth, mask)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        batch_losses.append(loss.item())

        # LR warmup and scheduler step
        step = epoch * len(train_loader) + i
        if step < warmup_steps:
            for j, group in enumerate(optimizer.param_groups):
                group['lr'] = base_lrs[j] * float(step) / float(warmup_steps)
        elif scheduler:
            scheduler.step()

        lr_now = optimizer.param_groups[0]['lr']
        writer.add_scalar("train/loss", loss.item(), step)
        if i % 100 == 0:
            train_bar.set_postfix(loss=f"{loss.item():.3f}", lr=f"{lr_now:.2e}")

    loss_history[f"epoch_{epoch+1}"] = batch_losses

    # Validation
    model.eval()
    names   = ["d1","d2","d3","abs_rel","sq_rel","rmse","rmse_log","log10","silog"]
    metrics = torch.zeros(len(names), device=device)
    val_loss_total = 0.0
    count   = 0
    val_bar = tqdm(val_loader, desc=f"[{epoch+1}/{cfg['epochs']}] val", ncols=80, miniters=20, unit="it")
    with torch.no_grad():
        for sample in val_bar:
            img   = sample["image"].to(device).float()
            depth = sample["depth"][0].to(device)
            vm    = sample["valid_mask"][0].to(device)
            p     = model(img)
            p     = torch.clamp(p, cfg["min_depth"], cfg["max_depth"])
            p     = F.interpolate(p[:,None], depth.shape[-2:], mode="bilinear", align_corners=True)[0,0]
            m     = (vm==1) & (depth>=cfg["min_depth"]) & (depth<=cfg["max_depth"])
            if m.sum() < 10:
                val_bar.update(); continue
            res   = eval_depth(p[m], depth[m])
            loss_eval = criterion(p[None], depth[None], m[None])
            val_loss_total += loss_eval.item()
            metrics += torch.tensor([res[n] for n in names], device=device)
            count   += 1
            val_bar.set_postfix(d1=f"{res['d1']:.3f}")
    
    avg = metrics / count
    val_loss_avg = val_loss_total / count
    logger.info("Validation ▶ " + ", ".join(f"{n}:{avg[i].item():.3f}" for i,n in enumerate(names)) + f" | loss={val_loss_avg:.3f}")
    for i,n in enumerate(names):
        writer.add_scalar(f"val/{n}", avg[i].item(), epoch)

    # Checkpointing
    key = f"{cfg['encoder']}_epoch_{epoch+1}"
    metrics_history[key] = {name: avg[i].item() for i,name in enumerate(names)}
    metrics_history[key]["avg_loss"] = val_loss_avg

    if (epoch + 1) % 5 == 0 or epoch == cfg["epochs"]:
        ckpt_name = f"checkpoint_{cfg['encoder']}_epoch_{epoch+1:03d}.pth"
        torch.save({'epoch': epoch, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'best_d1': best_d1, 'scheduler':scheduler.state_dict()},
                   os.path.join(ckpt_path, ckpt_name))

    if avg[0] > best_d1:
        for fpath in glob.glob(os.path.join(ckpt_path, f"best_{cfg['encoder']}_epoch_*.pth")):
            os.remove(fpath)
        best_d1 = avg[0]
        best_ckpt = f"best_{cfg['encoder']}_epoch_{epoch+1:03d}.pth"
        torch.save({'epoch': epoch, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'best_d1': best_d1, 'scheduler':scheduler.state_dict()},
                    os.path.join(ckpt_path, best_ckpt))

    with open(metrics_json, 'w') as f:
        json.dump(metrics_history, f, indent=2)

    with open(loss_json_path, 'w') as f:
        json.dump(loss_history, f, indent=2)
EOF

In [None]:
!rm -rf /content/runs

In [None]:
!python /content/Depth-Anything-V2/metric_depth/colab_train.py