In [1]:
import os
import glob
import cv2
import numpy as np

import util.io

import torch
import torch.optim as optim
from torchvision.transforms import Compose

from dpt.models import DPTDepthModel
from dpt.midas_net import MidasNet_large
from dpt.transforms import Resize, NormalizeImage, PrepareForNet

In [2]:
torch.manual_seed(0)

# k8s paths
k8s = False
k8s_repo = r'opt/repo/dynamic-inference'
k8s_pvc = r'../../christh9-pvc'

# path settings
input_path = 'input'
output_path = 'output_monodepth'
model_path = 'weights/dpt_hybrid_nyu-2ce69ec7.pt'

if k8s:
    input_path = os.path.join(k8s_repo, input_path)
    output_path = os.path.join(k8s_repo, output_path)
    model_path = os.path.join(k8s_pvc, 'dpt-hybrid-nyu.pt')
#     script_output = os.path.join(k8s_pvc, 'dpt-timings', f'runtimes-{device_name}.csv')

In [3]:
# select device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device: %s" % device)

model_type = 'dpt_hybrid_nyu'

# load network
if model_type == "dpt_large":  # DPT-Large
    net_w = net_h = 384
    model = DPTDepthModel(
        path=model_path,
        backbone="vitl16_384",
        non_negative=True,
        enable_attention_hooks=False,
    )
    normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
elif model_type == "dpt_hybrid":  # DPT-Hybrid
    net_w = net_h = 384
    model = DPTDepthModel(
        path=model_path,
        backbone="vitb_rn50_384",
        non_negative=True,
        enable_attention_hooks=False,
    )
    normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
elif model_type == "dpt_hybrid_kitti":
    net_w = 1216
    net_h = 352

    model = DPTDepthModel(
        path=model_path,
        scale=0.00006016,
        shift=0.00579,
        invert=True,
        backbone="vitb_rn50_384",
        non_negative=True,
        enable_attention_hooks=False,
    )

    normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
elif model_type == "dpt_hybrid_nyu":
    net_w = 640
    net_h = 480

    model = DPTDepthModel(
        path=model_path,
        scale=0.000305,
        shift=0.1378,
        invert=True,
        backbone="vitb_rn50_384",
        non_negative=True,
        enable_attention_hooks=False,
    )

    normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
elif model_type == "midas_v21":  # Convolutional model
    net_w = net_h = 384

    model = MidasNet_large(model_path, non_negative=True)
    normalization = NormalizeImage(
        mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
    )
else:
    assert (
        False
    ), f"model_type '{model_type}' not implemented, use: --model_type [dpt_large|dpt_hybrid|dpt_hybrid_kitti|dpt_hybrid_nyu|midas_v21]"
    
transform = Compose(
    [
        Resize(
            net_w,
            net_h,
            resize_target=None,
            keep_aspect_ratio=True,
            ensure_multiple_of=32,
            resize_method="minimal",
            image_interpolation_method=cv2.INTER_CUBIC,
        ),
        normalization,
        PrepareForNet(),
    ]
)

model.to(device)

device: cpu


DPTDepthModel(
  (pretrained): Module(
    (model): VisionTransformer(
      (patch_embed): HybridEmbed(
        (backbone): ResNetV2(
          (stem): Sequential(
            (conv): StdConv2dSame(3, 64, kernel_size=(7, 7), stride=(2, 2), bias=False)
            (norm): GroupNormAct(
              32, 64, eps=1e-05, affine=True
              (act): ReLU(inplace=True)
            )
            (pool): MaxPool2dSame(kernel_size=(3, 3), stride=(2, 2), padding=(0, 0), dilation=(1, 1), ceil_mode=True)
          )
          (stages): Sequential(
            (0): ResNetStage(
              (blocks): Sequential(
                (0): Bottleneck(
                  (downsample): DownsampleConv(
                    (conv): StdConv2dSame(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
                    (norm): GroupNormAct(
                      32, 256, eps=1e-05, affine=True
                      (act): Identity()
                    )
                  )
                  (conv1): St

In [12]:
import pytorch_lightning as pl

class InteriorNetDPT(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.model = DPTDepthModel(
                        path=model_path,
#                         scale=s,
#                         shift=t,
                        scale=0.000305,
                        shift=0.1378,
                        invert=True,
                        backbone="vitb_rn50_384",
                        non_negative=True,
                        enable_attention_hooks=False,
                     )
    
    def forward(self):
        return self.model(x)
    
    def training_step(self, batch, batch_idx):
        x, y = batch['image'], batch['depth']
        yhat = self.model(x)
        loss = SILog(yhat, 1/y)
        self.log('train_loss', loss, on_epoch=True)
        
        metrics = get_metrics(yhat.detach(), 1/y.detach())
        self.log('absrel', metrics[0], on_epoch=True)
        self.log('delta_acc', metrics[1], on_epoch=True)
        self.log('mae', metrics[2], on_epoch=True)
        return loss
    
    def configure_optimizers(self):
        return optim.Adam(self.parameters(), lr=1e-5)


model = InteriorNetDPT()
model.model.pretrained.model.patch_embed.requires_grad = False

In [4]:
# figure out how to implement this in training, don't use this for now

# if optimize == True and device == torch.device("cuda"):
#     model = model.to(memory_format=torch.channels_last)
#     model = model.half()

In [10]:
import re
import itertools
from torch.utils.data import Dataset, DataLoader

from video_inference_common.video_inference.datasets import interiornet

def getlines(files: [str], subsample):
    ''' helper function to get stripped lines from multiple files'''
    names = []
    # todo: delete after verification
    if not subsample:
        for f in files:
            names.append(map(lambda x: x.strip(), open(f).readlines()))
        return list(itertools.chain.from_iterable(names))

    else:
        for f in files:
            names.append(open(f).readline().strip())
            break
            
        return names

class InteriorNetDataset(Dataset):
    def __init__(self, dataset_path: str, train=True, transform=None, subsample=False):
        '''
        dataset_path: path to the folder containing the txts that specify dataset
                      (relative to ./dynamic-inference)
        train: specify to use the training or test split
        transform: optional transform to be applied per sample
        subsample: take a subsample of all the training data
        '''
        subsets = re.compile(f'.*?({"train" if train else "test"}).*?')
        video_names = map(lambda p: os.path.join(dataset_path, p), 
                          filter(subsets.match, os.listdir(dataset_path)))
        self.videos = np.array(getlines(video_names, subsample))
        self.transform = transform
        self.path = dataset_path
        
    def __len__(self):
        return 1000 * len(self.videos) # each video is 1000 frames
    
    def __getitem__(self, idx):
        
        # idx will come as video_index * frame_index
        img_name = self.videos[idx // 1000]
        frame_idx = idx % 1000
        
        im = interiornet.read_rgb(img_name, frame_idx)
        depth = interiornet.read_depth(img_name, frame_idx)
        
        if self.transform:
            im = self.transform({'image': im})['image']
        
        return {'image': im, 'depth': depth}

In [5]:
def SILogLoss(yhat, y, L = 1):
    '''
    yhat: prediction
    y: ground truth
    L: λ in the paper, [0,1]. L=0 gives elementwise L2 loss, 
       L=1 gives scale-invariant loss.
    https://arxiv.org/pdf/1406.2283.pdf
    '''
    idx = ~torch.isnan(y)
    di = torch.log(yhat[idx]) - torch.log(y[idx])
    
    return (di**2).mean() - L * di.mean() ** 2

In [6]:
def get_metrics(yhat, y, metrics=['absrel', 'delta', 'mae']):
    
    yhat.detach()
    y.detach()
    
    values = []
    idx = ~torch.isnan(y)
    
    if 'absrel' in metrics:
        values.append((torch.abs(y[idx] - yhat[idx]) / yhat[idx]).mean())
    if 'delta' in metrics:
        values.append(0)
    if 'mae' in metrics:
        values.append((torch.abs(y[idx] - yhat[idx])).mean())
        
    return np.array(values)

In [7]:
model.load_state_dict(torch.load(model_path))

<All keys matched successfully>

In [8]:
dataset_path = 'video_inference_common/resources'
if k8s:
    dataset_path = os.path.join(k8s_repo, dataset_path)
    
batch_size = 1

interiornet_dataset = InteriorNetDataset(dataset_path, transform=transform, subsample=True)
dataloader = DataLoader(interiornet_dataset, 
                        batch_size=batch_size, 
                        shuffle=True, 
                        num_workers=0)

In [13]:
# get shifted statistics
full_dataset = InteriorNetDataset(dataset_path, transform=transform)
d = []
p = 0.1
for i in np.random.choice(len(full_dataset), size=round(p*len(full_dataset)), replace=False):
    d.append(full_dataset[i]['depth'].flatten())

d = np.array(d)
t = np.median(d)
s = (d - t).mean()

s: 0.07850567
t: 2.8652542


In [14]:
num_steps = 1
losses = []
metrics = []
lr = 1e-5

optimizer = optim.Adam(model.parameters(), lr)

for step in range(num_steps):
    running_loss = 0.0
    for sample in dataloader:
        x, y = sample['image'], sample['depth']
        x.to(device)
        y.to(device)
        yhat = model(x)
        loss = SILogLoss((yhat - t) / s, y)
        loss.backward()
        running_loss += loss.item() * x.size(0)
        
        metrics.append(get_metrics(yhat, y))
        
        optimizer.step()
        optimizer.zero_grad()
    
    losses.append(running_loss / len(interiornet_dataset))



In [None]:
# data saving

import pandas as pd

metrics = np.array(metrics)

logs_dir = os.path.join(k8s_pvc, 'train-logs')

if not os.path.exists(logs_dir):
    os.mkdir(logs_dir)
    
df = pd.DataFrame({
                   'loss': losses, 
                   'absrel': metrics[:,0],
                   'mae': metrics[:,1],
                   'delta': metrics[:,2]
                  })

df.to_csv(os.path.join(logs_dir, 'testrun.csv'))

torch.save(model.state_dict(), os.path.join(logs_dir, 'finetune.pt'))

In [15]:
pip install torchsummary

Collecting torchsummary
  Downloading torchsummary-1.5.1-py3-none-any.whl (2.8 kB)
Installing collected packages: torchsummary
Successfully installed torchsummary-1.5.1
You should consider upgrading via the '/Users/cho/.pyenv/versions/3.8.5/bin/python3.8 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [16]:
from torchsummary import summary

In [23]:
summary(model,(3,480,640))



----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
     StdConv2dSame-1         [-1, 64, 240, 320]           9,408
              ReLU-2         [-1, 64, 240, 320]               0
      GroupNormAct-3         [-1, 64, 240, 320]             128
     MaxPool2dSame-4         [-1, 64, 120, 160]               0
     StdConv2dSame-5        [-1, 256, 120, 160]          16,384
          Identity-6        [-1, 256, 120, 160]               0
      GroupNormAct-7        [-1, 256, 120, 160]             512
    DownsampleConv-8        [-1, 256, 120, 160]               0
     StdConv2dSame-9         [-1, 64, 120, 160]           4,096
             ReLU-10         [-1, 64, 120, 160]               0
     GroupNormAct-11         [-1, 64, 120, 160]             128
    StdConv2dSame-12         [-1, 64, 120, 160]          36,864
             ReLU-13         [-1, 64, 120, 160]               0
     GroupNormAct-14         [-1, 64, 1

In [36]:
(48000 - 460.64) / 8880.35

5.353320533537529

In [37]:
3FO4IW2QC9U7_original_1_1

nan