In [1]:
import os
import pandas as pd 
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import transforms

from torch.utils.data import Dataset
from PIL import Image
import glob

from torchvision import models
import tqdm

import time
from torch.autograd import Variable
import torch.nn.functional as F
from torchvision.transforms import Resize, Compose, ToPILImage, ToTensor
import pickle
import math

#from efficientnet_pytorch import EfficientNet

#from kornia.filters import SpatialGradient

import random
from torchvision.transforms import RandomCrop

In [2]:
patch_size = 512

In [3]:
class MonocularDepthDataset(Dataset):
    def __init__(self, df, transform=None, crop_size=patch_size):
        self.df = df
        self.transform = transform
        self.crop_size = crop_size

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        image_path = self.df.iloc[idx]['image']
        depth_path = self.df.iloc[idx]['depth']

        image = Image.open(image_path) ##no rgb, takes grayscale
        depth = Image.open(depth_path)

        # randomly crop image and depth
        i, j, h, w = RandomCrop.get_params(image, output_size=(self.crop_size, self.crop_size))
        image = F.crop(image, i, j, h, w)
        depth = F.crop(depth, i, j, h, w)

        if self.transform:
            image = self.transform(image)
            depth = self.transform(depth)

        return image, depth

In [4]:


def gradient_loss_fn(gen_frames, gt_frames, alpha=1):
    def gradient(x):
        # idea from tf.image.image_gradients(image)
        # https://github.com/tensorflow/tensorflow/blob/r2.1/tensorflow/python/ops/image_ops_impl.py#L3441-L3512
        # x: (b,c,h,w), float32 or float64
        # dx, dy: (b,c,h,w)

        h_x = x.size()[-2]
        w_x = x.size()[-1]
        # gradient step=1
        left = x
        right = F.pad(x, [0, 1, 0, 0])[:, :, :, 1:]
        top = x
        bottom = F.pad(x, [0, 0, 0, 1])[:, :, 1:, :]

        # dx, dy = torch.abs(right - left), torch.abs(bottom - top)
        dx, dy = right - left, bottom - top 
        # dx will always have zeros in the last column, right-left
        # dy will always have zeros in the last row,    bottom-top
        dx[:, :, :, -1] = 0
        dy[:, :, -1, :] = 0

        return dx, dy

    # gradient
    gen_dx, gen_dy = gradient(gen_frames)
    gt_dx, gt_dy = gradient(gt_frames)
    #
    grad_diff_x = torch.abs(gt_dx - gen_dx)
    grad_diff_y = torch.abs(gt_dy - gen_dy)

    # condense into one tensor and avg
    return torch.mean(grad_diff_x ** alpha + grad_diff_y ** alpha)

class DepthEstimationLoss(nn.Module):
    def __init__(self, alpha=0.5):
        super(DepthEstimationLoss, self).__init__()
        self.alpha = alpha


    def forward(self, pred_depth, true_depth):
        pred_depth = torch.clamp(pred_depth, min=1e-8)
        true_depth = torch.clamp(true_depth, min=1e-8)

        # Scale-invariant MSE loss
        diff = torch.log(pred_depth) - torch.log(true_depth)
        mse_loss = torch.mean(diff**2)
        scale_invariant_mse_loss = mse_loss - (self.alpha * (torch.sum(diff)**2)) / (true_depth.numel()**2)

    

        gradient_loss = gradient_loss_fn(pred_depth,true_depth,alpha=self.alpha)

        total_loss = (scale_invariant_mse_loss + gradient_loss)/2

        return total_loss

In [5]:
def conv_relu_block(in_channel,out_channel,kernel,padding):
    return nn.Sequential(
            nn.Conv2d(in_channel,out_channel, kernel_size = kernel, padding=padding),
            nn.ReLU()) #nn.ReLU(inplace=True) #nn.Ge

In [6]:
class vanilla_unet(nn.Module):
    def __init__(self, n_class):
        super().__init__()
        self.input_1 = conv_relu_block(3,3,3,1) ##grayscale inputs
        #self.input_2 = conv_relu_block(64, 64, 3, 1) #no extra channels

        self.base_model = models.resnet18(pretrained=True)
        self.base_layers = list(self.base_model.children())

        self.l0 = nn.Sequential(*self.base_layers[:3])
        self.U0_conv = conv_relu_block(64, 64, 1, 0)
        self.conv_up0 = conv_relu_block(64 + 256, 128, 3, 1)

        self.l1 = nn.Sequential(*self.base_layers[3:5])
        self.U1_conv = conv_relu_block(64, 64, 1, 0)
        self.conv_up1 = conv_relu_block(64 + 256, 256, 3, 1)

        self.l2 = self.base_layers[5]
        self.U2_conv = conv_relu_block(128, 128, 1, 0)
        self.conv_up2 = conv_relu_block(128 + 512, 256, 3, 1)

        self.l3 = self.base_layers[6]
        self.U3_conv = conv_relu_block(256, 256, 1, 0)
        self.conv_up3 = conv_relu_block(256 + 512, 512, 3, 1)

        self.l4 = self.base_layers[7]
        self.U4_conv = conv_relu_block(512, 512, 1, 0)

        self.conv_up4 = conv_relu_block(64 + 128, 64, 3, 1)

        self.out4 = nn.Conv2d(128, n_class, 1)

        self.upsample = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)

    def forward(self, x):
        x = torch.cat([x,x,x], axis = 1)
        x = self.input_1(x)
        
        #print(x.shape,'x')
         #concat on channel
        #x_one = self.input_2(x_one)
        block0 = self.l0(x)
        block1 = self.l1(block0)
        block2 = self.l2(block1)
        block3 = self.l3(block2)
        block4 = self.l4(block3)

        block4 = self.U4_conv(block4)
        x = self.upsample(block4)

        #print(block0.shape, block1.shape, block2.shape,block3.shape,block4.shape)
        block3 = self.U3_conv(block3)
        #print(x.shape, block3.shape)
        x = torch.cat([x, block3], axis=1)
        x = self.conv_up3(x)

        x = self.upsample(x)
        block2 = self.U2_conv(block2)
        x = torch.cat([x, block2], axis=1)
        x = self.conv_up2(x)

        x = self.upsample(x)
        block1 = self.U1_conv(block1)
        x = torch.cat([x, block1], axis=1)
        x = self.conv_up1(x)

        x = self.upsample(x)
        block0 = self.U0_conv(block0)
        x = torch.cat([x, block0], axis=1)
        x = self.conv_up0(x)
        out4 = self.out4(x)

        out4_upsampled = F.interpolate(out4, scale_factor=2, mode='bilinear', align_corners=True)
        
        relu = nn.ReLU()
        out = relu(out4_upsampled)
        
        
        return out



In [7]:
v = vanilla_unet(5)



In [130]:
x = torch.ones((1,1,patch_size,patch_size))

v.forward(x)

tensor([[[[0.0757, 0.0544, 0.0331,  ..., 0.0000, 0.0217, 0.0548],
          [0.0869, 0.0730, 0.0590,  ..., 0.0000, 0.0265, 0.0581],
          [0.0982, 0.0916, 0.0849,  ..., 0.0013, 0.0313, 0.0613],
          ...,
          [0.0581, 0.0790, 0.0999,  ..., 0.0000, 0.0202, 0.0585],
          [0.0515, 0.0640, 0.0765,  ..., 0.0000, 0.0000, 0.0243],
          [0.0449, 0.0490, 0.0532,  ..., 0.0000, 0.0000, 0.0000]],

         [[0.0000, 0.0000, 0.0000,  ..., 0.0300, 0.0226, 0.0152],
          [0.0000, 0.0000, 0.0000,  ..., 0.0288, 0.0335, 0.0382],
          [0.0429, 0.0254, 0.0079,  ..., 0.0277, 0.0444, 0.0612],
          ...,
          [0.0718, 0.0815, 0.0912,  ..., 0.0386, 0.0313, 0.0239],
          [0.0563, 0.0675, 0.0787,  ..., 0.0235, 0.0278, 0.0320],
          [0.0409, 0.0535, 0.0662,  ..., 0.0083, 0.0242, 0.0402]],

         [[0.0734, 0.0804, 0.0873,  ..., 0.1074, 0.0844, 0.0615],
          [0.0518, 0.0430, 0.0341,  ..., 0.0391, 0.0494, 0.0597],
          [0.0302, 0.0056, 0.0000,  ..., 0

In [9]:
#model = depth_model(num_classes=1).to('cuda')
#model = resunet(n_class=1).to('cuda')
model = vanilla_unet(n_class=1).to('cuda')

#model = effunet(n_class=1).to('cuda')

In [10]:
# Set hyperparameters, dataset paths, and other configurations
batch_size = 8
learning_rate = 0.001
num_epochs = 10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

transform = transforms.Compose([
    transforms.Resize((patch_size, patch_size)),
    transforms.ToTensor()
])

df = pd.read_csv('train.csv')
train_dataset = MonocularDepthDataset(df, transform = transform)
#val_dataset = MonocularDepthDataset(val_image_paths, val_depth_paths, transform)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
#val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)


criterion = DepthEstimationLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

for epoch in tqdm.tqdm_notebook(range(num_epochs)):
    #train_loss = train(model, train_dataloader, optimizer, criterion, device)
    
    model.train()
    running_loss = 0.0
    
    for images, depths in tqdm.tqdm_notebook(train_dataloader):
        images = images.to(device)
        depths = depths.to(device)
        depths /= 10587

        optimizer.zero_grad()

        outputs = model(images)
        loss = criterion(outputs[-1].float(), depths.float())
        loss.backward()

        optimizer.step()
        running_loss += loss.item()
    train_loss = running_loss / len(train_dataloader)
    #val_loss = validate(model, val_dataloader, criterion, device)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for epoch in tqdm.tqdm_notebook(range(num_epochs)):


  0%|          | 0/10 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for images, depths in tqdm.tqdm_notebook(train_dataloader):


  0%|          | 0/224 [00:00<?, ?it/s]

FileNotFoundError: [Errno 2] No such file or directory: 'inputs/834a953a2d7446a89ea3bde1c2084b81-1617744150700003958.png'