In [133]:
import os
import pandas as pd 
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import transforms

from torch.utils.data import Dataset
from PIL import Image
import glob

from torchvision import models
import tqdm

import time
from torch.autograd import Variable
import torch.nn.functional as F
from torchvision.transforms import Resize, Compose, ToPILImage, ToTensor
import pickle
import math

from efficientnet_pytorch import EfficientNet

from kornia.filters import SpatialGradient

In [57]:
class MonocularDepthDataset(Dataset):
    def __init__(self, df, transform=None):
        self.df = df
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        self.image_paths = df.iloc[idx]['image']
        self.depth_paths = df.iloc[idx]['depth']

        image = Image.open(self.image_paths).convert('RGB')
        depth = Image.open(self.depth_paths)

        if self.transform:
            image = self.transform(image)
            depth = self.transform(depth)

        return image, depth


In [126]:


def gradient_loss_fn(gen_frames, gt_frames, alpha=1):
    def gradient(x):
        # idea from tf.image.image_gradients(image)
        # https://github.com/tensorflow/tensorflow/blob/r2.1/tensorflow/python/ops/image_ops_impl.py#L3441-L3512
        # x: (b,c,h,w), float32 or float64
        # dx, dy: (b,c,h,w)

        h_x = x.size()[-2]
        w_x = x.size()[-1]
        # gradient step=1
        left = x
        right = F.pad(x, [0, 1, 0, 0])[:, :, :, 1:]
        top = x
        bottom = F.pad(x, [0, 0, 0, 1])[:, :, 1:, :]

        # dx, dy = torch.abs(right - left), torch.abs(bottom - top)
        dx, dy = right - left, bottom - top 
        # dx will always have zeros in the last column, right-left
        # dy will always have zeros in the last row,    bottom-top
        dx[:, :, :, -1] = 0
        dy[:, :, -1, :] = 0

        return dx, dy

    # gradient
    gen_dx, gen_dy = gradient(gen_frames)
    gt_dx, gt_dy = gradient(gt_frames)
    #
    grad_diff_x = torch.abs(gt_dx - gen_dx)
    grad_diff_y = torch.abs(gt_dy - gen_dy)

    # condense into one tensor and avg
    return torch.mean(grad_diff_x ** alpha + grad_diff_y ** alpha)

class DepthEstimationLoss(nn.Module):
    def __init__(self, alpha=0.5):
        super(DepthEstimationLoss, self).__init__()
        self.alpha = alpha


    def forward(self, pred_depth, true_depth):
        pred_depth = torch.clamp(pred_depth, min=1e-8)
        true_depth = torch.clamp(true_depth, min=1e-8)

        # Scale-invariant MSE loss
        diff = torch.log(pred_depth) - torch.log(true_depth)
        mse_loss = torch.mean(diff**2)
        scale_invariant_mse_loss = mse_loss - (self.alpha * (torch.sum(diff)**2)) / (true_depth.numel()**2)

    

        gradient_loss = gradient_loss_fn(pred_depth,true_depth,alpha=self.alpha)

        total_loss = (scale_invariant_mse_loss + gradient_loss)/2

        return total_loss

In [127]:
class depth_model(nn.Module):
    def __init__(self, num_classes, pretrained=True):
        super(depth_model, self).__init__()
        self.base_model = EfficientNet.from_pretrained('efficientnet-b0', num_classes=num_classes)

        self.num_classes = num_classes
        
        # Encoder layers (downsampling)
        self.encoder1 = self.base_model._conv_stem
        self.encoder2 = self.base_model._blocks[:2]
        self.encoder3 = self.base_model._blocks[2:5]
        self.encoder4 = self.base_model._blocks[5:12]
        self.encoder5 = self.base_model._blocks[12:]
        
        # Decoder layers (upsampling)
        self.decoder1 = self.decoder_block(320, 256)
        self.decoder2 = self.decoder_block(256, 128)
        self.decoder3 = self.decoder_block(128, 64)
        self.decoder4 = self.decoder_block(64, 32)
        
        # Output layers for each decoder stage
        self.output1 = nn.Conv2d(256, num_classes, kernel_size=1)
        self.output2 = nn.Conv2d(128, num_classes, kernel_size=1)
        self.output3 = nn.Conv2d(64, num_classes, kernel_size=1)
        self.output4 = nn.Conv2d(32, num_classes, kernel_size=1)

    def decoder_block(self, in_channels, out_channels):
        return nn.Sequential(
            nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True),
            nn.ConvTranspose2d(out_channels, out_channels, kernel_size=2, stride=2)
        )

    def forward(self, x):
        # Encoder
        e1 = self.encoder1(x)
        e2 = self._encode_block(e1, self.encoder2)
        e3 = self._encode_block(e2, self.encoder3)
        e4 = self._encode_block(e3, self.encoder4)
        e5 = self._encode_block(e4, self.encoder5)
        
        # Decoder
        print(self.decoder1(e5).shape,e4.shape)
        d1 = self.decoder1(e5)
        d1 = self.resize_and_add(d1, e4)
        d2 = self.decoder2(d1)
        d2 = self.resize_and_add(d2, e3)
        d3 = self.decoder3(d2)
        d3 = self.resize_and_add(d3, e2)
        d4 = self.decoder4(d3)
        d4 = self.resize_and_add(d4, e1)
        
        # Output for each decoder stage
        out1 = self.output1(d1)
        out2 = self.output2(d2)
        out3 = self.output3(d3)
        out4 = self.output4(d4)

        # Resize output to the same shape
        H, W = x.size(2), x.size(3)
        out1 = F.interpolate(out1, size=(H, W), mode='bilinear', align_corners=False)
        out2 = F.interpolate(out2, size=(H, W), mode='bilinear', align_corners=False)
        out3 = F.interpolate(out3, size=(H, W), mode='bilinear', align_corners=False)
        out4 = F.interpolate(out4, size=(H, W), mode='bilinear', align_corners=False)
        
        # Output: average of all the outputs
        out_avg = (out1 + out2 + out3 + out4) / 4.0

        return out1, out2, out3, out4, out_avg


    def _encode_block(self, x, block):
        for layer in block:
            x = layer(x)
        return x
    
    def resize_and_add(self, x1, x2):
        x1_size = (x2.size(2), x2.size(3))
        x1_resized = F.interpolate(x1, size=x1_size, mode='bilinear', align_corners=False)
        return x1_resized + x2


In [128]:
def conv_relu_block(in_channel,out_channel,kernel,padding):
    return nn.Sequential(
            nn.Conv2d(in_channel,out_channel, kernel_size = kernel, padding=padding),
            nn.ReLU()) #nn.ReLU(inplace=True) #nn.Ge

In [189]:
class resmnet(nn.Module):
    def __init__(self, n_class):
        super().__init__()
        
        import torch.nn as nn

        self.pool = nn.AvgPool2d(kernel_size=2)


        self.input_1 = conv_relu_block(3, 64, 3, 1)
        self.input_2 = conv_relu_block(64, 64, 3, 1)

        self.base_model = models.resnet18(pretrained=True)
        self.base_layers = list(self.base_model.children())
        
        self.block0_1_conv = conv_relu_block(67,64,1,0)
        self.block2_conv = conv_relu_block(131,128,1,0)

        
        self.l0 = nn.Sequential(*self.base_layers[:3])
        self.U0_conv = conv_relu_block(64, 64, 1, 0)
        self.conv_up0 = conv_relu_block(64 + 256, 128, 3, 1)

        self.l1 = nn.Sequential(*self.base_layers[3:5])
        self.U1_conv = conv_relu_block(64, 64, 1, 0)
        self.conv_up1 = conv_relu_block(64 + 256, 256, 3, 1)

        self.l2 = self.base_layers[5]
        self.U2_conv = conv_relu_block(128, 128, 1, 0)
        self.conv_up2 = conv_relu_block(128 + 512, 256, 3, 1)

        self.l3 = self.base_layers[6]
        self.U3_conv = conv_relu_block(256, 256, 1, 0)
        self.conv_up3 = conv_relu_block(256 + 512, 512, 3, 1)

        self.l4 = self.base_layers[7]
        self.U4_conv = conv_relu_block(512, 512, 1, 0)

        self.conv_up4 = conv_relu_block(64 + 128, 64, 3, 1)

        self.out1 = nn.Conv2d(512, n_class, 1)
        self.out2 = nn.Conv2d(256, n_class, 1)
        self.out3 = nn.Conv2d(256, n_class, 1)
        self.out4 = nn.Conv2d(128, n_class, 1)

        self.upsample = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)

    def forward(self, x):
        x_one = self.input_1(x)
        x_one = self.input_2(x_one)
        
        
        scale_img_2 = self.pool(x)
        scale_img_3 = self.pool(scale_img_2)
        scale_img_4 = self.pool(scale_img_3)

        block0 = self.l0(x)
        block0 = torch.concatenate([block0,scale_img_2],axis=1)
        block0 = self.block0_1_conv(block0)

        block1 = self.l1(block0)
        block1 = torch.concatenate([block1,scale_img_3],axis=1)
        block1 = self.block0_1_conv(block1)

        block2 = self.l2(block1)
        block2 = torch.concatenate([block2,scale_img_4],axis=1)
        block2 = self.block2_conv(block2)

        print(block2.shape)

        
        block3 = self.l3(block2)
        block4 = self.l4(block3)

        block4 = self.U4_conv(block4)
        x = self.upsample(block4)
        block3 = self.U3_conv(block3)
        x = torch.cat([x, block3], axis=1)
        x = self.conv_up3(x)
        out1 = self.out1(x)

        x = self.upsample(x)
        block2 = self.U2_conv(block2)
        x = torch.cat([x, block2], axis=1)
        x = self.conv_up2(x)
        out2 = self.out2(x)

        x = self.upsample(x)
        block1 = self.U1_conv(block1)
        x = torch.cat([x, block1], axis=1)
        x = self.conv_up1(x)
        out3 = self.out3(x)

        x = self.upsample(x)
        block0 = self.U0_conv(block0)
        x = torch.cat([x, block0], axis=1)
        x = self.conv_up0(x)
        out4 = self.out4(x)


        out1_upsampled = F.interpolate(out1, scale_factor=16, mode='bilinear', align_corners=True)
        out2_upsampled = F.interpolate(out2, scale_factor=8, mode='bilinear', align_corners=True)
        out3_upsampled = F.interpolate(out3, scale_factor=4, mode='bilinear', align_corners=True)
        out4_upsampled = F.interpolate(out4, scale_factor=2, mode='bilinear', align_corners=True)



        avg_out = (out1_upsampled + out2_upsampled + out3_upsampled + out4_upsampled) / 4

        return out1, out2, out3, out4, avg_out

In [190]:
class resunet(nn.Module):
    def __init__(self, n_class):
        super().__init__()
        self.input_1 = conv_relu_block(3, 64, 3, 1)
        self.input_2 = conv_relu_block(64, 64, 3, 1)

        self.base_model = models.resnet18(pretrained=True)
        self.base_layers = list(self.base_model.children())

        self.l0 = nn.Sequential(*self.base_layers[:3])
        self.U0_conv = conv_relu_block(64, 64, 1, 0)
        self.conv_up0 = conv_relu_block(64 + 256, 128, 3, 1)

        self.l1 = nn.Sequential(*self.base_layers[3:5])
        self.U1_conv = conv_relu_block(64, 64, 1, 0)
        self.conv_up1 = conv_relu_block(64 + 256, 256, 3, 1)

        self.l2 = self.base_layers[5]
        self.U2_conv = conv_relu_block(128, 128, 1, 0)
        self.conv_up2 = conv_relu_block(128 + 512, 256, 3, 1)

        self.l3 = self.base_layers[6]
        self.U3_conv = conv_relu_block(256, 256, 1, 0)
        self.conv_up3 = conv_relu_block(256 + 512, 512, 3, 1)

        self.l4 = self.base_layers[7]
        self.U4_conv = conv_relu_block(512, 512, 1, 0)

        self.conv_up4 = conv_relu_block(64 + 128, 64, 3, 1)

        self.out1 = nn.Conv2d(512, n_class, 1)
        self.out2 = nn.Conv2d(256, n_class, 1)
        self.out3 = nn.Conv2d(256, n_class, 1)
        self.out4 = nn.Conv2d(128, n_class, 1)

        self.upsample = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)

    def forward(self, x):
        x_one = self.input_1(x)
        x_one = self.input_2(x_one)

        block0 = self.l0(x)
        block1 = self.l1(block0)
        block2 = self.l2(block1)
        block3 = self.l3(block2)
        block4 = self.l4(block3)

        block4 = self.U4_conv(block4)
        x = self.upsample(block4)
        block3 = self.U3_conv(block3)
        x = torch.cat([x, block3], axis=1)
        x = self.conv_up3(x)
        out1 = self.out1(x)

        x = self.upsample(x)
        block2 = self.U2_conv(block2)
        x = torch.cat([x, block2], axis=1)
        x = self.conv_up2(x)
        out2 = self.out2(x)

        x = self.upsample(x)
        block1 = self.U1_conv(block1)
        x = torch.cat([x, block1], axis=1)
        x = self.conv_up1(x)
        out3 = self.out3(x)

        x = self.upsample(x)
        block0 = self.U0_conv(block0)
        x = torch.cat([x, block0], axis=1)
        x = self.conv_up0(x)
        out4 = self.out4(x)


        out1_upsampled = F.interpolate(out1, scale_factor=16, mode='bilinear', align_corners=True)
        out2_upsampled = F.interpolate(out2, scale_factor=8, mode='bilinear', align_corners=True)
        out3_upsampled = F.interpolate(out3, scale_factor=4, mode='bilinear', align_corners=True)
        out4_upsampled = F.interpolate(out4, scale_factor=2, mode='bilinear', align_corners=True)



        avg_out = (out1_upsampled + out2_upsampled + out3_upsampled + out4_upsampled) / 4

        return out1, out2, out3, out4, avg_out



In [191]:
#model = depth_model(num_classes=1).to('cuda')
#model = resunet(n_class=1).to('cuda')
model = resmnet(n_class=1).to('cuda')

#model = effunet(n_class=1).to('cuda')



In [None]:
# Set hyperparameters, dataset paths, and other configurations
batch_size = 8
learning_rate = 0.001
num_epochs = 10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

transform = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.ToTensor()
])

df = pd.read_csv('train.csv')
train_dataset = MonocularDepthDataset(df, transform = transform)
#val_dataset = MonocularDepthDataset(val_image_paths, val_depth_paths, transform)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
#val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)


criterion = DepthEstimationLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

for epoch in tqdm.tqdm_notebook(range(num_epochs)):
    #train_loss = train(model, train_dataloader, optimizer, criterion, device)
    
    model.train()
    running_loss = 0.0
    
    for images, depths in tqdm.tqdm_notebook(train_dataloader):
        images = images.to(device)
        depths = depths.to(device)

        optimizer.zero_grad()

        outputs = model(images)
        loss = criterion(outputs[-1].float(), depths.float())
        loss.backward()

        optimizer.step()
        running_loss += loss.item()
    train_loss = running_loss / len(dataloader)
    #val_loss = validate(model, val_dataloader, criterion, device)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for epoch in tqdm.tqdm_notebook(range(num_epochs)):


  0%|          | 0/10 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for images, depths in tqdm.tqdm_notebook(train_dataloader):


  0%|          | 0/224 [00:00<?, ?it/s]

torch.Size([8, 128, 32, 32])
torch.Size([8, 128, 32, 32])
