<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

## Image segmentation with a U-Net-like architecture

In [1]:
import matplotlib.pyplot as plt
import os,math, random
import cv2
import numpy as np
from IPython.display import Image, display
from tensorflow.keras.preprocessing.image import load_img
import torch.nn as nn
from torchsummary import summary
from torch.utils.data import DataLoader,TensorDataset,Dataset
import torch

In [2]:
%matplotlib inline
np.random.seed(1000)
torch.manual_seed(1000)

<torch._C.Generator at 0x284971d0690>

In [3]:
#!curl -O https://www.robots.ox.ac.uk/~vgg/data/pets/data/images.tar.gz
#!curl -O https://www.robots.ox.ac.uk/~vgg/data/pets/data/annotations.tar.gz
#!tar -xf images.tar.gz
#!tar -xf annotations.tar.gz

In [4]:
input_dir = "../data/images/"
target_dir = "../data/annotations/trimaps/"
input_img_paths=sorted(
                    [ os.path.join(input_dir,img_name)for img_name in os.listdir(input_dir) if img_name.endswith('.jpg') ]
                    )

target_img_paths=sorted(
                  [ os.path.join(target_dir,tar_img_name) for tar_img_name in os.listdir(target_dir)
                   if tar_img_name.endswith('.png') and not tar_img_name.startswith('.')
                  ])
len(input_img_paths)

7390

In [5]:

img_size = (160,160)
num_classes = 3
batch_size = 50

In [6]:
class OxfordPetsDateset(Dataset):
    def __init__(self,img_size,input_img_paths,target_img_paths):
        self.img_size=img_size
        self.input_img_paths=input_img_paths
        self.target_img_paths=target_img_paths   
        
    def __len__(self):
        return len(self.target_img_paths)
    
    def __getitem__(self,idx):
        img_=self.input_img_paths[idx]
        label_=self.target_img_paths[idx]
        
        img=load_img(img_, target_size=self.img_size)
        img/=np.max(img)
        img=np.reshape(img,(3,160,160))
                    
        label=load_img(label_, target_size=self.img_size, color_mode="grayscale")
        label=np.expand_dims(label,axis=0)
        # Ground truth labels are 1, 2, 3. Subtract one to make them 0, 1, 2:
        label-=1
        
        return torch.tensor(img),torch.tensor(label)

In [7]:

# Split our img paths into a training and a validation set
val_samples = 1000
train_input_img_paths = input_img_paths[:-val_samples]
train_target_img_paths = target_img_paths[:-val_samples]
val_input_img_paths = input_img_paths[-val_samples:]
val_target_img_paths = target_img_paths[-val_samples:]
# Instantiate data Sequences for each split
train_data = OxfordPetsDateset( img_size, train_input_img_paths, train_target_img_paths)
val_data = OxfordPetsDateset(img_size, val_input_img_paths, val_target_img_paths)

In [8]:
for x,y in train_data:
    print(y.shape)
    print(x.shape)
    break

torch.Size([1, 160, 160])
torch.Size([3, 160, 160])


In [9]:
len(train_data),len(val_data)

(6390, 1000)

In [10]:
train_data_iter=DataLoader(train_data,batch_size=batch_size,shuffle=True)
val_data_iter=DataLoader(val_data,batch_size=batch_size,shuffle=True)

In [11]:
for x,y in train_data_iter:
    print(y.shape)
    print(x.shape)
    break

torch.Size([50, 1, 160, 160])
torch.Size([50, 3, 160, 160])


# MODEL
We will implement a U-Net-like architecture but with fewer layers

In [12]:
def block(in_channel,out_channel):
    return nn.Sequential(
        nn.Conv2d(in_channels=in_channel,out_channels=out_channel,kernel_size=3,padding=1,bias=False),
        nn.BatchNorm2d(out_channel),
        nn.ReLU(inplace=True),
        nn.Conv2d(in_channels=out_channel,out_channels=out_channel,kernel_size=3,padding=1,bias=False),
        nn.BatchNorm2d(out_channel),
        nn.ReLU(inplace=True)
       )   

In [13]:
class encoder(nn.Module):
    def __init__(self,in_channel,out_channel):
        super().__init__() 
        self.in_ch=in_channel
        self.out_ch=out_channel
        self.block=block(self.in_ch,self.out_ch)
        self.m_pool=nn.MaxPool2d(kernel_size=2,stride=2)
        
    def forward(self,x):
        en_block=self.block(x)
        m_pool=self.m_pool(en_block)
        return en_block,m_pool


In [14]:
class decoder(nn.Module):
    def __init__(self,in_ch,out_ch,shrink=False):
        super(decoder,self).__init__() 
        self.in_ch=in_ch
        self.shrink=shrink
        self.out_ch=out_ch
        self.up=nn.UpsamplingBilinear2d(scale_factor=2)
        self.conv=nn.Conv2d(self.in_ch,self.out_ch,3,2,padding=1,bias=False)
        self.conv1=nn.Conv2d(self.in_ch,self.out_ch,3,4,bias=False)
        self.block=block(self.out_ch,self.out_ch)
        self.relu=nn.ReLU()
    def forward(self,x,skips):
        x=self.up(x)
        if self.shrink==True:
            x=self.relu(self.conv1(x))
        else:
            x=self.relu(self.conv(x))
        merge=torch.cat([x,skips])
        output=self.block(merge)
        return output

In [15]:
class Reshape(nn.Module):
    def __init__(self):
        super().__init__()
    def forward(self, x):
        return x.view(-1,3,160,160)
class Unet(nn.Module):
    def __init__(self):
        super(Unet,self).__init__() 

        self.encoder1=encoder(3,20)
        self.encoder2=encoder(20,160)
        self.encoder3=encoder(160,100)
        self.block1=block(100,10)
        
        self.decoder1=decoder(10,100,shrink=True)
        self.decoder2=decoder(100,160)
        self.decoder3=decoder(160,20)
        self.up=nn.UpsamplingBilinear2d(scale_factor=2)
        self.conv1=nn.Conv2d(20,3,3,padding=1,bias=False)
        self.softmax=nn.Softmax()

    def forward(self,x):
        #x=Reshape(x)
        x=x.reshape(-1,3,160,160)
        # encoder
        enc1,skip1=self.encoder1(x)
        enc2,skip2=self.encoder2(enc1)
        enc3,skip3=self.encoder3(enc2)
        # bottlenect
        bk1=self.block1(enc3)
        #decoder
        dec1=self.decoder1(bk1,skip3)
        dec2=self.decoder2(dec1,skip2)
        dec3=self.decoder3(dec2,skip1)
        up=self.up(dec3)
        output=self.softmax(self.conv1(up))
        return output
      
unet=Unet()

In [20]:
#X = torch.randn(size=(1, 3, 160, 160), dtype=torch.float32)
print(summary(unet,(1, 3, 160, 160)))

Layer (type:depth-idx)                   Output Shape              Param #
├─encoder: 1-1                           [-1, 20, 160, 160]        --
|    └─Sequential: 2-1                   [-1, 20, 160, 160]        --
|    |    └─Conv2d: 3-1                  [-1, 20, 160, 160]        540
|    |    └─BatchNorm2d: 3-2             [-1, 20, 160, 160]        40
|    |    └─ReLU: 3-3                    [-1, 20, 160, 160]        --
|    |    └─Conv2d: 3-4                  [-1, 20, 160, 160]        3,600
|    |    └─BatchNorm2d: 3-5             [-1, 20, 160, 160]        40
|    |    └─ReLU: 3-6                    [-1, 20, 160, 160]        --
|    └─MaxPool2d: 2-2                    [-1, 20, 80, 80]          --
├─encoder: 1-2                           [-1, 160, 160, 160]       --
|    └─Sequential: 2-3                   [-1, 160, 160, 160]       --
|    |    └─Conv2d: 3-7                  [-1, 160, 160, 160]       28,800
|    |    └─BatchNorm2d: 3-8             [-1, 160, 160, 160]       320
|    |

  output=self.softmax(self.conv1(up))


In [31]:
loss_fc =nn.CrossEntropyLoss()
optimizer=torch.optim.Adam(unet.parameters(),lr=1e-4)
def evaluate_accuracy(output,label):
    pred = output.argmax(dim=1,keepdim=True)
    return (pred==label.float()).float().sum()

In [None]:
train_loss=0.
val_loss=0.
train_acc=0.
val_acc = 0.    
epochs=10
for epoch in range(epochs):
    unet.train()
    for inputs,targets in train_data:
            optimizer.zero_grad()
            output=unet(inputs)
            loss=loss_fc(output,targets)
            loss.backward()
            optimizer.step()
            train_acc+=evaluate_accuracy(output,targets)
            train_loss+=loss
            
            
    unet.eval()
    for inputs,label in val_data:
        output=unet(inputs)
        loss=loss_fc(output,torch.max(targets,1)[1])
        val_loss+=loss
        val_acc+=evaluate_accuracy(output,label)
    # taking averages
    train_acc/=len(train_data)
    train_loss/=len(train_data)
    val_acc/=len(val_data)
    val_loss/=len(val_data)
    print("Epoch %d: train loss %.3f, train acc %.3f, val loss %.3f, val acc %.3f" % (
    epoch+1, train_loss.detach().numpy(), train_acc.detach().numpy(),
    val_loss.detach().numpy(), val_acc.detach().numpy()))
            

# References

[U-Net: Convolutional Networks for Biomedical Image Segmentation, 2015](https://arxiv.org/pdf/1505.04597.pdf)

[Image segmentation with a U-Net-like architecture](https://keras.io/examples/vision/oxford_pets_image_segmentation/)

[A guide to convolution arithmetic for deep learning,2018](https://arxiv.org/abs/1603.07285)

[Is the deconvolution layer the same as a convolutional layer?,2016](https://arxiv.org/ftp/arxiv/papers/1609/1609.07009.pdf)

[13.11. Fully Convolutional Networks](https://d2l.ai/chapter_computer-vision/fcn.html)


