In [9]:
import os
import time
import numpy as np
import matplotlib.pyplot as plt
import PIL
import torch

import torch.nn as nn
import torch.optim as optim
from torchvision import models
from torchvision.models import vgg16
from torchvision import datasets, transforms


print('pytorch version: {}'.format(torch.__version__))
print('GPU 사용 가능 여부: {}'.format(torch.cuda.is_available()))
device = "cuda" if torch.cuda.is_available() else "cpu"   # GPU 사용 가능 여부에 따라 device 정보 저장

pytorch version: 1.4.0
GPU 사용 가능 여부: True


### reflected padding 부분

> input (image, label)를 각각 reflected padding 진행 후 `front-end`의 input으로 들어감

In [10]:
# input image는 전처리 과정에서 input reflected padding 진행

input = torch.arange(9, dtype=torch.float).reshape(1, 1, 3, 3)
print("\n------input------")
print(input.shape)
print(input)

print("\n------after ReflectionPad2d------")
m = nn.ReflectionPad2d(2)
print(m(input).shape)
m(input)


------input------
torch.Size([1, 1, 3, 3])
tensor([[[[0., 1., 2.],
          [3., 4., 5.],
          [6., 7., 8.]]]])

------after ReflectionPad2d------
torch.Size([1, 1, 7, 7])


tensor([[[[8., 7., 6., 7., 8., 7., 6.],
          [5., 4., 3., 4., 5., 4., 3.],
          [2., 1., 0., 1., 2., 1., 0.],
          [5., 4., 3., 4., 5., 4., 3.],
          [8., 7., 6., 7., 8., 7., 6.],
          [5., 4., 3., 4., 5., 4., 3.],
          [2., 1., 0., 1., 2., 1., 0.]]]])

In [11]:
def make_rfpad_image_label_data(imgae, mask):
    rf_pad= nn.ReplicationPad2d(2)
    
    imgae = rf_pad(imgae)
    mask = rf_pad(mask)
    
    return image, mask

### Identity matrix weights intialization

In [12]:
# Kernel 가중치 초기화 (일반적)
l = nn.Conv2d(12, 20, kernel_size = 3)
print(l.weight.size())
# l.weight

torch.Size([20, 12, 3, 3])


In [13]:
# Kernel 가중치 초기화 (identity matrix weights)
l = nn.Conv2d(12, 20, kernel_size = 3)
print(l.weight.data.copy_(torch.eye(3)).size())

#l.weight.data.copy_(torch.eye(3))

torch.Size([20, 12, 3, 3])


### 네트워크 설계 I (Pretrained 된 모델 사용 X)

### Front-end Module

![image.png](https://github.com/choco9966/Semantic-Segmentation-Review/blob/main/Multi-Scale%20Context%20Aggregation%20by%20Dilated%20Convolutions%20(DilatedNet)%20Review/png/front-end%20module.png?raw=true)

![image.png](https://github.com/choco9966/Semantic-Segmentation-Review/blob/main/Multi-Scale%20Context%20Aggregation%20by%20Dilated%20Convolutions%20(DilatedNet)%20Review/png/front-end%20module%20with%20context_module.png?raw=true)

### Context Module

A context module is constructed based on the dilated convolution as below:

![image.png](https://miro.medium.com/max/1576/1*aj0ymQMfAOCXbvhnSlTY_w.png)

In [7]:
def make_context_layers(in_channels, batch_norm=False, version = "basic"):
    '''
    args:
     in_channels : input의 channel 수 (e.g. 64)
     version : The Context module의 버전 (layer 구성을 위함)
    '''
    layers = []
    
    # 1 ~ 7 Layer에 해당되는 dilation 및 channel list 
    dilation_list = [1, 1, 2, 4, 8, 16, 1]
    basic_channels = [1, 1, 1, 1, 1, 1, 1]
    large_channels = [2, 2, 4, 8, 16, 32, 32]

    
    if version == "basic":
        for i in range(len(basic_channels)):
            conv2d = nn.Conv2d(in_channels, in_channels, kernel_size = 3, padding = 1, dilation = dilation_list[i])
            if batch_norm:
                layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)]
            else:
                layers += [conv2d, nn.ReLU(inplace=True)]
        
        layers += [nn.Conv2d(in_channels, in_channels, kernel_size = 1, padding = 1, dilation = 1)]
        
    else :
        temp_in_channels = in_channels
        for i in range(len(large_channels)):
            temp_out_channels = in_channels*large_channels[i]          
            conv2d = nn.Conv2d(temp_in_channels, temp_out_channels, kernel_size = 3, padding = 1, dilation = dilation_list[i])
            temp_in_channels = temp_out_channels
            
            if batch_norm:
                layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)]
            else:
                layers += [conv2d, nn.ReLU(inplace=True)]
        
        # 8 Layer 해당 
        layers += [nn.Conv2d(temp_in_channels, in_channels, kernel_size = 1, padding = 1, dilation = 1)]        
    
    return nn.Sequential(*layers)

In [8]:
# The Context module (basic) 
make_context_layers(21, batch_norm=False, version = "basic")

Sequential(
  (0): Conv2d(21, 21, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (1): ReLU(inplace=True)
  (2): Conv2d(21, 21, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (3): ReLU(inplace=True)
  (4): Conv2d(21, 21, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), dilation=(2, 2))
  (5): ReLU(inplace=True)
  (6): Conv2d(21, 21, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), dilation=(4, 4))
  (7): ReLU(inplace=True)
  (8): Conv2d(21, 21, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), dilation=(8, 8))
  (9): ReLU(inplace=True)
  (10): Conv2d(21, 21, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), dilation=(16, 16))
  (11): ReLU(inplace=True)
  (12): Conv2d(21, 21, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (13): ReLU(inplace=True)
  (14): Conv2d(21, 21, kernel_size=(1, 1), stride=(1, 1), padding=(1, 1))
)

In [159]:
# The Context module (large)
make_context_layers(21, batch_norm=False, version = "large")

Sequential(
  (0): Conv2d(21, 42, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (1): ReLU(inplace=True)
  (2): Conv2d(42, 42, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (3): ReLU(inplace=True)
  (4): Conv2d(42, 84, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), dilation=(2, 2))
  (5): ReLU(inplace=True)
  (6): Conv2d(84, 168, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), dilation=(4, 4))
  (7): ReLU(inplace=True)
  (8): Conv2d(168, 336, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), dilation=(8, 8))
  (9): ReLU(inplace=True)
  (10): Conv2d(336, 672, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), dilation=(16, 16))
  (11): ReLU(inplace=True)
  (12): Conv2d(672, 672, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (13): ReLU(inplace=True)
  (14): Conv2d(672, 21, kernel_size=(1, 1), stride=(1, 1), padding=(1, 1))
)

In [86]:
pretrained_model = vgg16(pretrained = False)

In [9]:
class MultiScaleContextAggregationByDilatedConvolutions(nn.Module):
    def __init__(self, num_classes = 21, version = "basic"):
        super(MultiScaleContextAggregationByDilatedConvolutions, self).__init__()
        self.vgg16_model = vgg16(pretrained = False)
        features = list(self.vgg16_model.features.children())[0:23]
        
        '''
        100 padding for 2 reasons:
            1) support very small input size
            2) allow cropping in order to match size of different layers' feature maps
        Note that the cropped part corresponds to a part of the 100 padding
        Spatial information of different layers' feature maps cannot be align exactly because of cropping, which is bad
        '''
        features[0].padding = (100, 100)
        
        
        # conv1 (block) ~ conv4 (block)
        self.features_map = nn.Sequential(*features)
        # conv5 (block) with dilation = 2
        self.conv5_block_with_dilation2 = nn.Sequential(nn.Conv2d(512, 512, kernel_size = 3, padding = 1, dilation = 2),
                                                        nn.ReLU(inplace=True),
                                                        nn.Conv2d(512, 512, kernel_size = 3, padding = 1, dilation = 2),
                                                        nn.ReLU(inplace=True),
                                                        nn.Conv2d(512, 512, kernel_size = 3, padding = 1, dilation = 2),
                                                        nn.ReLU(inplace=True),
                                                        )
        
        # conv6 (block) with dilation = 4
        self.conv6_block_with_dilation4 = nn.Sequential(nn.Conv2d(512, 4096, kernel_size = 7, padding = 1, dilation = 4),
                                                        nn.ReLU(inplace=True),
                                                        nn.Dropout2d(0.5)
                                                        )
        
        # conv7 
        self.conv7_blcok = nn.Sequential(nn.Conv2d(4096, 4096,  kernel_size = 1),
                                         nn.ReLU(inplace=True),
                                         nn.Dropout2d(0.5)                                         
                                        )
        # conv_final
        self.front_end_final_layer= nn.Conv2d(4096, num_classes,  kernel_size = 1)
        
        
        # context module

        self.context_module = make_context_layers(num_classes, batch_norm=False, version=version)
    
         
        # Deconvolution (Up) : 불확실
        self.deconv = nn.ConvTranspose2d(num_classes, num_classes, kernel_size = 16, stride = 8, padding = 4)
        
        # Sigmoid 
        self.Sigmoid = nn.Sigmoid()
        
        
    def forward(self, x):
        
        # front-end module
        x = self.features_map(x)
        print('----------------------------------------------')        
        print("size of after conv4 (block) : {}".format(x.shape))
        x = self.conv5_block_with_dilation2(x)
        print("size of after conv5_block_with_dilation2 : {}".format(x.shape))        
        x = self.conv6_block_with_dilation4(x)
        print("size of after conv6_block_with_dilation4 : {}".format(x.shape))
        x = self.conv7_blcok(x)
        print("size of after conv7_blcok : {}".format(x.shape))        
        x = self.front_end_final_layer(x)
        print("size of after front_end_final_layer : {}".format(x.shape))          

        # Context module
        print('----------------------------------------------')
        x = self.context_module(x)
        print("size of after Context module : {}".format(x.shape))
        
        # Deconvolution (Up)
        print('----------------------------------------------')
        x = self.deconv(x)
        print("size of after Deconvolution (Up) : {}".format(x.shape))
        print('----------------------------------------------')        
        
        return self.Sigmoid(x)

In [10]:
# 구현된 model에 임의의 input을 넣어 output이 잘 나오는지 test

model = MultiScaleContextAggregationByDilatedConvolutions(num_classes=22)

In [11]:
x = torch.randn([2, 3, 512, 512])
print("input shape : ", x.shape)
out = model(x)
print("output shape : ", out.size())

input shape :  torch.Size([2, 3, 512, 512])
----------------------------------------------
size of after conv4 (block) : torch.Size([2, 512, 88, 88])
size of after conv5_block_with_dilation2 : torch.Size([2, 512, 86, 86])
size of after conv6_block_with_dilation4 : torch.Size([2, 4096, 64, 64])
size of after conv7_blcok : torch.Size([2, 4096, 64, 64])
size of after front_end_final_layer : torch.Size([2, 22, 64, 64])
----------------------------------------------
size of after Context module : torch.Size([2, 22, 14, 14])
----------------------------------------------
size of after Deconvolution (Up) : torch.Size([2, 22, 112, 112])
----------------------------------------------
output shape :  torch.Size([2, 22, 112, 112])


### to do list

- `input` 및 `output`을 맞추는 작업 필요
- `weigth initialization` 

#### $$S_{out} = \text{stride} \ \times (S_{input}-1) + S_{filter size} - 2 \ \times \ \text{pad} $$ 

In [13]:
stride = 8
S_input = 14
S_filter = 16
pad = 4

In [26]:
stride * (S_input - 1) + S_filter - 2*pad

112

## CRF

In [20]:
import torch
import torch.nn as nn
from crfseg import CRF

model = nn.Sequential(
    nn.Identity(),  # your NN
    CRF(n_spatial_dims=2)
)

batch_size, n_channels, spatial = 10, 3,(100, 100)
x = torch.zeros(batch_size, n_channels, *spatial)
log_proba = model(x)

In [22]:
log_proba.shape

torch.Size([10, 3, 100, 100])

### Reference
---

- [Dilated Convolution for Semantic Image Segmentation using caffe](https://github.com/fyu/dilation/blob/master/network.py)