다음 4개의 모듈 구현

1. Feature

2. Pyramid Pooling

3. Decoder

4. AuxLoss

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

## PSPNet 네트워크 구조

In [None]:
# 필요한 모듈, 네트워크를 1~3 과정으로 구현
class PSPNet(nn.Module):
    def __init__(self, n_classes):
        super(PSPNet, self).__init__()

        # parameters
        block_config = [3, 4, 6, 3]  # resnet50
        img_size = 475
        img_size_8 = 60

        # 1. Feature 모듈의 sub-networks
        self.feature_conv = FeatureMap_convolution()

        self.feature_res_1 = ResidualBlockPSP(
            n_blocks=block_config[0], in_channels=128, mid_channels=64, out_channels=256, stride=1, dilation=1)
        self.feature_res_2 = ResidualBlockPSP(
            n_blocks=block_config[1], in_channels=256, mid_channels=128, out_channels=512, stride=2, dilation=1)
        
        self.feature_dilated_res_1 = ResidualBlockPSP(
            n_blocks=block_config[2], in_channels=512, mid_channels=256, out_channels=1024, stride=1, dilation=2)
        self.feature_dilated_res_2 = ResidualBlockPSP(
            n_blocks=block_config[3], in_channels=1024, mid_channels=512, out_channels=2048, stride=1, dilation=4)
        
        # 2. Pyramid Pooling
        self.pyramid_pooling = PyramidPooling(in_channels=2048, pool_sizes=[6, 3, 2, 1],  # 입력을 5개로 분기시켜 pooling (1개는 pooling없이 입력을 그대로 사용)
                                              height=img_size_8, width=img_size_8)

        # 3. Decoder (Up-sampling)
        self.decode_feature = DecodePSPFeature(
            height=img_size, width=img_size, n_classes=n_classes)

        # 4. AuxLoss
        self.aux = AuxiliaryPSPlayers(
            in_channels=1024, height=img_size, width=img_size, n_classes=n_classes)
        
    def forward(self, x):
        x = self.feature_conv(x)
        x = self.feature_res_1(x)
        x = self.feature_res_2(x)
        x = self.feature_dilated_res_1(x)

        output_aux = self.aux(x)  # Feature 모듈의 중간 출력을 AuxLoss 모듈의 입력으로 제공 -> (21, 475, 475)

        x = self.feature_dilated_res_2(x)  # Feature 모듈의 최종 output
        
        # Pyramid pooling 이후 Decoder를 통해 각 픽셀의 클래스 라벨 예측
        x = self.pyramid_pooling(x)
        output = self.decode_feature(x)  # (21, 475, 475)

        return (output, output_aux)

## 1. Feature 모듈

Encoder 모듈 - 입력 이미지의 특징 파악

(3, 475, 475) -> (2048, 60, 60) feature map

총 5개의 sub-network로 구성

 - 1개의 FeatureMap_convolution
 - 2개의 ResidualBlockPSP - 1개의 BottleNeckPSP, 다수의 BottleNeckIdentifyPSP (block_config 참고)
 - 2개의 DilatedResidualBlockPSP 

### 1.1 FeatureMap_convolution

합성곱, 배치 정규화, ReLU를 세트로 하는 conv2dBatchNormRelu 3개 + pooling

(3, 475, 475) -> (128, 119, 119)

In [3]:
class conv2DBatchNormRelu(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride, padding, dilation, bias):
        super(conv2DBatchNormRelu, self).__init__()
        self.conv = nn.Conv2d(in_channels, out_channels,
                              kernel_size, stride, padding, dilation, bias=bias)
        self.batchnorm = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU(inplace=True)  # ReLU에 대한 입력을 메모리에 저장하지 않고 그대로 출력 계산 (메모리 절약 옵션)

    def forward(self, x):
        x = self.conv(x)
        x = self.batchnorm(x)
        outputs= self.relu(x)

        return outputs

In [4]:
# 3개의 FeatureMap_convolution을 통과 후 pooling layer 거침
class FeatureMap_convolution(nn.Module):
    def __init__(self):
        super(FeatureMap_convolution, self).__init__()
    
        # Conv1
        in_channels, out_channels, kernel_size, stride, padding, dilation, bias = 3, 64, 3, 2, 1, 1, False  # 입력 : (3, 475, 475)
        self.cbnr_1 = conv2DBatchNormRelu(
            in_channels, out_channels, kernel_size, stride, padding, dilation, bias)  # (64, 238, 238)

        # Conv2
        in_channels, out_channels, kernel_size, stride, padding, dilation, bias = 64, 64, 3, 1, 1, 1, False  # stride=1
        self.cbnr_2 = conv2DBatchNormRelu(
            in_channels, out_channels, kernel_size, stride, padding, dilation, bias)  # (64, 238, 238)

        # Conv3
        in_channels, out_channels, kernel_size, stride, padding, dilation, bias = 64, 128, 3, 1, 1, 1, False  # stride=1
        self.cbnr_3 = conv2DBatchNormRelu(
            in_channels, out_channels, kernel_size, stride, padding, dilation, bias)  # (128, 238, 238)

        # MaxPooling
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)  # (128, 119, 119)
        
    def forward(self, x):
        x = self.cbnr_1(x)
        x = self.cbnr_2(x)
        x = self.cbnr_3(x)

        outputs = self.maxpool(x)

        return outputs  # (128, 119, 119)

### 1.2 ResidualBlockPSP

bottleNeckPSP를 통과한 후 bottleNeckIdentifyPSP를 여러 번 반복 출력 (ResNet50에서 사용하는 횟수인 block_config [3, 4, 6, 3] 에서 지정된 횟수 만큼 반복)

In [6]:
# conv + BatchNorm (Relu x)
class conv2DBatchNorm(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride, padding, dilation, bias):
        super(conv2DBatchNorm, self).__init__()
        self.conv = nn.Conv2d(in_channels, out_channels,
                              kernel_size, stride, padding, dilation, bias=bias)
        self.batchnorm = nn.BatchNorm2d(out_channels)

    def forward(self, x):
        x = self.conv(x)
        outputs = self.batchnorm(x)

        return outputs

#### 1.2.1 bottleNeckPSP

down-sampling 이후 skip-conn (차원을 맞춰주기 위해)

In [5]:
class bottleNeckPSP(nn.Module):
    def __init__(self, in_channels, mid_channels, out_channels, stride, dilation):
        super(bottleNeckPSP, self).__init__()
        
        # F(x)
        self.cbr_1 = conv2DBatchNormRelu(
            in_channels, mid_channels, kernel_size=1, stride=1, padding=0, dilation=1, bias=False)
        self.cbr_2 = conv2DBatchNormRelu(
            mid_channels, mid_channels, kernel_size=3, stride=stride, padding=dilation, dilation=dilation, bias=False)
        self.cb_3 = conv2DBatchNorm(
            mid_channels, out_channels, kernel_size=1, stride=1, padding=0, dilation=1, bias=False)
        
        # x (채널을 맞춰주기 위해 1x1 conv로 down-sampling)
        self.cb_residual = conv2DBatchNorm(
            in_channels, out_channels, kernel_size=1, stride=stride, padding=0, dilation=1, bias=False)
        self.relu = nn.ReLU(inplace=True)

    def forward(self, x):
        conv = self.cbr_3(self.cbr_2(self.cbr_1(x)))
        residual = self.cb_residual(x)

        return self.relu(conv + residual)  # F(x) + x (skip-conn)

#### 1.2.2 bottleNeckIdentifyPSP

down-sampling 없이 skip-conn

In [7]:
class bottleNeckIdentifyPSP(nn.Module):
    def __init__(self, in_channels, mid_channels, stride, dilation):
        super(bottleNeckIdentifyPSP, self).__init__()
        
        # F(x)
        self.cbr_1 = conv2DBatchNormRelu(
            in_channels, mid_channels, kernel_size=1, stride=1, padding=0, dilation=1, bias=False)
        self.cbr_2 = conv2DBatchNormRelu(
            mid_channels, mid_channels, kernel_size=3, stride=1, padding=dilation, dilation=dilation, bias=False)
        self.cb_3 = conv2DBatchNorm(
            mid_channels, in_channels, kernel_size=1, stride=1, padding=0, dilation=1, bias=False)
        self.relu = nn.ReLU(inplace=True)

    def forward(self, x):
        conv = self.cb_3(self.cbr_2(self.cbr_1(x)))
        residual = x  # down-sampling x

        return self.relu(conv + residual)  # F(x) + x

In [10]:
class ResidualBlockPSP(nn.Sequential):
    def __init__(self, n_blocks, in_channels, mid_channels, out_channels, stride, dilation):
        super(ResidualBlockPSP, self).__init__()
        
        # bottleNeckPSP
        self.add_module(
            'block1',
            bottleNeckPSP(in_channels, mid_channels, out_channels, stride, dilation)
        )
       
       # bottleNeckIdentifyyPSP (blcok_config에서 지정된 횟수 만큼 반복)
        for i in range(n_blocks - 1):
            self.add_module(
               'block' + str(i+2),
                bottleNeckIdentifyPSP(out_channels, mid_channels, stride, dilation)
           )

## 2. Pyramid Pooling

Feature 모듈의 최종 출력 (2048, 60, 60) tensor를 입력으로 받음

해당 입력은 5개로 분기되고, 그 중 4개는 각각 AdaptiveAvgPool2d를 통과(multi-scale 처리). 마지막 분기는 (2048, 60, 60) 별다른 처리없이 그대로 4개 분기의 출력과 최종적으로 결합

 - P6 -> (2048, 6, 6)
 - P3 -> (2048, 3, 3)
 - P2 -> (2048, 2, 2)
 - P1 -> (2048, 1, 1)

4개의 분기는 pooling 이후 각각 conv2DBatchNormRelu를 통과하여 크기는 유지한 채 동일한 512 채널을 갖도록 함

 - P6 -> conv2DBatchNormRelu -> (512, 6, 6)
 - P3 -> conv2DBatchNormRelu -> (512, 3, 3)
 - P2 -> conv2DBatchNormRelu -> (512, 2, 2)
 - P1 -> conv2DBatchNormRelu -> (512, 1, 1)

Up-sampling을 통해 크기를 60으로 맞춰줌

 - P6 -> conv2DBatchNormRelu -> Up-sample -> (512, 60, 60)
 - P3 -> conv2DBatchNormRelu -> Up-sample -> (512, 60, 60)
 - P2 -> conv2DBatchNormRelu -> Up-sample -> (512, 60, 60)
 - P1 -> conv2DBatchNormRelu -> Up-sample -> (512, 60, 60)

5개 분기의 출력들을 결합 (512x4 + 2048 = 4096 채널)


multi scale 정보를 가짐. 다양한 scale의 feature map을 사용하기 때문에 높은 정밀도를 가진 semantic 분할이 가능.



In [15]:
class PyramidPooling(nn.Module):
    def __init__(self, in_channels, pool_sizes, height, width):
        super(PyramidPooling, self).__init__()

        self.height = height
        self.width = width
        out_channels = int(in_channels / len(pool_sizes))

        # 개별 합성곱 층 (pool_size : [6, 3, 2, 1])
        self.avpool_1 = nn.AdaptiveAvgPool2d(output_size=pool_sizes[0])  
        self.cbr_1 = conv2DBatchNormRelu(
            in_channels, out_channels, kernel_size=1, stride=1, padding=0, dilation=1, bias=False)  # (512, 6, 6)
        
        self.avpool_2 = nn.AdaptiveAvgPool2d(output_size=pool_sizes[1])
        self.cbr_2 = conv2DBatchNormRelu(
            in_channels, out_channels, kernel_size=1, stride=1, padding=0, dilation=1, bias=False)  # (512, 3, 3)
        
        self.avpool_3 = nn.AdaptiveAvgPool2d(output_size=pool_sizes[2])
        self.cbr_3 = conv2DBatchNormRelu(
            in_channels, out_channels, kernel_size=1, stride=1, padding=0, dilation=1, bias=False)  # (512, 2, 2)
        
        self.avpool_4 = nn.AdaptiveAvgPool2d(output_size=pool_sizes[3])
        self.cbr_4 = conv2DBatchNormRelu(
            in_channels, out_channels, kernel_size=1, stride=1, padding=0, dilation=1, bias=False)  # (512, 1, 1)

    def forward(self, x):
        out1 = self.cbr_1(self.avpool_1(x))
        out1 = F.interpolate(out1, size=(self.height, self.width),  # Up-sampling
                             mode='bilinear', align_corners=True)  # (512, 60, 60)
        
        out2 = self.cbr_2(self.avpool_2(x))
        out2 = F.interpolate(out2, size=(self.height, self.width),  # Up-sampling
                             mode='bilinear', align_corners=True)  # (512, 60, 60)
        
        out3 = self.cbr_3(self.avpool_3(x))
        out3 = F.interpolate(out3, size=(self.height, self.width),  # Up-sampling
                             mode='bilinear', align_corners=True)  # (512, 60, 60)
        
        out4 = self.cbr_4(self.avpool_4(x))
        out4 = F.interpolate(out4, size=(self.height, self.width),  # Up-sampling
                             mode='bilinear', align_corners=True)  # (512, 60, 60)
        
        output = torch.cat([x, out1, out2, out3, out4], dim=1)  # 5개 분기의 출력들을 결합

        return output  # (4096, 60, 60)

## 3. Decoder & AuxLoss

Pyramid pooling, Feature 모듈의 출력 tensor를 decode한 후 원래 크기 (475, 475)로 up-sampling

둘 다 최종 출력은 (21, 475, 475)

In [11]:
class DecodePSPFeature(nn.Module):
    def __init__(self, height, width, n_classes):
        super(DecodePSPFeature, self).__init__()

        self.height = height
        self.width = width
        
        self.cbr = conv2DBatchNormRelu(
            in_channels=4096, out_channels=512, kernel_size=3, stride=1, padding=1, dilation=1, bias=False)
        self.dropout = nn.Dropout2d(p=0.1)
        self.classification = nn.Conv2d(
            in_channels=512, out_channels=n_classes, kernel_size=1, stride=1, padding=0)

    def forward(self, x):
        x = self.cbr(x)
        x = self.dropout(x)
        x = self.classification(x)
        output = F.interpolate(
            x, size=(self.height, self.width), mode="bilinear", align_corners=True)

        return output

In [12]:
class AuxiliaryPSPlayers(nn.Module):
    def __init__(self, in_channels, height, width, n_classes):
        super(AuxiliaryPSPlayers, self).__init__()

        self.height = height
        self.width = width

        self.cbr = conv2DBatchNormRelu(
            in_channels=in_channels, out_channels=256, kernel_size=3, stride=1, padding=1, dilation=1, bias=False)
        self.dropout = nn.Dropout2d(p=0.1)
        self.classification = nn.Conv2d(
            in_channels=256, out_channels=n_classes, kernel_size=1, stride=1, padding=0)

    def forward(self, x):
        x = self.cbr(x)
        x = self.dropout(x)
        x = self.classification(x)
        output = F.interpolate(
            x, size=(self.height, self.width), mode="bilinear", align_corners=True)

        return output

## 확인

In [13]:
# 필요한 모듈, 네트워크를 1~3 과정으로 구현
class PSPNet(nn.Module):
    def __init__(self, n_classes):
        super(PSPNet, self).__init__()

        # parameters
        block_config = [3, 4, 6, 3]  # resnet50
        img_size = 475
        img_size_8 = 60

        # 1. Feature 모듈의 sub-networks
        self.feature_conv = FeatureMap_convolution()

        self.feature_res_1 = ResidualBlockPSP(
            n_blocks=block_config[0], in_channels=128, mid_channels=64, out_channels=256, stride=1, dilation=1)
        self.feature_res_2 = ResidualBlockPSP(
            n_blocks=block_config[1], in_channels=256, mid_channels=128, out_channels=512, stride=2, dilation=1)
        
        self.feature_dilated_res_1 = ResidualBlockPSP(
            n_blocks=block_config[2], in_channels=512, mid_channels=256, out_channels=1024, stride=1, dilation=2)
        self.feature_dilated_res_2 = ResidualBlockPSP(
            n_blocks=block_config[3], in_channels=1024, mid_channels=512, out_channels=2048, stride=1, dilation=4)
        
        # 2. Pyramid Pooling
        self.pyramid_pooling = PyramidPooling(in_channels=2048, pool_sizes=[6, 3, 2, 1],  # 입력을 5개로 분기시켜 pooling (1개는 pooling없이 입력을 그대로 사용)
                                              height=img_size_8, width=img_size_8)

        # 3. Decoder (Up-sampling)
        self.decode_feature = DecodePSPFeature(
            height=img_size, width=img_size, n_classes=n_classes)

        # 4. AuxLoss
        self.aux = AuxiliaryPSPlayers(
            in_channels=1024, height=img_size, width=img_size, n_classes=n_classes)
        
    def forward(self, x):
        x = self.feature_conv(x)
        x = self.feature_res_1(x)
        x = self.feature_res_2(x)
        x = self.feature_dilated_res_1(x)

        output_aux = self.aux(x)  # Feature 모듈의 중간 출력을 AuxLoss 모듈의 입력으로 제공 -> (21, 475, 475)

        x = self.feature_dilated_res_2(x)  # Feature 모듈의 최종 output
        
        # Pyramid pooling 이후 Decoder를 통해 각 픽셀의 클래스 라벨 예측
        x = self.pyramid_pooling(x)
        output = self.decode_feature(x)  # (21, 475, 475)

        return (output, output_aux)

In [16]:
net = PSPNet(n_classes=21)
net

PSPNet(
  (feature_conv): FeatureMap_convolution(
    (cbnr_1): conv2DBatchNormRelu(
      (conv): Conv2d(3, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (batchnorm): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
    )
    (cbnr_2): conv2DBatchNormRelu(
      (conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (batchnorm): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
    )
    (cbnr_3): conv2DBatchNormRelu(
      (conv): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (batchnorm): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
    )
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  )
  (feature_res_1): ResidualBlockPSP(
    (block1): bottleNec

In [None]:
batch_size = 2
dummy_img = torch.rand(batch_size, 3, 475, 475)

outputs = net(dummy_img)
print(outputs)