In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import Softmax


def INF(B,H,W):
     return -torch.diag(torch.tensor(float("inf")).repeat(H),0).unsqueeze(0).repeat(B*W,1,1)


class CrissCrossAttention(nn.Module):
    def __init__(self, in_dim):
        super(CrissCrossAttention,self).__init__()
        self.query_conv = nn.Conv2d(in_channels=in_dim, out_channels=in_dim//8, kernel_size=1)  # dim reduction
        self.key_conv = nn.Conv2d(in_channels=in_dim, out_channels=in_dim//8, kernel_size=1)  # dim reduction
        self.value_conv = nn.Conv2d(in_channels=in_dim, out_channels=in_dim, kernel_size=1)
        
        self.softmax = Softmax(dim=3)
        self.INF = INF
        self.gamma = nn.Parameter(torch.zeros(1))


    def forward(self, x):
        '''Input: local feature map (N, C, H, W), c=C/8'''
        m_batchsize, _, height, width = x.size()
        
        proj_query = self.query_conv(x)  # (N, c, H, W)
        # (N, c, H, W) -> (N, W, c, H) -> (NW, c, H) -> (NW, H, c)
        proj_query_H = proj_query.permute(0, 3, 1, 2).contiguous().view(m_batchsize*width, -1, height).permute(0, 2, 1)
        # (N, c, H, W) -> (N, H, c, W) -> (NH, c, W) -> (NH, W, c)
        proj_query_W = proj_query.permute(0, 2, 1, 3).contiguous().view(m_batchsize*height, -1, width).permute(0, 2, 1)
        
        proj_key = self.key_conv(x)  # (N, c, H, W)
        # (N, c, H, W) -> (N, W, c, H) -> (NW, c, H) 
        proj_key_H = proj_key.permute(0, 3, 1, 2).contiguous().view(m_batchsize*width, -1, height)
        # (N, c, H, W) -> (N, H, c, W) -> (NH, c, W) 
        proj_key_W = proj_key.permute(0, 2, 1, 3).contiguous().view(m_batchsize*height, -1, width)
        
        proj_value = self.value_conv(x)  # (N, C, H, W)
        # (N, C, H, W) -> (N, W, C, H) -> (NW, C, H) 
        proj_value_H = proj_value.permute(0, 3, 1, 2).contiguous().view(m_batchsize*width, -1, height)
        # (N, C, H, W) -> (N, H, C, W) -> (NH, C, W) 
        proj_value_W = proj_value.permute(0, 2, 1, 3).contiguous().view(m_batchsize*height, -1, width)
        
        # (NW, H, c) x (NW, c, H) -> (NW, H, H) -> (N, W, H, H) -> (N, H, W, H)
        energy_H = (torch.bmm(proj_query_H, proj_key_H) + self.INF(m_batchsize, height, width)).view(m_batchsize, width, height, height).permute(0, 2, 1, 3)
        # (NH, W, c) x (NH, c, W) -> (NH, W, W) -> (N, H, W, W) 
        energy_W = torch.bmm(proj_query_W, proj_key_W).view(m_batchsize, height, width, width)
        
        concate = self.softmax(torch.cat([energy_H, energy_W], 3))  # (N, H, W, H+W)
        
        # (N, H, W, H) -> (N, W, H, H) -> (NW, H, H)
        att_H = concate[:, :, :, 0:height].permute(0, 2, 1, 3).contiguous().view(m_batchsize*width, height, height)
        # (N, H, W, W) -> (NH, W, W)
        att_W = concate[:, :, :, height:height+width].contiguous().view(m_batchsize*height, width, width)
        
        # (NW, C, H) x (NW, H, H) -> (NW, C, H) -> (N, W, C, H) -> (N, C, H, W)
        out_H = torch.bmm(proj_value_H, att_H.permute(0, 2, 1)).view(m_batchsize, width, -1, height).permute(0, 2, 3, 1)
        # (NH, C, W) x (NH, W, W) -> (NH, C, W) -> (N, H, C, W) -> (N, C, H, W)
        out_W = torch.bmm(proj_value_W, att_W.permute(0, 2, 1)).view(m_batchsize, height, -1, width).permute(0, 2, 1, 3)
        
        return self.gamma*(out_H + out_W) + x

In [4]:
if __name__ == '__main__':
    model = CrissCrossAttention(64)
    x = torch.randn(2, 64, 32, 16)
    out = model(x)
    print(out.shape)

torch.Size([2, 64, 32, 16])
