In [3]:
import torch
from torch import nn

tel = nn.TransformerEncoderLayer(d_model=12,nhead=3,batch_first=True)

In [16]:
for name,param in tel.self_attn.named_parameters():
    print(f'{name}: {param.shape}')    

in_proj_weight: torch.Size([36, 12])
in_proj_bias: torch.Size([36])
out_proj.weight: torch.Size([12, 12])
out_proj.bias: torch.Size([12])


In [28]:
nn.MultiheadAttention(embed_dim=12,num_heads=3).out_proj

NonDynamicallyQuantizableLinear(in_features=12, out_features=12, bias=True)

In [None]:
nn.MultiheadAttention(embed_dim=12,num_heads=3).out_proj

In [None]:
nn.MultiheadAttention(embed_dim=12,num_heads=3).out_proj

In [10]:
tel

TransformerEncoderLayer(
  (self_attn): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=12, out_features=12, bias=True)
  )
  (linear1): Linear(in_features=12, out_features=2048, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (linear2): Linear(in_features=2048, out_features=12, bias=True)
  (norm1): LayerNorm((12,), eps=1e-05, elementwise_affine=True)
  (norm2): LayerNorm((12,), eps=1e-05, elementwise_affine=True)
  (dropout1): Dropout(p=0.1, inplace=False)
  (dropout2): Dropout(p=0.1, inplace=False)
)

In [9]:
from torchinfo import summary as sm
sm(tel,(5,3,12))

Layer (type:depth-idx)                   Output Shape              Param #
TransformerEncoderLayer                  --                        --
├─MultiheadAttention: 1-1                [5, 3, 12]                --
├─Dropout: 1-2                           [5, 3, 12]                --
├─LayerNorm: 1-3                         [5, 3, 12]                24
├─Linear: 1-4                            [5, 3, 2048]              26,624
├─Dropout: 1-5                           [5, 3, 2048]              --
├─Linear: 1-6                            [5, 3, 12]                24,588
├─Dropout: 1-7                           [5, 3, 12]                --
├─LayerNorm: 1-8                         [5, 3, 12]                24
Total params: 51,260
Trainable params: 51,260
Non-trainable params: 0
Total mult-adds (M): 0.26
Input size (MB): 0.00
Forward/backward pass size (MB): 0.25
Params size (MB): 0.21
Estimated Total Size (MB): 0.46

In [6]:
tel(torch.rand(5,3,12)).shape

torch.Size([5, 3, 12])

### external attentions  

In [32]:
import os
ext_path = os.path.abspath('./ext_attns/')

In [34]:
import sys
sys.path.append(ext_path)

In [35]:
import torch
x = torch.rand(5,5,21)

'/home/yp/workspace/nlp-sm/ext_attns'

In [None]:
# 1.external attn: "Beyond Self-attention: External Attention using Two Linear Layers for Visual Tasks"
from model.attention.ExternalAttention import ExternalAttention
import torch

input=torch.randn(50,49,512)
ea = ExternalAttention(d_model=512,S=8)
output=ea(input)
print(output.shape)

In [None]:
# 2. self attn: attention is all you need 
from model.attention.SelfAttention import ScaledDotProductAttention
import torch

input=torch.randn(50,49,512)
sa = ScaledDotProductAttention(d_model=512, d_k=512, d_v=512, h=8)
output=sa(input,input,input)
print(output.shape)

In [None]:
# 3. simplified self attn: None
from model.attention.SimplifiedSelfAttention import SimplifiedScaledDotProductAttention
import torch

input=torch.randn(50,49,512)
ssa = SimplifiedScaledDotProductAttention(d_model=512, h=8)
output=ssa(input,input,input)
print(output.shape)

In [None]:
# 10 Efficient Multi-Head Self-Attention: "ResT: An Efficient Transformer for Visual Recognition"
from model.attention.EMSA import EMSA
import torch
from torch import nn
from torch.nn import functional as F

input=torch.randn(50,64,512)
emsa = EMSA(d_model=512, d_k=512, d_v=512, h=8,H=8,W=8,ratio=2,apply_transform=True)
output=emsa(input,input,input)
print(output.shape)

In [None]:
# 13 muse attn: "MUSE: Parallel Multi-Scale Attention for Sequence to Sequence Learning"
from model.attention.MUSEAttention import MUSEAttention
import torch
from torch import nn
from torch.nn import functional as F


input=torch.randn(50,49,512)
sa = MUSEAttention(d_model=512, d_k=512, d_v=512, h=8)
output=sa(input,input,input)
print(output.shape)

In [None]:
# 16 aft: An Attention Free Transformer
from model.attention.AFT import AFT_FULL
import torch
from torch import nn
from torch.nn import functional as F

input=torch.randn(50,49,512)
aft_full = AFT_FULL(d_model=512, n=49)
output=aft_full(input)
print(output.shape)

In [None]:
# 30 UFO: UFO-ViT: High Performance Linear Vision Transformer without Softmax---ArXiv 2021.09.29
from model.attention.UFOAttention import *
import torch
from torch import nn
from torch.nn import functional as F

if __name__ == '__main__':
    input=torch.randn(50,49,512)
    ufo = UFOAttention(d_model=512, d_k=512, d_v=512, h=8)
    output=ufo(input,input,input)
    print(output.shape) #[50, 49, 512]