In [53]:
import torch
import torch.nn as nn

In [54]:
# 编码器Block
encoder_layer = nn.TransformerEncoderLayer(d_model=512,  # 特征数目(E_q = E_k = E_v)
                                           # 多头数
                                           nhead=8,
                                           # the dimension of the feedforward network model
                                           dim_feedforward=2048,  # 默认dim_feedforward=2048
                                           dropout=0.1,  # 默认dropout=0.1
                                           activation='relu',  # 默认activattion='relu'
                                           # the eps value in layer normalization components
                                           layer_norm_eps=1e-05,  # 默认layer_norm_eps=1e05
                                           # f True, layer norm is done prior to attention and feedforward operations, respectivaly. Otherwise it’s done after
                                           norm_first=False,  # 默认norm_first=False
                                           device=None)
encoder_layer

TransformerEncoderLayer(
  (self_attn): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
  )
  (linear1): Linear(in_features=512, out_features=2048, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (linear2): Linear(in_features=2048, out_features=512, bias=True)
  (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  (dropout1): Dropout(p=0.1, inplace=False)
  (dropout2): Dropout(p=0.1, inplace=False)
)

In [55]:
src = torch.rand(10, 32, 512)  # 输入形状: (batch, seq, feature)
encoder_layer(
    # the sequence to the encoder layer (required)
    src=src,
    # the mask for the src sequence (optional)
    src_mask=None)  # 多头注意力中的attn_mask参数

tensor([[[-1.8692, -0.1515,  1.3918,  ..., -1.7462, -0.8727, -0.8541],
         [-0.9987, -0.4768,  0.4260,  ...,  1.2531, -0.9542,  0.3313],
         [-2.1213, -0.7085,  0.4212,  ..., -0.1166, -0.3093,  1.9479],
         ...,
         [-0.3635, -1.0434,  1.0164,  ..., -0.6747, -1.4213,  1.6979],
         [-1.8652,  0.2515,  1.6178,  ..., -0.7412,  0.5465,  1.4093],
         [-0.3876,  0.7023,  1.2365,  ...,  1.1873, -0.3462,  0.9643]],

        [[-0.0787, -1.0650, -1.4129,  ...,  1.6642, -0.3840, -0.4622],
         [ 1.0155, -0.1173, -0.9439,  ...,  0.5986, -0.1919,  0.9009],
         [-1.5084, -1.6509, -0.2305,  ...,  0.6838, -1.4779,  0.2462],
         ...,
         [-0.5524, -0.5299, -1.1652,  ...,  0.6802, -0.2298,  2.2992],
         [-0.5457, -0.5121,  1.6386,  ..., -0.0680, -1.5193,  1.8945],
         [-0.1030,  0.3906,  0.6759,  ..., -0.0294, -0.8602,  1.3786]],

        [[-0.8895, -1.5635, -1.2232,  ...,  0.7131,  0.2084,  1.7777],
         [-0.4362, -0.9548, -0.7247,  ..., -0

In [56]:
# 编码器
transformer_encoder = nn.TransformerEncoder(encoder_layer=encoder_layer,
                                            num_layers=6)
out_src_en = transformer_encoder(src, mask=None)
out_src_en

tensor([[[-1.0774,  0.8736,  0.5131,  ..., -1.3639, -1.7990, -0.7536],
         [-0.6161,  0.2664,  0.8244,  ..., -0.5872, -1.8289, -0.5937],
         [-0.7910,  0.8121,  1.0874,  ..., -0.7002, -0.6048, -1.0531],
         ...,
         [-1.5661,  0.4009,  1.5873,  ..., -2.1164, -1.6451,  0.5340],
         [-2.4691, -0.1829,  0.6436,  ..., -1.9597, -1.5631,  1.0037],
         [-0.7987,  0.4233,  0.4577,  ..., -1.4027, -0.9080, -0.2784]],

        [[-0.2829,  0.3466,  0.0111,  ..., -0.2984, -1.8073, -0.3881],
         [-0.6042,  0.2372,  0.5459,  ..., -0.0340, -1.2929, -0.1769],
         [-0.5447, -0.1324, -0.3678,  ...,  0.7825, -1.5298, -0.6594],
         ...,
         [-0.8469, -0.0421, -0.2172,  ..., -1.7386, -1.6365, -0.1385],
         [-1.0369, -0.2661,  1.2981,  ..., -0.3540, -2.0920,  0.5069],
         [-0.6639, -0.2423,  0.3724,  ..., -1.4955, -1.0934,  0.1709]],

        [[-0.4786,  0.5861, -0.9579,  ..., -0.7987, -2.2739,  0.5027],
         [-0.4958, -0.1908,  0.5486,  ..., -1

In [57]:
# 解码器Block
decoder_layer = nn.TransformerDecoderLayer(d_model=512,
                                           nhead=8,
                                           dim_feedforward=2048,  # 默认dim_feedforward=2048
                                           dropout=0.1,  # 默认dropout=0.1
                                           activation='relu',  # 默认activattion='relu'
                                           layer_norm_eps=1e-05,  # 默认layer_norm_eps=1e05
                                           norm_first=False,  # 默认norm_first=False
                                           device=None)
decoder_layer

TransformerDecoderLayer(
  (self_attn): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
  )
  (multihead_attn): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
  )
  (linear1): Linear(in_features=512, out_features=2048, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (linear2): Linear(in_features=2048, out_features=512, bias=True)
  (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  (norm3): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  (dropout1): Dropout(p=0.1, inplace=False)
  (dropout2): Dropout(p=0.1, inplace=False)
  (dropout3): Dropout(p=0.1, inplace=False)
)

In [58]:
memory = torch.rand(10, 32, 512)
tgt = torch.rand(20, 32, 512)
decoder_layer(
    # the sequence to the decoder layer (required).
    tgt=tgt,
    # the sequence from the last layer of the encoder (required).
    memory=memory,
    # the mask for the tgt sequence (optional)
    tgt_mask=None,
    # the mask for the memory sequence (optional)
    memory_mask=None
)

tensor([[[ 0.7794,  0.3020,  0.8144,  ...,  0.2260,  0.7677, -1.4454],
         [ 0.5657,  1.9052,  0.5356,  ...,  0.7191, -0.4774,  0.1977],
         [ 0.7530, -0.5442, -0.6075,  ..., -1.3486,  0.1988, -2.2017],
         ...,
         [-1.1420,  0.1529, -1.4197,  ..., -1.2877,  1.3177, -1.4238],
         [ 0.0899,  1.4249, -0.4623,  ...,  0.5833,  2.5259, -1.2417],
         [-0.4235,  0.4794,  0.9448,  ..., -0.0151,  0.5411, -1.6328]],

        [[-1.3327,  1.3460,  0.4856,  ...,  0.1713,  2.2537,  0.0601],
         [-0.8601,  1.9882, -0.0268,  ..., -0.7024, -0.3792, -1.5450],
         [-0.9184, -0.6669, -0.2834,  ..., -0.9357, -0.4182, -0.1712],
         ...,
         [ 0.7501,  1.5452, -0.7583,  ..., -0.2121, -0.3210, -0.7857],
         [ 1.2439,  0.4849, -1.2655,  ..., -1.3022, -0.3788,  0.3491],
         [ 0.2848,  1.2148,  1.1248,  ..., -0.4752,  1.3973, -0.3600]],

        [[ 0.6340, -1.0626, -0.7325,  ...,  1.0751, -1.1455, -0.7567],
         [ 1.4131,  0.0188,  1.3580,  ...,  0

In [59]:
# 编码器
transformer_decoder = nn.TransformerDecoder(decoder_layer=decoder_layer,
                                            num_layers=6)
out_src_de = transformer_decoder(tgt=tgt, memory=memory,
                                 tgt_mask=None, memory_mask=None)  # 多头注意力中的attn_mask参数
out_src_de

tensor([[[-0.2073,  0.4202,  0.8138,  ...,  0.0120,  1.7593,  1.1015],
         [ 0.0341,  1.7794,  1.1330,  ...,  0.6530,  1.2724,  1.2094],
         [ 0.5981, -0.3376,  0.2681,  ..., -0.9261,  1.7796,  0.2171],
         ...,
         [-0.5149,  0.8838,  0.4348,  ..., -0.5097,  0.9922,  0.8278],
         [-0.6107,  0.7843,  0.3483,  ..., -0.1543,  1.8903,  0.2653],
         [-0.1463,  1.5205,  1.4868,  ..., -0.3387,  2.2893,  0.6776]],

        [[-0.8949,  0.9534,  0.7246,  ...,  0.3101,  2.2774,  1.5423],
         [ 0.0929,  0.8505,  0.4263,  ...,  0.0185,  0.5956,  0.6751],
         [-0.1499, -0.3266, -0.3626,  ..., -0.5831,  0.5289,  1.0730],
         ...,
         [-0.4351,  1.1652,  0.7969,  ..., -0.5113,  1.3693,  0.6590],
         [ 0.2015,  1.1427,  0.3377,  ..., -0.0245,  0.6900,  0.3974],
         [ 0.1684,  0.7656,  0.6523,  ..., -0.2814,  1.8408,  0.7199]],

        [[-0.1547, -0.1658,  0.7835,  ...,  0.3473,  0.6477,  1.1922],
         [ 0.5024,  0.3546,  1.0316,  ...,  0

In [60]:
# Transformer默认参数如下
transformer_model_default = nn.Transformer(
    d_model=512,
    nhead=8,
    num_encoder_layers=6,
    num_decoder_layers=6,
    dim_feedforward=2048,
    dropout=0.1,
    activation='relu',
    custom_encoder=None,
    custom_decoder=None,
    layer_norm_eps=1e-05,
    norm_first=False,
    device=None)
transformer_model_default

Transformer(
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (linear1): Linear(in_features=512, out_features=2048, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=2048, out_features=512, bias=True)
        (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
      (1): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (linear1): Linear(in_features=512, out_features=2048, bias=True)
        (dropout): Dropout(p=0.1, in

In [61]:
transformer_model_obj = nn.Transformer(
    d_model=512,
    nhead=8,
    num_encoder_layers=6,
    num_decoder_layers=6,
    dim_feedforward=2048,
    dropout=0.1,
    activation='relu',
    custom_encoder=transformer_encoder,  # 自定义编码器
    custom_decoder=transformer_decoder,  # 自定义解码器
    layer_norm_eps=1e-05,
    norm_first=False,
    device=None)
transformer_model_obj

Transformer(
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (linear1): Linear(in_features=512, out_features=2048, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=2048, out_features=512, bias=True)
        (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
      (1): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (linear1): Linear(in_features=512, out_features=2048, bias=True)
        (dropout): Dropout(p=0.1, in

In [None]:
transformer_model_default(src=src,
                          tgt=tgt,
                          tgt_mask=None,
                          memory_mask=None)