In [2]:
import torch
import torch.nn as nn

In [3]:
# 编码器Block
encoder_layer = nn.TransformerEncoderLayer(
    # the number of expected features in the input (required).
    d_model=512,  # 特征数目(E_q = E_k = E_v)
    # 多头数
    nhead=8,
    # the dimension of the feedforward network model
    dim_feedforward=2048,  # 默认dim_feedforward=2048
    dropout=0.1,  # 默认dropout=0.1
    activation='relu',  # 默认activattion='relu'
    # the eps value in layer normalization components
    layer_norm_eps=1e-05,  # 默认layer_norm_eps=1e05
    # f True, layer norm is done prior to attention and feedforward operations, respectivaly. Otherwise it’s done after
    norm_first=False,  # 默认norm_first=False
    device=None)
encoder_layer

TransformerEncoderLayer(
  (self_attn): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
  )
  (linear1): Linear(in_features=512, out_features=2048, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (linear2): Linear(in_features=2048, out_features=512, bias=True)
  (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  (dropout1): Dropout(p=0.1, inplace=False)
  (dropout2): Dropout(p=0.1, inplace=False)
)

In [4]:
src = torch.rand(10, 32, 512)  # 输入形状: (batch, seq, feature)
encoder_layer(
    # the sequence to the encoder layer (required)
    src=src,
    # the mask for the src sequence (optional)
    src_mask=None)  # 多头注意力中的attn_mask参数

tensor([[[-1.7469e+00,  4.2051e-02,  1.9362e+00,  ..., -5.9903e-02,
           1.4711e-01, -7.2252e-01],
         [-2.2480e+00,  1.3206e+00, -7.8952e-01,  ...,  1.6642e+00,
           7.3171e-01, -1.2751e+00],
         [-1.6488e+00,  4.6468e-01,  1.5653e+00,  ...,  4.3992e-01,
          -2.5424e-01,  1.1500e+00],
         ...,
         [-7.3992e-02, -8.0366e-01,  8.1588e-01,  ...,  4.3400e-01,
           1.6023e+00, -7.6043e-01],
         [-1.3871e+00, -7.2627e-01,  1.7174e-02,  ...,  1.1562e-01,
           1.1124e+00,  8.4239e-01],
         [-8.9786e-01,  1.4180e+00,  1.1189e+00,  ..., -6.4031e-01,
           1.2422e+00, -4.6449e-01]],

        [[-2.4380e+00, -3.9573e-01,  1.4947e+00,  ...,  1.8465e-01,
          -7.9765e-02,  9.4587e-01],
         [-1.1278e+00,  1.5586e+00,  1.5435e+00,  ..., -3.9909e-02,
           4.9932e-01, -1.2963e-01],
         [ 1.4141e-01,  2.2784e-01,  2.2294e-01,  ...,  6.3093e-01,
           8.1622e-01, -5.5462e-01],
         ...,
         [-1.1984e+00,  1

In [5]:
# 编码器
transformer_encoder = nn.TransformerEncoder(encoder_layer=encoder_layer,
                                            num_layers=6)
out_src_en = transformer_encoder(src, mask=None)
out_src_en

tensor([[[-1.7223e+00, -2.5422e-01,  5.2391e-01,  ..., -3.6043e-01,
          -8.4727e-01, -1.3284e+00],
         [-1.6210e+00,  7.2879e-01, -8.3492e-01,  ...,  7.5971e-01,
          -5.4969e-01, -1.7766e+00],
         [-1.3749e+00, -4.2977e-01,  1.2220e+00,  ..., -1.3658e+00,
           1.3468e-01, -1.0810e+00],
         ...,
         [-9.4071e-01, -1.3170e+00, -1.9088e-01,  ...,  3.6259e-01,
           3.9407e-02, -1.5182e+00],
         [-8.6168e-01, -6.0159e-01,  1.0581e-02,  ..., -6.0681e-01,
          -1.4985e-01,  1.0571e-01],
         [-6.1878e-01,  6.9865e-01,  2.1431e-03,  ...,  1.0071e-01,
           4.0756e-01, -1.5029e+00]],

        [[-8.3908e-01, -8.5193e-01,  1.0251e-01,  ..., -1.0720e+00,
          -9.3323e-02, -5.5439e-01],
         [-9.5127e-01,  2.8391e-01,  4.8230e-01,  ...,  3.4062e-01,
           6.7189e-02, -1.3187e+00],
         [-1.7839e+00, -7.0469e-01,  3.7584e-01,  ..., -1.1501e+00,
           2.8152e-01, -1.0479e+00],
         ...,
         [-1.2833e+00, -5

In [6]:
# 解码器Block
decoder_layer = nn.TransformerDecoderLayer(d_model=512,
                                           nhead=8,
                                           dim_feedforward=2048,  # 默认dim_feedforward=2048
                                           dropout=0.1,  # 默认dropout=0.1
                                           activation='relu',  # 默认activattion='relu'
                                           layer_norm_eps=1e-05,  # 默认layer_norm_eps=1e05
                                           norm_first=False,  # 默认norm_first=False
                                           device=None)
decoder_layer

TransformerDecoderLayer(
  (self_attn): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
  )
  (multihead_attn): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
  )
  (linear1): Linear(in_features=512, out_features=2048, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (linear2): Linear(in_features=2048, out_features=512, bias=True)
  (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  (norm3): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  (dropout1): Dropout(p=0.1, inplace=False)
  (dropout2): Dropout(p=0.1, inplace=False)
  (dropout3): Dropout(p=0.1, inplace=False)
)

In [7]:
memory = torch.rand(10, 32, 512)
tgt = torch.rand(20, 32, 512)
decoder_layer(
    # the sequence to the decoder layer (required).
    tgt=tgt,
    # the sequence from the last layer of the encoder (required).
    memory=memory,
    # the mask for the tgt sequence (optional)
    tgt_mask=None,
    # the mask for the memory sequence (optional)
    memory_mask=None
)

tensor([[[-2.4136e-01, -7.0166e-01, -1.8993e+00,  ...,  1.2587e+00,
           2.0225e-01,  1.7014e+00],
         [ 7.7665e-01,  9.8012e-01, -8.4711e-01,  ...,  1.6112e+00,
           2.2498e-01, -1.2817e-01],
         [ 7.2620e-01, -4.3400e-01, -1.2945e+00,  ..., -4.5379e-01,
           1.7600e-01,  3.5164e-01],
         ...,
         [-1.0200e+00,  5.1918e-02, -1.8676e+00,  ...,  1.3874e+00,
           3.5616e-01, -1.9928e-01],
         [ 3.5279e-01,  1.8399e+00, -5.3174e-01,  ..., -8.7508e-01,
          -9.7843e-01,  1.2661e+00],
         [-1.2432e-01, -2.9244e-01,  7.4572e-01,  ...,  1.9856e-01,
           1.9546e+00,  8.1695e-01]],

        [[ 8.1090e-01,  1.2328e+00,  5.7557e-01,  ..., -5.4578e-01,
           1.2031e+00, -2.5842e-01],
         [-5.2259e-01, -1.1667e+00, -1.1536e+00,  ..., -6.1386e-02,
           6.3571e-01,  1.4593e+00],
         [ 6.1783e-01,  1.3427e+00, -1.2045e+00,  ...,  3.3734e-02,
           1.7217e+00,  1.5385e+00],
         ...,
         [ 4.5740e-01,  9

In [8]:
# 编码器
transformer_decoder = nn.TransformerDecoder(decoder_layer=decoder_layer,
                                            num_layers=6)
out_src_de = transformer_decoder(tgt=tgt, memory=memory,
                                 tgt_mask=None, memory_mask=None)  # 多头注意力中的attn_mask参数
out_src_de

tensor([[[-1.4741,  1.3153,  0.7335,  ...,  1.7569, -0.5978,  1.0754],
         [-0.5324,  1.2446,  0.8768,  ...,  1.4507, -0.5487, -0.2084],
         [-1.0658,  1.2898,  0.3674,  ...,  1.0603, -0.1856,  0.9397],
         ...,
         [-1.7659,  1.6570,  0.4399,  ...,  1.8238, -0.5380, -0.0103],
         [-0.7136,  2.4370,  0.9044,  ...,  1.7245, -1.3035,  0.7737],
         [-1.4178,  0.9352,  1.2538,  ...,  0.4309,  0.4570,  1.4094]],

        [[-0.7885,  1.8838,  0.8437,  ...,  1.5711,  0.1666,  1.0607],
         [-1.7347,  0.9000,  1.0103,  ...,  1.0445, -0.0289,  0.5028],
         [-2.0434,  1.9087,  0.6508,  ...,  1.4442, -0.5380,  0.7605],
         ...,
         [-0.9334,  1.2041,  0.4968,  ...,  1.7545, -0.8955,  0.0577],
         [-1.3306,  0.7884,  0.7297,  ...,  1.8367,  0.1691,  0.5927],
         [-1.4414,  0.7141,  1.3668,  ...,  0.8071, -0.3070,  0.8092]],

        [[-0.7734,  1.8164,  1.3775,  ...,  1.8063, -0.5843,  1.4923],
         [-1.7300,  0.8537,  1.2114,  ...,  1

In [9]:
# Transformer默认参数如下
transformer_model_default = nn.Transformer(
    d_model=512,
    nhead=8,
    num_encoder_layers=6,
    num_decoder_layers=6,
    dim_feedforward=2048,
    dropout=0.1,
    activation='relu',
    custom_encoder=None,
    custom_decoder=None,
    layer_norm_eps=1e-05,
    norm_first=False,
    device=None)
transformer_model_default

Transformer(
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (linear1): Linear(in_features=512, out_features=2048, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=2048, out_features=512, bias=True)
        (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
      (1): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (linear1): Linear(in_features=512, out_features=2048, bias=True)
        (dropout): Dropout(p=0.1, in

In [10]:
transformer_model_obj = nn.Transformer(
    d_model=512,
    nhead=8,
    num_encoder_layers=6,
    num_decoder_layers=6,
    dim_feedforward=2048,
    dropout=0.1,
    activation='relu',
    custom_encoder=transformer_encoder,  # 自定义编码器
    custom_decoder=transformer_decoder,  # 自定义解码器
    layer_norm_eps=1e-05,
    norm_first=False,
    device=None)
transformer_model_obj

Transformer(
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (linear1): Linear(in_features=512, out_features=2048, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=2048, out_features=512, bias=True)
        (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
      (1): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (linear1): Linear(in_features=512, out_features=2048, bias=True)
        (dropout): Dropout(p=0.1, in

In [11]:
transformer_model_default(src=src,
                          tgt=tgt,
                          tgt_mask=None,
                          memory_mask=None)

tensor([[[-0.6244, -2.0542,  0.4478,  ...,  1.9199,  1.0156, -1.2590],
         [-0.4608, -1.4375,  1.2078,  ...,  1.5706, -0.0928, -1.7533],
         [ 0.0137, -0.9193,  2.2957,  ...,  1.2349, -0.0572, -1.6597],
         ...,
         [ 0.5563, -0.7968,  1.5184,  ...,  0.8981, -0.6524, -1.3620],
         [ 0.4151, -0.3644,  2.3248,  ...,  1.2957,  0.2861, -1.4721],
         [ 1.1312, -1.4982,  1.1165,  ...,  0.5287,  0.3034, -0.9692]],

        [[ 0.3215, -2.2888,  2.7028,  ...,  1.6844,  0.8484, -1.4221],
         [-0.2087, -2.3712,  0.8356,  ...,  1.8050,  0.9232, -1.4095],
         [ 0.3716, -1.3159,  2.3826,  ...,  1.9608, -0.0205, -0.4565],
         ...,
         [ 0.1239, -0.6204,  1.2942,  ...,  0.9238, -0.7001, -1.2035],
         [-0.1830, -1.4763,  1.5067,  ...,  1.8243,  0.0708, -1.7185],
         [ 0.5813, -2.1217,  1.4111,  ...,  1.0835,  0.0330, -1.0250]],

        [[ 0.7165, -1.1891,  1.9710,  ...,  2.2829,  0.7236, -2.8863],
         [-0.0607, -1.6457,  0.7646,  ...,  2