In [13]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

### Multi Head Attention

In [12]:
class MultiHeadAttention(nn.Module):
    def __init__(self, hidden_size, num_attention_heads, p_dropout=0.1):
        super().__init__()
        assert hidden_size % num_attention_heads == 0

        self.hidden_size = hidden_size
        self.num_attention_heads = num_attention_heads
        self.attention_head_size = int(hidden_size / num_attention_heads)
        
        self.w_q = nn.Linear(self.hidden_size, self.hidden_size)
        self.w_k = nn.Linear(self.hidden_size, self.hidden_size)
        self.w_v = nn.Linear(self.hidden_size, self.hidden_size)
        
        self.dropout = nn.Dropout(p=p_dropout)
        self.out_proj = nn.Linear(self.hidden_size, self.hidden_size)
    
    def forward(self, query, key, value, mask=None):
        '''
        Args:
            query:(n, b, h * d)
            key: (m, b, h * d)
            value: (m, b, h * d)
            mask: (n, m)
        '''
        if mask is not None:
            # Same mask applied to all h heads.
            mask = mask.unsqueeze(1)
            
        nbatches = query.size(0)
        
        query, key, value = [
            lin(x).view(nbatches, -1, self.num_attention_heads, self.attention_head_size).transpose(1, 2)
            for lin, x in zip((self.w_q, self.w_k, self.w_v), (query, key, value))
        ]
        
        attention_scores = query @ key.transpose(-1, -2)
        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
        # print(attention_scores.size())
        if mask is not None:
        #* masked_fill_(mask, value) Fills elements of self tensor with value where mask is True
        #* 使用-1e9而不是0用于平滑概率
            attention_scores = attention_scores.masked_fill(mask == False, -1e9)
        
        attention_probs = F.softmax(attention_scores, dim=-1)
        # print(attention_probs)
        attention_probs = self.dropout(attention_probs)
        # mask
        
        print(attention_probs.size())
        print(value.size())
        attn_output = attention_probs @ value
        print(attn_output.size())
        attn_output = attn_output.transpose(1, 2).contiguous().view(nbatches, -1, self.attention_head_size * self.num_attention_heads)
        attn_output = self.out_proj(attn_output)
        
        del query, key, value
        
        return attn_output
    

In [87]:
nbatch, seq_length, hidden_size = 3, 5, 14
mask = torch.triu(torch.ones(3, 5, 5), diagonal=1) == 0
# print(mask)
mha = MultiHeadAttention(hidden_size, 2)

q = k = v = torch.randn(nbatch, seq_length, hidden_size)
# print(mha(q, k, v, mask))
# print(mha(q, k, v))
mha(q, k, v, mask)
mha(q, k, v)

torch.Size([3, 2, 5, 5])
torch.Size([3, 2, 5, 7])
torch.Size([3, 2, 5, 7])
torch.Size([3, 2, 5, 5])
torch.Size([3, 2, 5, 7])
torch.Size([3, 2, 5, 7])


tensor([[[ 0.1199,  0.1243, -0.1686, -0.1893, -0.1900,  0.0054,  0.1330,
          -0.0005, -0.1163, -0.1168,  0.1957, -0.2002, -0.1375, -0.0690],
         [ 0.1470,  0.1595, -0.1004, -0.1963, -0.1259, -0.0901,  0.2416,
          -0.0398, -0.0937,  0.0846,  0.2394, -0.2158, -0.0494, -0.1611],
         [ 0.0830,  0.1671, -0.2460, -0.1067, -0.2407,  0.0781,  0.1123,
           0.0213, -0.1483, -0.1111,  0.1597, -0.1293, -0.2468, -0.1078],
         [ 0.1192,  0.1229, -0.1105, -0.1635, -0.2022, -0.0157,  0.1115,
           0.0268, -0.0923, -0.1301,  0.1943, -0.2377, -0.1230, -0.0905],
         [ 0.1682,  0.1337, -0.2225, -0.1858, -0.2076,  0.1221,  0.0628,
          -0.0404, -0.0194, -0.1219,  0.2253, -0.2234, -0.2180, -0.0507]],

        [[ 0.3645,  0.3415, -0.0063,  0.4457, -0.2646,  0.0780,  0.0881,
          -0.0082,  0.2552, -0.0368,  0.3430, -0.3450,  0.1320, -0.1877],
         [ 0.4069,  0.2339, -0.0800,  0.0998, -0.3201,  0.0527,  0.1317,
          -0.0243,  0.1409,  0.1563,  0.406

In [72]:
# 输入数据，假设seq_length为10
src = torch.rand((10, 32, 512))  # (seq_length, batch_size, d_model)

# 创建一个padding mask，其中填充位置为True
padding_mask = (src == 0).all(dim=-1)  # 假设填充的值为0
padding_mask.size()

torch.Size([10, 32])

In [74]:
import torch
import torch.nn as nn

# 定义一个模块
transformer_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8)

# 输入数据，假设seq_length为10
src = torch.rand((32, 10, 512))  # (seq_length, batch_size, d_model)

# 创建一个padding mask，其中填充位置为True
padding_mask = (src == 0).all(dim=-1)  # 假设填充的值为0

# 将padding mask的形状扩展到匹配src的形状
# padding_mask = padding_mask.unsqueeze(1).unsqueeze(2)  # (seq_length, 1, 1, batch_size)

# 调用forward函数时传入padding_mask参数
output = transformer_layer(src, src_key_padding_mask=padding_mask.transpose(0, 1))


### LayerNorm

In [86]:
class MyLayerNorm(nn.Module):
    def __init__(self, features, eps=1e-5, elementwise_affine=True):
        super(MyLayerNorm, self).__init__()
        self.eps = eps
        self.elementwise_affine = elementwise_affine
        if elementwise_affine:
            self.a_2 = nn.Parameter(torch.ones(features))
            self.b_2 = nn.Parameter(torch.zeros(features))
        else:
            self.a_2 = torch.ones(features)
            self.b_2 = torch.zeros(features)
        self.reset_parameters()
        
    def reset_parameters(self) -> None:
        if self.elementwise_affine:
            nn.init.ones_(self.a_2)
            nn.init.zeros_(self.b_2)
    
    
    
    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1,keepdim=True)
        return  self.a_2 * (x - mean) / (std + self.eps) + self.b_2

In [85]:
# test
N, S, H = 2, 2, 10
input = torch.randn(N, S, H)

layer_norm_op = nn.LayerNorm([N, S, H], elementwise_affine=True) 
ln_y = layer_norm_op(input)

my_layer_norm = MyLayerNorm(input.size(), elementwise_affine=True)
verify_ln_y = my_layer_norm(input)

print(ln_y)
print(verify_ln_y)

tensor([[[-0.6084,  1.2828,  0.3061, -0.8851,  0.8505,  0.1779, -0.2187,
          -0.3460, -1.0200,  0.4999],
         [-0.3927,  1.5157,  0.2072,  2.3811,  0.1161, -0.0482,  0.5108,
          -0.6616, -1.0974,  1.1873]],

        [[-0.6979,  0.9624,  0.4424,  1.6685, -1.3118, -1.9541,  0.2597,
           1.1270, -2.2716, -0.7226],
         [ 0.1440,  0.1903, -1.6607,  0.2088, -0.9958,  1.2468, -0.2510,
           0.5317,  0.1493, -0.8226]]], grad_fn=<NativeLayerNormBackward0>)
tensor([[[-0.8130,  1.6980,  0.4012, -1.1804,  1.1241,  0.2310, -0.2955,
          -0.4645, -1.3594,  0.6585],
         [-0.7213,  1.0793, -0.1553,  1.8957, -0.2413, -0.3963,  0.1311,
          -0.9750, -1.3863,  0.7694]],

        [[-0.3327,  0.9000,  0.5139,  1.4242, -0.7884, -1.2653,  0.3783,
           1.0222, -1.5010, -0.3510],
         [ 0.3228,  0.3783, -1.8357,  0.4003, -1.0404,  1.6419, -0.1496,
           0.7866,  0.3292, -0.8333]]], grad_fn=<AddBackward0>)


In [87]:
roman_map = [(1000, 'M'), 
             (900, 'CM'), (500, 'D'), (400, 'CD'), (100, 'C'),
             (90, 'XC'), (50, 'L'), (40, 'XL'), (10, 'X'),
             (9, 'IX'), (5, 'V'), (4, 'IV'), (1, 'I')]

roman_map = [(v, k) for k, v in roman_map] 
print(roman_map)

[('M', 1000), ('CM', 900), ('D', 500), ('CD', 400), ('C', 100), ('XC', 90), ('L', 50), ('XL', 40), ('X', 10), ('IX', 9), ('V', 5), ('IV', 4), ('I', 1)]
