In [1]:
import torch
from torch import Tensor

In [2]:
def generate_square_subsequent_mask(sz: int) -> Tensor:
        r"""Generate a square mask for the sequence. The masked positions are filled with float('-inf').
            Unmasked positions are filled with float(0.0).
        """
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

In [4]:
def attention_with_mask(q, k, v, mask):
    # 计算注意力分数
    attn_scores = torch.matmul(q, k.transpose(-2, -1)) / torch.sqrt(torch.tensor(q.size(-1)).float())
    
    # 将mask应用到注意力分数上
    attn_scores = attn_scores + mask
    
    # 使用softmax得到注意力权重
    attn_weights = torch.nn.functional.softmax(attn_scores, dim=-1)
    
    # 将注意力权重应用到值向量上
    attended_values = torch.matmul(attn_weights, v)
    
    return attended_values

In [5]:
# 示例：结合上三角矩阵和Attention机制
seq_length = 5
tgt_mask = generate_square_subsequent_mask(5)
print(tgt_mask)

# 假设有一些查询、键和值
q = torch.rand((1, seq_length, 8))  # 查询
k = torch.rand((1, seq_length, 8))  # 键
v = torch.rand((1, seq_length, 16))  # 值
print(q)

tensor([[0., -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf],
        [0., 0., 0., 0., -inf],
        [0., 0., 0., 0., 0.]])
tensor([[[0.6827, 0.8715, 0.6594, 0.3215, 0.4095, 0.0578, 0.1286, 0.2718],
         [0.9719, 0.5628, 0.2867, 0.7069, 0.1371, 0.0905, 0.1335, 0.9883],
         [0.7962, 0.2180, 0.1426, 0.8030, 0.1417, 0.1458, 0.3048, 0.5056],
         [0.4402, 0.9670, 0.0383, 0.9630, 0.5657, 0.5988, 0.2132, 0.4764],
         [0.6218, 0.1403, 0.9986, 0.6016, 0.1755, 0.3002, 0.9370, 0.3618]]])


# 理解Decoder过程的mask在处理序列交叉中的“不可见”逻辑
### 将注意力权重应用到值向量上，attn_weights的行乘以v的列，该操作正是各注意力权重*相应向量再进行加和的结果，因此这样的结果的各个位置token的向量就可以接最后的分类layer(即)

In [8]:
attn_scores = torch.matmul(q, k.transpose(-2, -1)) / torch.sqrt(torch.tensor(q.size(-1)).float())
print("一、",attn_scores.shape)
# print(attn_scores)
# 将mask应用到注意力分数上
attn_scores = attn_scores + tgt_mask
# print("二、",attn_scores.shape)
print(attn_scores)

# 使用softmax得到注意力权重
attn_weights = torch.nn.functional.softmax(attn_scores, dim=-1)
# print("三、",attn_weights.shape)
print(attn_weights)


print("v",v.shape)
print("v",v)
# 将注意力权重应用到值向量上，attn_weights的行乘以v的列，该操作正是各注意力权重*相应向量再进行加和的结果，因此这样的结果的各个位置token的向量就可以接最后的分类layer(即)
attended_values = torch.matmul(attn_weights, v) #矩阵乘法
print(attended_values.shape)
print(attended_values)

一、 torch.Size([1, 5, 5])
tensor([[[0.7243,   -inf,   -inf,   -inf,   -inf],
         [0.8397, 0.7308,   -inf,   -inf,   -inf],
         [0.6991, 0.5937, 0.3858,   -inf,   -inf],
         [0.9776, 0.5494, 0.6603, 0.8436,   -inf],
         [0.8244, 0.8459, 0.5046, 1.0104, 0.7843]]])
tensor([[[1.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.5272, 0.4728, 0.0000, 0.0000, 0.0000],
         [0.3801, 0.3421, 0.2779, 0.0000, 0.0000],
         [0.3073, 0.2003, 0.2237, 0.2687, 0.0000],
         [0.2035, 0.2080, 0.1478, 0.2451, 0.1955]]])
v torch.Size([1, 5, 16])
v tensor([[[0.1223, 0.4286, 0.0514, 0.9821, 0.8721, 0.0588, 0.0726, 0.7067,
          0.1589, 0.7693, 0.2967, 0.4090, 0.6279, 0.1535, 0.1420, 0.5737],
         [0.2929, 0.9397, 0.2973, 0.6994, 0.3859, 0.4781, 0.5819, 0.1947,
          0.6096, 0.4671, 0.1786, 0.4330, 0.6424, 0.8087, 0.0260, 0.9049],
         [0.5693, 0.6645, 0.4188, 0.5822, 0.7503, 0.0173, 0.8950, 0.7226,
          0.1043, 0.6663, 0.1539, 0.0776, 0.0703, 0.9588, 0.96

In [9]:
A = torch.tensor([[1, 2], [3, 4]])
print(A)
B = torch.tensor([[5, 6, 9], [7, 8,10]])
print(B)
result = torch.matmul(A, B)
print(result)

tensor([[1, 2],
        [3, 4]])
tensor([[ 5,  6,  9],
        [ 7,  8, 10]])
tensor([[19, 22, 29],
        [43, 50, 67]])
