In [1]:
import torch
import time
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
torch.cuda.is_available()
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print("当前设备名称：", torch.cuda.get_device_name(device) if device.type == 'cuda' else 'CPU')

当前设备名称： NVIDIA GeForce RTX 3080


> 生成词索引构成的序列

In [2]:
batch_size = 2
max_num_src_words = 8
max_num_tgt_words = 8
model_dim = 8
max_src_seq_len = 5
max_tgt_seq_len = 5
max_position_len = 5
# src_len = torch.randint(2,5,(batch_size,))
# tgt_len = torch.randint(2,5,(batch_size,))
src_len = torch.Tensor([2, 4]).to(torch.int32)
tgt_len = torch.Tensor([4, 3]).to(torch.int32)
# 词索引构成的序列
src_seq = torch.cat([torch.unsqueeze(F.pad(torch.randint(1, max_num_src_words, (L,)), (0, max_src_seq_len-L)), 0) \
                     for L in src_len])
tgt_seq = torch.cat([torch.unsqueeze(F.pad(torch.randint(1, max_num_tgt_words, (L,)), (0, max_tgt_seq_len-L)), 0) \
                     for L in tgt_len])
print(src_seq)
print(tgt_seq)

tensor([[7, 4, 0, 0, 0],
        [7, 2, 5, 4, 0]])
tensor([[2, 3, 6, 5, 0],
        [4, 5, 5, 0, 0]])


> 采用Embedding的forward方法

In [3]:
# 实例化embedding # requires_grad 
# weight是随机的，训练时更新
src_embedding_table = nn.Embedding(max_num_src_words+1, model_dim)
tgt_embedding_table = nn.Embedding(max_num_tgt_words+1, model_dim)
# print(src_embedding_table.weight, tgt_embedding_table.weight)
# Embedding存在 --call-- forward方法隐式
src_embedding = src_embedding_table(src_seq)
tgt_embedding_table = tgt_embedding_table(tgt_seq)
print(src_embedding)
# 由于为0 底下3排一样

tensor([[[ 0.3390,  0.2224,  1.7262, -2.0615, -1.6018,  0.4065,  0.5479,
           0.2578],
         [ 0.7082,  0.0977,  0.6959,  1.0974, -0.7529,  0.6294,  0.7810,
          -0.6112],
         [-0.8659,  0.3368, -0.1845, -1.0268, -0.1665, -0.6293, -0.5064,
           1.3912],
         [-0.8659,  0.3368, -0.1845, -1.0268, -0.1665, -0.6293, -0.5064,
           1.3912],
         [-0.8659,  0.3368, -0.1845, -1.0268, -0.1665, -0.6293, -0.5064,
           1.3912]],

        [[ 0.3390,  0.2224,  1.7262, -2.0615, -1.6018,  0.4065,  0.5479,
           0.2578],
         [ 2.6824,  0.9109, -0.0814,  0.9007, -0.3474,  2.5647, -0.4796,
          -1.1515],
         [-1.3354,  1.2109, -1.0373, -0.4644, -0.4364, -0.0551, -1.2579,
           0.2740],
         [ 0.7082,  0.0977,  0.6959,  1.0974, -0.7529,  0.6294,  0.7810,
          -0.6112],
         [-0.8659,  0.3368, -0.1845, -1.0268, -0.1665, -0.6293, -0.5064,
           1.3912]]], grad_fn=<EmbeddingBackward0>)


> 生成pos—embedding

In [4]:
# pos embedding    method1 
start_time = time.time()
pos_embedding = torch.zeros(max_position_len, model_dim)
for i in range(0, model_dim):
    for j in range(0, max_position_len):
        if i%2 == 0:
            # print(i,j)
            pos_embedding[j, i] = np.sin(j/np.power(10000, (i/model_dim)))
        else:
            pos_embedding[j, i] = np.cos(j/np.power(10000, (i/model_dim)))
# print(pos_embedding)
end_time = time.time()
# 计算执行时间并转换为微秒
execution_time_microseconds = (end_time - start_time) * 1e6
# 打印执行时间
print("np Time (microseconds):", execution_time_microseconds)
#///////////////////////////////////////////////////////////////////////////////////////////
# pos embedding    method2 
start_time = time.time()
pos_mat = torch.arange(max_position_len).reshape(-1, 1) 
i_mat = torch.pow(10000, torch.arange(0, 8, 2).reshape((1, -1))/model_dim)
i1_mat = torch.pow(10000, torch.arange(1, 8, 2).reshape((1, -1))/model_dim)
# print(pos_mat, i_mat, i1_mat)
pe_embedding_table = torch.zeros(max_position_len, model_dim)
pe_embedding_table[:, 0::2] = torch.sin(pos_mat/i_mat)
pe_embedding_table[:, 1::2] = torch.cos(pos_mat/i1_mat)
print(pe_embedding_table)
end_time = time.time()
# 计算执行时间并转换为微秒
execution_time_microseconds = (end_time - start_time) * 1e6
# 打印执行时间
print("torch Time (microseconds):", execution_time_microseconds)
#///////////////////////////////////////////////////////////////////////////////////////////
# print(torch.sub(pos_embedding, pe_embedding_table))
pe_embedding = nn.Embedding(max_position_len, model_dim)
# 位置编码训练中不发生变化requires_grad=False
pe_embedding.weight = nn.Parameter(pe_embedding_table, requires_grad=False)
# 注意 这里给embedding的是位置 不是词的编号
# src_p=torch.zeros_like(src_seq)
# for i in range(src_p.size(0)):
#     for j in range(src_p.size(1)):
#         src_p[i][j] = j 
# print(src_p)
# src_pe_embedding =pe_embedding(src_p)
src_pe_embedding =pe_embedding(torch.cat([pos_mat,pos_mat], 1).transpose(-1,-2))
print(torch.cat([pos_mat,pos_mat], 1).transpose(-1,-2))
print(src_pe_embedding)
tgt_pe_embedding = src_pe_embedding

np Time (microseconds): 1003.9806365966797
tensor([[ 0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
          1.0000e+00,  0.0000e+00,  1.0000e+00],
        [ 8.4147e-01,  9.5042e-01,  9.9833e-02,  9.9950e-01,  9.9998e-03,
          9.9999e-01,  1.0000e-03,  1.0000e+00],
        [ 9.0930e-01,  8.0658e-01,  1.9867e-01,  9.9800e-01,  1.9999e-02,
          9.9998e-01,  2.0000e-03,  1.0000e+00],
        [ 1.4112e-01,  5.8275e-01,  2.9552e-01,  9.9550e-01,  2.9995e-02,
          9.9995e-01,  3.0000e-03,  1.0000e+00],
        [-7.5680e-01,  3.0114e-01,  3.8942e-01,  9.9201e-01,  3.9989e-02,
          9.9992e-01,  4.0000e-03,  1.0000e+00]])
torch Time (microseconds): 4999.399185180664
tensor([[0, 1, 2, 3, 4],
        [0, 1, 2, 3, 4]])
tensor([[[ 0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
           1.0000e+00,  0.0000e+00,  1.0000e+00],
         [ 8.4147e-01,  9.5042e-01,  9.9833e-02,  9.9950e-01,  9.9998e-03,
           9.9999e-01,  1.0000e-03,  1.0000e+

> 生成enc_self_attn_mask 无因果关系   对词向量相似度进行masked

In [5]:
# 这里不是三角矩阵# 这里不是三角矩阵 没有因果关系 多用矩阵相乘 转置 升维
vaild_encoder_pos = torch.unsqueeze(torch.cat([torch.unsqueeze(F.pad(torch.ones(L),(0, max(src_len)-L)),0) for L in src_len]), 2)
# print(vaild_encoder_pos.shape, "\n", vaild_encoder_pos)
# vaild_encoder_pos = torch.cat([torch.unsqueeze(F.pad(torch.ones(L),(0, max(src_len)-L)),0) for L in src_len])
print(vaild_encoder_pos.shape, "\n", vaild_encoder_pos)
# 这里不是三角矩阵
# print(torch.tril(torch.ones(4, 4)))
# vaild_mask = vaild_encoder_pos.transpose(1,2)*torch.ones(4, 4)*vaild_encoder_pos
# print(vaild_mask.shape, "\n", vaild_mask)
# print(torch.bmm(torch.unsqueeze(torch.ones(4, 4),0), torch.unsqueeze(torch.ones(4, 4),0)))
# print(torch.ones(4, 4)*torch.ones(4, 4))
valid_encoder_pos_matrix = torch.bmm(vaild_encoder_pos, vaild_encoder_pos.transpose(1, 2))
invalid_encoder_pos_matrix = 1-valid_encoder_pos_matrix
mask_encoder_self_attention = invalid_encoder_pos_matrix.to(torch.bool)
print(mask_encoder_self_attention)
#假设是vk注意力权重
score = torch.randn(batch_size, max(src_len), max(src_len))
masked_score = score.masked_fill(mask_encoder_self_attention, -np.inf)
prob = F.softmax(masked_score, -1)
print(prob)

torch.Size([2, 4, 1]) 
 tensor([[[1.],
         [1.],
         [0.],
         [0.]],

        [[1.],
         [1.],
         [1.],
         [1.]]])
tensor([[[False, False,  True,  True],
         [False, False,  True,  True],
         [ True,  True,  True,  True],
         [ True,  True,  True,  True]],

        [[False, False, False, False],
         [False, False, False, False],
         [False, False, False, False],
         [False, False, False, False]]])
tensor([[[0.0295, 0.9705, 0.0000, 0.0000],
         [0.3952, 0.6048, 0.0000, 0.0000],
         [   nan,    nan,    nan,    nan],
         [   nan,    nan,    nan,    nan]],

        [[0.1989, 0.2732, 0.3632, 0.1647],
         [0.6721, 0.0277, 0.2126, 0.0876],
         [0.4523, 0.1101, 0.1889, 0.2487],
         [0.1467, 0.1221, 0.4143, 0.3169]]])
