In [19]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math 

# Attention 模块的输入和输出是什么，关键的参数有哪些？
## 输入维度 d_model
输入的向量维度，
## 输出的的维度，也就是q,k,v的大小
q,k,v大小，要将输入的维度d_model转换成多少维度，如果是多头的话，那就是 head * size（比如128*56）
## 参数，Wq，Wk，Wv
参数在模型结构中，只需要关注大小和初始化
torch.nn.Parameter 和 torch.randn 有本质区别，对于模型参数必须使用 torch.nn.Parameter


In [21]:


class SelfAttention(nn.Module):
    
    def __init__(self, d_model, d_out):
        super(SelfAttention, self).__init__()
        self.d_model = d_model
        self.d_out = d_out
        self.W_q = torch.nn.Parameter(torch.randn(d_model, d_out))
        #self.W_q = torch.nn.Linear(d_model, d_out, bias=False).weight # Linear layer是 nn 的基础组件，直接当矩阵来用也是可以的，只是需要把 bias 去掉，
        self.W_k = torch.nn.Parameter(torch.randn(d_model, d_out))
        self.W_v = torch.nn.Parameter(torch.randn(d_model, d_out))


    def forward(self, input):
        q = torch.matmul(input, self.W_q)
        k = torch.matmul(input, self.W_k)
        v = torch.matmul(input, self.W_v)
        attn_scores = torch.matmul(q, k.T) / (self.d_model ** 0.5)
        attn_weights = F.softmax(attn_scores, dim=-1)
        output = torch.matmul(attn_weights, v)
        return output



In [22]:
atten = SelfAttention(d_model=4, d_out=4)
input = torch.randn(3,4)
input

tensor([[-0.8005, -1.4133, -0.5584, -0.0384],
        [-0.2368,  0.1479, -1.8579,  0.1989],
        [ 0.3247, -0.1453, -1.5111, -0.1312]])

In [23]:

output = atten(input)
output

tensor([[-0.5675,  0.5820,  0.3099,  0.0855],
        [-0.3227,  0.5517, -0.5577,  0.1724],
        [-0.2842,  0.5550, -0.6964,  0.2047]], grad_fn=<MmBackward0>)