In [1]:
import torch
from torch import nn

In [2]:
class MaskAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, 
                qkv_bais = False):
        super().__init__()
        self.d_out = d_out
        self.wq = nn.Linear(d_in, d_out, bias=qkv_bais)
        self.wk = nn.Linear(d_in, d_out, bias=qkv_bais)
        self.wv = nn.Linear(d_in, d_out, bias=qkv_bais)
        self.dropout = nn.Dropout(dropout)
        self.mask = torch.triu(torch.ones(context_length, context_length))
    def forward(self, x):
        b, num_tokens, d_in = x.shape
        q = self.wq(x)
        k = self.wk(x)
        v = self.wv(x)

        attn_scores = q @ k.transpose(1, 2)
        attn_scores.masked_fill_(
            self.mask.bool()[:num_tokens, :num_tokens],
            -torch.inf
        )
        attn_weights = torch.softmax(attn_scores / k.shape[-1] ** 0.5, dim = -1)
        attn_weights = self.dropout(attn_weights)

        context_vec = attn_weights @ v
        return context_vec

In [17]:
inputs = torch.tensor( 
 [[0.43, 0.15, 0.89], # Your (x^1) 
 [0.55, 0.87, 0.66], # journey (x^2) 
 [0.57, 0.85, 0.64], # starts (x^3) 
 [0.22, 0.58, 0.33], # with (x^4) 
 [0.77, 0.25, 0.10], # one (x^5) 
 [0.05, 0.80, 0.55]] # step (x^6) 
)
d_in = inputs.shape[1]
d_out = 2

In [18]:
batch = torch.stack((inputs, inputs), dim=0)

### 多头注意力

#### 就是多个注意力叠加，有多个Wq,Wk，Wv，分别计算得到上下文后合并

#### 实现多头注意力封装类

In [21]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, 
                 num_heads, dropout, qkv_bias = False):
        super().__init__()
        assert (d_out % num_heads == 0), "d_out must be divisible by num_heads"

        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads
        self.wq = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.wk = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.wv = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.out_proj = nn.Linear(d_out, d_out)
        self.dropout = nn.Dropout(dropout)
        self.mask = torch.triu(torch.ones(context_length, context_length), diagonal=1)
    def forward(self, x):
        batch_size, num_tokens, d_in = x.shape
        q = self.wq(x)
        k = self.wk(x)
        v = self.wv(x) # (batch_size, num_tokens, d_out)
        q = q.view(batch_size, num_tokens, self.num_heads, self.head_dim)
        k = k.view(batch_size, num_tokens, self.num_heads, self.head_dim)
        v = v.view(batch_size, num_tokens, self.num_heads, self.head_dim) #(batch_size, num_tokens, num_heads, head_dim)

        q = q.transpose(1, 2)
        k = k.transpose(1, 2)
        v = v.transpose(1, 2) #(batch_size, num_heads, num_tokens, head_dim)

        #计算每个头的注意力分数
        attn_scores = q @ k.transpose(2, 3)
        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]
        attn_scores.masked_fill_(mask_bool, -torch.inf)

        attn_weigths = torch.softmax(attn_scores / k.shape[-1] ** 0.5, dim=-1)
        attn_weigths = self.dropout(attn_weigths)

        context_vec = (attn_weigths @ v).transpose(1, 2)
        context_vec = context_vec.contiguous().view(batch_size, num_tokens, self.d_out)
        context_vec = self.out_proj(context_vec)
        return context_vec

In [22]:
torch.manual_seed(123) 
batch_size, context_length, d_in = batch.shape 
d_out = 2 
mha = MultiHeadAttention(d_in, d_out, context_length, num_heads=2, dropout=0.0) 
context_vecs = mha(batch) 
print(context_vecs) 
print("context_vecs.shape:", context_vecs.shape)

tensor([[[0.3190, 0.4858],
         [0.2943, 0.3897],
         [0.2856, 0.3593],
         [0.2693, 0.3873],
         [0.2639, 0.3928],
         [0.2575, 0.4028]],

        [[0.3190, 0.4858],
         [0.2943, 0.3897],
         [0.2856, 0.3593],
         [0.2693, 0.3873],
         [0.2639, 0.3928],
         [0.2575, 0.4028]]], grad_fn=<ViewBackward0>)
context_vecs.shape: torch.Size([2, 6, 2])
