In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import math
import copy
device=torch.device('cuda')

In [7]:
class MultiHeadAttention(nn.Module):
    def __init__(self,d_model,num_heads):
        super(MultiHeadAttention, self).__init__() #初始化 nn.Module 
        assert d_model % num_heads == 0 # 能够等分 h 为头数目
        self.num_heads=num_heads
        self.d_model=d_model
        self.d_k = d_model // num_heads # key 通过类似CNN的多通道机制进行分离
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)
    
    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k) # 表示对倒数第二个和最后一个维度进行转置。
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, -1e9) # 传入mask
        attn_probs = torch.softmax(attn_scores, dim=-1) #对dk进行
        output = torch.matmul(attn_probs, V)
        return output

    def split_heads(self, x):
        batch_size, seq_length, _ = x.size()
        return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)#先分成两个维度 不破坏原始数据结构
        # (batch_size, self.h, seq_length, self.d_k)
        
    def combine_heads(self, x):
        batch_size, _, seq_length, _ = x.size() #split 的逆向操作
        return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model) 
        #.contiguous() 可以确保张量在内存中是按照顺序排列的，以便后续的操作。
    
    def forward(self, Q, K, V, mask=None):
        Q = self.split_heads(self.W_q(Q))
        K = self.split_heads(self.W_k(K))
        V = self.split_heads(self.W_v(V))
        
        attn_output = self.scaled_dot_product_attention(Q, K, V, mask)
        output = self.W_o(self.combine_heads(attn_output))
        return output
    
class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(PositionWiseFeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))
    
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_length):
        super(PositionalEncoding, self).__init__()
        
        pe = torch.zeros(max_seq_length, d_model) # position ecoding 矩阵，对小于最大长度所有序列计算
        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1) #插入维度 batch_size
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
        
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        
        self.register_buffer('pe', pe.unsqueeze(0)) #它会与模型的参数一起被 PyTorch 的 state_dict() 保存和加载
         # 同时加了一维 batch_size self.pe 是一个形状为 (1, max_seq_length, d_model) 的张量
        
    def forward(self, x):
        return x + self.pe[:, :x.size(1)] # 利用广播机制相加 x.size(1) 说明超出seq_length 的部分不加

In [8]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model) #最后一个维度
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, mask):
        attn_output = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output)) # 残差连接
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x

class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, enc_output, src_mask, tgt_mask):
        attn_output = self.self_attn(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(attn_output))
        attn_output = self.cross_attn(x, enc_output, enc_output, src_mask)
        x = self.norm2(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_output))
        return x

In [9]:
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout):
        super(Transformer, self).__init__()
        self.encoder_embedding = nn.Embedding(src_vocab_size, d_model) # 词嵌入层，将词表示为 d_model 维的向量,超参数
        self.decoder_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length)

        # 并列多层(论文里面的N) 区分多层注意力
        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])

        self.fc = nn.Linear(d_model, tgt_vocab_size) # 转成词汇的分布
        self.dropout = nn.Dropout(dropout)

    def generate_mask(self, src, tgt): # src 是输入 tgt是目前输出
        src_mask = (src != 0).unsqueeze(1).unsqueeze(2) #  屏蔽填充位置，确保在计算 Self-Attention 时不会考虑填充位置的信息。
        tgt_mask = (tgt != 0).unsqueeze(1).unsqueeze(3)
        seq_length = tgt.size(1)
        nopeak_mask = (1 - torch.triu(torch.ones(1, seq_length, seq_length), diagonal=1)).bool() #将主对角线及其以上的元素设为零，主对角线以下的元素设为 1
        nopeak_mask = nopeak_mask.to(device)
        tgt_mask = (tgt_mask & nopeak_mask) # 按位与 扩展后有了时序
        return src_mask, tgt_mask

    def forward(self, src, tgt):
        src_mask, tgt_mask = self.generate_mask(src, tgt)
        src_embedded = self.dropout(self.positional_encoding(self.encoder_embedding(src)))
        tgt_embedded = self.dropout(self.positional_encoding(self.decoder_embedding(tgt)))

        enc_output = src_embedded
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)

        dec_output = tgt_embedded
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)

        output = self.fc(dec_output)
        return output

### 升维操作 `(src != 0).unsqueeze(1).unsqueeze(2)` 和 `(tgt != 0).unsqueeze(1).unsqueeze(3)`:

- **目的**: 生成源序列和目标序列的掩码，标识出哪些位置是有效的（非零元素）。
- **解释**:
  - `(src != 0)` 和 `(tgt != 0)` 的结果是一个布尔张量，表示非零位置为 `True`，零位置为 `False`。
  - `unsqueeze(1)` 操作将张量在维度 1 上插入一个维度，形状变为 `(batch_size, 1, seq_length)`。这是为了对齐注意力机制中的维度。
  - `unsqueeze(2)` 和 `unsqueeze(3)` 操作类似，将张量在维度 2 或 3 上插入一个维度，形状变为 `(batch_size, 1, 1, seq_length)` 或 `(batch_size, 1, seq_length, 1)`。这是为了对齐后续的掩码操作。

### 特殊掩码 `(1 - torch.triu(torch.ones(1, seq_length, seq_length), diagonal=1)).bool()`:

- **目的**: 生成一个特殊的上三角矩阵，防止目标序列中当前位置之后的信息被用于预测当前位置。
- **解释**:
  - `torch.ones(1, seq_length, seq_length)` 创建一个全为 1 的矩阵。
  - `torch.triu(..., diagonal=1)` 生成一个上三角矩阵，主对角线及其以下的元素设为零。
  - `1 - ...` 对矩阵取反，将主对角线及其以上的元素设为零。
  - `.bool()` 将矩阵元素转换为布尔类型。

- 对于位置 i，`nopeak_mask` 中第 i 行的元素是 `True`，表示位置 i 可以依赖于之前的所有位置。
- 对于位置 i，`nopeak_mask` 中第 i 列及其右侧的元素是 `False`，表示位置 i 不可以依赖于未来的位置。



In [10]:
src_vocab_size = 5000
tgt_vocab_size = 5000
d_model = 512
num_heads = 8
num_layers = 6
d_ff = 1200
max_seq_length = 30
dropout = 0.1

transformer = Transformer(src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout)
transformer = transformer.to('cuda')
# Generate random sample data
src_data = torch.randint(1, src_vocab_size, (64, max_seq_length)).to('cuda') # (batch_size, seq_length)
tgt_data = torch.randint(1, tgt_vocab_size, (64, max_seq_length)).to('cuda')  # (batch_size, seq_length)

criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

transformer.train()

for epoch in range(200):
    optimizer.zero_grad()
    output = transformer(src_data, tgt_data[:, :-1])
    loss = criterion(output.contiguous().view(-1, tgt_vocab_size), tgt_data[:, 1:].contiguous().view(-1))
    loss.backward()
    optimizer.step()
    print(f"Epoch: {epoch+1}, Loss: {loss.item()}")

Epoch: 1, Loss: 8.68315315246582
Epoch: 2, Loss: 8.472329139709473
Epoch: 3, Loss: 8.33831787109375
Epoch: 4, Loss: 8.227945327758789
Epoch: 5, Loss: 8.12308406829834
Epoch: 6, Loss: 7.986129283905029
Epoch: 7, Loss: 7.8567728996276855
Epoch: 8, Loss: 7.706655502319336
Epoch: 9, Loss: 7.568012714385986
Epoch: 10, Loss: 7.426093101501465
Epoch: 11, Loss: 7.30232572555542
Epoch: 12, Loss: 7.160962104797363
Epoch: 13, Loss: 7.02187967300415
Epoch: 14, Loss: 6.886958122253418
Epoch: 15, Loss: 6.762116432189941
Epoch: 16, Loss: 6.637843132019043
Epoch: 17, Loss: 6.5150980949401855
Epoch: 18, Loss: 6.3844685554504395
Epoch: 19, Loss: 6.269604206085205
Epoch: 20, Loss: 6.155383586883545
Epoch: 21, Loss: 6.040956020355225
Epoch: 22, Loss: 5.936882972717285
Epoch: 23, Loss: 5.821682453155518
Epoch: 24, Loss: 5.707643508911133
Epoch: 25, Loss: 5.618677139282227
Epoch: 26, Loss: 5.497015953063965
Epoch: 27, Loss: 5.398855209350586
Epoch: 28, Loss: 5.308420658111572
Epoch: 29, Loss: 5.193527698516