### 1.Preliminary

In [None]:
import torch

def softmax(z):
    exp_z = torch.exp(z)
    return exp_z / torch.sum(exp_z)    #！感觉不对

def dot_product_matrix(A, B):
    return torch.matmul(A, B.T)

### 2. transformer 

In [None]:
class Transformer(nn.Module):
    def __init__(self, config: ModelConfig):
        super().__init__()

        self.encoder = Encoder(config)
        self.decoder = Decoder(config)
        self.output_layer = nn.Linear(config.d_model, config.tgt_vocab_size)
        ...
    
    def forward(self, original, target):
        ...

#### 2.1 Word Embedding Layer

In [None]:
class navie_embedding(nn.Module):
    def __init__(self, v, d):
        super().__init__()
        self.embedding = nn.Parameter(torch.randn(v, d)) # 初始化Embedding Table
    
    def forward(self, x):
        # x: (batch_size, seq_len)
        # 第一种方法: 
        return self.embedding[x]  # 直接索引获取嵌入向量

        # 第二种方法: One Hot Encoding
        # x_one_hot = F.one_hot(x, num_classes=self.embedding.size(0)).float() # (batch_size, seq_len, v)
        # return torch.matmul(x_one_hot, self.embedding) # (batch_size, seq_len, d)

        # 第三种方法，利用Gather函数
        # batch_size, seq_len = x.size()
        # x = x.unsqueeze(-1).expand(-1, -1, self.embedding.size(1)) # (batch_size, seq_len, d)
        # return torch.gather(self.embedding.unsqueeze(0).expand(batch_size, -1, -1), 1, x) # (batch_size, seq_len, d)

In [None]:
class WordEmbedding(nn.Module):
    def __init__(self, vocab_size, d_model):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
    
    def forward(self, x):
        return self.embedding(x)

#### 2.2 Position Embedding Layer

In [None]:
pos = 1
d_model = 6
pe = torch.zeros(d_model)
for i in range(d_model // 2):
    pe[2 * i] = torch.sin(pos / (10000 ** (2 * i / d_model)))
    pe[2 * i + 1] = torch.cos(pos / (10000 ** (2 * i + 1 / d_model)))
print(pe)


class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()

        position = torch.arange(0, max_len).unsqueeze(1) # (max_len, 1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model)) # (d_model/2,)

        pe = torch.zeros(max_len, d_model)
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)  # (1, max_len, d_model)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1), :]
        return x

#### 2.3 Attention Layer

Attention机制是Transformer的核心组件，它允许模型在处理序列时动态地关注输入序列中的不同部分。Attention机制的基本思想是通过计算查询（Query）、键（Key）和值（Value）之间的相似性Equation 3 来决定如何加权输入信息。具体来说，Attention的计算过程如下:

\boxed{\text{Attention}(Q, K, V) = \text{softmax}\left(\frac{Q K^\top}{\sqrt{d_k}}\right) V} \tag{6}\

2.3.1 Self-Attention Layer
2.3.2 Causal Self-Attention Layer
\text{Attention}(Q, K, V) = \text{softmax}\left(\frac{Q K^\top}{\sqrt{d_k}} \textcolor{red}{+ M}\right) V \tag{7}

2.3.3 Cross Attention Layer
\[ \text{Attention}(Q_{dec}, \textcolor{red}{K_{enc}}, \textcolor{red}{V_{enc}}) = \text{softmax}\left(\frac{Q_{dec} \textcolor{red}{K_{enc}^\top}}{\sqrt{d_k}}\right) \textcolor{red}{V_{enc}} \tag{8}\]

2.3.4 Time Complexity of Attention
\[ \begin{array}{|l|l|} \hline \textbf{Step} & \textbf{Time Complexity} \\ \hline QK^\top & \mathcal{O}(n^2 d) \\ \text{softmax}(QK^\top) & \mathcal{O}(n^2) \\ \text{attention} \times V & \mathcal{O}(n^2 d) \\ \hline \textbf{Total} & \mathcal{O}(n^2 d) \\ \hline \end{array} \]

#### 2.4 Normalization Layer 

In [None]:
class LayerNormalization(nn.Module):
    def __init__(self, d_model, eps=1e-6):
        super().__init__()
        self.eps = eps
        self.gamma = nn.Parameter(torch.ones(d_model))
        self.beta = nn.Parameter(torch.zeros(d_model))

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        var = x.var(-1, keepdim=True, unbiased=False)
        x_hat = (x - mean) / torch.sqrt(var + self.eps)
        return self.gamma * x_hat + self.beta

#### 2.5 Feed Forward Layer

In [None]:
class FeedForwardNetwork(nn.Module):
    def __init__(self, d_model, d_ff):
        super().__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()   
    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

#### 2.6 Residual Connection

#### 2.7 Output Layer

In [None]:
class OutputLayer(nn.Module):
    def __init__(self, d_model, vocab_size):
        super().__init__()
        self.linear = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        return F.softmax(self.linear(x), dim=-1)

#### 2.8 Encoder & Decoder Layer

In [None]:
class EncoderLayer(nn.Module):
    def __init__(self, config: ModelConfig):
        super().__init__()
        self.self_attention = MultiHeadSelfAttention(config)
        self.ffn = FeedForwardNetwork(config.d_model, config.d_ff)
        self.norm1 = LayerNormalization(config.d_model)
        self.norm2 = LayerNormalization(config.d_model)
    def forward(self, x):
        x = self.norm1(x + self.self_attention(x, x, x))
        x = self.norm2(x + self.ffn(x))
        return x
    

class DecoderLayer(nn.Module):
    def __init__(self, config: ModelConfig):
        super().__init__()
        self.causal_self_attention = CausalMultiHeadSelfAttention(config)
        self.cross_attention = CrossAttention(config)
        self.ffn = FeedForwardNetwork(config.d_model, config.d_ff)
        self.norm1 = LayerNormalization(config.d_model)
        self.norm2 = LayerNormalization(config.d_model)
        self.norm3 = LayerNormalization(config.d_model)
    def forward(self, y, x_enc):
        y = self.norm1(y + self.causal_self_attention(y, y, y))
        y = self.norm2(y + self.cross_attention(y, x_enc, x_enc))
        y = self.norm3(y + self.ffn(y))
        return y
    


class Encoder(nn.Module):
    def __init__(self, config: ModelConfig):
        super().__init__()
        self.layers = nn.ModuleList([EncoderLayer(config) for _ in range(config.num_encoder_layers)])   
    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x
class Decoder(nn.Module):
    def __init__(self, config: ModelConfig):
        super().__init__()
        self.layers = nn.ModuleList([DecoderLayer(config) for _ in range(config.num_decoder_layers)])   
    def forward(self, y, x_enc):
        for layer in self.layers:
            y = layer(y, x_enc)
        return y

class Transformer(nn.Module):
    def __init__(self, config: ModelConfig):
        super().__init__()

        self.encoder = Encoder(config)
        self.decoder = Decoder(config)
        self.output_layer = nn.Linear(config.d_model, config.tgt_vocab_size)
        ...
    
    def forward(self, original, target):
        x_enc = self.encoder(original)
        y_dec = self.decoder(target, x_enc)
        output = self.output_layer(y_dec)
        return output

#### 2.11 Weight Initialization

In [None]:
class AdamOptimizer:
    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), weight_decay=0.0, eps=1e-8):
        self.params = list(params)
        self.lr = lr
        self.betas = betas
        self.eps = eps
        self.weight_decay = weight_decay
        self.m = [torch.zeros_like(p) for p in self.params]
        self.v = [torch.zeros_like(p) for p in self.params]
        self.t = 0  
    
    def set_lr(self, lr):
        self.lr = lr

    @torch.no_grad()
    def step(self):
        self.t += 1
        b1, b2 = self.betas

        for i, p in enumerate(self.params):
            if p.grad is None:
                continue

            g = p.grad

            # ----- weight decay -----
            if self.weight_decay != 0.0:
                    g = g.add(p.data, alpha=self.weight_decay)

            # ----- Adam moments -----
            self.m[i].mul_(b1).add_(g, alpha=(1.0 - b1))
            self.v[i].mul_(b2).addcmul_(g, g, value=(1.0 - b2))

            m_hat = self.m[i] / (1.0 - (b1 ** self.t))
            v_hat = self.v[i] / (1.0 - (b2 ** self.t))

            # update
            p.data.addcdiv_(m_hat, v_hat.sqrt().add_(self.eps), value=-self.lr)

    @torch.no_grad()
    def zero_grad(self, set_to_none: bool = False):
        for p in self.params:
            if p.grad is not None:
                if set_to_none:
                    p.grad = None
                else:
                    p.grad.zero_()

$$\mathbf{y} = \text{LayerNorm}(\mathbf{x} + \mathrm{Sublayer}(\mathbf{x}))$$

$e^{i\pi} + 1 = 0$