# transformer

    - 位置编码：使得输入序列并行训练
    - 多头自注意力机制：
    - 多头注意力机制：
    - add & Nom : 残差连接、层归一化
    - 

# 位置编码

## 正弦、余弦函数编码

In [30]:
# 正弦、余弦函数编码
import numpy as np
import torch
import torch.nn as nn

class PositionalEncodeing(nn.Module):
    def __init__(self,d_model,max_length = 1000):
        '''初始化方法
        
        parameter
        --------------
        d_model ：int
            嵌入向量维度
        max_length ：int
            最大序列长度    
        '''
        super().__init__()
        # 创建位置编码矩阵
        pe = torch.zeros(max_length,d_model)
        # 创建一个一维张量，其元素为从0到max_length-1，便是序列中的各个位置
        # 将形状转（max_length,)换为（max_length,1)，便于后续计算
        position = torch.arange(0,max_length,dtype=torch.float).unsqueeze(1)
        # exp(log(a)*b) = a^b

        div_trem = torch.exp(torch.arange(0,d_model,2) * (-np.log(10000.0)/d_model))
        # d_model必须为偶数，保证奇数长度与偶数长度相同
        # Position*div_trem.shape = (max_length,d_model/2)
        pe[:,0::2] = torch.sin(position * div_trem)
        pe[:,1::2] = torch.cos(position * div_trem)
        # 将pe注册为模型的缓冲区
        # 缓冲区时pytorch中的一种特殊属性，其不会被计算图追踪，不会更新梯度
        # 但是，成为缓冲区后，会成为state_dict的一部分，会随着模型一起保存和加载
        # 当注册缓冲区后，变量就会绑定当前对象，成为当前对象属性
        # 注册属性与绑定属性的区别:
            # 1、缓冲区会随着模型一起保存和加载，但是绑定属性无此功能
            # 2、缓冲区与模型参数一样，会随着模型一起迁移，但绑定属性无此功能
        self.register_buffer('pe',pe)
    
    def forward(self,x):
        # x.shape = (batch_size,seq_length,d_model)
        # 将词嵌入向量与位置张量相加
        x + self.pe[:x.size(1)]
        return x
    
def test():
    d_model = 2
    max_length = 5
    batch_szie = 2
    pos = PositionalEncodeing(d_model,max_length)
    print('位置编码张量形状',pos.pe.shape)

    x = torch.zeros(batch_szie,max_length,d_model)
    x_with_pos = pos(x)
    print(x_with_pos)

test()

位置编码张量形状 torch.Size([5, 2])
tensor([[[0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.]],

        [[0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.]]])


## 可训练位置编码  
优势：  
    - 灵活性，更具任务自适应调整  
    - 模型性能，可训练编码可以带给模型更好的性能

缺点：  
    - 外推能力，由于编码可训练，无法更好的泛化到训练时为见过的序列长度  
    - 计算成本

In [40]:
class PositionalEncodeing2(nn.Module):
    '''可训练位置编码
    通过可训练的嵌入层，实现位置编码
    '''

    def __init__(self,d_model,max_length):
        super().__init__()
        self.d_model = d_model
        self.max_length = max_length
        # 创建可学习的位置嵌入
        self.position_embedding = nn.Embedding(max_length,d_model)

    def forward(self,x):
        seq_len = x.size(1)
        # 生成位置索引
        pos_indices = torch.arange(seq_len,device =x.device,dtype=torch.long).unsqueeze(0)
        # 从嵌入层中获取位置编码
        pos_embedding = self.position_embedding(pos_indices)
        return x+pos_embedding
    
def test():
    d_model = 2
    max_length = 5
    batch_szie = 2
    pos = PositionalEncodeing2(d_model,max_length)
    print('位置编码张量形状',pos.position_embedding.weight.shape)

    x = torch.zeros(batch_szie,max_length,d_model)
    x_with_pos = pos(x)
    print(x_with_pos)

test()

位置编码张量形状 torch.Size([5, 2])
tensor([[[ 0.6606, -1.6223],
         [ 0.4468,  0.9839],
         [ 0.3566, -0.3273],
         [ 1.0180, -0.0404],
         [ 0.8518,  0.1466]],

        [[ 0.6606, -1.6223],
         [ 0.4468,  0.9839],
         [ 0.3566, -0.3273],
         [ 1.0180, -0.0404],
         [ 0.8518,  0.1466]]], grad_fn=<AddBackward0>)


# 注意力机制
q,k,v

## 自注意力机制
当q,k,v都是同一个张量，就是自注意力机制    
为什么需要自注意力，因为输入的词，经过词嵌入、位置编码，但依然没有彼此之间的关系，自注意力机制，就是，词与词之间的关系  
self_attention.shape = (seq_length,seq_length)  
例：  
seq = [x1,x3,x3,x4].shape = (n)  
embedding_seq.shape = (n,d_model)  
self_attention_seq = [[w1*x1+w2*x2+w3*x3+w4*x4],
                      [w1*x1+w2*x2+w3*x3+w4*x4],
                      [w1*x1+w2*x2+w3*x3+w4*x4].
                      [w1*x1+w2*x2+w3*x3+w4*x4]]  
self_attention_seq.shape = (n,d_model)                         
序列x.shape = (n,d_model)  
q = x*wq wq.shape = (d_model,dq)  
k = x*wk wk.shape = (d_model,dk)  
v = x*wv wv.shape = (d_model,dv)

scores = q*k.T/sqrt(dk)  
weigths = softmax(scores)  
context = weigths*v


## 多头注意力机制
将注意力矩阵中的QKV，切分成多份  
q_h = x*wq wq.shape = (d_model/h,dq)  
k_h = x*wk wk.shape = (d_model/h,dk)  
v_h = x*wv wv.shape = (d_model/h,dv)

In [44]:
class MultiHeadAttention(nn.Module):
    def __init__(self,d_model,num_head,p = 0.1):
        super().__init__()
        if d_model % num_head != 0:
            raise ValueError(f'd_model({d_model})需要能被num_head({num_head})整除')
        self.d_model = d_model
        self.num_head =num_head
        self.head_dim = d_model//num_head
        self.q_proj = nn.Linear(d_model,d_model)
        self.k_proj = nn.Linear(d_model,d_model)
        self.v_proj = nn.Linear(d_model,d_model)
        self.out_proj = nn.Linear(d_model,d_model)
        self.dropout = nn.Dropout(p)

    def forward(self,query,keys,values,atten_mask = None,key_padding_mask = None):
        '''parameters
        -----------------
        query:torh.tensor shape = (batch_size,traget_seq_len,d_model)
            查询张量，在编码器中，target就是srcoe
        keys:torh.tensor shape = (batch_size,src_seq_len,d_model)
        values:torh.tensor shape = (batch_size,src_seq_len,d_model)

        return
        ---------------------
        output : torch.tensor shape = (batch_szie,tgt_seq_length,d_model)
        atten_weigths : torch.tensor shape = (batch_szie,num_head,tgt_seq_length,src_seq_length)
        '''
        batch_szie = query.size(0)
        # 线性变换
        q = self.q_proj(query)
        k = self.k_proj(keys)
        v = self.v_proj(values)

        # 将qkv拆分为多个头
        # q.shape = (batch_szie,num_head,tgt_seq_length,head_dim)
        q = q.view(batch_szie,-1,self.num_head,self.head_dim).transpose(1,2)
        k = k.view(batch_szie,-1,self.num_head,self.head_dim).transpose(1,2)
        v = v.view(batch_szie,-1,self.num_head,self.head_dim).transpose(1,2)

        # 计算注意力权重
        scores = torch.matmul(q,k.transpose(-2,-1))/self.head_dim**0.5
        atten_weigths = torch.softmax(scores,dim=-1)
        atten_weigths = self.dropout(atten_weigths)

        # 计算注意力输出
        atten_output = torch.matmul(atten_weigths,v)

        # 合并多头
        atten_output = atten_output.transpose(1,2).contiguous().view(batch_szie,-1,self.d_model)
        # 线性变换输出
        output = self.out_proj(atten_output)
        return output,atten_weigths

In [46]:
def test():
    d_model = 10
    batch_szie = 2
    src_seq_length = 4
    tgt_seq_length = 3
    num_head = 2


    q = torch.randn(batch_szie,tgt_seq_length,d_model)
    k = torch.randn(batch_szie,src_seq_length,d_model)
    v = torch.randn(batch_szie,src_seq_length,d_model)

    atten = MultiHeadAttention(d_model,num_head)
    output,atten_weigths = atten(q,k,v)
    print('输出形状',output.shape)
    print('注意力权重形状',atten_weigths.shape)
test()
    

输出形状 torch.Size([2, 3, 10])
注意力权重形状 torch.Size([2, 2, 3, 4])
