In [48]:
import copy
import numpy as np
import torch.nn as nn
import torch
import torch.optim as optim
import matplotlib.pyplot as plt
import math
from torch.autograd import Variable
import torch.nn.functional as F


In [51]:
class Embedding(nn.Module):
    def __init__(self, d_model, vocab):
        super(Embedding, self).__init__()
        self.lut = nn.Embedding(vocab, d_model)
        self.d_model = d_model
    def forward(self, x):
        return self.lut(x) * math.sqrt(self.d_model)
                                       
d_model = 512
vocab = 1000

x = Variable(torch.LongTensor([[100,2,421,455], [452,334,2,11]]))
print("x" , x.shape)
emb = Embedding(d_model, vocab)
embr = emb(x)
print("embr" , embr)
print("embr" , embr.shape)



x torch.Size([2, 4])
embr tensor([[[  2.7861,  -5.1170,   7.3297,  ...,   8.7727,  19.0942,  20.0988],
         [-33.2384,  10.0533, -14.4355,  ...,  -0.7453,  16.3175,   3.3592],
         [-25.0414, -48.5396, -19.7695,  ...,  -9.7313,  26.3012, -12.9687],
         [-15.6585,  -0.8350,   1.5748,  ...,  37.0881,  -3.9556,   4.2145]],

        [[-42.2173,   8.2238,  23.8643,  ...,  -1.8616,  12.3401, -33.0340],
         [-32.5913,  30.7748,   0.3495,  ...,  -8.6518,  29.2767, -32.4875],
         [-33.2384,  10.0533, -14.4355,  ...,  -0.7453,  16.3175,   3.3592],
         [-23.8018,  30.5947,  -7.3784,  ..., -13.5331,  41.4704,  -5.1295]]],
       grad_fn=<MulBackward0>)
embr torch.Size([2, 4, 512])


In [52]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout, max_len = 5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p = dropout)
        #初始化一个位置编码矩阵,大小是max_len*d_model
        pe = torch.zeros(max_len, d_model)
        
        #初始化一个绝对位置编码矩阵,大小是max_len*1
        position = torch.arange(0, max_len).unsqueeze(1)
        
        #定义一个变化矩阵，跳跃式的初始化
        div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model))
        
        #奇数偶数分别赋值
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        
        #二维变三维
        pe = pe.unsqueeze(0)
        
        #将位置编码矩阵注册成模型的buffer，不跟随优化器同步更新
        self.register_buffer('pe', pe)
    
    def forward(self, x):
        x = x + Variable(self.pe[:, :x.size(1)], requires_grad = False)
        return self.dropout(x)
    
    
def subsequent_mask(size):
    attn_shape = (1, size, size)
    subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype('uint8')
    return torch.from_numpy(1 - subsequent_mask)
    
x = embr
dropout = 0.2
max_len = 60
pe = PositionalEncoding(d_model, dropout, max_len)
pe_result = pe(x)
print(pe_result)
print(pe_result.shape)

size = 5
sm = subsequent_mask(size)
print(sm)
print(sm.shape)


tensor([[[  0.0000,  -5.1462,   9.1621,  ...,  12.2159,   0.0000,  26.3734],
         [-40.4962,   0.0000, -17.0170,  ...,   0.3184,  20.3970,   5.4490],
         [-30.1651,  -0.0000, -23.5414,  ..., -10.9141,  32.8768, -14.9609],
         [-19.3967,  -2.2812,   2.2749,  ...,  47.6101,  -4.9441,   0.0000]],

        [[-52.7716,  11.5298,  29.8303,  ...,  -1.0770,   0.0000, -40.0424],
         [-39.6873,  39.1438,   1.4642,  ...,  -9.5648,   0.0000,  -0.0000],
         [ -0.0000,  12.0464, -16.8738,  ...,   0.0000,  20.3972,   5.4490],
         [ -0.0000,  37.0059,  -8.9166,  ..., -15.6663,  51.8384,  -5.1618]]],
       grad_fn=<MulBackward0>)
torch.Size([2, 4, 512])
tensor([[[1, 0, 0, 0, 0],
         [1, 1, 0, 0, 0],
         [1, 1, 1, 0, 0],
         [1, 1, 1, 1, 0],
         [1, 1, 1, 1, 1]]], dtype=torch.uint8)
torch.Size([1, 5, 5])


In [57]:
def attention(query, key, value, mask = None, dropout=None):
    #首先将query的最后一个维度提取，代表词嵌入的维度
    d_k = query.size(-1)
    print(d_k)
    
    scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
    print("scores", scores.shape)
    
    if mask is not None:
        scores = scores.masked_fill(mask == 0 , -1e9)
        
    p_attn = F.softmax(scores, dim=-1)
    
    if dropout is not None:
        p_attn = dropout(p_attn)
    return torch.matmul(p_attn, value), p_attn

query = key = value = pe_result
attn, p_attn = attention(query, key, value)

print(attn)
print(attn.shape)
print(p_attn)
print(p_attn.shape)

512
scores torch.Size([2, 4, 4])
tensor([[[  0.0000,  -5.1462,   9.1621,  ...,  12.2159,   0.0000,  26.3734],
         [-40.4962,   0.0000, -17.0170,  ...,   0.3184,  20.3970,   5.4490],
         [-30.1651,   0.0000, -23.5414,  ..., -10.9141,  32.8768, -14.9609],
         [-19.3967,  -2.2812,   2.2749,  ...,  47.6101,  -4.9441,   0.0000]],

        [[-52.7716,  11.5298,  29.8303,  ...,  -1.0770,   0.0000, -40.0424],
         [-39.6873,  39.1438,   1.4642,  ...,  -9.5648,   0.0000,   0.0000],
         [  0.0000,  12.0464, -16.8738,  ...,   0.0000,  20.3972,   5.4490],
         [  0.0000,  37.0059,  -8.9166,  ..., -15.6663,  51.8384,  -5.1618]]],
       grad_fn=<UnsafeViewBackward0>)
torch.Size([2, 4, 512])
tensor([[[1., 0., 0., 0.],
         [0., 1., 0., 0.],
         [0., 0., 1., 0.],
         [0., 0., 0., 1.]],

        [[1., 0., 0., 0.],
         [0., 1., 0., 0.],
         [0., 0., 1., 0.],
         [0., 0., 0., 1.]]], grad_fn=<SoftmaxBackward0>)
torch.Size([2, 4, 4])


In [34]:
def clones(module, N):
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)] )

class MultiHeadedAttention(nn.Module):
    def __init__(self, head, embedding_dim, dropout=0.1):
        super(MultiHeadedAttention, self).__init__()
        assert embedding_dim % head == 0
        self.d_k = embedding_dim // head
        self.head = head
        self.embedding_dim = embedding_dim
        
        self.linears = clones(nn.Linear(embedding_dim, embedding_dim), 4)
        #初始化注意力张量
        self.attn = None
        
        self.dropout = nn.Dropout(p = dropout)
        
    def forward(self,query, key, value, mask = None):
        if mask is not None:
            mask = mask.unsqueeze(1)
            
        batch_size = query.size(0)
        
        query, key, value = \
            [model(x).view(batch_size, -1, self.head, self.d_k).transpose(1,2)
             for model , x in zip(self.linears, (query, key, value))]
        
        x, self.attn = attention(query, key, value, mask=mask, dropout=self.dropout)
        
        x = x.transpose(1,2).contiguous().view(batch_size, -1, self.head * self.d_k)
        
        return self.linears[-1](x)
    


In [59]:
head = 8
embedding_dim = 512
dropout = 0.2

query = key = value = pe_result

mask = Variable(torch.zeros(2,4,4))

mha = MultiHeadedAttention(head, embedding_dim, dropout)


print("query shape:", query.shape)
print("key shape:", key.shape)
print("value shape:", value.shape)
print("mask shape:", mask.shape)
mha_result = mha(query, key, value, mask)
print(mha_result)
print(mha_result.shape)


query shape: torch.Size([2, 4, 512])
key shape: torch.Size([2, 4, 512])
value shape: torch.Size([2, 4, 512])
mask shape: torch.Size([2, 4, 4])
64
scores torch.Size([2, 8, 4, 4])
tensor([[[ 5.5644,  1.5772, -2.9466,  ..., -4.7217,  5.2768,  7.6987],
         [ 2.1447,  0.6680, -5.1555,  ...,  1.5196,  4.4446,  8.2820],
         [-0.0343, -1.2638, -6.0603,  ...,  1.9803,  1.4331,  9.4950],
         [ 2.1467, -4.9469, -3.4508,  ..., -0.5114,  3.7474, 10.1934]],

        [[ 1.0332,  0.6051,  0.1534,  ..., -3.4456,  0.9186,  4.6134],
         [ 0.1549, -0.2857, -0.8081,  ..., -7.1809,  3.1867,  5.4611],
         [-0.9462, -2.5637, -1.4282,  ..., -2.1174,  0.5409,  6.4205],
         [-0.4082,  1.2229, -1.4550,  ..., -0.4331,  0.4411,  8.0422]]],
       grad_fn=<ViewBackward0>)
torch.Size([2, 4, 512])


In [60]:
class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.w1 = nn.Linear(d_model,d_ff)
        self.w2 = nn.Linear(d_ff,d_model)
        self.dropout = nn.Dropout(p=dropout)
    def forward(self, x):
        return self.w2(self.dropout(F.relu(self.w1(x))))
    
d_ff = 64
d_model = 512
dropout = 0.2

x = mha_result
ff = PositionwiseFeedForward(d_model, d_ff, dropout)
ff_result = ff(x)
print(ff_result)
print(ff_result.shape)

tensor([[[ 0.0574,  0.5072, -2.4992,  ..., -0.3614,  1.7082, -0.5154],
         [-0.1217,  1.2613, -1.7827,  ..., -1.4855,  0.1240,  1.0478],
         [ 0.0790,  2.4579, -3.1313,  ..., -0.1956,  1.3494,  1.2364],
         [ 0.9925,  1.3909, -2.2709,  ..., -0.5404,  0.6839,  1.1610]],

        [[-1.3643,  2.4278, -2.5269,  ...,  0.0700, -0.9485, -1.7216],
         [ 0.8627,  0.9195, -1.8312,  ...,  0.5692,  0.0035,  0.0191],
         [ 0.1311,  1.8305, -0.9359,  ...,  1.3486,  0.2745, -1.1639],
         [-1.7927,  2.4690, -0.4020,  ...,  1.5674, -0.3094, -2.2710]]],
       grad_fn=<ViewBackward0>)
torch.Size([2, 4, 512])


前馈全连接层：再transform中前馈全连接层就是具有两层线性层的全连接网络

考虑注意力机制可能对复杂过程拟合程度不够，通过增加两层网络来增强模型的能力

relu激活函数

In [61]:
class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout = 0.1):
        #d_model:代表词嵌入的维度，同时也是线性层的输入维度和输出维度
        #d_ff： 代表第一个线性层的输出维度，和第二个线性层的输入维度
        
        super(PositionwiseFeedForward, self).__init__()
        
        self.w1 = nn.Linear(d_model, d_ff)
        self.w2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        return self.w2(self.dropout(F.relu(self.w1(x))))
    

d_model = 512
d_ff = 64
dropout=0.2

x = mha_result
ff = PositionwiseFeedForward(d_model, d_ff, dropout)
ff_result = ff(x)


print(ff_result)
print(ff_result.shape)

tensor([[[ 8.5999e-02, -1.8583e+00,  6.2025e-03,  ..., -1.4832e+00,
           3.1085e-01,  2.5882e-01],
         [-1.8909e+00, -1.3611e+00,  1.1367e+00,  ..., -7.8875e-01,
           3.7964e-01,  2.7609e-01],
         [-2.0618e+00, -2.7013e+00,  1.0881e+00,  ..., -9.9657e-01,
           1.5406e+00, -2.7545e+00],
         [-2.6390e+00, -2.1137e+00,  1.7532e+00,  ..., -5.3152e-01,
           2.1448e+00, -3.7907e-01]],

        [[-2.3052e+00,  3.2439e-01, -2.0018e+00,  ..., -1.1056e+00,
           7.3149e-01,  3.5918e-01],
         [-2.4998e+00, -3.9661e-01, -4.2757e-01,  ..., -3.5952e-01,
           4.1522e-01, -4.5117e-01],
         [-1.4586e+00, -8.8199e-01,  5.2307e-02,  ..., -3.3271e-01,
          -6.1155e-01,  1.0115e-01],
         [-2.2082e+00, -9.2416e-02, -3.9354e-01,  ..., -2.2430e+00,
          -2.0458e-03, -1.4116e+00]]], grad_fn=<ViewBackward0>)
torch.Size([2, 4, 512])


规范化层：通过多层计算后参数可能出现过大或过小，导致学习出现异常，模型收敛变慢，让特征值在合理范围内


In [62]:
class LayerNorm(nn.Module):
    def __init__(self, features, eps = 1e-6):
        #featrues代表词嵌入的维度
        super(LayerNorm, self).__init__()
        #nn.Parameter进行封装，代表他们也是模型中的参数
        self.a2 = nn.Parameter(torch.ones(features))
        self.b2 = nn.Parameter(torch.zeros(features))
        self.eps = eps
        
    def forward(self,x):
        #对x进行最后一个维度上求均值，维度保持一致
        #对x进行最后一个维度上求标准差，维度保持一致
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        # *是点乘
        return self.a2 * (x - mean)/(std + self.eps) + self.b2
    
    
featrues = d_model = 512
eps = 1e-6
x = ff_result
ln = LayerNorm(featrues, eps)
ln_result = ln(x)

print(ln_result)
print(ln_result.shape)

tensor([[[ 0.1034, -1.4535,  0.0395,  ..., -1.1531,  0.2835,  0.2418],
         [-1.2842, -0.9211,  0.7911,  ..., -0.5287,  0.2721,  0.2012],
         [-1.2299, -1.6172,  0.6782,  ..., -0.5846,  0.9523, -1.6495],
         [-1.5835, -1.2694,  1.0433,  ..., -0.3231,  1.2775, -0.2319]],

        [[-1.4821,  0.1790, -1.2904,  ..., -0.7243,  0.4362,  0.2010],
         [-2.0144, -0.2976, -0.3229,  ..., -0.2674,  0.3650, -0.3422],
         [-1.4055, -0.8767, -0.0200,  ..., -0.3731, -0.6288,  0.0248],
         [-1.4973, -0.0760, -0.2783,  ..., -1.5207, -0.0152, -0.9622]]],
       grad_fn=<AddBackward0>)
torch.Size([2, 4, 512])


子层连接结构

残差连接（跳跃连接）

In [70]:
class SublayerConnection(nn.Module):
    def __init__(self, size, dropout=0.1):
        super(SublayerConnection, self).__init__()
        #实例化规范化层的对象
        self.size = size
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(p=dropout)

    
    def forward(self,x, sublayer):
        #sublayer：该子层连接中子层函数
        #首先将x进行规范化，然后送入子层函数中处理，处理结果进入dropout层，最后进行残差链接  
        return x + self.dropout(sublayer(self.norm(x)))


size = d_model = 512
head = 8
dropout = 0.2

x = pe_result
mask = Variable(torch.zeros(2,4,4))
self_attn = MultiHeadedAttention(head, d_model)

sublayer = lambda x : self_attn(x, x, x, mask)
sc = SublayerConnection(size, dropout)
sc_result = sc(x, sublayer)

print(sc_result)
print(sc_result.shape)

64
scores torch.Size([2, 8, 4, 4])
tensor([[[ 1.4768e-01, -5.1178e+00,  8.7390e+00,  ...,  1.2379e+01,
           0.0000e+00,  2.6193e+01],
         [-4.0109e+01,  1.7407e-01, -1.7307e+01,  ...,  5.2211e-01,
           2.0397e+01,  5.4667e+00],
         [-2.9882e+01,  1.2204e-01, -2.3698e+01,  ..., -1.0835e+01,
           3.3145e+01, -1.4990e+01],
         [-1.9086e+01, -2.3130e+00,  1.9263e+00,  ...,  4.7760e+01,
          -4.6085e+00, -8.1126e-02]],

        [[-5.2772e+01,  1.2074e+01,  2.9862e+01,  ..., -9.5817e-01,
           3.9325e-03, -3.9711e+01],
         [-3.9526e+01,  3.9144e+01,  1.4076e+00,  ..., -9.2055e+00,
           1.1838e-01,  3.5946e-01],
         [ 4.4848e-01,  1.2614e+01, -1.6872e+01,  ...,  1.7651e-01,
           2.0259e+01,  5.7723e+00],
         [ 4.0136e-01,  3.7363e+01, -8.9716e+00,  ..., -1.5375e+01,
           5.1927e+01, -4.8413e+00]]], grad_fn=<AddBackward0>)
torch.Size([2, 4, 512])


编码器层作用：对输入的特征提取

In [73]:
class EncoderLayer(nn.Module):
    def __init__(self, size, self_attn, feed_forward, dropout):
        #size:代表词嵌入维度
        #self_attn：多头自注意力子层的实例化对象
        #feed_forward：前馈全连接层实例化对象
        #dropout：质零比例
        super(EncoderLayer,self).__init__() 
        
        self.self_attn = self_attn
        self.feed_forward = feed_forward
        self.size = size
        
        self.sublayer = clones(SublayerConnection(size, dropout), 2)
    def forward(self, x, mask):
        # x：代表上一层的传入张量
        # mask：代表掩码张量
        #首先让x经过第一个子层连接结构，内部包含多头自注意力机制子层
        #再让x经过第二个子层连接结构，内部包含前馈全连接网络
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, mask))
        return self.sublayer[1](x, self.feed_forward)
    
    
size = d_model = 512
head = 8
d_ff = 64
x = pe_result
dropout = 0.2

self_attn = MultiHeadedAttention(head, d_model)
ff = PositionwiseFeedForward(d_model, d_ff, dropout)
mask = Variable(torch.zeros(2,4,4))

el = EncoderLayer(size, self_attn, ff, dropout)
el_result = el(x,mask)
print(el_result)
print(el_result.shape)

64
scores torch.Size([2, 8, 4, 4])
tensor([[[  0.1339,  -4.8128,   9.4395,  ...,  12.0258,   0.3797,  26.4566],
         [-39.9285,   0.1643, -16.6818,  ...,   0.2159,  20.0679,   5.3704],
         [-30.0505,   0.1002, -23.3042,  ..., -10.6114,  32.6926, -14.9643],
         [-19.0091,  -2.2298,   2.0419,  ...,  47.8538,  -4.8858,  -0.1000]],

        [[-52.5995,  12.1280,  30.3564,  ...,  -1.2975,   0.3369, -40.3583],
         [-39.6446,  39.4865,   1.5508,  ...,  -9.5648,   0.2210,   0.0864],
         [ -0.1130,  12.0464, -16.6524,  ...,  -0.5318,  20.1564,   5.0981],
         [  0.3294,  36.7571,  -8.8828,  ..., -15.9101,  51.9819,  -5.4290]]],
       grad_fn=<AddBackward0>)
torch.Size([2, 4, 512])


编码器：由N个编码器层堆叠而成

In [74]:
class Encoder(nn.Module):
    def __init__(self, layer, N):
        super(Encoder, self).__init__()
        #layer:代表编码器层
        #N：代表编码器中有几个layer
        
        #首先使用clones函数克隆N个编码器层放置在self.layers中
        self.layers = clones(layer, N)
        #规范化层
        self.norm = LayerNorm(layer.size)
    def forward(self, x, mask):
        
        #让x依次经历N个编码器层的处理，最后再经过规范化层就可以输出了
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)
    
size = d_model = 512
head = 8
d_ff = 64
c = copy.deepcopy
x = pe_result
dropout = 0.2

attn = MultiHeadedAttention(head, d_model)
ff = PositionwiseFeedForward(d_model, d_ff, dropout)
layer = EncoderLayer(size, c(attn), c(ff), dropout)

N = 8

mask = Variable(torch.zeros(2,4,4))

en = Encoder(layer, N)
en_result = en(x,mask)
print(en_result)
print(en_result.shape)

64
scores torch.Size([2, 8, 4, 4])
64
scores torch.Size([2, 8, 4, 4])
64
scores torch.Size([2, 8, 4, 4])
64
scores torch.Size([2, 8, 4, 4])
64
scores torch.Size([2, 8, 4, 4])
64
scores torch.Size([2, 8, 4, 4])
64
scores torch.Size([2, 8, 4, 4])
64
scores torch.Size([2, 8, 4, 4])
tensor([[[ 0.0485, -0.3770,  0.3840,  ...,  0.4731,  0.1529,  0.8842],
         [-1.5488,  0.0735, -0.5730,  ...,  0.0372,  0.8259,  0.2583],
         [-1.1485, -0.1273, -1.0775,  ..., -0.4996,  1.3942, -0.7420],
         [-0.7062, -0.2043,  0.2021,  ...,  1.7537, -0.1897, -0.0718]],

        [[-1.9288,  0.4005,  1.3065,  ..., -0.1753,  0.0040, -1.5410],
         [-1.5767,  1.4986,  0.0275,  ..., -0.5608,  0.0090,  0.0639],
         [ 0.0993,  0.5305, -0.5738,  ..., -0.0777,  0.7131,  0.2292],
         [ 0.1862,  1.4317, -0.2575,  ..., -0.5594,  2.0354, -0.1278]]],
       grad_fn=<AddBackward0>)
torch.Size([2, 4, 512])


解码器：每个解码器层根据给定的输入向目标方向进行特征提取

In [76]:
class DecoderLayer(nn.Module):
    def __init__(self, size, self_attn, src_attn, feed_forward, dropout):
        #size：词嵌入维度
        #self_attn：多头自注意力机制实例化
        #src_attn：常规注意力机制实例化对象
        #feed_forward：前馈全连接层
        super(DecoderLayer, self).__init__()
        #将参数传入类中
        self.size = size
        self.self_attn = self_attn
        self.src_attn = src_attn
        self.feed_forward = feed_forward
        self.dropout = dropout
        
        #根据解码器层的结构，需要clones函数克隆3个子层连接对象
        self.sublayer = clones(SublayerConnection(size, dropout), 3)
    def forward(self, x, memory, source_mask, target_mask):
        #x：上一层输出
        #memory：编码器的语义存储张量
        #source_mask：源数据的掩码张量
        #target_mask：目标数据的掩码张量
        m = memory
        #第一步让x经历第一个子层，多头自注意力机制的子层
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, target_mask))
        
        #第二步让x经历第二个子层，常规的注意力机制的子层，Q!=K=V
        x = self.sublayer[1](x, lambda x: self.src_attn(x, m, m, target_mask))
        
        #第三步让x经历第三个子层，前馈全连接层
        x = self.sublayer[2](x, self.feed_forward)
        return x
        
size = d_model = 512
head = 8
d_ff = 64
c = copy.deepcopy
x = pe_result
dropout = 0.2
memory = en_result



self_attn = src_attn = MultiHeadedAttention(head, d_model)
ff = PositionwiseFeedForward(d_model, d_ff, dropout)
layer = EncoderLayer(size, c(attn), c(ff), dropout)

N = 8

mask = Variable(torch.zeros(2,4,4))
source_mask = target_mask = mask

dl = DecoderLayer(size, self_attn, src_attn, ff, dropout)
dl_result = dl(x, memory, source_mask, target_mask)
print(dl_result)
print(dl_result.shape)

64
scores torch.Size([2, 8, 4, 4])
64
scores torch.Size([2, 8, 4, 4])
tensor([[[-4.5474e-01, -4.9912e+00,  9.1455e+00,  ...,  1.2497e+01,
          -2.2109e-02,  2.5969e+01],
         [-4.0691e+01, -1.0692e-01, -1.7052e+01,  ...,  4.6210e-01,
           2.0132e+01,  5.5156e+00],
         [-3.0662e+01, -1.1548e-02, -2.3714e+01,  ..., -1.0663e+01,
           3.3049e+01, -1.5583e+01],
         [-1.9982e+01, -2.0805e+00,  1.9832e+00,  ...,  4.7732e+01,
          -4.9690e+00, -1.3161e-01]],

        [[-5.2741e+01,  1.1251e+01,  2.9056e+01,  ..., -2.4342e-01,
           3.5467e-01, -4.0090e+01],
         [-3.9707e+01,  3.9034e+01,  1.0170e+00,  ..., -8.7006e+00,
          -4.6611e-01,  6.5476e-02],
         [-3.0150e-01,  1.2530e+01, -1.7372e+01,  ...,  8.1796e-01,
           2.0792e+01,  6.1759e+00],
         [ 2.4632e-02,  3.6860e+01, -9.1811e+00,  ..., -1.5065e+01,
           5.1701e+01, -5.0168e+00]]], grad_fn=<AddBackward0>)
torch.Size([2, 4, 512])


编码器：根据编码器的结果以及上一次预测的结果，对下一次可能出现的值进行特征表示

In [79]:
class Decoder(nn.Module):
    def __init__(self, layer, N):
        super(Decoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)
    
    def forward(self, x, memory, source_mask, target_mask):
        for layer in self.layers:
            x = layer(x, memory, source_mask, target_mask)
        return self.norm(x)
    
size = d_model = 512
head = 8
d_ff = 64
c = copy.deepcopy
x = pe_result
dropout = 0.2
memory = en_result



attn = MultiHeadedAttention(head, d_model)
ff = PositionwiseFeedForward(d_model, d_ff, dropout)
layer = DecoderLayer(size, c(attn), c(attn), c(ff), dropout)

N = 8

mask = Variable(torch.zeros(2,4,4))
source_mask = target_mask = mask

de = Decoder(layer, N)
de_result = de(x, memory, source_mask, target_mask)
print(de_result)
print(de_result.shape)

64
scores torch.Size([2, 8, 4, 4])
64
scores torch.Size([2, 8, 4, 4])
64
scores torch.Size([2, 8, 4, 4])
64
scores torch.Size([2, 8, 4, 4])
64
scores torch.Size([2, 8, 4, 4])
64
scores torch.Size([2, 8, 4, 4])
64
scores torch.Size([2, 8, 4, 4])
64
scores torch.Size([2, 8, 4, 4])
64
scores torch.Size([2, 8, 4, 4])
64
scores torch.Size([2, 8, 4, 4])
64
scores torch.Size([2, 8, 4, 4])
64
scores torch.Size([2, 8, 4, 4])
64
scores torch.Size([2, 8, 4, 4])
64
scores torch.Size([2, 8, 4, 4])
64
scores torch.Size([2, 8, 4, 4])
64
scores torch.Size([2, 8, 4, 4])
tensor([[[-0.0124, -0.2119,  0.3868,  ...,  0.5012,  0.0651,  0.8903],
         [-1.7254,  0.1375, -0.6774,  ...,  0.1407,  0.9314,  0.0275],
         [-1.2846, -0.0292, -1.0038,  ..., -0.2769,  1.3334, -0.9056],
         [-0.7423, -0.2329,  0.0095,  ...,  1.9385, -0.2565, -0.1007]],

        [[-1.8028,  0.4237,  1.1409,  ..., -0.0633,  0.2591, -1.5145],
         [-1.3532,  1.2917,  0.0211,  ..., -0.3772,  0.1087,  0.0908],
         [ 0

线性层和softmax

线性层：通过对上一步的线性变化，得到指定的维度输出

In [80]:
#functional 装载了网络层中那些只进行计算，而没有参数的层
import torch.nn.functional as F


class Generator(nn.Module):
    def __init__(self, d_model, vocab_size):
        super(Generator, self).__init__()
        self.project = nn.Linear(d_model, vocab_size)
    def forward(self, x):
        return F.log_softmax(self.project(x), dim = -1)
    
d_model = 512
vocab_size = 1000
x = pe_result

gen = Generator(d_model, vocab_size)
gen_result = de(x, memory, source_mask, target_mask)
print(gen_result)
print(gen_result.shape)

64
scores torch.Size([2, 8, 4, 4])
64
scores torch.Size([2, 8, 4, 4])
64
scores torch.Size([2, 8, 4, 4])
64
scores torch.Size([2, 8, 4, 4])
64
scores torch.Size([2, 8, 4, 4])
64
scores torch.Size([2, 8, 4, 4])
64
scores torch.Size([2, 8, 4, 4])
64
scores torch.Size([2, 8, 4, 4])
64
scores torch.Size([2, 8, 4, 4])
64
scores torch.Size([2, 8, 4, 4])
64
scores torch.Size([2, 8, 4, 4])
64
scores torch.Size([2, 8, 4, 4])
64
scores torch.Size([2, 8, 4, 4])
64
scores torch.Size([2, 8, 4, 4])
64
scores torch.Size([2, 8, 4, 4])
64
scores torch.Size([2, 8, 4, 4])
tensor([[[ 0.0039, -0.2045,  0.3771,  ...,  0.4905,  0.0487,  0.8767],
         [-1.6801,  0.1297, -0.6928,  ...,  0.1438,  0.9744,  0.0816],
         [-1.2891, -0.0220, -1.0026,  ..., -0.3274,  1.3494, -0.8548],
         [-0.7581, -0.1939,  0.0179,  ...,  1.9780, -0.2731, -0.1232]],

        [[-1.8163,  0.3787,  1.1436,  ..., -0.0278,  0.2496, -1.5390],
         [-1.3462,  1.3552,  0.0358,  ..., -0.3515,  0.1283,  0.0743],
         [ 0

In [85]:
class EncoderDecoder(nn.Module):
    def __init__(self, encoder, decoder, source_embed, target_embed, generator):
        super(EncoderDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = source_embed
        self.tgt_embed = target_embed
        self.generator = generator
    def forward(self, source, target, source_mask, target_mask):
        return self.decode(self.encode(source, source_mask), source_mask, target, target_mask)
    
    def encode(self, source, source_mask):
        return self.encoder(self.src_embed(source), source_mask)
    
    def decode(self, memory, source_mask, target, target_mask):
        return self.decoder(self.tgt_embed(target), memory, source_mask, target_mask)
    
vocab_size = 1000
d_model = 512
encoder = en
decoder = de
source_embed = nn.Embedding(vocab_size, d_model)
target_embed = nn.Embedding(vocab_size, d_model)
generator = gen

source = target = Variable(torch.LongTensor([[10,3,124,244],[33,222,5,11]]))

source_mask = target_mask = Variable(torch.zeros(2,4,4))

ed = EncoderDecoder(encoder, decoder, source_embed, target_embed, generator)
ed_result = ed(source, target, source_mask, target_mask)
print(ed_result)
print(ed_result.shape)

64
scores torch.Size([2, 8, 4, 4])
64
scores torch.Size([2, 8, 4, 4])
64
scores torch.Size([2, 8, 4, 4])
64
scores torch.Size([2, 8, 4, 4])
64
scores torch.Size([2, 8, 4, 4])
64
scores torch.Size([2, 8, 4, 4])
64
scores torch.Size([2, 8, 4, 4])
64
scores torch.Size([2, 8, 4, 4])
64
scores torch.Size([2, 8, 4, 4])
64
scores torch.Size([2, 8, 4, 4])
64
scores torch.Size([2, 8, 4, 4])
64
scores torch.Size([2, 8, 4, 4])
64
scores torch.Size([2, 8, 4, 4])
64
scores torch.Size([2, 8, 4, 4])
64
scores torch.Size([2, 8, 4, 4])
64
scores torch.Size([2, 8, 4, 4])
64
scores torch.Size([2, 8, 4, 4])
64
scores torch.Size([2, 8, 4, 4])
64
scores torch.Size([2, 8, 4, 4])
64
scores torch.Size([2, 8, 4, 4])
64
scores torch.Size([2, 8, 4, 4])
64
scores torch.Size([2, 8, 4, 4])
64
scores torch.Size([2, 8, 4, 4])
64
scores torch.Size([2, 8, 4, 4])
tensor([[[-1.7757e-01,  4.4075e-01, -4.3779e-02,  ...,  9.9333e-01,
          -6.8268e-01, -4.7330e-01],
         [-1.7906e-01, -6.4976e-01,  8.7825e-01,  ..., 

make_model函数

In [89]:
def make_model(source_vocab, target_vocab, N=6, d_model=512, d_ff=2048, head=8, droupout=0.2):
    #source_vocab:代表源数据的词汇总数
    #target_vocab：代表目标数据的词汇总数
    #N：代表编码器和解码器堆叠的层数
    #d_model：词嵌入的维度
    #d_ff：前馈全连接层中变换矩阵的维度
    #head：头数
    c = copy.deepcopy
    
    attn = MultiHeadedAttention(head, d_model)
    ff = PositionwiseFeedForward(d_model, d_ff, dropout)
    position = PositionalEncoding(d_model, dropout)
    
    #实例化模型model，利用EncoderDecoder类
    #编码器结构两个子层，attention和前馈全连接
    #解码器结构三个子层，两个attention和前馈全连接
    model = EncoderDecoder(
        Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N),
        Decoder(DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout), N),
        nn.Sequential(Embedding(d_model, source_vocab), c(position)),
        nn.Sequential(Embedding(d_model, target_vocab), c(position)),
        Generator(d_model, target_vocab)
    )
    
    #初始化模型参数，维度大于1，将矩阵初始化成一个服从均匀分布的矩阵
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)
        
    return model
    
source_vocab = 11
target_vacab = 11
N=6

if __name__=='__main__':
    res = make_model(source_vocab, target_vacab, N)
    print(res)

EncoderDecoder(
  (encoder): Encoder(
    (layers): ModuleList(
      (0): EncoderLayer(
        (self_attn): MultiHeadedAttention(
          (linears): ModuleList(
            (0): Linear(in_features=512, out_features=512, bias=True)
            (1): Linear(in_features=512, out_features=512, bias=True)
            (2): Linear(in_features=512, out_features=512, bias=True)
            (3): Linear(in_features=512, out_features=512, bias=True)
          )
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (feed_forward): PositionwiseFeedForward(
          (w1): Linear(in_features=512, out_features=2048, bias=True)
          (w2): Linear(in_features=2048, out_features=512, bias=True)
          (dropout): Dropout(p=0.2, inplace=False)
        )
        (sublayer): ModuleList(
          (0): SublayerConnection(
            (norm): LayerNorm()
            (dropout): Dropout(p=0.2, inplace=False)
          )
          (1): SublayerConnection(
            (norm): LayerNorm()
 

COPY任务：帮助我们断定模型多有过程是否正常，是否具有基本学习能力

第一步：构建数据集生成器
第二步：模型及其优化器
第三步：训练评估
第四步：使用模型进行贪婪解码

In [90]:
#from pyitcast.transformer_utils import Batch

def data_generator(V, batch, num_batch):
    for i in range(num_batch):
        data = torch.from_numpy(np.random.randint(1, V, size=(batch, 10)))
        data[:, 0] = 1
        
        source= Variable(data, requires_grad = False)
        target= Variable(data, requires_grad = False)
        
        yield Batch(source, target)

SyntaxError: unexpected EOF while parsing (2147841568.py, line 3)

模型
第一步：导包
第二步：导入数据集
第三步：构建模型输入的批次数据
第四步：构建训练和评估函数
第五步：训练评估



In [None]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F

import torchtext
from torchtext.data.utils import get_tokenizer

from pyitcast.transformer import TransformerModel
