In [375]:
import math
import torch
import numpy as np
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as Data


In [376]:
sentences = [
    ['ich mochte ein bier P','S i want a beer .','i want a beer . E'],
    ['ich mochte ein cola P','S i want a coke . ','i want a coke . E']
]
src_vocab = {'P':0,'ich':1,'mochte':2,'ein':3,'bier':4,'cola':5}
src_vocab_size = len(src_vocab)
tgt_vocab = {'P':0,'i':1,'want':2,'a':3,'beer':4,'coke':5,'S':6,'E':7,'.':8}
idx2word = {i:w for i,w in enumerate(tgt_vocab)}
tgt_vocab_size = len(tgt_vocab)
print(idx2word)
print(tgt_vocab_size)

{0: 'P', 1: 'i', 2: 'want', 3: 'a', 4: 'beer', 5: 'coke', 6: 'S', 7: 'E', 8: '.'}
9


In [377]:
src_len = 5 #enc_input max sequence length
tgt_len = 6 #dec_input(=dec_output) max sequence length
print(sentences)

[['ich mochte ein bier P', 'S i want a beer .', 'i want a beer . E'], ['ich mochte ein cola P', 'S i want a coke . ', 'i want a coke . E']]


In [378]:
def make_data(sentences):
    enc_inputs,dec_inputs,dec_outputs = [],[],[]
    enc_input,dec_input,dec_output = [],[],[]
    for i in range(len(sentences)):
        enc_input = [[src_vocab[n] for n in sentences[i][0].split()]] # [[1, 2, 3, 4, 0], [1, 2, 3, 5, 0]]
        dec_input = [[tgt_vocab[n] for n in sentences[i][1].split()]] # [[6, 1, 2, 3, 4, 8], [6, 1, 2, 3, 5, 8]]
        dec_output = [[tgt_vocab[n] for n in sentences[i][2].split()]] # [[1, 2, 3, 4, 8, 7], [1, 2, 3, 5, 8, 7]]
      
        enc_inputs.extend(enc_input)
        dec_inputs.extend(dec_input)
        dec_outputs.extend(dec_output)

    return torch.LongTensor(enc_inputs),torch.LongTensor(dec_inputs),torch.LongTensor(dec_outputs)

In [379]:
make_data(sentences)

(tensor([[1, 2, 3, 4, 0],
         [1, 2, 3, 5, 0]]),
 tensor([[6, 1, 2, 3, 4, 8],
         [6, 1, 2, 3, 5, 8]]),
 tensor([[1, 2, 3, 4, 8, 7],
         [1, 2, 3, 5, 8, 7]]))

In [380]:
enc_inputs,dec_inputs,dec_outputs = make_data(sentences)

In [381]:
class MyDataSet(Data.Dataset):
    def __init__(self,enc_inputs,dec_inputs,dec_outputs):
        super(MyDataSet,self).__init__()
        self.enc_inputs = enc_inputs
        self.dec_inputs = dec_inputs
        self.dec_outputs = dec_outputs
    def  __len__(self):
        return self.enc_inputs.shape[0]
    def __getitem__(self,idx):
        return self.enc_inputs[idx],self.dec_inputs[idx],self.dec_outputs[idx]
    

In [382]:
loader = Data.DataLoader(MyDataSet(enc_inputs,dec_inputs,dec_outputs),2,True)
for batch_data in loader:
    print(batch_data)

[tensor([[1, 2, 3, 4, 0],
        [1, 2, 3, 5, 0]]), tensor([[6, 1, 2, 3, 4, 8],
        [6, 1, 2, 3, 5, 8]]), tensor([[1, 2, 3, 4, 8, 7],
        [1, 2, 3, 5, 8, 7]])]


In [383]:
d_model = 512
d_ff = 2048
d_k =d_v =64
n_layers = 6
n_heads = 8

In [384]:
class PositionalEncoding(nn.Module):
    def __init__(self,d_model,dropout=0.1,max_len=5000):
        super(PositionalEncoding,self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        
        pe = torch.zeros(max_len,d_model)
        
        
        position = torch.arange(0,max_len,dtype=torch.float).unsqueeze(1)
       
        
        div_term = torch.exp(torch.arange(0,d_model,2).float()*(-math.log(10000.0)/d_model))
       
        pe[:,0::2] = torch.sin(position*div_term)
        pe[:,1::2] = torch.cos(position*div_term)
        
        pe = pe.unsqueeze(0).transpose(0,1)
        
        self.register_buffer('pe',pe)
    def forward(self,x):
        x = x + self.pe[:x.size(0),:]
        return self.dropout(x)
    

In [385]:
        pe = torch.zeros(10,512)
        print(pe.size())
        position = torch.arange(0,10,dtype=torch.float).unsqueeze(1)
        print(position.size())
        div_term = torch.exp(torch.arange(0,d_model,2).float()*(-math.log(10000.0)/512))
        print(div_term.size())
        pe[:,0::2] = torch.sin(position*div_term)
        pe[:,1::2] = torch.cos(position*div_term)
        print(pe.size())#广播机制
        pe = pe.unsqueeze(0).transpose(0,1)
        print(pe.size())

torch.Size([10, 512])
torch.Size([10, 1])
torch.Size([256])
torch.Size([10, 512])
torch.Size([10, 1, 512])


In [386]:
def get_attn_pad_mask(seq_q,seq_k):
    """
    seq_q:[batch_size,seq_len]
    seq_k:[batch_size,seq_len]
    seq_len:could be src_len or tgt_len
    """
    batch_size,len_q = seq_q.size(0),seq_q.size(1)
    batch_size,len_k = seq_k.size(0),seq_k.size(1)
    pad_attn_mask = seq_k.data.eq(0).unsqueeze(1)
    return pad_attn_mask.expand(batch_size,len_q,len_k)

In [387]:
data = torch.randn(2,3)
y = data.data.eq(0).unsqueeze(1)
print(y)

tensor([[[False, False, False]],

        [[False, False, False]]])


In [388]:
def get_attn_subsequence_mask(seq):
    """
    seq:[batch_size,tgt_len]
    """
    attn_shape = [seq.size(0),seq.size(1),seq.size(1)]
    subsequence_mask = np.triu(np.ones(attn_shape),k=1)
    subsequence_mask = torch.from_numpy(subsequence_mask).byte()#?bool()
    return subsequence_mask

In [389]:
class ScaledDotProductAttention(nn.Module):
    def __init__(self):
        super(ScaledDotProductAttention,self).__init__()
    def forward(self,Q,K,V,attn_mask):
        """
        Q:[batch_size,n_heads,len_q,d_k]
        K:[batch_size,n_heads,len_k,d_k]
        V:[batch_size,n_heads,len_v(=len_k),d_v]
        attn_mask:[batch_size,n_heads,seq_len,seq_len]
        """
        scores = torch.matmul(Q,K.transpose(-1,-2))/np.sqrt(d_k)
        #scores:[batch_size,n_heads,len_q,len_k]
        scores.masked_fill_(attn_mask,-1e9)
        #Fill elements of scores with 1e-9 where mask is true.
        attn = torch.softmax(scores,dim=-1)
        context = torch.matmul(attn,V)
        #context:[batch_size,n_heads,len_q,d_v]
        return context,attn



In [390]:
class MultiHeadAttention(nn.Module):
    def __init__(self):
        super(MultiHeadAttention,self).__init__()
        self.W_Q = nn.Linear(d_model,d_k*n_heads,bias=False)
        self.W_K = nn.Linear(d_model,d_k*n_heads,bias=False)
        self.W_V = nn.Linear(d_model,d_v*n_heads,bias=False)
        self.fc = nn.Linear(n_heads*d_v,d_model,bias=False)
    def forward(self,input_Q,input_K,input_V,attn_mask):
        """
        input_Q:[batch_size,len_q,d_model]
        input_K:[batch_size,len_k,d_model]
        input_V:[batch_size,len_v(=len_k),d_model]
        attn_mask:[batch_size,n_heads,seq_len,seq_len]
        """
        residual,batch_size = input_Q,input_Q.size(0)
        
        Q = self.W_Q(input_Q).view(batch_size,-1,n_heads,d_k).transpose(1,2)
        K = self.W_K(input_K).view(batch_size,-1,n_heads,d_k).transpose(1,2)
        V = self.W_V(input_V).view(batch_size,-1,n_heads,d_v).transpose(1,2)
        #d_k(=d_q)?
        attn_mask = attn_mask.unsqueeze(1).repeat(1,n_heads,1,1)
        #attn_mask:[batch_size,n_heads,len_q,len_k] 第一个维度复制n_heads次
        context,attn = ScaledDotProductAttention()(Q,K,V,attn_mask)
        context = context.transpose(1,2).reshape(batch_size,-1,n_heads*d_v)
        output = self.fc(context)
        
        return nn.LayerNorm(d_model)(output+residual),attn

In [391]:
class PoswiseFeedForwardNet(nn.Module):
    def __init__(self):
        super(PoswiseFeedForwardNet,self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(d_model,d_ff,bias=False),
            nn.ReLU(),
            nn.Linear(d_ff,d_model,bias=False)
        )
    def forward(self,inputs):
        """
        inputs:[batch_size,seq_len,d_model]
        """
        residual = inputs
        output = self.fc(inputs)
        return nn.LayerNorm(d_model)(output+residual)
        #[batch_size,seq_len,d_model]

In [392]:
class EncoderLayer(nn.Module):
    def __init__(self):
        super(EncoderLayer,self).__init__()
        self.enc_self_attn = MultiHeadAttention()
        self.pos_ffn = PoswiseFeedForwardNet()
        
    def forward(self,enc_inputs,enc_self_attn_mask):
        """
        enc_inputs:[batch_size,src_len,d_model]
        enc_self_attn_mask:[batch_size,src_len,src_len]
        """
        enc_outputs,attn = self.enc_self_attn(enc_inputs,enc_inputs,enc_inputs,enc_self_attn_mask)
        #enc_outputs:Z[batch_size,src_len,d_model]
        #attn:softmax之后的[batch_size,src_len,src_len]
        enc_outputs = self.pos_ffn(enc_outputs)
        return enc_outputs,attn

In [393]:
class Encoder(nn.Module):
    def __init__(self):
        super(Encoder,self).__init__()
        self.src_emb = nn.Embedding(src_vocab_size,d_model)
        self.pos_emb = PositionalEncoding(d_model)
        self.layers = nn.ModuleList([EncoderLayer() for _ in range(n_layers)])
    
    def forward(self,enc_inputs):
        """
        enc_inputs:[batch_size,src_Len]
        """
        enc_outputs = self.src_emb(enc_inputs)
        enc_outputs = self.pos_emb(enc_outputs.transpose(0,1)).transpose(0,1)
        #enc_outputs:[batch_size,src_len,d_model]
        enc_self_attn_mask = get_attn_pad_mask(enc_inputs,enc_inputs)
        #[batch_size,src_len,src_len]
        enc_self_attns = []
        for layer in self.layers:
            enc_outputs,enc_self_attn = layer(enc_outputs,enc_self_attn_mask)
            enc_self_attns.append(enc_self_attn)
        return enc_outputs,enc_self_attns
            

In [394]:
#test
x = torch.tensor([[1, 3, 4], [0, 2, 1]])  # 例子中的整数索引
print(x)
y = nn.Embedding(10, 5)
output = y(x)
print(output)


tensor([[1, 3, 4],
        [0, 2, 1]])
tensor([[[-0.2891,  2.4665,  0.1586,  0.5661,  0.6591],
         [ 0.3609, -0.1599,  1.4397, -0.5303,  0.0638],
         [ 0.3249, -0.2855, -0.8015,  0.2443,  0.4326]],

        [[ 0.4750, -0.2368,  0.6460,  0.8659, -1.3432],
         [-0.6435,  0.5105, -0.6344, -1.0465,  1.4007],
         [-0.2891,  2.4665,  0.1586,  0.5661,  0.6591]]],
       grad_fn=<EmbeddingBackward0>)


In [395]:
class DecoderLayer(nn.Module):
    def __init__(self):
        super(DecoderLayer,self).__init__()
        self.dec_self_attn = MultiHeadAttention()
        self.dec_enc_attn = MultiHeadAttention()
        self.pos_ffn = PoswiseFeedForwardNet()
    
    def forward(self,dec_inputs,enc_outputs,dec_self_attn_mask,dec_enc_attn_mask):
        """
        dec_inputs:[batch_size,tgt_len,d_model]
        enc_outputs:[batch_size,src_len,d_model]
        dec_self_attn_mask:[batch_size,tgt_len(Q),tgt_len(K)]
        dec_enc_attn_mask:[batch_size,tgt_len(Q),src_len(K)]
        """
        dec_outputs,dec_self_attn = self.dec_self_attn(dec_inputs,dec_inputs,dec_inputs,dec_self_attn_mask)
        dec_outputs,dec_enc_attn = self.dec_enc_attn(dec_outputs,enc_outputs,enc_outputs,dec_enc_attn_mask)
        dec_outputs = self.pos_ffn(dec_outputs)
        
        return dec_outputs,dec_self_attn,dec_enc_attn

In [396]:
class Decoder(nn.Module):
    def __init__(self):
        super(Decoder,self).__init__()
        self.tgt_emb = nn.Embedding(tgt_vocab_size,d_model)
        self.pos_emb = PositionalEncoding(d_model)
        self.layers = nn.ModuleList([DecoderLayer() for _ in range(n_layers)])
    
    def forward(self,dec_inputs,enc_inputs,enc_outputs):
        """
        dec_inputs:[batch_size,tgt_len]
        enc_inputs:[batch_size,src_len]
        enc_outputs:[batch_size,src_len,d_model]
        """
        dec_outputs = self.tgt_emb(dec_inputs)#[batch_size,tgt_len,d_model]
        dec_outputs = self.pos_emb(dec_outputs.transpose(0,1)).\
                      transpose(0,1) #[batch_size,tgt_len,d_model]
        
        dec_self_attn_pad_mask = get_attn_pad_mask(dec_inputs,dec_inputs)#[batch_size,tgt_len,tgt_len]
        dec_self_attn_subsequence_mask = get_attn_subsequence_mask(dec_inputs)#[batch_size,tgt_len,tgt_len]
        dec_self_attn_mask = torch.gt((dec_self_attn_pad_mask+dec_self_attn_subsequence_mask),0)
        
        dec_enc_attn_mask = get_attn_pad_mask(dec_outputs,enc_inputs)#(Q,K)[batch_size,tgt_len,src_len]
        
        dec_self_attns,dec_enc_attns = [],[]
        for layer in self.layers:
            dec_outputs,dec_self_attn,dec_enc_attn = layer(dec_outputs,enc_outputs,dec_self_attn_mask,dec_enc_attn_mask)
            #dec_outputs:[batch_size,tgt_len,d_model]
            #dec_self_attn:[batch_size,n_heads,tgt_len,tgt_len]
            #dec_enc_attn:[batch_size,tgt_len,src_len]
            dec_self_attns.append(dec_self_attn)
            dec_enc_attns.append(dec_enc_attn)
            #[torch.Size([batch_size,tgt_len,src_len]),torch.Size([batch_size,tgt_len,src_len]),...]
        return dec_outputs,dec_self_attns,dec_enc_attns   

In [397]:
#test
dec_inputs = torch.tensor([[0,1,2],[2,4,0]])
dec_self_attn_pad_mask = get_attn_pad_mask(dec_inputs,dec_inputs)
print(dec_self_attn_pad_mask)
dec_self_attn_subsequence_mask = get_attn_subsequence_mask(dec_inputs)
print(dec_self_attn_subsequence_mask)
dec_self_attn_mask = torch.gt((dec_self_attn_pad_mask+dec_self_attn_subsequence_mask),0)
print(dec_self_attn_mask)

tensor([[[ True, False, False],
         [ True, False, False],
         [ True, False, False]],

        [[False, False,  True],
         [False, False,  True],
         [False, False,  True]]])
tensor([[[0, 1, 1],
         [0, 0, 1],
         [0, 0, 0]],

        [[0, 1, 1],
         [0, 0, 1],
         [0, 0, 0]]], dtype=torch.uint8)
tensor([[[ True,  True,  True],
         [ True, False,  True],
         [ True, False, False]],

        [[False,  True,  True],
         [False, False,  True],
         [False, False,  True]]])


In [398]:
class Transformer(nn.Module):
    def __init__(self):
        super(Transformer,self).__init__()
        self.encoder = Encoder()
        self.decoder = Decoder()
        self.projection = nn.Linear(d_model,tgt_vocab_size,bias=False)
        
    def forward(self,enc_inputs,dec_inputs):
        """
        enc_inputs:[batch_size,src_len]
        dec_inputs:[batch_size,tgt_len]
        """
        
        enc_outputs,enc_self_attns = self.encoder(enc_inputs)
        
        dec_outputs,dec_self_attns,dec_enc_attns = self.decoder(dec_inputs,enc_inputs,enc_outputs)
        
        dec_logits = self.projection(dec_outputs)#[batch_size,tgt_len,tgt_vocab_size]
        
        return dec_logits.view(-1,dec_logits.size(-1)),enc_self_attns,dec_self_attns,dec_enc_attns
        #dec_logits:[batch_size*tgt_len,tgt_vocab_size]


In [399]:
model = Transformer()
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.SGD(model.parameters(),lr=1e-3,momentum=0.99)

In [400]:
#test 参数形式  w1 b1 w2 b2
# 定义一个简单的神经网络模型  
class MyModel(nn.Module):  
    def __init__(self):  
        super(MyModel, self).__init__()  
        self.fc1 = nn.Linear(2, 4)  
        self.fc2 = nn.Linear(4, 1)  
  
    def forward(self, x):  
        x = self.fc1(x)  
        x = self.fc2(x)  
        return x  
mode = MyModel()  
# 获取模型的所有参数  
pa = mode.parameters()  
for p in pa:  
    print(p)

Parameter containing:
tensor([[-0.2488, -0.0254],
        [-0.5280,  0.5660],
        [-0.5183, -0.3582],
        [ 0.5053,  0.6608]], requires_grad=True)
Parameter containing:
tensor([ 0.1402, -0.2419, -0.1679,  0.2542], requires_grad=True)
Parameter containing:
tensor([[0.4977, 0.3949, 0.2427, 0.1168]], requires_grad=True)
Parameter containing:
tensor([-0.1020], requires_grad=True)


In [401]:
for epoch in range(30):
    for enc_inputs,dec_inputs,dec_outputs in loader:
        """
        enc_inputs: [batch_size, src_len]
        dec_inputs: [batch_size, tgt_len]
        dec_outputs: [batch_size, tgt_len]
        """
        print(enc_inputs.size())
        print(dec_inputs.size())
        print(dec_outputs.size())
        outputs,enc_self_attns,dec_self_attns,dec_enc_attns = model(enc_inputs,dec_inputs)
        #outputs:[batch_size*tgt_len,tgt_vocab_size]
        print(outputs.size())
        loss = criterion(outputs,dec_outputs.view(-1))
        print('Epoch:','%04d'%(epoch+1),'loss = ','{:.6f}'.format(loss))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

torch.Size([2, 5])
torch.Size([2, 6])
torch.Size([2, 6])
torch.Size([12, 9])
Epoch: 0001 loss =  2.221862
torch.Size([2, 5])
torch.Size([2, 6])
torch.Size([2, 6])
torch.Size([12, 9])
Epoch: 0002 loss =  2.120991
torch.Size([2, 5])
torch.Size([2, 6])
torch.Size([2, 6])
torch.Size([12, 9])
Epoch: 0003 loss =  1.910098
torch.Size([2, 5])
torch.Size([2, 6])
torch.Size([2, 6])
torch.Size([12, 9])
Epoch: 0004 loss =  1.636849
torch.Size([2, 5])
torch.Size([2, 6])
torch.Size([2, 6])
torch.Size([12, 9])
Epoch: 0005 loss =  1.429715
torch.Size([2, 5])
torch.Size([2, 6])
torch.Size([2, 6])
torch.Size([12, 9])
Epoch: 0006 loss =  1.165649
torch.Size([2, 5])
torch.Size([2, 6])
torch.Size([2, 6])
torch.Size([12, 9])
Epoch: 0007 loss =  0.979901
torch.Size([2, 5])
torch.Size([2, 6])
torch.Size([2, 6])
torch.Size([12, 9])
Epoch: 0008 loss =  0.812619
torch.Size([2, 5])
torch.Size([2, 6])
torch.Size([2, 6])
torch.Size([12, 9])
Epoch: 0009 loss =  0.678215
torch.Size([2, 5])
torch.Size([2, 6])
torch.Si

In [405]:
def greedy_decoder(model, enc_input, start_symbol):
    enc_outputs, enc_self_attns = model.encoder(enc_input)
    dec_input = torch.zeros(1, 0).type_as(enc_input.data)
    terminal = False
    next_symbol = start_symbol
    while not terminal:         
        dec_input = torch.cat([dec_input.detach(),torch.tensor([[next_symbol]],dtype=enc_input.dtype)],-1)
        dec_outputs, _, _ = model.decoder(dec_input, enc_input, enc_outputs)
        projected = model.projection(dec_outputs)
        prob = projected.squeeze(0).max(dim=-1, keepdim=False)[1]
        next_word = prob.data[-1]
        next_symbol = next_word
        if next_symbol == tgt_vocab["."]:
            terminal = True
        print(next_word)            
    return dec_input

# Test
enc_inputs, _, _ = next(iter(loader))

for i in range(len(enc_inputs)):
    greedy_dec_input = greedy_decoder(model, enc_inputs[i].view(1, -1), start_symbol=tgt_vocab["S"])
    predict, _, _, _ = model(enc_inputs[i].view(1, -1), greedy_dec_input)
    predict = predict.data.max(1, keepdim=True)[1]
    print(enc_inputs[i], '->', [idx2word[n.item()] for n in predict.squeeze()])

tensor(1)
tensor(2)
tensor(3)
tensor(5)
tensor(8)
tensor([1, 2, 3, 5, 0]) -> ['i', 'want', 'a', 'coke', '.']
tensor(1)
tensor(2)
tensor(3)
tensor(4)
tensor(8)
tensor([1, 2, 3, 4, 0]) -> ['i', 'want', 'a', 'beer', '.']


In [406]:
import os
print(os.path.abspath('.'))

C:\Users\cxy\z_Demo
