In [19]:
import collections
import math
import torch
from torch import nn,optim

In [20]:
class Encoder(nn.Module):
    def __init__(self,**kwargs):
        super().__init__(**kwargs)
    def forward(self,X,*args):
        raise NotImplementedError

In [21]:
class Decoder(nn.Module):
    def __init__(self,**kwargs):
        super().__init__(**kwargs)
    
    def init_state(self,enc_outputs,*args):
        raise NotImplementedError
    
    def forward(self,X,state):
        raise NotImplementedError
            

In [22]:
class Seq2SeqEncoder(Encoder):
    def __init__(self,vocab_size,embed_size,num_hiddens,num_layers,dropout=0,**kwargs):
        super().__init__(**kwargs)
        self.embedding=nn.Embedding(vocab_size,embed_size)
        self.rnn=nn.GRU(embed_size,num_hiddens,num_layers,dropout=dropout)
    
    def forward(self,X,*args):
#         (batch,step,embed)
        X=self.embedding(X)
#         (step,batch,embed)    
        X=X.permute(1,0,2)
#         output:(step,batch,hidden)
#         state :(layers,batch,hidden)
        output,state=self.rnn(X)
        return output,state        

In [23]:
encoder=Seq2SeqEncoder(vocab_size=10,embed_size=8,num_hiddens=16,num_layers=2)
encoder.eval()

Seq2SeqEncoder(
  (embedding): Embedding(10, 8)
  (rnn): GRU(8, 16, num_layers=2)
)

In [24]:
X=torch.zeros((4,7,),dtype=torch.long)
output,state=encoder(X)
output.shape,state.shape

(torch.Size([7, 4, 16]), torch.Size([2, 4, 16]))

In [26]:
class Seq2SeqDecoder(Decoder):
    def __init__(self,vocab_size,embed_size,num_hiddens,num_layers,dropout=0,**kwargs):
        super().__init__(**kwargs)
        self.embedding=nn.Embedding(vocab_size,embed_size)
        self.rnn=nn.GRU(embed_size+num_hiddens,num_hiddens,num_layers,dropout=dropout)
        self.dense=nn.Linear(num_hiddens,vocab_size)
    
    def init_state(self,enc_outputs,*args):
        return enc_outputs[1]
    
    def forward(self,X,state):
        X=self.embedding(X).permute(1,0,2)
#         上下文和上一步状态相关
        context=state[-1].repeat(X.shape[0],1,1)
#         输入需要X,上下文
        X_and_context=torch.cat((X,context),2)
        output,statee=self.rnn(X_and_context,state)
#         output经过两次permute,shape和X最开始一样了
        output=self.dense(output).permute(1,0,2)
        return output,state
        

In [27]:
decoder=Seq2SeqDecoder(vocab_size=10,embed_size=8,num_hiddens=16,num_layers=2)
decoder.eval()

Seq2SeqDecoder(
  (embedding): Embedding(10, 8)
  (rnn): GRU(24, 16, num_layers=2)
  (dense): Linear(in_features=16, out_features=10, bias=True)
)

In [28]:
state=decoder.init_state(encoder(X))

In [29]:
output,state=decoder(X,state)
output.shape,state.shape

(torch.Size([4, 7, 10]), torch.Size([2, 4, 16]))

In [30]:
def sequence_mask(X,valid_len,value=0):
    maxlen=X.size(1)
    mask=torch.arange((maxlen),dtype=torch.float32,device=X.device)[None,:]<valid_len[:,None]
    X[~mask]=value
    return X

In [31]:
X=torch.tensor([[1,2,3],[4,5,6]])

In [32]:
X[None,:].shape

torch.Size([1, 2, 3])

In [33]:
sequence_mask(X,torch.tensor([1,2]))

tensor([[1, 0, 0],
        [4, 5, 0]])

In [34]:
a=torch.arange(3)
b=torch.tensor([1,2,3])

In [35]:
a,b

(tensor([0, 1, 2]), tensor([1, 2, 3]))

In [36]:
a[None,:]

tensor([[0, 1, 2]])

In [37]:
a.unsqueeze(1)

tensor([[0],
        [1],
        [2]])

In [38]:
b[:,None]

tensor([[1],
        [2],
        [3]])

In [39]:
a[None,:]<b[:,None]

tensor([[ True, False, False],
        [ True,  True, False],
        [ True,  True,  True]])

In [21]:
a.unsqueeze(0)<b.unsqueeze(1)

tensor([[ True, False, False],
        [ True,  True, False],
        [ True,  True,  True]])

In [40]:
class MaskedSoftmaxCELoss(nn.CrossEntropyLoss):
    def forward(self,pred,label,valid_len):
        weights=torch.ones_like(label)
        weights=sequence_mask(weights,valid_len)
        self.reduction='none'
        unweighted_loss=super().forward(pred.permute(0,2,1),label)
        weighted_loss=(unweighted_loss*weights).mean(dim=1)
        return weighted_loss

In [41]:
loss=MaskedSoftmaxCELoss()
loss(torch.ones(3,4,10),torch.ones((3,4),dtype=torch.long),torch.tensor([4,2,0]))

tensor([2.3026, 1.1513, 0.0000])

In [42]:
def grad_clipping(net,theta):
    if isinstance(net,nn.Module):
        params=[p for p in net.parameters() if p.requires_grad]
    else:
        params=net.params
    norm=torch.sqrt(sum(torch.sum((p.grad**2)) for p in params))
    if norm > theta:
        for param in params:
            param.grad[:] *= theta/norm
    

In [43]:
def train_seq2seq(net,data_iter,lr,num_epochs,tgt_vocab,device):
    def xavier_init_weights(m):
        if type(m) ==nn.Linear:
            nn.init.xavier_uniform_(m.weight)
        if type(m)==nn.GRU:
            for param in m._flat_weights_names:
                if "weight" in param:
                    nn.init.xavier_uniform_(m._parameters[param])
    
    net.apply(xavier_init_weights)
    net.to(device)
    optimizer=torch.optim.Adam(net.parameters(),lr=lr)
    loss=MaskedSoftmaxCELoss()
    net.train()
    train_l,train_num=0,0
    for epoch in range(num_epochs):
        for batch in data_iter:
            optimizer.zero_grad()
            X,X_valid_len,Y,Y_valid_len=[x.to(device) for x in batch]
            bos=torch.tensor([tgt_vocab['<bos>']] * Y.shape[0],device=device).reshape(-1,1)
            dec_input=torch.cat([bos,Y[:,:-1]],1)
            Y_hat,_=net(X,dec_input,X_valid_len)
            l=loss(Y_hat,Y,Y_valid_len)
            l.sum().backward()
            grad_clipping(net,1)
            num_tokens=Y_valid_len.sum()
            optimizer.step()
            with torch.no_grad():
                train_l+=l.sum()
                train_num+=num_tokens
        if(epoch+1)%10==0:
            print("epoch : ",epoch+1," train loss : ",(train_l.item()/train_num.item()))
        

In [44]:
class Vocab:
    def __init__(self,tokens=None,min_freq=0,reserved_tokens=None):
        if tokens is None:
            tokens=[]
        if reserved_tokens is None:
            reserved_tokens=[]
        counter=count_corpus(tokens)
        self._token_freqs=sorted(counter.items(),key=lambda x:x[1],reverse=True)
        self.idx_to_token=['<unk>']+reserved_tokens
        self.token_to_idx={token:idx for idx,token in enumerate(self.idx_to_token)}
        self.idx_to_token,self.token_to_idx=[],dict()
        for token,freq in self._token_freqs:
            if freq<min_freq:
                break;
            if token not in self.token_to_idx:
                self.idx_to_token.append(token)
                self.token_to_idx[token]=len(self.idx_to_token)-1
    
    def __len__(self):
        return len(self.idx_to_token)
    
    def __getitem__(self,tokens):
        if not isinstance(tokens,(list,tuple)):
            return self.token_to_idx.get(tokens,self.unk)
        return [self.__getitem__(token) for token in tokens]
    
    def to_tokens(self,indices):
        if not isinstance(indices,(list,tuple)):
            return self.idx_to_token[indices]
        return [self.idx_to_token[index] for index in indices]
    
    @property
    def unk(self):
        return 0;
    
    @property
    def token_freqs(self):
        return self._token_freqs;
    

In [62]:
def count_corpus(tokens):
    if len(tokens)==0 or isinstance(tokens[0],list):
        tokens=[token for line in tokens for token in line]
    return collections.Counter(tokens)

In [63]:
def preprocess_nmt(text):
    def no_space(char,prev_char):
        return char in set(',.!?') and prev_char !=' '
    
    text=text.replace('\u202f', ' ').replace('\xa0',' ').lower()
    out=[' '+char if i>0 and no_space(char,text[i-1]) else char for i,char in enumerate(text) ]
    return ''.join(out)

In [64]:
a="abd,aaa,bbb"
b=a.split(',')
b

['abd', 'aaa', 'bbb']

In [65]:
def tokenize_nmt(text,num_examples=None):
    source,target=[],[]
    for i ,line in enumerate(text.split('\n')):
        if num_examples and i > num_examples:
            break
        parts=line.split('\t')
        if len(parts)==2:
            source.append(parts[0].split(' '))
            target.append(parts[1].split(' '))
    return source,target 
    

        

In [66]:
def build_array_nmt(lines, vocab, num_steps):
#     """将机器翻译的⽂本序列转换成⼩批量"""
    lines = [vocab[l] for l in lines]
#     print("lines : ",lines)
    lines = [l + [vocab['<eos>']] for l in lines]
    array = torch.tensor([truncate_pad(l, num_steps, vocab['<pad>']) for l in lines])
    valid_len = (array != vocab['<pad>']).type(torch.int32).sum(1)
    return array, valid_len

In [67]:
a=torch.tensor([1,2,3])
b=torch.tensor([0])
(a!=b)

tensor([True, True, True])

In [68]:
(a!=b).type(torch.int32)

tensor([1, 1, 1], dtype=torch.int32)

In [69]:
(a!=b).type(torch.int32).sum()

tensor(3)

In [70]:
(a!=b).type(torch.int32).sum(1)

IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)

In [71]:
(a!=b).type(torch.int32).sum(0)

tensor(3)

In [72]:
def load_data_nmt(batch_size, num_steps, num_examples=600):
#     """返回翻译数据集的迭代器和词表"""
    text = preprocess_nmt(read_data_nmt())
    source, target = tokenize_nmt(text, num_examples)
#     print("source : ",source)
#     print("target : ",target)
    src_vocab = Vocab(source, min_freq=2,reserved_tokens=['<pad>', '<bos>', '<eos>'])
    tgt_vocab = Vocab(target, min_freq=2,reserved_tokens=['<pad>', '<bos>', '<eos>'])
    src_array, src_valid_len = build_array_nmt(source, src_vocab, num_steps)
    tgt_array, tgt_valid_len = build_array_nmt(target, tgt_vocab, num_steps)
    data_arrays = (src_array, src_valid_len, tgt_array, tgt_valid_len)
    data_iter = load_array(data_arrays, batch_size)
    return data_iter, src_vocab, tgt_vocab

In [73]:
import os

In [74]:
def read_data_nmt():
    data_dir=r'F:\study\ml\DataSet\fra-eng'
    with open(os.path.join(data_dir,'fra.txt'),'r',encoding='utf-8') as f:
        return f.read()

In [75]:
def tokenize_nmt(text,num_examples=None):
    source,target=[],[]
    for i ,line in enumerate(text.split('\n')):
        if num_examples and i > num_examples:
            break
        parts=line.split('\t')
        if len(parts)==2:
            source.append(parts[0].split(' '))
            target.append(parts[1].split(' '))
    return source,target 
    

        

In [76]:
def truncate_pad(line,num_steps,padding_token):
    if len(line)>num_steps:
        return line[:num_steps]
    return line + [padding_token]*(num_steps-len(line))


In [77]:
def load_array(data_arrays, batch_size, is_train=True):
    """Construct a PyTorch data iterator.

    Defined in :numref:`sec_utils`"""
    dataset = torch.utils.data.TensorDataset(*data_arrays)
    return torch.utils.data.DataLoader(dataset, batch_size, shuffle=is_train)

In [78]:
class EncoderDecoder(nn.Module):
    """编码器-解码器架构的基类"""
    def __init__(self, encoder, decoder, **kwargs):
        super(EncoderDecoder, self).__init__(**kwargs)
        self.encoder = encoder
        self.decoder = decoder
    def forward(self, enc_X, dec_X, *args):
        enc_outputs = self.encoder(enc_X, *args)
        dec_state = self.decoder.init_state(enc_outputs, *args)
        return self.decoder(dec_X, dec_state)

In [79]:
embed_size,num_hiddens,num_layers,dropout=32,32,2,0.1
batch_size,num_steps=64,10
lr,num_epochs,device=0.005,300,'cpu'


In [80]:
train_iter, src_vocab, tgt_vocab =load_data_nmt(batch_size, num_steps)

In [81]:
encoder = Seq2SeqEncoder(len(src_vocab), embed_size, num_hiddens, num_layers,dropout)
decoder = Seq2SeqDecoder(len(tgt_vocab), embed_size, num_hiddens, num_layers,dropout)

In [82]:
net = EncoderDecoder(encoder, decoder)

In [83]:
train_seq2seq(net, train_iter, lr, num_epochs, tgt_vocab, device)

epoch :  10  train loss :  0.3717254590913537
epoch :  20  train loss :  0.3053531175114926
epoch :  30  train loss :  0.2551770802767997
epoch :  40  train loss :  0.21818605043891823
epoch :  50  train loss :  0.19044174552523474
epoch :  60  train loss :  0.169087441314554
epoch :  70  train loss :  0.15245553180192264
epoch :  80  train loss :  0.13915062368569053
epoch :  90  train loss :  0.12829703190206052
epoch :  100  train loss :  0.11937162253032081
epoch :  110  train loss :  0.1118645674949317
epoch :  120  train loss :  0.10549232351574726
epoch :  130  train loss :  0.10000449075328037
epoch :  140  train loss :  0.09522963518960988
epoch :  150  train loss :  0.09106233902256129
epoch :  160  train loss :  0.08736981077000196
epoch :  170  train loss :  0.08406418648738838
epoch :  180  train loss :  0.08111838226504087
epoch :  190  train loss :  0.0784633421876287
epoch :  200  train loss :  0.07607555598347027
epoch :  210  train loss :  0.07389829427956629
epoch : 

In [None]:
def predict_seq2seq(net,src_sentence,src_sentence,tgt_vocab,num_steps,device,
                    save_attention_weights=False):
    net.eval()
    src_tokens=src_vocab[src_sentence.lower().split(' ')]+[src_vocab['<eos>']]
    enc_valid_len
    
    