In [2]:
import numpy as np
import jieba
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader,Dataset
from collections import Counter

In [5]:
def read_data(file_path):
    src = []
    tgt = []
    with open(file_path , mode='r',encoding='utf-8') as f:
        for line in f:
            s,t = line.strip().lower().split('\t')
            s = s.split(' ')
            t = t.split(' ')
            src.append(s)
            tgt.append(t)
    return src,tgt
path = r'.\data\eng-fra.txt'
src_data ,tgt_data =  read_data(path)

In [6]:
print(src_data[10:20])
print(tgt_data[10:20])

[['wait!'], ['wait!'], ['i', 'see.'], ['i', 'try.'], ['i', 'won!'], ['i', 'won!'], ['oh', 'no!'], ['attack!'], ['attack!'], ['cheers!']]
[['attends', '!'], ['attendez', '!'], ['je', 'comprends.'], ["j'essaye."], ["j'ai", 'gagné', '!'], ['je', "l'ai", 'emporté', '!'], ['oh', 'non', '!'], ['attaque', '!'], ['attaquez', '!'], ['santé', '!']]


In [74]:
# 构建词汇表
class Vocab:
    '''词汇表类，用于从文本数据中构建词汇表'''
    # 定义类属性
    # 定义填充符号、未识别符号、开始符合、结束符号
    PAD = '<pad>'
    UNK = '<unk>'
    SOS = '<SOS>'
    EOS = '<EOS>'

    pad_idx = 0
    unk_idx = 1
    sos_idx = 2
    eos_idx = 3

    def __init__(self,text,max_vocab = 5000):
        '''初始化，构建词汇表
        parameter
        -----------------
        text:array of list
            包含文本的数据集
        max_vocab :int
            词汇表最大长度
        '''
        vocab = Counter()
        for text_line in text:
            vocab.update(text_line)

        self.word_index = {}
        c = self.__class__
        d = {c.PAD:c.pad_idx,
             c.UNK:c.unk_idx,
             c.SOS:c.sos_idx,
             c.EOS:c.eos_idx
             }
        self.word_index.update(d)

        for idx ,(word,count) in enumerate(vocab.most_common(max_vocab-4),start = 4):
            self.word_index[word]=idx

        self.index_word = {index:word for word,index in self.word_index.items()}
        self.vocab_size = len(self.word_index)

src_vocab = Vocab(src_data)
tgt_vocab = Vocab(tgt_data)
print(src_vocab.index_word[10])
print(tgt_vocab.index_word[10])

he
le


In [75]:
# 自定义数据集
class ParallelDataset(Dataset):
    '''自定义数据集类，获取平行数据'''
    def __init__(self,src_data,tgt_data,src_vocab,tgt_vocab,
                 max_src_length=None,max_tgt_length=None):
        if max_src_length is None:
             max_src_length = self.__get_max_seq_len__(src_data)
        if max_tgt_length is None:
             max_tgt_length = self.__get_max_seq_len__(tgt_data)

        self.data = []
        for src,tgt in zip(src_data,tgt_data):
            src_idx = [src_vocab.word_index.get(token,Vocab.unk_idx) for token in src]
            tgt_idx = [tgt_vocab.word_index.get(token,Vocab.unk_idx) for token in tgt]

            tgt_idx = [Vocab.sos_idx] + tgt_idx + [Vocab.eos_idx]
            # 填充或截断
            src_idx = self.__pad_or_truncatr__(src_idx,max_src_length)
            tgt_idx = self.__pad_or_truncatr__(tgt_idx,max_tgt_length)
            # 将序列转化为张量
            src_idx = torch.LongTensor(src_idx)
            tgt_idx = torch.LongTensor(tgt_idx)

            self.data.append((src_idx,tgt_idx))

    def __len__(self):
         return len(self.data)

    def __getitem__(self,index):
         return self.data[index]

    def __pad_or_truncatr__(self,seq,max_len):
        seq_len = len(seq)
        if seq_len>max_len:
            seq = seq[:max_len]
        else:
            seq = seq + [Vocab.pad_idx]*(max_len - seq_len)
        return seq

    def __get_max_seq_len__(self,text_data):
            max_len = max(len(d) for d in text_data)
            return max_len


In [76]:
# 位置编码

class PositionalEncoding(nn.Module):
    def __init__(self,d_model,max_length = 1000):
        '''初始化方法

        parameter
        --------------
        d_model ：int
            嵌入向量维度
        max_length ：int
            最大序列长度
        '''
        super().__init__()
        # 创建位置编码矩阵
        pe = torch.zeros(max_length,d_model)
        # 创建一个一维张量，其元素为从0到max_length-1，便是序列中的各个位置
        # 将形状转（max_length,)换为（max_length,1)，便于后续计算
        position = torch.arange(0,max_length,dtype=torch.float).unsqueeze(1)
        # exp(log(a)*b) = a^b

        div_trem = torch.exp(torch.arange(0,d_model,2) * (-np.log(10000.0)/d_model))
        # d_model必须为偶数，保证奇数长度与偶数长度相同
        # Position*div_trem.shape = (max_length,d_model/2)
        pe[:,0::2] = torch.sin(position * div_trem)
        pe[:,1::2] = torch.cos(position * div_trem)
        # 将pe注册为模型的缓冲区
        # 缓冲区时pytorch中的一种特殊属性，其不会被计算图追踪，不会更新梯度
        # 但是，成为缓冲区后，会成为state_dict的一部分，会随着模型一起保存和加载
        # 当注册缓冲区后，变量就会绑定当前对象，成为当前对象属性
        # 注册属性与绑定属性的区别:
            # 1、缓冲区会随着模型一起保存和加载，但是绑定属性无此功能
            # 2、缓冲区与模型参数一样，会随着模型一起迁移，但绑定属性无此功能
        self.register_buffer('pe',pe)

    def forward(self,x):
        # x.shape = (batch_size,seq_length,d_model)
        # 将词嵌入向量与位置张量相加
        x += self.pe[:x.size(1)].to(x.device)
        return x

In [77]:
class TransformerModel(nn.Module):
    '''transformer模型类
    pytorch中提供的tansformer类,不包含词嵌入和位置编码以及输出层
    '''
    def __init__(self,scr_vocab_size,tgt_vocab_size,d_model = 512,
                  num_heads = 8,num_layers = 6,ff_dim = 2048,dropout = 0.1):
            super().__init__()
            self.src_embed = nn.Embedding(scr_vocab_size,d_model)
            self.tgt_embed = nn.Embedding(tgt_vocab_size,d_model)
            self.pos_encoder = PositionalEncoding(d_model)


            # Transformer层
            self.transformer = nn.Transformer(
                d_model=d_model,
                nhead=num_heads,
                num_encoder_layers=num_layers,
                num_decoder_layers=num_layers,
                dim_feedforward=ff_dim,
                dropout=dropout,
                batch_first=True
            )
            # 输出层
            self.fc_out = nn.Linear(d_model,tgt_vocab_size)

    def forward(self,src_ids,tgt_ids):
        '''
        parameter
        ---------------
        src_ids : torch.tensor shape = (batch_size,seq_len)
            输入源数据索引
        tgt_ids : torch.tensr shape = (batcg_size,seq_len)
            输入目标系列索引

        return
        --------------
        out : torch.tensor shape = (batch_size,tgt_seq_len,tgt_vocab_size)
        '''
        src_mask,tgt_mask,src_pad_mask,tgt_pad_mask = self.generate_mask(src_ids,tgt_ids)
        src_embed = self.pos_encoder(self.src_embed(src_ids))
        tgt_embed = self.pos_encoder(self.tgt_embed(tgt_ids))
        out = self.transformer(
            src_embed,
            tgt_embed,
            src_mask,# 编码器自注意力掩码
            tgt_mask,# 解码器器自注意力掩码
            memory_mask = None,# 解码器参考编码器的掩码
            src_key_padding_mask = src_pad_mask,# 对编码器中填充的词进行掩码
            tgt_key_padding_mask = tgt_pad_mask,# 解码器中填充的词，进行掩码
            memory_key_padding_mask = src_pad_mask # 解码器中，参考编码器中填充的掩码
        )

        return self.fc_out(out)


    def generate_mask(self,src,tgt):
        '''
        生成编码器与解码器的注意力掩码与填充掩码
        parameter
        -----------------
        src : torch.tensor shape = (batch_size,src_seq_len)
            源序列
        tgt : torch.tensor shape = (batch_size,tgt_seq_len)
            目标序列

        return
        ------------------
        src_mask: shape = (src_seq_len,src_seq_len)
            编码器的注意力掩码
        tgt_mask : shape =(tgt_seq_len,tgt_seq_len)
            解码器的注意力掩码
        src_pad_mask : shape = (batch_size,src_seq_len)
            编码器的填充掩码
        tgt_pad_mask : shape = (batch_size,tgt_seq_len)
            解码器的填充掩码
        '''
        src_mask = torch.zeros(src.size(1), src.size(1)).to(src.device)
        #tgt_mask = self.generate_causal_mask(tgt.size(1))
        tgt_mask = nn.Transformer.generate_square_subsequent_mask(tgt.size(1)).to(src.device)
        src_pad_mask = (src == Vocab.pad_idx)
        tgt_pad_mask = (tgt == Vocab.pad_idx)
        return src_mask,tgt_mask,src_pad_mask,tgt_pad_mask


    def generate_causal_mask(self,seq_len):
        mask = torch.triu(torch.ones(seq_len,seq_len),diagonal=1)
        return mask.bool()

In [43]:
batch_size = 32
learning_rate = 0.001
epochs = 10

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
dataset = ParallelDataset(src_data,tgt_data,src_vocab,tgt_vocab)
train_dataloader = DataLoader(dataset,batch_size = batch_size,shuffle = True)

model = TransformerModel(src_vocab.vocab_size,tgt_vocab.vocab_size,d_model = 128,num_layers=3).to(device)
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index = Vocab.pad_idx)

for epoch in range(epochs):
    model.train()
    train_loss = 0
    for src_idx,tgt_idx in train_dataloader:
        src_idx,tgt_idx = src_idx.to(device),tgt_idx.to(device)
        # tgt在训练阶段，不用输入eos编码，因此需要切片
        tgt_inputs_idx = tgt_idx[:,:-1]
        tgt_target_idx = tgt_idx[:,1:]
        # outputs。shape= （batch_size,tgt_seq_len,tgt_vocab_size)
        outputs = model(src_idx,tgt_inputs_idx)
        # 将输入展开：view将共用内存空间，只改变形状。reshape复制出新的张量
        # view的张量，内存必须是连续的
        # 这里tgt_target_idx是截取出来的张量，因此内存不连续，使用reshape
        loss = criterion(outputs.view(-1,outputs.size(-1)),tgt_target_idx.reshape(-1))

        # 梯度清零
        optimizer.zero_grad()
        # 反向传播
        loss.backward()
        # 更新参数
        optimizer.step()
        train_loss += loss.item()
    train_loss/=len(train_dataloader)

    # 训练数据没有设置验证，由于机器翻译中，没有正确答案，一句话可以有多种翻译
    # 因此模型评估时，无法再给出准确的评估值。
    # 在模型训练时，可以给出target,给出准确答案，但在评估时，无法设置正确答案
    print(f'Eopch{epoch+1} / {epochs},loss = {train_loss:.4f}')




Eopch1 / 10,loss = 3.2012
Eopch2 / 10,loss = 2.0728
Eopch3 / 10,loss = 1.6986
Eopch4 / 10,loss = 1.5136
Eopch5 / 10,loss = 1.4014
Eopch6 / 10,loss = 1.3213
Eopch7 / 10,loss = 1.2608
Eopch8 / 10,loss = 1.2138
Eopch9 / 10,loss = 1.1757
Eopch10 / 10,loss = 1.1411


In [81]:
# 模型翻译
@torch.no_grad()
def translate(model,src_sentence,max_length = 10,device = device):
    model.eval()
    src_tokens = list(src_sentence.split(' '))
    src_idx = [src_vocab.word_index.get(token,Vocab.unk_idx) for token in src_tokens]
    src_idx = torch.LongTensor(src_idx).unsqueeze(0)

    tgt_idx = torch.LongTensor([[Vocab.sos_idx]])
    print(src_tokens)
    print(src_idx)

    for _ in range(max_length):
        src_idx,tgt_idx = src_idx.to(device),tgt_idx.to(device)
        outputs = model(src_idx,tgt_idx)
        next_token = outputs[0].argmax(dim=1)[-1:]
        tgt_idx = torch.cat([tgt_idx,next_token.unsqueeze(0)],dim=1)
        if next_token.item() == Vocab.eos_idx:
            break
    print(tgt_idx)
    tgt_tokens = []
    # 切片，用来去掉开始符号和结束符号
    for idx in tgt_idx.squeeze().tolist()[1:-1]:
        tgt_tokens.append(tgt_vocab.index_word[idx])
    return ' '.join(tgt_tokens)

translate(model,'stop !')

['stop', '!']
tensor([[192,   1]])
tensor([[  2, 791,   5,   1,  31,   3]], device='cuda:0')


'arrêtez de <unk> !'

In [None]:
# 遇到问题：
# 1、掩码设备问题：tgt_mask = nn.Transformer.generate_square_subsequent_mask(tgt.size(1)).to(tgt.device)
#   在创建模型时候，给掩码藏家设备，与输入相同
# 2、在翻译时，给输入增加设备到CUDA