In [11]:
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import time

## 0x01 文本数据预处理
最终的任务是按照：句子 --- 句子 的形式（sentence to sentence）
数据预处理的目的是将句子向量化。需要以下步骤：
1. 将句子转换成为单元 tokenized
2. 根据单元构建词典 vocab
3. 根据vocab来对句子向量化
最终得到[1,3,4,5,...]和[5,4,2,...]之间的转换


In [12]:
def tokenized(sentence):
    sentence = sentence.replace(',',' ,').replace('.',' .')
    return sentence.split()
print('--> tokenized() defined.')
print('Example for tokenized():')
print('------------------------------------')
temp = 'Hello world, I am a student.'
print('Input sentence: ')
print(temp)
print('Output tokens: ')
print(tokenized(temp))
print('------------------------------------')

# torchtext.vocab can help us to build vocabulary
from torchtext.vocab import vocab
from collections import Counter
def build_vocab_my(tokenizer,filepath,min_freq=2,specials=None):
    ''' 
    tokenizer: function to tokenize sentence
    filepath: path to the file
    min_freq: the minimum frequency of the word
    specials: the special tokens
    '''
    counter = Counter()
    if specials is None:
        specials = ['<unk>', '<pad>', '<bos>', '<eos>']

    with open(filepath, encoding='utf-8') as f:
        for line in f:
            words = tokenizer(line.strip())
            counter.update(words)
    print('The size of vocabulary before filtering is: ',len(counter))
    return vocab(counter,min_freq=min_freq,specials=specials)

path = '../input/translate/train.en'
vocab = build_vocab(tokenized,path,min_freq=2)
print('--> build_vocab() defined.')
print('Example for build_vocab():')
print('------------------------------------')
print('The size of vocabulary is: ',len(vocab))
print('The index of word "the" is: ',vocab['<eos>'])
print('------------------------------------')


--> tokenized() defined.
Example for tokenized():
------------------------------------
Input sentence: 
Hello world, I am a student.
Output tokens: 
['Hello', 'world', ',', 'I', 'am', 'a', 'student', '.']
------------------------------------
The size of vocabulary before filtering is:  11581
--> build_vocab() defined.
Example for build_vocab():
------------------------------------
The size of vocabulary is:  6247
The index of word "the" is:  3
------------------------------------


## 0x02 定义数据读取类

In [16]:
# torchtext.data.utils.get_tokenizer can help us to tokenize sentence
from torch.utils.data import DataLoader
class LoadEnglishData:
    def __init__(self,tokenizer,path,batchsize=8,shuffle=True):
        '''
        tokenizer: function to tokenize sentence 
        '''
        self.tokenizer = tokenizer
        self.path = path
        # self.en_tokenizer = get_tokenizer('spacy', language='../input/translate/en_core_web_sm-3.0.0/en_core_web_sm/en_core_web_sm-3.0.0/')
        # self.de_tokenizer = get_tokenizer('spacy', language='../input/translate/de_core_news_sm-3.0.0/de_core_news_sm/de_core_news_sm-3.0.0/')
        def build_vocab_my(tokenizer,filepath,min_freq=2,specials=None):
            ''' 
            tokenizer: function to tokenize sentence
            filepath: path to the file
            min_freq: the minimum frequency of the word
            specials: the special tokens
            '''
            counter = Counter()
            if specials is None:
                specials = ['<unk>', '<pad>', '<bos>', '<eos>']

            with open(filepath, encoding='utf-8') as f:
                for line in f:
                    words = tokenizer(line.strip())
                    counter.update(words)
            print('The size of vocabulary before filtering is: ',len(counter))
            return vocab(counter,min_freq=min_freq,specials=specials)
        self.en_vocab = build_vocab_my(tokenized,path[0])
        self.de_vocab = build_vocab_my(tokenized,path[1])
        self.batchsize = batchsize
        self.PAD_IDX = self.en_vocab['<pad>']
        self.BOS_IDX = self.en_vocab['<bos>']
        self.EOS_IDX = self.en_vocab['<eos>']
        self.specials = ['<unk>', '<pad>', '<bos>', '<eos>']
    
    def data_process(self,path):
        '''
        vectorize the data to dataset
        
        :param path: path to the file
        '''
        raw_de_iter = iter(open(path[0], encoding="utf8"))
        raw_en_iter = iter(open(path[1], encoding="utf8"))
        data = []
        for (raw_de, raw_en) in zip(raw_de_iter, raw_en_iter):
            de_tensor_ = torch.tensor([int(self.de_vocab[token]) for token in self.tokenizer(raw_de)], dtype=torch.long)
            en_tensor_ = torch.tensor([int(self.en_vocab[token]) for token in self.tokenizer(raw_en)], dtype=torch.long)
            data.append((de_tensor_, en_tensor_))
        return data

    def load_dataloader(self,train_path,test_path,valid_path):
        '''
        load data from the file
        '''
        train_data = self.data_process(train_path)
        val_data = self.data_process(valid_path)
        test_data = self.data_process(test_path)

        train_dataloader = DataLoader(train_data, batch_size=self.batchsize, shuffle=True, collate_fn=self.collate_fnX)
        val_dataloader = DataLoader(val_data, batch_size=self.batchsize, shuffle=True, collate_fn=self.collate_fnX)
        test_dataloader = DataLoader(test_data, batch_size=self.batchsize, shuffle=True, collate_fn=self.collate_fnX)
        return train_dataloader,val_dataloader,test_dataloader
    
    def collate_fnX(self,data_batch):
        '''
        Attention : which we need to collate the function
        1. 为decoder增加bos和eos
        2. 在同一个batch中将数据padding到同一长度
        不同batch的长度不一样
        transform data with padding
        '''
        de_batch, en_batch = [], []
        for (de_item, en_item) in data_batch:
            de_batch.append(de_item)
            en_batch.append(torch.cat([torch.tensor([self.en_vocab['<bos>']]), en_item, torch.tensor([self.en_vocab['<eos>']])], dim=0))
        de_batch = torch.nn.utils.rnn.pad_sequence(de_batch, padding_value=self.PAD_IDX)
        en_batch = torch.nn.utils.rnn.pad_sequence(en_batch, padding_value=self.PAD_IDX)
        return de_batch, en_batch
        

    def create_mask(self,x,y,device='cpu'):
        '''
        create mask
        '''
        x_seq_len = x.size(0)
        y_seq_len = y.size(0)
        # encoder mask (for masking the padding tokens)
        x_mask = torch.zeros((x_seq_len,x_seq_len),device=device).type(torch.bool)

        # decoder mask (for masking the future tokens)
        tgt_mask = torch.triu(torch.ones((y_seq_len,y_seq_len),device=device)).type(torch.bool)
        y_mask = tgt_mask.float().masked_fill(tgt_mask==0,float('-inf')).masked_fill(tgt_mask==1,float(0.0))
        y_mask = y_mask.transpose(0,1)

        # x_padding mask (for masking out the padding tokens    )
        x_padding_mask = (x == self.PAD_IDX).transpose(0,1)
        # y_padding_mask (for masking out the padding tokens)
        y_padding_mask = (y == self.PAD_IDX).transpose(0,1)
        return x_mask,y_mask,x_padding_mask,y_padding_mask


train_path = ['../input/translate/train.en','../input/translate/train.de']  
valid_path = ['../input/translate/val.en','../input/translate/val.de']
test_path = ['../input/translate/test_2016_flickr.en','../input/translate/test_2016_flickr.de']
batchsize = 8
shuffle = True
english_data = LoadEnglishData(tokenized,train_path,batchsize,shuffle)

train_dataloader,val_dataloader,test_dataloader = english_data.load_dataloader(train_path,test_path,valid_path)
for x,y in train_dataloader:
    print('The shape of x is: ',x.shape)
    print('The shape of y is: ',y.shape)
    print(english_data.create_mask(x,y))
    print(y)
    break

The size of vocabulary before filtering is:  11581


TypeError: forward() got an unexpected keyword argument 'min_freq'

## 0x03 定义模型

In [48]:
import torch.nn as nn 
import torch 
from torch.nn import Transformer 
import math
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size,embedding_size) -> None:
        super().__init__()
        self.embedding = nn.Embedding(vocab_size,embedding_size)
        self.embedding_size = embedding_size
    
    def forward(self,x):
        return self.embedding(x.long())*math.sqrt(self.embedding_size)

class PositionalEncoding(nn.Module):
    def __init__(self,dimen,dropout=0.1,max_len=10000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        PE = torch.zeros(max_len,dimen)
        position = torch.arange(0,max_len,dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0,dimen,2).float() * (-math.log(10000.0) / dimen))
        PE[:,0::2] = torch.sin(position * div_term)
        PE[:,1::2] = torch.cos(position * div_term)
        PE = PE.unsqueeze(0).transpose(0,1)
        self.register_buffer('PE',PE)
    
    def forward(self,x):
        x = x + self.PE[:x.size(0),:]
        return self.dropout(x)
    
class TranslationModel(nn.Module):
    def __init__(self,
                 x_vocab_size,
                 y_vocab_size,
                 d_model,
                 nhead,
                 num_encoder_layers,
                 num_decoder_layers,
                 dim_feedforward,
                 dropout,
                 device):
        super(TranslationModel,self).__init__()
        
        self.x_embedding = TokenEmbedding(x_vocab_size,d_model)
        self.y_embedding = TokenEmbedding(y_vocab_size,d_model)
        self.positional_embedding = PositionalEncoding(d_model,dropout)
        self.transformer = Transformer(
            d_model=d_model,
            nhead=nhead,
            num_encoder_layers=num_encoder_layers,
            num_decoder_layers=num_decoder_layers,
            dim_feedforward=dim_feedforward,
            dropout=dropout
        )

        self.device = device

        self.generator = nn.Linear(d_model,y_vocab_size)
        self._reset_parameters()
    
    def forward(self,
                x,
                y,
                x_mask,
                y_mask,
                x_padding_mask,
                y_padding_mask,
                memory_key_padding_mask):
        x_embedding = self.positional_embedding(self.x_embedding(x))
        y_embedding = self.positional_embedding(self.y_embedding(y))
        print('The shape of x_embedding is: ',x_embedding.shape)
        print('The shape of y_embedding is: ',y_embedding.shape)
        # output = self.transformer(x_embedding,
        #                           y_embedding,
        #                           src_mask=x_mask,
        #                           tgt_mask=y_mask,
        #                           src_key_padding_mask=x_padding_mask,
        #                           tgt_key_padding_mask=y_padding_mask,
        #                           memory_key_padding_mask=memory_key_padding_mask)
        # output = self.generator(output)
        # return output
    
    def inference(self,x,y,max_len=100):
        x_embedding = self.positional_embedding(self.x_embedding(x))
        memory = self.transformer.encoder(x_embedding)

        y_embedding = self.positional_embedding(self.y_embedding(y))
        outputs =  self.transformer.decoder(y_embedding,memory)
        return outputs
    
    def _reset_parameters(self):
        nn.init.xavier_uniform_(self.generator.weight)
        self.generator.bias.data.zero_()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
x_vocab_size = len(vocab)
y_vocab_size = len(vocab)
d_model = 512
nhead = 8
num_encoder_layers = 6
num_decoder_layers = 6
dim_feedforward = 2048
dropout = 0.1

model = TranslationModel(x_vocab_size,
                            y_vocab_size,
                            d_model,
                            nhead,
                            num_encoder_layers,
                            num_decoder_layers,
                            dim_feedforward,
                            dropout,
                            device)

for x,y in train_dataloader:
    x_mask,y_mask,x_padding_mask,y_padding_mask = english_data.create_mask(x,y,device)
    print('The shape of x is: ',x.shape)
    print('The shape of y is: ',y.shape)
    print('The shape of x_mask is: ',x_mask.shape)
    print('The shape of y_mask is: ',y_mask.shape)
    print('The shape of x_padding_mask is: ',x_padding_mask.shape)
    print('The shape of y_padding_mask is: ',y_padding_mask.shape)
    output = model(x,y,x_mask,y_mask,x_padding_mask,y_padding_mask,x_padding_mask)
    print('The shape of output is: ',output.shape)
    break

The shape of x is:  torch.Size([19, 8])
The shape of y is:  torch.Size([20, 8])
The shape of x_mask is:  torch.Size([19, 19])
The shape of y_mask is:  torch.Size([20, 20])
The shape of x_padding_mask is:  torch.Size([8, 19])
The shape of y_padding_mask is:  torch.Size([8, 20])


IndexError: index out of range in self