In [1]:
import torch
from d2l import torch as d2l

In [4]:
## Downloading and preprocessing the dataset the dataset

class MTFraEng(d2l.DataModule):
    def _download(self):
        d2l.extract(d2l.download(d2l.DATA_URL + 'fra-eng.zip', self.root, '94646ad1522d915e7b0f9296181140edcf86a4f5'))
        with open(self.root + '/fra-eng/fra.txt', encoding='utf-8') as f:
            return f.read()
        
    def _preprocess(self, text):
        text = text.replace('\u202f', ' ').replace('\xa0', ' ') # Replace non breaking space with space
        no_space = lambda char, prev_char: char in ',.!?' and prev_char != ' ' # Space between words and punctuation marks
        out = [' ' + char if i > 0 and no_space(char, text[i - 1]) else char for i, char in enumerate(text.lower())]
        return ''.join(out) # Join characters back into a string
    
    def _tokenize(self, text, max_examples = None):
        src, tgt = [], []
        for i, line in enumerate(text.split('\n')): # Iterate through each line in input
            if max_examples and i > max_examples:
                break
            parts = line.split('\t') # Split lines into source and tgt
            if len(parts) == 2: # Check if there are exactly two parts(source and target)
                src.append([t for t in f'{parts[0]} <eos>'.split(' ') if t]) # Tokenize into source seq and append to source list
                tgt.append([t for t in f'{parts[1]} <eos>'.split(' ') if t]) # Tokenize into target seq and append to target list
        return src, tgt
    
    def _build_arrays(self, raw_text, src_vocab=None, tgt_vocab=None):
        def _build_array(sentences, vocab, is_tgt=False):
            pad_or_trim = lambda seq, t: (seq[:t] if len(seq) > t else seq + ['<pad>'] * (t-len(seq))) # Adds pad token to match the sequence length
            sentences = [pad_or_trim(s, self.num_steps) for s in sentences]
            if is_tgt:
                sentences = [['<bos>'] + s for s in sentences] # If sentences are target sentenctes add bos at the begining
            if vocab is None:
                vocab = d2l.Vocab(sentences, min_freq=2) # If vocab is not provided, create a vocabulary
            array = torch.tensor([vocab[s] for s in sentences]) # Convert list of sentences into tensors
            valid_len = (array != vocab['<pad>']).type(torch.int32).sum(1)
            return array, vocab, valid_len
        src, tgt = self._tokenize(self._preprocess(raw_text), self.num_train + self.num_val)
        src_array, src_vocab, src_valid_len = _build_array(src, src_vocab)
        tgt_array, tgt_vocab, tgt_valid_len = _build_array(tgt, tgt_vocab, True)
        return ((src_array, tgt_array[:,:-1], src_valid_len, tgt_array[:, 1:]), src_vocab, tgt_vocab)
    
    def get_dataloader(self, train):
        idx = slice(0, self.num_train) if train else slice(self.num_train, None)
        return self.get_tensorloader(self.arrays, train, idx)
    
    def build(self, src_sentences, tgt_sentences):
        raw_text = '\n'.join([src + '\t' + tgt for src, tgt in zip(src_sentences, tgt_sentences)])
        arrays, _, _ = self._build_arrays(raw_text, self.src_vocab, self.tgt_vocab)
        return arrays

In [5]:
data = MTFraEng()
raw_text = data._download()

print(raw_text[:75])

Go.	Va !
Hi.	Salut !
Run!	Cours !
Run!	Courez !
Who?	Qui ?
Wow!	Ça alors !



In [6]:
text = data._preprocess(raw_text)
print(text[:75])

go .	va !
hi .	salut !
run !	cours !
run !	courez !
who ?	qui ?
wow !	ça al


In [7]:
src, tgt = data._tokenize(text)
src[:6], tgt[:6]

([['go', '.', '<eos>'],
  ['hi', '.', '<eos>'],
  ['run', '!', '<eos>'],
  ['run', '!', '<eos>'],
  ['who', '?', '<eos>'],
  ['wow', '!', '<eos>']],
 [['va', '!', '<eos>'],
  ['salut', '!', '<eos>'],
  ['cours', '!', '<eos>'],
  ['courez', '!', '<eos>'],
  ['qui', '?', '<eos>'],
  ['ça', 'alors', '!', '<eos>']])

In [8]:
@d2l.add_to_class(MTFraEng)  
def __init__(self, batch_size, num_steps=9, num_train=512, num_val=128):
    super(MTFraEng, self).__init__()
    self.save_hyperparameters()
    self.arrays, self.src_vocab, self.tgt_vocab = self._build_arrays(
        self._download())

In [9]:
data = MTFraEng(batch_size=3)
src, tgt, src_valid_len, label = next(iter(data.train_dataloader()))
print('source:', src.type(torch.int32))
print('decoder input:', tgt.type(torch.int32))
print('source len excluding pad:', src_valid_len.type(torch.int32))
print('label:', label.type(torch.int32))

source: tensor([[ 91, 191,   2,   3,   4,   4,   4,   4,   4],
        [ 84, 127,   2,   3,   4,   4,   4,   4,   4],
        [144, 174,   0,   3,   4,   4,   4,   4,   4]], dtype=torch.int32)
decoder input: tensor([[  3, 211,   6,   2,   4,   5,   5,   5,   5],
        [  3, 105,   6,   2,   4,   5,   5,   5,   5],
        [  3,  87,   0,   4,   5,   5,   5,   5,   5]], dtype=torch.int32)
source len excluding pad: tensor([4, 4, 4], dtype=torch.int32)
label: tensor([[211,   6,   2,   4,   5,   5,   5,   5,   5],
        [105,   6,   2,   4,   5,   5,   5,   5,   5],
        [ 87,   0,   4,   5,   5,   5,   5,   5,   5]], dtype=torch.int32)


In [10]:
src, tgt, _,  _ = data.build(['hi .'], ['salut .'])
print('source:', data.src_vocab.to_tokens(src[0].type(torch.int32)))
print('target:', data.tgt_vocab.to_tokens(tgt[0].type(torch.int32)))

source: ['hi', '.', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
target: ['<bos>', 'salut', '.', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
