In [1]:
import torch
import numpy as np
import matplotlib.pyplot as plt
from torch import nn
from torchtext import data
import spacy
from time import time
import pandas as pd
import copy

# Data preparation

In [2]:
en_text = open('./data/EN_DE_translation/en.txt', encoding="utf8").read().split('\n')
de_text = open('./data/EN_DE_translation/de.txt', encoding="utf8").read().split('\n')
print(en_text[:5])
print(de_text[:5])

['iron cement is a ready for use paste which is laid as a fillet by putty knife or finger in the mould edges ( corners ) of the steel ingot mould .', 'iron cement protects the ingot against the hot , abrasive steel casting process .', 'a fire restant repair cement for fire places , ovens , open fireplaces etc .', 'Construction and repair of highways and ...', 'An announcement must be commercial character .']
['iron cement ist eine gebrauchs ##AT##-##AT## fertige Paste , die mit einem Spachtel oder den Fingern als Hohlkehle in die Formecken ( Winkel ) der Stahlguss -Kokille aufgetragen wird .', 'Nach der Aushärtung schützt iron cement die Kokille gegen den heissen , abrasiven Stahlguss .', 'feuerfester Reparaturkitt für Feuerungsanlagen , Öfen , offene Feuerstellen etc.', 'Der Bau und die Reparatur der Autostraßen ...', 'die Mitteilungen sollen den geschäftlichen kommerziellen Charakter tragen .']


In [3]:
def create_tokenize_fn(tokenizer):
    return lambda sentence: [word.text for word in tokenizer(sentence.lower())]

disable = ["tagger","parser","ner","textcat"] # disable pipelines other then tokenization
tokenize_en = create_tokenize_fn(tokenizer=spacy.load('en_core_web_sm', disable=disable))
tokenize_de = create_tokenize_fn(tokenizer=spacy.load('de_core_news_sm', disable=disable))

print(tokenize_en("I am happy!"), tokenize_en("I'm happy..."))
print(de_text[0])

['i', 'am', 'happy', '!'] ['i', "'m", 'happy', '...']
iron cement ist eine gebrauchs ##AT##-##AT## fertige Paste , die mit einem Spachtel oder den Fingern als Hohlkehle in die Formecken ( Winkel ) der Stahlguss -Kokille aufgetragen wird .


In [4]:
en_field = data.Field(lower=True, tokenize=tokenize_en)
de_field = data.Field(lower=True, tokenize=tokenize_de, init_token='<sos>', eos_token='<eos>')

In [5]:
raw_data = {'en': [sen for sen in en_text], 'de': [sen for sen in de_text]}
df = pd.DataFrame(raw_data)
temp_csv = './data/EN_DE_translation/temp.csv'
df.to_csv(temp_csv, index=False)
t = time()
dataset = data.TabularDataset(temp_csv, format='csv', fields=[('x',en_field),('y',de_field)])
print(time()-t)
print(dataset.examples[0].x, dataset.examples[0].y)

68.3418972492218
['en'] ['de']


In [6]:
en_field.build_vocab(dataset)
de_field.build_vocab(dataset)

In [7]:
en_pad_nr = en_field.vocab.stoi['<pad>'] # dict word(str) -> unique index(nr)
de_pad_nr = de_field.vocab.stoi['<pad>']
print(en_pad_nr, de_pad_nr)

1 1


In [8]:
class BatchSizeFnCreator():
    def __init__(self):
        self.y_max, self.x_max = 0, 0 # max x/y seq len
        
    def batch_size_fn(self, new, count, sofar):
        self.x_max = max(self.x_max,  len(new.x))
        self.y_max = max(self.y_max,  len(new.y) + 2)
        x_elements = count * self.x_max
        y_elements = count * self.y_max
        return max(x_elements, y_elements)

In [9]:
class MyIterator(data.Iterator):    
    def pool(self):
        for p in data.batch(self.data(), self.batch_size * 100):
            p_batch = data.batch(sorted(p, key=self.sort_key), self.batch_size, self.batch_size_fn)
            for b in self.random_shuffler(list(p_batch)):
                yield b
    
    def create_batches(self):
        if self.train:
            self.batches = self.pool()
        else:
            self.batches = []
            for b in data.batch(self.data(), self.batch_size, self.batch_size_fn):
                self.batches.append(sorted(b, key=self.sort_key))

In [10]:
B = BatchSizeFnCreator()
iterator = MyIterator(  dataset, batch_size=1500, device=torch.device('cuda'), repeat=False, 
                        sort_key=lambda e: [len(e.x), len(e.y)],
                        batch_size_fn=B.batch_size_fn, train=True, shuffle=True )

In [11]:
len([i for i,b in enumerate(iterator)])

3929

In [12]:
a = en_field.tokenize('I want to go')
b = en_field.numericalize([a])
c = en_field.process([['i','want','to','go']])
print(b,'\n', c, '\n both are the same')

tensor([[ 49],
        [271],
        [  8],
        [308]]) 
 tensor([[ 49],
        [271],
        [  8],
        [308]]) 
 both are the same


In [13]:
en_field.vocab.stoi # str -> int
en_field.vocab.itos[:10] # int -> str

['<unk>', '<pad>', 'the', ',', '.', '#', 'and', 'of', 'to', 'a']

In [14]:
s = 4
~torch.triu(torch.ones(1,s,s), diagonal=1).byte()

tensor([[[1, 0, 0, 0],
         [1, 1, 0, 0],
         [1, 1, 1, 0],
         [1, 1, 1, 1]]], dtype=torch.uint8)

In [15]:
def create_enc_dec_masks(x, y, x_pad=en_pad_nr, y_pad=de_pad_nr):
    # encoder input mask and decoder input mask
    e_x_mask = (x != x_pad).unsqueeze(-2) # [b,1,s(en)]
    d_x_mask = (y != y_pad).unsqueeze(-2) # [b,1,s(de)]
    s = d_x_mask.size(2)
    nopeak_mask = ~torch.triu(torch.ones(1,s,s), diagonal=1).byte().cuda()
    # [b,s,s] = [b,1,s] & [b,s,s]
    d_x_mask = d_x_mask & nopeak_mask
    # return [b,1,s], [b,s,s]
    return e_x_mask, d_x_mask

In [16]:
for i, batch in enumerate(iterator):
    x = batch.x.transpose(0,1)
    y = batch.y.transpose(0,1)
    e_x_mask, d_x_mask = create_enc_dec_masks(x,y)
    # [b,e_s], [b,d_s], [b,1,e_s], [b,d_s,d_s], where e_s = num of words in encoder sentence, b = batch size
    print(x.shape, y.shape, e_x_mask.shape, d_x_mask.shape)
    if i > 5: break

torch.Size([10, 28]) torch.Size([10, 28]) torch.Size([10, 1, 28]) torch.Size([10, 28, 28])
torch.Size([10, 14]) torch.Size([10, 14]) torch.Size([10, 1, 14]) torch.Size([10, 14, 14])
torch.Size([10, 42]) torch.Size([10, 39]) torch.Size([10, 1, 42]) torch.Size([10, 39, 39])
torch.Size([10, 33]) torch.Size([10, 35]) torch.Size([10, 1, 33]) torch.Size([10, 35, 35])
torch.Size([10, 8]) torch.Size([10, 10]) torch.Size([10, 1, 8]) torch.Size([10, 10, 10])
torch.Size([10, 79]) torch.Size([10, 32]) torch.Size([10, 1, 79]) torch.Size([10, 32, 32])
torch.Size([10, 48]) torch.Size([10, 23]) torch.Size([10, 1, 48]) torch.Size([10, 23, 23])


# Model

In [17]:
'''
    w_en = num of encoder vocabulary words
    w_de = num of decoder vocabulary words
    d / d_model = size of embeddings and num neurons
    d_output = output size of encoder/decoder/other module
    N = num of transformer blocks (multi-head att + res-block + ff + res-block)
    num_heads = num heads in multi-head att layer
    drop = num of neurons to drop (dropout)
    s = sentence length (num of words in sentence) (is padded to match other items in same batch)
    b = batch size
'''

'\n    w_en = num of encoder vocabulary words\n    w_de = num of decoder vocabulary words\n    d / d_model = size of embeddings and num neurons\n    d_output = output size of encoder/decoder/other module\n    N = num of transformer blocks (multi-head att + res-block + ff + res-block)\n    num_heads = num heads in multi-head att layer\n    drop = num of neurons to drop (dropout)\n    s = sentence length (num of words in sentence) (is padded to match other items in same batch)\n    b = batch size\n'

### Embeddings

In [18]:
class Embedding(nn.Module):
    def __init__(self, num_total_words, d_model):
        super().__init__()
        self.e = nn.Embedding(num_total_words, embedding_dim=d_model)
        self.d = d_model
        
    def forward(self, x):
        ''' [b,s] -> [b,s,d], b=batch size, s=sentence len, d=embedding dim'''
        return self.e(x) + np.sqrt(self.d)

In [19]:
def get_positional_embedding_matrix(max_seq_len, d):
    # returns pe matrix w/ shape = [max_seq_len, d] (d=embed size),
    # which encodes (position of word and embedding_number) -> new number
    # in practice you do for each embedding pe[:seq_len,:]
    pe = torch.zeros([max_seq_len, d]) # positional embedding matrix
    for pos in range(max_seq_len):
        for i in range(0, d, 2):
            pe[pos, i  ] = np.sin( pos / 10000**(2*i/d) )
            pe[pos, i+1] = np.cos( pos / 10000**(2*i/d) )
    return pe

In [20]:
from torch.autograd import Variable
class PositionalEmbedding(nn.Module):
    ''' embeds embeddings and their position to new embdedding '''
    def __init__(self, d_model):
        super().__init__()
        self.d = d_model
        self.pe = get_positional_embedding_matrix(d=d_model, max_seq_len=1000) # [max_s,d]  
    
    def forward(self, x):
        ''' [b,s,d] -> [1,s,d] '''
        return Variable(self.pe[:x.size(1),:x.size(2)], requires_grad=False).unsqueeze(0).cuda()

In [21]:
e = Embedding(num_total_words=100000, d_model=512).cuda()
p = PositionalEmbedding(d_model=512)
for i, batch in enumerate(iterator):
    x = batch.x.transpose(0,1)
    z = e(x)
    y = p(z)
    print(x.shape, z.shape, y.shape)
    if i > 3: break

torch.Size([10, 36]) torch.Size([10, 36, 512]) torch.Size([1, 36, 512])
torch.Size([10, 14]) torch.Size([10, 14, 512]) torch.Size([1, 14, 512])
torch.Size([10, 19]) torch.Size([10, 19, 512]) torch.Size([1, 19, 512])
torch.Size([10, 34]) torch.Size([10, 34, 512]) torch.Size([1, 34, 512])
torch.Size([10, 66]) torch.Size([10, 66, 512]) torch.Size([1, 66, 512])


### Main architecture

In [22]:
softmax = torch.nn.functional.log_softmax
class Generator(nn.Module):
    def __init__(self, d_model, num_total_words):
        super().__init__()
        self.W = nn.Linear(d_model, de_total_words) # output/last linear layer
    def forward(self, x):
        return softmax( self.W(x), dim=-1 )

class Transformer(nn.Module):
    def __init__(self, en_total_words, de_total_words, d_model, N, num_heads, drop):
        super().__init__()
        self.encoder = Encoder(en_total_words, d_model, N, num_heads, drop)
        self.decoder = Decoder(de_total_words, d_model, N, num_heads, drop)
        self.generator = Generator(d_model, de_total_words) # output/last linear layer
    
    def forward(self, e_x, d_x, e_x_mask, d_x_mask):
        # e_x  , d_x   = [b, s_e]   , [b, s_d], where s_e = num of words in encoder sentence
        # e_x_m, d_x_m = [b, 1, s_e], [b, s_d, s_d]
        e_y = self.encoder(e_x, e_x_mask)
        d_y = self.decoder(e_y, d_x, e_x_mask, d_x_mask)
        return self.generator(d_y)

### Encoder + Decoder

In [23]:
def repeat(module, n):
    ''' make n copies of module and store it in list '''
    return nn.ModuleList([copy.deepcopy(module) for _ in range(n)])

In [24]:
class Encoder(nn.Module):
    def __init__(self, num_total_words, d_model, N, num_heads, drop):
        super().__init__()
        self.e = Embedding(num_total_words, d_model)
        self.pe = PositionalEmbedding(d_model)
        self.dropout = nn.Dropout(drop)
        self.multi_head_attention = repeat( MultiHeadAttention(d_model, num_heads, drop), N )
        self.add_and_norm_1 = repeat( AddAndNorm(d_model), N )
        self.add_and_norm_2 = repeat( AddAndNorm(d_model), N )
        self.ff = repeat( FeedForward(d_model), N )
        self.N = N
        
    def forward(self, e_x, e_x_mask):
        ''' all x's are w/ shape [b,s,d] '''
        e = self.e(e_x) # [b,s,d]
        pe = self.pe(e) # [1,s,d]
        x = e + pe # broadcast pe for all batches
        x = self.dropout(x)
        for i in range(self.N):
            x_orig = x
            x = self.multi_head_attention[i](Q=x, K=x, V=x, mask=e_x_mask)
            x = self.add_and_norm_1[i](x, x_orig)
            x_orig = x
            x = self.ff[i](x)
            x = self.add_and_norm_2[i](x, x_orig)
        return x

In [25]:
class Decoder(nn.Module):
    def __init__(self, num_total_words, d_model, N, num_heads, drop):
        super().__init__()
        self.e = Embedding(num_total_words, d_model)
        self.pe = PositionalEmbedding(d_model)
        self.dropout = nn.Dropout(drop)
        self.multi_head_attention_1 = repeat( MultiHeadAttention(d_model, num_heads, drop), N )
        self.multi_head_attention_2 = repeat( MultiHeadAttention(d_model, num_heads, drop), N )
        self.add_and_norm_1 = repeat( AddAndNorm(d_model), N )
        self.add_and_norm_2 = repeat( AddAndNorm(d_model), N )
        self.add_and_norm_3 = repeat( AddAndNorm(d_model), N )
        self.ff = repeat( FeedForward(d_model), N )
        self.N = N
        
    def forward(self, e_y, d_x, e_x_mask, d_x_mask): # encoder_x, decoder_x
        ''' [b,s], [b,s], [b,1,s], [b,s,s] -> [b,s,d] '''
        d_e = self.e(d_x) # [b,s,d] # decoder embedding
        d_pe = self.pe(d_e) # [1,s,d] # decoder positional embed
        d_x = d_e + d_pe # broadcast pe for all batches
        d_x = self.dropout(d_x)
        for i in range(self.N):
            x_orig = d_x
            d_x = self.multi_head_attention_1[i](Q=d_x, K=d_x, V=d_x, mask=d_x_mask)
            d_x = self.add_and_norm_1[i](d_x, x_orig)
            x_orig = d_x
            d_x = self.multi_head_attention_2[i](Q=d_x, K=e_y, V=e_y, mask=e_x_mask)
            d_x = self.add_and_norm_2[i](d_x, x_orig)
            x_orig = d_x
            d_x = self.ff[i](d_x)
            d_x = self.add_and_norm_3[i](d_x, x_orig)
        return d_x

### Layers

In [26]:
class AddAndNorm(nn.Module):
    def __init__(self, d_model, epsilon=1e-6):
        super().__init__()
        self.a = nn.Parameter(torch.ones(d_model))
        self.b = nn.Parameter(torch.zeros(d_model))
        self.epsilon = epsilon
        
    def forward(self, x, x_orig):
        ''' [b,s,d] -> [b,s,d] '''
        mean = x.mean(-1, keepdim=True) # [b,s,1]
        std = x.std(-1, keepdim=True) + self.epsilon # [b,s,1]
        norm = (x - mean)/std # [b,s,d]
        x = self.a * norm + self.b
        return x + x_orig

In [27]:
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff=2048, drop=0.1):
        super().__init__()
        self.W1 = nn.Linear(d_model, d_ff)
        self.drop = nn.Dropout(drop)
        self.W2 = nn.Linear(d_ff, d_model)
        self.relu = nn.functional.relu
        
    def forward(self, x):
        W1, W2, drop, relu = self.W1, self.W2, self.drop, self.relu
        return W2( drop( relu(W1(x)) ) )

In [28]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d, num_heads, drop=0):
        super().__init__()
        self.num_heads = num_heads
        self.drop1 = nn.Dropout(drop)
        self.drop2 = nn.Dropout(drop)
        self.saved_attention = None # saved attention outputs [b,h,s,s], h=num heads
        self.ff = nn.Linear(d,d)
        # linear layers for Q, K, D
        self.WQ = nn.Linear(d, d)
        self.WK = nn.Linear(d, d)
        self.WV = nn.Linear(d, d)
        
    def forward(self, Q, K, V, mask=None):
        ''' Q = K = V = [b,s,d] shape, returns same, mask = [b,s,s] '''
        softmax = nn.functional.softmax
        (b,s,d), h = Q.shape, self.num_heads
        # split each var to h heads. [b,s,d] -> [b,h,s,d/h], where h = num heads
        # all further computation now is split into h subsets automatically
        Q = self.WQ(Q).view(b,-1,h,d//h).transpose(1,2)
        K = self.WK(K).view(b,-1,h,d//h).transpose(1,2)
        V = self.WV(V).view(b,-1,h,d//h).transpose(1,2)
        # [b,h,s,s] = [b,h,s,d/h] @ [b,h,d/h,s] / const
        a = (Q @ K.transpose(-2,-1)) / np.sqrt(d//h)
        # [b,h,s,s] = [b,h,s,s].masked_fill([b,1,s,s])
        a = a if mask is None else a.masked_fill(mask.unsqueeze(1) == 0, -1e9)
        self.saved_attention = softmax(a, dim=-1) # TRY: dim=-2
        a = self.drop1(self.saved_attention)
        # [b,h,s,d/h] = [b,h,s,s] @ [b,h,s,d/h]
        a = a @ V
        # concat all heads and apply linear layer
        a = a.transpose(1,2).contiguous().view(b,s,d)
        return self.drop2( self.ff(a) )

In [29]:
en_total_words, de_total_words = len(en_field.vocab.itos), len(de_field.vocab.itos)
print(en_total_words, de_total_words)
model = Transformer(en_total_words, de_total_words, d_model=512, N=6, num_heads=8, drop=0.1)
model = model.cuda()

for i, batch in enumerate(iterator):
    x = batch.x.transpose(0,1)
    y = batch.y.transpose(0,1)
    e_x = x
    d_x, d_y = y[:,:-1], y[:,1:].contiguous().view(-1)
    e_x_mask, d_x_mask = create_enc_dec_masks(e_x,d_x)
    z = model(e_x, d_x, e_x_mask, d_x_mask)
    # [b,e_s], [b,d_s], [b,1,e_s], [b,d_s,d_s], [b,d_s,total_words], 
    # where e_s = num of words in encoder sentence, b = batch size
    print(e_x.shape, d_x.shape, e_x_mask.shape, d_x_mask.shape, z.shape)
    if i > 1: break

49739 88914
torch.Size([10, 22]) torch.Size([10, 21]) torch.Size([10, 1, 22]) torch.Size([10, 21, 21]) torch.Size([10, 21, 88914])
torch.Size([10, 16]) torch.Size([10, 17]) torch.Size([10, 1, 16]) torch.Size([10, 17, 17]) torch.Size([10, 17, 88914])
torch.Size([10, 33]) torch.Size([10, 39]) torch.Size([10, 1, 33]) torch.Size([10, 39, 39]) torch.Size([10, 39, 88914])


# Training

### Init model

In [30]:
model = Transformer(en_total_words, de_total_words, d_model=512, N=6, num_heads=8, drop=0.1)
model = model.cuda()

In [31]:
for p in model.parameters():
    if p.dim() > 1: # filter to only ff layers
        nn.init.xavier_uniform_(p)

In [32]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

### Train loop

In [33]:
EPOCHS = 1
cross_entropy = torch.nn.functional.cross_entropy # input:[b,c], target:[b], output:[]
model.train() # tell params who needs it to update during training
for epoch in range(EPOCHS):
    total_loss = 0
    for i, b in enumerate(iterator):
        # convert x's y's to correct format for decoder/encoder (e_x, d_x)
        x = batch.x.transpose(0,1)
        y = batch.y.transpose(0,1)
        e_x = x
        d_x, d_y = y[:,:-1], y[:,1:].contiguous().view(-1) # [b,d_s], [b x d_s]
        e_x_mask, d_x_mask = create_enc_dec_masks(e_x,d_x)
        d_y_pred = model(e_x, d_x, e_x_mask, d_x_mask) # [b,d_s,c]
        d_y_pred = d_y_pred.view(-1, d_y_pred.size(-1))
        optimizer.zero_grad()
        loss = cross_entropy(d_y_pred, d_y, ignore_index=de_pad_nr) # need to ignore padding
        # if padding is not ignred, then the loss function will be way lower and will decrease not so fast
        # loss2 = cross_entropy(d_y_pred, d_y)
        loss.backward()
        optimizer.step()
        if i % 100 == 0: print(i, loss.detach().cpu().numpy())

0 15.100875


RuntimeError: CUDA out of memory. Tried to allocate 132.38 MiB (GPU 0; 6.00 GiB total capacity; 4.45 GiB already allocated; 69.39 MiB free; 212.02 MiB cached)

### Testing

In [None]:
en_sentence = ['i','love','dogs']
max_pred_len = 100
e_x = en_field.process([en_sentence]).view(1,len(en_sentence)) # [b,s]
for w in e_x[0]:
    print(en_field.vocab.itos[w], end=' ')

In [None]:
def subsequent_mask(size):
    attn_shape = (1, size, size)
    subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype('uint8')
    return torch.from_numpy(subsequent_mask) == 0

In [None]:
# for i, batch in enumerate(iterator):
#     x = batch.x.transpose(0,1)
#     y = batch.y.transpose(0,1)
#     e_x = x[0].unsqueeze(0)
#     break
# en_sentence = [0] * e_x.size(1)
# for d in y[0][1:]: # [b,s], b=1 [1:] to skip <sos> token
#     print(de_field.vocab.itos[d])

In [None]:
model.eval() # turn off dropout
e_x_mask = Variable( torch.ones(1,1,len(en_sentence)) ) # [b,1,s]
e_x, e_x_mask = e_x.cuda(), e_x_mask.cuda()
e_y = model.encoder(e_x, e_x_mask) # [b,s,d] or [1,3,512]
d_x = torch.ones(1,1).fill_(de_field.vocab.stoi['<sos>']).type_as(e_x.data) # [b,s]
for i in range(max_pred_len):
    d_x_mask = Variable(subsequent_mask(d_x.size(1))).type_as(e_x.data) # [b,s,s]
    d_y = model.decoder(e_y, d_x, e_x_mask, d_x_mask)
    prob = model.generator(d_y) # [b,s,w]
#     print(prob.shape)
    j, next_word = torch.max(prob, dim=2) # [b,s]
    word = torch.full([1, 1], next_word[0,-1]).type_as(e_x.data)
    d_x = torch.cat([d_x, word], dim=1) # [b,s] append new word to decoder input
    
for d in d_x[0][1:]: # [b,s], b=1 [1:] to skip <sos> token
    print(de_field.vocab.itos[d])


In [None]:
11.546175 11.546175
10.702625 10.702625
10.049401 10.049401
9.308085 9.308085
8.485468 8.485468
7.564091 7.564091
6.7078533 6.7078533
6.092827 6.092827
5.68647 5.68647
5.336512 5.336512
5.088187 5.088187
4.981215 4.981215
4.9514318 4.9514318
4.95398 4.95398
4.928978 4.928978
4.9025507 4.9025507
4.8787074 4.8787074