In [1]:
# if german spacy not present uncomment this:
#!python3 -m spacy download de

In [2]:
import torch
import numpy  as np
from torch import nn, optim
import time
import torch.nn.functional as F

from torchtext.datasets import TranslationDataset, Multi30k
from torchtext.data import Field, BucketIterator

import spacy
spacy_english = spacy.load('en')
spacy_german = spacy.load('de')

In [3]:
# tokenize languages
def tokenize_german(text):
    return [token.text for token in spacy_german.tokenizer(text)]

# reversing order has been shown to imporve model!!! 
def tokenize_english(text):
    return [token.text for token in spacy_english.tokenizer(text)][::-1] 



In [4]:
SOURCE = Field(tokenize = tokenize_english, 
               init_token = "<sos>",
               eos_token = "<eso>",
               lower = True
              )

TARGET = Field(tokenize = tokenize_german, 
               init_token = "<sos>",
               eos_token = "<eso>",
               lower = True
              )

train_data, valid_data, test_data = Multi30k.splits(exts = ('.en', '.de'),
                                                    fields= (SOURCE, TARGET))




In [5]:
print(train_data.examples[0].src)
print(train_data.examples[0].trg)

['.', 'bushes', 'many', 'near', 'outside', 'are', 'males', 'white', ',', 'young', 'two']
['zwei', 'junge', 'weiße', 'männer', 'sind', 'im', 'freien', 'in', 'der', 'nähe', 'vieler', 'büsche', '.']


In [6]:
len(train_data.examples), len(test_data.examples), len(test_data.examples)

(29000, 1000, 1000)

In [7]:
SOURCE.build_vocab(train_data, min_freq = 2)
TARGET.build_vocab(train_data, min_freq = 2)
len(SOURCE.vocab), len(TARGET.vocab)

(5893, 7854)

In [8]:
device =  torch.device("cuda" if torch.cuda.is_available() else 'cpu')

In [9]:
batch_size = 32
train_iterator, valid_iterator, test_iterator = BucketIterator.splits((train_data, valid_data, test_data),
                                                                      batch_size = batch_size, 
                                                                      device = device)



In [10]:
# building the encoder

In [31]:
class Encoder(nn.Module):
    def __init__(self, input_dims, emd_dims, hidden_dims, n_layers, dropout):
        super().__init__()
        self.hid_dims = hidden_dims
        self.emd_dims = emd_dims
        self.n_layers = n_layers
        self.dropout = dropout
        self.input_dims = input_dims
        
        self.embeddings = nn.Embedding(input_dims, emd_dims)
        self.rnn = nn.LSTM(emd_dims, hidden_dims, n_layers, dropout = dropout)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        embedded = self.dropout(self.embeddings(src))
        outputs, (h, cell) = self.rnn(embedded)
        return h, cell

        
class Decoder(nn.Module):
    def __init__(self, output_dims, emd_dims, hidden_dims, n_layers, dropout):
        super().__init__()
        self.hid_dims = hidden_dims
        self.emd_dims = emd_dims
        self.n_layers = n_layers
        self.dropout = dropout
        self.output_dims = output_dims
        
        self.embeddings = nn.Embedding(output_dims, emd_dims)
        self.rnn = nn.LSTM(emd_dims, hidden_dims, n_layers, dropout = dropout)
        self.fc_out = nn.Linear(hidden_dims, output_dims)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input_data, h, cell):
        input_data = input_data.unsqueeze(0)
        embedded = self.dropout(self.embeddings(input_data))
        outputs, (h, cell) = self.rnn(embedded)
        pred = self.fc_out(outputs.squeeze(0))
        return pred, h, cell


In [38]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
    def forward(self, src, trg, teacher_forcing_rate=0.5):
        batch_size = trg.shape[1]
        target_length = trg.shape[0]
        target_vocab_size = self.decoder.output_dims
        outputs = torch.zeros(target_length, batch_size, target_vocab_size).to(self.device)
        
        h, cell = self.encoder(src)
        input_tok = trg[0,:]
        for t  in range(1, target_length):
            output, h, cell = self.decoder(input_tok, h, cell)
            outputs[t] = output 
            top = output.argmax(1)
            input_tok = trg[t] if (np.random.random() < teacher_forcing_rate)  else top
        return outputs
        
        

In [39]:
# make an instance ready to be trained:

input_dimension = len(SOURCE.vocab)
output_dimension = len(TARGET.vocab)
encoder_embedding_dimension = 256
decoder_embedding_dimension = 256
hidden_layer_dimension = 512
number_of_layers = 2
encoder_dropout = 0.5
decoder_dropout = 0.5

encod = Encoder(input_dimension, 
                encoder_embedding_dimension, 
                hidden_layer_dimension, 
                number_of_layers, 
                encoder_dropout)

decod = Decoder(output_dimension, 
                decoder_embedding_dimension, 
                hidden_layer_dimension, 
                number_of_layers, 
                decoder_dropout)

model = Seq2Seq(encod, decod, device).to(device)


In [40]:
# training loop
def initialize_weights(m):
    for name, param, in m.named_parameters():
        nn.init.uniform_(param.data, -0.1, 0.1)

model.apply(initialize_weights)

Seq2Seq(
  (encoder): Encoder(
    (embeddings): Embedding(5893, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embeddings): Embedding(7854, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (fc_out): Linear(in_features=512, out_features=7854, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [41]:
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index = TARGET.vocab.stoi[TARGET.pad_token])


In [46]:
def train(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    for i, batch in enumerate(iterator):
        src = batch.src
        trg = batch.trg
        optimizer.zero_grad()
        output = model(src, trg)
    
        output_dims = output.shape[-1]
        output = output[1:].view(-1, output_dims)
        trg = trg[1:].view(-1)
        
        loss = criterion(output, trg)
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss/len(iterator)

def evaluate(model, iterator, optimizer, criterion):
    
    model.eval()
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch.src
            trg = batch.trg
            # turn off teacher forcing
            output = model(src, trg, 0)

            output_dim = output.shape[-1]
            
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)

            loss = criterion(output, trg)
            
            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)
        
    
        

In [None]:
epochs = 10
grad_clip = 1
lowest_validation_loss = float('inf')

for epoch in range(epochs):
    start_time = time.time()
    
    training_loss = train(model, train_iterator, optimizer, criterion, grad_clip)
    validation_loss = evaluare(model, valid_iterator, optimizer, criterion)
    end_time = time.time()
    
    if validation_loss <  lowest_validation_loss:
        lowest_validation_loss = valid_loss
        torch.save(model.state_dict(), 'seq2seq.pt')
    
    print(f"Epoch {epoch}{epochs}, time : {np.round(end_time-start_time)}s")
    print(f"Train loss: {train_loss.item()}")
    print(f"VAlid loss: {valid_loss.item()}")
    
