In [1]:
import torchtext

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.legacy.datasets import Multi30k
from torchtext.legacy.data import Field, BucketIterator
import numpy as np
import spacy
import random

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

###### load tokenizers for english and german

In [41]:
%%capture
# !python -m spacy download de_core_news_sm
# !python -m spacy download en_core_web_sm
!python -m spacy download de

In [5]:
import en_core_web_sm
import de_core_news_sm

spacy_ger = de_core_news_sm.load()
spacy_eng = en_core_web_sm.load()

In [6]:
def tokenizer_ger(german_text):
    return [tok.text for tok in spacy_ger.tokenizer(german_text)]

def tokenizer_eng(english_text):
    return [tok.text for tok in spacy_eng.tokenizer(english_text)]

In [7]:
# German and English Preprocessing
german = Field(tokenize=tokenizer_ger, lower=True,
              init_token='<START>', eos_token='<END>')

english = Field(tokenize=tokenizer_eng, lower=True,
              init_token='<START>', eos_token='<END>')

In [8]:
train_data, validation_data, test_data = Multi30k.splits(exts=('.de', '.en'),
                                                        fields=(german, english))

In [9]:
german.build_vocab(train_data, max_size=10000, min_freq=2)
english.build_vocab(train_data, max_size=10000, min_freq=2)

In [10]:
# LSTM
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, drop_num):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout = nn.Dropout(drop_num)
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=drop_num)
    
    def forward(self, x):
        embedding = self.dropout(self.embedding(x))
        outputs, (hidden, cell) = self.rnn(embedding)
        
        return hidden, cell

In [11]:
# LSTM
class Decoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, output_size, num_layers, drop_num):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout = nn.Dropout(drop_num)
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=drop_num)
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, x, hidden, cell):
        x = x.unsqueeze(0) # predict one word at a time
        
        embedding = self.dropout(self.embedding(x))
        
        outputs, (hidden, cell) = self.rnn(embedding, (hidden, cell))
        
        predictions = self.fc(outputs)
        
        predictions = predictions.squeeze(0)
        
        return predictions, hidden, cell
        

In [12]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        
    def forward(self, source, target, teacher_force_ratio=0.5):
        batch_size = source.shape[1]
        target_len = target.shape[0]
        target_vocab_size = len(english.vocab)
        
        outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)
        
        hidden, cell = self.encoder(source)
        
        # take the start token
        x = target[0]
        
        for t in range(1, target_len):
            output, hidden, cell = self.decoder(x, hidden, cell)
            
            outputs[t] = output
            
            best_guess = output.argmax(1)
            
            x = target[t] if random.random() < teacher_force_ratio else best_guess
            
        return outputs

In [13]:
batch_size = 64
lr = 0.001

In [14]:
input_size_encoder = len(german.vocab)
input_size_decoder = len(english.vocab)

output_size = len(english.vocab)

num_layers=2
encoder_embedding_size = 300
decoder_embedding_size = 300

hidden_size = 1024

drop_rate = 0.5

In [15]:
train_iter, valid_iter, test_iter = BucketIterator.splits((train_data, validation_data, test_data), 
                                                           batch_size = batch_size,
                                                           sort_within_batch = True,
                                                           sort_key = lambda x : len(x.src),
                                                           device=device)

In [16]:
encoder_model = Encoder(input_size_encoder, encoder_embedding_size, hidden_size, num_layers, drop_rate).to(device)

In [17]:
decoder_model = Decoder(input_size_decoder, decoder_embedding_size, 
                        hidden_size, output_size, num_layers, drop_rate).to(device)

In [18]:
model = Seq2Seq(encoder_model, decoder_model).to(device)

In [19]:
pad_idx = english.vocab.stoi['<PAD>']
criterion = nn.CrossEntropyLoss(ignore_index = pad_idx) # did not know this was allowed

### Training Phase

In [20]:
def train(model, num_epochs, optimizer):
    lowest = 0
    model.train()
    for epoch in range(num_epochs):
        cumulative_loss = 0
        for batch_idx, batch in enumerate(train_iter):
            input_data = batch.src.to(device)
            target = batch.trg.to(device)
            
            output = model(input_data, target)
            
            output = output[1:].reshape(-1, output.shape[2]) # do not consider start token
            target = target[1:].reshape(-1)
            
            optimizer.zero_grad()
            
            loss = criterion(output, target)
            
            loss.backward()
            
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
            optimizer.step()
            
            cumulative_loss += loss.item()
        # print(cumulative_loss)
        if(epoch == 0):
          lowest = cumulative_loss
        elif (cumulative_loss < lowest):
          lowest = cumulative_loss
          torch.save(model.state_dict(), 'seqtoseq.pt')
            
            

In [21]:
optimizer = optim.Adam(model.parameters(), lr=0.001)
train(model, 20, optimizer)

In [34]:
def translate(model, sentence, german, english, device, max_length=50):
  spacy_ger = de_core_news_sm.load()
  spacy_eng = en_core_web_sm.load()

  if type(sentence) == str:
    tokens = [token.text.lower() for token in spacy_ger(sentence)]
  else:
    tokens = [token.lower() for token in sentence]

  tokens.insert(0, german.init_token)
  tokens.append(german.eos_token)

  text_to_indices = [german.vocab.stoi[token] for token in tokens]

  sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)

  with torch.no_grad():
    hidden, cell = model.encoder(sentence_tensor)

  outputs = [english.vocab['<sos>']]

  for _ in range(max_length):
    previous_word = torch.LongTensor([outputs[-1]]).to(device)

    with torch.no_grad():
      output, hidden, cell = model.decoder(previous_word, hidden, cell)
      best_guess = output.argmax(1).item()

    outputs.append(best_guess)

    if output.argmax(1).item() == english.vocab.stoi['<eos>']:
      break


  translated_sentence = [english.vocab.itos[idx] for idx in outputs]

  return translated_sentence[1:]

In [35]:
model.load_state_dict(torch.load('seqtoseq.pt'))

Seq2Seq(
  (encoder): Encoder(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(7855, 300)
    (rnn): LSTM(300, 1024, num_layers=2, dropout=0.5)
  )
  (decoder): Decoder(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(5893, 300)
    (rnn): LSTM(300, 1024, num_layers=2, dropout=0.5)
    (fc): Linear(in_features=1024, out_features=5893, bias=True)
  )
)

In [42]:
model.eval()
sentence = 'ein Boot mit anderen Männern.'
print(translate(model, sentence, german, english, device, max_length=50))

['men', 'with', 'two', 'men', 'men', '.', '<END>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
