In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

from torchtext.legacy.datasets import Multi30k
from torchtext.legacy.data import Field, BucketIterator

import spacy
import numpy as np

import random
import math
import time


In [2]:
SEED = 1234
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [3]:
spacy_de = spacy.load('de_core_news_sm')
spacy_en = spacy.load('en_core_web_sm')

In [4]:

def tokenize_de(text):
    return [tok.text for tok in spacy_de.tokenizer(text)][::-1]

def tokenize_en(text):
    return [ tok.text for tok in spacy_en.tokenizer(text)]

In [5]:

SRC = Field(tokenize=tokenize_de, init_token='<sos>', eos_token='<eos>', lower=True)
TRG = Field(tokenize=tokenize_en, init_token='<sos>', eos_token='<eos>', lower=True)

In [6]:
train_data, valid_data, test_data = Multi30k.splits(exts=('.de', '.en'), fields= (SRC, TRG))

In [7]:
vars(train_data[0])

{'src': ['.',
  'büsche',
  'vieler',
  'nähe',
  'der',
  'in',
  'freien',
  'im',
  'sind',
  'männer',
  'weiße',
  'junge',
  'zwei'],
 'trg': ['two',
  'young',
  ',',
  'white',
  'males',
  'are',
  'outside',
  'near',
  'many',
  'bushes',
  '.']}

In [8]:
SRC.build_vocab(train_data, min_freq=2)
TRG.build_vocab(train_data, min_freq=2)

In [9]:
print(len(SRC.vocab))
print(len(TRG.vocab))

7853
5893


In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [11]:

BATCH_SIZE = 128

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size = BATCH_SIZE,
    device = device )

In [12]:

class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(input_dim, emb_dim)
        
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        # batch_frist = False ... meaning input dimension would be [src_len, batch_len]
        embedded = self.dropout(self.embedding(src))
        
        # embedded = [ src_len, batch_len, emb_dim]
        outputs, (hidden, cell) = self.rnn(embedded)
        
        # outputs dim = [src_len(sequence len), batch_len, hidden_dim * n_direction(=1)]
        # hidden dim = [ n_layers * n_directions, batch_size, hidden_dim ]
        # cell dim = [ n_layers * n_directions, batch_size, hidden_dim]
        # outputs are always from the top(last) layer
        
        return hidden, cell   # we don't need outputs as long as we don't use attention 
        
        

In [13]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        
        self.output_dim = output_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers

        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.LSTM( emb_dim, hid_dim, n_layers, dropout = dropout)
        self.fc_out = nn.Linear( hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, cell):
        # input = [batch_size]
        # hidden = [ n_layers * n_directions, batch_size, hid_dim]
        # cell = [ n_layers * n_directions, batch_size, hid_dim]
        
        input = input.unsqueeze(0)
        # input dimension is now [1,batch_size]
        
        embedded = self.dropout(self.embedding(input))
        
        # embedded is now [1, batch_size, emb_dim]
        
        output, (hidden, cell) = self.rnn(embedded, (hidden,cell))
        
        # output = [ seq_len, batch_size, hid_dim * n_directions]
        # hidden = [ n_layers * n_directions, batch_size, hid_dim]
        # cell = [ n_layers * n_directions, batch_size, hid_dim]
        
        # seq_len and n_directions is 1 in the decoder ( why???)
        # output = [ 1, batch_size, hid_dim]
        # hidden = [ n_layers, batch_size, hid_dim]
        # cell = [ n_layers, batch_size, hid_dim]
        
        prediction = self.fc_out(output.squeeze(0))
        # precition dim = [batch_size, output_dim]
        
        return prediction, hidden, cell


In [14]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        # src_dim = [src_len, batch_len]
        # trg_dim = [trg_len, batch_len]
        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        
        hidden, cell = self.encoder(src)
        input = trg[0,:]
        
        for t in range(1,trg_len):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[t] = output
            
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            
            input = trg[t] if teacher_force else top1
        
        return outputs


In [15]:

INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)
model = Seq2Seq(enc, dec, device).to(device)

In [16]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
        
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(7853, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(5893, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (fc_out): Linear(in_features=512, out_features=5893, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [17]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [18]:
count_parameters(model)

13898501

In [19]:
optimizer = optim.Adam(model.parameters())
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]
criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

In [20]:

def train( model, iterator, optimizer, criterion, clip):
    
    model.train()
    epoch_loss = 0 
    
    for i, batch in enumerate(iterator):
        src = batch.src
        trg = batch.trg
        
        optimizer.zero_grad()
        
        output = model(src, trg)
        
        # trg = [target_length , batch_len]
        # output = [target_len, batch_len, output_dim]
        
        output_dim = output.shape[-1]
        
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        
        loss = criterion(output, trg)
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
    return epoch_loss / len(iterator)
        
   

In [21]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch.src
            trg = batch.trg

            output = model(src, trg, 0) #turn off teacher forcing

            #trg = [trg len, batch size]
            #output = [trg len, batch size, output dim]

            output_dim = output.shape[-1]
            
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)

            #trg = [(trg len - 1) * batch size]
            #output = [(trg len - 1) * batch size, output dim]

            loss = criterion(output, trg)
            
            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [22]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [23]:
N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

Epoch: 01 | Time: 0m 27s
	Train Loss: 5.054 | Train PPL: 156.617
	 Val. Loss: 5.030 |  Val. PPL: 152.948
Epoch: 02 | Time: 0m 27s
	Train Loss: 4.473 | Train PPL:  87.641
	 Val. Loss: 4.781 |  Val. PPL: 119.239
Epoch: 03 | Time: 0m 27s
	Train Loss: 4.182 | Train PPL:  65.494
	 Val. Loss: 4.583 |  Val. PPL:  97.799
Epoch: 04 | Time: 0m 27s
	Train Loss: 3.962 | Train PPL:  52.542
	 Val. Loss: 4.432 |  Val. PPL:  84.135
Epoch: 05 | Time: 0m 27s
	Train Loss: 3.782 | Train PPL:  43.915
	 Val. Loss: 4.354 |  Val. PPL:  77.764
Epoch: 06 | Time: 0m 28s
	Train Loss: 3.613 | Train PPL:  37.079
	 Val. Loss: 4.278 |  Val. PPL:  72.064
Epoch: 07 | Time: 0m 27s
	Train Loss: 3.471 | Train PPL:  32.182
	 Val. Loss: 4.103 |  Val. PPL:  60.516
Epoch: 08 | Time: 0m 27s
	Train Loss: 3.334 | Train PPL:  28.051
	 Val. Loss: 4.016 |  Val. PPL:  55.494
Epoch: 09 | Time: 0m 28s
	Train Loss: 3.208 | Train PPL:  24.722
	 Val. Loss: 3.952 |  Val. PPL:  52.064
Epoch: 10 | Time: 0m 28s
	Train Loss: 3.117 | Train PPL

In [None]:
model.load_state_dict(torch.load('tut1-model.pt'))

test_loss = evaluate(model, test_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')