In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

from torchtext.datasets import Multi30k
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

import spacy
import numpy as np

import random
import math
import time

In [2]:
SEED = 1234
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [3]:
tokenizer_en = get_tokenizer('spacy', language='en_core_web_sm')
tokenizer_de = get_tokenizer('spacy', language='de_core_news_sm')

In [4]:

def yield_tokens_en(data_iter):
    for data in data_iter:
        yield tokenizer_en(data[1].rstrip('\n'))
        
def yield_tokens_de(data_iter):
    for data in data_iter:
        yield tokenizer_de(data[0].rstrip('\n'))
        
    

In [1]:
train = Multi30k('./.data', split=('train'), language_pair=('de','en'))
en_vocab = build_vocab_from_iterator(yield_tokens_en(train), min_freq=2, specials=['<unk>', '<pad>', '<sos>', '<eos>'])
de_vocab = build_vocab_from_iterator(yield_tokens_de(train), min_freq=2, specials=['<unk>', '<pad>', '<sos>', '<eos>'])

NameError: name 'Multi30k' is not defined

In [6]:
en_vocab.set_default_index(en_vocab['<unk>'])
de_vocab.set_default_index(de_vocab['<unk>'])

In [7]:
print(len(en_vocab))
print(len(de_vocab))

6191
8014


In [8]:
PAD_IDX = en_vocab['<pad>']
SOS_IDX = en_vocab['<sos>']
EOS_IDX = en_vocab['<eos>']
UNK_IDX = en_vocab['<unk>']

print(UNK_IDX, PAD_IDX, SOS_IDX, EOS_IDX)

0 1 2 3


In [9]:
en_text_pipeline = lambda x : en_vocab(tokenizer_en(x))
de_text_pipeline = lambda x : de_vocab(tokenizer_de(x))

In [10]:
def collate_batch(batch):
    de_list, en_list = [], []
    for data in batch:
        processed_de = torch.tensor( de_text_pipeline(data[0]), dtype=torch.int64)
        processed_en = torch.tensor( en_text_pipeline(data[1]), dtype=torch.int64)
        de_list.append( torch.cat( [torch.tensor([SOS_IDX]), processed_de, torch.tensor([EOS_IDX])], dim=0))
        en_list.append( torch.cat( [torch.tensor([SOS_IDX]), processed_en, torch.tensor([EOS_IDX])], dim=0))
    de_list = pad_sequence( de_list, padding_value=PAD_IDX)
    en_list = pad_sequence( en_list, padding_value=PAD_IDX)
    return de_list, en_list


In [11]:
train, valid, test = Multi30k('./.data', split=('train', 'valid', 'test'), language_pair=('de','en'))

BATCH_SIZE = 64

train_loader = DataLoader( train, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch)
valid_loader = DataLoader(valid, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch)
#test_loader = DataLoader(test, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch)

(참고) DataLaoder iteration 이 한번밖에 안되는 현상이 있는 것 같음...  각 batch 들의 sequence_length 들이 다른 경우에는 batch 가 한번 돌고 나면 유지되는게 아니라 없어지는 것일 수도 있고, torchtext 0.11 의 bug 일수도 있어서 이대로 놔두었다가 pytorch version 이 바뀌었을때 다시 한번 테스트 해보는 것으로 ...  다른 seq2seq code 들도 training loop 안에서 DataLoader 를 새로 부르고 있음 

In [12]:
'''
import time 

EPOCH = 10

for i in range(1, EPOCH+1):
    print("starting epoch",i)
    for (src, dst) in train_loader:
        print(src.shape, dst.shape)
    time.sleep(0.5)
'''


'\nimport time \n\nEPOCH = 10\n\nfor i in range(1, EPOCH+1):\n    print("starting epoch",i)\n    for (src, dst) in train_loader:\n        print(src.shape, dst.shape)\n    time.sleep(0.5)\n'

In [13]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [14]:

class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(input_dim, emb_dim)
        
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        # batch_frist = False ... meaning input dimension would be [src_len, batch_len]
        embedded = self.dropout(self.embedding(src))
        
        # embedded = [ src_len, batch_len, emb_dim]
        outputs, (hidden, cell) = self.rnn(embedded)
        
        # outputs dim = [src_len(sequence len), batch_len, hidden_dim * n_direction(=1)]
        # hidden dim = [ n_layers * n_directions, batch_size, hidden_dim ]
        # cell dim = [ n_layers * n_directions, batch_size, hidden_dim]
        # outputs are always from the top(last) layer
        
        return hidden, cell   # we don't need outputs as long as we don't use attention 
        
        

In [15]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        
        self.output_dim = output_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers

        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.LSTM( emb_dim, hid_dim, n_layers, dropout = dropout)
        self.fc_out = nn.Linear( hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, cell):
        # input = [batch_size]
        # hidden = [ n_layers * n_directions, batch_size, hid_dim]
        # cell = [ n_layers * n_directions, batch_size, hid_dim]
        
        input = input.unsqueeze(0)
        # input dimension is now [1,batch_size]
        
        embedded = self.dropout(self.embedding(input))
        
        # embedded is now [1, batch_size, emb_dim]
        
        output, (hidden, cell) = self.rnn(embedded, (hidden,cell))
        
        # output = [ seq_len, batch_size, hid_dim * n_directions]
        # hidden = [ n_layers * n_directions, batch_size, hid_dim]
        # cell = [ n_layers * n_directions, batch_size, hid_dim]
        
        # seq_len and n_directions is 1 in the decoder ( why???)
        # output = [ 1, batch_size, hid_dim]
        # hidden = [ n_layers, batch_size, hid_dim]
        # cell = [ n_layers, batch_size, hid_dim]
        
        prediction = self.fc_out(output.squeeze(0))
        # precition dim = [batch_size, output_dim]
        
        return prediction, hidden, cell


In [16]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        # src_dim = [src_len, batch_len]
        # trg_dim = [trg_len, batch_len]
        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        
        hidden, cell = self.encoder(src)
        input = trg[0,:]
        
        for t in range(1,trg_len):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[t] = output
            
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            
            input = trg[t] if teacher_force else top1
        
        return outputs


In [17]:

INPUT_DIM = len(de_vocab)
OUTPUT_DIM = len(en_vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)
model = Seq2Seq(enc, dec, device).to(device)

In [18]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
        
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(8014, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(6191, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (fc_out): Linear(in_features=512, out_features=6191, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [19]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

count_parameters(model)

14168879

In [20]:
optimizer = optim.Adam(model.parameters())
PAD_IDX = en_vocab['<pad>']

criterion = nn.CrossEntropyLoss(ignore_index = PAD_IDX)

In [21]:

def train( model, iterator, optimizer, criterion, clip):
    
    model.train()
    epoch_loss = 0 
    
    for i, batch in enumerate(iterator):
        src = batch[0].to(device)
        trg = batch[1].to(device)
        
        optimizer.zero_grad()
        
        output = model(src, trg)
        
        # trg = [target_length , batch_len]
        # output = [target_len, batch_len, output_dim]
        
        output_dim = output.shape[-1]
        
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        
        loss = criterion(output, trg)
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
    return epoch_loss / len(iterator)
        
   

In [22]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch[0].to(device)
            trg = batch[1].to(device)

            output = model(src, trg, 0) #turn off teacher forcing

            #trg = [trg len, batch size]
            #output = [trg len, batch size, output dim]

            output_dim = output.shape[-1]
            
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)

            #trg = [(trg len - 1) * batch size]
            #output = [(trg len - 1) * batch size, output dim]

            loss = criterion(output, trg)
            
            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [23]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [24]:
N_EPOCHS = 20
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_iter = Multi30k('./.data', split=('train'), language_pair=('de','en'))
    train_loader = DataLoader( train_iter, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch)
    
    valid_iter = Multi30k('./.data', split=('valid'), language_pair=('de','en'))
    valid_loader = DataLoader(valid_iter, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch)
    
    train_loss = train(model, train_loader, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_loader, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

Epoch: 01 | Time: 0m 41s
	Train Loss: 4.648 | Train PPL: 104.410
	 Val. Loss: 4.620 |  Val. PPL: 101.533
Epoch: 02 | Time: 0m 41s
	Train Loss: 3.956 | Train PPL:  52.250
	 Val. Loss: 4.431 |  Val. PPL:  84.029
Epoch: 03 | Time: 0m 42s
	Train Loss: 3.694 | Train PPL:  40.188
	 Val. Loss: 4.201 |  Val. PPL:  66.747
Epoch: 04 | Time: 0m 41s
	Train Loss: 3.510 | Train PPL:  33.437
	 Val. Loss: 4.134 |  Val. PPL:  62.438
Epoch: 05 | Time: 0m 41s
	Train Loss: 3.317 | Train PPL:  27.585
	 Val. Loss: 4.061 |  Val. PPL:  58.039
Epoch: 06 | Time: 0m 41s
	Train Loss: 3.161 | Train PPL:  23.590
	 Val. Loss: 3.959 |  Val. PPL:  52.431
Epoch: 07 | Time: 0m 41s
	Train Loss: 2.996 | Train PPL:  20.009
	 Val. Loss: 3.871 |  Val. PPL:  47.992
Epoch: 08 | Time: 0m 42s
	Train Loss: 2.862 | Train PPL:  17.491
	 Val. Loss: 3.831 |  Val. PPL:  46.111
Epoch: 09 | Time: 0m 42s
	Train Loss: 2.729 | Train PPL:  15.321
	 Val. Loss: 3.756 |  Val. PPL:  42.767
Epoch: 10 | Time: 0m 42s
	Train Loss: 2.628 | Train PPL

In [25]:
model.load_state_dict(torch.load('tut1-model.pt'))

test_iter = Multi30k('./.data', split=('test'), language_pair=('de','en'))
test_loader = DataLoader(test_iter, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch)
    

test_loss = evaluate(model, test_loader, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

| Test Loss: 3.737 | Test PPL:  41.986 |
