In [1]:
import os
import sys
sys.path.append('..')

In [2]:
import import_ipynb
from utils.dataset_loader import CreateDataset
from utils.training import Learning

importing Jupyter notebook from ..\utils\dataset_loader.ipynb
importing Jupyter notebook from ..\utils\training.ipynb


In [40]:
import torch
from torch import nn
from torch import optim
import torch.nn.functional as F

import random
import numpy as np

from tqdm import tqdm

In [4]:
### cpu, gpu 선택
device = 'cuda' if torch.cuda.is_available() else 'cpu'

### 불용어 사용 여부
use_stopword = True

### batch_size
batch_size = 32

In [5]:
### 미리 만들어둔 데이터셋을 가져옴
dataset = CreateDataset(device=device, use_stopword=use_stopword)

### 데이터셋에서 iterator만 뽑아냄
train_iterator, valid_iterator, test_iterator = dataset.get_iterator(batch_size=batch_size)

downloading training.tar.gz


.data\multi30k\training.tar.gz: 100%|██████████████████████████████████████████████| 1.21M/1.21M [00:02<00:00, 469kB/s]


downloading validation.tar.gz


.data\multi30k\validation.tar.gz: 100%|███████████████████████████████████████████| 46.3k/46.3k [00:00<00:00, 77.2kB/s]


downloading mmt_task1_test2016.tar.gz


.data\multi30k\mmt_task1_test2016.tar.gz: 100%|████████████████████████████████████| 66.2k/66.2k [00:00<00:00, 116kB/s]


In [106]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, enc_hid_dim, dec_hid_dim, n_layers, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, enc_hid_dim, num_layers=n_layers, dropout=dropout, bidirectional=True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(enc_hid_dim*2, dec_hid_dim)
        
    def forward(self, x):
        # x = [src_len, batch_size]
        
        embedded = self.dropout(self.embedding(x))
        # embedded = [src_len, batch_size, emb_dim]
        
        outputs, hidden = self.rnn(embedded)
        # outputs = [src_len, batch_size, hid_dim*2]
        # hidden = [num_layers*directional, batch_size, hid_dim]
        
        hidden = torch.cat([hidden[-2], hidden[-1]], dim=1)
        hidden = self.fc(hidden)
        return outputs, hidden

In [107]:
class Attention(nn.Module):
    def __init__(self, enc_hid_dim, dec_hid_dim):
        super().__init__()
        self.energy = nn.Linear(enc_hid_dim*2 + dec_hid_dim, dec_hid_dim)
        self.v = nn.Linear(dec_hid_dim, 1, bias=False)
        
    def forward(self, encoder_outputs, hidden):
        # encoder_outputs = [src_len, batch_size, enc_hid_dim*2]
        # hidden = [batch_size, dec_hid_dim]
        
        src_len = encoder_outputs.shape[0]
        batch_size = encoder_outputs.shape[1]
        
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
        # hidden = [batch_size, src_len, dec_hid_dim]
        
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        # encoder_outputs = [batch_size, src_len, enc_hid_dim*2]
        
        temp = torch.cat((encoder_outputs, hidden), dim=2)
        
        energy = torch.tanh(self.energy(temp))
        # energy = [batch_size, src_len, dec_hid_dim]
        
        v = self.v(energy).squeeze(2)
        # v = [batch_size, src_len, 1]
        
        return F.softmax(v, dim=1)

In [108]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, dec_hid_dim, n_layers, dropout, atte):
        super().__init__()
        self.output_dim = output_dim
        self.atte = atte
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim+enc_hid_dim*2, dec_hid_dim, num_layers=n_layers, dropout=dropout)
        self.dropout = nn.Dropout(dropout)
        self.fc_out = nn.Linear(dec_hid_dim+enc_hid_dim*2+emb_dim, output_dim)
        
    def forward(self, trg, encoder_outputs, hidden):
        # trg = [batch_size]
        # encoder_outputs = [batch_size, src_len, enc_hid_dim*2]
        # hidden = [batch_size, dec_hid_dim]
        
        trg = trg.unsqueeze(0)
        # trg = [1, batch_size]
        
        embedded = self.dropout(self.embedding(trg))
        # embedded = [1, batch_size, emb_dim]
        
        a = self.atte(encoder_outputs, hidden)
        # a = [batch_size, src_len]
        
        a = a.unsqueeze(1)
        # a = [batch_size, 1, src_len]
        
        weighted = torch.bmm(a, encoder_outputs)
        # weighted = [batch_size, 1, enc_hid_dim*2]
        
        weighted = weighted.permute(1, 0, 2)
        # weighted = [1, batch_size, enc_hid_dim*2]
        
        rnn_output, hidden = self.rnn(torch.cat((embedded, weighted), axis=2), hidden.unsqueeze(0))
        # output = [1, batch_size, dec_hid_dim]
        # hidden = [1, batch_size, dec_hid_dim]
        
        output = self.fc_out(torch.cat(embedded.squeeze(0), weighted.squeeze(0), rnn_output.squeeze(0)), dim=1)
        
        return output, hidden.squeeze(0)

In [109]:
class Seq2Seq(nn.Module):
    def __init__(self, enc, dec, device):
        super().__init__()
        self.enc = enc
        self.dec = dec
        self.device = device
        
    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        encoder_outputs, hidden = self.enc(src)
        
        trg_len = trg.shape[0]
        batch_size = trg.shape[1]
        output_dim = self.dec.output_dim
        
        outputs = torch.zeros(trg_len, batch_size, output_dim).to(self.device)
        
        dec_input = trg[0]
        
        for t in range(1, trg_len):
            output, hidden = self.dec(dec_input, encoder_outputs, hidden)
            
            outputs[t] = output
            
            top1 = torch.argmax(output, dim=1)
            
            dec_input = top1 if random.random() > teacher_forcing_ratio else trg[t]
            
        return outputs

In [110]:
input_dim = len(dataset.SRC.vocab)
output_dim = len(dataset.TRG.vocab)
emb_dim = 256
enc_hid_dim = 512
dec_hid_dim = 512
n_layers = 1
dropout = 0.1
clip = 1

In [111]:
enc = Encoder(input_dim, emb_dim, enc_hid_dim, dec_hid_dim, n_layers, dropout).to(device)
att = Attention(enc_hid_dim, dec_hid_dim)
dec = Decoder(output_dim, emb_dim, dec_hid_dim, n_layers, dropout, att).to(device)

model = Seq2Seq(enc, dec, device).to(device)
epochs = 10

In [112]:
pad_index = dataset.TRG.vocab.stoi[dataset.TRG.pad_token]

criterion = nn.CrossEntropyLoss(ignore_index=pad_index)
optimizer = optim.Adam(model.parameters(), lr=0.0001)

In [113]:
model

Seq2Seq(
  (enc): Encoder(
    (embedding): Embedding(7854, 256)
    (rnn): GRU(256, 512, dropout=0.1, bidirectional=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (fc): Linear(in_features=1024, out_features=512, bias=True)
  )
  (dec): Decoder(
    (atte): Attention(
      (energy): Linear(in_features=1536, out_features=512, bias=True)
      (v): Linear(in_features=512, out_features=1, bias=False)
    )
    (embedding): Embedding(5893, 256)
    (rnn): GRU(1280, 512, dropout=0.1)
    (dropout): Dropout(p=0.1, inplace=False)
    (fc_out): Linear(in_features=1792, out_features=5893, bias=True)
  )
)

In [114]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 20,518,661 trainable parameters


In [115]:
learn = Learning()

for epoch in range(epochs):
    model, train_loss = learn.train(model, criterion, optimizer, train_iterator, clip)
    eval_loss = learn.evaluation(model, criterion, valid_iterator)
    print(train_loss, eval_loss)

  0%|                                                                                          | 0/907 [00:00<?, ?it/s]


RuntimeError: invalid argument 7: equal number of batches expected at C:/w/b/windows/pytorch/aten/src\THC/generic/THCTensorMathBlas.cu:75