In [1]:
import os
import sys
sys.path.append('..')

In [2]:
import import_ipynb
from utils.dataset_loader import CreateCNNDataset
from utils.training import CNNLearning

importing Jupyter notebook from ..\utils\dataset_loader.ipynb
importing Jupyter notebook from ..\utils\training.ipynb


In [3]:
import torch
from torch import nn
from torch import optim
import torch.nn.functional as F

import random
import numpy as np

from tqdm import tqdm

In [4]:
### cpu, gpu 선택
device = 'cuda' if torch.cuda.is_available() else 'cpu'

### 불용어 사용 여부
use_stopword = True

### batch_size
batch_size = 32

In [5]:
### 미리 만들어둔 데이터셋을 가져옴
dataset = CreateCNNDataset(device=device, use_stopword=use_stopword)

### 데이터셋에서 iterator만 뽑아냄
train_iterator, valid_iterator, test_iterator = dataset.get_iterator(batch_size=batch_size)

In [6]:
class Encoder(nn.Module):
    def __init__(self, input_dim, hid_dim, n_layers, n_heads, pf_dim, dropout, device, max_length=100):
        super().__init__()
        
        self.device = device
        self.tok_embedding = nn.Embedding(input_dim, hid_dim)
        self.pos_embedding = nn.Embedding(input_dim, hid_dim)
        
        self.layers = nn.ModuleList([EncoderLayer(hid_dim, n_heads, pf_dim, dropout, device) for _ in range(n_layers)])
        
        self.dropout = nn.Dropout(dropout)
        
        self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)
        
    def forward(self, src, src_mask):
        # src = [batch_size, src_len]
        # src_mask = [batch_size, 1, 1, src_len]
        batch_size = src.shape[0]
        src_len = src.shape[1]
        
        pos = torch.arange(0, src_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
        # pos = [batch_size, src_len]
        
        src = self.dropout((self.tok_embedding(src) * self.scale) + self.pos_embedding(pos))
        
        for layer in self.layers:
            src = layer(src, src_mask)
        # src = [batch_size, src_len, hid_dim]
        
        return src       

In [7]:
class EncoderLayer(nn.Module):
    def __init__(self, hid_dim, n_heads, pf_dim, dropout, device):
        super().__init__()
        
        self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.ff_layer_norm = nn.LayerNorm(hid_dim)
        self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.positionwise_feedforward = PositionwiseFeedforwardLayer(hid_dim, pf_dim, dropout)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src, src_mask):
        # src = [batch_size, src_len, hid_dim]
        # src_mask = [batch_size, 1, 1, src_len]
        
        _src, _ = self.self_attention(src, src, src, src_mask)
        
        src = self.self_attn_layer_norm(src + self.dropout(_src))
        
        _src = self.positionwise_feedforward(src)
        
        src = self.ff_layer_norm(src + self.dropout(_src))
        
        # src = [batch_size, src_len, hid_dim]
        
        return src

In [8]:
class MultiHeadAttentionLayer(nn.Module):
    def __init__(self, hid_dim, n_heads, dropout, device):
        super().__init__()
        
        assert hid_dim % n_heads == 0
        
        self.hid_dim = hid_dim
        self.n_heads = n_heads
        self.head_dim = hid_dim // n_heads
        
        self.fc_q = nn.Linear(hid_dim, hid_dim)
        self.fc_k = nn.Linear(hid_dim, hid_dim)
        self.fc_v = nn.Linear(hid_dim, hid_dim)
        
        self.fc_o = nn.Linear(hid_dim, hid_dim)

        self.dropout = nn.Dropout(dropout)
        
        self.scale = torch.sqrt(torch.FloatTensor([self.head_dim])).to(device)
        
    def forward(self, query, key, value, mask=None):
        batch_size = query.shape[0]
        
        Q = self.fc_q(query)
        K = self.fc_k(key)
        V = self.fc_v(value)
        # Q = K = V = [batch_size, src_len, hid_dim]
        
        Q = Q.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        K = K.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        V = V.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        
        # Q = [batch_size, n_heads, query_len, head_dim]
        # K = [batch_size, n_heads, key_len, head_dim]
        # V = [batch_size, n_heads, value_len, head_dim]
        
        energy = torch.matmul(Q, K.permute(0, 1, 3, 2)) / self.scale
        
        if mask is not None:
            energy = energy.masked_fill(mask == 0, -1e10)
            
        attention = torch.softmax(energy, dim=-1)
        # attention = [batch_size, n_heads, query_len, key_len]
        
        x = torch.matmul(self.dropout(attention), V)
        # x = [batch_size, n_heads, query_len, head_dim]
        
        x = x.permute(0, 2, 1, 3).contiguous()
        # x = [batch_size, query_len, n_heads, head_dim]
        
        x = x.view(batch_size, -1, self.hid_dim)
        # x  = [batch_size, query_len, hid_dim]
        
        x = self.fc_o(x)
        # x = [batch_size, query_len, hid_dim]
        
        return x, attention

In [9]:
class PositionwiseFeedforwardLayer(nn.Module):
    def __init__(self, hid_dim, pf_dim, dropout):
        super().__init__()
        
        self.fc_1 = nn.Linear(hid_dim, pf_dim)
        self.fc_2 = nn.Linear(pf_dim, hid_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        # x = [batch_size, seq_len, hid_dim]
        
        x = self.dropout(torch.relu(self.fc_1(x)))
        # x = [batch_size, seq_len, pf_dim]
        
        x = self.fc_2(x)
        # x = [batch_size, seq_len, hid_dim]
        
        return x

In [10]:
class Decoder(nn.Module):
    def __init__(self, output_dim, hid_dim, n_layers, n_heads, pf_dim, dropout, device, max_length=100):
        super().__init__()
        self.device = device
        
        self.tok_embedding = nn.Embedding(output_dim, hid_dim)
        self.pos_embedding = nn.Embedding(max_length, hid_dim)
        
        self.layers = nn.ModuleList([DecoderLayer(hid_dim, n_heads, pf_dim, dropout, device) for _ in range(n_layers)])
        
        self.fc_out = nn.Linear(hid_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
        self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)
        
    def forward(self, trg, enc_src, trg_mask, src_mask):
        # trg = [batch_size, trg_len]
        # enc_src = [batch_size, src_len, hid_dim]
        # trg_mask = [batch_size, 1, trg_len, trg_len]
        # src_mask = [batch_size, 1, 1 src_len]
        
        batch_size = trg.shape[0]
        trg_len = trg.shape[1]
        
        pos = torch.arange(0, trg_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
        
        trg = self.dropout((self.tok_embedding(trg) * self.scale) + self.pos_embedding(pos))
        
        for layer in self.layers:
            trg, attention = layer(trg, enc_src, trg_mask, src_mask)
            
        output = self.fc_out(trg)
        
        return output, attention

In [11]:
class DecoderLayer(nn.Module):
    def __init__(self, hid_dim, n_heads, pf_dim, dropout, device):
        super().__init__()
        
        self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.enc_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.ff_layer_norm = nn.LayerNorm(hid_dim)
        self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.encoder_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.positionwise_feedforward = PositionwiseFeedforwardLayer(hid_dim, pf_dim, dropout)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, trg, enc_src, trg_mask, src_mask):
        # trg = [batch_size, trg_len, hid_dim]
        # enc_src = [batch_size, src_len, hid_dim]
        # trg_mask = [batch_size, 1, trg_len, trg_len]
        # src_mask = [batch_size, 1, 1, src_len]
        
        _trg, _ = self.self_attention(trg, trg, trg, trg_mask)
        
        trg = self.self_attn_layer_norm(trg + self.dropout(_trg))
        # trg = [batch_size, trg_len, hid_dim]
        
        _trg, attention = self.encoder_attention(trg, enc_src, enc_src, src_mask)
        
        trg = self.enc_attn_layer_norm(trg + self.dropout(_trg))
        # trg = [batch_size, trg_len, hid_dim]
        
        _trg = self.positionwise_feedforward(trg)
        
        trg = self.ff_layer_norm(trg + self.dropout(_trg))
        
        return trg, attention

In [12]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, src_pad_idx, trg_pad_idx, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        self.device = device
        
    def make_src_mask(self, src):
        # src = [batch_size, src_len]
        
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)
        # src_mask = [batch_size, 1, 1, src_len]
        
        return src_mask
    
    def make_trg_mask(self, trg):
        # trg = [batch_size, trg_len]
        
        trg_pad_mask = (trg != self.trg_pad_idx).unsqueeze(1).unsqueeze(2)
        # trg_pad_mask = [batch_size, 1, 1, trg_len]
        
        trg_len = trg.shape[1]
        
        trg_sub_mask = torch.tril(torch.ones((trg_len, trg_len), device=self.device)).bool()
        # trg_sub_mask = [trg_len, trg_len]
        
        trg_mask = trg_pad_mask & trg_sub_mask
        # trg_mask = [batch_size, 1, trg_len, trg_len]
        
        return trg_mask
        
    def forward(self, src, trg):
        # src = [batch_size, src_len]
        # trg = [batch_size, trg_len]
        
        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)
        # src_mask = [batch_size, 1, 1, src_len]
        # trg_mask = [batch_size, 1, trg_len, trg_len]
        
        enc_src = self.encoder(src, src_mask)
        
        output, attention = self.decoder(trg, enc_src, trg_mask, src_mask)
        
        return output, attention

In [13]:
input_dim = len(dataset.SRC.vocab)
output_dim = len(dataset.TRG.vocab)
hid_dim = 256
enc_layers = 3
dec_layers = 3
enc_heads = 8
dec_heads = 8
enc_pf_dim = 512
dec_pf_dim = 512
enc_dropout = 0.1
dec_dropout = 0.1

enc = Encoder(input_dim, hid_dim, enc_layers, enc_heads, enc_pf_dim, enc_dropout, device)
dec = Decoder(output_dim, hid_dim, dec_layers, dec_heads, dec_pf_dim, dec_dropout, device)

In [14]:
SRC_PAD_IDX = dataset.SRC.vocab.stoi[dataset.SRC.pad_token]
TRG_PAD_IDX = dataset.TRG.vocab.stoi[dataset.TRG.pad_token]

model = Seq2Seq(enc, dec, SRC_PAD_IDX, TRG_PAD_IDX, device).to(device)

In [15]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 11,023,621 trainable parameters


In [16]:
criterion = nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX)
optimizer = optim.Adam(model.parameters(), lr=0.0005)
epochs = 10
clip = 1

In [17]:
learn = CNNLearning()

for epoch in range(epochs):
    model, train_loss = learn.train(model, criterion, optimizer, train_iterator, clip)
    eval_loss = learn.evaluation(model, criterion, valid_iterator)
    print(train_loss, eval_loss)

100%|████████████████████████████████████████████████████████████████████████████████| 907/907 [00:34<00:00, 26.29it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 32/32 [00:00<00:00, 126.99it/s]
  0%|▎                                                                                 | 3/907 [00:00<00:40, 22.06it/s]

3.5647449698442926 2.690774030983448


100%|████████████████████████████████████████████████████████████████████████████████| 907/907 [00:34<00:00, 26.27it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 32/32 [00:00<00:00, 132.78it/s]
  0%|▏                                                                                 | 2/907 [00:00<00:45, 19.80it/s]

2.630589989343578 2.2979978881776333


100%|████████████████████████████████████████████████████████████████████████████████| 907/907 [00:36<00:00, 24.84it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 32/32 [00:00<00:00, 97.86it/s]
  0%|▏                                                                                 | 2/907 [00:00<01:01, 14.71it/s]

2.2400191822546187 2.0736191235482693


100%|████████████████████████████████████████████████████████████████████████████████| 907/907 [00:38<00:00, 23.41it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 32/32 [00:00<00:00, 123.08it/s]
  0%|▏                                                                                 | 2/907 [00:00<00:49, 18.35it/s]

1.974623788152272 1.9660740196704865


100%|████████████████████████████████████████████████████████████████████████████████| 907/907 [00:37<00:00, 24.03it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 32/32 [00:00<00:00, 109.22it/s]
  0%|▏                                                                                 | 2/907 [00:00<00:59, 15.15it/s]

1.7752125950606157 1.8883415833115578


100%|████████████████████████████████████████████████████████████████████████████████| 907/907 [00:38<00:00, 23.77it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 32/32 [00:00<00:00, 108.84it/s]
  0%|▏                                                                                 | 2/907 [00:00<00:58, 15.50it/s]

1.6171576631108613 1.853509545326233


100%|████████████████████████████████████████████████████████████████████████████████| 907/907 [00:38<00:00, 23.79it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 32/32 [00:00<00:00, 131.01it/s]
  0%|▏                                                                                 | 2/907 [00:00<00:47, 19.12it/s]

1.485650150357954 1.837219811975956


100%|████████████████████████████████████████████████████████████████████████████████| 907/907 [00:39<00:00, 23.21it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 32/32 [00:00<00:00, 100.31it/s]
  0%|▏                                                                                 | 2/907 [00:00<00:57, 15.75it/s]

1.375405909602408 1.8294683694839478


100%|████████████████████████████████████████████████████████████████████████████████| 907/907 [00:36<00:00, 24.80it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 32/32 [00:00<00:00, 128.51it/s]
  0%|▏                                                                                 | 2/907 [00:00<00:51, 17.54it/s]

1.2850292157443381 1.822184443473816


100%|████████████████████████████████████████████████████████████████████████████████| 907/907 [00:36<00:00, 24.59it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 32/32 [00:00<00:00, 115.52it/s]

1.2058460908267306 1.838268917053938



