In [1]:
import os
import sys
sys.path.append('..')

In [2]:
import import_ipynb
from utils.dataset_loader import CreateCNNDataset
from utils.training import CNNLearning

importing Jupyter notebook from ..\utils\dataset_loader.ipynb
importing Jupyter notebook from ..\utils\training.ipynb


In [3]:
import torch
from torch import nn
from torch import optim
import torch.nn.functional as F

import random
import numpy as np

from tqdm import tqdm

In [4]:
### cpu, gpu 선택
device = 'cuda' if torch.cuda.is_available() else 'cpu'

### 불용어 사용 여부
use_stopword = True

### batch_size
batch_size = 32

In [5]:
### 미리 만들어둔 데이터셋을 가져옴
dataset = CreateCNNDataset(device=device, use_stopword=use_stopword)

### 데이터셋에서 iterator만 뽑아냄
train_iterator, valid_iterator, test_iterator = dataset.get_iterator(batch_size=batch_size)

In [57]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, kernel_size, n_layers, dropout, device, max_length=100):
        super().__init__()
        self.device = device
        
        self.scale = torch.sqrt(torch.FloatTensor([0.5])).to(device)
        
        self.tok_embedding = nn.Embedding(input_dim, emb_dim)
        self.pos_embedding = nn.Embedding(max_length, emb_dim)
        
        self.emb2hid = nn.Linear(emb_dim, hid_dim)
        self.hid2emb = nn.Linear(hid_dim, emb_dim)
        
        self.conv = nn.ModuleList([nn.Conv1d(in_channels=hid_dim, out_channels=2*hid_dim,
                                            kernel_size=kernel_size, padding=(kernel_size-1)//2) for _ in range(n_layers)])
        
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, src):
        # src = [batch_size, src_len]
        batch_size = src.shape[0]
        src_len = src.shape[1]
        
        pos = torch.arange(0, src_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
        
        tok_embedded = self.tok_embedding(src)
        pos_embedded = self.pos_embedding(pos)
        
        embedded = self.dropout(tok_embedded + pos_embedded)
        # embedded = [batch_size, src_len, emb_dim]
        
        conv_input = self.emb2hid(embedded)
        # conv_input = [batch_size, src_len, hid_dim]
        
        conv_input = conv_input.permute(0, 2, 1)
        # conv_input = [batch_size, hid_dim, src_len]
        
        for i, conv in enumerate(self.conv):
            conved = conv(self.dropout(conv_input))
            # conved = [batch_size, 2*hid_dim, src_len]
            
            conved = F.glu(conved, dim=1)
            # conved = [batch_size, hid_dim, src_len]
            
            conved = (conved + conv_input) * self.scale
            
            conv_input = conved
            
        # conved = [batch_size, hid_dim, src)len]
        
        conved = self.hid2emb(conved.permute(0, 2, 1))
        # conved = [batch_size, src_len, emb_dim]
        
        combined = (conved + embedded) * self.scale
        
        return conved, combined

In [58]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, kernel_size, dropout, trg_pad_idx, device, max_length=100):
        super().__init__()
        
        self.kernel_size = kernel_size
        self.trg_pad_idx = trg_pad_idx
        self.device = device
        
        self.scale = torch.sqrt(torch.FloatTensor([0.5])).to(device)
        
        self.tok_embedding = nn.Embedding(output_dim, emb_dim)
        self.pos_embedding = nn.Embedding(max_length, emb_dim)
        
        self.emb2hid = nn.Linear(emb_dim, hid_dim)
        self.hid2emb = nn.Linear(hid_dim, emb_dim)
        
        self.attn_hid2emb = nn.Linear(hid_dim, emb_dim)
        self.attn_emb2hid = nn.Linear(emb_dim, hid_dim)
        
        self.fc_out = nn.Linear(emb_dim, output_dim)
        
        self.convs = nn.ModuleList([nn.Conv1d(in_channels=hid_dim, out_channels=hid_dim*2,
                                             kernel_size=kernel_size) for _ in range(n_layers)])
        
        self.dropout = nn.Dropout(dropout)
        
        
    def calculate_attention(self, embedded, conved, encoder_conved, encoder_combined):
        # embedded = [batch_size, trg_len, emb_dim]
        # conved = [batch_size, hid_dim, trg_len]
        # encoder_conved = [batch_size, src_len, emb_dim]
        # encoder_combined = [batch_size, src_len, emb_dim]
        
        conved_emb = self.attn_hid2emb(conved.permute(0, 2, 1))
        # conved_emb = [batch_size, trg_len, emb_dim]
        
        combined = (conved_emb + embedded) * self.scale
        # combined = [batch_size, trg_len, emb_dim]
        
        energy = torch.matmul(combined, encoder_conved.permute(0, 2, 1))
        # energy = [batch_size, trg_len, src_len]
        
        attention = F.softmax(energy, dim=2)
        # attention = [batch_size, trg_len, src_len]
        
        attended_encoding = torch.matmul(attention, encoder_combined)
        # attended_encoding = [batch_size, trg_len, emb_dim]
        
        attended_encoding = self.attn_emb2hid(attended_encoding)
        # attended_encoding = [batch_size, trg_len, hid_dim]
        
        attended_combined = (conved + attended_encoding.permute(0, 2, 1)) * self.scale
        # attended_encoding = [batch_size, hid_dim, trg_len]
        
        return attention, attended_combined
        
        
    def forward(self, trg, encoder_conved, encoder_combined):
        batch_size = trg.shape[0]
        trg_len = trg.shape[1]
        
        pos = torch.arange(0, trg_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
        
        tok_embedding = self.tok_embedding(trg)
        pos_embedding = self.pos_embedding(pos)
        
        embedded = self.dropout(tok_embedding + pos_embedding)
        # embedded = [batch_size, trg_len, emb_dim]
        
        conv_input = self.emb2hid(embedded)
        # conv_input = [batch_size, trg_len, hid_dim]
        
        conv_input = conv_input.permute(0, 2, 1)
        # conv_input = [batch_size, hid_dim, trg_len]
        
        for i, conv in enumerate(self.conv):
            conv_input = self.dropout(conv_input)
            
            padding = torch.zeros(batch_size, hid_dim, self.kernel_size-1).fill_(self.trg_pad_idx).to(self.device)
            # padding = [batch_size, hid_dim, kernel_size-1]
            
            padded_conv_input = torch.cat((padding, conv_input), dim=2)
            # padded_conv_input = [batch_size, hid_dim, trg_len + kernel_size - 1]
            
            conved = conv(padded_conv_input)
            # conved = [batch_size, hid_dim*2, trg_len]
            
            conved = F.glu(conved, dim=1)
            # conved = [batch_size, hid_dim, trg_len]
            
            attention, conved = self.calculate_attention(embedded, conved, encoder_conved, encoder_combined)
            
            conved = (conved + conv_input) * self.scale
            
            conv_input = conved
            
        conved = self.hid2emb(conved.permute(0, 2, 1))
        
        output = self.fc_out(self.dropout(conved))
        
        return output, attention

In [59]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        
    def forward(self, src, trg):
        encoder_conved, encoder_combined = self.encoder(src)
        
        output, attention = self.decoder(trg, encoder_conved, encoder_combined)
        
        return output, attention

In [60]:
input_dim = len(dataset.SRC.vocab)
output_dim = len(dataset.TRG.vocab)
emb_dim = 256
hid_dim = 512
kernel_size=3
n_layers = 10
dropout = 0.1
trg_pad_token = dataset.TRG.vocab.stoi[dataset.TRG.pad_token]

enc = Encoder(input_dim, emb_dim, hid_dim, n_layers, kernel_size, dropout, device)
dec = Decoder(output_dim, emb_dim, hid_dim, n_layers, kernel_size, dropout, trg_pad_token, device)

model = Seq2Seq(enc, dec).to(device)

padding =  4


In [55]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 37,344,261 trainable parameters


In [56]:
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=trg_pad_token)

In [49]:
learn = CNNLearning()
epochs = 10
clip = 1

In [50]:
for epoch in range(epochs):
    model, train_loss = learn.train(model, criterion, optimizer, train_iterator, clip)
    eval_loss = learn.evaluation(model, criterion, valid_iterator)
    print(train_loss, eval_loss)

  0%|                                                                                          | 0/907 [00:00<?, ?it/s]

torch.Size([32, 512, 28])
torch.Size([32, 1024, 27])
torch.Size([32, 512, 27]) torch.Size([32, 512, 28])





RuntimeError: The size of tensor a (27) must match the size of tensor b (28) at non-singleton dimension 2