In [43]:
import os
import re
from os.path import join as pjoin
import numpy as np
from tqdm import tqdm
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split

import spacy
import random
import math
import time

from konlpy.tag import Okt
okt = Okt()

SEED = 111
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device("cpu")

In [38]:
def listDir(mypath):
    onlyfiles = [pjoin(mypath, f) for f in os.listdir(mypath)]
    return onlyfiles
file_list = listDir("korean_data")

In [39]:
df_kor_en = pd.read_excel(file_list[0],
                          index_col="SID")
df_kor_en

Unnamed: 0_level_0,원문,번역문
SID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,'Bible Coloring'은 성경의 아름다운 이야기를 체험 할 수 있는 컬러링 ...,Bible Coloring' is a coloring application that...
2,씨티은행에서 일하세요?,Do you work at a City bank?
3,푸리토의 베스트셀러는 해외에서 입소문만으로 4차 완판을 기록하였다.,"PURITO's bestseller, which recorded 4th rough ..."
4,11장에서는 예수님이 이번엔 나사로를 무덤에서 불러내어 죽은 자 가운데서 살리셨습니다.,In Chapter 11 Jesus called Lazarus from the to...
5,"6.5, 7, 8 사이즈가 몇 개나 더 재입고 될지 제게 알려주시면 감사하겠습니다.",I would feel grateful to know how many stocks ...
...,...,...
199996,나는 먼저 청소기로 바닥을 밀었어요.,"First of all, I vacuumed the floor."
199997,나는 먼저 팀 과제를 하고 놀러 갔어요.,I did the team assignment first and went out t...
199998,나는 비 같은 멋진 연예인을 좋아해요.,I like cool entertainer like Rain.
199999,나는 멋진 자연 경치를 보고 눈물을 흘렸어.,I cried seeing the amazing scenery.


In [9]:
ko_sequences = []
en_sequences = []

for idx, row in tqdm(df_kor_en.iterrows(), total=200000):
    kor, en = row["원문"], row["번역문"]
    cleaned_kor = re.sub('[\.\?\!\,]+','', kor)
    cleaned_en = re.sub('[\.\?\!\,]+','', en)
    ko_sequences.append(okt.morphs(cleaned_kor))
    en_sequences.append(cleaned_en.split())

100%|██████████| 200000/200000 [03:07<00:00, 1068.10it/s]


In [41]:
class TranslationData(Dataset):
    
    def __init__(self, from_sentences, to_sentences):
        self.init_token = "<sos>"
        self.end_token = "<eos>"
        self.end_token_pivot = 2
        self.from_sequences, self.from_word_dict = self.tokenize(from_sentences)
        self.to_sequences, self.to_word_dict = self.tokenize(to_sentences)
        
        self.source_dim = len(self.from_word_dict)
        self.target_dim = len(self.to_word_dict)
    
    def _dict_reverse(self, dictionary, value):
        for k, v in dictionary.items():
            if v == value:
                return k
        raise KeyError
        
    def tokenize(self, sentences):
        word_dict = {
            init_token: 1,
            end_token: 2,
        }
        word_counter = {
            1: len(sentences),
            2: len(sentences)
        }
        pivot = 3
        sequences = []
        for words in sentences:
            for word in words:
                if word not in word_dict:
                    word_dict[word] = pivot
                    word_counter[pivot] = 1
                    pivot = pivot + 1
                else:
                    word_counter[word_dict[word]] += 1
        
        for pivot, count in word_counter.items():
            if count == 1:
                word = self._dict_reverse(word_dict, pivot)
                word_dict[word] = 0
        
        start_pivot = 3
        for key in word_dict.keys():
            if word_dict[key] > 2:
                word_dict[key] = start_pivot
                start_pivot += 1
        
        for words in sentences:
            words = [self.init_token] + words + [self.end_token]
            tokens = [word_dict[w] for w in words]
            sequences.append(tokens)
        
        max_len = max(map(len, sequences))
        
        word_dict = {v: k for k, v in word_dict.items() if v > 0}
        word_dict[0] = "<NONE>"
        print("MAX LEN : {}".format(max_len))
        print("TOTAL SEQ: {}".format(len(sequences)))
        print("WORD DICT LEN: {}".format(len(word_dict)))
        for sequence in sequences:
            seq_len = len(sequence)
            sequence.extend([self.end_token_pivot] * (max_len - seq_len))
        return torch.tensor(sequences), word_dict
    
    def __len__(self):
        return len(self.from_sequences)
    
    def __getitem__(self, index):
        return self.from_sequences[index], self.to_sequences[index]
    

In [18]:
class Encoder(nn.Module):

    def __init__(self, input_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout):
        """
        :param input_dim: the size of the one-hot vectors that will be input
        :param emb_dim: the dimensionality of the embedding layer
        :param enc_hid_dim: the dimensionality of the encoder hidden states
        :param dec_hid_dim: the dimensionality of the decoder hidden states
        :param dropout: amount of dropout to use
        """
        super().__init__()

        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, enc_hid_dim, bidirectional=True)
        self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, hidden = self.rnn(embedded)
        hidden = torch.tanh(self.fc(torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)))
        return outputs, hidden

In [19]:
class Attention(nn.Module):

    def __init__(self, enc_hid_dim, dec_hid_dim):
        super().__init__()
        self.attn = nn.Linear(enc_hid_dim * 2 + dec_hid_dim, dec_hid_dim)
        self.v = nn.Linear(dec_hid_dim, 1, bias = False)

    def forward(self, hidden, encoder_outputs):
        """
        :param hidden: [batch size, dec hid dim]
        :param encoder_outputs: [src len, batch size, enc hid dim*2]
        merge hidden states of decoder and bidrectional output of encoder
        """
        batch_size = encoder_outputs.shape[1]
        src_len = encoder_outputs.shape[0]
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
        attention = self.v(energy).squeeze(2)
        return F.softmax(attention, dim=1)

In [20]:
class Decoder(nn.Module):

    def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim,
                 dropout, attention):
        super().__init__()
        self.output_dim = output_dim
        self.attention = attention

        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim + enc_hid_dim * 2, dec_hid_dim)
        self.fc_out = nn.Linear(emb_dim + 2 * enc_hid_dim + dec_hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, ipt, hidden, encoder_outputs):
        ipt = ipt.unsqueeze(0)
        embedded = self.dropout(self.embedding(ipt))
        attn = self.attention(hidden, encoder_outputs)
        attn = attn.unsqueeze(1)
        
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        
        weighted = torch.bmm(attn, encoder_outputs)
        weighted = weighted.permute(1, 0, 2)
        rnn_input = torch.cat((embedded, weighted), dim = 2)
        output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0))
        assert (output == hidden).all()

        embedded = embedded.squeeze(0)
        output = output.squeeze(0)
        weighted = weighted.squeeze(0)
        prediction = self.fc_out(torch.cat((output, weighted, embedded), dim=1))
        
        return prediction, hidden.squeeze(0)

In [21]:
class Seq2Seq(nn.Module):

    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        """
        :param src: shape (src len, batch size)
        :param trg: shape (trg len, batch size)
        :param teacher_forcing_ratio: probability to use teacher forcing
        """
        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim

        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)

        encoder_outputs, hidden = self.encoder(src)
        ipt = trg[0, :]

        for t in range(1, trg_len):

            output, hidden = self.decoder(ipt, hidden, encoder_outputs)
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            ipt = trg[t] if teacher_force else top1

        return outputs

In [53]:
class Train:

    def __init__(self, from_seq, to_seq,
                 enc_emb_dim=128, dec_emb_dim=128,
                 enc_hid_dim=256, dec_hid_dim=256,
                 enc_dropout=0.3, dec_dropout=0.3,
                 epochs=15):
        #self.data = Data()
        self.data = TranslationData(from_seq, to_seq)
        data_len = len(self.data)
        train_num = int(data_len * 0.8)
        valid_num = int(data_len * 0.1)
        test_num = data_len - train_num - valid_num
        train, valid, test = random_split(self.data, [train_num, valid_num, test_num])
        self.train_iter = DataLoader(train, batch_size = 256, shuffle=True)
        self.valid_iter = DataLoader(valid, batch_size = 256, shuffle=False)
        self.test_iter = DataLoader(test, batch_size = 256, shuffle=False)
        self.input_dim = self.data.source_dim
        self.output_dim = self.data.target_dim

        self.enc_emb_dim = enc_emb_dim
        self.dec_emb_dim = dec_emb_dim
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        self.enc_dropout = enc_dropout
        self.dec_dropout = dec_dropout

        self.encoder = Encoder(self.input_dim,
                               self.enc_emb_dim,
                               self.enc_hid_dim,
                               self.dec_hid_dim,
                               self.enc_dropout)
        self.attention = Attention(self.enc_hid_dim, self.dec_hid_dim)
        self.decoder = Decoder(self.output_dim,
                               self.dec_emb_dim,
                               self.enc_hid_dim,
                               self.dec_hid_dim,
                               self.dec_dropout,
                               self.attention)
        self.model = Seq2Seq(self.encoder, self.decoder, device).to(device)

        self.epochs = epochs
        self.criterion = nn.CrossEntropyLoss(ignore_index = self.data.end_token_pivot)

    @staticmethod
    def init_weights(m):
        for name, param in m.named_parameters():
            if 'weight' in name:
                nn.init.normal_(param.data, mean=0, std=0.01)
            else:
                nn.init.constant_(param.data, 0)

    def count_parameters(self, model):
        return sum(p.numel() for p in model.parameters() if p.requires_grad)

    def train(self, epoch, iterator, optimizer, criterion, clip):
        self.model.train()
        epoch_loss = 0
        pbar = tqdm(enumerate(iterator), total=len(iterator),
                    desc="({0:^3})".format(epoch))
        for i, batch in pbar:
            src = batch[0].transpose_(0, 1).to(device)
            trg = batch[1].transpose_(0, 1).to(device)
            
            optimizer.zero_grad()
            output = self.model(src, trg)
            # trg = [trg len, batch size]
            # output = [trg len, batch size, output dim]
            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].reshape(-1)
            #trg = [(trg len -1) * batch size]
            #output = [(trg len -1) * batch size, output dim]

            loss = criterion(output, trg)
            loss.backward()

            torch.nn.utils.clip_grad_norm(self.model.parameters(), clip)
            optimizer.step()
            epoch_loss += loss.item()
        return epoch_loss / len(iterator)

    def evaluate(self, iterator, criterion):
        self.model.eval()
        epoch_loss = 0
        with torch.no_grad():
            for i, batch in enumerate(iterator):
                src = batch[0].transpose_(0, 1).to(device)
                trg = batch[1].transpose_(0, 1).to(device)
                #src = batch.src
                #trg = batch.trg
                output = self.model(src, trg, 0.0)

                #trg = [trg len, batch size]
                #output = [trg len, batch size, output dim]

                output_dim = output.shape[-1]
                output = output[1:].view(-1, output_dim)
                trg = trg[1:].reshape(-1)
                loss = criterion(output, trg)
                epoch_loss += loss.item()
        return epoch_loss / len(iterator)

    def _epoch_time(self, start_time, end_time):
        elapsed_time = end_time - start_time
        elapsed_mins = int(elapsed_time / 60)
        elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
        return elapsed_mins, elapsed_secs

    def test(self):
        self.model.load_state_dict(torch.load(pjoin('model', 'attention.pt')))
        test_loss = self.evaluate(self.test_iter, self.criterion)
        print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

    def run(self):
        self.model.apply(self.init_weights)
        print(self.model)
        print("Model trainable parametes: {}".format(self.count_parameters(self.model)))

        optimizer = optim.Adam(self.model.parameters())

        CLIP = 1
        best_valid_loss = float('inf')
        for epoch in range(self.epochs):
            start_time = time.time()
            train_loss = self.train(epoch, self.train_iter, optimizer, self.criterion, CLIP)
            valid_loss = self.evaluate(self.valid_iter, self.criterion)
            end_time = time.time()

            epoch_mins, epoch_secs = self._epoch_time(start_time, end_time)
            if valid_loss < best_valid_loss:
                best_valid_loss = valid_loss
                torch.save(self.model.state_dict(), pjoin('model', 'attention.pt'))
            print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
            print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
            print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')


In [None]:
train = Train(ko_sequences, en_sequences)
train.run()
train.test()

MAX LEN : 46
TOTAL SEQ: 200000
WORD DICT LEN: 38122
MAX LEN : 49
TOTAL SEQ: 200000
WORD DICT LEN: 25440



( 0 ):   0%|          | 0/625 [00:00<?, ?it/s][A

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(38122, 128)
    (rnn): GRU(128, 256, bidirectional=True)
    (fc): Linear(in_features=512, out_features=256, bias=True)
    (dropout): Dropout(p=0.3, inplace=False)
  )
  (decoder): Decoder(
    (attention): Attention(
      (attn): Linear(in_features=768, out_features=256, bias=True)
      (v): Linear(in_features=256, out_features=1, bias=False)
    )
    (embedding): Embedding(25440, 128)
    (rnn): GRU(640, 256)
    (fc_out): Linear(in_features=896, out_features=25440, bias=True)
    (dropout): Dropout(p=0.3, inplace=False)
  )
)
Model trainable parametes: 32566624



( 0 ):   0%|          | 1/625 [00:18<3:07:57, 18.07s/it][A
( 0 ):   0%|          | 2/625 [00:37<3:13:16, 18.61s/it][A
( 0 ):   0%|          | 3/625 [00:55<3:08:40, 18.20s/it][A
( 0 ):   1%|          | 4/625 [01:14<3:11:59, 18.55s/it][A
( 0 ):   2%|▏         | 12/625 [03:36<3:09:34, 18.56s/it][A
( 0 ):   2%|▏         | 13/625 [03:54<3:07:57, 18.43s/it][A
( 0 ):   2%|▏         | 14/625 [04:12<3:04:50, 18.15s/it][A
( 0 ):   2%|▏         | 15/625 [04:29<3:01:15, 17.83s/it][A
( 0 ):   3%|▎         | 16/625 [04:46<2:59:02, 17.64s/it][A
( 0 ):   3%|▎         | 17/625 [05:06<3:04:02, 18.16s/it][A
( 0 ):   3%|▎         | 18/625 [05:23<3:00:36, 17.85s/it][A
( 0 ):   3%|▎         | 19/625 [05:40<2:57:50, 17.61s/it][A
( 0 ):   3%|▎         | 20/625 [05:57<2:57:24, 17.59s/it][A
( 0 ):   3%|▎         | 21/625 [06:14<2:55:40, 17.45s/it][A
( 0 ):   4%|▎         | 22/625 [06:32<2:56:26, 17.56s/it][A
( 0 ):   4%|▎         | 23/625 [06:49<2:55:02, 17.45s/it][A
( 0 ):   4%|▍         | 24/