In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm

%config InlineBackend.figure_format = 'retina'
 
fontpath = '/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf'
font = fm.FontProperties(fname=fontpath, size=9)
plt.rc('font', family='NanumBarunGothic') 
mpl.font_manager.findfont(font)

'/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf'

In [2]:
import numpy as np
import tensorflow as tf
import sentencepiece as spm
import matplotlib.pyplot as plt

import re
import os
import io
import time
import random

import seaborn

print(tf.__version__)

2.4.1


***

# 데이터 불러오기

In [3]:
data_dir = os.getenv('HOME')+'/aiffel/nlp10/transformer/data'
kor_path = data_dir+"/korean-english-park.train.ko"
eng_path = data_dir+"/korean-english-park.train.en"

# zip과 set을 활용하여 순서쌍을 유지하면서 중복 데이터 제거

In [4]:
def clean_corpus(kor_path, eng_path):
    with open(kor_path, "r") as f: kor = f.read().splitlines()
    with open(eng_path, "r") as f: eng = f.read().splitlines()
    assert len(kor) == len(eng)

    cleaned_corpus = list(set(zip(kor,eng)))

    return cleaned_corpus

In [5]:
cleaned_corpus = clean_corpus(kor_path, eng_path)

In [6]:
len(cleaned_corpus)

78968

# 전처리 및 센텐스피스를 사용한 토큰화

In [7]:
def preprocess_sentence(sentence):
    sentence = sentence.lower()
    sentence = re.sub(r"[^a-zㄱ-ㅎ가-힣?.!,]+", " ", sentence)
    sentence = re.sub(r"([?.!,])", r" \1 ", sentence)
    sentence = re.sub(r'[" "]+', " ", sentence)
    sentence = sentence.strip()
    
    return sentence

In [8]:
def generate_tokenizer(corpus, vocab_size, lang="ko", pad_id=0, bos_id=1, eos_id=2, unk_id=3):
    
    temp_file = os.getenv('HOME') + f'/aiffel/nlp10/transformer/corpus_{lang}.txt'
    
    with open(temp_file, 'w') as f:
        for row in corpus:
            f.write(str(row) + '\n')
    
    spm.SentencePieceTrainer.Train(
        f'--input={temp_file} --pad_id={pad_id} --bos_id={bos_id} --eos_id={eos_id} \
        --unk_id={unk_id} --model_prefix=spm_{lang} --vocab_size={vocab_size}'
    )
    
    tokenizer = spm.SentencePieceProcessor()
    tokenizer.Load(f'spm_{lang}.model')

    return tokenizer

In [9]:
SRC_VOCAB_SIZE = TGT_VOCAB_SIZE = 20000

eng_corpus = []
kor_corpus = []

for pair in cleaned_corpus:
    k, e = pair[0], pair[1]

    kor_corpus.append(preprocess_sentence(k))
    eng_corpus.append(preprocess_sentence(e))

ko_tokenizer = generate_tokenizer(kor_corpus, SRC_VOCAB_SIZE, "ko")
en_tokenizer = generate_tokenizer(eng_corpus, TGT_VOCAB_SIZE, "en")
en_tokenizer.set_encode_extra_options("bos:eos")

True

In [10]:
from tqdm.notebook import tqdm

src_corpus = []
tgt_corpus = []

assert len(kor_corpus) == len(eng_corpus)

for idx in tqdm(range(len(kor_corpus))):
    src = ko_tokenizer.EncodeAsIds(kor_corpus[idx])
    tgt = en_tokenizer.EncodeAsIds(eng_corpus[idx])
    
    if len(src) <= 50 and len(tgt) <= 50:
        src_corpus.append(src)
        tgt_corpus.append(tgt)

enc_train = tf.keras.preprocessing.sequence.pad_sequences(src_corpus, padding='post')
dec_train = tf.keras.preprocessing.sequence.pad_sequences(tgt_corpus, padding='post')

  0%|          | 0/78968 [00:00<?, ?it/s]

***

# Transformer 구현

In [11]:
def positional_encoding(pos, d_model):
    def cal_angle(position, i):
        return position / np.power(10000, int(i) / d_model)

    def get_posi_angle_vec(position):
        return [cal_angle(position, i) for i in range(d_model)]

    sinusoid_table = np.array([get_posi_angle_vec(pos_i) for pos_i in range(pos)])
    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])
    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])
    return sinusoid_table

In [12]:
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        
        self.num_heads = num_heads
        self.d_model = d_model
        self.depth = d_model // self.num_heads
        self.W_q = tf.keras.layers.Dense(d_model)
        self.W_k = tf.keras.layers.Dense(d_model)
        self.W_v = tf.keras.layers.Dense(d_model)
        self.linear = tf.keras.layers.Dense(d_model)

        
    def scaled_dot_product_attention(self, Q, K, V, mask):
        
        d_k = tf.cast(K.shape[-1], tf.float32)       
        QK = tf.matmul(Q,K,transpose_b=True)
        scaled_qk = QK/tf.math.sqrt(d_k)

        if mask is not None: scaled_qk += (mask * -1e9) 
        
        attentions = tf.nn.softmax(scaled_qk, axis=-1)
        out = tf.matmul(attentions, V)

        return out, attentions
        

    def split_heads(self, x):
        
        batch_size = x.shape[0]
        split_x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        split_x = tf.transpose(split_x, perm=[0, 2, 1, 3])

        return split_x

    def combine_heads(self, x):
        
        batch_size = x.shape[0]
        combined_x = tf.transpose(x, perm=[0, 2, 1, 3])
        combined_x = tf.reshape(combined_x, (batch_size, -1, self.d_model))

        return combined_x
    

    def call(self, Q, K, V, mask):
        
        WQ = self.W_q(Q)
        WK = self.W_k(K)
        WV = self.W_v(V)
        
        WQ_splits = self.split_heads(WQ)
        WK_splits = self.split_heads(WK)
        WV_splits = self.split_heads(WV)
        
        out, attention_weights = self.scaled_dot_product_attention(WQ_splits, WK_splits, WV_splits, mask)
        
        out = self.combine_heads(out)
        out = self.linear(out)

        return out, attention_weights

In [13]:
class PoswiseFeedForwardNet(tf.keras.layers.Layer):
    def __init__(self, d_model, d_ff):
        super(PoswiseFeedForwardNet, self).__init__()
        
        self.w_1 = tf.keras.layers.Dense(d_ff, activation='relu')
        self.w_2 = tf.keras.layers.Dense(d_model)

    def call(self, x):
        out = self.w_1(x)
        out = self.w_2(out)
            
        return out

In [14]:
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, n_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()

        self.enc_self_attn = MultiHeadAttention(d_model, n_heads)
        self.ffn = PoswiseFeedForwardNet(d_model, d_ff)
        self.norm_1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.norm_2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout = tf.keras.layers.Dropout(dropout)
        
    def call(self, x, mask):

        residual = x
        out = self.norm_1(x)
        out, enc_attn = self.enc_self_attn(out, out, out, mask)
        out = self.dropout(out)
        out += residual

        residual = out
        out = self.norm_2(out)
        out = self.ffn(out)
        out = self.dropout(out)
        out += residual
        
        return out, enc_attn

In [15]:
class Encoder(tf.keras.Model):
    def __init__(self, n_layers, d_model, n_heads, d_ff, dropout):
        super(Encoder, self).__init__()
        self.n_layers = n_layers
        self.enc_layers = [EncoderLayer(d_model, n_heads, d_ff, dropout) 
                        for _ in range(n_layers)]
        
    def call(self, x, mask):
        out = x
    
        enc_attns = list()
        for i in range(self.n_layers):
            out, enc_attn = self.enc_layers[i](out, mask)
            enc_attns.append(enc_attn)
        
        return out, enc_attns

In [16]:
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()

        self.dec_self_attn = MultiHeadAttention(d_model, num_heads)
        self.enc_dec_attn = MultiHeadAttention(d_model, num_heads)

        self.ffn = PoswiseFeedForwardNet(d_model, d_ff)

        self.norm_1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.norm_2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.norm_3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.dropout = tf.keras.layers.Dropout(dropout)
    
    def call(self, x, enc_out, causality_mask, padding_mask):

        residual = x
        out = self.norm_1(x)
        out, dec_attn = self.dec_self_attn(out, out, out, padding_mask)
        out = self.dropout(out)
        out += residual

        residual = out
        out = self.norm_2(out)
        out, dec_enc_attn = self.enc_dec_attn(out, enc_out, enc_out, causality_mask)
        out = self.dropout(out)
        out += residual
        
        residual = out
        out = self.norm_3(out)
        out = self.ffn(out)
        out = self.dropout(out)
        out += residual

        return out, dec_attn, dec_enc_attn

In [17]:
class Decoder(tf.keras.Model):
    def __init__(self,
                 n_layers,
                 d_model,
                 n_heads,
                 d_ff,
                 dropout):
        super(Decoder, self).__init__()
        self.n_layers = n_layers
        self.dec_layers = [DecoderLayer(d_model, n_heads, d_ff, dropout) 
                            for _ in range(n_layers)]
                            
                            
    def call(self, x, enc_out, causality_mask, padding_mask):
        out = x
    
        dec_attns = list()
        dec_enc_attns = list()
        for i in range(self.n_layers):
            out, dec_attn, dec_enc_attn = \
            self.dec_layers[i](out, enc_out, causality_mask, padding_mask)

            dec_attns.append(dec_attn)
            dec_enc_attns.append(dec_enc_attn)

        return out, dec_attns, dec_enc_attns

In [18]:
class Transformer(tf.keras.Model):
    def __init__(self, n_layers, d_model, n_heads, d_ff, src_vocab_size, tgt_vocab_size, pos_len, dropout=0.2, shared=True):
        super(Transformer, self).__init__()
        
        self.d_model = tf.cast(d_model, tf.float32)
        self.enc_emb = tf.keras.layers.Embedding(src_vocab_size, d_model)
        self.dec_emb = tf.keras.layers.Embedding(tgt_vocab_size, d_model)
        self.pos_encoding = positional_encoding(pos_len, d_model)
        self.dropout = tf.keras.layers.Dropout(dropout)
        self.encoder = Encoder(n_layers, d_model, n_heads, d_ff, dropout)
        self.decoder = Decoder(n_layers, d_model, n_heads, d_ff, dropout)
        self.fc = tf.keras.layers.Dense(tgt_vocab_size)
        self.shared = shared

        if shared: self.fc.set_weights(tf.transpose(self.dec_emb.weights))

    def embedding(self, emb, x):
        
        seq_len = x.shape[1]
        out = emb(x)

        if self.shared: out *= tf.math.sqrt(self.d_model)

        out += self.pos_encoding[np.newaxis, ...][:, :seq_len, :]
        out = self.dropout(out)

        return out

        
    def call(self, enc_in, dec_in, enc_mask, causality_mask, dec_mask):
        
        enc_in = self.embedding(self.enc_emb, enc_in)
        dec_in = self.embedding(self.dec_emb, dec_in)
        enc_out, enc_attns = self.encoder(enc_in, enc_mask)
        dec_out, dec_attns, dec_enc_attns = \
        self.decoder(dec_in, enc_out, causality_mask, dec_mask)
        logits = self.fc(dec_out)
        
        return logits, enc_attns, dec_attns, dec_enc_attns

In [19]:
def generate_padding_mask(seq):
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
    return seq[:, tf.newaxis, tf.newaxis, :]

def generate_causality_mask(src_len, tgt_len):
    mask = 1 - np.cumsum(np.eye(src_len, tgt_len), 0)
    return tf.cast(mask, tf.float32)

def generate_masks(src, tgt):
    enc_mask = generate_padding_mask(src)
    dec_mask = generate_padding_mask(tgt)

    dec_enc_causality_mask = generate_causality_mask(tgt.shape[1], src.shape[1])
    dec_enc_mask = tf.maximum(enc_mask, dec_enc_causality_mask)

    dec_causality_mask = generate_causality_mask(tgt.shape[1], tgt.shape[1])
    dec_mask = tf.maximum(dec_mask, dec_causality_mask)

    return enc_mask, dec_enc_mask, dec_mask

***

# 러닝레이트 및 옵티마이저 설정

In [20]:
class LearningRateScheduler(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(LearningRateScheduler, self).__init__()
        
        self.d_model = d_model
        self.warmup_steps = warmup_steps
    
    def __call__(self, step):
        arg1 = step ** -0.5
        arg2 = step * (self.warmup_steps ** -1.5)
        
        return (self.d_model ** -0.5) * tf.math.minimum(arg1, arg2)

In [21]:
learning_rate = LearningRateScheduler(512)

In [22]:
optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

# loss 설정

In [23]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_sum(loss_)/tf.reduce_sum(mask)

# train step 설정

In [24]:
@tf.function()
def train_step(src, tgt, model, optimizer):
    gold = tgt[:, 1:]
        
    enc_mask, dec_enc_mask, dec_mask = generate_masks(src, tgt)

    with tf.GradientTape() as tape:
        predictions, enc_attns, dec_attns, dec_enc_attns = model(src, tgt, enc_mask, dec_enc_mask, dec_mask)
        loss = loss_function(gold, predictions[:, :-1])

    gradients = tape.gradient(loss, model.trainable_variables)    
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    
    return loss, enc_attns, dec_attns, dec_enc_attns

***

# 학습 및 번역

In [25]:
transformer = Transformer(n_layers=2, d_model=512, n_heads=8, d_ff = 2048, src_vocab_size=SRC_VOCAB_SIZE, tgt_vocab_size=TGT_VOCAB_SIZE, pos_len=200, dropout=0.1, shared=True)

In [26]:
def visualize_attention(src, tgt, enc_attns, dec_attns, dec_enc_attns):
    def draw(data, ax, x="auto", y="auto"):
        import seaborn
        seaborn.heatmap(data, 
                        square=True,
                        vmin=0.0, vmax=1.0, 
                        cbar=False, ax=ax,
                        xticklabels=x,
                        yticklabels=y)
        
    for layer in range(0, 2, 1):
        fig, axs = plt.subplots(1, 4, figsize=(20, 10))
        print("Encoder Layer", layer + 1)
        for h in range(4):
            draw(enc_attns[layer][0, h, :len(src), :len(src)], axs[h], src, src)
        plt.show()
        
    for layer in range(0, 2, 1):
        fig, axs = plt.subplots(1, 4, figsize=(20, 10))
        print("Decoder Self Layer", layer+1)
        for h in range(4):
            draw(dec_attns[layer][0, h, :len(tgt), :len(tgt)], axs[h], tgt, tgt)
        plt.show()

        print("Decoder Src Layer", layer+1)
        fig, axs = plt.subplots(1, 4, figsize=(20, 10))
        for h in range(4):
            draw(dec_enc_attns[layer][0, h, :len(tgt), :len(src)], axs[h], src, tgt)
        plt.show()

In [27]:
def evaluate(sentence, model, src_tokenizer, tgt_tokenizer):
    sentence = preprocess_sentence(sentence)
    pieces = src_tokenizer.encode_as_pieces(sentence)
    tokens = src_tokenizer.encode_as_ids(sentence)

    _input = tf.keras.preprocessing.sequence.pad_sequences([tokens], maxlen=enc_train.shape[-1], padding='post')
    
    print(len(_input))
    print(enc_train.shape[-1])

    ids = []
    output = tf.expand_dims([tgt_tokenizer.bos_id()], 0)
    for i in range(dec_train.shape[-1]):
        enc_padding_mask, combined_mask, dec_padding_mask = generate_masks(_input, output)
        
        predictions, enc_attns, dec_attns, dec_enc_attns = model(_input, output, enc_padding_mask, combined_mask, dec_padding_mask)
        
        predicted_id = tf.argmax(tf.math.softmax(predictions, axis=-1)[0, -1]).numpy().item()
        if tgt_tokenizer.eos_id() == predicted_id:
            result = tgt_tokenizer.decode_ids(ids)
            return pieces, result, enc_attns, dec_attns, dec_enc_attns

        ids.append(predicted_id)
        output = tf.concat([output, tf.expand_dims([predicted_id], 0)], axis=-1)
    result = tgt_tokenizer.decode_ids(ids)
    return pieces, result, enc_attns, dec_attns, dec_enc_attns

In [28]:
def translate(sentence, model, src_tokenizer, tgt_tokenizer, plot_attention=False):
    pieces, result, enc_attns, dec_attns, dec_enc_attns = \
    evaluate(sentence, model, src_tokenizer, tgt_tokenizer)
    
    print('Input: %s' % (sentence))
    print('Predicted translation: {}'.format(result))

    if plot_attention:
        visualize_attention(pieces, result.split(), enc_attns, dec_attns, dec_enc_attns)

In [29]:
examples = [
    "오바마는 대통령이다.",
    "시민들은 도시 속에 산다.",
    "커피는 필요 없다.",
    "일곱 명의 사망자가 발생했다."
]

In [30]:
from tqdm import tqdm_notebook 

BATCH_SIZE = 64
EPOCHS = 20


for epoch in range(EPOCHS):
    total_loss = 0
    
    idx_list = list(range(0, enc_train.shape[0], BATCH_SIZE))
    random.shuffle(idx_list)
    t = tqdm_notebook(idx_list)

    for (batch, idx) in enumerate(t):
        batch_loss, enc_attns, dec_attns, dec_enc_attns = \
        train_step(enc_train[idx:idx+BATCH_SIZE],
                    dec_train[idx:idx+BATCH_SIZE],
                    transformer,
                    optimizer)

        total_loss += batch_loss
        
        t.set_description_str('Epoch %2d' % (epoch + 1))
        t.set_postfix_str('Loss %.4f' % (total_loss.numpy() / (batch + 1)))

    for example in examples:
        translate(example, transformer, ko_tokenizer, en_tokenizer)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  if sys.path[0] == '':


  0%|          | 0/1064 [00:00<?, ?it/s]

1
50
Input: 오바마는 대통령이다.
Predicted translation: obama s obama is a campaign .
1
50
Input: 시민들은 도시 속에 산다.
Predicted translation: they are being in the city .
1
50
Input: 커피는 필요 없다.
Predicted translation: it is no longer .
1
50
Input: 일곱 명의 사망자가 발생했다.
Predicted translation: the attacks of the death toll in the city of the city .


  0%|          | 0/1064 [00:00<?, ?it/s]

1
50
Input: 오바마는 대통령이다.
Predicted translation: obama is a president .
1
50
Input: 시민들은 도시 속에 산다.
Predicted translation: the city is around the city .
1
50
Input: 커피는 필요 없다.
Predicted translation: coffee don t think you don t think you don t think you don t think you don t think you don t think .
1
50
Input: 일곱 명의 사망자가 발생했다.
Predicted translation: thousands of dead were dead .


  0%|          | 0/1064 [00:00<?, ?it/s]

1
50
Input: 오바마는 대통령이다.
Predicted translation: obama is the president .
1
50
Input: 시민들은 도시 속에 산다.
Predicted translation: the city s city is a city .
1
50
Input: 커피는 필요 없다.
Predicted translation: coffee needs to be a coffee .
1
50
Input: 일곱 명의 사망자가 발생했다.
Predicted translation: seven people dead and a dead .


  0%|          | 0/1064 [00:00<?, ?it/s]

1
50
Input: 오바마는 대통령이다.
Predicted translation: obama is the president .
1
50
Input: 시민들은 도시 속에 산다.
Predicted translation: the city s city mountain in the city .
1
50
Input: 커피는 필요 없다.
Predicted translation: coffee is no need for coffee .
1
50
Input: 일곱 명의 사망자가 발생했다.
Predicted translation: death toll from the death toll monday .


  0%|          | 0/1064 [00:00<?, ?it/s]

1
50
Input: 오바마는 대통령이다.
Predicted translation: obama is a presumptive president .
1
50
Input: 시민들은 도시 속에 산다.
Predicted translation: they re the city in town of the city .
1
50
Input: 커피는 필요 없다.
Predicted translation: there need no need to be no need to keep any of anyone needs to be needed .
1
50
Input: 일곱 명의 사망자가 발생했다.
Predicted translation: two of the dead were killed .


  0%|          | 0/1064 [00:00<?, ?it/s]

1
50
Input: 오바마는 대통령이다.
Predicted translation: president obama is live with .
1
50
Input: 시민들은 도시 속에 산다.
Predicted translation: city san cha commemorates san city .
1
50
Input: 커피는 필요 없다.
Predicted translation: no otherwise .
1
50
Input: 일곱 명의 사망자가 발생했다.
Predicted translation: seven death toll from the ministry of seven consecutive days monday .


  0%|          | 0/1064 [00:00<?, ?it/s]

1
50
Input: 오바마는 대통령이다.
Predicted translation: he is the president .
1
50
Input: 시민들은 도시 속에 산다.
Predicted translation: the city s mann in the city .
1
50
Input: 커피는 필요 없다.
Predicted translation: there s no need to win anywhere .
1
50
Input: 일곱 명의 사망자가 발생했다.
Predicted translation: seven deaths were confirmed .


  0%|          | 0/1064 [00:00<?, ?it/s]

1
50
Input: 오바마는 대통령이다.
Predicted translation: obama is the president .
1
50
Input: 시민들은 도시 속에 산다.
Predicted translation: citizens thought the city down sanion in the city .
1
50
Input: 커피는 필요 없다.
Predicted translation: no need to keep casualties .
1
50
Input: 일곱 명의 사망자가 발생했다.
Predicted translation: seven people were dead thursday .


  0%|          | 0/1064 [00:00<?, ?it/s]

1
50
Input: 오바마는 대통령이다.
Predicted translation: obama is then .
1
50
Input: 시민들은 도시 속에 산다.
Predicted translation: city authorities are outside the city .
1
50
Input: 커피는 필요 없다.
Predicted translation: there needs to be no rollless .
1
50
Input: 일곱 명의 사망자가 발생했다.
Predicted translation: seven people were killed , officials said monday .


  0%|          | 0/1064 [00:00<?, ?it/s]

1
50
Input: 오바마는 대통령이다.
Predicted translation: obama is taking the presidential election .
1
50
Input: 시민들은 도시 속에 산다.
Predicted translation: city streets take the city down city .
1
50
Input: 커피는 필요 없다.
Predicted translation: no need for even .
1
50
Input: 일곱 명의 사망자가 발생했다.
Predicted translation: seven other people were dead , .


  0%|          | 0/1064 [00:00<?, ?it/s]

1
50
Input: 오바마는 대통령이다.
Predicted translation: obama is taking the president s presidential election .
1
50
Input: 시민들은 도시 속에 산다.
Predicted translation: the city is only san .
1
50
Input: 커피는 필요 없다.
Predicted translation: no need to need to coffee .
1
50
Input: 일곱 명의 사망자가 발생했다.
Predicted translation: seven others were killed , thursday s dead .


  0%|          | 0/1064 [00:00<?, ?it/s]

1
50
Input: 오바마는 대통령이다.
Predicted translation: obama is also live just becoming the shortest .
1
50
Input: 시민들은 도시 속에 산다.
Predicted translation: citizens are going to control over city .
1
50
Input: 커피는 필요 없다.
Predicted translation: there need no work need to be coffee .
1
50
Input: 일곱 명의 사망자가 발생했다.
Predicted translation: seven people were killed when the deaths occurred .


  0%|          | 0/1064 [00:00<?, ?it/s]

1
50
Input: 오바마는 대통령이다.
Predicted translation: that is obama .
1
50
Input: 시민들은 도시 속에 산다.
Predicted translation: city take the city on saturday .
1
50
Input: 커피는 필요 없다.
Predicted translation: the need for coffee .
1
50
Input: 일곱 명의 사망자가 발생했다.
Predicted translation: seven people were killed , officials said .


  0%|          | 0/1064 [00:00<?, ?it/s]

1
50
Input: 오바마는 대통령이다.
Predicted translation: president obama is live at the president .
1
50
Input: 시민들은 도시 속에 산다.
Predicted translation: city took place on the city streets in san city .
1
50
Input: 커피는 필요 없다.
Predicted translation: no need to need for start .
1
50
Input: 일곱 명의 사망자가 발생했다.
Predicted translation: three people died and seven other dead .


  0%|          | 0/1064 [00:00<?, ?it/s]

1
50
Input: 오바마는 대통령이다.
Predicted translation: obama is taking the presidential nominee .
1
50
Input: 시민들은 도시 속에 산다.
Predicted translation: city took control of the city .
1
50
Input: 커피는 필요 없다.
Predicted translation: no need to need to need coffee .
1
50
Input: 일곱 명의 사망자가 발생했다.
Predicted translation: another seven were wounded .


  0%|          | 0/1064 [00:00<?, ?it/s]

1
50
Input: 오바마는 대통령이다.
Predicted translation: obama is then .
1
50
Input: 시민들은 도시 속에 산다.
Predicted translation: city took some of the cityy san city .
1
50
Input: 커피는 필요 없다.
Predicted translation: for no need have to been needed .
1
50
Input: 일곱 명의 사망자가 발생했다.
Predicted translation: seven people died on thursday , .


  0%|          | 0/1064 [00:00<?, ?it/s]

1
50
Input: 오바마는 대통령이다.
Predicted translation: obama is the president .
1
50
Input: 시민들은 도시 속에 산다.
Predicted translation: citizens took some the city to control the city .
1
50
Input: 커피는 필요 없다.
Predicted translation: at no need to need to need to keep it .
1
50
Input: 일곱 명의 사망자가 발생했다.
Predicted translation: seven people were dead were seven .


  0%|          | 0/1064 [00:00<?, ?it/s]

1
50
Input: 오바마는 대통령이다.
Predicted translation: that is president obama .
1
50
Input: 시민들은 도시 속에 산다.
Predicted translation: city took eight the city in the city .
1
50
Input: 커피는 필요 없다.
Predicted translation: at no need is needed .
1
50
Input: 일곱 명의 사망자가 발생했다.
Predicted translation: another people died because , another seven people were killed .


  0%|          | 0/1064 [00:00<?, ?it/s]

1
50
Input: 오바마는 대통령이다.
Predicted translation: he is president a short story that obama is a short african country .
1
50
Input: 시민들은 도시 속에 산다.
Predicted translation: city took some in city to the city streets in the city .
1
50
Input: 커피는 필요 없다.
Predicted translation: for no need to buy it .
1
50
Input: 일곱 명의 사망자가 발생했다.
Predicted translation: seven people were dead in the dead .


  0%|          | 0/1064 [00:00<?, ?it/s]

1
50
Input: 오바마는 대통령이다.
Predicted translation: obama is a president .
1
50
Input: 시민들은 도시 속에 산다.
Predicted translation: city took some of sanish streets in the city .
1
50
Input: 커피는 필요 없다.
Predicted translation: at no coffee , he said .
1
50
Input: 일곱 명의 사망자가 발생했다.
Predicted translation: seven elderly people have died in that thursday s death .


# 고찰

학습과정에서 loss는 계속 낮아졌지만 학습 과정에서 번역 성능은 들쑥날쑥하는 것을 볼 수 있었습니다.  
transformer를 적용하긴 했지만 아직 성능 개선의 여지가 많이 남아 있는 모습입니다. 
transformer의 성능 자체가 아쉬움이 많은 것인지, 작업 환경의 제약 때문에 논문에 나와 있는 수준으로 하이퍼파라미터를 튜닝할 수 없어서 생긴 성능의 아쉬움인지는 잘 모르겠습니다.  

***

In [31]:
translate("러시아의 우크라이나 침공이 가시화되고 있습니다.", transformer, ko_tokenizer, en_tokenizer)

1
50
Input: 러시아의 우크라이나 침공이 가시화되고 있습니다.
Predicted translation: russia s closest smoothly climbed into situations .


In [32]:
translate("미국의 핵잠수함 제공에 대해 프랑스는 불편한 내색을 감추지 않고 있습니다.", transformer, ko_tokenizer, en_tokenizer)

1
50
Input: 미국의 핵잠수함 제공에 대해 프랑스는 불편한 내색을 감추지 않고 있습니다.
Predicted translation: you know about your enemy ice , maybe i ask you to study your favorite bills .


In [33]:
translate("유럽의 천연가스 가격이 급등하고 있습니다.", transformer, ko_tokenizer, en_tokenizer)

1
50
Input: 유럽의 천연가스 가격이 급등하고 있습니다.
Predicted translation: european gas prices are being priceed on rise .


In [34]:
translate("전세계적으로 공급망 이슈는 지속될 것으로 보입니다.", transformer, ko_tokenizer, en_tokenizer)

1
50
Input: 전세계적으로 공급망 이슈는 지속될 것으로 보입니다.
Predicted translation: another issue that just will be issued .


## 이전 seq2seq 과제에 비해서 좀 더 그럴싸한 번역을 보여줍니다.
## 다만 데이터에 없어보일 법한 문장에 대해서는 성능이 떨어지는 듯해보입니다.