In [73]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import time
import math
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np

import torch
import torch.nn as nn 
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [74]:
SOS_token = 0
EOS_token = 1


class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1
            

def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters


def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

# Data Preparing

## ENGLISH -> KOREAN

In [75]:
lang1 = 'eng'
lang2 = 'kor'
input_lang = Lang(lang1)
output_lang = Lang(lang2)

## 파일(전체) 읽어오기

In [76]:
lines = open('data/%s-%s.txt' %(lang1, lang2), encoding = 'utf-8').read().strip().split('\n')
print(lines[:10])

['Go.\t가.\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #8363271 (Eunhee)', 'Hi.\t안녕.\tCC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #8355888 (Eunhee)', 'Run!\t뛰어!\tCC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #8355891 (Eunhee)', 'Run.\t뛰어.\tCC-BY 2.0 (France) Attribution: tatoeba.org #4008918 (JSakuragi) & #8363273 (Eunhee)', 'Who?\t누구?\tCC-BY 2.0 (France) Attribution: tatoeba.org #2083030 (CK) & #6820074 (yesjustryan)', 'Wow!\t우와!\tCC-BY 2.0 (France) Attribution: tatoeba.org #52027 (Zifre) & #8355878 (Eunhee)', 'Fire!\t쏴!\tCC-BY 2.0 (France) Attribution: tatoeba.org #1829639 (Spamster) & #8355899 (Eunhee)', 'Help!\t도와줘!\tCC-BY 2.0 (France) Attribution: tatoeba.org #435084 (lukaszpp) & #8355885 (Eunhee)', 'Jump!\t점프!\tCC-BY 2.0 (France) Attribution: tatoeba.org #1102981 (jamessilver) & #8355893 (Eunhee)', 'Jump.\t점프해.\tCC-BY 2.0 (France) Attribution: tatoeba.org #631038 (Shishir) & #8355890 (Eunhee)']


In [77]:
print(lines[0].split('\t')[:2], lines[1].split('\t')[:2])

['Go.', '가.'] ['Hi.', '안녕.']


## 파일 각 줄 별로 읽어서 대문자는 소문자로 만들기

In [78]:
pairs = []
for l in lines :
    eng, kor = l.split('\t')[:2]
    kor = kor[:-1]+' .'
    pairs.append([normalizeString(eng), kor])

print(pairs[:2])

# 읽어온 문장의 개수
print("Read %s sentence pairs" % len(pairs))

[['go .', '가 .'], ['hi .', '안녕 .']]
Read 3621 sentence pairs


### 읽어오는 영/한 문장에 대한 trimming 
- 각 문장이 10자가 넘으면 사용하지 않음 
- encoder-decoder의 아키텍쳐 사이즈 때문

In [79]:
MAX_LENGTH = 10

eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s ",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)


def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH 


def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

## trimming 결과보기

In [80]:
print("Read %s sentence pairs" % len(pairs))
pairs = filterPairs(pairs)
print("Trimmed to %s sentence pairs" % len(pairs))
print("Counting words...")
for pair in pairs:
    input_lang.addSentence(pair[0])
    output_lang.addSentence(pair[1])
print("Counted words:")
print(input_lang.name, input_lang.n_words)
print(output_lang.name, output_lang.n_words)
print(random.choice(pairs))

Read 3621 sentence pairs
Trimmed to 3144 sentence pairs
Counting words...
Counted words:
eng 2195
kor 4545
['stop crying .', '그만 울어 .']


# Seq2Seq Model

### Encoder - Decoder network

<img src="./image/seq2seq.png" width="600" height="200">

### LSTM and GRU(Gated Recurrent Unit)
<img src="./image/rnnunit.png" width="800" height="300">

- LSTM : hidden state + memory cell

- GRU : memory cell이 hidden state에 포함된 경량화된 모델

$$h^{(t)} = u_t\odot h^{(t-1)}+(1-u_t)\odot\sigma(b+W(r_t\odot h^{(t-1)})+Ux^{(t)})$$
$$u_t = \sigma(b^u+W^uh^{(t-1)}+U^ux^{(t)})$$
$$r_t = \sigma(b^r+W^rh^{(t-1)}+U^rx^{(t)})$$

In [81]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        # input size : 입력 문장의 길이
        # hidden size : context vector의 차원
        
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        # 처음 들어가는 hidden state
        return torch.zeros(1, 1, self.hidden_size, device=device)

class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

## Attention Model

<img src="./image/attention.jpg" width="500" height="200">

In [82]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)
        
        ## input embedding과 이전 hidden state를 이용해서 attention weight 구함
        attn_weights = F.softmax(self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        
        ## encoder output에 위에서 구한 attention weight를 곱해서 weighted combination을 구함
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        ## input embedding과 context vector를 이용해서 예측하기
        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

# Preparing Training Data

## indexes From Sentence : 
- lang.word2index : laguage(한/영) 별로 word 별로 index가 저장되어있는 dictionary
- 문장에서 각 단어의 index 를 순서대로 저장

## tensor From Sentence :
- 위 함수로 얻은 index list에 EOS 토큰을 마지막에 추가
- tensor로 변환

## tensors From Pair :
- 한/영 문장 pair 에 대해 위 2개의 함수를 각각 적용

In [83]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

# Training Model

## train : 매 iteration 반복되는 학습과정


In [84]:
def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    for di in range(target_length):
        decoder_output, decoder_hidden, decoder_attention = decoder(
            decoder_input, decoder_hidden, encoder_outputs)
        loss += criterion(decoder_output, target_tensor[di])
        decoder_input = target_tensor[di]  # Teacher forcing

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [85]:
def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))



## trainIters  
- SGD 이용 
- n epochs
- train_pairs : n번의 epoch에서 사용할 각각 샘플 1개씩

In [86]:
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(pairs))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0
    return plot_losses

In [87]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]
    
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

In [88]:
hidden_size = 256
encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1).to(device)

losses = trainIters(encoder = encoder1, decoder = attn_decoder1, n_iters = 50000, print_every=5000)


3m 18s (- 29m 45s) (5000 10%) 4.6007
6m 39s (- 26m 37s) (10000 20%) 3.6873
10m 2s (- 23m 24s) (15000 30%) 2.7951
13m 24s (- 20m 6s) (20000 40%) 2.0665
16m 36s (- 16m 36s) (25000 50%) 1.4143
19m 49s (- 13m 12s) (30000 60%) 0.9878
23m 7s (- 9m 54s) (35000 70%) 0.6573
26m 33s (- 6m 38s) (40000 80%) 0.4366
30m 5s (- 3m 20s) (45000 90%) 0.2920
33m 41s (- 0m 0s) (50000 100%) 0.2136


In [89]:
evaluateRandomly(encoder1, attn_decoder1)

> do you really think we can help ?
= 너 진짜 우리가 도울 수 있을 거라고 생각해 .
< 너 진짜 우리가 도울 수 있을 거라고 생각해 . <EOS>

> i ll kill him .
= 나는 그를 죽일 것이다 .
< 나는 그를 죽일 것이다 . <EOS>

> sorry but you can t go in there .
= 미안한데, 거기 들어오면 안 돼 .
< 미안한데, 거기 들어오면 안 돼 . <EOS>

> tom is a family oriented person .
= 톰은 가정 중심적인 사람이야 .
< 톰은 가정 중심적인 사람이야 . <EOS>

> does anybody here know how this thing works ?
= 이걸 어떻게 하는지 아는 사람 .
< 이걸 어떻게 하는지 아는 사람 . <EOS>

> i m feeling really confident now .
= 난 지금 자신감에 넘치고 있어 .
< 난 지금 자신감에 넘치고 있어 . <EOS>

> we talked .
= 우린 서로 얘기했어 .
< 우린 서로 얘기했어 . <EOS>

> nothing could sway his conviction .
= 그 어떤 것도 그의 신념을 꺾을 수 없었다 .
< 그 어떤 것도 그의 신념을 꺾을 수 없었다 . <EOS>

> i shook tom awake .
= 난 톰을 흔들어 깨웠어 .
< 난 톰을 흔들어 깨웠어 . <EOS>

> i caught a cold .
= 감기에 걸렸어 .
< 감기에 걸렸어 . <EOS>



In [91]:
for _ in range(10) : 
    pair = random.choice(pairs)
    eng = pair[0]
    kor = pair[1] 
    output_words, attentions = evaluate(encoder1, attn_decoder1, eng)
    output_sentence = ' '.join(output_words)
    print("\n")
    print("%d th test-------------------------------\n" %(_+1))
    print("input sentence : " + eng)
    print("target sentence : " + kor)
    print("output sentence : " + output_sentence)
    eng = eng.split(' ')
    print("\n")
    print("------------ Attention -------------")
    for i in range(len(attentions)) :
        attention_score = attentions[i]
        _ , idx = attention_score.topk(1)
        if idx[0].item() < len(eng):
            print("%s <- %s" %(output_words[i], eng[idx]))



1 th test-------------------------------

input sentence : it was a rhetorical question .
target sentence : 이건 수사적인 질문이었어 .
output sentence : 이건 수사적인 질문이었어 . <EOS>


------------ Attention -------------
이건 <- a
수사적인 <- rhetorical
질문이었어 <- question
. <- .
<EOS> <- it


2 th test-------------------------------

input sentence : everything stopped .
target sentence : 모든 것이 멈췄어 .
output sentence : 모든 것이 멈췄어 . <EOS>


------------ Attention -------------
모든 <- .
<EOS> <- .


3 th test-------------------------------

input sentence : i don t sleep a lot .
target sentence : 나는 많이 자지 않아요 .
output sentence : 나는 잠이 많지 않아요 . <EOS>


------------ Attention -------------
나는 <- t
잠이 <- sleep
많지 <- a
않아요 <- a
. <- .
<EOS> <- t


4 th test-------------------------------

input sentence : it works .
target sentence : 작동하네 .
output sentence : 되네 . <EOS>


------------ Attention -------------
되네 <- .
<EOS> <- it


5 th test-------------------------------

input sentence : tom is a very dedicated actor 

In [106]:
pair = random.choice(pairs)
eng = 'tom is very dedicated man .'
output_words, attentions = evaluate(encoder1, attn_decoder1, eng)
output_sentence = ' '.join(output_words)
print("\n")
print("%d th test-------------------------------\n" %(_+1))
print("input sentence : " + eng)
print("output sentence : " + output_sentence)



1 th test-------------------------------

input sentence : tom is very dedicated man .
output sentence : 톰은 아마 나한테 하모니카를 프랑스어를 가르쳤어 . <EOS>


In [93]:
print(pair)

['tom currently lives alone in a small apartment .', '탐은 현재 작은 아파트에서 혼자 살고 있다 .']
