In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import random


device = 'cuda' if torch.cuda.is_available() else 'cpu'
torch.manual_seed(0)
if device == 'cuda':
    torch.cuda.manual_seed(0)


In [2]:
raw = ["I feel hungry.	나는 배가 고프다.",
       "Pytorch is very easy.	파이토치는 매우 쉽다.",
       "Pytorch is a framework for deep learning.	파이토치는 딥러닝을 위한 프레임워크이다.",
       "Pytorch is very clear to use.	파이토치는 사용하기 매우 직관적이다."]

In [3]:
# fix token for "start of sentence" and "end of sentence"
SOS_token = 0
EOS_token = 1

In [4]:
class Vocab:
    def __init__(self):
        self.vocab2index = {"<SOS>": SOS_token, "<EOS>": EOS_token}
        self.index2vocab = {SOS_token: "<SOS>", EOS_token: "<EOS>"}
        self.vocab_count = {} # 특정 단어의 개수
        self.n_vocab = len(self.vocab2index) # vocab vector의 길이
    def add_vocab(self, sentence):
        for word in sentence.split(" "):
            if word not in self.vocab2index: # 기존에 없던 것 추가됐을 때
                self.vocab2index[word] = self.n_vocab
                self.vocab_count[word] = 1
                self.index2vocab[self.n_vocab] = word
                self.n_vocab += 1
            else:
                self.vocab_count[word] += 1

In [5]:
# filter out the long sentence from source and target data
def filter_pair(pair, source_max_length, target_max_length):
    # 논리값 출력
    # source와 target 모두 max_length이내이면 통과시킨다.
    return len(pair[0].split(" ")) < source_max_length and len(pair[1].split(" ")) < target_max_length

In [6]:
# read & process
def preprocess(corpus, source_max_length, target_max_length):
    pairs = []
    for line in corpus: # 한 문장씩 뽑아와서 빈 칸 없애고, 소문자로하고, 공백기준으로 원문과 번역으로 나눈다.
        pairs.append([s for s in line.strip().lower().split("\t")])
    print("read {} sentences pairs".format(len(pairs)))
    
    pairs = [pair for pair in pairs if filter_pair(pair, source_max_length, target_max_length)]
    print("trimmed to {} sentence pairs".format(len(pairs)))
    
    source_vocab = Vocab()
    target_vocab = Vocab()
    
    for pair in pairs:
        source_vocab.add_vocab(pair[0]) # 원문
        target_vocab.add_vocab(pair[1]) # 번역문
    print("source vocab size = ", source_vocab.n_vocab)
    print("target vocab size = ", target_vocab.n_vocab)
    return pairs, source_vocab, target_vocab

In [7]:
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super().__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
    def forward(self, x, hidden):
        x = self.embedding(x).view(1,1,-1)
        x, hidden = self.gru(x, hidden)
        return x, hidden
        

In [8]:
class Decoder(nn.Module):
    def __init__(self, hidden_size, output_size):
        super().__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size) # softmax skipped
        self.softmax = nn.LogSoftmax(dim=1)
    def forward(self, x, hidden):
        x = self.embedding(x).view(1,1,-1)
        x, hidden = self.gru(x, hidden)
        x = self.out(x[0])
        x = self.softmax(x)
        return x, hidden

In [9]:
def tensorize(vocab, sentence):
    indexes = [vocab.vocab2index[word] for word in sentence.split(" ")]
    indexes.append(vocab.vocab2index["<EOS>"])
    return torch.Tensor(indexes).long().to(device).view(-1,1)

In [10]:
# training
def train(pairs, source_vocab, target_vocab, encoder, decoder, n_iter, print_every=1000, learning_rate=0.01, loss_total=0):
    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    
    training_batch = [random.choice(pairs) for _ in range(n_iter)] # batch 생성
    # vocab to tensor
    training_source = [tensorize(source_vocab, pair[0]) for pair in training_batch]
    training_target = [tensorize(target_vocab, pair[1]) for pair in training_batch]
    
    # criterion = nn.CrossEntropyLoss() # [1,16]이 들어가야한다. 하지만 입력이 [1]이 되므로 안됨
    criterion = nn.NLLLoss()
    
    for i in range(1, n_iter + 1):
        source_tensor = training_source[i - 1]
        target_tensor = training_target[i - 1]
        
        encoder_hidden = torch.zeros([1,1,encoder.hidden_size]).to(device) # 초기 hidden은 0
        
        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()
        
        source_length = source_tensor.size(0)
        target_length = target_tensor.size(0)
        
        loss = 0
        for enc_input in range(source_length):
            _, encoder_hidden = encoder(source_tensor[enc_input], encoder_hidden) # 입력만 주구장창
            
        decoder_input = torch.Tensor([[SOS_token]]).long().to(device) # decoder의 시작은 SOS
        decoder_hidden = encoder_hidden # encoder hidden은 decoder hidden으로 전이된다.
        
        for di in range(target_length): # decoder 학습
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]
        
        loss.backward()
        encoder_optimizer.step()
        decoder_optimizer.step()
        
        loss_iter = loss.item() / target_length
        loss_total += loss_iter
    
        if i % print_every == 0:
            loss_avg = loss_total / print_every
            loss_total = 0
            print("{} - {} loss = {:05.4f}".format(i, i / n_iter * 100, loss_avg))

In [11]:
# evaluate the result
def evaluate(pairs, source_vocab, target_vocab, encoder, decoder, target_max_length):
    for pair in pairs:
        print(">", pair[0])
        print("=", pair[0])
        source_tensor = tensorize(source_vocab, pair[0])
        source_length = source_tensor.size()[0]
        encoder_hidden = torch.zeros([1,1,encoder.hidden_size]).to(device) # encoder의 초기 hidden은 0
        
        for ei in range(source_length):
            _, encoder_hidden = encoder(source_tensor[ei], encoder_hidden)
        
        decoder_input = torch.Tensor([[SOS_token]], device=device).long()
        decoder_hidden = encoder_hidden
        decoded_words = []
        
        # topk : key/value에서 value를 기준으로 K번째로 높은 값을 구하는 것이다.
        # values, indexs = torch.topk(predict, k=k, dim=-1)
        # https://pytorch.org/docs/stable/generated/torch.topk.html#torch.topk
        for di in range(target_max_length):
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
            _, top_index = decoder_output.data.topk(1)
            if top_index.item() == EOS_token: # EOS인지 확인
                decoded_words.append("<EOS>")
                break
            else:
                decoded_words.append(target_vocab.index2vocab[top_index.item()])
            decoder_input = top_index.squeeze().detach()
            
        predict_words = decoded_words
        predict_sentence = " ".join(predict_words)
        print("<", predict_sentence)
        print("")

In [12]:
SOURCE_MAX_LENGTH = 10
TARGET_MAX_LENGTH = 12

In [13]:
load_pairs, load_source_vocab, load_target_vocab = preprocess(raw, SOURCE_MAX_LENGTH, TARGET_MAX_LENGTH)
print(random.choice(load_pairs)) # list에서 아무거나 하나 뽑아준다.

read 4 sentences pairs
trimmed to 4 sentence pairs
source vocab size =  17
target vocab size =  13
['pytorch is very easy.', '파이토치는 매우 쉽다.']


In [14]:
enc_hidden_size = 16
dec_hidden_size = 16
enc = Encoder(load_source_vocab.n_vocab, enc_hidden_size).to(device)
dec = Decoder(dec_hidden_size, load_target_vocab.n_vocab).to(device)

In [15]:
train(load_pairs, load_source_vocab, load_target_vocab, enc, dec, 5000, print_every=1000)

1000 - 20.0 loss = 0.7360
2000 - 40.0 loss = 0.1106
3000 - 60.0 loss = 0.0358
4000 - 80.0 loss = 0.0187
5000 - 100.0 loss = 0.0127


In [16]:
evaluate(load_pairs, load_source_vocab, load_target_vocab, enc, dec, TARGET_MAX_LENGTH)

> i feel hungry.
= i feel hungry.
< 나는 배가 고프다. <EOS>

> pytorch is very easy.
= pytorch is very easy.
< 파이토치는 매우 쉽다. <EOS>

> pytorch is a framework for deep learning.
= pytorch is a framework for deep learning.
< 파이토치는 딥러닝을 위한 프레임워크이다. <EOS>

> pytorch is very clear to use.
= pytorch is very clear to use.
< 파이토치는 사용하기 매우 직관적이다. <EOS>

