In [1]:
import random
import torch
import torch.nn as nn
import torch.optim as optim

In [2]:
torch.manual_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
raw = raw = ["I feel hungry.	나는 배가 고프다.",
       "Pytorch is very easy.	파이토치는 매우 쉽다.",
       "Pytorch is a framework for deep learning.	파이토치는 딥러닝을 위한 프레임워크이다.",
       "Pytorch is very clear to use.	파이토치는 사용하기 매우 직관적이다."]
# 병렬 코퍼스

In [4]:
SOS_TOKEN = 0
#start of sentence
EOS_TOKEN = 1

In [5]:

class Vocab:
    def __init__(self):
        self.vocab2index = {"<SOS>":SOS_TOKEN, "<EOS>":EOS_TOKEN}
        self.index2voab = {SOS_TOKEN:"<SOS>",EOS_TOKEN:"<EOS>"}
        self.vocab_count = {}
        self.n_vocab = len(self.vocab2index)

    def add_vocab(self,sentence):
        for word in sentence.split(" "):
            if word not in self.vocab2index:
                self.vocab2index[word] = self.n_vocab
                self.vocab_count[word] = 1
                self.index2voab[self.n_vocab] = word
                self.n_vocab += 1
            else:
                self.vocab_count[word] += 1

In [27]:
# pair - 원문
# 번역기에서는 max_len 을 서로 다르게 준다.
def filter_pair(pair, source_max_length, target_max_length):
    return len(pair[0].split(" "))<source_max_length and len(pair[1].split(" "))< target_max_length

In [25]:
# sourse - target 으로 병렬 
def preprocess(corpus, source_max_length, target_max_length):
    pairs = []
    for line in corpus:
        pairs.append([s for s in line.strip().lower().split("\t")])
        #print(pairs)

    pairs = [pair for pair in pairs if filter_pair(pair, source_max_length, target_max_length)]

    source_vocab = Vocab()
    target_vocab = Vocab()

    for pair in pairs:
        source_vocab.add_vocab(pair[0])
        target_vocab.add_vocab(pair[1])
    
    print('source vocab size =', source_vocab.n_vocab)
    print('target vocab size =', target_vocab.n_vocab)

    return pairs, source_vocab, target_vocab


In [11]:
preprocess(raw, 10, 10)

[[['i feel hungry.', '나는 배가 고프다.']]]
[[['i feel hungry.', '나는 배가 고프다.']], [['pytorch is very easy.', '파이토치는 매우 쉽다.']]]
[[['i feel hungry.', '나는 배가 고프다.']], [['pytorch is very easy.', '파이토치는 매우 쉽다.']], [['pytorch is a framework for deep learning.', '파이토치는 딥러닝을 위한 프레임워크이다.']]]
[[['i feel hungry.', '나는 배가 고프다.']], [['pytorch is very easy.', '파이토치는 매우 쉽다.']], [['pytorch is a framework for deep learning.', '파이토치는 딥러닝을 위한 프레임워크이다.']], [['pytorch is very clear to use.', '파이토치는 사용하기 매우 직관적이다.']]]


In [28]:
preprocess(raw, 10, 10)

source vocab size = 17
target vocab size = 13


([['i feel hungry.', '나는 배가 고프다.'],
  ['pytorch is very easy.', '파이토치는 매우 쉽다.'],
  ['pytorch is a framework for deep learning.', '파이토치는 딥러닝을 위한 프레임워크이다.'],
  ['pytorch is very clear to use.', '파이토치는 사용하기 매우 직관적이다.']],
 <__main__.Vocab at 0x7f9ea16c3210>,
 <__main__.Vocab at 0x7f9f053adf50>)

In [38]:
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Encoder,self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, x, hidden):
        x = self.embedding(x).view(1,1,-1)
        x, hidden = self.gru(x,hidden)
        return x, hidden

In [19]:
class Decoder(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, x, hidden):
        x = self.embedding(x).view(1,1,-1)
        x,hidden = self.gru(x,hidden)
        x = self.softmax(self.out(x[0]))
        return x, hidden
    

In [50]:
def tensorize(vocab, sentence):
    indices = [vocab.vocab2index[word] for word in sentence.split(" ")]
    indices.append(vocab.vocab2index['<EOS>'])

    return torch.Tensor(indices).long().to(device).view(-1, 1)

In [51]:
SOURCE_MAX_LENGTH = 10
TARGET_MAX_LENGTH = 12

In [52]:
load_pairs, load_source_vocab, load_target_vocab = preprocess(raw, SOURCE_MAX_LENGTH, TARGET_MAX_LENGTH)
print(random.choice(load_pairs))

source vocab size = 17
target vocab size = 13
['pytorch is a framework for deep learning.', '파이토치는 딥러닝을 위한 프레임워크이다.']


In [53]:
enc_hidden_size = 16
dec_hidden_size = enc_hidden_size
enc = Encoder(load_source_vocab.n_vocab, enc_hidden_size).to(device)
dec = Decoder(dec_hidden_size, load_target_vocab.n_vocab).to(device)

In [54]:
def train(pairs, source_vocab, target_vocab, encoder, decoder, n_iter, print_every=100, learning_rate=0.01):
    loss_total = 0

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)

    training_batch = [random.choice(pairs) for _ in range(n_iter)]

    training_source = [tensorize(source_vocab, pair[0]) for pair in training_batch]
    training_target = [tensorize(target_vocab, pair[1]) for pair in training_batch]

    criterion = nn.NLLLoss()


    for i in range(1, n_iter + 1):
        source_tensor = training_source[i - 1]
        target_tensor = training_target[i - 1]
    

In [None]:
training_batch = [random.choice(load_pairs) for _ in range(10)]
print(training_batch)
training_source = [tensorize(load_source_vocab, pair[0]) for pair in training_batch]
training_target = [tensorize(load_target_vocab, pair[1]) for pair in training_batch]

print(training_source)
print(training_target)

In [56]:
encoder_hidden = torch.zeros([1, 1, enc.hidden_size])

In [57]:
training_source[0].shape

torch.Size([5, 1])

In [58]:
encoder_hidden = torch.zeros([1,1,enc.hidden_size]).to(device)
encoder_hidden.shape

torch.Size([1, 1, 16])

In [60]:
#output, encoder_hidden = enc(training_source[0][0],enc_hidden_size)