In [1]:
%matplotlib inline

In [2]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

from konlpy.tag import Komoran; tokenizer = Komoran()

import torch
from torch.jit import script, trace
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import csv
import random
import re
import os
import unicodedata
import codecs
from io import open
import itertools

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
corpus_name = "normal chatbot data"
corpus = os.path.join("data", corpus_name)
textfilename = "speech_data.txt"

def printLines(file, n=10):
    with open(file, 'r', encoding = 'utf-8') as datafile:
        lines = datafile.readlines()
    for line in lines[:n]:
        print(line)

printLines(os.path.join(corpus, textfilename))
datafile = os.path.join(corpus, textfilename)

곧 만기 은퇴를 앞두고 있어. 노후 준비를 미리 꼼꼼하게 해두어서 기뻐.	노후 준비를 미리 꼼꼼하게 해두어서 기쁘시겠군요.

이렇게 노후 준비를 할 수 있는 건 옆에서 도와준 친구 덕분이야. 이 고마움을 뭐로 갚지?	노후 준비를 도운 친구에게 고마움을 느끼시고 있군요. 상대방에게 고마움을 표현할 수 있는 좋은 방법이 있을까요?

그 친구가 술을 좋아하니까 조만간 술을 들고 친구 집에 찾아가야겠어.	선물로 친구에게 고마움을 표현할 수 있으면 좋겠어요.

친구가 노후 자금 마련을 도와줘서 노후 걱정이 없어. 마음이 아주 든든해.	노후자금 마련을 도와준 친구 덕분에 노후 걱정이 없어 기쁘시겠군요.

다 그 친구가 주식 정보를 물어다 준 덕분이지. 이 고마움을 뭐로 표현하지?	주식 정보를 알려준 친구에게 고마움을 느끼시고 있군요. 이 고마움을 무엇으로 표현할 수 있을까요?

친구에게 내가 잘 된 건 다 너 덕분이라고 고맙다는 문자 한 통 넣어야겠어.	문자로 친구에게 고마움을 표현할 수 있으면 좋겠어요.

회사에 다시 복직하게 돼서 기쁘네. 부당 해고 당할까 봐 어찌나 맘 졸였는지.	부당하게 해고당할 뻔 한 회사에 다시 복직하게 돼서 기쁘시겠군요.

이건 다 처음 해고당했을 때 같이 목숨 걸고 싸워준 우리 노조 위원장 덕분이야.	처음 해고당했을 때 함께 싸워준 분께 고마워하시고 있군요. 어떻게 이 고마운 마음을 표현할 수 있을까요?

조만간 제대로 인사차 노조 사무실에 들러야겠어. 복직하게 도와줘서 고맙다고 해야지.	직접 방문해 인사를 드려서 고마움이 표현되면 좋겠어요.

회사에서 정년퇴직 후에 대리점 사장을 하면 어떻겠냐고 물어 보더군. 기분이 참 좋아.	정년퇴직 후에 대리점 사장을 제안해서 기분이 좋으시겠어요.



In [4]:
# 기본 단어 토큰 값
PAD_token = 0  # 짧은 문장을 채울(패딩, PADding) 때 사용할 제로 토큰
SOS_token = 1  # 문장의 시작(SOS, Start Of Sentence)을 나타내는 토큰
EOS_token = 2  # 문장의 끝(EOS, End Of Sentence)을 나태는 토큰

class Voc:
    def __init__(self, name):
        self.name = name
        self.trimmed = False
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3  # SOS, EOS, PAD를 센 것

    def addSentence(self, sentence):
        for word in tokenizer.morphs(sentence):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.num_words
            self.word2count[word] = 1
            self.index2word[self.num_words] = word
            self.num_words += 1
        else:
            self.word2count[word] += 1

    # 등장 횟수가 기준 이하인 단어를 정리합니다
    def trim(self, min_count):
        if self.trimmed:
            return
        self.trimmed = True

        keep_words = []

        for k, v in self.word2count.items():
            if v >= min_count:
                keep_words.append(k)

        print('keep_words {} / {} = {:.4f}'.format(
            len(keep_words), len(self.word2index), len(keep_words) / len(self.word2index)
        ))

        # 사전을 다시 초기화힙니다
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3 # 기본 토큰을 센 것

        for word in keep_words:
            self.addWord(word)

In [5]:
MAX_LENGTH = 15  # 고려할 문장의 최대 길이

# 유니코드 문자열을 아스키로 변환합니다
# https://stackoverflow.com/a/518232/2809427 참고
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# 소문자로 만들고, 공백을 넣고, 알파벳 외의 글자를 제거합니다
def normalizeString(s):
    hangul = re.compile('[^ ㄱ-ㅣ가-힣 ^☆; ^a-zA-Z.!?;0-9]+')
    result = hangul.sub('', s)
    return result

# 질의/응답 쌍을 읽어서 voc 객체를 반환합니다
def readVocs(datafile, corpus_name):
    print("Reading lines...")
    # 파일을 읽고, 쪼개어 lines에 저장합니다
    lines = open(datafile, encoding='utf-8').\
        read().strip().split('\n')
    # 각 줄을 쪼개어 pairs에 저장하고 정규화합니다
    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]
    voc = Voc(corpus_name)
    return voc, pairs

# 문장의 쌍 'p'에 포함된 두 문장이 모두 MAX_LENGTH라는 기준보다 짧은지를 반환합니다
def filterPair(p):
    # EOS 토큰을 위해 입력 시퀀스의 마지막 단어를 보존해야 합니다
    return len(tokenizer.morphs(p[0])) < MAX_LENGTH and len(tokenizer.morphs(p[1])) < MAX_LENGTH

# 조건식 filterPair에 따라 pairs를 필터링합니다
def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

# 앞에서 정의한 함수를 이용하여 만든 voc 객체와 리스트 pairs를 반환합니다
def loadPrepareData(corpus, corpus_name, datafile, save_dir):
    print("Start preparing training data ...")
    voc, pairs = readVocs(datafile, corpus_name)
    print("Read {!s} sentence pairs".format(len(pairs)))
    pairs = filterPairs(pairs)
    print("Trimmed to {!s} sentence pairs".format(len(pairs)))
    print("Counting words...")
    for pair in pairs:
        voc.addSentence(pair[0])
        voc.addSentence(pair[1])
    print("Counted words:", voc.num_words)
    return voc, pairs


# voc와 pairs를 읽고 재구성합니다
save_dir = os.path.join("data", "save")
voc, pairs = loadPrepareData(corpus, corpus_name, datafile, save_dir)
# 검증을 위해 pairs의 일부 내용을 출력해 봅니다
print("\npairs:")
for pair in pairs[:10]:

    print(pair)

Start preparing training data ...
Reading lines...
Read 27513 sentence pairs
Trimmed to 2366 sentence pairs
Counting words...
Counted words: 2653

pairs:
['감사하더라고. 좋지.', '감사한 마음을 어떻게 전하면 좋을까요?']
['고맙다고 전화라도 많이 해야겠어.', '전화로 고마운 마음이 전달되면 좋겠어요.']
['남편에게 편지를 적어봐야겠어.', '남편 분에게 마음이 잘 전달되면 좋겠어요.']
['손은 떨리지만 아직 일을 더 하고 싶어.', '그렇게 하려면 어떻게 해야 할까요?']
['아빠와 이야기를 좀 나눠봐야겠어.', '아버지와의 대화가 도움이 되길 바랄게요.']
['박 씨를 위해서 병문안을 가려고 해.', '병문안으로 기쁜 감정이 계속 유지되길 바랄게요.']
['지난번 기말고사보다 성적이 더 떨어져서 걱정이야.', '성적이 더 떨어지셨군요. 많이 속상하시겠어요.']
['이놈 저놈 아주 내가 은행인줄 아는구먼!', '왜 화가 나셨나요? 무슨 일이 있었나요?']
['이렇게 계속 약속을 안 지켜서 너무 속상해.', '아버지가 어떻게 하셨으면 좋으시겠어요?']
['아니. 형은 계속 이유 없이 나를 괴롭혀.', '어떻게 하면 형이 그만 괴롭힐까요?']


In [6]:
MIN_COUNT = 2    # 제외할 단어의 기준이 되는 등장 횟수

def trimRareWords(voc, pairs, MIN_COUNT):
    # MIN_COUNT 미만으로 사용된 단어는 voc에서 제외합니다
    voc.trim(MIN_COUNT)
    # 제외할 단어가 포함된 경우를 pairs에서도 제외합니다
    keep_pairs = []
    for pair in pairs:
        input_sentence = pair[0]
        output_sentence = pair[1]
        keep_input = True
        keep_output = True
        # 입력 문장을 검사합니다
        for word in tokenizer.morphs(input_sentence):
            if word not in voc.word2index:
                keep_input = False
                break
        # 출력 문장을 검사합니다
        for word in tokenizer.morphs(output_sentence):
            if word not in voc.word2index:
                keep_output = False
                break

        # 입출력 문장에 제외하기로 한 단어를 포함하지 않는 경우만을 남겨둡니다
        if keep_input and keep_output:
            keep_pairs.append(pair)

    print("Trimmed from {} pairs to {}, {:.4f} of total".format(len(pairs), len(keep_pairs), len(keep_pairs) / len(pairs)))
    return keep_pairs


# voc와 pairs를 정돈합니다
pairs = trimRareWords(voc, pairs, MIN_COUNT)

keep_words 1611 / 2650 = 0.6079
Trimmed from 2366 pairs to 1610, 0.6805 of total


In [7]:
def indexesFromSentence(voc, sentence):
    return [voc.word2index[word] for word in tokenizer.morphs(sentence)] + [EOS_token]


def zeroPadding(l, fillvalue=PAD_token):
    return list(itertools.zip_longest(*l, fillvalue=fillvalue))

def binaryMatrix(l, value=PAD_token):
    m = []
    for i, seq in enumerate(l):
        m.append([])
        for token in seq:
            if token == PAD_token:
                m[i].append(0)
            else:
                m[i].append(1)
    return m

# 입력 시퀀스 텐서에 패딩한 결과와 lengths를 반환합니다
def inputVar(l, voc):
    indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch)
    padVar = torch.LongTensor(padList)
    return padVar, lengths

# 패딩한 목표 시퀀스 텐서, 패딩 마스크, 그리고 최대 목표 길이를 반환합니다
def outputVar(l, voc):
    indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
    max_target_len = max([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch)
    mask = binaryMatrix(padList)
    mask = torch.ByteTensor(mask)
    padVar = torch.LongTensor(padList)
    return padVar, mask, max_target_len

# 입력 배치를 이루는 쌍에 대한 모든 아이템을 반환합니다
def batch2TrainData(voc, pair_batch):
    pair_batch.sort(key=lambda x: len(x[0].split(" ")), reverse=True)
    input_batch, output_batch = [], []
    for pair in pair_batch:
        input_batch.append(pair[0])
        output_batch.append(pair[1])
    inp, lengths = inputVar(input_batch, voc)
    output, mask, max_target_len = outputVar(output_batch, voc)
    return inp, lengths, output, mask, max_target_len

In [8]:
class EncoderRNN(nn.Module):
    def __init__(self, hidden_size, embedding, n_layers=1, dropout=0):
        super(EncoderRNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.embedding = embedding

        # GRU를 초기화합니다. input_size와 hidden_size 패러미터는 둘 다 'hidden_size'로
        # 둡니다. 이는 우리 입력의 크기가 hideen_size 만큼의 피처를 갖는 단어 임베딩이기
        # 때문입니다.
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers,
                          dropout=(0 if n_layers == 1 else dropout), bidirectional=True)

    def forward(self, input_seq, input_lengths, hidden=None):
        # 단어 인덱스를 임베딩으로 변환합니다
        embedded = self.embedding(input_seq)
        # RNN 모듈을 위한 패딩된 배치 시퀀스를 패킹합니다
        packed = nn.utils.rnn.pack_padded_sequence(embedded, input_lengths, enforce_sorted=False)
        # GRU로 포워드 패스를 수행합니다
        outputs, hidden = self.gru(packed, hidden)
        # 패딩을 언패킹합니다
        outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs)
        # 양방향 GRU의 출력을 합산합니다
        outputs = outputs[:, :, :self.hidden_size] + outputs[:, : ,self.hidden_size:]
        # 출력과 마지막 은닉 상태를 반환합니다
        return outputs, hidden

In [9]:
# Luong 어텐션 레이어
class Attn(nn.Module):
    def __init__(self, method, hidden_size):
        super(Attn, self).__init__()
        self.method = method
        if self.method not in ['dot', 'general', 'concat']:
            raise ValueError(self.method, "is not an appropriate attention method.")
        self.hidden_size = hidden_size
        if self.method == 'general':
            self.attn = nn.Linear(self.hidden_size, hidden_size)
        elif self.method == 'concat':
            self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
            self.v = nn.Parameter(torch.FloatTensor(hidden_size))

    def dot_score(self, hidden, encoder_output):
        return torch.sum(hidden * encoder_output, dim=2)

    def general_score(self, hidden, encoder_output):
        energy = self.attn(encoder_output)
        return torch.sum(hidden * energy, dim=2)

    def concat_score(self, hidden, encoder_output):
        energy = self.attn(torch.cat((hidden.expand(encoder_output.size(0), -1, -1), encoder_output), 2)).tanh()
        return torch.sum(self.v * energy, dim=2)

    def forward(self, hidden, encoder_outputs):
        # Attention 가중치(에너지)를 제안된 방법에 따라 계산합니다
        if self.method == 'general':
            attn_energies = self.general_score(hidden, encoder_outputs)
        elif self.method == 'concat':
            attn_energies = self.concat_score(hidden, encoder_outputs)
        elif self.method == 'dot':
            attn_energies = self.dot_score(hidden, encoder_outputs)

        # max_length와 batch_size의 차원을 뒤집습니다
        attn_energies = attn_energies.t()

        # 정규화된 softmax 확률 점수를 반환합니다 (차원을 늘려서)
        return F.softmax(attn_energies, dim=1).unsqueeze(1)

In [10]:
class LuongAttnDecoderRNN(nn.Module):
    def __init__(self, attn_model, embedding, hidden_size, output_size, n_layers=1, dropout=0.1):
        super(LuongAttnDecoderRNN, self).__init__()

        # 참조를 보존해 둡니다
        self.attn_model = attn_model
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout = dropout

        # 레이어를 정의합니다
        self.embedding = embedding
        self.embedding_dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=(0 if n_layers == 1 else dropout))
        self.concat = nn.Linear(hidden_size * 2, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)

        self.attn = Attn(attn_model, hidden_size)

    def forward(self, input_step, last_hidden, encoder_outputs):
        # 주의: 한 단위 시간에 대해 한 단계(단어)만을 수행합니다
        # 현재의 입력 단어에 대한 임베딩을 구합니다
        embedded = self.embedding(input_step)
        embedded = self.embedding_dropout(embedded)
        # 무방향 GRU로 포워드 패스를 수행합니다
        rnn_output, hidden = self.gru(embedded, last_hidden)
        # 현재의 GRU 출력을 바탕으로 어텐션 가중치를 계산합니다
        attn_weights = self.attn(rnn_output, encoder_outputs)
        # 인코더 출력에 어텐션을 곱하여 새로운 "가중치 합" 문백 벡터를 구합니다
        context = attn_weights.bmm(encoder_outputs.transpose(0, 1))
        # Luong의 논문에 나온 식 5를 이용하여 가중치 문백 벡터와 GRU 출력을 결합합니다
        rnn_output = rnn_output.squeeze(0)
        context = context.squeeze(1)
        concat_input = torch.cat((rnn_output, context), 1)
        concat_output = torch.tanh(self.concat(concat_input))
        # Luong의 논문에 나온 식 6을 이용하여 다음 단어를 예측합니다
        output = self.out(concat_output)
        output = F.softmax(output, dim=1)
        # 출력과 마지막 은닉 상태를 반환합니다
        return output, hidden

In [11]:
def maskNLLLoss(inp, target, mask):
    nTotal = mask.sum()
    crossEntropy = -torch.log(torch.gather(inp, 1, target.view(-1, 1)).squeeze(1))
    loss = crossEntropy.masked_select(mask).mean()
    loss = loss.to(device)
    return loss, nTotal.item()

In [12]:
def train(input_variable, lengths, target_variable, mask, max_target_len, encoder, decoder, embedding,
          encoder_optimizer, decoder_optimizer, batch_size, clip, max_length=MAX_LENGTH):

    # 제로 그라디언트
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    # device 옵션을 설정합니다
    input_variable = input_variable.to(device)
    lengths = lengths.to(device)
    target_variable = target_variable.to(device)
    mask = mask.to(device)

    # 변수를 초기화합니다
    loss = 0
    print_losses = []
    n_totals = 0

    # 인코더로 포워드 패스를 수행합니다
    encoder_outputs, encoder_hidden = encoder(input_variable, lengths)

    # 초기 디코더 입력을 생성합니다(각 문장을 SOS 도큰으로 시작합니다)
    decoder_input = torch.LongTensor([[SOS_token for _ in range(batch_size)]])
    decoder_input = decoder_input.to(device)

    # 디코더의 초기 은닉 상태를 인코더의 마지막 은닉 상태로 둡니다
    decoder_hidden = encoder_hidden[:decoder.n_layers]

    # 이번 반복에서 teacher forcing을 사용할지를 결정합니다
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    # 배치 시퀀스를 한 번에 하나씩 디코더로 포워드 패스합니다
    if use_teacher_forcing:
        for t in range(max_target_len):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden, encoder_outputs
            )
            # Teacher forcing 사용: 다음 입력을 현재의 목표로 둡니다
            decoder_input = target_variable[t].view(1, -1)
            # 손실을 계산하고 누적합니다
            mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal
    else:
        for t in range(max_target_len):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden, encoder_outputs
            )
            # Teacher forcing 미사용: 다음 입력을 디코더의 출력으로 둡니다
            _, topi = decoder_output.topk(1)
            decoder_input = torch.LongTensor([[topi[i][0] for i in range(batch_size)]])
            decoder_input = decoder_input.to(device)
            # 손실을 계산하고 누적합니다
            mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal

    # 역전파를 수행합니다
    loss.backward()

    # 그라디언트 클리핑: 그라디언트를 제자리에서 수정합니다
    _ = nn.utils.clip_grad_norm_(encoder.parameters(), clip)
    _ = nn.utils.clip_grad_norm_(decoder.parameters(), clip)

    # 모델의 가중치를 수정합니다
    encoder_optimizer.step()
    decoder_optimizer.step()

    return sum(print_losses) / n_totals

In [13]:
def trainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer, embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size, print_every, save_every, clip, corpus_name, loadFilename):

    # 각 단계에 대한 배치를 읽어옵니다
    training_batches = [batch2TrainData(voc, [random.choice(pairs) for _ in range(batch_size)])
                      for _ in range(n_iteration)]

    # 초기화
    print('Initializing ...')
    start_iteration = 1
    print_loss = 0
    if loadFilename:
        start_iteration = checkpoint['iteration'] + 1

    # 학습 루프
    print("Training...")
    for iteration in range(start_iteration, n_iteration + 1):
        training_batch = training_batches[iteration - 1]
        # 배치에서 각 필드를 읽어옵니다
        input_variable, lengths, target_variable, mask, max_target_len = training_batch

        # 배치에 대해 학습을 한 단계 진행합니다
        loss = train(input_variable, lengths, target_variable, mask, max_target_len, encoder,
                     decoder, embedding, encoder_optimizer, decoder_optimizer, batch_size, clip)
        print_loss += loss

        # 경과를 출력합니다
        if iteration % print_every == 0:
            print_loss_avg = print_loss / print_every
            print("Iteration: {}; Percent complete: {:.1f}%; Average loss: {:.4f}".format(iteration, iteration / n_iteration * 100, print_loss_avg))
            print_loss = 0

        # Checkpoint를 저장합니다
        if (iteration % save_every == 0):
            directory = os.path.join(save_dir, model_name, corpus_name, '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size))
            if not os.path.exists(directory):
                os.makedirs(directory)
            torch.save({
                'iteration': iteration,
                'en': encoder.state_dict(),
                'de': decoder.state_dict(),
                'en_opt': encoder_optimizer.state_dict(),
                'de_opt': decoder_optimizer.state_dict(),
                'loss': loss,
                'voc_dict': voc.__dict__,
                'embedding': embedding.state_dict()
            }, os.path.join(directory, '{}_{}.tar'.format(iteration, 'checkpoint')))

In [14]:
class GreedySearchDecoder(nn.Module):
    def __init__(self, encoder, decoder):
        super(GreedySearchDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, input_seq, input_length, max_length):
        # 인코더 모델로 입력을 포워드 패스합니다
        encoder_outputs, encoder_hidden = self.encoder(input_seq, input_length)
        # 인코더의 마지막 은닉 레이어가 디코더의 첫 번째 은닉 레이어의 입력이 되도록 준비합니다
        decoder_hidden = encoder_hidden[:decoder.n_layers]
        # 디코더의 첫 번째 입력을 SOS_token으로 초기화합니다
        decoder_input = torch.ones(1, 1, device=device, dtype=torch.long) * SOS_token
        # 디코더가 단어를 덧붙여 나갈 텐서를 초기화합니다
        all_tokens = torch.zeros([0], device=device, dtype=torch.long)
        all_scores = torch.zeros([0], device=device)
        # 반복적으로 각 단계마다 하나의 단어 토큰을 디코딩합니다
        for _ in range(max_length):
            # 디코더로의 포워드 패스를 수행합니다
            decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden, encoder_outputs)
            # 가장 가능성 높은 단어 토큰과 그 softmax 점수를 구합니다
            decoder_scores, decoder_input = torch.max(decoder_output, dim=1)
            # 토큰과 점수를 기록합니다
            all_tokens = torch.cat((all_tokens, decoder_input), dim=0)
            all_scores = torch.cat((all_scores, decoder_scores), dim=0)
            # 현재의 토큰을 디코더의 다음 입력으로 준비시킵니다(차원을 증가시켜서)
            decoder_input = torch.unsqueeze(decoder_input, 0)
        # 단어 토큰과 점수를 모아서 반환합니다
        return all_tokens, all_scores

In [15]:
def evaluate(encoder, decoder, searcher, voc, sentence, max_length=MAX_LENGTH):
    ### 입력 시퀀스를 배치 형태로 만듭니다
    # 단어 -> 인덱스
    indexes_batch = [indexesFromSentence(voc, sentence)]
    # lengths 텐서를 만듭니다
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    # 배치의 차원을 뒤집어서 모델이 사용하는 형태로 만듭니다
    input_batch = torch.LongTensor(indexes_batch).transpose(0, 1)
    # 적절한 디바이스를 사용합니다
    input_batch = input_batch.to(device)
    lengths = lengths.to(device)
    # searcher를 이용하여 문장을 디코딩합니다
    tokens, scores = searcher(input_batch, lengths, max_length)
    # 인덱스 -> 단어
    decoded_words = [voc.index2word[token.item()] for token in tokens]
    return decoded_words


def evaluateInput(encoder, decoder, searcher, voc):
    input_sentence = ''
    while(1):
        try:
            # 입력 문장을 받아옵니다
            input_sentence = input('> ')
            # 종료 조건인지 검사합니다
            if input_sentence == 'q' or input_sentence == 'quit': break
            # 문장을 정규화합니다
            input_sentence = normalizeString(input_sentence)
            # 문장을 평가합니다
            output_words = evaluate(encoder, decoder, searcher, voc, input_sentence)
            # 응답 문장을 형식에 맞춰 출력합니다
            output_words[:] = [x for x in output_words if not (x == 'EOS' or x == 'PAD')]
            print('Bot:', ' '.join(output_words))

        except KeyError:
            print("Error: Encountered unknown word.")

In [16]:
# 모델을 설정합니다
model_name = 'cb_model'
attn_model = 'dot'
#attn_model = 'general'
#attn_model = 'concat'
hidden_size = 500
encoder_n_layers = 2
decoder_n_layers = 2
dropout = 0.1
batch_size = 64

# 불러올 checkpoint를 설정합니다. 처음부터 시작할 때는 None으로 둡니다.
loadFilename = None
checkpoint_iter = 5000
#loadFilename = os.path.join(save_dir, model_name, corpus_name,
#                            '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size),
#                            '{}_checkpoint.tar'.format(checkpoint_iter))


# loadFilename이 제공되는 경우에는 모델을 불러옵니다
if loadFilename:
    # 모델을 학습할 때와 같은 기기에서 불러오는 경우
    checkpoint = torch.load(loadFilename)
    # GPU에서 학습한 모델을 CPU로 불러오는 경우
    #checkpoint = torch.load(loadFilename, map_location=torch.device('cpu'))
    encoder_sd = checkpoint['en']
    decoder_sd = checkpoint['de']
    encoder_optimizer_sd = checkpoint['en_opt']
    decoder_optimizer_sd = checkpoint['de_opt']
    embedding_sd = checkpoint['embedding']
    voc.__dict__ = checkpoint['voc_dict']


print('Building encoder and decoder ...')
# 단어 임베딩을 초기화합니다
embedding = nn.Embedding(voc.num_words, hidden_size)
if loadFilename:
    embedding.load_state_dict(embedding_sd)
# 인코더 및 디코더 모델을 초기화합니다
encoder = EncoderRNN(hidden_size, embedding, encoder_n_layers, dropout)
decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size, voc.num_words, decoder_n_layers, dropout)
if loadFilename:
    encoder.load_state_dict(encoder_sd)
    decoder.load_state_dict(decoder_sd)
# 적절한 디바이스를 사용합니다
encoder = encoder.to(device)
decoder = decoder.to(device)
print('Models built and ready to go!')

Building encoder and decoder ...
Models built and ready to go!


In [None]:
# 학습 및 최적화 설정
clip = 50.0
teacher_forcing_ratio = 1.0
learning_rate = 0.0001
decoder_learning_ratio = 5.0
n_iteration = 5000
print_every = 1
save_every = 500

# Dropout 레이어를 학습 모드로 둡니다
encoder.train()
decoder.train()

# Optimizer를 초기화합니다
print('Building optimizers ...')
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate * decoder_learning_ratio)
if loadFilename:
    encoder_optimizer.load_state_dict(encoder_optimizer_sd)
    decoder_optimizer.load_state_dict(decoder_optimizer_sd)

# cuda가 있다면 cuda를 설정합니다
for state in encoder_optimizer.state.values():
    for k, v in state.items():
        if isinstance(v, torch.Tensor):
            state[k] = v.cuda()

for state in decoder_optimizer.state.values():
    for k, v in state.items():
        if isinstance(v, torch.Tensor):
            state[k] = v.cuda()
    
# 학습 단계를 수행합니다
print("Starting Training!")
trainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer,
           embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size,
           print_every, save_every, clip, corpus_name, loadFilename)

Building optimizers ...
Starting Training!
Initializing ...
Training...


  loss = crossEntropy.masked_select(mask).mean()


Iteration: 1; Percent complete: 0.0%; Average loss: 7.3833
Iteration: 2; Percent complete: 0.0%; Average loss: 7.2881
Iteration: 3; Percent complete: 0.1%; Average loss: 7.1608
Iteration: 4; Percent complete: 0.1%; Average loss: 6.9880
Iteration: 5; Percent complete: 0.1%; Average loss: 6.6673
Iteration: 6; Percent complete: 0.1%; Average loss: 6.3233
Iteration: 7; Percent complete: 0.1%; Average loss: 6.0517
Iteration: 8; Percent complete: 0.2%; Average loss: 6.0374
Iteration: 9; Percent complete: 0.2%; Average loss: 5.7972
Iteration: 10; Percent complete: 0.2%; Average loss: 5.4699
Iteration: 11; Percent complete: 0.2%; Average loss: 5.1481
Iteration: 12; Percent complete: 0.2%; Average loss: 4.9681
Iteration: 13; Percent complete: 0.3%; Average loss: 5.0448
Iteration: 14; Percent complete: 0.3%; Average loss: 4.8957
Iteration: 15; Percent complete: 0.3%; Average loss: 4.8109
Iteration: 16; Percent complete: 0.3%; Average loss: 4.7969
Iteration: 17; Percent complete: 0.3%; Average lo

Iteration: 138; Percent complete: 2.8%; Average loss: 2.2016
Iteration: 139; Percent complete: 2.8%; Average loss: 2.4575
Iteration: 140; Percent complete: 2.8%; Average loss: 2.2470
Iteration: 141; Percent complete: 2.8%; Average loss: 2.4293
Iteration: 142; Percent complete: 2.8%; Average loss: 2.4415
Iteration: 143; Percent complete: 2.9%; Average loss: 2.3851
Iteration: 144; Percent complete: 2.9%; Average loss: 2.1438
Iteration: 145; Percent complete: 2.9%; Average loss: 2.5613
Iteration: 146; Percent complete: 2.9%; Average loss: 2.2465
Iteration: 147; Percent complete: 2.9%; Average loss: 2.2448
Iteration: 148; Percent complete: 3.0%; Average loss: 2.2451
Iteration: 149; Percent complete: 3.0%; Average loss: 2.0616
Iteration: 150; Percent complete: 3.0%; Average loss: 2.2444
Iteration: 151; Percent complete: 3.0%; Average loss: 2.5409
Iteration: 152; Percent complete: 3.0%; Average loss: 2.4006
Iteration: 153; Percent complete: 3.1%; Average loss: 2.2571
Iteration: 154; Percent 

Iteration: 273; Percent complete: 5.5%; Average loss: 1.4222
Iteration: 274; Percent complete: 5.5%; Average loss: 1.5241
Iteration: 275; Percent complete: 5.5%; Average loss: 1.4713
Iteration: 276; Percent complete: 5.5%; Average loss: 1.5140
Iteration: 277; Percent complete: 5.5%; Average loss: 1.4615
Iteration: 278; Percent complete: 5.6%; Average loss: 1.5956
Iteration: 279; Percent complete: 5.6%; Average loss: 1.4507
Iteration: 280; Percent complete: 5.6%; Average loss: 1.6327
Iteration: 281; Percent complete: 5.6%; Average loss: 1.4503
Iteration: 282; Percent complete: 5.6%; Average loss: 1.5831
Iteration: 283; Percent complete: 5.7%; Average loss: 1.4301
Iteration: 284; Percent complete: 5.7%; Average loss: 1.5889
Iteration: 285; Percent complete: 5.7%; Average loss: 1.6139
Iteration: 286; Percent complete: 5.7%; Average loss: 1.5044
Iteration: 287; Percent complete: 5.7%; Average loss: 1.4672
Iteration: 288; Percent complete: 5.8%; Average loss: 1.5353
Iteration: 289; Percent 

Iteration: 408; Percent complete: 8.2%; Average loss: 1.1053
Iteration: 409; Percent complete: 8.2%; Average loss: 0.9829
Iteration: 410; Percent complete: 8.2%; Average loss: 1.1575
Iteration: 411; Percent complete: 8.2%; Average loss: 1.0065
Iteration: 412; Percent complete: 8.2%; Average loss: 1.0233
Iteration: 413; Percent complete: 8.3%; Average loss: 1.0744
Iteration: 414; Percent complete: 8.3%; Average loss: 1.0807
Iteration: 415; Percent complete: 8.3%; Average loss: 0.9507
Iteration: 416; Percent complete: 8.3%; Average loss: 1.2049
Iteration: 417; Percent complete: 8.3%; Average loss: 1.1128
Iteration: 418; Percent complete: 8.4%; Average loss: 1.0553
Iteration: 419; Percent complete: 8.4%; Average loss: 0.9668
Iteration: 420; Percent complete: 8.4%; Average loss: 0.9120
Iteration: 421; Percent complete: 8.4%; Average loss: 1.0146
Iteration: 422; Percent complete: 8.4%; Average loss: 1.0455
Iteration: 423; Percent complete: 8.5%; Average loss: 1.0190
Iteration: 424; Percent 

Iteration: 542; Percent complete: 10.8%; Average loss: 0.6189
Iteration: 543; Percent complete: 10.9%; Average loss: 0.6675
Iteration: 544; Percent complete: 10.9%; Average loss: 0.6668
Iteration: 545; Percent complete: 10.9%; Average loss: 0.6140
Iteration: 546; Percent complete: 10.9%; Average loss: 0.6580
Iteration: 547; Percent complete: 10.9%; Average loss: 0.6079
Iteration: 548; Percent complete: 11.0%; Average loss: 0.5154
Iteration: 549; Percent complete: 11.0%; Average loss: 0.6335
Iteration: 550; Percent complete: 11.0%; Average loss: 0.6014
Iteration: 551; Percent complete: 11.0%; Average loss: 0.6386
Iteration: 552; Percent complete: 11.0%; Average loss: 0.6194
Iteration: 553; Percent complete: 11.1%; Average loss: 0.5961
Iteration: 554; Percent complete: 11.1%; Average loss: 0.6110
Iteration: 555; Percent complete: 11.1%; Average loss: 0.5617
Iteration: 556; Percent complete: 11.1%; Average loss: 0.5605
Iteration: 557; Percent complete: 11.1%; Average loss: 0.6680
Iteratio

Iteration: 675; Percent complete: 13.5%; Average loss: 0.3550
Iteration: 676; Percent complete: 13.5%; Average loss: 0.4073
Iteration: 677; Percent complete: 13.5%; Average loss: 0.3874
Iteration: 678; Percent complete: 13.6%; Average loss: 0.3290
Iteration: 679; Percent complete: 13.6%; Average loss: 0.3941
Iteration: 680; Percent complete: 13.6%; Average loss: 0.3368
Iteration: 681; Percent complete: 13.6%; Average loss: 0.3416
Iteration: 682; Percent complete: 13.6%; Average loss: 0.3603
Iteration: 683; Percent complete: 13.7%; Average loss: 0.3420
Iteration: 684; Percent complete: 13.7%; Average loss: 0.3526
Iteration: 685; Percent complete: 13.7%; Average loss: 0.3211
Iteration: 686; Percent complete: 13.7%; Average loss: 0.3082
Iteration: 687; Percent complete: 13.7%; Average loss: 0.3706
Iteration: 688; Percent complete: 13.8%; Average loss: 0.3771
Iteration: 689; Percent complete: 13.8%; Average loss: 0.3551
Iteration: 690; Percent complete: 13.8%; Average loss: 0.3642
Iteratio

Iteration: 808; Percent complete: 16.2%; Average loss: 0.1791
Iteration: 809; Percent complete: 16.2%; Average loss: 0.1955
Iteration: 810; Percent complete: 16.2%; Average loss: 0.2024
Iteration: 811; Percent complete: 16.2%; Average loss: 0.1701
Iteration: 812; Percent complete: 16.2%; Average loss: 0.1996
Iteration: 813; Percent complete: 16.3%; Average loss: 0.1918
Iteration: 814; Percent complete: 16.3%; Average loss: 0.1828
Iteration: 815; Percent complete: 16.3%; Average loss: 0.2110
Iteration: 816; Percent complete: 16.3%; Average loss: 0.2051
Iteration: 817; Percent complete: 16.3%; Average loss: 0.1997
Iteration: 818; Percent complete: 16.4%; Average loss: 0.1749
Iteration: 819; Percent complete: 16.4%; Average loss: 0.1855
Iteration: 820; Percent complete: 16.4%; Average loss: 0.1838
Iteration: 821; Percent complete: 16.4%; Average loss: 0.1752
Iteration: 822; Percent complete: 16.4%; Average loss: 0.1855
Iteration: 823; Percent complete: 16.5%; Average loss: 0.1801
Iteratio

Iteration: 941; Percent complete: 18.8%; Average loss: 0.0981
Iteration: 942; Percent complete: 18.8%; Average loss: 0.1032
Iteration: 943; Percent complete: 18.9%; Average loss: 0.1111
Iteration: 944; Percent complete: 18.9%; Average loss: 0.1244
Iteration: 945; Percent complete: 18.9%; Average loss: 0.1160
Iteration: 946; Percent complete: 18.9%; Average loss: 0.1297
Iteration: 947; Percent complete: 18.9%; Average loss: 0.1085
Iteration: 948; Percent complete: 19.0%; Average loss: 0.1094
Iteration: 949; Percent complete: 19.0%; Average loss: 0.1138
Iteration: 950; Percent complete: 19.0%; Average loss: 0.1046
Iteration: 951; Percent complete: 19.0%; Average loss: 0.0968
Iteration: 952; Percent complete: 19.0%; Average loss: 0.1185
Iteration: 953; Percent complete: 19.1%; Average loss: 0.1019
Iteration: 954; Percent complete: 19.1%; Average loss: 0.1176
Iteration: 955; Percent complete: 19.1%; Average loss: 0.0947
Iteration: 956; Percent complete: 19.1%; Average loss: 0.1077
Iteratio

Iteration: 1072; Percent complete: 21.4%; Average loss: 0.0666
Iteration: 1073; Percent complete: 21.5%; Average loss: 0.0525
Iteration: 1074; Percent complete: 21.5%; Average loss: 0.0622
Iteration: 1075; Percent complete: 21.5%; Average loss: 0.0685
Iteration: 1076; Percent complete: 21.5%; Average loss: 0.0590
Iteration: 1077; Percent complete: 21.5%; Average loss: 0.0761
Iteration: 1078; Percent complete: 21.6%; Average loss: 0.0741
Iteration: 1079; Percent complete: 21.6%; Average loss: 0.0727
Iteration: 1080; Percent complete: 21.6%; Average loss: 0.0678
Iteration: 1081; Percent complete: 21.6%; Average loss: 0.0556
Iteration: 1082; Percent complete: 21.6%; Average loss: 0.0667
Iteration: 1083; Percent complete: 21.7%; Average loss: 0.0652
Iteration: 1084; Percent complete: 21.7%; Average loss: 0.0666
Iteration: 1085; Percent complete: 21.7%; Average loss: 0.0635
Iteration: 1086; Percent complete: 21.7%; Average loss: 0.0706
Iteration: 1087; Percent complete: 21.7%; Average loss:

Iteration: 1203; Percent complete: 24.1%; Average loss: 0.0455
Iteration: 1204; Percent complete: 24.1%; Average loss: 0.0445
Iteration: 1205; Percent complete: 24.1%; Average loss: 0.0437
Iteration: 1206; Percent complete: 24.1%; Average loss: 0.0409
Iteration: 1207; Percent complete: 24.1%; Average loss: 0.0433
Iteration: 1208; Percent complete: 24.2%; Average loss: 0.0405
Iteration: 1209; Percent complete: 24.2%; Average loss: 0.0440
Iteration: 1210; Percent complete: 24.2%; Average loss: 0.0412
Iteration: 1211; Percent complete: 24.2%; Average loss: 0.0341
Iteration: 1212; Percent complete: 24.2%; Average loss: 0.0414
Iteration: 1213; Percent complete: 24.3%; Average loss: 0.0437
Iteration: 1214; Percent complete: 24.3%; Average loss: 0.0493
Iteration: 1215; Percent complete: 24.3%; Average loss: 0.0412
Iteration: 1216; Percent complete: 24.3%; Average loss: 0.0406
Iteration: 1217; Percent complete: 24.3%; Average loss: 0.0458
Iteration: 1218; Percent complete: 24.4%; Average loss:

Iteration: 1334; Percent complete: 26.7%; Average loss: 0.0351
Iteration: 1335; Percent complete: 26.7%; Average loss: 0.0364
Iteration: 1336; Percent complete: 26.7%; Average loss: 0.0369
Iteration: 1337; Percent complete: 26.7%; Average loss: 0.0442
Iteration: 1338; Percent complete: 26.8%; Average loss: 0.0289
Iteration: 1339; Percent complete: 26.8%; Average loss: 0.0319
Iteration: 1340; Percent complete: 26.8%; Average loss: 0.0347
Iteration: 1341; Percent complete: 26.8%; Average loss: 0.0384
Iteration: 1342; Percent complete: 26.8%; Average loss: 0.0362
Iteration: 1343; Percent complete: 26.9%; Average loss: 0.0335
Iteration: 1344; Percent complete: 26.9%; Average loss: 0.0319
Iteration: 1345; Percent complete: 26.9%; Average loss: 0.0337
Iteration: 1346; Percent complete: 26.9%; Average loss: 0.0420
Iteration: 1347; Percent complete: 26.9%; Average loss: 0.0290
Iteration: 1348; Percent complete: 27.0%; Average loss: 0.0372
Iteration: 1349; Percent complete: 27.0%; Average loss:

Iteration: 1465; Percent complete: 29.3%; Average loss: 0.0241
Iteration: 1466; Percent complete: 29.3%; Average loss: 0.0210
Iteration: 1467; Percent complete: 29.3%; Average loss: 0.0229
Iteration: 1468; Percent complete: 29.4%; Average loss: 0.0260
Iteration: 1469; Percent complete: 29.4%; Average loss: 0.0305
Iteration: 1470; Percent complete: 29.4%; Average loss: 0.0286
Iteration: 1471; Percent complete: 29.4%; Average loss: 0.0279
Iteration: 1472; Percent complete: 29.4%; Average loss: 0.0238
Iteration: 1473; Percent complete: 29.5%; Average loss: 0.0223
Iteration: 1474; Percent complete: 29.5%; Average loss: 0.0256
Iteration: 1475; Percent complete: 29.5%; Average loss: 0.0292
Iteration: 1476; Percent complete: 29.5%; Average loss: 0.0216
Iteration: 1477; Percent complete: 29.5%; Average loss: 0.0231
Iteration: 1478; Percent complete: 29.6%; Average loss: 0.0246
Iteration: 1479; Percent complete: 29.6%; Average loss: 0.0353
Iteration: 1480; Percent complete: 29.6%; Average loss:

Iteration: 1596; Percent complete: 31.9%; Average loss: 0.0203
Iteration: 1597; Percent complete: 31.9%; Average loss: 0.0201
Iteration: 1598; Percent complete: 32.0%; Average loss: 0.0172
Iteration: 1599; Percent complete: 32.0%; Average loss: 0.0300
Iteration: 1600; Percent complete: 32.0%; Average loss: 0.0234
Iteration: 1601; Percent complete: 32.0%; Average loss: 0.0241
Iteration: 1602; Percent complete: 32.0%; Average loss: 0.0233
Iteration: 1603; Percent complete: 32.1%; Average loss: 0.0230
Iteration: 1604; Percent complete: 32.1%; Average loss: 0.0209
Iteration: 1605; Percent complete: 32.1%; Average loss: 0.0276
Iteration: 1606; Percent complete: 32.1%; Average loss: 0.0209
Iteration: 1607; Percent complete: 32.1%; Average loss: 0.0339
Iteration: 1608; Percent complete: 32.2%; Average loss: 0.0272
Iteration: 1609; Percent complete: 32.2%; Average loss: 0.0202
Iteration: 1610; Percent complete: 32.2%; Average loss: 0.0206
Iteration: 1611; Percent complete: 32.2%; Average loss:

Iteration: 1727; Percent complete: 34.5%; Average loss: 0.0275
Iteration: 1728; Percent complete: 34.6%; Average loss: 0.0200
Iteration: 1729; Percent complete: 34.6%; Average loss: 0.0175
Iteration: 1730; Percent complete: 34.6%; Average loss: 0.0192
Iteration: 1731; Percent complete: 34.6%; Average loss: 0.0165
Iteration: 1732; Percent complete: 34.6%; Average loss: 0.0206
Iteration: 1733; Percent complete: 34.7%; Average loss: 0.0202
Iteration: 1734; Percent complete: 34.7%; Average loss: 0.0180
Iteration: 1735; Percent complete: 34.7%; Average loss: 0.0179
Iteration: 1736; Percent complete: 34.7%; Average loss: 0.0155
Iteration: 1737; Percent complete: 34.7%; Average loss: 0.0212
Iteration: 1738; Percent complete: 34.8%; Average loss: 0.0171
Iteration: 1739; Percent complete: 34.8%; Average loss: 0.0134
Iteration: 1740; Percent complete: 34.8%; Average loss: 0.0217
Iteration: 1741; Percent complete: 34.8%; Average loss: 0.0150
Iteration: 1742; Percent complete: 34.8%; Average loss:

In [None]:
# Dropout 레이어를 평가 모드로 설정합니다
encoder.eval()
decoder.eval()

# 탐색 모듈을 초기화합니다
searcher = GreedySearchDecoder(encoder, decoder)

# 채팅을 시작합니다 (다음 줄의 주석을 제거하면 시작해볼 수 있습니다)
evaluateInput(encoder, decoder, searcher, voc)