In [7]:
# 단어의 토큰화
from tensorflow.keras.preprocessing.text import Tokenizer

paper = ['많은 것을 바꾸고 싶다면 많은 것을 받아들여라']

tknz = Tokenizer()
tknz.fit_on_texts(paper)

In [8]:
print(tknz.word_index)
print(tknz.word_counts)

{'많은': 1, '것을': 2, '바꾸고': 3, '싶다면': 4, '받아들여라': 5}
OrderedDict([('많은', 2), ('것을', 2), ('바꾸고', 1), ('싶다면', 1), ('받아들여라', 1)])


In [9]:
# 원핫 인코딩
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer

paper = ['많은 것을 바꾸고 싶다면 많은 것을 받아들여라']
tknz = Tokenizer()
tknz.fit_on_texts(paper)

idx_paper = tknz.texts_to_sequences(paper)
n = len(tknz.word_index)+1
idx_onehot = to_categorical(idx_paper, num_classes=n)

In [10]:
print(idx_paper)
print(n)
print(idx_onehot)

[[1, 2, 3, 4, 1, 2, 5]]
6
[[[0. 1. 0. 0. 0. 0.]
  [0. 0. 1. 0. 0. 0.]
  [0. 0. 0. 1. 0. 0.]
  [0. 0. 0. 0. 1. 0.]
  [0. 1. 0. 0. 0. 0.]
  [0. 0. 1. 0. 0. 0.]
  [0. 0. 0. 0. 0. 1.]]]


In [11]:
# 단어 임베딩
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding

model = Sequential()
model.add(Embedding(input_dim=n, output_dim=3))
model.compile(optimizer='rmsprop', loss='mse')
embedding = model.predict(idx_paper)

In [12]:
print(embedding)

[[[ 0.01347996  0.04358232 -0.01255677]
  [-0.00283106 -0.02775335 -0.01162326]
  [ 0.01668438 -0.01551704  0.03676926]
  [ 0.03386027 -0.0054024  -0.00484884]
  [ 0.01347996  0.04358232 -0.01255677]
  [-0.00283106 -0.02775335 -0.01162326]
  [ 0.01882217 -0.02422711  0.01118854]]]


## Seq2seq

In [13]:
import numpy as np
import tensorflow as tf
np.random.seed(0)
tf.random.set_seed(0)

In [14]:
n_batch = 64
epochs = 100
latent_dim = 256
n_max_sample = 10000
data_path = '../rawdata/eng-fra/fra.txt'

In [15]:
with open(data_path, 'r', encoding='utf-8') as f:
    lines = f.read().split('\n')

In [16]:
lines[:10]

['Go.\tVa !\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #1158250 (Wittydev)',
 'Hi.\tSalut !\tCC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #509819 (Aiji)',
 'Hi.\tSalut.\tCC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #4320462 (gillux)',
 'Run!\tCours\u202f!\tCC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #906331 (sacredceltic)',
 'Run!\tCourez\u202f!\tCC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #906332 (sacredceltic)',
 'Who?\tQui ?\tCC-BY 2.0 (France) Attribution: tatoeba.org #2083030 (CK) & #4366796 (gillux)',
 'Wow!\tÇa alors\u202f!\tCC-BY 2.0 (France) Attribution: tatoeba.org #52027 (Zifre) & #374631 (zmoo)',
 'Fire!\tAu feu !\tCC-BY 2.0 (France) Attribution: tatoeba.org #1829639 (Spamster) & #4627939 (sacredceltic)',
 "Help!\tÀ l'aide\u202f!\tCC-BY 2.0 (France) Attribution: tatoeba.org #435084 (lukaszpp) & #128430 (sysko)",
 'Jump.\tSaute.\tCC-BY 2.0 (France) Attribution: tatoeba.org #631038 (Shishi

In [17]:
# 인풋, 타겟 텍스트 데이터 정리
x_txts = []
y_txts = []
x_chars_uni = set()
y_chars_uni = set()
n_sample = min(n_max_sample, len(lines) - 1)    

for line in lines[:n_sample]:
    x_txt, y_txt, _ = line.split('\t')
    y_txt = '\t' + y_txt + '\n'
    x_txts.append(x_txt)
    y_txts.append(y_txt)
    
    for char in x_txt:
        if char not in x_chars_uni:
            x_chars_uni.add(char)
    for char in y_txt:
        if char not in y_chars_uni:
            y_chars_uni.add(char)

In [18]:
x_txts[:5]

['Go.', 'Hi.', 'Hi.', 'Run!', 'Run!']

In [19]:
y_txts[:5]

['\tVa !\n',
 '\tSalut !\n',
 '\tSalut.\n',
 '\tCours\u202f!\n',
 '\tCourez\u202f!\n']

In [20]:
x_chars_uni

{' ',
 '!',
 '$',
 '%',
 '&',
 "'",
 ',',
 '-',
 '.',
 '0',
 '1',
 '2',
 '3',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 '?',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'Y',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 'é'}

In [21]:
y_chars_uni

{'\t',
 '\n',
 ' ',
 '!',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 ',',
 '-',
 '.',
 '0',
 '1',
 '2',
 '3',
 '5',
 '8',
 '9',
 ':',
 '?',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'Y',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'x',
 'y',
 'z',
 '\xa0',
 '«',
 '»',
 'À',
 'Ç',
 'É',
 'Ê',
 'à',
 'â',
 'ç',
 'è',
 'é',
 'ê',
 'ë',
 'î',
 'ï',
 'ô',
 'ù',
 'û',
 'œ',
 '\u2009',
 '’',
 '\u202f'}

In [22]:
# 토큰 단위 정리
x_chars_uni = sorted(list(x_chars_uni))
y_chars_uni = sorted(list(y_chars_uni))
n_encoder_tokens = len(x_chars_uni)
n_decoder_tokens = len(y_chars_uni)

In [23]:
print("유니크 인코더 토큰 글자 수: ", n_encoder_tokens)
print("유니크 디코더 토큰 글자 수: ", n_decoder_tokens)

유니크 인코더 토큰 글자 수:  71
유니크 디코더 토큰 글자 수:  93


In [24]:
max_encoder_seq_len = 0
for txt in x_txts:
    txt_len = len(txt)
    max_encoder_seq_len = max(txt_len, 
                              max_encoder_seq_len)
print("인코더 문장내 최대 문자 수: ", max_encoder_seq_len)

인코더 문장내 최대 문자 수:  15


In [25]:
max_decoder_seq_len = 0
for txt in y_txts:
    txt_len = len(txt)
    max_decoder_seq_len = max(txt_len, 
                              max_decoder_seq_len)
print("디코더 문장내 최대 문자 수: ", max_decoder_seq_len)

디코더 문장내 최대 문자 수:  59


In [26]:
# 토큰 별 인덱스
x_token_idx = {}
for idx, char in enumerate(x_chars_uni):
    x_token_idx[char] = idx
    
y_token_idx ={}
for idx, char in enumerate(y_chars_uni):
    y_token_idx[char] = idx

In [27]:
x_token_idx

{' ': 0,
 '!': 1,
 '$': 2,
 '%': 3,
 '&': 4,
 "'": 5,
 ',': 6,
 '-': 7,
 '.': 8,
 '0': 9,
 '1': 10,
 '2': 11,
 '3': 12,
 '5': 13,
 '6': 14,
 '7': 15,
 '8': 16,
 '9': 17,
 ':': 18,
 '?': 19,
 'A': 20,
 'B': 21,
 'C': 22,
 'D': 23,
 'E': 24,
 'F': 25,
 'G': 26,
 'H': 27,
 'I': 28,
 'J': 29,
 'K': 30,
 'L': 31,
 'M': 32,
 'N': 33,
 'O': 34,
 'P': 35,
 'Q': 36,
 'R': 37,
 'S': 38,
 'T': 39,
 'U': 40,
 'V': 41,
 'W': 42,
 'Y': 43,
 'a': 44,
 'b': 45,
 'c': 46,
 'd': 47,
 'e': 48,
 'f': 49,
 'g': 50,
 'h': 51,
 'i': 52,
 'j': 53,
 'k': 54,
 'l': 55,
 'm': 56,
 'n': 57,
 'o': 58,
 'p': 59,
 'q': 60,
 'r': 61,
 's': 62,
 't': 63,
 'u': 64,
 'v': 65,
 'w': 66,
 'x': 67,
 'y': 68,
 'z': 69,
 'é': 70}

In [28]:
# 영행렬 만들기
encoder_x_data = np.zeros(
                            (len(x_txts), 
                             max_encoder_seq_len, 
                             n_encoder_tokens),
                        dtype='float32')
decoder_x_data = np.zeros(
                            (len(x_txts), 
                             max_decoder_seq_len, 
                             n_decoder_tokens),
                        dtype='float32')
decoder_y_data = np.zeros(
                            (len(x_txts), 
                             max_decoder_seq_len, 
                             n_decoder_tokens),
                        dtype='float32')

In [29]:
# 인풋 데이터 행렬
for i, x_txt in enumerate(x_txts):
    
    for t, char in enumerate(x_txt):
        encoder_x_data[i, t, x_token_idx[char]] = 1.
    encoder_x_data[i, t + 1:, x_token_idx[' ']] = 1.

In [30]:
# 타깃 데이터 행렬
for i, y_txt in enumerate(y_txts):
       
    for t, char in enumerate(y_txt):
        decoder_x_data[i, t, y_token_idx[char]] = 1.
        if t > 0:
            decoder_y_data[i, t - 1, y_token_idx[char]] = 1.
            
    decoder_x_data[i, t + 1:, y_token_idx[' ']] = 1.
    decoder_y_data[i, t:, y_token_idx[' ']] = 1.

In [31]:
# 인코더
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, Input, LSTM, Dense, TimeDistributed

encoder_inputs = Input(shape=(None, n_encoder_tokens))
encoder = LSTM(latent_dim, return_state=True)
encoder_outs, state_h, state_c = encoder(encoder_inputs)
encoder_states = [state_h, state_c]

In [32]:
print(encoder_inputs.shape)
print(encoder_outs.shape)
print(state_h.shape)
print(state_c.shape)

(None, None, 71)
(None, 256)
(None, 256)
(None, 256)


In [33]:
# 디코더
decoder_inputs = Input(shape=(None, n_decoder_tokens))
decoder = LSTM(latent_dim, 
                return_sequences=True, 
                return_state=True)
decoder_outs, _, _ = decoder(decoder_inputs,
                             initial_state=encoder_states)
decoder_dense = TimeDistributed(Dense(n_decoder_tokens, 
                                      activation='softmax'))
decoder_outputs = decoder_dense(decoder_outs)

In [34]:
print(decoder_inputs.shape)
print(decoder_outputs.shape)

(None, None, 93)
(None, None, 93)


In [35]:
# 인코더 - 디코더
model = Model([encoder_inputs, decoder_inputs], 
              decoder_outputs)
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None, 71)]   0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None, 93)]   0                                            
__________________________________________________________________________________________________
lstm (LSTM)                     [(None, 256), (None, 335872      input_1[0][0]                    
__________________________________________________________________________________________________
lstm_1 (LSTM)                   [(None, None, 256),  358400      input_2[0][0]                    
                                                                 lstm[0][1]                   

In [36]:
# 모형 컴파일
model.compile(optimizer='rmsprop', 
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [37]:
model.fit([encoder_x_data, decoder_x_data], decoder_y_data,
          batch_size=n_batch,
          epochs=epochs,
          validation_split=0.2)

Train on 8000 samples, validate on 2000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100

In [None]:
# 추론 모형 생성
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, 
                         decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder(
    decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

In [None]:
# 리버스 인덱스
reverse_x_char_idx = {}
for char, idx in x_token_idx.items():
    reverse_x_char_idx[idx] = char
    
reverse_y_char_idx ={}
for char, idx in y_token_idx.items():
    reverse_y_char_idx[idx] = char

In [None]:
reverse_x_char_idx

In [None]:
# 결과값 디코딩
def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)
    y_seq = np.zeros((1, 1, n_decoder_tokens))
    y_seq[0, 0, y_token_idx['\t']] = 1.

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [y_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_y_char_idx[sampled_token_index]
        decoded_sentence += sampled_char

        if (sampled_char == '\n' or
           len(decoded_sentence) > max_decoder_seq_len):
            stop_condition = True

        y_seq = np.zeros((1, 1, n_decoder_tokens))
        y_seq[0, 0, sampled_token_index] = 1.

        states_value = [h, c]

    return decoded_sentence

In [None]:
# 결과확인
for seq_idx in range(100):
    x_seq = encoder_x_data[seq_idx: seq_idx + 1]
    decoded_sentence = decode_sequence(x_seq)
    print('-')
    print('Input sentence:', x_txts[seq_idx])
    print('Decoded sentence:', decoded_sentence)