In [8]:
import pandas as pd
import numpy as np
import warnings

In [9]:
warnings.filterwarnings(action='ignore')

In [10]:
df = pd.read_hdf('../data//tokenized_10thousand.hdf')

In [11]:
# json으로 저장한 단어 사전 불러오기
import json

with open('./id_dict/input_id.json', 'r') as fp:
    input_id = json.load(fp)

with open('./id_dict/target_id.json', 'r') as fp:
    target_id = json.load(fp)


In [12]:
# 데이터셋을 아이디값으로 변환(vectorize)
original = df.original.apply(lambda y :np.array(list(map(lambda x:input_id[x].setdefault(x,1), y)))).to_numpy()
# 한글의 경우 20000개 안에 없는 단어가 있을 수 있다.
# OOV 아이디인 1로 바꿔줘야 한다.
translation = df.translation.apply(lambda y :np.array(list(map(lambda x:target_id.setdefault(x, 1), y)))).to_numpy()

In [13]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [14]:
encoder_input_data = pad_sequences(original, maxlen=200, padding='post', truncating='post')
decoder_target_data = pad_sequences(translation, maxlen=200, padding='post', truncating='post')

In [15]:
encoder_input_data

array([[ 11, 516, 202, ..., 713, 604, 426],
       [ 11,  14, 117, ..., 178,  24,   3],
       [ 11, 345, 613, ...,   0,   0,   0],
       ...,
       [ 11, 391,  44, ...,   0,   0,   0],
       [ 11, 510,  43, ...,   0,   0,   0],
       [ 11, 413, 590, ...,   0,   0,   0]])

In [16]:
# decoder input data는 <start> character (== 2)로 시작해야 하며 1 time 씩 밀어내야 한다.
tmp = []
for t in translation:
    a = [2]
    a.extend(list(t))
    tmp.append(a)
tmp = np.array(tmp)

decoder_input_data = pad_sequences(tmp, maxlen=200, padding='post', truncating='post')

In [17]:
decoder_input_data

array([[   2,  913,   22, ...,   88,   26,   58],
       [   2,   65,    7, ...,   15,    5, 1647],
       [   2,  421,   22, ...,    0,    0,    0],
       ...,
       [   2,  717,    1, ...,    0,    0,    0],
       [   2, 2362,  416, ...,    0,    0,    0],
       [   2,  139,    4, ...,    0,    0,    0]])

In [18]:
encoder_input_data.shape, decoder_target_data.shape, decoder_input_data.shape

((99996, 200), (99996, 200), (99996, 200))

In [19]:
batch_size = 1  # Batch size for training.
epochs = 1  # Number of epochs to train for.
latent_dim = 256  # 인코더 차원
num_encoder_tokens = 7049 # unique한 한자 캐릭터의 수
num_decoder_tokens = 10003 # unique한 한글 토큰의 수
embedding_dim = 256 # 워드 임베딩 길이

In [20]:
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Softmax
from tensorflow.keras.models import Model

In [21]:
from tensorflow.keras.models import load_model

In [16]:
model = load_model('./model/s2s_10.h5')

In [20]:
encoder_inputs = model.input[0]   # input_1
encoder_embedding = model.layers[2](encoder_inputs)
encoder_outputs, state_h_enc, state_c_enc = model.layers[4].output   # lstm_1
encoder_states = [state_h_enc, state_c_enc]
encoder_model = Model(encoder_inputs, encoder_states)

decoder_inputs = model.input[1]   # input_2
decoder_embedding = model.layers[3](decoder_inputs)
decoder_state_input_h = Input(shape=(latent_dim,), name='input_3')
decoder_state_input_c = Input(shape=(latent_dim,), name='input_4')
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_lstm = model.layers[5]
decoder_outputs, state_h_dec, state_c_dec = decoder_lstm(
    decoder_embedding, initial_state=decoder_states_inputs)
decoder_states = [state_h_dec, state_c_dec]
decoder_dense = model.layers[6]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

In [28]:
# Reverse-lookup token index to decode sequences back to
# something readable.
reverse_input_id = {i:char for char, i in input_id.items()}
reverse_input_id[1] = '_'
reverse_target_id = {i:char for char, i in target_id.items()}
reverse_target_id[1] = '_'

In [27]:
reverse_input_id[1]

'OOV'

In [26]:
# with open('./id_dict/reversed_input_id.json','w') as fp:
#     json.dump(reverse_input_id, fp)
# with open('./id_dict/reversed_target_id.json','w') as fp:
#     json.dump(reverse_target_id, fp)

In [119]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0] = 2

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)
        
        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_id[sampled_token_index]
        decoded_sentence += sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == 'PAD' or
           len(decoded_sentence) > 200):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    return decoded_sentence


for seq_index in range(1000, 1100):
    # Take one sequence (part of the training set)
    # for trying out decoding.
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
#     print('-')
#     print('Input sentence:', encoder_input_data[seq_index])
#     print('Decoded sentence:', decoded_sentence)
    with open('./result10.txt', 'a') as fp:
        fp.write(decoded_sentence+'\n')

In [25]:
decoder_target_data

array([[ 913,   22, 5822, ...,   26,   58,   39],
       [  65,    7, 1972, ...,    5, 1647,   11],
       [ 421,   22,  120, ...,    0,    0,    0],
       ...,
       [ 717,    1,  943, ...,    0,    0,    0],
       [2362,  416,    7, ...,    0,    0,    0],
       [ 139,    4,   21, ...,    0,    0,    0]])