# https://keras.io/examples/nlp/lstm_seq2seq/
# https://blog.keras.io/a-ten-minute-introduction-to-sequence-to-sequence-learning-in-keras.html

In [None]:
import numpy as np
import pandas as pd

import re
import unicodedata

from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

In [None]:
data_path = "../data/"

df = pd.read_csv(data_path + "fra.txt", sep="\t", names=["src", "tar", "lic"])
del df['lic'] # remove license column which is not needed
df = df[0:33000] # use only 33,000 samples 

print(len(df))
df.tail(10)

# Data Preprocessing

In [None]:
def to_ascii(s):
    # 프랑스어 악센트(accent) 삭제
    # 예시 : 'déjà diné' -> deja dine
    return ''.join(c for c in unicodedata.normalize('NFD', s) # NFD : Normalization Form Canonical Decomposition = remove accents
                   if unicodedata.category(c) != 'Mn') # Mn : Nonspacing_Mark

def preprocess_sentence(sent):
    # 악센트 제거 함수 호출
    sent = to_ascii(sent.lower())

    # 단어와 구두점 사이에 공백 추가.
    # ex) "I am a student." => "I am a student ."
    sent = re.sub(r"([?.!,¿])", r" \1", sent)

    # (a-z, A-Z, ".", "?", "!", ",") 이들을 제외하고는 전부 공백으로 변환.
    sent = re.sub(r"[^a-zA-Z!.?]+", r" ", sent)

    # 다수 개의 공백을 하나의 공백으로 치환
    sent = re.sub(r"\s+", " ", sent)
    return sent

In [None]:
# test
en_sent = u"Have you had dinner?"
fr_sent = u"Avez-vous déjà diné?"

print('전처리 전 영어 문장 :', en_sent)
print('전처리 후 영어 문장 :',preprocess_sentence(en_sent))
print('전처리 전 프랑스어 문장 :', fr_sent)
print('전처리 후 프랑스어 문장 :', preprocess_sentence(fr_sent))

In [14]:
# generate encoder_input, decoder_input, decoder_target
en_in = df.src.apply(lambda word : [word for word in preprocess_sentence(word).split(" ")]) 
en_in = [word for word in en_in]
fr_in = df.tar.apply(lambda word : [word for word in ("<sos> " + preprocess_sentence(word)).split(" ")])
fr_in = [word for word in fr_in]
fra_out = df.tar.apply(lambda word : [word for word in (preprocess_sentence(word)+ " <eos>").split(" ")])
fra_out = [word for word in fra_out]

In [27]:
for i in range(5):
    print(en_in[32995+i], fr_in[32995+i], fra_out[32995+i])
    print()
    
print(en_in[32995:])
print(fr_in[32995:])
print(fra_out[32995:])

['here', 's', 'your', 'change', '.'] ['<sos>', 'voici', 'ton', 'changement', '!'] ['voici', 'ton', 'changement', '!', '<eos>']

['here', 's', 'your', 'change', '.'] ['<sos>', 'voici', 'votre', 'changement', '!'] ['voici', 'votre', 'changement', '!', '<eos>']

['here', 's', 'your', 'change', '.'] ['<sos>', 'voici', 'ta', 'monnaie', '!'] ['voici', 'ta', 'monnaie', '!', '<eos>']

['here', 's', 'your', 'change', '.'] ['<sos>', 'voici', 'votre', 'monnaie', '!'] ['voici', 'votre', 'monnaie', '!', '<eos>']

['here', 's', 'your', 'dinner', '.'] ['<sos>', 'voici', 'votre', 'diner', '.'] ['voici', 'votre', 'diner', '.', '<eos>']

[['here', 's', 'your', 'change', '.'], ['here', 's', 'your', 'change', '.'], ['here', 's', 'your', 'change', '.'], ['here', 's', 'your', 'change', '.'], ['here', 's', 'your', 'dinner', '.']]
[['<sos>', 'voici', 'ton', 'changement', '!'], ['<sos>', 'voici', 'votre', 'changement', '!'], ['<sos>', 'voici', 'ta', 'monnaie', '!'], ['<sos>', 'voici', 'votre', 'monnaie', '!'],

# Tokenizing
# tokenizing for words

In [41]:
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

In [38]:
tokenizer_en = Tokenizer()
tokenizer_en.fit_on_texts(en_in) # eng word -> index 
tokenizer_fr = Tokenizer()
tokenizer_fr.fit_on_texts(fr_in) # fra word -> index ('<sos>' is added)
#print(len(tokenizer_fr.word_index)) -> 7883
tokenizer_fr.fit_on_texts(fra_out) # fra word -> index ('<eos>' are added)
#print(len(tokenizer_fr.word_index)) -> 7884

encoder_input = tokenizer_en.texts_to_sequences(en_in) # apply tokenizer to eng word
decoder_input = tokenizer_fr.texts_to_sequences(fr_in) # apply tokenizer to fra word
decoder_target = tokenizer_fr.texts_to_sequences(fra_out) # apply tokenizer to fra word

encoder_input_pad = pad_sequences(encoder_input, padding='post') # padding
decoder_input_pad = pad_sequences(decoder_input, padding='post') # padding
decoder_target_pad = pad_sequences(decoder_target, padding='post') # padding

src_vocab_size = len(tokenizer_en.word_index) + 1
tar_vocab_size = len(tokenizer_fr.word_index) + 1

print(encoder_input_pad.shape, decoder_input_pad.shape, decoder_target_pad.shape)
print(src_vocab_size, tar_vocab_size)

(33000, 7) (33000, 16) (33000, 16)
4488 7884


In [46]:
print(tokenizer_fr.word_index)

{'.': 1, '<sos>': 2, '<eos>': 3, 'je': 4, 'est': 5, '?': 6, 'tom': 7, 'a': 8, 'vous': 9, '!': 10, 'pas': 11, 'j': 12, 'il': 13, 'nous': 14, 'le': 15, 'ai': 16, 'tu': 17, 'de': 18, 'c': 19, 'ne': 20, 'suis': 21, 'la': 22, 'l': 23, 'un': 24, 'en': 25, 'ce': 26, 'n': 27, 'etes': 28, 'ca': 29, 'me': 30, 'que': 31, 'une': 32, 'les': 33, 'moi': 34, 'es': 35, 'd': 36, 's': 37, 'sommes': 38, 't': 39, 'y': 40, 'sont': 41, 'elle': 42, 'etait': 43, 'm': 44, 'ils': 45, 'fait': 46, 'qui': 47, 'tout': 48, 'as': 49, 'aime': 50, 'des': 51, 'elles': 52, 'mon': 53, 'te': 54, 'bien': 55, 'toi': 56, 'ici': 57, 'avez': 58, 'du': 59, 'tres': 60, 'ete': 61, 'besoin': 62, 'peux': 63, 'avons': 64, 'faire': 65, 'va': 66, 'qu': 67, 'se': 68, 'on': 69, 'cela': 70, 'air': 71, 'faut': 72, 'votre': 73, 'personne': 74, 'veux': 75, 'fais': 76, 'etais': 77, 'ont': 78, 'ton': 79, 'aller': 80, 'monde': 81, 'tous': 82, 'trop': 83, 'ou': 84, 'au': 85, 'maintenant': 86, 'ma': 87, 'adore': 88, 'train': 89, 'comment': 90, 'so

In [40]:
# random indices
indices = np.arange(encoder_input_pad.shape[0])
np.random.shuffle(indices)

# shuffle
encoder_input_pad = encoder_input_pad[indices]
decoder_input_pad = decoder_input_pad[indices]
decoder_target_pad = decoder_target_pad[indices]

# split datasets into train and test
n_val = 3300 # 33000 * 0.1

encoder_input_train = encoder_input_pad[:-n_val]
decoder_input_train = decoder_input_pad[:-n_val]
decoder_target_train = decoder_target_pad[:-n_val]

encoder_input_test = encoder_input_pad[-n_val:]
decoder_input_test = decoder_input_pad[-n_val:]
decoder_target_test = decoder_target_pad[-n_val:]

print(encoder_input_train.shape, decoder_input_train.shape, decoder_target_train.shape)
print(encoder_input_test.shape, decoder_input_test.shape, decoder_target_test.shape)
print(encoder_input_pad[0], decoder_input_pad[0], decoder_target_pad[0])

(29700, 7) (29700, 16) (29700, 16)
(3300, 7) (3300, 16) (3300, 16)
[   2   20 2681    1    0    0    0] [   2   12   16   61 5692    1    0    0    0    0    0    0    0    0
    0    0] [  12   16   61 5692    1    3    0    0    0    0    0    0    0    0
    0    0]


# Modeling

## Training

In [42]:
from keras.layers import Input, LSTM, Embedding, Dense, Masking
from keras.models import Model

In [43]:
# hyper-parameter
embedding_dim = 64
hidden_units = 64

# encoder
encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(input_dim=src_vocab_size, output_dim=embedding_dim)(encoder_inputs)
encoder_masking = Masking(mask_value=0.0)(encoder_embedding) # 패딩 토큰인 숫자 0의 경우에는 연산을 제외
encoder_lstm = LSTM(units=hidden_units, return_state=True) # return_sequences=False, because we only need the last hidden state
_, state_h, state_c = encoder_lstm(encoder_masking) 
encoder_states = [state_h, state_c] 

# decoder
decoder_input = Input(shape=(None,))

decoder_embedding_layer = Embedding(input_dim=tar_vocab_size, output_dim=embedding_dim)
decoder_embedding = decoder_embedding_layer(decoder_input)
decoder_masking = Masking(mask_value=0.0) (decoder_embedding)

decoder_lstm_layer = LSTM(units=hidden_units, return_sequences=True, return_state=True) # return_sequences=True, because we need all hidden states
decoder_outputs, _, _ = decoder_lstm_layer(decoder_masking, initial_state=encoder_states) # initial_state=encoder_states : use encoder's last hidden state as initial state

decoder_softmax_layer = Dense(units=tar_vocab_size, activation='softmax')
decoder_outputs = decoder_softmax_layer(decoder_outputs)

model = Model(inputs=[encoder_inputs, decoder_input], outputs=decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['acc']) # sparse_categorical_crossentropy : one-hot encoding을 하지 않고 정수 인코딩 상태 그대로 손실 함수 계산
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 embedding (Embedding)       (None, None, 64)             287232    ['input_1[0][0]']             
                                                                                                  
 embedding_1 (Embedding)     (None, None, 64)             504576    ['input_2[0][0]']             
                                                                                              

In [44]:
history = model.fit(x=[encoder_input_train, decoder_input_train], y=decoder_target_train,
                    validation_data=([encoder_input_test, decoder_input_test], decoder_target_test),
                    batch_size=128, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


## Inferencing
 - not using model from training phase but using trained layers.
 - encoder_inputs와 encoder_states는 훈련 과정에서 이미 정의한 것들을 재사용합니다. 이렇게 되면 훈련 단계에 encoder_inputs와 encoder_states 사이에 있는 모든 층까지 전부 불러오게 되므로 결과적으로 훈련 단계에서 사용한 인코더를 그대로 재사용.

In [45]:
# encoder
encoder_model = Model(inputs=encoder_inputs, outputs=encoder_states)

# decoder
decoder_state_input_h = Input(shape=(hidden_units,))
decoder_state_input_c = Input(shape=(hidden_units,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

# using same embedding layer trained in training phase
decoder_embedding_2 = decoder_embedding_layer(decoder_input)

# using same lstm layer trained in training phase
# using encoder's last hidden state as initial state to predict next word
decoder_outputs_2, state_h_2, state_c_2 = decoder_lstm_layer(decoder_embedding_2, initial_state=decoder_states_inputs)
decoder_states_2 = [state_h_2, state_c_2]

# using same sofmax dense layer trained in training phase
decoder_outputs_2 = decoder_softmax_layer(decoder_outputs_2)

decoder_model = Model(inputs=[decoder_input] + decoder_states_inputs, outputs=[decoder_outputs_2] + decoder_states_2)

In [47]:
def decode_sequence(input_seq):
    # 입력으로부터 인코더의 마지막 시점의 상태(은닉 상태, 셀 상태)를 얻음
    states_value = encoder_model.predict(input_seq)

    # <SOS>에 해당하는 정수 생성
    target_seq = np.zeros((1,1)) 
    target_seq[0, 0] = tokenizer_fr.word_index['<sos>']

    stop_condition = False
    decoded_sentence = ''

    # stop_condition이 True가 될 때까지 루프 반복
    # 구현의 간소화를 위해서 이 함수는 배치 크기를 1로 가정합니다.
    while not stop_condition:
        # 이점 시점의 상태 states_value를 현 시점의 초기 상태로 사용
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # 예측 결과를 단어로 변환
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = tokenizer_fr.index_word[sampled_token_index]

        # 현재 시점의 예측 단어를 예측 문장에 추가
        decoded_sentence += ' '+ sampled_word

        # <eos>에 도달하거나 정해진 길이를 넘으면 중단.
        if (sampled_word == '<eos>' or
                len(decoded_sentence) > 50):
            stop_condition = True

        # 현재 시점의 예측 결과를 다음 시점의 입력으로 사용하기 위해 저장
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        # 현재 시점의 상태를 다음 시점의 상태로 사용하기 위해 저장
        states_value = [h, c]

    return decoded_sentence

In [48]:
# 원문의 정수 시퀀스를 텍스트 시퀀스로 변환
def seq_to_src(input_seq):
    sentence = ''
    for encoded_word in input_seq:
        if(encoded_word != 0):
            sentence = sentence + tokenizer_en.index_word[encoded_word] + ' '
    return sentence

# 번역문의 정수 시퀀스를 텍스트 시퀀스로 변환
def seq_to_tar(input_seq):
    sentence = ''
    for encoded_word in input_seq:
        if(encoded_word != 0 and encoded_word != tokenizer_fr.word_index['<sos>'] and encoded_word != tokenizer_fr.word_index['<eos>']):
            sentence = sentence + tokenizer_fr.index_word[encoded_word] + ' '
    return sentence

In [49]:
for seq_index in [3, 50, 100, 300, 1001]:
    input_seq = encoder_input_train[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)

    print("입력문장 :",seq_to_src(encoder_input_train[seq_index]))
    print("정답문장 :",seq_to_tar(decoder_input_train[seq_index]))
    print("번역문장 :",decoded_sentence[1:-5])
    print("-"*50)

입력문장 : you made an error . 
정답문장 : tu as fait une erreur . 
번역문장 : vous ne ai pas . 
--------------------------------------------------
입력문장 : good riddance . 
정답문장 : bon debarras ! 
번역문장 : 
--------------------------------------------------
입력문장 : we love our dog . 
정답문장 : nous adorons notre chienne . 
번역문장 : nous sommes a un . 
--------------------------------------------------
입력문장 : this is offensive . 
정답문장 : c est grossier . 
번역문장 : il est . 
--------------------------------------------------
입력문장 : keep a diary . 
정답문장 : tiens un journal ! 
번역문장 : ils le ! 
--------------------------------------------------
