# https://keras.io/examples/nlp/lstm_seq2seq/
# https://blog.keras.io/a-ten-minute-introduction-to-sequence-to-sequence-learning-in-keras.html

In [1]:
import numpy as np
import pandas as pd

from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

In [2]:
data_path = "../data/"

df = pd.read_csv(data_path + "fra.txt", sep="\t", names=["src", "tar", "lic"])
del df['lic'] # remove license column which is not needed
df = df[0:60000] # use only 60,000 samples for 

print(len(df))
df.tail(10)

60000


Unnamed: 0,src,tar
59990,That's to be expected.,Il faut s'y attendre.
59991,That's totally normal.,C'est totalement normal.
59992,That's totally normal.,C'est parfaitement normal.
59993,That's totally normal.,C'est tout à fait normal.
59994,That's totally untrue.,C'est complètement faux.
59995,"That's true, isn't it?","C'est vrai, n'est-ce pas ?"
59996,That's useful to know.,C'est bon à savoir.
59997,That's very dangerous.,C'est très dangereux.
59998,That's very dishonest.,C'est très malhonnête.
59999,That's very good news.,Ce sont d'excellentes nouvelles.


# Data Preprocessing

In [3]:
# "\t" is start of sequence(<sos>) and "\n" is end of sequence(<eos>)
df.tar = df.tar.apply(lambda x: "\t " + x + " \n") 
df.sample(10)

Unnamed: 0,src,tar
57811,I was right all along.,\t J'avais raison depuis le début. \n
18960,You're immature.,\t Vous êtes immature. \n
6977,You're early.,\t Tu es matinal. \n
38983,Are you enjoying it?,\t Cela te plaît-il ? \n
55988,He seems to be honest.,\t Il semble honnête. \n
59811,That car is very fast.,\t Cette voiture est très rapide. \n
29210,The boat capsized.,\t Le bateau chavira. \n
15201,I called Tom up.,\t J'ai passé un coup de fil à Tom. \n
23688,Tom was a farmer.,\t Tom était fermier. \n
28706,Prices are rising.,\t Les prix sont en hausse. \n


# Tokenizing
## tokenizing for each char

In [4]:
# get chars
src_char = set() # english char
tar_char = set() # french char

for line in df.src:
    for char in line:
        src_char.add(char)
        
for line in df.tar:
    for char in line:
        tar_char.add(char)

# sort chars
src_char_sorted = sorted(src_char)
tar_char_sorted = sorted(tar_char)    

In [5]:
src_max = len(src_char_sorted) + 1
tar_max = len(tar_char_sorted) + 1

print(src_max, tar_max)
print(src_char_sorted[45:75])
print(tar_char_sorted[45:75])

80 104
['W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
['T', 'U', 'V', 'W', 'X', 'Y', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x']


In [6]:
# char to index
src_index = dict([(char, i+1) for i, char in enumerate(src_char_sorted)])
tar_index = dict([(char, i+1) for i, char in enumerate(tar_char_sorted)])

print(src_index)
print(tar_index)

{' ': 1, '!': 2, '"': 3, '$': 4, '%': 5, '&': 6, "'": 7, ',': 8, '-': 9, '.': 10, '/': 11, '0': 12, '1': 13, '2': 14, '3': 15, '4': 16, '5': 17, '6': 18, '7': 19, '8': 20, '9': 21, ':': 22, '?': 23, 'A': 24, 'B': 25, 'C': 26, 'D': 27, 'E': 28, 'F': 29, 'G': 30, 'H': 31, 'I': 32, 'J': 33, 'K': 34, 'L': 35, 'M': 36, 'N': 37, 'O': 38, 'P': 39, 'Q': 40, 'R': 41, 'S': 42, 'T': 43, 'U': 44, 'V': 45, 'W': 46, 'X': 47, 'Y': 48, 'Z': 49, 'a': 50, 'b': 51, 'c': 52, 'd': 53, 'e': 54, 'f': 55, 'g': 56, 'h': 57, 'i': 58, 'j': 59, 'k': 60, 'l': 61, 'm': 62, 'n': 63, 'o': 64, 'p': 65, 'q': 66, 'r': 67, 's': 68, 't': 69, 'u': 70, 'v': 71, 'w': 72, 'x': 73, 'y': 74, 'z': 75, 'é': 76, 'ï': 77, '’': 78, '€': 79}
{'\t': 1, '\n': 2, ' ': 3, '!': 4, '"': 5, '$': 6, '%': 7, '&': 8, "'": 9, '(': 10, ')': 11, ',': 12, '-': 13, '.': 14, '0': 15, '1': 16, '2': 17, '3': 18, '4': 19, '5': 20, '6': 21, '7': 22, '8': 23, '9': 24, ':': 25, '?': 26, 'A': 27, 'B': 28, 'C': 29, 'D': 30, 'E': 31, 'F': 32, 'G': 33, 'H': 3

In [7]:
# encoding src and tar for encoder and decoder
encoder_input = []
decoder_input = []
decoder_target = []

for line in df.src:
    encoder_input.append([src_index[char] for char in line])

for line in df.tar:
    decoder_input.append([tar_index[char] for char in line])
    
for line in df.tar:
    decoder_target.append([tar_index[char] for char in line][1:]) # remove <sos> token 
    
print(encoder_input[:5])
print(decoder_input[:5]) # decoder_input has <sos> token which is index 0 and <eos> token which is index 1
print(decoder_target[:5]) # <sos> token is removed

[[30, 64, 10], [30, 64, 10], [30, 64, 10], [30, 64, 10], [31, 58, 10]]
[[1, 3, 48, 52, 3, 4, 3, 2], [1, 3, 39, 52, 69, 54, 59, 56, 14, 3, 2], [1, 3, 31, 65, 3, 69, 66, 72, 71, 56, 3, 4, 3, 2], [1, 3, 28, 66, 72, 58, 56, 3, 4, 3, 2], [1, 3, 45, 52, 63, 72, 71, 3, 4, 3, 2]]
[[3, 48, 52, 3, 4, 3, 2], [3, 39, 52, 69, 54, 59, 56, 14, 3, 2], [3, 31, 65, 3, 69, 66, 72, 71, 56, 3, 4, 3, 2], [3, 28, 66, 72, 58, 56, 3, 4, 3, 2], [3, 45, 52, 63, 72, 71, 3, 4, 3, 2]]


In [8]:
# padding with max length of each src and tar
max_src_len = max([len(line) for line in encoder_input])
max_tar_len = max([len(line) for line in decoder_input])

print(max_src_len)
print(max_tar_len)

encoder_input_pad = pad_sequences(encoder_input, maxlen=max_src_len, padding='post')
decoder_input_pad = pad_sequences(decoder_input, maxlen=max_tar_len, padding='post')
decoder_target_pad = pad_sequences(decoder_target, maxlen=max_tar_len, padding='post')

22
76


In [9]:
# one-hot encoding
encoder_input_onehot = to_categorical(encoder_input_pad)
decoder_input_onehot = to_categorical(decoder_input_pad)
decoder_target_onehot = to_categorical(decoder_target_pad)

# (number of sentences, max length of sentence, number of chars index)
print(encoder_input_onehot.shape, decoder_input_onehot.shape, decoder_target_onehot.shape) 

(60000, 22, 80) (60000, 76, 104) (60000, 76, 104)


# Modeling

## Teacher Forcing
### https://en.wikipedia.org/wiki/Teacher_forcing
 - A method for quickly and efficiently training recurrent neural network models that use the ground truth from a prior time step as input.
 - Feeding observed sequence values (i.e. ground-truth samples) back into the RNN after each step, thus forcing the RNN to stay close to the ground-truth sequence.
 - It is trained to turn the target sequences into the same sequences but offset by one timestep in the future. -> decoder learns to generate targets[t+1...] with given targets[...t].
 - 'decoder_target' is 'decoder_input' offset by one timestep. Since <sos> token is removed from 'decoder_target', the length of 'decoder_target' is one less than 'decoder_input'.
 - For example, "[1, 3, 48, 52, 3, 4, 3, 2] -> [3, 48, 52, 3, 4, 3, 2]" : (context vector + 1 -> 3), (context vector + 1 + 3 -> 48), (context vector + 1 + 3 + 48 -> 52) ... and so on.

In [10]:
from keras.layers import Input, LSTM, Embedding, Dense
from keras.models import Model

In [11]:
# hyper-parameters
latent_dim = 256 # LSTM hidden layer size

# encoder
e_input = Input(shape=(None, src_max))
encoder_lstm = LSTM(units=latent_dim, return_state=True)
encoder_output, state_h, state_c = encoder_lstm(e_input)# return_state=True : return hidden state and cell state
encoder_states = [state_h, state_c] # hidden state and cell state = context vector

# decoder
d_input = Input(shape=(None, tar_max)) # teacher forcing input
decoder_lstm = LSTM(units=latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(d_input, initial_state=encoder_states) # initial_state : last hidden state of the encoder
decoder_softmax = Dense(tar_max, activation='softmax')
decoder_outputs = decoder_softmax(decoder_outputs)

# model
model = Model(inputs=[e_input, d_input], outputs=decoder_outputs)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, None, 80)]           0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, None, 104)]          0         []                            
                                                                                                  
 lstm (LSTM)                 [(None, 256),                345088    ['input_1[0][0]']             
                              (None, 256),                                                        
                              (None, 256)]                                                        
                                                                                              

### LSTM 
 - return_state : configuring a RNN layer to return a list where the first entry is the outputs and the next entries are the internal RNN states. This is used to recover the states of the encoder.
 - initial_state : initial state(s) of a RNN. This is used to pass the encoder states to the decoder as initial states.
 - return_sequences : configuring a RNN to return its full sequence of outputs (instead of just the last output, which the defaults behavior). This is used in the decoder.

### Context Vector
 - The context vector is the final hidden state of the encoder. (컨텍스트 벡터는 사실 인코더에서의 마지막 RNN 셀의 은닉 상태값을 말하는 것이며, 이는 입력 문장의 모든 단어 토큰들의 정보를 요약해서 담고 있다.)
 - The encoder hidden state is used as the initial hidden state of the decoder.

### Decoder
 - The decoder is trained to predict the next word in the sequence given the previous word(s) and the context vector.
 - 디코더의 첫번째 RNN 셀은 이 첫번째 은닉 상태의 값(Context Vector) + 현재 t에서의 입력값인 <sos>로부터 다음에 등장할 단어를 예측.
 - 예측된 단어는 다음 시점인 t+1 RNN에서의 입력값이 되고, 이 t+1에서의 RNN 또한 이 입력값 + t에서의 은닉 상태(hidden state)로 다음에 등장할 단어를 예측.

In [12]:
history = model.fit(x=[encoder_input_onehot, decoder_input_onehot], y=decoder_target_onehot, batch_size=64, epochs=1, validation_split=0.2)



# Inferencing
- not using model from training phase but using trained layers.

In [30]:
# encoder
inf_encoder = Model(inputs=e_input, outputs=encoder_states)

# decoder
# tensor for save values of previous time step
inf_decoder_state_input_h = Input(shape=(latent_dim,))
inf_decoder_state_input_c = Input(shape=(latent_dim,))
inf_decoder_state_inputs = [inf_decoder_state_input_h, inf_decoder_state_input_c]
# using saved states of previous time step to predict next char (ref. 'decode_sequence' function)
# decoder_lstm_2 = LSTM(units=latent_dim, return_sequences=True, return_state=True)
inf_decoder_output, inf_state_h, inf_state_c = decoder_lstm(d_input, initial_state=inf_decoder_state_inputs)
# softmax layer
inf_decoder_states = [inf_state_h, inf_state_c]
# decoder_softmax_2 = Dense(tar_max, activation='softmax')
inf_decoder_output = decoder_softmax(inf_decoder_output)

inf_decoder = Model(inputs=[d_input] + inf_decoder_state_inputs, outputs=[inf_decoder_output] + inf_decoder_states)

inf_decoder.summary()

Model: "model_12"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_2 (InputLayer)        [(None, None, 104)]          0         []                            
                                                                                                  
 input_13 (InputLayer)       [(None, 256)]                0         []                            
                                                                                                  
 input_14 (InputLayer)       [(None, 256)]                0         []                            
                                                                                                  
 lstm_1 (LSTM)               [(None, None, 256),          369664    ['input_2[0][0]',             
                              (None, 256),                           'input_13[0][0]',     

### Functional API
 - Functional API is a way to create models that is more flexible than the tf.keras.Sequential API.
### Why use existing model layer from training? e.g., 'decoder_lstm', 'decoder_softmax'
 - The reason is that we need the model layer to have the same weights as the trained model. If define new model layer (decoder_lstm_2, decoder_softmax_2), the weights earned from training will not be used so training and inferencing phase will have different weights. 
### Encoder
 - encoder_inputs와 encoder_states는 훈련 과정에서 이미 정의한 것들을 재사용합니다. 이렇게 되면 훈련 단계에 encoder_inputs와 encoder_states 사이에 있는 모든 층까지 전부 불러오게 되므로 결과적으로 훈련 단계에서 사용한 인코더를 그대로 재사용.

In [31]:
idx_src = dict([(i, char) for char, i in src_index.items()])
idx_tar = dict([(i, char) for char, i in tar_index.items()])

print(idx_src)
print(idx_tar)

{1: ' ', 2: '!', 3: '"', 4: '$', 5: '%', 6: '&', 7: "'", 8: ',', 9: '-', 10: '.', 11: '/', 12: '0', 13: '1', 14: '2', 15: '3', 16: '4', 17: '5', 18: '6', 19: '7', 20: '8', 21: '9', 22: ':', 23: '?', 24: 'A', 25: 'B', 26: 'C', 27: 'D', 28: 'E', 29: 'F', 30: 'G', 31: 'H', 32: 'I', 33: 'J', 34: 'K', 35: 'L', 36: 'M', 37: 'N', 38: 'O', 39: 'P', 40: 'Q', 41: 'R', 42: 'S', 43: 'T', 44: 'U', 45: 'V', 46: 'W', 47: 'X', 48: 'Y', 49: 'Z', 50: 'a', 51: 'b', 52: 'c', 53: 'd', 54: 'e', 55: 'f', 56: 'g', 57: 'h', 58: 'i', 59: 'j', 60: 'k', 61: 'l', 62: 'm', 63: 'n', 64: 'o', 65: 'p', 66: 'q', 67: 'r', 68: 's', 69: 't', 70: 'u', 71: 'v', 72: 'w', 73: 'x', 74: 'y', 75: 'z', 76: 'é', 77: 'ï', 78: '’', 79: '€'}
{1: '\t', 2: '\n', 3: ' ', 4: '!', 5: '"', 6: '$', 7: '%', 8: '&', 9: "'", 10: '(', 11: ')', 12: ',', 13: '-', 14: '.', 15: '0', 16: '1', 17: '2', 18: '3', 19: '4', 20: '5', 21: '6', 22: '7', 23: '8', 24: '9', 25: ':', 26: '?', 27: 'A', 28: 'B', 29: 'C', 30: 'D', 31: 'E', 32: 'F', 33: 'G', 34: 'H

In [32]:
def decode_sequence(input_seq):
    # 입력으로부터 인코더의 상태를 얻음
    states_value = inf_encoder.predict(input_seq, verbose=0)

    # <SOS>에 해당하는 원-핫 벡터 생성
    target_seq = np.zeros((1, 1, tar_max))
    target_seq[0, 0, tar_index['\t']] = 1.

    stop_condition = False
    decoded_sentence = ""

    # stop_condition이 True가 될 때까지 루프 반복
    while not stop_condition:
        # 이점 시점의 상태 states_value를 현 시점의 초기 상태로 사용
        output_tokens, h, c = inf_decoder.predict([target_seq] + states_value, verbose=0) # <SOS> + context vector

        # 예측 결과를 문자로 변환
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = idx_tar[sampled_token_index]

        # 현재 시점의 예측 문자를 예측 문장에 추가
        decoded_sentence += sampled_char

        # 에 도달하거나 최대 길이를 넘으면 중단.
        if (sampled_char == '\n' or
                len(decoded_sentence) > max_tar_len):
            stop_condition = True

        # 현재 시점의 예측 결과를 다음 시점의 입력으로 사용하기 위해 저장
        target_seq = np.zeros((1, 1, tar_max))
        target_seq[0, 0, sampled_token_index] = 1.

        # 현재 시점의 상태를 다음 시점의 상태로 사용하기 위해 저장
        states_value = [h, c]

    return decoded_sentence

In [33]:
for seq_index in [3,50,100,300,1001]: # 입력 문장의 인덱스
    input_seq = encoder_input_onehot[seq_index:seq_index+1]
    decoded_sentence = decode_sequence(input_seq)
    print(35 * "-")
    print('입력 문장:', df.src[seq_index])
    print('정답 문장:', df.tar[seq_index][2:len(df.tar[seq_index])-1]) # '\t'와 '\n'을 빼고 출력
    print('번역 문장:', decoded_sentence[1:len(decoded_sentence)-1]) # '\n'을 빼고 출력

-----------------------------------
입력 문장: Go.
정답 문장: Bouge ! 
번역 문장: Tom est en coune de mont ? 
-----------------------------------
입력 문장: Hello!
정답 문장: Bonjour ! 
번역 문장: Tom est an de mon de le mont ? 
-----------------------------------
입력 문장: Got it!
정답 문장: J'ai pigé ! 
번역 문장: La sous en parde ! 
-----------------------------------
입력 문장: Go home.
정답 문장: Rentre à la maison. 
번역 문장: Tom est an de mon de le mont ? 
-----------------------------------
입력 문장: Get going.
정답 문장: En avant. 
번역 문장: La sous de monte de paster. 
