## Data Preprocessing

In [1]:
import pandas as pd
lines = pd.read_csv('fra.txt', names = ['src','tar','cc'], sep='\t')
len(lines)

175623

In [2]:
lines = lines.drop(['cc'], axis = 1)
lines = lines[0:60000]
lines.sample(10)

Unnamed: 0,src,tar
5694,I was curious.,J'étais curieuse.
39275,You're conscientious.,Vous êtes consciencieux.
33368,Are you with someone?,Êtes-vous avec quelqu'un ?
19360,I seem to be lost.,Il semble que je sois perdu.
36546,It's really horrible.,C'est vraiment horrible.
22395,You're the oldest.,C'est toi le plus vieux.
15796,No one asked you.,Personne ne te l'a demandé.
31556,They can't stop you.,Ils ne peuvent pas vous arrêter.
47720,I had to get some help.,J'ai dû obtenir de l'aide.
30287,Is that all of them?,Est-ce là la totalité d'entre elles ?


In [3]:
lines.tar = lines.tar.apply(lambda x : '\t ' + x + ' \n')
lines.sample(10)

Unnamed: 0,src,tar
10971,I like your tie.,\t J'aime votre cravate. \n
23896,I hate goat cheese.,\t Je déteste le fromage de chèvre. \n
3063,Will you go?,\t Vous y rendrez-vous ? \n
28676,He took off his hat.,\t Il retira son chapeau. \n
42593,Is Mary a real blonde?,\t Est-ce que Marie est une vraie blonde? \n
22767,Did you find a job?,\t As-tu trouvé du travail ? \n
35119,I hear Tom hates you.,\t On m'a dit que Tom te détestait. \n
57835,We depend on each other.,\t Nous dépendons les uns des autres. \n
50453,They hid in the cellar.,\t Ils se sont cachés dans la cave. \n
36347,Is this all for real?,\t Tout ceci est-il réel ? \n


In [4]:
src_vocab = set()
for line in lines.src:
    for char in line:
        src_vocab.add(char)

tar_vocab = set()
for line in lines.tar:
    for char in line:
        tar_vocab.add(char)

In [5]:
src_vocab_size = len(src_vocab)+1
tar_vocab_size = len(tar_vocab)+1
print(src_vocab_size)
print(tar_vocab_size)

79
106


In [6]:
src_vocab = sorted(list(src_vocab))
tar_vocab = sorted(list(tar_vocab))
print(src_vocab[45:75])
print(tar_vocab[45:75])

['W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
['T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w']


In [7]:
src_to_index = dict([(word, i+1) for i, word in enumerate(src_vocab)])
tar_to_index = dict([(word, i+1) for i, word in enumerate(tar_vocab)])
print(src_to_index)
print(tar_to_index)

{' ': 1, '!': 2, '"': 3, '$': 4, '%': 5, '&': 6, "'": 7, ',': 8, '-': 9, '.': 10, '/': 11, '0': 12, '1': 13, '2': 14, '3': 15, '4': 16, '5': 17, '6': 18, '7': 19, '8': 20, '9': 21, ':': 22, '?': 23, 'A': 24, 'B': 25, 'C': 26, 'D': 27, 'E': 28, 'F': 29, 'G': 30, 'H': 31, 'I': 32, 'J': 33, 'K': 34, 'L': 35, 'M': 36, 'N': 37, 'O': 38, 'P': 39, 'Q': 40, 'R': 41, 'S': 42, 'T': 43, 'U': 44, 'V': 45, 'W': 46, 'X': 47, 'Y': 48, 'Z': 49, 'a': 50, 'b': 51, 'c': 52, 'd': 53, 'e': 54, 'f': 55, 'g': 56, 'h': 57, 'i': 58, 'j': 59, 'k': 60, 'l': 61, 'm': 62, 'n': 63, 'o': 64, 'p': 65, 'q': 66, 'r': 67, 's': 68, 't': 69, 'u': 70, 'v': 71, 'w': 72, 'x': 73, 'y': 74, 'z': 75, 'é': 76, '’': 77, '€': 78}
{'\t': 1, '\n': 2, ' ': 3, '!': 4, '"': 5, '$': 6, '%': 7, '&': 8, "'": 9, '(': 10, ')': 11, ',': 12, '-': 13, '.': 14, '0': 15, '1': 16, '2': 17, '3': 18, '4': 19, '5': 20, '6': 21, '7': 22, '8': 23, '9': 24, ':': 25, '?': 26, 'A': 27, 'B': 28, 'C': 29, 'D': 30, 'E': 31, 'F': 32, 'G': 33, 'H': 34, 'I': 3

In [8]:
encoder_input = []
for line in lines.src:
    temp_X = []
    for w in line:
        temp_X.append(src_to_index[w])
    encoder_input.append(temp_X)
print(encoder_input[:5])

[[30, 64, 10], [31, 58, 10], [31, 58, 10], [41, 70, 63, 2], [41, 70, 63, 2]]


In [9]:
decoder_input = []
for line in lines.tar:
    temp_X = []
    for w in line:
        temp_X.append(tar_to_index[w])
    decoder_input.append(temp_X)
print(decoder_input[:5])

[[1, 3, 48, 53, 3, 4, 3, 2], [1, 3, 45, 53, 64, 73, 72, 3, 4, 3, 2], [1, 3, 45, 53, 64, 73, 72, 14, 3, 2], [1, 3, 29, 67, 73, 70, 71, 105, 4, 3, 2], [1, 3, 29, 67, 73, 70, 57, 78, 105, 4, 3, 2]]


In [10]:
decoder_target = []
for line in lines.tar:
    t = 0
    temp_X = []
    for w in line:
        if t > 0 :
            temp_X.append(tar_to_index[w])
        t = t + 1
    decoder_target.append(temp_X)
print(decoder_target[:5])

[[3, 48, 53, 3, 4, 3, 2], [3, 45, 53, 64, 73, 72, 3, 4, 3, 2], [3, 45, 53, 64, 73, 72, 14, 3, 2], [3, 29, 67, 73, 70, 71, 105, 4, 3, 2], [3, 29, 67, 73, 70, 57, 78, 105, 4, 3, 2]]


In [11]:
max_src_len = max([len(line) for line in lines.src])
max_tar_len = max([len(line) for line in lines.tar])
print(max_src_len)
print(max_tar_len)

25
76


In [12]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
encoder_input = pad_sequences(encoder_input, maxlen=max_src_len, padding='post')
decoder_input = pad_sequences(decoder_input, maxlen=max_tar_len, padding='post')
decoder_target= pad_sequences(decoder_target, maxlen=max_tar_len, padding='post')

In [13]:
from tensorflow.keras.utils import to_categorical
encoder_input = to_categorical(encoder_input)
decoder_input = to_categorical(decoder_input)
decoder_target = to_categorical(decoder_target)

## Teacher Forcing

디코더에서 이전 셀의 예측 결과를 다음 단계에 인풋으로 넣는 것이 아니라 이전 셀의 실제 값을 다음 단계에 인풋으로 넣어 학습의 정확도를 높여준다.

## seq2seq 기계 번역기 훈련시키기

In [14]:
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense
from tensorflow.keras.models import Model

encoder_inputs = Input(shape=(None, src_vocab_size))
encoder_lstm = LSTM(units=256, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_inputs)
encoder_states = [state_h, state_c]

W0531 00:20:25.295560 16484 deprecation.py:506] From c:\users\yunja_kuj61s9\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\ops\init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [15]:
decoder_inputs = Input(shape=(None, tar_vocab_size))
decoder_lstm = LSTM(units=256, return_sequences=True, return_state=True)
decoder_outputs, _, _=decoder_lstm(decoder_inputs, initial_state=encoder_states)

decoder_softmax_layer = Dense(tar_vocab_size, activation='softmax')
decoder_outputs = decoder_softmax_layer(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer="rmsprop", loss="categorical_crossentropy")

In [17]:
model.fit(x=[encoder_input, decoder_input], y=decoder_target, batch_size=64, epochs=50, validation_split=0.2)

W0531 00:20:36.895695 16484 deprecation.py:323] From c:\users\yunja_kuj61s9\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\ops\math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Train on 48000 samples, validate on 12000 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x2ec0f549d68>

In [18]:
encoder_model = Model(inputs = encoder_inputs, outputs = encoder_states)

In [19]:
#이전 시점의 상태들을 저장하는 텐서
decoder_state_input_h = Input(shape=(256,))
decoder_state_input_c = Input(shape=(256,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, 
                                                 initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]

In [20]:
decoder_outputs = decoder_softmax_layer(decoder_outputs)
decoder_model = Model(inputs = [decoder_inputs] + decoder_states_inputs, 
                      outputs = [decoder_outputs] + decoder_states)

In [21]:
index_to_src = dict((i,char) for char, i in src_to_index.items())
index_to_tar = dict((i,char) for char, i in tar_to_index.items())

In [22]:
def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)
    target_seq = np.zeros((1,1,tar_vocab_size))
    target_seq[0,0,tar_to_index['\t']] = 1.
    
    stop_condition = False
    decoded_sentence = ""
    
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = index_to_tar[sampled_token_index]
        
        decoded_sentence += sampled_char
        
        if(sampled_char == '\n' or len(decoded_sentence) > max_tar_len):
            stop_condition = True
            
        target_seq = np.zeros((1,1,tar_vocab_size))
        target_seq[0,0,sampled_token_index] = 1.
        
        states_value = [h, c]
        
    return decoded_sentence

In [23]:
import numpy as np
for seq_index in [3, 50, 100, 300, 1001]:
    input_seq = encoder_input[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print(35 * "-")
    print('입력 문장:', lines.src[seq_index])
    print('정답 문장:', lines.tar[seq_index][1:len(lines.tar[seq_index])-1])
    print('번역기가 번역한 문장:', decoded_sentence[:len(decoded_sentence)-1])

-----------------------------------
입력 문장: Run!
정답 문장:  Cours ! 
번역기가 번역한 문장:  Courez ! 
-----------------------------------
입력 문장: I lied.
정답 문장:  J'ai menti. 
번역기가 번역한 문장:  J'ai disposé. 
-----------------------------------
입력 문장: Come in.
정답 문장:  Entre. 
번역기가 번역한 문장:  Entrez ! 
-----------------------------------
입력 문장: I did OK.
정답 문장:  Je m'en suis bien sortie. 
번역기가 번역한 문장:  Je m'en suis bien sortie. 
-----------------------------------
입력 문장: We're sad.
정답 문장:  Nous sommes tristes. 
번역기가 번역한 문장:  Nous sommes en train de se passer. 


In [24]:
sent = ['I want a cat.']
input_sent = []
for line in sent:
    for w in line:
        input_sent.append(src_to_index[w])
input_sent = pad_sequences([input_sent], maxlen=max_src_len, padding='post')
inputSent = to_categorical(input_sent, 79)
decoded_sent = decode_sequence(inputSent)
print('입력 문장: '+sent[0])
print('번역기가 번역한 문장:', decoded_sent[:len(decoded_sent)-1])

입력 문장: I want a cat.
번역기가 번역한 문장:  Je veux du lait. 
