## Data Preprocessing

In [1]:
import pandas as pd
lines = pd.read_csv('fra.txt', names = ['src','tar','cc'], sep='\t')
len(lines)

175623

In [2]:
lines = lines.drop(['cc'], axis = 1)
lines = lines[0:60000]
lines.sample(10)

Unnamed: 0,src,tar
48583,I'll never deceive you.,Je ne te décevrai jamais.
35461,I saw you snickering.,Je vous ai vues pouffer.
54344,I had no one to talk to.,Je n'avais personne à qui parler.
1911,Be prepared.,Sois préparé !
13826,Everybody stayed.,Tout le monde est resté.
2216,I hate dogs.,Je hais les chiens.
21101,This is my mother.,C'est ma maman.
7907,I hate spinach.,Je déteste les épinards.
14255,How do I do that?,Comment je fais ça ?
22268,You're a disgrace.,Vous êtes une honte.


In [3]:
lines.tar = lines.tar.apply(lambda x : '\t ' + x + ' \n')
lines.sample(10)

Unnamed: 0,src,tar
51012,Tom thinks it's stupid.,\t Tom trouve que c'est stupide. \n
54842,I saw tears in his eyes.,\t J'ai vu des larmes dans ses yeux. \n
43480,She wore a pretty hat.,\t Elle portait un joli chapeau. \n
41402,I hated history class.,\t J'ai détesté les cours d'histoire. \n
55414,"I'm busy, so I can't go.","\t Je suis occupé, je ne peux donc y aller. \n"
26893,What color is this?,\t De quelle couleur est-ce ? \n
13182,You look gloomy.,\t Vous avez l'air triste. \n
13240,You're arrogant.,\t Vous êtes arrogantes. \n
4744,You're early.,\t Tu es matinal. \n
35362,I need another drink.,\t J'ai besoin d'un autre verre. \n


In [4]:
src_vocab = set()
for line in lines.src:
    for char in line:
        src_vocab.add(char)

tar_vocab = set()
for line in lines.tar:
    for char in line:
        tar_vocab.add(char)

In [5]:
src_vocab_size = len(src_vocab)+1
tar_vocab_size = len(tar_vocab)+1
print(src_vocab_size)
print(tar_vocab_size)

79
106


In [6]:
src_vocab = sorted(list(src_vocab))
tar_vocab = sorted(list(tar_vocab))
print(src_vocab[45:75])
print(tar_vocab[45:75])

['W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
['T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w']


In [7]:
src_to_index = dict([(word, i+1) for i, word in enumerate(src_vocab)])
tar_to_index = dict([(word, i+1) for i, word in enumerate(tar_vocab)])
print(src_to_index)
print(tar_to_index)

{' ': 1, '!': 2, '"': 3, '$': 4, '%': 5, '&': 6, "'": 7, ',': 8, '-': 9, '.': 10, '/': 11, '0': 12, '1': 13, '2': 14, '3': 15, '4': 16, '5': 17, '6': 18, '7': 19, '8': 20, '9': 21, ':': 22, '?': 23, 'A': 24, 'B': 25, 'C': 26, 'D': 27, 'E': 28, 'F': 29, 'G': 30, 'H': 31, 'I': 32, 'J': 33, 'K': 34, 'L': 35, 'M': 36, 'N': 37, 'O': 38, 'P': 39, 'Q': 40, 'R': 41, 'S': 42, 'T': 43, 'U': 44, 'V': 45, 'W': 46, 'X': 47, 'Y': 48, 'Z': 49, 'a': 50, 'b': 51, 'c': 52, 'd': 53, 'e': 54, 'f': 55, 'g': 56, 'h': 57, 'i': 58, 'j': 59, 'k': 60, 'l': 61, 'm': 62, 'n': 63, 'o': 64, 'p': 65, 'q': 66, 'r': 67, 's': 68, 't': 69, 'u': 70, 'v': 71, 'w': 72, 'x': 73, 'y': 74, 'z': 75, 'é': 76, '’': 77, '€': 78}
{'\t': 1, '\n': 2, ' ': 3, '!': 4, '"': 5, '$': 6, '%': 7, '&': 8, "'": 9, '(': 10, ')': 11, ',': 12, '-': 13, '.': 14, '0': 15, '1': 16, '2': 17, '3': 18, '4': 19, '5': 20, '6': 21, '7': 22, '8': 23, '9': 24, ':': 25, '?': 26, 'A': 27, 'B': 28, 'C': 29, 'D': 30, 'E': 31, 'F': 32, 'G': 33, 'H': 34, 'I': 3

In [8]:
encoder_input = []
for line in lines.src:
    temp_X = []
    for w in line:
        temp_X.append(src_to_index[w])
    encoder_input.append(temp_X)
print(encoder_input[:5])

[[30, 64, 10], [31, 58, 10], [31, 58, 10], [41, 70, 63, 2], [41, 70, 63, 2]]


In [9]:
decoder_input = []
for line in lines.tar:
    temp_X = []
    for w in line:
        temp_X.append(tar_to_index[w])
    decoder_input.append(temp_X)
print(decoder_input[:5])

[[1, 3, 48, 53, 3, 4, 3, 2], [1, 3, 45, 53, 64, 73, 72, 3, 4, 3, 2], [1, 3, 45, 53, 64, 73, 72, 14, 3, 2], [1, 3, 29, 67, 73, 70, 71, 105, 4, 3, 2], [1, 3, 29, 67, 73, 70, 57, 78, 105, 4, 3, 2]]


In [10]:
decoder_target = []
for line in lines.tar:
    t = 0
    temp_X = []
    for w in line:
        if t > 0 :
            temp_X.append(tar_to_index[w])
        t = t + 1
    decoder_target.append(temp_X)
print(decoder_target[:5])

[[3, 48, 53, 3, 4, 3, 2], [3, 45, 53, 64, 73, 72, 3, 4, 3, 2], [3, 45, 53, 64, 73, 72, 14, 3, 2], [3, 29, 67, 73, 70, 71, 105, 4, 3, 2], [3, 29, 67, 73, 70, 57, 78, 105, 4, 3, 2]]


In [11]:
max_src_len = max([len(line) for line in lines.src])
max_tar_len = max([len(line) for line in lines.tar])
print(max_src_len)
print(max_tar_len)

25
76


In [12]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
encoder_input = pad_sequences(encoder_input, maxlen=max_src_len, padding='post')
decoder_input = pad_sequences(decoder_input, maxlen=max_tar_len, padding='post')
decoder_target= pad_sequences(decoder_target, maxlen=max_tar_len, padding='post')

In [13]:
from tensorflow.keras.utils import to_categorical
encoder_input = to_categorical(encoder_input)
decoder_input = to_categorical(decoder_input)
decoder_target = to_categorical(decoder_target)

## Teacher Forcing

디코더에서 이전 셀의 예측 결과를 다음 단계에 인풋으로 넣는 것이 아니라 이전 셀의 실제 값을 다음 단계에 인풋으로 넣어 학습의 정확도를 높여준다.

## seq2seq 기계 번역기 훈련시키기

In [14]:
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense
from tensorflow.keras.models import Model

encoder_inputs = Input(shape=(None, src_vocab_size))
encoder_lstm = LSTM(units=256, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_inputs)
encoder_states = [state_h, state_c]

In [15]:
decoder_inputs = Input(shape=(None, tar_vocab_size))
decoder_lstm = LSTM(units=256, return_sequences=True, return_state=True)
decoder_outputs, _, _=decoder_lstm(decoder_inputs, initial_state=encoder_states)

decoder_softmax_layer = Dense(tar_vocab_size, activation='softmax')
decoder_outputs = decoder_softmax_layer(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer="rmsprop", loss="categorical_crossentropy")

In [16]:
model.fit(x=[encoder_input, decoder_input], y=decoder_target, batch_size=64, epochs=50, validation_split=0.2)

InternalError: Failed copying input tensor from /job:localhost/replica:0/task:0/device:CPU:0 to /job:localhost/replica:0/task:0/device:GPU:0 in order to run GatherV2: Dst tensor is not initialized. [Op:GatherV2]

In [None]:
encoder_model = Model(inputs = encoder_inputs, outputs = encoder_states)

In [None]:
#이전 시점의 상태들을 저장하는 텐서
decoder_state_input_h = Input(shape=(256,))
decoder_state_input_c = Input(shape=(256,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, 
                                                 initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]

In [None]:
decoder_outputs = decoder_softmax_layer(decoder_outputs)
decoder_model = Model(inputs = [decoder_inputs] + decoder_states_inputs, 
                      outputs = [decoder_outputs] + decoder_states)

In [None]:
index_to_src = dict((i,char) for char, i in src_to_index.items())
index_to_tar = dict((i,char) for char, i in tar_to_index.items())

In [None]:
def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)
    target_seq = np.zeros((1,1,tar_vocab_size))
    target_seq[0,0,tar_to_index['\t']] = 1.
    
    stop_condition = False
    decoded_sentence = ""
    
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = index_to_tar[sampled_token_index]
        
        decoded_sentence += sampled_char
        
        if(sampled_char == '\n' or len(decoded_sentence) > max_tar_len):
            stop_condition = True
            
        target_seq = np.zeros((1,1,tar_vocab_size))
        target_seq[0,0,sampled_token_index] = 1.
        
        states_value = [h, c]
        
    return decoded_sentence

In [None]:
import numpy as np
for seq_index in [3, 50, 100, 300, 1001]:
    input_seq = encoder_input[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print(35 * "-")
    print('입력 문장:', lines.src[seq_index])
    print('정답 문장:', lines.tar[seq_index][1:len(lines.tar[seq_index])-1])
    print('번역기가 번역한 문장:', decoded_sentence[:len(decoded_sentence)-1])

In [None]:
sent = ['I want a cat.']
input_sent = []
for line in sent:
    for w in line:
        input_sent.append(src_to_index[w])
input_sent = pad_sequences([input_sent], maxlen=max_src_len, padding='post')
inputSent = to_categorical(input_sent, 79)
decoded_sent = decode_sequence(inputSent)
print('입력 문장: '+sent[0])
print('번역기가 번역한 문장:', decoded_sent[:len(decoded_sent)-1])