In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import re

In [32]:
english = []
korean = []
count = 50
with open('korean-english-park.train.en', 'r', encoding='utf8') as f:
    for i, line in enumerate(f):
        english.append(line)
        if i-1 == count:
            break

with open('korean-english-park.train.ko', 'r', encoding='utf8') as f:
    for i, line in enumerate(f):
        korean.append(line)
        if i-1 == count:
            break

In [33]:
for i in range(len(english)):
    english[i] = re.sub('\n', '', english[i])
for i in range(len(korean)):
    korean[i] = re.sub('\n', '', korean[i])

In [34]:
for i in range(len(english)):
    english[i] = english[i].split()
for i in range(len(korean)):
    korean[i] = korean[i].split()

In [35]:
seq_data = np.stack((english, korean), 1)

In [36]:
word_arr = []
for seq in english:
    word_arr += seq
word_arr += ['<P>']
en_word2num = {c:i for i, c in enumerate(set(word_arr))}
en_num2word = {i:c for i, c in enumerate(en_word2num.keys())}

In [37]:
word_arr = []
for seq in korean:
    word_arr += seq
word_arr += ['<S>', '</S>', '<P>']
ko_word2num = {c:i for i, c in enumerate(set(word_arr))}
ko_num2word = {i:c for i, c in enumerate(ko_word2num.keys())}

In [38]:
def get_max_length(seq_data):
    max_len = 0
    for seq in seq_data:
        if max_len < len(seq):
            max_len = len(seq)
    return max_len

In [39]:
def make_batch(seq_data, enc_max_len, dec_max_len):
    input_batch = []
    output_batch = []
    target_batch = []
    for i, seq in enumerate(seq_data):
        input = []
        output = []
        target = []
        for token in seq[0]:
            input.append(en_word2num[token])
        for _ in range(len(seq[0]), enc_max_len):
            input.append(en_word2num['<P>'])
        input_batch.append(input)
        output.append(ko_word2num['<S>'])
        for token in seq[1]:
            output.append(ko_word2num[token])
            target.append(ko_word2num[token])
        target.append(ko_word2num['</S>'])
        for _ in range(len(seq[1]), dec_max_len):
            output.append(ko_word2num['</S>'])
            target.append(ko_word2num['</S>'])
                
        output_batch.append(output)
        target_batch.append(target)

    return input_batch, output_batch, target_batch

In [40]:
def get_seq_length(seq_data):
    seq_len = []
    for i, seq in enumerate(seq_data):
        seq_len.append(len(seq))
    return seq_len

In [51]:
learning_rate = 1e-3
n_hidden = 128
max_enc_step = get_max_length(english)
max_dec_step = get_max_length(korean)
n_embedding = 100
total_epoch = 500
batch_size = count
en_dic_len = len(en_word2num)
ko_dic_len = len(ko_word2num)

In [52]:
tf.reset_default_graph()
enc_input = tf.placeholder(tf.int32, [None, max_enc_step])
dec_input = tf.placeholder(tf.int32, [None, max_dec_step+1])
targets = tf.placeholder(tf.int64, [None, max_dec_step+1])
W = tf.get_variable(name='encode_embedding', shape=[en_dic_len, n_embedding], trainable=True)
W_ = tf.get_variable(name='decode_embedding', shape=[ko_dic_len, n_embedding], trainable=True)
enc_seq_len = tf.placeholder(dtype=tf.int32, shape=[None])
dec_seq_len = tf.placeholder(dtype=tf.int32, shape=[None])
enc_inputs = tf.nn.embedding_lookup(W, enc_input)
dec_inputs = tf.nn.embedding_lookup(W_, dec_input)

In [53]:
with tf.variable_scope('encode'):
    enc_cell = tf.nn.rnn_cell.BasicLSTMCell(n_hidden)
    #enc_cell = tf.nn.rnn_cell.DropoutWrapper(enc_cell, output_keep_prob=0.5)
    outputs, enc_states = tf.nn.dynamic_rnn(enc_cell, enc_inputs, sequence_length=enc_seq_len, dtype=tf.float32)

In [54]:
with tf.variable_scope('decode'):
    dec_cell = tf.nn.rnn_cell.BasicLSTMCell(n_hidden)
    #dec_cell = tf.nn.rnn_cell.DropoutWrapper(dec_cell, output_keep_prob=0.5)
    outputs, dec_states = tf.nn.dynamic_rnn(dec_cell, dec_inputs, initial_state = enc_states, dtype=tf.float32)

In [55]:
logits = tf.layers.dense(outputs, ko_dic_len, activation=None)

In [56]:
cost = tf.reduce_mean(tf.contrib.seq2seq.sequence_loss(logits=logits, targets=targets, weights=tf.sequence_mask(dec_seq_len+1, max_dec_step+1, dtype=tf.float32)))
predict = tf.argmax(logits, 2)

In [57]:
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

In [58]:
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
sess = tf.Session()
sess.run(tf.global_variables_initializer())
total_batch = int(len(seq_data)/batch_size)
for epoch in range(total_epoch):
    loss_sum = 0
    #print('< epoch:', epoch+1, '>')
    for i in range(total_batch):
        if i == (total_batch-1):
            input_batch, output_batch, target_batch = make_batch(seq_data[i*batch_size:len(seq_data)], max_enc_step, max_dec_step)
            enc_seq_data = get_seq_length(english[i*batch_size:len(seq_data)])
            dec_seq_data = get_seq_length(korean[i*batch_size:len(seq_data)])
        else:
            input_batch, output_batch, target_batch = make_batch(seq_data[i*batch_size:(i+1)*batch_size], max_enc_step, max_dec_step)
            enc_seq_data = get_seq_length(english[i*batch_size:(i+1)*batch_size])
            dec_seq_data = get_seq_length(korean[i*batch_size:(i+1)*batch_size])
        
        _, loss = sess.run([optimizer, cost], feed_dict={enc_input: input_batch, dec_input: output_batch, targets: target_batch, enc_seq_len: enc_seq_data, dec_seq_len: dec_seq_data})
        loss_sum += loss
        #if i % 30 == 29:
        #    print('batch:', '%03d' % (i+1), 'cost =', '{:.6f}'.format(loss_sum/30))
        #    loss_sum = 0
    if epoch % 50 == 49:
        print('epoch:', '%03d' % (epoch+1), 'cost =', '{:.6f}'.format(loss_sum/50))
        
print('optimization finished!')

epoch: 050 cost = 0.104676
epoch: 100 cost = 0.062498
epoch: 150 cost = 0.031110
epoch: 200 cost = 0.017271
epoch: 250 cost = 0.009547
epoch: 300 cost = 0.005225
epoch: 350 cost = 0.002992
epoch: 400 cost = 0.001867
epoch: 450 cost = 0.001283
epoch: 500 cost = 0.000942
optimization finished!


In [59]:
def translate(english):
    english = [english]
    korean = [['<P>']*max_dec_step]
    seq_data = [english + korean]
    input_batch, output_batch, target_batch = make_batch(seq_data, max_enc_step, max_dec_step)
    enc_seq_data = get_seq_length(english)
    dec_seq_data = get_seq_length(korean)
    result = sess.run(predict, feed_dict={enc_input: input_batch, dec_input: output_batch, targets: target_batch, enc_seq_len: enc_seq_data, dec_seq_len: dec_seq_data})
    decoded = [ko_num2word[j] for j in np.squeeze(result)]
    end = len(decoded)-1
    if '</S>' in decoded:
        end = decoded.index('</S>')
    translated = ' '.join(decoded[:end])
    return translated

In [60]:
for i, seq in enumerate(english):
    temp = ''
    for token in seq:
        temp += token + ' '
    #print(temp)
    print(temp, '\n->', translate(seq), '\n')
    if i == 10:
        break

Much of personal computing is about "can you top this?"  
-> 개인용 컴퓨터 사용의 상당 상당 부분은 실바가 실바가 있느냐?" 

so a mention a few weeks ago about a rechargeable wireless optical mouse brought in another rechargeable, wireless mouse.  
-> 모든 광마우스와 마찬가지 로 광마우스도 책상 위에 위에 마우스 패드를 패드를 패드를 120대의 판매될 판매될 

Like all optical mice, But it also doesn't need a desk.  
-> 그러나 이것은 또한 책상도 필요로 필요로 않는다. 않는다. 않는다. 

uses gyroscopic sensors to control the cursor movement as you move your wrist, arm, whatever through the air.  
-> 79.95달러하는 79.95달러하는 최첨단 무선 광마우스는 허공에서 팔, 그외에 경우, 전체 전체 공학 시장의 250억달러에 달할 달할 라틴 

-> 정보 관리들은 관리들은 아시아에서의 선박들에 선박들에 선박들에 (테러) (테러) 돌아갔음을 돌아갔음을 밝혔으며, 밝혔으며, 해상 교역량의 거의 거의 거의 좁은 좁은 테러 테러 테러 테러 세계 

-> 이 지역에 있는 있는 선박과 선박과 선박들에 선박들에 알카에다의 알카에다의 핵심적인 중 중 실패했다는 실패했다는 실패했다는 크리스 크리스 민주당 마키는 마키는 후에, 태국에서의 대통령에게 대통령에게 연료용 연료용 1994 1994 1994 1994 

Caffeine can help increase reaction time and improve performance for military servicemen who must perform complex tasks or who need help staying alert for lon