# Sequence to Sequence Model

## Data loading & Word dictionary

In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import sm_tool as tool

In [3]:
# data loading
path = 'data/'
fileName = 'rawData_10000.csv'

docNo, res, code = tool.loading_data(path+fileName)

In [4]:
# 띄어쓰기 교정
res_s=tool.spellchecker(res)
code_s=tool.spellchecker(code)

In [5]:
# make word dictionary - normalized
word_to_ix, ix_to_word = tool.make_dict_all_cut(res_s, code_s, minlength=0, maxlength=3, jamo_delete=True)

Data 갯수 : 5486, 단어 갯수 : 2819


In [None]:
word_to_ix

In [None]:
ix_to_word

## Transform the raw data to input form

In [5]:
tool.check_doclength(res_s, sep=True) # only word (sep = True)

45

In [6]:
tool.check_doclength(code_s, sep=True) # only word (sep = True)

7

In [6]:
# make data inputs
encoder_size = 10
decoder_size = tool.check_doclength(code_s, sep=True)

# 5000번 까지만 train_input을 구성
encoderinputs, decoderinputs, targets, targetweights= \
    tool.make_inputs(res_s[:5000], code_s[:5000],
                     word_to_ix, 
                     encoder_size=encoder_size, 
                     decoder_size=decoder_size,
                     shuffle=False)

## Model

In [7]:
# parameter
multi= True,
hidden_size = 100
num_layers = 3
forward_only = False
learning_rate = 0.01
batch_size = 5
voca_size = len(ix_to_word)
# use_LSTM = False           # use GRU Cell
use_LSTM = True              # use LSTM Cell 

In [8]:
class seq2seq(object):

    def __init__(self, multi, hidden_size, num_layers, forward_only,
                 learning_rate, batch_size,
                 voca_size, encoder_size, decoder_size, use_LSTM):

        # variables
        self.source_vocab_size = voca_size
        self.target_vocab_size = voca_size
        self.batch_size = batch_size
        self.encoder_size = encoder_size
        self.decoder_size = decoder_size
        self.learning_rate = tf.Variable(float(learning_rate), trainable=False)

        # networks
        W = tf.Variable(tf.random_normal([hidden_size, voca_size]))
        b = tf.Variable(tf.random_normal([voca_size]))
        output_projection = (W, b) # If we use sampled softmax, we need an output projection.
        
        # 인덱스만 있는 데이터 (원핫 인코딩 미시행)
        self.encoder_inputs = [tf.placeholder(tf.int32, [batch_size]) for _ in range(encoder_size)]  # 인덱스만 있는 데이터 (원핫 인코딩 미시행)
        self.decoder_inputs = [tf.placeholder(tf.int32, [batch_size]) for _ in range(decoder_size)]
        self.targets = [tf.placeholder(tf.int32, [batch_size]) for _ in range(decoder_size)]
        self.target_weights = [tf.placeholder(tf.float32, [batch_size]) for _ in range(decoder_size)]

       
    # models
        
         # use LSTM cell or not
        if use_LSTM:
            def single_cell():
                return tf.contrib.rnn.LSTMCell(num_units=hidden_size)
        else:
            def single_cell():
                return tf.contrib.rnn.GRUCell(num_units=hidden_size)
            
        # determinemulti cell
        if multi:
            cell = tf.contrib.rnn.MultiRNNCell([single_cell()] * num_layers)
        else:
            cell = single_cell()
                
        # train
        if not forward_only:            
            self.outputs, self.states = tf.contrib.legacy_seq2seq.embedding_attention_seq2seq(
                  encoder_inputs=self.encoder_inputs,
                  decoder_inputs=self.decoder_inputs,
                  cell=cell,
                  num_encoder_symbols=self.source_vocab_size,
                  num_decoder_symbols=self.target_vocab_size,
                  embedding_size=hidden_size,
                  output_projection=output_projection,
                  feed_previous=False)

            self.logits = [tf.matmul(output, output_projection[0]) + output_projection[1] for output in self.outputs]
            self.loss = []
            for logit, target, target_weight in zip(self.logits, self.targets, self.target_weights):
                crossentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=target, logits=logit)
                self.loss.append(crossentropy * target_weight)
                
            self.cost = tf.add_n(self.loss)
            self.train_op = tf.train.AdamOptimizer(learning_rate).minimize(self.cost)

        # test
        else:
            self.outputs, self.states = tf.nn.seq2seq.embedding_attention_seq2seq(
                self.encoder_inputs, self.decoder_inputs, cell,
                num_encoder_symbols=voca_size,
                num_decoder_symbols=voca_size,
                embedding_size=hidden_size,
                output_projection=output_projection,
                feed_previous=True)
            
            self.logits = [tf.matmul(output, output_projection[0]) + output_projection[1] for output in self.outputs]

    def step(self, session, encoderinputs, decoderinputs, targets, targetweights, forward_only):
        input_feed = {}
        for l in range(len(encoder_inputs)):
            input_feed[self.encoder_inputs[l].name] = encoderinputs[l]
        for l in range(len(decoder_inputs)):
            input_feed[self.decoder_inputs[l].name] = decoderinputs[l]
            input_feed[self.targets[l].name] = targets[l]
            input_feed[self.target_weights[l].name] = targetweights[l]
        if not forward_only:
            output_feed = [self.train_op, self.cost]
        else:
            output_feed = []
            for l in range(len(decoder_inputs)):
                output_feed.append(self.logits[l])
        output = session.run(output_feed, input_feed)
        if not forward_only:
            return output[1] # loss
        else:
            return output[0:] # outputs

In [9]:
model = seq2seq(multi=multi, hidden_size=hidden_size, num_layers=num_layers, forward_only=forward_only, learning_rate=learning_rate, 
                batch_size=batch_size, voca_size=voca_size, 
                encoder_size=encoder_size, decoder_size=decoder_size, use_LSTM=use_LSTM)

In [None]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())

In [None]:
loss = 0.0
current_step = 1
start = 0
end = batch_size
steps_per_checkpoint= 10000
result_list=[]
epoch = 0

while epoch < 10000 :
    
    # 1 epoch
    for start in range(len(encoderinputs)-batch_size+1):
        
        end = (start + batch_size)

        # Get a batch and make a step
        # make_batch : transposed array
        encoder_inputs, decoder_inputs, targets_, target_weights = tool.make_batch(encoderinputs[start:end],
                                                                                  decoderinputs[start:end],
                                                                                  targets[start:end],
                                                                                  targetweights[start:end])

        for i in range(decoder_size - 2):
            decoder_inputs[i + 1] = np.array([word_to_ix['<PAD>']] * batch_size)

        output_logits = model.step(sess, encoder_inputs, decoder_inputs, targets_, target_weights, True)     

        predict = [np.argmax(logit, axis=1)[0] for logit in output_logits]
        predict = [ix_to_word[ix] for ix in predict]
        predict = ' '.join(predict)
        
        real = [word[0] for word in targets_]
        real = [ix_to_word[ix] for ix in real]
        real = ' '.join(real)

        step_loss = model.step(sess, encoder_inputs, decoder_inputs, targets_, target_weights, False)
        loss += np.mean(step_loss)

        result_list.append([docNo[start], res[start], code_s[start], real, predict, loss])

        # steps_per_checkpoint 별로 진행상황을 check                                                                            
        if (current_step % steps_per_checkpoint) == 0 :
            print('\n----step : %d----\n LOSS : %s \n docNo : %d \n 예측 : %s \n 손질한 정답 : %s \n 정답 : %s \n---------------\n' 
                  % (current_step, loss, docNo[start], predict, real, code_s[start]))
        loss = 0.0  
        current_step += 1
        
    # end of epoch
    if end == len(encoderinputs):
        start = 0
        end = batch_size
        epoch += 1        
        
else :
    print('Finished')


----step : 10000----
 LOSS : 1.37741792202 
 docNo : 14 
 예측 : 맛이 좋다 <E> 상큼하 <E> 상큼하 <E> 
 손질한 정답 : 맛이 좋다 <E> <PAD> <PAD> <PAD> <PAD> 
 정답 : 맛이 좋다 
---------------


----step : 20000----
 LOSS : 1.84513354301 
 docNo : 23 
 예측 : 맛이 좋다 <E> <E> <E> <E> <E> 
 손질한 정답 : 맛이 좋다 <E> <PAD> <PAD> <PAD> <PAD> 
 정답 : 맛이 좋다 
---------------


----step : 30000----
 LOSS : 7.27754354477 
 docNo : 37 
 예측 : 목 넘김이 부드럽 <E> <E> <E> <E> 
 손질한 정답 : 목 넘김이 부드럽 <E> <PAD> <PAD> <PAD> 
 정답 : 목 넘김이 부드럽다 
---------------


----step : 40000----
 LOSS : 8.06501960754 
 docNo : 50 
 예측 : 모름 <E> <E> <E> <E> <E> <E> 
 손질한 정답 : 모름 <E> <PAD> <PAD> <PAD> <PAD> <PAD> 
 정답 : 모름 
---------------


----step : 50000----
 LOSS : 1.93862628937 
 docNo : 73 
 예측 : 맛이 좋다 <E> 있다 <E> <E> <E> 
 손질한 정답 : 맛이 순하다 <E> <PAD> <PAD> <PAD> <PAD> 
 정답 : 맛이 순하다 
---------------


----step : 60000----
 LOSS : 6.36497783661 
 docNo : 88 
 예측 : 맛이 좋다 <E> 상큼하 <E> 상큼하 <E> 
 손질한 정답 : 풍미가 좋다 <E> <PAD> <PAD> <PAD> <PAD> 
 정답 : 풍미가 좋다 
--------------

In [None]:
result=pd.DataFrame(result_list, columns=['no', 'res', 'code', 'real', 'predict', 'loss'])

In [None]:
result

In [None]:
result.to_csv('result_seq2seqI_171115_LSTM(epoch:10000).csv', encoding='utf-8')