# seq2seq Model


In [0]:
!pip install --upgrade tensorflow-gpu==1.13.0rc1

In [0]:
import tensorflow as tf

tf.__version__

In [0]:
from google.colab import auth
auth.authenticate_user()

from google.colab import drive
drive.mount('/content/gdrive')

In [0]:
DATA_IN_PATH = './data_in/'
DATA_OUT_PATH = './data_out/'

In [0]:
!rm -rf data_in;rm -rf data_out;ls;

In [0]:
import os

if not os.path.exists(DATA_IN_PATH):
    os.makedirs(DATA_IN_PATH)
    
if not os.path.exists(DATA_OUT_PATH):
    os.makedirs(DATA_OUT_PATH)

### 데이터 존재 확인 (ChatBotData.csv)

In [0]:
!ls -al '/content/gdrive/My Drive/Colab Notebooks/Data/'

In [0]:
!ls -al '/content/gdrive/My Drive/Colab Notebooks/Seq2seq/'

### 데이터 받기

In [0]:
!wget -P '/content/gdrive/My Drive/Colab Notebooks/Data/' https://raw.githubusercontent.com/changwookjun/learningspoons/master/Data/ChatBotData.csv

### Seq2seq 폴더 만들고 소스 받기

In [0]:
!wget -P '/content/gdrive/My Drive/Colab Notebooks/Seq2seq/' https://raw.githubusercontent.com/changwookjun/learningspoons/master/Seq2seq/configs.py

In [0]:
!wget -P '/content/gdrive/My Drive/Colab Notebooks/Seq2seq/' https://raw.githubusercontent.com/changwookjun/learningspoons/master/Seq2seq/data.py

In [0]:
!wget -P '/content/gdrive/My Drive/Colab Notebooks/Seq2seq/' https://raw.githubusercontent.com/changwookjun/learningspoons/master/Seq2seq/model.py

In [0]:
!ls -al '/content/gdrive/My Drive/Colab Notebooks/Seq2seq/'

In [0]:
!ls -al ./data_in/

In [0]:
!cp '/content/gdrive/My Drive/Colab Notebooks/Data/ChatBotData.csv' ./data_in/ChatBotData.csv;ls -al ./data_in/

### 필요한 패키지 Install

In [0]:
!pip install konlpy

In [0]:
!pip install tqdm

In [0]:
import tensorflow as tf
import numpy as np
from tqdm import tqdm
from konlpy.tag import Okt
import re
import os
import sys

### google drive Seq2seq 파일 경로를 설정 해준다.

In [0]:
sys.path.insert(0, '/content/gdrive/My Drive/Colab Notebooks/Seq2seq/')

import data
import configs as conf
import model as ml

### 사전 만들기

In [0]:
char2idx,  idx2char, vocabulary_length = data.load_vocabulary()

In [0]:
print(vocabulary_length)

### 학습용 / 평가용 데이터 만들기

In [0]:
train_input, train_label, eval_input, eval_label = data.load_data()

In [0]:
train_input_enc, train_input_enc_length = data.enc_processing(train_input, char2idx)
train_output_dec, train_output_dec_length = data.dec_input_processing(train_label, char2idx)
train_target_dec = data.dec_target_processing(train_label, char2idx)

In [0]:
eval_input_enc, eval_input_enc_length = data.enc_processing(eval_input,char2idx)
eval_output_dec, eval_output_dec_length = data.dec_input_processing(eval_label, char2idx)
eval_target_dec = data.dec_target_processing(eval_label, char2idx)

In [0]:
check_point_path = os.path.join(os.getcwd(), conf.DEFINES.check_point_path)
os.makedirs(check_point_path, exist_ok=True)

In [0]:
classifier = tf.estimator.Estimator(
        model_fn=ml.model, # 모델 등록한다.
        model_dir=conf.DEFINES.check_point_path, # 체크포인트 위치 등록한다.
        params={ # 모델 쪽으로 파라메터 전달한다.
            'hidden_size': conf.DEFINES.hidden_size, # 가중치 크기 설정한다.
            'layer_size': conf.DEFINES.layer_size, # 멀티 레이어 층 개수를 설정한다.
            'learning_rate': conf.DEFINES.learning_rate, # 학습율 설정한다. 
            'vocabulary_length': vocabulary_length, # 딕셔너리 크기를 설정한다.
            'embedding_size': conf.DEFINES.embedding_size, # 임베딩 크기를 설정한다.
        })

In [0]:
!rm -rf ./data_out/check_point

### 학습 시작

In [0]:
classifier.train(input_fn=lambda:data.train_input_fn(
    train_input_enc, train_output_dec, train_target_dec,  conf.DEFINES.batch_size), steps=conf.DEFINES.train_steps)

In [0]:
eval_result = classifier.evaluate(input_fn=lambda:data.eval_input_fn(
    eval_input_enc, eval_output_dec, eval_target_dec,  conf.DEFINES.batch_size))
print('\nEVAL set accuracy: {accuracy:0.3f}\n'.format(**eval_result))

In [0]:
# 테스트용 데이터 만드는 부분이다.
# 인코딩 부분 만든다.
predic_input_enc, predic_input_enc_length = data.enc_processing(["남자친구가 너무 운동만 해"], char2idx)
# 학습 과정이 아니므로 디코딩 입력은 
# 존재하지 않는다.(구조를 맞추기 위해 넣는다.)
predic_output_dec, predic_output_decLength = data.dec_input_processing([""], char2idx)       
# 학습 과정이 아니므로 디코딩 출력 부분도 
# 존재하지 않는다.(구조를 맞추기 위해 넣는다.)
predic_target_dec = data.dec_target_processing([""], char2idx)      

# 예측을 하는 부분이다.
predictions = classifier.predict(
    input_fn=lambda:data.eval_input_fn(predic_input_enc, predic_output_dec, predic_target_dec, conf.DEFINES.batch_size))

# 예측한 값을 인지 할 수 있도록 
# 텍스트로 변경하는 부분이다.
data.pred2string(predictions, idx2char)