In [None]:
!pip install konlpy

Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[K     |████████████████████████████████| 19.4 MB 1.4 MB/s 
Collecting JPype1>=0.7.0
  Downloading JPype1-1.3.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (448 kB)
[K     |████████████████████████████████| 448 kB 49.0 MB/s 
Installing collected packages: JPype1, konlpy
Successfully installed JPype1-1.3.0 konlpy-0.6.0


In [None]:
!pwd

/content


In [None]:
!cp '/content/drive/MyDrive/requirements.txt' .

In [None]:
!ls

drive  requirements.txt  sample_data


In [None]:
!pip install -r requirements.txt

Collecting konlpy==0.6.0
  Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[K     |████████████████████████████████| 19.4 MB 3.9 MB/s 
[?25hCollecting JPype1>=0.7.0
  Downloading JPype1-1.3.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (448 kB)
[K     |████████████████████████████████| 448 kB 66.2 MB/s 
Installing collected packages: JPype1, konlpy
Successfully installed JPype1-1.3.0 konlpy-0.6.0


In [None]:
!pip freeze | grep konlpy

konlpy==0.6.0


In [None]:
import tensorflow as tf
import numpy as np
import os
import json

from tensorflow.keras.layers import Layer, Embedding, GRU, Dense

from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import matplotlib.pyplot as plt

import sys
sys.path.append("/content/drive/MyDrive/")

from preprocess import *

In [None]:
def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.plot(history.history['val_'+string], '')
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legend([string, 'val_'+string])
    plt.show()

In [None]:
DATA_IN_PATH = '/content/drive/MyDrive/data_in/'
DATA_OUT_PATH = '/content/drive/MyDrive/data_out/'
TRAIN_INPUTS = 'train_inputs.npy'
TRAIN_OUTPUTS = 'train_outputs.npy'
TRAIN_TARGETS = 'train_targets.npy'
DATA_CONFIGS = 'data_configs.json'

In [None]:
SEED_NUM = 1234
tf.random.set_seed(SEED_NUM)

In [None]:
index_inputs = np.load(open(DATA_IN_PATH + TRAIN_INPUTS, 'rb'))
index_outputs = np.load(open(DATA_IN_PATH + TRAIN_OUTPUTS , 'rb'))
index_targets = np.load(open(DATA_IN_PATH + TRAIN_TARGETS , 'rb'))
prepro_configs = json.load(open(DATA_IN_PATH + DATA_CONFIGS, 'r'))

In [None]:
print(len(index_inputs),  len(index_outputs), len(index_targets))

20 20 20


In [None]:
MODEL_NAME = 'seq2seq_kor'
BATCH_SIZE = 2
MAX_SEQUENCE = 25
EPOCH = 30
UNITS = 1024
EMBEDDING_DIM = 256
VALIDATION_SPLIT = 0.1 

char2idx = prepro_configs['char2idx']
idx2char = prepro_configs['idx2char']
std_index = prepro_configs['std_symbol']
end_index = prepro_configs['end_symbol']
vocab_size = prepro_configs['vocab_size']

In [None]:
class Encoder(Layer):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding_dim = embedding_dim
        self.vocab_size = vocab_size

        self.embedding = Embedding(self.vocab_size, self.embedding_dim)
        self.gru = GRU(self.enc_units, return_sequences=True, return_state=True, recurrent_initializer='glorot_uniform')

    def call(self, x, hidden):
      x = self.embedding(x)
      output, state = self.gru(x, initial_state=hidden)
      return output, state

    def initialize_hidden_state(self, inp):
      
      return tf.zeros((tf.shape(inp)[0], self.enc_units))

In [None]:
encoder = Encoder(vocab_size, EMBEDDING_DIM, UNITS, BATCH_SIZE)

In [None]:
index_inputs.shape

(20, 25)

In [None]:
enc_hidden = encoder.initialize_hidden_state(index_inputs)
enc_hidden.shape

TensorShape([20, 1024])

Bahdanau Attention

In [None]:
class BahdanauAttention(Layer):
  def __init__(self, units):
    super(BahdanauAttention, self).__init__()

    self.W1 = Dense(units) #입력과 w
    self.W2 = Dense(units)
    self.V = Dense(1) #output

  def call(self, query, values):
    # expand_dim: 
    hidden_with_time_axis = tf.expand_dims(query, 1)

    #하이퍼볼릭 함수 지나도록 함
    score = self.V(tf.nn.tanh(self.W1(values) + self.W2(hidden_with_time_axis)))

    #softmax
    attention_weights = tf.nn.softmax(score, axis=1)

    #softmax한 걸 기존거랑 곱해줘야 함: context_vector
    context_vector = attention_weights * values
    context_vector = tf.reduce_sum(context_vector, axis=1)

    return context_vector, attention_weights


디코더(seq2seq 교안 p.15)

In [None]:
class Decoder(Layer):
  def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
    super(Decoder, self).__init__()

    self.batch_sz = batch_sz
    self.dec_units = dec_units
    self.vocab_size = vocab_size
    self.embedding_dim = embedding_dim

    self.embedding = Embedding(self.vocab_size, self.embedding_dim)
    #GRU: 입력으로 들어온 애가 타임스텝별로 그 아웃풋 사용
    self.gru = GRU(self.dec_units, return_sequences=True, return_state=True, recurrent_initializer='glorot_uniform')

    # hidden state 나온 걸로 단어 맞춰야 함
    self.fc = Dense(self.vocab_size) #출력을 vocab_size -> softmax -> crossentropy -> 가장 높은 라벨 중 하나를 고름

    self.attention = BahdanauAttention(self.dec_units)

  #x: decoder의 input, hidden: 어텐션 스코어, enc_output: 마지막 단계
  def call(self, x, hidden, enc_output):
    context_vector, attention_weights = self.attention(hidden, enc_output)

    x = self.embedding(x)

    x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

    #입력이 들어가기 전에 어텐션 거는 것
    #교안예시는 입력이 들어간 다음, 나중에 어텐션을 걸었음
    output, state = self.gru(x)
    output = tf.reshape(output, (-1, output.shape[2])) #shape 맞추기

    x = self.fc(output)

    return x, state, attention_weights

In [None]:
optimizer = tf.keras.optimizers.Adam()

# CategoricalCrossentropy : 다중 클래스 분류(원핫)
# SparseCategoricalCrossentropy: 다중 클래스 분류(라벨; softmax가 이미 그안에 들어가있음)
# reduction: sample의 개수 더해주는 것
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy')

# real: target, pred: model 결과
def loss(real, pred):
  # 문장 가져오면 char2idx, idx2char에서 바꿔줌
  # index:0 = <PAD> -> 원래 문자 없는거니까 loss 계산 안하려고 masking(얘의 loss값 안쓰겠다)
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred) #loss구하기
  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask
  return tf.reduce_mean(loss_)

def accuracy(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  # expand_dims: 차원 바꿔주는 것
  mask = tf.expand_dims(tf.cast(mask, dtype=pred.dtype), axis=-1)
  pred *= mask
  acc = train_accuracy(real, pred)

  return tf.reduce_mean(acc)

In [None]:
# seq2seq model: encoder, decoder 두개 갖고 있는 모델
class seq2seq(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, dec_units, batch_sz, end_token_idx=2):
        super(seq2seq, self).__init__()
        self.end_token_idx = end_token_idx
        # 언어 하나라 인코더, 디코더 vocab_size 같게 둔 것(언어 달라지면 이거 변경)
        self.encoder = Encoder(vocab_size, embedding_dim, enc_units, batch_sz)
        self.decoder = Decoder(vocab_size, embedding_dim, dec_units, batch_sz)

    def call(self, x):
      # enc_input: encoder의 입력, dec_input: decoder의 입력
      enc_inputs, dec_inputs = x

      #encoder에 집어넣기
      enc_hidden = self.encoder.initialize_hidden_state(enc_inputs)
      #enc_hidden: encoder 마지막 단의 hidden state
      enc_output, enc_hidden = self.encoder(enc_inputs, enc_hidden)

      #enc_hidden: context vector
      dec_hidden = enc_hidden

      predict_tokens = []
      #sequence length만큼 돌아가도록
      for t in range(dec_inputs.shape[1]):
        #sample: 20개, sequence length: 25
        #단어 순서대로 가져오기; 단어 rnn에 집어넣기
        dec_input = tf.dtypes.cast(tf.expand_dims(dec_inputs[:, t], 1), tf.float32)
        # 앞(gru)에서 3개 리턴했으니까 자리만 맞춰주는 것
        # decoder 한번 돌면 dec_hidden에 update된 state 들어가는 것
        # dec_input: 잘못된 거 학습하지 않도록 정답 집어넣는 것(teacher forcing)
        predictions, dec_hidden, _ = self.decoder(dec_input, dec_hidden, enc_output)
        predict_tokens.append(tf.dtypes.cast(predictions, tf.float32))
    
      return tf.stack(predict_tokens, axis=1)

In [None]:
model = seq2seq(vocab_size, EMBEDDING_DIM, UNITS, UNITS, BATCH_SIZE, char2idx[end_index])
model.compile(loss=loss, optimizer=optimizer, metrics=accuracy)

In [None]:
history = model.fit([index_inputs, index_outputs], index_targets, batch_size=BATCH_SIZE,
                    epochs=EPOCH, validation_split=VALIDATION_SPLIT)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
