# 시작

In [142]:
import os
import re
import pandas as pd
import tensorflow as tf
import tensorflow_datasets as tfds

# 데이터 준비 & 전처리

In [143]:
def load_data():
    path = os.getenv('HOME') + '/aiffel/songys_chatbot/ChatbotData.csv'
    data = pd.read_csv(path)
    print('데이터를 불러왔습니다: {}개 문장'.format(len(data)))
    print(data.head())
    return data


def preprocess_sentence(sentence):
    sentence = re.sub(r"([?.!,])", r" \1 ", sentence)
    sentence = re.sub(r'[" "]+', " ", sentence)
    sentence = sentence.strip()
    return sentence

In [144]:
MAX_LENGTH = 20

def tokenize_data(data):
    questions = data['Q'].apply(preprocess_sentence).tolist()
    answers = data['A'].apply(preprocess_sentence).tolist()
    
    tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(questions + answers, target_vocab_size=2**13)
    start_token, end_token = [tokenizer.vocab_size], [tokenizer.vocab_size + 1]
    
    tokenized_questions = []
    tokenized_answers = []
    
    removed_count = 0
    for question, answer in zip(questions, answers):
        tokenized_question = start_token + tokenizer.encode(question) + end_token
        tokenized_answer = start_token + tokenizer.encode(answer) + end_token
        
        if len(tokenized_question) < MAX_LENGTH and len(tokenized_answer) < MAX_LENGTH:
            tokenized_questions.append(tokenized_question)
            tokenized_answers.append(tokenized_answer)
        else:
            removed_count += 1
    print('{}개 문장이 {}보다 길어 삭제되었습니다'.format(removed_count, MAX_LENGTH))
    tokenized_questions = tf.keras.preprocessing.sequence.pad_sequences(tokenized_questions, maxlen=MAX_LENGTH, padding='post')
    tokenized_answers = tf.keras.preprocessing.sequence.pad_sequences(tokenized_answers, maxlen=MAX_LENGTH, padding='post')
    return tokenizer, tokenized_questions, tokenized_answers

In [145]:
def make_dataset(questions, answers):
    BATCH_SIZE = 64
    BUFFER_SIZE = 20000
    return tf.data.Dataset.from_tensor_slices((
        {
            'inputs': questions,
            'dec_inputs': answers[:, :-1]
        },
        {
            'outputs': answers[:, 1:]
        },
    ))\
    .cache()\
    .shuffle(BUFFER_SIZE)\
    .batch(BATCH_SIZE)\
    .prefetch(tf.data.experimental.AUTOTUNE)

# 모델 생성 & 훈련 함수

In [146]:
class PositionalEncoding(tf.keras.layers.Layer):
    def __init__(self, vocab_size, d_model):
        super().__init__()
        self.pos_encoding = self.positional_encoding(vocab_size, d_model)
        
    def positional_encoding(self, vocab_size, d_model):
        angle_rads = self.get_angles(vocab_size, d_model)
        sins = tf.math.sin(angle_rads[:, 0::2])
        coss = tf.math.cos(angle_rads[:, 1::2])
        pos_encoding = tf.concat([sins, coss], axis=-1)[tf.newaxis, ...]
        return tf.cast(pos_encoding, tf.float32)
        
    def get_angles(self, vocab_size, d_model):
        v_positions = tf.range(vocab_size, dtype=tf.float32)[:, tf.newaxis]
        d_positions = tf.range(d_model, dtype=tf.float32)[tf.newaxis, :]
        angles = 1 / tf.pow(10000, (2*(d_positions//2)) / tf.cast(d_model, tf.float32))
        return v_positions * angles
    
    def call(self, inputs):
        return inputs + self.pos_encoding[:, tf.shape(inputs)[1], :]

In [147]:
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, name='multi_head_attention'):
        super().__init__(name=name)
        self.d_model = d_model
        self.num_heads = num_heads
        assert d_model % num_heads == 0
        
        self.query_dense = tf.keras.layers.Dense(d_model)
        self.key_dense = tf.keras.layers.Dense(d_model)
        self.value_dense = tf.keras.layers.Dense(d_model)
        self.dense = tf.keras.layers.Dense(d_model)
        
        
    def split_heads(self, inputs):
        split_d_model = self.d_model//self.num_heads
        inputs = tf.reshape(inputs, shape=(self.batch_size, -1, self.num_heads, split_d_model))
        return tf.transpose(inputs, perm=[0,2,1,3])
    
    
    def scaled_dot_product_attention(self, query, key, value, mask):
        depth = tf.cast(tf.shape(key)[-1], tf.float32)
        query_key = tf.matmul(query, key, transpose_b=True) / tf.math.sqrt(depth)
        if mask is not None:
            query_key += (mask * -1e9)
        weights = tf.nn.softmax(query_key)
        weights = tf.matmul(weights, value)
        weights = tf.transpose(weights, perm=[0,2,1,3])
        return tf.reshape(weights, shape=(self.batch_size, -1, self.d_model))
        
        
    def call(self, inputs):
        query, key, value, mask = inputs['query'], inputs['key'], inputs['value'], inputs['mask']
        
        self.batch_size = tf.shape(query)[0]
        
        query = self.query_dense(query)
        key = self.key_dense(key)
        value = self.value_dense(value)
        
        query = self.split_heads(query)
        key = self.split_heads(key)
        value = self.split_heads(value)
        
        attention = self.scaled_dot_product_attention(query, key, value, mask)
        return self.dense(attention)

In [148]:
def create_padding_mask(x):
    return tf.cast(tf.math.equal(x, 0), tf.float32)[:, tf.newaxis, tf.newaxis, :]


def create_look_ahead_mask(x):
    seq_len = tf.shape(x)[1]
    look_ahead_mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
    padding_mask = create_padding_mask(x)
    return tf.maximum(look_ahead_mask, padding_mask)

In [149]:
def encoder_layer(units, d_model, num_heads, dropout, name='enc_layer'):
    inputs = tf.keras.Input(shape=(None, d_model), name='inputs')
    padding_mask = tf.keras.Input(shape=(1, 1, None), name='padding_mask')
    attention_inputs = {
        'query': inputs,
        'key': inputs,
        'value': inputs,
        'mask': padding_mask
    }
    attention = MultiHeadAttention(d_model, num_heads, name='attention')(attention_inputs)
    attention = tf.keras.layers.Dropout(dropout)(attention)
    attention = tf.keras.layers.LayerNormalization(epsilon=1e-6)(inputs+attention)
    
    outputs = tf.keras.layers.Dense(units, activation='relu')(attention)
    outputs = tf.keras.layers.Dense(d_model)(outputs)
    outputs = tf.keras.layers.Dropout(dropout)(outputs)
    outputs = tf.keras.layers.LayerNormalization(epsilon=1e-6)(attention+outputs)
    
    return tf.keras.Model(inputs=[inputs, padding_mask], outputs=outputs, name=name)


def encoder(vocab_size,
           num_layers,
           units,
           d_model,
           num_heads,
           dropout,
           name='encoder'):
    inputs = tf.keras.Input(shape=(None, ), name='inputs')
    padding_mask = tf.keras.Input(shape=(1,1,None), name='padding_mask')
    
    embedding = tf.keras.layers.Embedding(vocab_size, d_model)(inputs)
    embedding *= tf.math.sqrt(tf.cast(d_model, tf.float32))
    embedding = PositionalEncoding(vocab_size, d_model)(embedding)
    
    outputs = tf.keras.layers.Dropout(dropout)(embedding)
    for i in range(num_layers):
        outputs = encoder_layer(units, d_model, num_heads, dropout, name='enc_layer{}'.format(i))([outputs, padding_mask])
    
    return tf.keras.Model(inputs=[inputs, padding_mask], outputs=outputs, name=name)

In [150]:
def decoder_layer(units, d_model, num_heads, dropout, name='dec_layer'):
    inputs = tf.keras.Input(shape=(None, d_model), name='inputs')
    enc_output = tf.keras.Input(shape=(None, d_model), name='enc_outputs')
    look_ahead_mask = tf.keras.Input(shape=(1, None, None), name='look_ahead_mask')
    padding_mask = tf.keras.Input(shape=(1, 1, None), name='padding_mask')
    
    attention1_inputs = {
        'query': inputs,
        'key': inputs,
        'value': inputs,
        'mask': look_ahead_mask
    }
    attention1 = MultiHeadAttention(d_model, num_heads, name='attention1')(attention1_inputs)
    attention1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)(attention1+inputs)
    
    attention2_inputs = {
        'query': attention1,
        'key': enc_output,
        'value': enc_output,
        'mask': padding_mask
    }
    attention2 = MultiHeadAttention(d_model, num_heads, name='attention2')(attention2_inputs)
    attention2 = tf.keras.layers.Dropout(dropout)(attention2)
    attention2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)(attention2+attention1)
    
    outputs = tf.keras.layers.Dense(units, activation='relu')(attention2)
    outputs = tf.keras.layers.Dense(d_model)(outputs)
    outputs = tf.keras.layers.Dropout(dropout)(outputs)
    outputs = tf.keras.layers.LayerNormalization(epsilon=1e-6)(outputs+attention2)
    
    return tf.keras.Model(inputs=[inputs, enc_output, look_ahead_mask, padding_mask], outputs=outputs, name=name)


def decoder(vocab_size,
           num_layers,
           units,
           d_model,
           num_heads,
           dropout,
           name='decoder'):
    inputs = tf.keras.Input(shape=(None,), name='inputs')
    enc_outputs = tf.keras.Input(shape=(None, d_model), name='enc_outputs')
    look_ahead_mask = tf.keras.Input(shape=(1, None, None), name='look_ahead_mask')
    padding_mask = tf.keras.Input(shape=(1, 1, None), name='padding_mask')
    
    embedding = tf.keras.layers.Embedding(vocab_size, d_model)(inputs)
    embedding *= tf.math.sqrt(tf.cast(d_model, tf.float32))
    embedding = PositionalEncoding(vocab_size, d_model)(embedding)
    
    outputs = tf.keras.layers.Dropout(dropout)(embedding)
    for i in range(num_layers):
        outputs = decoder_layer(units, d_model, num_heads, dropout, name='dec_layer{}'.format(i))([outputs, enc_outputs, look_ahead_mask, padding_mask])
    return tf.keras.Model(inputs=[inputs, enc_outputs, look_ahead_mask, padding_mask], outputs=outputs, name=name)

In [151]:
def transformer(vocab_size, num_layers, units, d_model, num_heads, dropout, name="transformer"):
    inputs = tf.keras.Input(shape=(None,), name="inputs")
    dec_inputs = tf.keras.Input(shape=(None,), name="dec_inputs")

    enc_padding_mask = tf.keras.layers.Lambda(create_padding_mask, output_shape=(1, 1, None),name='enc_padding_mask')(inputs)
    look_ahead_mask = tf.keras.layers.Lambda(create_look_ahead_mask, output_shape=(1, None, None), name='look_ahead_mask')(dec_inputs)
    dec_padding_mask = tf.keras.layers.Lambda(create_padding_mask, output_shape=(1, 1, None), name='dec_padding_mask')(inputs)

    enc_outputs = encoder(vocab_size, num_layers, units, d_model, num_heads, dropout)
    enc_outputs.summary()
    enc_outputs = enc_outputs(inputs=[inputs, enc_padding_mask])
    
    dec_outputs = decoder(vocab_size, num_layers, units, d_model, num_heads, dropout)
    dec_outputs.summary()
    dec_outputs = dec_outputs(inputs=[dec_inputs, enc_outputs, look_ahead_mask, dec_padding_mask])
    
    outputs = tf.keras.layers.Dense(units=vocab_size, name="outputs")(dec_outputs)

    return tf.keras.Model(inputs=[inputs, dec_inputs], outputs=outputs, name=name)

In [152]:
def loss_function(y_true, y_pred):
    y_true = tf.reshape(y_true, shape=(-1, MAX_LENGTH - 1))
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')(y_true, y_pred)
    mask = tf.cast(tf.not_equal(y_true, 0), tf.float32)
    loss = tf.multiply(loss, mask)

    return tf.reduce_mean(loss)

In [153]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super().__init__()
        self.d_model = d_model
        self.d_model = tf.cast(self.d_model, tf.float32)
        self.warmup_steps = warmup_steps

    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps**-1.5)
        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

# 생성 함수

In [154]:
def decoder_inference(sentence, tokenizer):
    start_token = [tokenizer.vocab_size]
    end_token = [tokenizer.vocab_size+1]
    sentence = preprocess_sentence(sentence)

    sentence = tf.expand_dims(start_token + tokenizer.encode(sentence) + end_token, axis=0)

    output_sequence = tf.expand_dims(start_token, 0)

    for i in range(MAX_LENGTH):
        predictions = model(inputs=[sentence, output_sequence], training=False)
        predictions = predictions[:, -1:, :]
        predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)

        if tf.equal(predicted_id, end_token[0]):
            break
        output_sequence = tf.concat([output_sequence, predicted_id], axis=-1)

    return tf.squeeze(output_sequence, axis=0)

def sentence_generation(sentence, tokenizer):
    prediction = decoder_inference(sentence, tokenizer)
    predicted_sentence = tokenizer.decode([i for i in prediction if i < tokenizer.vocab_size])

    print('입력 : {}'.format(sentence))
    print('출력 : {}'.format(predicted_sentence))

    return predicted_sentence

# 메인

In [155]:
data = load_data()

데이터를 불러왔습니다: 11823개 문장
                 Q            A  label
0           12시 땡!   하루가 또 가네요.      0
1      1지망 학교 떨어졌어    위로해 드립니다.      0
2     3박4일 놀러가고 싶다  여행은 언제나 좋죠.      0
3  3박4일 정도 놀러가고 싶다  여행은 언제나 좋죠.      0
4          PPL 심하네   눈살이 찌푸려지죠.      0


In [156]:
tokenizer, questions, answers = tokenize_data(data)

45개 문장이 20보다 길어 삭제되었습니다


In [157]:
VOCAB_SIZE = tokenizer.vocab_size + 2
print('단어장 크기는 {}입니다'.format(VOCAB_SIZE))

단어장 크기는 8175입니다


In [158]:
dataset = make_dataset(questions, answers)

In [159]:
tf.keras.backend.clear_session()

# 하이퍼파라미터
NUM_LAYERS = 2 # 인코더와 디코더의 층의 개수
D_MODEL = 256 # 인코더와 디코더 내부의 입, 출력의 고정 차원
NUM_HEADS = 8 # 멀티 헤드 어텐션에서의 헤드 수 
UNITS = 512 # 피드 포워드 신경망의 은닉층의 크기
DROPOUT = 0.1 # 드롭아웃의 비율

model = transformer(
    vocab_size=VOCAB_SIZE,
    num_layers=NUM_LAYERS,
    units=UNITS,
    d_model=D_MODEL,
    num_heads=NUM_HEADS,
    dropout=DROPOUT)

model.summary()

Model: "encoder"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
inputs (InputLayer)             [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 256)    2092800     inputs[0][0]                     
__________________________________________________________________________________________________
tf_op_layer_Mul (TensorFlowOpLa [(None, None, 256)]  0           embedding[0][0]                  
__________________________________________________________________________________________________
positional_encoding (Positional (None, None, 256)    0           tf_op_layer_Mul[0][0]            
____________________________________________________________________________________________

In [160]:
learning_rate = CustomSchedule(D_MODEL)

optimizer = tf.keras.optimizers.Adam(
    learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

def accuracy(y_true, y_pred):
    y_true = tf.reshape(y_true, shape=(-1, MAX_LENGTH - 1))
    return tf.keras.metrics.sparse_categorical_accuracy(y_true, y_pred)

model.compile(optimizer=optimizer, loss=loss_function, metrics=[accuracy])

In [161]:
EPOCHS = 30
model.fit(dataset, epochs=EPOCHS, verbose=1)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7fdc2595c890>

In [162]:
sentence_generation('내일 시험이에요', tokenizer)

입력 : 내일 시험이에요
출력 : 컨디션 조절 하세요 .


'컨디션 조절 하세요 .'

In [163]:
sentence_generation('너무 재밌어요', tokenizer)

입력 : 너무 재밌어요
출력 : 네 , 슬픈 말이네요 .


'네 , 슬픈 말이네요 .'

In [164]:
sentence_generation('생일 선물 추천해주세요', tokenizer)

입력 : 생일 선물 추천해주세요
출력 : 진심이에요 .


'진심이에요 .'

In [165]:
sentence_generation('오늘 따듯해서 좋다', tokenizer)

입력 : 오늘 따듯해서 좋다
출력 : 포근한 느낌 저도 좋아해요 .


'포근한 느낌 저도 좋아해요 .'

In [166]:
sentence_generation('맥주 마실래?', tokenizer)

입력 : 맥주 마실래?
출력 : 저는 배터리가 밥이예요 .


'저는 배터리가 밥이예요 .'

In [167]:
sentence_generation('손이 얼겠어', tokenizer)

입력 : 손이 얼겠어
출력 : 시간이 흐르면 무덤덤해질 거예요 .


'시간이 흐르면 무덤덤해질 거예요 .'

In [168]:
sentence_generation('시간 잘가네', tokenizer)

입력 : 시간 잘가네
출력 : 안녕 .


'안녕 .'

In [169]:
sentence_generation('어떻게 할까?', tokenizer)

입력 : 어떻게 할까?
출력 : 기분요하면 연락해보세요 .


'기분요하면 연락해보세요 .'

In [170]:
sentence_generation('완전 큰일났어', tokenizer)

입력 : 완전 큰일났어
출력 : 마음 오래 날이 올 거예요 .


'마음 오래 날이 올 거예요 .'

In [171]:
sentence_generation('이거 너무 재미 없어', tokenizer)

입력 : 이거 너무 재미 없어
출력 : 슬프네요 .


'슬프네요 .'

# 결론

평균적으로 생성된 문장을 보면 입력에 호응되는 문장이 몇 개 있고, 입력과는 상관없는 문장이 몇 개 있고, 말이 되지 않는 문장이 몇 개 있다.   