# 1. Import libraries

In [None]:
try:
  # Colab only
  %tensorflow_version 2.x
except Exception:
  pass

In [None]:
!pip install konlpy

In [None]:
!pip install gensim

In [None]:
#colab에서 실행 중이라면...
!git clone https://github.com/hukim1112/comment_classifier.git
import os
os.chdir('/content/comment_classifier')

In [None]:
import tensorflow as tf
from konlpy.tag import Twitter
from collections import Counter
import pandas as pd
import numpy as np
import gensim
from matplotlib import pyplot as plt

keras = tf.keras
t = Twitter()

# 2. fit tokenizer to our datasets

In [None]:
from vectorizer import BaseVectorizer
tokenizer = BaseVectorizer(t.morphs)

In [None]:
df = pd.read_csv('train_intent.csv')

In [None]:
df[500:520]

In [None]:
tokenizer.fit(df['question'].values)

In [None]:
tokenizer.vocabulary_

# 3. data preprocessing

In [None]:
label_to_id = {t:i for i,t in enumerate(df.intent.unique())}
id_to_label = {i:t for i,t in enumerate(df.intent.unique())}

In [None]:
print(label_to_id)

In [None]:
# df.intent = df.intent.map(lambda x : label_index[x])
# print(df.head(10))

In [None]:
MAX_LENGTH = 40
def tokenize_and_filter(sentences, labels):
  inputs, outputs = [], []
  
  for sentence, label in zip(sentences, labels):
    # tokenize sentence
    tokenized_sentence = tokenizer.encode_a_doc_to_list(sentence)
    # check tokenized sentence max length
    if len(tokenized_sentence) <= MAX_LENGTH:
      inputs.append(tokenized_sentence)
      outputs.append(label_to_id[label])
  
  # pad tokenized sentences
  padded_inputs = tf.keras.preprocessing.sequence.pad_sequences(
      inputs, maxlen=MAX_LENGTH, padding='post', 
      value = tokenizer.vocabulary_['_PAD_']) # value = 0
  
  return padded_inputs, outputs

In [None]:
inputs, outputs = tokenize_and_filter(df.question, df.intent)

In [None]:
print('encoded input : ', inputs[0], 'label : ', outputs[0], 'original input sentence : ', tokenizer.decode_from_list(inputs[0]))

In [None]:
BATCH_SIZE = 16
BUFFER_SIZE = 7836

# decoder inputs use the previous target as input
# remove START_TOKEN from targets
dataset = tf.data.Dataset.from_tensor_slices((inputs, outputs))

dataset = dataset.cache()
dataset = dataset.shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

In [None]:
for x, y in dataset.take(1):
    print(x, y)
    print('-----------------------------------------------')
    print(x.shape, y.shape)

# 4. model design

In [None]:
print(len(label_to_id.values()))

In [None]:
def get_model():
    return tf.keras.Sequential([
    tf.keras.layers.Embedding(tokenizer.n_vocabs, 64),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(len(label_to_id.values()), activation='softmax')
])

In [None]:
model = get_model()

In [None]:
LEARNING_RATE = 0.0001

In [None]:
model.compile(optimizer=tf.keras.optimizers.Adam(LEARNING_RATE),
              loss=tf.keras.losses.sparse_categorical_crossentropy,
              metrics=[tf.keras.metrics.sparse_categorical_accuracy])

In [None]:
model.fit(dataset, epochs=10)

In [None]:
def question_processing(sentences):
    inputs = []
    for sentence in sentences:
        # tokenize sentence
        tokenized_sentence = tokenizer.encode_a_doc_to_list(sentence)
        # check tokenized sentence max length
        if len(tokenized_sentence) <= MAX_LENGTH:
            inputs.append(tokenized_sentence)
        else:
            print('입력이 너무 길어요.')
    # pad tokenized sentences
    padded_inputs = tf.keras.preprocessing.sequence.pad_sequences(
    inputs, maxlen=MAX_LENGTH, padding='post', 
    value = tokenizer.vocabulary_['_PAD_']) # value = 0
    return padded_inputs

In [None]:
input_sentence = question_processing(['서울 날씨 어때?', 
                                      '나는 전주 날씨 궁금함',
                                      '안중근 의사는 누구야?',
                                      '이순신 장군은 어떤 분이니?',
                                      '명동 맛있는 음식점 있니?'
                                     ])

In [None]:
model.predict(input_sentence)

In [None]:
prediction = np.argmax(model.predict(input_sentence), axis=1)
print(prediction)

In [None]:
for p in prediction:
    print(id_to_label[p])

In [None]:
del model

# 데이터 추가해보기

In [None]:
names = ['안중근', '이순신', '세종대왕', '김광석', '아이유', '에미넴', '이건희', '고아라', '유재석', '한석희', '최민성']
def question_generator(names):
    question = []
    for name in names:
        s1 = name+'는 어떤 분이야?'
        s2 = name+'은 어떤 사람이니?'
        s3 = name+'이란 사람에 대해 궁금해'
        question = question+[s1, s2, s3]
    return question
question = question_generator(names)

In [None]:
question

In [None]:
new_data = {'question' : question, 'intent' : ['인물']*len(question)}
add_df = pd.DataFrame(new_data, columns=('question', 'intent'))

In [None]:
add_df.head(5)

In [None]:
print(len(df), len(add_df))

In [None]:
new_df = pd.concat([df, add_df])
print(len(new_df))

In [None]:
tokenizer.fit(new_df['question'].values)

In [None]:
new_inputs, new_outputs = tokenize_and_filter(new_df.question, new_df.intent)

BATCH_SIZE = 16
BUFFER_SIZE = 7836

# decoder inputs use the previous target as input
# remove START_TOKEN from targets
dataset = tf.data.Dataset.from_tensor_slices((new_inputs, new_outputs))

dataset = dataset.cache()
dataset = dataset.shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

In [None]:
new_model = get_model()
LEARNING_RATE = 0.0001
new_model.compile(optimizer=tf.keras.optimizers.Adam(LEARNING_RATE),
              loss=tf.keras.losses.sparse_categorical_crossentropy,
              metrics=[tf.keras.metrics.sparse_categorical_accuracy])
new_model.fit(dataset, epochs=10)

In [None]:
input_sentence = question_processing(['서울 날씨 어때?', 
                                      '나는 전주 날씨 궁금함',
                                      '안중근 의사는 누구야?',
                                      '박소희는 어떤 사람인지 궁금해.',
                                      '명동 맛있는 음식점 있니?'
                                     ])

In [None]:
new_model.predict(input_sentence)

In [None]:
prediction = np.argmax(new_model.predict(input_sentence), axis=1)

In [None]:
for p in prediction:
    print(id_to_label[p])