### 문장 등장하는 단어의 빈도수를 계산해서 문장 분류하기

[원본](https://blog.eduonix.com/internet-of-things/simple-nlp-based-chatbot-python/)

In [1]:
from keras.models import Sequential
from keras.losses import categorical_crossentropy
from keras.optimizers import SGD
from keras.layers import Dense

from numpy import argmax
import numpy as np
import re

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
X = ['Hi',
     'Hello',
     'How are you?',
     'I am studying',
     'studying',
     'see you later',
     'bye',
     'goodbye']

In [3]:
Y = ['greeting',
     'greeting',
     'greeting',
     'studying',
     'studying',
     'bye',
     'bye',
     'bye']

In [4]:
# 문자, 숫자를 제외한 문장을 반환하는 함수
def remove_non_alpha_numeric_characters(sentence):
    new_sentence = ''
    for alphabet in sentence:
        if alphabet.isalpha() or alphabet == ' ':
            new_sentence += alphabet
    return new_sentence

In [5]:
# 전처리
# 소문자, 단어 앞뒤 공백제거 
def preprocess_data(X):
    X = [data_point.lower() for data_point in X]
    X = [remove_non_alpha_numeric_characters(
        sentence) for sentence in X]
    X = [data_point.strip() for data_point in X]
    X = [re.sub(' +', ' ',
                data_point) for data_point in X]
    return X

In [6]:
# 단어사전 생성
X = preprocess_data(X)

vocabulary = set()
for data_point in X:
    for word in data_point.split(' '):
        vocabulary.add(word)

vocabulary = list(vocabulary)

In [7]:
# 인코딩
X_encoded = []

def encode_sentence(sentence):
    sentence = preprocess_data([sentence])[0]
    sentence_encoded = [0] * len(vocabulary)
    for i in range(len(vocabulary)):
        if vocabulary[i] in sentence.split(' '):
            sentence_encoded[i] = 1
    return sentence_encoded

X_encoded = [encode_sentence(sentence) for sentence in X]

In [8]:
# intent 인코팅
classes = list(set(Y))

Y_encoded = []
for data_point in Y:
    data_point_encoded = [0] * len(classes)
    for i in range(len(classes)):
        if classes[i] == data_point:
            data_point_encoded[i] = 1
    Y_encoded.append(data_point_encoded)

In [9]:
# 데이터 분할, 학습 데이터, 테스트 데이터
X_train = np.array(X_encoded)
y_train = np.array(Y_encoded)
X_test = np.array(X_encoded)
y_test = np.array(Y_encoded)

In [11]:
X_train

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1],
       [0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [13]:
model = Sequential()
model.add(Dense(units=64, activation='sigmoid',
                input_dim=len(X_train[0])))
model.add(Dense(units=len(y_train[0]), activation='softmax'))
model.compile(loss=categorical_crossentropy,
              optimizer=SGD(lr=0.01,
                            momentum=0.9, nesterov=True))
model.fit(X_train, y_train, epochs=100, batch_size=16)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x20803f4cc88>

In [14]:
predictions = [argmax(pred) for pred in model.predict(X_test)]

In [17]:
correct = 0
for i in range(len(predictions)):
    if predictions[i] == argmax(y_test[i]):
        correct += 1

print("Correct:", correct)
print("Total:", len(predictions))

Correct: 8
Total: 8


In [None]:
while True:
    print("Enter a sentence")
    # python 2.7
    # sentence = raw_input() 
    sentence = input()
    prediction= model.predict(np.array([encode_sentence(sentence)]))
    print(classes[argmax(prediction)]) 

Enter a sentence
hi
greeting
Enter a sentence
hello
greeting
Enter a sentence
demo
bye
Enter a sentence
i am study
greeting
Enter a sentence
i am studying
studying
Enter a sentence
bye
bye
Enter a sentence
