In [1]:
# coding: utf-8
from eunjeon import Mecab
from gensim.models import word2vec
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.utils import np_utils
print("load done")



load done


# Parameters

In [2]:
vector_size = 50
encode_length = 4
label_size = 3
MAX_NB_WORDS = 0
MAX_SEQUENCE_LENGTH = 0

tokenizer = Tokenizer()

# Choose multi test
filter_type = "multi"
filter_sizes = [2,3,4,2,3,4,2,3,4]
fully_connected_layers = [1024, 1024]
dropout_p = 0.1
optimizer = "adam"
loss = "categorical_crossentropy"
num_filters = len(filter_sizes)

train_data_list =  {
                'encode' : ['판교에 오늘 피자 주문해줘','오늘 날짜에 호텔 예약 해줄레','모래 날짜에 판교 여행 정보 알려줘'],
                'decode' : ['0','1','2']
             }
train_data_list.get('encode')

['판교에 오늘 피자 주문해줘', '오늘 날짜에 호텔 예약 해줄레', '모래 날짜에 판교 여행 정보 알려줘']

# Vector model

In [3]:
def train_vector_model(str_buf):

    mecab = Mecab()
    str_buf = train_data_list['encode']
    pos1 = mecab.pos(''.join(str_buf))
    pos2 = ' '.join(list(map(lambda x : '\n' if x[1] in ['SF'] else x[0], pos1))).split('\n')
    morphs = list(map(lambda x : mecab.morphs(x) , pos2))
    print(str_buf)
    model = word2vec.Word2Vec(size=vector_size, window=2, min_count=1)
    model.build_vocab(morphs)
    model.train(morphs, epochs=model.epochs, total_examples=model.corpus_count)
    return model

model = train_vector_model(train_data_list)
print(model)

['판교에 오늘 피자 주문해줘', '오늘 날짜에 호텔 예약 해줄레', '모래 날짜에 판교 여행 정보 알려줘']
Word2Vec(vocab=15, size=50, alpha=0.025)


# Test data Embedding

In [4]:
def embed(data) :
    mecab = Mecab()
    inputs = []
    labels = []

    for encode_raw in data['encode'] :
        encode_raw = mecab.morphs(encode_raw)
        tokenizer.fit_on_texts(encode_raw)
        inputs.append(encode_raw)        

    x_train = tokenizer.texts_to_sequences(data['encode'])

    for decode_raw in data['decode']:
        labels.append(decode_raw)

    y_train = np_utils.to_categorical(labels, label_size)
    return x_train, y_train

x_train, y_train = embed(train_data_list)
print(x_train)
print(y_train)

MAX_NB_WORDS = len(tokenizer.word_index) + 1
MAX_SEQUENCE_LENGTH = max([len(seq) for seq in x_train])

print(MAX_NB_WORDS, MAX_SEQUENCE_LENGTH)

[[3, 6], [3, 9, 10], [12, 2, 13, 14, 15]]
[[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]
16 5


# import keras modules

In [5]:
from tensorflow.python.keras.models import Model
from tensorflow.python.keras.layers import Input, Dense, Concatenate
from tensorflow.python.keras.layers import Convolution1D
from tensorflow.python.keras.layers import GlobalMaxPooling1D
from tensorflow.python.keras.layers import Embedding
from tensorflow.python.keras.layers import AlphaDropout
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from tensorflow.python.keras.callbacks import TensorBoard

x_train = pad_sequences(x_train, maxlen=MAX_SEQUENCE_LENGTH)

print(x_train)

inputs = Input(shape=(MAX_SEQUENCE_LENGTH,), name='sent_input', dtype='int64')
x = Embedding(vector_size, MAX_NB_WORDS, input_length=MAX_SEQUENCE_LENGTH)(inputs)

[[ 0  0  0  3  6]
 [ 0  0  3  9 10]
 [12  2 13 14 15]]


# Model design

In [6]:
convolution_output = []
inindex = 0
for filter_width in filter_sizes:
    conv = Convolution1D(filters=256,
                         kernel_size=filter_width,
                         activation='relu',
                         name='Conv1D_{}_{}_{}'.format(256, filter_width,inindex))(x)
    pool = GlobalMaxPooling1D(name='MaxPoolingOverTime_{}_{}_{}'.format(256, filter_width,inindex))(conv)
    convolution_output.append(pool)
    inindex = inindex+1

x = Concatenate()(convolution_output)
for fl in fully_connected_layers:
    x = Dense(fl, activation='relu', kernel_initializer='lecun_normal')(x)
    x = AlphaDropout(dropout_p)(x)

predictions = Dense(label_size, activation='softmax')(x)
model = Model(inputs=inputs, outputs=predictions)
model.compile(optimizer=optimizer, loss=loss)
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
sent_input (InputLayer)         (None, 5)            0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 5, 16)        800         sent_input[0][0]                 
__________________________________________________________________________________________________
Conv1D_256_2_0 (Conv1D)         (None, 4, 256)       8448        embedding[0][0]                  
__________________________________________________________________________________________________
Conv1D_256_3_1 (Conv1D)         (None, 3, 256)       12544       embedding[0][0]                  
__________________________________________________________________________________________________
Conv1D_256

# Model run

In [7]:
model.fit(x_train, y_train,
               validation_data=(x_train, y_train),
               epochs=100,
               batch_size=128,
               verbose=2)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 3 samples, validate on 3 samples
Epoch 1/100
 - 3s - loss: 0.8369 - val_loss: 1.0530
Epoch 2/100
 - 0s - loss: 1.5614 - val_loss: 0.9197
Epoch 3/100
 - 0s - loss: 1.4228 - val_loss: 0.9437
Epoch 4/100
 - 0s - loss: 0.8239 - val_loss: 1.0048
Epoch 5/100
 - 0s - loss: 1.7202 - val_loss: 0.8495
Epoch 6/100
 - 0s - loss: 1.1343 - val_loss: 0.7950
Epoch 7/100
 - 0s - loss: 1.3154 - val_loss: 0.4708
Epoch 8/100
 - 0s - loss: 0.6163 - val_loss: 0.3380
Epoch 9/100
 - 0s - loss: 0.5599 - val_loss: 0.3753
Epoch 10/100
 - 0s - loss: 0.3008 - val_loss: 0.3010
Epoch 11/100
 - 0s - loss: 0.5434 - val_loss: 0.1489
Epoch 12/100
 - 0s - loss: 0.1348 - val_loss: 0.0573
Epoch 13/100
 - 0s - loss: 0.4518 - val_loss: 0.0352
Epoch 14/100
 - 0s - loss: 0.0502 - val_loss: 0.0291
Epoch 15/100
 - 0s - loss: 0.0975 - val_loss: 0.0165
Epoch 16/100
 - 0s - loss: 0.0575 - val_loss: 0.0060
Epoch 17/100
 - 0s - loss: 0.0441 - val_loss: 0.0027
Epoch 18/100
 - 0s - loss: 0.0074 - val_loss: 0.0014
Epoch 19/100


<tensorflow.python.keras.callbacks.History at 0x176058529e8>

# Predict

In [8]:
def inference_embed(data) :
    mecab = Mecab()
    encode_raw = mecab.morphs(data)
    output = tokenizer.texts_to_sequences([encode_raw])
    return output

def predict(data):
    x_predict = inference_embed(data)
    x_predict = pad_sequences(x_predict, maxlen=MAX_SEQUENCE_LENGTH)
    y = model.predict(x_predict, 128, 2)
    print("result : {0}".format(y))
    print("result : {0}".format(y.argmax()))

predict('판교에 오늘 피자 주문해줘')
predict('오늘 날짜에 호텔 예약 해줄수있어')
predict('모래 날짜에 판교 여행 정보 알려줘')

result : [[2.7916927e-04 9.9417830e-01 5.5424403e-03]]
result : 1
result : [[7.6869748e-07 9.9999440e-01 4.9428463e-06]]
result : 1
result : [[1.743006e-14 7.012635e-12 1.000000e+00]]
result : 2
