In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from konlpy.tag import Mecab,Okt
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, Flatten, Conv1D, LSTM, Dropout, GlobalMaxPooling1D, GRU

In [2]:
# 데이터 불러오기
data = pd.read_csv('train.csv')

# class 열을 숫자로 변환
encoder = LabelEncoder()
data['class'] = encoder.fit_transform(data['class'])

In [3]:
mecab = Mecab()
data['conversation'] = data['conversation'].apply(mecab.morphs)

In [4]:
# 케라스를 이용한 정수 인코딩
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['conversation'])

sequences = tokenizer.texts_to_sequences(data['conversation'])

In [5]:
# 입력 데이터 패딩 처리
data_pad = pad_sequences(sequences)

In [6]:
# 데이터셋 분리
X_train, X_val, y_train, y_val = train_test_split(data_pad, data['class'], test_size=0.1, random_state=42)

In [7]:
def get_mlp():
    # MLP 모델 생성
    model = Sequential()
    model.add(Embedding(len(tokenizer.word_index)+1, 128, input_length=data_pad.shape[1]))
    model.add(Flatten())
    model.add(Dense(64, activation='relu'))
    model.add(Dense(len(data['class'].unique()), activation='softmax'))
    return model

In [8]:
def get_cnn():
    # CNN 모델 생성
    model = Sequential()
    model.add(Embedding(len(tokenizer.word_index)+1, 128, input_length=data_pad.shape[1]))
    model.add(Conv1D(128, 5, activation='relu'))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(64, activation='relu'))
    model.add(Dense(len(data['class'].unique()), activation='softmax'))

    return model

In [9]:
def get_lstm():
    # LSTM 모델 생성
    model = Sequential()
    model.add(Embedding(len(tokenizer.word_index)+1, 128, input_length=data_pad.shape[1]))
    model.add(LSTM(64))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(len(data['class'].unique()), activation='softmax'))

    return model

In [10]:
def get_gru():
    model = Sequential()
    model.add(Embedding(len(tokenizer.word_index)+1, 128, input_length=data_pad.shape[1]))
    model.add(GRU(64))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(len(data['class'].unique()), activation='softmax'))
    return model

In [11]:
model = get_gru()
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 447, 128)          1674240   
_________________________________________________________________
gru (GRU)                    (None, 64)                37248     
_________________________________________________________________
dense (Dense)                (None, 64)                4160      
_________________________________________________________________
dropout (Dropout)            (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 4)                 260       
Total params: 1,715,908
Trainable params: 1,715,908
Non-trainable params: 0
_________________________________________________________________


In [12]:
# 모델 컴파일
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [13]:
from datetime import datetime

start = datetime.now()
# 모델 학습
history = model.fit(X_train, y_train, validation_split=0.15, epochs=10)

print(f'\n\nEnd of train at: {datetime.now()-start}')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


End of train at: 0:00:17.850524


In [14]:
print(model.evaluate(X_val, y_val))

[1.1251415014266968, 0.8075949549674988]
