In [None]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import LSTM
from keras.layers import Conv1D, MaxPooling1D
from keras.datasets import imdb

import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

# seed 값 설정
seed = 0
numpy.random.seed(seed)
tf.random.set_seed(3)

train = pd.read_csv('./dataset/train_SectionDataset.csv')
test = pd.read_csv('./dataset/test_SectionDataset.csv')

train_text = np.array(train)
x_train = [] #학습셋 제목
y_train = [] #학습셋 라벨
for i in range(len(train_text)):
    x_train.append(train_text[i][0])
    y_train.append(train_text[i][1])

test_text = np.array(test)
x_test = [] #테스트셋 제목
y_test = [] #테스트셋 라벨
for i in range(len(test_text)):
    x_test.append(test_text[i][0])
    y_test.append(test_text[i][1])

token = Tokenizer()
token.fit_on_texts(x_train + x_test)

token.fit_on_texts(x_test)
token.fit_on_texts(x_train)

x_token = token.texts_to_sequences(x_train + x_test) #토큰화
word_size = len(token.word_index)

train_token = token.texts_to_sequences(x_train)
test_token = token.texts_to_sequences(x_test)

#원핫인코딩
y = y_train + y_test
y_train_encoded = tf.keras.utils.to_categorical(y_train)
y_test_encoded = tf.keras.utils.to_categorical(y_test)

padded_x_train = pad_sequences(train_token, 18)
padded_x_test = pad_sequences(test_token, 18)
padded_x = pad_sequences(x_token, 18)

# 모델의 설정
model = Sequential()
model.add(Embedding(word_size + 1, 100))
model.add(Dropout(0.5))
model.add(Conv1D(64, 5, padding='valid', activation='relu',strides=1))
model.add(MaxPooling1D(pool_size=4))
model.add(LSTM(55))
model.add(Dense(1))
model.add(Activation('sigmoid'))

# 모델의 컴파일
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

y_train = np.array(y_train)
y_test = np.array(y_test)
y = np.array(y)
model.fit(padded_x, y, batch_size=10, epochs=20)

# 테스트 정확도 출력
print("\n Test Accuracy: %.4f" % (model.evaluate(padded_x_test, y_test)[1]))

print("첫번째 학습셋 입력: " , x_train[0])
print("첫번째 테스트셋 입력: " , x_test[0])
print("첫번째 학습셋 결과 one-hot 출력: ", y_train_encoded[0])
print("첫번째 테스트셋 결과 one-hot 출력: ", y_test_encoded[0])
print("전체 데이터셋 단어 토큰 개수: ", word_size)
print("첫번째 학습셋 토큰 결과: ", train_token[0])
print("첫번째 테스트셋 토큰 결과: ", test_token[0])
trainMax = max(len(i) for i in train_token)
print("학습셋 제목 최대 길이: ", trainMax)
testMax = max(len(i) for i in test_token)
print("테스트셋 제목 최대 길이: ", testMax)
print("첫번째 학습셋 패딩 토큰: ", padded_x_train[0])
print("첫번째 테스트셋 패딩 토큰: ", padded_x_test[0])

import pickle
with open('test_token.pickle', 'wb') as handle:
    pickle.dump(test_token, handle, protocol=pickle.HIGHEST_PROTOCOL)