라이브러리 import 및 설정

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
from matplotlib import rcParams, pyplot as plt
import numpy as np
import os
import pandas as pd
from pathlib import Path
import re
from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import StratifiedKFold
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, GlobalMaxPooling1D, Conv1D, Dropout, Bidirectional
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.utils import plot_model, to_categorical
from tensorflow.keras.optimizers import Adam
import warnings 
warnings.filterwarnings(action='ignore')

In [None]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    # Restrict TensorFlow to only use the first GPU
    try:
        tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")
    except RuntimeError as e:
        # Visible devices must be set before GPUs have been initialized
        print(e)
else:
    print('No GPU detected')

1 Physical GPUs, 1 Logical GPU


학습데이터 로드

In [None]:
train = pd.read_csv("shuffle_train.tsv",sep='\t',names=['document','label'])
train[:5]

Unnamed: 0,document,label
0,거실 콘센트에 선풍기 플러그가 꽂혀 있니?,483
1,나에게 물어보고 싶은 게 있는.,58
2,이번 달 발표 몇 번이야?,71
3,구호선 배차 시간 들어 봅시다.,189
4,현관문 잘 잠겼는지 체크하고 말해 줘.,516


In [None]:
test = pd.read_csv("shuffle_test.tsv",sep='\t',names=['document','label'])
test[:5]

Unnamed: 0,document,label
0,이 근처 맛집 순위 알려 줘.,34
1,자동 주행으로 얼마나 주행했는지 확인해 줘.,600
2,숙소 주변에서 놀러 갈 만한 길 찾아봐 줘.,37
3,시로 축구 새 소식 알려 줘.,532
4,방금 전원 들어와 있는 플러그 어디 있는 건지 불러 줘.,487


In [None]:
target_col = 'label'
n_fold = 5
n_class = 785
seed = 42

데이터 정제

In [None]:
train['document'].nunique(), train['label'].nunique()

(59622, 785)

In [None]:
train.drop_duplicates(subset=['document'], inplace=True)

In [None]:
print('총 샘플의 수 :',len(train))

총 샘플의 수 : 59623


In [None]:
# 데이터 중 Null 존재하는지 확인
print(train.isnull().values.any())
# 어떤 열에 Null 존재하는지 확인
print(train.isnull().sum())
# Null값 가진 샘플이 어느 인덱스의 위치에 존재하는지 출력
train.loc[train.document.isnull()]

In [None]:
train = train.dropna(how = 'any') # Null 값이 존재하는 행 제거
print(train.isnull().values.any()) # Null 값이 존재하는지 확인

False


In [None]:
X_train = train['document'].values
X_test = test['document'].values
y = train['label'].values
print(X_train.shape, X_test.shape, y.shape)

(59622,) (9228,) (59622,)


In [None]:
X_train[:3]

array(['거실 콘센트에 선풍기 플러그가 꽂혀 있니?', '나에게 물어보고 싶은 게 있는.', '이번 달 발표 몇 번이야?'],
      dtype=object)

Training

In [None]:
vocab_size = 20000
embedding_dim = 64
max_length = 20
padding_type='post'

In [None]:
tokenizer = Tokenizer(num_words = vocab_size)
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index

In [None]:
train_sequences = tokenizer.texts_to_sequences(X_train)
test_sequences = tokenizer.texts_to_sequences(X_test)

In [None]:
trn = pad_sequences(train_sequences, padding=padding_type, maxlen=max_length)
tst = pad_sequences(test_sequences, padding=padding_type, maxlen=max_length)
print(trn.shape, tst.shape)

(59622, 20) (9228, 20)


In [None]:
cv = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)

In [None]:
def get_model():
    model = Sequential([
        Embedding(vocab_size, embedding_dim, input_length=max_length),
        Bidirectional(LSTM(64, return_sequences=True)),
        Bidirectional(LSTM(64)),
        Dense(n_class, activation='softmax')
    ])
    
    model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=.01))
    return model

In [None]:
p_val = np.zeros((trn.shape[0], n_class))
p_tst = np.zeros((tst.shape[0], n_class))
for i, (i_trn, i_val) in enumerate(cv.split(trn, y), 1):
    print(f'training model for CV #{i}')
    clf = get_model()
    
    es = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=3,
                       verbose=1, mode='min', baseline=None, restore_best_weights=True)   

    clf.fit(trn[i_trn], 
            to_categorical(y[i_trn]),
            validation_data=(trn[i_val], to_categorical(y[i_val])),
            epochs=10,
            batch_size=512,
            callbacks=[es])
    p_val[i_val, :] = clf.predict(trn[i_val])
    p_tst += clf.predict(tst) / n_fold

In [None]:
print(f'Accuracy (CV): {accuracy_score(y, np.argmax(p_val, axis=1)) * 100:8.4f}%')
print(f'Log Loss (CV): {log_loss(pd.get_dummies(y), p_val):8.4f}')

Accuracy (CV):  76.6311%
Log Loss (CV):   1.1490


In [None]:
def sentiment_predict(new_sentence):
  new_sentence = okt.

In [None]:
# np.savetxt(p_val_file, p_val, fmt='%.6f', delimiter=',')
# np.savetxt(p_tst_file, p_tst, fmt='%.6f', delimiter=',')

시각화

In [None]:
print(clf.summary())

In [None]:
plot_model(clf)