# NAVER 영화 리뷰 데이터 분석 - CNN

In [1]:
# 필요한 모듈들을 임포트
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, Conv1D, Dropout, Dense, GlobalMaxPooling1D
from tensorflow.keras.constraints import MaxNorm

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import os
import json

from tqdm import tqdm

In [2]:
# EDA시 전처리한 데이터를 불러옴
DATA_IN_PATH = './data/'

INPUT_TRAIN_DATA = 'nsmc_train_input.npy'
INPUT_LABEL_DATA = 'nsmc_train_label.npy'
DATA_CONFIGS = 'data_configs_kr.json'

train_input = np.load(open(DATA_IN_PATH + INPUT_TRAIN_DATA, 'rb'))
train_input = pad_sequences(train_input, maxlen=train_input.shape[1])
train_label = np.load(open(DATA_IN_PATH + INPUT_LABEL_DATA, 'rb'))
prepro_configs = json.load(open(DATA_IN_PATH + DATA_CONFIGS, 'r'))

In [3]:
# 모델 하이퍼파라미터 정의
model_name = 'cnn_classifier_kr'
BATCH_SIZE = 128
NUM_EPOCHS = 5
VALID_SPLIT = 0.1
MAX_LEN = train_input.shape[1]

kargs = {'model_name': model_name,
        'vocab_size': prepro_configs['vocab_size'],
        'embedding_dimension': 128,
        'num_filters': 100,
        'dropout_rate': 0.2,
        'hidden_dimension': 150,
        'output_dimension': 1}

In [4]:
# 모델을 생성하는 클래스를 정의
class CNNClassifier(Model):
    def __init__(self, **kargs):
        super(CNNClassifier, self).__init__(name=kargs['model_name'])
        self.embedding = Embedding(input_dim=kargs['vocab_size'],
                                  output_dim=kargs['embedding_dimension'])
        self.conv_list = [Conv1D(filters=kargs['num_filters'],
                                kernel_size=kernel_size,
                                padding='valid',
                                activation='relu',
                                kernel_constraint=MaxNorm(max_value=3.))
                         for kernel_size in [3,4,5]]
        self.max_pooling = GlobalMaxPooling1D()
        self.dropout = Dropout(kargs['dropout_rate'])
        self.fc1 = Dense(units=kargs['hidden_dimension'],
                        activation='relu', kernel_constraint=MaxNorm(max_value=3.))
        self.fc2 = Dense(units=kargs['output_dimension'],
                        activation='sigmoid', kernel_constraint=MaxNorm(max_value=3.))
        
    def call(self, x):
        x = self.embedding(x)
        x = self.dropout(x)
        x = tf.concat([self.max_pooling(conv(x)) for conv in self.conv_list], axis=-1)
        x = self.fc1(x)
        x = self.fc2(x)
        
        return x

In [5]:
# 모델 생성
model = CNNClassifier(**kargs)
model.compile(optimizer='adam', loss='binary_crossentropy',
             metrics=['accuracy'])

In [6]:
# overfitting 방지
earlystop_callback = EarlyStopping(monitor='val_accuracy', min_delta=0.0001, patience=1)
# min_delta: the threshold that triggers the termination (acc should at least improve 0.0001)
# patience: no improvment epochs (patience = 1, 1번 이상 상승이 없으면 종료)

checkpoint_path = DATA_OUT_PATH + model_name + '/weights.h5'
checkpoint_dir = os.path.dirname(checkpoint_path)

# Create path if exists
if os.path.exists(checkpoint_dir):
    print("{} -- Folder already exists \n".format(checkpoint_dir))
else:
    os.makedirs(checkpoint_dir, exist_ok=True)
    print("{} -- Folder create complete \n".format(checkpoint_dir))
    

cp_callback = ModelCheckpoint(
    checkpoint_path, monitor='val_accuracy', verbose=1, save_best_only=True, save_weights_only=True)

./submission/cnn_classifier_kr -- Folder already exists 



In [7]:
# 모델 학습
history = model.fit(train_input, train_label, batch_size=BATCH_SIZE, epochs=NUM_EPOCHS,
                   validation_split=VALID_SPLIT, callbacks=[earlystop_callback, cp_callback])

Epoch 1/5
Epoch 00001: val_accuracy improved from -inf to 0.82807, saving model to ./submission/cnn_classifier_kr/weights.h5
Epoch 2/5
Epoch 00002: val_accuracy improved from 0.82807 to 0.82913, saving model to ./submission/cnn_classifier_kr/weights.h5
Epoch 3/5
Epoch 00003: val_accuracy did not improve from 0.82913


In [10]:
# 테스트 데이터를 불러옴
DATA_OUT_PATH = './submission/'
INPUT_TEST_DATA = 'nsmc_test_input.npy'
LABEL_TEST_DATA = 'nsmc_test_label.npy'
SAVE_FILE_NAME = 'weights.h5'

test_input = np.load(open(DATA_IN_PATH + INPUT_TEST_DATA, 'rb'))
test_input = pad_sequences(test_input, maxlen=test_input.shape[1])
test_label_data = np.load(open(DATA_IN_PATH + LABEL_TEST_DATA, 'rb'))

# 모델 테스트
model.load_weights(os.path.join(DATA_OUT_PATH, model_name, SAVE_FILE_NAME))
model.evaluate(test_input, test_label_data) # accuracy : 0.8273



[0.38986945152282715, 0.8272799849510193]