In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Dropout, Embedding
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D
from tensorflow.keras.constraints import MaxNorm

import numpy as np
import pandas as pd

import os
import json

In [2]:
# 전처리한 학습 데이터를 불러옴
DATA_IN_PATH  = './data/'
DATA_OUT_PATH = './submission/'

TRAIN_INPUT_DATA = 'train_input.npy'
TRAIN_LABEL_DATA = 'train_label.npy'
DATA_CONFIGS = 'data_configs_en.json'

train_input = np.load(open(DATA_IN_PATH + TRAIN_INPUT_DATA, 'rb'))
train_label = np.load(open(DATA_IN_PATH + TRAIN_LABEL_DATA, 'rb'))
prepro_configs = json.load(open(DATA_IN_PATH + DATA_CONFIGS, 'r'))

In [3]:
# vocab_size 설정을 위해 검증 데이터를 불러옴
# test_inputs.npy에 prepro_configs['vocab_size']의 범위([0, 74067))를 넘는 데이터가 존재하기 때문
TEST_INPUT_DATA = 'test_inputs.npy'
TEST_ID_DATA = 'test_id.npy'
SAVE_FILE_NAME = 'weights.h5'

test_input = np.load(open(DATA_IN_PATH + TEST_INPUT_DATA, 'rb'))
test_input = pad_sequences(test_input, maxlen=test_input.shape[1])

In [4]:
# 랜덤 시드 고정
SEED_NUM = 1234
tf.random.set_seed(SEED_NUM)

In [14]:
# 모델 하이퍼파라미터 정의
model_name = 'cnn_classifier_en'
BATCH_SIZE = 128
NUM_EPOCHS = 5
VALID_SPLIT = 0.1
MAX_LEN = train_input.shape[1]
MAX_SENTENCE_LENGTH = np.max(test_input) + 1

kargs = {'model_name': model_name,
        'vocab_size': MAX_SENTENCE_LENGTH,
        'embedding_dimension': 128,
        'num_filters': 100,
        'dropout_rate': 0.5,
        'hidden_dimension': 100,
        'output_dimension': 1}

In [6]:
# 모델 구현
class CNNClassifier(Model):
    def __init__(self, **kargs):
        super(CNNClassifier, self).__init__(name=kargs['model_name'])
        self.embedding = Embedding(input_dim=kargs['vocab_size'],
                                  output_dim=kargs['embedding_dimension'])
        self.conv_list = [Conv1D(filters=kargs['num_filters'],
                                kernel_size=kernel_size,
                                padding='valid',
                                activation='relu',
                                kernel_constraint=MaxNorm(max_value=3.))
                         for kernel_size in [3,4,5]]
        self.max_pooling = GlobalMaxPooling1D()
        self.dropout = Dropout(kargs['dropout_rate'])
        self.fc1 = Dense(units=kargs['hidden_dimension'],
                        activation='relu', kernel_constraint=MaxNorm(max_value=3.))
        self.fc2 = Dense(units=kargs['output_dimension'],
                        activation='sigmoid', kernel_constraint=MaxNorm(max_value=3.))
        
    def call(self, x):
        x = self.embedding(x)
        x = self.dropout(x)
        x = tf.concat([self.max_pooling(conv(x)) for conv in self.conv_list], axis=-1)
        x = self.fc1(x)
        x = self.fc2(x)
        
        return x

In [7]:
# 모델 생성
model = CNNClassifier(**kargs)
model.compile(optimizer='adam', loss='binary_crossentropy',
             metrics=['accuracy'])

In [8]:
# overfitting 방지
earlystop_callback = EarlyStopping(monitor='val_accuracy', min_delta=0.0001, patience=1)
# min_delta: the threshold that triggers the termination (acc should at least improve 0.0001)
# patience: no improvment epochs (patience = 1, 1번 이상 상승이 없으면 종료)

checkpoint_path = DATA_OUT_PATH + model_name + '/weights.h5'
checkpoint_dir = os.path.dirname(checkpoint_path)

# Create path if exists
if os.path.exists(checkpoint_dir):
    print("{} -- Folder already exists \n".format(checkpoint_dir))
else:
    os.makedirs(checkpoint_dir, exist_ok=True)
    print("{} -- Folder create complete \n".format(checkpoint_dir))
    

cp_callback = ModelCheckpoint(
    checkpoint_path, monitor='val_accuracy', verbose=1, save_best_only=True, save_weights_only=True)

./submission/cnn_classifier -- Folder already exists 



In [13]:
# 모델 학습
history = model.fit(train_input, train_label, batch_size=BATCH_SIZE, epochs=NUM_EPOCHS,
                   validation_split=VALID_SPLIT, callbacks=[earlystop_callback, cp_callback])

Epoch 1/5
Epoch 00001: val_accuracy improved from -inf to 0.87720, saving model to ./submission/cnn_classifier/weights.h5
Epoch 2/5
Epoch 00002: val_accuracy improved from 0.87720 to 0.88040, saving model to ./submission/cnn_classifier/weights.h5
Epoch 3/5
Epoch 00003: val_accuracy did not improve from 0.88040


In [16]:
# 가장 좋은 검증 점수가 나온 모델을 불러옴
model.load_weights(os.path.join(DATA_OUT_PATH, model_name, SAVE_FILE_NAME))

In [17]:
# kaggle에 제출할 CSV파일을 만들어 저장
predictions = model.predict(test_input)
predictions = predictions.squeeze(-1)

test_id = np.load(open(DATA_IN_PATH + TEST_ID_DATA, 'rb'), allow_pickle=True)

output = pd.DataFrame(data={'id': list(test_id), 'sentiment': list(predictions)})
output.to_csv(DATA_OUT_PATH + 'movie_review_sentiment_analysis_CNN.csv', index=False, quoting=3)