# IMDB 영화 리뷰 감성 분석
- 테스트 데이터 25000건 중 10000건은 Validation 데이터로 활용
- Conv1D  
        컨볼루션 : 특징을 추출해내는 것, 이미지의 경우 2차원이니 2d, 텍스트는 1차원이니 1d

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

seed = 2021
np.random.seed(seed)
tf.random.set_seed(seed)

import warnings
warnings.filterwarnings('ignore')

In [3]:
from tensorflow.keras.datasets import imdb
(X_train, y_train), (X_test,y_test) = imdb.load_data(num_words=None)

X_train.shape, X_test.shape

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


((25000,), (25000,))

In [4]:
index_dict = {}

for key, value in imdb.get_word_index().items():
    index_dict[value] = key

len(index_dict)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json


88584

In [6]:
print(X_train[2])

[1, 14, 47, 8, 30, 31, 7, 4, 249, 108, 7, 4, 5974, 54, 61, 369, 13, 71, 149, 14, 22, 112, 4, 2401, 311, 12, 16, 3711, 33, 75, 43, 1829, 296, 4, 86, 320, 35, 534, 19, 263, 4821, 1301, 4, 1873, 33, 89, 78, 12, 66, 16, 4, 360, 7, 4, 58, 316, 334, 11, 4, 1716, 43, 645, 662, 8, 257, 85, 1200, 42, 1228, 2578, 83, 68, 3912, 15, 36, 165, 1539, 278, 36, 69, 44076, 780, 8, 106, 14, 6905, 1338, 18, 6, 22, 12, 215, 28, 610, 40, 6, 87, 326, 23, 2300, 21, 23, 22, 12, 272, 40, 57, 31, 11, 4, 22, 47, 6, 2307, 51, 9, 170, 23, 595, 116, 595, 1352, 13, 191, 79, 638, 89, 51428, 14, 9, 8, 106, 607, 624, 35, 534, 6, 227, 7, 129, 113]


In [8]:
' '.join(index_dict[s] for s in X_train[2])

"the as there in at by br of sure many br of proving no only women was than doesn't as you never of hat night that with ignored they bad out superman plays of how star so stories film comes defense date of wide they don't do that had with of hollywood br of my seeing fan this of pop out body shots in having because cause it's stick passing first were enjoys for from look seven sense from me superimposition die in character as cuban issues but is you that isn't one song just is him less are strongly not are you that different just even by this of you there is eight when it part are film's love film's 80's was big also light don't wrangling as it in character looked cinematography so stories is far br man acting"

In [9]:
print('영화평 최대 길이 : ', max(len(l) for l in X_train))
print('영화평 평균 길이 : ', sum(map(len,X_train)) / len(X_train))

영화평 최대 길이 :  2494
영화평 평균 길이 :  238.71364


## Conv1D와 LSTM으로 IMDB 리뷰 감성 분류
- 단어 빈도수 : 5,000(총단어수는 88,584)  
- 문장의 단어수 : 500단어 (최대 : 2,494)  
- Test data 중 10000개는 검증데이터로

In [11]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Conv1D, Dropout, MaxPooling1D
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

In [12]:
# 단어 빈도수 : 5,000
(X_train, y_train), (X_test,y_test) = imdb.load_data(num_words=5000)

In [14]:
# 문장의 단어수 : 500단어 
max_len = 500

X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test,maxlen=max_len)

In [15]:
# Test데이터 중 만개는 검증 데이터로
from sklearn.model_selection import train_test_split
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, stratify=y_test, test_size=0.4, random_state=seed)

X_test.shape, X_val.shape, y_test.shape, y_val.shape

((15000, 500), (10000, 500), (15000,), (10000,))

### 모델 정의 / 설정 /학습 / 평가

In [19]:
model = Sequential([
    Embedding(5000,120),
    Dropout(0.5),
    Conv1D(64, 5, activation='relu'),
    MaxPooling1D(pool_size=4),
    LSTM(60),
    Dense(1, activation='sigmoid')
])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 120)         600000    
_________________________________________________________________
dropout_1 (Dropout)          (None, None, 120)         0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, None, 64)          38464     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, None, 64)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 60)                30000     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 61        
Total params: 668,525
Trainable params: 668,525
Non-trainable params: 0
________________________________________________

In [20]:
model.compile(
    optimizer='adam',
    loss = 'binary_crossentropy',metrics=['accuracy']
)

In [22]:
# Callback 함수
model_path = 'model/IMDB_Conv1d_lstm_best.h5'
checkpointer = ModelCheckpoint(
    model_path, monitor='val_loss', verbose=1, save_best_only=True
)
early_stopping = EarlyStopping(patience=5)

In [24]:
# 모델 학습
history = model.fit(X_train, y_train, epochs=50, batch_size = 100, validation_data = (X_val, y_val), verbose=2, callbacks=[checkpointer, early_stopping] )

Epoch 1/50

Epoch 00001: val_loss improved from inf to 0.28942, saving model to model\IMDB_Conv1d_lstm_best.h5
250/250 - 59s - loss: 0.4114 - accuracy: 0.7947 - val_loss: 0.2894 - val_accuracy: 0.8837
Epoch 2/50

Epoch 00002: val_loss improved from 0.28942 to 0.26943, saving model to model\IMDB_Conv1d_lstm_best.h5
250/250 - 55s - loss: 0.2365 - accuracy: 0.9086 - val_loss: 0.2694 - val_accuracy: 0.8907
Epoch 3/50

Epoch 00003: val_loss did not improve from 0.26943
250/250 - 55s - loss: 0.1944 - accuracy: 0.9250 - val_loss: 0.2784 - val_accuracy: 0.8836
Epoch 4/50

Epoch 00004: val_loss did not improve from 0.26943
250/250 - 55s - loss: 0.1577 - accuracy: 0.9414 - val_loss: 0.3079 - val_accuracy: 0.8792
Epoch 5/50

Epoch 00005: val_loss did not improve from 0.26943
250/250 - 52s - loss: 0.1328 - accuracy: 0.9524 - val_loss: 0.2997 - val_accuracy: 0.8798
Epoch 6/50

Epoch 00006: val_loss did not improve from 0.26943
250/250 - 49s - loss: 0.1101 - accuracy: 0.9624 - val_loss: 0.3247 - val

In [26]:
best_model = load_model(model_path)
best_model.evaluate(X_test, y_test)



[0.2695341408252716, 0.8879333138465881]

_______________

## LSTM의 경우 

In [27]:
model2 = Sequential([
    Embedding(5000,120),
    LSTM(120),
    Dense(1, activation='sigmoid')
])
model2.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 120)         600000    
_________________________________________________________________
lstm_2 (LSTM)                (None, 120)               115680    
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 121       
Total params: 715,801
Trainable params: 715,801
Non-trainable params: 0
_________________________________________________________________


In [28]:
model2.compile(
    optimizer='adam',
    loss = 'binary_crossentropy',metrics=['accuracy']
)

In [29]:
# Callback 함수
model_path = 'model/IMDB_lstm_best.h5'
checkpointer = ModelCheckpoint(
    model_path, monitor='val_loss', verbose=1, save_best_only=True
)
early_stopping = EarlyStopping(patience=5)

In [30]:
# 모델 학습
history2 = model2.fit(X_train, y_train, epochs=50, batch_size = 100, validation_data = (X_val, y_val), verbose=2, callbacks=[checkpointer, early_stopping] )

Epoch 1/50

Epoch 00001: val_loss improved from inf to 0.30666, saving model to model\IMDB_lstm_best.h5
250/250 - 284s - loss: 0.4150 - accuracy: 0.8045 - val_loss: 0.3067 - val_accuracy: 0.8744
Epoch 2/50

Epoch 00002: val_loss did not improve from 0.30666
250/250 - 273s - loss: 0.2710 - accuracy: 0.8908 - val_loss: 0.3170 - val_accuracy: 0.8665
Epoch 3/50

Epoch 00003: val_loss did not improve from 0.30666
250/250 - 290s - loss: 0.2248 - accuracy: 0.9128 - val_loss: 0.3182 - val_accuracy: 0.8671
Epoch 4/50

Epoch 00004: val_loss did not improve from 0.30666
250/250 - 305s - loss: 0.1979 - accuracy: 0.9240 - val_loss: 0.3351 - val_accuracy: 0.8692
Epoch 5/50

Epoch 00005: val_loss did not improve from 0.30666
250/250 - 303s - loss: 0.1793 - accuracy: 0.9314 - val_loss: 0.3707 - val_accuracy: 0.8645
Epoch 6/50

Epoch 00006: val_loss did not improve from 0.30666
250/250 - 307s - loss: 0.1415 - accuracy: 0.9476 - val_loss: 0.3887 - val_accuracy: 0.8579


In [33]:
best_model = load_model(model_path)
best_model.evaluate(X_test, y_test)



[0.3048918545246124, 0.8734666705131531]