# IMDB 영화리뷰 감성분석

In [1]:
import numpy as np
import tensorflow as tf
seed = 2023
np.random.seed(seed)
tf.random.set_seed(seed)

In [2]:
from tensorflow.keras.datasets import imdb
(X_train, y_train), (X_test, y_test) = imdb.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


In [3]:
len(X_train), len(X_test)

(25000, 25000)

In [4]:
# 등장한 단어수
len(imdb.get_word_index())

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json


88584

In [5]:
# 리뷰 최대/평균 길이
max(len(s) for s in X_train), sum(map(len, X_train)) / len(X_train)

(2494, 238.71364)

### LSTM으로 감성분석
- 단어 빈도수: 10,000 (총 88,584)
- 리뷰 단어수: 500
- 데이터 갯수
    - train: 25000, test: 15000, val: 10000개
    - train: 20000, test: 5000, val: 20% (시간관계상 수업중)

In [6]:
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [7]:
num_words = 10000
(X_train, y_train), (_, _) = imdb.load_data(num_words=num_words)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    X_train, y_train, stratify=y_train, test_size=0.2, random_state=seed
)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((20000,), (5000,), (20000,), (5000,))

In [9]:
max_len = 500
X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)
X_train.shape, X_test.shape

((20000, 500), (5000, 500))

#### 모델
- Embedding vector 차원: 100
- LSTM unit 갯수: 128

In [10]:
model = Sequential([ 
    Embedding(num_words, 100, input_length=max_len),
    LSTM(128),
    Dense(1, activation='sigmoid')
])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 500, 100)          1000000   
                                                                 
 lstm (LSTM)                 (None, 128)               117248    
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 1,117,377
Trainable params: 1,117,377
Non-trainable params: 0
_________________________________________________________________


In [11]:
model.compile('adam', 'binary_crossentropy', ['accuracy'])

In [12]:
model_path = 'best_model.h5'
mc = ModelCheckpoint(model_path, verbose=1, save_best_only=True)
es = EarlyStopping(patience=5)

In [13]:
hist = model.fit(
    X_train, y_train, validation_split=0.2,
    epochs=30, batch_size=64, callbacks=[mc, es]
)

Epoch 1/30
Epoch 1: val_loss improved from inf to 0.33873, saving model to best_model.h5
Epoch 2/30
Epoch 2: val_loss improved from 0.33873 to 0.33202, saving model to best_model.h5
Epoch 3/30
Epoch 3: val_loss did not improve from 0.33202
Epoch 4/30
Epoch 4: val_loss did not improve from 0.33202
Epoch 5/30
Epoch 5: val_loss did not improve from 0.33202
Epoch 6/30
Epoch 6: val_loss did not improve from 0.33202
Epoch 7/30
Epoch 7: val_loss did not improve from 0.33202


In [14]:
best_model = load_model(model_path)
best_model.evaluate(X_test, y_test)



[0.3295687139034271, 0.8622000217437744]

#### Conv1D 로 학습

In [15]:
from tensorflow.keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D, Dropout

In [16]:
model2 = Sequential([ 
    Embedding(num_words, 100, input_length=max_len),
    Dropout(0.5),
    Conv1D(64, 7, activation='relu'),
    MaxPooling1D(7),
    Conv1D(64, 5, activation='relu'),
    MaxPooling1D(5),
    GlobalMaxPooling1D(),
    Dense(1, 'sigmoid')
])
model2.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 500, 100)          1000000   
                                                                 
 dropout (Dropout)           (None, 500, 100)          0         
                                                                 
 conv1d (Conv1D)             (None, 494, 64)           44864     
                                                                 
 max_pooling1d (MaxPooling1D  (None, 70, 64)           0         
 )                                                               
                                                                 
 conv1d_1 (Conv1D)           (None, 66, 64)            20544     
                                                                 
 max_pooling1d_1 (MaxPooling  (None, 13, 64)           0         
 1D)                                                  

In [17]:
model2.compile('adam', 'binary_crossentropy', ['accuracy'])
model2_path = 'best_model2.h5'
mc2 = ModelCheckpoint(model2_path, verbose=1, save_best_only=True)
es2 = EarlyStopping(patience=5)

In [18]:
hist2 = model2.fit(
    X_train, y_train, validation_split=0.2,
    epochs=30, batch_size=64, callbacks=[mc2, es2]
)

Epoch 1/30
Epoch 1: val_loss improved from inf to 0.34284, saving model to best_model2.h5
Epoch 2/30
Epoch 2: val_loss improved from 0.34284 to 0.31695, saving model to best_model2.h5
Epoch 3/30
Epoch 3: val_loss did not improve from 0.31695
Epoch 4/30
Epoch 4: val_loss did not improve from 0.31695
Epoch 5/30
Epoch 5: val_loss did not improve from 0.31695
Epoch 6/30
Epoch 6: val_loss did not improve from 0.31695
Epoch 7/30
Epoch 7: val_loss did not improve from 0.31695


In [19]:
best_model2 = load_model(model2_path)
best_model2.evaluate(X_test, y_test)



[0.31364983320236206, 0.8654000163078308]

#### Conv1D + LSTM

In [20]:
model3 = Sequential([ 
    Embedding(num_words, 100, input_length=max_len),
    Dropout(0.5),
    Conv1D(64, 5, activation='relu'),
    MaxPooling1D(5),
    LSTM(100),
    Dense(1, 'sigmoid')
])
model3.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 500, 100)          1000000   
                                                                 
 dropout_1 (Dropout)         (None, 500, 100)          0         
                                                                 
 conv1d_2 (Conv1D)           (None, 496, 64)           32064     
                                                                 
 max_pooling1d_2 (MaxPooling  (None, 99, 64)           0         
 1D)                                                             
                                                                 
 lstm_1 (LSTM)               (None, 100)               66000     
                                                                 
 dense_2 (Dense)             (None, 1)                 101       
                                                      

In [21]:
model3.compile('adam', 'binary_crossentropy', ['accuracy'])
model3_path = 'best_model3.h5'
mc3 = ModelCheckpoint(model3_path, verbose=1, save_best_only=True)
es3 = EarlyStopping(patience=5)

In [22]:
hist3 = model3.fit(
    X_train, y_train, validation_split=0.2,
    epochs=30, batch_size=64, callbacks=[mc3, es3]
)

Epoch 1/30
Epoch 1: val_loss improved from inf to 0.29699, saving model to best_model3.h5
Epoch 2/30
Epoch 2: val_loss improved from 0.29699 to 0.28146, saving model to best_model3.h5
Epoch 3/30
Epoch 3: val_loss did not improve from 0.28146
Epoch 4/30
Epoch 4: val_loss did not improve from 0.28146
Epoch 5/30
Epoch 5: val_loss did not improve from 0.28146
Epoch 6/30
Epoch 6: val_loss did not improve from 0.28146
Epoch 7/30
Epoch 7: val_loss did not improve from 0.28146


In [23]:
best_model3 = load_model(model3_path)
best_model3.evaluate(X_test, y_test)



[0.27746376395225525, 0.881600022315979]