In [46]:
import numpy as np 
from keras.datasets import imdb

from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, Conv1D, MaxPooling1D
from keras.layers.embeddings import Embedding

from keras.preprocessing import sequence
np.random.seed(7)# fix random seed for reproducibility

"""
개별 movie review에 있는, 모든 단어를 고려하는 것은 무의미하기 때문에 
top_words, 즉 상위 5000개의 단어에 대해서만, 추려냄. 나머지는 필터링
그리고 단어는 index로 표시됨 .
"""
max_review_length, top_words = 100, 500 # 원래 500, 5000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)
"""
sequence의 길이를 똑같이 맞춤. 
길이가 500보다 큰 경우에는 그냥 일괄적으로 앞부분을 잘라내버림. 
"""
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)
print("reading data complete")

In [49]:
"""
layer의 구성은 
- Embedding: 단어를 벡터화하고, 이 결과 값을 LSTM에 집어넣어줌. 
    - input_dim에 top_words를 넣어주는데, 아마도 내부에서 자동으로 one-hot vector를 만들어주는 것 같음
    - 현재는 one-hot vector가 아니라, 0, 1, 등 word vocab의 index가 넘어감. 
- Conv1D: 구조적인 특성을 파악하기 위해 여러 filter로 찍어줌.
- MaxPooling1D: convolution으로 찍어낸 정보를 좀 더 특징화함. 
- LSTM: sequential한 정보를 활용
- Dense: classification이므로 output layer을 1칸짜리로 넣어줌. 
"""
embedding_vector_length = 32
model = Sequential([
    Embedding(input_dim=top_words, # 5000
              output_dim=embedding_vector_length, # 32
              input_length=max_review_length), 
    Conv1D(filters=32, kernel_size=5, padding='same', activation='relu'), 
    MaxPooling1D(pool_size=2),
    LSTM(50), # 원래는 100, 
    Dropout(0.2), 
    Dense(25, activation='sigmoid'), 
    Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=5, batch_size=64)
print("training complete")
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: {:.2f}".format(scores[1]*100))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 100, 32)           16000     
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 100, 32)           5152      
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 50, 32)            0         
_________________________________________________________________
lstm_6 (LSTM)                (None, 50)                16600     
_________________________________________________________________
dropout_4 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_8 (Dense)              (None, 25)                1275      
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 26        
Total para

Accuracy: 81.43
