# LSTM을 사용한 영화 리뷰 감정 분석

영화 리뷰와 같은 데이터는 일반적인 수치 데이터와 달리 사람의 언어, 즉 자연어 데이터입니다. 자연어 데이터는 머신러닝에서 특히 다루기 까다로운 데이터인데요. 데이터 자체가 길고 전처리할 부분이 많을 뿐만 아니라 순서나 단어의 유사성 개념을 이해해야만 하기 때문입니다.

딥러닝에서는 이렇듯 순서가 있는 데이터를 처리하기 위해 RNN (Recurrent Neural Network)를 사용하고, 일반적인 RNN에서 발생하는 문제를 막기 위한 다양한 종류의 개량된 레이어를 사용합니다.

단어의 유사성을 학습하는 것은, 단어를 단순히 1, 2, 53과 같은 숫자로 표현하는 대신 [0.3, 0.4, 2.1] 과 같은 좌표로 표현하는 것을 통해 가능합니다. 좌표 공간에서의 방향이나 거리를 인간이 이해하는 단어 사이의 관계를 표현하기 위해 쓰는 것이죠. 이를 word embedding이라고 하며 자연어 처리 분야에서 거의 필수적인 요소입니다.

LSTM(RNN) 소개  
https://brunch.co.kr/@chris-song/9

Word Embedding (Word2Vec)  
https://deeplearning4j.org/kr/word2vec

In [1]:
# Keras의 백엔드 프레임워크로 Tensorflow를 사용합니다
import tensorflow as tf
# 이 셀을 실행하고 *이 사라진 것을 확인 후 다음으로 진행하세요

from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

Using TensorFlow backend.


[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 2818496372077541760
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 11287966516
locality {
  bus_id: 1
  links {
  }
}
incarnation: 9339303113628124430
physical_device_desc: "device: 0, name: Tesla K80, pci bus id: 0000:00:04.0, compute capability: 3.7"
]


In [2]:
# LSTM for sequence classification in the IMDB dataset
import numpy
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import LSTM, CuDNNLSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

# load the dataset but only keep the top n words, zero the rest
top_words = 3000
index_from_num = 3

(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words, index_from=index_from_num)

# truncate and pad input sequences
max_review_length = 300
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)

X_alt_train = X_train[..., numpy.newaxis]
X_alt_test = X_test[..., numpy.newaxis]
X = X_train
X_t = X_test

Downloading data from https://s3.amazonaws.com/text-datasets/imdb.npz


In [3]:
print(X_t[0])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0 

In [4]:
# To see original text data
(X_original_train, y_original_train), (X_original_test, y_original_test) = imdb.load_data(index_from=index_from_num)
word_to_id = imdb.get_word_index()
word_to_id = {k:(v+index_from_num) for k,v in word_to_id.items()}
word_to_id["<PAD>"] = 0
word_to_id["<START>"] = 1
word_to_id["<UNK>"] = 2
id_to_word = {value:key for key,value in word_to_id.items()}

Downloading data from https://s3.amazonaws.com/text-datasets/imdb_word_index.json


In [0]:
def dense_only():
    global X
    global X_t
    X = X_train
    X_t = X_test
    model = Sequential()
    model.add(Dense(512, input_shape=(max_review_length, )))
    model.add(Dense(512, activation='relu'))
    model.add(Dense(512, activation='relu'))
    model.add(Dense(512, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model
  
def simple():
  global X
  global X_t
  X = X_alt_train
  X_t = X_alt_test
  model = Sequential()
  model.add(LSTM(128, input_shape=(max_review_length, 1)))
  model.add(Dense(1, activation='sigmoid'))
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
  return model
  
def simple_cuDNN():
    global X
    global X_t
    X = X_alt_train
    X_t = X_alt_test
    model = Sequential()
    model.add(CuDNNLSTM(128, input_shape=(max_review_length, 1)))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model
  
def stacked():
    global X
    global X_t
    X = X_alt_train
    X_t = X_alt_test
    model = Sequential()
    model.add(CuDNNLSTM(128, input_shape=(max_review_length, 1), return_sequences=True))
    model.add(CuDNNLSTM(128))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

embedding_vector_length = 32
def embedding():
    global X
    global X_t
    X = X_train
    X_t = X_test
    model = Sequential()
    model.add(Embedding(top_words, embedding_vector_length, input_length=max_review_length))
    model.add(CuDNNLSTM(128))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model
  
def embedding_stacked():
    global X
    global X_t
    X = X_train
    X_t = X_test
    model = Sequential()
    model.add(Embedding(top_words, embedding_vector_length, input_length=max_review_length))
    model.add(CuDNNLSTM(128, return_sequences=False))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [15]:
model = simple_cuDNN()
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
cu_dnnlstm_1 (CuDNNLSTM)     (None, 128)               67072     
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 129       
Total params: 67,201
Trainable params: 67,201
Non-trainable params: 0
_________________________________________________________________
None


In [17]:
model.fit(X, y_train, epochs=5, batch_size=128)

Epoch 1/5

KeyboardInterrupt: ignored

In [0]:
# Final evaluation of the model
scores = model.evaluate(X_t, y_test, batch_size=32, verbose=1)
print("Accuracy: %.2f%%" % (scores[1]*100))

In [0]:
start_index = 0
end_index = 10
predict = model.predict(X_t[start_index:end_index+1])

for i, p in enumerate(predict):
    if (p[0] < 0.5 and y_test[i] == 0) or (p[0] > 0.5 and y_test[i] == 1):
        correct = '맞음'
    else:
        correct = '틀림'
    print("%d번 데이터 (%s) - 예측: %.3f / 실제: %d" % (start_index + i, correct, p[0], y_test[i]))
    print(' '.join(id_to_word[id] for id in X_original_test[i] if id != 0 and id != 1 ))
    print('')