<a href="https://colab.research.google.com/github/changyong93/Natural-language-processing-with-chat-bot/blob/main/%EB%94%A5%EB%9F%AC%EB%8B%9D%EC%9D%84_%EC%9D%B4%EC%9A%A9%ED%95%9C_%EC%9E%90%EC%97%B0%EC%96%B4%EC%B2%98%EB%A6%AC_%EC%9E%85%EB%AC%B8(11_4_RNN%EC%9D%84_%EC%9D%B4%EC%9A%A9%ED%95%9C_%ED%85%8D%EC%8A%A4%ED%8A%B8%EB%B6%84%EB%A5%98_IMDB_%EB%A6%AC%EB%B7%B0_%EA%B0%90%EC%84%B1_%EB%B6%84%EB%A5%98).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#  IMDB 리뷰 감성 분류하기(IMDB Movie Review Sentiment Analysis)

## IMDB 리뷰 데이터에 대한 이해

In [None]:
from tensorflow.keras.datasets import imdb
import matplotlib.pyplot as plt
import numpy as np

In [None]:
#imdb 데이터셋은 훈련데이터셋와 테스트데이터셋을 5:5로 구분하여 제공하여 test_set 비율 조절 파라미터가 따로 없음
(X_train,y_train), (X_test,y_test) = imdb.load_data()

In [None]:
print("훈련용 리뷰 개수: ",len(X_train))
print("테스트용 리뷰 개수: ", len(X_test))
print("카테고리: ", np.unique(y_train))

In [None]:
print(X_train[0])
print("*"*100)
print(X_test[0])

In [None]:
len_result = [len(l) for l in X_train]
print(f"리뷰의 최대 길이: ", max(len_result))
print(f"리뷰의 평균 길이: ", np.mean(len_result))

plt.figure(figsize = (10,5))
plt.subplot(1,2,1)
plt.boxplot(len_result)
plt.subplot(1,2,2)
plt.hist(len_result, bins = 50)
plt.show()

In [None]:
unique_elements, counts_elements = np.unique(y_train, return_counts=True)
print("각 레이블에 대한 빈도수")
np.asarray([unique_elements, counts_elements])

In [None]:
word_to_index = imdb.get_word_index()
index_to_word = {}
for value,index in word_to_index.items():
  index_to_word[index+3] = value # get_word_index에 저장된 값에 +3을 해야 실제 맵핑되는 정수, 이는 imdb 데이터셋에서 정한 규칙

In [None]:
#IMDB 리뷰 데이터셋에서는 0, 1, 2, 3은 특별 토큰으로 취급.
# 따라서 정수 4부터가 실제 IMDB 리뷰 데이터셋에서 빈도수가 가장 높은 실제 영단어
print("빈도수 상위 1등 단어:", index_to_word[4] )
print("빈도수 상위 3938등 단어:", index_to_word[3941] )

In [None]:
for index, value in enumerate(["<pad>", "<sos>", "<unk>"]):
  index_to_word[index] = value

In [None]:
print(' '.join([index_to_word[index] for index in X_train[0]]))

## GRU로 IMDB 리뷰 감성 분류하기

In [None]:
import re #정규표현식
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense, Embedding
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import load_model

In [None]:
#단어 집합의 크기 10000개로 제한
vocab_size = 10000
(X_train,y_train),(X_test,y_test) = imdb.load_data(num_words = vocab_size)

In [None]:
#리뷰 최대 길이 500으로 제한
max_len = 500
X_train = pad_sequences(sequences= X_train, maxlen = max_len)
X_test = pad_sequences(sequences= X_test, maxlen=max_len)


In [None]:
model = Sequential()
model.add(Embedding(input_dim = vocab_size, output_dim = 100))
model.add(GRU(units = 128))
model.add(Dense(units = 1, activation='sigmoid'))

In [None]:
es = EarlyStopping(monitor='val_loss', mode = 'min', patience= 4, verbose = 1)
mc = ModelCheckpoint('GRU_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

In [None]:
model.compile(optimizer='rmsprop', loss = 'binary_crossentropy', metrics = ['acc'])
history = model.fit(x = X_train, y = y_train, batch_size = 60, epochs=15, callbacks = [es, mc], validation_split=0.2)

In [None]:
load_model('GRU_model.h5')

In [None]:
load_model_selected = load_model('GRU_model.h5')
print('\n 테스트 정확도: %4.f' %load_model_selected.evaluate(X_test,y_test)[1])

---
- IMDB 사이트의 영화 블랙팬서의 1점 리뷰 예측 테스트

In [None]:
def sentiment_predict(new_sentence):
  #알파벳과 숫자를 제외하고 모두 제거 및 알파벳 소문자화
  new_sentence = re.sub('[^0-9a-zA-Z ]', '', new_sentence).lower() #정규표현식을 위한 []에서 마지막 띄어쓰기도 포함, [^0-9a-zA-Z ] 시, 숫자,소문자,대문자, 띄어쓰기를 제외하고 제거

  #정수 인코딩
  encoded = []
  for word in new_sentence.split():
    #단어 크기 10000으로 제한
    try:
      if word_to_index[word] <= 10000:
        encoded.append(word_to_index[word]+3)
      else: #10000이 넘으면 <unk> 토큰으로 취급
        encoded.append(2)
    except KeyError:
      encoded.append(2)



  pad_new = pad_sequences([encoded], maxlen = max_len)
  score = float(load_model_selected.predict(x = pad_new))

  if(score > 0.5):
    print(f"{score*100:.2f}% 확률로 긍정 리뷰입니다.")
  else:
    print(f"{(1 - score)*100:.2f}% 확률로 부정 리뷰입니다.")

In [None]:
temp_str = "This movie was just way too overrated. The fighting was not professional and in slow motion. I was expecting more from a 200 million budget movie. The little sister of T.Challa was just trying too hard to be funny. The story was really dumb as well. Don't watch this movie if you are going because others say its great unless you are a Black Panther fan or Marvels fan."

sentiment_predict(temp_str)

---
10점 리뷰 예측

In [None]:
temp_str = " I was lucky enough to be included in the group to see the advanced screening in Melbourne on the 15th of April, 2012. And, firstly, I need to say a big thank-you to Disney and Marvel Studios. \
Now, the film... how can I even begin to explain how I feel about this film? It is, as the title of this review says a 'comic book triumph'. I went into the film with very, very high expectations and I was not disappointed. \
Seeing Joss Whedon's direction and envisioning of the film come to life on the big screen is perfect. The script is amazingly detailed and laced with sharp wit a humor. The special effects are literally mind-blowing and the action scenes are both hard-hitting and beautifully choreographed."

sentiment_predict(temp_str)