<a href="https://colab.research.google.com/github/changyong93/Natural-language-processing-with-chat-bot/blob/main/%EB%94%A5%EB%9F%AC%EB%8B%9D%EC%9D%84_%EC%9D%B4%EC%9A%A9%ED%95%9C_%EC%9E%90%EC%97%B0%EC%96%B4%EC%B2%98%EB%A6%AC_%EC%9E%85%EB%AC%B8(11_8_RNN%EC%9D%84_%EC%9D%B4%EC%9A%A9%ED%95%9C_%ED%85%8D%EC%8A%A4%ED%8A%B8%EB%B6%84%EB%A5%98_BiLSTM%EC%9C%BC%EB%A1%9C_%ED%95%9C%EA%B5%AD%EC%96%B4_%EC%8A%A4%ED%8C%80_%EB%A6%AC%EB%B7%B0_%EA%B0%90%EC%84%B1_%EB%B6%84%EB%A5%98.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# BiLSTM으로 한국어 스팀 리뷰 감성 분류하기
- 다운로드 링크 : https://github.com/bab2min/corpus/tree/master/sentiment

## 스팀 리뷰 데이터에 대한 이해와 전처리

In [None]:
# Colab에 Mecab 설치
!git clone https://github.com/SOMJANG/Mecab-ko-for-Google-Colab.git
%cd Mecab-ko-for-Google-Colab
!bash install_mecab-ko_on_colab190912.sh

!pip install konlpy

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import urllib.request
from collections import Counter
from konlpy.tag import Mecab
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import warnings
warnings.filterwarnings(action = 'ignore')

In [None]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/bab2min/corpus/master/sentiment/steam.txt", filename="steam.txt")

In [None]:
total_data = pd.read_table(filepath_or_buffer="steam.txt", names = ["label","reviews"])

In [None]:
print("전체 리뷰 개수: ", len(total_data))

In [None]:
total_data.head()

In [None]:
total_data.nunique()

In [None]:
#중복 데이터 제거
total_data = total_data.drop_duplicates(subset = ["reviews"]).copy()
print("총 샘플 개수: ", total_data.shape[0])

In [None]:
total_data.isnull().sum()

### 훈련셋 테스트셋 나누기

In [None]:
train_data, test_data = train_test_split(total_data, test_size = 0.25, random_state = 42)
print("훈련 리뷰 개수: ", train_data.shape[0])
print("테스트 리뷰 개수: ", test_data.shape[0])

In [None]:
train_data.label.value_counts().plot.bar()

In [None]:
train_data.label.value_counts().reset_index(name = "count")

### 데이터 정제

In [None]:
# 한글을 제외하고 모두 제거, 이 과정에서 빈샘플 확인
train_data["reviews"] = train_data["reviews"].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")
train_data["reviews"] = train_data["reviews"].replace("^ +","")
train_data["reviews"] = train_data["reviews"].replace("",np.nan)
print(test_data.isnull().sum())


In [None]:
#테스트 데이터도 동일 과정 진행
test_data["reviews"] = test_data["reviews"].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")
test_data["reviews"] = test_data["reviews"].replace("^ +","")
test_data["reviews"] = test_data["reviews"].replace("",np.nan)
test_data.isnull().sum()

In [None]:
#불용어 지정
stopwords =\
['도', '는', '다', '의', '가', '이', '은', '한', '에', '하', '고', '을', '를', '인', '듯',
 '과', '와', '네', '들', '듯', '지', '임', '게', '만', '게임', '겜', '되', '음', '면']

### 토큰화

In [None]:
mecab = Mecab()
train_data["tokenized"] = train_data["reviews"].apply(mecab.morphs)
train_data["tokenized"] = train_data["tokenized"].apply(lambda x: [word for word in x if word not in stopwords])

test_data["tokenized"] = test_data["reviews"].apply(mecab.morphs)
test_data["tokenized"] = test_data["tokenized"].apply(lambda x : [word for word in x if word not in stopwords])

### 단어 길이와 분포

In [None]:
negative_tokens = np.hstack(train_data.loc[train_data.label == 0,"tokenized"].values)
positive_tokens = np.hstack(train_data.loc[train_data.label == 1,"tokenized"].values)

In [None]:
negative_tokens_count = Counter(negative_tokens)
positive_tokens_count = Counter(positive_tokens)

print(positive_tokens_count.most_common(20))
print("*"*100)
print(negative_tokens_count.most_common(20))

In [None]:
#길이 분포
fig,(ax1,ax2) = plt.subplots(1,2,figsize = (12,5))
text_len = train_data.loc[train_data["label"]==1,"tokenized"].apply(len)
ax1.hist(x = text_len, color = "red")
ax1.set_xlabel("length of samples")
ax1.set_ylabel("number of samples")
ax1.set_title("positive Reviews")
print("긍정 리뷰의 평균 길이: ", np.mean(text_len))

text_len = train_data.loc[train_data["label"]==0,"tokenized"].apply(len)
ax2.hist(x = text_len, color = "blue")
ax2.set_xlabel("length of samples")
ax2.set_ylabel("number of samples")
ax2.set_title("negative Reviews")
print("부정 리뷰의 평균 길이: ", np.mean(text_len))

plt.show()

In [None]:
X_train = train_data["tokenized"].values
y_train = train_data["label"].values

X_test = test_data["tokenized"].values
y_test = test_data["label"].values

In [None]:
# 정수인코딩
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

In [None]:
threshold = 2
total_cnt = len(tokenizer.word_index) # 전체 단어의 수
rare_cnt = 0 #등장 빈도수가 threhold보다 작은 단어의 수
total_freq = 0 #전체 단어의 등장 빈도수
rare_freq = 0 # 희귀 등장 단어의 등장 빈도수

for key, value in tokenizer.word_counts.items():
  total_freq += value

  if value < threshold:
    rare_cnt += 1
    rare_freq += value

print("전체 등장 집합(vocabulary) 크기: ", total_cnt)
print(f"전체 단어 중 희귀 단어의 개수: {rare_cnt}")
print(f"전체 단어 중 희귀 단어의 비율 {rare_cnt / total_cnt *100:.3f}%")
print(f"전체 등장 빈도수 중 희귀 단어 등장 빈도수 비율 {rare_freq / total_freq * 100:.3f}%")

In [None]:
# 등장 빈도가 1회 이상인 단어는 OOV 처리
# 0번 패딩 + OOV 토큰을 고려해 +2
vocab_size = total_cnt - rare_cnt +2
print("단어 집합의 크기: ", vocab_size)

In [None]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token = "OOV")
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [None]:
print(X_train[:3])
print("*"*100)
print(X_test[:3])

### 패딩

In [None]:
print("리뷰의 최대 길이: ", max(map(len, X_train)))
print("리뷰의 평균 길이: ", np.mean([len(s) for s in X_train]))

plt.hist([len(s) for s in X_train], bins = 50)
plt.xlabel("length of samples")
plt.ylabel("number of samples")
plt.show()

In [None]:
max_len = max([len(s) for s in X_train])
for idx in range(1,max_len+1):
  cnt = 0
  for value in [len(s) for s in X_train]:
    if value <= idx:
      cnt +=1
  print(f"f전체 샘플 중 길이가 {idx}이하인 샘플의 비율 {cnt/len(X_train)*100:.3f}%")

In [None]:
max_len = 60
X_train = pad_sequences(X_train, maxlen = max_len)
X_test = pad_sequences(X_test, maxlen = max_len)

## BiLSTM으로 스팀  리뷰 감성 분류

In [None]:
import re
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import Dense,LSTM, Bidirectional, Embedding
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

In [None]:
model = Sequential()
model.add(Embedding(input_dim = vocab_size, output_dim = 100))
model.add(Bidirectional(layer = LSTM(units = 100, return_sequences=False)))
model.add(Dense(units = 1, activation = "sigmoid"))

es = EarlyStopping(monitor = 'val_loss', patience = 4, verbose = 1, mode = 'min')
mc = ModelCheckpoint(filepath = "base_model.h5",monitor = 'val_acc', verbose = 1, save_best_only=True, mode = 'max')

model.compile(optimizer = 'rmsprop', loss = 'binary_crossentropy', metrics = ['acc'])
model.fit(x = X_train, y = y_train, batch_size = 256, epochs = 15, validation_split=0.2, callbacks = [es,mc])

In [None]:
loaded_model = load_model("base_model.h5")
print(f"테스트 정확도: {loaded_model.evaluate(x = X_test, y = y_test)[1]:.3f}%")

## 리뷰 예측해보기

In [None]:
def sentiment_predict(new_sentence):
  new_sentence = re.sub("[^ㄱ-ㅎㅏ-ㅣ가-힣]","",new_sentence)
  new_sentence = mecab.morphs(new_sentence)
  new_sentence = [word for word in new_sentence if word not in stopwords]
  encoded = tokenizer.texts_to_sequences([new_sentence])
  pad_new = pad_sequences(sequences = encoded, maxlen = max_len)
  score = float(loaded_model.predict(pad_new))
  if score > 0.5:
    print(f"{score * 100:.3f}% 확률로 긍정 리뷰입니다.")
  else:
    print(f"{(1-score) * 100:.3f}% 확률로 부정 리뷰입니다.")

In [None]:
sentiment_predict('노잼 ..완전 재미 없음 ㅉㅉ')

In [None]:
sentiment_predict('조금 어렵지만 재밌음ㅋㅋ')

In [None]:
sentiment_predict('케릭터가 예뻐서 좋아요')