In [None]:
#biLSTM 기반 감성 분류기

In [None]:
!pip install konlpy

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import urllib.request
from collections import Counter
from konlpy.tag import Okt
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/bab2min/corpus/master/sentiment/steam.txt", filename="steam.txt")


In [None]:
total_data = pd.read_table('steam.txt', names=['label', 'reviews'],encoding='utf-8')
print('전체 리뷰 개수 :',len(total_data))

In [None]:
total_data

In [None]:
total_data['reviews'].nunique()
total_data['label'].nunique()

In [None]:
total_data.drop_duplicates(subset=['reviews'], inplace=True) 

In [None]:
print('총 샘플의 수 :',len(total_data))


In [None]:
print(total_data.isnull().values.any())

In [None]:
train_data, test_data = train_test_split(total_data, 
                                         test_size = 0.25, 
                                         random_state = 42)
print('훈련용 리뷰의 개수 :', len(train_data))
print('테스트용 리뷰의 개수 :', len(test_data))

In [None]:
train_data['label'].value_counts().plot(kind = 'bar')


In [None]:
train_data['reviews']=train_data['reviews'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")

In [None]:
train_data['reviews'].replace('', np.nan, inplace=True)


In [None]:
print(train_data.isnull().sum())


In [None]:
test_data.drop_duplicates(subset = ['reviews'], inplace=True) # 중복 제거
test_data['reviews'] = test_data['reviews'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","") # 정규 표현식 수행
test_data['reviews'].replace('', np.nan, inplace=True) # 공백은 Null 값으로 변경
test_data = test_data.dropna(how='any') # Null 값 제거
print('전처리 후 테스트용 샘플의 개수 :',len(test_data))

In [None]:
stopwords = ['도', '는', '다', '의', '가', '이', '은', '한', '에', '하', '고', '을', '를', '인', '듯', '과', '와', '네', '들', '듯', '지', '임', '게', '만', '게임', '겜', '되', '음', '면']

In [None]:
okt=Okt()

In [None]:
train_data['tokenized'] = train_data['reviews'].apply(okt.morphs)
train_data['tokenized'] = train_data['tokenized'].apply(lambda x: [item for item in x if item not in stopwords])
test_data['tokenized'] = test_data['reviews'].apply(okt.morphs)
test_data['tokenized'] = test_data['tokenized'].apply(lambda x: [item for item in x if item not in stopwords])

In [None]:
train_data.head()

In [None]:
negative_words = np.hstack(train_data[train_data.label == 0]['tokenized'].values)
positive_words = np.hstack(train_data[train_data.label == 1]['tokenized'].values)

In [None]:
#negative_words = np.hstack(train_data[train_data.label == 0]['tokenized'].values)

In [None]:
negative_word_count = Counter(negative_words)
print(negative_word_count.most_common(20))

In [None]:
positive_word_count = Counter(positive_words)
print(positive_word_count.most_common(20))

In [None]:
fig,(ax1,ax2) = plt.subplots(1,2,figsize=(10,5))
text_len = train_data[train_data['label']==1]['tokenized'].map(lambda x: len(x))
ax1.hist(text_len, color='red')
ax1.set_title('Positive Reviews')
ax1.set_xlabel('length of samples')
ax1.set_ylabel('number of samples')
print('긍정 리뷰의 평균 길이 :', np.mean(text_len))

text_len = train_data[train_data['label']==0]['tokenized'].map(lambda x: len(x))
ax2.hist(text_len, color='blue')
ax2.set_title('Negative Reviews')
fig.suptitle('Words in texts')
ax2.set_xlabel('length of samples')
ax2.set_ylabel('number of samples')
print('부정 리뷰의 평균 길이 :', np.mean(text_len))
plt.show()

In [None]:
X_train = train_data['tokenized'].values

In [None]:
y_train = train_data['label'].values


In [None]:
X_test= test_data['tokenized'].values
y_test = test_data['label'].values

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

In [None]:
total_cnt = len(tokenizer.word_index) # 단어의 수

In [None]:
threshold = 2

In [None]:
rare_cnt = 0 # 등장 빈도수가 threshold보다 작은 단어의 개수를 카운트
total_freq = 0 # 훈련 데이터의 전체 단어 빈도수 총 합
rare_freq = 0 # 등장 빈도수가 threshold보다 작은 단어의 등장 빈도수의 총 합

In [None]:
#tokenizer.word_counts
for key, value in tokenizer.word_counts.items():
    total_freq = total_freq + value
    # 단어의 등장 빈도수가 threshold보다 작으면
    if(value < threshold):
        rare_cnt = rare_cnt + 1 #희귀 단어 카운트
        rare_freq = rare_freq + value         #희귀 단어 빈도수 누적

In [None]:
print('단어 집합(vocabulary)의 크기 :',total_cnt)
print('등장 빈도가 %s번 이하인 희귀 단어의 수: %s'%(threshold - 1, rare_cnt))
print("단어 집합에서 희귀 단어의 비율:", (rare_cnt / total_cnt)*100)
print("전체 등장 빈도에서 희귀 단어 등장 빈도 비율:", (rare_freq / total_freq)*100)

In [None]:
vocab_size=total_cnt-rare_cnt+2
print('단어 집합의 크기 :',vocab_size)
#0번은 패딩, 1번 oov로 사용, 실제 단어는 2번 index부터 부여

In [None]:
tokenizer=Tokenizer(vocab_size, oov_token='OOV')

In [None]:
tokenizer.fit_on_texts(X_train)

In [None]:
tokenizer.index_word

In [None]:
X_train[1]

In [None]:
tokenizer.texts_to_sequences(X_train)

In [None]:
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [None]:
print('리뷰의 최대 길이 :',max(len(l) for l in X_train))
print('리뷰의 평균 길이 :',sum(map(len, X_train))/len(X_train))


In [None]:
len(X_train)

In [None]:
len(X_train[74918])

In [None]:
map(len, X_train)

In [None]:
sum(map(len, X_train))

In [None]:
plt.hist([len(s) for s in X_train], bins=50)
plt.xlabel('length of samples')
plt.ylabel('number of samples')
plt.show()

In [None]:
def below_threshold_len(max_len, nested_list):
  cnt = 0
  for s in nested_list:
    if(len(s) <= max_len):
        cnt = cnt + 1
  print('전체 샘플 중 길이가 %s 이하인 샘플의 비율: %s'%(max_len, (cnt / len(nested_list))*100))

In [None]:
max_len = 60
below_threshold_len(max_len, X_train)

In [None]:
X_train = pad_sequences(X_train, maxlen = max_len)
X_test = pad_sequences(X_test, maxlen = max_len)

In [None]:
X_train[1]

In [None]:
import re
from keras.layers import Embedding, Dense, LSTM, Bidirectional
from keras.models import Sequential
from keras.models import load_model
from keras.callbacks import EarlyStopping, ModelCheckpoint

In [None]:
vocab_size

In [None]:
model = Sequential()
model.add(Embedding(vocab_size, 100)) #vocab_size(31592)->100차원 공간으로 임베딩
model.add(Bidirectional(LSTM(100)))
model.add(Dense(1, activation='sigmoid'))

In [None]:
model.summary()

In [None]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)


In [None]:
mc = ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

In [None]:
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
history = model.fit(X_train, y_train, epochs=15, callbacks=[es, mc], batch_size=256, validation_split=0.2)

In [None]:
loaded_model = load_model('best_model.h5')

In [None]:
loaded_model.evaluate(X_test, y_test)

In [None]:
def sentiment_predict(new_sentence):
  new_sentence = re.sub(r'[^ㄱ-ㅎㅏ-ㅣ가-힣 ]','', new_sentence)
  new_sentence = okt.morphs(new_sentence) # 토큰화
  new_sentence = [word for word in new_sentence if not word in stopwords] # 불용어 제거
  encoded = tokenizer.texts_to_sequences([new_sentence]) # 정수 인코딩
  pad_new = pad_sequences(encoded, maxlen = max_len) # 패딩
  score = float(loaded_model.predict(pad_new)) # 예측
  if(score > 0.5):
    print("{:.2f}% 확률로 긍정 리뷰입니다.".format(score * 100))
  else:
    print("{:.2f}% 확률로 부정 리뷰입니다.".format((1 - score) * 100))

In [None]:
sentiment_predict('재미없어~ 하지만 다시 볼꺼야')
