In [None]:
!pip install soynlp
!pip install konlpy
!pip install glove_python

In [162]:
import pandas as pd
from konlpy.tag import Okt, Komoran
from soynlp.utils import DoublespaceLineCorpus
from soynlp.vectorizer import sent_to_word_contexts_matrix
from glove import Glove
import re
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, GRU, Embedding, Flatten
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
import numpy as np
from sklearn.metrics import classification_report

In [4]:
# 리뷰 데이터셋
reviews = pd.read_csv('drive/My Drive/datasets/review_final.csv')
content = reviews['댓글내용']
star = reviews['별점']


# 형태소 분석기
komoran = Komoran() # 상대적으로 많은 데이터를 대상으로 더 빠름, 더 상세히 분석
okt = Okt() # 상대적으로 안정성이 좋음. but 사용자 사전 기능이 없다.

# Komoran 은 실행 중 버그 발생 -> 임시로 Okt 채택


In [108]:
# tokenizing
def tokenize(doc):
    # 한글 자음, 모음 제거
    doc = re.sub(pattern='([ㄱ-ㅎㅏ-ㅣ]+)', repl='', string=doc)
    # 특수기호 제거
    doc = re.sub(pattern='[^\w\s]', repl='', string=doc)
    # norm은 정규화, stem은 근어로 표시하기를 나타냄
    doc = okt.pos(doc, norm=True, stem=True)
    # 명사, 형용사, 부사, 동사 채택
    token = []
    for i in doc:
        if i[1] == 'Noun' or i[1] == 'Verb' or i[1] == 'Adverb' or i[1] == 'Adjective':
          token.append(i)

    return ['/'.join(t) for t in token]

In [117]:
# 단어 토큰화
X_data = [tokenize(t) for t in content]
# 정수 인코딩
encoder = Tokenizer(num_words=10000)
encoder.fit_on_texts(X_data)
X_encoded_data = encoder.texts_to_sequences(X_data)

# 레이블 원-핫 인코딩
y_encoded_data = to_categorical(star)

In [138]:
def vectorize(sentence):
  l = []
  token = tokenize(sentence)
  l.append(token)
  result = encoder.texts_to_sequences(l)
  return pad_sequences(result, 100)

In [131]:
# 훈련 데이터 6만개, 테스트 데이터 2만 7천개
X_train = X_encoded_data[:60000]
X_test = X_encoded_data[60000:]
y_train = y_encoded_data[:60000]
y_test = y_encoded_data[60000:]

In [132]:
# 각 리뷰 데이터를 동일한 길이로 패딩
max_len = 100
X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)

In [172]:
# LSTM 변형 버전인 GRU. LSTM 보다 간결함.

vocab_size = 10000

model_gru = Sequential()
model_gru.add(Embedding(vocab_size, 100))
model_gru.add(GRU(128))
model_gru.add(Dense(16, activation='relu'))
model_gru.add(Dense(6, activation='softmax'))

# 조기 종료 및 체크포인트
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)
mc = ModelCheckpoint('GRU_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

model_gru.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])
history = model_gru.fit(X_train, y_train, epochs=10, callbacks=[es, mc], batch_size=100, validation_split=0.2)

Epoch 1/10
Epoch 00001: val_acc improved from -inf to 0.74142, saving model to GRU_model.h5
Epoch 2/10
Epoch 00002: val_acc improved from 0.74142 to 0.75092, saving model to GRU_model.h5
Epoch 3/10
Epoch 00003: val_acc did not improve from 0.75092
Epoch 4/10
Epoch 00004: val_acc did not improve from 0.75092
Epoch 5/10
Epoch 00005: val_acc did not improve from 0.75092
Epoch 00005: early stopping


In [176]:
model_gru.predict_classes(vectorize('평좋아서 샀는데 저는 너무 달고 느끼했어요'))

array([3])

In [177]:
model_gru.save('drive/My Drive/datasets/gru_model.h5')

In [None]:
# Vectorize to co-occurence matrix
x, idx2vocab = sent_to_word_contexts_matrix(
    content,
    windows=3,
    min_tf=10,
    tokenizer=tokenize,
    dynamic_weight=True,
    verbose=True
)

# glove (Word Embedding)
glove = Glove(no_components=100, learning_rate=0.05, max_count=30)
# input coo matrix 변환
glove.fit(x.tocoo(), epochs=5, no_threads=4, verbose=True)

# 사전 추가
dictionary = {vocab:idx for idx, vocab in enumerate(idx2vocab)}
glove.add_dictionary(dictionary)

Create (word, contexts) matrix
  - counting word frequency from 87410 sents, mem=0.909 Gb
  - scanning (word, context) pairs from 87410 sents, mem=1.043 Gb
  - (word, context) matrix was constructed. shape = (6006, 6006)                    
  - done
Performing 5 training epochs with 4 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4


In [None]:
def tokened(sentence):
  return ['/'.join(x) for x in okt.pos(sentence)][0]

def get_most_similar(word):
  print(tokened(word))
  print(glove.most_similar(tokened(word), number=10))

In [None]:
# 유사 단어 테스트
glove.most_similar(tokened('볶음'), number=10)

[('오뎅/Noun', 0.8337399648625302),
 ('탕/Noun', 0.8073953231965109),
 ('샤브샤브/Noun', 0.7304645581988074),
 ('주꾸미/Noun', 0.7113417844249657),
 ('제육/Noun', 0.7101901277544247),
 ('된장/Noun', 0.7026016259594793),
 ('숙주/Noun', 0.701619828402634),
 ('샌드위치/Noun', 0.7007344901125827),
 ('찌게/Noun', 0.690310771903459)]

In [None]:
# glove model 저장
glove.save('drive/My Drive/datasets/glove_test.model')