In [None]:
!pip install konlpy
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import re
import json
from konlpy.tag import Okt
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
%matplotlib inline

from google.colab import drive
drive.mount('/content/drive')
DATA_Path = '/content/drive/My Drive/Colab Notebooks/dataset/Korean movies/'

In [None]:
#training data load
train_data = pd.read_csv(DATA_Path+'ratings_train.txt',header=0,delimiter='\t',quoting=3)
train_data.head()

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...,1


In [None]:
train_length = train_data['document'].astype(str).apply(len)
print('길이 중간값 :',np.median(train_length))

In [None]:
#단어 사용량 (단어 기준으로 불용어 제거해서 RNN 학습할 예정)
train_review = [review for review in train_data['document'] if type(review) is str]
train_word_count = train_data['document'].astype(str).apply(lambda x:len(x.split(' ')))
print('단어 사용량 중간값 :',np.median(train_word_count))

In [None]:
#텍스트에서 한글 성분만 가져오기
review_text = re.sub("[^가-힣ㄱ-ㅎㅏ-ㅣ\\s]",'',train_data['document'][0])
okt = Okt()
review_text=okt.morphs(review_text,stem=True)

In [None]:
stopwords = set(['은','는','이','가','을','를','하','아','것','들','의','있','되','수','보','주','등','한'])
clean_review = [token for token in review_text if token not in stopwords]
print(clean_review)

In [None]:
#각각의 리뷰 전처리 하는 과정 : 한글 성분만 가져와서 문장으로 연결, 각 문장을 어절로 분해해서 불용어 제거 
def preprocessing(review,okt,using_stopwords=False,stop_words=[]):
    review_text = re.sub("[^가-힣ㄱ-ㅎㅏ-ㅣ\\s]",',',review)

    word_review = okt.morphs(review_text,stem = True)
    if using_stopwords:
        clean_review = [token for token in word_review if token not in stopwords]
    
    return clean_review

In [None]:
#training data에 대한 리뷰 데이터 전처리
clean_review_data = []
for review in train_data['document']:
    if type(review) == str:
        clean_review_data.append(preprocessing(review,okt,True,stop_words=stopwords))
    else:
        clean_review_data.append([])

In [None]:
#test 데이터에 대한 리뷰 데이터 전처리
test_data = pd.read_csv(DATA_Path+'ratings_test.txt',header=0,delimiter='\t',quoting=3)

clean_test_review = []
for review in test_data['document']:
    if type(review) == str:
        clean_test_review.append(preprocessing(review,okt,True,stop_words=stopwords))
    else:
        clean_test_review.append([])
#데이터 tokenize 한 후 같은 길이로 맞춰줌, post-padding    
tokenizer = Tokenizer()
tokenizer.fit_on_texts(clean_review_data)
train_seq = tokenizer.texts_to_sequences(clean_review_data)
test_seq = tokenizer.texts_to_sequences(clean_test_review)

word_vocab = tokenizer.word_index

MAX_LEN = 15

train_inputs = pad_sequences(train_seq,maxlen=MAX_LEN,padding='post')
train_labels = np.array(train_data['label'])
test_inputs = pad_sequences(test_seq,maxlen=MAX_LEN,padding='post')
test_label = np.array(test_data['label'])

In [None]:
#데이터 저장코드
np.save(open(DATA_Path+'train_input.npy','wb'),train_inputs)
np.save(open(DATA_Path+'train_label.npy','wb'),train_labels)
np.save(open(DATA_Path+'test_input.npy','wb'),test_inputs)
np.save(open(DATA_Path+'test_label.npy','wb'),test_label)

data_configs = {}
data_configs['vocab'] = word_vocab
data_configs['vocab_size'] = len(word_vocab)+1

json.dump(data_configs,open(DATA_Path+'data_configs.json','w'),ensure_ascii=False)

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Dense,Embedding,Dropout
from tensorflow.keras.optimizers import Adam

In [None]:
#RNN 모델 구성
model = Sequential()
model.add(Embedding(data_configs['vocab_size'],output_dim=128))
model.add(LSTM(256,return_sequences=True))
model.add(LSTM(256))
model.add(Dropout(0.3))
model.add(Dense(128,activation='relu',kernel_initializer='glorot_normal'))
model.add(Dropout(0.3))
model.add(Dense(1,activation='sigmoid'))

model.compile(optimizer=Adam(0.001),loss='binary_crossentropy',metrics=['acc'])
model.summary()

In [None]:
history = model.fit(train_inputs,train_labels,batch_size=512,epochs=8,validation_split=0.3)

In [None]:
eval = model.evaluate(test_inputs,test_label)

