In [1]:
import os
os.getcwd()
os.chdir("/content/drive/My Drive/Colab Notebooks")

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import load_model
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [3]:
imdb_data = pd.read_csv("IMDB Dataset.csv")
print(imdb_data.head())

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [6]:
# pos, neg 값을 숫자로 변형(1, 0)
imdb_data['sentiment'] = imdb_data['sentiment'].replace('positive', 1)
imdb_data['sentiment'] = imdb_data['sentiment'].replace('negative', 0)


# 전처리---------------------------------------------------------------
# 1) 단어가 아니면 삭제
imdb_data['review'] = imdb_data['review'].str.replace("[^\w]|br", " ")
# 2) 공백만 있는 경우 null array로 변환
imdb_data['review'] = imdb_data['review'].replace('', np.nan)
imdb_data['sentiment'] = imdb_data['sentiment'].replace('', np.nan)
# 3) null인 행 제거
imdb_data = imdb_data.dropna(how='any', axis=0)
print("# preprocessing done")
#---------------------------------------------------------------------

review_train, review_test, y_train, y_test = train_test_split(imdb_data['review'], imdb_data['sentiment'], shuffle=False, random_state=34)
     # default : test_size = 0.25
print("# split done")
#---------------------------------------------------------------------

# 리뷰 문장 -> 토크나이즈(using split)
stopwords = ['a', 'an', 'the']

X_train = []
for stc in review_train:
    token = []
    words = stc.split()
    for word in words:
      if word not in stopwords:
        token.append(word)
    X_train.append(token)

X_test = []
for stc in review_test:
    token = []
    words = stc.split()
    for word in words:
      if word not in stopwords:
        token.append(word)
    X_test.append(token)
print("# tokenization done")
#-----------------------------------------------------------------------

# 단어 -> 정수 인코딩
# 임베딩 전 컴퓨터가 단어를 구별할 수 있도록 함. OHE로 바꿔서 하는 것과 동일한 맥락
tokenizer = Tokenizer(5000)
tokenizer.fit_on_texts(X_train) # 각 단어에 정수 인덱스를 부여
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
print("# int_encoding done")

# preprocessing done
# split done
# tokenization done
# int_encoding done


In [None]:
# 문장마다 길이가 다르므로 길이를 맞춰주기
# 문장마다 embedding layer를 통과하게 할 것이기 때문

# max_len은 데이터셋을 보면서 최대 문장의 길이로 설정
# max_len = max([len(x) for x in X_train]) # 1743
max_len = 500
X_train = pad_sequences(X_train, maxlen=max_len) # 더 길면 자르고, 짧으면 0을 추가
X_test = pad_sequences(X_test, maxlen=max_len)

In [None]:
print(X_train[0])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0 

In [None]:
# 모델 생성-------------------------------------------

model = Sequential()
# 단어 임베딩 -> 5000개의 단어를 120차원으로 내보내겠다(정수 -> 120 dimension vector)
model.add(Embedding(5000, 120))
# LSTM
model.add(LSTM(120))
# 이진 분류(sigmoid)
model.add(Dense(1, activation='sigmoid'))
#------------------------------------------------------

In [None]:
# 모델 성능 향상---------------------------------------

# validation loss를 계속 보다가 5회 이상 loss가 증가하면, 과적합될 수 있으므로 학습을 조기 종료하겠다.
early_stop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)
# epoch를 반복하면서, 가장 검증데이터 정확도가 높았던 순간을 체크포인트(the_best_imdb.h5)로 저장
# 정확도가 낮아지면 모델 버려라
model_check = ModelCheckpoint('the_best_imdb.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=64, callbacks=[early_stop, model_check])

Epoch 1/10
Epoch 00001: val_acc improved from -inf to 0.84736, saving model to the_best.h5
Epoch 2/10
Epoch 00002: val_acc improved from 0.84736 to 0.87568, saving model to the_best.h5
Epoch 3/10
Epoch 00003: val_acc did not improve from 0.87568
Epoch 4/10
Epoch 00004: val_acc improved from 0.87568 to 0.87756, saving model to the_best.h5
Epoch 5/10
Epoch 00005: val_acc did not improve from 0.87756
Epoch 6/10
Epoch 00006: val_acc did not improve from 0.87756
Epoch 7/10
Epoch 00007: val_acc did not improve from 0.87756
Epoch 00007: early stopping


<tensorflow.python.keras.callbacks.History at 0x7f11d4c19278>

In [None]:
# # 모델 불러오기
# from tensorflow.keras.models import load_model

# model = load_model('the_best.h5')

In [None]:
# 정확도 측정
print(model.evaluate(X_test, y_test)) # [loss, acc]

[0.32294711470603943, 0.877560019493103]
