# Using same data preprocessing and tokenizing as “naver_movie_review_sentiment_analysis” in “Recurrent_Neural_Network_Text_Classification”

In [None]:
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
import pickle

import matplotlib.pyplot as plt

import urllib.request

import tensorflow as tf
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from konlpy.tag import Okt

In [None]:
data_path = "../data/"
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt", filename= data_path + "ratings_train.txt")
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt", filename=data_path + "ratings_test.txt")

In [None]:
train_data = pd.read_table(data_path + "ratings_train.txt")
test_data = pd.read_table(data_path + "ratings_test.txt")
print(train_data[:3])
print(test_data[:3])

# Data Preprocessing

In [None]:
# check null and unique values
print(len(train_data))
print(train_data.isnull().values.any())
print(train_data.nunique()) # count number of unique values

In [None]:
# remove null values and duplicates
train_data.dropna(inplace=True) 
train_data.drop_duplicates(subset=['document'], inplace=True) 

print(train_data.isnull().values.any())
print(train_data.nunique())

In [None]:
train_data["label"].value_counts().plot(kind='bar')

In [None]:
train_data["document"] = train_data["document"].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","", regex=True)
train_data[:5]

In [None]:
train_data["document"].replace("^ +", "", regex=True, inplace=True) # remove leading whitespace. e.g., "  안녕하세요" -> "안녕하세요"
train_data["document"].replace("", np.nan, inplace=True) # replace empty string with null

print(train_data["document"].isnull().sum())
train_data.dropna(inplace=True)

print(len(train_data))
print(train_data["document"].isnull().sum())
print(train_data.isnull().values.any())

In [None]:
# apply same preprocessing to test data
test_data.dropna(inplace=True)
test_data.drop_duplicates(subset=['document'], inplace=True)
test_data["document"] = test_data["document"].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","", regex=True)
test_data["document"].replace("^ +", "", regex=True, inplace=True) # remove leading whitespace. e.g., "  안녕하세요" -> "안녕하세요"
test_data["document"].replace("", np.nan, inplace=True) # replace empty string with null
test_data.dropna(inplace=True)

print(len(test_data))
print(test_data["document"].isnull().sum())
print(test_data.isnull().values.any())

# Tokenizing

In [None]:
okt = Okt()

X_train = []
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']

# tokenize and remove stopwords from sentences
for sentence in tqdm(train_data['document']):
    tokenized = okt.morphs(sentence, stem=True) # tokenize
    stopwords_removed = [word for word in tokenized if not word in stopwords] # remove stopwords
    X_train.append(stopwords_removed)
    
print(X_train[:5])

In [None]:
X_test = []

for sentence in tqdm(test_data['document']):
    tokenized = okt.morphs(sentence, stem=True) # tokenize
    stopwords_removed = [word for word in tokenized if not word in stopwords] # remove stopwords
    X_test.append(stopwords_removed)
    
print(X_test[:5])

In [None]:
# encode words to integers
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

print(tokenizer.word_index)

In [None]:
threshold = 3 # set threshold for rare words
total_freq = 0
total_word_num = len(tokenizer.word_index)
rare_freq = 0
rare_word_num = 0

for word, cnt in tokenizer.word_counts.items():
    total_freq += cnt
    
    if cnt < threshold:
        rare_freq += cnt
        rare_word_num += 1
        
        
print("Total number of words: ", total_freq)
print("Number of rare words: ", rare_word_num)
print("Percentage of rare words: ", (rare_word_num / total_word_num) * 100)
print("Percentage of rare words in total frequency: ", (rare_freq / total_freq) * 100)

In [None]:
vocab_size = total_word_num - rare_word_num +1
print(vocab_size)

In [None]:
# ReTokenize with vocab_size
tokenizer = Tokenizer(vocab_size)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
print(X_train_seq[:5])

In [None]:
X_test_seq = tokenizer.texts_to_sequences(X_test) # skip fit_on_texts because it's already fitted
print(X_test_seq[:5])

In [None]:
y_train = np.array(train_data['label'])
y_test = np.array(test_data['label'])

# Additional Preprocessing

In [None]:
# remove empty sentences after removing low-frequency words
print(len(X_train_seq), len(y_train))
drop_train_idx = [index for index, sentence in enumerate(X_train_seq) if len(sentence) < 1]
print(len(drop_train_idx))
X_train_seq_removed = [sentence for index, sentence in enumerate(X_train_seq) if index not in drop_train_idx]
y_train_removed = [label for index, label in enumerate(y_train) if index not in drop_train_idx]
print(len(X_train_seq_removed), len(y_train_removed))

### numpy.delete is not working with different length of sub arrays in 2d array

In [None]:
# padding
max_len = max(len(sentence) for sentence in X_train_seq_removed)
avg_len = sum(map(len, X_train_seq_removed)) / len(X_train_seq_removed)
print(max_len, avg_len)

plt.hist([len(sentence) for sentence in X_train_seq_removed], bins=50)

In [None]:
# set max_len to 30~40 based on the histogram
max_padding = 30

# make list to numpy array
X_train_seq_padded = pad_sequences(X_train_seq_removed, maxlen=max_padding)
X_test_seq_padded = pad_sequences(X_test_seq, maxlen=max_padding)

y_train_removed = np.array(y_train_removed)
y_test = np.array(y_test)

print(X_train_seq_padded.shape, X_test_seq_padded.shape)
print(y_train_removed.shape, y_test.shape)

# Modeling

In [None]:
from keras.models import Sequential, Model
from keras.layers import Embedding, Dropout, Conv1D, GlobalMaxPooling1D, Dense, Input, Flatten, Concatenate
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import load_model

In [None]:
# set hyperparameters
embedding_dim = 128
dropout_ratio = (0.5, 0.8) # use two dropout layers
num_filters = 128
hidden_units = 128

convs = []

input = Input(shape=(max_padding,))
embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_padding)(input)
embedding_dropped_out = Dropout(dropout_ratio[0])(embedding)

# use different kernel sizes for convolution
for size in [3,4,5]:
    conv = Conv1D(filters=num_filters, kernel_size=size, padding='valid', activation='relu', strides=1)(embedding_dropped_out)
    conv_pooled = GlobalMaxPooling1D()(conv)
    convs.append(conv_pooled)

output = Concatenate()(convs)
output = Dropout(dropout_ratio[1])(output)
output = Dense(hidden_units, activation='relu')(output)
output = Dense(1, activation='sigmoid')(output)

model = Model(inputs=input, outputs=output)
model.summary()

In [None]:
model_path = "../model/"
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)
mc = ModelCheckpoint(filepath=model_path+'review_best_model_cnn.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
history = model.fit(X_train_seq_padded, y_train_removed, epochs=10, callbacks=[es, mc], batch_size=64, validation_split=0.2)

In [None]:
loaded_model = load_model(model_path+'review_best_model_cnn.h5')
print("\n 테스트 정확도: %.4f" % (loaded_model.evaluate(X_test_seq_padded, y_test)[1]))

# Inference

In [None]:
def predict(new_sentence):
    new_sentence = re.sub(r'[^ㄱ-ㅎㅏ-ㅣ가-힣 ]','', new_sentence)
    new_sentence = okt.morphs(new_sentence, stem=True) # 토큰화
    new_sentence = [word for word in new_sentence if not word in stopwords] # 불용어 제거
    encoded = tokenizer.texts_to_sequences([new_sentence]) # 정수 인코딩
    pad_new = pad_sequences(encoded, maxlen = max_padding) # 패딩
    score = float(loaded_model.predict(pad_new)) # 예측
    if(score > 0.5):
        print("{:.2f}% 확률로 긍정 리뷰입니다.\n".format(score * 100))
    else:
        print("{:.2f}% 확률로 부정 리뷰입니다.\n".format((1 - score) * 100))

In [40]:
predict("이 영화 개꿀잼 ㅋㅋㅋ")

89.29% 확률로 긍정 리뷰입니다.


  score = float(loaded_model.predict(pad_new)) # 예측
