# 네이버 영화평 감성분석

In [1]:
!pip install konlpy > /dev/null

In [2]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [3]:
train_df = pd.read_csv('https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt', sep='\t')
test_df = pd.read_csv('https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt', sep='\t')
train_df.shape, test_df.shape

((150000, 3), (50000, 3))

In [4]:
train_df.head(3)

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0


### 1. 데이터 전처리

- train data

In [5]:
# 결측치가 있는지 확인
train_df.isna().sum()

id          0
document    5
label       0
dtype: int64

In [6]:
# 결측치 제거
train_df.dropna(how='any', inplace=True)
train_df.shape

(149995, 3)

In [7]:
# 중복 데이터 확인
train_df.document.nunique()

146182

In [8]:
# 중복 데이터 제거
train_df.drop_duplicates(subset=['document'], inplace=True)
train_df.shape

(146182, 3)

In [9]:
# 긍정(1)/부정(0) 값의 분포
train_df.label.value_counts()

0    73342
1    72840
Name: label, dtype: int64

- test dataset

In [10]:
test_df.dropna(how='any', inplace=True)
test_df.shape

(49997, 3)

In [11]:
test_df.drop_duplicates(subset=['document'], inplace=True)
test_df.shape

(49157, 3)

In [12]:
test_df.label.value_counts()

1    24711
0    24446
Name: label, dtype: int64

### 2. 텍스트 전처리

- train dataset

In [13]:
# 한글 이외의 문자를 공백으로 처리하고 strip()
train_df.document = train_df.document.str.replace('[^ㄱ-ㅎㅏ-ㅣ가-힣]', ' ').str.strip()
train_df.head(3)

Unnamed: 0,id,document,label
0,9976970,아 더빙 진짜 짜증나네요 목소리,0
1,3819312,흠 포스터보고 초딩영화줄 오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0


In [14]:
# 영어, 숫자, 특수문자로만 되어 있던 document는 '' 가 됨
# '' --> np.nan 으로 교체한 후 결측치 제거
train_df.document.replace('', np.nan, inplace=True)
train_df.document.isna().sum()

789

In [15]:
train_df.dropna(how='any', inplace=True)
train_df.shape

(145393, 3)

- test dataset

In [16]:
test_df.document = test_df.document.str.replace('[^ㄱ-ㅎㅏ-ㅣ가-힣]', ' ').str.strip()
test_df.document.replace('', np.nan, inplace=True)
test_df.dropna(how='any', inplace=True)
test_df.shape


(48852, 3)

### 3. 한글 형태소 분석

In [17]:
from konlpy.tag import Okt
okt = Okt()

In [18]:
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다','을']

In [19]:
from tqdm.notebook import tqdm

X_train = []
for sentence in tqdm(train_df.document):
    morphs = okt.morphs(sentence, stem=True)
    tmp_X = [word for word in morphs if word not in stopwords]
    X_train.append(tmp_X)

  0%|          | 0/145393 [00:00<?, ?it/s]

In [20]:
X_train[:3]

[['아', '더빙', '진짜', '짜증나다', '목소리'],
 ['흠', '포스터', '보고', '초딩', '영화', '줄', '오버', '연기', '조차', '가볍다', '않다'],
 ['너', '무재', '밓었', '다그', '래서', '보다', '추천', '다']]

- 시간 관계상 train data를 train/test 로 분리하여 처리

### 4. Keras Tokenizer 적용, pad_sequence, train/test dataset

In [21]:
import tensorflow as tf
seed = 2023
np.random.seed(seed)
tf.random.set_seed(seed)
from tensorflow.keras.preprocessing.text import Tokenizer

In [22]:
t = Tokenizer()
t.fit_on_texts(X_train)

In [23]:
len(t.word_index)

43121

In [24]:
# 빈도수 상위 10,000개 단어로 인코딩
num_words = 10000
t = Tokenizer(num_words=num_words)
t.fit_on_texts(X_train)

In [25]:
X_train = t.texts_to_sequences(X_train)

In [26]:
# 데이터의 최대/평균 길이
max(len(s) for s in X_train), sum(map(len, X_train)) / len(X_train)

(69, 10.361269111993012)

In [27]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
max_len = 30
X_train = pad_sequences(X_train, maxlen=max_len)

In [28]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X_train, train_df.label.values, stratify=train_df.label.values,
    test_size=0.2, random_state=seed
)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((116314, 30), (29079, 30), (116314,), (29079,))

### 5. LSTM 모델 정의/설정/학습

In [29]:
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

In [30]:
model = Sequential([ 
    Embedding(num_words, 100, input_length=max_len),
    LSTM(128),
    Dense(1, 'sigmoid')
])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 30, 100)           1000000   
                                                                 
 lstm (LSTM)                 (None, 128)               117248    
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 1,117,377
Trainable params: 1,117,377
Non-trainable params: 0
_________________________________________________________________


In [31]:
model.compile('adam', 'binary_crossentropy', ['accuracy'])
model_path = 'best_model.h5'
mc = ModelCheckpoint(model_path, verbose=1, save_best_only=True)
es = EarlyStopping(patience=5)

In [32]:
hist = model.fit(
    X_train, y_train, validation_split=0.2,
    epochs=30, batch_size=128, callbacks=[mc, es]
)

Epoch 1/30
Epoch 1: val_loss improved from inf to 0.36521, saving model to best_model.h5
Epoch 2/30
Epoch 2: val_loss improved from 0.36521 to 0.35454, saving model to best_model.h5
Epoch 3/30
Epoch 3: val_loss did not improve from 0.35454
Epoch 4/30
Epoch 4: val_loss did not improve from 0.35454
Epoch 5/30
Epoch 5: val_loss did not improve from 0.35454
Epoch 6/30
Epoch 6: val_loss did not improve from 0.35454
Epoch 7/30
Epoch 7: val_loss did not improve from 0.35454


In [33]:
best_model = load_model(model_path)
best_model.evaluate(X_test, y_test)



[0.3484908640384674, 0.8465559482574463]

### 6. 실제 데이터 예측

In [34]:
review = '이 영화 really 개꿀잼 ㅋㅋ'

In [35]:
import re
review = re.sub('[^ㄱ-ㅎㅏ-ㅣ가-힣]', ' ', review).strip()
review

'이 영화 개꿀잼 ㅋㅋ'

In [36]:
morphs = okt.morphs(review, stem=True)
morphs = [word for word in morphs if word not in stopwords]
encoded = t.texts_to_sequences([morphs])
padded = pad_sequences(encoded, maxlen=max_len)
padded

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    1, 3046,   49]], dtype=int32)

In [37]:
score = float(best_model.predict(padded))
score



0.9204555749893188

In [38]:
print('긍정' if score > 0.5 else '부정')

긍정


In [41]:
def sentiment_predict(review, tokenizer=t, max_len=max_len):
    review = re.sub('[^ㄱ-ㅎㅏ-ㅣ가-힣]', ' ', review).strip()
    morphs = okt.morphs(review, stem=True)
    morphs = [word for word in morphs if word not in stopwords]
    encoded = tokenizer.texts_to_sequences([morphs])
    padded = pad_sequences(encoded, maxlen=max_len)
    score = float(best_model.predict(padded, verbose=0))
    return f'긍정({score*100:.2f}%)' if score > 0.5 else f'부정({(1-score)*100:.2f}%)'

In [42]:
sentiment_predict('이 영화 핵노잼 ㅠㅠ')

'부정(98.59%)'

In [43]:
sentiment_predict('이 영화 개꿀잼 ㅋㅋ')

'긍정(92.05%)'