# 네이버 영화평 감성 분석

In [1]:
# KoNLPy 설치 
!pip install konlpy

Collecting konlpy
[?25l  Downloading https://files.pythonhosted.org/packages/85/0e/f385566fec837c0b83f216b2da65db9997b35dd675e107752005b7d392b1/konlpy-0.5.2-py2.py3-none-any.whl (19.4MB)
[K     |████████████████████████████████| 19.4MB 1.3MB/s 
Collecting colorama
  Downloading https://files.pythonhosted.org/packages/44/98/5b86278fbbf250d239ae0ecb724f8572af1c91f4a11edf4d36a206189440/colorama-0.4.4-py2.py3-none-any.whl
Collecting JPype1>=0.7.0
[?25l  Downloading https://files.pythonhosted.org/packages/cd/a5/9781e2ef4ca92d09912c4794642c1653aea7607f473e156cf4d423a881a1/JPype1-1.2.1-cp37-cp37m-manylinux2010_x86_64.whl (457kB)
[K     |████████████████████████████████| 460kB 49.5MB/s 
Collecting beautifulsoup4==4.6.0
[?25l  Downloading https://files.pythonhosted.org/packages/9e/d4/10f46e5cfac773e22707237bfcd51bbffeaf0a576b0a847ec7ab15bd7ace/beautifulsoup4-4.6.0-py3-none-any.whl (86kB)
[K     |████████████████████████████████| 92kB 9.1MB/s 
Installing collected packages: colorama, JPype

In [2]:
import numpy as np
import pandas as pd

In [3]:
from google.colab import files

uploaded = files.upload()

Saving ratings_test.txt to ratings_test.txt


In [4]:
test_df = pd.read_csv(list(uploaded.keys())[0], sep='\t')
test_df.head(3)

Unnamed: 0,id,document,label
0,6270596,굳 ㅋ,1
1,9274899,GDNTOPCLASSINTHECLUB,0
2,8544678,뭐야 이 평점들은.... 나쁘진 않지만 10점 짜리는 더더욱 아니잖아,0


In [5]:
uploaded = files.upload()

Saving ratings_train.txt to ratings_train.txt


In [6]:
train_df = pd.read_csv(list(uploaded.keys())[0], sep='\t')
train_df.head(3)

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0


In [7]:
train_df.shape, test_df.shape

((150000, 3), (50000, 3))

### 데이터 전처리

In [8]:
# 중복 여부 확인
train_df.document.nunique()

146182

In [9]:
train_df.drop_duplicates(subset=['document'], inplace=True)
train_df.shape

(146183, 3)

In [10]:
# Null값 확인
train_df.isnull().sum()

id          0
document    1
label       0
dtype: int64

In [11]:
# Null값 제거
train_df = train_df.dropna(how='any')
train_df.shape

(146182, 3)

In [12]:
# 0과 1의 분포
train_df.label.value_counts()

0    73342
1    72840
Name: label, dtype: int64

- 테스트 데이터 셋에도 적용

In [13]:
test_df.document.nunique()

49157

In [14]:
test_df.drop_duplicates(subset=['document'], inplace=True)
test_df.shape

(49158, 3)

In [15]:
test_df.isnull().sum()

id          0
document    1
label       0
dtype: int64

In [16]:
test_df = test_df.dropna(how='any')
test_df.shape

(49157, 3)

In [17]:
test_df.label.value_counts()

1    24711
0    24446
Name: label, dtype: int64

### 텍스트 전처리

In [18]:
# 한글 이외의 문자는 제거
train_df['document'] = train_df['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")
train_df.head()

Unnamed: 0,id,document,label
0,9976970,아 더빙 진짜 짜증나네요 목소리,0
1,3819312,흠포스터보고 초딩영화줄오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 솔직히 재미는 없다평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화스파이더맨에서 늙어보이기만 했던 커스틴 던...,1


In [19]:
# 영문이나 숫자로만 되어 있는 문장은 데이터가 없어짐
train_df['document'].replace('', np.nan, inplace=True)
train_df.isnull().sum()

id            0
document    391
label         0
dtype: int64

In [None]:
train_df = train_df.dropna(how='any')
train_df.shape

(145791, 3)

- 테스트 데이터 셋

In [20]:
test_df['document'] = test_df['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")
test_df['document'].replace('', np.nan, inplace=True)
test_df.isnull().sum()

id            0
document    162
label         0
dtype: int64

In [21]:
test_df = test_df.dropna(how='any')
test_df.shape

(48995, 3)

### 한글 형태소 분석 - 토큰화

In [22]:
from konlpy.tag import Okt
okt = Okt()

In [23]:
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다','을']

In [24]:
okt.morphs('교도소 이야기구먼 솔직히 재미는 없다평점 조정', stem=True)

['교도소', '이야기', '구먼', '솔직하다', '재미', '는', '없다', '평점', '조정']

In [32]:
from tqdm.notebook import tqdm
X_train = []
for sentence in tqdm(train_df.document):
    morphs = okt.morphs(sentence, stem=True)    # 토큰화
    temp = ' '.join(word for word in morphs if not word in stopwords)   # 불용어 제거
    X_train.append(temp)

HBox(children=(FloatProgress(value=0.0, max=146182.0), HTML(value='')))

TypeError: ignored

In [30]:
X_test = []
for sentence in tqdm(test_df.document):
    morphs = okt.morphs(sentence, stem=True)    # 토큰화
    temp = ' '.join(word for word in morphs if not word in stopwords)   # 불용어 제거
    X_test.append(temp)

HBox(children=(FloatProgress(value=0.0, max=48995.0), HTML(value='')))




In [None]:
y_train = train_df.label.values
y_test = test_df.label.values

### Keras 인코딩

In [None]:
# 정수 인코딩
max_words = 35000
tokenizer = Tokenizer(num_words=max_words) # 상위 35,000개의 단어만 보존
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [None]:
#  X_train과 X_test의 모든 샘플의 길이를 동일하게 30으로 셋팅
max_len=30
# 전체 데이터의 길이는 30으로 맞춘다.
X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)

### LSTM 모델 정의/설정/학습/평가

In [33]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [34]:
model = Sequential([
    Embedding(max_words, 100),
    LSTM(128),
    Dense(1, activation='sigmoid')
])
model.summary()

NameError: ignored

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint
checkpointer = ModelCheckpoint(filepath='naver-lstm-best-model.h5',
                               monitor='val_loss', verbose=1, save_best_only=True)

In [None]:
history = model.fit(X_train, y_train, epochs=4, batch_size=60,
                    validation_split=0.2, verbose=1, callbacks=[checkpointer])

In [None]:
# Best Model 선택
from tensorflow.keras.models import load_model
best_model = load_model('naver-lstm-best-model.h5')

In [None]:
_, acc = best_model.evaluate(X_test, y_test, verbose=2)
print(f'Accuracy: {acc:.4f}')