### 7.8.1 (한국어판 부록) KoNLPy를 사용한 영화 리뷰 분석

In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn
import mglearn
import scipy
import os
%matplotlib inline

In [2]:
df_train = pd.read_csv('../data/ratings_train.txt', delimiter='\t', keep_default_na=False)
df_train.head(n=5)

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...,1


In [3]:
text_train = df_train['document'].values
y_train = df_train['label'].values

In [4]:
df_test = pd.read_csv('../data/ratings_test.txt', delimiter='\t', keep_default_na=False)
text_test = df_test['document'].values
y_test = df_test['label'].values

In [5]:
len(text_train), np.bincount(y_train)

(150000, array([75173, 74827]))

In [6]:
len(text_test), np.bincount(y_test)

(50000, array([24827, 25173]))

In [7]:
from konlpy.tag import Okt
okt_tag = Okt()

In [8]:
def okt_tokenizer(text):
    return okt_tag.morphs(text)

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV

okt_param_grid = {'tfidfvectorizer__min_df': [3, 5],
                 'tfidfvectorizer__ngram_range': [(1,1), (1,3)],
                 'logisticregression__C': [0.1, 1, 10]}

okt_pipe = make_pipeline(TfidfVectorizer(tokenizer=okt_tokenizer), LogisticRegression(solver='liblinear', max_iter=500))

okt_grid = GridSearchCV(okt_pipe, okt_param_grid, cv=5)

okt_grid.fit(text_train[:1000], y_train[:1000])

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('tfidfvectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth...ty='l2', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'tfidfvectorizer__min_df': [3, 5], 'tfidfvectorizer__ngram_range': [(1, 1), (1, 3)], 'logisticregression__C': [0.1, 1, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [10]:
print('최상의 교차 검증 점수: {:.3f}'.format(okt_grid.best_score_))
print('최적의 교차 검증 매개변수:\n{}'.format(okt_grid.best_params_))

최상의 교차 검증 점수: 0.717
최적의 교차 검증 매개변수:
{'logisticregression__C': 1, 'tfidfvectorizer__min_df': 3, 'tfidfvectorizer__ngram_range': (1, 1)}


In [11]:
X_test_okt = okt_grid.best_estimator_.named_steps['tfidfvectorizer'].transform(text_test)
score = okt_grid.best_estimator_.named_steps['logisticregression'].score(X_test_okt, y_test)
print('테스트 세트 점수: {:.3f}'.format(score))

테스트 세트 점수: 0.705


In [12]:
# 별도 파일(모듈)로 저장하고 import 하여 사용 (from mecab_tokenizer import mecab_tokenizer)

# from konlpy.tag import Mecab
# mecab = Mecab()
# def mecab_tokenizer(text):
#     return mecab.morphs(text)

In [13]:
from mecab_tokenizer import mecab_tokenizer

mecab_param_grid = {'tfidfvectorizer__min_df': [1, 3, 5],
                 'tfidfvectorizer__ngram_range': [(1,1), (1,2), (1,3)],
                 'logisticregression__C': [0.1, 1, 10, 100]}

mecab_pipe = make_pipeline(TfidfVectorizer(tokenizer=mecab_tokenizer), LogisticRegression(solver='liblinear', max_iter=500))
mecab_grid = GridSearchCV(mecab_pipe, mecab_param_grid, cv=3, n_jobs=-1)
mecab_grid.fit(text_train, y_train)

print('최상의 교차 검증 점수: {:.3f}'.format(mecab_grid.best_score_))
print('최적의 교차 검증 매개변수:\n{}'.format(mecab_grid.best_params_))

최상의 교차 검증 점수: 0.872
최적의 교차 검증 매개변수:
{'logisticregression__C': 10, 'tfidfvectorizer__min_df': 1, 'tfidfvectorizer__ngram_range': (1, 3)}


In [14]:
X_test_mecab = mecab_grid.best_estimator_.named_steps['tfidfvectorizer'].transform(text_test)
score = mecab_grid.best_estimator_.named_steps['logisticregression'].score(X_test_mecab, y_test)
print('테스트 세트 점수: {:.3f}'.format(score))

테스트 세트 점수: 0.877
