In [1]:
#2021.06.25. FRI. 
#Hankyeong

#00. 패키지 호출
import pandas as pd 
import numpy as np
import warnings
import re
from konlpy.tag import Okt
from tqdm import tqdm_notebook
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

#00-1. warning message ignore
warnings.filterwarnings(action='ignore')

In [2]:
#21. 네이버 영화 리뷰 데이터로 한글 감성 분석하기. & TF-IDF Vectorizer
#(1) 데이터셋 불러오기. 
review_train = pd.read_csv('D://Python_Project/Hankyeong_DataAnalysis/Data/naver_movie_review_train.csv', sep='\t')
review_test = pd.read_csv('D://Python_Project/Hankyeong_DataAnalysis/Data/naver_movie_review_test.csv', sep='\t')

#(2) train 데이터셋 탐색하기. 
#①행, 열, 타입 확인하기. 
review_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145791 entries, 0 to 145790
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   id        145791 non-null  int64 
 1   document  145791 non-null  object
 2   label     145791 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 3.3+ MB


In [3]:
#②결측치 확인하기. 
review_train.isna().sum()

id          0
document    0
label       0
dtype: int64

In [4]:
#(3) test 데이터셋 탐색하기. 
#①행, 열, 타입 확인하기. 
review_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48995 entries, 0 to 48994
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        48995 non-null  int64 
 1   document  48995 non-null  object
 2   label     48995 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 1.1+ MB


In [5]:
#②결측치 확인하기. 
review_test.isna().sum()

id          0
document    0
label       0
dtype: int64

In [6]:
#22. tokenizer 함수 정의하기. 
#(1) 형태소 분리 객체 설정하기. 
okt = Okt()

#(2) 불용어 설정하기. 
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다','을']

#(3) 함수 정의하기. 
def okt_tokenizer(text) : 
    tokens = okt.morphs(text, stem=True)
    tokens = [word for word in tokens if not word in stopwords ]
    return tokens

#(4) TF-IDF Vectorizer 객체 설정하기. 
tfid_vect = TfidfVectorizer(tokenizer=okt_tokenizer, 
                            ngram_range=(1,2))

#(5) review_train 데이터셋을 넣어 벡터화하기. 
%time tfid_vect.fit(review_train['document'])

Wall time: 3min 52s


TfidfVectorizer(ngram_range=(1, 2),
                tokenizer=<function okt_tokenizer at 0x0000027AC6D73040>)

In [8]:
#(6) transform() 메서드를 이용해 벡터화하기. 
X_train_tv = tfid_vect.transform(review_train.document)
X_test_tv = tfid_vect.transform(review_test.document)

#(7) y_train, y_test 데이터셋 정의하기. 
y_train = review_train['label'].values
y_test = review_test['label'].values

In [9]:
#23. Naive Bayes 모델로 분류하기. 
#(1) 모델 정의하기. 
nb = MultinomialNB()

#(2) 모델의 하이퍼파라미터 확인하기. 
nb.get_params()

{'alpha': 1.0, 'class_prior': None, 'fit_prior': True}

In [12]:
#(3) GridSearch를 위한 하이퍼파라미터 값 지정하기. (recycle)
params = {
    'alpha'     : [0,0.25,0.5,0.75,1,]
}

#(4) GridsearchCV() 메서드를 이용해 훈련 모델 할당하기. 
gscv_nb = GridSearchCV(nb, param_grid=params)

#(5) 모형 학습하기. 
gscv_nb.fit(X_train_tv,y_train)

GridSearchCV(estimator=MultinomialNB(),
             param_grid={'alpha': [0, 0.25, 0.5, 0.75, 1]})

In [13]:
#(6) 최적 하이퍼 파라미터 확인하기. 
gscv_nb.best_params_

{'alpha': 0.5}

In [14]:
#(7) 최적 파라미터에 대한 평가 점수 확인하기. 
gscv_nb.best_score_

0.8576660877148417

In [16]:
#(8) test 데이터셋으로 모형 예측 및 평가하기. 
accuracy_score(y_test,gscv_nb.predict(X_test_tv))

0.8615368915195428