## 저장내용 불러오기

In [1]:
import pandas as pd

train_df = pd.read_csv('elect_training.csv', header=None, )
print(train_df[1].value_counts())
test_df = pd.read_csv('elect_test.csv', header=None)
print(test_df[1].value_counts())

0    209
1    197
Name: 1, dtype: int64
1    100
0    100
Name: 1, dtype: int64


## 데이터정제

In [3]:
import numpy as np

# train_df에 데이터의 중복이 있는지 확인
train_df[0].nunique(), train_df[1].nunique()
# document 열에서 중복인 내용이 있다면 중복 제거
train_df.drop_duplicates(subset=[0], inplace=True) 
# 한글과 공백을 제외하고 모두 제거
train_df[0] = train_df[0].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣]","")
train_df[0].replace('', np.nan, inplace=True)
# 한글이 없는 리뷰 제거
train_df = train_df.dropna(how = 'any')
print(len(train_df))

# test도 마찬가지로 수행
test_df.drop_duplicates(subset = [0], inplace=True) # document 열에서 중복인 내용이 있다면 중복 제거
test_df[0] = test_df[0].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣]","") # 정규 표현식 수행
test_df[0].replace('', np.nan, inplace=True) # 공백은 Null 값으로 변경
test_df = test_df.dropna(how='any') # Null 값 제거
print('전처리 후 테스트용 샘플의 개수 :',len(test_df))

388
전처리 후 테스트용 샘플의 개수 : 187


## 단어 토큰화

In [4]:
from konlpy.tag import Twitter

twitter = Twitter()
def tw_tokenizer(text):
    # 입력 인자로 들어온 텍스트를 형태소 단어로 토큰화해 리스트 형태로 반환
    tokens_ko = twitter.morphs(text)
    return tokens_ko

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

# Twitter 객체의 morphs() 객체를 이용한 tokenizer를 사용. ngram_range는 (1,2)
tfidf_vect = TfidfVectorizer(tokenizer=tw_tokenizer, ngram_range=(1,2), min_df=3, max_df=0.9)
tfidf_vect.fit(train_df[0])
tfidf_matrix_train = tfidf_vect.transform(train_df[0])
tfidf_matrix_test = tfidf_vect.transform(test_df[0])



## 모델생성

In [7]:
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier

# n_estimators는 100으로, random state는 예제 수행 시마다 동일 예측 결과를 위해 설정
xgb_clf = XGBClassifier(n_estimator=100, random_state=0)

# 성능 평가 지표를 auc로, 조기 중단 파리미터는 30으로 설정하고 학습 수행
xgb_clf.fit(tfidf_matrix_train, train_df[1], early_stopping_rounds=30, eval_metric="auc",
            eval_set=[(tfidf_matrix_train, train_df[1]),(tfidf_matrix_test, test_df[1])])

xgb_roc_score = roc_auc_score(test_df[1], xgb_clf.predict_proba(tfidf_matrix_test)[:,1], average='macro')
print('ROC AUC: {0:.4f}'.format(xgb_roc_score))

Parameters: { n_estimator } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	validation_0-auc:0.83674	validation_1-auc:0.60557
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 30 rounds.
[1]	validation_0-auc:0.91239	validation_1-auc:0.61207
[2]	validation_0-auc:0.93203	validation_1-auc:0.63103
[3]	validation_0-auc:0.94631	validation_1-auc:0.64856
[4]	validation_0-auc:0.95878	validation_1-auc:0.66718
[5]	validation_0-auc:0.96426	validation_1-auc:0.67368
[6]	validation_0-auc:0.96977	validation_1-auc:0.67897
[7]	validation_0-auc:0.97316	validation_1-auc:0.68845
[8]	validation_0-auc:0.97635	validation_1-auc:0.68448
[9]	validation_0-auc:0.97834	validation_1-auc:0.68793
[10]	validation_

In [8]:
# 파라미터 C 최적화를 위해 GridSearchCV를 이용
params = { 'max_depth': [0.5, 1, 3, 5],
         'min_child_weight': [0.25, 0.5, 1],
         'colsample_bytree': [0.25, 0.5, 0.75]}

grid_cv = GridSearchCV(xgb_clf, param_grid=params, cv=3, scoring='accuracy', verbose=1)
grid_cv.fit(tfidf_matrix_train, train_df[1])
print(grid_cv.best_params_, round(grid_cv.best_score_, 4))

Fitting 3 folds for each of 36 candidates, totalling 108 fits
Parameters: { n_estimator } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
Traceback (most recent call last):
  File "C:\Users\user\Anaconda3\envs\tensorflow\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\user\Anaconda3\envs\tensorflow\lib\site-packages\xgboost\sklearn.py", line 824, in fit
    callbacks=callbacks)
  File "C:\Users\user\Anaconda3\envs\tensorflow\lib\site-packages\xgboost\training.py", line 212, in train
    xgb_model=xgb_model, callbacks=callbacks)
  File "C:\Users\user\Anaconda3\envs\tensorflow\lib\site-packages\xgboost\training.py", line 75, in _train_internal
    bst.update(dtrain, i, obj)
  File "C:\Users\user\Anaconda3\envs\tensorflow\lib\site-packages\xgboost\core.py", line 1369, in update
    dtrain.handle))
  File "C:\Users\user\Anaconda3\envs\tensorflow\lib\site-packages\xgboost\core.py", line 190, in _check_call
    raise XGBoostError(py_str(_LIB.X

Parameters: { n_estimator } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimator } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimator } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimator } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  


Parameters: { n_estimator } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimator } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimator } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimator } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
 


Parameters: { n_estimator } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimator } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimator } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimator } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
 

Parameters: { n_estimator } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimator } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimator } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimator } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  

[Parallel(n_jobs=1)]: Done 108 out of 108 | elapsed:   16.2s finished


Parameters: { n_estimator } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


{'colsample_bytree': 0.25, 'max_depth': 3, 'min_child_weight': 0.5} 0.6831


## 모델학습

In [12]:
import xgboost as xgb
from xgboost import plot_importance
from sklearn.metrics import accuracy_score
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [13]:
# 사이킷런 래퍼 XGBoost 클래스인 XGBClassifier 임포트
from xgboost import XGBClassifier

xgb_wrapper = XGBClassifier(n_estimators=400, learning_rate=0.1, max_depth=3)
xgb_wrapper.fit(tfidf_matrix_train, train_df[1])

best_estimator = grid_cv.best_estimator_
w_preds = xgb_wrapper.predict(tfidf_matrix_test)

In [14]:
print('XGBoost 정확도: ', accuracy_score(test_df[1], w_preds))

XGBoost 정확도:  0.7540106951871658
