# I. data 준비

### 1) 최종적으로 cluster된 파일 불러오기

In [1]:
import pandas as pd

df = pd.read_csv(r"C:\Users\qls05\OneDrive\바탕 화면\df.csv", encoding = 'cp949') 
#000.ipynb로부터 나온 파일임 github 링크까지 첨부하면 좋을듯

  from pandas.core.computation.check import NUMEXPR_INSTALLED


### 2) 불러온 df ->  tf-idf 벡터화

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

texts = df['preprocessed_송출내용'].fillna('')
vectorizer = TfidfVectorizer(max_features=30)
vectored_df = vectorizer.fit_transform(texts)

In [3]:
dense_df = vectored_df.todense() #vectored_df는 희소행렬이기 때문에 dense 형태로 전환.

feature_names = vectorizer.get_feature_names_out()

df_tfidf = pd.DataFrame(dense_df, columns=feature_names)

### 3) 데이터 분할

In [4]:
from sklearn.model_selection import train_test_split

X = df_tfidf
y = df['label']

# train : val : test = 6 : 2 : 2
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42) 


print("X_train shape:", X_train.shape, "y_train shape:", y_train.shape)
print("X_valid shape:", X_valid.shape, "y_valid shape:", y_valid.shape)
print("X_test shape:", X_test.shape, "y_test shape:", y_test.shape)

X_train shape: (5608, 30) y_train shape: (5608,)
X_valid shape: (1870, 30) y_valid shape: (1870,)
X_test shape: (1870, 30) y_test shape: (1870,)


### 4) data scaling (model에 따라 scaled 된 data가 필요한 경우가 있음)

In [5]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_valid_scaled = scaler.transform(X_valid)
X_test_scaled = scaler.transform(X_test)

# ----------------------------------------------------------------------------

# II. 모델 적용

## 1. SVM

In [6]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

svc = SVC()
svc.fit(X_train, y_train)

y_train_hat = svc.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_hat)

y_valid_hat = svc.predict(X_valid)
valid_accuracy = accuracy_score(y_valid, y_valid_hat)

print("train score: %.3f" %train_accuracy)
print("valid score: %.3f" %valid_accuracy)

train score: 0.863
valid score: 0.814


###  1) svc 하이퍼파라미터 튜닝 

**for문으로 탐색.** <br/>
**1-1) C, gamma 조절**

In [7]:
C_settings = [0.1, 1, 10, 50]
gamma_settings = [0.1, 0.01, 0.001, 0.0001]
results = []

for C in C_settings:
    for gamma in gamma_settings:
        svc = SVC(C=C, gamma=gamma, random_state=20).fit(X_train_scaled, y_train) #C, gamma 조정

        y_train_hat = svc.predict(X_train_scaled)
        y_valid_hat =svc.predict(X_valid_scaled)
        
        train_accuracy = accuracy_score(y_train, y_train_hat)
        valid_accuracy = accuracy_score(y_valid, y_valid_hat)
        

        results.append({'C': C,
                        'gamma': gamma,
                        'train_accuracy': train_accuracy,
                        'valid_accuracy': valid_accuracy})

display(pd.DataFrame(results))

Unnamed: 0,C,gamma,train_accuracy,valid_accuracy
0,0.1,0.1,0.810093,0.783957
1,0.1,0.01,0.759094,0.734225
2,0.1,0.001,0.596113,0.572727
3,0.1,0.0001,0.324358,0.324599
4,1.0,0.1,0.895506,0.824064
5,1.0,0.01,0.826498,0.788235
6,1.0,0.001,0.748039,0.724064
7,1.0,0.0001,0.603067,0.579144
8,10.0,0.1,0.921362,0.829947
9,10.0,0.01,0.87393,0.815508


**1-2) kernel 조절**

In [8]:
kernels = ['linear', 'rbf', 'poly', 'sigmoid']
results = []

for kernel in kernels:
        svc = SVC(kernel=kernel, random_state=20).fit(X_train_scaled, y_train) #kernel 조정

        y_train_hat = svc.predict(X_train_scaled)
        y_valid_hat =svc.predict(X_valid_scaled)
        
        train_accuracy = accuracy_score(y_train, y_train_hat)
        valid_accuracy = accuracy_score(y_valid, y_valid_hat)
        

        results.append({'kernel': kernel,
                        'train_accuracy': train_accuracy,
                        'valid_accuracy': valid_accuracy})

display(pd.DataFrame(results))

Unnamed: 0,kernel,train_accuracy,valid_accuracy
0,linear,0.808131,0.770588
1,rbf,0.864836,0.813904
2,poly,0.882668,0.817647
3,sigmoid,0.62607,0.616043


**1-3) kernel = poly로 정하고 C랑 gamma 다시 튜닝**

In [9]:
C_settings = [0.1, 1, 10, 50]
gamma_settings = [0.1, 0.01, 0.001, 0.0001]
results = []

for C in C_settings:
    for gamma in gamma_settings:
        svc = SVC(C=C, gamma=gamma, kernel='poly', random_state=20).fit(X_train_scaled, y_train)

        y_train_hat = svc.predict(X_train_scaled)
        y_valid_hat =svc.predict(X_valid_scaled)
        
        train_accuracy = accuracy_score(y_train, y_train_hat)
        valid_accuracy = accuracy_score(y_valid, y_valid_hat)
        

        results.append({'C': C,
                        'gamma': gamma,
                        'train_accuracy': train_accuracy,
                        'valid_accuracy': valid_accuracy})

display(pd.DataFrame(results))

Unnamed: 0,C,gamma,train_accuracy,valid_accuracy
0,0.1,0.1,0.901213,0.818182
1,0.1,0.01,0.324358,0.324599
2,0.1,0.001,0.324358,0.324599
3,0.1,0.0001,0.324358,0.324599
4,1.0,0.1,0.918688,0.826738
5,1.0,0.01,0.729494,0.711765
6,1.0,0.001,0.324358,0.324599
7,1.0,0.0001,0.324358,0.324599
8,10.0,0.1,0.921719,0.830481
9,10.0,0.01,0.846291,0.798396


**최적 hyperparameter:  kernel = poly, C = 50, gamma=0.1**

### 2) 최적의 hyperparameter를 적용한 SVC model 및 평가

In [10]:
#모델 적용
best_svc = SVC(kernel='poly', C=50, gamma=0.1).fit(X_train_scaled, y_train)

#모델 평가
y_train_hat = best_svc.predict(X_train_scaled)
train_accuracy = accuracy_score(y_train, y_train_hat)

y_valid_hat = best_svc.predict(X_valid_scaled)
valid_accuracy = accuracy_score(y_valid, y_valid_hat)

print("train score: %.3f" %train_accuracy)
print("valid score: %.3f" %valid_accuracy)

train score: 0.922
valid score: 0.830


In [11]:
print(classification_report(y_valid, y_valid_hat))

              precision    recall  f1-score   support

           0       0.92      0.93      0.92       152
           1       0.67      0.73      0.70       162
           2       0.58      0.64      0.61        89
           3       0.62      0.49      0.55        51
           4       0.83      0.81      0.82       607
           5       0.95      0.95      0.95       464
           6       0.76      0.84      0.80       194
           7       0.84      0.78      0.81       151

    accuracy                           0.83      1870
   macro avg       0.77      0.77      0.77      1870
weighted avg       0.83      0.83      0.83      1870



## 2. 나이브베이즈

In [12]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

nb = MultinomialNB()
nb.fit(X_train, y_train)

y_train_hat = nb.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_hat)

y_valid_hat = nb.predict(X_valid)
valid_accuracy = accuracy_score(y_valid, y_valid_hat)

print("train score: %.3f" %train_accuracy)
print("valid score: %.3f" %valid_accuracy)

train score: 0.683
valid score: 0.660


###  1) naive baise 하이퍼파라미터 튜닝 

**이후 gird search으로 탐색**

In [13]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'alpha': [0.01, 0.05, 0.1, 0.5, 1.0, 1.5, 2.0, 5.0, 10.0],
    'fit_prior': [True, False]
    }

nb = MultinomialNB()

grid_search = GridSearchCV(estimator=nb, param_grid=param_grid, scoring='accuracy', cv=3, n_jobs=-1)
grid_search.fit(X_train, y_train)

# 최적의 파라미터와 최고의 점수
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

Best parameters: {'alpha': 0.01, 'fit_prior': True}
Best score: 0.6813487533249596


### 2) 최적의 hyperparameter를 적용한 Naive Baise model 및 평가

In [14]:
#모델 적용
best_nb = MultinomialNB(alpha=grid_search.best_params_['alpha'], fit_prior=grid_search.best_params_['fit_prior'])
best_nb.fit(X_train, y_train)

#모델 평가
y_train_hat = best_nb.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_hat)

y_valid_hat = best_nb.predict(X_valid)
valid_accuracy = accuracy_score(y_valid, y_valid_hat)

print("train score: %.3f" %train_accuracy)
print("valid score: %.3f" %valid_accuracy)

train score: 0.689
valid score: 0.672


In [15]:
#상세보고서
print(classification_report(y_valid, y_valid_hat))

              precision    recall  f1-score   support

           0       0.89      0.88      0.88       152
           1       0.68      0.57      0.62       162
           2       0.70      0.08      0.14        89
           3       0.00      0.00      0.00        51
           4       0.58      0.68      0.62       607
           5       0.74      0.90      0.81       464
           6       0.65      0.87      0.74       194
           7       0.76      0.17      0.28       151

    accuracy                           0.67      1870
   macro avg       0.63      0.52      0.51      1870
weighted avg       0.66      0.67      0.64      1870



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## 3. RandomForest

In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

rf = RandomForestClassifier().fit(X_train, y_train)

y_train_hat = rf.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_hat)

y_valid_hat = rf.predict(X_valid)
valid_accuracy = accuracy_score(y_valid, y_valid_hat)

print("train score: %.3f" %train_accuracy)
print("valid score: %.3f" %valid_accuracy)

train score: 0.923
valid score: 0.830


### 1) random forest 하이퍼파라미터 튜닝

**gird search로 조정**

In [17]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf = RandomForestClassifier()
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, verbose=0, n_jobs=-1)

grid_search.fit(X_train, y_train)

# 최적의 파라미터와 최고의 점수 출력
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

324 fits failed out of a total of 972.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
139 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\qls05\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\qls05\anaconda3\lib\site-packages\sklearn\base.py", line 1145, in wrapper
    estimator._validate_params()
  File "C:\Users\qls05\anaconda3\lib\site-packages\sklearn\base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\qls05\anaconda3\lib\site-packages\sklearn\utils\_param_validation.py", line 96, in validate_parameter_constraints
    raise InvalidParame

Best parameters: {'max_depth': 20, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}
Best score: 0.8393369060256038


### 2) 최적의 hyperparameter를 적용한 RandomForest model 및 평가

In [18]:
#모델 적용
best_rf = RandomForestClassifier(
    n_estimators=grid_search.best_params_['n_estimators'],
    max_features=grid_search.best_params_['max_features'],
    max_depth=grid_search.best_params_['max_depth'],
    min_samples_split=grid_search.best_params_['min_samples_split'],
    min_samples_leaf=grid_search.best_params_['min_samples_leaf'])
best_rf.fit(X_train, y_train)

#모델 평가
y_train_hat = best_rf.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_hat)

y_valid_hat = best_rf.predict(X_valid)
valid_accuracy = accuracy_score(y_valid, y_valid_hat)

print("train score: %.3f" %train_accuracy)
print("valid score: %.3f" %valid_accuracy)

train score: 0.917
valid score: 0.832


In [19]:
#상세 보고서
print(classification_report(y_valid, y_valid_hat))

              precision    recall  f1-score   support

           0       0.93      0.90      0.91       152
           1       0.71      0.73      0.72       162
           2       0.72      0.56      0.63        89
           3       0.68      0.49      0.57        51
           4       0.77      0.86      0.81       607
           5       0.97      0.92      0.95       464
           6       0.79      0.86      0.82       194
           7       0.90      0.72      0.80       151

    accuracy                           0.83      1870
   macro avg       0.81      0.76      0.78      1870
weighted avg       0.84      0.83      0.83      1870



## 4. XGBoost

In [20]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


DEPRECATION: pyodbc 4.0.0-unsupported has a non-standard version number. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pyodbc or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063


In [21]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report

xg = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xg.fit(X_train, y_train)

y_train_hat = xg.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_hat)

y_valid_hat = xg.predict(X_valid)
valid_accuracy = accuracy_score(y_valid, y_valid_hat)

print("train score: %.3f" %train_accuracy)
print("valid score: %.3f" %valid_accuracy)

train score: 0.922
valid score: 0.827


### 1) XGBoost 하이퍼파라미터 튜닝

**gird search로 조정**

In [22]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'max_depth': [3, 4, 5, 6, 7],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

xg= xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

grid_search = GridSearchCV(estimator=xg, param_grid=param_grid, scoring='accuracy', cv=3, n_jobs=-1)
grid_search.fit(X_train, y_train)

# 최적의 파라미터와 최고의 점수 출력
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

Best parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.05, 'max_depth': 7, 'n_estimators': 200, 'subsample': 1.0}
Best score: 0.8375539933372055


### 2) 최적의 hyperparameter를 적용한 XGBoost model 및 평가

In [23]:
#모델 적용                                                                                                                                                                                                                        4                                                   
best_xg = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss',
    max_depth=grid_search.best_params_['max_depth'],
    learning_rate=grid_search.best_params_['learning_rate'],
    n_estimators=grid_search.best_params_['n_estimators'],
    subsample=grid_search.best_params_['subsample'],
    colsample_bytree=grid_search.best_params_['colsample_bytree'])
best_xg.fit(X_train, y_train)

#모델 평가
y_train_hat = best_xg.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_hat)

y_valid_hat = best_xg.predict(X_valid)
valid_accuracy = accuracy_score(y_valid, y_valid_hat)

print("train score: %.3f" %train_accuracy)
print("valid score: %.3f" %valid_accuracy)

train score: 0.910
valid score: 0.824


In [24]:
    #상세 보고서
print(classification_report(y_valid, y_valid_hat))

              precision    recall  f1-score   support

           0       0.93      0.89      0.91       152
           1       0.71      0.73      0.72       162
           2       0.70      0.56      0.62        89
           3       0.69      0.47      0.56        51
           4       0.78      0.83      0.81       607
           5       0.95      0.92      0.94       464
           6       0.77      0.86      0.81       194
           7       0.82      0.74      0.78       151

    accuracy                           0.82      1870
   macro avg       0.79      0.75      0.77      1870
weighted avg       0.82      0.82      0.82      1870



## 5. CatBoost

In [25]:
pip install catboost

Note: you may need to restart the kernel to use updated packages.


DEPRECATION: pyodbc 4.0.0-unsupported has a non-standard version number. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pyodbc or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063


In [26]:
from catboost import CatBoostClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score, classification_report

cat = CatBoostClassifier(loss_function='MultiClass', verbose=0)
cat.fit(X_train, y_train, eval_set=(X_valid, y_valid))

y_train_hat = cat.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_hat)

y_valid_hat = cat.predict(X_valid)
valid_accuracy = accuracy_score(y_valid, y_valid_hat)

print("train score: %.3f" %train_accuracy)
print("valid score: %.3f" %valid_accuracy)

train score: 0.897
valid score: 0.822


### 1) CatBoost 하이퍼파라미터 튜닝

**gird search로 조정**

In [27]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'iterations': [500],
    'learning_rate': [0.01, 0.05, 0.1],
    'depth': [4, 6, 10],
    'l2_leaf_reg': [1, 3, 5]
}
cat = CatBoostClassifier(early_stopping_rounds=50, verbose=False)
grid_search = GridSearchCV(cat, param_grid, cv=3, scoring='accuracy')

grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best score: {:.2f}".format(grid_search.best_score_))

Best parameters: {'depth': 10, 'iterations': 500, 'l2_leaf_reg': 3, 'learning_rate': 0.05}
Best score: 0.84


### 2) 최적의 hyperparameter를 적용한 CatBoost model 및 평가

In [28]:
#모델 적용
best_cat = CatBoostClassifier(verbose=0,
    iterations=grid_search.best_params_['iterations'],
    learning_rate=grid_search.best_params_['learning_rate'],
    depth=grid_search.best_params_['depth'],
    l2_leaf_reg=grid_search.best_params_['l2_leaf_reg'])
best_cat.fit(X_train, y_train)

#모델 평가
y_train_hat = best_cat.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_hat)

y_valid_hat = best_cat.predict(X_valid)
valid_accuracy = accuracy_score(y_valid, y_valid_hat)

print("train score: %.3f" %train_accuracy)
print("valid score: %.3f" %valid_accuracy)

train score: 0.896
valid score: 0.828


In [29]:
#상세 보고서
print(metrics.classification_report(y_valid, y_valid_hat))

              precision    recall  f1-score   support

           0       0.94      0.89      0.92       152
           1       0.70      0.73      0.72       162
           2       0.74      0.57      0.65        89
           3       0.63      0.47      0.54        51
           4       0.78      0.85      0.81       607
           5       0.94      0.92      0.93       464
           6       0.79      0.86      0.82       194
           7       0.88      0.74      0.80       151

    accuracy                           0.83      1870
   macro avg       0.80      0.75      0.77      1870
weighted avg       0.83      0.83      0.83      1870



# III. 최종 모델 선정

각모델의 valid score<br/><br/>

svc = 0.830<br/>
나이브베이즈 = 0.672<br/>
랜덤포레스트 = 0.830<br/>
XGBoost = 0.824<br/>
catBoost = 0.828 

**나이브 베이즈를 제외한 4가지 모델의 성능이0.01정도의 차이를 보이므로 cross validuation으로 재평가 (fold=5)**

In [30]:
from sklearn.model_selection import train_test_split

#train: test = 8: 2로 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 

In [32]:
from sklearn.model_selection import cross_val_score

#모델별 5개의 fold로 교차검증
svc_cv_score = cross_val_score(best_svc, X_train, y_train, cv=5) 
print("svc model cross validation score: %.3f" %svc_cv_score.mean())

rf_cv_score = cross_val_score(best_rf, X_train, y_train, cv=5) 
print("rf model cross validation score: %.3f" %rf_cv_score.mean())

xg_cv_score = cross_val_score(best_xg, X_train, y_train, cv=5) 
print("XGboost model cross validation score: %.3f" %xg_cv_score.mean())

cat_cv_score = cross_val_score(best_cat, X_train, y_train, cv=5) 
print("CatBoost model cross validation score: %.3f" %cat_cv_score.mean())

svc model cross validation score: 0.762
rf model cross validation score: 0.838
XGboost model cross validation score: 0.838
CatBoost model cross validation score: 0.841


**['depth': 10, 'iterations': 500, 'l2_leaf_reg': 3, 'learning_rate': 0.05]일때의 catBoost 모델이 성능이 가장 높은 것을 알 수 있다.**

### best model 생성 및 최종 평가

In [36]:
#bestmodel 생성
best_model = CatBoostClassifier(depth=10, iterations=500, l2_leaf_reg=3, learning_rate=0.05)
best_model.fit(X_train, y_train)

0:	learn: 1.9208144	total: 1.23s	remaining: 10m 13s
1:	learn: 1.7957177	total: 2.51s	remaining: 10m 24s
2:	learn: 1.6882446	total: 3.68s	remaining: 10m 9s
3:	learn: 1.5955640	total: 4.84s	remaining: 10m
4:	learn: 1.5209007	total: 6.09s	remaining: 10m 3s
5:	learn: 1.4512460	total: 7.44s	remaining: 10m 12s
6:	learn: 1.3906001	total: 8.63s	remaining: 10m 7s
7:	learn: 1.3359669	total: 9.92s	remaining: 10m 9s
8:	learn: 1.2836768	total: 11.2s	remaining: 10m 9s
9:	learn: 1.2387172	total: 12.5s	remaining: 10m 14s
10:	learn: 1.1978746	total: 14.1s	remaining: 10m 28s
11:	learn: 1.1603640	total: 15.5s	remaining: 10m 28s
12:	learn: 1.1250483	total: 16.9s	remaining: 10m 32s
13:	learn: 1.0946970	total: 18.1s	remaining: 10m 26s
14:	learn: 1.0640530	total: 19.2s	remaining: 10m 21s
15:	learn: 1.0366113	total: 20.4s	remaining: 10m 17s
16:	learn: 1.0109435	total: 21.7s	remaining: 10m 16s
17:	learn: 0.9862320	total: 23s	remaining: 10m 15s
18:	learn: 0.9642834	total: 24.4s	remaining: 10m 16s
19:	learn: 0.9

155:	learn: 0.4486363	total: 3m 47s	remaining: 8m 20s
156:	learn: 0.4478823	total: 3m 48s	remaining: 8m 19s
157:	learn: 0.4472611	total: 3m 49s	remaining: 8m 17s
158:	learn: 0.4465636	total: 3m 51s	remaining: 8m 16s
159:	learn: 0.4462417	total: 3m 53s	remaining: 8m 16s
160:	learn: 0.4455467	total: 3m 55s	remaining: 8m 16s
161:	learn: 0.4447235	total: 3m 57s	remaining: 8m 15s
162:	learn: 0.4441007	total: 3m 59s	remaining: 8m 14s
163:	learn: 0.4436340	total: 4m 1s	remaining: 8m 13s
164:	learn: 0.4431906	total: 4m 2s	remaining: 8m 12s
165:	learn: 0.4426075	total: 4m 4s	remaining: 8m 11s
166:	learn: 0.4417544	total: 4m 5s	remaining: 8m 10s
167:	learn: 0.4410355	total: 4m 7s	remaining: 8m 8s
168:	learn: 0.4402824	total: 4m 8s	remaining: 8m 7s
169:	learn: 0.4399454	total: 4m 10s	remaining: 8m 6s
170:	learn: 0.4390238	total: 4m 11s	remaining: 8m 4s
171:	learn: 0.4383416	total: 4m 13s	remaining: 8m 3s
172:	learn: 0.4377242	total: 4m 15s	remaining: 8m 2s
173:	learn: 0.4372289	total: 4m 16s	rema

308:	learn: 0.3881452	total: 7m 39s	remaining: 4m 44s
309:	learn: 0.3878622	total: 7m 41s	remaining: 4m 42s
310:	learn: 0.3874028	total: 7m 42s	remaining: 4m 41s
311:	learn: 0.3872639	total: 7m 43s	remaining: 4m 39s
312:	learn: 0.3870422	total: 7m 45s	remaining: 4m 37s
313:	learn: 0.3868254	total: 7m 46s	remaining: 4m 36s
314:	learn: 0.3865123	total: 7m 48s	remaining: 4m 34s
315:	learn: 0.3863796	total: 7m 49s	remaining: 4m 33s
316:	learn: 0.3861773	total: 7m 50s	remaining: 4m 31s
317:	learn: 0.3857624	total: 7m 52s	remaining: 4m 30s
318:	learn: 0.3853348	total: 7m 53s	remaining: 4m 28s
319:	learn: 0.3851012	total: 7m 55s	remaining: 4m 27s
320:	learn: 0.3846540	total: 7m 56s	remaining: 4m 25s
321:	learn: 0.3845747	total: 7m 58s	remaining: 4m 24s
322:	learn: 0.3841985	total: 7m 59s	remaining: 4m 22s
323:	learn: 0.3841222	total: 8m 1s	remaining: 4m 21s
324:	learn: 0.3839803	total: 8m 2s	remaining: 4m 19s
325:	learn: 0.3836035	total: 8m 3s	remaining: 4m 18s
326:	learn: 0.3834838	total: 8m

461:	learn: 0.3547970	total: 11m 7s	remaining: 54.9s
462:	learn: 0.3545668	total: 11m 8s	remaining: 53.4s
463:	learn: 0.3544337	total: 11m 10s	remaining: 52s
464:	learn: 0.3542948	total: 11m 11s	remaining: 50.6s
465:	learn: 0.3541478	total: 11m 13s	remaining: 49.1s
466:	learn: 0.3540503	total: 11m 14s	remaining: 47.7s
467:	learn: 0.3539083	total: 11m 15s	remaining: 46.2s
468:	learn: 0.3535783	total: 11m 17s	remaining: 44.8s
469:	learn: 0.3533562	total: 11m 18s	remaining: 43.3s
470:	learn: 0.3530115	total: 11m 19s	remaining: 41.9s
471:	learn: 0.3528997	total: 11m 21s	remaining: 40.4s
472:	learn: 0.3528435	total: 11m 22s	remaining: 39s
473:	learn: 0.3526643	total: 11m 23s	remaining: 37.5s
474:	learn: 0.3523901	total: 11m 25s	remaining: 36.1s
475:	learn: 0.3521556	total: 11m 26s	remaining: 34.6s
476:	learn: 0.3518065	total: 11m 28s	remaining: 33.2s
477:	learn: 0.3515334	total: 11m 29s	remaining: 31.7s
478:	learn: 0.3512122	total: 11m 31s	remaining: 30.3s
479:	learn: 0.3510789	total: 11m 3

<catboost.core.CatBoostClassifier at 0x26d58616a90>

### 최종 test score

In [37]:
y_train_hat = best_model.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_hat)

y_test_hat = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_hat)

print("train score: %.3f" %train_accuracy)
print("test score: %.3f" %test_accuracy)

train score: 0.890
test score: 0.846


**그러나 catBoost 모델의 경우 계산 비용이 너무커서 random forest 모델을 쓰는 것이 좋을 수도 있겠다.**