# 모델 셀렉션 모듈

### 교차 검증

In [1]:
from sklearn.datasets import load_wine
wine = load_wine()

In [2]:
wine.data.shape

(178, 13)

In [3]:
wine.target_names

array(['class_0', 'class_1', 'class_2'], dtype='<U7')

In [4]:
import numpy as np
unique, counts = np.unique(wine.target, return_counts=True)
unique, counts

(array([0, 1, 2]), array([59, 71, 48]))

In [6]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier(random_state=2021)

In [7]:
# wine 데이터에 대해 성능 지표는 정확도, 교차검증 세트는 5개
scores = cross_val_score(dt_clf, wine.data, wine.target, scoring='accuracy', cv=5)

In [8]:
print('교차 검증별 정확도:', np.round(scores, 4))
print('평균 검증 정확도:', np.round(np.mean(scores), 4))

교차 검증별 정확도: [0.9167 0.8056 0.8056 0.9143 0.8571]
평균 검증 정확도: 0.8598


### 교차 검증과 최적 하이퍼파라메터 튜닝을 동시 수행 - GridSearchCV

In [10]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    wine.data, wine.target, stratify=wine.target, test_size=0.2, random_state=2021
)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((142, 13), (36, 13), (142,), (36,))

In [13]:
dt_clf = DecisionTreeClassifier(random_state=2021)

In [14]:
param_grid = {'max_depth': [2,4,6],
              'min_samples_leaf': [2,4]}

In [15]:
grid_dt = GridSearchCV(dt_clf, param_grid=param_grid, cv=5)

In [16]:
grid_dt.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=2021,
                                              splitter='best'),
             iid='deprecated', n_jobs=None,
             param_grid={'max_depth': [2, 4, 6], 'min_samples_leaf': [2, 4]},


In [17]:
# 학습의 결과
grid_dt.cv_results_

{'mean_fit_time': array([0.00122776, 0.00077238, 0.00080452, 0.00079694, 0.00082641,
        0.0008132 ]),
 'mean_score_time': array([0.00034971, 0.00023561, 0.00022869, 0.00022774, 0.00022874,
        0.00023665]),
 'mean_test_score': array([0.80270936, 0.80985222, 0.85197044, 0.85935961, 0.85197044,
        0.85935961]),
 'param_max_depth': masked_array(data=[2, 2, 4, 4, 6, 6],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_min_samples_leaf': masked_array(data=[2, 4, 2, 4, 2, 4],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'max_depth': 2, 'min_samples_leaf': 2},
  {'max_depth': 2, 'min_samples_leaf': 4},
  {'max_depth': 4, 'min_samples_leaf': 2},
  {'max_depth': 4, 'min_samples_leaf': 4},
  {'max_depth': 6, 'min_samples_leaf': 2},
  {'max_depth': 6, 'min_samples_leaf': 4}],
 'rank_test_score': array([6, 5, 3, 1, 3, 1], dtype=i

In [18]:
# 최적 파라메터
grid_dt.best_params_

{'max_depth': 4, 'min_samples_leaf': 4}

In [19]:
# 최고 정확도
grid_dt.best_score_

0.8593596059113301

In [20]:
# 최고 정확도를 가지는 최적 파라메터로 학습한 분류기
best_estimator = grid_dt.best_estimator_
best_estimator.score(X_test, y_test)

0.9444444444444444

- 파라메터 튜닝은 지속해서 수행하여야 함

In [21]:
param_grid = {
    'max_depth': [3,4,5], 'min_samples_leaf': [3,4,5]
    #'max_depth': [2,4,6], 'min_samples_leaf': [2,4]
}
grid_dt = GridSearchCV(dt_clf, param_grid=param_grid, cv=5)
grid_dt.fit(X_train, y_train)
grid_dt.best_params_

{'max_depth': 4, 'min_samples_leaf': 3}

In [22]:
best_estimator = grid_dt.best_estimator_
best_estimator.score(X_test, y_test)

0.9722222222222222