# 교차검증

## 3-Fold 교차검증
- 훈련세트를 세 부분으로 나눠 교차 검증을 수행하며 이때 각 검증세트마다 모델평가를 수행하여, 평균값으로 cross validation을 수행

In [None]:
import pandas as pd
wine = pd.read_csv('https://bit.ly/wine_csv_data')
wine.head()

Unnamed: 0,alcohol,sugar,pH,class
0,9.4,1.9,3.51,0.0
1,9.8,2.6,3.2,0.0
2,9.8,2.3,3.26,0.0
3,9.8,1.9,3.16,0.0
4,9.4,1.9,3.51,0.0


In [None]:
data = wine[['alcohol', 'sugar', 'pH']].to_numpy()
target = wine[['class']].to_numpy()

In [None]:
from sklearn.model_selection import train_test_split
train_input, test_input, train_target, test_target = train_test_split(data, target, test_size=0.2, random_state=42)

In [None]:
from sklearn.tree import DecisionTreeClassifier

# decision tree에서는 정규화 필요 없음
dt = DecisionTreeClassifier(random_state=42)
dt.fit(train_input, train_target)
print("train's score: ", dt.score(train_input, train_target))
print("test's score: ", dt.score(test_input, test_target))

train's score:  0.996921300750433
test's score:  0.8584615384615385


In [None]:
# sklearn에서는 cross_validate에서 자체적으로 훈련 데이터에 대해 3-fold validation을 수행함
from sklearn.model_selection import cross_validate
import numpy as np


# fit_time: 모델 훈련 시간
# score_time: 검증 시간
# test_score: 실제 검증 값 (cross_validation에서는 기본 5-fold validation 수행)
# 참고로, 이미 위에서 train_test_split로 데이터를 섞었으나, 만약 위 과정이 생략 되었을 경우 
#       cross_validation함수에서 cv 매개변수에 회귀모델일 경우 KFold, 분류모델일 경우 StratifiedKFold()를 적용
# N-fold적용은 cross_validation에 매개변수로 n_splits로 적용하면 됨
scores = cross_validate(dt, train_input, train_target) 

for k, v in scores.items():  
  print(k, v)

print("cross validation's score: ", np.mean(scores['test_score']))

fit_time [0.00868511 0.00714016 0.00770807 0.00741148 0.0070653 ]
score_time [0.00080895 0.00077629 0.00077701 0.00071144 0.00075412]
test_score [0.86923077 0.84615385 0.87680462 0.84889317 0.83541867]
cross validation's score:  0.855300214703487


# GridSearch

- 하이퍼파라미터 탐색과 교차 검증을 한 번에 수행

In [None]:
from sklearn.model_selection import GridSearchCV

params = {'min_impurity_decrease': [0.0001, 0.0002, 0.0003, 0.0004, 0.0005]}

# n_jobs=-1 로 하면 모든 CPU 사용
gs = GridSearchCV(DecisionTreeClassifier(random_state=42), params, n_jobs=-1)
gs.fit(train_input, train_target)

GridSearchCV(cv=None, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=42,
                                              splitter='best'),
             iid='deprecated', n_jobs=-1,
             param_grid={'min_impurity_decrease': [0.0001, 0.0002, 0.0003,
    

In [None]:
# 위 훈련값 중에서 최적의 하이퍼파라미터를 찾음
print(gs.best_estimator_)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0001, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=42, splitter='best')


In [None]:
dt = gs.best_estimator_
print(dt.score(train_input, train_target))

0.9615162593804117


In [None]:
# 최적의 매개변수 확인
print(gs.best_params_)

{'min_impurity_decrease': 0.0001}


In [None]:
# 각 매개변수에서 수행한 교차 검증의 평균 점수
print(gs.cv_results_)
print(gs.cv_results_['mean_test_score'])
print(np.argmax(gs.cv_results_['mean_test_score']))
print(gs.cv_results_['params'][np.argmax(gs.cv_results_['mean_test_score'])])

{'mean_fit_time': array([0.0086668 , 0.00768681, 0.00704589, 0.00679541, 0.00635467]), 'std_fit_time': array([0.00087049, 0.00018591, 0.00010633, 0.00044149, 0.00020597]), 'mean_score_time': array([0.00095606, 0.00088611, 0.00084596, 0.00082707, 0.00081096]), 'std_score_time': array([5.59098431e-05, 4.10145377e-05, 5.23497711e-05, 2.62303782e-05,
       1.01399240e-04]), 'param_min_impurity_decrease': masked_array(data=[0.0001, 0.0002, 0.0003, 0.0004, 0.0005],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'min_impurity_decrease': 0.0001}, {'min_impurity_decrease': 0.0002}, {'min_impurity_decrease': 0.0003}, {'min_impurity_decrease': 0.0004}, {'min_impurity_decrease': 0.0005}], 'split0_test_score': array([0.86923077, 0.87115385, 0.86923077, 0.86923077, 0.86538462]), 'split1_test_score': array([0.86826923, 0.86346154, 0.85961538, 0.86346154, 0.86923077]), 'split2_test_score': array([0.8825794 , 0.87680462, 0.87584216,

# 랜덤서치
- 매개변수 값이 수치일때 값의 범위나 간격을 미리 정하기 어려울때
- 매개변수를 샘플링할 수 있는 확률 분포 객체를 전달

In [21]:
# uniform은 실숫값을, randint는 정수값을 고르게 뽑음
from scipy.stats import uniform, randint

rgen = randint(0, 10)
print(np.unique(rgen.rvs(1000), return_counts=True))

ugen = uniform(0, 1)
print(ugen.rvs(10))

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), array([ 99,  98, 106, 118, 100,  95,  94,  80, 104, 106]))
[0.5147345  0.46595264 0.33209207 0.64346755 0.06084647 0.25502655
 0.38774294 0.91250001 0.26461495 0.21150012]


In [22]:
params = {'min_impurity_decrease': uniform(0.0001, 0.001),
          'max_depth': randint(20, 50)}


from sklearn.model_selection import RandomizedSearchCV

# n_iter: 100번을 샘프링하여 교차검증을 수행하고 최적의 매개변수 조합을 찾아라
gs = RandomizedSearchCV(DecisionTreeClassifier(random_state=42), params, n_iter=100, n_jobs=-1, random_state=42)
gs.fit(train_input, train_target)

RandomizedSearchCV(cv=None, error_score=nan,
                   estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features=None,
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    presort='deprecated',
                                                    random_state=42,
         

In [23]:
print(gs.best_params_)

{'max_depth': 29, 'min_impurity_decrease': 0.000437615171403628}


In [24]:
print(np.max(gs.cv_results_['mean_test_score']))

0.8689635004071962


In [26]:
dt = gs.best_estimator_
print(dt.score(test_input, test_target))

0.8638461538461538
