### 검증 세트

In [94]:
import pandas as pd

wine =  pd.read_csv('http://bit.ly/wine_csv_data')

In [95]:
data = wine[['alcohol', 'sugar', 'pH']]
target = wine[['class']]

In [96]:
# 8:2 로 나누기
from sklearn.model_selection import train_test_split
train_input, test_input, train_target, test_target = train_test_split(data, target, random_state=42)

In [97]:
# 한번 더 스플릿
sub_input, val_input, sub_target, val_target = train_test_split(train_input, train_target, random_state=42)

In [98]:
print(data.shape, train_input.shape, test_input.shape)
print(sub_input.shape, val_input.shape)

(6497, 3) (4872, 3) (1625, 3)
(3654, 3) (1218, 3)


In [99]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state=42)
dt.fit(sub_input, sub_target)
print(df.score(sub_input, sub_target))
print(df.score(val_input, val_target))

0.9978106185002736
0.8571428571428571


### K-교차검증

In [100]:
from sklearn.model_selection import cross_validate

scores = cross_validate(dt, train_input, train_target)
scores
# 5번의 교차 검증
# cross_validate(): 교차 검증을 위한 함수. 기본 값이 K-fold 검증임.

{'fit_time': array([0.00700021, 0.00603676, 0.0060029 , 0.00901318, 0.00763392]),
 'score_time': array([0.00200129, 0.00200558, 0.00377846, 0.00200605, 0.00200105]),
 'test_score': array([0.85128205, 0.84820513, 0.8788501 , 0.85112936, 0.84394251])}

In [101]:
# 평균값
import numpy as np
np.mean(scores['test_score'])

np.float64(0.8546818301479492)

In [102]:
# cross_validate는 데이터를 섞어주지 않음
    # 물론 우린 섞여있는 train_input 데이터를 사용하긴함
# 데이터를 섞어주기 위해
from sklearn.model_selection import StratifiedKFold
splitter = StratifiedKFold(n_splits=10, shuffle=True)
cross_validate(dt, train_input, train_target, cv=splitter)

{'fit_time': array([0.00788665, 0.00700331, 0.00800109, 0.00799918, 0.00602078,
        0.00596499, 0.03718472, 0.00999928, 0.01099896, 0.01001096]),
 'score_time': array([0.00199938, 0.00199962, 0.00299788, 0.00203466, 0.0019815 ,
        0.00600004, 0.00400019, 0.0073278 , 0.00500083, 0.0032382 ]),
 'test_score': array([0.84631148, 0.86270492, 0.86858316, 0.85215606, 0.85626283,
        0.86858316, 0.85420945, 0.86652977, 0.87063655, 0.86447639])}

In [103]:
from sklearn.model_selection import GridSearchCV
params = {
    'min_impurity_decrease': [0.0001, 0.0002, 0.0003, 0.0004, 0.0005]
}

In [104]:
dt = DecisionTreeClassifier(random_state=42)
gs = GridSearchCV(dt, params)

In [105]:
gs.fit(train_input, train_target)

In [106]:
# 성능 좋은 값 찾기
gs.best_estimator_
# 0.0003이 가장 좋음

In [107]:
gs.best_params_

{'min_impurity_decrease': 0.0003}

In [108]:
gs.cv_results_

{'mean_fit_time': array([0.00607333, 0.00440831, 0.006213  , 0.00379963, 0.00339961]),
 'std_fit_time': array([0.00103669, 0.00049734, 0.00270877, 0.00040004, 0.00048959]),
 'mean_score_time': array([0.00260291, 0.00179968, 0.00240049, 0.00180035, 0.00160046]),
 'std_score_time': array([0.00049242, 0.00039748, 0.00048847, 0.00039997, 0.00049004]),
 'param_min_impurity_decrease': masked_array(data=[0.0001, 0.0002, 0.0003, 0.0004, 0.0005],
              mask=[False, False, False, False, False],
        fill_value=1e+20),
 'params': [{'min_impurity_decrease': 0.0001},
  {'min_impurity_decrease': 0.0002},
  {'min_impurity_decrease': 0.0003},
  {'min_impurity_decrease': 0.0004},
  {'min_impurity_decrease': 0.0005}],
 'split0_test_score': array([0.87384615, 0.87076923, 0.87282051, 0.86461538, 0.86051282]),
 'split1_test_score': array([0.86666667, 0.86871795, 0.87794872, 0.88512821, 0.87794872]),
 'split2_test_score': array([0.88603696, 0.88295688, 0.8798768 , 0.8788501 , 0.88295688]),
 'spli

In [64]:
params = {
    'min_impurity_decrease': np.arange(0.0001, 0.001, 0.0001),
    'max_depth': range(5, 20, 1),
    'min_samples_split': range(2, 100, 10),
}

In [65]:
dt = DecisionTreeClassifier(random_state=42)
gs = GridSearchCV(dt, params, n_jobs=-1)

In [66]:
gs.fit(train_input, train_target)

In [67]:
gs.best_params_

{'max_depth': 15,
 'min_impurity_decrease': np.float64(0.0001),
 'min_samples_split': 22}

In [71]:
gs.cv_results_['mean_test_score']

array([0.85837161, 0.85837161, 0.85837161, ..., 0.86309693, 0.86309693,
       0.86309693])

### 랜덤 서치

In [72]:
from scipy.stats import uniform, randint

In [73]:
randint(0, 10).rvs(10)

array([2, 0, 2, 4, 2, 9, 6, 0, 3, 7])

In [77]:
rgen = randint(0, 10)
np.unique(rgen.rvs(1000), return_counts=True)
# 각 고유 숫자의 개수 확인

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([116, 109, 108,  91,  92, 109,  96,  90,  98,  91]))

In [85]:
params = {
    'min_impurity_decrease': uniform(0.0001, 0.001),
    'max_depth': randint(20, 50),
    'min_samples_split': randint(2, 25),
    'min_samples_leaf': randint(1, 25),
}

In [86]:
from sklearn.model_selection import RandomizedSearchCV

In [87]:
gs = RandomizedSearchCV(dt, params, n_iter=100, n_jobs=-1)
gs.fit(train_input, train_target)

In [88]:
gs.best_estimator_

In [89]:
gs.best_params_

{'max_depth': 46,
 'min_impurity_decrease': np.float64(0.00022671017546484532),
 'min_samples_leaf': 6,
 'min_samples_split': 24}