<a href="https://colab.research.google.com/github/chasubeen/python_selfstudy/blob/master/%E1%84%92%E1%85%A9%E1%86%AB%E1%84%8C%E1%85%A1%20%E1%84%80%E1%85%A9%E1%86%BC%E1%84%87%E1%85%AE%E1%84%92%E1%85%A1%E1%84%82%E1%85%B3%E1%86%AB%20%E1%84%86%E1%85%A5%E1%84%89%E1%85%B5%E1%86%AB%E1%84%85%E1%85%A5%E1%84%82%E1%85%B5%E1%86%BC%2B%E1%84%83%E1%85%B5%E1%86%B8%E1%84%85%E1%85%A5%E1%84%82%E1%85%B5%E1%86%BC%20%E1%84%89%E1%85%B5%E1%86%AF%E1%84%89%E1%85%B3%E1%86%B8/5-2.%20%EA%B5%90%EC%B0%A8%20%EA%B2%80%EC%A6%9D%EA%B3%BC%20%EA%B7%B8%EB%A6%AC%EB%93%9C%20%EC%84%9C%EC%B9%98.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

###**검증 세트(validation set)**

In [1]:
import pandas as pd
wine = pd.read_csv('https://bit.ly/wine_csv_data')

In [2]:
data = wine[['alcohol','sugar','pH']].to_numpy()
target = wine['class'].to_numpy()

In [3]:
# 훈련 세트와 테스트 세트 나누기
from sklearn.model_selection import train_test_split
train_input, test_input, train_target, test_target = train_test_split(
    data, target, test_size = 0.2, random_state = 42
)

In [4]:
# 훈련 세트와 검증 세트 나누기(훈련 세트에서 분리)
sub_input, val_input, sub_target, val_target = train_test_split(
    train_input, train_target, test_size = 0.2, random_state = 42
)

In [5]:
# 훈련 세트와 검증 세트의 크기 확인하기
print(sub_input.shape,val_input.shape)

(4157, 3) (1040, 3)


In [6]:
# 모델 생성, 훈련, 평가
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state = 42)
dt.fit(sub_input, sub_target)
print(dt.score(sub_input, sub_target))
print(dt.score(val_input, val_target))

0.9971133028626413
0.864423076923077


###**교차 검증**

In [7]:
# 5 - 폴드 교차 검증 시행하기
from sklearn.model_selection import cross_validate
scores = cross_validate(dt,train_input, train_target)
print(scores)

{'fit_time': array([0.01272297, 0.01032472, 0.01121831, 0.01070595, 0.0101099 ]), 'score_time': array([0.00142932, 0.00125241, 0.00127864, 0.00124168, 0.00123334]), 'test_score': array([0.86923077, 0.84615385, 0.87680462, 0.84889317, 0.83541867])}


In [8]:
# 검증 폴드의 점수 계산
import numpy as np
print(np.mean(scores['test_score']))

0.855300214703487


In [9]:
# 분할기를 명시적으로 지정해주기
# 분류 문제 -> StratifiedKFold
from sklearn.model_selection import StratifiedKFold
scores = cross_validate(dt,train_input, train_target, cv = StratifiedKFold())
print(np.mean(scores['test_score']))

0.855300214703487


In [10]:
# 훈련 세트를 섞고 10 - 폴드 교차 검증 수행하기
splitter = StratifiedKFold(n_splits = 10, shuffle = True, random_state = 42)
scores = cross_validate(dt,train_input, train_target, cv = splitter)
print(np.mean(scores['test_score']))

0.8574181117533719


###**하이퍼파라미터 튜닝**

In [11]:
# 매개변수 최적값 찾기
from sklearn.model_selection import GridSearchCV
params = {'min_impurity_decrease' : [0.0001,0.0002,0.0003,0.0004,0.0005]} # 매개변수

In [15]:
# 그리드 서치 객체 만들기
gs = GridSearchCV(DecisionTreeClassifier(random_state = 42), params, n_jobs = -1)

In [16]:
gs.fit(train_input, train_target)

GridSearchCV(estimator=DecisionTreeClassifier(random_state=42), n_jobs=-1,
             param_grid={'min_impurity_decrease': [0.0001, 0.0002, 0.0003,
                                                   0.0004, 0.0005]})

In [17]:
dt = gs.best_estimator_
print(dt.score(train_input, train_target))

0.9615162593804117


In [18]:
print(gs.best_params_)

{'min_impurity_decrease': 0.0001}


In [20]:
# 5번의 교차 검증으로 얻은 점수 평균
print(gs.cv_results_['mean_test_score'])

[0.86819297 0.86453617 0.86492226 0.86780891 0.86761605]


In [21]:
best_index = np.argmax(gs.cv_results_['mean_test_score'])
print(gs.cv_results_['params'][best_index])

{'min_impurity_decrease': 0.0001}


In [22]:
# 복잡한 매개변수 조합 탐색하기
# min_impurity_decrease : 노드를 분할하기 위한 불순도 감소 최소량
# max_depth : 트리 깊이 제한
# min_samples_split : 노드를 나누기 위한 최소 샘플 수

params = {'min_impurity_decrease' : np.arange(0.0001, 0.001, 0.0001),
          'max_depth' : range(5,20,1),
          'min_samples_split' : range(2,100,10)
          }

In [23]:
# 그리드 서치 수행하기
gs = GridSearchCV(DecisionTreeClassifier(random_state = 42), params, n_jobs = -1)
gs.fit(train_input, train_target)

GridSearchCV(estimator=DecisionTreeClassifier(random_state=42), n_jobs=-1,
             param_grid={'max_depth': range(5, 20),
                         'min_impurity_decrease': array([0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008,
       0.0009]),
                         'min_samples_split': range(2, 100, 10)})

In [24]:
# 최상의 매개변수 조합 확인하기
print(gs.best_params_)

{'max_depth': 14, 'min_impurity_decrease': 0.0004, 'min_samples_split': 12}


In [25]:
# 최상의 교차 검증 점수 확인하기
print(np.max(gs.cv_results_['mean_test_score']))

0.8683865773302731


**랜덤 서치**

In [26]:
from scipy.stats import uniform, randint

In [27]:
# 매개변수 탐색하기
params = {'min_impurity_decrease' : uniform(0.0001,0.001),
          'max_depth' : randint(20,50),
          'min_samples_split' : randint(2,25),
          'min_samples_leaf' : randint(1,25)}

In [29]:
# 정의된 매개변수 범위에서 샘플링하여 교차 검증 수행
from sklearn.model_selection import RandomizedSearchCV
gs = RandomizedSearchCV(DecisionTreeClassifier(random_state = 42),params, n_iter = 100, n_jobs = -1, random_state = 42)
gs.fit(train_input, train_target)

RandomizedSearchCV(estimator=DecisionTreeClassifier(random_state=42),
                   n_iter=100, n_jobs=-1,
                   param_distributions={'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f553d362610>,
                                        'min_impurity_decrease': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f553d3626d0>,
                                        'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f553d362910>,
                                        'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f553d362710>},
                   random_state=42)

In [30]:
# 최적의 매개변수 조합
print(gs.best_params_)

{'max_depth': 39, 'min_impurity_decrease': 0.00034102546602601173, 'min_samples_leaf': 7, 'min_samples_split': 13}


In [31]:
# 최고의 교차 검증 점수 확인
print(np.max(gs.cv_results_['mean_test_score']))

0.8695428296438884


In [32]:
# 최적 모델로 테스트 세트의 성능 확인하기
dt = gs.best_estimator_
print(dt.score(test_input, test_target))

0.86
