<a href="https://colab.research.google.com/github/chasubeen/python_selfstudy/blob/master/5_2.%EA%B5%90%EC%B0%A8%20%EA%B2%80%EC%A6%9D%EA%B3%BC%20%EA%B7%B8%EB%A6%AC%EB%93%9C%20%EC%84%9C%EC%B9%98.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **검증 세트**

In [None]:
# 데이터 불러오기
import pandas as pd
wine = pd.read_csv('https://bit.ly/wine_csv_data')

In [None]:
wine

Unnamed: 0,alcohol,sugar,pH,class
0,9.4,1.9,3.51,0.0
1,9.8,2.6,3.20,0.0
2,9.8,2.3,3.26,0.0
3,9.8,1.9,3.16,0.0
4,9.4,1.9,3.51,0.0
...,...,...,...,...
6492,11.2,1.6,3.27,1.0
6493,9.6,8.0,3.15,1.0
6494,9.4,1.2,2.99,1.0
6495,12.8,1.1,3.34,1.0


In [None]:
data = wine[['alcohol','sugar','pH']].to_numpy() # 특성 배열에 저장
target = wine['class'].to_numpy() 

In [None]:
# 훈련 세트와 테스트 세트로 나누기
from sklearn.model_selection import train_test_split
train_input, test_input, train_target, test_target = train_test_split(data,target,test_size = 0.2,random_state = 42)

In [None]:
# 훈련 세트를 다시 훈련 세트와 검증 세트로 나누기
sub_input, val_input, sub_target, val_target = train_test_split(train_input,train_target,test_size = 0.2,random_state = 42)

In [None]:
# 훈련 세트와 검증 세트의 크기 파악하기
print(sub_input.shape,val_input.shape)

(4157, 3) (1040, 3)


In [None]:
# 모델 생성/평가
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state = 42)
dt.fit(sub_input,sub_target)
print(dt.score(sub_input,sub_target))
print(dt.score(val_input,val_target))

# 결론> 해당 모델은 훈련 세트에 과대적합되어있다.

0.9971133028626413
0.864423076923077


### **교차 검증**

In [None]:
from sklearn.model_selection import cross_validate
scores = cross_validate(dt,train_input,train_target)
print(scores)

{'fit_time': array([0.01241183, 0.01109576, 0.01117349, 0.01079774, 0.01037288]), 'score_time': array([0.00152349, 0.00146508, 0.00140882, 0.00131464, 0.0013721 ]), 'test_score': array([0.86923077, 0.84615385, 0.87680462, 0.84889317, 0.83541867])}


In [None]:
# 각 교차 검증 시의 점수를 평균 내어 최종 점수를 구함
import numpy as np
print(np.mean(scores['test_score']))

0.855300214703487


In [None]:
from sklearn.model_selection import StratifiedKFold
scores = cross_validate(dt,train_input,train_target,cv = StratifiedKFold())
print(np.mean(scores['test_score']))

0.855300214703487


In [None]:
# 훈련 세트를 섞은 후 10-fold 교차 검증 수행
splitter = StratifiedKFold(n_splits = 10,shuffle = True,random_state = 42)
scores = cross_validate(dt,train_input,train_target,cv = splitter)
print(np.mean(scores['test_score']))

0.8574181117533719


###**하이퍼 파라미터 튜닝**

In [None]:
from sklearn.model_selection import GridSearchCV 
params = {'min_impurity_decrease':[0.0001,0.0002,0.0003,0.0004,0.0005]} # 탐색할 매개변수와 탐색할 값의 리스트를 딕셔너리 형태로 구성

In [None]:
# 그리드 서치 객체 생성
gs = GridSearchCV(DecisionTreeClassifier(random_state = 42),params,n_jobs = -1) 
# n_jobs: 병렬 실행에 사용할 CPU 코어 수를 지정
# n_jobs = -1 ====> 시스템에 있는 모든 코어를 사용

In [None]:
gs.fit(train_input,train_target)

GridSearchCV(estimator=DecisionTreeClassifier(random_state=42), n_jobs=-1,
             param_grid={'min_impurity_decrease': [0.0001, 0.0002, 0.0003,
                                                   0.0004, 0.0005]})

In [None]:
dt = gs.best_estimator_ # 최적의 성능을 가지는 모델을 저장하고 있는 매개변수
print(dt.score(train_input,train_target))

0.9615162593804117


In [None]:
print(gs.best_params_) # 그리드 서치로 찾은 최적의 매개변수

{'min_impurity_decrease': 0.0001}


In [None]:
print(gs.cv_results_['mean_test_score']) # 각 매개변수에서 수행한 교차 검증의 평균 점수가 저장되어있음

[0.86819297 0.86453617 0.86492226 0.86780891 0.86761605]


In [None]:
best_index = np.argmax(gs.cv_results_['mean_test_score'])
print(gs.cv_results_['params'][best_index])

{'min_impurity_decrease': 0.0001}


**과정 정리**
1. 탐색할 매개변수 지정
2. 훈련 세트에서 그리드 서치를 수행하여 최상의 평균 검증 점수가 나오는 매개변수 조합 찾기 -> 그리드 서치 객체에 저장됨
3. 그리드 서치는 최상의 매개변수에서 전체 훈련 세트를 사용해 최종 모델을 훈련함 -> 그리드 서치 객체에 저장됨


In [None]:
params = {'min_impurity_decrease':np.arange(0.0001,0.001,0.0001),
          'max_depth':range(5,20,1),
          'min_samples_split':range(2,100,10)
          }

In [None]:
gs = GridSearchCV(DecisionTreeClassifier(random_state = 42),params,n_jobs = -1)
gs.fit(train_input,train_target)

GridSearchCV(estimator=DecisionTreeClassifier(random_state=42), n_jobs=-1,
             param_grid={'max_depth': range(5, 20),
                         'min_impurity_decrease': array([0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008,
       0.0009]),
                         'min_samples_split': range(2, 100, 10)})

In [None]:
# 최상의 매개변수 조합을 확인하기
print(gs.best_params_)

{'max_depth': 14, 'min_impurity_decrease': 0.0004, 'min_samples_split': 12}


In [None]:
print(np.max(gs.cv_results_['mean_test_score']))

0.8683865773302731


###**랜덤 서치**

In [None]:
from scipy.stats import uniform,randint

In [None]:
rgen = randint(0,10) # 난수 발생
rgen.rvs(10) # 10개의 숫자를 sampling

array([3, 5, 2, 2, 3, 4, 9, 9, 2, 1])

In [None]:
np.unique(rgen.rvs(1000),return_counts = True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([106,  96, 103,  92,  87, 108, 107,  80, 124,  97]))

In [None]:
ugen = uniform(0,1)
ugen.rvs(10)

array([0.05328966, 0.57622324, 0.28015169, 0.79245741, 0.24769538,
       0.62991564, 0.59154752, 0.28784718, 0.95563939, 0.70420869])

In [None]:
# 매개변수 탐색하기
params = {'min_impurity_decrease':uniform(0.001,0.001),
          'max_depth':randint(20,50),
          'min_samples_split':randint(2,25),
          'min_samples_leaf':randint(1,25),
          }

In [None]:
from sklearn.model_selection import RandomizedSearchCV
gs = RandomizedSearchCV(DecisionTreeClassifier(random_state = 42),params,
                        n_iter = 100,n_jobs = -1,random_state = 42)
gs.fit(train_input,train_target)

RandomizedSearchCV(estimator=DecisionTreeClassifier(random_state=42),
                   n_iter=100, n_jobs=-1,
                   param_distributions={'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f8055ada110>,
                                        'min_impurity_decrease': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f80558dc9d0>,
                                        'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f8055bcc810>,
                                        'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f8055ada7d0>},
                   random_state=42)

In [None]:
# 최적의 매개변수 조합 출력하기
print(gs.best_params_)

{'max_depth': 45, 'min_impurity_decrease': 0.001025419126744095, 'min_samples_leaf': 13, 'min_samples_split': 20}


In [None]:
# 최고의 교차 검증 점수 확인하기
print(np.max(gs.cv_results_['mean_test_score']))

0.8647301399274451


In [None]:
# 해당 모델을 최종 모델로 결정하고 테스트 세트의 성능을 확인하기
dt = gs.best_estimator_
print(dt.score(test_input,test_target))

0.8561538461538462
