<a href="https://colab.research.google.com/github/choisungmin123/machine-learning-practice/blob/main/250416_cross_validation_hyperparameter_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 교차 검증과 그리드 서치

## 검증 데이터셋

In [1]:
import pandas as pd

wine = pd.read_csv('https://bit.ly/wine-date')

### 문제 1 : wine 데이터 확인

In [4]:
# wine 처음 5개 행 데이터 확인
wine.head()

Unnamed: 0,alcohol,sugar,pH,class
0,9.4,1.9,3.51,0.0
1,9.8,2.6,3.2,0.0
2,9.8,2.3,3.26,0.0
3,9.8,1.9,3.16,0.0
4,9.4,1.9,3.51,0.0


In [6]:
# wine 전체 행의 개수 확인
print(len(wine))

6497


In [8]:
# wine 데이터 통계값 확인 (각 특성별 평균, 표준편차, 최소값, 최대값 등)
wine.describe()

Unnamed: 0,alcohol,sugar,pH,class
count,6497.0,6497.0,6497.0,6497.0
mean,10.491801,5.443235,3.218501,0.753886
std,1.192712,4.757804,0.160787,0.430779
min,8.0,0.6,2.72,0.0
25%,9.5,1.8,3.11,1.0
50%,10.3,3.0,3.21,1.0
75%,11.3,8.1,3.32,1.0
max,14.9,65.8,4.01,1.0


In [9]:
# 화이트 와인, 레드 와인 데이터 개수 확인
wine['class'].value_counts()

Unnamed: 0_level_0,count
class,Unnamed: 1_level_1
1.0,4898
0.0,1599


### 데이터셋 분류

In [10]:
data = wine[['alcohol', 'sugar', 'pH']].to_numpy()
target = wine['class'].to_numpy()

In [11]:
from sklearn.model_selection import train_test_split

train_input, test_input, train_target, test_target = train_test_split(
    data, target, test_size=0.2, random_state=42)

In [12]:
sub_input, val_input, sub_target, val_target = train_test_split(
    train_input, train_target, test_size=0.2, random_state=42)

In [13]:
print(sub_input.shape, val_input.shape)

(4157, 3) (1040, 3)


In [46]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=42) # 동일한 결과를 얻기 위해 난수 고정
dt.fit(sub_input, sub_target)

print(dt.score(sub_input, sub_target))
print(dt.score(val_input, val_target))

0.9971133028626413
0.864423076923077


## 교차 검증

In [15]:
from sklearn.model_selection import cross_validate

scores = cross_validate(dt, train_input, train_target)
print(scores)

{'fit_time': array([0.01272297, 0.00874186, 0.00944948, 0.01125169, 0.00866961]), 'score_time': array([0.00196815, 0.00136256, 0.00153136, 0.00179124, 0.0013299 ]), 'test_score': array([0.86923077, 0.84615385, 0.87680462, 0.84889317, 0.83541867])}


In [16]:
import numpy as np

print(np.mean(scores['test_score']))

0.855300214703487


In [17]:
from sklearn.model_selection import StratifiedKFold

scores = cross_validate(dt, train_input, train_target, cv=StratifiedKFold()) #교차 검증 수행
print(np.mean(scores['test_score']))

0.855300214703487


In [18]:
splitter = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_validate(dt, train_input, train_target, cv=splitter)
print(np.mean(scores['test_score']))

0.8574181117533719


## 하이퍼파라미터 튜닝

In [19]:
from sklearn.model_selection import GridSearchCV

params = {'min_impurity_decrease': [0.0001, 0.0002, 0.0003, 0.0004, 0.0005]}

In [20]:
gs = GridSearchCV(DecisionTreeClassifier(random_state=42), params, n_jobs=-1)

In [21]:
gs.fit(train_input, train_target)

In [22]:
dt = gs.best_estimator_
print(dt.score(train_input, train_target))

0.9615162593804117


In [23]:
print(gs.best_params_)

{'min_impurity_decrease': 0.0001}


In [24]:
print(gs.cv_results_['mean_test_score'])

[0.86819297 0.86453617 0.86492226 0.86780891 0.86761605]


In [25]:
best_index = np.argmax(gs.cv_results_['mean_test_score'])
print(gs.cv_results_['params'][best_index])

{'min_impurity_decrease': 0.0001}


In [26]:
params = {'min_impurity_decrease': np.arange(0.0001, 0.001, 0.0001),
          'max_depth': range(5, 20, 1),
          'min_samples_split': range(2, 100, 10)
          }

In [27]:
gs = GridSearchCV(DecisionTreeClassifier(random_state=42), params, n_jobs=-1)
gs.fit(train_input, train_target)

In [28]:
print(gs.best_params_)

{'max_depth': 14, 'min_impurity_decrease': np.float64(0.0004), 'min_samples_split': 12}


In [29]:
print(np.max(gs.cv_results_['mean_test_score']))

0.8683865773302731


In [30]:
# 교차검증 수행 시간 프린트
gs.cv_results_['mean_fit_time']

array([0.00756269, 0.00822544, 0.00908861, ..., 0.01293764, 0.01453285,
       0.01492162])

### 랜덤 서치

In [31]:
from scipy.stats import uniform, randint

In [32]:
# 균등 분포 샘플링
rgen = randint(0, 10)
rgen.rvs(10)

array([2, 6, 0, 7, 8, 6, 4, 8, 2, 2])

In [33]:
np.unique(rgen.rvs(1000), return_counts=True) # 빈도도 함께 출력

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([ 89,  99, 107,  93,  96, 110,  84, 121, 103,  98]))

In [34]:
ugen = uniform(0, 1)
ugen.rvs(10)

array([0.93896148, 0.63280727, 0.64248769, 0.12153956, 0.57671181,
       0.20761635, 0.97205877, 0.86952983, 0.2981871 , 0.58995888])

In [35]:
params = {'min_impurity_decrease': uniform(0.0001, 0.001),
          'max_depth': randint(20, 50),
          'min_samples_split': randint(2, 25),
          'min_samples_leaf': randint(1, 25),
          }

In [36]:
from sklearn.model_selection import RandomizedSearchCV

rs = RandomizedSearchCV(DecisionTreeClassifier(random_state=42), params,
                        n_iter=100, n_jobs=-1, random_state=42)
rs.fit(train_input, train_target)

In [37]:
print(rs.best_params_)

{'max_depth': 39, 'min_impurity_decrease': np.float64(0.00034102546602601173), 'min_samples_leaf': 7, 'min_samples_split': 13}


In [38]:
print(np.max(rs.cv_results_['mean_test_score']))

0.8695428296438884


In [39]:
dt = rs.best_estimator_

print(dt.score(test_input, test_target))

0.86


In [40]:
rs.cv_results_['mean_fit_time']

array([0.007478  , 0.00711803, 0.00810614, 0.00827179, 0.00673704,
       0.0076448 , 0.00711579, 0.00703945, 0.00727849, 0.0075459 ,
       0.00743098, 0.00741529, 0.00763903, 0.00778708, 0.01071177,
       0.01094346, 0.00735593, 0.01009068, 0.01333041, 0.00792141,
       0.01517262, 0.00843244, 0.01321063, 0.01203609, 0.01362095,
       0.01755133, 0.00775046, 0.00771627, 0.00756216, 0.01342249,
       0.01350069, 0.01448483, 0.00780501, 0.0111412 , 0.01208344,
       0.01636958, 0.01408634, 0.01520047, 0.01512465, 0.01603022,
       0.01223645, 0.01475024, 0.02355938, 0.01924019, 0.01085448,
       0.0102798 , 0.01796551, 0.02040215, 0.01883097, 0.0208189 ,
       0.01241856, 0.01745663, 0.00842037, 0.00776377, 0.01441011,
       0.0139596 , 0.01518345, 0.01819105, 0.01313038, 0.01702366,
       0.01674418, 0.01554422, 0.01513276, 0.01515288, 0.01152182,
       0.01622024, 0.01497703, 0.01194115, 0.01184692, 0.01341786,
       0.01274095, 0.01430702, 0.00902791, 0.01454353, 0.01287

In [41]:
print(np.mean(rs.cv_results_['mean_fit_time']))

0.012369283199310306


### 결정트리 분할 옵션 변경

In [42]:
rs2 = RandomizedSearchCV(DecisionTreeClassifier(splitter='random', random_state=42), params,
                        n_iter=100, n_jobs=-1, random_state=42)
rs2.fit(train_input, train_target)

In [43]:
print(rs2.best_params_)
print(np.max(rs2.cv_results_['mean_test_score']))

dt = rs2.best_estimator_
print(dt.score(test_input, test_target))

{'max_depth': 43, 'min_impurity_decrease': np.float64(0.00011407982271508446), 'min_samples_leaf': 19, 'min_samples_split': 18}
0.8458726956392981
0.786923076923077


In [44]:
rs2.cv_results_['mean_fit_time']

array([0.00347304, 0.00363317, 0.00351958, 0.00335422, 0.00338154,
       0.00355992, 0.00317478, 0.00310831, 0.00371408, 0.00365586,
       0.00353131, 0.00311379, 0.00351744, 0.00354443, 0.00337629,
       0.00348454, 0.0032876 , 0.0038908 , 0.00393863, 0.00358233,
       0.00399647, 0.00430217, 0.00345726, 0.0037025 , 0.0042295 ,
       0.00504088, 0.00313282, 0.00342021, 0.00322289, 0.00329547,
       0.00310378, 0.00351563, 0.00310726, 0.00361695, 0.00362725,
       0.00350847, 0.0031559 , 0.00429049, 0.00326314, 0.00308323,
       0.00324378, 0.00341473, 0.00389738, 0.0033257 , 0.00348802,
       0.00324898, 0.00348511, 0.00328455, 0.00379567, 0.00478415,
       0.00412765, 0.0033474 , 0.00387416, 0.00312304, 0.00319638,
       0.00405941, 0.00332594, 0.00344205, 0.0038569 , 0.00318222,
       0.00349693, 0.00317092, 0.00504708, 0.00620146, 0.00481267,
       0.00304828, 0.00291572, 0.0033536 , 0.00320253, 0.00403476,
       0.00337949, 0.00323591, 0.00316081, 0.00311127, 0.00350

In [45]:
print(np.mean(rs2.cv_results_['mean_fit_time']))

0.00356235408782959


문제 2 : 위 코드가 기존 랜덤 서치 코드와 다른 점을 2가지 적어보세요.

모델 성능뿐만 아니라 학습 시간도 함께 평가함.    rs2.cv_results_['mean_fit_time']

결정트리에서 분할 기준을 무작위로 설정함. splitter='random'