In [1]:
from sklearn.datasets import load_iris
import numpy as np
import pandas as pd

iris = load_iris()
features = iris.data
targets = iris.target

target_df = pd.DataFrame(targets, columns=['target'])
target_df.value_counts()

target
0         50
1         50
2         50
Name: count, dtype: int64

In [2]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold

dtc = DecisionTreeClassifier(random_state=124, min_samples_leaf=6)
kfold = KFold(n_splits=5)

In [4]:
features.shape

(150, 4)

In [8]:
count = 0
for train_index, test_index in kfold.split(features):
    # 분리
    X_train, X_test = features[train_index], features[test_index]
    y_train, y_test = targets[train_index], targets[test_index]

    # 학습 및 예측
    dtc.fit(X_train, y_train)
    prediction = dtc.predict(X_test)

    # 평가
    accuracy = np.round(accuracy_score(y_test, prediction), 4)

    # 검증
    train_targets = pd.DataFrame(y_train)
    test_targets = pd.DataFrame(y_test)

    count += 1
    
    print(f'{count} 회차')
    print(f'학습 타겟 데이터 분포: \n{train_targets.value_counts()}')
    print(f'검증 타겟 데이터 분포: \n{test_targets.value_counts()}')
    print(f'정확도: {accuracy}')

1 회차
학습 타겟 데이터 분포: 
1    50
2    50
0    20
Name: count, dtype: int64
검증 타겟 데이터 분포: 
0    30
Name: count, dtype: int64
정확도: 1.0
2 회차
학습 타겟 데이터 분포: 
2    50
1    40
0    30
Name: count, dtype: int64
검증 타겟 데이터 분포: 
0    20
1    10
Name: count, dtype: int64
정확도: 1.0
3 회차
학습 타겟 데이터 분포: 
0    50
2    50
1    20
Name: count, dtype: int64
검증 타겟 데이터 분포: 
1    30
Name: count, dtype: int64
정확도: 0.8333
4 회차
학습 타겟 데이터 분포: 
0    50
1    40
2    30
Name: count, dtype: int64
검증 타겟 데이터 분포: 
2    20
1    10
Name: count, dtype: int64
정확도: 0.9333
5 회차
학습 타겟 데이터 분포: 
0    50
1    50
2    20
Name: count, dtype: int64
검증 타겟 데이터 분포: 
2    30
Name: count, dtype: int64
정확도: 0.8333


In [9]:
from sklearn.model_selection import StratifiedKFold

s_kfold = StratifiedKFold(n_splits=5)

In [13]:
count = 0
accuracy_list = []
for train_index, test_index in s_kfold.split(features, targets):
    # 분리
    X_train, X_test = features[train_index], features[test_index]
    y_train, y_test = targets[train_index], targets[test_index]

    # 학습 및 예측
    dtc.fit(X_train, y_train)
    prediction = dtc.predict(X_test)

    # 평가
    accuracy = np.round(accuracy_score(y_test, prediction), 4)
    accuracy_list.append(accuracy)

    # 검증
    train_targets = pd.DataFrame(y_train)
    test_targets = pd.DataFrame(y_test)

    count += 1
    
    print(f'{count} 회차')
    print(f'학습 타겟 데이터 분포: \n{train_targets.value_counts()}')
    print(f'검증 타겟 데이터 분포: \n{test_targets.value_counts()}')
    print(f'정확도: {accuracy}')

print(f'평균 정확도: {np.mean(accuracy_list)}')

1 회차
학습 타겟 데이터 분포: 
0    40
1    40
2    40
Name: count, dtype: int64
검증 타겟 데이터 분포: 
0    10
1    10
2    10
Name: count, dtype: int64
정확도: 0.9667
2 회차
학습 타겟 데이터 분포: 
0    40
1    40
2    40
Name: count, dtype: int64
검증 타겟 데이터 분포: 
0    10
1    10
2    10
Name: count, dtype: int64
정확도: 0.9667
3 회차
학습 타겟 데이터 분포: 
0    40
1    40
2    40
Name: count, dtype: int64
검증 타겟 데이터 분포: 
0    10
1    10
2    10
Name: count, dtype: int64
정확도: 0.9
4 회차
학습 타겟 데이터 분포: 
0    40
1    40
2    40
Name: count, dtype: int64
검증 타겟 데이터 분포: 
0    10
1    10
2    10
Name: count, dtype: int64
정확도: 0.8667
5 회차
학습 타겟 데이터 분포: 
0    40
1    40
2    40
Name: count, dtype: int64
검증 타겟 데이터 분포: 
0    10
1    10
2    10
Name: count, dtype: int64
정확도: 1.0
평균 정확도: 0.94002


In [16]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.datasets import load_iris
import numpy as np

iris = load_iris()
dtc = DecisionTreeClassifier(random_state=124, min_samples_leaf=6)

features = iris.data
targets = iris.target

score = cross_val_score(dtc, features, targets, cv=5, scoring='accuracy')
print(np.round(np.mean(score), 4))

0.94


In [17]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score

iris = load_iris()

features = iris.data
targets = iris.target

X_train, X_test, y_train, y_test = \
train_test_split(features, targets, test_size=0.2, random_state=124)

dtc = DecisionTreeClassifier()
parameters = {'max_depth': [2, 3, 4], 'min_samples_split': [6, 7]}

In [26]:
g_dtc = GridSearchCV(dtc, 
                     param_grid=parameters, 
                     cv=5, 
                     refit=True, 
                     return_train_score=True, 
                     n_jobs=-1)

In [27]:
g_dtc.fit(X_train, y_train)

In [28]:
g_dtc.cv_results_

{'mean_fit_time': array([0.0021945 , 0.00119586, 0.00219994, 0.00239244, 0.00139651,
        0.00179806]),
 'std_fit_time': array([0.00073533, 0.00039676, 0.00097977, 0.00102183, 0.00048922,
        0.00074461]),
 'mean_score_time': array([0.00179472, 0.00119581, 0.00240035, 0.00119095, 0.00059824,
        0.00119791]),
 'std_score_time': array([0.00039432, 0.00039881, 0.00048419, 0.00038633, 0.00048846,
        0.00040194]),
 'param_max_depth': masked_array(data=[2, 2, 3, 3, 4, 4],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_min_samples_split': masked_array(data=[6, 7, 6, 7, 6, 7],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'max_depth': 2, 'min_samples_split': 6},
  {'max_depth': 2, 'min_samples_split': 7},
  {'max_depth': 3, 'min_samples_split': 6},
  {'max_depth': 3, 'min_samples_split': 7},
  {'max_depth': 4, 'min_sample

In [29]:
result_df = pd.DataFrame(g_dtc.cv_results_)
result_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.002194,0.000735,0.001795,0.000394,2,6,"{'max_depth': 2, 'min_samples_split': 6}",1.0,0.958333,0.958333,1.0,0.958333,0.975,0.020412,5,0.96875,0.979167,0.979167,0.96875,0.979167,0.975,0.005103
1,0.001196,0.000397,0.001196,0.000399,2,7,"{'max_depth': 2, 'min_samples_split': 7}",1.0,0.958333,0.958333,1.0,0.958333,0.975,0.020412,5,0.96875,0.979167,0.979167,0.96875,0.979167,0.975,0.005103
2,0.0022,0.00098,0.0024,0.000484,3,6,"{'max_depth': 3, 'min_samples_split': 6}",1.0,1.0,0.958333,1.0,1.0,0.991667,0.016667,1,1.0,1.0,1.0,1.0,1.0,1.0,0.0
3,0.002392,0.001022,0.001191,0.000386,3,7,"{'max_depth': 3, 'min_samples_split': 7}",1.0,1.0,0.958333,1.0,1.0,0.991667,0.016667,1,1.0,1.0,1.0,1.0,1.0,1.0,0.0
4,0.001397,0.000489,0.000598,0.000488,4,6,"{'max_depth': 4, 'min_samples_split': 6}",1.0,1.0,0.958333,1.0,1.0,0.991667,0.016667,1,1.0,1.0,1.0,1.0,1.0,1.0,0.0
5,0.001798,0.000745,0.001198,0.000402,4,7,"{'max_depth': 4, 'min_samples_split': 7}",1.0,1.0,0.958333,1.0,1.0,0.991667,0.016667,1,1.0,1.0,1.0,1.0,1.0,1.0,0.0


In [32]:
print(g_dtc.best_params_, g_dtc.best_score_, sep="\n")

{'max_depth': 3, 'min_samples_split': 6}
0.9916666666666668


In [33]:
g_dtc.best_estimator_

In [34]:
dtc = g_dtc.best_estimator_
prediction = dtc.predict(X_test)
accuracy_score(y_test, prediction)

0.9