In [28]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score, GridSearchCV, train_test_split
from sklearn.datasets import load_iris
import numpy as np
import pandas as pd

In [2]:
iris = load_iris()
features = iris.data
label = iris.target

In [3]:
kfold = KFold(n_splits=5)

In [4]:
features.shape

(150, 4)

In [10]:
dt_clf = DecisionTreeClassifier(random_state=11)
n_iter = 0
cv_accuracy = []

for train_index, test_index in kfold.split(features):
    X_train, X_test = features[train_index], features[test_index]
    y_train, y_test = label[train_index], label[test_index]
    #print(X_train.shape, X_test.shape)
    
    dt_clf.fit(X_train, y_train)
    pred = dt_clf.predict(X_test)
    n_iter += 1
    
    accuracy = np.round(accuracy_score(y_test, pred), 4)
    print(n_iter, accuracy)
    
    cv_accuracy.append(accuracy)
    
print("평균 정확도 :", np.mean(cv_accuracy))

1 1.0
2 0.9667
3 0.8667
4 0.9333
5 0.8333
평균 정확도 : 0.9200000000000002


In [12]:
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
iris_df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [13]:
iris_df['label'] = iris.target
iris_df['label'].value_counts()

2    50
1    50
0    50
Name: label, dtype: int64

In [14]:
iris_df.head(10)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),label
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
5,5.4,3.9,1.7,0.4,0
6,4.6,3.4,1.4,0.3,0
7,5.0,3.4,1.5,0.2,0
8,4.4,2.9,1.4,0.2,0
9,4.9,3.1,1.5,0.1,0


In [17]:
kfold = KFold(n_splits=3)
n_iter = 0
for train_index, test_index in kfold.split(iris_df):
    n_iter = n_iter + 1
    label_train = iris_df['label'].iloc[train_index]
    label_test = iris_df['label'].iloc[test_index]
    print(f'{n_iter+1}번째 교차 검증')
    print('학습 레이블 분포:\n', label_train.value_counts())
    print('검증 레이블 분포:\n', label_test.value_counts())

2번째 교차 검증
학습 레이블 분포:
 2    50
1    50
Name: label, dtype: int64
검증 레이블 분포:
 0    50
Name: label, dtype: int64
3번째 교차 검증
학습 레이블 분포:
 2    50
0    50
Name: label, dtype: int64
검증 레이블 분포:
 1    50
Name: label, dtype: int64
4번째 교차 검증
학습 레이블 분포:
 1    50
0    50
Name: label, dtype: int64
검증 레이블 분포:
 2    50
Name: label, dtype: int64


In [20]:
skf = StratifiedKFold(n_splits=3)
n_iter = 0

for train_index, test_index in skf.split(iris_df, iris_df['label']):
    n_iter = n_iter + 1
    label_train = iris_df['label'].iloc[train_index]
    label_test = iris_df['label'].iloc[test_index]
    print(f'{n_iter+1}번째 교차 검증')
    print('학습 레이블 분포:\n', label_train.value_counts())
    print('검증 레이블 분포:\n', label_test.value_counts())

2번째 교차 검증
학습 레이블 분포:
 2    34
1    33
0    33
Name: label, dtype: int64
검증 레이블 분포:
 1    17
0    17
2    16
Name: label, dtype: int64
3번째 교차 검증
학습 레이블 분포:
 1    34
2    33
0    33
Name: label, dtype: int64
검증 레이블 분포:
 2    17
0    17
1    16
Name: label, dtype: int64
4번째 교차 검증
학습 레이블 분포:
 0    34
2    33
1    33
Name: label, dtype: int64
검증 레이블 분포:
 2    17
1    17
0    16
Name: label, dtype: int64


In [21]:
dt_clf = DecisionTreeClassifier(random_state=11)
n_iter = 0
cv_accuracy = []
skfold = StratifiedKFold(n_splits=5)

for train_index, test_index in skfold.split(features, label):
    X_train = features[train_index]
    X_test = features[test_index]
    y_train, y_test = label[train_index], label[test_index]
    
    dt_clf.fit(X_train, y_train)
    pred = dt_clf.predict(X_test)

    n_iter += 1
    
    accuracy = np.round(accuracy_score(y_test, pred), 4)
    print(n_iter, accuracy)
    
    cv_accuracy.append(accuracy)
    
print("평균 정확도 :", np.mean(cv_accuracy))

1 0.9667
2 0.9667
3 0.9
4 0.9667
5 1.0
평균 정확도 : 0.9600200000000001


# cross_val_score
- label 파라미터를 입력하면 stratified kfold, 입력하지 않으면 normal kfold

In [None]:
scores = cross_val_score()

In [23]:
cross_val_score?

In [24]:
dt_clf = DecisionTreeClassifier(random_state=11)
scores = cross_val_score(dt_clf, features, label, scoring='accuracy', cv=5)
print(scores)
print(np.mean(scores))

[0.96666667 0.96666667 0.9        0.96666667 1.        ]
0.9600000000000002


# GridSearchCV
### Stratified Kfold vs. Normal Kfold

```
grid_dtree = GridSearchCV(dt_clf, param_grid=parameters, cv=5, refit=True, return_train_score=True)
grid_dtree_skf = GridSearchCV(dt_clf, param_grid=parameters, cv=StratifiedKFold(n_splits=5), refit=True, return_train_score=True)
```
`cv`
- int, cross-validation generator or an iterable, default=None
- Determines the cross-validation splitting strategy. Possible inputs for cv are:
    - None, to use the default 5-fold cross validation,
    - integer, to specify the number of folds in a (Stratified)KFold,
    - CV splitter,
    - An iterable yielding (train, test) splits as arrays of indices.

For integer/None inputs, if the estimator is a classifier and y is either binary or multiclass, StratifiedKFold is used. In all other cases, KFold is used.

In [42]:
GridSearchCV?

In [29]:
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target,
                                                    test_size=0.2, random_state=11)
dt_clf = DecisionTreeClassifier()
parameters = {'max_depth':[1,2,3],
             'min_samples_split':[2,3]}

In [63]:
grid_dtree = GridSearchCV(dt_clf, param_grid=parameters, cv=5, refit=True, return_train_score=True)

In [64]:
grid_dtree_skf = GridSearchCV(dt_clf, param_grid=parameters, cv=StratifiedKFold(n_splits=5), refit=True, return_train_score=True)

In [65]:
grid_dtree

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'max_depth': [1, 2, 3], 'min_samples_split': [2, 3]},
             return_train_score=True)

In [66]:
grid_dtree_skf

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=DecisionTreeClassifier(),
             param_grid={'max_depth': [1, 2, 3], 'min_samples_split': [2, 3]},
             return_train_score=True)

In [67]:
grid_dtree.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'max_depth': [1, 2, 3], 'min_samples_split': [2, 3]},
             return_train_score=True)

In [68]:
grid_dtree_skf.fit(X_train, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=DecisionTreeClassifier(),
             param_grid={'max_depth': [1, 2, 3], 'min_samples_split': [2, 3]},
             return_train_score=True)

In [69]:
scores_df = pd.DataFrame(grid_dtree.cv_results_)

In [70]:
grid_dtree.best_params_, grid_dtree.best_score_

({'max_depth': 3, 'min_samples_split': 2}, 0.9666666666666668)

In [71]:
grid_dtree_skf.best_params_, grid_dtree_skf.best_score_

({'max_depth': 3, 'min_samples_split': 2}, 0.9666666666666668)