# KFold

In [22]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold , train_test_split
import numpy as np

iris = load_iris()
features = iris.data
label = iris.target

print('Iris Data Size : {}'.format(features.shape[0]) )
print(features.shape)

(150,)
Iris Data Size : 150
(150, 4)


In [15]:
dt_clf = DecisionTreeClassifier(random_state=156)
kfold = KFold(n_splits=10, shuffle=True)
cv_accuracy = []
n_iter = 0

# KFold.split( ) returns index for train and test dataset in array
for train_index, test_index  in kfold.split(features):
    
    X_train, X_test = features[train_index], features[test_index]
    y_train, y_test = label[train_index], label[test_index]
    
    dt_clf.fit(X_train , y_train)    # run DecisionTree Model 
    pred = dt_clf.predict(X_test)
    n_iter += 1
    
    accuracy = np.round(accuracy_score(y_test,pred), 4)
    train_size = X_train.shape[0]
    test_size = X_test.shape[0]
    print('\n#{0} cross_validation_accuracy :{1}, train data size: {2}, validation data size: {3}' \
          .format(n_iter, accuracy, train_size, test_size))
    print('#{0} validation_index:{1}'.format(n_iter,test_index))
    cv_accuracy.append(accuracy)
    
# take the average of accuracy from each iteration
print('\n## average_cross_validation_accuracy:', np.mean(cv_accuracy)) 


#1 cross_validation_accuracy :1.0, train data size: 135, validation data size: 15
#1 validation_index:[  7  11  24  32  39  66 109 113 122 127 132 137 141 143 145]

#2 cross_validation_accuracy :0.9333, train data size: 135, validation data size: 15
#2 validation_index:[ 22  36  41  45  47  49  51  55  78  80  82  89 104 131 138]

#3 cross_validation_accuracy :0.8667, train data size: 135, validation data size: 15
#3 validation_index:[  4  25  56  60  67  70  71  91 103 106 114 115 123 147 149]

#4 cross_validation_accuracy :0.9333, train data size: 135, validation data size: 15
#4 validation_index:[ 13  14  15  18  19  52  65  73  96  99 102 118 130 133 146]

#5 cross_validation_accuracy :0.9333, train data size: 135, validation data size: 15
#5 validation_index:[  0   8  28  33  37  42  53  58  83  93 101 107 110 117 135]

#6 cross_validation_accuracy :0.9333, train data size: 135, validation data size: 15
#6 validation_index:[  6  17  20  21  23  38  40  59  61  77  86 116 128 140 

# Stratified KFold

In [6]:
import pandas as pd

iris = load_iris()

iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
iris_df['label']=iris.target
iris_df['label'].value_counts()
iris_df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),label
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [7]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=12, shuffle=True, random_state = 120)
n_iter=0
fold = {}

for train_index, test_index in skf.split(iris_df, iris_df['label']):
    n_iter += 1
    fold[n_iter] = train_index, test_index
    
fold
    

{1: (array([  0,   1,   2,   3,   5,   6,   7,   8,   9,  10,  11,  12,  13,
          14,  15,  16,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,
          28,  30,  31,  32,  34,  35,  36,  37,  38,  39,  40,  41,  42,
          43,  44,  45,  46,  47,  49,  50,  51,  52,  53,  54,  55,  56,
          57,  59,  60,  61,  62,  63,  64,  65,  66,  68,  69,  70,  71,
          72,  73,  74,  75,  76,  77,  78,  79,  81,  82,  83,  84,  85,
          86,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,
         100, 101, 102, 103, 104, 105, 106, 107, 108, 110, 111, 112, 114,
         115, 116, 117, 118, 119, 120, 121, 122, 123, 125, 126, 127, 128,
         129, 130, 131, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
         143, 144, 145, 146, 147, 148, 149]),
  array([  4,  17,  29,  33,  48,  58,  67,  80,  87, 109, 113, 124, 132])),
 2: (array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  14,
          15,  16,  17,  20,  21,  22,  23,  24,  25,  26

In [27]:
dt_clf = DecisionTreeClassifier(random_state=156)

skfold = StratifiedKFold(n_splits=5, shuffle = True)
n_iter=0
cv_accuracy=[]

for train_index, test_index  in skfold.split(features, label): # label(y/target value) has to be added together with features(X)
    X_train, X_test = features[train_index], features[test_index]
    y_train, y_test = label[train_index], label[test_index]
    #train and predict
    dt_clf.fit(X_train , y_train)    
    pred = dt_clf.predict(X_test)

    
    n_iter += 1
    accuracy = np.round(accuracy_score(y_test,pred), 4)
    train_size = X_train.shape[0]
    test_size = X_test.shape[0]
    print('\n#{0} cross_validation_accuracy :{1}, train data size: {2}, validation data size: {3}'\
          .format(n_iter, accuracy, train_size, test_size))
    
    print('#{0} train_index:{1}'.format(n_iter,train_index))
    print('#{0} validation_index:{1}'.format(n_iter,test_index))
    cv_accuracy.append(accuracy)
    
print('\n## each_cross_validation_accuracy:', np.round(cv_accuracy, 4))
print('\n## average_cross_validation_accuracy:', np.mean(cv_accuracy)) 


#1 cross_validation_accuracy :0.9, train data size: 120, validation data size: 30
#1 train_index:[  0   1   2   3   4   5   6   7  10  11  12  13  14  15  16  17  18  19
  20  21  22  23  25  26  27  28  29  30  32  34  37  38  39  40  41  42
  43  46  47  48  50  52  53  54  55  56  57  58  59  60  62  63  64  65
  67  69  71  72  73  75  77  78  79  80  81  83  84  85  86  87  88  89
  90  91  93  95  96  97  98  99 101 103 104 107 108 109 110 111 112 113
 114 115 116 118 120 121 122 123 124 125 126 127 128 129 130 131 133 134
 135 136 139 140 141 142 143 144 145 147 148 149]
#1 validation_index:[  8   9  24  31  33  35  36  44  45  49  51  61  66  68  70  74  76  82
  92  94 100 102 105 106 117 119 132 137 138 146]

#2 cross_validation_accuracy :0.9667, train data size: 120, validation data size: 30
#2 train_index:[  0   1   2   3   4   5   6   7   8   9  10  11  13  14  15  16  17  19
  20  22  23  24  30  31  32  33  34  35  36  37  38  39  40  41  42  44
  45  46  48  49  50  51