## Cross Validation

In [4]:
import pandas as pd

df = pd.read_csv('datasets/iris.csv')
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


### Example #1: Splitting into K-Folds

In [22]:
# split first into k folds
from sklearn.model_selection import KFold
kfolds = KFold(n_splits=10, shuffle=True, random_state=42)

for train, test in kfolds.split(df):
    print('train: %s\ntest: %s\n\n' % (train, test))
    df_train = df.iloc[train]
    df_test = df.iloc[test]

train: [  0   1   2   3   4   5   6   7   8  10  11  13  14  15  16  17  20  21
  22  23  24  25  26  27  28  29  30  32  33  34  35  37  38  39  40  41
  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59
  60  61  62  63  65  66  67  69  70  71  72  74  75  77  79  80  81  83
  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 101
 102 103 104 105 106 107 108 109 111 112 113 114 115 116 117 119 120 121
 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139
 140 142 143 144 145 146 147 148 149]
test: [  9  12  18  19  31  36  64  68  73  76  78  82 110 118 141]


train: [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  27  28  31  32  33  34  35  36  37  38
  39  40  41  42  43  44  46  47  48  49  50  51  52  53  54  57  58  59
  60  61  62  63  64  65  66  67  68  70  71  72  73  74  75  76  77  78
  79  80  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96
  

### Example #2: K-Fold Cross Validation with Random Forest Classifier

In [26]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(criterion='gini', n_estimators = 100)

from sklearn.model_selection import cross_val_score
X_features = df.drop(["class"], axis=1)
y_label = df["class"]
scores = cross_val_score(model, X_features, y_label, cv=5, scoring='accuracy')
scores

array([0.96666667, 0.96666667, 0.93333333, 0.96666667, 1.        ])

### Example #3: K-Fold Cross Validation with a Shuffled K-Folds Split

In [30]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators = 100)

# allows us to customize our KFold function
from sklearn.model_selection import KFold
kfolds = KFold(n_splits=5, shuffle=True, random_state=42)

from sklearn.model_selection import cross_val_score
X_features = df.drop(["class"], axis=1)
y_label = df["class"]
scores = cross_val_score(model, X_features, y_label, cv=kfolds.split(df), scoring='accuracy')
scores

array([1.        , 0.96666667, 0.93333333, 0.93333333, 0.96666667])