# Cross validation with KFold

Split the train set in k folds and use each fold as a validation set with the other folds as train set.

The result is then averaged.

Every observations will be in the testing set at least once.

In [1]:
from sklearn.model_selection import KFold, cross_val_score
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn import svm

In [2]:
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
y = np.array([1, 2, 3, 4])

In [3]:
for train_index, test_index in KFold(n_splits=2).split(X):
    print("Train:", train_index, "Test:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

Train: [2 3] Test: [0 1]
Train: [0 1] Test: [2 3]


In [4]:
for train_index, test_index in KFold(n_splits=4).split(X):
    print("Train:", train_index, "Test:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

Train: [1 2 3] Test: [0]
Train: [0 2 3] Test: [1]
Train: [0 1 3] Test: [2]
Train: [0 1 2] Test: [3]


## Using K-Fold cross validation with a classifier

In [5]:
X, y = datasets.load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

In [6]:
clf = svm.SVC(kernel='linear', C=1).fit(X_train, y_train)
clf.score(X_test, y_test)

0.9666666666666667

In [7]:
clf = svm.SVC(kernel='linear', C=1)
scores = cross_val_score(clf, X, y, cv=5)
scores

array([0.96666667, 1.        , 0.96666667, 0.96666667, 1.        ])

In [8]:
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.98 (+/- 0.03)
