In [None]:
# ------------------------------------------
# cross validate method for model selection
# method 1: cross_val_score
# method 2: cross_validate
# method 3: cross_val_predict
# method 4: KFlod
# method 5: LeaveOneOut
# ------------------------------------------

In [8]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn import svm

iris = datasets.load_iris()

In [13]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

# cross_val_score 可以针对指定的metric进行多次迭代测试
clf = svm.SVC(kernel='linear', C=1)
scores = cross_val_score(clf, iris.data, iris.target, cv=5, scoring='f1_macro')
scores

array([ 0.96658312,  1.        ,  0.96658312,  0.96658312,  1.        ])

In [21]:
# cross_val_score可以指定交叉验证次数和样本的选择方法
from sklearn.model_selection import ShuffleSplit
n_samples = iris.data.shape[0]
cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0) # n_splits 迭代次数
cross_val_score(clf, iris.data, iris.target, cv=cv)

array([ 0.97777778,  0.97777778,  1.        ,  0.95555556,  1.        ])

In [23]:
# cross_validate 与 cross_val_score的区别在于它可以指定对个metrics，且可以计算fit和scoring的time
from sklearn.model_selection import cross_validate
from sklearn.metrics import recall_score

scoring = ['precision_macro', 'recall_macro']
scores = cross_validate(clf, iris.data, iris.target, scoring=scoring, cv=5, return_train_score=False)
sorted(scores.keys())
scores['test_recall_macro']

array([ 0.96666667,  1.        ,  0.96666667,  0.96666667,  1.        ])

In [38]:
# obtaining predictions by cross-validation
from sklearn.model_selection import cross_val_predict
predicted = cross_val_predict(clf, iris.data, iris.target, cv=10)
print(len(iris.target), len(predicted))
metrics.accuracy_score(iris.target, predicted) 

150 150


0.97333333333333338

In [42]:
# K-fold
from sklearn.model_selection import KFold
X = ['a', 'b', 'c', 'd']
kf = KFold(n_splits=2)
for train, test in kf.split(X):
    print('%s %s'% (train, test))

# leave one out
from sklearn.model_selection import LeaveOneOut
loo = LeaveOneOut()
for train, test in loo.split(X):
    print('%s %s'% (train, test))

[2 3] [0 1]
[0 1] [2 3]
[1 2 3] [0]
[0 2 3] [1]
[0 1 3] [2]
[0 1 2] [3]
