# cross_validate

from scikit-learn tutorial, https://scikit-learn.org/stable/modules/cross_validation.html

In [2]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn import svm

X, y = datasets.load_iris(return_X_y=True)
X.shape, y.shape

((150, 4), (150,))

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.4, random_state=0)

X_train.shape, y_train.shape
X_test.shape, y_test.shape

clf = svm.SVC(kernel='linear', C=1).fit(X_train, y_train)
clf.score(X_test, y_test)

0.9666666666666667

In [8]:
# The simplest way to use cross-validation is to call the
# cross_val_score helper function on the estimator and the dataset.
from sklearn.model_selection import cross_val_score
clf = svm.SVC(kernel='linear', C=1, random_state=42)
scores = cross_val_score(clf, X, y, cv=5)
scores

array([0.96666667, 1.        , 0.96666667, 0.96666667, 1.        ])

In [9]:
# When the cv argument is an integer, cross_val_score uses
# the KFold or StratifiedKFold strategies by default,
# the latter being used if the estimator derives from ClassifierMixin.

# It is also possible to use other cross validation strategies 
# by passing a cross validation iterator instead, for instance:
from sklearn.model_selection import ShuffleSplit
n_samples = X.shape[0]
cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
cross_val_score(clf, X, y, cv=cv)

array([0.97777778, 0.97777778, 1.        , 0.95555556, 1.        ])

In [11]:
# Another option is to use an iterable yielding 
# (train, test) splits as arrays of indices, for example:
def custom_cv_2folds(X):
     n = X.shape[0]
     i = 1
     while i <= 2:
         idx = np.arange(n * (i - 1) / 2, n * i / 2, dtype=int)
         yield idx, idx
         i += 1

custom_cv = custom_cv_2folds(X)
cross_val_score(clf, X, y, cv=custom_cv)

array([1.        , 0.97333333])

In [12]:
# Just as it is important to test a predictor on data held-out from 
# training, preprocessing (such as standardization, feature selection, etc.) 
# and similar data transformations similarly should be learnt from a 
# training set and applied to held-out data for prediction:
from sklearn import preprocessing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.4, random_state=0)
scaler = preprocessing.StandardScaler().fit(X_train)
X_train_transformed = scaler.transform(X_train)
clf = svm.SVC(C=1).fit(X_train_transformed, y_train)
X_test_transformed = scaler.transform(X_test)
clf.score(X_test_transformed, y_test)

0.9333333333333333

In [14]:
# A Pipeline makes it easier to compose estimators, providing this behavior 
# under cross-validation:
from sklearn.pipeline import make_pipeline
clf = make_pipeline(preprocessing.StandardScaler(), svm.SVC(C=1))
cross_val_score(clf, X, y, cv=cv)

array([0.97777778, 0.93333333, 0.95555556, 0.93333333, 0.97777778])

In [15]:
# The cross_validate function differs from cross_val_score in two ways:

# It allows specifying multiple metrics for evaluation.

# It returns a dict containing fit-times, score-times (and optionally 
# training scores, fitted estimators, train-test split indices) in addition 
# to the test score.

# For single metric evaluation, where the scoring parameter is a string, 
# callable or None, the keys will be - ['test_score', 'fit_time', 
# 'score_time']

# And for multiple metric evaluation, the return value is a dict with the 
# following keys - ['test_<scorer1_name>', 'test_<scorer2_name>', 
# 'test_<scorer...>', 'fit_time', 'score_time']

# return_train_score is set to False by default to save computation time. To 
# evaluate the scores on the training set as well you need to set it to 
# True. You may also retain the estimator fitted on each training set by 
# setting return_estimator=True. Similarly, you may set return_indices=True 
# to retain the training and testing indices used to split the dataset into 
# train and test sets for each cv split.

# The multiple metrics can be specified either as a list, tuple or set of 
# predefined scorer names:
from sklearn.model_selection import cross_validate
from sklearn.metrics import recall_score
scoring = ['precision_macro', 'recall_macro']
clf = svm.SVC(kernel='linear', C=1, random_state=0)
scores = cross_validate(clf, X, y, scoring=scoring)
sorted(scores.keys()) # ['fit_time', 'score_time', 
# 'test_precision_macro', 'test_recall_macro']
scores['test_recall_macro']


array([0.96666667, 1.        , 0.96666667, 0.96666667, 1.        ])