Tratando de ver las diferentes formas de aplicar Cross-Validation (CV):

https://scikit-learn.org/stable/modules/cross_validation.html 

# Usando scikit-learn:


In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn import svm

from sklearn.model_selection import KFold
from sklearn.model_selection import RepeatedKFold

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import StratifiedShuffleSplit

import numpy as np

from sklearn.model_selection import cross_val_score


In [None]:
X, y = datasets.load_iris(return_X_y=True)
X.shape, y.shape

((150, 4), (150,))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(90, 4) (90,)
(60, 4) (60,)


## Clasificación usando SVM:

In [None]:
clf = svm.SVC(kernel='linear', C=1).fit(X_train, y_train)
clf.score(X_test, y_test)

0.8666666666666667

## Caso 1-1: CROSS_VAL_SCORE

Este es el modo más sencillo de apicar Cross_Validation con sklearn. 

Simplemente calcula el modelo tantas veces como le dices y te regresa cada resultado en un vector. 

In [None]:
clf = svm.SVC(kernel='linear', C=1)
scores = cross_val_score(clf, X, y, cv=5, scoring='accuracy')  # 'f1_macro'
print(scores)
print("Accuracy and 95%% confidence interval: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

[0.96666667 1.         0.96666667 0.96666667 1.        ]
Accuracy and 95% confidence interval: 0.98 (+/- 0.03)


Puedes encontrar más opciones de la evaluación en: 
https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter


## Caso 1-2: SHUFFLE_SPLIT

Simplemente repite n_splits veces la partición aleatoria dada por test_size:

In [None]:
from sklearn.model_selection import ShuffleSplit
n_samples = X.shape[0]
cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
cross_val_score(clf, X, y, cv=cv)

array([0.97777778, 0.97777778, 1.        , 0.95555556, 1.        ])

## CASO 1-3: KFold:

Este genera lo que se llama un iterator_index, es decir, genera los índices de los folds, y estos son los que usamos para aplicarlos a los datos... Es uno de los casos más prácticos para utilizar, ya que te permite manipular la información en cada interación/fold.

Observa que los índices generados son consecutivos, por lo que deberás aleatorizar primero todo el conjunto de datos original, que en en este ejemplo en particular ya lo hizo train_test_split() que usaste al inicio.

In [None]:
kf = KFold(n_splits=5)

for train_index, test_index in kf.split(X):
  
    print("\nTRAIN:", train_index, "\nTEST:", test_index)
    
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    clf = svm.SVC(kernel='linear', C=1).fit(X_train, y_train)
    print(clf.score(X_test, y_test))


TRAIN: [ 30  31  32  33  34  35  36  37  38  39  40  41  42  43  44  45  46  47
  48  49  50  51  52  53  54  55  56  57  58  59  60  61  62  63  64  65
  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80  81  82  83
  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 101
 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119
 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137
 138 139 140 141 142 143 144 145 146 147 148 149] TEST: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29]
1.0

TRAIN: [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  60  61  62  63  64  65
  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80  81  82  83
  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 101
 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119
 120 121 122 123 124 125 126

## RepeatedKFold

Cuando deseas repetir varias veces el KFold:

In [None]:
rkf = RepeatedKFold(n_splits=3, n_repeats=2, random_state=17)
for train_index, test_index in rkf.split(X):
    print("\nTRAIN:", train_index, "TEST:", test_index)
    
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    clf = svm.SVC(kernel='linear', C=1).fit(X_train, y_train)
    print(clf.score(X_test, y_test))


TRAIN: [  1   2   3   4   6   7   8  10  12  13  14  15  17  18  20  21  22  24
  25  26  27  30  31  32  33  35  36  39  40  41  42  43  44  45  46  47
  49  50  51  52  54  56  57  58  59  60  61  62  63  64  65  68  72  74
  76  77  79  82  83  87  88  90  91  93  94 101 102 103 104 105 106 107
 108 109 111 114 115 118 119 120 121 122 123 124 128 129 130 131 134 135
 136 137 138 139 140 141 143 146 147 148] TEST: [  0   5   9  11  16  19  23  28  29  34  37  38  48  53  55  66  67  69
  70  71  73  75  78  80  81  84  85  86  89  92  95  96  97  98  99 100
 110 112 113 116 117 125 126 127 132 133 142 144 145 149]
0.98

TRAIN: [  0   2   5   6   7   8   9  10  11  13  16  17  19  22  23  26  27  28
  29  30  31  32  34  37  38  39  41  44  48  49  50  51  53  54  55  56
  57  60  61  63  66  67  68  69  70  71  72  73  74  75  78  79  80  81
  83  84  85  86  87  88  89  91  92  95  96  97  98  99 100 103 106 107
 110 111 112 113 116 117 118 121 122 123 125 126 127 128 129 131 132 1

# Otros...

# Stratified KFold cases

Para aplicar Cross-Validation con partición estratificada.

In [None]:
skf = StratifiedKFold(n_splits=3)

In [None]:
for train_index, test_index in skf.split(X, y):
  print("\nTRAIN:", train_index, "TEST:", test_index)

  clf = svm.SVC(kernel='linear', C=1).fit(X_train, y_train)
  print(clf.score(X_test, y_test))


TRAIN: [ 17  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34
  35  36  37  38  39  40  41  42  43  44  45  46  47  48  49  67  68  69
  70  71  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87
  88  89  90  91  92  93  94  95  96  97  98  99 116 117 118 119 120 121
 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139
 140 141 142 143 144 145 146 147 148 149] TEST: [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  50
  51  52  53  54  55  56  57  58  59  60  61  62  63  64  65  66 100 101
 102 103 104 105 106 107 108 109 110 111 112 113 114 115]
0.9666666666666667

TRAIN: [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  34
  35  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52
  53  54  55  56  57  58  59  60  61  62  63  64  65  66  83  84  85  86
  87  88  89  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104
 105 106 107 108 109 110 111 112 113 114 115 133 134 135

# Stratified Shuffle Split

In [None]:
sss = StratifiedShuffleSplit(n_splits=3)

In [None]:
for train_index, test_index in sss.split(X, y):
  print("\nTRAIN:", train_index, "TEST:", test_index)

  clf = svm.SVC(kernel='linear', C=1).fit(X_train, y_train)
  print(clf.score(X_test, y_test))


TRAIN: [ 54 118 111 106 107 129  39  76  89 109 128  92  25  31 125 116  65  36
  87 141  77  47  50 145  73  33 131  28 134 121  53   8  97 110 104  75
  42  94   7 138  74  24   6  35  43   4 139  78 100  67  95   2  12 115
 147  40  32 148 108  21 127  14  83  17  22  49   9  51  37  44  70  41
  34  30  85 149 124  29  64  72 123 117 130  59  84 114  66  57 126  56
  90 103  99  81  71 105  60  19 143  48  63  91  13  93 122 101 137 119
  20  61  80  68 142  98  58  88  55 112 113 146  69  15  23 144  45   3
  46  62  10  38   5  11 140  26 132] TEST: [ 52 102 133  79  18 120  82 136  16  96 135   0  86   1  27]
0.98

TRAIN: [120  96  70   0   4   8 112  12  61  60  84  58  20  82  35  97  43 103
 149 101  33  63  69 119  28  27  42  64  34 136  67  99  89 125   1  73
  68  29 147  24  25 111  77   2  39  52  50  54 107 117  38  18 141  90
  40  76  44 121 115 143 140 135  41  80  88   7  56 105  94  32  91 146
  62 114  37  55 137  75  23  92  93  79 100  36  78  86 129 128  26 1