**Корректность проверена на Python 3.6:**
+ numpy 1.15.4
+ sklearn 0.20.2

# Sklearn

## sklearn.model_selection

документация: http://scikit-learn.org/stable/modules/cross_validation.html

In [1]:
from sklearn import model_selection, datasets

import numpy as np

### Разовое разбиение данных на обучение и тест с помощью train_test_split

In [2]:
iris = datasets.load_iris()

In [3]:
train_data, test_data, train_labels, test_labels = model_selection.train_test_split(iris.data, iris.target, 
                                                                                     test_size = 0.3)

In [4]:
#убедимся, что тестовая выборка действительно составляет 0.3 от всех данных
float(len(test_labels))/len(iris.data)

0.3

In [5]:
print('Размер обучающей выборки: {} объектов \nРазмер тестовой выборки: {} объектов'.format(len(train_data),
                                                                                            len(test_data)))

Размер обучающей выборки: 105 объектов 
Размер тестовой выборки: 45 объектов


In [6]:
print('Обучающая выборка:\n', train_data[:5])
print('\n')
print('Тестовая выборка:\n', test_data[:5])

Обучающая выборка:
 [[6.3 2.5 5.  1.9]
 [5.5 4.2 1.4 0.2]
 [6.5 3.2 5.1 2. ]
 [5.  3.2 1.2 0.2]
 [6.9 3.1 4.9 1.5]]


Тестовая выборка:
 [[5.7 3.8 1.7 0.3]
 [4.9 3.  1.4 0.2]
 [5.9 3.2 4.8 1.8]
 [6.4 2.7 5.3 1.9]
 [5.4 3.9 1.7 0.4]]


In [7]:
print('Метки классов на обучающей выборке:\n', train_labels)
print('\n')
print('Метки классов на тестовой выборке:\n', test_labels)

Метки классов на обучающей выборке:
 [2 0 2 0 1 0 1 0 1 1 1 2 0 1 2 1 2 2 2 0 2 1 1 2 0 2 1 0 2 2 2 2 0 0 0 1 1
 1 2 2 1 2 0 1 0 0 0 1 1 1 0 2 0 2 1 1 0 0 0 1 1 0 1 1 1 0 0 2 0 1 0 2 1 2
 2 0 2 1 2 2 1 1 2 2 1 2 1 0 2 2 0 1 2 0 1 1 2 0 0 2 2 1 0 0 0]


Метки классов на тестовой выборке:
 [0 0 1 2 0 1 1 0 1 2 2 1 0 2 1 2 2 2 0 2 2 0 0 0 0 2 1 1 1 2 1 0 2 0 2 1 0
 2 2 0 0 1 1 0 1]


### Стратегии проведения кросс-валидации

In [44]:
#сгенерируем короткое подобие датасета, где элементы совпадают с порядковым номером
X = range(0,50)
print (X)

range(0, 50)


#### KFold

In [10]:
kf = model_selection.KFold(n_splits = 5)
for train_indices, test_indices in kf.split(X):
    print(train_indices, test_indices)

[2 3 4 5 6 7 8 9] [0 1]
[0 1 4 5 6 7 8 9] [2 3]
[0 1 2 3 6 7 8 9] [4 5]
[0 1 2 3 4 5 8 9] [6 7]
[0 1 2 3 4 5 6 7] [8 9]


In [11]:
kf = model_selection.KFold(n_splits = 2, shuffle = True)
for train_indices, test_indices in kf.split(X):
    print(train_indices, test_indices)

[1 4 5 7 9] [0 2 3 6 8]
[0 2 3 6 8] [1 4 5 7 9]


In [12]:
kf = model_selection.KFold(n_splits = 2, shuffle = True, random_state = 1)
for train_indices, test_indices in kf.split(X):
    print(train_indices, test_indices)

[1 3 5 7 8] [0 2 4 6 9]
[0 2 4 6 9] [1 3 5 7 8]


#### StratifiedKFold

In [37]:
y = np.array([0] * 5 + [1] * 5)
print(y)

skf = model_selection.StratifiedKFold(n_splits = 2, shuffle = True, random_state = 0)
for train_indices, test_indices in skf.split(X, y):
    print(train_indices, test_indices)

[0 0 0 0 0 1 1 1 1 1]
[3 4 8 9] [0 1 2 5 6 7]
[0 1 2 5 6 7] [3 4 8 9]


In [31]:
target = np.array([0, 1] * 5)
print(target)

skf = model_selection.StratifiedKFold(n_splits = 2,shuffle = True)
for train_indices, test_indices in skf.split(X, target):
    print(train_indices, test_indices)

[0 1 0 1 0 1 0 1 0 1]
[1 2 8 9] [0 3 4 5 6 7]
[0 3 4 5 6 7] [1 2 8 9]


#### ShuffleSplit

In [48]:
ss = model_selection.ShuffleSplit(n_splits = 40, test_size = 10)

for train_indices, test_indices in ss.split(X):
    print(train_indices, test_indices)
test_indices

[49 37 31 46  3 38 26 25 40 36 17 14  1 42  2 41 18 35  8 34 43  5 32 30
  4  9 16 27 20 23  7  0 28 19 15 47 44 12 11 10] [33 29 39 22 21 24 13 45  6 48]
[49 42 37  3 43 11 19 34 14 13  8 29  2 48  9 36 32  1 35 40 24 21 33 30
 38  4  7 17 45  6 39 10 46 26 20 15 31 28 25 18] [23 22 16  5 27 44  0 12 41 47]
[45 43 18 15 21  5 22 48 17 36 37 33 46 49 13 12 11 32 40 16  0  7 35 47
  4 23  2 10  9 30 39 25 44 26  3 19 20 28 38 24] [42 34 29 27  1 14 31  8 41  6]
[24 21  8 40 32 14 42 47 45 18 39 19 28 17 12 33 29 41 46 44 37  0  6  2
 38 11 31  7 48 26 23 13 27 20  9  5 43 35 25  3] [34 10 30 22 36  4  1 49 16 15]
[23  9 21  2 39 12  5 32 47 18 33 31 29 13  8 48 26 35 44 22  6 27 10 41
 43 15 38 42 37 16 17 30 49 45  0  4 28 24 36  7] [ 3  1 40 25 46 20 11 34 19 14]
[34  5 39  2 47 31 29  4 30 15 48 14 32 40  7 45 20 18  9 10 41 27 11 35
 28 42 24 43 33 12 22  0 38 19 16 49 21  3 13 25] [23 36  6  1  8 44 17 26 46 37]
[39 15  2 36 45  8 21 22 25  9  5 44 49 35 19 48 26 47  3  7 13 38 43 

array([26,  4, 25,  1, 13,  5, 40, 28,  3, 43])

#### StratifiedShuffleSplit

In [42]:
target = np.array([0] * 5 + [1] * 5)
print(target)

sss = model_selection.StratifiedShuffleSplit(n_splits = 4, test_size = 0.2)
for train_indices, test_indices in sss.split(X, target):
    print(train_indices, test_indices)

[0 0 0 0 0 1 1 1 1 1]
[5 9 4 3 6 7 2 1] [8 0]
[0 6 1 2 3 8 9 7] [5 4]
[6 1 9 8 7 2 4 3] [0 5]
[5 7 0 4 8 3 9 2] [1 6]


#### Leave-One-Out

In [43]:
loo = model_selection.LeaveOneOut()

for train_indices, test_index in loo.split(X):
    print(train_indices, test_index)

[1 2 3 4 5 6 7 8 9] [0]
[0 2 3 4 5 6 7 8 9] [1]
[0 1 3 4 5 6 7 8 9] [2]
[0 1 2 4 5 6 7 8 9] [3]
[0 1 2 3 5 6 7 8 9] [4]
[0 1 2 3 4 6 7 8 9] [5]
[0 1 2 3 4 5 7 8 9] [6]
[0 1 2 3 4 5 6 8 9] [7]
[0 1 2 3 4 5 6 7 9] [8]
[0 1 2 3 4 5 6 7 8] [9]


Больше стратегий проведения кросс-валидации доступно здесь: http://scikit-learn.org/stable/modules/cross_validation.html#cross-validation-iterators