# Cross-validation behavior in scikit-learn

## sklearn.model_selection의 CV함수 특징
* generator (생성함수). 단 train_test_split은 데이터셋을 직접 분할함
* tr/ts 인덱스를 생성함 
* 함수를 객체로 취급하여 각종 알고리듬 함수에 직접 제공하여 튜닝함

## KFold : K-fold CV

In [37]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
kf = KFold(n_splits=2)  # 생성기 반환

In [41]:
kf

KFold(n_splits=2, random_state=None, shuffle=False)

In [38]:
type(kf)

sklearn.model_selection._split.KFold

In [39]:
isinstance(kf, KFold)  # kf는 KFold의 instance

True

In [44]:
for tr, ts in kf.split(range(10)):   # Since shuffle=False
    print('%s %s' % (tr, ts))

[5 6 7 8 9] [0 1 2 3 4]
[0 1 2 3 4] [5 6 7 8 9]


In [45]:
kf = KFold(n_splits=2, shuffle=True)
for tr, ts in kf.split(range(10)):   # Since shuffle=False
    print('%s %s' % (tr, ts))

[0 1 2 5 6] [3 4 7 8 9]
[3 4 7 8 9] [0 1 2 5 6]


In [47]:
dfr = pd.DataFrame({'x':[1,2,3,4,5], 'y':['a','b','c','d','e']})
dfr

Unnamed: 0,x,y
0,1,a
1,2,b
2,3,c
3,4,d
4,5,e


In [48]:
for tr, ts in kf.split(dfr):
    print('%s %s' %(tr, ts))

[0 4] [1 2 3]
[1 2 3] [0 4]


In [49]:
tr

array([1, 2, 3])

In [50]:
ts

array([0, 4])