Skip to content

Commit

Permalink
option to shuffle data in mknfolds (#1459)
Browse files Browse the repository at this point in the history
* option to shuffle data in mknfolds

* removed possibility to run as stand alone test

* split function def in 2 lines for lint

* option to shuffle data in mknfolds

* removed possibility to run as stand alone test

* split function def in 2 lines for lint
  • Loading branch information
jokari69 authored and terrytangyuan committed Dec 22, 2016
1 parent b49b339 commit fb0fc0c
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 6 deletions.
19 changes: 13 additions & 6 deletions python-package/xgboost/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,17 +222,21 @@ def eval(self, iteration, feval):
return self.bst.eval_set(self.watchlist, iteration, feval)


def mknfold(dall, nfold, param, seed, evals=(), fpreproc=None, stratified=False, folds=None):
def mknfold(dall, nfold, param, seed, evals=(), fpreproc=None, stratified=False,
folds=None, shuffle=True):
"""
Make an n-fold list of CVPack from random indices.
"""
evals = list(evals)
np.random.seed(seed)

if stratified is False and folds is None:
randidx = np.random.permutation(dall.num_row())
kstep = int(len(randidx) / nfold)
idset = [randidx[(i * kstep): min(len(randidx), (i + 1) * kstep)] for i in range(nfold)]
if shuffle is True:
idx = np.random.permutation(dall.num_row())
else:
idx = np.arange(dall.num_row())
kstep = int(len(idx) / nfold)
idset = [idx[(i * kstep): min(len(idx), (i + 1) * kstep)] for i in range(nfold)]
elif folds is not None and isinstance(folds, list):
idset = [x[1] for x in folds]
nfold = len(idset)
Expand Down Expand Up @@ -289,7 +293,7 @@ def aggcv(rlist):
def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None,
metrics=(), obj=None, feval=None, maximize=False, early_stopping_rounds=None,
fpreproc=None, as_pandas=True, verbose_eval=None, show_stdv=True,
seed=0, callbacks=None):
seed=0, callbacks=None, shuffle=True):
# pylint: disable = invalid-name
"""Cross-validation with given parameters.
Expand Down Expand Up @@ -339,6 +343,8 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None
List of callback functions that are applied at end of each iteration.
It is possible to use predefined callbacks by using xgb.callback module.
Example: [xgb.callback.reset_learning_rate(custom_rates)]
shuffle : bool
Shuffle data before creating folds.
Returns
-------
Expand Down Expand Up @@ -367,7 +373,8 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None
params.pop("eval_metric", None)

results = {}
cvfolds = mknfold(dtrain, nfold, params, seed, metrics, fpreproc, stratified, folds)
cvfolds = mknfold(dtrain, nfold, params, seed, metrics, fpreproc,
stratified, folds, shuffle)

# setup callbacks
callbacks = [] if callbacks is None else callbacks
Expand Down
9 changes: 9 additions & 0 deletions tests/python/test_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,3 +241,12 @@ def test_cv(self):
cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, as_pandas=False)
assert isinstance(cv, dict)
assert len(cv) == (4)

def test_cv_no_shuffle(self):
dm = xgb.DMatrix(dpath + 'agaricus.txt.train')
params = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic'}

# return np.ndarray
cv = xgb.cv(params, dm, num_boost_round=10, shuffle=False, nfold=10, as_pandas=False)
assert isinstance(cv, dict)
assert len(cv) == (4)

0 comments on commit fb0fc0c

Please sign in to comment.