Permalink
Browse files

option to shuffle data in mknfolds (#1459)

* option to shuffle data in mknfolds

* removed possibility to run as stand alone test

* split function def in 2 lines for lint

* option to shuffle data in mknfolds

* removed possibility to run as stand alone test

* split function def in 2 lines for lint
  • Loading branch information...
1 parent b49b339 commit fb0fc0c580bbf4c3a2030aeabc644eeda103fcfa @jokari69 jokari69 committed with terrytangyuan Dec 22, 2016
Showing with 22 additions and 6 deletions.
  1. +13 −6 python-package/xgboost/training.py
  2. +9 −0 tests/python/test_basic.py
@@ -222,17 +222,21 @@ def eval(self, iteration, feval):
return self.bst.eval_set(self.watchlist, iteration, feval)
-def mknfold(dall, nfold, param, seed, evals=(), fpreproc=None, stratified=False, folds=None):
+def mknfold(dall, nfold, param, seed, evals=(), fpreproc=None, stratified=False,
+ folds=None, shuffle=True):
"""
Make an n-fold list of CVPack from random indices.
"""
evals = list(evals)
np.random.seed(seed)
if stratified is False and folds is None:
- randidx = np.random.permutation(dall.num_row())
- kstep = int(len(randidx) / nfold)
- idset = [randidx[(i * kstep): min(len(randidx), (i + 1) * kstep)] for i in range(nfold)]
+ if shuffle is True:
+ idx = np.random.permutation(dall.num_row())
+ else:
+ idx = np.arange(dall.num_row())
+ kstep = int(len(idx) / nfold)
+ idset = [idx[(i * kstep): min(len(idx), (i + 1) * kstep)] for i in range(nfold)]
elif folds is not None and isinstance(folds, list):
idset = [x[1] for x in folds]
nfold = len(idset)
@@ -289,7 +293,7 @@ def aggcv(rlist):
def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None,
metrics=(), obj=None, feval=None, maximize=False, early_stopping_rounds=None,
fpreproc=None, as_pandas=True, verbose_eval=None, show_stdv=True,
- seed=0, callbacks=None):
+ seed=0, callbacks=None, shuffle=True):
# pylint: disable = invalid-name
"""Cross-validation with given parameters.
@@ -339,6 +343,8 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None
List of callback functions that are applied at end of each iteration.
It is possible to use predefined callbacks by using xgb.callback module.
Example: [xgb.callback.reset_learning_rate(custom_rates)]
+ shuffle : bool
+ Shuffle data before creating folds.
Returns
-------
@@ -367,7 +373,8 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None
params.pop("eval_metric", None)
results = {}
- cvfolds = mknfold(dtrain, nfold, params, seed, metrics, fpreproc, stratified, folds)
+ cvfolds = mknfold(dtrain, nfold, params, seed, metrics, fpreproc,
+ stratified, folds, shuffle)
# setup callbacks
callbacks = [] if callbacks is None else callbacks
@@ -241,3 +241,12 @@ def test_cv(self):
cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, as_pandas=False)
assert isinstance(cv, dict)
assert len(cv) == (4)
+
+ def test_cv_no_shuffle(self):
+ dm = xgb.DMatrix(dpath + 'agaricus.txt.train')
+ params = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic'}
+
+ # return np.ndarray
+ cv = xgb.cv(params, dm, num_boost_round=10, shuffle=False, nfold=10, as_pandas=False)
+ assert isinstance(cv, dict)
+ assert len(cv) == (4)

0 comments on commit fb0fc0c

Please sign in to comment.