Skip to content

Commit

Permalink
Refactor fast-hist, add tests for some updaters. (#3836)
Browse files Browse the repository at this point in the history
Add unittest for prune.

Add unittest for refresh.

Refactor fast_hist.

* Remove fast_hist_param.
* Rename to quantile_hist.

Add unittests for QuantileHist.

* Refactor QuantileHist into .h and .cc file.
* Remove sync.h.
* Remove MGPU_mock test.

Rename fast hist method to quantile hist.
  • Loading branch information
trivialfis committed Nov 7, 2018
1 parent 2b045aa commit 19ee0a3
Show file tree
Hide file tree
Showing 30 changed files with 1,366 additions and 983 deletions.
2 changes: 1 addition & 1 deletion amalgamation/xgboost-all0.cc
Expand Up @@ -48,7 +48,7 @@
#include "../src/tree/tree_model.cc"
#include "../src/tree/tree_updater.cc"
#include "../src/tree/updater_colmaker.cc"
#include "../src/tree/updater_fast_hist.cc"
#include "../src/tree/updater_quantile_hist.cc"
#include "../src/tree/updater_prune.cc"
#include "../src/tree/updater_refresh.cc"
#include "../src/tree/updater_sync.cc"
Expand Down
1 change: 0 additions & 1 deletion src/cli_main.cc
Expand Up @@ -19,7 +19,6 @@
#include <cstdio>
#include <cstring>
#include <vector>
#include "./common/sync.h"
#include "./common/config.h"


Expand Down
9 changes: 5 additions & 4 deletions src/common/hist_util.cc
Expand Up @@ -4,10 +4,11 @@
* \brief Utilities to store histograms
* \author Philip Cho, Tianqi Chen
*/
#include <rabit/rabit.h>
#include <dmlc/omp.h>
#include <numeric>
#include <vector>
#include "./sync.h"

#include "./random.h"
#include "./column_matrix.h"
#include "./hist_util.h"
Expand Down Expand Up @@ -216,7 +217,7 @@ FindGroups(const std::vector<unsigned>& feature_list,
const std::vector<size_t>& feature_nnz,
const ColumnMatrix& colmat,
size_t nrow,
const FastHistParam& param) {
const tree::TrainParam& param) {
/* Goal: Bundle features together that has little or no "overlap", i.e.
only a few data points should have nonzero values for
member features.
Expand Down Expand Up @@ -278,7 +279,7 @@ FindGroups(const std::vector<unsigned>& feature_list,
inline std::vector<std::vector<unsigned>>
FastFeatureGrouping(const GHistIndexMatrix& gmat,
const ColumnMatrix& colmat,
const FastHistParam& param) {
const tree::TrainParam& param) {
const size_t nrow = gmat.row_ptr.size() - 1;
const size_t nfeature = gmat.cut.row_ptr.size() - 1;

Expand Down Expand Up @@ -332,7 +333,7 @@ FastFeatureGrouping(const GHistIndexMatrix& gmat,

void GHistIndexBlockMatrix::Init(const GHistIndexMatrix& gmat,
const ColumnMatrix& colmat,
const FastHistParam& param) {
const tree::TrainParam& param) {
cut_ = &gmat.cut;

const size_t nrow = gmat.row_ptr.size() - 1;
Expand Down
5 changes: 1 addition & 4 deletions src/common/hist_util.h
Expand Up @@ -11,16 +11,13 @@
#include <limits>
#include <vector>
#include "row_set.h"
#include "../tree/fast_hist_param.h"
#include "../tree/param.h"
#include "./quantile.h"

namespace xgboost {

namespace common {

using tree::FastHistParam;

/*! \brief sums of gradient statistics corresponding to a histogram bin */
struct GHistEntry {
/*! \brief sum of first-order gradient statistics */
Expand Down Expand Up @@ -145,7 +142,7 @@ class GHistIndexBlockMatrix {
public:
void Init(const GHistIndexMatrix& gmat,
const ColumnMatrix& colmat,
const FastHistParam& param);
const tree::TrainParam& param);

inline GHistIndexBlock operator[](size_t i) const {
return {blocks_[i].row_ptr_begin, blocks_[i].index_begin};
Expand Down
2 changes: 1 addition & 1 deletion src/common/io.h
Expand Up @@ -9,9 +9,9 @@
#define XGBOOST_COMMON_IO_H_

#include <dmlc/io.h>
#include <rabit/rabit.h>
#include <string>
#include <cstring>
#include "./sync.h"

namespace xgboost {
namespace common {
Expand Down
13 changes: 0 additions & 13 deletions src/common/sync.h

This file was deleted.

4 changes: 2 additions & 2 deletions src/learner.cc
Expand Up @@ -211,8 +211,8 @@ class LearnerImpl : public Learner {
break;
case TreeMethod::kHist:
LOG(CONSOLE) << "Tree method is selected to be 'hist', which uses a "
"single updater grow_fast_histmaker.";
cfg_["updater"] = "grow_fast_histmaker";
"single updater grow_quantile_histmaker.";
cfg_["updater"] = "grow_quantile_histmaker";
break;
case TreeMethod::kGPUExact:
this->AssertGPUSupport();
Expand Down
2 changes: 1 addition & 1 deletion src/logging.cc
Expand Up @@ -4,9 +4,9 @@
* \brief Implementation of loggers.
* \author Tianqi Chen
*/
#include <rabit/rabit.h>
#include <xgboost/logging.h>
#include <iostream>
#include "./common/sync.h"

#if !defined(XGBOOST_STRICT_R_MODE) || XGBOOST_STRICT_R_MODE == 0
// Override logging mechanism for non-R interfaces
Expand Down
2 changes: 1 addition & 1 deletion src/metric/elementwise_metric.cc
Expand Up @@ -4,11 +4,11 @@
* \brief evaluation metrics for elementwise binary or regression.
* \author Kailong Chen, Tianqi Chen
*/
#include <rabit/rabit.h>
#include <xgboost/metric.h>
#include <dmlc/registry.h>
#include <cmath>
#include "../common/math.h"
#include "../common/sync.h"

namespace xgboost {
namespace metric {
Expand Down
2 changes: 1 addition & 1 deletion src/metric/multiclass_metric.cc
Expand Up @@ -4,9 +4,9 @@
* \brief evaluation metrics for multiclass classification.
* \author Kailong Chen, Tianqi Chen
*/
#include <rabit/rabit.h>
#include <xgboost/metric.h>
#include <cmath>
#include "../common/sync.h"
#include "../common/math.h"

namespace xgboost {
Expand Down
2 changes: 1 addition & 1 deletion src/metric/rank_metric.cc
Expand Up @@ -4,10 +4,10 @@
* \brief prediction rank based metrics.
* \author Kailong Chen, Tianqi Chen
*/
#include <rabit/rabit.h>
#include <xgboost/metric.h>
#include <dmlc/registry.h>
#include <cmath>
#include "../common/sync.h"
#include "../common/math.h"

namespace xgboost {
Expand Down
54 changes: 0 additions & 54 deletions src/tree/fast_hist_param.h

This file was deleted.

37 changes: 36 additions & 1 deletion src/tree/param.h
Expand Up @@ -81,6 +81,23 @@ struct TrainParam : public dmlc::Parameter<TrainParam> {
int gpu_batch_nrows;
// the criteria to use for ranking splits
std::string split_evaluator;

// ------ From cpu quantile histogram -------.
// percentage threshold for treating a feature as sparse
// e.g. 0.2 indicates a feature with fewer than 20% nonzeros is considered sparse
double sparse_threshold;
// use feature grouping? (default yes)
int enable_feature_grouping;
// when grouping features, how many "conflicts" to allow.
// conflict is when an instance has nonzero values for two or more features
// default is 0, meaning features should be strictly complementary
double max_conflict_rate;
// when grouping features, how much effort to expend to prevent singleton groups
// we'll try to insert each feature into existing groups before creating a new group
// for that feature; to save time, only up to (max_search_group) of existing groups
// will be considered. If set to zero, ALL existing groups will be examined
unsigned max_search_group;

// declare the parameters
DMLC_DECLARE_PARAMETER(TrainParam) {
DMLC_DECLARE_FIELD(learning_rate)
Expand Down Expand Up @@ -196,6 +213,24 @@ struct TrainParam : public dmlc::Parameter<TrainParam> {
DMLC_DECLARE_FIELD(split_evaluator)
.set_default("elastic_net,monotonic,interaction")
.describe("The criteria to use for ranking splits");

// ------ From cpu quantile histogram -------.
DMLC_DECLARE_FIELD(sparse_threshold).set_range(0, 1.0).set_default(0.2)
.describe("percentage threshold for treating a feature as sparse");
DMLC_DECLARE_FIELD(enable_feature_grouping).set_lower_bound(0).set_default(0)
.describe("if >0, enable feature grouping to ameliorate work imbalance "
"among worker threads");
DMLC_DECLARE_FIELD(max_conflict_rate).set_range(0, 1.0).set_default(0)
.describe("when grouping features, how many \"conflicts\" to allow."
"conflict is when an instance has nonzero values for two or more features."
"default is 0, meaning features should be strictly complementary.");
DMLC_DECLARE_FIELD(max_search_group).set_lower_bound(0).set_default(100)
.describe("when grouping features, how much effort to expend to prevent "
"singleton groups. We'll try to insert each feature into existing "
"groups before creating a new group for that feature; to save time, "
"only up to (max_search_group) of existing groups will be "
"considered. If set to zero, ALL existing groups will be examined.");

// add alias of parameters
DMLC_DECLARE_ALIAS(reg_lambda, lambda);
DMLC_DECLARE_ALIAS(reg_alpha, alpha);
Expand Down Expand Up @@ -518,7 +553,7 @@ struct SplitEntry {
this->loss_chg = new_loss_chg;
if (default_left) {
split_index |= (1U << 31);
}
}
this->sindex = split_index;
this->split_value = new_split_value;
return true;
Expand Down
2 changes: 1 addition & 1 deletion src/tree/tree_updater.cc
Expand Up @@ -31,7 +31,7 @@ DMLC_REGISTRY_LINK_TAG(updater_colmaker);
DMLC_REGISTRY_LINK_TAG(updater_skmaker);
DMLC_REGISTRY_LINK_TAG(updater_refresh);
DMLC_REGISTRY_LINK_TAG(updater_prune);
DMLC_REGISTRY_LINK_TAG(updater_fast_hist);
DMLC_REGISTRY_LINK_TAG(updater_quantile_hist);
DMLC_REGISTRY_LINK_TAG(updater_histmaker);
DMLC_REGISTRY_LINK_TAG(updater_sync);
#ifdef XGBOOST_USE_CUDA
Expand Down
4 changes: 3 additions & 1 deletion src/tree/updater_basemaker-inl.h
Expand Up @@ -7,15 +7,17 @@
#ifndef XGBOOST_TREE_UPDATER_BASEMAKER_INL_H_
#define XGBOOST_TREE_UPDATER_BASEMAKER_INL_H_

#include <rabit/rabit.h>

#include <xgboost/base.h>
#include <xgboost/tree_updater.h>
#include <vector>
#include <algorithm>
#include <string>
#include <limits>
#include <utility>

#include "./param.h"
#include "../common/sync.h"
#include "../common/io.h"
#include "../common/random.h"
#include "../common/quantile.h"
Expand Down
3 changes: 2 additions & 1 deletion src/tree/updater_colmaker.cc
Expand Up @@ -4,15 +4,16 @@
* \brief use columnwise update to construct a tree
* \author Tianqi Chen
*/
#include <rabit/rabit.h>
#include <xgboost/tree_updater.h>
#include <memory>
#include <vector>
#include <cmath>
#include <algorithm>

#include "./param.h"
#include "../common/random.h"
#include "../common/bitmap.h"
#include "../common/sync.h"
#include "split_evaluator.h"

namespace xgboost {
Expand Down

0 comments on commit 19ee0a3

Please sign in to comment.