Merge branch 'master' of https://github.com/dmlc/xgboost into optimiz…

…ation_part_applysplit
dmlc · Jun 26, 2022 · 7b7ca83 · 7b7ca83
2 parents 7780172 + 0725fd6
commit 7b7ca83
Show file tree

Hide file tree

Showing 197 changed files with 2,981 additions and 3,185 deletions.
diff --git a/.github/workflows/r_tests.yml b/.github/workflows/r_tests.yml
@@ -13,7 +13,7 @@ jobs:
     strategy:
       matrix:
         config:
-          - {os: windows-latest, r: 'release', compiler: 'mingw', build: 'autotools'}
+          - {os: ubuntu-latest, r: 'release'}
     env:
       R_REMOTES_NO_ERRORS_FROM_WARNINGS: true
       RSPM: ${{ matrix.config.rspm }}
@@ -49,8 +49,9 @@ jobs:
     - name: Run lintr
       run: |
         cd R-package
-        R.exe CMD INSTALL .
-        Rscript.exe tests/helper_scripts/run_lint.R
+        R CMD INSTALL .
+        # Disable lintr errors for now: https://github.com/dmlc/xgboost/issues/8012
+        Rscript tests/helper_scripts/run_lint.R || true
 
   test-with-R:
     runs-on: ${{ matrix.config.os }}

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -159,6 +159,11 @@ if (USE_OPENMP)
   endif (APPLE)
   find_package(OpenMP REQUIRED)
 endif (USE_OPENMP)
+#Add for IBM i
+if (${CMAKE_SYSTEM_NAME} MATCHES "OS400")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
+  set(CMAKE_CXX_ARCHIVE_CREATE "<CMAKE_AR> -X64 qc <TARGET> <OBJECTS>")
+endif()
 
 if (USE_NCCL)
   find_package(Nccl REQUIRED)
@@ -201,6 +206,10 @@ endif (JVM_BINDINGS)
 # Plugin
 add_subdirectory(${xgboost_SOURCE_DIR}/plugin)
 
+if (PLUGIN_RMM)
+  find_package(rmm REQUIRED)
+endif (PLUGIN_RMM)
+
 #-- library
 if (BUILD_STATIC_LIB)
   add_library(xgboost STATIC)

diff --git a/Jenkinsfile b/Jenkinsfile
@@ -397,7 +397,7 @@ def TestCppGPU(args) {
   node(nodeReq) {
     unstash name: "xgboost_cpp_tests_cuda${artifact_cuda_version}"
     unstash name: 'srcs'
-    echo "Test C++, CUDA ${args.host_cuda_version}"
+    echo "Test C++, CUDA ${args.host_cuda_version}, rmm: ${args.test_rmm}"
     def container_type = "gpu"
     def docker_binary = "nvidia-docker"
     def docker_args = "--build-arg CUDA_VERSION_ARG=${args.host_cuda_version}"
@@ -410,7 +410,7 @@ def TestCppGPU(args) {
       docker_binary = "nvidia-docker"
       docker_args = "--build-arg CUDA_VERSION_ARG=${args.host_cuda_version}"
       sh """
-      ${dockerRun} ${container_type} ${docker_binary} ${docker_args} bash -c "source activate gpu_test && build/testxgboost --use-rmm-pool --gtest_filter=-*DeathTest.*"
+      ${dockerRun} ${container_type} ${docker_binary} ${docker_args} bash -c "source activate gpu_test && build/testxgboost --use-rmm-pool"
       """
     }
     deleteDir()

diff --git a/R-package/tests/helper_scripts/run_lint.R b/R-package/tests/helper_scripts/run_lint.R
@@ -13,7 +13,7 @@ my_linters <- list(
   object_usage_linter = lintr::object_usage_linter,
   object_length_linter = lintr::object_length_linter,
   open_curly_linter = lintr::open_curly_linter,
-  semicolon = lintr::semicolon_terminator_linter,
+  semicolon = lintr::semicolon_terminator_linter(semicolon = c("compound", "trailing")),
   seq = lintr::seq_linter,
   spaces_inside_linter = lintr::spaces_inside_linter,
   spaces_left_parentheses_linter = lintr::spaces_left_parentheses_linter,

diff --git a/amalgamation/xgboost-all0.cc b/amalgamation/xgboost-all0.cc
@@ -55,7 +55,6 @@
 #include "../src/tree/tree_updater.cc"
 #include "../src/tree/updater_approx.cc"
 #include "../src/tree/updater_colmaker.cc"
-#include "../src/tree/updater_histmaker.cc"
 #include "../src/tree/updater_prune.cc"
 #include "../src/tree/updater_quantile_hist.cc"
 #include "../src/tree/updater_refresh.cc"

diff --git a/cmake/Utils.cmake b/cmake/Utils.cmake
@@ -144,6 +144,15 @@ function(xgboost_set_cuda_flags target)
     set_property(TARGET ${target} PROPERTY CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES})
   endif (CMAKE_VERSION VERSION_GREATER_EQUAL "3.18")
 
+  if (FORCE_COLORED_OUTPUT)
+    if (FORCE_COLORED_OUTPUT AND (CMAKE_GENERATOR STREQUAL "Ninja") AND
+        ((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") OR
+          (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")))
+      target_compile_options(${target} PRIVATE
+        $<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-fdiagnostics-color=always>)
+    endif()
+  endif (FORCE_COLORED_OUTPUT)
+
   if (USE_DEVICE_DEBUG)
     target_compile_options(${target} PRIVATE
       $<$<AND:$<CONFIG:DEBUG>,$<COMPILE_LANGUAGE:CUDA>>:-G;-src-in-ptx>)
@@ -169,10 +178,17 @@ function(xgboost_set_cuda_flags target)
       $<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=/utf-8>)
   endif (MSVC)
 
-  set_target_properties(${target} PROPERTIES
-    CUDA_STANDARD 14
-    CUDA_STANDARD_REQUIRED ON
-    CUDA_SEPARABLE_COMPILATION OFF)
+  if (PLUGIN_RMM)
+    set_target_properties(${target} PROPERTIES
+      CUDA_STANDARD 17
+      CUDA_STANDARD_REQUIRED ON
+      CUDA_SEPARABLE_COMPILATION OFF)
+  else ()
+    set_target_properties(${target} PROPERTIES
+      CUDA_STANDARD 14
+      CUDA_STANDARD_REQUIRED ON
+      CUDA_SEPARABLE_COMPILATION OFF)
+  endif (PLUGIN_RMM)
 endfunction(xgboost_set_cuda_flags)
 
 macro(xgboost_link_nccl target)
@@ -189,10 +205,18 @@ endmacro(xgboost_link_nccl)
 
 # compile options
 macro(xgboost_target_properties target)
-  set_target_properties(${target} PROPERTIES
-    CXX_STANDARD 14
-    CXX_STANDARD_REQUIRED ON
-    POSITION_INDEPENDENT_CODE ON)
+  if (PLUGIN_RMM)
+    set_target_properties(${target} PROPERTIES
+      CXX_STANDARD 17
+      CXX_STANDARD_REQUIRED ON
+      POSITION_INDEPENDENT_CODE ON)
+  else ()
+    set_target_properties(${target} PROPERTIES
+      CXX_STANDARD 14
+      CXX_STANDARD_REQUIRED ON
+      POSITION_INDEPENDENT_CODE ON)
+  endif (PLUGIN_RMM)
+
   if (HIDE_CXX_SYMBOLS)
     #-- Hide all C++ symbols
     set_target_properties(${target} PROPERTIES
@@ -204,7 +228,9 @@ macro(xgboost_target_properties target)
 
   if (ENABLE_ALL_WARNINGS)
     target_compile_options(${target} PUBLIC
-      $<IF:$<COMPILE_LANGUAGE:CUDA>,-Xcompiler=-Wall -Xcompiler=-Wextra,-Wall -Wextra>
+      $<IF:$<COMPILE_LANGUAGE:CUDA>,
+      -Xcompiler=-Wall -Xcompiler=-Wextra -Xcompiler=-Wno-expansion-to-defined,
+      -Wall -Wextra -Wno-expansion-to-defined>
     )
   endif(ENABLE_ALL_WARNINGS)
 
@@ -247,6 +273,10 @@ macro(xgboost_target_defs target)
       PRIVATE
       -DXGBOOST_BUILTIN_PREFETCH_PRESENT=1)
   endif (XGBOOST_BUILTIN_PREFETCH_PRESENT)
+
+  if (PLUGIN_RMM)
+    target_compile_definitions(objxgboost PUBLIC -DXGBOOST_USE_RMM=1)
+  endif (PLUGIN_RMM)
 endmacro(xgboost_target_defs)
 
 # handles dependencies
@@ -269,6 +299,10 @@ macro(xgboost_target_link_libraries target)
     xgboost_set_cuda_flags(${target})
   endif (USE_CUDA)
 
+  if (PLUGIN_RMM)
+    target_link_libraries(${target} PRIVATE rmm::rmm)
+  endif (PLUGIN_RMM)
+
   if (USE_NCCL)
     xgboost_link_nccl(${target})
   endif (USE_NCCL)

diff --git a/demo/nvflare/README.md b/demo/nvflare/README.md
@@ -3,6 +3,8 @@
 This directory contains a demo of Federated Learning using
 [NVFlare](https://nvidia.github.io/NVFlare/).
 
+## Training with CPU only
+
 To run the demo, first build XGBoost with the federated learning plugin enabled (see the
 [README](../../plugin/federated/README.md)).
 
@@ -53,3 +55,12 @@ Finally, shutdown everything from the admin CLI:
 shutdown client
 shutdown server
 ```
+
+## Training with GPUs
+
+To demo with Federated Learning using GPUs, make sure your machine has at least 2 GPUs.
+Build XGBoost with the federated learning plugin enabled along with CUDA, but with NCCL
+turned off (see the [README](../../plugin/federated/README.md)).
+
+Modify `config/config_fed_client.json` and set `use_gpus` to `true`, then repeat the steps
+above.
diff --git a/demo/nvflare/config/config_fed_client.json b/demo/nvflare/config/config_fed_client.json
@@ -12,7 +12,8 @@
           "world_size": 2,
           "server_cert_path": "server-cert.pem",
           "client_key_path": "client-key.pem",
-          "client_cert_path": "client-cert.pem"
+          "client_cert_path": "client-cert.pem",
+          "use_gpus": "false"
         }
       }
     }

diff --git a/demo/nvflare/custom/trainer.py b/demo/nvflare/custom/trainer.py
@@ -16,7 +16,7 @@ class SupportedTasks(object):
 
 class XGBoostTrainer(Executor):
     def __init__(self, server_address: str, world_size: int, server_cert_path: str,
-                 client_key_path: str, client_cert_path: str):
+                 client_key_path: str, client_cert_path: str, use_gpus: bool):
         """Trainer for federated XGBoost.
 
         Args:
@@ -32,6 +32,7 @@ def __init__(self, server_address: str, world_size: int, server_cert_path: str,
         self._server_cert_path = server_cert_path
         self._client_key_path = client_key_path
         self._client_cert_path = client_cert_path
+        self._use_gpus = use_gpus
 
     def execute(self, task_name: str, shareable: Shareable, fl_ctx: FLContext,
                 abort_signal: Signal) -> Shareable:
@@ -66,6 +67,10 @@ def _do_training(self, fl_ctx: FLContext):
 
             # Specify parameters via map, definition are same as c++ version
             param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
+            if self._use_gpus:
+                self.log_info(fl_ctx, f'Training with GPU {rank}')
+                param['tree_method'] = 'gpu_hist'
+                param['gpu_id'] = rank
 
             # Specify validations set to watch performance
             watchlist = [(dtest, 'eval'), (dtrain, 'train')]

diff --git a/dmlc-core b/dmlc-core
diff --git a/doc/build.rst b/doc/build.rst
@@ -136,9 +136,9 @@ From the command line on Linux starting from the XGBoost directory:
 
   To speed up compilation, the compute version specific to your GPU could be passed to cmake as, e.g., ``-DGPU_COMPUTE_VER=50``. A quick explanation and numbers for some architectures can be found `in this page <https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/>`_.
 
-.. note:: Enabling distributed GPU training
+.. note:: Faster distributed GPU training with NCCL
 
-  By default, distributed GPU training is disabled and only a single GPU will be used. To enable distributed GPU training, set the option ``USE_NCCL=ON``. Distributed GPU training depends on NCCL2, available at `this link <https://developer.nvidia.com/nccl>`_. Since NCCL2 is only available for Linux machines, **distributed GPU training is available only for Linux**.
+  By default, distributed GPU training is enabled and uses Rabit for communication. For faster training, set the option ``USE_NCCL=ON``. Faster distributed GPU training depends on NCCL2, available at `this link <https://developer.nvidia.com/nccl>`_. Since NCCL2 is only available for Linux machines, **faster distributed GPU training is available only for Linux**.
 
   .. code-block:: bash
 

diff --git a/doc/contrib/ci.rst b/doc/contrib/ci.rst
@@ -37,3 +37,26 @@ machine in GitHub Actions, cross-compilation is needed; ``cibuildwheel`` takes c
 task of cross-compiling a Python wheel. (Note that ``cibuildwheel`` will call
 ``setup.py bdist_wheel``. Since XGBoost has a native library component, ``setup.py`` contains
 a glue code to call CMake and a C++ compiler to build the native library on the fly.)
+
+*******************************
+Reproducing errors from Jenkins
+*******************************
+
+It is often useful to reproduce the particular testing environment from our Jenkins server for
+the purpose of troubleshooting a failing test. We use Docker containers heavily to package
+the testing environment, so you can use Docker to reproduce it on your own machine.
+
+1. Install Docker: https://docs.docker.com/engine/install/ubuntu/
+2. Install NVIDIA Docker runtime: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#installing-on-ubuntu-and-debian
+   The runtime lets you access NVIDIA GPUs inside a Docker container.
+3. In a build log, all tests are invoked via the wrapper script ``tests/ci_build/ci_build.sh``.
+   Identify the test you'd like to reproduce locally, and note how the wrapper script was invoked for that test.
+   The invocation should look like this:
+
+.. code-block:: bash
+
+  CI_DOCKER_EXTRA_PARAMS_INIT='--shm-size=4g' tests/ci_build/ci_build.sh gpu nvidia-docker \
+    --build-arg CUDA_VERSION_ARG=11.0 tests/ci_build/test_python.sh mgpu --use-rmm-pool
+
+4. You can now run the same command on your own machine. The wrapper script will automatically download and
+   set up the correct Docker container(s).
diff --git a/doc/jvm/xgboost4j_spark_tutorial.rst b/doc/jvm/xgboost4j_spark_tutorial.rst
@@ -345,6 +345,16 @@ and then loading the model in another session:
   val xgbClassificationModel2 = XGBoostClassificationModel.load(xgbClassificationModelPath)
   xgbClassificationModel2.transform(xgbInput)
 
+.. note::
+
+  Besides dumping the model to raw format, users are able to dump the model to be json or ubj format from ``version 2.0.0+``.
+
+  .. code-block:: scala
+
+    val xgbClassificationModelPath = "/tmp/xgbClassificationModel"
+    xgbClassificationModel.write.overwrite().option("format", "json").save(xgbClassificationModelPath)
+
+
 With regards to ML pipeline save and load, please refer the next section.
 
 Interact with Other Bindings of XGBoost

diff --git a/doc/parameter.rst b/doc/parameter.rst
@@ -151,15 +151,6 @@ Parameters for Tree Booster
     - ``hist``: Faster histogram optimized approximate greedy algorithm.
     - ``gpu_hist``: GPU implementation of ``hist`` algorithm.
 
-* ``sketch_eps`` [default=0.03]
-
-  - Only used for ``updater=grow_local_histmaker``.
-  - This roughly translates into ``O(1 / sketch_eps)`` number of bins.
-    Compared to directly select number of bins, this comes with theoretical guarantee with sketch accuracy.
-  - Usually user does not have to tune this.
-    But consider setting to a lower number for more accurate enumeration of split candidates.
-  - range: (0, 1)
-
 * ``scale_pos_weight`` [default=1]
 
   - Control the balance of positive and negative weights, useful for unbalanced classes. A typical value to consider: ``sum(negative instances) / sum(positive instances)``. See :doc:`Parameters Tuning </tutorials/param_tuning>` for more discussion. Also, see Higgs Kaggle competition demo for examples: `R <https://github.com/dmlc/xgboost/blob/master/demo/kaggle-higgs/higgs-train.R>`_, `py1 <https://github.com/dmlc/xgboost/blob/master/demo/kaggle-higgs/higgs-numpy.py>`_, `py2 <https://github.com/dmlc/xgboost/blob/master/demo/kaggle-higgs/higgs-cv.py>`_, `py3 <https://github.com/dmlc/xgboost/blob/master/demo/guide-python/cross_validation.py>`_.
@@ -170,7 +161,6 @@ Parameters for Tree Booster
 
     - ``grow_colmaker``: non-distributed column-based construction of trees.
     - ``grow_histmaker``: distributed tree construction with row-based data splitting based on global proposal of histogram counting.
-    - ``grow_local_histmaker``: based on local histogram counting.
     - ``grow_quantile_histmaker``: Grow tree using quantized histogram.
     - ``grow_gpu_hist``: Grow tree with GPU.
     - ``sync``: synchronizes trees in all distributed nodes.
@@ -235,21 +225,36 @@ Parameters for Tree Booster
     list is a group of indices of features that are allowed to interact with each other.
     See :doc:`/tutorials/feature_interaction_constraint` for more information.
 
-Additional parameters for ``hist``, ``gpu_hist`` and ``approx`` tree method
-===========================================================================
+.. _cat-param:
+
+Parameters for Categorical Feature
+==================================
+
+These parameters are only used for training with categorical data. See
+:doc:`/tutorials/categorical` for more information.
 
 * ``max_cat_to_onehot``
 
   .. versionadded:: 1.6
 
-  .. note:: The support for this parameter is experimental.
+  .. note:: This parameter is experimental. ``exact`` tree method is not supported yet.
 
   - A threshold for deciding whether XGBoost should use one-hot encoding based split for
     categorical data.  When number of categories is lesser than the threshold then one-hot
     encoding is chosen, otherwise the categories will be partitioned into children nodes.
     Only relevant for regression and binary classification. Also, ``exact`` tree method is
     not supported
 
+* ``max_cat_threshold``
+
+  .. versionadded:: 2.0
+
+  .. note:: This parameter is experimental. ``exact`` and ``gpu_hist`` tree methods are
+            not supported yet.
+
+  - Maximum number of categories considered for each split. Used only by partition-based
+    splits for preventing over-fitting.
+
 Additional parameters for Dart Booster (``booster=dart``)
 =========================================================