diff --git a/.github/workflows/i386.yml b/.github/workflows/i386.yml
index 1c4e98010310..de8d7b25bac7 100644
--- a/.github/workflows/i386.yml
+++ b/.github/workflows/i386.yml
@@ -19,7 +19,7 @@ jobs:
         ports:
           - 5000:5000
     steps:
-    - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+    - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
       with:
         submodules: 'true'
     - name: Set up Docker Buildx
diff --git a/.github/workflows/jvm_tests.yml b/.github/workflows/jvm_tests.yml
index 9ef314ca5b0b..c2b6ca6a7483 100644
--- a/.github/workflows/jvm_tests.yml
+++ b/.github/workflows/jvm_tests.yml
@@ -16,26 +16,23 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os: [windows-latest, ubuntu-latest, macos-11]
+        os: [windows-latest, ubuntu-latest, macos-13]
 
     steps:
-    - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+    - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
       with:
         submodules: 'true'
 
-    - uses: mamba-org/setup-micromamba@422500192359a097648154e8db4e39bdb6c6eed7  # v1.8.1
+    - uses: conda-incubator/setup-miniconda@a4260408e20b96e80095f42ff7f1a15b27dd94ca  # v3.0.4
       with:
-        micromamba-version: '1.5.6-0'
-        environment-name: jvm_tests
-        create-args: >-
-          python=3.10
-          awscli
-        cache-downloads: true
-        cache-environment: true
-        init-shell: bash powershell
+        miniforge-variant: Mambaforge
+        miniforge-version: latest
+        activate-environment: jvm_tests
+        environment-file: tests/ci_build/conda_env/jvm_tests.yml
+        use-mamba: true
 
     - name: Cache Maven packages
-      uses: actions/cache@13aacd865c20de90d75de3b17ebe84f7a17d57d2  # v4.0.0
+      uses: actions/cache@0c45773b623bea8c8e75f6c82b208c3cf94ea4f9 # v4.0.2
       with:
         path: ~/.m2
         key: ${{ runner.os }}-m2-${{ hashFiles('./jvm-packages/pom.xml') }}
@@ -61,7 +58,7 @@ jobs:
       id: extract_branch
       if: |
         (github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) &&
-        (matrix.os == 'windows-latest' || matrix.os == 'macos-11')
+        (matrix.os == 'windows-latest' || matrix.os == 'macos-13')
 
     - name: Publish artifact xgboost4j.dll to S3
       run: |
@@ -85,7 +82,7 @@ jobs:
         python -m awscli s3 cp libxgboost4j_${{ github.sha }}.dylib s3://xgboost-nightly-builds/${{ steps.extract_branch.outputs.branch }}/libxgboost4j/ --acl public-read --region us-west-2
       if: |
         (github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) &&
-        matrix.os == 'macos-11'
+        matrix.os == 'macos-13'
       env:
         AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }}
         AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }}
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 4755f9aaaad8..f5ecb94f68b1 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -21,9 +21,9 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os: [macos-11]
+        os: [macos-12]
     steps:
-    - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+    - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
       with:
         submodules: 'true'
     - name: Install system packages
@@ -33,7 +33,7 @@ jobs:
       run: |
         mkdir build
         cd build
-        cmake .. -DGOOGLE_TEST=ON -DUSE_OPENMP=ON -DUSE_DMLC_GTEST=ON -GNinja -DBUILD_DEPRECATED_CLI=ON
+        cmake .. -DGOOGLE_TEST=ON -DUSE_OPENMP=ON -DUSE_DMLC_GTEST=ON -GNinja -DBUILD_DEPRECATED_CLI=ON -DUSE_SANITIZER=ON -DENABLED_SANITIZERS=address -DCMAKE_BUILD_TYPE=RelWithDebInfo
         ninja -v
     - name: Run gtest binary
       run: |
@@ -49,7 +49,7 @@ jobs:
       matrix:
         os: [ubuntu-latest]
     steps:
-    - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+    - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
       with:
         submodules: 'true'
     - name: Install system packages
@@ -76,16 +76,16 @@ jobs:
         os: [ubuntu-latest]
         python-version: ["3.8"]
     steps:
-    - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+    - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
       with:
         submodules: 'true'
-    - uses: mamba-org/provision-with-micromamba@3c96c0c27676490c63c18bc81f5c51895ac3e0e6 # v16
+    - uses: conda-incubator/setup-miniconda@a4260408e20b96e80095f42ff7f1a15b27dd94ca  # v3.0.4
       with:
-        cache-downloads: true
-        cache-env: true
-        environment-name: linux_sycl_test
+        miniforge-variant: Mambaforge
+        miniforge-version: latest
+        activate-environment: linux_sycl_test
         environment-file: tests/ci_build/conda_env/linux_sycl_test.yml
-
+        use-mamba: true
     - name: Display Conda env
       run: |
         conda info
@@ -118,15 +118,16 @@ jobs:
         os: ["ubuntu-latest"]
         python-version: ["3.8"]
     steps:
-    - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+    - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
       with:
         submodules: 'true'
-    - uses: mamba-org/provision-with-micromamba@3c96c0c27676490c63c18bc81f5c51895ac3e0e6 # v16
+    - uses: conda-incubator/setup-miniconda@a4260408e20b96e80095f42ff7f1a15b27dd94ca  # v3.0.4
       with:
-        cache-downloads: true
-        cache-env: true
-        environment-name: cpp_test
+        miniforge-variant: Mambaforge
+        miniforge-version: latest
+        activate-environment: cpp_test
         environment-file: tests/ci_build/conda_env/cpp_test.yml
+        use-mamba: true
     - name: Display Conda env
       run: |
         conda info
@@ -155,8 +156,9 @@ jobs:
     - name: Build and install XGBoost shared library
       run: |
         cd build
-        cmake .. -DBUILD_STATIC_LIB=OFF -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -GNinja
+        cmake .. -DBUILD_STATIC_LIB=OFF -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -GNinja -DPLUGIN_FEDERATED=ON -DGOOGLE_TEST=ON
         ninja -v install
+        ./testxgboost
         cd -
     - name: Build and run C API demo with shared
       run: |
@@ -175,10 +177,10 @@ jobs:
     runs-on: ubuntu-latest
     name: Code linting for C++
     steps:
-    - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+    - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
       with:
         submodules: 'true'
-    - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0
+    - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0
       with:
         python-version: "3.8"
         architecture: 'x64'
diff --git a/.github/workflows/python_tests.yml b/.github/workflows/python_tests.yml
index f0cad6382d87..e6eec86c8606 100644
--- a/.github/workflows/python_tests.yml
+++ b/.github/workflows/python_tests.yml
@@ -21,15 +21,16 @@ jobs:
       matrix:
         os: [ubuntu-latest]
     steps:
-    - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+    - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
       with:
         submodules: 'true'
-    - uses: mamba-org/provision-with-micromamba@3c96c0c27676490c63c18bc81f5c51895ac3e0e6 # v16
+    - uses: conda-incubator/setup-miniconda@a4260408e20b96e80095f42ff7f1a15b27dd94ca  # v3.0.4
       with:
-        cache-downloads: true
-        cache-env: true
-        environment-name: python_lint
+        miniforge-variant: Mambaforge
+        miniforge-version: latest
+        activate-environment: python_lint
         environment-file: tests/ci_build/conda_env/python_lint.yml
+        use-mamba: true
     - name: Display Conda env
       run: |
         conda info
@@ -52,15 +53,16 @@ jobs:
       matrix:
         os: [ubuntu-latest]
     steps:
-    - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+    - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
       with:
         submodules: 'true'
-    - uses: mamba-org/provision-with-micromamba@3c96c0c27676490c63c18bc81f5c51895ac3e0e6 # v16
+    - uses: conda-incubator/setup-miniconda@a4260408e20b96e80095f42ff7f1a15b27dd94ca  # v3.0.4
       with:
-        cache-downloads: true
-        cache-env: true
-        environment-name: sdist_test
+        miniforge-variant: Mambaforge
+        miniforge-version: latest
+        activate-environment: sdist_test
         environment-file: tests/ci_build/conda_env/sdist_test.yml
+        use-mamba: true
     - name: Display Conda env
       run: |
         conda info
@@ -81,14 +83,14 @@ jobs:
     name: Test installing XGBoost Python source package on ${{ matrix.os }}
     strategy:
       matrix:
-        os: [macos-11, windows-latest]
+        os: [macos-13, windows-latest]
         python-version: ["3.8"]
     steps:
-    - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+    - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
       with:
         submodules: 'true'
     - name: Install osx system dependencies
-      if: matrix.os == 'macos-11'
+      if: matrix.os == 'macos-13'
       run: |
         brew install ninja libomp
     - uses: conda-incubator/setup-miniconda@a4260408e20b96e80095f42ff7f1a15b27dd94ca # v3.0.4
@@ -119,19 +121,20 @@ jobs:
     strategy:
       matrix:
         config:
-          - {os: macos-11}
+          - {os: macos-13}
 
     steps:
-    - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+    - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
       with:
         submodules: 'true'
 
-    - uses: mamba-org/provision-with-micromamba@3c96c0c27676490c63c18bc81f5c51895ac3e0e6 # v16
+    - uses: conda-incubator/setup-miniconda@a4260408e20b96e80095f42ff7f1a15b27dd94ca  # v3.0.4
       with:
-        cache-downloads: true
-        cache-env: true
-        environment-name: macos_test
+        miniforge-variant: Mambaforge
+        miniforge-version: latest
+        activate-environment: macos_cpu_test
         environment-file: tests/ci_build/conda_env/macos_cpu_test.yml
+        use-mamba: true
 
     - name: Display Conda env
       run: |
@@ -174,7 +177,7 @@ jobs:
           - {os: windows-latest, python-version: '3.8'}
 
     steps:
-    - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+    - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
       with:
         submodules: 'true'
 
@@ -218,16 +221,17 @@ jobs:
           - {os: ubuntu-latest, python-version: "3.8"}
 
     steps:
-    - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+    - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
       with:
         submodules: 'true'
 
-    - uses: mamba-org/provision-with-micromamba@3c96c0c27676490c63c18bc81f5c51895ac3e0e6 # v16
+    - uses: conda-incubator/setup-miniconda@a4260408e20b96e80095f42ff7f1a15b27dd94ca  # v3.0.4
       with:
-        cache-downloads: true
-        cache-env: true
-        environment-name: linux_cpu_test
+        miniforge-variant: Mambaforge
+        miniforge-version: latest
+        activate-environment: linux_cpu_test
         environment-file: tests/ci_build/conda_env/linux_cpu_test.yml
+        use-mamba: true
 
     - name: Display Conda env
       run: |
@@ -270,16 +274,17 @@ jobs:
           - {os: ubuntu-latest, python-version: "3.8"}
 
     steps:
-    - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+    - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
       with:
         submodules: 'true'
 
-    - uses: mamba-org/provision-with-micromamba@3c96c0c27676490c63c18bc81f5c51895ac3e0e6 # v16
+    - uses: conda-incubator/setup-miniconda@a4260408e20b96e80095f42ff7f1a15b27dd94ca  # v3.0.4
       with:
-        cache-downloads: true
-        cache-env: true
-        environment-name: linux_sycl_test
+        miniforge-variant: Mambaforge
+        miniforge-version: latest
+        activate-environment: linux_sycl_test
         environment-file: tests/ci_build/conda_env/linux_sycl_test.yml
+        use-mamba: true
 
     - name: Display Conda env
       run: |
@@ -309,12 +314,12 @@ jobs:
         os: [ubuntu-latest]
 
     steps:
-      - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+      - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
         with:
           submodules: 'true'
 
       - name: Set up Python 3.8
-        uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0
+        uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0
         with:
           python-version: 3.8
 
diff --git a/.github/workflows/python_wheels.yml b/.github/workflows/python_wheels.yml
index 090b1f830213..f3e7d5817479 100644
--- a/.github/workflows/python_wheels.yml
+++ b/.github/workflows/python_wheels.yml
@@ -25,10 +25,10 @@ jobs:
         - os: macos-14
           platform_id: macosx_arm64
     steps:
-    - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+    - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
       with:
         submodules: 'true'
-    - uses: conda-incubator/setup-miniconda@v3.0.4
+    - uses: conda-incubator/setup-miniconda@a4260408e20b96e80095f42ff7f1a15b27dd94ca  # v3.0.4
       with:
         miniforge-variant: Mambaforge
         miniforge-version: latest
diff --git a/.github/workflows/r_nold.yml b/.github/workflows/r_nold.yml
index 887470190035..4b506927e06c 100644
--- a/.github/workflows/r_nold.yml
+++ b/.github/workflows/r_nold.yml
@@ -27,7 +27,7 @@ jobs:
       run: |
         apt update && apt install libcurl4-openssl-dev libssl-dev libssh2-1-dev libgit2-dev libglpk-dev libxml2-dev libharfbuzz-dev libfribidi-dev git -y
 
-    - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+    - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
       with:
         submodules: 'true'
 
diff --git a/.github/workflows/r_tests.yml b/.github/workflows/r_tests.yml
index f3d83b823aff..9fb9d4684ad1 100644
--- a/.github/workflows/r_tests.yml
+++ b/.github/workflows/r_tests.yml
@@ -25,7 +25,7 @@ jobs:
       RSPM: ${{ matrix.config.rspm }}
 
     steps:
-    - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+    - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
       with:
         submodules: 'true'
 
@@ -34,7 +34,7 @@ jobs:
         r-version: ${{ matrix.config.r }}
 
     - name: Cache R packages
-      uses: actions/cache@937d24475381cd9c75ae6db12cb4e79714b926ed # v3.0.11
+      uses: actions/cache@0c45773b623bea8c8e75f6c82b208c3cf94ea4f9 # v4.0.2
       with:
         path: ${{ env.R_LIBS_USER }}
         key: ${{ runner.os }}-r-${{ matrix.config.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }}
@@ -69,7 +69,7 @@ jobs:
         sudo apt update
         sudo apt install libcurl4-openssl-dev libssl-dev libssh2-1-dev libgit2-dev libglpk-dev libxml2-dev libharfbuzz-dev libfribidi-dev
       if: matrix.config.os == 'ubuntu-latest'
-    - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+    - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
       with:
         submodules: 'true'
 
@@ -78,13 +78,13 @@ jobs:
         r-version: ${{ matrix.config.r }}
 
     - name: Cache R packages
-      uses: actions/cache@937d24475381cd9c75ae6db12cb4e79714b926ed # v3.0.11
+      uses: actions/cache@0c45773b623bea8c8e75f6c82b208c3cf94ea4f9 # v4.0.2
       with:
         path: ${{ env.R_LIBS_USER }}
         key: ${{ runner.os }}-r-${{ matrix.config.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }}
         restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }}
 
-    - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0
+    - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0
       with:
         python-version: "3.8"
         architecture: 'x64'
@@ -123,7 +123,7 @@ jobs:
       run: |
         git config --global --add safe.directory "${GITHUB_WORKSPACE}"
 
-    - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+    - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
       with:
         submodules: 'true'
 
diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml
index 4651e2ac0dff..222700da4a58 100644
--- a/.github/workflows/scorecards.yml
+++ b/.github/workflows/scorecards.yml
@@ -22,7 +22,7 @@ jobs:
 
     steps:
       - name: "Checkout code"
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
         with:
           persist-credentials: false
 
@@ -41,7 +41,7 @@ jobs:
       # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF
       # format to the repository Actions tab.
       - name: "Upload artifact"
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3 # v4.3.1
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808 # v4.3.3
         with:
           name: SARIF file
           path: results.sarif
diff --git a/.github/workflows/update_rapids.yml b/.github/workflows/update_rapids.yml
index 9f9c85f62e28..9490926cfcaf 100644
--- a/.github/workflows/update_rapids.yml
+++ b/.github/workflows/update_rapids.yml
@@ -25,7 +25,7 @@ jobs:
     name: Check latest RAPIDS
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+    - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
       with:
         submodules: 'true'
     - name: Check latest RAPIDS and update conftest.sh
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
index fb7c8dbe69e7..e7fa372d89f9 100644
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -12,7 +12,7 @@ submodules:
 build:
   os: ubuntu-22.04
   tools:
-    python: "3.8"
+    python: "3.10"
   apt_packages:
     - graphviz
     - cmake
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c69b0d2a3dc7..e718d88ab1c2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,7 +6,7 @@ if(PLUGIN_SYCL)
   string(REPLACE " -isystem ${CONDA_PREFIX}/include" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
 endif()
 
-project(xgboost LANGUAGES CXX C VERSION 2.1.0)
+project(xgboost LANGUAGES CXX C VERSION 2.2.0)
 include(cmake/Utils.cmake)
 list(APPEND CMAKE_MODULE_PATH "${xgboost_SOURCE_DIR}/cmake/modules")
 
@@ -346,7 +346,6 @@ if(BUILD_DEPRECATED_CLI)
     PRIVATE
     ${xgboost_SOURCE_DIR}/include
     ${xgboost_SOURCE_DIR}/dmlc-core/include
-    ${xgboost_SOURCE_DIR}/rabit/include
   )
   set_target_properties(runxgboost PROPERTIES OUTPUT_NAME xgboost)
   xgboost_target_properties(runxgboost)
diff --git a/R-package/CMakeLists.txt b/R-package/CMakeLists.txt
index 37c5dbf4c1ed..75c3e2d77449 100644
--- a/R-package/CMakeLists.txt
+++ b/R-package/CMakeLists.txt
@@ -29,7 +29,6 @@ target_compile_definitions(
   -DDMLC_LOG_BEFORE_THROW=0
   -DDMLC_DISABLE_STDIN=1
   -DDMLC_LOG_CUSTOMIZE=1
-  -DRABIT_STRICT_CXX98_
 )
 
 target_include_directories(
@@ -37,7 +36,6 @@ target_include_directories(
   ${LIBR_INCLUDE_DIRS}
   ${PROJECT_SOURCE_DIR}/include
   ${PROJECT_SOURCE_DIR}/dmlc-core/include
-  ${PROJECT_SOURCE_DIR}/rabit/include
 )
 
 target_link_libraries(xgboost-r PUBLIC ${LIBR_CORE_LIBRARY})
diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION
index b4072aff0b41..82d7011de3a4 100644
--- a/R-package/DESCRIPTION
+++ b/R-package/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: xgboost
 Type: Package
 Title: Extreme Gradient Boosting
-Version: 2.1.0.0
-Date: 2023-08-19
+Version: 2.2.0.0
+Date: 2024-06-03
 Authors@R: c(
   person("Tianqi", "Chen", role = c("aut"),
          email = "tianqi.tchen@gmail.com"),
diff --git a/R-package/R/utils.R b/R-package/R/utils.R
index 7b6a20f704dd..69f358751dc8 100644
--- a/R-package/R/utils.R
+++ b/R-package/R/utils.R
@@ -27,8 +27,7 @@ NVL <- function(x, val) {
 }
 
 .RANKING_OBJECTIVES <- function() {
-  return(c('binary:logistic', 'binary:logitraw', 'binary:hinge', 'multi:softmax',
-           'multi:softprob'))
+  return(c('rank:pairwise', 'rank:ndcg', 'rank:map'))
 }
 
 
@@ -213,7 +212,7 @@ xgb.iter.eval <- function(bst, evals, iter, feval) {
     res <- sapply(seq_along(evals), function(j) {
       w <- evals[[j]]
       ## predict using all trees
-      preds <- predict(bst, w, outputmargin = TRUE, iterationrange = "all")
+      preds <- predict(bst, w, outputmargin = TRUE, reshape = TRUE, iterationrange = "all")
       eval_res <- feval(preds, w)
       out <- eval_res$value
       names(out) <- paste0(evnames[j], "-", eval_res$metric)
diff --git a/R-package/R/xgb.Booster.R b/R-package/R/xgb.Booster.R
index 77d75fa9c2a5..77b33f16db44 100644
--- a/R-package/R/xgb.Booster.R
+++ b/R-package/R/xgb.Booster.R
@@ -249,7 +249,7 @@ xgb.get.handle <- function(object) {
 #' summary(rowSums(pred_contr) - qlogis(pred))
 #' # for the 1st record, let's inspect its features that had non-zero contribution to prediction:
 #' contr1 <- pred_contr[1,]
-#' contr1 <- contr1[-length(contr1)]    # drop BIAS
+#' contr1 <- contr1[-length(contr1)]    # drop intercept
 #' contr1 <- contr1[contr1 != 0]        # drop non-contributing features
 #' contr1 <- contr1[order(abs(contr1))] # order by contribution magnitude
 #' old_mar <- par("mar")
@@ -473,7 +473,7 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
 
   .Call(XGSetArrayDimInplace_R, arr, rev(shape))
 
-  cnames <- if (!is.null(colnames(newdata))) c(colnames(newdata), "BIAS") else NULL
+  cnames <- if (!is.null(colnames(newdata))) c(colnames(newdata), "(Intercept)") else NULL
   n_groups <- shape[2]
 
   ## Needed regardless of whether strict shape is being used.
diff --git a/R-package/R/xgb.train.R b/R-package/R/xgb.train.R
index 4cea088e0e45..0aa3cdcf1df0 100644
--- a/R-package/R/xgb.train.R
+++ b/R-package/R/xgb.train.R
@@ -122,11 +122,23 @@
 #'        printed out during the training.
 #'        E.g., specifying \code{evals=list(validation1=mat1, validation2=mat2)} allows to track
 #'        the performance of each round's model on mat1 and mat2.
-#' @param obj customized objective function. Returns gradient and second order
-#'        gradient with given prediction and dtrain.
-#' @param feval customized evaluation function. Returns
-#'        \code{list(metric='metric-name', value='metric-value')} with given
-#'        prediction and dtrain.
+#' @param obj customized objective function. Should take two arguments: the first one will be the
+#'        current predictions (either a numeric vector or matrix depending on the number of targets / classes),
+#'        and the second one will be the `data` DMatrix object that is used for training.
+#'
+#'        It should return a list with two elements `grad` and `hess` (in that order), as either
+#'        numeric vectors or numeric matrices depending on the number of targets / classes (same
+#'        dimension as the predictions that are passed as first argument).
+#' @param feval customized evaluation function. Just like `obj`, should take two arguments, with
+#'        the first one being the predictions and the second one the `data` DMatrix.
+#'
+#'        Should return a list with two elements `metric` (name that will be displayed for this metric,
+#'        should be a string / character), and `value` (the number that the function calculates, should
+#'        be a numeric scalar).
+#'
+#'        Note that even if passing `feval`, objectives also have an associated default metric that
+#'        will be evaluated in addition to it. In order to disable the built-in metric, one can pass
+#'        parameter `disable_default_eval_metric = TRUE`.
 #' @param verbose If 0, xgboost will stay silent. If 1, it will print information about performance.
 #'        If 2, some additional information will be printed out.
 #'        Note that setting \code{verbose > 0} automatically engages the
diff --git a/R-package/configure b/R-package/configure
index 3bbfa71503fb..395ea9ee5d5d 100755
--- a/R-package/configure
+++ b/R-package/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.71 for xgboost 2.1.0.
+# Generated by GNU Autoconf 2.71 for xgboost 2.2.0.
 #
 #
 # Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation,
@@ -607,8 +607,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='xgboost'
 PACKAGE_TARNAME='xgboost'
-PACKAGE_VERSION='2.1.0'
-PACKAGE_STRING='xgboost 2.1.0'
+PACKAGE_VERSION='2.2.0'
+PACKAGE_STRING='xgboost 2.2.0'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
 
@@ -1225,7 +1225,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures xgboost 2.1.0 to adapt to many kinds of systems.
+\`configure' configures xgboost 2.2.0 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1287,7 +1287,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of xgboost 2.1.0:";;
+     short | recursive ) echo "Configuration of xgboost 2.2.0:";;
    esac
   cat <<\_ACEOF
 
@@ -1367,7 +1367,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-xgboost configure 2.1.0
+xgboost configure 2.2.0
 generated by GNU Autoconf 2.71
 
 Copyright (C) 2021 Free Software Foundation, Inc.
@@ -1533,7 +1533,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by xgboost $as_me 2.1.0, which was
+It was created by xgboost $as_me 2.2.0, which was
 generated by GNU Autoconf 2.71.  Invocation command line was
 
   $ $0$ac_configure_args_raw
@@ -3412,7 +3412,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by xgboost $as_me 2.1.0, which was
+This file was extended by xgboost $as_me 2.2.0, which was
 generated by GNU Autoconf 2.71.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -3467,7 +3467,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config='$ac_cs_config_escaped'
 ac_cs_version="\\
-xgboost config.status 2.1.0
+xgboost config.status 2.2.0
 configured by $0, generated by GNU Autoconf 2.71,
   with options \\"\$ac_cs_config\\"
 
diff --git a/R-package/configure.ac b/R-package/configure.ac
index 89f8635fe315..ee9ce823a0c1 100644
--- a/R-package/configure.ac
+++ b/R-package/configure.ac
@@ -2,7 +2,7 @@
 
 AC_PREREQ(2.69)
 
-AC_INIT([xgboost],[2.1.0],[],[xgboost],[])
+AC_INIT([xgboost],[2.2.0],[],[xgboost],[])
 
 : ${R_HOME=`R RHOME`}
 if test -z "${R_HOME}"; then
diff --git a/R-package/man/predict.xgb.Booster.Rd b/R-package/man/predict.xgb.Booster.Rd
index 88a2f203efcd..9c2e434d0625 100644
--- a/R-package/man/predict.xgb.Booster.Rd
+++ b/R-package/man/predict.xgb.Booster.Rd
@@ -211,7 +211,7 @@ str(pred_contr)
 summary(rowSums(pred_contr) - qlogis(pred))
 # for the 1st record, let's inspect its features that had non-zero contribution to prediction:
 contr1 <- pred_contr[1,]
-contr1 <- contr1[-length(contr1)]    # drop BIAS
+contr1 <- contr1[-length(contr1)]    # drop intercept
 contr1 <- contr1[contr1 != 0]        # drop non-contributing features
 contr1 <- contr1[order(abs(contr1))] # order by contribution magnitude
 old_mar <- par("mar")
diff --git a/R-package/man/xgb.train.Rd b/R-package/man/xgb.train.Rd
index 21c8dbe16413..937020e0dd38 100644
--- a/R-package/man/xgb.train.Rd
+++ b/R-package/man/xgb.train.Rd
@@ -167,12 +167,26 @@ printed out during the training.
 E.g., specifying \code{evals=list(validation1=mat1, validation2=mat2)} allows to track
 the performance of each round's model on mat1 and mat2.}
 
-\item{obj}{customized objective function. Returns gradient and second order
-gradient with given prediction and dtrain.}
+\item{obj}{customized objective function. Should take two arguments: the first one will be the
+current predictions (either a numeric vector or matrix depending on the number of targets / classes),
+and the second one will be the \code{data} DMatrix object that is used for training.
 
-\item{feval}{customized evaluation function. Returns
-\code{list(metric='metric-name', value='metric-value')} with given
-prediction and dtrain.}
+\if{html}{\out{<div class="sourceCode">}}\preformatted{   It should return a list with two elements `grad` and `hess` (in that order), as either
+   numeric vectors or numeric matrices depending on the number of targets / classes (same
+   dimension as the predictions that are passed as first argument).
+}\if{html}{\out{</div>}}}
+
+\item{feval}{customized evaluation function. Just like \code{obj}, should take two arguments, with
+the first one being the predictions and the second one the \code{data} DMatrix.
+
+\if{html}{\out{<div class="sourceCode">}}\preformatted{   Should return a list with two elements `metric` (name that will be displayed for this metric,
+   should be a string / character), and `value` (the number that the function calculates, should
+   be a numeric scalar).
+
+   Note that even if passing `feval`, objectives also have an associated default metric that
+   will be evaluated in addition to it. In order to disable the built-in metric, one can pass
+   parameter `disable_default_eval_metric = TRUE`.
+}\if{html}{\out{</div>}}}
 
 \item{verbose}{If 0, xgboost will stay silent. If 1, it will print information about performance.
 If 2, some additional information will be printed out.
diff --git a/R-package/src/Makevars.in b/R-package/src/Makevars.in
index 93cfb8e5b4c1..0cabffcad3c8 100644
--- a/R-package/src/Makevars.in
+++ b/R-package/src/Makevars.in
@@ -21,7 +21,6 @@ $(foreach v, $(XGB_RFLAGS), $(warning $(v)))
 PKG_CPPFLAGS = \
     -I$(PKGROOT)/include \
     -I$(PKGROOT)/dmlc-core/include \
-    -I$(PKGROOT)/rabit/include \
     -I$(PKGROOT) \
     $(XGB_RFLAGS)
 
diff --git a/R-package/src/Makevars.win b/R-package/src/Makevars.win
index f160930e8a4a..c49006c5e0a6 100644
--- a/R-package/src/Makevars.win
+++ b/R-package/src/Makevars.win
@@ -21,7 +21,6 @@ $(foreach v, $(XGB_RFLAGS), $(warning $(v)))
 PKG_CPPFLAGS = \
     -I$(PKGROOT)/include \
     -I$(PKGROOT)/dmlc-core/include \
-    -I$(PKGROOT)/rabit/include \
     -I$(PKGROOT) \
     $(XGB_RFLAGS)
 
diff --git a/R-package/tests/testthat/test_custom_objective.R b/R-package/tests/testthat/test_custom_objective.R
index d3050b152aa0..cf3a347d4d9d 100644
--- a/R-package/tests/testthat/test_custom_objective.R
+++ b/R-package/tests/testthat/test_custom_objective.R
@@ -147,3 +147,34 @@ test_that("custom objective with multi-class works", {
 
   expect_equal(custom_predt, builtin_predt)
 })
+
+test_that("custom metric with multi-target passes reshaped data to feval", {
+  x <- as.matrix(iris[, -5])
+  y <- as.numeric(iris$Species) - 1
+  dtrain <- xgb.DMatrix(data = x, label = y)
+
+  multinomial.ll <- function(predt, dtrain) {
+    expect_equal(dim(predt), c(nrow(iris), 3L))
+    y <- getinfo(dtrain, "label")
+    probs <- apply(predt, 1, softmax) |> t()
+    probs.y <- probs[cbind(seq(1L, nrow(predt)), y + 1L)]
+    ll <- sum(log(probs.y))
+    return(list(metric = "multinomial-ll", value = -ll))
+  }
+
+  model <- xgb.train(
+    params = list(
+      objective = "multi:softmax",
+      num_class = 3L,
+      base_score = 0,
+      disable_default_eval_metric = TRUE,
+      max_depth = 123,
+      seed = 123
+    ),
+    data = dtrain,
+    nrounds = 2L,
+    evals = list(Train = dtrain),
+    eval_metric = multinomial.ll,
+    verbose = 0
+  )
+})
diff --git a/R-package/tests/testthat/test_helpers.R b/R-package/tests/testthat/test_helpers.R
index 38b5ca0667bf..c619bc50b5da 100644
--- a/R-package/tests/testthat/test_helpers.R
+++ b/R-package/tests/testthat/test_helpers.R
@@ -101,7 +101,7 @@ test_that("predict feature contributions works", {
   # gbtree binary classifier
   expect_error(pred_contr <- predict(bst.Tree, sparse_matrix, predcontrib = TRUE), regexp = NA)
   expect_equal(dim(pred_contr), c(nrow(sparse_matrix), ncol(sparse_matrix) + 1))
-  expect_equal(colnames(pred_contr), c(colnames(sparse_matrix), "BIAS"))
+  expect_equal(colnames(pred_contr), c(colnames(sparse_matrix), "(Intercept)"))
   pred <- predict(bst.Tree, sparse_matrix, outputmargin = TRUE)
   expect_lt(max(abs(rowSums(pred_contr) - pred)), 1e-5)
   # must work with data that has no column names
@@ -114,14 +114,14 @@ test_that("predict feature contributions works", {
   # gbtree binary classifier (approximate method)
   expect_error(pred_contr <- predict(bst.Tree, sparse_matrix, predcontrib = TRUE, approxcontrib = TRUE), regexp = NA)
   expect_equal(dim(pred_contr), c(nrow(sparse_matrix), ncol(sparse_matrix) + 1))
-  expect_equal(colnames(pred_contr), c(colnames(sparse_matrix), "BIAS"))
+  expect_equal(colnames(pred_contr), c(colnames(sparse_matrix), "(Intercept)"))
   pred <- predict(bst.Tree, sparse_matrix, outputmargin = TRUE)
   expect_lt(max(abs(rowSums(pred_contr) - pred)), 1e-5)
 
   # gblinear binary classifier
   expect_error(pred_contr <- predict(bst.GLM, sparse_matrix, predcontrib = TRUE), regexp = NA)
   expect_equal(dim(pred_contr), c(nrow(sparse_matrix), ncol(sparse_matrix) + 1))
-  expect_equal(colnames(pred_contr), c(colnames(sparse_matrix), "BIAS"))
+  expect_equal(colnames(pred_contr), c(colnames(sparse_matrix), "(Intercept)"))
   pred <- predict(bst.GLM, sparse_matrix, outputmargin = TRUE)
   expect_lt(max(abs(rowSums(pred_contr) - pred)), 1e-5)
   # manual calculation of linear terms
@@ -137,7 +137,7 @@ test_that("predict feature contributions works", {
   expect_is(pred_contr, "list")
   expect_length(pred_contr, 3)
   for (g in seq_along(pred_contr)) {
-    expect_equal(colnames(pred_contr[[g]]), c(colnames(iris[, -5]), "BIAS"))
+    expect_equal(colnames(pred_contr[[g]]), c(colnames(iris[, -5]), "(Intercept)"))
     expect_lt(max(abs(rowSums(pred_contr[[g]]) - pred[, g])), 1e-5)
   }
 
@@ -151,7 +151,7 @@ test_that("predict feature contributions works", {
     byrow = TRUE
   )
   for (g in seq_along(pred_contr)) {
-    expect_equal(colnames(pred_contr[[g]]), c(colnames(iris[, -5]), "BIAS"))
+    expect_equal(colnames(pred_contr[[g]]), c(colnames(iris[, -5]), "(Intercept)"))
     expect_lt(max(abs(rowSums(pred_contr[[g]]) - pred[, g])), float_tolerance)
     # manual calculation of linear terms
     coefs <- c(coefs_all[-1, g], coefs_all[1, g]) # intercept needs to be the last
diff --git a/R-package/tests/testthat/test_interactions.R b/R-package/tests/testthat/test_interactions.R
index 645efc12a14c..60cf9d80039a 100644
--- a/R-package/tests/testthat/test_interactions.R
+++ b/R-package/tests/testthat/test_interactions.R
@@ -48,7 +48,7 @@ test_that("predict feature interactions works", {
   intr <- predict(b, dm, predinteraction = TRUE)
   expect_equal(dim(intr), c(N, P + 1, P + 1))
   # check assigned colnames
-  cn <- c(letters[1:P], "BIAS")
+  cn <- c(letters[1:P], "(Intercept)")
   expect_equal(dimnames(intr), list(NULL, cn, cn))
 
   # check the symmetry
@@ -60,7 +60,7 @@ test_that("predict feature interactions works", {
   # diagonal terms for features 3,4,5 must be close to zero
   expect_lt(Reduce(max, sapply(3:P, function(i) max(abs(intr[, i, i])))), 0.05)
 
-  # BIAS must have no interactions
+  # Intercept must have no interactions
   expect_lt(max(abs(intr[, 1:P, P + 1])), 0.00001)
 
   # interactions other than 2 x 3 must be close to zero
diff --git a/README.md b/README.md
index 234bd7dba76e..220e94637fe1 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-<img src="https://xgboost.ai/images/logo/xgboost-logo-ng-trimmed.png" width=200/> eXtreme Gradient Boosting
+<img src="https://xgboost.ai/images/logo/xgboost-logo-trimmed.png" width=200/> eXtreme Gradient Boosting
 ===========
 
 [![Build Status](https://badge.buildkite.com/aca47f40a32735c00a8550540c5eeff6a4c1d246a580cae9b0.svg?branch=master)](https://buildkite.com/xgboost/xgboost-ci)
@@ -11,6 +11,7 @@
 [![Optuna](https://img.shields.io/badge/Optuna-integrated-blue)](https://optuna.org)
 [![Twitter](https://img.shields.io/badge/@XGBoostProject--_.svg?style=social&logo=twitter)](https://twitter.com/XGBoostProject)
 [![OpenSSF Scorecard](https://api.securityscorecards.dev/projects/github.com/dmlc/xgboost/badge)](https://api.securityscorecards.dev/projects/github.com/dmlc/xgboost)
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/comet-ml/comet-examples/blob/master/integrations/model-training/xgboost/notebooks/how_to_use_comet_with_xgboost_tutorial.ipynb)
 
 [Community](https://xgboost.ai/community) |
 [Documentation](https://xgboost.readthedocs.org) |
@@ -49,6 +50,7 @@ Become a sponsor and get a logo here. See details at [Sponsoring the XGBoost Pro
 
 <a href="https://www.nvidia.com/en-us/" target="_blank"><img src="https://raw.githubusercontent.com/xgboost-ai/xgboost-ai.github.io/master/images/sponsors/nvidia.jpg" alt="NVIDIA" width="72" height="72"></a>
 <a href="https://www.intel.com/" target="_blank"><img src="https://images.opencollective.com/intel-corporation/2fa85c1/logo/256.png" width="72" height="72"></a>
+<a href="https://www.comet.com/site/?utm_source=xgboost&utm_medium=github&utm_content=readme" target="_blank"><img src="https://cdn.comet.ml/img/notebook_logo.png" height="72"></a>
 
 ### Backers
 [[Become a backer](https://opencollective.com/xgboost#backer)]
diff --git a/demo/c-api/basic/Makefile b/demo/c-api/basic/Makefile
index 345079fa9a75..dceb9bc73a11 100644
--- a/demo/c-api/basic/Makefile
+++ b/demo/c-api/basic/Makefile
@@ -4,7 +4,7 @@ TGT=c-api-demo
 cc=cc
 CFLAGS ?=-O3
 XGBOOST_ROOT ?=../..
-INCLUDE_DIR=-I$(XGBOOST_ROOT)/include -I$(XGBOOST_ROOT)/dmlc-core/include -I$(XGBOOST_ROOT)/rabit/include
+INCLUDE_DIR=-I$(XGBOOST_ROOT)/include -I$(XGBOOST_ROOT)/dmlc-core/include
 LIB_DIR=-L$(XGBOOST_ROOT)/lib
 
 build: $(TGT)
diff --git a/demo/dask/gpu_training.py b/demo/dask/gpu_training.py
index f53835ffbee9..d964d78e20aa 100644
--- a/demo/dask/gpu_training.py
+++ b/demo/dask/gpu_training.py
@@ -3,7 +3,7 @@
 ====================================
 """
 
-import cupy as cp
+import dask
 import dask_cudf
 from dask import array as da
 from dask import dataframe as dd
@@ -24,12 +24,8 @@ def using_dask_matrix(client: Client, X: da.Array, y: da.Array) -> da.Array:
     # history obtained from evaluation metrics.
     output = dxgb.train(
         client,
-        {
-            "verbosity": 2,
-            "tree_method": "hist",
-            # Golden line for GPU training
-            "device": "cuda",
-        },
+        # Make sure the device is set to CUDA.
+        {"tree_method": "hist", "device": "cuda"},
         dtrain,
         num_boost_round=4,
         evals=[(dtrain, "train")],
@@ -50,18 +46,17 @@ def using_quantile_device_dmatrix(client: Client, X: da.Array, y: da.Array) -> d
     .. versionadded:: 1.2.0
 
     """
-    X = dask_cudf.from_dask_dataframe(dd.from_dask_array(X))
-    y = dask_cudf.from_dask_dataframe(dd.from_dask_array(y))
-
     # `DaskQuantileDMatrix` is used instead of `DaskDMatrix`, be careful that it can not
     # be used for anything else other than training unless a reference is specified. See
     # the `ref` argument of `DaskQuantileDMatrix`.
     dtrain = dxgb.DaskQuantileDMatrix(client, X, y)
     output = dxgb.train(
         client,
-        {"verbosity": 2, "tree_method": "hist", "device": "cuda"},
+        # Make sure the device is set to CUDA.
+        {"tree_method": "hist", "device": "cuda"},
         dtrain,
         num_boost_round=4,
+        evals=[(dtrain, "train")],
     )
 
     prediction = dxgb.predict(client, output, X)
@@ -72,15 +67,23 @@ def using_quantile_device_dmatrix(client: Client, X: da.Array, y: da.Array) -> d
     # `LocalCUDACluster` is used for assigning GPU to XGBoost processes.  Here
     # `n_workers` represents the number of GPUs since we use one GPU per worker process.
     with LocalCUDACluster(n_workers=2, threads_per_worker=4) as cluster:
-        with Client(cluster) as client:
-            # generate some random data for demonstration
+        # Create client from cluster, set the backend to GPU array (cupy).
+        with Client(cluster) as client, dask.config.set({"array.backend": "cupy"}):
+            # Generate some random data for demonstration
             rng = da.random.default_rng(1)
 
-            m = 100000
+            m = 2**18
             n = 100
-            X = rng.normal(size=(m, n))
+            X = rng.uniform(size=(m, n), chunks=(128**2, -1))
             y = X.sum(axis=1)
 
+            X = dd.from_dask_array(X)
+            y = dd.from_dask_array(y)
+            # XGBoost can take arrays. This is to show that DataFrame uses the GPU
+            # backend as well.
+            assert isinstance(X, dask_cudf.DataFrame)
+            assert isinstance(y, dask_cudf.Series)
+
             print("Using DaskQuantileDMatrix")
             from_ddqdm = using_quantile_device_dmatrix(client, X, y)
             print("Using DMatrix")
diff --git a/demo/dask/sklearn_gpu_training.py b/demo/dask/sklearn_gpu_training.py
index 6161bf9a3402..56f1be7151c4 100644
--- a/demo/dask/sklearn_gpu_training.py
+++ b/demo/dask/sklearn_gpu_training.py
@@ -3,6 +3,7 @@
 ===================================================================
 """
 
+import dask
 from dask import array as da
 from dask.distributed import Client
 
@@ -13,17 +14,18 @@
 
 
 def main(client: Client) -> dxgb.Booster:
-    # generate some random data for demonstration
+    # Generate some random data for demonstration
+    rng = da.random.default_rng(1)
+
+    m = 2**18
     n = 100
-    m = 1000000
-    partition_size = 10000
-    X = da.random.random((m, n), partition_size)
-    y = da.random.random(m, partition_size)
+    X = rng.uniform(size=(m, n), chunks=(128**2, -1))
+    y = X.sum(axis=1)
 
     regressor = dxgb.DaskXGBRegressor(verbosity=1)
-    # set the device to CUDA
+    # Set the device to CUDA
     regressor.set_params(tree_method="hist", device="cuda")
-    # assigning client here is optional
+    # Assigning client here is optional
     regressor.client = client
 
     regressor.fit(X, y, eval_set=[(X, y)])
@@ -42,5 +44,6 @@ def main(client: Client) -> dxgb.Booster:
     # With dask cuda, one can scale up XGBoost to arbitrary GPU clusters.
     # `LocalCUDACluster` used here is only for demonstration purpose.
     with LocalCUDACluster() as cluster:
-        with Client(cluster) as client:
+        # Create client from cluster, set the backend to GPU array (cupy).
+        with Client(cluster) as client, dask.config.set({"array.backend": "cupy"}):
             main(client)
diff --git a/dev/release-artifacts.py b/dev/release-artifacts.py
index d5f28f6fc0ca..9f27d35738c6 100644
--- a/dev/release-artifacts.py
+++ b/dev/release-artifacts.py
@@ -230,7 +230,6 @@ def release_note(
 ) -> None:
     """Generate a note for GitHub release description."""
     r_gpu_linux_url = r_urls["linux"]
-    r_gpu_win64_url = r_urls["win64"]
     src_tarball = (
         f"https://github.com/dmlc/xgboost/releases/download/v{release}/{tarname}"
     )
@@ -251,7 +250,6 @@ def release_note(
 
 **Experimental binary packages for R with CUDA enabled**
 * xgboost_r_gpu_linux_{release}.tar.gz: [Download]({r_gpu_linux_url})
-* xgboost_r_gpu_win64_{release}.tar.gz: [Download]({r_gpu_win64_url})
 
 **Source tarball**
 * xgboost.tar.gz: [Download]({src_tarball})"""
@@ -297,6 +295,8 @@ def main(args: argparse.Namespace) -> None:
     commit_hash = latest_hash()
 
     outdir = os.path.abspath(args.outdir)
+    if outdir.find(str(ROOT)) != -1:
+        raise ValueError("output dir must be outside of the source tree.")
     if not os.path.exists(outdir):
         os.mkdir(outdir)
 
diff --git a/doc/build.rst b/doc/build.rst
index cba75ff57d2b..fda64820ad04 100644
--- a/doc/build.rst
+++ b/doc/build.rst
@@ -134,11 +134,11 @@ From the command line on Linux starting from the XGBoost directory:
 
 .. note:: Specifying compute capability
 
-  To speed up compilation, the compute version specific to your GPU could be passed to cmake as, e.g., ``-DGPU_COMPUTE_VER=50``. A quick explanation and numbers for some architectures can be found `in this page <https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/>`_.
+  To speed up compilation, the compute version specific to your GPU could be passed to cmake as, e.g., ``-DCMAKE_CUDA_ARCHITECTURES=75``. A quick explanation and numbers for some architectures can be found `in this page <https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/>`_.
 
 .. note:: Faster distributed GPU training with NCCL
 
-  By default, distributed GPU training is enabled and uses Rabit for communication. For faster training, set the option ``USE_NCCL=ON``. Faster distributed GPU training depends on NCCL2, available at `this link <https://developer.nvidia.com/nccl>`_. Since NCCL2 is only available for Linux machines, **faster distributed GPU training is available only for Linux**.
+  By default, distributed GPU training is enabled with the option ``USE_NCCL=ON``. Distributed GPU training depends on NCCL2, available at `this link <https://developer.nvidia.com/nccl>`_. Since NCCL2 is only available for Linux machines, **Distributed GPU training is available only for Linux**.
 
   .. code-block:: bash
 
@@ -147,6 +147,8 @@ From the command line on Linux starting from the XGBoost directory:
     cmake .. -DUSE_CUDA=ON -DUSE_NCCL=ON -DNCCL_ROOT=/path/to/nccl2
     make -j4
 
+Some additional flags are available for NCCL, ``BUILD_WITH_SHARED_NCCL`` enables building XGBoost with NCCL as a shared library, while ``USE_DLOPEN_NCCL`` enables XGBoost  to load NCCL at runtime using ``dlopen``.
+
 On Windows, run CMake as follows:
 
 .. code-block:: bash
@@ -165,6 +167,17 @@ The above cmake configuration run will create an ``xgboost.sln`` solution file i
 
 To speed up compilation, run multiple jobs in parallel by appending option ``-- /MP``.
 
+Federated Learning
+==================
+
+The federated learning plugin requires ``grpc`` and ``protobuf``. To install grpc, refer
+to the `installation guide from the gRPC website
+<https://grpc.io/docs/languages/cpp/quickstart/>`_. Alternatively, one can use the
+``libgrpc`` and the ``protobuf`` package from conda forge if conda is available. After
+obtaining the required dependencies, enable the flag: `-DPLUGIN_FEDERATED=ON` when running
+CMake. Please note that only Linux is supported for the federated plugin.
+
+
 .. _build_python:
 
 ***********************************
@@ -228,11 +241,12 @@ There are several ways to build and install the package from source:
 
 3. Editable installation
 
-  To further enable rapid development and iteration, we provide an **editable installation**.
-  In an editable installation, the installed package is simply a symbolic link to your
-  working copy of the XGBoost source code. So every changes you make to your source
-  directory will be immediately visible to the Python interpreter. Here is how to
-  install XGBoost as editable installation:
+  To further enable rapid development and iteration, we provide an **editable
+  installation**.  In an editable installation, the installed package is simply a symbolic
+  link to your working copy of the XGBoost source code. So every changes you make to your
+  source directory will be immediately visible to the Python interpreter. To install
+  XGBoost as editable installation, first build the shared library as previously
+  described, then install the Python package:
 
   .. code-block:: bash
 
diff --git a/doc/conf.py b/doc/conf.py
index ec58c5a5d456..0a90fa297bef 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -21,8 +21,6 @@
 import warnings
 from urllib.error import HTTPError
 
-from sh.contrib import git
-
 CURR_PATH = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
 PROJECT_ROOT = os.path.normpath(os.path.join(CURR_PATH, os.path.pardir))
 TMP_DIR = os.path.join(CURR_PATH, "tmp")
@@ -61,6 +59,49 @@ def run_doxygen():
         os.chdir(curdir)
 
 
+def build_jvm_docs():
+    """Build docs for the JVM packages"""
+    git_branch = os.getenv("READTHEDOCS_VERSION_NAME", default=None)
+    print(f"READTHEDOCS_VERSION_NAME = {git_branch}")
+
+    if not git_branch:
+        git_branch = "master"
+    elif git_branch == "latest":
+        git_branch = "master"
+    elif git_branch == "stable":
+        git_branch = f"release_{version}"
+    print(f"git_branch = {git_branch}")
+
+    def try_fetch_jvm_doc(branch):
+        """
+        Attempt to fetch JVM docs for a given branch.
+        Returns True if successful
+        """
+        try:
+            url = f"https://s3-us-west-2.amazonaws.com/xgboost-docs/{branch}.tar.bz2"
+            filename, _ = urllib.request.urlretrieve(url)
+            if not os.path.exists(TMP_DIR):
+                print(f"Create directory {TMP_DIR}")
+                os.mkdir(TMP_DIR)
+            jvm_doc_dir = os.path.join(TMP_DIR, "jvm_docs")
+            if os.path.exists(jvm_doc_dir):
+                print(f"Delete directory {jvm_doc_dir}")
+                shutil.rmtree(jvm_doc_dir)
+            print(f"Create directory {jvm_doc_dir}")
+            os.mkdir(jvm_doc_dir)
+
+            with tarfile.open(filename, "r:bz2") as t:
+                t.extractall(jvm_doc_dir)
+            return True
+        except HTTPError:
+            print(f"JVM doc not found at {url}. Skipping...")
+            return False
+
+    if not try_fetch_jvm_doc(git_branch):
+        print(f"Falling back to the master branch...")
+        try_fetch_jvm_doc("master")
+
+
 def is_readthedocs_build():
     if os.environ.get("READTHEDOCS", None) == "True":
         return True
@@ -75,40 +116,9 @@ def is_readthedocs_build():
 
 if is_readthedocs_build():
     run_doxygen()
+    build_jvm_docs()
 
 
-git_branch = os.getenv("SPHINX_GIT_BRANCH", default=None)
-if not git_branch:
-    # If SPHINX_GIT_BRANCH environment variable is not given, run git
-    # to determine branch name
-    git_branch = [
-        re.sub(r"origin/", "", x.lstrip(" "))
-        for x in str(git.branch("-r", "--contains", "HEAD")).rstrip("\n").split("\n")
-    ]
-    git_branch = [x for x in git_branch if "HEAD" not in x]
-else:
-    git_branch = [git_branch]
-print("git_branch = {}".format(git_branch[0]))
-
-try:
-    filename, _ = urllib.request.urlretrieve(
-        f"https://s3-us-west-2.amazonaws.com/xgboost-docs/{git_branch[0]}.tar.bz2"
-    )
-    if not os.path.exists(TMP_DIR):
-        print(f"Create directory {TMP_DIR}")
-        os.mkdir(TMP_DIR)
-    jvm_doc_dir = os.path.join(TMP_DIR, "jvm")
-    if os.path.exists(jvm_doc_dir):
-        print(f"Delete directory {jvm_doc_dir}")
-        shutil.rmtree(jvm_doc_dir)
-    print(f"Create directory {jvm_doc_dir}")
-    os.mkdir(jvm_doc_dir)
-
-    with tarfile.open(filename, "r:bz2") as t:
-        t.extractall(jvm_doc_dir)
-except HTTPError:
-    print("JVM doc not found. Skipping...")
-
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
@@ -152,7 +162,7 @@ def is_readthedocs_build():
         "../demo/dask",
         "../demo/aft_survival",
         "../demo/gpu_acceleration",
-        "../demo/rmm_plugin"
+        "../demo/rmm_plugin",
     ],
     # path to where to save gallery generated output
     "gallery_dirs": [
@@ -250,7 +260,7 @@ def is_readthedocs_build():
 html_theme_options = {"logo_only": True}
 
 
-html_logo = "https://xgboost.ai/images/logo/xgboost-logo-ng.png"
+html_logo = "https://xgboost.ai/images/logo/xgboost-logo.png"
 
 html_css_files = ["css/custom.css"]
 
diff --git a/doc/contrib/featuremap.rst b/doc/contrib/featuremap.rst
new file mode 100644
index 000000000000..66b87129e774
--- /dev/null
+++ b/doc/contrib/featuremap.rst
@@ -0,0 +1,69 @@
+############################
+XGBoost Internal Feature Map
+############################
+
+The following is a reference to the features supported by XGBoost.  It is not a beginner's guide, but rather a list meant to help those looking to add new features to XGBoost understand what needs to be covered.
+
+*************
+Core Features
+*************
+Core features are not dependent on language binding and any language binding can choose to support them.
+
+-------------
+Data Storage
+-------------
+The primary data structure in XGBoost for storing user inputs is ``DMatrix``; it's a container for all data that XGBoost can use. ``QuantileDMatrix`` is a variant specifically designed for the ``hist`` tree method. Both can take GPU-based inputs. They take an optional parameter ``missing`` to specify which input value should be ignored. For external memory support, please refer to :doc:`/tutorials/external_memory`.
+
+---------------------
+Single Node Training
+---------------------
+There are two different model types in XGBoost: the tree model, which we primarily focus on, and the linear model. For the tree model, we have various methods to build decision trees; please see the :doc:`/treemethod` for a complete reference. In addition to the tree method, we have many hyper-parameters for tuning the model and injecting prior knowledge into the training process. Two noteworthy examples are :doc:`monotonic constraints </tutorials/monotonic>` and :doc:`feature interaction constraints </tutorials/feature_interaction_constraint>`. These two constraints require special treatment during tree construction. Both the ``hist`` and the ``approx`` tree methods support GPU acceleration. Also, XGBoost GPU supports gradient-based sampling, which supports external-memory data as well.
+
+The objective function plays an important role in training. It not only provides the gradient, but also responsible for estimating a good starting point for Newton optimization. Please note that users can define custom objective functions for the task at hand.
+In addition to numerical features, XGBoost also supports categorical features with two different algorithms, including one-hot encoding and optimal partitioning. For more information, refer to the :doc:`categorical feature tutorial </tutorials/categorical>`. The ``hist`` and the ``approx`` tree methods support categorical features for CPU and GPU.
+
+There's working-in-progress support for vector leaves, which are decision tree leaves that contain multiple values. This type of tree is used to support efficient multi-class and multi-target models.
+
+----------
+Inference
+----------
+By inference, we specifically mean getting model prediction for the response variable. XGBoost supports two inference methods. The first one is the prediction on the ``DMatrix`` object (or ``QuantileDMatrix``, which is a subclass). Using a ``DMatrix`` object allows XGBoost to cache the prediction, hence getting faster performance when running prediction on the same data with new trees. The second method is ``inplace_predict``, which bypasses the construction of ``DMatrix``. It's more efficient but doesn't support cached prediction. In addtion to returning the estimated response, we also support returning the leaf index, which can be used to analyse the model and as a feature to another model.
+
+----------
+Model IO
+----------
+We have a set of methods for different model serialization methods, including complete serialization, saving to a file, and saving to a buffer. For more, refer to the :doc:`/tutorials/saving_model`.
+
+-------------------
+Model Explanation
+-------------------
+XGBoost includes features designed to improve understanding of the model. Here's a list:
+
+- Global feature importance.
+- SHAP value, including contribution and intervention.
+- Tree dump.
+- Tree visualization.
+- Tree as dataframe.
+
+For GPU support, the SHAP value uses the `GPUTreeShap <https://github.com/rapidsai/gputreeshap/tree/main>`_ project in rapidsai. They all support categorical features, while vector-leaf is still in progress.
+
+----------
+Evaluation
+----------
+XGBoost has built-in support for a wide range of metrics, from basic regression to learning to rank and survival modeling. They can handle distributed training and GPU-based acceleration. Custom metrics are supported as well, please see :doc:`/tutorials/custom_metric_obj`.
+
+--------------------
+Distributed Training
+--------------------
+XGBoost has built-in support for three distributed frameworks, including ``Dask``, ``PySpark``, and ``Spark (Scala)``. In addition, there's ``flink`` support for the Java binding and the ``ray-xgboost`` project. Please see the respective tutorial on how to use them. By default, XGBoost uses sample-based parallelism for distributed training. The column-based split is still working in progress and needs to be supported in these high-level framework integrations. On top of distributed training, we are also working on federated learning for both sample-based and column-based splits.
+
+Distributed training works with custom objective functions and metrics as well. XGBoost aggregates the evaluation result automatically during training.
+
+The distributed training is enabled by a built-in implementation of a collective library. It's based on the RABIT project and has evolved significantly since its early adoption. The collective implementation supports GPU via NCCL, and has variants for handling federated learning and federated learning on GPU.
+
+Inference normally doesn't require any special treatment since we are using sample-based split. However, with column-based data split, we need to initialize the communicator context as well.
+
+*****************
+Language Bindings
+*****************
+We have a list of bindings for various languages. Inside the XGBoost repository, there's Python, R, Java, Scala, and C. All language bindings are built on top of the C version. Some others, like Julia and Rust, have their own repository. For guideline on adding a new binding, please see :doc:`/contrib/consistency`.
\ No newline at end of file
diff --git a/doc/contrib/index.rst b/doc/contrib/index.rst
index feac865fbe34..75bd37094e89 100644
--- a/doc/contrib/index.rst
+++ b/doc/contrib/index.rst
@@ -27,6 +27,7 @@ Here are guidelines for contributing to various aspect of the XGBoost project:
   python_packaging
   unit_tests
   Docs and Examples <docs>
+  featuremap
   git_guide
   release
   ci
diff --git a/doc/faq.rst b/doc/faq.rst
index 4fe63076c18b..cdfb8bc2cb3c 100644
--- a/doc/faq.rst
+++ b/doc/faq.rst
@@ -37,7 +37,7 @@ The ultimate question will still come back to how to push the limit of each comp
 and use less resources to complete the task (thus with less communication and chance of failure).
 
 To achieve these, we decide to reuse the optimizations in the single node XGBoost and build the distributed version on top of it.
-The demand for communication in machine learning is rather simple, in the sense that we can depend on a limited set of APIs (in our case rabit).
+The demand for communication in machine learning is rather simple, in the sense that we can depend on a limited set of APIs.
 Such design allows us to reuse most of the code, while being portable to major platforms such as Hadoop/Yarn, MPI, SGE.
 Most importantly, it pushes the limit of the computation resources we can use.
 
diff --git a/doc/jvm/api.rst b/doc/jvm/api.rst
new file mode 100644
index 000000000000..b9e7821aa6fa
--- /dev/null
+++ b/doc/jvm/api.rst
@@ -0,0 +1,8 @@
+#############################
+API Docs for the JVM packages
+#############################
+
+* `XGBoost4J Java API <../jvm_docs/javadocs/index.html>`_
+* `XGBoost4J Scala API <../jvm_docs/scaladocs/xgboost4j/index.html>`_
+* `XGBoost4J-Spark Scala API <../jvm_docs/scaladocs/xgboost4j-spark/index.html>`_
+* `XGBoost4J-Flink Scala API <../jvm_docs/scaladocs/xgboost4j-flink/index.html>`_
diff --git a/doc/jvm/index.rst b/doc/jvm/index.rst
index a92834d747e0..0a2e947ea586 100644
--- a/doc/jvm/index.rst
+++ b/doc/jvm/index.rst
@@ -37,10 +37,7 @@ Contents
   XGBoost4J-Spark Tutorial <xgboost4j_spark_tutorial>
   XGBoost4J-Spark-GPU Tutorial <xgboost4j_spark_gpu_tutorial>
   Code Examples <https://github.com/dmlc/xgboost/tree/master/jvm-packages/xgboost4j-example>
-  XGBoost4J Java API <javadocs/index>
-  XGBoost4J Scala API <scaladocs/xgboost4j/index>
-  XGBoost4J-Spark Scala API <scaladocs/xgboost4j-spark/index>
-  XGBoost4J-Flink Scala API <scaladocs/xgboost4j-flink/index>
+  API docs <api>
 
 .. note::
 
diff --git a/include/xgboost/c_api.h b/include/xgboost/c_api.h
index 4b60fe01a546..85897412f9a6 100644
--- a/include/xgboost/c_api.h
+++ b/include/xgboost/c_api.h
@@ -114,7 +114,7 @@ XGB_DLL int XGBGetGlobalConfig(char const **out_config);
 /**
  * @defgroup DMatrix DMatrix
  *
- * @brief DMatrix is the baisc data storage for XGBoost used by all XGBoost algorithms
+ * @brief DMatrix is the basic data storage for XGBoost used by all XGBoost algorithms
  *        including both training, prediction and explanation. There are a few variants of
  *        `DMatrix` including normal `DMatrix`, which is a CSR matrix, `QuantileDMatrix`,
  *        which is used by histogram-based tree methods for saving memory, and lastly the
@@ -1265,13 +1265,11 @@ XGB_DLL int XGBoosterLoadModelFromBuffer(BoosterHandle handle,
  * \param handle handle
  * \param config JSON encoded string storing parameters for the function.  Following
  *               keys are expected in the JSON document:
- *
- *     "format": str
- *       - json: Output booster will be encoded as JSON.
- *       - ubj:  Output booster will be encoded as Univeral binary JSON.
- *       - deprecated: Output booster will be encoded as old custom binary format.  Do not use
- *         this format except for compatibility reasons.
- *
+ *               - "format": str
+ *                 - json: Output booster will be encoded as JSON.
+ *                 - ubj:  Output booster will be encoded as Universal binary JSON.
+ *                 - deprecated: Output booster will be encoded as old custom binary format.  Do not use
+ *                   this format except for compatibility reasons.
  * \param out_len  The argument to hold the output length
  * \param out_dptr The argument to hold the output data pointer
  *
diff --git a/rabit/include/rabit/internal/socket.h b/include/xgboost/collective/poll_utils.h
similarity index 97%
rename from rabit/include/rabit/internal/socket.h
rename to include/xgboost/collective/poll_utils.h
index 3701146d4577..514e0a5c6633 100644
--- a/rabit/include/rabit/internal/socket.h
+++ b/include/xgboost/collective/poll_utils.h
@@ -3,8 +3,7 @@
  * \file socket.h
  * \author Tianqi Chen
  */
-#ifndef RABIT_INTERNAL_SOCKET_H_
-#define RABIT_INTERNAL_SOCKET_H_
+#pragma once
 #include "xgboost/collective/result.h"
 #include "xgboost/collective/socket.h"
 
@@ -61,8 +60,8 @@ using sock_size_t = size_t;  // NOLINT
 #pragma message("Distributed training on mingw is not supported.")
 typedef struct pollfd {
   SOCKET fd;
-  short  events;
-  short  revents;
+  short  events;  // NOLINT
+  short  revents;  // NOLINT
 } WSAPOLLFD, *PWSAPOLLFD, *LPWSAPOLLFD;
 
 // POLLRDNORM | POLLRDBAND
@@ -97,7 +96,8 @@ std::enable_if_t<std::is_integral_v<E>, xgboost::collective::Result> PollError(E
   if ((revents & POLLERR) != 0) {
     auto err = errno;
     auto str = strerror(err);
-    return xgboost::system::FailWithCode(std::string{"Poll error condition:"} + std::string{str} +
+    return xgboost::system::FailWithCode(std::string{"Poll error condition:"} +  // NOLINT
+                                         std::string{str} +                      // NOLINT
                                          " code:" + std::to_string(err));
   }
   if ((revents & POLLNVAL) != 0) {
@@ -229,5 +229,3 @@ struct PollHelper {
 #undef POLLPRI
 #undef POLLOUT
 #endif  // IS_MINGW()
-
-#endif  // RABIT_INTERNAL_SOCKET_H_
diff --git a/include/xgboost/data.h b/include/xgboost/data.h
index 05e2cb0080f0..7ae1c4ebcc09 100644
--- a/include/xgboost/data.h
+++ b/include/xgboost/data.h
@@ -473,10 +473,7 @@ class BatchIterator {
     return *(*impl_);
   }
 
-  bool operator!=(const BatchIterator&) const {
-    CHECK(impl_ != nullptr);
-    return !impl_->AtEnd();
-  }
+  [[nodiscard]] bool operator!=(const BatchIterator&) const { return !this->AtEnd(); }
 
   [[nodiscard]] bool AtEnd() const {
     CHECK(impl_ != nullptr);
@@ -511,13 +508,13 @@ class DMatrix {
  public:
   /*! \brief default constructor */
   DMatrix()  = default;
-  /*! \brief meta information of the dataset */
-  virtual MetaInfo& Info() = 0;
+  /** @brief meta information of the dataset */
+  [[nodiscard]] virtual MetaInfo& Info() = 0;
   virtual void SetInfo(const char* key, std::string const& interface_str) {
     auto const& ctx = *this->Ctx();
     this->Info().SetInfo(ctx, key, StringView{interface_str});
   }
-  /*! \brief meta information of the dataset */
+  /** @brief meta information of the dataset */
   [[nodiscard]] virtual const MetaInfo& Info() const = 0;
 
   /*! \brief Get thread local memory for returning data from DMatrix. */
diff --git a/include/xgboost/version_config.h b/include/xgboost/version_config.h
index 70e5417af779..b20753b03548 100644
--- a/include/xgboost/version_config.h
+++ b/include/xgboost/version_config.h
@@ -5,7 +5,7 @@
 #define XGBOOST_VERSION_CONFIG_H_
 
 #define XGBOOST_VER_MAJOR 2  /* NOLINT */
-#define XGBOOST_VER_MINOR 1  /* NOLINT */
+#define XGBOOST_VER_MINOR 2  /* NOLINT */
 #define XGBOOST_VER_PATCH 0  /* NOLINT */
 
 #endif  // XGBOOST_VERSION_CONFIG_H_
diff --git a/jvm-packages/CMakeLists.txt b/jvm-packages/CMakeLists.txt
index 36ed61a6b063..c6353d4b7400 100644
--- a/jvm-packages/CMakeLists.txt
+++ b/jvm-packages/CMakeLists.txt
@@ -21,7 +21,6 @@ target_include_directories(xgboost4j
   ${JNI_INCLUDE_DIRS}
   ${PROJECT_SOURCE_DIR}/jvm-packages/xgboost4j/src/native
   ${PROJECT_SOURCE_DIR}/include
-  ${PROJECT_SOURCE_DIR}/dmlc-core/include
-  ${PROJECT_SOURCE_DIR}/rabit/include)
+  ${PROJECT_SOURCE_DIR}/dmlc-core/include)
 
 set_output_directory(xgboost4j ${PROJECT_SOURCE_DIR}/lib)
diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml
index 17afbe48d2cc..8b26af4f2190 100644
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -6,7 +6,7 @@
 
     <groupId>ml.dmlc</groupId>
     <artifactId>xgboost-jvm_2.12</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.2.0-SNAPSHOT</version>
     <packaging>pom</packaging>
     <name>XGBoost JVM Package</name>
     <description>JVM Package for XGBoost</description>
@@ -35,16 +35,17 @@
         <maven.compiler.target>1.8</maven.compiler.target>
         <flink.version>1.19.0</flink.version>
         <junit.version>4.13.2</junit.version>
-        <spark.version>3.4.1</spark.version>
-        <spark.version.gpu>3.4.1</spark.version.gpu>
+        <spark.version>3.5.1</spark.version>
+        <spark.version.gpu>3.5.1</spark.version.gpu>
+        <fasterxml.jackson.version>2.15.2</fasterxml.jackson.version>
         <scala.version>2.12.18</scala.version>
         <scala.binary.version>2.12</scala.binary.version>
         <hadoop.version>3.4.0</hadoop.version>
         <maven.wagon.http.retryHandler.count>5</maven.wagon.http.retryHandler.count>
         <log.capi.invocation>OFF</log.capi.invocation>
         <use.cuda>OFF</use.cuda>
-        <cudf.version>23.12.1</cudf.version>
-        <spark.rapids.version>23.12.1</spark.rapids.version>
+        <cudf.version>24.04.0</cudf.version>
+        <spark.rapids.version>24.04.1</spark.rapids.version>
         <cudf.classifier>cuda12</cudf.classifier>
         <scalatest.version>3.2.18</scalatest.version>
         <scala-collection-compat.version>2.12.0</scala-collection-compat.version>
@@ -179,7 +180,7 @@
                     <plugin>
                         <groupId>org.sonatype.plugins</groupId>
                         <artifactId>nexus-staging-maven-plugin</artifactId>
-                        <version>1.6.13</version>
+                        <version>1.7.0</version>
                         <extensions>true</extensions>
                         <configuration>
                             <serverId>ossrh</serverId>
@@ -410,7 +411,7 @@
             <plugin>
                 <groupId>net.alchim31.maven</groupId>
                 <artifactId>scala-maven-plugin</artifactId>
-                <version>4.9.0</version>
+                <version>4.9.1</version>
                 <executions>
                     <execution>
                         <id>compile</id>
@@ -473,7 +474,7 @@
             <plugin>
                 <groupId>net.alchim31.maven</groupId>
                 <artifactId>scala-maven-plugin</artifactId>
-                <version>4.9.0</version>
+                <version>4.9.1</version>
                 <configuration>
                     <jvmArgs>
                         <jvmArg>-Xms64m</jvmArg>
@@ -489,11 +490,6 @@
             <artifactId>kryo</artifactId>
             <version>5.6.0</version>
         </dependency>
-        <dependency>
-            <groupId>com.fasterxml.jackson.core</groupId>
-            <artifactId>jackson-databind</artifactId>
-            <version>2.14.2</version>
-        </dependency>
         <dependency>
             <groupId>commons-logging</groupId>
             <artifactId>commons-logging</artifactId>
diff --git a/jvm-packages/xgboost4j-example/pom.xml b/jvm-packages/xgboost4j-example/pom.xml
index 431c6766a8be..eda453041fa3 100644
--- a/jvm-packages/xgboost4j-example/pom.xml
+++ b/jvm-packages/xgboost4j-example/pom.xml
@@ -6,11 +6,11 @@
     <parent>
         <groupId>ml.dmlc</groupId>
         <artifactId>xgboost-jvm_2.12</artifactId>
-        <version>2.1.0-SNAPSHOT</version>
+        <version>2.2.0-SNAPSHOT</version>
     </parent>
     <name>xgboost4j-example</name>
     <artifactId>xgboost4j-example_2.12</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.2.0-SNAPSHOT</version>
     <packaging>jar</packaging>
     <build>
         <plugins>
diff --git a/jvm-packages/xgboost4j-flink/pom.xml b/jvm-packages/xgboost4j-flink/pom.xml
index e3dfb383041f..10ebfe36a6e8 100644
--- a/jvm-packages/xgboost4j-flink/pom.xml
+++ b/jvm-packages/xgboost4j-flink/pom.xml
@@ -6,12 +6,12 @@
     <parent>
         <groupId>ml.dmlc</groupId>
         <artifactId>xgboost-jvm_2.12</artifactId>
-        <version>2.1.0-SNAPSHOT</version>
+        <version>2.2.0-SNAPSHOT</version>
     </parent>
 
     <name>xgboost4j-flink</name>
     <artifactId>xgboost4j-flink_2.12</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.2.0-SNAPSHOT</version>
     <properties>
       <flink-ml.version>2.2.0</flink-ml.version>
     </properties>
diff --git a/jvm-packages/xgboost4j-flink/src/main/java/ml/dmlc/xgboost4j/java/flink/XGBoost.java b/jvm-packages/xgboost4j-flink/src/main/java/ml/dmlc/xgboost4j/java/flink/XGBoost.java
index 99608b927489..a660bca8806c 100644
--- a/jvm-packages/xgboost4j-flink/src/main/java/ml/dmlc/xgboost4j/java/flink/XGBoost.java
+++ b/jvm-packages/xgboost4j-flink/src/main/java/ml/dmlc/xgboost4j/java/flink/XGBoost.java
@@ -176,7 +176,7 @@ public static XGBoostModel train(DataSet<Tuple2<Vector, Double>> dtrain,
         new RabitTracker(dtrain.getExecutionEnvironment().getParallelism());
     if (tracker.start()) {
       return dtrain
-        .mapPartition(new MapFunction(params, numBoostRound, tracker.workerArgs()))
+        .mapPartition(new MapFunction(params, numBoostRound, tracker.getWorkerArgs()))
         .reduce((x, y) -> x)
         .collect()
         .get(0);
diff --git a/jvm-packages/xgboost4j-gpu/pom.xml b/jvm-packages/xgboost4j-gpu/pom.xml
index 25b44d6b2d2d..bd26acd688cd 100644
--- a/jvm-packages/xgboost4j-gpu/pom.xml
+++ b/jvm-packages/xgboost4j-gpu/pom.xml
@@ -6,11 +6,11 @@
     <parent>
         <groupId>ml.dmlc</groupId>
         <artifactId>xgboost-jvm_2.12</artifactId>
-        <version>2.1.0-SNAPSHOT</version>
+        <version>2.2.0-SNAPSHOT</version>
     </parent>
     <artifactId>xgboost4j-gpu_2.12</artifactId>
     <name>xgboost4j-gpu</name>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.2.0-SNAPSHOT</version>
     <packaging>jar</packaging>
 
     <dependencies>
@@ -72,7 +72,7 @@
             <plugin>
                 <groupId>org.apache.maven.plugins</groupId>
                 <artifactId>maven-javadoc-plugin</artifactId>
-                <version>3.6.3</version>
+                <version>3.7.0</version>
                 <configuration>
                     <show>protected</show>
                     <nohelp>true</nohelp>
@@ -88,7 +88,7 @@
             <plugin>
                 <artifactId>exec-maven-plugin</artifactId>
                 <groupId>org.codehaus.mojo</groupId>
-                <version>3.2.0</version>
+                <version>3.3.0</version>
                 <executions>
                     <execution>
                         <id>native</id>
@@ -113,7 +113,7 @@
             <plugin>
                 <groupId>org.apache.maven.plugins</groupId>
                 <artifactId>maven-jar-plugin</artifactId>
-                <version>3.4.0</version>
+                <version>3.4.1</version>
                 <executions>
                     <execution>
                         <goals>
diff --git a/jvm-packages/xgboost4j-spark-gpu/pom.xml b/jvm-packages/xgboost4j-spark-gpu/pom.xml
index 149f2f3a326a..c97924105f29 100644
--- a/jvm-packages/xgboost4j-spark-gpu/pom.xml
+++ b/jvm-packages/xgboost4j-spark-gpu/pom.xml
@@ -6,7 +6,7 @@
     <parent>
         <groupId>ml.dmlc</groupId>
         <artifactId>xgboost-jvm_2.12</artifactId>
-        <version>2.1.0-SNAPSHOT</version>
+        <version>2.2.0-SNAPSHOT</version>
     </parent>
     <name>xgboost4j-spark-gpu</name>
     <artifactId>xgboost4j-spark-gpu_2.12</artifactId>
diff --git a/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuPreXGBoost.scala b/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuPreXGBoost.scala
index 7e83dc6f17b0..00c547aa8758 100644
--- a/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuPreXGBoost.scala
+++ b/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuPreXGBoost.scala
@@ -1,5 +1,5 @@
 /*
- Copyright (c) 2021-2022 by Contributors
+ Copyright (c) 2021-2024 by Contributors
 
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
@@ -29,7 +29,7 @@ import org.apache.spark.{SparkContext, TaskContext}
 import org.apache.spark.ml.{Estimator, Model}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
-import org.apache.spark.sql.catalyst.encoders.RowEncoder
+import org.apache.spark.sql.catalyst.encoders.{ExpressionEncoder, RowEncoder}
 import org.apache.spark.sql.catalyst.expressions.UnsafeProjection
 import org.apache.spark.sql.functions.{col, collect_list, struct}
 import org.apache.spark.sql.types.{ArrayType, FloatType, StructField, StructType}
@@ -444,7 +444,7 @@ object GpuPreXGBoost extends PreXGBoostProvider {
       .groupBy(groupName)
       .agg(collect_list(struct(schema.fieldNames.map(col): _*)) as "list")
 
-    implicit val encoder = RowEncoder(schema)
+    implicit val encoder = ExpressionEncoder(RowEncoder.encoderFor(schema, false))
     // Expand the grouped rows after repartition
     repartitionInputData(groupedDF, nWorkers).mapPartitions(iter => {
       new Iterator[Row] {
diff --git a/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuUtils.scala b/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuUtils.scala
index c88aefa4eb0a..79a8d5449606 100644
--- a/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuUtils.scala
+++ b/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuUtils.scala
@@ -89,9 +89,13 @@ private[spark] object GpuUtils {
     val featureNameSet = featureNames.distinct
     validateSchema(dataset.schema, featureNameSet, labelName, weightName, marginName, fitting)
 
-    val castToFloat = (ds: Dataset[_], colName: String) => {
-      val colMeta = ds.schema(colName).metadata
-      ds.withColumn(colName, col(colName).as(colName, colMeta).cast(FloatType))
+    val castToFloat = (df: DataFrame, colName: String) => {
+      if (df.schema(colName).dataType.isInstanceOf[FloatType]) {
+        df
+      } else {
+        val colMeta = df.schema(colName).metadata
+        df.withColumn(colName, col(colName).as(colName, colMeta).cast(FloatType))
+      }
     }
     val colNames = if (fitting) {
       var names = featureNameSet :+ labelName
diff --git a/jvm-packages/xgboost4j-spark/pom.xml b/jvm-packages/xgboost4j-spark/pom.xml
index 6f16335f013d..5412642549d6 100644
--- a/jvm-packages/xgboost4j-spark/pom.xml
+++ b/jvm-packages/xgboost4j-spark/pom.xml
@@ -6,7 +6,7 @@
     <parent>
         <groupId>ml.dmlc</groupId>
         <artifactId>xgboost-jvm_2.12</artifactId>
-        <version>2.1.0-SNAPSHOT</version>
+        <version>2.2.0-SNAPSHOT</version>
     </parent>
     <name>xgboost4j-spark</name>
     <artifactId>xgboost4j-spark_2.12</artifactId>
diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
index e17c68355c5b..10c4b5a72992 100644
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
@@ -233,24 +233,6 @@ private[this] class XGBoostExecutionParamsFactory(rawParams: Map[String, Any], s
     xgbExecParam.setRawParamMap(overridedParams)
     xgbExecParam
   }
-
-  private[spark] def buildRabitParams : Map[String, String] = Map(
-    "rabit_reduce_ring_mincount" ->
-      overridedParams.getOrElse("rabit_ring_reduce_threshold", 32 << 10).toString,
-    "rabit_debug" ->
-      (overridedParams.getOrElse("verbosity", 0).toString.toInt == 3).toString,
-    "rabit_timeout" ->
-      (overridedParams.getOrElse("rabit_timeout", -1).toString.toInt >= 0).toString,
-    "rabit_timeout_sec" -> {
-      if (overridedParams.getOrElse("rabit_timeout", -1).toString.toInt >= 0) {
-        overridedParams.get("rabit_timeout").toString
-      } else {
-        "1800"
-      }
-    },
-    "DMLC_WORKER_CONNECT_RETRY" ->
-      overridedParams.getOrElse("dmlc_worker_connect_retry", 5).toString
-  )
 }
 
 /**
@@ -475,17 +457,15 @@ object XGBoost extends XGBoostStageLevel {
     }
   }
 
-  /** visiable for testing */
-  private[scala] def getTracker(nWorkers: Int, trackerConf: TrackerConf): ITracker = {
-    val tracker: ITracker = new RabitTracker(
-      nWorkers, trackerConf.hostIp, trackerConf.port, trackerConf.timeout)
-    tracker
-  }
-
-  private def startTracker(nWorkers: Int, trackerConf: TrackerConf): ITracker = {
-    val tracker = getTracker(nWorkers, trackerConf)
+  // Executes the provided code block inside a tracker and then stops the tracker
+  private def withTracker[T](nWorkers: Int, conf: TrackerConf)(block: ITracker => T): T = {
+    val tracker = new RabitTracker(nWorkers, conf.hostIp, conf.port, conf.timeout)
     require(tracker.start(), "FAULT: Failed to start tracker")
-    tracker
+    try {
+      block(tracker)
+    } finally {
+      tracker.stop()
+    }
   }
 
   /**
@@ -501,28 +481,27 @@ object XGBoost extends XGBoostStageLevel {
     logger.info(s"Running XGBoost ${spark.VERSION} with parameters:\n${params.mkString("\n")}")
 
     val xgbParamsFactory = new XGBoostExecutionParamsFactory(params, sc)
-    val xgbExecParams = xgbParamsFactory.buildXGBRuntimeParams
-    val xgbRabitParams = xgbParamsFactory.buildRabitParams.asJava
+    val runtimeParams = xgbParamsFactory.buildXGBRuntimeParams
 
-    val prevBooster = xgbExecParams.checkpointParam.map { checkpointParam =>
+    val prevBooster = runtimeParams.checkpointParam.map { checkpointParam =>
       val checkpointManager = new ExternalCheckpointManager(
         checkpointParam.checkpointPath,
         FileSystem.get(sc.hadoopConfiguration))
-      checkpointManager.cleanUpHigherVersions(xgbExecParams.numRounds)
+      checkpointManager.cleanUpHigherVersions(runtimeParams.numRounds)
       checkpointManager.loadCheckpointAsScalaBooster()
     }.orNull
 
     // Get the training data RDD and the cachedRDD
-    val (trainingRDD, optionalCachedRDD) = buildTrainingData(xgbExecParams)
+    val (trainingRDD, optionalCachedRDD) = buildTrainingData(runtimeParams)
 
     try {
-      // Train for every ${savingRound} rounds and save the partially completed booster
-      val tracker = startTracker(xgbExecParams.numWorkers, xgbExecParams.trackerConf)
-      val (booster, metrics) = try {
-        tracker.workerArgs().putAll(xgbRabitParams)
-        val rabitEnv = tracker.workerArgs
+      val (booster, metrics) = withTracker(
+        runtimeParams.numWorkers,
+        runtimeParams.trackerConf
+      ) { tracker =>
+        val rabitEnv = tracker.getWorkerArgs()
 
-        val boostersAndMetrics = trainingRDD.barrier().mapPartitions { iter => {
+        val boostersAndMetrics = trainingRDD.barrier().mapPartitions { iter =>
           var optionWatches: Option[() => Watches] = None
 
           // take the first Watches to train
@@ -530,26 +509,25 @@ object XGBoost extends XGBoostStageLevel {
             optionWatches = Some(iter.next())
           }
 
-          optionWatches.map { buildWatches => buildDistributedBooster(buildWatches,
-            xgbExecParams, rabitEnv, xgbExecParams.obj, xgbExecParams.eval, prevBooster)}
-            .getOrElse(throw new RuntimeException("No Watches to train"))
-
-        }}
+          optionWatches.map { buildWatches =>
+              buildDistributedBooster(buildWatches,
+                runtimeParams, rabitEnv, runtimeParams.obj, runtimeParams.eval, prevBooster)
+            }.getOrElse(throw new RuntimeException("No Watches to train"))
+        }
 
-        val boostersAndMetricsWithRes = tryStageLevelScheduling(sc, xgbExecParams,
+        val boostersAndMetricsWithRes = tryStageLevelScheduling(sc, runtimeParams,
           boostersAndMetrics)
         // The repartition step is to make training stage as ShuffleMapStage, so that when one
         // of the training task fails the training stage can retry. ResultStage won't retry when
         // it fails.
         val (booster, metrics) = boostersAndMetricsWithRes.repartition(1).collect()(0)
         (booster, metrics)
-      } finally {
-        tracker.stop()
       }
+
       // we should delete the checkpoint directory after a successful training
-      xgbExecParams.checkpointParam.foreach {
+      runtimeParams.checkpointParam.foreach {
         cpParam =>
-          if (!xgbExecParams.checkpointParam.get.skipCleanCheckpoint) {
+          if (!runtimeParams.checkpointParam.get.skipCleanCheckpoint) {
             val checkpointManager = new ExternalCheckpointManager(
               cpParam.checkpointPath,
               FileSystem.get(sc.hadoopConfiguration))
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/CommunicatorRobustnessSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/CommunicatorRobustnessSuite.scala
index 108053af5d76..d3f3901ad704 100644
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/CommunicatorRobustnessSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/CommunicatorRobustnessSuite.scala
@@ -45,7 +45,7 @@ class CommunicatorRobustnessSuite extends AnyFunSuite with PerTest {
 
     val tracker = new RabitTracker(numWorkers)
     tracker.start()
-    val trackerEnvs = tracker. workerArgs
+    val trackerEnvs = tracker.getWorkerArgs
 
     val workerCount: Int = numWorkers
     /*
@@ -84,7 +84,7 @@ class CommunicatorRobustnessSuite extends AnyFunSuite with PerTest {
     val rdd = sc.parallelize(1 to numWorkers, numWorkers).cache()
     val tracker = new RabitTracker(numWorkers)
     tracker.start()
-    val trackerEnvs = tracker.workerArgs
+    val trackerEnvs = tracker.getWorkerArgs
 
     val workerCount: Int = numWorkers
 
diff --git a/jvm-packages/xgboost4j/pom.xml b/jvm-packages/xgboost4j/pom.xml
index 5a83a400c50b..5c5648b6d23a 100644
--- a/jvm-packages/xgboost4j/pom.xml
+++ b/jvm-packages/xgboost4j/pom.xml
@@ -6,11 +6,11 @@
     <parent>
         <groupId>ml.dmlc</groupId>
         <artifactId>xgboost-jvm_2.12</artifactId>
-        <version>2.1.0-SNAPSHOT</version>
+        <version>2.2.0-SNAPSHOT</version>
     </parent>
     <name>xgboost4j</name>
     <artifactId>xgboost4j_2.12</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.2.0-SNAPSHOT</version>
     <packaging>jar</packaging>
 
     <dependencies>
@@ -53,6 +53,12 @@
           <version>${scalatest.version}</version>
           <scope>provided</scope>
         </dependency>
+        <dependency>
+            <groupId>com.fasterxml.jackson.core</groupId>
+            <artifactId>jackson-databind</artifactId>
+            <version>${fasterxml.jackson.version}</version>
+            <scope>provided</scope>
+        </dependency>
     </dependencies>
 
     <build>
@@ -60,7 +66,7 @@
           <plugin>
               <groupId>org.apache.maven.plugins</groupId>
               <artifactId>maven-javadoc-plugin</artifactId>
-              <version>3.6.3</version>
+              <version>3.7.0</version>
               <configuration>
                   <show>protected</show>
                   <nohelp>true</nohelp>
@@ -76,7 +82,7 @@
           <plugin>
               <artifactId>exec-maven-plugin</artifactId>
               <groupId>org.codehaus.mojo</groupId>
-              <version>3.2.0</version>
+              <version>3.3.0</version>
               <executions>
                   <execution>
                       <id>native</id>
diff --git a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/ITracker.java b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/ITracker.java
index 1bfef677d45c..84e535a269e2 100644
--- a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/ITracker.java
+++ b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/ITracker.java
@@ -7,7 +7,7 @@
  *
  *  - start(timeout): Start the tracker awaiting for worker connections, with a given
  *  timeout value (in seconds).
- *  - workerArgs(): Return the arguments needed to initialize Rabit clients.
+ *  - getWorkerArgs(): Return the arguments needed to initialize Rabit clients.
  *  - waitFor(timeout): Wait for the task execution by the worker nodes for at most `timeout`
  *  milliseconds.
  *
@@ -21,21 +21,8 @@
  * brokers connections between workers.
  */
 public interface ITracker extends Thread.UncaughtExceptionHandler {
-  enum TrackerStatus {
-    SUCCESS(0), INTERRUPTED(1), TIMEOUT(2), FAILURE(3);
 
-    private int statusCode;
-
-    TrackerStatus(int statusCode) {
-      this.statusCode = statusCode;
-    }
-
-    public int getStatusCode() {
-      return this.statusCode;
-    }
-  }
-
-  Map<String, Object> workerArgs() throws XGBoostError;
+  Map<String, Object> getWorkerArgs() throws XGBoostError;
 
   boolean start() throws XGBoostError;
 
diff --git a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/RabitTracker.java b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/RabitTracker.java
index 914a493cc8d1..48b163a7753b 100644
--- a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/RabitTracker.java
+++ b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/RabitTracker.java
@@ -1,3 +1,19 @@
+/*
+ Copyright (c) 2014-2024 by Contributors
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
 package ml.dmlc.xgboost4j.java;
 
 import java.util.Map;
@@ -10,14 +26,12 @@
 
 /**
  * Java implementation of the Rabit tracker to coordinate distributed workers.
- *
- * The tracker must be started on driver node before running distributed jobs.
  */
 public class RabitTracker implements ITracker {
   // Maybe per tracker logger?
   private static final Log logger = LogFactory.getLog(RabitTracker.class);
   private long handle = 0;
-  private Thread tracker_daemon;
+  private Thread trackerDaemon;
 
   public RabitTracker(int numWorkers) throws XGBoostError {
     this(numWorkers, "");
@@ -44,7 +58,7 @@ public void uncaughtException(Thread t, Throwable e) {
     } catch (InterruptedException ex) {
       logger.error(ex);
     } finally {
-      this.tracker_daemon.interrupt();
+      this.trackerDaemon.interrupt();
     }
   }
 
@@ -52,16 +66,14 @@ public void uncaughtException(Thread t, Throwable e) {
    * Get environments that can be used to pass to worker.
    * @return The environment settings.
    */
-  public Map<String, Object> workerArgs() throws XGBoostError {
+  public Map<String, Object> getWorkerArgs() throws XGBoostError {
     // fixme: timeout
     String[] args = new String[1];
     XGBoostJNI.checkCall(XGBoostJNI.TrackerWorkerArgs(this.handle, 0, args));
     ObjectMapper mapper = new ObjectMapper();
-    TypeReference<Map<String, Object>> typeRef = new TypeReference<Map<String, Object>>() {
-    };
     Map<String, Object> config;
     try {
-      config = mapper.readValue(args[0], typeRef);
+      config = mapper.readValue(args[0], new TypeReference<Map<String, Object>>() {});
     } catch (JsonProcessingException ex) {
       throw new XGBoostError("Failed to get worker arguments.", ex);
     }
@@ -74,18 +86,18 @@ public void stop() throws XGBoostError {
 
   public boolean start() throws XGBoostError {
     XGBoostJNI.checkCall(XGBoostJNI.TrackerRun(this.handle));
-    this.tracker_daemon = new Thread(() -> {
+    this.trackerDaemon = new Thread(() -> {
       try {
-        XGBoostJNI.checkCall(XGBoostJNI.TrackerWaitFor(this.handle, 0));
+        waitFor(0);
       } catch (XGBoostError ex) {
         logger.error(ex);
         return; // exit the thread
       }
     });
-    this.tracker_daemon.setDaemon(true);
-    this.tracker_daemon.start();
+    this.trackerDaemon.setDaemon(true);
+    this.trackerDaemon.start();
 
-    return this.tracker_daemon.isAlive();
+    return this.trackerDaemon.isAlive();
   }
 
   public void waitFor(long timeout) throws XGBoostError {
diff --git a/plugin/federated/README.md b/plugin/federated/README.md
index 631c44cee26f..1040c933ef12 100644
--- a/plugin/federated/README.md
+++ b/plugin/federated/README.md
@@ -1,33 +1,16 @@
 XGBoost Plugin for Federated Learning
 =====================================
 
-This folder contains the plugin for federated learning. Follow these steps to build and test it.
+This folder contains the plugin for federated learning.
 
-Install gRPC
-------------
-Refer to the [installation guide from the gRPC website](https://grpc.io/docs/languages/cpp/quickstart/).
+See [build instruction](../../doc/build.rst) for how to build the plugin.
 
-Build the Plugin
-----------------
-```shell
-# Under xgboost source tree.
-mkdir build
-cd build
-cmake .. -GNinja \
- -DPLUGIN_FEDERATED=ON \
- -DUSE_CUDA=ON\
- -DUSE_NCCL=ON
-ninja
-cd ../python-package
-pip install -e .
-```
-If CMake fails to locate gRPC, you may need to pass `-DCMAKE_PREFIX_PATH=<grpc path>` to CMake.
 
 Test Federated XGBoost
 ----------------------
 ```shell
 # Under xgboost source tree.
-cd tests/distributed
+cd tests/distributed/test_federated
 # This tests both CPU training (`hist`) and GPU training (`gpu_hist`).
 ./runtests-federated.sh
 ```
diff --git a/plugin/federated/federated_comm.cc b/plugin/federated/federated_comm.cc
index ec128741353b..578c25bdbec7 100644
--- a/plugin/federated/federated_comm.cc
+++ b/plugin/federated/federated_comm.cc
@@ -53,8 +53,8 @@ void FederatedComm::Init(std::string const& host, std::int32_t port, std::int32_
       args.SetMaxReceiveMessageSize(std::numeric_limits<std::int32_t>::max());
       auto channel = grpc::CreateCustomChannel(host + ":" + std::to_string(port),
                                                grpc::SslCredentials(options), args);
-      channel->WaitForConnected(
-          gpr_time_add(gpr_now(GPR_CLOCK_REALTIME), gpr_time_from_seconds(60, GPR_TIMESPAN)));
+      channel->WaitForConnected(gpr_time_add(
+          gpr_now(GPR_CLOCK_REALTIME), gpr_time_from_seconds(DefaultTimeoutSec(), GPR_TIMESPAN)));
       return federated::Federated::NewStub(channel);
     }();
   }
@@ -90,8 +90,6 @@ FederatedComm::FederatedComm(std::int32_t retry, std::chrono::seconds timeout, s
   auto parsed = common::Split(server_address, ':');
   CHECK_EQ(parsed.size(), 2) << "Invalid server address:" << server_address;
 
-  CHECK_NE(rank, -1) << "Parameter `federated_rank` is required";
-  CHECK_NE(world_size, 0) << "Parameter `federated_world_size` is required.";
   CHECK(!server_address.empty()) << "Parameter `federated_server_address` is required.";
 
   /**
diff --git a/plugin/federated/federated_comm.h b/plugin/federated/federated_comm.h
index b39e1878a8ea..0909509e07bc 100644
--- a/plugin/federated/federated_comm.h
+++ b/plugin/federated/federated_comm.h
@@ -6,8 +6,9 @@
 #include <federated.grpc.pb.h>
 #include <federated.pb.h>
 
+#include <chrono>   // for seconds
 #include <cstdint>  // for int32_t
-#include <memory>   // for unique_ptr
+#include <memory>   // for shared_ptr
 #include <string>   // for string
 
 #include "../../src/collective/comm.h"    // for HostComm
@@ -46,10 +47,6 @@ class FederatedComm : public HostComm {
    */
   explicit FederatedComm(std::int32_t retry, std::chrono::seconds timeout, std::string task_id,
                          Json const& config);
-  explicit FederatedComm(std::string const& host, std::int32_t port, std::int32_t world,
-                         std::int32_t rank) {
-    this->Init(host, port, world, rank, {}, {}, {});
-  }
   [[nodiscard]] Result Shutdown() final {
     this->ResetState();
     return Success();
diff --git a/plugin/sycl/common/hist_util.cc b/plugin/sycl/common/hist_util.cc
index fd813a92cec9..2f2417f3a29a 100644
--- a/plugin/sycl/common/hist_util.cc
+++ b/plugin/sycl/common/hist_util.cc
@@ -64,6 +64,30 @@ template ::sycl::event SubtractionHist(::sycl::queue qu,
                               const GHistRow<double, MemoryType::on_device>& src2,
                               size_t size, ::sycl::event event_priv);
 
+inline auto GetBlocksParameters(const ::sycl::queue& qu, size_t size, size_t max_nblocks) {
+  struct _ {
+    size_t block_size, nblocks;
+  };
+
+  const size_t min_block_size = 32;
+  const size_t max_compute_units =
+    qu.get_device().get_info<::sycl::info::device::max_compute_units>();
+
+  size_t nblocks = max_compute_units;
+
+  size_t block_size = size / nblocks + !!(size % nblocks);
+  if (block_size > (1u << 12)) {
+    nblocks = max_nblocks;
+    block_size = size / nblocks + !!(size % nblocks);
+  }
+  if (block_size < min_block_size) {
+    block_size = min_block_size;
+    nblocks = size / block_size + !!(size % block_size);
+  }
+
+  return _{block_size, nblocks};
+}
+
 // Kernel with buffer using
 template<typename FPType, typename BinIdxType, bool isDense>
 ::sycl::event BuildHistKernel(::sycl::queue qu,
@@ -73,27 +97,26 @@ ::sycl::event BuildHistKernel(::sycl::queue qu,
                             GHistRow<FPType, MemoryType::on_device>* hist,
                             GHistRow<FPType, MemoryType::on_device>* hist_buffer,
                             ::sycl::event event_priv) {
+  using GradientPairT = xgboost::detail::GradientPairInternal<FPType>;
   const size_t size = row_indices.Size();
   const size_t* rid = row_indices.begin;
   const size_t n_columns = isDense ? gmat.nfeatures : gmat.row_stride;
-  const GradientPair::ValueT* pgh =
-    reinterpret_cast<const GradientPair::ValueT*>(gpair_device.DataConst());
+  const auto* pgh = gpair_device.DataConst();
   const BinIdxType* gradient_index = gmat.index.data<BinIdxType>();
   const uint32_t* offsets = gmat.index.Offset();
-  FPType* hist_data = reinterpret_cast<FPType*>(hist->Data());
   const size_t nbins = gmat.nbins;
 
   const size_t max_work_group_size =
     qu.get_device().get_info<::sycl::info::device::max_work_group_size>();
   const size_t work_group_size = n_columns < max_work_group_size ? n_columns : max_work_group_size;
 
-  const size_t max_nblocks = hist_buffer->Size() / (nbins * 2);
-  const size_t min_block_size = 128;
-  size_t nblocks = std::min(max_nblocks, size / min_block_size + !!(size % min_block_size));
-  const size_t block_size = size / nblocks + !!(size % nblocks);
-  FPType* hist_buffer_data = reinterpret_cast<FPType*>(hist_buffer->Data());
+  // Captured structured bindings are a C++20 extension
+  const auto block_params = GetBlocksParameters(qu, size, hist_buffer->Size() / (nbins * 2));
+  const size_t block_size = block_params.block_size;
+  const size_t nblocks = block_params.nblocks;
 
-  auto event_fill = qu.fill(hist_buffer_data, FPType(0), nblocks * nbins * 2, event_priv);
+  GradientPairT* hist_buffer_data = hist_buffer->Data();
+  auto event_fill = qu.fill(hist_buffer_data, GradientPairT(0, 0), nblocks * nbins * 2, event_priv);
   auto event_main = qu.submit([&](::sycl::handler& cgh) {
     cgh.depends_on(event_fill);
     cgh.parallel_for<>(::sycl::nd_range<2>(::sycl::range<2>(nblocks, work_group_size),
@@ -102,13 +125,14 @@ ::sycl::event BuildHistKernel(::sycl::queue qu,
       size_t block = pid.get_global_id(0);
       size_t feat = pid.get_global_id(1);
 
-      FPType* hist_local = hist_buffer_data + block * nbins * 2;
+      GradientPairT* hist_local = hist_buffer_data + block * nbins;
       for (size_t idx = 0; idx < block_size; ++idx) {
         size_t i = block * block_size + idx;
         if (i < size) {
           const size_t icol_start = n_columns * rid[i];
           const size_t idx_gh = rid[i];
 
+          const GradientPairT pgh_row = {pgh[idx_gh].GetGrad(), pgh[idx_gh].GetHess()};
           pid.barrier(::sycl::access::fence_space::local_space);
           const BinIdxType* gr_index_local = gradient_index + icol_start;
 
@@ -118,8 +142,7 @@ ::sycl::event BuildHistKernel(::sycl::queue qu,
               idx_bin += offsets[j];
             }
             if (idx_bin < nbins) {
-              hist_local[2 * idx_bin]   += pgh[2 * idx_gh];
-              hist_local[2 * idx_bin+1] += pgh[2 * idx_gh+1];
+              hist_local[idx_bin] += pgh_row;
             }
           }
         }
@@ -127,21 +150,19 @@ ::sycl::event BuildHistKernel(::sycl::queue qu,
     });
   });
 
+  GradientPairT* hist_data = hist->Data();
   auto event_save = qu.submit([&](::sycl::handler& cgh) {
     cgh.depends_on(event_main);
     cgh.parallel_for<>(::sycl::range<1>(nbins), [=](::sycl::item<1> pid) {
       size_t idx_bin = pid.get_id(0);
 
-      FPType gsum = 0.0f;
-      FPType hsum = 0.0f;
+      GradientPairT gpair = {0, 0};
 
       for (size_t j = 0; j < nblocks; ++j) {
-        gsum += hist_buffer_data[j * nbins * 2 + 2 * idx_bin];
-        hsum += hist_buffer_data[j * nbins * 2 + 2 * idx_bin + 1];
+        gpair += hist_buffer_data[j * nbins + idx_bin];
       }
 
-      hist_data[2 * idx_bin] = gsum;
-      hist_data[2 * idx_bin + 1] = hsum;
+      hist_data[idx_bin] = gpair;
     });
   });
   return event_save;
@@ -165,24 +186,27 @@ ::sycl::event BuildHistKernel(::sycl::queue qu,
   FPType* hist_data = reinterpret_cast<FPType*>(hist->Data());
   const size_t nbins = gmat.nbins;
 
-  const size_t max_work_group_size =
-    qu.get_device().get_info<::sycl::info::device::max_work_group_size>();
-  const size_t feat_local = n_columns < max_work_group_size ? n_columns : max_work_group_size;
+  constexpr size_t work_group_size = 32;
+  const size_t n_work_groups = n_columns / work_group_size + (n_columns % work_group_size > 0);
 
   auto event_fill = qu.fill(hist_data, FPType(0), nbins * 2, event_priv);
   auto event_main = qu.submit([&](::sycl::handler& cgh) {
     cgh.depends_on(event_fill);
-    cgh.parallel_for<>(::sycl::range<2>(size, feat_local),
-                      [=](::sycl::item<2> pid) {
-      size_t i = pid.get_id(0);
-      size_t feat = pid.get_id(1);
+    cgh.parallel_for<>(::sycl::nd_range<2>(::sycl::range<2>(size, n_work_groups * work_group_size),
+                                           ::sycl::range<2>(1, work_group_size)),
+                       [=](::sycl::nd_item<2> pid) {
+      const int i = pid.get_global_id(0);
+      auto group  = pid.get_group();
 
       const size_t icol_start = n_columns * rid[i];
       const size_t idx_gh = rid[i];
-
+      const FPType pgh_row[2] = {pgh[2 * idx_gh], pgh[2 * idx_gh + 1]};
       const BinIdxType* gr_index_local = gradient_index + icol_start;
 
-      for (size_t j = feat; j < n_columns; j += feat_local) {
+      const size_t group_id = group.get_group_id()[1];
+      const size_t local_id = group.get_local_id()[1];
+      const size_t j = group_id * work_group_size + local_id;
+      if (j < n_columns) {
         uint32_t idx_bin = static_cast<uint32_t>(gr_index_local[j]);
         if constexpr (isDense) {
           idx_bin += offsets[j];
@@ -190,8 +214,8 @@ ::sycl::event BuildHistKernel(::sycl::queue qu,
         if (idx_bin < nbins) {
           AtomicRef<FPType> gsum(hist_data[2 * idx_bin]);
           AtomicRef<FPType> hsum(hist_data[2 * idx_bin + 1]);
-          gsum.fetch_add(pgh[2 * idx_gh]);
-          hsum.fetch_add(pgh[2 * idx_gh + 1]);
+          gsum += pgh_row[0];
+          hsum += pgh_row[1];
         }
       }
     });
@@ -214,10 +238,6 @@ ::sycl::event BuildHistDispatchKernel(
   const size_t n_columns = isDense ? gmat.nfeatures : gmat.row_stride;
   const size_t nbins = gmat.nbins;
 
-  // max cycle size, while atomics are still effective
-  const size_t max_cycle_size_atomics = nbins;
-  const size_t cycle_size = size;
-
   // TODO(razdoburdin): replace the add-hock dispatching criteria by more sutable one
   bool use_atomic = (size < nbins) || (gmat.max_num_bins == gmat.nbins / n_columns);
 
diff --git a/plugin/sycl/common/linalg_op.h b/plugin/sycl/common/linalg_op.h
new file mode 100644
index 000000000000..07d4a7ef28a4
--- /dev/null
+++ b/plugin/sycl/common/linalg_op.h
@@ -0,0 +1,240 @@
+/**
+ * Copyright 2021-2024, XGBoost Contributors
+ * \file linalg_op.h
+ */
+#ifndef PLUGIN_SYCL_COMMON_LINALG_OP_H_
+#define PLUGIN_SYCL_COMMON_LINALG_OP_H_
+
+#include <vector>
+#include <utility>
+
+#include "../data.h"
+
+#include <CL/sycl.hpp>
+
+namespace xgboost {
+namespace sycl {
+namespace linalg {
+
+struct WorkGroupsParams {
+  size_t n_workgroups;
+  size_t workgroup_size;
+};
+
+template <typename Fn>
+::sycl::event GroupWiseKernel(::sycl::queue* qu, int* flag_ptr,
+                              const std::vector<::sycl::event>& events,
+                              const WorkGroupsParams& wg, Fn &&fn) {
+  ::sycl::buffer<int, 1> flag_buf(flag_ptr, 1);
+  auto event = qu->submit([&](::sycl::handler& cgh) {
+    cgh.depends_on(events);
+    auto flag  = flag_buf.get_access<::sycl::access::mode::write>(cgh);
+    cgh.parallel_for_work_group<>(::sycl::range<1>(wg.n_workgroups),
+                                  ::sycl::range<1>(wg.workgroup_size),
+                                  [=](::sycl::group<1> group) {
+      group.parallel_for_work_item([&](::sycl::h_item<1> item) {
+        const size_t idx = item.get_global_id()[0];
+        fn(idx, flag);
+      });
+    });
+  });
+  return event;
+}
+
+struct Argument {
+    template <typename T>
+    operator T&&() const;
+};
+
+template <typename Fn, typename Is, typename = void>
+struct ArgumentsPassedImpl
+       : std::false_type {};
+
+template <typename Fn, size_t ...Is>
+struct ArgumentsPassedImpl<Fn, std::index_sequence<Is...>,
+                       decltype(std::declval<Fn>()(((void)Is, Argument{})...), void())>
+      : std::true_type {};
+
+template <typename Fn, size_t N>
+struct ArgumentsPassed : ArgumentsPassedImpl<Fn, std::make_index_sequence<N>> {};
+
+template <typename OutputDType, typename InputDType,
+          size_t BatchSize, size_t MaxNumInputs>
+class BatchProcessingHelper {
+ public:
+  static constexpr size_t kBatchSize = BatchSize;
+  using InputType = HostDeviceVector<InputDType>;
+  using OutputType = HostDeviceVector<OutputDType>;
+
+ private:
+  template <size_t NumInput = 0>
+  void Host2Buffers(InputDType* in_buffer_ptr, const InputType& input) {
+    /* 
+     * Some inputs may have less than 1 sample per output symbol.
+     */
+    const size_t sub_sample_rate = ndata_ * sample_rates_[NumInput+1] / input.Size();
+    const size_t n_samples = batch_size_ * sample_rates_[NumInput+1] / sub_sample_rate;
+
+    const InputDType* in_host_ptr = input.HostPointer() +
+                                    batch_begin_ * sample_rates_[NumInput+1] / sub_sample_rate;
+
+    events_[NumInput] =
+      qu_->memcpy(in_buffer_ptr, in_host_ptr, n_samples * sizeof(InputDType),
+                  events_[MaxNumInputs - 2]);
+  }
+
+  template <size_t NumInput = 0, class... InputTypes>
+  void Host2Buffers(InputDType* in_buffer_ptr, const InputType& input,
+                    const InputTypes&... other_inputs) {
+    // Make copy for the first input in the list
+    Host2Buffers<NumInput>(in_buffer_ptr, input);
+    // Recurent call for next inputs
+    InputDType* next_input = in_buffer_.Data() + in_buff_offsets_[NumInput + 1];
+    Host2Buffers<NumInput+1>(next_input, other_inputs...);
+  }
+
+  void Buffers2Host(OutputType* output) {
+    const size_t n_samples = batch_size_ * sample_rates_[0];
+    OutputDType* out_host_ptr = output->HostPointer() + batch_begin_* sample_rates_[0];
+    events_[MaxNumInputs - 1] =
+      qu_->memcpy(out_host_ptr, out_buffer_.DataConst(), n_samples * sizeof(OutputDType),
+                  events_[MaxNumInputs - 2]);
+  }
+
+  void Buffers2Host(InputType* output) {
+    const size_t n_samples = batch_size_ * sample_rates_[1];
+    InputDType* out_host_ptr = output->HostPointer() + batch_begin_* sample_rates_[1];
+    events_[MaxNumInputs - 1] =
+      qu_->memcpy(out_host_ptr, in_buffer_.DataConst(), n_samples * sizeof(InputDType),
+                  events_[MaxNumInputs - 2]);
+  }
+
+  template <size_t NumInputs = 1, typename Fn, class... InputTypes>
+  void Call(Fn &&fn, const InputDType* input, const InputTypes*... other_inputs) {
+    static_assert(NumInputs <= MaxNumInputs,
+                  "To many arguments in the passed function");
+    /* Passed lambda may have less inputs than MaxNumInputs,
+     * need to pass only requared number of arguments
+     */
+    // 1 for events, 1 for batch_size, 1 for output
+    if constexpr (ArgumentsPassed<Fn, NumInputs + 1 + 1 + 1>::value) {
+      events_[MaxNumInputs - 2] = fn(events_, batch_size_,
+                                     out_buffer_.Data(), input, other_inputs...);
+    } else {
+      const InputDType* next_input = in_buffer_.DataConst() +
+                                     in_buff_offsets_[MaxNumInputs - 1 - NumInputs];
+      Call<NumInputs+1>(std::forward<Fn>(fn), next_input, input, other_inputs...);
+    }
+  }
+
+  template <size_t NumInputs = 1, typename Fn, class... InputTypes>
+  void Call(Fn &&fn, InputDType* io, const InputDType* input, const InputTypes*... other_inputs) {
+    static_assert(NumInputs <= MaxNumInputs,
+                  "To many arguments in the passed function");
+    if constexpr (ArgumentsPassed<Fn, NumInputs + 1 + 1>::value) {
+      events_[MaxNumInputs - 2] = fn(events_, batch_size_,
+                                     io, input, other_inputs...);
+    } else {
+      const InputDType* next_input = in_buffer_.DataConst() +
+                                     in_buff_offsets_[MaxNumInputs - NumInputs];
+      Call<NumInputs+1>(std::forward<Fn>(fn), io, next_input, input, other_inputs...);
+    }
+  }
+
+  template <size_t NumInputs = 1, typename Fn>
+  void Call(Fn &&fn, InputDType* io) {
+    static_assert(NumInputs <= MaxNumInputs,
+                  "To many arguments in the passed function");
+    if constexpr (ArgumentsPassed<Fn, NumInputs + 1 + 1>::value) {
+      events_[MaxNumInputs - 2] = fn(events_, batch_size_, io);
+    } else {
+      const InputDType* next_input = in_buffer_.DataConst() +
+                                     in_buff_offsets_[MaxNumInputs - 1];
+      Call<NumInputs+1>(std::forward<Fn>(fn), io, next_input);
+    }
+  }
+
+ public:
+  BatchProcessingHelper() = default;
+
+  // The first element of sample_rate always corresonds to output sample rate
+  void InitBuffers(::sycl::queue* qu, const std::vector<int>& sample_rate) {
+    assert(sample_rate.size() == MaxNumInputs + 1);
+    sample_rates_ = sample_rate;
+    qu_ = qu;
+    events_.resize(MaxNumInputs + 2);
+    out_buffer_.Resize(qu, kBatchSize * sample_rate.front());
+
+    in_buff_offsets_[0] = 0;
+    for (size_t i = 1; i < MaxNumInputs; ++i) {
+      in_buff_offsets_[i] = in_buff_offsets_[i - 1] + kBatchSize * sample_rate[i];
+    }
+    const size_t in_buff_size = in_buff_offsets_.back() + kBatchSize * sample_rate.back();
+    in_buffer_.Resize(qu, in_buff_size);
+  }
+
+  /*
+   * Batch-wise proces on sycl device
+   * output = fn(inputs)
+   */
+  template <typename Fn, class... InputTypes>
+  void Calculate(Fn &&fn, OutputType* output, const InputTypes&... inputs) {
+    ndata_ = output->Size() / sample_rates_.front();
+    const size_t nBatch = ndata_ / kBatchSize + (ndata_ % kBatchSize > 0);
+    for (size_t batch = 0; batch < nBatch; ++batch) {
+      batch_begin_ = batch * kBatchSize;
+      batch_end_ = (batch == nBatch - 1) ? ndata_ : batch_begin_ + kBatchSize;
+      batch_size_ = batch_end_ - batch_begin_;
+
+      // Iteratively copy all inputs to device buffers
+      Host2Buffers(in_buffer_.Data(), inputs...);
+      // Pack buffers and call function
+      // We shift input pointer to keep the same order of inputs after packing
+      Call(std::forward<Fn>(fn), in_buffer_.DataConst() + in_buff_offsets_.back());
+      // Copy results to host
+      Buffers2Host(output);
+    }
+  }
+
+  /*
+   * Batch-wise proces on sycl device
+   * input = fn(input, other_inputs)
+   */
+  template <typename Fn, class... InputTypes>
+  void Calculate(Fn &&fn, InputType* input, const InputTypes&... other_inputs) {
+    ndata_ = input->Size();
+    const size_t nBatch = ndata_ / kBatchSize + (ndata_ % kBatchSize > 0);
+    for (size_t batch = 0; batch < nBatch; ++batch) {
+      batch_begin_ = batch * kBatchSize;
+      batch_end_ = (batch == nBatch - 1) ? ndata_ : batch_begin_ + kBatchSize;
+      batch_size_ = batch_end_ - batch_begin_;
+
+      // Iteratively copy all inputs to device buffers.
+      // inputs are pased by const reference
+      Host2Buffers(in_buffer_.Data(), *(input), other_inputs...);
+      // Pack buffers and call function
+      // We shift input pointer to keep the same order of inputs after packing
+      Call(std::forward<Fn>(fn), in_buffer_.Data());
+      // Copy results to host
+      Buffers2Host(input);
+    }
+  }
+
+ private:
+  std::array<int, MaxNumInputs> in_buff_offsets_;
+  std::vector<int> sample_rates_;
+  size_t ndata_;
+  size_t batch_begin_;
+  size_t batch_end_;
+  // is not equal to kBatchSize for the last batch
+  size_t batch_size_;
+  ::sycl::queue* qu_;
+  std::vector<::sycl::event> events_;
+  USMVector<InputDType, MemoryType::on_device> in_buffer_;
+  USMVector<OutputDType, MemoryType::on_device> out_buffer_;
+};
+
+}  // namespace linalg
+}  // namespace sycl
+}  // namespace xgboost
+#endif  // PLUGIN_SYCL_COMMON_LINALG_OP_H_
diff --git a/plugin/sycl/objective/multiclass_obj.cc b/plugin/sycl/objective/multiclass_obj.cc
index 5dcc8c3de599..25668c830944 100644
--- a/plugin/sycl/objective/multiclass_obj.cc
+++ b/plugin/sycl/objective/multiclass_obj.cc
@@ -22,7 +22,10 @@
 
 #include "../../../src/objective/multiclass_param.h"
 
+#include "../common/linalg_op.h"
+
 #include "../device_manager.h"
+#include "../data.h"
 #include <CL/sycl.hpp>
 
 namespace xgboost {
@@ -32,6 +35,15 @@ namespace obj {
 DMLC_REGISTRY_FILE_TAG(multiclass_obj_sycl);
 
 class SoftmaxMultiClassObj : public ObjFunction {
+  mutable bool are_buffs_init = false;
+
+  void InitBuffers(const std::vector<int>& sample_rate) const {
+    if (!are_buffs_init) {
+      batch_processor_.InitBuffers(&qu_, sample_rate);
+      are_buffs_init = true;
+    }
+  }
+
  public:
   explicit SoftmaxMultiClassObj(bool output_prob)
   : output_prob_(output_prob) {}
@@ -44,7 +56,7 @@ class SoftmaxMultiClassObj : public ObjFunction {
   void GetGradient(const HostDeviceVector<bst_float>& preds,
                    const MetaInfo& info,
                    int iter,
-                   linalg::Matrix<GradientPair>* out_gpair) override {
+                   xgboost::linalg::Matrix<GradientPair>* out_gpair) override {
     if (preds.Size() == 0) return;
     if (info.labels.Size() == 0) return;
 
@@ -66,54 +78,68 @@ class SoftmaxMultiClassObj : public ObjFunction {
           << "Number of weights should be equal to number of data points.";
     }
 
-    ::sycl::buffer<bst_float, 1> preds_buf(preds.HostPointer(), preds.Size());
-    ::sycl::buffer<bst_float, 1> labels_buf(info.labels.Data()->HostPointer(), info.labels.Size());
-    ::sycl::buffer<GradientPair, 1> out_gpair_buf(out_gpair->Data()->HostPointer(),
-                                                  out_gpair->Size());
-    ::sycl::buffer<bst_float, 1> weights_buf(is_null_weight ? NULL : info.weights_.HostPointer(),
-                                             is_null_weight ? 1 : info.weights_.Size());
-
     int flag = 1;
-    {
-      ::sycl::buffer<int, 1> flag_buf(&flag, 1);
-      qu_.submit([&](::sycl::handler& cgh) {
-        auto preds_acc     = preds_buf.get_access<::sycl::access::mode::read>(cgh);
-        auto labels_acc    = labels_buf.get_access<::sycl::access::mode::read>(cgh);
-        auto weights_acc   = weights_buf.get_access<::sycl::access::mode::read>(cgh);
-        auto out_gpair_acc = out_gpair_buf.get_access<::sycl::access::mode::write>(cgh);
-        auto flag_buf_acc  = flag_buf.get_access<::sycl::access::mode::write>(cgh);
-        cgh.parallel_for<>(::sycl::range<1>(ndata), [=](::sycl::id<1> pid) {
-          int idx = pid[0];
-
-          bst_float const * point = &preds_acc[idx * nclass];
+    auto objective_fn = [=, &flag]
+                        (const std::vector<::sycl::event>& events,
+                         size_t ndata,
+                         GradientPair* out_gpair,
+                         const bst_float* preds,
+                         const bst_float* labels,
+                         const bst_float* weights) {
+      const size_t wg_size = 32;
+      const size_t nwgs = ndata / wg_size + (ndata % wg_size > 0);
+      return linalg::GroupWiseKernel(&qu_, &flag, events, {nwgs, wg_size},
+        [=] (size_t idx, auto flag) {
+          const bst_float* pred = preds + idx * nclass;
 
           // Part of Softmax function
           bst_float wmax = std::numeric_limits<bst_float>::min();
-          for (int k = 0; k < nclass; k++) { wmax = ::sycl::max(point[k], wmax); }
-          float wsum = 0.0f;
-          for (int k = 0; k < nclass; k++) { wsum += ::sycl::exp(point[k] - wmax); }
-          auto label = labels_acc[idx];
+          for (int k = 0; k < nclass; k++) { wmax = ::sycl::max(pred[k], wmax); }
+          bst_float wsum = 0.0f;
+          for (int k = 0; k < nclass; k++) { wsum += ::sycl::exp(pred[k] - wmax); }
+          bst_float label = labels[idx];
+
           if (label < 0 || label >= nclass) {
-            flag_buf_acc[0] = 0;
+            AtomicRef<int> flag_ref(flag[0]);
+            flag_ref = 0;
             label = 0;
           }
-          bst_float wt = is_null_weight ? 1.0f : weights_acc[idx];
+
+          bst_float wt = is_null_weight ? 1.0f : weights[idx];
           for (int k = 0; k < nclass; ++k) {
-            bst_float p = expf(point[k] - wmax) / static_cast<float>(wsum);
+            bst_float p = expf(pred[k] - wmax) / static_cast<float>(wsum);
             const float eps = 1e-16f;
             const bst_float h = ::sycl::max(2.0f * p * (1.0f - p) * wt, eps);
             p = label == k ? p - 1.0f : p;
-            out_gpair_acc[idx * nclass + k] = GradientPair(p * wt, h);
+            out_gpair[idx * nclass + k] = GradientPair(p * wt, h);
           }
-        });
-      }).wait();
+      });
+    };
+
+    // out_gpair and preds have nclass points per sample
+    // labels and weights have 1 points per sample
+    InitBuffers({nclass, nclass, 1, 1});
+    if (is_null_weight) {
+      // Output is passed by pointer
+      // Inputs are passed by const reference
+      batch_processor_.Calculate(std::move(objective_fn),
+                                 out_gpair->Data(),
+                                 preds,
+                                 *(info.labels.Data()));
+    } else {
+      batch_processor_.Calculate(std::move(objective_fn),
+                                 out_gpair->Data(),
+                                 preds,
+                                 *(info.labels.Data()),
+                                 info.weights_);
     }
-    // flag_buf is destroyed, content is copyed to the "flag"
+    qu_.wait_and_throw();
 
     if (flag == 0) {
       LOG(FATAL) << "SYCL::SoftmaxMultiClassObj: label must be in [0, num_class).";
     }
   }
+
   void PredTransform(HostDeviceVector<bst_float>* io_preds) const override {
     this->Transform(io_preds, output_prob_);
   }
@@ -190,6 +216,8 @@ class SoftmaxMultiClassObj : public ObjFunction {
   sycl::DeviceManager device_manager;
 
   mutable ::sycl::queue qu_;
+  static constexpr size_t kBatchSize = 1u << 22;
+  mutable linalg::BatchProcessingHelper<GradientPair, bst_float, kBatchSize, 3> batch_processor_;
 };
 
 XGBOOST_REGISTER_OBJECTIVE(SoftmaxMultiClass, "multi:softmax_sycl")
diff --git a/plugin/sycl/objective/regression_obj.cc b/plugin/sycl/objective/regression_obj.cc
index 82467a7c4848..ee75270faf35 100644
--- a/plugin/sycl/objective/regression_obj.cc
+++ b/plugin/sycl/objective/regression_obj.cc
@@ -27,7 +27,10 @@
 #pragma GCC diagnostic pop
 #include "../../../src/objective/regression_param.h"
 
+#include "../common/linalg_op.h"
+
 #include "../device_manager.h"
+#include "../data.h"
 
 #include <CL/sycl.hpp>
 
@@ -41,6 +44,14 @@ template<typename Loss>
 class RegLossObj : public ObjFunction {
  protected:
   HostDeviceVector<int> label_correct_;
+  mutable bool are_buffs_init = false;
+
+  void InitBuffers() const {
+    if (!are_buffs_init) {
+      batch_processor_.InitBuffers(&qu_, {1, 1, 1, 1});
+      are_buffs_init = true;
+    }
+  }
 
  public:
   RegLossObj() = default;
@@ -53,63 +64,72 @@ class RegLossObj : public ObjFunction {
   void GetGradient(const HostDeviceVector<bst_float>& preds,
                    const MetaInfo &info,
                    int iter,
-                   linalg::Matrix<GradientPair>* out_gpair) override {
-  if (info.labels.Size() == 0) return;
-  CHECK_EQ(preds.Size(), info.labels.Size())
-      << " " << "labels are not correctly provided"
-      << "preds.size=" << preds.Size() << ", label.size=" << info.labels.Size() << ", "
-      << "Loss: " << Loss::Name();
-
-  size_t const ndata = preds.Size();
-  auto const n_targets = this->Targets(info);
-  out_gpair->Reshape(info.num_row_, n_targets);
-
-  // TODO(razdoburdin): add label_correct check
-  label_correct_.Resize(1);
-  label_correct_.Fill(1);
-
-  bool is_null_weight = info.weights_.Size() == 0;
-
-  ::sycl::buffer<bst_float, 1> preds_buf(preds.HostPointer(), preds.Size());
-  ::sycl::buffer<bst_float, 1> labels_buf(info.labels.Data()->HostPointer(), info.labels.Size());
-  ::sycl::buffer<GradientPair, 1> out_gpair_buf(out_gpair->Data()->HostPointer(),
-                                                out_gpair->Size());
-  ::sycl::buffer<bst_float, 1> weights_buf(is_null_weight ? NULL : info.weights_.HostPointer(),
-                                           is_null_weight ? 1    : info.weights_.Size());
-
-  auto scale_pos_weight = param_.scale_pos_weight;
-  if (!is_null_weight) {
-    CHECK_EQ(info.weights_.Size(), info.labels.Shape(0))
-      << "Number of weights should be equal to number of data points.";
-  }
+                   xgboost::linalg::Matrix<GradientPair>* out_gpair) override {
+    if (info.labels.Size() == 0) return;
+    CHECK_EQ(preds.Size(), info.labels.Size())
+        << " " << "labels are not correctly provided"
+        << "preds.size=" << preds.Size() << ", label.size=" << info.labels.Size() << ", "
+        << "Loss: " << Loss::Name();
+
+    size_t const ndata = preds.Size();
+    auto const n_targets = this->Targets(info);
+    out_gpair->Reshape(info.num_row_, n_targets);
+
+    // TODO(razdoburdin): add label_correct check
+    label_correct_.Resize(1);
+    label_correct_.Fill(1);
+
+    bool is_null_weight = info.weights_.Size() == 0;
+
+    auto scale_pos_weight = param_.scale_pos_weight;
+    if (!is_null_weight) {
+      CHECK_EQ(info.weights_.Size(), info.labels.Shape(0))
+        << "Number of weights should be equal to number of data points.";
+    }
 
-  int flag = 1;
-  {
-    ::sycl::buffer<int, 1> flag_buf(&flag, 1);
-    qu_.submit([&](::sycl::handler& cgh) {
-        auto preds_acc     = preds_buf.get_access<::sycl::access::mode::read>(cgh);
-        auto labels_acc    = labels_buf.get_access<::sycl::access::mode::read>(cgh);
-        auto weights_acc   = weights_buf.get_access<::sycl::access::mode::read>(cgh);
-        auto out_gpair_acc = out_gpair_buf.get_access<::sycl::access::mode::write>(cgh);
-        auto flag_buf_acc  = flag_buf.get_access<::sycl::access::mode::write>(cgh);
-        cgh.parallel_for<>(::sycl::range<1>(ndata), [=](::sycl::id<1> pid) {
-          int idx = pid[0];
-          bst_float p = Loss::PredTransform(preds_acc[idx]);
-          bst_float w = is_null_weight ? 1.0f : weights_acc[idx/n_targets];
-          bst_float label = labels_acc[idx];
+    int flag = 1;
+    auto objective_fn = [=, &flag]
+                        (const std::vector<::sycl::event>& events,
+                         size_t ndata,
+                         GradientPair* out_gpair,
+                         const bst_float* preds,
+                         const bst_float* labels,
+                         const bst_float* weights) {
+      const size_t wg_size = 32;
+      const size_t nwgs = ndata / wg_size + (ndata % wg_size > 0);
+      return linalg::GroupWiseKernel(&qu_, &flag, events, {nwgs, wg_size},
+        [=] (size_t idx, auto flag) {
+          const bst_float pred = Loss::PredTransform(preds[idx]);
+          bst_float weight = is_null_weight ? 1.0f : weights[idx/n_targets];
+          const bst_float label = labels[idx];
           if (label == 1.0f) {
-            w *= scale_pos_weight;
+            weight *= scale_pos_weight;
           }
           if (!Loss::CheckLabel(label)) {
-            // If there is an incorrect label, the host code will know.
-            flag_buf_acc[0] = 0;
+            AtomicRef<int> flag_ref(flag[0]);
+            flag_ref = 0;
           }
-          out_gpair_acc[idx] = GradientPair(Loss::FirstOrderGradient(p, label) * w,
-                                            Loss::SecondOrderGradient(p, label) * w);
-        });
-      }).wait();
-  }
-  // flag_buf is destroyed, content is copyed to the "flag"
+          out_gpair[idx] = GradientPair(Loss::FirstOrderGradient(pred, label) * weight,
+                                        Loss::SecondOrderGradient(pred, label) * weight);
+      });
+    };
+
+    InitBuffers();
+    if (is_null_weight) {
+      // Output is passed by pointer
+      // Inputs are passed by const reference
+      batch_processor_.Calculate(std::move(objective_fn),
+                                 out_gpair->Data(),
+                                 preds,
+                                 *(info.labels.Data()));
+    } else {
+      batch_processor_.Calculate(std::move(objective_fn),
+                                 out_gpair->Data(),
+                                 preds,
+                                 *(info.labels.Data()),
+                                 info.weights_);
+    }
+    qu_.wait_and_throw();
 
     if (flag == 0) {
       LOG(FATAL) << Loss::LabelErrorMsg();
@@ -121,18 +141,23 @@ class RegLossObj : public ObjFunction {
     return Loss::DefaultEvalMetric();
   }
 
-  void PredTransform(HostDeviceVector<float> *io_preds) const override {
+  void PredTransform(HostDeviceVector<bst_float> *io_preds) const override {
     size_t const ndata = io_preds->Size();
     if (ndata == 0) return;
-    ::sycl::buffer<bst_float, 1> io_preds_buf(io_preds->HostPointer(), io_preds->Size());
+    InitBuffers();
 
-    qu_.submit([&](::sycl::handler& cgh) {
-      auto io_preds_acc = io_preds_buf.get_access<::sycl::access::mode::read_write>(cgh);
-      cgh.parallel_for<>(::sycl::range<1>(ndata), [=](::sycl::id<1> pid) {
-        int idx = pid[0];
-        io_preds_acc[idx] = Loss::PredTransform(io_preds_acc[idx]);
+    batch_processor_.Calculate([=] (const std::vector<::sycl::event>& events,
+                                    size_t ndata,
+                                    bst_float* io_preds) {
+       return qu_.submit([&](::sycl::handler& cgh) {
+        cgh.depends_on(events);
+        cgh.parallel_for<>(::sycl::range<1>(ndata), [=](::sycl::id<1> pid) {
+          int idx = pid[0];
+          io_preds[idx] = Loss::PredTransform(io_preds[idx]);
+        });
       });
-    }).wait();
+    }, io_preds);
+    qu_.wait_and_throw();
   }
 
   float ProbToMargin(float base_score) const override {
@@ -163,6 +188,8 @@ class RegLossObj : public ObjFunction {
   sycl::DeviceManager device_manager;
 
   mutable ::sycl::queue qu_;
+  static constexpr size_t kBatchSize = 1u << 22;
+  mutable linalg::BatchProcessingHelper<GradientPair, bst_float, kBatchSize, 3> batch_processor_;
 };
 
 XGBOOST_REGISTER_OBJECTIVE(SquaredLossRegression,
diff --git a/plugin/sycl/tree/hist_updater.cc b/plugin/sycl/tree/hist_updater.cc
index 76ecdeab8ac3..daf043e4e055 100644
--- a/plugin/sycl/tree/hist_updater.cc
+++ b/plugin/sycl/tree/hist_updater.cc
@@ -136,10 +136,7 @@ void HistUpdater<GradientSumT>::InitData(
 
     hist_buffer_.Init(qu_, nbins);
     size_t buffer_size = kBufferSize;
-    if (buffer_size > info.num_row_ / kMinBlockSize + 1) {
-      buffer_size = info.num_row_ / kMinBlockSize + 1;
-    }
-    hist_buffer_.Reset(buffer_size);
+    hist_buffer_.Reset(kBufferSize);
 
     // initialize histogram builder
     hist_builder_ = common::GHistBuilder<GradientSumT>(qu_, nbins);
diff --git a/plugin/sycl/tree/hist_updater.h b/plugin/sycl/tree/hist_updater.h
index 544a7c26698a..538f2fe5f707 100644
--- a/plugin/sycl/tree/hist_updater.h
+++ b/plugin/sycl/tree/hist_updater.h
@@ -130,7 +130,6 @@ class HistUpdater {
   DataLayout data_layout_;
 
   constexpr static size_t kBufferSize = 2048;
-  constexpr static size_t kMinBlockSize = 128;
   common::GHistBuilder<GradientSumT> hist_builder_;
   common::ParallelGHistBuilder<GradientSumT> hist_buffer_;
   /*! \brief culmulative histogram of gradients. */
diff --git a/python-package/packager/sdist.py b/python-package/packager/sdist.py
index 4c70c24fe8cc..013a28022009 100644
--- a/python-package/packager/sdist.py
+++ b/python-package/packager/sdist.py
@@ -18,7 +18,6 @@ def copy_cpp_src_tree(
         "include",
         "dmlc-core",
         "gputreeshap",
-        "rabit",
         "cmake",
         "plugin",
     ]:
diff --git a/python-package/pyproject.toml b/python-package/pyproject.toml
index 3bd642cc7f1f..ab0af65eaef9 100644
--- a/python-package/pyproject.toml
+++ b/python-package/pyproject.toml
@@ -7,7 +7,7 @@ build-backend = "packager.pep517"
 
 [project]
 name = "xgboost"
-version = "2.1.0-dev"
+version = "2.2.0-dev"
 authors = [
     { name = "Hyunsu Cho", email = "chohyu01@cs.washington.edu" },
     { name = "Jiaming Yuan", email = "jm.yuan@outlook.com" }
@@ -25,7 +25,8 @@ classifiers = [
     "Programming Language :: Python :: 3.8",
     "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
-    "Programming Language :: Python :: 3.11"
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
 ]
 dependencies = [
     "numpy",
diff --git a/python-package/xgboost/VERSION b/python-package/xgboost/VERSION
index c10edc3fa750..97ade1520c58 100644
--- a/python-package/xgboost/VERSION
+++ b/python-package/xgboost/VERSION
@@ -1 +1 @@
-2.1.0-dev
+2.2.0-dev
diff --git a/python-package/xgboost/collective.py b/python-package/xgboost/collective.py
index 468d38942f73..0f3feeeb4a6d 100644
--- a/python-package/xgboost/collective.py
+++ b/python-package/xgboost/collective.py
@@ -4,7 +4,6 @@
 import logging
 import os
 import pickle
-import platform
 from enum import IntEnum, unique
 from typing import Any, Dict, List, Optional
 
@@ -184,8 +183,10 @@ def _map_dtype(dtype: np.dtype) -> int:
         np.dtype("uint32"): 10,
         np.dtype("uint64"): 11,
     }
-    if platform.system() != "Windows":
+    try:
         dtype_map.update({np.dtype("float128"): 3})
+    except TypeError:  # float128 doesn't exist on the system
+        pass
 
     if dtype not in dtype_map:
         raise TypeError(f"data type {dtype} is not supported on the current platform.")
diff --git a/python-package/xgboost/testing/dask.py b/python-package/xgboost/testing/dask.py
index 70e3dc219928..6841087de0cc 100644
--- a/python-package/xgboost/testing/dask.py
+++ b/python-package/xgboost/testing/dask.py
@@ -1,5 +1,7 @@
 """Tests for dask shared by different test modules."""
 
+from typing import Literal
+
 import numpy as np
 import pandas as pd
 from dask import array as da
@@ -10,19 +12,26 @@
 from xgboost.testing.updater import get_basescore
 
 
-def check_init_estimation_clf(tree_method: str, client: Client) -> None:
+def check_init_estimation_clf(
+    tree_method: str, device: Literal["cpu", "cuda"], client: Client
+) -> None:
     """Test init estimation for classsifier."""
     from sklearn.datasets import make_classification
 
     X, y = make_classification(n_samples=4096 * 2, n_features=32, random_state=1994)
-    clf = xgb.XGBClassifier(n_estimators=1, max_depth=1, tree_method=tree_method)
+    clf = xgb.XGBClassifier(
+        n_estimators=1, max_depth=1, tree_method=tree_method, device=device
+    )
     clf.fit(X, y)
     base_score = get_basescore(clf)
 
     dx = da.from_array(X).rechunk(chunks=(32, None))
     dy = da.from_array(y).rechunk(chunks=(32,))
     dclf = xgb.dask.DaskXGBClassifier(
-        n_estimators=1, max_depth=1, tree_method=tree_method
+        n_estimators=1,
+        max_depth=1,
+        tree_method=tree_method,
+        device=device,
     )
     dclf.client = client
     dclf.fit(dx, dy)
@@ -30,20 +39,24 @@ def check_init_estimation_clf(tree_method: str, client: Client) -> None:
     np.testing.assert_allclose(base_score, dbase_score)
 
 
-def check_init_estimation_reg(tree_method: str, client: Client) -> None:
+def check_init_estimation_reg(
+    tree_method: str, device: Literal["cpu", "cuda"], client: Client
+) -> None:
     """Test init estimation for regressor."""
     from sklearn.datasets import make_regression
 
     # pylint: disable=unbalanced-tuple-unpacking
     X, y = make_regression(n_samples=4096 * 2, n_features=32, random_state=1994)
-    reg = xgb.XGBRegressor(n_estimators=1, max_depth=1, tree_method=tree_method)
+    reg = xgb.XGBRegressor(
+        n_estimators=1, max_depth=1, tree_method=tree_method, device=device
+    )
     reg.fit(X, y)
     base_score = get_basescore(reg)
 
     dx = da.from_array(X).rechunk(chunks=(32, None))
     dy = da.from_array(y).rechunk(chunks=(32,))
     dreg = xgb.dask.DaskXGBRegressor(
-        n_estimators=1, max_depth=1, tree_method=tree_method
+        n_estimators=1, max_depth=1, tree_method=tree_method, device=device
     )
     dreg.client = client
     dreg.fit(dx, dy)
@@ -51,22 +64,26 @@ def check_init_estimation_reg(tree_method: str, client: Client) -> None:
     np.testing.assert_allclose(base_score, dbase_score)
 
 
-def check_init_estimation(tree_method: str, client: Client) -> None:
+def check_init_estimation(
+    tree_method: str, device: Literal["cpu", "cuda"], client: Client
+) -> None:
     """Test init estimation."""
-    check_init_estimation_reg(tree_method, client)
-    check_init_estimation_clf(tree_method, client)
+    check_init_estimation_reg(tree_method, device, client)
+    check_init_estimation_clf(tree_method, device, client)
 
 
-def check_uneven_nan(client: Client, tree_method: str, n_workers: int) -> None:
+def check_uneven_nan(
+    client: Client, tree_method: str, device: Literal["cpu", "cuda"], n_workers: int
+) -> None:
     """Issue #9271, not every worker has missing value."""
     assert n_workers >= 2
 
     with client.as_current():
-        clf = xgb.dask.DaskXGBClassifier(tree_method=tree_method)
+        clf = xgb.dask.DaskXGBClassifier(tree_method=tree_method, device=device)
         X = pd.DataFrame({"a": range(10000), "b": range(10000, 0, -1)})
         y = pd.Series([*[0] * 5000, *[1] * 5000])
 
-        X["a"][:3000:1000] = np.nan
+        X.loc[:3000:1000, "a"] = np.nan
 
         client.wait_for_workers(n_workers=n_workers)
 
diff --git a/src/collective/comm.h b/src/collective/comm.h
index 0a0f24aadd3d..72fec2e816e9 100644
--- a/src/collective/comm.h
+++ b/src/collective/comm.h
@@ -20,7 +20,7 @@
 
 namespace xgboost::collective {
 
-inline constexpr std::int64_t DefaultTimeoutSec() { return 300; }  // 5min
+inline constexpr std::int64_t DefaultTimeoutSec() { return 60 * 30; }  // 30min
 inline constexpr std::int32_t DefaultRetry() { return 3; }
 
 // indexing into the ring
diff --git a/src/collective/loop.cc b/src/collective/loop.cc
index 1c384bb2814a..cd7740fd2805 100644
--- a/src/collective/loop.cc
+++ b/src/collective/loop.cc
@@ -14,10 +14,10 @@
 #include <thread>     // for thread
 #include <utility>    // for move
 
-#include "rabit/internal/socket.h"      // for PollHelper
-#include "xgboost/collective/result.h"  // for Fail, Success
-#include "xgboost/collective/socket.h"  // for FailWithCode
-#include "xgboost/logging.h"            // for CHECK
+#include "xgboost/collective/poll_utils.h"  // for PollHelper
+#include "xgboost/collective/result.h"      // for Fail, Success
+#include "xgboost/collective/socket.h"      // for FailWithCode
+#include "xgboost/logging.h"                // for CHECK
 
 namespace xgboost::collective {
 Result Loop::ProcessQueue(std::queue<Op>* p_queue) const {
@@ -185,7 +185,8 @@ void Loop::Process() {
       if (!rc.OK()) {
         set_rc(std::move(rc));
       } else {
-        CHECK(qcopy.empty());
+        std::unique_lock lock{mu_};
+        CHECK(qcopy.empty() || stop_);
       }
     } catch (std::exception const& e) {
       curr_exce_ = std::current_exception();
diff --git a/src/collective/socket.cc b/src/collective/socket.cc
index 99b02f665f10..38e727caf836 100644
--- a/src/collective/socket.cc
+++ b/src/collective/socket.cc
@@ -11,8 +11,8 @@
 #include <system_error>  // for error_code, system_category
 #include <thread>        // for sleep_for
 
-#include "rabit/internal/socket.h"      // for PollHelper
-#include "xgboost/collective/result.h"  // for Result
+#include "xgboost/collective/poll_utils.h"  // for PollHelper
+#include "xgboost/collective/result.h"      // for Result
 
 #if defined(__unix__) || defined(__APPLE__)
 #include <netdb.h>  // getaddrinfo, freeaddrinfo
diff --git a/src/collective/tracker.cc b/src/collective/tracker.cc
index f4c07c5d145c..9441ab449080 100644
--- a/src/collective/tracker.cc
+++ b/src/collective/tracker.cc
@@ -1,7 +1,7 @@
 /**
  * Copyright 2023-2024, XGBoost Contributors
  */
-#include "rabit/internal/socket.h"
+
 #if defined(__unix__) || defined(__APPLE__)
 #include <netdb.h>       // gethostbyname
 #include <sys/socket.h>  // socket, AF_INET6, AF_INET, connect, getsockname
@@ -19,6 +19,7 @@
 #include <algorithm>  // for sort
 #include <chrono>     // for seconds, ms
 #include <cstdint>    // for int32_t
+#include <memory>     // for unique_ptr
 #include <string>     // for string
 #include <utility>    // for move, forward
 
@@ -26,9 +27,10 @@
 #include "comm.h"
 #include "protocol.h"  // for kMagic, PeerInfo
 #include "tracker.h"
-#include "xgboost/collective/result.h"  // for Result, Fail, Success
-#include "xgboost/collective/socket.h"  // for GetHostName, FailWithCode, MakeSockAddress, ...
-#include "xgboost/json.h"               // for Json
+#include "xgboost/collective/poll_utils.h"  // for PollHelper
+#include "xgboost/collective/result.h"      // for Result, Fail, Success
+#include "xgboost/collective/socket.h"      // for GetHostName, FailWithCode, MakeSockAddress, ...
+#include "xgboost/json.h"                   // for Json
 
 namespace xgboost::collective {
 
@@ -376,20 +378,51 @@ Result RabitTracker::Bootstrap(std::vector<WorkerProxy>* p_workers) {
   if (!rc.OK()) {
     return rc;
   }
-  auto host = gethostbyname(out->c_str());
 
-  // get ip address from host
-  std::string ip;
-  rc = INetNToP(host, &ip);
-  if (!rc.OK()) {
-    return rc;
+  addrinfo hints;
+  addrinfo* servinfo;
+
+  std::memset(&hints, 0, sizeof(hints));
+  hints.ai_family = AF_UNSPEC;
+  hints.ai_socktype = SOCK_STREAM;
+  hints.ai_flags = AI_PASSIVE;
+
+  std::int32_t errc{0};
+  std::unique_ptr<addrinfo*, std::function<void(addrinfo**)>> guard{&servinfo, [](addrinfo** ptr) {
+                                                                      freeaddrinfo(*ptr);
+                                                                    }};
+  if ((errc = getaddrinfo(nullptr, "0", &hints, &servinfo)) != 0) {
+    return Fail("Failed to get address info:" + std::string{gai_strerror(errc)});
   }
 
-  if (!(ip.size() >= 4 && ip.substr(0, 4) == "127.")) {
-    // return if this is a public IP address.
-    // not entirely accurate, we have other reserved IPs
-    *out = ip;
-    return Success();
+  // https://beej.us/guide/bgnet/html/#getaddrinfoprepare-to-launch
+  std::vector<SockAddress> addresses;
+  for (addrinfo* p = servinfo; p != nullptr; p = p->ai_next) {
+    // Get the pointer to the address itself, different fields in IPv4 and IPv6:
+    if (p->ai_family == AF_INET) {  // IPv4
+      struct sockaddr_in* ipv4 = reinterpret_cast<sockaddr_in*>(p->ai_addr);
+      addresses.emplace_back(SockAddrV4{*ipv4});
+      auto ip = addresses.back().V4().Addr();
+      // Priortize V4.
+      // Return if this is a public IP address. Not accurate, we have other reserved IPs
+      if (ip.size() > 4 && ip.substr(0, 4) != "127." && ip != SockAddrV4::InaddrAny().Addr()) {
+        *out = ip;
+        return Success();
+      }
+    } else {
+      struct sockaddr_in6* ipv6 = reinterpret_cast<sockaddr_in6*>(p->ai_addr);
+      addresses.emplace_back(SockAddrV6{*ipv6});
+    }
+  }
+  // If not v4 address is found, we try v6
+  for (auto const& addr : addresses) {
+    if (addr.IsV6()) {
+      auto ip = addr.V6().Addr();
+      if (ip != SockAddrV6::InaddrAny().Addr() && ip != SockAddrV6::Loopback().Addr()) {
+        *out = ip;
+        return Success();
+      }
+    }
   }
 
   // Create an UDP socket to prob the public IP address, it's fine even if it's
@@ -413,7 +446,7 @@ Result RabitTracker::Bootstrap(std::vector<WorkerProxy>* p_workers) {
   if (getsockname(sock, reinterpret_cast<struct sockaddr*>(&addr), &len) == -1) {
     return Fail("Failed to get sock name.");
   }
-  ip = inet_ntoa(addr.sin_addr);
+  std::string ip = inet_ntoa(addr.sin_addr);
 
   err = system::CloseSocket(sock);
   if (err != 0) {
diff --git a/src/data/ellpack_page_raw_format.cu b/src/data/ellpack_page_raw_format.cu
index 8316368ba885..62b29640c334 100644
--- a/src/data/ellpack_page_raw_format.cu
+++ b/src/data/ellpack_page_raw_format.cu
@@ -36,6 +36,7 @@ class EllpackPageRawFormat : public SparsePageFormat<EllpackPage> {
     if (!fi->Read(&impl->base_rowid)) {
       return false;
     }
+    dh::DefaultStream().Sync();
     return true;
   }
 
diff --git a/src/data/ellpack_page_source.cu b/src/data/ellpack_page_source.cu
index 41b0f480b8cc..1144e7a2e051 100644
--- a/src/data/ellpack_page_source.cu
+++ b/src/data/ellpack_page_source.cu
@@ -1,8 +1,7 @@
 /**
- * Copyright 2019-2023, XGBoost contributors
+ * Copyright 2019-2024, XGBoost contributors
  */
 #include <memory>
-#include <utility>
 
 #include "ellpack_page.cuh"
 #include "ellpack_page.h"  // for EllpackPage
diff --git a/src/data/proxy_dmatrix.h b/src/data/proxy_dmatrix.h
index a29fde8423fa..b4943972616d 100644
--- a/src/data/proxy_dmatrix.h
+++ b/src/data/proxy_dmatrix.h
@@ -6,7 +6,6 @@
 
 #include <any>  // for any, any_cast
 #include <memory>
-#include <string>
 #include <type_traits>  // for invoke_result_t
 #include <utility>
 
diff --git a/src/data/sparse_page_dmatrix.cc b/src/data/sparse_page_dmatrix.cc
index f1754c1b5a5d..ae4927a30023 100644
--- a/src/data/sparse_page_dmatrix.cc
+++ b/src/data/sparse_page_dmatrix.cc
@@ -56,10 +56,10 @@ SparsePageDMatrix::SparsePageDMatrix(DataIterHandle iter_handle, DMatrixHandle p
   auto iter = DataIterProxy<DataIterResetCallback, XGDMatrixCallbackNext>{
       iter_, reset_, next_};
 
-  uint32_t n_batches = 0;
-  size_t n_features = 0;
-  size_t n_samples = 0;
-  size_t nnz = 0;
+  std::uint32_t n_batches = 0;
+  bst_feature_t n_features = 0;
+  bst_idx_t n_samples = 0;
+  bst_idx_t nnz = 0;
 
   auto num_rows = [&]() {
     bool type_error {false};
@@ -72,7 +72,7 @@ SparsePageDMatrix::SparsePageDMatrix(DataIterHandle iter_handle, DMatrixHandle p
   };
   auto num_cols = [&]() {
     bool type_error {false};
-    size_t n_features = HostAdapterDispatch(
+    bst_feature_t n_features = HostAdapterDispatch(
         proxy, [](auto const &value) { return value.NumCols(); }, &type_error);
     if (type_error) {
       n_features = detail::NFeaturesDevice(proxy);
@@ -121,10 +121,9 @@ void SparsePageDMatrix::InitializeSparsePage(Context const *ctx) {
                                                            this->n_batches_, cache_info_.at(id));
 }
 
-BatchSet<SparsePage> SparsePageDMatrix::GetRowBatchesImpl(Context const* ctx) {
+BatchSet<SparsePage> SparsePageDMatrix::GetRowBatchesImpl(Context const *ctx) {
   this->InitializeSparsePage(ctx);
-  auto begin_iter = BatchIterator<SparsePage>(sparse_page_source_);
-  return BatchSet<SparsePage>(BatchIterator<SparsePage>(begin_iter));
+  return BatchSet{BatchIterator<SparsePage>{this->sparse_page_source_}};
 }
 
 BatchSet<SparsePage> SparsePageDMatrix::GetRowBatches() {
@@ -143,8 +142,7 @@ BatchSet<CSCPage> SparsePageDMatrix::GetColumnBatches(Context const *ctx) {
   } else {
     column_source_->Reset();
   }
-  auto begin_iter = BatchIterator<CSCPage>(column_source_);
-  return BatchSet<CSCPage>(BatchIterator<CSCPage>(begin_iter));
+  return BatchSet{BatchIterator<CSCPage>{this->column_source_}};
 }
 
 BatchSet<SortedCSCPage> SparsePageDMatrix::GetSortedColumnBatches(Context const *ctx) {
@@ -158,8 +156,7 @@ BatchSet<SortedCSCPage> SparsePageDMatrix::GetSortedColumnBatches(Context const
   } else {
     sorted_column_source_->Reset();
   }
-  auto begin_iter = BatchIterator<SortedCSCPage>(sorted_column_source_);
-  return BatchSet<SortedCSCPage>(BatchIterator<SortedCSCPage>(begin_iter));
+  return BatchSet{BatchIterator<SortedCSCPage>{this->sorted_column_source_}};
 }
 
 BatchSet<GHistIndexMatrix> SparsePageDMatrix::GetGradientIndex(Context const *ctx,
@@ -169,8 +166,8 @@ BatchSet<GHistIndexMatrix> SparsePageDMatrix::GetGradientIndex(Context const *ct
   }
   detail::CheckEmpty(batch_param_, param);
   auto id = MakeCache(this, ".gradient_index.page", cache_prefix_, &cache_info_);
-  this->InitializeSparsePage(ctx);
   if (!cache_info_.at(id)->written || detail::RegenGHist(batch_param_, param)) {
+    this->InitializeSparsePage(ctx);
     cache_info_.erase(id);
     MakeCache(this, ".gradient_index.page", cache_prefix_, &cache_info_);
     LOG(INFO) << "Generating new Gradient Index.";
@@ -190,15 +187,13 @@ BatchSet<GHistIndexMatrix> SparsePageDMatrix::GetGradientIndex(Context const *ct
     CHECK(ghist_index_source_);
     ghist_index_source_->Reset();
   }
-  auto begin_iter = BatchIterator<GHistIndexMatrix>(ghist_index_source_);
-  return BatchSet<GHistIndexMatrix>(BatchIterator<GHistIndexMatrix>(begin_iter));
+  return BatchSet{BatchIterator<GHistIndexMatrix>{this->ghist_index_source_}};
 }
 
 #if !defined(XGBOOST_USE_CUDA)
 BatchSet<EllpackPage> SparsePageDMatrix::GetEllpackBatches(Context const *, const BatchParam &) {
   common::AssertGPUSupport();
-  auto begin_iter = BatchIterator<EllpackPage>(ellpack_page_source_);
-  return BatchSet<EllpackPage>(BatchIterator<EllpackPage>(begin_iter));
+  return BatchSet{BatchIterator<EllpackPage>{this->ellpack_page_source_}};
 }
 #endif  // !defined(XGBOOST_USE_CUDA)
 }  // namespace xgboost::data
diff --git a/src/data/sparse_page_dmatrix.cu b/src/data/sparse_page_dmatrix.cu
index 572d6cb08117..1e76f8601413 100644
--- a/src/data/sparse_page_dmatrix.cu
+++ b/src/data/sparse_page_dmatrix.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2021-2023 by XGBoost contributors
+ * Copyright 2021-2024, XGBoost contributors
  */
 #include <memory>  // for unique_ptr
 
@@ -21,8 +21,8 @@ BatchSet<EllpackPage> SparsePageDMatrix::GetEllpackBatches(Context const* ctx,
   detail::CheckEmpty(batch_param_, param);
   auto id = MakeCache(this, ".ellpack.page", cache_prefix_, &cache_info_);
   size_t row_stride = 0;
-  this->InitializeSparsePage(ctx);
   if (!cache_info_.at(id)->written || detail::RegenGHist(batch_param_, param)) {
+    this->InitializeSparsePage(ctx);
     // reinitialize the cache
     cache_info_.erase(id);
     MakeCache(this, ".ellpack.page", cache_prefix_, &cache_info_);
@@ -52,7 +52,6 @@ BatchSet<EllpackPage> SparsePageDMatrix::GetEllpackBatches(Context const* ctx,
     ellpack_page_source_->Reset();
   }
 
-  auto begin_iter = BatchIterator<EllpackPage>(ellpack_page_source_);
-  return BatchSet<EllpackPage>(BatchIterator<EllpackPage>(begin_iter));
+  return BatchSet{BatchIterator<EllpackPage>{this->ellpack_page_source_}};
 }
 }  // namespace xgboost::data
diff --git a/src/data/sparse_page_dmatrix.h b/src/data/sparse_page_dmatrix.h
index d4324000f025..fd31bc661f30 100644
--- a/src/data/sparse_page_dmatrix.h
+++ b/src/data/sparse_page_dmatrix.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2015-2023, XGBoost Contributors
+ * Copyright 2015-2024, XGBoost Contributors
  * \file sparse_page_dmatrix.h
  * \brief External-memory version of DMatrix.
  * \author Tianqi Chen
@@ -7,12 +7,10 @@
 #ifndef XGBOOST_DATA_SPARSE_PAGE_DMATRIX_H_
 #define XGBOOST_DATA_SPARSE_PAGE_DMATRIX_H_
 
-#include <algorithm>
 #include <map>
 #include <memory>
 #include <string>
 #include <utility>
-#include <vector>
 
 #include "ellpack_page_source.h"
 #include "gradient_index_page_source.h"
@@ -22,7 +20,7 @@
 
 namespace xgboost::data {
 /**
- * \brief DMatrix used for external memory.
+ * @brief DMatrix used for external memory.
  *
  * The external memory is created for controlling memory usage by splitting up data into
  * multiple batches.  However that doesn't mean we will actually process exactly 1 batch
@@ -51,8 +49,13 @@ namespace xgboost::data {
  * want to change the generated page like Ellpack, pass parameter into `GetBatches` to
  * re-generate them instead of trying to modify the pages in-place.
  *
- * A possible optimization is dropping the sparse page once dependent pages like ellpack
- * are constructed and cached.
+ * The overall chain of responsibility of external memory DMatrix:
+ *
+ *    User defined iterator (in Python/C/R) -> Proxy DMatrix -> Sparse page Source ->
+ *    Other sources (Like Ellpack) -> Sparse Page DMatrix -> Caller
+ *
+ * A possible optimization is skipping the sparse page source for `hist` based algorithms
+ * similar to the Quantile DMatrix.
  */
 class SparsePageDMatrix : public DMatrix {
   MetaInfo info_;
@@ -67,7 +70,7 @@ class SparsePageDMatrix : public DMatrix {
   float missing_;
   Context fmat_ctx_;
   std::string cache_prefix_;
-  uint32_t n_batches_{0};
+  std::uint32_t n_batches_{0};
   // sparse page is the source to other page types, we make a special member function.
   void InitializeSparsePage(Context const *ctx);
   // Non-virtual version that can be used in constructor
@@ -93,11 +96,11 @@ class SparsePageDMatrix : public DMatrix {
     }
   }
 
-  MetaInfo &Info() override;
-  const MetaInfo &Info() const override;
-  Context const *Ctx() const override { return &fmat_ctx_; }
+  [[nodiscard]] MetaInfo &Info() override;
+  [[nodiscard]] const MetaInfo &Info() const override;
+  [[nodiscard]] Context const *Ctx() const override { return &fmat_ctx_; }
   // The only DMatrix implementation that returns false.
-  bool SingleColBlock() const override { return false; }
+  [[nodiscard]] bool SingleColBlock() const override { return false; }
   DMatrix *Slice(common::Span<int32_t const>) override {
     LOG(FATAL) << "Slicing DMatrix is not supported for external memory.";
     return nullptr;
@@ -107,6 +110,20 @@ class SparsePageDMatrix : public DMatrix {
     return nullptr;
   }
 
+  [[nodiscard]] bool EllpackExists() const override {
+    return static_cast<bool>(ellpack_page_source_);
+  }
+  [[nodiscard]] bool GHistIndexExists() const override {
+    return static_cast<bool>(ghist_index_source_);
+  }
+  [[nodiscard]] bool SparsePageExists() const override {
+    return static_cast<bool>(sparse_page_source_);
+  }
+  // For testing, getter for the number of fetches for sparse page source.
+  [[nodiscard]] auto SparsePageFetchCount() const {
+    return this->sparse_page_source_->FetchCount();
+  }
+
  private:
   BatchSet<SparsePage> GetRowBatches() override;
   BatchSet<CSCPage> GetColumnBatches(Context const *ctx) override;
@@ -118,24 +135,24 @@ class SparsePageDMatrix : public DMatrix {
     return BatchSet<ExtSparsePage>(BatchIterator<ExtSparsePage>(nullptr));
   }
 
+ private:
   // source data pointers.
   std::shared_ptr<SparsePageSource> sparse_page_source_;
   std::shared_ptr<EllpackPageSource> ellpack_page_source_;
   std::shared_ptr<CSCPageSource> column_source_;
   std::shared_ptr<SortedCSCPageSource> sorted_column_source_;
   std::shared_ptr<GradientIndexPageSource> ghist_index_source_;
-
-  bool EllpackExists() const override { return static_cast<bool>(ellpack_page_source_); }
-  bool GHistIndexExists() const override { return static_cast<bool>(ghist_index_source_); }
-  bool SparsePageExists() const override { return static_cast<bool>(sparse_page_source_); }
 };
 
-inline std::string MakeId(std::string prefix, SparsePageDMatrix *ptr) {
+[[nodiscard]] inline std::string MakeId(std::string prefix, SparsePageDMatrix *ptr) {
   std::stringstream ss;
   ss << ptr;
   return prefix + "-" + ss.str();
 }
 
+/**
+ * @brief Make cache if it doesn't exist yet.
+ */
 inline std::string MakeCache(SparsePageDMatrix *ptr, std::string format, std::string prefix,
                              std::map<std::string, std::shared_ptr<Cache>> *out) {
   auto &cache_info = *out;
diff --git a/src/data/sparse_page_source.h b/src/data/sparse_page_source.h
index 00aeeb5427f0..6e8ebd33c72c 100644
--- a/src/data/sparse_page_source.h
+++ b/src/data/sparse_page_source.h
@@ -8,7 +8,7 @@
 #include <algorithm>  // for min
 #include <atomic>     // for atomic
 #include <cstdio>     // for remove
-#include <future>     // for async
+#include <future>     // for future
 #include <memory>     // for unique_ptr
 #include <mutex>      // for mutex
 #include <numeric>    // for partial_sum
@@ -55,7 +55,7 @@ struct Cache {
     offset.push_back(0);
   }
 
-  static std::string ShardName(std::string name, std::string format) {
+  [[nodiscard]] static std::string ShardName(std::string name, std::string format) {
     CHECK_EQ(format.front(), '.');
     return name + format;
   }
@@ -174,7 +174,7 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S> {
   ExceHandler exce_;
   common::Monitor monitor_;
 
-  bool ReadCache() {
+  [[nodiscard]] bool ReadCache() {
     CHECK(!at_end_);
     if (!cache_info_->written) {
       return false;
@@ -216,7 +216,6 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S> {
         return page;
       });
     }
-
     CHECK_EQ(std::count_if(ring_->cbegin(), ring_->cend(), [](auto const& f) { return f.valid(); }),
              n_prefetch_batches)
         << "Sparse DMatrix assumes forward iteration.";
@@ -279,9 +278,9 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S> {
     }
   }
 
-  [[nodiscard]] uint32_t Iter() const { return count_; }
+  [[nodiscard]] std::uint32_t Iter() const { return count_; }
 
-  const S &operator*() const override {
+  [[nodiscard]] S const& operator*() const override {
     CHECK(page_);
     return *page_;
   }
@@ -311,22 +310,29 @@ inline void DevicePush(DMatrixProxy*, float, SparsePage*) { common::AssertGPUSup
 #endif
 
 class SparsePageSource : public SparsePageSourceImpl<SparsePage> {
-  // This is the source from the user.
+  // This is the source iterator from the user.
   DataIterProxy<DataIterResetCallback, XGDMatrixCallbackNext> iter_;
   DMatrixProxy* proxy_;
   std::size_t base_row_id_{0};
+  bst_idx_t fetch_cnt_{0};  // Used for sanity check.
 
   void Fetch() final {
+    fetch_cnt_++;
     page_ = std::make_shared<SparsePage>();
+    // The first round of reading, this is responsible for initialization.
     if (!this->ReadCache()) {
-      bool type_error { false };
+      bool type_error{false};
       CHECK(proxy_);
-      HostAdapterDispatch(proxy_, [&](auto const &adapter_batch) {
-        page_->Push(adapter_batch, this->missing_, this->nthreads_);
-      }, &type_error);
+      HostAdapterDispatch(
+          proxy_,
+          [&](auto const& adapter_batch) {
+            page_->Push(adapter_batch, this->missing_, this->nthreads_);
+          },
+          &type_error);
       if (type_error) {
         DevicePush(proxy_, missing_, page_.get());
       }
+
       page_->SetBaseRowId(base_row_id_);
       base_row_id_ += page_->Size();
       n_batches_++;
@@ -351,11 +357,13 @@ class SparsePageSource : public SparsePageSourceImpl<SparsePage> {
   SparsePageSource& operator++() final {
     TryLockGuard guard{single_threaded_};
     count_++;
+
     if (cache_info_->written) {
       at_end_ = (count_ == n_batches_);
     } else {
       at_end_ = !iter_.Next();
     }
+    CHECK_LE(count_, n_batches_);
 
     if (at_end_) {
       CHECK_EQ(cache_info_->offset.size(), n_batches_ + 1);
@@ -381,6 +389,8 @@ class SparsePageSource : public SparsePageSourceImpl<SparsePage> {
     TryLockGuard guard{single_threaded_};
     base_row_id_ = 0;
   }
+
+  [[nodiscard]] auto FetchCount() const { return fetch_cnt_; }
 };
 
 // A mixin for advancing the iterator.
@@ -394,11 +404,11 @@ class PageSourceIncMixIn : public SparsePageSourceImpl<S> {
   bool sync_{true};
 
  public:
-  PageSourceIncMixIn(float missing, int nthreads, bst_feature_t n_features, uint32_t n_batches,
+  PageSourceIncMixIn(float missing, int nthreads, bst_feature_t n_features, std::uint32_t n_batches,
                      std::shared_ptr<Cache> cache, bool sync)
       : Super::SparsePageSourceImpl{missing, nthreads, n_features, n_batches, cache}, sync_{sync} {}
 
-  PageSourceIncMixIn& operator++() final {
+  [[nodiscard]] PageSourceIncMixIn& operator++() final {
     TryLockGuard guard{this->single_threaded_};
     if (sync_) {
       ++(*source_);
@@ -422,6 +432,13 @@ class PageSourceIncMixIn : public SparsePageSourceImpl<S> {
     }
     return *this;
   }
+
+  void Reset() final {
+    if (sync_) {
+      this->source_->Reset();
+    }
+    Super::Reset();
+  }
 };
 
 class CSCPageSource : public PageSourceIncMixIn<CSCPage> {
diff --git a/src/tree/updater_approx.cc b/src/tree/updater_approx.cc
index c1e3cc3d7c14..c79df741d3aa 100644
--- a/src/tree/updater_approx.cc
+++ b/src/tree/updater_approx.cc
@@ -52,7 +52,7 @@ auto BatchSpec(TrainParam const &p, common::Span<float> hess) {
 }
 }  // anonymous namespace
 
-class GloablApproxBuilder {
+class GlobalApproxBuilder {
  protected:
   TrainParam const *param_;
   HistMakerTrainParam const *hist_param_{nullptr};
@@ -161,7 +161,7 @@ class GloablApproxBuilder {
   }
 
  public:
-  explicit GloablApproxBuilder(TrainParam const *param, HistMakerTrainParam const *hist_param,
+  explicit GlobalApproxBuilder(TrainParam const *param, HistMakerTrainParam const *hist_param,
                                MetaInfo const &info, Context const *ctx,
                                std::shared_ptr<common::ColumnSampler> column_sampler,
                                ObjInfo const *task, common::Monitor *monitor)
@@ -248,7 +248,7 @@ class GloablApproxBuilder {
 class GlobalApproxUpdater : public TreeUpdater {
   common::Monitor monitor_;
   // specializations for different histogram precision.
-  std::unique_ptr<GloablApproxBuilder> pimpl_;
+  std::unique_ptr<GlobalApproxBuilder> pimpl_;
   // pointer to the last DMatrix, used for update prediction cache.
   DMatrix *cached_{nullptr};
   std::shared_ptr<common::ColumnSampler> column_sampler_;
@@ -289,7 +289,7 @@ class GlobalApproxUpdater : public TreeUpdater {
     if (!column_sampler_) {
       column_sampler_ = common::MakeColumnSampler(ctx_);
     }
-    pimpl_ = std::make_unique<GloablApproxBuilder>(param, &hist_param_, m->Info(), ctx_,
+    pimpl_ = std::make_unique<GlobalApproxBuilder>(param, &hist_param_, m->Info(), ctx_,
                                                    column_sampler_, task_, &monitor_);
 
     linalg::Matrix<GradientPair> h_gpair;
diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc
index b042f1631f2e..3731ded677d7 100644
--- a/src/tree/updater_quantile_hist.cc
+++ b/src/tree/updater_quantile_hist.cc
@@ -7,7 +7,6 @@
 #include <algorithm>  // for max, copy, transform
 #include <cstddef>    // for size_t
 #include <cstdint>    // for uint32_t, int32_t
-#include <exception>  // for exception
 #include <memory>     // for allocator, unique_ptr, make_unique, shared_ptr
 #include <ostream>    // for operator<<, basic_ostream, char_traits
 #include <utility>    // for move
@@ -20,7 +19,6 @@
 #include "../common/random.h"                // for ColumnSampler
 #include "../common/threading_utils.h"       // for ParallelFor
 #include "../common/timer.h"                 // for Monitor
-#include "../common/transform_iterator.h"    // for IndexTransformIter
 #include "../data/gradient_index.h"          // for GHistIndexMatrix
 #include "common_row_partitioner.h"          // for CommonRowPartitioner
 #include "dmlc/registry.h"                   // for DMLC_REGISTRY_FILE_TAG
diff --git a/tests/buildkite/build-containers.sh b/tests/buildkite/build-containers.sh
index 9aec33d1ffc4..1f8c587c9131 100755
--- a/tests/buildkite/build-containers.sh
+++ b/tests/buildkite/build-containers.sh
@@ -20,16 +20,16 @@ case "${container}" in
   cpu)
     ;;
 
-  gpu)
+  gpu|gpu_build_centos7)
     BUILD_ARGS="$BUILD_ARGS --build-arg CUDA_VERSION_ARG=$CUDA_VERSION"
     BUILD_ARGS="$BUILD_ARGS --build-arg NCCL_VERSION_ARG=$NCCL_VERSION"
     BUILD_ARGS="$BUILD_ARGS --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION"
     ;;
 
-  gpu_build_centos7)
+  gpu_dev_ver)
     BUILD_ARGS="$BUILD_ARGS --build-arg CUDA_VERSION_ARG=$CUDA_VERSION"
     BUILD_ARGS="$BUILD_ARGS --build-arg NCCL_VERSION_ARG=$NCCL_VERSION"
-    BUILD_ARGS="$BUILD_ARGS --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION"
+    BUILD_ARGS="$BUILD_ARGS --build-arg RAPIDS_VERSION_ARG=$DEV_RAPIDS_VERSION"
     ;;
 
   jvm_gpu_build)
diff --git a/tests/buildkite/build-cpu-arm64.sh b/tests/buildkite/build-cpu-arm64.sh
index 3bbc954729ce..c826b9154a5f 100755
--- a/tests/buildkite/build-cpu-arm64.sh
+++ b/tests/buildkite/build-cpu-arm64.sh
@@ -19,13 +19,17 @@ $command_wrapper bash -c "cd build && ctest --extra-verbose"
 echo "--- Build binary wheel"
 $command_wrapper bash -c \
   "cd python-package && rm -rf dist/* && pip wheel --no-deps -v . --wheel-dir dist/"
-$command_wrapper python tests/ci_build/rename_whl.py python-package/dist/*.whl \
-  ${BUILDKITE_COMMIT} ${WHEEL_TAG}
+$command_wrapper python tests/ci_build/rename_whl.py  \
+  --wheel-path python-package/dist/*.whl  \
+  --commit-hash ${BUILDKITE_COMMIT}  \
+  --platform-tag ${WHEEL_TAG}
 
 echo "--- Audit binary wheel to ensure it's compliant with manylinux2014 standard"
 $command_wrapper auditwheel repair --plat ${WHEEL_TAG} python-package/dist/*.whl
-$command_wrapper python tests/ci_build/rename_whl.py wheelhouse/*.whl \
-  ${BUILDKITE_COMMIT} ${WHEEL_TAG}
+$command_wrapper python tests/ci_build/rename_whl.py  \
+  --wheel-path wheelhouse/*.whl  \
+  --commit-hash ${BUILDKITE_COMMIT}  \
+  --platform-tag ${WHEEL_TAG}
 mv -v wheelhouse/*.whl python-package/dist/
 # Make sure that libgomp.so is vendored in the wheel
 $command_wrapper bash -c \
diff --git a/tests/buildkite/build-cuda-with-rmm.sh b/tests/buildkite/build-cuda-with-rmm.sh
index 559bad8a786b..0474f5466e29 100755
--- a/tests/buildkite/build-cuda-with-rmm.sh
+++ b/tests/buildkite/build-cuda-with-rmm.sh
@@ -36,14 +36,18 @@ $command_wrapper tests/ci_build/build_via_cmake.sh \
 echo "--- Build binary wheel"
 $command_wrapper bash -c \
   "cd python-package && rm -rf dist/* && pip wheel --no-deps -v . --wheel-dir dist/"
-$command_wrapper python tests/ci_build/rename_whl.py python-package/dist/*.whl \
-  ${BUILDKITE_COMMIT} ${WHEEL_TAG}
+$command_wrapper python tests/ci_build/rename_whl.py  \
+  --wheel-path python-package/dist/*.whl  \
+  --commit-hash ${BUILDKITE_COMMIT}  \
+  --platform-tag ${WHEEL_TAG}
 
 echo "--- Audit binary wheel to ensure it's compliant with manylinux2014 standard"
 tests/ci_build/ci_build.sh auditwheel_x86_64 auditwheel repair \
   --plat ${WHEEL_TAG} python-package/dist/*.whl
-$command_wrapper python tests/ci_build/rename_whl.py wheelhouse/*.whl \
-  ${BUILDKITE_COMMIT} ${WHEEL_TAG}
+$command_wrapper python tests/ci_build/rename_whl.py  \
+  --wheel-path wheelhouse/*.whl  \
+  --commit-hash ${BUILDKITE_COMMIT}  \
+  --platform-tag ${WHEEL_TAG}
 mv -v wheelhouse/*.whl python-package/dist/
 # Make sure that libgomp.so is vendored in the wheel
 tests/ci_build/ci_build.sh auditwheel_x86_64 bash -c \
diff --git a/tests/buildkite/build-cuda.sh b/tests/buildkite/build-cuda.sh
index 5abc5ca5aebd..f08502c317cb 100755
--- a/tests/buildkite/build-cuda.sh
+++ b/tests/buildkite/build-cuda.sh
@@ -35,14 +35,18 @@ $command_wrapper tests/ci_build/build_via_cmake.sh \
 echo "--- Build binary wheel"
 $command_wrapper bash -c \
   "cd python-package && rm -rf dist/* && pip wheel --no-deps -v . --wheel-dir dist/"
-$command_wrapper python tests/ci_build/rename_whl.py python-package/dist/*.whl \
-  ${BUILDKITE_COMMIT} ${WHEEL_TAG}
+$command_wrapper python tests/ci_build/rename_whl.py  \
+  --wheel-path python-package/dist/*.whl  \
+  --commit-hash ${BUILDKITE_COMMIT}  \
+  --platform-tag ${WHEEL_TAG}
 
 echo "--- Audit binary wheel to ensure it's compliant with manylinux2014 standard"
 tests/ci_build/ci_build.sh auditwheel_x86_64 auditwheel repair \
   --plat ${WHEEL_TAG} python-package/dist/*.whl
-$command_wrapper python tests/ci_build/rename_whl.py wheelhouse/*.whl \
-  ${BUILDKITE_COMMIT} ${WHEEL_TAG}
+$command_wrapper python tests/ci_build/rename_whl.py  \
+  --wheel-path wheelhouse/*.whl  \
+  --commit-hash ${BUILDKITE_COMMIT}  \
+  --platform-tag ${WHEEL_TAG}
 mv -v wheelhouse/*.whl python-package/dist/
 # Make sure that libgomp.so is vendored in the wheel
 tests/ci_build/ci_build.sh auditwheel_x86_64 bash -c \
@@ -54,6 +58,15 @@ if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]]
 then
   aws s3 cp python-package/dist/*.whl s3://xgboost-nightly-builds/${BRANCH_NAME}/ \
     --acl public-read --no-progress
+
+  # Generate the meta info which includes xgboost version and the commit info
+  $command_wrapper python tests/ci_build/format_wheel_meta.py \
+    --wheel-path python-package/dist/*.whl  \
+    --commit-hash ${BUILDKITE_COMMIT}  \
+    --platform-tag ${WHEEL_TAG}  \
+    --meta-path python-package/dist/
+  aws s3 cp python-package/dist/meta.json s3://xgboost-nightly-builds/${BRANCH_NAME}/ \
+    --acl public-read --no-progress
 fi
 echo "-- Stash C++ test executable (testxgboost)"
 buildkite-agent artifact upload build/testxgboost
diff --git a/tests/buildkite/build-win64-gpu.ps1 b/tests/buildkite/build-win64-gpu.ps1
index 092d4b192ab2..0799d7ac6add 100644
--- a/tests/buildkite/build-win64-gpu.ps1
+++ b/tests/buildkite/build-win64-gpu.ps1
@@ -29,7 +29,10 @@ conda activate
 & pip wheel --no-deps -v . --wheel-dir dist/
 Get-ChildItem . -Filter dist/*.whl |
 Foreach-Object {
-  & python ../tests/ci_build/rename_whl.py $_.FullName $Env:BUILDKITE_COMMIT win_amd64
+  & python ../tests/ci_build/rename_whl.py `
+    --wheel-path $_.FullName `
+    --commit-hash $Env:BUILDKITE_COMMIT `
+    --platform-tag win_amd64
   if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
 }
 
@@ -44,7 +47,7 @@ if ( $is_release_branch -eq 1 ) {
   Get-ChildItem . -Filter python-package/dist/*.whl |
   Foreach-Object {
     & aws s3 cp python-package/dist/$_ s3://xgboost-nightly-builds/$Env:BUILDKITE_BRANCH/ `
-    --acl public-read --no-progress
+      --acl public-read --no-progress
     if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
   }
 }
diff --git a/tests/buildkite/conftest.sh b/tests/buildkite/conftest.sh
index 44043910bc96..ea7a5a00c02a 100755
--- a/tests/buildkite/conftest.sh
+++ b/tests/buildkite/conftest.sh
@@ -25,6 +25,7 @@ set -x
 CUDA_VERSION=11.8.0
 NCCL_VERSION=2.16.5-1
 RAPIDS_VERSION=24.04
+DEV_RAPIDS_VERSION=24.06
 SPARK_VERSION=3.4.0
 JDK_VERSION=8
 R_VERSION=4.3.2
diff --git a/tests/buildkite/pipeline-nightly.yml b/tests/buildkite/pipeline-nightly.yml
new file mode 100644
index 000000000000..495d1b3e48a9
--- /dev/null
+++ b/tests/buildkite/pipeline-nightly.yml
@@ -0,0 +1,37 @@
+# Nightly CI pipeline, to test against dev versions of dependencies
+
+env:
+  DOCKER_CACHE_ECR_ID: "492475357299"
+  DOCKER_CACHE_ECR_REGION: "us-west-2"
+  DISABLE_RELEASE: "1"
+    # Skip uploading artifacts to S3 bucket
+    # Also, don't build all CUDA archs; just build sm_75
+  USE_DEPS_DEV_VER: "1"
+    # Use dev versions of RAPIDS and other dependencies
+steps:
+  #### -------- CONTAINER BUILD --------
+  - label: ":docker: Build containers"
+    commands:
+      - "tests/buildkite/build-containers.sh gpu_build_centos7"
+      - "tests/buildkite/build-containers.sh gpu_dev_ver"
+    key: build-containers
+    agents:
+      queue: linux-amd64-cpu
+  - wait
+
+  - label: ":console: Build CUDA"
+    command: "tests/buildkite/build-cuda.sh"
+    key: build-cuda
+    agents:
+      queue: linux-amd64-cpu
+  - wait
+  - label: ":console: Test Python package, single GPU"
+    command: "tests/buildkite/test-python-gpu.sh gpu"
+    key: test-python-gpu
+    agents:
+      queue: linux-amd64-gpu
+  - label: ":console: Test Python package, 4 GPUs"
+    command: "tests/buildkite/test-python-gpu.sh mgpu"
+    key: test-python-mgpu
+    agents:
+      queue: linux-amd64-mgpu
diff --git a/tests/buildkite/test-python-gpu.sh b/tests/buildkite/test-python-gpu.sh
index bb61a980de11..d7bd729a2e01 100755
--- a/tests/buildkite/test-python-gpu.sh
+++ b/tests/buildkite/test-python-gpu.sh
@@ -22,10 +22,19 @@ chmod +x build/testxgboost
 # Allocate extra space in /dev/shm to enable NCCL
 export CI_DOCKER_EXTRA_PARAMS_INIT='--shm-size=4g'
 
-command_wrapper="tests/ci_build/ci_build.sh gpu --use-gpus --build-arg "`
+if [[ -z "${USE_DEPS_DEV_VER-}" ]]
+then
+  container_tag='gpu'
+  rapids_version=${RAPIDS_VERSION}
+else
+  container_tag='gpu_dev_ver'
+  rapids_version=${DEV_RAPIDS_VERSION}
+fi
+
+command_wrapper="tests/ci_build/ci_build.sh ${container_tag} --use-gpus --build-arg "`
                 `"CUDA_VERSION_ARG=$CUDA_VERSION --build-arg "`
-                `"RAPIDS_VERSION_ARG=$RAPIDS_VERSION --build-arg "`
-		`"NCCL_VERSION_ARG=$NCCL_VERSION"
+                `"RAPIDS_VERSION_ARG=${rapids_version} --build-arg "`
+                `"NCCL_VERSION_ARG=$NCCL_VERSION"
 
 # Run specified test suite
 case "$suite" in
diff --git a/tests/buildkite/update-rapids.sh b/tests/buildkite/update-rapids.sh
index f617ccd11f58..f6a2675bdfa9 100755
--- a/tests/buildkite/update-rapids.sh
+++ b/tests/buildkite/update-rapids.sh
@@ -4,7 +4,10 @@ set -euo pipefail
 
 LATEST_RAPIDS_VERSION=$(gh api repos/rapidsai/cuml/releases/latest --jq '.name' | sed -e 's/^v\([[:digit:]]\+\.[[:digit:]]\+\).*/\1/')
 echo "LATEST_RAPIDS_VERSION = $LATEST_RAPIDS_VERSION"
+DEV_RAPIDS_VERSION=$(date +%Y-%m-%d -d "20${LATEST_RAPIDS_VERSION//./-}-01 + 2 month" | cut -c3-7 | tr - .)
+echo "DEV_RAPIDS_VERSION = $DEV_RAPIDS_VERSION"
 
 PARENT_PATH=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P )
 
 sed -i "s/^RAPIDS_VERSION=[[:digit:]]\+\.[[:digit:]]\+/RAPIDS_VERSION=${LATEST_RAPIDS_VERSION}/" $PARENT_PATH/conftest.sh
+sed -i "s/^DEV_RAPIDS_VERSION=[[:digit:]]\+\.[[:digit:]]\+/DEV_RAPIDS_VERSION=${DEV_RAPIDS_VERSION}/" $PARENT_PATH/conftest.sh
diff --git a/tests/ci_build/Dockerfile.aarch64 b/tests/ci_build/Dockerfile.aarch64
index 9b06e1c83373..f028e635b1f8 100644
--- a/tests/ci_build/Dockerfile.aarch64
+++ b/tests/ci_build/Dockerfile.aarch64
@@ -10,7 +10,7 @@ RUN \
     yum update -y && \
     yum install -y devtoolset-9 && \
     # Python
-    wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/22.11.1-2/Mambaforge-22.11.1-2-Linux-aarch64.sh && \
+    wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Miniforge3-24.3.0-0-Linux-aarch64.sh && \
     bash conda.sh -b -p /opt/mambaforge
 
 ENV PATH=/opt/mambaforge/bin:$PATH
@@ -23,7 +23,7 @@ ENV GOSU_VERSION 1.10
 COPY conda_env/aarch64_test.yml /scripts/
 RUN mamba create -n aarch64_test && \
     mamba env update -n aarch64_test --file=/scripts/aarch64_test.yml && \
-    mamba clean --all
+    mamba clean --all --yes
 
 # Install lightweight sudo (not bound to TTY)
 RUN set -ex; \
diff --git a/tests/ci_build/Dockerfile.cpu b/tests/ci_build/Dockerfile.cpu
index fa9ea772df5d..6ca574f513e0 100644
--- a/tests/ci_build/Dockerfile.cpu
+++ b/tests/ci_build/Dockerfile.cpu
@@ -12,7 +12,7 @@ RUN \
     apt-get update && \
     apt-get install -y tar unzip wget git build-essential doxygen graphviz llvm libidn12 cmake ninja-build gcc-9 g++-9 openjdk-8-jdk-headless && \
     # Python
-    wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/22.11.1-2/Mambaforge-22.11.1-2-Linux-x86_64.sh && \
+    wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Miniforge3-24.3.0-0-Linux-x86_64.sh && \
     bash conda.sh -b -p /opt/mambaforge
 
 ENV PATH=/opt/mambaforge/bin:$PATH
@@ -36,7 +36,7 @@ RUN git clone -b v1.49.1 https://github.com/grpc/grpc.git \
 COPY conda_env/linux_cpu_test.yml /scripts/
 RUN mamba create -n linux_cpu_test && \
     mamba env update -n linux_cpu_test --file=/scripts/linux_cpu_test.yml && \
-    mamba clean --all && \
+    mamba clean --all --yes && \
     conda run --no-capture-output -n linux_cpu_test pip install buildkite-test-collector
 
 # Install lightweight sudo (not bound to TTY)
diff --git a/tests/ci_build/Dockerfile.gpu b/tests/ci_build/Dockerfile.gpu
index 255dd9d71874..f68ba9d6b14b 100644
--- a/tests/ci_build/Dockerfile.gpu
+++ b/tests/ci_build/Dockerfile.gpu
@@ -14,7 +14,7 @@ RUN \
     apt-get update && \
     apt-get install -y wget unzip bzip2 libgomp1 build-essential openjdk-8-jdk-headless && \
     # Python
-    wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/22.11.1-2/Mambaforge-22.11.1-2-Linux-x86_64.sh && \
+    wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Miniforge3-24.3.0-0-Linux-x86_64.sh && \
     bash conda.sh -b -p /opt/mambaforge
 
 ENV PATH=/opt/mambaforge/bin:$PATH
@@ -22,14 +22,14 @@ ENV PATH=/opt/mambaforge/bin:$PATH
 # Create new Conda environment with cuDF, Dask, and cuPy
 RUN \
     export NCCL_SHORT_VER=$(echo "$NCCL_VERSION_ARG" | cut -d "-" -f 1) && \
-    mamba create -y -n gpu_test -c rapidsai -c nvidia -c conda-forge \
+    mamba create -y -n gpu_test -c rapidsai -c conda-forge -c nvidia \
         python=3.10 cudf=$RAPIDS_VERSION_ARG* rmm=$RAPIDS_VERSION_ARG* cudatoolkit=$CUDA_VERSION_ARG \
         "nccl>=${NCCL_SHORT_VER}" \
         dask=2024.1.1 \
         dask-cuda=$RAPIDS_VERSION_ARG* dask-cudf=$RAPIDS_VERSION_ARG* cupy \
         numpy pytest pytest-timeout scipy scikit-learn pandas matplotlib wheel python-kubernetes urllib3 graphviz hypothesis \
         "pyspark>=3.4.0" cloudpickle cuda-python && \
-    mamba clean --all && \
+    mamba clean --all --yes && \
     conda run --no-capture-output -n gpu_test pip install buildkite-test-collector
 
 ENV GOSU_VERSION 1.10
diff --git a/tests/ci_build/Dockerfile.gpu_build_centos7 b/tests/ci_build/Dockerfile.gpu_build_centos7
index 16445de2a704..9934027258ab 100644
--- a/tests/ci_build/Dockerfile.gpu_build_centos7
+++ b/tests/ci_build/Dockerfile.gpu_build_centos7
@@ -13,7 +13,7 @@ RUN \
     yum -y update && \
     yum install -y tar unzip wget xz git which ninja-build devtoolset-9-gcc devtoolset-9-binutils devtoolset-9-gcc-c++ && \
     # Python
-    wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/22.11.1-2/Mambaforge-22.11.1-2-Linux-x86_64.sh && \
+    wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Miniforge3-24.3.0-0-Linux-x86_64.sh && \
     bash conda.sh -b -p /opt/mambaforge && \
     /opt/mambaforge/bin/python -m pip install awscli && \
     # CMake
diff --git a/tests/ci_build/Dockerfile.gpu_build_r_centos7 b/tests/ci_build/Dockerfile.gpu_build_r_centos7
index 7c95f09b59a9..bfe1cf221710 100644
--- a/tests/ci_build/Dockerfile.gpu_build_r_centos7
+++ b/tests/ci_build/Dockerfile.gpu_build_r_centos7
@@ -35,7 +35,7 @@ RUN \
 
 run \
     # Python
-    wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/22.11.1-2/Mambaforge-22.11.1-2-Linux-x86_64.sh && \
+    wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Miniforge3-24.3.0-0-Linux-x86_64.sh && \
     bash conda.sh -b -p /opt/mambaforge && \
     /opt/mambaforge/bin/python -m pip install auditwheel awscli && \
     # CMake
diff --git a/tests/ci_build/Dockerfile.gpu_dev_ver b/tests/ci_build/Dockerfile.gpu_dev_ver
new file mode 100644
index 000000000000..a592d4891093
--- /dev/null
+++ b/tests/ci_build/Dockerfile.gpu_dev_ver
@@ -0,0 +1,52 @@
+# Container to test XGBoost against dev versions of dependencies
+
+ARG CUDA_VERSION_ARG
+FROM nvidia/cuda:$CUDA_VERSION_ARG-runtime-ubuntu22.04
+ARG CUDA_VERSION_ARG
+ARG RAPIDS_VERSION_ARG
+  # Should be first 4 digits of the dev version (e.g. 24.06)
+ARG NCCL_VERSION_ARG
+
+# Environment
+ENV DEBIAN_FRONTEND noninteractive
+SHELL ["/bin/bash", "-c"]   # Use Bash as shell
+
+# Install all basic requirements
+RUN \
+    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/3bf863cc.pub && \
+    apt-get update && \
+    apt-get install -y wget unzip bzip2 libgomp1 build-essential openjdk-8-jdk-headless && \
+    # Python
+    wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Miniforge3-24.3.0-0-Linux-x86_64.sh && \
+    bash conda.sh -b -p /opt/mambaforge
+
+ENV PATH=/opt/mambaforge/bin:$PATH
+
+# Create new Conda environment with dev versions of cuDF, Dask, and cuPy
+RUN \
+    export NCCL_SHORT_VER=$(echo "$NCCL_VERSION_ARG" | cut -d "-" -f 1) && \
+    mamba create -y -n gpu_test -c rapidsai-nightly -c conda-forge -c nvidia \
+        python=3.10 "cudf=$RAPIDS_VERSION_ARG.*" "rmm=$RAPIDS_VERSION_ARG.*" cudatoolkit=$CUDA_VERSION_ARG \
+        "nccl>=${NCCL_SHORT_VER}" \
+        dask=2024.1.1 \
+        "dask-cuda=$RAPIDS_VERSION_ARG.*" "dask-cudf=$RAPIDS_VERSION_ARG.*" cupy \
+        numpy pytest pytest-timeout scipy scikit-learn pandas matplotlib wheel python-kubernetes urllib3 graphviz hypothesis \
+        "pyspark>=3.4.0" cloudpickle cuda-python && \
+    mamba clean --all --yes && \
+    conda run --no-capture-output -n gpu_test pip install buildkite-test-collector
+
+ENV GOSU_VERSION 1.10
+ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64/
+
+# Install lightweight sudo (not bound to TTY)
+RUN set -ex; \
+    wget -nv -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \
+    chmod +x /usr/local/bin/gosu && \
+    gosu nobody true
+
+# Default entry-point to use if running locally
+# It will preserve attributes of created files
+COPY entrypoint.sh /scripts/
+
+WORKDIR /workspace
+ENTRYPOINT ["/scripts/entrypoint.sh"]
diff --git a/tests/ci_build/Dockerfile.jvm b/tests/ci_build/Dockerfile.jvm
index a115fd52c2d9..4f447cbbab54 100644
--- a/tests/ci_build/Dockerfile.jvm
+++ b/tests/ci_build/Dockerfile.jvm
@@ -9,7 +9,7 @@ RUN \
                    devtoolset-9-gcc devtoolset-9-binutils devtoolset-9-gcc-c++ \
                    devtoolset-9-runtime devtoolset-9-libstdc++-devel && \
     # Python
-    wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/22.11.1-2/Mambaforge-22.11.1-2-Linux-x86_64.sh && \
+    wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Miniforge3-24.3.0-0-Linux-x86_64.sh && \
     bash conda.sh -b -p /opt/mambaforge && \
     # CMake
     wget -nv -nc https://cmake.org/files/v3.18/cmake-3.18.0-Linux-x86_64.sh --no-check-certificate && \
diff --git a/tests/ci_build/Dockerfile.jvm_cross b/tests/ci_build/Dockerfile.jvm_cross
index 43988872d989..9ba7b6e69f6a 100644
--- a/tests/ci_build/Dockerfile.jvm_cross
+++ b/tests/ci_build/Dockerfile.jvm_cross
@@ -13,7 +13,7 @@ RUN \
     apt-get update && \
     apt-get install -y tar unzip wget openjdk-$JDK_VERSION-jdk libgomp1 && \
     # Python
-    wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/22.11.1-2/Mambaforge-22.11.1-2-Linux-x86_64.sh && \
+    wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Miniforge3-24.3.0-0-Linux-x86_64.sh && \
     bash conda.sh -b -p /opt/mambaforge && \
     /opt/mambaforge/bin/pip install awscli && \
     # Maven
diff --git a/tests/ci_build/Dockerfile.jvm_gpu_build b/tests/ci_build/Dockerfile.jvm_gpu_build
index cee41894266b..7bd49b5a9c70 100644
--- a/tests/ci_build/Dockerfile.jvm_gpu_build
+++ b/tests/ci_build/Dockerfile.jvm_gpu_build
@@ -12,7 +12,7 @@ RUN \
     yum -y update && \
     yum install -y tar unzip wget xz git which ninja-build java-1.8.0-openjdk-devel devtoolset-9-gcc devtoolset-9-binutils devtoolset-9-gcc-c++ && \
     # Python
-    wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/22.11.1-2/Mambaforge-22.11.1-2-Linux-x86_64.sh && \
+    wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Miniforge3-24.3.0-0-Linux-x86_64.sh && \
     bash conda.sh -b -p /opt/mambaforge && \
     # CMake
     wget -nv -nc https://cmake.org/files/v3.18/cmake-3.18.0-Linux-x86_64.sh --no-check-certificate && \
diff --git a/tests/ci_build/build_python_wheels.sh b/tests/ci_build/build_python_wheels.sh
index 912fee482e64..03a9bc1d6026 100644
--- a/tests/ci_build/build_python_wheels.sh
+++ b/tests/ci_build/build_python_wheels.sh
@@ -67,4 +67,7 @@ fi
 
 python -m pip install cibuildwheel
 python -m cibuildwheel python-package --output-dir wheelhouse
-python tests/ci_build/rename_whl.py wheelhouse/*.whl ${commit_id} ${wheel_tag}
+python tests/ci_build/rename_whl.py  \
+  --wheel-path wheelhouse/*.whl  \
+  --commit-hash ${commit_id}  \
+  --platform-tag ${wheel_tag}
diff --git a/tests/ci_build/conda_env/cpp_test.yml b/tests/ci_build/conda_env/cpp_test.yml
index d08b904d867c..99a4caef1d63 100644
--- a/tests/ci_build/conda_env/cpp_test.yml
+++ b/tests/ci_build/conda_env/cpp_test.yml
@@ -8,3 +8,5 @@ dependencies:
 - c-compiler
 - cxx-compiler
 - gtest
+- protobuf
+- libgrpc
diff --git a/tests/ci_build/conda_env/jvm_tests.yml b/tests/ci_build/conda_env/jvm_tests.yml
new file mode 100644
index 000000000000..56e11dff27bb
--- /dev/null
+++ b/tests/ci_build/conda_env/jvm_tests.yml
@@ -0,0 +1,6 @@
+name: jvm_tests
+channels:
+- conda-forge
+dependencies:
+- python=3.10
+- awscli
diff --git a/tests/ci_build/deploy_jvm_packages.sh b/tests/ci_build/deploy_jvm_packages.sh
index 9531d79a9937..7c909483acaa 100755
--- a/tests/ci_build/deploy_jvm_packages.sh
+++ b/tests/ci_build/deploy_jvm_packages.sh
@@ -17,7 +17,7 @@ cd jvm-packages
 rm -rf $(find . -name target)
 rm -rf ../build/
 
-# Re-build package without Mock Rabit
+# Re-build package
 # Maven profiles:
 # `default`           includes modules: xgboost4j, xgboost4j-spark, xgboost4j-flink, xgboost4j-example
 # `gpu`               includes modules: xgboost4j-gpu, xgboost4j-spark-gpu, sets `use.cuda = ON`
diff --git a/tests/ci_build/format_wheel_meta.py b/tests/ci_build/format_wheel_meta.py
new file mode 100644
index 000000000000..735c6a9719c7
--- /dev/null
+++ b/tests/ci_build/format_wheel_meta.py
@@ -0,0 +1,58 @@
+"""
+Script to generate meta.json to store metadata for a nightly build of
+XGBoost Python package.
+"""
+import json
+import pathlib
+from argparse import ArgumentParser
+
+
+def main(args):
+    wheel_path = pathlib.Path(args.wheel_path).expanduser().resolve()
+    if not wheel_path.exists():
+        raise ValueError(f"Wheel cannot be found at path {wheel_path}")
+    if not wheel_path.is_file():
+        raise ValueError(f"Path {wheel_path} is not a valid file")
+    wheel_dir, wheel_name = wheel_path.parent, wheel_path.name
+
+    meta_path = pathlib.Path(args.meta_path)
+    if not meta_path.exists():
+        raise ValueError(f"Path {meta_path} does not exist")
+    if not meta_path.is_dir():
+        raise ValueError(f"Path {meta_path} is not a valid directory")
+
+    tokens = wheel_name.split("-")
+    assert len(tokens) == 5
+    version = tokens[1].split("+")[0]
+
+    meta_info = {
+        "wheel_name": wheel_name,
+        "platform_tag": args.platform_tag,
+        "version": version,
+        "commit_id": args.commit_hash,
+    }
+    with open(meta_path / "meta.json", "w") as f:
+        json.dump(meta_info, f, indent=4)
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser(
+        description="Format meta.json encoding the latest nightly version of the Python wheel"
+    )
+    parser.add_argument(
+        "--wheel-path", type=str, required=True, help="Path to the wheel"
+    )
+    parser.add_argument(
+        "--commit-hash", type=str, required=True, help="Git commit hash"
+    )
+    parser.add_argument(
+        "--platform-tag",
+        type=str,
+        required=True,
+        help="Platform tag (e.g. manylinux2014_x86_64)",
+    )
+    parser.add_argument(
+        "--meta-path", type=str, required=True, help="Directory to place meta.json"
+    )
+    parsed_args = parser.parse_args()
+    main(parsed_args)
diff --git a/tests/ci_build/rename_whl.py b/tests/ci_build/rename_whl.py
index 2da7db8dea08..62005f58ec70 100644
--- a/tests/ci_build/rename_whl.py
+++ b/tests/ci_build/rename_whl.py
@@ -1,39 +1,59 @@
-import os
-import sys
+import pathlib
+from argparse import ArgumentParser
 
-from test_utils import DirectoryExcursion
 
-if len(sys.argv) != 4:
-    print("Usage: {} [wheel to rename] [commit id] [platform tag]".format(sys.argv[0]))
-    sys.exit(1)
+def main(args):
+    wheel_path = pathlib.Path(args.wheel_path).expanduser().resolve()
+    if not wheel_path.exists():
+        raise ValueError(f"Wheel cannot be found at path {wheel_path}")
+    if not wheel_path.is_file():
+        raise ValueError(f"Path {wheel_path} is not a valid file")
+    wheel_dir, wheel_name = wheel_path.parent, wheel_path.name
 
-
-whl_path = sys.argv[1]
-commit_id = sys.argv[2]
-platform_tag = sys.argv[3]
-
-dirname, basename = os.path.dirname(whl_path), os.path.basename(whl_path)
-
-with DirectoryExcursion(dirname):
-    tokens = basename.split("-")
+    tokens = wheel_name.split("-")
     assert len(tokens) == 5
     version = tokens[1].split("+")[0]
     keywords = {
         "pkg_name": tokens[0],
         "version": version,
-        "commit_id": commit_id,
-        "platform_tag": platform_tag,
+        "commit_id": args.commit_hash,
+        "platform_tag": args.platform_tag,
     }
-    new_name = "{pkg_name}-{version}+{commit_id}-py3-none-{platform_tag}.whl".format(
-        **keywords
+    new_wheel_name = (
+        "{pkg_name}-{version}+{commit_id}-py3-none-{platform_tag}.whl".format(
+            **keywords
+        )
     )
-    print("Renaming {} to {}...".format(basename, new_name))
-    if os.path.isfile(new_name):
-        os.remove(new_name)
-    os.rename(basename, new_name)
+    new_wheel_path = wheel_dir / new_wheel_name
+    print(f"Renaming {wheel_name} to {new_wheel_name}...")
+    if new_wheel_path.is_file():
+        new_wheel_path.unlink()
+    wheel_path.rename(new_wheel_path)
+
+    filesize = new_wheel_path.stat().st_size / 1024 / 1024  # MiB
+    print(f"Wheel size: {filesize:.2f} MiB")
+
+    if filesize > 300:
+        raise RuntimeError(
+            f"Limit of wheel size set by PyPI is exceeded. {new_wheel_name}: {filesize:.2f} MiB"
+        )
 
-    filesize = os.path.getsize(new_name) / 1024 / 1024  # MB
-    print(f"Wheel size: {filesize}")
 
-    msg = f"Limit of wheel size set by PyPI is exceeded. {new_name}: {filesize}"
-    assert filesize <= 300, msg
+if __name__ == "__main__":
+    parser = ArgumentParser(
+        description="Format a Python wheel's name using the git commit hash and platform tag"
+    )
+    parser.add_argument(
+        "--wheel-path", type=str, required=True, help="Path to the wheel"
+    )
+    parser.add_argument(
+        "--commit-hash", type=str, required=True, help="Git commit hash"
+    )
+    parser.add_argument(
+        "--platform-tag",
+        type=str,
+        required=True,
+        help="Platform tag (e.g. manylinux2014_x86_64)",
+    )
+    parsed_args = parser.parse_args()
+    main(parsed_args)
diff --git a/tests/ci_build/test_r_package.py b/tests/ci_build/test_r_package.py
index 1fe1644add1f..add31b97313c 100644
--- a/tests/ci_build/test_r_package.py
+++ b/tests/ci_build/test_r_package.py
@@ -50,10 +50,6 @@ def pkgroot(path: str) -> None:
     shutil.copytree("src", dest / "src" / "src")
     shutil.copytree("include", dest / "src" / "include")
     shutil.copytree("amalgamation", dest / "src" / "amalgamation")
-    # rabit
-    rabit = Path("rabit")
-    os.mkdir(dest / "src" / rabit)
-    shutil.copytree(rabit / "include", dest / "src" / "rabit" / "include")
     # dmlc-core
     dmlc_core = Path("dmlc-core")
     os.mkdir(dest / "src" / dmlc_core)
diff --git a/tests/ci_build/tidy.py b/tests/ci_build/tidy.py
index 16fb07e0d1c2..7116eb78e039 100755
--- a/tests/ci_build/tidy.py
+++ b/tests/ci_build/tidy.py
@@ -192,8 +192,7 @@ def _configure(self):
         def should_lint(path):
             if not self.cpp_lint and path.endswith('.cc'):
                 return False
-            isxgb = path.find('rabit') == -1
-            isxgb = isxgb and path.find('dmlc-core') == -1
+            isxgb = path.find('dmlc-core') == -1
             isxgb = isxgb and (not path.startswith(self.cdb_path))
             if isxgb:
                 print(path)
diff --git a/tests/cpp/CMakeLists.txt b/tests/cpp/CMakeLists.txt
index 2748e13098b6..6496f8af45de 100644
--- a/tests/cpp/CMakeLists.txt
+++ b/tests/cpp/CMakeLists.txt
@@ -25,8 +25,7 @@ if(PLUGIN_SYCL)
     PRIVATE
     ${gtest_SOURCE_DIR}/include
     ${xgboost_SOURCE_DIR}/include
-    ${xgboost_SOURCE_DIR}/dmlc-core/include
-    ${xgboost_SOURCE_DIR}/rabit/include)
+    ${xgboost_SOURCE_DIR}/dmlc-core/include)
 
   target_compile_definitions(plugin_sycl_test PUBLIC -DXGBOOST_USE_SYCL=1)
   target_link_libraries(plugin_sycl_test PUBLIC -fsycl)
@@ -66,8 +65,7 @@ target_include_directories(testxgboost
   PRIVATE
   ${GTEST_INCLUDE_DIRS}
   ${xgboost_SOURCE_DIR}/include
-  ${xgboost_SOURCE_DIR}/dmlc-core/include
-  ${xgboost_SOURCE_DIR}/rabit/include)
+  ${xgboost_SOURCE_DIR}/dmlc-core/include)
 target_link_libraries(testxgboost
   PRIVATE
   GTest::gtest GTest::gmock)
diff --git a/tests/cpp/data/test_gradient_index_page_raw_format.cc b/tests/cpp/data/test_gradient_index_page_raw_format.cc
index e2397c7b7eea..a327b319cef0 100644
--- a/tests/cpp/data/test_gradient_index_page_raw_format.cc
+++ b/tests/cpp/data/test_gradient_index_page_raw_format.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2021-2023, XGBoost contributors
+ * Copyright 2021-2024, XGBoost contributors
  */
 #include <gtest/gtest.h>
 #include <xgboost/context.h>  // for Context
@@ -8,10 +8,10 @@
 #include <memory>   // for unique_ptr
 
 #include "../../../src/common/column_matrix.h"
-#include "../../../src/common/io.h"            // for MmapResource, AlignedResourceReadStream...
-#include "../../../src/data/gradient_index.h"  // for GHistIndexMatrix
-#include "../../../src/data/sparse_page_source.h"
-#include "../helpers.h"  // for RandomDataGenerator
+#include "../../../src/common/io.h"                // for MmapResource, AlignedResourceReadStream...
+#include "../../../src/data/gradient_index.h"      // for GHistIndexMatrix
+#include "../../../src/data/sparse_page_writer.h"  // for CreatePageFormat
+#include "../helpers.h"                            // for RandomDataGenerator
 
 namespace xgboost::data {
 TEST(GHistIndexPageRawFormat, IO) {
diff --git a/tests/cpp/data/test_sparse_page_dmatrix.cc b/tests/cpp/data/test_sparse_page_dmatrix.cc
index efd323e7792e..25acb038c462 100644
--- a/tests/cpp/data/test_sparse_page_dmatrix.cc
+++ b/tests/cpp/data/test_sparse_page_dmatrix.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2016-2023 by XGBoost Contributors
+ * Copyright 2016-2024, XGBoost Contributors
  */
 #include <gtest/gtest.h>
 #include <xgboost/data.h>
@@ -115,9 +115,67 @@ TEST(SparsePageDMatrix, RetainSparsePage) {
   TestRetainPage<SortedCSCPage>();
 }
 
+// Test GHistIndexMatrix can avoid loading sparse page after the initialization.
+TEST(SparsePageDMatrix, GHistIndexSkipSparsePage) {
+  dmlc::TemporaryDirectory tmpdir;
+  auto Xy = RandomDataGenerator{180, 12, 0.0}.Batches(6).GenerateSparsePageDMatrix(
+      tmpdir.path + "/", true);
+  Context ctx;
+  bst_bin_t n_bins{256};
+  double sparse_thresh{0.8};
+  BatchParam batch_param{n_bins, sparse_thresh};
+
+  auto check_ghist = [&] {
+    std::int32_t k = 0;
+    for (auto const &page : Xy->GetBatches<GHistIndexMatrix>(&ctx, batch_param)) {
+      ASSERT_EQ(page.Size(), 30);
+      ASSERT_EQ(k, page.base_rowid);
+      k += page.Size();
+    }
+  };
+  check_ghist();
+
+  auto casted = std::dynamic_pointer_cast<data::SparsePageDMatrix>(Xy);
+  CHECK(casted);
+  // Make the number of fetches don't change (no new fetch)
+  auto n_init_fetches = casted->SparsePageFetchCount();
+
+  std::vector<float> hess(Xy->Info().num_row_, 1.0f);
+  // Run multiple iterations to make sure fetches are consistent after reset.
+  for (std::int32_t i = 0; i < 4; ++i) {
+    auto n_fetches = casted->SparsePageFetchCount();
+    check_ghist();
+    ASSERT_EQ(casted->SparsePageFetchCount(), n_fetches);
+    if (i == 0) {
+      ASSERT_EQ(n_fetches, n_init_fetches);
+    }
+    // Make sure other page types don't interfere the GHist. This way, we can reuse the
+    // DMatrix for multiple purposes.
+    for ([[maybe_unused]] auto const &page : Xy->GetBatches<SparsePage>(&ctx)) {
+    }
+    for ([[maybe_unused]] auto const &page : Xy->GetBatches<SortedCSCPage>(&ctx)) {
+    }
+    for ([[maybe_unused]] auto const &page : Xy->GetBatches<GHistIndexMatrix>(&ctx, batch_param)) {
+    }
+    // Approx tree method pages
+    {
+      BatchParam regen{n_bins, common::Span{hess.data(), hess.size()}, false};
+      for ([[maybe_unused]] auto const &page : Xy->GetBatches<GHistIndexMatrix>(&ctx, regen)) {
+      }
+    }
+    {
+      BatchParam regen{n_bins, common::Span{hess.data(), hess.size()}, true};
+      for ([[maybe_unused]] auto const &page : Xy->GetBatches<GHistIndexMatrix>(&ctx, regen)) {
+      }
+    }
+    // Restore the batch parameter by passing it in again through check_ghist
+    check_ghist();
+  }
+}
+
 TEST(SparsePageDMatrix, MetaInfo) {
-  dmlc::TemporaryDirectory tempdir;
-  const std::string tmp_file = tempdir.path + "/simple.libsvm";
+  dmlc::TemporaryDirectory tmpdir;
+  const std::string tmp_file = tmpdir.path + "/simple.libsvm";
   size_t constexpr kEntries = 24;
   CreateBigTestData(tmp_file, kEntries);
 
diff --git a/tests/cpp/data/test_sparse_page_dmatrix.cu b/tests/cpp/data/test_sparse_page_dmatrix.cu
index e82ca64cc1df..2aff98375f31 100644
--- a/tests/cpp/data/test_sparse_page_dmatrix.cu
+++ b/tests/cpp/data/test_sparse_page_dmatrix.cu
@@ -42,6 +42,36 @@ TEST(SparsePageDMatrix, EllpackPage) {
   delete dmat;
 }
 
+TEST(SparsePageDMatrix, EllpackSkipSparsePage) {
+  // Test Ellpack can avoid loading sparse page after the initialization.
+  dmlc::TemporaryDirectory tmpdir;
+  auto Xy = RandomDataGenerator{180, 12, 0.0}.Batches(6).GenerateSparsePageDMatrix(
+      tmpdir.path + "/", true);
+  auto ctx = MakeCUDACtx(0);
+  bst_bin_t n_bins{256};
+  double sparse_thresh{0.8};
+  BatchParam batch_param{n_bins, sparse_thresh};
+
+  std::int32_t k = 0;
+  for (auto const& page : Xy->GetBatches<EllpackPage>(&ctx, batch_param)) {
+    auto impl = page.Impl();
+    ASSERT_EQ(page.Size(), 30);
+    ASSERT_EQ(k, impl->base_rowid);
+    k += page.Size();
+  }
+
+  auto casted = std::dynamic_pointer_cast<data::SparsePageDMatrix>(Xy);
+  CHECK(casted);
+  // Make the number of fetches don't change (no new fetch)
+  auto n_fetches = casted->SparsePageFetchCount();
+  for (std::int32_t i = 0; i < 3; ++i) {
+    for ([[maybe_unused]] auto const& page : Xy->GetBatches<EllpackPage>(&ctx, batch_param)) {
+    }
+    auto casted = std::dynamic_pointer_cast<data::SparsePageDMatrix>(Xy);
+    ASSERT_EQ(casted->SparsePageFetchCount(), n_fetches);
+  }
+}
+
 TEST(SparsePageDMatrix, MultipleEllpackPages) {
   Context ctx{MakeCUDACtx(0)};
   auto param = BatchParam{256, tree::TrainParam::DftSparseThreshold()};
diff --git a/tests/cpp/plugin/federated/test_federated_comm.cc b/tests/cpp/plugin/federated/test_federated_comm.cc
index 16edc685fde3..a82e9c2c429e 100644
--- a/tests/cpp/plugin/federated/test_federated_comm.cc
+++ b/tests/cpp/plugin/federated/test_federated_comm.cc
@@ -16,21 +16,35 @@
 namespace xgboost::collective {
 namespace {
 class FederatedCommTest : public SocketTest {};
+auto MakeConfig(std::string host, std::int32_t port, std::int32_t world, std::int32_t rank) {
+  Json config{Object{}};
+  config["federated_server_address"] = host + ":" + std::to_string(port);
+  config["federated_world_size"] = Integer{world};
+  config["federated_rank"] = Integer{rank};
+  return config;
+}
 }  // namespace
 
 TEST_F(FederatedCommTest, ThrowOnWorldSizeTooSmall) {
-  auto construct = [] { FederatedComm comm{"localhost", 0, 0, 0}; };
+  auto config = MakeConfig("localhost", 0, 0, 0);
+  auto construct = [config] {
+    FederatedComm comm{DefaultRetry(), std::chrono::seconds{DefaultTimeoutSec()}, "", config};
+  };
   ASSERT_THAT(construct, GMockThrow("Invalid world size"));
 }
 
 TEST_F(FederatedCommTest, ThrowOnRankTooSmall) {
-  auto construct = [] { FederatedComm comm{"localhost", 0, 1, -1}; };
+  auto config = MakeConfig("localhost", 0, 1, -1);
+  auto construct = [config] {
+    FederatedComm comm{DefaultRetry(), std::chrono::seconds{DefaultTimeoutSec()}, "", config};
+  };
   ASSERT_THAT(construct, GMockThrow("Invalid worker rank."));
 }
 
 TEST_F(FederatedCommTest, ThrowOnRankTooBig) {
-  auto construct = [] {
-    FederatedComm comm{"localhost", 0, 1, 1};
+  auto config = MakeConfig("localhost", 0, 1, 1);
+  auto construct = [config] {
+    FederatedComm comm{DefaultRetry(), std::chrono::seconds{DefaultTimeoutSec()}, "", config};
   };
   ASSERT_THAT(construct, GMockThrow("Invalid worker rank."));
 }
@@ -68,7 +82,8 @@ TEST_F(FederatedCommTest, GetWorldSizeAndRank) {
 }
 
 TEST_F(FederatedCommTest, IsDistributed) {
-  FederatedComm comm{"localhost", 0, 2, 1};
+  FederatedComm comm{DefaultRetry(), std::chrono::seconds{DefaultTimeoutSec()}, "",
+                     MakeConfig("localhost", 0, 2, 1)};
   EXPECT_TRUE(comm.IsDistributed());
 }
 
diff --git a/tests/python/test_config.py b/tests/python/test_config.py
index 3f741c25d527..338f82c68cc3 100644
--- a/tests/python/test_config.py
+++ b/tests/python/test_config.py
@@ -59,7 +59,7 @@ def test_nested_config() -> None:
     assert verbosity == 1
 
 
-def test_thread_safty():
+def test_thread_safety():
     n_threads = multiprocessing.cpu_count()
     futures = []
     with ThreadPoolExecutor(max_workers=n_threads) as executor:
diff --git a/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py b/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py
index ced78a84b0d2..905947d874ee 100644
--- a/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py
+++ b/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py
@@ -230,13 +230,13 @@ def test_boost_from_prediction(self, local_cuda_client: Client) -> None:
         run_boost_from_prediction_multi_class(X, y, "hist", "cuda", local_cuda_client)
 
     def test_init_estimation(self, local_cuda_client: Client) -> None:
-        check_init_estimation("gpu_hist", local_cuda_client)
+        check_init_estimation("hist", "cuda", local_cuda_client)
 
     def test_uneven_nan(self) -> None:
         n_workers = 2
         with LocalCUDACluster(n_workers=n_workers) as cluster:
             with Client(cluster) as client:
-                check_uneven_nan(client, "gpu_hist", n_workers)
+                check_uneven_nan(client, "hist", "cuda", n_workers)
 
     @pytest.mark.skipif(**tm.no_dask_cudf())
     def test_dask_dataframe(self, local_cuda_client: Client) -> None:
@@ -386,7 +386,7 @@ def test_dask_classifier(self, model: str, local_cuda_client: Client) -> None:
         X = dask_cudf.from_dask_dataframe(dd.from_dask_array(X_))
         y = dask_cudf.from_dask_dataframe(dd.from_dask_array(y_))
         w = dask_cudf.from_dask_dataframe(dd.from_dask_array(w_))
-        run_dask_classifier(X, y, w, model, "gpu_hist", local_cuda_client, 10)
+        run_dask_classifier(X, y, w, model, "hist", "cuda", local_cuda_client, 10)
 
     def test_empty_dmatrix(self, local_cuda_client: Client) -> None:
         parameters = {
diff --git a/tests/test_distributed/test_with_dask/test_with_dask.py b/tests/test_distributed/test_with_dask/test_with_dask.py
index ca55716bbd62..56abccb95ef5 100644
--- a/tests/test_distributed/test_with_dask/test_with_dask.py
+++ b/tests/test_distributed/test_with_dask/test_with_dask.py
@@ -12,7 +12,7 @@
 from math import ceil
 from operator import attrgetter, getitem
 from pathlib import Path
-from typing import Any, Dict, Generator, Optional, Tuple, Type, TypeVar, Union
+from typing import Any, Dict, Generator, Literal, Optional, Tuple, Type, TypeVar, Union
 
 import hypothesis
 import numpy as np
@@ -700,6 +700,7 @@ def run_dask_classifier(
     w: xgb.dask._DaskCollection,
     model: str,
     tree_method: Optional[str],
+    device: Literal["cpu", "cuda"],
     client: "Client",
     n_classes,
 ) -> None:
@@ -707,11 +708,19 @@ def run_dask_classifier(
 
     if model == "boosting":
         classifier = xgb.dask.DaskXGBClassifier(
-            verbosity=1, n_estimators=2, eval_metric=metric, tree_method=tree_method
+            verbosity=1,
+            n_estimators=2,
+            eval_metric=metric,
+            tree_method=tree_method,
+            device=device,
         )
     else:
         classifier = xgb.dask.DaskXGBRFClassifier(
-            verbosity=1, n_estimators=2, eval_metric=metric, tree_method=tree_method
+            verbosity=1,
+            n_estimators=2,
+            eval_metric=metric,
+            tree_method=tree_method,
+            device=device,
         )
 
     assert classifier._estimator_type == "classifier"
@@ -785,12 +794,12 @@ def test_dask_classifier(model: str, client: "Client") -> None:
     X, y, w = generate_array(with_weights=True)
     y = (y * 10).astype(np.int32)
     assert w is not None
-    run_dask_classifier(X, y, w, model, None, client, 10)
+    run_dask_classifier(X, y, w, model, None, "cpu", client, 10)
 
     y_bin = y.copy()
     y_bin[y > 5] = 1.0
     y_bin[y <= 5] = 0.0
-    run_dask_classifier(X, y_bin, w, model, None, client, 2)
+    run_dask_classifier(X, y_bin, w, model, None, "cpu", client, 2)
 
 
 def test_empty_dmatrix_training_continuation(client: "Client") -> None:
@@ -2136,7 +2145,7 @@ def _() -> xgb.dask.DaskXGBClassifier:
 
 
 def test_init_estimation(client: Client) -> None:
-    check_init_estimation("hist", client)
+    check_init_estimation("hist", "cpu", client)
 
 
 @pytest.mark.parametrize("tree_method", ["hist", "approx"])
@@ -2144,7 +2153,7 @@ def test_uneven_nan(tree_method) -> None:
     n_workers = 2
     with LocalCluster(n_workers=n_workers) as cluster:
         with Client(cluster) as client:
-            check_uneven_nan(client, tree_method, n_workers)
+            check_uneven_nan(client, tree_method, "cpu", n_workers)
 
 
 class TestDaskCallbacks: