Merge branch 'master' into untyped_storage

dmlc · Mar 7, 2024 · c259d1b · c259d1b
2 parents 77473a7 + 34ae70b
commit c259d1b
Show file tree

Hide file tree

Showing 116 changed files with 5,635 additions and 1,159 deletions.
diff --git a/.gitmodules b/.gitmodules
@@ -25,3 +25,6 @@
 [submodule "third_party/cccl"]
 	path = third_party/cccl
 	url = https://github.com/NVIDIA/cccl.git
+[submodule "third_party/liburing"]
+	path = third_party/liburing
+	url = https://github.com/axboe/liburing.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -139,8 +139,6 @@ endif()
 if(USE_CUDA)
   message(STATUS "Build with CUDA support")
   project(dgl C CXX)
-  # see https://github.com/NVIDIA/thrust/issues/1401
-  add_definitions(-DTHRUST_CUB_WRAPPED_NAMESPACE=dgl)
   include(cmake/modules/CUDA.cmake)
   message(STATUS "Use external CCCL library for a consistent API and performance.")
   cuda_include_directories(BEFORE "${CMAKE_SOURCE_DIR}/third_party/cccl/thrust")
@@ -351,7 +349,23 @@ endif(EXTERNAL_PHMAP_PATH)
 target_include_directories(dgl PRIVATE "tensoradapter/include")
 target_include_directories(dgl PRIVATE "third_party/pcg/include")
 
-
+if(CMAKE_SYSTEM_NAME MATCHES "Linux")
+  include(ExternalProject)
+  set(LIBURING_INSTALL_DIR ${CMAKE_BINARY_DIR}/third_party/liburing)
+  ExternalProject_Add(
+    liburing
+    SOURCE_DIR ${CMAKE_SOURCE_DIR}/third_party/liburing
+    CONFIGURE_COMMAND <SOURCE_DIR>/configure --prefix=${LIBURING_INSTALL_DIR}
+    BUILD_COMMAND bash -c "make -j 4"
+    BUILD_IN_SOURCE ON
+    INSTALL_COMMAND make install DESTDIR=${LIBURING_INSTALL_DIR}
+    BUILD_BYPRODUCTS ${LIBURING_INSTALL_DIR}/lib/liburing.a
+    BUILD_BYPRODUCTS ${LIBURING_INSTALL_DIR}/include
+    DOWNLOAD_EXTRACT_TIMESTAMP true
+  )
+  set(LIBURING_INCLUDE ${LIBURING_INSTALL_DIR}/include)
+  set(LIBURING ${LIBURING_INSTALL_DIR}/lib/liburing.a)
+endif()
 
 if(EXTERNAL_NANOFLANN_PATH)
   include_directories(SYSTEM ${EXTERNAL_NANOFLANN_PATH})

diff --git a/Jenkinsfile b/Jenkinsfile
@@ -318,7 +318,7 @@ pipeline {
               agent {
                 docker {
                   label "dgl-ci-linux-cpu"
-                  image "dgllib/dgl-ci-cpu:v240123_1000"
+                  image "dgllib/dgl-ci-cpu:v240227_1200"
                   args "-u root"
                   alwaysPull true
                 }
@@ -337,7 +337,7 @@ pipeline {
               agent {
                 docker {
                   label "dgl-ci-linux-cpu"
-                  image "dgllib/dgl-ci-gpu:cu116_v240123_1000"
+                  image "dgllib/dgl-ci-gpu:cu118_v240227_1200"
                   args "-u root"
                   alwaysPull true
                 }
@@ -392,7 +392,7 @@ pipeline {
               agent {
                 docker {
                   label "dgl-ci-linux-cpu"
-                  image "dgllib/dgl-ci-cpu:v240123_1000"
+                  image "dgllib/dgl-ci-cpu:v240227_1200"
                   args "-u root"
                   alwaysPull true
                 }
@@ -411,7 +411,7 @@ pipeline {
               agent {
                 docker {
                   label "dgl-ci-linux-gpu"
-                  image "dgllib/dgl-ci-gpu:cu116_v240123_1000"
+                  image "dgllib/dgl-ci-gpu:cu118_v240227_1200"
                   args "-u root --runtime nvidia"
                   alwaysPull true
                 }
@@ -466,7 +466,7 @@ pipeline {
               agent {
                 docker {
                   label "dgl-ci-linux-gpu"
-                  image "dgllib/dgl-ci-gpu:cu116_v240123_1000"
+                  image "dgllib/dgl-ci-gpu:cu118_v240227_1200"
                   args "-u root --runtime nvidia"
                   alwaysPull true
                 }
@@ -491,7 +491,7 @@ pipeline {
               agent {
                 docker {
                   label "dgl-ci-linux-cpu"
-                  image "dgllib/dgl-ci-cpu:v240123_1000"
+                  image "dgllib/dgl-ci-cpu:v240227_1200"
                   args "-u root --shm-size=4gb"
                   alwaysPull true
                 }
@@ -544,7 +544,7 @@ pipeline {
               agent {
                 docker {
                   label "dgl-ci-linux-gpu"
-                  image "dgllib/dgl-ci-gpu:cu116_v240123_1000"
+                  image "dgllib/dgl-ci-gpu:cu118_v240227_1200"
                   args "-u root --runtime nvidia --shm-size=8gb"
                   alwaysPull true
                 }
@@ -573,7 +573,7 @@ pipeline {
               agent {
                 docker {
                   label "dgl-ci-linux-cpu"
-                  image "dgllib/dgl-ci-cpu:v240123_1000"
+                  image "dgllib/dgl-ci-cpu:v240227_1200"
                   args "-u root --shm-size=4gb"
                   alwaysPull true
                 }
@@ -620,7 +620,7 @@ pipeline {
               agent {
                 docker {
                   label "dgl-ci-linux-cpu"
-                  image "dgllib/dgl-ci-cpu:v240123_1000"
+                  image "dgllib/dgl-ci-cpu:v240227_1200"
                   args "-u root"
                   alwaysPull true
                 }

diff --git a/conda/dgl/meta.yaml b/conda/dgl/meta.yaml
@@ -1,6 +1,6 @@
 package:
   name: dgl{{ environ.get('DGL_PACKAGE_SUFFIX', '') }}
-  version: 2.1{{ environ.get('DGL_VERSION_SUFFIX', '') }}
+  version: 2.2{{ environ.get('DGL_VERSION_SUFFIX', '') }}
 
 source:
   git_rev: {{ environ.get('DGL_RELEASE_BRANCH', 'master') }}

diff --git a/docker/Dockerfile.ci_cpu b/docker/Dockerfile.ci_cpu
@@ -1,6 +1,6 @@
 # CI docker CPU env
 # Adapted from github.com/dmlc/tvm/docker/Dockerfile.ci_cpu
-FROM ubuntu:18.04
+FROM ubuntu:20.04
 
 ENV TZ=US
 RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone

diff --git a/docker/Dockerfile.ci_gpu b/docker/Dockerfile.ci_gpu
@@ -1,14 +1,9 @@
 # CI docker GPU env
-FROM nvidia/cuda:11.6.2-cudnn8-devel-ubuntu18.04
+FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04
 
 ENV TZ=US
 RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
 
-# Update outdated public key from NVIDIA
-RUN apt-key del 3bf863cc
-RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
-RUN apt-get update --fix-missing
-
 COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
 RUN bash /install/ubuntu_install_core.sh
 

diff --git a/docker/install/conda_env/torch_cpu.yml b/docker/install/conda_env/torch_cpu.yml
@@ -1,6 +1,6 @@
 name: pytorch-ci
 dependencies:
-  - python=3.8
+  - python=3.10
   - pip
   - pip:
     - --find-links https://download.pytorch.org/whl/torch_stable.html

diff --git a/docker/install/conda_env/torch_cpu_pip.txt b/docker/install/conda_env/torch_cpu_pip.txt
@@ -17,7 +17,7 @@ rdflib
 requests[security]==2.28
 scikit-learn
 scipy
-torch==1.13.0+cpu
+torch==2.0.0+cpu
 torchdata
 torcheval
 torchmetrics

diff --git a/docker/install/conda_env/torch_gpu.yml b/docker/install/conda_env/torch_gpu.yml
@@ -1,6 +1,6 @@
 name: pytorch-ci
 dependencies:
-  - python=3.8
+  - python=3.10
   - pip
   - pip:
     - --find-links https://download.pytorch.org/whl/torch_stable.html

diff --git a/docker/install/conda_env/torch_gpu_pip.txt b/docker/install/conda_env/torch_gpu_pip.txt
@@ -15,7 +15,7 @@ rdflib
 requests[security]==2.28
 scikit-learn
 scipy
-torch==1.13.0+cu116
+torch==2.0.0+cu118
 torchdata
 torcheval
 torchmetrics

diff --git a/docs/source/api/python/dgl.distributed.rst b/docs/source/api/python/dgl.distributed.rst
@@ -104,3 +104,4 @@ Split and Load Partitions
     load_partition_feats
     load_partition_book
     partition_graph
+    dgl_partition_to_graphbolt
diff --git a/docs/source/api/python/nn-pytorch.rst b/docs/source/api/python/nn-pytorch.rst
@@ -132,14 +132,7 @@ Utility Modules
     ~dgl.nn.pytorch.explain.PGExplainer
     ~dgl.nn.pytorch.explain.HeteroPGExplainer
     ~dgl.nn.pytorch.utils.LabelPropagation
-    ~dgl.nn.pytorch.graph_transformer.DegreeEncoder
     ~dgl.nn.pytorch.utils.LaplacianPosEnc
-    ~dgl.nn.pytorch.graph_transformer.BiasedMultiheadAttention
-    ~dgl.nn.pytorch.graph_transformer.EGTLayer
-    ~dgl.nn.pytorch.graph_transformer.GraphormerLayer
-    ~dgl.nn.pytorch.graph_transformer.PathEncoder
-    ~dgl.nn.pytorch.graph_transformer.SpatialEncoder
-    ~dgl.nn.pytorch.graph_transformer.SpatialEncoder3d
 
 Network Embedding Modules
 ----------------------------------------

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -194,6 +194,8 @@
 # -- Extension configuration -------------------------------------------------
 autosummary_generate = True
 autodoc_member_order = "alphabetical"
+# Skip the following members.
+autodoc_mock_imports = ["dgl.nn.mxnet", "dgl.nn.tensorflow"]
 
 intersphinx_mapping = {
     "python": (

diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -48,8 +48,6 @@ Welcome to Deep Graph Library Tutorials and Documentation
    api/python/dgl.geometry
    api/python/dgl.graphbolt
    api/python/nn-pytorch
-   api/python/nn-tensorflow
-   api/python/nn-mxnet
    api/python/nn.functional
    api/python/dgl.ops
    api/python/dgl.optim

diff --git a/examples/README.md b/examples/README.md
@@ -6,6 +6,13 @@ The folder contains example implementations of selected research papers related
 
 To quickly locate the examples of your interest, search for the tagged keywords or use the search tool on [dgl.ai](https://www.dgl.ai/).
 
+## 2024
+
+- <a name="labor"></a> Lin et al. ARGO: An Auto-Tuning Runtime System for Scalable GNN Training on Multi-Core Processor. [Paper link](https://arxiv.org/abs/2402.03671)
+  - Example code: [PyTorch](https://github.com/dmlc/dgl/tree/master/examples/pytorch/argo)
+
+  - Tags: semi-supervised node classification
+
 ## 2023
 
 - <a name="labor"></a> Zheng Wang et al. From Cluster Assumption to Graph Convolution: Graph-based Semi-Supervised Learning Revisited. [Paper link](https://arxiv.org/abs/2210.13339)

diff --git a/examples/distributed/graphsage/README.md b/examples/distributed/graphsage/README.md
@@ -138,3 +138,63 @@ python3 ~/workspace/dgl/tools/launch.py \
 --ip_config ip_config.txt \
 "python3 node_classification.py --graph_name ogbn-products --ip_config ip_config.txt --num_epochs 30 --batch_size 1000 --num_gpus 4"
 ```
+
+### Running with GraphBolt
+
+In order to run with `GraphBolt`, we need to partition graph into `GraphBolt` data formats.Please note that both `DGL` and `GraphBolt` partitions are saved together.
+
+```
+python3 partition_graph.py --dataset ogbn-products --num_parts 2 --balance_train --balance_edges --use_graphbolt
+```
+
+#### Partition sizes compared to DGL
+
+Compared to `DGL`, `GraphBolt` partitions are much smaller(reduced to **16%** and **19%** for `ogbn-products` and `ogbn-papers100M` respectively).
+
+`ogbn-products`
+
+| Data Formats |         File Name            | Part 0 | Part 1 |
+| ------------ | ---------------------------- | ------ | ------ |
+| DGL          | graph.dgl                    | 1.5GB  | 1.6GB  |
+| GraphBolt    | fused_csc_sampling_graph.pt  | 255MB  | 265MB  |
+
+`ogbn-papers100M`
+
+| Data Formats |         File Name            | Part 0 | Part 1 |
+| ------------ | ---------------------------- | ------ | ------ |
+| DGL          | graph.dgl                    | 23GB   | 22GB   |
+| GraphBolt    | fused_csc_sampling_graph.pt  | 4.4GB  | 4.1GB  |
+
+Then run example with `--use_graphbolt`.
+
+```
+python3 ~/workspace/dgl/tools/launch.py \
+--workspace ~/workspace/dgl/examples/pytorch/graphsage/dist/ \
+--num_trainers 4 \
+--num_samplers 0 \
+--num_servers 2 \
+--part_config data/ogbn-products.json \
+--ip_config ip_config.txt \
+"python3 node_classification.py --graph_name ogbn-products --ip_config ip_config.txt --num_epochs 10 --use_graphbolt"
+```
+
+#### Performance compared to `DGL`
+
+Compared to `DGL`, `GraphBolt`'s sampler works faster(reduced to **80%** and **77%** for `ogbn-products` and `ogbn-papers100M` respectively). `Min` and `Max` are statistics of all trainers on all nodes(machines).
+
+As for RAM usage, the shared memory(measured by **shared** field of `free` command) usage is decreased due to smaller graph partitions in `GraphBolt` though the peak memory used by processes(measured by **used** field of `free` command) does not decrease.
+
+`ogbn-products`
+
+| Data Formats | Sample Time Per Epoch (CPU) |      Test Accuracy (10 epochs)   |  shared | used (peak) |
+| ------------ | --------------------------- | -------------------------------- |  -----  | ---- |
+|     DGL      | Min: 1.2884s, Max: 1.4159s  | Min: 64.38%, Max: 70.42%         |  2.4GB  | 7.8GB|
+|   GraphBolt  | Min: 1.0589s, Max: 1.1400s  | Min: 61.68%, Max: 71.23%         |  1.1GB  | 7.8GB|
+
+
+`ogbn-papers100M`
+
+| Data Formats | Sample Time Per Epoch (CPU) |      Test Accuracy (10 epochs)   |  shared | used (peak) |
+| ------------ | --------------------------- | -------------------------------- |  -----  | ---- |
+|     DGL      | Min: 5.5570s, Max: 6.1900s  | Min: 29.12%, Max: 34.33%         |  84GB   | 43GB |
+|   GraphBolt  | Min: 4.5046s, Max: 4.7718s  | Min: 29.11%, Max: 33.49%         |  67GB   | 43GB |
diff --git a/examples/distributed/graphsage/node_classification.py b/examples/distributed/graphsage/node_classification.py
@@ -340,7 +340,7 @@ def main(args):
     """
     host_name = socket.gethostname()
     print(f"{host_name}: Initializing DistDGL.")
-    dgl.distributed.initialize(args.ip_config)
+    dgl.distributed.initialize(args.ip_config, use_graphbolt=args.use_graphbolt)
     print(f"{host_name}: Initializing PyTorch process group.")
     th.distributed.init_process_group(backend=args.backend)
     print(f"{host_name}: Initializing DistGraph.")
@@ -457,6 +457,11 @@ def main(args):
         help="Pad train nid to the same length across machine, to ensure num "
         "of batches to be the same.",
     )
+    parser.add_argument(
+        "--use_graphbolt",
+        action="store_true",
+        help="Use GraphBolt for distributed train.",
+    )
     args = parser.parse_args()
     print(f"Arguments: {args}")
     main(args)
diff --git a/examples/distributed/graphsage/partition_graph.py b/examples/distributed/graphsage/partition_graph.py
@@ -87,6 +87,11 @@ def load_ogb(name, root="dataset"):
         default="data",
         help="Output path of partitioned graph.",
     )
+    argparser.add_argument(
+        "--use_graphbolt",
+        action="store_true",
+        help="Use GraphBolt for distributed train.",
+    )
     args = argparser.parse_args()
 
     start = time.time()
@@ -127,4 +132,5 @@ def load_ogb(name, root="dataset"):
         balance_ntypes=balance_ntypes,
         balance_edges=args.balance_edges,
         num_trainers_per_machine=args.num_trainers_per_machine,
+        use_graphbolt=args.use_graphbolt,
     )