Merge branch 'master' into untyped_storage

dmlc · Jan 29, 2024 · 9ea5525 · 9ea5525
2 parents 7cedd70 + fe78093
commit 9ea5525
Show file tree

Hide file tree

Showing 33 changed files with 1,050 additions and 409 deletions.
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -318,7 +318,7 @@ pipeline {
               agent {
                 docker {
                   label "dgl-ci-linux-cpu"
-                  image "dgllib/dgl-ci-cpu:v231103_1700"
+                  image "dgllib/dgl-ci-cpu:v240123_1000"
                   args "-u root"
                   alwaysPull true
                 }
@@ -337,7 +337,7 @@ pipeline {
               agent {
                 docker {
                   label "dgl-ci-linux-cpu"
-                  image "dgllib/dgl-ci-gpu:cu116_v231103_1700"
+                  image "dgllib/dgl-ci-gpu:cu116_v240123_1000"
                   args "-u root"
                   alwaysPull true
                 }
@@ -392,7 +392,7 @@ pipeline {
               agent {
                 docker {
                   label "dgl-ci-linux-cpu"
-                  image "dgllib/dgl-ci-cpu:v231103_1700"
+                  image "dgllib/dgl-ci-cpu:v240123_1000"
                   args "-u root"
                   alwaysPull true
                 }
@@ -411,7 +411,7 @@ pipeline {
               agent {
                 docker {
                   label "dgl-ci-linux-gpu"
-                  image "dgllib/dgl-ci-gpu:cu116_v231103_1700"
+                  image "dgllib/dgl-ci-gpu:cu116_v240123_1000"
                   args "-u root --runtime nvidia"
                   alwaysPull true
                 }
@@ -451,6 +451,8 @@ pipeline {
                   steps {
                     unit_test_linux('tensorflow', 'cpu')
                   }
+                  // Tensorflow is deprecated.
+                  when { expression { false } }
                 }
               }
               post {
@@ -464,7 +466,7 @@ pipeline {
               agent {
                 docker {
                   label "dgl-ci-linux-gpu"
-                  image "dgllib/dgl-ci-gpu:cu116_v231103_1700"
+                  image "dgllib/dgl-ci-gpu:cu116_v240123_1000"
                   args "-u root --runtime nvidia"
                   alwaysPull true
                 }
@@ -489,7 +491,7 @@ pipeline {
               agent {
                 docker {
                   label "dgl-ci-linux-cpu"
-                  image "dgllib/dgl-ci-cpu:v231103_1700"
+                  image "dgllib/dgl-ci-cpu:v240123_1000"
                   args "-u root --shm-size=4gb"
                   alwaysPull true
                 }
@@ -542,7 +544,7 @@ pipeline {
               agent {
                 docker {
                   label "dgl-ci-linux-gpu"
-                  image "dgllib/dgl-ci-gpu:cu116_v231103_1700"
+                  image "dgllib/dgl-ci-gpu:cu116_v240123_1000"
                   args "-u root --runtime nvidia --shm-size=8gb"
                   alwaysPull true
                 }
@@ -571,7 +573,7 @@ pipeline {
               agent {
                 docker {
                   label "dgl-ci-linux-cpu"
-                  image "dgllib/dgl-ci-cpu:v231103_1700"
+                  image "dgllib/dgl-ci-cpu:v240123_1000"
                   args "-u root --shm-size=4gb"
                   alwaysPull true
                 }
@@ -618,7 +620,7 @@ pipeline {
               agent {
                 docker {
                   label "dgl-ci-linux-cpu"
-                  image "dgllib/dgl-ci-cpu:v231103_1700"
+                  image "dgllib/dgl-ci-cpu:v240123_1000"
                   args "-u root"
                   alwaysPull true
                 }

diff --git a/docker/install/conda_env/torch_cpu_pip.txt b/docker/install/conda_env/torch_cpu_pip.txt
@@ -21,4 +21,5 @@ torch==1.13.0+cpu
 torchdata
 torcheval
 torchmetrics
+torch_geometric
 tqdm
diff --git a/docker/install/conda_env/torch_gpu_pip.txt b/docker/install/conda_env/torch_gpu_pip.txt
@@ -19,4 +19,5 @@ torch==1.13.0+cu116
 torchdata
 torcheval
 torchmetrics
+torch_geometric
 tqdm
diff --git a/examples/core/rgcn/README.md b/examples/core/rgcn/README.md
@@ -20,15 +20,15 @@ Below results are roughly collected from an AWS EC2 **g4dn.metal**, 384GB RAM, 9
 
 | Dataset Size | CPU RAM Usage | Num of GPUs | GPU RAM Usage | Time Per Epoch(Training) |
 | ------------ | ------------- | ----------- | ------------- | ------------------------ |
-| ~1.1GB       | ~5GB          | 0           |  0GB          | ~243s                    |
-| ~1.1GB       | ~3GB          | 1           |  4.4GB        | ~81s                     |
+| ~1.1GB       | ~7GB          | 0           |  0GB          | ~233s                    |
+| ~1.1GB       | ~5GB          | 1           |  4.5GB        | ~73.6s                   |
 
 ### Accuracies
 ```
-Epoch: 01, Loss: 2.3302, Valid: 47.76%, Test: 46.58%
-Epoch: 02, Loss: 1.5486, Valid: 48.31%, Test: 47.12%
-Epoch: 03, Loss: 1.1469, Valid: 46.43%, Test: 45.18%
-Test accuracy 45.1227
+Epoch: 01, Loss: 2.3386, Valid: 47.67%, Test: 46.96%
+Epoch: 02, Loss: 1.5563, Valid: 47.66%, Test: 47.02%
+Epoch: 03, Loss: 1.1557, Valid: 46.58%, Test: 45.42%
+Test accuracy 45.3850
 ```
 
 ## Run on `ogb-lsc-mag240m` dataset
@@ -54,8 +54,8 @@ Below results are roughly collected from an AWS EC2 **g4dn.metal**, 384GB RAM, 9
 
 | Dataset Size | CPU RAM Usage | Num of GPUs | GPU RAM Usage | Time Per Epoch(Training) |
 | ------------ | ------------- | ----------- | ------------- | ------------------------ |
-| ~404GB       | ~60GB         | 0           |  0GB          | ~216s                    |
-| ~404GB       | ~60GB         | 1           |  7GB          | ~157s                    |
+| ~404GB       | ~72GB         | 0           |  0GB          | ~325s                    |
+| ~404GB       | ~61GB         | 1           |  14GB         | ~178s                    |
 
 ### Accuracies
 ```

diff --git a/examples/multigpu/graphbolt/node_classification.py b/examples/multigpu/graphbolt/node_classification.py
@@ -89,9 +89,7 @@ def create_dataloader(
     features,
     itemset,
     device,
-    drop_last=False,
-    shuffle=True,
-    drop_uneven_inputs=False,
+    is_train,
 ):
     ############################################################################
     # [HIGHLIGHT]
@@ -122,9 +120,9 @@ def create_dataloader(
     datapipe = gb.DistributedItemSampler(
         item_set=itemset,
         batch_size=args.batch_size,
-        drop_last=drop_last,
-        shuffle=shuffle,
-        drop_uneven_inputs=drop_uneven_inputs,
+        drop_last=is_train,
+        shuffle=is_train,
+        drop_uneven_inputs=is_train,
     )
     ############################################################################
     # [Note]:
@@ -141,7 +139,10 @@ def create_dataloader(
     if args.storage_device == "cpu":
         datapipe = datapipe.copy_to(device)
 
-    dataloader = gb.DataLoader(datapipe, args.num_workers)
+    # Until https://github.com/dmlc/dgl/issues/7008, overlap should be False.
+    dataloader = gb.DataLoader(
+        datapipe, args.num_workers, overlap_feature_fetch=False
+    )
 
     # Return the fully-initialized DataLoader object.
     return dataloader
@@ -187,7 +188,7 @@ def train(
         epoch_start = time.time()
 
         model.train()
-        total_loss = torch.tensor(0, dtype=torch.float).to(device)
+        total_loss = torch.tensor(0, dtype=torch.float, device=device)
         ########################################################################
         # (HIGHLIGHT) Use Join Context Manager to solve uneven input problem.
         #
@@ -227,20 +228,17 @@ def train(
                 loss.backward()
                 optimizer.step()
 
-                total_loss += loss
+                total_loss += loss.detach()
 
         # Evaluate the model.
         if rank == 0:
             print("Validating...")
-        acc = (
-            evaluate(
-                rank,
-                model,
-                valid_dataloader,
-                num_classes,
-                device,
-            )
-            / world_size
+        acc = evaluate(
+            rank,
+            model,
+            valid_dataloader,
+            num_classes,
+            device,
         )
         ########################################################################
         # (HIGHLIGHT) Collect accuracy and loss values from sub-processes and
@@ -252,14 +250,13 @@ def train(
         dist.reduce(tensor=acc, dst=0)
         total_loss /= step + 1
         dist.reduce(tensor=total_loss, dst=0)
-        dist.barrier()
 
         epoch_end = time.time()
         if rank == 0:
             print(
                 f"Epoch {epoch:05d} | "
                 f"Average Loss {total_loss.item() / world_size:.4f} | "
-                f"Accuracy {acc.item():.4f} | "
+                f"Accuracy {acc.item() / world_size:.4f} | "
                 f"Time {epoch_end - epoch_start:.4f}"
             )
 
@@ -301,29 +298,23 @@ def run(rank, world_size, args, devices, dataset):
         dataset.feature,
         train_set,
         device,
-        drop_last=False,
-        shuffle=True,
-        drop_uneven_inputs=False,
+        is_train=True,
     )
     valid_dataloader = create_dataloader(
         args,
         dataset.graph,
         dataset.feature,
         valid_set,
         device,
-        drop_last=False,
-        shuffle=False,
-        drop_uneven_inputs=False,
+        is_train=False,
     )
     test_dataloader = create_dataloader(
         args,
         dataset.graph,
         dataset.feature,
         test_set,
         device,
-        drop_last=False,
-        shuffle=False,
-        drop_uneven_inputs=False,
+        is_train=False,
     )
 
     # Model training.
@@ -354,7 +345,7 @@ def run(rank, world_size, args, devices, dataset):
         / world_size
     )
     dist.reduce(tensor=test_acc, dst=0)
-    dist.barrier()
+    torch.cuda.synchronize()
     if rank == 0:
         print(f"Test Accuracy {test_acc.item():.4f}")
 

diff --git a/examples/multigpu/node_classification_sage.py b/examples/multigpu/node_classification_sage.py
@@ -171,12 +171,16 @@ def train(
     use_uva,
 ):
     # Instantiate a neighbor sampler
-    sampler = NeighborSampler(
-        [10, 10, 10],
-        prefetch_node_feats=["feat"],
-        prefetch_labels=["label"],
-        fused=(args.mode != "benchmark"),
-    )
+    if args.mode == "benchmark":
+        # A work-around to prevent CUDA running error. For more details, please
+        # see https://github.com/dmlc/dgl/issues/6697.
+        sampler = NeighborSampler([10, 10, 10], fused=False)
+    else:
+        sampler = NeighborSampler(
+            [10, 10, 10],
+            prefetch_node_feats=["feat"],
+            prefetch_labels=["label"],
+        )
     train_dataloader = DataLoader(
         g,
         train_idx,

diff --git a/examples/sampling/graphbolt/node_classification.py b/examples/sampling/graphbolt/node_classification.py
@@ -287,7 +287,9 @@ def evaluate(args, model, graph, features, itemset, num_classes):
 
 
 def train(args, graph, features, train_set, valid_set, num_classes, model):
-    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
+    optimizer = torch.optim.Adam(
+        model.parameters(), lr=args.lr, weight_decay=5e-4
+    )
     dataloader = create_dataloader(
         graph=graph,
         features=features,
@@ -343,7 +345,7 @@ def parse_args():
     parser.add_argument(
         "--lr",
         type=float,
-        default=0.0005,
+        default=1e-3,
         help="Learning rate for optimization.",
     )
     parser.add_argument(

diff --git a/examples/sampling/graphbolt/rgcn/README.md b/examples/sampling/graphbolt/rgcn/README.md
@@ -19,15 +19,15 @@ Below results are roughly collected from an AWS EC2 **g4dn.metal**, 384GB RAM, 9
 
 | Dataset Size | CPU RAM Usage | Num of GPUs | GPU RAM Usage | Time Per Epoch(Training) |
 | ------------ | ------------- | ----------- | ------------- | ------------------------ |
-| ~1.1GB       | ~4.5GB        | 0           |  0GB          | ~235s                    |
-| ~1.1GB       | ~2GB          | 1           |  4.4GB        | ~60s                     |
+| ~1.1GB       | ~5.3GB        | 0           |  0GB          | ~230s                    |
+| ~1.1GB       | ~3GB          | 1           |  3.87GB       | ~64.6s                   |
 
 ### Accuracies
 ```
-Epoch: 01, Loss: 2.6736, Valid accuracy: 42.21%
-Epoch: 02, Loss: 2.0809, Valid accuracy: 42.51%
-Epoch: 03, Loss: 1.8143, Valid accuracy: 42.76%
-Test accuracy 41.4817
+Epoch: 01, Loss: 2.3434, Valid accuracy: 48.23%
+Epoch: 02, Loss: 1.5646, Valid accuracy: 48.49%
+Epoch: 03, Loss: 1.1633, Valid accuracy: 45.79%
+Test accuracy 44.6792
 ```
 
 ## Run on `ogb-lsc-mag240m` dataset
@@ -47,17 +47,17 @@ Below results are roughly collected from an AWS EC2 **g4dn.metal**, 384GB RAM, 9
 
 > **note:**
 `buffer/cache` are highly used during train, it's about 300GB. If more RAM is available, more `buffer/cache` will be consumed as graph size is about 55GB and feature data is about 350GB.
-One more thing, first epoch is quite slow as `buffer/cache` is not ready yet. For GPU train, first epoch takes **34:56min, 1.93s/it**.
+One more thing, first epoch is quite slow as `buffer/cache` is not ready yet. For GPU train, first epoch takes **1030s**.
 Even in following epochs, time consumption varies.
 
 | Dataset Size | CPU RAM Usage | Num of GPUs | GPU RAM Usage | Time Per Epoch(Training) |
 | ------------ | ------------- | ----------- | ------------- | ------------------------ |
-| ~404GB       | ~55GB       | 0           |  0GB            | ~197s                    |
-| ~404GB       | ~55GB       | 1           |  7GB            | ~119s                    |
+| ~404GB       | ~67GB         | 0           |  0GB          | ~248s                    |
+| ~404GB       | ~60GB         | 1           |  15GB         | ~166s                    |
 
 ### Accuracies
 ```
-Epoch: 01, Loss: 2.3038, Valid accuracy: 46.33%
-Epoch: 02, Loss: 2.1160, Valid accuracy: 46.47%
-Epoch: 03, Loss: 2.0847, Valid accuracy: 48.38%
+Epoch: 01, Loss: 2.1432, Valid accuracy: 50.21%
+Epoch: 02, Loss: 1.9267, Valid accuracy: 50.77%
+Epoch: 03, Loss: 1.8797, Valid accuracy: 53.38%
 ```
diff --git a/graphbolt/include/graphbolt/cuda_ops.h b/graphbolt/include/graphbolt/cuda_ops.h
@@ -113,15 +113,16 @@ std::tuple<torch::Tensor, torch::Tensor> IndexSelectCSCImpl(
  * given nodes and their indptr values.
  *
  * @param indptr The indptr tensor.
- * @param nodes  The nodes to read from indptr
+ * @param nodes  The nodes to read from indptr. If not provided, assumed to be
+ * equal to torch.arange(indptr.size(0) - 1).
  *
  * @return Tuple of tensors with values:
  * (indptr[nodes + 1] - indptr[nodes], indptr[nodes]), the returned indegrees
  * tensor (first one) has size nodes.size(0) + 1 so that calling ExclusiveCumSum
  * on it gives the output indptr.
  */
 std::tuple<torch::Tensor, torch::Tensor> SliceCSCIndptr(
-    torch::Tensor indptr, torch::Tensor nodes);
+    torch::Tensor indptr, torch::optional<torch::Tensor> nodes);
 
 /**
  * @brief Given the compacted sub_indptr tensor, edge type tensor and

diff --git a/graphbolt/include/graphbolt/cuda_sampling_ops.h b/graphbolt/include/graphbolt/cuda_sampling_ops.h
@@ -19,7 +19,8 @@ namespace ops {
  *
  * @param indptr Index pointer array of the CSC.
  * @param indices Indices array of the CSC.
- * @param nodes The nodes from which to sample neighbors.
+ * @param nodes The nodes from which to sample neighbors. If not provided,
+ * assumed to be equal to torch.arange(indptr.size(0) - 1).
  * @param fanouts The number of edges to be sampled for each node with or
  * without considering edge types.
  *   - When the length is 1, it indicates that the fanout applies to all
@@ -49,9 +50,9 @@ namespace ops {
  * the sampled graph's information.
  */
 c10::intrusive_ptr<sampling::FusedSampledSubgraph> SampleNeighbors(
-    torch::Tensor indptr, torch::Tensor indices, torch::Tensor nodes,
-    const std::vector<int64_t>& fanouts, bool replace, bool layer,
-    bool return_eids,
+    torch::Tensor indptr, torch::Tensor indices,
+    torch::optional<torch::Tensor> nodes, const std::vector<int64_t>& fanouts,
+    bool replace, bool layer, bool return_eids,
     torch::optional<torch::Tensor> type_per_edge = torch::nullopt,
     torch::optional<torch::Tensor> probs_or_mask = torch::nullopt);