From 80203e34cd21516d2a503c671b0708f0b343ff45 Mon Sep 17 00:00:00 2001
From: hutiechuan <47250730+hutiechuan@users.noreply.github.com>
Date: Thu, 11 Jan 2024 07:50:32 +0800
Subject: [PATCH 01/22] [GraphBolt] add node classification example for pyg
 model (#6872)

Co-authored-by: Muhammed Fatih BALIN <m.f.balin@gmail.com>
---
 examples/sampling/pyg/README.md              |  57 +++++
 examples/sampling/pyg/node_classification.py | 235 +++++++++++++++++++
 2 files changed, 292 insertions(+)
 create mode 100644 examples/sampling/pyg/README.md
 create mode 100644 examples/sampling/pyg/node_classification.py

diff --git a/examples/sampling/pyg/README.md b/examples/sampling/pyg/README.md
new file mode 100644
index 000000000000..c75aa9b90dfe
--- /dev/null
+++ b/examples/sampling/pyg/README.md
@@ -0,0 +1,57 @@
+##  Overview
+
+This project demonstrates the training and evaluation of a GraphSAGE model for node classification on large graphs. The example utilizes GraphBolt for efficient data handling and PyG for the GNN training.
+
+
+# Node classification on graph
+
+This example aims to demonstrate how to run node classification task on heterogeneous graph with **GraphBolt**. 
+
+##  Model
+
+The model is a three-layer GraphSAGE network implemented using PyTorch Geometric's SAGEConv layers.
+
+
+## Default Run on `ogbn-arxiv` dataset
+
+```
+python node_classification.py
+```
+
+
+
+
+## Accuracies
+```
+Final performance(for ogbn-arxiv): 
+All runs:
+Highest Train: 62.26
+Highest Valid: 59.89
+Final Train: 62.26
+Final Test: 52.78
+```
+
+
+
+## Run on `ogbn-products` dataset
+
+### Sample on CPU and train/infer on CPU
+
+```
+python node_classification.py --dataset ogbn-products
+```
+
+## Accuracies
+```
+Final performance(for ogbn-products): 
+All runs:
+Highest Train: 90.79
+Highest Valid: 89.86
+Final Train: 90.79
+Final Test: 75.24
+```
+
+
+
+
+
diff --git a/examples/sampling/pyg/node_classification.py b/examples/sampling/pyg/node_classification.py
new file mode 100644
index 000000000000..923025298c83
--- /dev/null
+++ b/examples/sampling/pyg/node_classification.py
@@ -0,0 +1,235 @@
+"""
+This script demonstrates node classification with GraphSAGE on large graphs, 
+merging GraphBolt (GB) and PyTorch Geometric (PyG). GraphBolt efficiently manages 
+data loading for large datasets, crucial for mini-batch processing. Post data 
+loading, PyG's user-friendly framework takes over for training, showcasing seamless 
+integration with GraphBolt. This combination offers an efficient alternative to 
+traditional Deep Graph Library (DGL) methods, highlighting adaptability and 
+scalability in handling large-scale graph data for diverse real-world applications.
+
+
+
+Key Features:
+- Implements the GraphSAGE model, a scalable GNN, for node classification on large graphs.
+- Utilizes GraphBolt, an efficient framework for large-scale graph data processing.
+- Integrates with PyTorch Geometric for building and training the GraphSAGE model.
+- The script is well-documented, providing clear explanations at each step.
+
+This flowchart describes the main functional sequence of the provided example.
+main: 
+
+main
+│
+├───> Load and preprocess dataset (GraphBolt)
+│     │
+│     └───> Utilize GraphBolt's BuiltinDataset for dataset handling
+│
+├───> Instantiate the SAGE model (PyTorch Geometric)
+│     │
+│     └───> Define the GraphSAGE model architecture
+│
+├───> Train the model
+│     │
+│     ├───> Mini-Batch Processing with GraphBolt
+│     │     │
+│     │     └───> Efficient handling of mini-batches using GraphBolt's utilities
+│     │
+│     └───> Training Loop
+│           │
+│           ├───> Forward and backward passes
+│           │
+│           └───> Parameters optimization
+│
+└───> Evaluate the model
+      │
+      └───> Performance assessment on validation and test datasets
+            │
+            └───> Accuracy and other relevant metrics calculation
+
+
+"""
+
+import argparse
+
+import dgl.graphbolt as gb
+import torch
+import torch.nn.functional as F
+import torchmetrics.functional as MF
+from torch_geometric.nn import SAGEConv
+
+
+class GraphSAGE(torch.nn.Module):
+    #####################################################################
+    # (HIGHLIGHT) Define the GraphSAGE model architecture.
+    #
+    # - This class inherits from `torch.nn.Module`.
+    # - Two convolutional layers are created using the SAGEConv class from PyG.
+    # - 'in_size', 'hidden_size', 'out_size' are the sizes of
+    #   the input, hidden, and output features, respectively.
+    # - The forward method defines the computation performed at every call.
+    #####################################################################
+    def __init__(self, in_size, hidden_size, out_size):
+        super(GraphSAGE, self).__init__()
+        self.layers = torch.nn.ModuleList()
+        self.layers.append(SAGEConv(in_size, hidden_size))
+        self.layers.append(SAGEConv(hidden_size, hidden_size))
+        self.layers.append(SAGEConv(hidden_size, out_size))
+
+    def forward(self, blocks, x, device):
+        h = x
+        for i, (layer, block) in enumerate(zip(self.layers, blocks)):
+            src, dst = block.edges()
+            edge_index = torch.stack([src, dst], dim=0)
+            h_src, h_dst = h, h[: block.number_of_dst_nodes()]
+            h = layer((h_src, h_dst), edge_index)
+            if i != len(blocks) - 1:
+                h = F.relu(h)
+        return h
+
+
+def create_dataloader(dataset_set, graph, feature, device, is_train):
+    #####################################################################
+    # (HIGHLIGHT) Create a data loader for efficiently loading graph data.
+    #
+    # - 'ItemSampler' samples mini-batches of node IDs from the dataset.
+    # - 'sample_neighbor' performs neighbor sampling on the graph.
+    # - 'FeatureFetcher' fetches node features based on the sampled subgraph.
+    # - 'CopyTo' copies the fetched data to the specified device.
+
+    #####################################################################
+    # Create a datapipe for mini-batch sampling with a specific neighbor fanout.
+    # Here, [10, 10, 10] specifies the number of neighbors sampled for each node at each layer.
+    # We're using `sample_neighbor` for consistency with DGL's sampling API.
+    # Note: GraphBolt offers additional sampling methods, such as `sample_layer_neighbor`,
+    # which could provide further optimization and efficiency for GNN training.
+    # Users are encouraged to explore these advanced features for potentially improved performance.
+
+    # Initialize an ItemSampler to sample mini-batches from the dataset.
+    datapipe = gb.ItemSampler(
+        dataset_set, batch_size=1024, shuffle=is_train, drop_last=is_train
+    )
+    # Sample neighbors for each node in the mini-batch.
+    datapipe = datapipe.sample_neighbor(graph, [10, 10, 10])
+    # Fetch node features for the sampled subgraph.
+    datapipe = datapipe.fetch_feature(feature, node_feature_keys=["feat"])
+    # Copy the data to the specified device.
+    datapipe = datapipe.copy_to(device=device)
+    # Create and return a DataLoader to handle data loading.
+    dataloader = gb.DataLoader(datapipe, num_workers=0)
+
+    return dataloader
+
+
+def train(model, dataloader, optimizer, criterion, device, num_classes):
+    #####################################################################
+    # (HIGHLIGHT) Train the model for one epoch.
+    #
+    # - Iterates over the data loader, fetching mini-batches of graph data.
+    # - For each mini-batch, it performs a forward pass, computes loss, and
+    #   updates the model parameters.
+    # - The function returns the average loss and accuracy for the epoch.
+    #
+    # Parameters:
+    #   model: The GraphSAGE model.
+    #   dataloader: DataLoader that provides mini-batches of graph data.
+    #   optimizer: Optimizer used for updating model parameters.
+    #   criterion: Loss function used for training.
+    #   device: The device (CPU/GPU) to run the training on.
+    #####################################################################
+
+    model.train()  # Set the model to training mode
+    total_loss = 0  # Accumulator for the total loss
+    total_correct = 0  # Accumulator for the total number of correct predictions
+    total_samples = 0  # Accumulator for the total number of samples processed
+    num_batches = 0  # Counter for the number of mini-batches processed
+
+    for minibatch in dataloader:
+        node_features = minibatch.node_features["feat"]
+        labels = minibatch.labels
+        optimizer.zero_grad()
+        out = model(minibatch.blocks, node_features, device)
+        loss = criterion(out, labels)
+        total_loss += loss.item()
+        total_correct += MF.accuracy(
+            out, labels, task="multiclass", num_classes=num_classes
+        ) * labels.size(0)
+        total_samples += labels.size(0)
+        loss.backward()
+        optimizer.step()
+        num_batches += 1
+    avg_loss = total_loss / num_batches
+    avg_accuracy = total_correct / total_samples
+    return avg_loss, avg_accuracy
+
+
+@torch.no_grad()
+def evaluate(model, dataloader, device, num_classes):
+    model.eval()
+    y_hats = []
+    ys = []
+    for minibatch in dataloader:
+        node_features = minibatch.node_features["feat"]
+        labels = minibatch.labels
+        out = model(minibatch.blocks, node_features, device)
+        y_hats.append(out)
+        ys.append(labels)
+
+    return MF.accuracy(
+        torch.cat(y_hats),
+        torch.cat(ys),
+        task="multiclass",
+        num_classes=num_classes,
+    )
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Which dataset are you going to use?"
+    )
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        default="ogbn-arxiv",
+        help='Name of the dataset to use (e.g., "ogbn-products", "ogbn-arxiv")',
+    )
+    args = parser.parse_args()
+    dataset_name = args.dataset
+    dataset = gb.BuiltinDataset(dataset_name).load()
+    graph = dataset.graph
+    feature = dataset.feature
+    train_set = dataset.tasks[0].train_set
+    valid_set = dataset.tasks[0].validation_set
+    test_set = dataset.tasks[0].test_set
+    num_classes = dataset.tasks[0].metadata["num_classes"]
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    train_dataloader = create_dataloader(
+        train_set, graph, feature, device, is_train=True
+    )
+    valid_dataloader = create_dataloader(
+        valid_set, graph, feature, device, is_train=False
+    )
+    test_dataloader = create_dataloader(
+        test_set, graph, feature, device, is_train=False
+    )
+    in_channels = feature.size("node", None, "feat")[0]
+    hidden_channels = 128
+    model = GraphSAGE(in_channels, hidden_channels, num_classes).to(device)
+    optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
+    criterion = torch.nn.CrossEntropyLoss()
+    for epoch in range(10):
+        train_loss, train_accuracy = train(
+            model, train_dataloader, optimizer, criterion, device, num_classes
+        )
+
+        valid_accuracy = evaluate(model, valid_dataloader, device, num_classes)
+        print(
+            f"Epoch {epoch}, Train Loss: {train_loss:.4f}, Train Accuracy: {valid_accuracy:.4f}, "
+            f"Valid Accuracy: {valid_accuracy:.4f}"
+        )
+    test_accuracy = evaluate(model, test_dataloader, device, num_classes)
+    print(f"Test Accuracy: {test_accuracy:.4f}")
+
+
+if __name__ == "__main__":
+    main()

From b6087efd7f59a71ed691f07370939f5cdc3706bc Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Wed, 10 Jan 2024 22:04:22 -0500
Subject: [PATCH 02/22] [GraphBolt] PyG example typo fix (#6931)

---
 examples/sampling/pyg/node_classification.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/sampling/pyg/node_classification.py b/examples/sampling/pyg/node_classification.py
index 923025298c83..a34fbf4abecc 100644
--- a/examples/sampling/pyg/node_classification.py
+++ b/examples/sampling/pyg/node_classification.py
@@ -224,7 +224,7 @@ def main():
 
         valid_accuracy = evaluate(model, valid_dataloader, device, num_classes)
         print(
-            f"Epoch {epoch}, Train Loss: {train_loss:.4f}, Train Accuracy: {valid_accuracy:.4f}, "
+            f"Epoch {epoch}, Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}, "
             f"Valid Accuracy: {valid_accuracy:.4f}"
         )
     test_accuracy = evaluate(model, test_dataloader, device, num_classes)

From 3ff7ad9d2de4b520138b2a7fccfaa00801712d05 Mon Sep 17 00:00:00 2001
From: Rhett Ying <85214957+Rhett-Ying@users.noreply.github.com>
Date: Thu, 11 Jan 2024 14:34:21 +0800
Subject: [PATCH 03/22] [GraphBolt] update notebooks about numpy edges (#6936)

---
 .../stochastic_training/ondisk-dataset-specification.rst   | 6 ++++--
 .../stochastic_training/ondisk_dataset_heterograph.ipynb   | 7 ++++---
 .../stochastic_training/ondisk_dataset_homograph.ipynb     | 5 +++--
 3 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/docs/source/stochastic_training/ondisk-dataset-specification.rst b/docs/source/stochastic_training/ondisk-dataset-specification.rst
index b567247ef2d3..0587b26a8806 100644
--- a/docs/source/stochastic_training/ondisk-dataset-specification.rst
+++ b/docs/source/stochastic_training/ondisk-dataset-specification.rst
@@ -122,8 +122,10 @@ The ``graph`` field is used to specify the graph structure. It has two fields:
       homogeneous graphs. For heterogeneous graphs, it is the edge type.
     - ``format``: ``string``
 
-      The ``format`` field is used to specify the format of the edge data. It can
-      only be ``csv`` for now.
+      The ``format`` field is used to specify the format of the edge data. It
+      can be ``csv`` or ``numpy``. If it is ``csv``, no ``index`` and ``header``
+      fields are needed. If it is ``numpy``, the array requires to be in shape
+      of ``(2, num_edges)``. ``numpy`` format is recommended for large graphs.
     - ``path``: ``string``
 
       The ``path`` field is used to specify the path of the edge data. It is
diff --git a/notebooks/stochastic_training/ondisk_dataset_heterograph.ipynb b/notebooks/stochastic_training/ondisk_dataset_heterograph.ipynb
index 0c28e8c402e2..8b2cb3ead2d5 100644
--- a/notebooks/stochastic_training/ondisk_dataset_heterograph.ipynb
+++ b/notebooks/stochastic_training/ondisk_dataset_heterograph.ipynb
@@ -103,10 +103,11 @@
       "cell_type": "markdown",
       "source": [
         "### Generate graph structure data\n",
-        "For heterogeneous graph, we need to save different edge edges(namely node pairs) into separate **CSV** files.\n",
+        "For heterogeneous graph, we need to save different edge edges(namely node pairs) into separate **Numpy** or **CSV** files.\n",
         "\n",
-        "**Note**:\n",
-        "when saving to file, do not save index and header.\n"
+        "Note:\n",
+        "- when saving to **Numpy**, the array requires to be in shape of `(2, N)`. This format is recommended as constructing graph from it is much faster than **CSV** file.\n",
+        "- when saving to **CSV** file, do not save index and header.\n"
       ],
       "metadata": {
         "id": "qhNtIn_xhlnl"
diff --git a/notebooks/stochastic_training/ondisk_dataset_homograph.ipynb b/notebooks/stochastic_training/ondisk_dataset_homograph.ipynb
index 0f2e99b4442e..5ce4c8f168a6 100644
--- a/notebooks/stochastic_training/ondisk_dataset_homograph.ipynb
+++ b/notebooks/stochastic_training/ondisk_dataset_homograph.ipynb
@@ -103,10 +103,11 @@
       "cell_type": "markdown",
       "source": [
         "### Generate graph structure data\n",
-        "For homogeneous graph, we just need to save edges(namely node pairs) into **CSV** file.\n",
+        "For homogeneous graph, we just need to save edges(namely node pairs) into  **Numpy** or **CSV** file.\n",
         "\n",
         "Note:\n",
-        "when saving to file, do not save index and header.\n"
+        "- when saving to **Numpy**, the array requires to be in shape of `(2, N)`. This format is recommended as constructing graph from it is much faster than **CSV** file.\n",
+        "- when saving to **CSV** file, do not save index and header.\n"
       ],
       "metadata": {
         "id": "qhNtIn_xhlnl"

From afca11142771a0d99c3fbdbe1c17ad4dc775b934 Mon Sep 17 00:00:00 2001
From: peizhou001 <110809584+peizhou001@users.noreply.github.com>
Date: Thu, 11 Jan 2024 16:29:50 +0800
Subject: [PATCH 04/22] [Graphbolt]Fix negative sampler (#6933)

Co-authored-by: Ubuntu <ubuntu@ip-172-31-21-218.ap-northeast-1.compute.internal>
---
 .../graphbolt/fused_csc_sampling_graph.h      |  26 ---
 graphbolt/src/fused_csc_sampling_graph.cc     |  12 --
 graphbolt/src/python_binding.cc               |   3 -
 .../impl/fused_csc_sampling_graph.py          |  32 ++--
 .../impl/uniform_negative_sampler.py          |  15 +-
 .../graphbolt/impl/test_negative_sampler.py   |   8 +-
 .../pytorch/graphbolt/test_integration.py     | 153 ++++++++----------
 7 files changed, 98 insertions(+), 151 deletions(-)

diff --git a/graphbolt/include/graphbolt/fused_csc_sampling_graph.h b/graphbolt/include/graphbolt/fused_csc_sampling_graph.h
index 6c60df2e4a33..2423ed982c5d 100644
--- a/graphbolt/include/graphbolt/fused_csc_sampling_graph.h
+++ b/graphbolt/include/graphbolt/fused_csc_sampling_graph.h
@@ -359,32 +359,6 @@ class FusedCSCSamplingGraph : public torch::CustomClassHolder {
       torch::optional<std::string> node_timestamp_attr_name,
       torch::optional<std::string> edge_timestamp_attr_name) const;
 
-  /**
-   * @brief Sample negative edges by randomly choosing negative
-   * source-destination pairs according to a uniform distribution. For each edge
-   * ``(u, v)``, it is supposed to generate `negative_ratio` pairs of negative
-   * edges ``(u, v')``, where ``v'`` is chosen uniformly from all the nodes in
-   * the graph.
-   *
-   * @param node_pairs A tuple of two 1D tensors that represent the source and
-   * destination of positive edges, with 'positive' indicating that these edges
-   * are present in the graph. It's important to note that within the context of
-   * a heterogeneous graph, the ids in these tensors signify heterogeneous ids.
-   * @param negative_ratio The ratio of the number of negative samples to
-   * positive samples.
-   * @param max_node_id The maximum ID of the node to be selected. It
-   * should correspond to the number of nodes of a specific type.
-   *
-   * @return A tuple consisting of two 1D tensors represents the source and
-   * destination of negative edges. In the context of a heterogeneous
-   * graph, both the input nodes and the selected nodes are represented
-   * by heterogeneous IDs. Note that negative refers to false negatives,
-   * which means the edge could be present or not present in the graph.
-   */
-  std::tuple<torch::Tensor, torch::Tensor> SampleNegativeEdgesUniform(
-      const std::tuple<torch::Tensor, torch::Tensor>& node_pairs,
-      int64_t negative_ratio, int64_t max_node_id) const;
-
   /**
    * @brief Copy the graph to shared memory.
    * @param shared_memory_name The name of the shared memory.
diff --git a/graphbolt/src/fused_csc_sampling_graph.cc b/graphbolt/src/fused_csc_sampling_graph.cc
index 8432d8a2da01..431ee2f752b0 100644
--- a/graphbolt/src/fused_csc_sampling_graph.cc
+++ b/graphbolt/src/fused_csc_sampling_graph.cc
@@ -712,18 +712,6 @@ FusedCSCSamplingGraph::TemporalSampleNeighbors(
   }
 }
 
-std::tuple<torch::Tensor, torch::Tensor>
-FusedCSCSamplingGraph::SampleNegativeEdgesUniform(
-    const std::tuple<torch::Tensor, torch::Tensor>& node_pairs,
-    int64_t negative_ratio, int64_t max_node_id) const {
-  torch::Tensor pos_src;
-  std::tie(pos_src, std::ignore) = node_pairs;
-  auto neg_len = pos_src.size(0) * negative_ratio;
-  auto neg_src = pos_src.repeat(negative_ratio);
-  auto neg_dst = torch::randint(0, max_node_id, {neg_len}, pos_src.options());
-  return std::make_tuple(neg_src, neg_dst);
-}
-
 static c10::intrusive_ptr<FusedCSCSamplingGraph>
 BuildGraphFromSharedMemoryHelper(SharedMemoryHelper&& helper) {
   helper.InitializeRead();
diff --git a/graphbolt/src/python_binding.cc b/graphbolt/src/python_binding.cc
index c60ad4b91180..44b6306d890d 100644
--- a/graphbolt/src/python_binding.cc
+++ b/graphbolt/src/python_binding.cc
@@ -52,9 +52,6 @@ TORCH_LIBRARY(graphbolt, m) {
       .def(
           "temporal_sample_neighbors",
           &FusedCSCSamplingGraph::TemporalSampleNeighbors)
-      .def(
-          "sample_negative_edges_uniform",
-          &FusedCSCSamplingGraph::SampleNegativeEdgesUniform)
       .def("copy_to_shared_memory", &FusedCSCSamplingGraph::CopyToSharedMemory)
       .def_pickle(
           // __getstate__
diff --git a/python/dgl/graphbolt/impl/fused_csc_sampling_graph.py b/python/dgl/graphbolt/impl/fused_csc_sampling_graph.py
index 80ff42433934..486bd59caccb 100644
--- a/python/dgl/graphbolt/impl/fused_csc_sampling_graph.py
+++ b/python/dgl/graphbolt/impl/fused_csc_sampling_graph.py
@@ -876,7 +876,8 @@ def sample_negative_edges_uniform(
         pairs according to a uniform distribution. For each edge ``(u, v)``,
         it is supposed to generate `negative_ratio` pairs of negative edges
         ``(u, v')``, where ``v'`` is chosen uniformly from all the nodes in
-        the graph.
+        the graph. As ``u`` is exactly same as the corresponding positive edges,
+        it returns None for negative sources.
 
         Parameters
         ----------
@@ -903,23 +904,22 @@ def sample_negative_edges_uniform(
             `edge_type`. Note that negative refers to false negatives, which
             means the edge could be present or not present in the graph.
         """
-        if edge_type is not None:
-            assert (
-                self.node_type_offset is not None
-            ), "The 'node_type_offset' array is necessary for performing \
-                negative sampling by edge type."
-            _, _, dst_node_type = etype_str_to_tuple(edge_type)
-            dst_node_type_id = self.node_type_to_id[dst_node_type]
-            offset = self._node_type_offset_list
-            max_node_id = (
-                offset[dst_node_type_id + 1] - offset[dst_node_type_id]
-            )
+        if edge_type:
+            _, _, dst_ntype = etype_str_to_tuple(edge_type)
+            max_node_id = self.num_nodes[dst_ntype]
         else:
             max_node_id = self.total_num_nodes
-        return self._c_csc_graph.sample_negative_edges_uniform(
-            node_pairs,
-            negative_ratio,
-            max_node_id,
+        pos_src, _ = node_pairs
+        num_negative = pos_src.size(0) * negative_ratio
+        return (
+            None,
+            torch.randint(
+                0,
+                max_node_id,
+                (num_negative,),
+                dtype=pos_src.dtype,
+                device=pos_src.device,
+            ),
         )
 
     def copy_to_shared_memory(self, shared_memory_name: str):
diff --git a/python/dgl/graphbolt/impl/uniform_negative_sampler.py b/python/dgl/graphbolt/impl/uniform_negative_sampler.py
index 512bd7ab5bc9..f979fd603249 100644
--- a/python/dgl/graphbolt/impl/uniform_negative_sampler.py
+++ b/python/dgl/graphbolt/impl/uniform_negative_sampler.py
@@ -32,20 +32,23 @@ class UniformNegativeSampler(NegativeSampler):
     Examples
     --------
     >>> from dgl import graphbolt as gb
-    >>> indptr = torch.LongTensor([0, 2, 4, 5])
-    >>> indices = torch.LongTensor([1, 2, 0, 2, 0])
+    >>> indptr = torch.LongTensor([0, 1, 2, 3, 4])
+    >>> indices = torch.LongTensor([1, 2, 3, 0])
     >>> graph = gb.fused_csc_sampling_graph(indptr, indices)
-    >>> node_pairs = (torch.tensor([0, 1]), torch.tensor([1, 2]))
+    >>> node_pairs = torch.tensor([[0, 1], [1, 2], [2, 3], [3, 0]])
     >>> item_set = gb.ItemSet(node_pairs, names="node_pairs")
     >>> item_sampler = gb.ItemSampler(
-    ...     item_set, batch_size=1,)
+    ...     item_set, batch_size=4,)
     >>> neg_sampler = gb.UniformNegativeSampler(
     ...     item_sampler, graph, 2)
     >>> for minibatch in neg_sampler:
     ...       print(minibatch.negative_srcs)
     ...       print(minibatch.negative_dsts)
-    (tensor([0, 0, 0]), tensor([1, 1, 2]), tensor([1, 0, 0]))
-    (tensor([1, 1, 1]), tensor([2, 1, 2]), tensor([1, 0, 0]))
+    None
+    tensor([[2, 1],
+        [2, 1],
+        [3, 2],
+        [1, 3]])
     """
 
     def __init__(
diff --git a/tests/python/pytorch/graphbolt/impl/test_negative_sampler.py b/tests/python/pytorch/graphbolt/impl/test_negative_sampler.py
index 7905f32798f1..577ade0e6f3f 100644
--- a/tests/python/pytorch/graphbolt/impl/test_negative_sampler.py
+++ b/tests/python/pytorch/graphbolt/impl/test_negative_sampler.py
@@ -46,8 +46,7 @@ def test_UniformNegativeSampler_invoke():
     def _verify(negative_sampler):
         for data in negative_sampler:
             # Assertation
-            assert data.negative_srcs.size(0) == batch_size
-            assert data.negative_srcs.size(1) == negative_ratio
+            assert data.negative_srcs is None
             assert data.negative_dsts.size(0) == batch_size
             assert data.negative_dsts.size(1) == negative_ratio
 
@@ -90,12 +89,9 @@ def test_Uniform_NegativeSampler(negative_ratio):
         # Assertation
         assert len(pos_src) == batch_size
         assert len(pos_dst) == batch_size
-        assert len(neg_src) == batch_size
         assert len(neg_dst) == batch_size
-        assert neg_src.numel() == batch_size * negative_ratio
+        assert neg_src is None
         assert neg_dst.numel() == batch_size * negative_ratio
-        expected_src = pos_src.repeat(negative_ratio).view(-1, negative_ratio)
-        assert torch.equal(expected_src, neg_src)
 
 
 def get_hetero_graph():
diff --git a/tests/python/pytorch/graphbolt/test_integration.py b/tests/python/pytorch/graphbolt/test_integration.py
index e6c16567b8ba..fe3b5c0a2daa 100644
--- a/tests/python/pytorch/graphbolt/test_integration.py
+++ b/tests/python/pytorch/graphbolt/test_integration.py
@@ -48,7 +48,7 @@ def test_integration_link_prediction():
     }
     feature_store = gb.BasicFeatureStore(features)
     datapipe = gb.ItemSampler(item_set, batch_size=4)
-    datapipe = datapipe.sample_uniform_negative(graph, 1)
+    datapipe = datapipe.sample_uniform_negative(graph, 2)
     fanouts = torch.LongTensor([1])
     datapipe = datapipe.sample_neighbor(graph, [fanouts, fanouts], replace=True)
     datapipe = datapipe.transform(gb.exclude_seed_edges)
@@ -62,23 +62,23 @@ def test_integration_link_prediction():
         str(
             """MiniBatch(seed_nodes=None,
           sampled_subgraphs=[SampledSubgraphImpl(sampled_csc=CSCFormatBase(indptr=tensor([0, 1, 1, 1, 1, 1, 2]),
-                                                                         indices=tensor([5, 4]),
+                                                                         indices=tensor([0, 4]),
                                                            ),
                                                original_row_node_ids=tensor([5, 3, 1, 2, 0, 4]),
                                                original_edge_ids=None,
                                                original_column_node_ids=tensor([5, 3, 1, 2, 0, 4]),
                             ),
-                            SampledSubgraphImpl(sampled_csc=CSCFormatBase(indptr=tensor([0, 1, 1, 1, 1, 1]),
-                                                                         indices=tensor([5]),
+                            SampledSubgraphImpl(sampled_csc=CSCFormatBase(indptr=tensor([0, 1, 1, 1, 1, 1, 2]),
+                                                                         indices=tensor([5, 4]),
                                                            ),
                                                original_row_node_ids=tensor([5, 3, 1, 2, 0, 4]),
                                                original_edge_ids=None,
-                                               original_column_node_ids=tensor([5, 3, 1, 2, 0]),
+                                               original_column_node_ids=tensor([5, 3, 1, 2, 0, 4]),
                             )],
           positive_node_pairs=(tensor([0, 1, 1, 1]),
                               tensor([2, 3, 3, 1])),
-          node_pairs_with_labels=((tensor([0, 1, 1, 1, 0, 1, 1, 1]), tensor([2, 3, 3, 1, 4, 4, 1, 4])),
-                                 tensor([1., 1., 1., 1., 0., 0., 0., 0.])),
+          node_pairs_with_labels=((tensor([0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1]), tensor([2, 3, 3, 1, 4, 4, 1, 4, 0, 1, 1, 5])),
+                                 tensor([1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.])),
           node_pairs=(tensor([5, 3, 3, 3]),
                      tensor([1, 2, 2, 3])),
           node_features={'feat': tensor([[0.5160, 0.2486],
@@ -87,131 +87,120 @@ def test_integration_link_prediction():
                                 [0.2109, 0.1089],
                                 [0.9634, 0.2294],
                                 [0.5503, 0.8223]])},
-          negative_srcs=tensor([[5],
-                                [3],
-                                [3],
-                                [3]]),
-          negative_node_pairs=(tensor([0, 1, 1, 1]),
-                              tensor([4, 4, 1, 4])),
-          negative_dsts=tensor([[0],
-                                [0],
-                                [3],
-                                [0]]),
+          negative_srcs=None,
+          negative_node_pairs=(tensor([0, 0, 1, 1, 1, 1, 1, 1]),
+                              tensor([4, 4, 1, 4, 0, 1, 1, 5])),
+          negative_dsts=tensor([[0, 0],
+                                [3, 0],
+                                [5, 3],
+                                [3, 4]]),
           labels=None,
           input_nodes=tensor([5, 3, 1, 2, 0, 4]),
           edge_features=[{},
                         {}],
           compacted_node_pairs=(tensor([0, 1, 1, 1]),
                                tensor([2, 3, 3, 1])),
-          compacted_negative_srcs=tensor([[0],
-                                          [1],
-                                          [1],
-                                          [1]]),
-          compacted_negative_dsts=tensor([[4],
-                                          [4],
-                                          [1],
-                                          [4]]),
+          compacted_negative_srcs=None,
+          compacted_negative_dsts=tensor([[4, 4],
+                                          [1, 4],
+                                          [0, 1],
+                                          [1, 5]]),
           blocks=[Block(num_src_nodes=6, num_dst_nodes=6, num_edges=2),
-                 Block(num_src_nodes=6, num_dst_nodes=5, num_edges=1)],
+                 Block(num_src_nodes=6, num_dst_nodes=6, num_edges=2)],
        )"""
         ),
         str(
             """MiniBatch(seed_nodes=None,
-          sampled_subgraphs=[SampledSubgraphImpl(sampled_csc=CSCFormatBase(indptr=tensor([0, 0, 0, 0, 1, 2]),
-                                                                         indices=tensor([1, 3]),
+          sampled_subgraphs=[SampledSubgraphImpl(sampled_csc=CSCFormatBase(indptr=tensor([0, 0, 0, 0, 1, 2, 3]),
+                                                                         indices=tensor([4, 1, 0]),
                                                            ),
-                                               original_row_node_ids=tensor([3, 4, 0, 5, 1]),
+                                               original_row_node_ids=tensor([3, 4, 0, 1, 5, 2]),
                                                original_edge_ids=None,
-                                               original_column_node_ids=tensor([3, 4, 0, 5, 1]),
+                                               original_column_node_ids=tensor([3, 4, 0, 1, 5, 2]),
                             ),
-                            SampledSubgraphImpl(sampled_csc=CSCFormatBase(indptr=tensor([0, 0, 0, 0, 1, 2]),
-                                                                         indices=tensor([1, 3]),
+                            SampledSubgraphImpl(sampled_csc=CSCFormatBase(indptr=tensor([0, 0, 0, 0, 1, 2, 3]),
+                                                                         indices=tensor([4, 4, 0]),
                                                            ),
-                                               original_row_node_ids=tensor([3, 4, 0, 5, 1]),
+                                               original_row_node_ids=tensor([3, 4, 0, 1, 5, 2]),
                                                original_edge_ids=None,
-                                               original_column_node_ids=tensor([3, 4, 0, 5, 1]),
+                                               original_column_node_ids=tensor([3, 4, 0, 1, 5, 2]),
                             )],
           positive_node_pairs=(tensor([0, 1, 1, 2]),
                               tensor([0, 0, 1, 1])),
-          node_pairs_with_labels=((tensor([0, 1, 1, 2, 0, 1, 1, 2]), tensor([0, 0, 1, 1, 1, 1, 3, 4])),
-                                 tensor([1., 1., 1., 1., 0., 0., 0., 0.])),
+          node_pairs_with_labels=((tensor([0, 1, 1, 2, 0, 0, 1, 1, 1, 1, 2, 2]), tensor([0, 0, 1, 1, 3, 4, 5, 4, 1, 0, 3, 4])),
+                                 tensor([1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.])),
           node_pairs=(tensor([3, 4, 4, 0]),
                      tensor([3, 3, 4, 4])),
           node_features={'feat': tensor([[0.8672, 0.2276],
                                 [0.5503, 0.8223],
                                 [0.9634, 0.2294],
+                                [0.6172, 0.7865],
                                 [0.5160, 0.2486],
-                                [0.6172, 0.7865]])},
-          negative_srcs=tensor([[3],
-                                [4],
-                                [4],
-                                [0]]),
-          negative_node_pairs=(tensor([0, 1, 1, 2]),
-                              tensor([1, 1, 3, 4])),
-          negative_dsts=tensor([[4],
-                                [4],
-                                [5],
-                                [1]]),
+                                [0.2109, 0.1089]])},
+          negative_srcs=None,
+          negative_node_pairs=(tensor([0, 0, 1, 1, 1, 1, 2, 2]),
+                              tensor([3, 4, 5, 4, 1, 0, 3, 4])),
+          negative_dsts=tensor([[1, 5],
+                                [2, 5],
+                                [4, 3],
+                                [1, 5]]),
           labels=None,
-          input_nodes=tensor([3, 4, 0, 5, 1]),
+          input_nodes=tensor([3, 4, 0, 1, 5, 2]),
           edge_features=[{},
                         {}],
           compacted_node_pairs=(tensor([0, 1, 1, 2]),
                                tensor([0, 0, 1, 1])),
-          compacted_negative_srcs=tensor([[0],
-                                          [1],
-                                          [1],
-                                          [2]]),
-          compacted_negative_dsts=tensor([[1],
-                                          [1],
-                                          [3],
-                                          [4]]),
-          blocks=[Block(num_src_nodes=5, num_dst_nodes=5, num_edges=2),
-                 Block(num_src_nodes=5, num_dst_nodes=5, num_edges=2)],
+          compacted_negative_srcs=None,
+          compacted_negative_dsts=tensor([[3, 4],
+                                          [5, 4],
+                                          [1, 0],
+                                          [3, 4]]),
+          blocks=[Block(num_src_nodes=6, num_dst_nodes=6, num_edges=3),
+                 Block(num_src_nodes=6, num_dst_nodes=6, num_edges=3)],
        )"""
         ),
         str(
             """MiniBatch(seed_nodes=None,
-          sampled_subgraphs=[SampledSubgraphImpl(sampled_csc=CSCFormatBase(indptr=tensor([0, 0, 1]),
-                                                                         indices=tensor([1]),
+          sampled_subgraphs=[SampledSubgraphImpl(sampled_csc=CSCFormatBase(indptr=tensor([0, 0, 1, 1, 2]),
+                                                                         indices=tensor([1, 0]),
                                                            ),
-                                               original_row_node_ids=tensor([5, 4]),
+                                               original_row_node_ids=tensor([5, 4, 0, 1]),
                                                original_edge_ids=None,
-                                               original_column_node_ids=tensor([5, 4]),
+                                               original_column_node_ids=tensor([5, 4, 0, 1]),
                             ),
-                            SampledSubgraphImpl(sampled_csc=CSCFormatBase(indptr=tensor([0, 0, 1]),
-                                                                         indices=tensor([1]),
+                            SampledSubgraphImpl(sampled_csc=CSCFormatBase(indptr=tensor([0, 0, 1, 1, 2]),
+                                                                         indices=tensor([1, 0]),
                                                            ),
-                                               original_row_node_ids=tensor([5, 4]),
+                                               original_row_node_ids=tensor([5, 4, 0, 1]),
                                                original_edge_ids=None,
-                                               original_column_node_ids=tensor([5, 4]),
+                                               original_column_node_ids=tensor([5, 4, 0, 1]),
                             )],
           positive_node_pairs=(tensor([0, 1]),
                               tensor([0, 0])),
-          node_pairs_with_labels=((tensor([0, 1, 0, 1]), tensor([0, 0, 0, 0])),
-                                 tensor([1., 1., 0., 0.])),
+          node_pairs_with_labels=((tensor([0, 1, 0, 0, 1, 1]), tensor([0, 0, 2, 1, 2, 3])),
+                                 tensor([1., 1., 0., 0., 0., 0.])),
           node_pairs=(tensor([5, 4]),
                      tensor([5, 5])),
           node_features={'feat': tensor([[0.5160, 0.2486],
-                                [0.5503, 0.8223]])},
-          negative_srcs=tensor([[5],
-                                [4]]),
-          negative_node_pairs=(tensor([0, 1]),
-                              tensor([0, 0])),
-          negative_dsts=tensor([[5],
-                                [5]]),
+                                [0.5503, 0.8223],
+                                [0.9634, 0.2294],
+                                [0.6172, 0.7865]])},
+          negative_srcs=None,
+          negative_node_pairs=(tensor([0, 0, 1, 1]),
+                              tensor([2, 1, 2, 3])),
+          negative_dsts=tensor([[0, 4],
+                                [0, 1]]),
           labels=None,
-          input_nodes=tensor([5, 4]),
+          input_nodes=tensor([5, 4, 0, 1]),
           edge_features=[{},
                         {}],
           compacted_node_pairs=(tensor([0, 1]),
                                tensor([0, 0])),
-          compacted_negative_srcs=tensor([[0],
-                                          [1]]),
-          compacted_negative_dsts=tensor([[0],
-                                          [0]]),
-          blocks=[Block(num_src_nodes=2, num_dst_nodes=2, num_edges=1),
-                 Block(num_src_nodes=2, num_dst_nodes=2, num_edges=1)],
+          compacted_negative_srcs=None,
+          compacted_negative_dsts=tensor([[2, 1],
+                                          [2, 3]]),
+          blocks=[Block(num_src_nodes=4, num_dst_nodes=4, num_edges=2),
+                 Block(num_src_nodes=4, num_dst_nodes=4, num_edges=2)],
        )"""
         ),
     ]

From c40f54e643e952dab4c37600d110eede882a64ac Mon Sep 17 00:00:00 2001
From: Rhett Ying <85214957+Rhett-Ying@users.noreply.github.com>
Date: Fri, 12 Jan 2024 08:54:11 +0800
Subject: [PATCH 05/22] [release] upgrade version to next one

---
 python/update_version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/update_version.py b/python/update_version.py
index e5dd2b3d8f32..00b4ef0c5f77 100644
--- a/python/update_version.py
+++ b/python/update_version.py
@@ -16,7 +16,7 @@
 # (usually "aYYMMDD")
 # The environment variable DGL_VERSION_SUFFIX is the local version label
 # suffix for indicating CPU and CUDA versions as in PEP 440 (e.g. "+cu102")
-__version__ = "2.0" + os.getenv("DGL_PRERELEASE", "")
+__version__ = "2.1" + os.getenv("DGL_PRERELEASE", "")
 __version__ += os.getenv("DGL_VERSION_SUFFIX", "")
 print(__version__)
 

From 47a1d6a85df1602833eedeb8cd3fb618c5e30cea Mon Sep 17 00:00:00 2001
From: Rhett Ying <85214957+Rhett-Ying@users.noreply.github.com>
Date: Fri, 12 Jan 2024 08:58:00 +0800
Subject: [PATCH 06/22] [release] upgrade version to 2.1 on master (#6940)

---
 conda/dgl/meta.yaml                 | 2 +-
 include/dgl/runtime/c_runtime_api.h | 2 +-
 python/dgl/_ffi/libinfo.py          | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/conda/dgl/meta.yaml b/conda/dgl/meta.yaml
index fdf912349d03..f6acfa9a8c1a 100644
--- a/conda/dgl/meta.yaml
+++ b/conda/dgl/meta.yaml
@@ -1,6 +1,6 @@
 package:
   name: dgl{{ environ.get('DGL_PACKAGE_SUFFIX', '') }}
-  version: 2.0{{ environ.get('DGL_VERSION_SUFFIX', '') }}
+  version: 2.1{{ environ.get('DGL_VERSION_SUFFIX', '') }}
 
 source:
   git_rev: {{ environ.get('DGL_RELEASE_BRANCH', 'master') }}
diff --git a/include/dgl/runtime/c_runtime_api.h b/include/dgl/runtime/c_runtime_api.h
index d9e9d4edc054..6dc7e5d2d299 100644
--- a/include/dgl/runtime/c_runtime_api.h
+++ b/include/dgl/runtime/c_runtime_api.h
@@ -33,7 +33,7 @@
 #endif
 
 // DGL version
-#define DGL_VERSION "2.0"
+#define DGL_VERSION "2.1"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/python/dgl/_ffi/libinfo.py b/python/dgl/_ffi/libinfo.py
index f82a6f700033..6f841e246bd5 100644
--- a/python/dgl/_ffi/libinfo.py
+++ b/python/dgl/_ffi/libinfo.py
@@ -105,4 +105,4 @@ def find_lib_path(name=None, search_path=None, optional=False):
 # We use the version of the incoming release for code
 # that is under development.
 # The following line is set by dgl/python/update_version.py
-__version__ = "2.0"
+__version__ = "2.1"

From f86212edb5012f36f30ed8f79513b2b3f54cf1ea Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Fri, 12 Jan 2024 03:51:53 -0500
Subject: [PATCH 07/22] [GraphBolt][CUDA] Enable tests for weighted sampling
 (#6919)

---
 graphbolt/src/cuda/neighbor_sampler.cu        | 58 ++++++++++++++++---
 .../impl/test_fused_csc_sampling_graph.py     | 30 ++++------
 2 files changed, 60 insertions(+), 28 deletions(-)

diff --git a/graphbolt/src/cuda/neighbor_sampler.cu b/graphbolt/src/cuda/neighbor_sampler.cu
index 7cede6b5a471..96070303c1ce 100644
--- a/graphbolt/src/cuda/neighbor_sampler.cu
+++ b/graphbolt/src/cuda/neighbor_sampler.cu
@@ -43,7 +43,7 @@ template <
 __global__ void _ComputeRandoms(
     const int64_t num_edges, const indptr_t* const sliced_indptr,
     const indptr_t* const sub_indptr, const indices_t* const csr_rows,
-    const weights_t* const weights, const indices_t* const indices,
+    const weights_t* const sliced_weights, const indices_t* const indices,
     const uint64_t random_seed, float_t* random_arr, edge_id_t* edge_ids) {
   int64_t i = blockIdx.x * blockDim.x + threadIdx.x;
   const int stride = gridDim.x * blockDim.x;
@@ -65,7 +65,8 @@ __global__ void _ComputeRandoms(
     }
 
     const auto rnd = curand_uniform(&rng);
-    const auto prob = weights ? weights[in_idx] : static_cast<weights_t>(1);
+    const auto prob =
+        sliced_weights ? sliced_weights[i] : static_cast<weights_t>(1);
     const auto exp_rnd = -__logf(rnd);
     const float_t adjusted_rnd = prob > 0
                                      ? static_cast<float_t>(exp_rnd / prob)
@@ -77,6 +78,13 @@ __global__ void _ComputeRandoms(
   }
 }
 
+struct IsPositive {
+  template <typename probs_t>
+  __host__ __device__ auto operator()(probs_t x) {
+    return x > 0;
+  }
+};
+
 template <typename indptr_t>
 struct MinInDegreeFanout {
   const indptr_t* in_degree;
@@ -152,7 +160,18 @@ c10::intrusive_ptr<sampling::FusedSampledSubgraph> SampleNeighbors(
   auto in_degree_and_sliced_indptr = SliceCSCIndptr(indptr, nodes);
   auto in_degree = std::get<0>(in_degree_and_sliced_indptr);
   auto sliced_indptr = std::get<1>(in_degree_and_sliced_indptr);
-  auto sub_indptr = ExclusiveCumSum(in_degree);
+  torch::Tensor sub_indptr;
+  // @todo mfbalin, refactor IndexSelectCSCImpl so that it does not have to take
+  // nodes as input
+  torch::optional<torch::Tensor> sliced_probs_or_mask;
+  if (probs_or_mask.has_value()) {
+    torch::Tensor sliced_probs_or_mask_tensor;
+    std::tie(sub_indptr, sliced_probs_or_mask_tensor) =
+        IndexSelectCSCImpl(indptr, probs_or_mask.value(), nodes);
+    sliced_probs_or_mask = sliced_probs_or_mask_tensor;
+  } else {
+    sub_indptr = ExclusiveCumSum(in_degree);
+  }
   if (fanouts.size() > 1) {
     torch::Tensor sliced_type_per_edge;
     std::tie(sub_indptr, sliced_type_per_edge) =
@@ -187,6 +206,29 @@ c10::intrusive_ptr<sampling::FusedSampledSubgraph> SampleNeighbors(
   AT_DISPATCH_INDEX_TYPES(
       indptr.scalar_type(), "SampleNeighborsIndptr", ([&] {
         using indptr_t = index_t;
+        if (probs_or_mask.has_value()) {  // Count nonzero probs into in_degree.
+          GRAPHBOLT_DISPATCH_ALL_TYPES(
+              probs_or_mask.value().scalar_type(),
+              "SampleNeighborsPositiveProbs", ([&] {
+                using probs_t = scalar_t;
+                auto is_nonzero = thrust::make_transform_iterator(
+                    sliced_probs_or_mask.value().data_ptr<probs_t>(),
+                    IsPositive{});
+                size_t tmp_storage_size = 0;
+                cub::DeviceSegmentedReduce::Sum(
+                    nullptr, tmp_storage_size, is_nonzero,
+                    in_degree.data_ptr<indptr_t>(), num_rows,
+                    sub_indptr.data_ptr<indptr_t>(),
+                    sub_indptr.data_ptr<indptr_t>() + 1, stream);
+                auto tmp_storage =
+                    allocator.AllocateStorage<char>(tmp_storage_size);
+                cub::DeviceSegmentedReduce::Sum(
+                    tmp_storage.get(), tmp_storage_size, is_nonzero,
+                    in_degree.data_ptr<indptr_t>(), num_rows,
+                    sub_indptr.data_ptr<indptr_t>(),
+                    sub_indptr.data_ptr<indptr_t>() + 1, stream);
+              }));
+        }
         thrust::counting_iterator<int64_t> iota(0);
         auto sampled_degree = thrust::make_transform_iterator(
             iota, MinInDegreeFanout<indptr_t>{
@@ -246,10 +288,10 @@ c10::intrusive_ptr<sampling::FusedSampledSubgraph> SampleNeighbors(
                         probs_or_mask_scalar_type, "SampleNeighborsProbs",
                         ([&] {
                           using probs_t = scalar_t;
-                          probs_t* probs_ptr = nullptr;
-                          if (probs_or_mask.has_value()) {
-                            probs_ptr =
-                                probs_or_mask.value().data_ptr<probs_t>();
+                          probs_t* sliced_probs_ptr = nullptr;
+                          if (sliced_probs_or_mask.has_value()) {
+                            sliced_probs_ptr = sliced_probs_or_mask.value()
+                                                   .data_ptr<probs_t>();
                           }
                           const indices_t* indices_ptr =
                               layer ? indices.data_ptr<indices_t>() : nullptr;
@@ -261,7 +303,7 @@ c10::intrusive_ptr<sampling::FusedSampledSubgraph> SampleNeighbors(
                               _ComputeRandoms, grid, block, 0, stream,
                               num_edges, sliced_indptr.data_ptr<indptr_t>(),
                               sub_indptr.data_ptr<indptr_t>(),
-                              coo_rows.data_ptr<indices_t>(), probs_ptr,
+                              coo_rows.data_ptr<indices_t>(), sliced_probs_ptr,
                               indices_ptr, random_seed, randoms.get(),
                               edge_id_segments.get());
                         }));
diff --git a/tests/python/pytorch/graphbolt/impl/test_fused_csc_sampling_graph.py b/tests/python/pytorch/graphbolt/impl/test_fused_csc_sampling_graph.py
index 34a508deb728..3e13a1ce2f5a 100644
--- a/tests/python/pytorch/graphbolt/impl/test_fused_csc_sampling_graph.py
+++ b/tests/python/pytorch/graphbolt/impl/test_fused_csc_sampling_graph.py
@@ -1797,10 +1797,6 @@ def test_sample_neighbors_fanouts(
     assert subgraph.sampled_csc["n2:e2:n1"].indptr.size(0) == 2
 
 
-@unittest.skipIf(
-    F._default_context_str == "gpu",
-    reason="Sampling with replacement not yet supported on GPU.",
-)
 @pytest.mark.parametrize(
     "replace, expected_sampled_num1, expected_sampled_num2",
     [(False, 2, 2), (True, 4, 4)],
@@ -1808,6 +1804,8 @@ def test_sample_neighbors_fanouts(
 def test_sample_neighbors_replace(
     replace, expected_sampled_num1, expected_sampled_num2
 ):
+    if F._default_context_str == "gpu" and replace == True:
+        pytest.skip("Sampling with replacement not yet supported on GPU.")
     """Original graph in COO:
     "n1:e1:n2":[0, 0, 1, 1, 1], [0, 2, 0, 1, 2]
     "n2:e2:n1":[0, 0, 1, 2], [0, 1, 1 ,0]
@@ -1966,14 +1964,12 @@ def test_sample_neighbors_return_eids_hetero(labor):
         )
 
 
-@unittest.skipIf(
-    F._default_context_str == "gpu",
-    reason="Sampling with replacement not yet supported on GPU.",
-)
 @pytest.mark.parametrize("replace", [True, False])
 @pytest.mark.parametrize("labor", [False, True])
 @pytest.mark.parametrize("probs_name", ["weight", "mask"])
 def test_sample_neighbors_probs(replace, labor, probs_name):
+    if F._default_context_str == "gpu" and replace == True:
+        pytest.skip("Sampling with replacement not yet supported on GPU.")
     """Original graph in COO:
     1   0   1   0   1
     1   0   1   1   0
@@ -2020,10 +2016,6 @@ def test_sample_neighbors_probs(replace, labor, probs_name):
         assert sampled_num == 4
 
 
-@unittest.skipIf(
-    F._default_context_str == "gpu",
-    reason="Sampling with replacement not yet supported on GPU.",
-)
 @pytest.mark.parametrize("replace", [True, False])
 @pytest.mark.parametrize("labor", [False, True])
 @pytest.mark.parametrize(
@@ -2034,6 +2026,8 @@ def test_sample_neighbors_probs(replace, labor, probs_name):
     ],
 )
 def test_sample_neighbors_zero_probs(replace, labor, probs_or_mask):
+    if F._default_context_str == "gpu" and replace == True:
+        pytest.skip("Sampling with replacement not yet supported on GPU.")
     # Initialize data.
     total_num_nodes = 5
     total_num_edges = 12
@@ -2065,10 +2059,6 @@ def test_sample_neighbors_zero_probs(replace, labor, probs_or_mask):
     assert sampled_num == 0
 
 
-@unittest.skipIf(
-    F._default_context_str == "gpu",
-    reason="Sampling with replacement not yet supported on GPU.",
-)
 @pytest.mark.parametrize("replace", [False, True])
 @pytest.mark.parametrize("labor", [False, True])
 @pytest.mark.parametrize(
@@ -2089,6 +2079,8 @@ def test_sample_neighbors_zero_probs(replace, labor, probs_or_mask):
     ],
 )
 def test_sample_neighbors_homo_pick_number(fanouts, replace, labor, probs_name):
+    if F._default_context_str == "gpu" and replace == True:
+        pytest.skip("Sampling with replacement not yet supported on GPU.")
     """Original graph in COO:
     1   1   1   1   1   1
     0   0   0   0   0   0
@@ -2150,10 +2142,6 @@ def test_sample_neighbors_homo_pick_number(fanouts, replace, labor, probs_name):
                 assert sampled_num == min(fanouts[0], 6)
 
 
-@unittest.skipIf(
-    F._default_context_str == "gpu",
-    reason="Sampling with replacement not yet supported on GPU.",
-)
 @pytest.mark.parametrize("replace", [False, True])
 @pytest.mark.parametrize("labor", [False, True])
 @pytest.mark.parametrize(
@@ -2171,6 +2159,8 @@ def test_sample_neighbors_homo_pick_number(fanouts, replace, labor, probs_name):
 def test_sample_neighbors_hetero_pick_number(
     fanouts, replace, labor, probs_name
 ):
+    if F._default_context_str == "gpu" and replace == True:
+        pytest.skip("Sampling with replacement not yet supported on GPU.")
     # Initialize data.
     total_num_nodes = 10
     total_num_edges = 9

From 3795a006b91c94291b911f0daa261c0598d7ffd8 Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Fri, 12 Jan 2024 05:00:24 -0500
Subject: [PATCH 08/22] [GraphBolt][CUDA] Refactor codebase with `CUB_CALL`
 macro (#6870)

---
 graphbolt/src/cuda/common.h                   |  34 ++++--
 graphbolt/src/cuda/csr_to_coo.cu              |  18 +--
 graphbolt/src/cuda/cumsum.cu                  |  19 +---
 graphbolt/src/cuda/index_select_csc_impl.cu   |  56 +++-------
 graphbolt/src/cuda/index_select_impl.cu       |  17 +--
 graphbolt/src/cuda/insubgraph.cu              |   2 -
 graphbolt/src/cuda/isin.cu                    |   9 +-
 graphbolt/src/cuda/neighbor_sampler.cu        | 104 +++++-------------
 graphbolt/src/cuda/sampling_utils.cu          |  29 ++---
 graphbolt/src/cuda/sort_impl.cu               |  25 +----
 graphbolt/src/cuda/unique_and_compact_impl.cu |  78 +++++--------
 11 files changed, 128 insertions(+), 263 deletions(-)

diff --git a/graphbolt/src/cuda/common.h b/graphbolt/src/cuda/common.h
index b2d5991b3f50..cc9980dccda5 100644
--- a/graphbolt/src/cuda/common.h
+++ b/graphbolt/src/cuda/common.h
@@ -10,6 +10,7 @@
 #include <ATen/cuda/CUDAEvent.h>
 #include <c10/cuda/CUDACachingAllocator.h>
 #include <c10/cuda/CUDAException.h>
+#include <c10/cuda/CUDAStream.h>
 #include <cuda_runtime.h>
 #include <torch/script.h>
 
@@ -82,15 +83,34 @@ inline bool is_zero<dim3>(dim3 size) {
 
 #define CUDA_CALL(func) C10_CUDA_CHECK((func))
 
-#define CUDA_KERNEL_CALL(kernel, nblks, nthrs, shmem, stream, ...)    \
-  {                                                                   \
-    if (!graphbolt::cuda::is_zero((nblks)) &&                         \
-        !graphbolt::cuda::is_zero((nthrs))) {                         \
-      (kernel)<<<(nblks), (nthrs), (shmem), (stream)>>>(__VA_ARGS__); \
-      C10_CUDA_KERNEL_LAUNCH_CHECK();                                 \
-    }                                                                 \
+#define CUDA_KERNEL_CALL(kernel, nblks, nthrs, shmem, ...)          \
+  {                                                                 \
+    if (!graphbolt::cuda::is_zero((nblks)) &&                       \
+        !graphbolt::cuda::is_zero((nthrs))) {                       \
+      auto stream = graphbolt::cuda::GetCurrentStream();            \
+      (kernel)<<<(nblks), (nthrs), (shmem), stream>>>(__VA_ARGS__); \
+      C10_CUDA_KERNEL_LAUNCH_CHECK();                               \
+    }                                                               \
   }
 
+#define CUB_CALL(fn, ...)                                                     \
+  {                                                                           \
+    auto allocator = graphbolt::cuda::GetAllocator();                         \
+    auto stream = graphbolt::cuda::GetCurrentStream();                        \
+    size_t workspace_size = 0;                                                \
+    CUDA_CALL(cub::fn(nullptr, workspace_size, __VA_ARGS__, stream));         \
+    auto workspace = allocator.AllocateStorage<char>(workspace_size);         \
+    CUDA_CALL(cub::fn(workspace.get(), workspace_size, __VA_ARGS__, stream)); \
+  }
+
+#define THRUST_CALL(fn, ...)                                                 \
+  [&] {                                                                      \
+    auto allocator = graphbolt::cuda::GetAllocator();                        \
+    auto stream = graphbolt::cuda::GetCurrentStream();                       \
+    const auto exec_policy = thrust::cuda::par_nosync(allocator).on(stream); \
+    return thrust::fn(exec_policy, __VA_ARGS__);                             \
+  }()
+
 /**
  * @brief This class is designed to handle the copy operation of a single
  * scalar_t item from a given CUDA device pointer. Later, if the object is cast
diff --git a/graphbolt/src/cuda/csr_to_coo.cu b/graphbolt/src/cuda/csr_to_coo.cu
index 524f56eb058e..42765986bb25 100644
--- a/graphbolt/src/cuda/csr_to_coo.cu
+++ b/graphbolt/src/cuda/csr_to_coo.cu
@@ -39,8 +39,6 @@ struct AdjacentDifference {
 };
 
 torch::Tensor CSRToCOO(torch::Tensor indptr, torch::ScalarType output_dtype) {
-  auto allocator = cuda::GetAllocator();
-  auto stream = cuda::GetCurrentStream();
   const auto num_rows = indptr.size(0) - 1;
   thrust::counting_iterator<int64_t> iota(0);
 
@@ -69,19 +67,9 @@ torch::Tensor CSRToCOO(torch::Tensor indptr, torch::ScalarType output_dtype) {
               constexpr int64_t max_copy_at_once =
                   std::numeric_limits<int32_t>::max();
               for (int64_t i = 0; i < num_rows; i += max_copy_at_once) {
-                std::size_t tmp_storage_size = 0;
-                CUDA_CALL(cub::DeviceCopy::Batched(
-                    nullptr, tmp_storage_size, input_buffer + i,
-                    output_buffer + i, buffer_sizes + i,
-                    std::min(num_rows - i, max_copy_at_once), stream));
-
-                auto tmp_storage =
-                    allocator.AllocateStorage<char>(tmp_storage_size);
-
-                CUDA_CALL(cub::DeviceCopy::Batched(
-                    tmp_storage.get(), tmp_storage_size, input_buffer + i,
-                    output_buffer + i, buffer_sizes + i,
-                    std::min(num_rows - i, max_copy_at_once), stream));
+                CUB_CALL(
+                    DeviceCopy::Batched, input_buffer + i, output_buffer + i,
+                    buffer_sizes + i, std::min(num_rows - i, max_copy_at_once));
               }
             }));
         return csr_rows;
diff --git a/graphbolt/src/cuda/cumsum.cu b/graphbolt/src/cuda/cumsum.cu
index ed50e6914e84..3537697ac4c7 100644
--- a/graphbolt/src/cuda/cumsum.cu
+++ b/graphbolt/src/cuda/cumsum.cu
@@ -12,21 +12,14 @@ namespace graphbolt {
 namespace ops {
 
 torch::Tensor ExclusiveCumSum(torch::Tensor input) {
-  auto allocator = cuda::GetAllocator();
-  auto stream = cuda::GetCurrentStream();
   auto result = torch::empty_like(input);
 
-  AT_DISPATCH_INTEGRAL_TYPES(
-      input.scalar_type(), "ExclusiveCumSum", ([&] {
-        size_t tmp_storage_size = 0;
-        cub::DeviceScan::ExclusiveSum(
-            nullptr, tmp_storage_size, input.data_ptr<scalar_t>(),
-            result.data_ptr<scalar_t>(), input.size(0), stream);
-        auto tmp_storage = allocator.AllocateStorage<char>(tmp_storage_size);
-        cub::DeviceScan::ExclusiveSum(
-            tmp_storage.get(), tmp_storage_size, input.data_ptr<scalar_t>(),
-            result.data_ptr<scalar_t>(), input.size(0), stream);
-      }));
+  AT_DISPATCH_INTEGRAL_TYPES(input.scalar_type(), "ExclusiveCumSum", ([&] {
+                               CUB_CALL(
+                                   DeviceScan::ExclusiveSum,
+                                   input.data_ptr<scalar_t>(),
+                                   result.data_ptr<scalar_t>(), input.size(0));
+                             }));
   return result;
 }
 
diff --git a/graphbolt/src/cuda/index_select_csc_impl.cu b/graphbolt/src/cuda/index_select_csc_impl.cu
index c24f2a7f0d07..da0b80584482 100644
--- a/graphbolt/src/cuda/index_select_csc_impl.cu
+++ b/graphbolt/src/cuda/index_select_csc_impl.cu
@@ -5,11 +5,10 @@
  * @brief Index select csc operator implementation on CUDA.
  */
 #include <c10/core/ScalarType.h>
-#include <c10/cuda/CUDAStream.h>
 #include <graphbolt/cuda_ops.h>
-#include <thrust/execution_policy.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
 
 #include <cub/cub.cuh>
 #include <numeric>
@@ -88,7 +87,7 @@ std::tuple<torch::Tensor, torch::Tensor> UVAIndexSelectCSCCopyIndices(
     torch::Tensor indices, const int64_t num_nodes,
     const indptr_t* const in_degree, const indptr_t* const sliced_indptr,
     const int64_t* const perm, torch::TensorOptions nodes_options,
-    torch::ScalarType indptr_scalar_type, cudaStream_t stream) {
+    torch::ScalarType indptr_scalar_type) {
   auto allocator = cuda::GetAllocator();
   thrust::counting_iterator<int64_t> iota(0);
 
@@ -109,14 +108,9 @@ std::tuple<torch::Tensor, torch::Tensor> UVAIndexSelectCSCCopyIndices(
         output_indptr.data_ptr<indptr_t>(), output_indptr_aligned.get());
     thrust::tuple<indptr_t, indptr_t> zero_value{};
     // Compute the prefix sum over actual and modified indegrees.
-    size_t tmp_storage_size = 0;
-    CUDA_CALL(cub::DeviceScan::ExclusiveScan(
-        nullptr, tmp_storage_size, modified_in_degree, output_indptr_pair,
-        PairSum{}, zero_value, num_nodes + 1, stream));
-    auto tmp_storage = allocator.AllocateStorage<char>(tmp_storage_size);
-    CUDA_CALL(cub::DeviceScan::ExclusiveScan(
-        tmp_storage.get(), tmp_storage_size, modified_in_degree,
-        output_indptr_pair, PairSum{}, zero_value, num_nodes + 1, stream));
+    CUB_CALL(
+        DeviceScan::ExclusiveScan, modified_in_degree, output_indptr_pair,
+        PairSum{}, zero_value, num_nodes + 1);
   }
 
   // Copy the actual total number of edges.
@@ -138,7 +132,7 @@ std::tuple<torch::Tensor, torch::Tensor> UVAIndexSelectCSCCopyIndices(
   // Perform the actual copying, of the indices array into
   // output_indices in an aligned manner.
   CUDA_KERNEL_CALL(
-      _CopyIndicesAlignedKernel, grid, block, 0, stream,
+      _CopyIndicesAlignedKernel, grid, block, 0,
       static_cast<indptr_t>(edge_count_aligned), num_nodes, sliced_indptr,
       output_indptr.data_ptr<indptr_t>(), output_indptr_aligned.get(),
       reinterpret_cast<indices_t*>(indices.data_ptr()),
@@ -151,7 +145,6 @@ std::tuple<torch::Tensor, torch::Tensor> UVAIndexSelectCSCImpl(
   // Sorting nodes so that accesses over PCI-e are more regular.
   const auto sorted_idx =
       Sort(nodes, cuda::NumberOfBits(indptr.size(0) - 1)).second;
-  auto stream = cuda::GetCurrentStream();
   const int64_t num_nodes = nodes.size(0);
 
   auto in_degree_and_sliced_indptr = SliceCSCIndptr(indptr, nodes);
@@ -167,7 +160,7 @@ std::tuple<torch::Tensor, torch::Tensor> UVAIndexSelectCSCImpl(
               return UVAIndexSelectCSCCopyIndices<indptr_t, element_size_t>(
                   indices, num_nodes, in_degree, sliced_indptr,
                   sorted_idx.data_ptr<int64_t>(), nodes.options(),
-                  indptr.scalar_type(), stream);
+                  indptr.scalar_type());
             }));
       }));
 }
@@ -191,9 +184,7 @@ template <typename indptr_t, typename indices_t>
 void IndexSelectCSCCopyIndices(
     const int64_t num_nodes, indices_t* const indices,
     indptr_t* const sliced_indptr, const indptr_t* const in_degree,
-    indptr_t* const output_indptr, indices_t* const output_indices,
-    cudaStream_t stream) {
-  auto allocator = cuda::GetAllocator();
+    indptr_t* const output_indptr, indices_t* const output_indices) {
   thrust::counting_iterator<int64_t> iota(0);
 
   auto input_buffer_it = thrust::make_transform_iterator(
@@ -206,21 +197,14 @@ void IndexSelectCSCCopyIndices(
 
   // Performs the copy from indices into output_indices.
   for (int64_t i = 0; i < num_nodes; i += max_copy_at_once) {
-    size_t tmp_storage_size = 0;
-    CUDA_CALL(cub::DeviceMemcpy::Batched(
-        nullptr, tmp_storage_size, input_buffer_it + i, output_buffer_it + i,
-        buffer_sizes + i, std::min(num_nodes - i, max_copy_at_once), stream));
-    auto tmp_storage = allocator.AllocateStorage<char>(tmp_storage_size);
-    CUDA_CALL(cub::DeviceMemcpy::Batched(
-        tmp_storage.get(), tmp_storage_size, input_buffer_it + i,
-        output_buffer_it + i, buffer_sizes + i,
-        std::min(num_nodes - i, max_copy_at_once), stream));
+    CUB_CALL(
+        DeviceMemcpy::Batched, input_buffer_it + i, output_buffer_it + i,
+        buffer_sizes + i, std::min(num_nodes - i, max_copy_at_once));
   }
 }
 
 std::tuple<torch::Tensor, torch::Tensor> DeviceIndexSelectCSCImpl(
     torch::Tensor indptr, torch::Tensor indices, torch::Tensor nodes) {
-  auto stream = cuda::GetCurrentStream();
   const int64_t num_nodes = nodes.size(0);
   auto in_degree_and_sliced_indptr = SliceCSCIndptr(indptr, nodes);
   return AT_DISPATCH_INTEGRAL_TYPES(
@@ -234,17 +218,10 @@ std::tuple<torch::Tensor, torch::Tensor> DeviceIndexSelectCSCImpl(
         torch::Tensor output_indptr = torch::empty(
             num_nodes + 1, nodes.options().dtype(indptr.scalar_type()));
 
-        {  // Compute the output indptr, output_indptr.
-          size_t tmp_storage_size = 0;
-          CUDA_CALL(cub::DeviceScan::ExclusiveSum(
-              nullptr, tmp_storage_size, in_degree,
-              output_indptr.data_ptr<indptr_t>(), num_nodes + 1, stream));
-          auto allocator = cuda::GetAllocator();
-          auto tmp_storage = allocator.AllocateStorage<char>(tmp_storage_size);
-          CUDA_CALL(cub::DeviceScan::ExclusiveSum(
-              tmp_storage.get(), tmp_storage_size, in_degree,
-              output_indptr.data_ptr<indptr_t>(), num_nodes + 1, stream));
-        }
+        // Compute the output indptr, output_indptr.
+        CUB_CALL(
+            DeviceScan::ExclusiveSum, in_degree,
+            output_indptr.data_ptr<indptr_t>(), num_nodes + 1);
 
         // Number of edges being copied.
         auto edge_count =
@@ -259,8 +236,7 @@ std::tuple<torch::Tensor, torch::Tensor> DeviceIndexSelectCSCImpl(
               IndexSelectCSCCopyIndices<indptr_t, indices_t>(
                   num_nodes, reinterpret_cast<indices_t*>(indices.data_ptr()),
                   sliced_indptr, in_degree, output_indptr.data_ptr<indptr_t>(),
-                  reinterpret_cast<indices_t*>(output_indices.data_ptr()),
-                  stream);
+                  reinterpret_cast<indices_t*>(output_indices.data_ptr()));
             }));
         return std::make_tuple(output_indptr, output_indices);
       }));
diff --git a/graphbolt/src/cuda/index_select_impl.cu b/graphbolt/src/cuda/index_select_impl.cu
index 0c937e9030f0..af2c9fe96a24 100644
--- a/graphbolt/src/cuda/index_select_impl.cu
+++ b/graphbolt/src/cuda/index_select_impl.cu
@@ -5,13 +5,8 @@
  * @brief Index select operator implementation on CUDA.
  */
 #include <c10/core/ScalarType.h>
-#include <c10/cuda/CUDAStream.h>
 #include <graphbolt/cuda_ops.h>
-#include <thrust/execution_policy.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/transform_iterator.h>
 
-#include <cub/cub.cuh>
 #include <numeric>
 
 #include "./common.h"
@@ -124,14 +119,12 @@ torch::Tensor UVAIndexSelectImpl_(torch::Tensor input, torch::Tensor index) {
   const IdType* index_sorted_ptr = sorted_index.data_ptr<IdType>();
   const int64_t* permutation_ptr = permutation.data_ptr<int64_t>();
 
-  auto stream = cuda::GetCurrentStream();
-
   if (aligned_feature_size == 1) {
     // Use a single thread to process each output row to avoid wasting threads.
     const int num_threads = cuda::FindNumThreads(return_len);
     const int num_blocks = (return_len + num_threads - 1) / num_threads;
     CUDA_KERNEL_CALL(
-        IndexSelectSingleKernel, num_blocks, num_threads, 0, stream, input_ptr,
+        IndexSelectSingleKernel, num_blocks, num_threads, 0, input_ptr,
         input_len, index_sorted_ptr, return_len, ret_ptr, permutation_ptr);
   } else {
     dim3 block(512, 1);
@@ -144,15 +137,15 @@ torch::Tensor UVAIndexSelectImpl_(torch::Tensor input, torch::Tensor index) {
       // When feature size is smaller than GPU cache line size, use unaligned
       // version for less SM usage, which is more resource efficient.
       CUDA_KERNEL_CALL(
-          IndexSelectMultiKernel, grid, block, 0, stream, input_ptr, input_len,
+          IndexSelectMultiKernel, grid, block, 0, input_ptr, input_len,
           aligned_feature_size, index_sorted_ptr, return_len, ret_ptr,
           permutation_ptr);
     } else {
       // Use aligned version to improve the memory access pattern.
       CUDA_KERNEL_CALL(
-          IndexSelectMultiKernelAligned, grid, block, 0, stream, input_ptr,
-          input_len, aligned_feature_size, index_sorted_ptr, return_len,
-          ret_ptr, permutation_ptr);
+          IndexSelectMultiKernelAligned, grid, block, 0, input_ptr, input_len,
+          aligned_feature_size, index_sorted_ptr, return_len, ret_ptr,
+          permutation_ptr);
     }
   }
 
diff --git a/graphbolt/src/cuda/insubgraph.cu b/graphbolt/src/cuda/insubgraph.cu
index 33473aad8bbc..d3526486e6dd 100644
--- a/graphbolt/src/cuda/insubgraph.cu
+++ b/graphbolt/src/cuda/insubgraph.cu
@@ -8,8 +8,6 @@
 #include <graphbolt/cuda_ops.h>
 #include <graphbolt/cuda_sampling_ops.h>
 
-#include <cub/cub.cuh>
-
 #include "./common.h"
 
 namespace graphbolt {
diff --git a/graphbolt/src/cuda/isin.cu b/graphbolt/src/cuda/isin.cu
index dca4a594afaa..d6b36696f5f5 100644
--- a/graphbolt/src/cuda/isin.cu
+++ b/graphbolt/src/cuda/isin.cu
@@ -7,8 +7,6 @@
 #include <graphbolt/cuda_ops.h>
 #include <thrust/binary_search.h>
 
-#include <cub/cub.cuh>
-
 #include "./common.h"
 
 namespace graphbolt {
@@ -16,15 +14,12 @@ namespace ops {
 
 torch::Tensor IsIn(torch::Tensor elements, torch::Tensor test_elements) {
   auto sorted_test_elements = Sort<false>(test_elements);
-  auto allocator = cuda::GetAllocator();
-  auto stream = cuda::GetCurrentStream();
-  const auto exec_policy = thrust::cuda::par_nosync(allocator).on(stream);
   auto result = torch::empty_like(elements, torch::kBool);
 
   AT_DISPATCH_INTEGRAL_TYPES(
       elements.scalar_type(), "IsInOperation", ([&] {
-        thrust::binary_search(
-            exec_policy, sorted_test_elements.data_ptr<scalar_t>(),
+        THRUST_CALL(
+            binary_search, sorted_test_elements.data_ptr<scalar_t>(),
             sorted_test_elements.data_ptr<scalar_t>() +
                 sorted_test_elements.size(0),
             elements.data_ptr<scalar_t>(),
diff --git a/graphbolt/src/cuda/neighbor_sampler.cu b/graphbolt/src/cuda/neighbor_sampler.cu
index 96070303c1ce..e40a6c909d45 100644
--- a/graphbolt/src/cuda/neighbor_sampler.cu
+++ b/graphbolt/src/cuda/neighbor_sampler.cu
@@ -5,12 +5,10 @@
  * @brief Index select operator implementation on CUDA.
  */
 #include <c10/core/ScalarType.h>
-#include <c10/cuda/CUDAStream.h>
 #include <curand_kernel.h>
 #include <graphbolt/cuda_ops.h>
 #include <graphbolt/cuda_sampling_ops.h>
 #include <thrust/gather.h>
-#include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/iterator/transform_output_iterator.h>
@@ -18,7 +16,6 @@
 #include <algorithm>
 #include <array>
 #include <cub/cub.cuh>
-#include <cuda/std/tuple>
 #include <limits>
 #include <numeric>
 #include <type_traits>
@@ -142,7 +139,6 @@ c10::intrusive_ptr<sampling::FusedSampledSubgraph> SampleNeighbors(
   // are all resident on the GPU. If not, it is better to first extract them
   // before calling this function.
   auto allocator = cuda::GetAllocator();
-  const auto stream = cuda::GetCurrentStream();
   auto num_rows = nodes.size(0);
   auto fanouts_pinned = torch::empty(
       fanouts.size(),
@@ -156,7 +152,8 @@ c10::intrusive_ptr<sampling::FusedSampledSubgraph> SampleNeighbors(
   auto fanouts_device = allocator.AllocateStorage<int64_t>(fanouts.size());
   CUDA_CALL(cudaMemcpyAsync(
       fanouts_device.get(), fanouts_pinned_ptr,
-      sizeof(int64_t) * fanouts.size(), cudaMemcpyHostToDevice, stream));
+      sizeof(int64_t) * fanouts.size(), cudaMemcpyHostToDevice,
+      cuda::GetCurrentStream()));
   auto in_degree_and_sliced_indptr = SliceCSCIndptr(indptr, nodes);
   auto in_degree = std::get<0>(in_degree_and_sliced_indptr);
   auto sliced_indptr = std::get<1>(in_degree_and_sliced_indptr);
@@ -185,14 +182,9 @@ c10::intrusive_ptr<sampling::FusedSampledSubgraph> SampleNeighbors(
       c10::TensorOptions().dtype(in_degree.scalar_type()).pinned_memory(true));
   AT_DISPATCH_INDEX_TYPES(
       indptr.scalar_type(), "SampleNeighborsInDegree", ([&] {
-        size_t tmp_storage_size = 0;
-        cub::DeviceReduce::Max(
-            nullptr, tmp_storage_size, in_degree.data_ptr<index_t>(),
-            max_in_degree.data_ptr<index_t>(), num_rows, stream);
-        auto tmp_storage = allocator.AllocateStorage<char>(tmp_storage_size);
-        cub::DeviceReduce::Max(
-            tmp_storage.get(), tmp_storage_size, in_degree.data_ptr<index_t>(),
-            max_in_degree.data_ptr<index_t>(), num_rows, stream);
+        CUB_CALL(
+            DeviceReduce::Max, in_degree.data_ptr<index_t>(),
+            max_in_degree.data_ptr<index_t>(), num_rows);
       }));
   auto coo_rows = CSRToCOO(sub_indptr, indices.scalar_type());
   const auto num_edges = coo_rows.size(0);
@@ -214,19 +206,11 @@ c10::intrusive_ptr<sampling::FusedSampledSubgraph> SampleNeighbors(
                 auto is_nonzero = thrust::make_transform_iterator(
                     sliced_probs_or_mask.value().data_ptr<probs_t>(),
                     IsPositive{});
-                size_t tmp_storage_size = 0;
-                cub::DeviceSegmentedReduce::Sum(
-                    nullptr, tmp_storage_size, is_nonzero,
+                CUB_CALL(
+                    DeviceSegmentedReduce::Sum, is_nonzero,
                     in_degree.data_ptr<indptr_t>(), num_rows,
                     sub_indptr.data_ptr<indptr_t>(),
-                    sub_indptr.data_ptr<indptr_t>() + 1, stream);
-                auto tmp_storage =
-                    allocator.AllocateStorage<char>(tmp_storage_size);
-                cub::DeviceSegmentedReduce::Sum(
-                    tmp_storage.get(), tmp_storage_size, is_nonzero,
-                    in_degree.data_ptr<indptr_t>(), num_rows,
-                    sub_indptr.data_ptr<indptr_t>(),
-                    sub_indptr.data_ptr<indptr_t>() + 1, stream);
+                    sub_indptr.data_ptr<indptr_t>() + 1);
               }));
         }
         thrust::counting_iterator<int64_t> iota(0);
@@ -235,16 +219,10 @@ c10::intrusive_ptr<sampling::FusedSampledSubgraph> SampleNeighbors(
                       in_degree.data_ptr<indptr_t>(), fanouts_device.get(),
                       fanouts.size()});
 
-        {  // Compute output_indptr.
-          size_t tmp_storage_size = 0;
-          cub::DeviceScan::ExclusiveSum(
-              nullptr, tmp_storage_size, sampled_degree,
-              output_indptr.data_ptr<indptr_t>(), num_rows + 1, stream);
-          auto tmp_storage = allocator.AllocateStorage<char>(tmp_storage_size);
-          cub::DeviceScan::ExclusiveSum(
-              tmp_storage.get(), tmp_storage_size, sampled_degree,
-              output_indptr.data_ptr<indptr_t>(), num_rows + 1, stream);
-        }
+        // Compute output_indptr.
+        CUB_CALL(
+            DeviceScan::ExclusiveSum, sampled_degree,
+            output_indptr.data_ptr<indptr_t>(), num_rows + 1);
 
         auto num_sampled_edges =
             cuda::CopyScalar{output_indptr.data_ptr<indptr_t>() + num_rows};
@@ -300,8 +278,8 @@ c10::intrusive_ptr<sampling::FusedSampledSubgraph> SampleNeighbors(
                               (num_edges + BLOCK_SIZE - 1) / BLOCK_SIZE);
                           // Compute row and random number pairs.
                           CUDA_KERNEL_CALL(
-                              _ComputeRandoms, grid, block, 0, stream,
-                              num_edges, sliced_indptr.data_ptr<indptr_t>(),
+                              _ComputeRandoms, grid, block, 0, num_edges,
+                              sliced_indptr.data_ptr<indptr_t>(),
                               sub_indptr.data_ptr<indptr_t>(),
                               coo_rows.data_ptr<indices_t>(), sliced_probs_ptr,
                               indices_ptr, random_seed, randoms.get(),
@@ -312,21 +290,12 @@ c10::intrusive_ptr<sampling::FusedSampledSubgraph> SampleNeighbors(
               // Sort the random numbers along with edge ids, after
               // sorting the first fanout elements of each row will
               // give us the sampled edges.
-              size_t tmp_storage_size = 0;
-              CUDA_CALL(cub::DeviceSegmentedSort::SortPairs(
-                  nullptr, tmp_storage_size, randoms.get(),
+              CUB_CALL(
+                  DeviceSegmentedSort::SortPairs, randoms.get(),
                   randoms_sorted.get(), edge_id_segments.get(),
                   sorted_edge_id_segments.get(), num_edges, num_rows,
                   sub_indptr.data_ptr<indptr_t>(),
-                  sub_indptr.data_ptr<indptr_t>() + 1, stream));
-              auto tmp_storage =
-                  allocator.AllocateStorage<char>(tmp_storage_size);
-              CUDA_CALL(cub::DeviceSegmentedSort::SortPairs(
-                  tmp_storage.get(), tmp_storage_size, randoms.get(),
-                  randoms_sorted.get(), edge_id_segments.get(),
-                  sorted_edge_id_segments.get(), num_edges, num_rows,
-                  sub_indptr.data_ptr<indptr_t>(),
-                  sub_indptr.data_ptr<indptr_t>() + 1, stream));
+                  sub_indptr.data_ptr<indptr_t>() + 1);
 
               picked_eids = torch::empty(
                   static_cast<indptr_t>(num_sampled_edges),
@@ -341,19 +310,11 @@ c10::intrusive_ptr<sampling::FusedSampledSubgraph> SampleNeighbors(
                 auto sampled_segment_end_it = thrust::make_transform_iterator(
                     iota, SegmentEndFunc<indptr_t, decltype(sampled_degree)>{
                               sub_indptr.data_ptr<indptr_t>(), sampled_degree});
-                size_t tmp_storage_size = 0;
-                CUDA_CALL(cub::DeviceSegmentedSort::SortKeys(
-                    nullptr, tmp_storage_size, edge_id_segments.get(),
-                    sorted_edge_id_segments.get(), picked_eids.size(0),
-                    num_rows, sub_indptr.data_ptr<indptr_t>(),
-                    sampled_segment_end_it, stream));
-                auto tmp_storage =
-                    allocator.AllocateStorage<char>(tmp_storage_size);
-                CUDA_CALL(cub::DeviceSegmentedSort::SortKeys(
-                    tmp_storage.get(), tmp_storage_size, edge_id_segments.get(),
+                CUB_CALL(
+                    DeviceSegmentedSort::SortKeys, edge_id_segments.get(),
                     sorted_edge_id_segments.get(), picked_eids.size(0),
                     num_rows, sub_indptr.data_ptr<indptr_t>(),
-                    sampled_segment_end_it, stream));
+                    sampled_segment_end_it);
               }
 
               auto input_buffer_it = thrust::make_transform_iterator(
@@ -370,17 +331,10 @@ c10::intrusive_ptr<sampling::FusedSampledSubgraph> SampleNeighbors(
 
               // Copy the sampled edge ids into picked_eids tensor.
               for (int64_t i = 0; i < num_rows; i += max_copy_at_once) {
-                size_t tmp_storage_size = 0;
-                CUDA_CALL(cub::DeviceCopy::Batched(
-                    nullptr, tmp_storage_size, input_buffer_it + i,
-                    output_buffer_it + i, sampled_degree + i,
-                    std::min(num_rows - i, max_copy_at_once), stream));
-                auto tmp_storage =
-                    allocator.AllocateStorage<char>(tmp_storage_size);
-                CUDA_CALL(cub::DeviceCopy::Batched(
-                    tmp_storage.get(), tmp_storage_size, input_buffer_it + i,
+                CUB_CALL(
+                    DeviceCopy::Batched, input_buffer_it + i,
                     output_buffer_it + i, sampled_degree + i,
-                    std::min(num_rows - i, max_copy_at_once), stream));
+                    std::min(num_rows - i, max_copy_at_once));
               }
             }));
 
@@ -392,10 +346,8 @@ c10::intrusive_ptr<sampling::FusedSampledSubgraph> SampleNeighbors(
         AT_DISPATCH_INDEX_TYPES(
             indices.scalar_type(), "SampleNeighborsOutputIndices", ([&] {
               using indices_t = index_t;
-              const auto exec_policy =
-                  thrust::cuda::par_nosync(allocator).on(stream);
-              thrust::gather(
-                  exec_policy, picked_eids.data_ptr<indptr_t>(),
+              THRUST_CALL(
+                  gather, picked_eids.data_ptr<indptr_t>(),
                   picked_eids.data_ptr<indptr_t>() + picked_eids.size(0),
                   indices.data_ptr<indices_t>(),
                   output_indices.data_ptr<indices_t>());
@@ -412,10 +364,8 @@ c10::intrusive_ptr<sampling::FusedSampledSubgraph> SampleNeighbors(
               picked_eids.options().dtype(types.scalar_type()));
           AT_DISPATCH_INTEGRAL_TYPES(
               types.scalar_type(), "SampleNeighborsOutputTypePerEdge", ([&] {
-                const auto exec_policy =
-                    thrust::cuda::par_nosync(allocator).on(stream);
-                thrust::gather(
-                    exec_policy, picked_eids.data_ptr<indptr_t>(),
+                THRUST_CALL(
+                    gather, picked_eids.data_ptr<indptr_t>(),
                     picked_eids.data_ptr<indptr_t>() + picked_eids.size(0),
                     types.data_ptr<scalar_t>(),
                     output_type_per_edge.value().data_ptr<scalar_t>());
diff --git a/graphbolt/src/cuda/sampling_utils.cu b/graphbolt/src/cuda/sampling_utils.cu
index 0589bd47e643..4db17a5e4436 100644
--- a/graphbolt/src/cuda/sampling_utils.cu
+++ b/graphbolt/src/cuda/sampling_utils.cu
@@ -4,7 +4,7 @@
  * @file cuda/sampling_utils.cu
  * @brief Sampling utility function implementations on CUDA.
  */
-#include <thrust/execution_policy.h>
+#include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
 
 #include <cub/cub.cuh>
@@ -36,9 +36,6 @@ struct SliceFunc {
 // Returns (indptr[nodes + 1] - indptr[nodes], indptr[nodes])
 std::tuple<torch::Tensor, torch::Tensor> SliceCSCIndptr(
     torch::Tensor indptr, torch::Tensor nodes) {
-  auto allocator = cuda::GetAllocator();
-  const auto exec_policy =
-      thrust::cuda::par_nosync(allocator).on(cuda::GetCurrentStream());
   const int64_t num_nodes = nodes.size(0);
   // Read indptr only once in case it is pinned and access is slow.
   auto sliced_indptr =
@@ -53,8 +50,8 @@ std::tuple<torch::Tensor, torch::Tensor> SliceCSCIndptr(
         AT_DISPATCH_INDEX_TYPES(
             nodes.scalar_type(), "IndexSelectCSCNodes", ([&] {
               using nodes_t = index_t;
-              thrust::for_each(
-                  exec_policy, iota, iota + num_nodes,
+              THRUST_CALL(
+                  for_each, iota, iota + num_nodes,
                   SliceFunc<indptr_t, nodes_t>{
                       nodes.data_ptr<nodes_t>(), indptr.data_ptr<indptr_t>(),
                       in_degree.data_ptr<indptr_t>(),
@@ -92,9 +89,6 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> SliceCSCIndptrHetero(
   auto new_sub_indptr = torch::empty(num_rows + 1, sub_indptr.options());
   auto new_indegree = torch::empty(num_rows + 2, sub_indptr.options());
   auto new_sliced_indptr = torch::empty(num_rows, sliced_indptr.options());
-  auto allocator = cuda::GetAllocator();
-  auto stream = cuda::GetCurrentStream();
-  const auto exec_policy = thrust::cuda::par_nosync(allocator).on(stream);
   thrust::counting_iterator<int64_t> iota(0);
   AT_DISPATCH_INTEGRAL_TYPES(
       sub_indptr.scalar_type(), "SliceCSCIndptrHeteroIndptr", ([&] {
@@ -102,8 +96,8 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> SliceCSCIndptrHetero(
         AT_DISPATCH_INTEGRAL_TYPES(
             etypes.scalar_type(), "SliceCSCIndptrHeteroTypePerEdge", ([&] {
               using etype_t = scalar_t;
-              thrust::for_each(
-                  exec_policy, iota, iota + num_rows,
+              THRUST_CALL(
+                  for_each, iota, iota + num_rows,
                   EdgeTypeSearch<indptr_t, etype_t>{
                       sub_indptr.data_ptr<indptr_t>(),
                       sliced_indptr.data_ptr<indptr_t>(),
@@ -111,17 +105,10 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> SliceCSCIndptrHetero(
                       new_sub_indptr.data_ptr<indptr_t>(),
                       new_sliced_indptr.data_ptr<indptr_t>()});
             }));
-        size_t tmp_storage_size = 0;
-        cub::DeviceAdjacentDifference::SubtractLeftCopy(
-            nullptr, tmp_storage_size, new_sub_indptr.data_ptr<indptr_t>(),
-            new_indegree.data_ptr<indptr_t>(), num_rows + 1, cub::Difference{},
-            stream);
-        auto tmp_storage = allocator.AllocateStorage<char>(tmp_storage_size);
-        cub::DeviceAdjacentDifference::SubtractLeftCopy(
-            tmp_storage.get(), tmp_storage_size,
+        CUB_CALL(
+            DeviceAdjacentDifference::SubtractLeftCopy,
             new_sub_indptr.data_ptr<indptr_t>(),
-            new_indegree.data_ptr<indptr_t>(), num_rows + 1, cub::Difference{},
-            stream);
+            new_indegree.data_ptr<indptr_t>(), num_rows + 1, cub::Difference{});
       }));
   // Discard the first element of the SubtractLeftCopy result and ensure that
   // new_indegree tensor has size num_rows + 1 so that its ExclusiveCumSum is
diff --git a/graphbolt/src/cuda/sort_impl.cu b/graphbolt/src/cuda/sort_impl.cu
index a8eb10879726..c097e14b52b8 100644
--- a/graphbolt/src/cuda/sort_impl.cu
+++ b/graphbolt/src/cuda/sort_impl.cu
@@ -5,7 +5,6 @@
  * @brief Sort implementation on CUDA.
  */
 #include <c10/core/ScalarType.h>
-#include <c10/cuda/CUDAStream.h>
 
 #include <cub/cub.cuh>
 
@@ -21,8 +20,6 @@ std::conditional_t<
     torch::Tensor>
 Sort(const scalar_t* input_keys, int64_t num_items, int num_bits) {
   const auto options = torch::TensorOptions().device(c10::DeviceType::CUDA);
-  auto allocator = cuda::GetAllocator();
-  auto stream = cuda::GetCurrentStream();
   constexpr c10::ScalarType dtype = c10::CppTypeToScalarType<scalar_t>::value;
   auto sorted_array = torch::empty(num_items, options.dtype(dtype));
   auto sorted_keys = sorted_array.data_ptr<scalar_t>();
@@ -36,24 +33,14 @@ Sort(const scalar_t* input_keys, int64_t num_items, int num_bits) {
     auto sorted_idx = torch::empty_like(original_idx);
     const int64_t* input_values = original_idx.data_ptr<int64_t>();
     int64_t* sorted_values = sorted_idx.data_ptr<int64_t>();
-    size_t tmp_storage_size = 0;
-    CUDA_CALL(cub::DeviceRadixSort::SortPairs(
-        nullptr, tmp_storage_size, input_keys, sorted_keys, input_values,
-        sorted_values, num_items, 0, num_bits, stream));
-    auto tmp_storage = allocator.AllocateStorage<char>(tmp_storage_size);
-    CUDA_CALL(cub::DeviceRadixSort::SortPairs(
-        tmp_storage.get(), tmp_storage_size, input_keys, sorted_keys,
-        input_values, sorted_values, num_items, 0, num_bits, stream));
+    CUB_CALL(
+        DeviceRadixSort::SortPairs, input_keys, sorted_keys, input_values,
+        sorted_values, num_items, 0, num_bits);
     return std::make_pair(sorted_array, sorted_idx);
   } else {
-    size_t tmp_storage_size = 0;
-    CUDA_CALL(cub::DeviceRadixSort::SortKeys(
-        nullptr, tmp_storage_size, input_keys, sorted_keys, num_items, 0,
-        num_bits, stream));
-    auto tmp_storage = allocator.AllocateStorage<char>(tmp_storage_size);
-    CUDA_CALL(cub::DeviceRadixSort::SortKeys(
-        tmp_storage.get(), tmp_storage_size, input_keys, sorted_keys, num_items,
-        0, num_bits, stream));
+    CUB_CALL(
+        DeviceRadixSort::SortKeys, input_keys, sorted_keys, num_items, 0,
+        num_bits);
     return sorted_array;
   }
 }
diff --git a/graphbolt/src/cuda/unique_and_compact_impl.cu b/graphbolt/src/cuda/unique_and_compact_impl.cu
index 013f6d752f43..7a92d8173cf2 100644
--- a/graphbolt/src/cuda/unique_and_compact_impl.cu
+++ b/graphbolt/src/cuda/unique_and_compact_impl.cu
@@ -4,15 +4,11 @@
  * @file cuda/unique_and_compact_impl.cu
  * @brief Unique and compact operator implementation on CUDA.
  */
-#include <c10/cuda/CUDAStream.h>
 #include <graphbolt/cuda_ops.h>
 #include <thrust/binary_search.h>
 #include <thrust/functional.h>
 #include <thrust/gather.h>
-#include <thrust/iterator/discard_iterator.h>
 #include <thrust/logical.h>
-#include <thrust/reduce.h>
-#include <thrust/remove.h>
 
 #include <cub/cub.cuh>
 #include <type_traits>
@@ -33,23 +29,17 @@ struct EqualityFunc {
   }
 };
 
-#define DefineReductionFunction(reduce_fn, name)                               \
-  template <typename scalar_iterator_t>                                        \
-  auto name(const scalar_iterator_t input, int64_t size) {                     \
-    auto allocator = cuda::GetAllocator();                                     \
-    auto stream = cuda::GetCurrentStream();                                    \
-    using scalar_t = std::remove_reference_t<decltype(input[0])>;              \
-    cuda::CopyScalar<scalar_t> result;                                         \
-    size_t workspace_size = 0;                                                 \
-    reduce_fn(nullptr, workspace_size, input, result.get(), size, stream);     \
-    auto tmp_storage = allocator.AllocateStorage<char>(workspace_size);        \
-    reduce_fn(                                                                 \
-        tmp_storage.get(), workspace_size, input, result.get(), size, stream); \
-    return result;                                                             \
+#define DefineCubReductionFunction(cub_reduce_fn, name)           \
+  template <typename scalar_iterator_t>                           \
+  auto name(const scalar_iterator_t input, int64_t size) {        \
+    using scalar_t = std::remove_reference_t<decltype(input[0])>; \
+    cuda::CopyScalar<scalar_t> result;                            \
+    CUB_CALL(cub_reduce_fn, input, result.get(), size);           \
+    return result;                                                \
   }
 
-DefineReductionFunction(cub::DeviceReduce::Max, Max);
-DefineReductionFunction(cub::DeviceReduce::Min, Min);
+DefineCubReductionFunction(DeviceReduce::Max, Max);
+DefineCubReductionFunction(DeviceReduce::Min, Min);
 
 std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> UniqueAndCompact(
     const torch::Tensor src_ids, const torch::Tensor dst_ids,
@@ -60,7 +50,6 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> UniqueAndCompact(
       "Dtypes of tensors passed to UniqueAndCompact need to be identical.");
   auto allocator = cuda::GetAllocator();
   auto stream = cuda::GetCurrentStream();
-  const auto exec_policy = thrust::cuda::par_nosync(allocator).on(stream);
   return AT_DISPATCH_INTEGRAL_TYPES(
       src_ids.scalar_type(), "unique_and_compact", ([&] {
         auto src_ids_ptr = src_ids.data_ptr<scalar_t>();
@@ -84,8 +73,8 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> UniqueAndCompact(
 
         // Mark dst nodes in the src_ids tensor.
         auto is_dst = allocator.AllocateStorage<bool>(src_ids.size(0));
-        thrust::binary_search(
-            exec_policy, sorted_unique_dst_ids_ptr,
+        THRUST_CALL(
+            binary_search, sorted_unique_dst_ids_ptr,
             sorted_unique_dst_ids_ptr + unique_dst_ids.size(0), src_ids_ptr,
             src_ids_ptr + src_ids.size(0), is_dst.get());
 
@@ -96,16 +85,10 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> UniqueAndCompact(
           auto is_src = thrust::make_transform_iterator(
               is_dst.get(), thrust::logical_not<bool>{});
           cuda::CopyScalar<int64_t> only_src_size;
-          size_t workspace_size = 0;
-          cub::DeviceSelect::Flagged(
-              nullptr, workspace_size, src_ids_ptr, is_src,
+          CUB_CALL(
+              DeviceSelect::Flagged, src_ids_ptr, is_src,
               only_src.data_ptr<scalar_t>(), only_src_size.get(),
-              src_ids.size(0), stream);
-          auto tmp_storage = allocator.AllocateStorage<char>(workspace_size);
-          cub::DeviceSelect::Flagged(
-              tmp_storage.get(), workspace_size, src_ids_ptr, is_src,
-              only_src.data_ptr<scalar_t>(), only_src_size.get(),
-              src_ids.size(0), stream);
+              src_ids.size(0));
           stream.synchronize();
           only_src = only_src.slice(0, 0, static_cast<int64_t>(only_src_size));
         }
@@ -129,16 +112,10 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> UniqueAndCompact(
 
         {  // Compute the unique operation on the only_src tensor.
           cuda::CopyScalar<int64_t> unique_only_src_size;
-          size_t workspace_size = 0;
-          CUDA_CALL(cub::DeviceSelect::Unique(
-              nullptr, workspace_size, sorted_only_src.data_ptr<scalar_t>(),
-              unique_only_src_ptr, unique_only_src_size.get(), only_src.size(0),
-              stream));
-          auto tmp_storage = allocator.AllocateStorage<char>(workspace_size);
-          CUDA_CALL(cub::DeviceSelect::Unique(
-              tmp_storage.get(), workspace_size,
-              sorted_only_src.data_ptr<scalar_t>(), unique_only_src_ptr,
-              unique_only_src_size.get(), only_src.size(0), stream));
+          CUB_CALL(
+              DeviceSelect::Unique, sorted_only_src.data_ptr<scalar_t>(),
+              unique_only_src_ptr, unique_only_src_size.get(),
+              only_src.size(0));
           stream.synchronize();
           unique_only_src = unique_only_src.slice(
               0, 0, static_cast<int64_t>(unique_only_src_size));
@@ -146,7 +123,8 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> UniqueAndCompact(
 
         auto real_order = torch::cat({unique_dst_ids, unique_only_src});
         // Sort here so that binary search can be used to lookup new_ids.
-        auto [sorted_order, new_ids] = Sort(real_order, num_bits);
+        torch::Tensor sorted_order, new_ids;
+        std::tie(sorted_order, new_ids) = Sort(real_order, num_bits);
         auto sorted_order_ptr = sorted_order.data_ptr<scalar_t>();
         auto new_ids_ptr = new_ids.data_ptr<int64_t>();
         // Holds the found locations of the src and dst ids in the sorted_order.
@@ -154,8 +132,8 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> UniqueAndCompact(
         // tensors.
         auto new_dst_ids_loc =
             allocator.AllocateStorage<scalar_t>(dst_ids.size(0));
-        thrust::lower_bound(
-            exec_policy, sorted_order_ptr,
+        THRUST_CALL(
+            lower_bound, sorted_order_ptr,
             sorted_order_ptr + sorted_order.size(0), dst_ids_ptr,
             dst_ids_ptr + dst_ids.size(0), new_dst_ids_loc.get());
 
@@ -172,16 +150,16 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> UniqueAndCompact(
 
         auto new_src_ids_loc =
             allocator.AllocateStorage<scalar_t>(src_ids.size(0));
-        thrust::lower_bound(
-            exec_policy, sorted_order_ptr,
+        THRUST_CALL(
+            lower_bound, sorted_order_ptr,
             sorted_order_ptr + sorted_order.size(0), src_ids_ptr,
             src_ids_ptr + src_ids.size(0), new_src_ids_loc.get());
 
         // Finally, lookup the new compact ids of the src and dst tensors via
         // gather operations.
         auto new_src_ids = torch::empty_like(src_ids);
-        thrust::gather(
-            exec_policy, new_src_ids_loc.get(),
+        THRUST_CALL(
+            gather, new_src_ids_loc.get(),
             new_src_ids_loc.get() + src_ids.size(0),
             new_ids.data_ptr<int64_t>(), new_src_ids.data_ptr<scalar_t>());
         // Perform check before we gather for the dst indices.
@@ -189,8 +167,8 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> UniqueAndCompact(
           throw std::out_of_range("Some ids not found.");
         }
         auto new_dst_ids = torch::empty_like(dst_ids);
-        thrust::gather(
-            exec_policy, new_dst_ids_loc.get(),
+        THRUST_CALL(
+            gather, new_dst_ids_loc.get(),
             new_dst_ids_loc.get() + dst_ids.size(0),
             new_ids.data_ptr<int64_t>(), new_dst_ids.data_ptr<scalar_t>());
         return std::make_tuple(real_order, new_src_ids, new_dst_ids);

From 0f3bfd7ecbd359f222e835c208ea8dce44570ad6 Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Fri, 12 Jan 2024 06:10:26 -0500
Subject: [PATCH 09/22] [GraphBolt][CUDA] Refactor `IndexSelectCSC` and add
 `output_size` argument (#6927)

---
 graphbolt/include/graphbolt/cuda_ops.h        | 25 +++++-
 graphbolt/src/cuda/index_select_csc_impl.cu   | 88 +++++++++++--------
 graphbolt/src/cuda/insubgraph.cu              | 12 +--
 graphbolt/src/cuda/neighbor_sampler.cu        | 21 +++--
 graphbolt/src/index_select.cc                 |  5 +-
 graphbolt/src/index_select.h                  |  4 +-
 .../impl/test_in_subgraph_sampler.py          | 13 ++-
 7 files changed, 110 insertions(+), 58 deletions(-)

diff --git a/graphbolt/include/graphbolt/cuda_ops.h b/graphbolt/include/graphbolt/cuda_ops.h
index a045a933d1df..f0f48f75e603 100644
--- a/graphbolt/include/graphbolt/cuda_ops.h
+++ b/graphbolt/include/graphbolt/cuda_ops.h
@@ -68,6 +68,27 @@ Sort(torch::Tensor input, int num_bits = 0);
  */
 torch::Tensor IsIn(torch::Tensor elements, torch::Tensor test_elements);
 
+/**
+ * @brief Select columns for a sparse matrix in a CSC format according to nodes
+ * tensor.
+ *
+ * NOTE: The shape of all tensors must be 1-D.
+ *
+ * @param in_degree Indegree tensor containing degrees of nodes being copied.
+ * @param sliced_indptr Sliced_indptr tensor containing indptr values of nodes
+ * being copied.
+ * @param indices Indices tensor with edge information of shape (indptr[N],).
+ * @param nodes Nodes tensor with shape (M,).
+ * @param nodes_max An upperbound on `nodes.max()`.
+ * @param output_size The total number of edges being copied.
+ * @return (torch::Tensor, torch::Tensor) Output indptr and indices tensors of
+ * shapes (M + 1,) and ((indptr[nodes + 1] - indptr[nodes]).sum(),).
+ */
+std::tuple<torch::Tensor, torch::Tensor> IndexSelectCSCImpl(
+    torch::Tensor in_degree, torch::Tensor sliced_indptr, torch::Tensor indices,
+    torch::Tensor nodes, int64_t nodes_max,
+    torch::optional<int64_t> output_size = torch::nullopt);
+
 /**
  * @brief Select columns for a sparse matrix in a CSC format according to nodes
  * tensor.
@@ -77,11 +98,13 @@ torch::Tensor IsIn(torch::Tensor elements, torch::Tensor test_elements);
  * @param indptr Indptr tensor containing offsets with shape (N,).
  * @param indices Indices tensor with edge information of shape (indptr[N],).
  * @param nodes Nodes tensor with shape (M,).
+ * @param output_size The total number of edges being copied.
  * @return (torch::Tensor, torch::Tensor) Output indptr and indices tensors of
  * shapes (M + 1,) and ((indptr[nodes + 1] - indptr[nodes]).sum(),).
  */
 std::tuple<torch::Tensor, torch::Tensor> IndexSelectCSCImpl(
-    torch::Tensor indptr, torch::Tensor indices, torch::Tensor nodes);
+    torch::Tensor indptr, torch::Tensor indices, torch::Tensor nodes,
+    torch::optional<int64_t> output_size = torch::nullopt);
 
 /**
  * @brief Slices the indptr tensor with nodes and returns the indegrees of the
diff --git a/graphbolt/src/cuda/index_select_csc_impl.cu b/graphbolt/src/cuda/index_select_csc_impl.cu
index da0b80584482..cd42bb987da9 100644
--- a/graphbolt/src/cuda/index_select_csc_impl.cu
+++ b/graphbolt/src/cuda/index_select_csc_impl.cu
@@ -86,14 +86,15 @@ template <typename indptr_t, typename indices_t>
 std::tuple<torch::Tensor, torch::Tensor> UVAIndexSelectCSCCopyIndices(
     torch::Tensor indices, const int64_t num_nodes,
     const indptr_t* const in_degree, const indptr_t* const sliced_indptr,
-    const int64_t* const perm, torch::TensorOptions nodes_options,
-    torch::ScalarType indptr_scalar_type) {
+    const int64_t* const perm, torch::TensorOptions options,
+    torch::ScalarType indptr_scalar_type,
+    torch::optional<int64_t> output_size) {
   auto allocator = cuda::GetAllocator();
   thrust::counting_iterator<int64_t> iota(0);
 
   // Output indptr for the slice indexed by nodes.
   auto output_indptr =
-      torch::empty(num_nodes + 1, nodes_options.dtype(indptr_scalar_type));
+      torch::empty(num_nodes + 1, options.dtype(indptr_scalar_type));
 
   auto output_indptr_aligned =
       allocator.AllocateStorage<indptr_t>(num_nodes + 1);
@@ -114,16 +115,18 @@ std::tuple<torch::Tensor, torch::Tensor> UVAIndexSelectCSCCopyIndices(
   }
 
   // Copy the actual total number of edges.
-  auto edge_count =
-      cuda::CopyScalar{output_indptr.data_ptr<indptr_t>() + num_nodes};
+  if (!output_size.has_value()) {
+    auto edge_count =
+        cuda::CopyScalar{output_indptr.data_ptr<indptr_t>() + num_nodes};
+    output_size = static_cast<indptr_t>(edge_count);
+  }
   // Copy the modified number of edges.
   auto edge_count_aligned =
       cuda::CopyScalar{output_indptr_aligned.get() + num_nodes};
 
   // Allocate output array with actual number of edges.
-  torch::Tensor output_indices = torch::empty(
-      static_cast<indptr_t>(edge_count),
-      nodes_options.dtype(indices.scalar_type()));
+  torch::Tensor output_indices =
+      torch::empty(output_size.value(), options.dtype(indices.scalar_type()));
   const dim3 block(BLOCK_SIZE);
   const dim3 grid(
       (static_cast<indptr_t>(edge_count_aligned) + BLOCK_SIZE - 1) /
@@ -141,26 +144,22 @@ std::tuple<torch::Tensor, torch::Tensor> UVAIndexSelectCSCCopyIndices(
 }
 
 std::tuple<torch::Tensor, torch::Tensor> UVAIndexSelectCSCImpl(
-    torch::Tensor indptr, torch::Tensor indices, torch::Tensor nodes) {
+    torch::Tensor in_degree, torch::Tensor sliced_indptr, torch::Tensor indices,
+    torch::Tensor nodes, int num_bits, torch::optional<int64_t> output_size) {
   // Sorting nodes so that accesses over PCI-e are more regular.
-  const auto sorted_idx =
-      Sort(nodes, cuda::NumberOfBits(indptr.size(0) - 1)).second;
+  const auto sorted_idx = Sort(nodes, num_bits).second;
   const int64_t num_nodes = nodes.size(0);
 
-  auto in_degree_and_sliced_indptr = SliceCSCIndptr(indptr, nodes);
   return AT_DISPATCH_INTEGRAL_TYPES(
-      indptr.scalar_type(), "UVAIndexSelectCSCIndptr", ([&] {
+      sliced_indptr.scalar_type(), "UVAIndexSelectCSCIndptr", ([&] {
         using indptr_t = scalar_t;
-        auto in_degree =
-            std::get<0>(in_degree_and_sliced_indptr).data_ptr<indptr_t>();
-        auto sliced_indptr =
-            std::get<1>(in_degree_and_sliced_indptr).data_ptr<indptr_t>();
         return GRAPHBOLT_DISPATCH_ELEMENT_SIZES(
             indices.element_size(), "UVAIndexSelectCSCCopyIndices", ([&] {
               return UVAIndexSelectCSCCopyIndices<indptr_t, element_size_t>(
-                  indices, num_nodes, in_degree, sliced_indptr,
+                  indices, num_nodes, in_degree.data_ptr<indptr_t>(),
+                  sliced_indptr.data_ptr<indptr_t>(),
                   sorted_idx.data_ptr<int64_t>(), nodes.options(),
-                  indptr.scalar_type());
+                  sliced_indptr.scalar_type(), output_size);
             }));
       }));
 }
@@ -204,38 +203,39 @@ void IndexSelectCSCCopyIndices(
 }
 
 std::tuple<torch::Tensor, torch::Tensor> DeviceIndexSelectCSCImpl(
-    torch::Tensor indptr, torch::Tensor indices, torch::Tensor nodes) {
-  const int64_t num_nodes = nodes.size(0);
-  auto in_degree_and_sliced_indptr = SliceCSCIndptr(indptr, nodes);
+    torch::Tensor in_degree, torch::Tensor sliced_indptr, torch::Tensor indices,
+    torch::TensorOptions options, torch::optional<int64_t> output_size) {
+  const int64_t num_nodes = sliced_indptr.size(0);
   return AT_DISPATCH_INTEGRAL_TYPES(
-      indptr.scalar_type(), "IndexSelectCSCIndptr", ([&] {
+      sliced_indptr.scalar_type(), "IndexSelectCSCIndptr", ([&] {
         using indptr_t = scalar_t;
-        auto in_degree =
-            std::get<0>(in_degree_and_sliced_indptr).data_ptr<indptr_t>();
-        auto sliced_indptr =
-            std::get<1>(in_degree_and_sliced_indptr).data_ptr<indptr_t>();
+        auto in_degree_ptr = in_degree.data_ptr<indptr_t>();
+        auto sliced_indptr_ptr = sliced_indptr.data_ptr<indptr_t>();
         // Output indptr for the slice indexed by nodes.
         torch::Tensor output_indptr = torch::empty(
-            num_nodes + 1, nodes.options().dtype(indptr.scalar_type()));
+            num_nodes + 1, options.dtype(sliced_indptr.scalar_type()));
 
         // Compute the output indptr, output_indptr.
         CUB_CALL(
-            DeviceScan::ExclusiveSum, in_degree,
+            DeviceScan::ExclusiveSum, in_degree_ptr,
             output_indptr.data_ptr<indptr_t>(), num_nodes + 1);
 
         // Number of edges being copied.
-        auto edge_count =
-            cuda::CopyScalar{output_indptr.data_ptr<indptr_t>() + num_nodes};
+        if (!output_size.has_value()) {
+          auto edge_count =
+              cuda::CopyScalar{output_indptr.data_ptr<indptr_t>() + num_nodes};
+          output_size = static_cast<indptr_t>(edge_count);
+        }
         // Allocate output array of size number of copied edges.
         torch::Tensor output_indices = torch::empty(
-            static_cast<indptr_t>(edge_count),
-            nodes.options().dtype(indices.scalar_type()));
+            output_size.value(), options.dtype(indices.scalar_type()));
         GRAPHBOLT_DISPATCH_ELEMENT_SIZES(
             indices.element_size(), "IndexSelectCSCCopyIndices", ([&] {
               using indices_t = element_size_t;
               IndexSelectCSCCopyIndices<indptr_t, indices_t>(
                   num_nodes, reinterpret_cast<indices_t*>(indices.data_ptr()),
-                  sliced_indptr, in_degree, output_indptr.data_ptr<indptr_t>(),
+                  sliced_indptr_ptr, in_degree_ptr,
+                  output_indptr.data_ptr<indptr_t>(),
                   reinterpret_cast<indices_t*>(output_indices.data_ptr()));
             }));
         return std::make_tuple(output_indptr, output_indices);
@@ -243,13 +243,27 @@ std::tuple<torch::Tensor, torch::Tensor> DeviceIndexSelectCSCImpl(
 }
 
 std::tuple<torch::Tensor, torch::Tensor> IndexSelectCSCImpl(
-    torch::Tensor indptr, torch::Tensor indices, torch::Tensor nodes) {
+    torch::Tensor in_degree, torch::Tensor sliced_indptr, torch::Tensor indices,
+    torch::Tensor nodes, int64_t nodes_max,
+    torch::optional<int64_t> output_size) {
   if (indices.is_pinned()) {
-    return UVAIndexSelectCSCImpl(indptr, indices, nodes);
+    int num_bits = cuda::NumberOfBits(nodes_max + 1);
+    return UVAIndexSelectCSCImpl(
+        in_degree, sliced_indptr, indices, nodes, num_bits, output_size);
   } else {
-    return DeviceIndexSelectCSCImpl(indptr, indices, nodes);
+    return DeviceIndexSelectCSCImpl(
+        in_degree, sliced_indptr, indices, nodes.options(), output_size);
   }
 }
 
+std::tuple<torch::Tensor, torch::Tensor> IndexSelectCSCImpl(
+    torch::Tensor indptr, torch::Tensor indices, torch::Tensor nodes,
+    torch::optional<int64_t> output_size) {
+  auto [in_degree, sliced_indptr] = SliceCSCIndptr(indptr, nodes);
+  return IndexSelectCSCImpl(
+      in_degree, sliced_indptr, indices, nodes, indptr.size(0) - 2,
+      output_size);
+}
+
 }  //  namespace ops
 }  //  namespace graphbolt
diff --git a/graphbolt/src/cuda/insubgraph.cu b/graphbolt/src/cuda/insubgraph.cu
index d3526486e6dd..bd72ba9a93c9 100644
--- a/graphbolt/src/cuda/insubgraph.cu
+++ b/graphbolt/src/cuda/insubgraph.cu
@@ -16,15 +16,17 @@ namespace ops {
 c10::intrusive_ptr<sampling::FusedSampledSubgraph> InSubgraph(
     torch::Tensor indptr, torch::Tensor indices, torch::Tensor nodes,
     torch::optional<torch::Tensor> type_per_edge) {
-  auto [output_indptr, output_indices] =
-      IndexSelectCSCImpl(indptr, indices, nodes);
+  auto [in_degree, sliced_indptr] = SliceCSCIndptr(indptr, nodes);
+  auto [output_indptr, output_indices] = IndexSelectCSCImpl(
+      in_degree, sliced_indptr, indices, nodes, indptr.size(0) - 2);
+  const int64_t num_edges = output_indices.size(0);
   torch::optional<torch::Tensor> output_type_per_edge;
   if (type_per_edge) {
-    output_type_per_edge =
-        std::get<1>(IndexSelectCSCImpl(indptr, type_per_edge.value(), nodes));
+    output_type_per_edge = std::get<1>(IndexSelectCSCImpl(
+        in_degree, sliced_indptr, type_per_edge.value(), nodes,
+        indptr.size(0) - 2, num_edges));
   }
   auto rows = CSRToCOO(output_indptr, indices.scalar_type());
-  auto [in_degree, sliced_indptr] = SliceCSCIndptr(indptr, nodes);
   auto i = torch::arange(output_indices.size(0), output_indptr.options());
   auto edge_ids =
       i - output_indptr.gather(0, rows) + sliced_indptr.gather(0, rows);
diff --git a/graphbolt/src/cuda/neighbor_sampler.cu b/graphbolt/src/cuda/neighbor_sampler.cu
index e40a6c909d45..a31a0f673ed7 100644
--- a/graphbolt/src/cuda/neighbor_sampler.cu
+++ b/graphbolt/src/cuda/neighbor_sampler.cu
@@ -157,25 +157,30 @@ c10::intrusive_ptr<sampling::FusedSampledSubgraph> SampleNeighbors(
   auto in_degree_and_sliced_indptr = SliceCSCIndptr(indptr, nodes);
   auto in_degree = std::get<0>(in_degree_and_sliced_indptr);
   auto sliced_indptr = std::get<1>(in_degree_and_sliced_indptr);
+  torch::optional<int64_t> num_edges_;
   torch::Tensor sub_indptr;
-  // @todo mfbalin, refactor IndexSelectCSCImpl so that it does not have to take
-  // nodes as input
   torch::optional<torch::Tensor> sliced_probs_or_mask;
   if (probs_or_mask.has_value()) {
     torch::Tensor sliced_probs_or_mask_tensor;
-    std::tie(sub_indptr, sliced_probs_or_mask_tensor) =
-        IndexSelectCSCImpl(indptr, probs_or_mask.value(), nodes);
+    std::tie(sub_indptr, sliced_probs_or_mask_tensor) = IndexSelectCSCImpl(
+        in_degree, sliced_indptr, probs_or_mask.value(), nodes,
+        indptr.size(0) - 2, num_edges_);
     sliced_probs_or_mask = sliced_probs_or_mask_tensor;
-  } else {
-    sub_indptr = ExclusiveCumSum(in_degree);
+    num_edges_ = sliced_probs_or_mask_tensor.size(0);
   }
   if (fanouts.size() > 1) {
     torch::Tensor sliced_type_per_edge;
-    std::tie(sub_indptr, sliced_type_per_edge) =
-        IndexSelectCSCImpl(indptr, type_per_edge.value(), nodes);
+    std::tie(sub_indptr, sliced_type_per_edge) = IndexSelectCSCImpl(
+        in_degree, sliced_indptr, type_per_edge.value(), nodes,
+        indptr.size(0) - 2, num_edges_);
     std::tie(sub_indptr, in_degree, sliced_indptr) = SliceCSCIndptrHetero(
         sub_indptr, sliced_type_per_edge, sliced_indptr, fanouts.size());
     num_rows = sliced_indptr.size(0);
+    num_edges_ = sliced_type_per_edge.size(0);
+  }
+  // If sub_indptr was not computed in the two code blocks above:
+  if (!probs_or_mask.has_value() && fanouts.size() <= 1) {
+    sub_indptr = ExclusiveCumSum(in_degree);
   }
   auto max_in_degree = torch::empty(
       1,
diff --git a/graphbolt/src/index_select.cc b/graphbolt/src/index_select.cc
index 00257061c675..598bd043b165 100644
--- a/graphbolt/src/index_select.cc
+++ b/graphbolt/src/index_select.cc
@@ -22,14 +22,15 @@ torch::Tensor IndexSelect(torch::Tensor input, torch::Tensor index) {
 }
 
 std::tuple<torch::Tensor, torch::Tensor> IndexSelectCSC(
-    torch::Tensor indptr, torch::Tensor indices, torch::Tensor nodes) {
+    torch::Tensor indptr, torch::Tensor indices, torch::Tensor nodes,
+    torch::optional<int64_t> output_size) {
   TORCH_CHECK(
       indices.sizes().size() == 1, "IndexSelectCSC only supports 1d tensors");
   if (utils::is_on_gpu(nodes) && utils::is_accessible_from_gpu(indptr) &&
       utils::is_accessible_from_gpu(indices)) {
     GRAPHBOLT_DISPATCH_CUDA_ONLY_DEVICE(
         c10::DeviceType::CUDA, "IndexSelectCSCImpl",
-        { return IndexSelectCSCImpl(indptr, indices, nodes); });
+        { return IndexSelectCSCImpl(indptr, indices, nodes, output_size); });
   }
   // @todo: The CPU supports only integer dtypes for indices tensor.
   TORCH_CHECK(
diff --git a/graphbolt/src/index_select.h b/graphbolt/src/index_select.h
index 9f95e051ab53..29fa6db2e751 100644
--- a/graphbolt/src/index_select.h
+++ b/graphbolt/src/index_select.h
@@ -25,11 +25,13 @@ namespace ops {
  * @param indptr Indptr tensor containing offsets with shape (N,).
  * @param indices Indices tensor with edge information of shape (indptr[N],).
  * @param nodes Nodes tensor with shape (M,).
+ * @param output_size The total number of edges being copied.
  * @return (torch::Tensor, torch::Tensor) Output indptr and indices tensors of
  * shapes (M + 1,) and ((indptr[nodes + 1] - indptr[nodes]).sum(),).
  */
 std::tuple<torch::Tensor, torch::Tensor> IndexSelectCSC(
-    torch::Tensor indptr, torch::Tensor indices, torch::Tensor nodes);
+    torch::Tensor indptr, torch::Tensor indices, torch::Tensor nodes,
+    torch::optional<int64_t> output_size = torch::nullopt);
 
 /**
  * @brief Select rows from input tensor according to index tensor.
diff --git a/tests/python/pytorch/graphbolt/impl/test_in_subgraph_sampler.py b/tests/python/pytorch/graphbolt/impl/test_in_subgraph_sampler.py
index 6ae5c47ffc97..9f4c021b8928 100644
--- a/tests/python/pytorch/graphbolt/impl/test_in_subgraph_sampler.py
+++ b/tests/python/pytorch/graphbolt/impl/test_in_subgraph_sampler.py
@@ -22,7 +22,10 @@
 )
 @pytest.mark.parametrize("idtype", [torch.int32, torch.int64])
 @pytest.mark.parametrize("is_pinned", [False, True])
-def test_index_select_csc(indptr_dtype, indices_dtype, idtype, is_pinned):
+@pytest.mark.parametrize("output_size", [None, True])
+def test_index_select_csc(
+    indptr_dtype, indices_dtype, idtype, is_pinned, output_size
+):
     """Original graph in COO:
     1   0   1   0   1   0
     1   0   0   1   0   1
@@ -38,7 +41,7 @@ def test_index_select_csc(indptr_dtype, indices_dtype, idtype, is_pinned):
     index = torch.tensor([0, 5, 3], dtype=idtype)
 
     cpu_indptr, cpu_indices = torch.ops.graphbolt.index_select_csc(
-        indptr, indices, index
+        indptr, indices, index, None
     )
     if is_pinned:
         indptr = indptr.pin_memory()
@@ -48,10 +51,12 @@ def test_index_select_csc(indptr_dtype, indices_dtype, idtype, is_pinned):
         indices = indices.cuda()
     index = index.cuda()
 
+    if output_size:
+        output_size = len(cpu_indices)
+
     gpu_indptr, gpu_indices = torch.ops.graphbolt.index_select_csc(
-        indptr, indices, index
+        indptr, indices, index, output_size
     )
-
     assert not cpu_indptr.is_cuda
     assert not cpu_indices.is_cuda
 

From dfff53bc24ecc07e67fb396120308da34add72da Mon Sep 17 00:00:00 2001
From: Mingbang Wang <100203018+Skeleton003@users.noreply.github.com>
Date: Mon, 15 Jan 2024 12:11:40 +0800
Subject: [PATCH 10/22] [GraphBolt] add test for PR#6873 (#6923)

---
 python/dgl/graphbolt/impl/ondisk_dataset.py   |  2 +-
 .../python/pytorch/graphbolt/gb_test_utils.py | 41 +++++++++++++++++++
 .../graphbolt/impl/test_ondisk_dataset.py     | 27 ++++++++++--
 3 files changed, 65 insertions(+), 5 deletions(-)

diff --git a/python/dgl/graphbolt/impl/ondisk_dataset.py b/python/dgl/graphbolt/impl/ondisk_dataset.py
index fde717a6b6bb..8fb198c4875a 100644
--- a/python/dgl/graphbolt/impl/ondisk_dataset.py
+++ b/python/dgl/graphbolt/impl/ondisk_dataset.py
@@ -158,7 +158,7 @@ def preprocess_ondisk_dataset(
                         graph_feature["name"]
                     ] = edge_data
         if not is_homogeneous:
-            # For homogeneous graph, a node/edge feature must cover all
+            # For heterogenous graph, a node/edge feature must cover all
             # node/edge types.
             for feat_name, feat_data in g.ndata.items():
                 existing_types = set(feat_data.keys())
diff --git a/tests/python/pytorch/graphbolt/gb_test_utils.py b/tests/python/pytorch/graphbolt/gb_test_utils.py
index 14661ac7bff7..dd7abc74da0c 100644
--- a/tests/python/pytorch/graphbolt/gb_test_utils.py
+++ b/tests/python/pytorch/graphbolt/gb_test_utils.py
@@ -165,6 +165,12 @@ def random_homo_graphbolt_graph(
                 - format: {edge_fmt}
                   path: {edge_path}
             feature_data:
+                - domain: node
+                  type: null
+                  name: feat
+                  format: numpy
+                  in_memory: true
+                  path: {node_feat_path}
                 - domain: edge
                   type: null
                   name: feat
@@ -250,6 +256,16 @@ def genereate_raw_data_for_hetero_dataset(
         np.save(os.path.join(test_dir, node_feat_path), node_feats)
         node_feats_path[ntype] = node_feat_path
 
+    # Generate edge features.
+    edge_feats_path = {}
+    os.makedirs(os.path.join(test_dir, "data"), exist_ok=True)
+    for etype, num_edge in num_edges.items():
+        src_ntype, etype_str, dst_ntype = etype
+        edge_feat_path = os.path.join("data", f"{etype_str}-feat.npy")
+        edge_feats = np.random.rand(num_edge, num_classes)
+        np.save(os.path.join(test_dir, edge_feat_path), edge_feats)
+        edge_feats_path[etype_str] = edge_feat_path
+
     # Generate train/test/valid set.
     os.makedirs(os.path.join(test_dir, "set"), exist_ok=True)
     user_ids = torch.arange(num_nodes["user"])
@@ -285,6 +301,31 @@ def genereate_raw_data_for_hetero_dataset(
             - type: "user:click:item"
               format: {edge_fmt}
               path: {edges_path["click"]}
+          feature_data:
+            - domain: node
+              type: user
+              name: feat
+              format: numpy
+              in_memory: true
+              path: {node_feats_path["user"]}
+            - domain: node
+              type: item
+              name: feat
+              format: numpy
+              in_memory: true
+              path: {node_feats_path["item"]}
+            - domain: edge
+              type: "user:follow:user"
+              name: feat
+              format: numpy
+              in_memory: true
+              path: {edge_feats_path["follow"]}
+            - domain: edge
+              type: "user:click:item"
+              name: feat
+              format: numpy
+              in_memory: true
+              path: {edge_feats_path["click"]}
         feature_data:
           - domain: node
             type: user
diff --git a/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py b/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py
index 8f261652c2bd..1eb70ef1bfe3 100644
--- a/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py
+++ b/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py
@@ -1136,9 +1136,14 @@ def test_OnDiskDataset_preprocess_homogeneous(edge_fmt):
         assert fused_csc_sampling_graph.total_num_nodes == num_nodes
         assert fused_csc_sampling_graph.total_num_edges == num_edges
         assert (
-            fused_csc_sampling_graph.edge_attributes is None
-            or gb.ORIGINAL_EDGE_ID
+            fused_csc_sampling_graph.node_attributes is not None
+            and "feat" in fused_csc_sampling_graph.node_attributes
+        )
+        assert (
+            fused_csc_sampling_graph.edge_attributes is not None
+            and gb.ORIGINAL_EDGE_ID
             not in fused_csc_sampling_graph.edge_attributes
+            and "feat" in fused_csc_sampling_graph.edge_attributes
         )
 
         num_samples = 100
@@ -2147,7 +2152,14 @@ def test_OnDiskDataset_homogeneous(include_original_edge_id, edge_fmt):
         assert isinstance(graph, gb.FusedCSCSamplingGraph)
         assert graph.total_num_nodes == num_nodes
         assert graph.total_num_edges == num_edges
-        assert graph.edge_attributes is not None
+        assert (
+            graph.node_attributes is not None
+            and "feat" in graph.node_attributes
+        )
+        assert (
+            graph.edge_attributes is not None
+            and "feat" in graph.edge_attributes
+        )
         assert (
             not include_original_edge_id
         ) or gb.ORIGINAL_EDGE_ID in graph.edge_attributes
@@ -2220,7 +2232,14 @@ def test_OnDiskDataset_heterogeneous(include_original_edge_id, edge_fmt):
         assert graph.total_num_edges == sum(
             num_edge for num_edge in num_edges.values()
         )
-        assert graph.edge_attributes is not None
+        assert (
+            graph.node_attributes is not None
+            and "feat" in graph.node_attributes
+        )
+        assert (
+            graph.edge_attributes is not None
+            and "feat" in graph.edge_attributes
+        )
         assert (
             not include_original_edge_id
         ) or gb.ORIGINAL_EDGE_ID in graph.edge_attributes

From 982f20284ba55f20f4c65d0968fe2b4da3e50dd9 Mon Sep 17 00:00:00 2001
From: rudongyu <ru_dongyu@outlook.com>
Date: Mon, 15 Jan 2024 13:50:02 +0800
Subject: [PATCH 11/22] [Doc Fix] fix the format of gt doc (#6949)

---
 docs/source/graphtransformer/data.rst  | 1 +
 docs/source/graphtransformer/index.rst | 6 +++---
 docs/source/graphtransformer/model.rst | 7 ++++++-
 3 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/docs/source/graphtransformer/data.rst b/docs/source/graphtransformer/data.rst
index a8d09dd89a71..e0123e103632 100644
--- a/docs/source/graphtransformer/data.rst
+++ b/docs/source/graphtransformer/data.rst
@@ -5,6 +5,7 @@ In this section, we will prepare the data for the Graphormer model introduced be
 
 
 .. code:: python
+
     def collate(graphs):
         # compute shortest path features, can be done in advance
         for g in graphs:
diff --git a/docs/source/graphtransformer/index.rst b/docs/source/graphtransformer/index.rst
index 691403d13fdb..f8e6363f8eb6 100644
--- a/docs/source/graphtransformer/index.rst
+++ b/docs/source/graphtransformer/index.rst
@@ -1,8 +1,8 @@
-🆕 Tutorial: GraphTransformer
+🆕 Tutorial: Graph Transformer
 ==========
 
-This tutorial introduces the **graphtransformer** module, which is a set of
-utility modules for building and training graph transformer models.
+This tutorial introduces the **graph transformer** (:mod:`~dgl.nn.gt`) module,
+which is a set of utility modules for building and training graph transformer models.
 
 .. toctree::
   :maxdepth: 2
diff --git a/docs/source/graphtransformer/model.rst b/docs/source/graphtransformer/model.rst
index eb5d74a0e2ad..78919d48f089 100644
--- a/docs/source/graphtransformer/model.rst
+++ b/docs/source/graphtransformer/model.rst
@@ -12,6 +12,7 @@ Degree Encoding
 The degree encoder is a learnable embedding layer that encodes the degree of each node into a vector. It takes as input the batched input and output degrees of graph nodes, and outputs the degree embeddings of the nodes.
 
 .. code:: python
+
     degree_encoder = dgl.nn.DegreeEncoder(
         max_degree=8,  # the maximum degree to cut off
         embedding_dim=512  # the dimension of the degree embedding
@@ -22,6 +23,7 @@ Path Encoding
 The path encoder encodes the edge features on the shortest path between two nodes to get attention bias for the self-attention module. It takes as input the batched edge features in shape  and outputs the attention bias based on path encoding.
 
 .. code:: python
+
     path_encoder = PathEncoder(
         max_len=5,  # the maximum length of the shortest path
         feat_dim=512,  # the dimension of the edge feature
@@ -33,6 +35,7 @@ Spatial Encoding
 The spatial encoder encodes the shortest distance between two nodes to get attention bias for the self-attention module. It takes as input the shortest distance between two nodes and outputs the attention bias based on spatial encoding.
 
 .. code:: python
+
     spatial_encoder = SpatialEncoder(
         max_dist=5,  # the maximum distance between two nodes
         num_heads=8,  # the number of attention heads
@@ -46,6 +49,7 @@ The Graphormer layer is like a Transformer encoder layer with the Multi-head Att
 We can stack multiple Graphormer layers as a list just like implementing a Transformer encoder in PyTorch.
 
 .. code:: python
+
     layers = th.nn.ModuleList([
         GraphormerLayer(
             feat_size=512,  # the dimension of the input node features
@@ -63,6 +67,7 @@ Model Forward
 Grouping the modules above defines the primary components of the Graphormer model. We then can define the forward process as follows:
 
 .. code:: python
+
     node_feat, in_degree, out_degree, attn_mask, path_data, dist = \
         next(iter(dataloader))  #  we will use the first batch as an example
     num_graphs, max_num_nodes, _ = node_feat.shape
@@ -84,6 +89,6 @@ Grouping the modules above defines the primary components of the Graphormer mode
             attn_bias=attn_bias,
         )
 
-For simplicity, we omit some details in the forward process. For the complete implementation, please refer to the `Graphormer example <https://github.com/dmlc/dgl/tree/master/examples/core/Graphormer`_.
+For simplicity, we omit some details in the forward process. For the complete implementation, please refer to the `Graphormer example <https://github.com/dmlc/dgl/tree/master/examples/core/Graphormer>`_.
 
 You can also explore other `utility modules <https://docs.dgl.ai/api/python/nn-pytorch.html#utility-modules-for-graph-transformer>`_ to customize your own graph transformer model. In the next section, we will show how to prepare the data for training.

From b003732d318e3c43996d79018eccb41bc90588fc Mon Sep 17 00:00:00 2001
From: Mingbang Wang <100203018+Skeleton003@users.noreply.github.com>
Date: Mon, 15 Jan 2024 15:16:58 +0800
Subject: [PATCH 12/22] [GraphBolt] Update `__repr__` of `TorchBasedFeature`
 and `TorchBasedFeatureStore` (#6945)

---
 .../impl/torch_based_feature_store.py         | 85 ++++++++++---------
 .../impl/test_torch_based_feature_store.py    | 65 +++++++-------
 2 files changed, 83 insertions(+), 67 deletions(-)

diff --git a/python/dgl/graphbolt/impl/torch_based_feature_store.py b/python/dgl/graphbolt/impl/torch_based_feature_store.py
index c33b562dd849..292b0b1d4e59 100644
--- a/python/dgl/graphbolt/impl/torch_based_feature_store.py
+++ b/python/dgl/graphbolt/impl/torch_based_feature_store.py
@@ -1,4 +1,6 @@
 """Torch-based feature store for GraphBolt."""
+
+import textwrap
 from typing import Dict, List
 
 import numpy as np
@@ -169,7 +171,37 @@ def pin_memory_(self):
         self._tensor = self._tensor.pin_memory()
 
     def __repr__(self) -> str:
-        return _torch_based_feature_str(self)
+        ret = (
+            "TorchBasedFeature(\n"
+            "    feature={feature},\n"
+            "    metadata={metadata},\n"
+            ")"
+        )
+
+        feature_str = str(self._tensor)
+        feature_str_lines = feature_str.splitlines()
+        if len(feature_str_lines) > 1:
+            feature_str = (
+                feature_str_lines[0]
+                + "\n"
+                + textwrap.indent(
+                    "\n".join(feature_str_lines[1:]), " " * len("    feature=")
+                )
+            )
+
+        metadata_str = str(self.metadata())
+        metadata_str_lines = metadata_str.splitlines()
+        if len(metadata_str_lines) > 1:
+            metadata_str = (
+                metadata_str_lines[0]
+                + "\n"
+                + textwrap.indent(
+                    "\n".join(metadata_str_lines[1:]),
+                    " " * len("    metadata="),
+                )
+            )
+
+        return ret.format(feature=feature_str, metadata=metadata_str)
 
 
 class TorchBasedFeatureStore(BasicFeatureStore):
@@ -236,40 +268,17 @@ def pin_memory_(self):
             feature.pin_memory_()
 
     def __repr__(self) -> str:
-        return _torch_based_feature_store_str(self._features)
-
-
-def _torch_based_feature_str(feature: TorchBasedFeature) -> str:
-    final_str = "TorchBasedFeature("
-    indent_len = len(final_str)
-
-    def _add_indent(_str, indent):
-        lines = _str.split("\n")
-        lines = [lines[0]] + [" " * indent + line for line in lines[1:]]
-        return "\n".join(lines)
-
-    feature_str = "feature=" + _add_indent(
-        str(feature._tensor), indent_len + len("feature=")
-    )
-    final_str += feature_str + ",\n" + " " * indent_len
-    metadata_str = "metadata=" + _add_indent(
-        str(feature.metadata()), indent_len + len("metadata=")
-    )
-    final_str += metadata_str + ",\n)"
-    return final_str
-
-
-def _torch_based_feature_store_str(
-    features: Dict[str, TorchBasedFeature]
-) -> str:
-    final_str = "TorchBasedFeatureStore"
-    indent_len = len(final_str)
-
-    def _add_indent(_str, indent):
-        lines = _str.split("\n")
-        lines = [lines[0]] + [" " * indent + line for line in lines[1:]]
-        return "\n".join(lines)
-
-    features_str = _add_indent(str(features), indent_len)
-    final_str += features_str
-    return final_str
+        ret = "TorchBasedFeatureStore(\n" + "    {features}\n" + ")"
+
+        features_str = str(self._features)
+        features_str_lines = features_str.splitlines()
+        if len(features_str_lines) > 1:
+            features_str = (
+                features_str_lines[0]
+                + "\n"
+                + textwrap.indent(
+                    "\n".join(features_str_lines[1:]), " " * len("    ")
+                )
+            )
+
+        return ret.format(features=features_str)
diff --git a/tests/python/pytorch/graphbolt/impl/test_torch_based_feature_store.py b/tests/python/pytorch/graphbolt/impl/test_torch_based_feature_store.py
index d8ce37a5580d..3de5fe3a0a82 100644
--- a/tests/python/pytorch/graphbolt/impl/test_torch_based_feature_store.py
+++ b/tests/python/pytorch/graphbolt/impl/test_torch_based_feature_store.py
@@ -296,23 +296,27 @@ def test_torch_based_feature_repr(in_memory):
         feature_a = gb.TorchBasedFeature(a, metadata=metadata)
         feature_b = gb.TorchBasedFeature(b)
 
-        expected_str_feature_a = str(
-            """TorchBasedFeature(feature=tensor([[1, 2, 3],
-                                  [4, 5, 6]]),
-                  metadata={'max_value': 3},
-)"""
+        expected_str_feature_a = (
+            "TorchBasedFeature(\n"
+            "    feature=tensor([[1, 2, 3],\n"
+            "                    [4, 5, 6]]),\n"
+            "    metadata={'max_value': 3},\n"
+            ")"
         )
-        expected_str_feature_b = str(
-            """TorchBasedFeature(feature=tensor([[[1, 2],
-                                   [3, 4]],
-                          
-                                  [[4, 5],
-                                   [6, 7]]]),
-                  metadata={},
-)"""
+        expected_str_feature_b = (
+            "TorchBasedFeature(\n"
+            "    feature=tensor([[[1, 2],\n"
+            "                     [3, 4]],\n"
+            "\n"
+            "                    [[4, 5],\n"
+            "                     [6, 7]]]),\n"
+            "    metadata={},\n"
+            ")"
         )
-        assert str(feature_a) == expected_str_feature_a
-        assert str(feature_b) == expected_str_feature_b
+
+        assert repr(feature_a) == expected_str_feature_a, feature_a
+        assert repr(feature_b) == expected_str_feature_b, feature_b
+
         a = b = metadata = None
         feature_a = feature_b = None
         expected_str_feature_a = expected_str_feature_b = None
@@ -345,21 +349,24 @@ def test_torch_based_feature_store_repr(in_memory):
         ]
         feature_store = gb.TorchBasedFeatureStore(feature_data)
 
-        expected_feature_store_str = str(
-            """TorchBasedFeatureStore{(<OnDiskFeatureDataDomain.NODE: 'node'>, 'paper', 'a'): TorchBasedFeature(feature=tensor([[1, 2, 4],
-                                                        [2, 5, 3]]),
-                                        metadata={},
-                      ), (<OnDiskFeatureDataDomain.EDGE: 'edge'>, 'paper:cites:paper', 'b'): TorchBasedFeature(feature=tensor([[[1, 2],
-                                                         [3, 4]],
-                                                
-                                                        [[2, 5],
-                                                         [3, 4]]]),
-                                        metadata={},
-                      )}"""
-        )
-        assert str(feature_store) == expected_feature_store_str, print(
-            feature_store
+        expected_feature_store_str = (
+            "TorchBasedFeatureStore(\n"
+            "    {(<OnDiskFeatureDataDomain.NODE: 'node'>, 'paper', 'a'): TorchBasedFeature(\n"
+            "        feature=tensor([[1, 2, 4],\n"
+            "                        [2, 5, 3]]),\n"
+            "        metadata={},\n"
+            "    ), (<OnDiskFeatureDataDomain.EDGE: 'edge'>, 'paper:cites:paper', 'b'): TorchBasedFeature(\n"
+            "        feature=tensor([[[1, 2],\n"
+            "                         [3, 4]],\n"
+            "\n"
+            "                        [[2, 5],\n"
+            "                         [3, 4]]]),\n"
+            "        metadata={},\n"
+            "    )}\n"
+            ")"
         )
 
+        assert repr(feature_store) == expected_feature_store_str, feature_store
+
         a = b = feature_data = None
         feature_store = expected_feature_store_str = None

From 90e57e741f249d43d2643ab327d07d4749ed847a Mon Sep 17 00:00:00 2001
From: Mingbang Wang <100203018+Skeleton003@users.noreply.github.com>
Date: Mon, 15 Jan 2024 15:17:27 +0800
Subject: [PATCH 13/22] [GraphBolt] Update `__repr__` of `ItemSet` and
 `ItemSetDict` (#6944)

---
 python/dgl/graphbolt/itemset.py               | 54 ++++++------
 .../graphbolt/impl/test_ondisk_dataset.py     | 84 +++++++++++--------
 .../python/pytorch/graphbolt/test_itemset.py  | 65 +++++++-------
 3 files changed, 110 insertions(+), 93 deletions(-)

diff --git a/python/dgl/graphbolt/itemset.py b/python/dgl/graphbolt/itemset.py
index 5976fe7798b0..8697ac9c59e4 100644
--- a/python/dgl/graphbolt/itemset.py
+++ b/python/dgl/graphbolt/itemset.py
@@ -1,5 +1,6 @@
 """GraphBolt Itemset."""
 
+import textwrap
 from typing import Dict, Iterable, Iterator, Sized, Tuple, Union
 
 import torch
@@ -175,7 +176,14 @@ def names(self) -> Tuple[str]:
         return self._names
 
     def __repr__(self) -> str:
-        return _itemset_str(self, "ItemSet")
+        ret = (
+            f"ItemSet(\n"
+            f"    items={self._items},\n"
+            f"    names={self._names},\n"
+            f")"
+        )
+
+        return ret
 
 
 class ItemSetDict:
@@ -330,31 +338,19 @@ def names(self) -> Tuple[str]:
         return self._names
 
     def __repr__(self) -> str:
-        return _itemset_str(self, "ItemSetDict")
-
-
-def _itemset_str(itemset: Union[ItemSet, ItemSetDict], name) -> str:
-    final_str = f"{name}("
-    indent_len = len(final_str)
-
-    def _add_indent(_str, indent):
-        lines = _str.split("\n")
-        lines = [lines[0]] + [" " * indent + line for line in lines[1:]]
-        return "\n".join(lines)
-
-    items = (
-        itemset._items if isinstance(itemset, ItemSet) else itemset._itemsets
-    )
-    item_str = (
-        "items="
-        + _add_indent(str(items), indent_len + len("items="))
-        + ",\n"
-        + " " * indent_len
-    )
-    name_str = (
-        "names="
-        + _add_indent(str(itemset._names), indent_len + len("items="))
-        + ",\n)"
-    )
-    final_str += item_str + name_str
-    return final_str
+        ret = (
+            "ItemSetDict(\n"
+            "    itemsets={itemsets},\n"
+            "    names={names},\n"
+            ")"
+        )
+
+        itemsets_str = repr(self._itemsets)
+        lines = itemsets_str.splitlines()
+        itemsets_str = (
+            lines[0]
+            + "\n"
+            + textwrap.indent("\n".join(lines[1:]), " " * len("    itemsets="))
+        )
+
+        return ret.format(itemsets=itemsets_str, names=self._names)
diff --git a/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py b/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py
index 1eb70ef1bfe3..c3a9beab9749 100644
--- a/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py
+++ b/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py
@@ -2348,18 +2348,21 @@ def test_OnDiskTask_repr_homogeneous():
     )
     metadata = {"name": "node_classification"}
     task = gb.OnDiskTask(metadata, item_set, item_set, item_set)
-    expected_str = str(
-        """OnDiskTask(validation_set=ItemSet(items=(tensor([0, 1, 2, 3, 4]), tensor([5, 6, 7, 8, 9])),
-                                  names=('seed_nodes', 'labels'),
-                          ),
-           train_set=ItemSet(items=(tensor([0, 1, 2, 3, 4]), tensor([5, 6, 7, 8, 9])),
-                             names=('seed_nodes', 'labels'),
-                     ),
-           test_set=ItemSet(items=(tensor([0, 1, 2, 3, 4]), tensor([5, 6, 7, 8, 9])),
-                            names=('seed_nodes', 'labels'),
-                    ),
-           metadata={'name': 'node_classification'},
-)"""
+    expected_str = (
+        "OnDiskTask(validation_set=ItemSet(\n"
+        "                              items=(tensor([0, 1, 2, 3, 4]), tensor([5, 6, 7, 8, 9])),\n"
+        "                              names=('seed_nodes', 'labels'),\n"
+        "                          ),\n"
+        "           train_set=ItemSet(\n"
+        "                         items=(tensor([0, 1, 2, 3, 4]), tensor([5, 6, 7, 8, 9])),\n"
+        "                         names=('seed_nodes', 'labels'),\n"
+        "                     ),\n"
+        "           test_set=ItemSet(\n"
+        "                        items=(tensor([0, 1, 2, 3, 4]), tensor([5, 6, 7, 8, 9])),\n"
+        "                        names=('seed_nodes', 'labels'),\n"
+        "                    ),\n"
+        "           metadata={'name': 'node_classification'},\n"
+        ")"
     )
     assert str(task) == expected_str, print(task)
 
@@ -2373,30 +2376,39 @@ def test_OnDiskTask_repr_heterogeneous():
     )
     metadata = {"name": "node_classification"}
     task = gb.OnDiskTask(metadata, item_set, item_set, item_set)
-    expected_str = str(
-        """OnDiskTask(validation_set=ItemSetDict(items={'user': ItemSet(items=(tensor([0, 1, 2, 3, 4]),),
-                                                    names=('seed_nodes',),
-                                            ), 'item': ItemSet(items=(tensor([5, 6, 7, 8, 9]),),
-                                                    names=('seed_nodes',),
-                                            )},
-                                      names=('seed_nodes',),
-                          ),
-           train_set=ItemSetDict(items={'user': ItemSet(items=(tensor([0, 1, 2, 3, 4]),),
-                                               names=('seed_nodes',),
-                                       ), 'item': ItemSet(items=(tensor([5, 6, 7, 8, 9]),),
-                                               names=('seed_nodes',),
-                                       )},
-                                 names=('seed_nodes',),
-                     ),
-           test_set=ItemSetDict(items={'user': ItemSet(items=(tensor([0, 1, 2, 3, 4]),),
-                                              names=('seed_nodes',),
-                                      ), 'item': ItemSet(items=(tensor([5, 6, 7, 8, 9]),),
-                                              names=('seed_nodes',),
-                                      )},
-                                names=('seed_nodes',),
-                    ),
-           metadata={'name': 'node_classification'},
-)"""
+    expected_str = (
+        "OnDiskTask(validation_set=ItemSetDict(\n"
+        "                              itemsets={'user': ItemSet(\n"
+        "                                           items=(tensor([0, 1, 2, 3, 4]),),\n"
+        "                                           names=('seed_nodes',),\n"
+        "                                       ), 'item': ItemSet(\n"
+        "                                           items=(tensor([5, 6, 7, 8, 9]),),\n"
+        "                                           names=('seed_nodes',),\n"
+        "                                       )},\n"
+        "                              names=('seed_nodes',),\n"
+        "                          ),\n"
+        "           train_set=ItemSetDict(\n"
+        "                         itemsets={'user': ItemSet(\n"
+        "                                      items=(tensor([0, 1, 2, 3, 4]),),\n"
+        "                                      names=('seed_nodes',),\n"
+        "                                  ), 'item': ItemSet(\n"
+        "                                      items=(tensor([5, 6, 7, 8, 9]),),\n"
+        "                                      names=('seed_nodes',),\n"
+        "                                  )},\n"
+        "                         names=('seed_nodes',),\n"
+        "                     ),\n"
+        "           test_set=ItemSetDict(\n"
+        "                        itemsets={'user': ItemSet(\n"
+        "                                     items=(tensor([0, 1, 2, 3, 4]),),\n"
+        "                                     names=('seed_nodes',),\n"
+        "                                 ), 'item': ItemSet(\n"
+        "                                     items=(tensor([5, 6, 7, 8, 9]),),\n"
+        "                                     names=('seed_nodes',),\n"
+        "                                 )},\n"
+        "                        names=('seed_nodes',),\n"
+        "                    ),\n"
+        "           metadata={'name': 'node_classification'},\n"
+        ")"
     )
     assert str(task) == expected_str, print(task)
 
diff --git a/tests/python/pytorch/graphbolt/test_itemset.py b/tests/python/pytorch/graphbolt/test_itemset.py
index 3174c2d42910..58238aeec0f4 100644
--- a/tests/python/pytorch/graphbolt/test_itemset.py
+++ b/tests/python/pytorch/graphbolt/test_itemset.py
@@ -529,24 +529,27 @@ def test_ItemSetDict_iteration_node_pairs_neg_dsts():
 def test_ItemSet_repr():
     # ItemSet with single name.
     item_set = gb.ItemSet(torch.arange(0, 5), names="seed_nodes")
-    expected_str = str(
-        """ItemSet(items=(tensor([0, 1, 2, 3, 4]),),
-        names=('seed_nodes',),
-)"""
+    expected_str = (
+        "ItemSet(\n"
+        "    items=(tensor([0, 1, 2, 3, 4]),),\n"
+        "    names=('seed_nodes',),\n"
+        ")"
     )
-    assert str(item_set) == expected_str, print(item_set)
+
+    assert str(item_set) == expected_str, item_set
 
     # ItemSet with multiple names.
     item_set = gb.ItemSet(
         (torch.arange(0, 5), torch.arange(5, 10)),
         names=("seed_nodes", "labels"),
     )
-    expected_str = str(
-        """ItemSet(items=(tensor([0, 1, 2, 3, 4]), tensor([5, 6, 7, 8, 9])),
-        names=('seed_nodes', 'labels'),
-)"""
+    expected_str = (
+        "ItemSet(\n"
+        "    items=(tensor([0, 1, 2, 3, 4]), tensor([5, 6, 7, 8, 9])),\n"
+        "    names=('seed_nodes', 'labels'),\n"
+        ")"
     )
-    assert str(item_set) == expected_str, print(item_set)
+    assert str(item_set) == expected_str, item_set
 
 
 def test_ItemSetDict_repr():
@@ -557,16 +560,19 @@ def test_ItemSetDict_repr():
             "item": gb.ItemSet(torch.arange(5, 10), names="seed_nodes"),
         }
     )
-    expected_str = str(
-        """ItemSetDict(items={'user': ItemSet(items=(tensor([0, 1, 2, 3, 4]),),
-                          names=('seed_nodes',),
-                  ), 'item': ItemSet(items=(tensor([5, 6, 7, 8, 9]),),
-                          names=('seed_nodes',),
-                  )},
-            names=('seed_nodes',),
-)"""
+    expected_str = (
+        "ItemSetDict(\n"
+        "    itemsets={'user': ItemSet(\n"
+        "                 items=(tensor([0, 1, 2, 3, 4]),),\n"
+        "                 names=('seed_nodes',),\n"
+        "             ), 'item': ItemSet(\n"
+        "                 items=(tensor([5, 6, 7, 8, 9]),),\n"
+        "                 names=('seed_nodes',),\n"
+        "             )},\n"
+        "    names=('seed_nodes',),\n"
+        ")"
     )
-    assert str(item_set) == expected_str, print(item_set)
+    assert str(item_set) == expected_str, item_set
 
     # ItemSetDict with multiple names.
     item_set = gb.ItemSetDict(
@@ -581,13 +587,16 @@ def test_ItemSetDict_repr():
             ),
         }
     )
-    expected_str = str(
-        """ItemSetDict(items={'user': ItemSet(items=(tensor([0, 1, 2, 3, 4]), tensor([5, 6, 7, 8, 9])),
-                          names=('seed_nodes', 'labels'),
-                  ), 'item': ItemSet(items=(tensor([5, 6, 7, 8, 9]), tensor([10, 11, 12, 13, 14])),
-                          names=('seed_nodes', 'labels'),
-                  )},
-            names=('seed_nodes', 'labels'),
-)"""
+    expected_str = (
+        "ItemSetDict(\n"
+        "    itemsets={'user': ItemSet(\n"
+        "                 items=(tensor([0, 1, 2, 3, 4]), tensor([5, 6, 7, 8, 9])),\n"
+        "                 names=('seed_nodes', 'labels'),\n"
+        "             ), 'item': ItemSet(\n"
+        "                 items=(tensor([5, 6, 7, 8, 9]), tensor([10, 11, 12, 13, 14])),\n"
+        "                 names=('seed_nodes', 'labels'),\n"
+        "             )},\n"
+        "    names=('seed_nodes', 'labels'),\n"
+        ")"
     )
-    assert str(item_set) == expected_str, print(item_set)
+    assert str(item_set) == expected_str, item_set

From 9a8aa8fa0f42d4f75512178d47b3537044216046 Mon Sep 17 00:00:00 2001
From: peizhou001 <110809584+peizhou001@users.noreply.github.com>
Date: Mon, 15 Jan 2024 17:50:47 +0800
Subject: [PATCH 14/22] [Grapbolt]Negative node pairs should be 2D (#6951)

Co-authored-by: Ubuntu <ubuntu@ip-172-31-21-218.ap-northeast-1.compute.internal>
---
 python/dgl/graphbolt/minibatch.py             | 42 ++++++++++---------
 python/dgl/graphbolt/subgraph_sampler.py      |  6 +++
 .../pytorch/graphbolt/impl/test_minibatch.py  | 22 ++++++----
 .../pytorch/graphbolt/test_integration.py     | 26 +++++++++---
 4 files changed, 63 insertions(+), 33 deletions(-)

diff --git a/python/dgl/graphbolt/minibatch.py b/python/dgl/graphbolt/minibatch.py
index f21095418dc0..ec7ead0c36a8 100644
--- a/python/dgl/graphbolt/minibatch.py
+++ b/python/dgl/graphbolt/minibatch.py
@@ -299,15 +299,15 @@ def negative_node_pairs(self):
             # For homogeneous graph.
             if isinstance(self.compacted_negative_srcs, torch.Tensor):
                 negative_node_pairs = (
-                    self.compacted_negative_srcs.view(-1),
-                    self.compacted_negative_dsts.view(-1),
+                    self.compacted_negative_srcs,
+                    self.compacted_negative_dsts,
                 )
             # For heterogeneous graph.
             else:
                 negative_node_pairs = {
                     etype: (
-                        neg_src.view(-1),
-                        self.compacted_negative_dsts[etype].view(-1),
+                        neg_src,
+                        self.compacted_negative_dsts[etype],
                     )
                     for etype, neg_src in self.compacted_negative_srcs.items()
                 }
@@ -319,10 +319,10 @@ def negative_node_pairs(self):
             if isinstance(self.compacted_negative_srcs, torch.Tensor):
                 negative_ratio = self.compacted_negative_srcs.size(1)
                 negative_node_pairs = (
-                    self.compacted_negative_srcs.view(-1),
-                    self.compacted_node_pairs[1].repeat_interleave(
-                        negative_ratio
-                    ),
+                    self.compacted_negative_srcs,
+                    self.compacted_node_pairs[1]
+                    .repeat_interleave(negative_ratio)
+                    .view(-1, negative_ratio),
                 )
             # For heterogeneous graph.
             else:
@@ -331,10 +331,10 @@ def negative_node_pairs(self):
                 ].size(1)
                 negative_node_pairs = {
                     etype: (
-                        neg_src.view(-1),
-                        self.compacted_node_pairs[etype][1].repeat_interleave(
-                            negative_ratio
-                        ),
+                        neg_src,
+                        self.compacted_node_pairs[etype][1]
+                        .repeat_interleave(negative_ratio)
+                        .view(-1, negative_ratio),
                     )
                     for etype, neg_src in self.compacted_negative_srcs.items()
                 }
@@ -346,10 +346,10 @@ def negative_node_pairs(self):
             if isinstance(self.compacted_negative_dsts, torch.Tensor):
                 negative_ratio = self.compacted_negative_dsts.size(1)
                 negative_node_pairs = (
-                    self.compacted_node_pairs[0].repeat_interleave(
-                        negative_ratio
-                    ),
-                    self.compacted_negative_dsts.view(-1),
+                    self.compacted_node_pairs[0]
+                    .repeat_interleave(negative_ratio)
+                    .view(-1, negative_ratio),
+                    self.compacted_negative_dsts,
                 )
             # For heterogeneous graph.
             else:
@@ -358,10 +358,10 @@ def negative_node_pairs(self):
                 ].size(1)
                 negative_node_pairs = {
                     etype: (
-                        self.compacted_node_pairs[etype][0].repeat_interleave(
-                            negative_ratio
-                        ),
-                        neg_dst.view(-1),
+                        self.compacted_node_pairs[etype][0]
+                        .repeat_interleave(negative_ratio)
+                        .view(-1, negative_ratio),
+                        neg_dst,
                     )
                     for etype, neg_dst in self.compacted_negative_dsts.items()
                 }
@@ -396,6 +396,7 @@ def node_pairs_with_labels(self):
                 for etype in positive_node_pairs:
                     pos_src, pos_dst = positive_node_pairs[etype]
                     neg_src, neg_dst = negative_node_pairs[etype]
+                    neg_src, neg_dst = neg_src.view(-1), neg_dst.view(-1)
                     node_pairs_by_etype[etype] = (
                         torch.cat((pos_src, neg_src), dim=0),
                         torch.cat((pos_dst, neg_dst), dim=0),
@@ -410,6 +411,7 @@ def node_pairs_with_labels(self):
                 # Homogeneous graph.
                 pos_src, pos_dst = positive_node_pairs
                 neg_src, neg_dst = negative_node_pairs
+                neg_src, neg_dst = neg_src.view(-1), neg_dst.view(-1)
                 node_pairs = (
                     torch.cat((pos_src, neg_src), dim=0),
                     torch.cat((pos_dst, neg_dst), dim=0),
diff --git a/python/dgl/graphbolt/subgraph_sampler.py b/python/dgl/graphbolt/subgraph_sampler.py
index ce22f607c663..3e3c3d9b507c 100644
--- a/python/dgl/graphbolt/subgraph_sampler.py
+++ b/python/dgl/graphbolt/subgraph_sampler.py
@@ -130,10 +130,16 @@ def _node_pairs_preprocess(self, minibatch):
                 for etype, _ in neg_src.items():
                     src_type, _, _ = etype_str_to_tuple(etype)
                     compacted_negative_srcs[etype] = compacted[src_type].pop(0)
+                    compacted_negative_srcs[etype] = compacted_negative_srcs[
+                        etype
+                    ].view(neg_src[etype].shape)
             if has_neg_dst:
                 for etype, _ in neg_dst.items():
                     _, _, dst_type = etype_str_to_tuple(etype)
                     compacted_negative_dsts[etype] = compacted[dst_type].pop(0)
+                    compacted_negative_dsts[etype] = compacted_negative_dsts[
+                        etype
+                    ].view(neg_dst[etype].shape)
         else:
             # Collect nodes from all types of input.
             nodes = list(node_pairs)
diff --git a/tests/python/pytorch/graphbolt/impl/test_minibatch.py b/tests/python/pytorch/graphbolt/impl/test_minibatch.py
index a32e3e4a6e53..79e75df6bb56 100644
--- a/tests/python/pytorch/graphbolt/impl/test_minibatch.py
+++ b/tests/python/pytorch/graphbolt/impl/test_minibatch.py
@@ -125,8 +125,12 @@ def test_minibatch_representation_homo():
           negative_srcs=tensor([[8],
                                 [1],
                                 [6]]),
-          negative_node_pairs=(tensor([0, 1, 2]),
-                              tensor([6, 0, 0])),
+          negative_node_pairs=(tensor([[0],
+                                      [1],
+                                      [2]]),
+                              tensor([[6],
+                                      [0],
+                                      [0]])),
           negative_dsts=tensor([[2],
                                 [8],
                                 [8]]),
@@ -278,7 +282,11 @@ def test_minibatch_representation_hetero():
           negative_srcs={'B': tensor([[8],
                                 [1],
                                 [6]])},
-          negative_node_pairs={'A:r:B': (tensor([0, 1, 2]), tensor([6, 0, 0]))},
+          negative_node_pairs={'A:r:B': (tensor([[0],
+                                      [1],
+                                      [2]]), tensor([[6],
+                                      [0],
+                                      [0]]))},
           negative_dsts={'B': tensor([[2],
                                 [8],
                                 [8]])},
@@ -773,12 +781,12 @@ def test_dgl_link_predication_homo(mode):
     if mode == "neg_graph" or mode == "neg_src":
         assert torch.equal(
             minibatch.negative_node_pairs[0],
-            minibatch.compacted_negative_srcs.view(-1),
+            minibatch.compacted_negative_srcs,
         )
     if mode == "neg_graph" or mode == "neg_dst":
         assert torch.equal(
             minibatch.negative_node_pairs[1],
-            minibatch.compacted_negative_dsts.view(-1),
+            minibatch.compacted_negative_dsts,
         )
     (
         node_pairs,
@@ -834,11 +842,11 @@ def test_dgl_link_predication_hetero(mode):
         for etype, src in minibatch.compacted_negative_srcs.items():
             assert torch.equal(
                 minibatch.negative_node_pairs[etype][0],
-                src.view(-1),
+                src,
             )
     if mode == "neg_graph" or mode == "neg_dst":
         for etype, dst in minibatch.compacted_negative_dsts.items():
             assert torch.equal(
                 minibatch.negative_node_pairs[etype][1],
-                minibatch.compacted_negative_dsts[etype].view(-1),
+                minibatch.compacted_negative_dsts[etype],
             )
diff --git a/tests/python/pytorch/graphbolt/test_integration.py b/tests/python/pytorch/graphbolt/test_integration.py
index fe3b5c0a2daa..bea5f234869c 100644
--- a/tests/python/pytorch/graphbolt/test_integration.py
+++ b/tests/python/pytorch/graphbolt/test_integration.py
@@ -88,8 +88,14 @@ def test_integration_link_prediction():
                                 [0.9634, 0.2294],
                                 [0.5503, 0.8223]])},
           negative_srcs=None,
-          negative_node_pairs=(tensor([0, 0, 1, 1, 1, 1, 1, 1]),
-                              tensor([4, 4, 1, 4, 0, 1, 1, 5])),
+          negative_node_pairs=(tensor([[0, 0],
+                                      [1, 1],
+                                      [1, 1],
+                                      [1, 1]]),
+                              tensor([[4, 4],
+                                      [1, 4],
+                                      [0, 1],
+                                      [1, 5]])),
           negative_dsts=tensor([[0, 0],
                                 [3, 0],
                                 [5, 3],
@@ -138,8 +144,14 @@ def test_integration_link_prediction():
                                 [0.5160, 0.2486],
                                 [0.2109, 0.1089]])},
           negative_srcs=None,
-          negative_node_pairs=(tensor([0, 0, 1, 1, 1, 1, 2, 2]),
-                              tensor([3, 4, 5, 4, 1, 0, 3, 4])),
+          negative_node_pairs=(tensor([[0, 0],
+                                      [1, 1],
+                                      [1, 1],
+                                      [2, 2]]),
+                              tensor([[3, 4],
+                                      [5, 4],
+                                      [1, 0],
+                                      [3, 4]])),
           negative_dsts=tensor([[1, 5],
                                 [2, 5],
                                 [4, 3],
@@ -186,8 +198,10 @@ def test_integration_link_prediction():
                                 [0.9634, 0.2294],
                                 [0.6172, 0.7865]])},
           negative_srcs=None,
-          negative_node_pairs=(tensor([0, 0, 1, 1]),
-                              tensor([2, 1, 2, 3])),
+          negative_node_pairs=(tensor([[0, 0],
+                                      [1, 1]]),
+                              tensor([[2, 1],
+                                      [2, 3]])),
           negative_dsts=tensor([[0, 4],
                                 [0, 1]]),
           labels=None,

From 40816f6e4069b353b25cbed5e0956257843f5b65 Mon Sep 17 00:00:00 2001
From: yxy235 <77922129+yxy235@users.noreply.github.com>
Date: Tue, 16 Jan 2024 10:04:19 +0800
Subject: [PATCH 15/22] [GraphBolt] Automatically force preprocess on-disk
 dataset. (#6937)

Co-authored-by: Ubuntu <ubuntu@ip-172-31-0-133.us-west-2.compute.internal>
---
 python/dgl/graphbolt/impl/ondisk_dataset.py   |  47 +++-
 python/dgl/graphbolt/internal/utils.py        |  51 ++++
 .../graphbolt/impl/test_ondisk_dataset.py     | 230 +++++++++++++++++-
 .../pytorch/graphbolt/utils/test_internal.py  |  66 +++++
 4 files changed, 381 insertions(+), 13 deletions(-)

diff --git a/python/dgl/graphbolt/impl/ondisk_dataset.py b/python/dgl/graphbolt/impl/ondisk_dataset.py
index 8fb198c4875a..11245c458061 100644
--- a/python/dgl/graphbolt/impl/ondisk_dataset.py
+++ b/python/dgl/graphbolt/impl/ondisk_dataset.py
@@ -1,5 +1,6 @@
 """GraphBolt OnDiskDataset."""
 
+import json
 import os
 import shutil
 from copy import deepcopy
@@ -15,6 +16,8 @@
 from ..base import etype_str_to_tuple
 from ..dataset import Dataset, Task
 from ..internal import (
+    calculate_dir_hash,
+    check_dataset_change,
     copy_or_convert_data,
     get_attributes,
     read_data,
@@ -37,7 +40,7 @@
 def preprocess_ondisk_dataset(
     dataset_dir: str,
     include_original_edge_id: bool = False,
-    force_preprocess: bool = False,
+    force_preprocess: bool = None,
 ) -> str:
     """Preprocess the on-disk dataset. Parse the input config file,
     load the data, and save the data in the format that GraphBolt supports.
@@ -72,6 +75,20 @@ def preprocess_ondisk_dataset(
         processed_dir_prefix, "metadata.yaml"
     )
     if os.path.exists(os.path.join(dataset_dir, preprocess_metadata_path)):
+        if force_preprocess is None:
+            with open(
+                os.path.join(dataset_dir, preprocess_metadata_path), "r"
+            ) as f:
+                preprocess_config = yaml.safe_load(f)
+            if (
+                preprocess_config.get("include_original_edge_id", None)
+                == include_original_edge_id
+            ):
+                force_preprocess = check_dataset_change(
+                    dataset_dir, processed_dir_prefix
+                )
+            else:
+                force_preprocess = True
         if force_preprocess:
             shutil.rmtree(os.path.join(dataset_dir, processed_dir_prefix))
             print(
@@ -180,7 +197,10 @@ def preprocess_ondisk_dataset(
         g, is_homogeneous, include_original_edge_id
     )
 
-    # 5. Save the FusedCSCSamplingGraph and modify the output_config.
+    # 5. Record value of include_original_edge_id.
+    output_config["include_original_edge_id"] = include_original_edge_id
+
+    # 6. Save the FusedCSCSamplingGraph and modify the output_config.
     output_config["graph_topology"] = {}
     output_config["graph_topology"]["type"] = "FusedCSCSamplingGraph"
     output_config["graph_topology"]["path"] = os.path.join(
@@ -196,7 +216,7 @@ def preprocess_ondisk_dataset(
     )
     del output_config["graph"]
 
-    # 6. Load the node/edge features and do necessary conversion.
+    # 7. Load the node/edge features and do necessary conversion.
     if input_config.get("feature_data", None):
         for feature, out_feature in zip(
             input_config["feature_data"], output_config["feature_data"]
@@ -218,7 +238,7 @@ def preprocess_ondisk_dataset(
                 is_feature=True,
             )
 
-    # 7. Save tasks and train/val/test split according to the output_config.
+    # 8. Save tasks and train/val/test split according to the output_config.
     if input_config.get("tasks", None):
         for input_task, output_task in zip(
             input_config["tasks"], output_config["tasks"]
@@ -245,13 +265,24 @@ def preprocess_ondisk_dataset(
                             output_data["format"],
                         )
 
-    # 8. Save the output_config.
+    # 9. Save the output_config.
     output_config_path = os.path.join(dataset_dir, preprocess_metadata_path)
     with open(output_config_path, "w") as f:
         yaml.dump(output_config, f)
     print("Finish preprocessing the on-disk dataset.")
 
-    # 9. Return the absolute path of the preprocessing yaml file.
+    # 10. Calculate and save the hash value of the dataset directory.
+    hash_value_file = "dataset_hash_value.txt"
+    hash_value_file_path = os.path.join(
+        dataset_dir, processed_dir_prefix, hash_value_file
+    )
+    if os.path.exists(hash_value_file_path):
+        os.remove(hash_value_file_path)
+    dir_hash = calculate_dir_hash(dataset_dir)
+    with open(hash_value_file_path, "w") as f:
+        f.write(json.dumps(dir_hash, indent=4))
+
+    # 11. Return the absolute path of the preprocessing yaml file.
     return output_config_path
 
 
@@ -398,7 +429,7 @@ def __init__(
         self,
         path: str,
         include_original_edge_id: bool = False,
-        force_preprocess: bool = False,
+        force_preprocess: bool = None,
     ) -> None:
         # Always call the preprocess function first. If already preprocessed,
         # the function will return the original path directly.
@@ -720,7 +751,7 @@ def __init__(self, name: str, root: str = "datasets") -> OnDiskDataset:
             download(url, path=zip_file_path)
             extract_archive(zip_file_path, root, overwrite=True)
             os.remove(zip_file_path)
-        super().__init__(dataset_dir)
+        super().__init__(dataset_dir, force_preprocess=False)
 
 
 def _ondisk_task_str(task: OnDiskTask) -> str:
diff --git a/python/dgl/graphbolt/internal/utils.py b/python/dgl/graphbolt/internal/utils.py
index ee61037ffd7b..af5bcdb6db93 100644
--- a/python/dgl/graphbolt/internal/utils.py
+++ b/python/dgl/graphbolt/internal/utils.py
@@ -1,7 +1,10 @@
 """Utility functions for GraphBolt."""
 
+import hashlib
+import json
 import os
 import shutil
+from typing import List, Union
 
 import numpy as np
 import pandas as pd
@@ -145,3 +148,51 @@ def read_edges(dataset_dir, edge_fmt, edge_path):
         )
         src, dst = edge_data["src"].to_numpy(), edge_data["dst"].to_numpy()
     return (src, dst)
+
+
+def calculate_file_hash(file_path, hash_algo="md5"):
+    """Calculate the hash value of a file."""
+    hash_algos = ["md5", "sha1", "sha224", "sha256", "sha384", "sha512"]
+    if hash_algo in hash_algos:
+        hash_obj = getattr(hashlib, hash_algo)()
+    else:
+        raise ValueError(
+            f"Hash algorithm must be one of: {hash_algos}, but got `{hash_algo}`."
+        )
+    with open(file_path, "rb") as file:
+        for chunk in iter(lambda: file.read(4096), b""):
+            hash_obj.update(chunk)
+    return hash_obj.hexdigest()
+
+
+def calculate_dir_hash(
+    dir_path, hash_algo="md5", ignore: Union[str, List[str]] = None
+):
+    """Calculte the hash values of all files under the directory."""
+    hashes = {}
+    for dirpath, _, filenames in os.walk(dir_path):
+        for filename in filenames:
+            if ignore and filename in ignore:
+                continue
+            filepath = os.path.join(dirpath, filename)
+            file_hash = calculate_file_hash(filepath, hash_algo=hash_algo)
+            hashes[filepath] = file_hash
+    return hashes
+
+
+def check_dataset_change(dataset_dir, processed_dir):
+    """Check whether dataset has been changed by checking its hash value."""
+    hash_value_file = "dataset_hash_value.txt"
+    hash_value_file_path = os.path.join(
+        dataset_dir, processed_dir, hash_value_file
+    )
+    if not os.path.exists(hash_value_file_path):
+        return True
+    with open(hash_value_file_path, "r") as f:
+        oringinal_hash_value = json.load(f)
+    present_hash_value = calculate_dir_hash(dataset_dir, ignore=hash_value_file)
+    if oringinal_hash_value == present_hash_value:
+        force_preprocess = False
+    else:
+        force_preprocess = True
+    return force_preprocess
diff --git a/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py b/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py
index c3a9beab9749..1e5fdd4fbabe 100644
--- a/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py
+++ b/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py
@@ -32,9 +32,11 @@ def load_dataset(dataset):
         return dataset.load()
 
 
-def write_yaml_and_load_dataset(yaml_content, dir):
+def write_yaml_and_load_dataset(yaml_content, dir, force_preprocess=False):
     write_yaml_file(yaml_content, dir)
-    return load_dataset(gb.OnDiskDataset(dir))
+    return load_dataset(
+        gb.OnDiskDataset(dir, force_preprocess=force_preprocess)
+    )
 
 
 def test_OnDiskDataset_TVTSet_exceptions():
@@ -52,7 +54,7 @@ def test_OnDiskDataset_TVTSet_exceptions():
         """
         write_yaml_file(yaml_content, test_dir)
         with pytest.raises(pydantic.ValidationError):
-            _ = gb.OnDiskDataset(test_dir).load()
+            _ = gb.OnDiskDataset(test_dir, force_preprocess=False).load()
 
         # Case 2: ``type`` is not specified while multiple TVT sets are
         # specified.
@@ -74,7 +76,7 @@ def test_OnDiskDataset_TVTSet_exceptions():
             AssertionError,
             match=r"Only one TVT set is allowed if type is not specified.",
         ):
-            _ = gb.OnDiskDataset(test_dir).load()
+            _ = gb.OnDiskDataset(test_dir, force_preprocess=False).load()
 
 
 def test_OnDiskDataset_multiple_tasks():
@@ -1001,7 +1003,7 @@ def test_OnDiskDataset_Graph_Exceptions():
             pydantic.ValidationError,
             match="1 validation error for OnDiskMetaData",
         ):
-            _ = gb.OnDiskDataset(test_dir).load()
+            _ = gb.OnDiskDataset(test_dir, force_preprocess=False).load()
 
 
 def test_OnDiskDataset_Graph_homogeneous():
@@ -1359,6 +1361,7 @@ def test_OnDiskDataset_preprocess_yaml_content_unix():
                     data:
                       - format: numpy
                         path: preprocessed/set/test.npy
+            include_original_edge_id: False
         """
         target_yaml_data = yaml.safe_load(target_yaml_content)
         # Check yaml content.
@@ -1513,6 +1516,7 @@ def test_OnDiskDataset_preprocess_yaml_content_windows():
                     data:
                       - format: numpy
                         path: preprocessed\\set\\test.npy
+            include_original_edge_id: False
         """
         target_yaml_data = yaml.safe_load(target_yaml_content)
         # Check yaml content.
@@ -1609,6 +1613,119 @@ def test_OnDiskDataset_preprocess_force_preprocess(capsys):
         assert target_yaml_data["tasks"][0]["name"] == "fake_name"
 
 
+def test_OnDiskDataset_preprocess_auto_force_preprocess(capsys):
+    """Test force preprocess of OnDiskDataset."""
+    with tempfile.TemporaryDirectory() as test_dir:
+        # All metadata fields are specified.
+        dataset_name = "graphbolt_test"
+        num_nodes = 4000
+        num_edges = 20000
+        num_classes = 10
+
+        # Generate random graph.
+        yaml_content = gbt.random_homo_graphbolt_graph(
+            test_dir,
+            dataset_name,
+            num_nodes,
+            num_edges,
+            num_classes,
+        )
+        yaml_file = os.path.join(test_dir, "metadata.yaml")
+        with open(yaml_file, "w") as f:
+            f.write(yaml_content)
+
+        # First preprocess on-disk dataset.
+        preprocessed_metadata_path = (
+            gb.ondisk_dataset.preprocess_ondisk_dataset(
+                test_dir, include_original_edge_id=False
+            )
+        )
+        captured = capsys.readouterr().out.split("\n")
+        assert captured == [
+            "Start to preprocess the on-disk dataset.",
+            "Finish preprocessing the on-disk dataset.",
+            "",
+        ]
+        with open(preprocessed_metadata_path, "r") as f:
+            target_yaml_data = yaml.safe_load(f)
+        assert target_yaml_data["tasks"][0]["name"] == "link_prediction"
+
+        # 1. Change yaml_data.
+        with open(yaml_file, "r") as f:
+            yaml_data = yaml.safe_load(f)
+        yaml_data["tasks"][0]["name"] = "fake_name"
+        with open(yaml_file, "w") as f:
+            yaml.dump(yaml_data, f)
+        preprocessed_metadata_path = (
+            gb.ondisk_dataset.preprocess_ondisk_dataset(
+                test_dir, include_original_edge_id=False
+            )
+        )
+        captured = capsys.readouterr().out.split("\n")
+        assert captured == [
+            "The on-disk dataset is re-preprocessing, so the existing "
+            + "preprocessed dataset has been removed.",
+            "Start to preprocess the on-disk dataset.",
+            "Finish preprocessing the on-disk dataset.",
+            "",
+        ]
+        with open(preprocessed_metadata_path, "r") as f:
+            target_yaml_data = yaml.safe_load(f)
+        assert target_yaml_data["tasks"][0]["name"] == "fake_name"
+
+        # 2. Change edge feature.
+        edge_feats = np.random.rand(num_edges, num_classes)
+        edge_feat_path = os.path.join("data", "edge-feat.npy")
+        np.save(os.path.join(test_dir, edge_feat_path), edge_feats)
+        preprocessed_metadata_path = (
+            gb.ondisk_dataset.preprocess_ondisk_dataset(
+                test_dir, include_original_edge_id=False
+            )
+        )
+        captured = capsys.readouterr().out.split("\n")
+        assert captured == [
+            "The on-disk dataset is re-preprocessing, so the existing "
+            + "preprocessed dataset has been removed.",
+            "Start to preprocess the on-disk dataset.",
+            "Finish preprocessing the on-disk dataset.",
+            "",
+        ]
+        preprocessed_edge_feat = np.load(
+            os.path.join(test_dir, "preprocessed", edge_feat_path)
+        )
+        assert preprocessed_edge_feat.all() == edge_feats.all()
+        with open(preprocessed_metadata_path, "r") as f:
+            target_yaml_data = yaml.safe_load(f)
+        assert target_yaml_data["include_original_edge_id"] == False
+
+        # 3. Change include_original_edge_id.
+        preprocessed_metadata_path = (
+            gb.ondisk_dataset.preprocess_ondisk_dataset(
+                test_dir, include_original_edge_id=True
+            )
+        )
+        captured = capsys.readouterr().out.split("\n")
+        assert captured == [
+            "The on-disk dataset is re-preprocessing, so the existing "
+            + "preprocessed dataset has been removed.",
+            "Start to preprocess the on-disk dataset.",
+            "Finish preprocessing the on-disk dataset.",
+            "",
+        ]
+        with open(preprocessed_metadata_path, "r") as f:
+            target_yaml_data = yaml.safe_load(f)
+        assert target_yaml_data["include_original_edge_id"] == True
+
+        # 4. Change nothing.
+        preprocessed_metadata_path = (
+            gb.ondisk_dataset.preprocess_ondisk_dataset(
+                test_dir, include_original_edge_id=True
+            )
+        )
+        captured = capsys.readouterr().out.split("\n")
+        assert captured == ["The dataset is already preprocessed.", ""]
+
+
 @pytest.mark.parametrize("edge_fmt", ["csv", "numpy"])
 def test_OnDiskDataset_load_name(edge_fmt):
     """Test preprocess of OnDiskDataset."""
@@ -2341,6 +2458,109 @@ def test_OnDiskDataset_force_preprocess(capsys):
         dataset = None
 
 
+def test_OnDiskDataset_auto_force_preprocess(capsys):
+    """Test force preprocess of OnDiskDataset."""
+    with tempfile.TemporaryDirectory() as test_dir:
+        # All metadata fields are specified.
+        dataset_name = "graphbolt_test"
+        num_nodes = 4000
+        num_edges = 20000
+        num_classes = 10
+
+        # Generate random graph.
+        yaml_content = gbt.random_homo_graphbolt_graph(
+            test_dir,
+            dataset_name,
+            num_nodes,
+            num_edges,
+            num_classes,
+        )
+        yaml_file = os.path.join(test_dir, "metadata.yaml")
+        with open(yaml_file, "w") as f:
+            f.write(yaml_content)
+
+        # First preprocess on-disk dataset.
+        dataset = gb.OnDiskDataset(
+            test_dir, include_original_edge_id=False
+        ).load()
+        captured = capsys.readouterr().out.split("\n")
+        assert captured == [
+            "Start to preprocess the on-disk dataset.",
+            "Finish preprocessing the on-disk dataset.",
+            "",
+        ]
+        tasks = dataset.tasks
+        assert tasks[0].metadata["name"] == "link_prediction"
+
+        # 1. Change yaml_data.
+        with open(yaml_file, "r") as f:
+            yaml_data = yaml.safe_load(f)
+        yaml_data["tasks"][0]["name"] = "fake_name"
+        with open(yaml_file, "w") as f:
+            yaml.dump(yaml_data, f)
+        dataset = gb.OnDiskDataset(
+            test_dir, include_original_edge_id=False
+        ).load()
+        captured = capsys.readouterr().out.split("\n")
+        assert captured == [
+            "The on-disk dataset is re-preprocessing, so the existing "
+            + "preprocessed dataset has been removed.",
+            "Start to preprocess the on-disk dataset.",
+            "Finish preprocessing the on-disk dataset.",
+            "",
+        ]
+        tasks = dataset.tasks
+        assert tasks[0].metadata["name"] == "fake_name"
+
+        # 2. Change edge feature.
+        edge_feats = np.random.rand(num_edges, num_classes)
+        edge_feat_path = os.path.join("data", "edge-feat.npy")
+        np.save(os.path.join(test_dir, edge_feat_path), edge_feats)
+        dataset = gb.OnDiskDataset(
+            test_dir, include_original_edge_id=False
+        ).load()
+        captured = capsys.readouterr().out.split("\n")
+        assert captured == [
+            "The on-disk dataset is re-preprocessing, so the existing "
+            + "preprocessed dataset has been removed.",
+            "Start to preprocess the on-disk dataset.",
+            "Finish preprocessing the on-disk dataset.",
+            "",
+        ]
+        assert torch.equal(
+            dataset.feature.read("edge", None, "feat"),
+            torch.from_numpy(edge_feats),
+        )
+        graph = dataset.graph
+        assert gb.ORIGINAL_EDGE_ID not in graph.edge_attributes
+
+        # 3. Change include_original_edge_id.
+        dataset = gb.OnDiskDataset(
+            test_dir, include_original_edge_id=True
+        ).load()
+        captured = capsys.readouterr().out.split("\n")
+        assert captured == [
+            "The on-disk dataset is re-preprocessing, so the existing "
+            + "preprocessed dataset has been removed.",
+            "Start to preprocess the on-disk dataset.",
+            "Finish preprocessing the on-disk dataset.",
+            "",
+        ]
+        graph = dataset.graph
+        assert gb.ORIGINAL_EDGE_ID in graph.edge_attributes
+
+        # 4. Change Nothing.
+        dataset = gb.OnDiskDataset(
+            test_dir, include_original_edge_id=True
+        ).load()
+        captured = capsys.readouterr().out.split("\n")
+        assert captured == ["The dataset is already preprocessed.", ""]
+
+        graph = None
+        tasks = None
+        dataset = None
+
+
 def test_OnDiskTask_repr_homogeneous():
     item_set = gb.ItemSet(
         (torch.arange(0, 5), torch.arange(5, 10)),
diff --git a/tests/python/pytorch/graphbolt/utils/test_internal.py b/tests/python/pytorch/graphbolt/utils/test_internal.py
index 939c10a2a438..c1274f1a9adf 100644
--- a/tests/python/pytorch/graphbolt/utils/test_internal.py
+++ b/tests/python/pytorch/graphbolt/utils/test_internal.py
@@ -1,3 +1,4 @@
+import json
 import os
 import re
 import tempfile
@@ -200,3 +201,68 @@ def test_read_edges_error():
             ),
         ):
             internal.read_edges(test_dir, "numpy", edge_path)
+
+
+def test_calculate_file_hash():
+    with tempfile.TemporaryDirectory() as test_dir:
+        test_file_path = os.path.join(test_dir, "test.txt")
+        with open(test_file_path, "w") as file:
+            file.write("test content")
+        hash_value = internal.calculate_file_hash(
+            test_file_path, hash_algo="md5"
+        )
+        expected_hash_value = "9473fdd0d880a43c21b7778d34872157"
+        assert expected_hash_value == hash_value
+        with pytest.raises(
+            ValueError,
+            match=re.escape(
+                "Hash algorithm must be one of: ['md5', 'sha1', 'sha224', "
+                + "'sha256', 'sha384', 'sha512'], but got `fake`."
+            ),
+        ):
+            hash_value = internal.calculate_file_hash(
+                test_file_path, hash_algo="fake"
+            )
+
+
+def test_calculate_dir_hash():
+    with tempfile.TemporaryDirectory() as test_dir:
+        test_file_path_1 = os.path.join(test_dir, "test_1.txt")
+        test_file_path_2 = os.path.join(test_dir, "test_2.txt")
+        with open(test_file_path_1, "w") as file:
+            file.write("test content")
+        with open(test_file_path_2, "w") as file:
+            file.write("test contents of directory")
+        hash_value = internal.calculate_dir_hash(test_dir, hash_algo="md5")
+        expected_hash_value = [
+            "56e708a2bdf92887d4a7f25cbc13c555",
+            "9473fdd0d880a43c21b7778d34872157",
+        ]
+        assert len(hash_value) == 2
+        for val in hash_value.values():
+            assert val in expected_hash_value
+
+
+def test_check_dataset_change():
+    with tempfile.TemporaryDirectory() as test_dir:
+        # Generate directory and record its hash value.
+        test_file_path_1 = os.path.join(test_dir, "test_1.txt")
+        test_file_path_2 = os.path.join(test_dir, "test_2.txt")
+        with open(test_file_path_1, "w") as file:
+            file.write("test content")
+        with open(test_file_path_2, "w") as file:
+            file.write("test contents of directory")
+        hash_value = internal.calculate_dir_hash(test_dir, hash_algo="md5")
+        hash_value_file = "dataset_hash_value.txt"
+        hash_value_file_paht = os.path.join(
+            test_dir, "preprocessed", hash_value_file
+        )
+        os.makedirs(os.path.join(test_dir, "preprocessed"), exist_ok=True)
+        with open(hash_value_file_paht, "w") as file:
+            file.write(json.dumps(hash_value, indent=4))
+
+        # Modify the content of a file.
+        with open(test_file_path_2, "w") as file:
+            file.write("test contents of directory changed")
+
+        assert internal.check_dataset_change(test_dir, "preprocessed")

From 80f36134c15464d7ad92350519e9c8c0d124b2ac Mon Sep 17 00:00:00 2001
From: Mingbang Wang <100203018+Skeleton003@users.noreply.github.com>
Date: Tue, 16 Jan 2024 11:14:04 +0800
Subject: [PATCH 16/22] [GraphBolt] Update the docstring of `ItemSet` (#6943)

---
 python/dgl/graphbolt/itemset.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/python/dgl/graphbolt/itemset.py b/python/dgl/graphbolt/itemset.py
index 8697ac9c59e4..8f93e9fed414 100644
--- a/python/dgl/graphbolt/itemset.py
+++ b/python/dgl/graphbolt/itemset.py
@@ -25,7 +25,10 @@ class requires each input itemset to be iterable.
         items.
     names: Union[str, Tuple[str]], optional
         The names of the items. If it is a tuple, each name corresponds to an
-        item in the tuple.
+        item in the tuple. The naming is arbitrary, but in general practice,
+        the names should be chosen from ['seed_nodes', 'node_pairs', 'labels',
+        'negative_srcs', 'negative_dsts'] to align with the attributes of
+        class `dgl.graphbolt.MiniBatch`.
 
     Examples
     --------

From 905321f855268b7ff65df056ba7f5d4ae5b60aae Mon Sep 17 00:00:00 2001
From: Mingbang Wang <100203018+Skeleton003@users.noreply.github.com>
Date: Tue, 16 Jan 2024 14:38:58 +0800
Subject: [PATCH 17/22] [GraphBolt] Modify `__repr__` (#6953)

---
 python/dgl/graphbolt/impl/ondisk_dataset.py   | 42 +++++-----
 .../impl/torch_based_feature_store.py         | 57 ++++---------
 python/dgl/graphbolt/itemset.py               | 20 ++---
 .../graphbolt/impl/test_ondisk_dataset.py     | 82 +++++++++----------
 4 files changed, 87 insertions(+), 114 deletions(-)

diff --git a/python/dgl/graphbolt/impl/ondisk_dataset.py b/python/dgl/graphbolt/impl/ondisk_dataset.py
index 11245c458061..c86476a41961 100644
--- a/python/dgl/graphbolt/impl/ondisk_dataset.py
+++ b/python/dgl/graphbolt/impl/ondisk_dataset.py
@@ -3,6 +3,7 @@
 import json
 import os
 import shutil
+import textwrap
 from copy import deepcopy
 from typing import Dict, List, Union
 
@@ -339,7 +340,24 @@ def test_set(self) -> Union[ItemSet, ItemSetDict]:
         return self._test_set
 
     def __repr__(self) -> str:
-        return _ondisk_task_str(self)
+        ret = "{Classname}({attributes})"
+
+        attributes_str = ""
+
+        attributes = get_attributes(self)
+        attributes.reverse()
+        for attribute in attributes:
+            if attribute[0] == "_":
+                continue
+            value = getattr(self, attribute)
+            attributes_str += f"{attribute}={value},\n"
+        attributes_str = textwrap.indent(
+            attributes_str, " " * len("OnDiskTask(")
+        ).strip()
+
+        return ret.format(
+            Classname=self.__class__.__name__, attributes=attributes_str
+        )
 
 
 class OnDiskDataset(Dataset):
@@ -752,25 +770,3 @@ def __init__(self, name: str, root: str = "datasets") -> OnDiskDataset:
             extract_archive(zip_file_path, root, overwrite=True)
             os.remove(zip_file_path)
         super().__init__(dataset_dir, force_preprocess=False)
-
-
-def _ondisk_task_str(task: OnDiskTask) -> str:
-    final_str = "OnDiskTask("
-    indent_len = len(final_str)
-
-    def _add_indent(_str, indent):
-        lines = _str.split("\n")
-        lines = [lines[0]] + [" " * indent + line for line in lines[1:]]
-        return "\n".join(lines)
-
-    attributes = get_attributes(task)
-    attributes.reverse()
-    for name in attributes:
-        if name[0] == "_":
-            continue
-        val = getattr(task, name)
-        final_str += (
-            f"{name}={_add_indent(str(val), indent_len + len(name) + 1)},\n"
-            + " " * indent_len
-        )
-    return final_str[:-indent_len] + ")"
diff --git a/python/dgl/graphbolt/impl/torch_based_feature_store.py b/python/dgl/graphbolt/impl/torch_based_feature_store.py
index 292b0b1d4e59..9e0718a1d9fe 100644
--- a/python/dgl/graphbolt/impl/torch_based_feature_store.py
+++ b/python/dgl/graphbolt/impl/torch_based_feature_store.py
@@ -172,36 +172,24 @@ def pin_memory_(self):
 
     def __repr__(self) -> str:
         ret = (
-            "TorchBasedFeature(\n"
+            "{Classname}(\n"
             "    feature={feature},\n"
             "    metadata={metadata},\n"
             ")"
         )
 
-        feature_str = str(self._tensor)
-        feature_str_lines = feature_str.splitlines()
-        if len(feature_str_lines) > 1:
-            feature_str = (
-                feature_str_lines[0]
-                + "\n"
-                + textwrap.indent(
-                    "\n".join(feature_str_lines[1:]), " " * len("    feature=")
-                )
-            )
-
-        metadata_str = str(self.metadata())
-        metadata_str_lines = metadata_str.splitlines()
-        if len(metadata_str_lines) > 1:
-            metadata_str = (
-                metadata_str_lines[0]
-                + "\n"
-                + textwrap.indent(
-                    "\n".join(metadata_str_lines[1:]),
-                    " " * len("    metadata="),
-                )
-            )
-
-        return ret.format(feature=feature_str, metadata=metadata_str)
+        feature_str = textwrap.indent(
+            str(self._tensor), " " * len("    feature=")
+        ).strip()
+        metadata_str = textwrap.indent(
+            str(self.metadata()), " " * len("    metadata=")
+        ).strip()
+
+        return ret.format(
+            Classname=self.__class__.__name__,
+            feature=feature_str,
+            metadata=metadata_str,
+        )
 
 
 class TorchBasedFeatureStore(BasicFeatureStore):
@@ -268,17 +256,8 @@ def pin_memory_(self):
             feature.pin_memory_()
 
     def __repr__(self) -> str:
-        ret = "TorchBasedFeatureStore(\n" + "    {features}\n" + ")"
-
-        features_str = str(self._features)
-        features_str_lines = features_str.splitlines()
-        if len(features_str_lines) > 1:
-            features_str = (
-                features_str_lines[0]
-                + "\n"
-                + textwrap.indent(
-                    "\n".join(features_str_lines[1:]), " " * len("    ")
-                )
-            )
-
-        return ret.format(features=features_str)
+        ret = "{Classname}(\n" + "    {features}\n" + ")"
+        features_str = textwrap.indent(str(self._features), "    ").strip()
+        return ret.format(
+            Classname=self.__class__.__name__, features=features_str
+        )
diff --git a/python/dgl/graphbolt/itemset.py b/python/dgl/graphbolt/itemset.py
index 8f93e9fed414..14dc49b523cb 100644
--- a/python/dgl/graphbolt/itemset.py
+++ b/python/dgl/graphbolt/itemset.py
@@ -180,7 +180,7 @@ def names(self) -> Tuple[str]:
 
     def __repr__(self) -> str:
         ret = (
-            f"ItemSet(\n"
+            f"{self.__class__.__name__}(\n"
             f"    items={self._items},\n"
             f"    names={self._names},\n"
             f")"
@@ -342,18 +342,18 @@ def names(self) -> Tuple[str]:
 
     def __repr__(self) -> str:
         ret = (
-            "ItemSetDict(\n"
+            "{Classname}(\n"
             "    itemsets={itemsets},\n"
             "    names={names},\n"
             ")"
         )
 
-        itemsets_str = repr(self._itemsets)
-        lines = itemsets_str.splitlines()
-        itemsets_str = (
-            lines[0]
-            + "\n"
-            + textwrap.indent("\n".join(lines[1:]), " " * len("    itemsets="))
-        )
+        itemsets_str = textwrap.indent(
+            repr(self._itemsets), " " * len("    itemsets=")
+        ).strip()
 
-        return ret.format(itemsets=itemsets_str, names=self._names)
+        return ret.format(
+            Classname=self.__class__.__name__,
+            itemsets=itemsets_str,
+            names=self._names,
+        )
diff --git a/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py b/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py
index 1e5fdd4fbabe..b669f2e7d1f4 100644
--- a/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py
+++ b/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py
@@ -2570,21 +2570,20 @@ def test_OnDiskTask_repr_homogeneous():
     task = gb.OnDiskTask(metadata, item_set, item_set, item_set)
     expected_str = (
         "OnDiskTask(validation_set=ItemSet(\n"
-        "                              items=(tensor([0, 1, 2, 3, 4]), tensor([5, 6, 7, 8, 9])),\n"
-        "                              names=('seed_nodes', 'labels'),\n"
-        "                          ),\n"
+        "               items=(tensor([0, 1, 2, 3, 4]), tensor([5, 6, 7, 8, 9])),\n"
+        "               names=('seed_nodes', 'labels'),\n"
+        "           ),\n"
         "           train_set=ItemSet(\n"
-        "                         items=(tensor([0, 1, 2, 3, 4]), tensor([5, 6, 7, 8, 9])),\n"
-        "                         names=('seed_nodes', 'labels'),\n"
-        "                     ),\n"
+        "               items=(tensor([0, 1, 2, 3, 4]), tensor([5, 6, 7, 8, 9])),\n"
+        "               names=('seed_nodes', 'labels'),\n"
+        "           ),\n"
         "           test_set=ItemSet(\n"
-        "                        items=(tensor([0, 1, 2, 3, 4]), tensor([5, 6, 7, 8, 9])),\n"
-        "                        names=('seed_nodes', 'labels'),\n"
-        "                    ),\n"
-        "           metadata={'name': 'node_classification'},\n"
-        ")"
+        "               items=(tensor([0, 1, 2, 3, 4]), tensor([5, 6, 7, 8, 9])),\n"
+        "               names=('seed_nodes', 'labels'),\n"
+        "           ),\n"
+        "           metadata={'name': 'node_classification'},)"
     )
-    assert str(task) == expected_str, print(task)
+    assert repr(task) == expected_str, task
 
 
 def test_OnDiskTask_repr_heterogeneous():
@@ -2598,39 +2597,38 @@ def test_OnDiskTask_repr_heterogeneous():
     task = gb.OnDiskTask(metadata, item_set, item_set, item_set)
     expected_str = (
         "OnDiskTask(validation_set=ItemSetDict(\n"
-        "                              itemsets={'user': ItemSet(\n"
-        "                                           items=(tensor([0, 1, 2, 3, 4]),),\n"
-        "                                           names=('seed_nodes',),\n"
-        "                                       ), 'item': ItemSet(\n"
-        "                                           items=(tensor([5, 6, 7, 8, 9]),),\n"
-        "                                           names=('seed_nodes',),\n"
-        "                                       )},\n"
-        "                              names=('seed_nodes',),\n"
-        "                          ),\n"
+        "               itemsets={'user': ItemSet(\n"
+        "                            items=(tensor([0, 1, 2, 3, 4]),),\n"
+        "                            names=('seed_nodes',),\n"
+        "                        ), 'item': ItemSet(\n"
+        "                            items=(tensor([5, 6, 7, 8, 9]),),\n"
+        "                            names=('seed_nodes',),\n"
+        "                        )},\n"
+        "               names=('seed_nodes',),\n"
+        "           ),\n"
         "           train_set=ItemSetDict(\n"
-        "                         itemsets={'user': ItemSet(\n"
-        "                                      items=(tensor([0, 1, 2, 3, 4]),),\n"
-        "                                      names=('seed_nodes',),\n"
-        "                                  ), 'item': ItemSet(\n"
-        "                                      items=(tensor([5, 6, 7, 8, 9]),),\n"
-        "                                      names=('seed_nodes',),\n"
-        "                                  )},\n"
-        "                         names=('seed_nodes',),\n"
-        "                     ),\n"
+        "               itemsets={'user': ItemSet(\n"
+        "                            items=(tensor([0, 1, 2, 3, 4]),),\n"
+        "                            names=('seed_nodes',),\n"
+        "                        ), 'item': ItemSet(\n"
+        "                            items=(tensor([5, 6, 7, 8, 9]),),\n"
+        "                            names=('seed_nodes',),\n"
+        "                        )},\n"
+        "               names=('seed_nodes',),\n"
+        "           ),\n"
         "           test_set=ItemSetDict(\n"
-        "                        itemsets={'user': ItemSet(\n"
-        "                                     items=(tensor([0, 1, 2, 3, 4]),),\n"
-        "                                     names=('seed_nodes',),\n"
-        "                                 ), 'item': ItemSet(\n"
-        "                                     items=(tensor([5, 6, 7, 8, 9]),),\n"
-        "                                     names=('seed_nodes',),\n"
-        "                                 )},\n"
-        "                        names=('seed_nodes',),\n"
-        "                    ),\n"
-        "           metadata={'name': 'node_classification'},\n"
-        ")"
+        "               itemsets={'user': ItemSet(\n"
+        "                            items=(tensor([0, 1, 2, 3, 4]),),\n"
+        "                            names=('seed_nodes',),\n"
+        "                        ), 'item': ItemSet(\n"
+        "                            items=(tensor([5, 6, 7, 8, 9]),),\n"
+        "                            names=('seed_nodes',),\n"
+        "                        )},\n"
+        "               names=('seed_nodes',),\n"
+        "           ),\n"
+        "           metadata={'name': 'node_classification'},)"
     )
-    assert str(task) == expected_str, print(task)
+    assert repr(task) == expected_str, task
 
 
 def test_OnDiskDataset_load_tasks_selectively():

From de1eedc6e85b2726b2ae2adc6f79b9c8343907f3 Mon Sep 17 00:00:00 2001
From: yxy235 <77922129+yxy235@users.noreply.github.com>
Date: Tue, 16 Jan 2024 15:05:58 +0800
Subject: [PATCH 18/22] [GraphBolt] Add check about whether edge IDs are saved
 when edge feature is stored. (#6948)

Co-authored-by: Ubuntu <ubuntu@ip-172-31-0-133.us-west-2.compute.internal>
---
 python/dgl/graphbolt/impl/ondisk_dataset.py   |  5 ++
 .../graphbolt/impl/test_ondisk_dataset.py     | 56 +++++++++++++++++++
 2 files changed, 61 insertions(+)

diff --git a/python/dgl/graphbolt/impl/ondisk_dataset.py b/python/dgl/graphbolt/impl/ondisk_dataset.py
index c86476a41961..5da99fe74f23 100644
--- a/python/dgl/graphbolt/impl/ondisk_dataset.py
+++ b/python/dgl/graphbolt/impl/ondisk_dataset.py
@@ -219,6 +219,7 @@ def preprocess_ondisk_dataset(
 
     # 7. Load the node/edge features and do necessary conversion.
     if input_config.get("feature_data", None):
+        has_edge_feature_data = False
         for feature, out_feature in zip(
             input_config["feature_data"], output_config["feature_data"]
         ):
@@ -230,6 +231,8 @@ def preprocess_ondisk_dataset(
             in_memory = (
                 True if "in_memory" not in feature else feature["in_memory"]
             )
+            if not has_edge_feature_data and feature["domain"] == "edge":
+                has_edge_feature_data = True
             copy_or_convert_data(
                 os.path.join(dataset_dir, feature["path"]),
                 os.path.join(dataset_dir, out_feature["path"]),
@@ -238,6 +241,8 @@ def preprocess_ondisk_dataset(
                 in_memory=in_memory,
                 is_feature=True,
             )
+        if has_edge_feature_data and not include_original_edge_id:
+            dgl_warning("Edge feature is stored, but edge IDs are not saved.")
 
     # 8. Save tasks and train/val/test split according to the output_config.
     if input_config.get("tasks", None):
diff --git a/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py b/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py
index b669f2e7d1f4..237ab0c590d1 100644
--- a/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py
+++ b/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py
@@ -1726,6 +1726,35 @@ def test_OnDiskDataset_preprocess_auto_force_preprocess(capsys):
         assert captured == ["The dataset is already preprocessed.", ""]
 
 
+def test_OnDiskDataset_preprocess_not_include_eids():
+    with tempfile.TemporaryDirectory() as test_dir:
+        # All metadata fields are specified.
+        dataset_name = "graphbolt_test"
+        num_nodes = 4000
+        num_edges = 20000
+        num_classes = 10
+
+        # Generate random graph.
+        yaml_content = gbt.random_homo_graphbolt_graph(
+            test_dir,
+            dataset_name,
+            num_nodes,
+            num_edges,
+            num_classes,
+        )
+        yaml_file = os.path.join(test_dir, "metadata.yaml")
+        with open(yaml_file, "w") as f:
+            f.write(yaml_content)
+
+        with pytest.warns(
+            DGLWarning,
+            match="Edge feature is stored, but edge IDs are not saved.",
+        ):
+            gb.ondisk_dataset.preprocess_ondisk_dataset(
+                test_dir, include_original_edge_id=False
+            )
+
+
 @pytest.mark.parametrize("edge_fmt", ["csv", "numpy"])
 def test_OnDiskDataset_load_name(edge_fmt):
     """Test preprocess of OnDiskDataset."""
@@ -2586,6 +2615,33 @@ def test_OnDiskTask_repr_homogeneous():
     assert repr(task) == expected_str, task
 
 
+def test_OnDiskDataset_not_include_eids():
+    with tempfile.TemporaryDirectory() as test_dir:
+        # All metadata fields are specified.
+        dataset_name = "graphbolt_test"
+        num_nodes = 4000
+        num_edges = 20000
+        num_classes = 10
+
+        # Generate random graph.
+        yaml_content = gbt.random_homo_graphbolt_graph(
+            test_dir,
+            dataset_name,
+            num_nodes,
+            num_edges,
+            num_classes,
+        )
+        yaml_file = os.path.join(test_dir, "metadata.yaml")
+        with open(yaml_file, "w") as f:
+            f.write(yaml_content)
+
+        with pytest.warns(
+            DGLWarning,
+            match="Edge feature is stored, but edge IDs are not saved.",
+        ):
+            gb.OnDiskDataset(test_dir, include_original_edge_id=False)
+
+
 def test_OnDiskTask_repr_heterogeneous():
     item_set = gb.ItemSetDict(
         {

From 6f9db8134567ce24def548f4de3cac5a63a92865 Mon Sep 17 00:00:00 2001
From: Mingbang Wang <100203018+Skeleton003@users.noreply.github.com>
Date: Tue, 16 Jan 2024 15:43:57 +0800
Subject: [PATCH 19/22] [Misc] Correct
 `test_OnDiskDataset_preprocess_homogeneous` (#6922)

---
 tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py b/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py
index 237ab0c590d1..ef15453463e7 100644
--- a/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py
+++ b/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py
@@ -1175,9 +1175,9 @@ def test_OnDiskDataset_preprocess_homogeneous(edge_fmt):
         yaml_file = os.path.join(test_dir, "metadata.yaml")
         with open(yaml_file, "w") as f:
             f.write(yaml_content)
-        # Test do not generate original_edge_id.
+        # Test generating original_edge_id.
         output_file = gb.ondisk_dataset.preprocess_ondisk_dataset(
-            test_dir, include_original_edge_id=False
+            test_dir, include_original_edge_id=True
         )
         with open(output_file, "rb") as f:
             processed_dataset = yaml.load(f, Loader=yaml.Loader)
@@ -1186,8 +1186,7 @@ def test_OnDiskDataset_preprocess_homogeneous(edge_fmt):
         )
         assert (
             fused_csc_sampling_graph.edge_attributes is not None
-            and gb.ORIGINAL_EDGE_ID
-            not in fused_csc_sampling_graph.edge_attributes
+            and gb.ORIGINAL_EDGE_ID in fused_csc_sampling_graph.edge_attributes
         )
         fused_csc_sampling_graph = None
 

From bd74c44c39e037e645dfd3e7e580d4395b211a14 Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Tue, 16 Jan 2024 03:27:48 -0500
Subject: [PATCH 20/22] [GraphBolt][CUDA] Add `.to()` method to Graph and
 FeatureStore. (#6957)

---
 .../impl/fused_csc_sampling_graph.py          | 16 +++++-
 .../impl/torch_based_feature_store.py         | 18 +++++++
 .../impl/test_fused_csc_sampling_graph.py     | 53 ++++++++++++-------
 .../impl/test_torch_based_feature_store.py    | 53 +++++++++++++++++++
 4 files changed, 120 insertions(+), 20 deletions(-)

diff --git a/python/dgl/graphbolt/impl/fused_csc_sampling_graph.py b/python/dgl/graphbolt/impl/fused_csc_sampling_graph.py
index 486bd59caccb..8f026b3c5095 100644
--- a/python/dgl/graphbolt/impl/fused_csc_sampling_graph.py
+++ b/python/dgl/graphbolt/impl/fused_csc_sampling_graph.py
@@ -956,7 +956,21 @@ def to(self, device: torch.device) -> None:  # pylint: disable=invalid-name
         def _to(x):
             return x.to(device) if hasattr(x, "to") else x
 
-        return self._apply_to_members(_to)
+        def _pin(x):
+            return x.pin_memory() if hasattr(x, "pin_memory") else x
+
+        # Create a copy of self.
+        self2 = fused_csc_sampling_graph(
+            self.csc_indptr,
+            self.indices,
+            self.node_type_offset,
+            self.type_per_edge,
+            self.node_type_to_id,
+            self.edge_type_to_id,
+            self.node_attributes,
+            self.edge_attributes,
+        )
+        return self2._apply_to_members(_pin if device == "pinned" else _to)
 
     def pin_memory_(self):
         """Copy `FusedCSCSamplingGraph` to the pinned memory in-place."""
diff --git a/python/dgl/graphbolt/impl/torch_based_feature_store.py b/python/dgl/graphbolt/impl/torch_based_feature_store.py
index 9e0718a1d9fe..3952eb0a84b4 100644
--- a/python/dgl/graphbolt/impl/torch_based_feature_store.py
+++ b/python/dgl/graphbolt/impl/torch_based_feature_store.py
@@ -1,5 +1,6 @@
 """Torch-based feature store for GraphBolt."""
 
+import copy
 import textwrap
 from typing import Dict, List
 
@@ -170,6 +171,16 @@ def pin_memory_(self):
         """In-place operation to copy the feature to pinned memory."""
         self._tensor = self._tensor.pin_memory()
 
+    def to(self, device):  # pylint: disable=invalid-name
+        """Copy `TorchBasedFeature` to the specified device."""
+        # copy.copy is a shallow copy so it does not copy tensor memory.
+        self2 = copy.copy(self)
+        if device == "pinned":
+            self2.pin_memory_()
+        else:
+            self2._tensor = self2._tensor.to(device)
+        return self2
+
     def __repr__(self) -> str:
         ret = (
             "{Classname}(\n"
@@ -255,6 +266,13 @@ def pin_memory_(self):
         for feature in self._features.values():
             feature.pin_memory_()
 
+    def to(self, device):  # pylint: disable=invalid-name
+        """Copy `TorchBasedFeatureStore` to the specified device."""
+        # copy.copy is a shallow copy so it does not copy tensor memory.
+        self2 = copy.copy(self)
+        self2._features = {k: v.to(device) for k, v in self2._features.items()}
+        return self2
+
     def __repr__(self) -> str:
         ret = "{Classname}(\n" + "    {features}\n" + ")"
         features_str = textwrap.indent(str(self._features), "    ").strip()
diff --git a/tests/python/pytorch/graphbolt/impl/test_fused_csc_sampling_graph.py b/tests/python/pytorch/graphbolt/impl/test_fused_csc_sampling_graph.py
index 3e13a1ce2f5a..b2f240e6279b 100644
--- a/tests/python/pytorch/graphbolt/impl/test_fused_csc_sampling_graph.py
+++ b/tests/python/pytorch/graphbolt/impl/test_fused_csc_sampling_graph.py
@@ -1552,25 +1552,46 @@ def create_fused_csc_sampling_graph():
     )
 
 
+def is_graph_on_device_type(graph, device_type):
+    assert graph.csc_indptr.device.type == device_type
+    assert graph.indices.device.type == device_type
+    assert graph.node_type_offset.device.type == device_type
+    assert graph.type_per_edge.device.type == device_type
+    assert graph.csc_indptr.device.type == device_type
+    for key in graph.edge_attributes:
+        assert graph.edge_attributes[key].device.type == device_type
+
+
+def is_graph_pinned(graph):
+    assert graph.csc_indptr.is_pinned()
+    assert graph.indices.is_pinned()
+    assert graph.node_type_offset.is_pinned()
+    assert graph.type_per_edge.is_pinned()
+    assert graph.csc_indptr.is_pinned()
+    for key in graph.edge_attributes:
+        assert graph.edge_attributes[key].is_pinned()
+
+
 @unittest.skipIf(
     F._default_context_str == "cpu",
     reason="`to` function needs GPU to test.",
 )
-def test_csc_sampling_graph_to_device():
+@pytest.mark.parametrize("device", ["pinned", "cuda"])
+def test_csc_sampling_graph_to_device(device):
     # Construct FusedCSCSamplingGraph.
     graph = create_fused_csc_sampling_graph()
 
     # Copy to device.
-    graph = graph.to("cuda")
-
-    # Check.
-    assert graph.csc_indptr.device.type == "cuda"
-    assert graph.indices.device.type == "cuda"
-    assert graph.node_type_offset.device.type == "cuda"
-    assert graph.type_per_edge.device.type == "cuda"
-    assert graph.csc_indptr.device.type == "cuda"
-    for key in graph.edge_attributes:
-        assert graph.edge_attributes[key].device.type == "cuda"
+    graph2 = graph.to(device)
+
+    if device == "cuda":
+        is_graph_on_device_type(graph2, "cuda")
+    elif device == "pinned":
+        is_graph_on_device_type(graph2, "cpu")
+        is_graph_pinned(graph2)
+
+    # The original variable should be untouched.
+    is_graph_on_device_type(graph, "cpu")
 
 
 @unittest.skipIf(
@@ -1584,14 +1605,8 @@ def test_csc_sampling_graph_to_pinned_memory():
     # Copy to pinned_memory in-place.
     graph.pin_memory_()
 
-    # Check.
-    assert graph.csc_indptr.is_pinned()
-    assert graph.indices.is_pinned()
-    assert graph.node_type_offset.is_pinned()
-    assert graph.type_per_edge.is_pinned()
-    assert graph.csc_indptr.is_pinned()
-    for key in graph.edge_attributes:
-        assert graph.edge_attributes[key].is_pinned()
+    is_graph_on_device_type(graph, "cpu")
+    is_graph_pinned(graph)
 
 
 @pytest.mark.parametrize("labor", [False, True])
diff --git a/tests/python/pytorch/graphbolt/impl/test_torch_based_feature_store.py b/tests/python/pytorch/graphbolt/impl/test_torch_based_feature_store.py
index 3de5fe3a0a82..be4b43b79461 100644
--- a/tests/python/pytorch/graphbolt/impl/test_torch_based_feature_store.py
+++ b/tests/python/pytorch/graphbolt/impl/test_torch_based_feature_store.py
@@ -136,6 +136,59 @@ def test_torch_based_feature(in_memory):
         feature_a = feature_b = None
 
 
+def is_feature_store_pinned(store):
+    for feature in store._features.values():
+        assert feature._tensor.is_pinned()
+
+
+def is_feature_store_on_cuda(store):
+    for feature in store._features.values():
+        assert feature._tensor.is_cuda
+
+
+def is_feature_store_on_cpu(store):
+    for feature in store._features.values():
+        assert not feature._tensor.is_cuda
+
+
+@unittest.skipIf(
+    F._default_context_str == "cpu",
+    reason="Tests for pinned memory are only meaningful on GPU.",
+)
+@pytest.mark.parametrize("device", ["pinned", "cuda"])
+def test_feature_store_to_device(device):
+    with tempfile.TemporaryDirectory() as test_dir:
+        a = torch.tensor([[1, 2, 4], [2, 5, 3]])
+        b = torch.tensor([[[1, 2], [3, 4]], [[2, 5], [3, 4]]])
+        write_tensor_to_disk(test_dir, "a", a, fmt="torch")
+        write_tensor_to_disk(test_dir, "b", b, fmt="numpy")
+        feature_data = [
+            gb.OnDiskFeatureData(
+                domain="node",
+                type="paper",
+                name="a",
+                format="torch",
+                path=os.path.join(test_dir, "a.pt"),
+            ),
+            gb.OnDiskFeatureData(
+                domain="edge",
+                type="paper:cites:paper",
+                name="b",
+                format="numpy",
+                path=os.path.join(test_dir, "b.npy"),
+            ),
+        ]
+        feature_store = gb.TorchBasedFeatureStore(feature_data)
+        feature_store2 = feature_store.to(device)
+        if device == "pinned":
+            is_feature_store_pinned(feature_store2)
+        elif device == "cuda":
+            is_feature_store_on_cuda(feature_store2)
+
+        # The original variable should be untouched.
+        is_feature_store_on_cpu(feature_store)
+
+
 @unittest.skipIf(
     F._default_context_str == "cpu",
     reason="Tests for pinned memory are only meaningful on GPU.",

From c81ff6ad2da04fbf3c4f819744d3906108fe073c Mon Sep 17 00:00:00 2001
From: Ramon Zhou <deluxurous@gmail.com>
Date: Tue, 16 Jan 2024 18:20:37 +0800
Subject: [PATCH 21/22] [GraphBolt] Fix fanouts setting in rgcn example (#6959)

---
 examples/sampling/graphbolt/rgcn/hetero_rgcn.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/examples/sampling/graphbolt/rgcn/hetero_rgcn.py b/examples/sampling/graphbolt/rgcn/hetero_rgcn.py
index 9b1b4bebcf4a..5a664a73a7d6 100644
--- a/examples/sampling/graphbolt/rgcn/hetero_rgcn.py
+++ b/examples/sampling/graphbolt/rgcn/hetero_rgcn.py
@@ -430,6 +430,7 @@ def evaluate(
     else:
         evaluator = MAG240MEvaluator()
 
+    num_etype = len(g.num_edges)
     data_loader = create_dataloader(
         name,
         g,
@@ -437,7 +438,7 @@ def evaluate(
         item_set,
         device,
         batch_size=4096,
-        fanouts=[25, 10],
+        fanouts=[torch.full((num_etype,), 25), torch.full((num_etype,), 10)],
         shuffle=False,
         num_workers=num_workers,
     )
@@ -491,6 +492,7 @@ def train(
     print("Start to train...")
     category = "paper"
 
+    num_etype = len(g.num_edges)
     data_loader = create_dataloader(
         name,
         g,
@@ -498,7 +500,7 @@ def train(
         train_set,
         device,
         batch_size=1024,
-        fanouts=[25, 10],
+        fanouts=[torch.full((num_etype,), 25), torch.full((num_etype,), 10)],
         shuffle=True,
         num_workers=num_workers,
     )

From 053c822175b70f7202dd793713476789ec02f8c6 Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Tue, 16 Jan 2024 11:16:13 -0500
Subject: [PATCH 22/22] [GraphBolt][CUDA] Enable GPU sampling in examples
 (#6861)

---
 .../sampling/graphbolt/link_prediction.py     | 39 ++++++++++-----
 .../sampling/graphbolt/node_classification.py | 48 ++++++++++++-------
 .../graphbolt/quickstart/link_prediction.py   | 14 ++++--
 .../quickstart/node_classification.py         | 14 ++++--
 4 files changed, 80 insertions(+), 35 deletions(-)

diff --git a/examples/sampling/graphbolt/link_prediction.py b/examples/sampling/graphbolt/link_prediction.py
index 45d5e525b28d..20e169b570a5 100644
--- a/examples/sampling/graphbolt/link_prediction.py
+++ b/examples/sampling/graphbolt/link_prediction.py
@@ -144,6 +144,16 @@ def create_dataloader(args, graph, features, itemset, is_train=True):
         shuffle=is_train,
     )
 
+    ############################################################################
+    # [Input]:
+    # 'device': The device to copy the data to.
+    # [Output]:
+    # A CopyTo object to copy the data to the specified device. Copying here
+    # ensures that the rest of the operations run on the GPU.
+    ############################################################################
+    if args.storage_device != "cpu":
+        datapipe = datapipe.copy_to(device=args.device)
+
     ############################################################################
     # [Input]:
     # 'args.neg_ratio': Specify the ratio of negative to positive samples.
@@ -216,7 +226,8 @@ def create_dataloader(args, graph, features, itemset, is_train=True):
     # [Output]:
     # A CopyTo object to copy the data to the specified device.
     ############################################################################
-    datapipe = datapipe.copy_to(device=args.device)
+    if args.storage_device == "cpu":
+        datapipe = datapipe.copy_to(device=args.device)
 
     ############################################################################
     # [Input]:
@@ -304,11 +315,11 @@ def train(args, model, graph, features, train_set):
     optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
     dataloader = create_dataloader(args, graph, features, train_set)
 
-    for epoch in tqdm.trange(args.epochs):
+    for epoch in range(args.epochs):
         model.train()
         total_loss = 0
         start_epoch_time = time.time()
-        for step, data in enumerate(dataloader):
+        for step, data in tqdm.tqdm(enumerate(dataloader)):
             # Get node pairs with labels for loss calculation.
             compacted_pairs, labels = data.node_pairs_with_labels
 
@@ -366,24 +377,30 @@ def parse_args():
         help="Whether to exclude reverse edges during sampling. Default: 1",
     )
     parser.add_argument(
-        "--device",
-        default="cpu",
-        choices=["cpu", "cuda"],
-        help="Train device: 'cpu' for CPU, 'cuda' for GPU.",
+        "--mode",
+        default="pinned-cuda",
+        choices=["cpu-cpu", "cpu-cuda", "pinned-cuda", "cuda-cuda"],
+        help="Dataset storage placement and Train device: 'cpu' for CPU and RAM,"
+        " 'pinned' for pinned memory in RAM, 'cuda' for GPU and GPU memory.",
     )
     return parser.parse_args()
 
 
 def main(args):
     if not torch.cuda.is_available():
-        args.device = "cpu"
-    print(f"Training in {args.device} mode.")
+        args.mode = "cpu-cpu"
+    print(f"Training in {args.mode} mode.")
+    args.storage_device, args.device = args.mode.split("-")
+    args.device = torch.device(args.device)
 
     # Load and preprocess dataset.
     print("Loading data")
     dataset = gb.BuiltinDataset("ogbl-citation2").load()
-    graph = dataset.graph
-    features = dataset.feature
+
+    # Move the dataset to the selected storage.
+    graph = dataset.graph.to(args.storage_device)
+    features = dataset.feature.to(args.storage_device)
+
     train_set = dataset.tasks[0].train_set
     args.fanout = list(map(int, args.fanout.split(",")))
 
diff --git a/examples/sampling/graphbolt/node_classification.py b/examples/sampling/graphbolt/node_classification.py
index 56c9b9ddbc08..f589e667b455 100644
--- a/examples/sampling/graphbolt/node_classification.py
+++ b/examples/sampling/graphbolt/node_classification.py
@@ -92,6 +92,19 @@ def create_dataloader(
 
     ############################################################################
     # [Step-2]:
+    # self.copy_to()
+    # [Input]:
+    # 'device': The device to copy the data to.
+    # 'extra_attrs': The extra attributes to copy.
+    # [Output]:
+    # A CopyTo object to copy the data to the specified device. Copying here
+    # ensures that the rest of the operations run on the GPU.
+    ############################################################################
+    if args.storage_device != "cpu":
+        datapipe = datapipe.copy_to(device=device, extra_attrs=["seed_nodes"])
+
+    ############################################################################
+    # [Step-3]:
     # self.sample_neighbor()
     # [Input]:
     # 'graph': The network topology for sampling.
@@ -109,7 +122,7 @@ def create_dataloader(
     )
 
     ############################################################################
-    # [Step-3]:
+    # [Step-4]:
     # self.fetch_feature()
     # [Input]:
     # 'features': The node features.
@@ -125,17 +138,18 @@ def create_dataloader(
         datapipe = datapipe.fetch_feature(features, node_feature_keys=["feat"])
 
     ############################################################################
-    # [Step-4]:
+    # [Step-5]:
     # self.copy_to()
     # [Input]:
     # 'device': The device to copy the data to.
     # [Output]:
     # A CopyTo object to copy the data to the specified device.
     ############################################################################
-    datapipe = datapipe.copy_to(device=device)
+    if args.storage_device == "cpu":
+        datapipe = datapipe.copy_to(device=device)
 
     ############################################################################
-    # [Step-5]:
+    # [Step-6]:
     # gb.DataLoader()
     # [Input]:
     # 'datapipe': The datapipe object to be used for data loading.
@@ -259,7 +273,7 @@ def evaluate(args, model, graph, features, itemset, num_classes):
         job="evaluate",
     )
 
-    for step, data in tqdm(enumerate(dataloader)):
+    for step, data in tqdm(enumerate(dataloader), "Evaluating"):
         x = data.node_features["feat"]
         y.append(data.labels)
         y_hats.append(model(data.blocks, x))
@@ -289,7 +303,7 @@ def train(args, graph, features, train_set, valid_set, num_classes, model):
         t0 = time.time()
         model.train()
         total_loss = 0
-        for step, data in enumerate(dataloader):
+        for step, data in tqdm(enumerate(dataloader), "Training"):
             # The input features from the source nodes in the first layer's
             # computation graph.
             x = data.node_features["feat"]
@@ -349,28 +363,30 @@ def parse_args():
         " identical with the number of layers in your model. Default: 10,10,10",
     )
     parser.add_argument(
-        "--device",
-        default="cpu",
-        choices=["cpu", "cuda"],
-        help="Train device: 'cpu' for CPU, 'cuda' for GPU.",
+        "--mode",
+        default="pinned-cuda",
+        choices=["cpu-cpu", "cpu-cuda", "pinned-cuda", "cuda-cuda"],
+        help="Dataset storage placement and Train device: 'cpu' for CPU and RAM,"
+        " 'pinned' for pinned memory in RAM, 'cuda' for GPU and GPU memory.",
     )
     return parser.parse_args()
 
 
 def main(args):
     if not torch.cuda.is_available():
-        args.device = "cpu"
-    print(f"Training in {args.device} mode.")
+        args.mode = "cpu-cpu"
+    print(f"Training in {args.mode} mode.")
+    args.storage_device, args.device = args.mode.split("-")
     args.device = torch.device(args.device)
 
     # Load and preprocess dataset.
     print("Loading data...")
     dataset = gb.BuiltinDataset("ogbn-products").load()
 
-    graph = dataset.graph
-    # Currently the neighbor-sampling process can only be done on the CPU,
-    # therefore there is no need to copy the graph to the GPU.
-    features = dataset.feature
+    # Move the dataset to the selected storage.
+    graph = dataset.graph.to(args.storage_device)
+    features = dataset.feature.to(args.storage_device)
+
     train_set = dataset.tasks[0].train_set
     valid_set = dataset.tasks[0].validation_set
     test_set = dataset.tasks[0].test_set
diff --git a/examples/sampling/graphbolt/quickstart/link_prediction.py b/examples/sampling/graphbolt/quickstart/link_prediction.py
index 03d15446fed3..350c541ca92d 100644
--- a/examples/sampling/graphbolt/quickstart/link_prediction.py
+++ b/examples/sampling/graphbolt/quickstart/link_prediction.py
@@ -18,7 +18,7 @@
 ############################################################################
 # (HIGHLIGHT) Create a single process dataloader with dgl graphbolt package.
 ############################################################################
-def create_dataloader(dateset, device, is_train=True):
+def create_dataloader(dataset, device, is_train=True):
     # The second of two tasks in the dataset is link prediction.
     task = dataset.tasks[1]
     itemset = task.train_set if is_train else task.test_set
@@ -26,6 +26,9 @@ def create_dataloader(dateset, device, is_train=True):
     # Sample seed edges from the itemset.
     datapipe = gb.ItemSampler(itemset, batch_size=256)
 
+    # Copy the mini-batch to the designated device for sampling and training.
+    datapipe = datapipe.copy_to(device)
+
     if is_train:
         # Sample negative edges for the seed edges.
         datapipe = datapipe.sample_uniform_negative(
@@ -47,9 +50,6 @@ def create_dataloader(dateset, device, is_train=True):
         dataset.feature, node_feature_keys=["feat"]
     )
 
-    # Copy the mini-batch to the designated device for training.
-    datapipe = datapipe.copy_to(device)
-
     # Initiate the dataloader for the datapipe.
     return gb.DataLoader(datapipe)
 
@@ -158,6 +158,12 @@ def train(model, dataset, device):
     print("Loading data...")
     dataset = gb.BuiltinDataset("cora").load()
 
+    # If a CUDA device is selected, we pin the graph and the features so that
+    # the GPU can access them.
+    if device == torch.device("cuda:0"):
+        dataset.graph.pin_memory_()
+        dataset.feature.pin_memory_()
+
     in_size = dataset.feature.size("node", None, "feat")[0]
     model = GraphSAGE(in_size).to(device)
 
diff --git a/examples/sampling/graphbolt/quickstart/node_classification.py b/examples/sampling/graphbolt/quickstart/node_classification.py
index 823d08d5b447..0c824e880111 100644
--- a/examples/sampling/graphbolt/quickstart/node_classification.py
+++ b/examples/sampling/graphbolt/quickstart/node_classification.py
@@ -13,10 +13,13 @@
 ############################################################################
 # (HIGHLIGHT) Create a single process dataloader with dgl graphbolt package.
 ############################################################################
-def create_dataloader(dateset, itemset, device):
+def create_dataloader(dataset, itemset, device):
     # Sample seed nodes from the itemset.
     datapipe = gb.ItemSampler(itemset, batch_size=16)
 
+    # Copy the mini-batch to the designated device for sampling and training.
+    datapipe = datapipe.copy_to(device, extra_attrs=["seed_nodes"])
+
     # Sample neighbors for the seed nodes.
     datapipe = datapipe.sample_neighbor(dataset.graph, fanouts=[4, 2])
 
@@ -25,9 +28,6 @@ def create_dataloader(dateset, itemset, device):
         dataset.feature, node_feature_keys=["feat"]
     )
 
-    # Copy the mini-batch to the designated device for training.
-    datapipe = datapipe.copy_to(device)
-
     # Initiate the dataloader for the datapipe.
     return gb.DataLoader(datapipe)
 
@@ -119,6 +119,12 @@ def train(model, dataset, device):
     print("Loading data...")
     dataset = gb.BuiltinDataset("cora").load()
 
+    # If a CUDA device is selected, we pin the graph and the features so that
+    # the GPU can access them.
+    if device == torch.device("cuda:0"):
+        dataset.graph.pin_memory_()
+        dataset.feature.pin_memory_()
+
     in_size = dataset.feature.size("node", None, "feat")[0]
     out_size = dataset.tasks[0].metadata["num_classes"]
     model = GCN(in_size, out_size).to(device)