From df6b3250f456d6512dadf622b44dee6a6ba105e1 Mon Sep 17 00:00:00 2001
From: Rhett Ying <85214957+Rhett-Ying@users.noreply.github.com>
Date: Thu, 1 Feb 2024 09:39:50 +0800
Subject: [PATCH 01/45] [DistGB] enable DistGraph to load graphbolt partitions
 (#7048)

---
 python/dgl/distributed/dist_graph.py       |  40 +++-
 tests/distributed/test_dist_graph_store.py | 211 ++++++++++++++++-----
 2 files changed, 189 insertions(+), 62 deletions(-)

diff --git a/python/dgl/distributed/dist_graph.py b/python/dgl/distributed/dist_graph.py
index 4defa3223937..5bf76498ec97 100644
--- a/python/dgl/distributed/dist_graph.py
+++ b/python/dgl/distributed/dist_graph.py
@@ -60,18 +60,21 @@ class InitGraphRequest(rpc.Request):
     with shared memory.
     """
 
-    def __init__(self, graph_name):
+    def __init__(self, graph_name, use_graphbolt):
         self._graph_name = graph_name
+        self._use_graphbolt = use_graphbolt
 
     def __getstate__(self):
-        return self._graph_name
+        return self._graph_name, self._use_graphbolt
 
     def __setstate__(self, state):
-        self._graph_name = state
+        self._graph_name, self._use_graphbolt = state
 
     def process_request(self, server_state):
         if server_state.graph is None:
-            server_state.graph = _get_graph_from_shared_mem(self._graph_name)
+            server_state.graph = _get_graph_from_shared_mem(
+                self._graph_name, self._use_graphbolt
+            )
         return InitGraphResponse(self._graph_name)
 
 
@@ -153,13 +156,15 @@ def _exist_shared_mem_array(graph_name, name):
     return exist_shared_mem_array(_get_edata_path(graph_name, name))
 
 
-def _get_graph_from_shared_mem(graph_name):
+def _get_graph_from_shared_mem(graph_name, use_graphbolt):
     """Get the graph from the DistGraph server.
 
     The DistGraph server puts the graph structure of the local partition in the shared memory.
     The client can access the graph structure and some metadata on nodes and edges directly
     through shared memory to reduce the overhead of data access.
     """
+    if use_graphbolt:
+        return gb.load_from_shared_memory(graph_name)
     g, ntypes, etypes = heterograph_index.create_heterograph_from_shared_memory(
         graph_name
     )
@@ -524,6 +529,8 @@ class DistGraph:
     part_config : str, optional
         The path of partition configuration file generated by
         :py:meth:`dgl.distributed.partition.partition_graph`. It's used in the standalone mode.
+    use_graphbolt : bool, optional
+        Whether to load GraphBolt partition. Default: False.
 
     Examples
     --------
@@ -557,9 +564,15 @@ class DistGraph:
     manually setting up servers and trainers. The setup is not fully tested yet.
     """
 
-    def __init__(self, graph_name, gpb=None, part_config=None):
+    def __init__(
+        self, graph_name, gpb=None, part_config=None, use_graphbolt=False
+    ):
         self.graph_name = graph_name
+        self._use_graphbolt = use_graphbolt
         if os.environ.get("DGL_DIST_MODE", "standalone") == "standalone":
+            assert (
+                use_graphbolt is False
+            ), "GraphBolt is not supported in standalone mode."
             assert (
                 part_config is not None
             ), "When running in the standalone model, the partition config file is required"
@@ -600,7 +613,9 @@ def __init__(self, graph_name, gpb=None, part_config=None):
             self._init(gpb)
             # Tell the backup servers to load the graph structure from shared memory.
             for server_id in range(self._client.num_servers):
-                rpc.send_request(server_id, InitGraphRequest(graph_name))
+                rpc.send_request(
+                    server_id, InitGraphRequest(graph_name, use_graphbolt)
+                )
             for server_id in range(self._client.num_servers):
                 rpc.recv_response()
             self._client.barrier()
@@ -625,7 +640,9 @@ def _init(self, gpb):
         assert (
             self._client is not None
         ), "Distributed module is not initialized. Please call dgl.distributed.initialize."
-        self._g = _get_graph_from_shared_mem(self.graph_name)
+        self._g = _get_graph_from_shared_mem(
+            self.graph_name, self._use_graphbolt
+        )
         self._gpb = get_shared_mem_partition_book(self.graph_name)
         if self._gpb is None:
             self._gpb = gpb
@@ -682,10 +699,10 @@ def _init_edata_store(self):
                 self._edata_store[etype] = data
 
     def __getstate__(self):
-        return self.graph_name, self._gpb
+        return self.graph_name, self._gpb, self._use_graphbolt
 
     def __setstate__(self, state):
-        self.graph_name, gpb = state
+        self.graph_name, gpb, self._use_graphbolt = state
         self._init(gpb)
 
         self._init_ndata_store()
@@ -1230,6 +1247,9 @@ def find_edges(self, edges, etype=None):
         tensor
             The destination node ID array.
         """
+        assert (
+            self._use_graphbolt is False
+        ), "find_edges is not supported in GraphBolt."
         if etype is None:
             assert (
                 len(self.etypes) == 1
diff --git a/tests/distributed/test_dist_graph_store.py b/tests/distributed/test_dist_graph_store.py
index b473ef163215..63b70c3cd2be 100644
--- a/tests/distributed/test_dist_graph_store.py
+++ b/tests/distributed/test_dist_graph_store.py
@@ -13,11 +13,13 @@
 import backend as F
 
 import dgl
+import dgl.graphbolt as gb
 import numpy as np
 import pytest
 import torch as th
 from dgl.data.utils import load_graphs, save_graphs
 from dgl.distributed import (
+    dgl_partition_to_graphbolt,
     DistEmbedding,
     DistGraph,
     DistGraphServer,
@@ -38,12 +40,33 @@
     import struct
 
 
+def _verify_dist_graph_server_dgl(g):
+    # verify dtype of underlying graph
+    cg = g.client_g
+    for k, dtype in dgl.distributed.dist_graph.RESERVED_FIELD_DTYPE.items():
+        if k in cg.ndata:
+            assert (
+                F.dtype(cg.ndata[k]) == dtype
+            ), "Data type of {} in ndata should be {}.".format(k, dtype)
+        if k in cg.edata:
+            assert (
+                F.dtype(cg.edata[k]) == dtype
+            ), "Data type of {} in edata should be {}.".format(k, dtype)
+
+
+def _verify_dist_graph_server_graphbolt(g):
+    graph = g.client_g
+    assert isinstance(graph, gb.FusedCSCSamplingGraph)
+    # [Rui][TODO] verify dtype of underlying graph.
+
+
 def run_server(
     graph_name,
     server_id,
     server_count,
     num_clients,
     shared_mem,
+    use_graphbolt=False,
 ):
     g = DistGraphServer(
         server_id,
@@ -53,19 +76,15 @@ def run_server(
         "/tmp/dist_graph/{}.json".format(graph_name),
         disable_shared_mem=not shared_mem,
         graph_format=["csc", "coo"],
+        use_graphbolt=use_graphbolt,
     )
-    print("start server", server_id)
-    # verify dtype of underlying graph
-    cg = g.client_g
-    for k, dtype in dgl.distributed.dist_graph.RESERVED_FIELD_DTYPE.items():
-        if k in cg.ndata:
-            assert (
-                F.dtype(cg.ndata[k]) == dtype
-            ), "Data type of {} in ndata should be {}.".format(k, dtype)
-        if k in cg.edata:
-            assert (
-                F.dtype(cg.edata[k]) == dtype
-            ), "Data type of {} in edata should be {}.".format(k, dtype)
+    print(f"Starting server[{server_id}] with use_graphbolt={use_graphbolt}")
+    _verify = (
+        _verify_dist_graph_server_graphbolt
+        if use_graphbolt
+        else _verify_dist_graph_server_dgl
+    )
+    _verify(g)
     g.start()
 
 
@@ -110,18 +129,26 @@ def check_dist_graph_empty(g, num_clients, num_nodes, num_edges):
 
 
 def run_client_empty(
-    graph_name, part_id, server_count, num_clients, num_nodes, num_edges
+    graph_name,
+    part_id,
+    server_count,
+    num_clients,
+    num_nodes,
+    num_edges,
+    use_graphbolt=False,
 ):
     os.environ["DGL_NUM_SERVER"] = str(server_count)
     dgl.distributed.initialize("kv_ip_config.txt")
     gpb, graph_name, _, _ = load_partition_book(
         "/tmp/dist_graph/{}.json".format(graph_name), part_id
     )
-    g = DistGraph(graph_name, gpb=gpb)
+    g = DistGraph(graph_name, gpb=gpb, use_graphbolt=use_graphbolt)
     check_dist_graph_empty(g, num_clients, num_nodes, num_edges)
 
 
-def check_server_client_empty(shared_mem, num_servers, num_clients):
+def check_server_client_empty(
+    shared_mem, num_servers, num_clients, use_graphbolt=False
+):
     prepare_dist(num_servers)
     g = create_random_graph(10000)
 
@@ -129,6 +156,9 @@ def check_server_client_empty(shared_mem, num_servers, num_clients):
     num_parts = 1
     graph_name = "dist_graph_test_1"
     partition_graph(g, graph_name, num_parts, "/tmp/dist_graph")
+    if use_graphbolt:
+        part_config = os.path.join("/tmp/dist_graph", f"{graph_name}.json")
+        dgl_partition_to_graphbolt(part_config)
 
     # let's just test on one partition for now.
     # We cannot run multiple servers and clients on the same machine.
@@ -137,7 +167,14 @@ def check_server_client_empty(shared_mem, num_servers, num_clients):
     for serv_id in range(num_servers):
         p = ctx.Process(
             target=run_server,
-            args=(graph_name, serv_id, num_servers, num_clients, shared_mem),
+            args=(
+                graph_name,
+                serv_id,
+                num_servers,
+                num_clients,
+                shared_mem,
+                use_graphbolt,
+            ),
         )
         serv_ps.append(p)
         p.start()
@@ -154,6 +191,7 @@ def check_server_client_empty(shared_mem, num_servers, num_clients):
                 num_clients,
                 g.num_nodes(),
                 g.num_edges(),
+                use_graphbolt,
             ),
         )
         p.start()
@@ -178,6 +216,7 @@ def run_client(
     num_nodes,
     num_edges,
     group_id,
+    use_graphbolt=False,
 ):
     os.environ["DGL_NUM_SERVER"] = str(server_count)
     os.environ["DGL_GROUP_ID"] = str(group_id)
@@ -185,8 +224,10 @@ def run_client(
     gpb, graph_name, _, _ = load_partition_book(
         "/tmp/dist_graph/{}.json".format(graph_name), part_id
     )
-    g = DistGraph(graph_name, gpb=gpb)
-    check_dist_graph(g, num_clients, num_nodes, num_edges)
+    g = DistGraph(graph_name, gpb=gpb, use_graphbolt=use_graphbolt)
+    check_dist_graph(
+        g, num_clients, num_nodes, num_edges, use_graphbolt=use_graphbolt
+    )
 
 
 def run_emb_client(
@@ -270,14 +311,20 @@ def check_dist_optim_store(rank, num_nodes, optimizer_states, save):
 
 
 def run_client_hierarchy(
-    graph_name, part_id, server_count, node_mask, edge_mask, return_dict
+    graph_name,
+    part_id,
+    server_count,
+    node_mask,
+    edge_mask,
+    return_dict,
+    use_graphbolt=False,
 ):
     os.environ["DGL_NUM_SERVER"] = str(server_count)
     dgl.distributed.initialize("kv_ip_config.txt")
     gpb, graph_name, _, _ = load_partition_book(
         "/tmp/dist_graph/{}.json".format(graph_name), part_id
     )
-    g = DistGraph(graph_name, gpb=gpb)
+    g = DistGraph(graph_name, gpb=gpb, use_graphbolt=use_graphbolt)
     node_mask = F.tensor(node_mask)
     edge_mask = F.tensor(edge_mask)
     nodes = node_split(
@@ -355,7 +402,7 @@ def check_dist_emb(g, num_clients, num_nodes, num_edges):
         sys.exit(-1)
 
 
-def check_dist_graph(g, num_clients, num_nodes, num_edges):
+def check_dist_graph(g, num_clients, num_nodes, num_edges, use_graphbolt=False):
     # Test API
     assert g.num_nodes() == num_nodes
     assert g.num_edges() == num_edges
@@ -373,9 +420,15 @@ def check_dist_graph(g, num_clients, num_nodes, num_edges):
     assert np.all(F.asnumpy(feats == eids))
 
     # Test edge_subgraph
-    sg = g.edge_subgraph(eids)
-    assert sg.num_edges() == len(eids)
-    assert F.array_equal(sg.edata[dgl.EID], eids)
+    if use_graphbolt:
+        with pytest.raises(
+            AssertionError, match="find_edges is not supported in GraphBolt."
+        ):
+            g.edge_subgraph(eids)
+    else:
+        sg = g.edge_subgraph(eids)
+        assert sg.num_edges() == len(eids)
+        assert F.array_equal(sg.edata[dgl.EID], eids)
 
     # Test init node data
     new_shape = (g.num_nodes(), 2)
@@ -522,7 +575,9 @@ def check_dist_emb_server_client(
     print("clients have terminated")
 
 
-def check_server_client(shared_mem, num_servers, num_clients, num_groups=1):
+def check_server_client(
+    shared_mem, num_servers, num_clients, num_groups=1, use_graphbolt=False
+):
     prepare_dist(num_servers)
     g = create_random_graph(10000)
 
@@ -532,6 +587,9 @@ def check_server_client(shared_mem, num_servers, num_clients, num_groups=1):
     g.ndata["features"] = F.unsqueeze(F.arange(0, g.num_nodes()), 1)
     g.edata["features"] = F.unsqueeze(F.arange(0, g.num_edges()), 1)
     partition_graph(g, graph_name, num_parts, "/tmp/dist_graph")
+    if use_graphbolt:
+        part_config = os.path.join("/tmp/dist_graph", f"{graph_name}.json")
+        dgl_partition_to_graphbolt(part_config)
 
     # let's just test on one partition for now.
     # We cannot run multiple servers and clients on the same machine.
@@ -546,6 +604,7 @@ def check_server_client(shared_mem, num_servers, num_clients, num_groups=1):
                 num_servers,
                 num_clients,
                 shared_mem,
+                use_graphbolt,
             ),
         )
         serv_ps.append(p)
@@ -566,6 +625,7 @@ def check_server_client(shared_mem, num_servers, num_clients, num_groups=1):
                     g.num_nodes(),
                     g.num_edges(),
                     group_id,
+                    use_graphbolt,
                 ),
             )
             p.start()
@@ -582,7 +642,12 @@ def check_server_client(shared_mem, num_servers, num_clients, num_groups=1):
     print("clients have terminated")
 
 
-def check_server_client_hierarchy(shared_mem, num_servers, num_clients):
+def check_server_client_hierarchy(
+    shared_mem, num_servers, num_clients, use_graphbolt=False
+):
+    if num_clients == 1:
+        # skip this test if there is only one client.
+        return
     prepare_dist(num_servers)
     g = create_random_graph(10000)
 
@@ -598,6 +663,9 @@ def check_server_client_hierarchy(shared_mem, num_servers, num_clients):
         "/tmp/dist_graph",
         num_trainers_per_machine=num_clients,
     )
+    if use_graphbolt:
+        part_config = os.path.join("/tmp/dist_graph", f"{graph_name}.json")
+        dgl_partition_to_graphbolt(part_config)
 
     # let's just test on one partition for now.
     # We cannot run multiple servers and clients on the same machine.
@@ -606,7 +674,14 @@ def check_server_client_hierarchy(shared_mem, num_servers, num_clients):
     for serv_id in range(num_servers):
         p = ctx.Process(
             target=run_server,
-            args=(graph_name, serv_id, num_servers, num_clients, shared_mem),
+            args=(
+                graph_name,
+                serv_id,
+                num_servers,
+                num_clients,
+                shared_mem,
+                use_graphbolt,
+            ),
         )
         serv_ps.append(p)
         p.start()
@@ -633,6 +708,7 @@ def check_server_client_hierarchy(shared_mem, num_servers, num_clients):
                 node_mask,
                 edge_mask,
                 return_dict,
+                use_graphbolt,
             ),
         )
         p.start()
@@ -658,15 +734,23 @@ def check_server_client_hierarchy(shared_mem, num_servers, num_clients):
 
 
 def run_client_hetero(
-    graph_name, part_id, server_count, num_clients, num_nodes, num_edges
+    graph_name,
+    part_id,
+    server_count,
+    num_clients,
+    num_nodes,
+    num_edges,
+    use_graphbolt=False,
 ):
     os.environ["DGL_NUM_SERVER"] = str(server_count)
     dgl.distributed.initialize("kv_ip_config.txt")
     gpb, graph_name, _, _ = load_partition_book(
         "/tmp/dist_graph/{}.json".format(graph_name), part_id
     )
-    g = DistGraph(graph_name, gpb=gpb)
-    check_dist_graph_hetero(g, num_clients, num_nodes, num_edges)
+    g = DistGraph(graph_name, gpb=gpb, use_graphbolt=use_graphbolt)
+    check_dist_graph_hetero(
+        g, num_clients, num_nodes, num_edges, use_graphbolt=use_graphbolt
+    )
 
 
 def create_random_hetero():
@@ -701,7 +785,9 @@ def create_random_hetero():
     return g
 
 
-def check_dist_graph_hetero(g, num_clients, num_nodes, num_edges):
+def check_dist_graph_hetero(
+    g, num_clients, num_nodes, num_edges, use_graphbolt=False
+):
     # Test API
     for ntype in num_nodes:
         assert ntype in g.ntypes
@@ -754,12 +840,18 @@ def check_dist_graph_hetero(g, num_clients, num_nodes, num_edges):
     assert expect_except
 
     # Test edge_subgraph
-    sg = g.edge_subgraph({"r1": eids})
-    assert sg.num_edges() == len(eids)
-    assert F.array_equal(sg.edata[dgl.EID], eids)
-    sg = g.edge_subgraph({("n1", "r1", "n2"): eids})
-    assert sg.num_edges() == len(eids)
-    assert F.array_equal(sg.edata[dgl.EID], eids)
+    if use_graphbolt:
+        with pytest.raises(
+            AssertionError, match="find_edges is not supported in GraphBolt."
+        ):
+            g.edge_subgraph({"r1": eids})
+    else:
+        sg = g.edge_subgraph({"r1": eids})
+        assert sg.num_edges() == len(eids)
+        assert F.array_equal(sg.edata[dgl.EID], eids)
+        sg = g.edge_subgraph({("n1", "r1", "n2"): eids})
+        assert sg.num_edges() == len(eids)
+        assert F.array_equal(sg.edata[dgl.EID], eids)
 
     # Test init node data
     new_shape = (g.num_nodes("n1"), 2)
@@ -827,7 +919,9 @@ def check_dist_graph_hetero(g, num_clients, num_nodes, num_edges):
     print("end")
 
 
-def check_server_client_hetero(shared_mem, num_servers, num_clients):
+def check_server_client_hetero(
+    shared_mem, num_servers, num_clients, use_graphbolt=False
+):
     prepare_dist(num_servers)
     g = create_random_hetero()
 
@@ -835,6 +929,9 @@ def check_server_client_hetero(shared_mem, num_servers, num_clients):
     num_parts = 1
     graph_name = "dist_graph_test_3"
     partition_graph(g, graph_name, num_parts, "/tmp/dist_graph")
+    if use_graphbolt:
+        part_config = os.path.join("/tmp/dist_graph", f"{graph_name}.json")
+        dgl_partition_to_graphbolt(part_config)
 
     # let's just test on one partition for now.
     # We cannot run multiple servers and clients on the same machine.
@@ -843,7 +940,14 @@ def check_server_client_hetero(shared_mem, num_servers, num_clients):
     for serv_id in range(num_servers):
         p = ctx.Process(
             target=run_server,
-            args=(graph_name, serv_id, num_servers, num_clients, shared_mem),
+            args=(
+                graph_name,
+                serv_id,
+                num_servers,
+                num_clients,
+                shared_mem,
+                use_graphbolt,
+            ),
         )
         serv_ps.append(p)
         p.start()
@@ -862,6 +966,7 @@ def check_server_client_hetero(shared_mem, num_servers, num_clients):
                 num_clients,
                 num_nodes,
                 num_edges,
+                use_graphbolt,
             ),
         )
         p.start()
@@ -886,21 +991,23 @@ def check_server_client_hetero(shared_mem, num_servers, num_clients):
 @unittest.skipIf(
     dgl.backend.backend_name == "mxnet", reason="Turn off Mxnet support"
 )
-def test_server_client():
+@pytest.mark.parametrize("shared_mem", [True])
+@pytest.mark.parametrize("num_servers", [1])
+@pytest.mark.parametrize("num_clients", [1, 4])
+@pytest.mark.parametrize("use_graphbolt", [True, False])
+def test_server_client(shared_mem, num_servers, num_clients, use_graphbolt):
     reset_envs()
     os.environ["DGL_DIST_MODE"] = "distributed"
-    check_server_client_hierarchy(False, 1, 4)
-    check_server_client_empty(True, 1, 1)
-    check_server_client_hetero(True, 1, 1)
-    check_server_client_hetero(False, 1, 1)
-    check_server_client(True, 1, 1)
-    check_server_client(False, 1, 1)
-    # [TODO][Rhett] Tests for multiple groups may fail sometimes and
-    # root cause is unknown. Let's disable them for now.
-    # check_server_client(True, 2, 2)
-    # check_server_client(True, 1, 1, 2)
-    # check_server_client(False, 1, 1, 2)
-    # check_server_client(True, 2, 2, 2)
+    # [Rui]
+    # 1. `disable_shared_mem=False` is not supported yet. Skip it.
+    # 2. `num_servers` > 1 does not work on single machine. Skip it.
+    for func in [
+        check_server_client,
+        check_server_client_hetero,
+        check_server_client_empty,
+        check_server_client_hierarchy,
+    ]:
+        func(shared_mem, num_servers, num_clients, use_graphbolt=use_graphbolt)
 
 
 @unittest.skip(reason="Skip due to glitch in CI")

From 571340dac63d3a09e5d66d45244f9f13bb175d00 Mon Sep 17 00:00:00 2001
From: Rhett Ying <85214957+Rhett-Ying@users.noreply.github.com>
Date: Thu, 1 Feb 2024 11:37:43 +0800
Subject: [PATCH 02/45] [DistGB] update partition_graph to support graphbolt
 (#7053)

---
 python/dgl/distributed/partition.py        |  15 +-
 tests/distributed/test_dist_graph_store.py |  26 ++--
 tests/distributed/test_partition.py        | 162 +++++++++++++++++++++
 3 files changed, 186 insertions(+), 17 deletions(-)

diff --git a/python/dgl/distributed/partition.py b/python/dgl/distributed/partition.py
index 028935086d48..6928d24da534 100644
--- a/python/dgl/distributed/partition.py
+++ b/python/dgl/distributed/partition.py
@@ -638,6 +638,8 @@ def partition_graph(
     num_trainers_per_machine=1,
     objtype="cut",
     graph_formats=None,
+    use_graphbolt=False,
+    **kwargs,
 ):
     """Partition a graph for distributed training and store the partitions on files.
 
@@ -811,6 +813,10 @@ def partition_graph(
         ``csc`` and ``csr``. If not specified, save one format only according to what
         format is available. If multiple formats are available, selection priority
         from high to low is ``coo``, ``csc``, ``csr``.
+    use_graphbolt : bool, optional
+        Whether to save partitions in GraphBolt format. Default: False.
+    kwargs : dict
+        Other keyword arguments for converting DGL partitions to GraphBolt.
 
     Returns
     -------
@@ -1298,7 +1304,8 @@ def get_homogeneous(g, balance_ntypes):
         )
     )
 
-    _dump_part_config(f"{out_path}/{graph_name}.json", part_metadata)
+    part_config = os.path.join(out_path, graph_name + ".json")
+    _dump_part_config(part_config, part_metadata)
 
     num_cuts = sim_g.num_edges() - tot_num_inner_edges
     if num_parts == 1:
@@ -1309,6 +1316,12 @@ def get_homogeneous(g, balance_ntypes):
         )
     )
 
+    if use_graphbolt:
+        dgl_partition_to_graphbolt(
+            part_config,
+            **kwargs,
+        )
+
     if return_mapping:
         return orig_nids, orig_eids
 
diff --git a/tests/distributed/test_dist_graph_store.py b/tests/distributed/test_dist_graph_store.py
index 63b70c3cd2be..8ba98ef4f47c 100644
--- a/tests/distributed/test_dist_graph_store.py
+++ b/tests/distributed/test_dist_graph_store.py
@@ -19,7 +19,6 @@
 import torch as th
 from dgl.data.utils import load_graphs, save_graphs
 from dgl.distributed import (
-    dgl_partition_to_graphbolt,
     DistEmbedding,
     DistGraph,
     DistGraphServer,
@@ -155,10 +154,9 @@ def check_server_client_empty(
     # Partition the graph
     num_parts = 1
     graph_name = "dist_graph_test_1"
-    partition_graph(g, graph_name, num_parts, "/tmp/dist_graph")
-    if use_graphbolt:
-        part_config = os.path.join("/tmp/dist_graph", f"{graph_name}.json")
-        dgl_partition_to_graphbolt(part_config)
+    partition_graph(
+        g, graph_name, num_parts, "/tmp/dist_graph", use_graphbolt=use_graphbolt
+    )
 
     # let's just test on one partition for now.
     # We cannot run multiple servers and clients on the same machine.
@@ -586,10 +584,9 @@ def check_server_client(
     graph_name = f"check_server_client_{shared_mem}_{num_servers}_{num_clients}_{num_groups}"
     g.ndata["features"] = F.unsqueeze(F.arange(0, g.num_nodes()), 1)
     g.edata["features"] = F.unsqueeze(F.arange(0, g.num_edges()), 1)
-    partition_graph(g, graph_name, num_parts, "/tmp/dist_graph")
-    if use_graphbolt:
-        part_config = os.path.join("/tmp/dist_graph", f"{graph_name}.json")
-        dgl_partition_to_graphbolt(part_config)
+    partition_graph(
+        g, graph_name, num_parts, "/tmp/dist_graph", use_graphbolt=use_graphbolt
+    )
 
     # let's just test on one partition for now.
     # We cannot run multiple servers and clients on the same machine.
@@ -662,10 +659,8 @@ def check_server_client_hierarchy(
         num_parts,
         "/tmp/dist_graph",
         num_trainers_per_machine=num_clients,
+        use_graphbolt=use_graphbolt,
     )
-    if use_graphbolt:
-        part_config = os.path.join("/tmp/dist_graph", f"{graph_name}.json")
-        dgl_partition_to_graphbolt(part_config)
 
     # let's just test on one partition for now.
     # We cannot run multiple servers and clients on the same machine.
@@ -928,10 +923,9 @@ def check_server_client_hetero(
     # Partition the graph
     num_parts = 1
     graph_name = "dist_graph_test_3"
-    partition_graph(g, graph_name, num_parts, "/tmp/dist_graph")
-    if use_graphbolt:
-        part_config = os.path.join("/tmp/dist_graph", f"{graph_name}.json")
-        dgl_partition_to_graphbolt(part_config)
+    partition_graph(
+        g, graph_name, num_parts, "/tmp/dist_graph", use_graphbolt=use_graphbolt
+    )
 
     # let's just test on one partition for now.
     # We cannot run multiple servers and clients on the same machine.
diff --git a/tests/distributed/test_partition.py b/tests/distributed/test_partition.py
index 6b2df3fdc038..30a85f7df025 100644
--- a/tests/distributed/test_partition.py
+++ b/tests/distributed/test_partition.py
@@ -944,3 +944,165 @@ def test_not_sorted_node_edge_map():
         gpb, _, _, _ = load_partition_book(part_config, 1)
         assert gpb.local_ntype_offset == [0, 300, 700]
         assert gpb.local_etype_offset == [0, 500, 1100, 1800, 2600]
+
+
+@pytest.mark.parametrize("part_method", ["metis", "random"])
+@pytest.mark.parametrize("num_parts", [1, 4])
+@pytest.mark.parametrize("store_eids", [True, False])
+@pytest.mark.parametrize("store_inner_node", [True, False])
+@pytest.mark.parametrize("store_inner_edge", [True, False])
+@pytest.mark.parametrize("debug_mode", [True, False])
+def test_partition_graph_graphbolt_homo(
+    part_method,
+    num_parts,
+    store_eids,
+    store_inner_node,
+    store_inner_edge,
+    debug_mode,
+):
+    reset_envs()
+    if debug_mode:
+        os.environ["DGL_DIST_DEBUG"] = "1"
+    with tempfile.TemporaryDirectory() as test_dir:
+        g = create_random_graph(1000)
+        graph_name = "test"
+        partition_graph(
+            g,
+            graph_name,
+            num_parts,
+            test_dir,
+            part_method=part_method,
+            use_graphbolt=True,
+            store_eids=store_eids,
+            store_inner_node=store_inner_node,
+            store_inner_edge=store_inner_edge,
+        )
+        part_config = os.path.join(test_dir, f"{graph_name}.json")
+        for part_id in range(num_parts):
+            orig_g = dgl.load_graphs(
+                os.path.join(test_dir, f"part{part_id}/graph.dgl")
+            )[0][0]
+            new_g = load_partition(
+                part_config, part_id, load_feats=False, use_graphbolt=True
+            )[0]
+            orig_indptr, orig_indices, orig_eids = orig_g.adj().csc()
+            assert th.equal(orig_indptr, new_g.csc_indptr)
+            assert th.equal(orig_indices, new_g.indices)
+            assert new_g.node_type_offset is None
+            assert th.equal(
+                orig_g.ndata[dgl.NID], new_g.node_attributes[dgl.NID]
+            )
+            if store_inner_node or debug_mode:
+                assert th.equal(
+                    orig_g.ndata["inner_node"],
+                    new_g.node_attributes["inner_node"],
+                )
+            else:
+                assert "inner_node" not in new_g.node_attributes
+            if store_eids or debug_mode:
+                assert th.equal(
+                    orig_g.edata[dgl.EID][orig_eids],
+                    new_g.edge_attributes[dgl.EID],
+                )
+            else:
+                assert dgl.EID not in new_g.edge_attributes
+            if store_inner_edge or debug_mode:
+                assert th.equal(
+                    orig_g.edata["inner_edge"][orig_eids],
+                    new_g.edge_attributes["inner_edge"],
+                )
+            else:
+                assert "inner_edge" not in new_g.edge_attributes
+            assert new_g.type_per_edge is None
+            assert new_g.node_type_to_id is None
+            assert new_g.edge_type_to_id is None
+
+
+@pytest.mark.parametrize("part_method", ["metis", "random"])
+@pytest.mark.parametrize("num_parts", [1, 4])
+@pytest.mark.parametrize("store_eids", [True, False])
+@pytest.mark.parametrize("store_inner_node", [True, False])
+@pytest.mark.parametrize("store_inner_edge", [True, False])
+@pytest.mark.parametrize("debug_mode", [True, False])
+def test_partition_graph_graphbolt_hetero(
+    part_method,
+    num_parts,
+    store_eids,
+    store_inner_node,
+    store_inner_edge,
+    debug_mode,
+):
+    reset_envs()
+    if debug_mode:
+        os.environ["DGL_DIST_DEBUG"] = "1"
+    with tempfile.TemporaryDirectory() as test_dir:
+        g = create_random_hetero()
+        graph_name = "test"
+        partition_graph(
+            g,
+            graph_name,
+            num_parts,
+            test_dir,
+            part_method=part_method,
+            use_graphbolt=True,
+            store_eids=store_eids,
+            store_inner_node=store_inner_node,
+            store_inner_edge=store_inner_edge,
+        )
+        part_config = os.path.join(test_dir, f"{graph_name}.json")
+        for part_id in range(num_parts):
+            orig_g = dgl.load_graphs(
+                os.path.join(test_dir, f"part{part_id}/graph.dgl")
+            )[0][0]
+            new_g = load_partition(
+                part_config, part_id, load_feats=False, use_graphbolt=True
+            )[0]
+            orig_indptr, orig_indices, orig_eids = orig_g.adj().csc()
+            assert th.equal(orig_indptr, new_g.csc_indptr)
+            assert th.equal(orig_indices, new_g.indices)
+            assert th.equal(
+                orig_g.ndata[dgl.NID], new_g.node_attributes[dgl.NID]
+            )
+            if store_inner_node or debug_mode:
+                assert th.equal(
+                    orig_g.ndata["inner_node"],
+                    new_g.node_attributes["inner_node"],
+                )
+            else:
+                assert "inner_node" not in new_g.node_attributes
+            if debug_mode:
+                assert th.equal(
+                    orig_g.ndata[dgl.NTYPE], new_g.node_attributes[dgl.NTYPE]
+                )
+            else:
+                assert dgl.NTYPE not in new_g.node_attributes
+            if store_eids or debug_mode:
+                assert th.equal(
+                    orig_g.edata[dgl.EID][orig_eids],
+                    new_g.edge_attributes[dgl.EID],
+                )
+            else:
+                assert dgl.EID not in new_g.edge_attributes
+            if store_inner_edge or debug_mode:
+                assert th.equal(
+                    orig_g.edata["inner_edge"],
+                    new_g.edge_attributes["inner_edge"],
+                )
+            else:
+                assert "inner_edge" not in new_g.edge_attributes
+            if debug_mode:
+                assert th.equal(
+                    orig_g.edata[dgl.ETYPE][orig_eids],
+                    new_g.edge_attributes[dgl.ETYPE],
+                )
+            else:
+                assert dgl.ETYPE not in new_g.edge_attributes
+            assert th.equal(
+                orig_g.edata[dgl.ETYPE][orig_eids], new_g.type_per_edge
+            )
+
+            for node_type, type_id in new_g.node_type_to_id.items():
+                assert g.get_ntype_id(node_type) == type_id
+            for edge_type, type_id in new_g.edge_type_to_id.items():
+                assert g.get_etype_id(_etype_str_to_tuple(edge_type)) == type_id
+            assert new_g.node_type_offset is None

From e117adac250db0052ba2a56d5b654dcf381631cd Mon Sep 17 00:00:00 2001
From: Rhett Ying <85214957+Rhett-Ying@users.noreply.github.com>
Date: Thu, 1 Feb 2024 11:41:05 +0800
Subject: [PATCH 03/45] [GraphBolt] fix testcases on warning messages (#7054)

---
 tests/python/pytorch/graphbolt/test_base.py       |  8 +++++---
 .../pytorch/graphbolt/test_feature_fetcher.py     | 10 ++++++----
 .../python/pytorch/graphbolt/test_item_sampler.py | 15 +++++++++++----
 3 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/tests/python/pytorch/graphbolt/test_base.py b/tests/python/pytorch/graphbolt/test_base.py
index 34c57c469ef6..b25b28166294 100644
--- a/tests/python/pytorch/graphbolt/test_base.py
+++ b/tests/python/pytorch/graphbolt/test_base.py
@@ -13,17 +13,19 @@
 
 @unittest.skipIf(F._default_context_str == "cpu", "CopyTo needs GPU to test")
 def test_CopyTo():
-    item_sampler = gb.ItemSampler(gb.ItemSet(torch.randn(20)), 4)
+    item_sampler = gb.ItemSampler(
+        gb.ItemSet(torch.arange(20), names="seed_nodes"), 4
+    )
 
     # Invoke CopyTo via class constructor.
     dp = gb.CopyTo(item_sampler, "cuda")
     for data in dp:
-        assert data.device.type == "cuda"
+        assert data.seed_nodes.device.type == "cuda"
 
     # Invoke CopyTo via functional form.
     dp = item_sampler.copy_to("cuda")
     for data in dp:
-        assert data.device.type == "cuda"
+        assert data.seed_nodes.device.type == "cuda"
 
 
 @pytest.mark.parametrize(
diff --git a/tests/python/pytorch/graphbolt/test_feature_fetcher.py b/tests/python/pytorch/graphbolt/test_feature_fetcher.py
index bd14716bb188..63d990dc5eaa 100644
--- a/tests/python/pytorch/graphbolt/test_feature_fetcher.py
+++ b/tests/python/pytorch/graphbolt/test_feature_fetcher.py
@@ -77,7 +77,8 @@ def test_FeatureFetcher_with_edges_homo():
         [[random.randint(0, 10)] for _ in range(graph.total_num_edges)]
     )
 
-    def add_node_and_edge_ids(seeds):
+    def add_node_and_edge_ids(minibatch):
+        seeds = minibatch.seed_nodes
         subgraphs = []
         for _ in range(3):
             sampled_csc = gb.CSCFormatBase(
@@ -103,7 +104,7 @@ def add_node_and_edge_ids(seeds):
     features[keys[1]] = gb.TorchBasedFeature(b)
     feature_store = gb.BasicFeatureStore(features)
 
-    itemset = gb.ItemSet(torch.arange(10))
+    itemset = gb.ItemSet(torch.arange(10), names="seed_nodes")
     item_sampler_dp = gb.ItemSampler(itemset, batch_size=2)
     converter_dp = Mapper(item_sampler_dp, add_node_and_edge_ids)
     fetcher_dp = gb.FeatureFetcher(converter_dp, feature_store, ["a"], ["b"])
@@ -170,7 +171,8 @@ def test_FeatureFetcher_with_edges_hetero():
     a = torch.tensor([[random.randint(0, 10)] for _ in range(20)])
     b = torch.tensor([[random.randint(0, 10)] for _ in range(50)])
 
-    def add_node_and_edge_ids(seeds):
+    def add_node_and_edge_ids(minibatch):
+        seeds = minibatch.seed_nodes
         subgraphs = []
         original_edge_ids = {
             "n1:e1:n2": torch.randint(0, 50, (10,)),
@@ -213,7 +215,7 @@ def add_node_and_edge_ids(seeds):
 
     itemset = gb.ItemSetDict(
         {
-            "n1": gb.ItemSet(torch.randint(0, 20, (10,))),
+            "n1": gb.ItemSet(torch.randint(0, 20, (10,)), names="seed_nodes"),
         }
     )
     item_sampler_dp = gb.ItemSampler(itemset, batch_size=2)
diff --git a/tests/python/pytorch/graphbolt/test_item_sampler.py b/tests/python/pytorch/graphbolt/test_item_sampler.py
index fc4764df026f..264e489726cf 100644
--- a/tests/python/pytorch/graphbolt/test_item_sampler.py
+++ b/tests/python/pytorch/graphbolt/test_item_sampler.py
@@ -204,9 +204,16 @@ def test_ItemSet_graphs(batch_size, shuffle, drop_last):
         dgl.rand_graph(num_nodes * (i + 1), num_edges * (i + 1))
         for i in range(num_graphs)
     ]
-    item_set = gb.ItemSet(graphs)
+    item_set = gb.ItemSet(graphs, names="graphs")
+    # DGLGraph is not supported in gb.MiniBatch yet. Let's use a customized
+    # minibatcher to return the original graphs.
+    customized_minibatcher = lambda batch, names: batch
     item_sampler = gb.ItemSampler(
-        item_set, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last
+        item_set,
+        batch_size=batch_size,
+        shuffle=shuffle,
+        drop_last=drop_last,
+        minibatcher=customized_minibatcher,
     )
     minibatch_num_nodes = []
     minibatch_num_edges = []
@@ -459,13 +466,13 @@ def test_ItemSet_seeds_labels(batch_size, shuffle, drop_last):
 def test_append_with_other_datapipes():
     num_ids = 100
     batch_size = 4
-    item_set = gb.ItemSet(torch.arange(0, num_ids))
+    item_set = gb.ItemSet(torch.arange(0, num_ids), names="seed_nodes")
     data_pipe = gb.ItemSampler(item_set, batch_size)
     # torchdata.datapipes.iter.Enumerator
     data_pipe = data_pipe.enumerate()
     for i, (idx, data) in enumerate(data_pipe):
         assert i == idx
-        assert len(data) == batch_size
+        assert len(data.seed_nodes) == batch_size
 
 
 @pytest.mark.parametrize("batch_size", [1, 4])

From bf8f05df36280dffed45fb3548c1595dbeefec78 Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Thu, 1 Feb 2024 09:46:50 +0300
Subject: [PATCH 04/45] [GraphBolt][CUDA] Use deque instead of Queue for
 Bufferer. (#7050)

---
 python/dgl/graphbolt/dataloader.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/python/dgl/graphbolt/dataloader.py b/python/dgl/graphbolt/dataloader.py
index f89e43e0c8d9..b0dd9daccfaf 100644
--- a/python/dgl/graphbolt/dataloader.py
+++ b/python/dgl/graphbolt/dataloader.py
@@ -1,6 +1,6 @@
 """Graph Bolt DataLoaders"""
 
-from queue import Queue
+from collections import deque
 
 import torch
 import torch.utils.data
@@ -69,18 +69,18 @@ def __init__(self, datapipe, buffer_size=1):
             raise ValueError(
                 "'buffer_size' is required to be a positive integer."
             )
-        self.buffer = Queue(buffer_size)
+        self.buffer = deque(maxlen=buffer_size)
 
     def __iter__(self):
         for data in self.datapipe:
-            if not self.buffer.full():
-                self.buffer.put(data)
+            if len(self.buffer) < self.buffer.maxlen:
+                self.buffer.append(data)
             else:
-                return_data = self.buffer.get()
-                self.buffer.put(data)
+                return_data = self.buffer.popleft()
+                self.buffer.append(data)
                 yield return_data
-        while not self.buffer.empty():
-            yield self.buffer.get()
+        while len(self.buffer) > 0:
+            yield self.buffer.popleft()
 
 
 class Awaiter(dp.iter.IterDataPipe):

From e602ab1b56889c8f999f07aeddb55d641fba1014 Mon Sep 17 00:00:00 2001
From: "Hongzhi (Steve), Chen" <chenhongzhi.nkcs@gmail.com>
Date: Thu, 1 Feb 2024 14:48:35 +0800
Subject: [PATCH 05/45] [Misc] Move test_minibatch to the correct dir. (#7055)

Co-authored-by: Ubuntu <ubuntu@ip-172-31-28-63.ap-northeast-1.compute.internal>
---
 tests/python/pytorch/graphbolt/{impl => }/test_minibatch.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename tests/python/pytorch/graphbolt/{impl => }/test_minibatch.py (100%)

diff --git a/tests/python/pytorch/graphbolt/impl/test_minibatch.py b/tests/python/pytorch/graphbolt/test_minibatch.py
similarity index 100%
rename from tests/python/pytorch/graphbolt/impl/test_minibatch.py
rename to tests/python/pytorch/graphbolt/test_minibatch.py

From 50eb1014ac3b0e3bfd4268454c8a2f37e3a232a2 Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Thu, 1 Feb 2024 10:07:30 +0300
Subject: [PATCH 06/45] [GraphBolt] Refactor NeighborSampler and expose
 fine-grained datapipes. (#6983)

---
 python/dgl/graphbolt/impl/neighbor_sampler.py | 146 ++++++++++++------
 python/dgl/graphbolt/subgraph_sampler.py      |  57 +++++--
 2 files changed, 146 insertions(+), 57 deletions(-)

diff --git a/python/dgl/graphbolt/impl/neighbor_sampler.py b/python/dgl/graphbolt/impl/neighbor_sampler.py
index ef10d49d7584..605da8ff5ce3 100644
--- a/python/dgl/graphbolt/impl/neighbor_sampler.py
+++ b/python/dgl/graphbolt/impl/neighbor_sampler.py
@@ -1,9 +1,12 @@
 """Neighbor subgraph samplers for GraphBolt."""
 
+from functools import partial
+
 import torch
 from torch.utils.data import functional_datapipe
 
 from ..internal import compact_csc_format, unique_and_compact_csc_formats
+from ..minibatch_transformer import MiniBatchTransformer
 
 from ..subgraph_sampler import SubgraphSampler
 from .sampled_subgraph_impl import SampledSubgraphImpl
@@ -12,8 +15,66 @@
 __all__ = ["NeighborSampler", "LayerNeighborSampler"]
 
 
+@functional_datapipe("sample_per_layer")
+class SamplePerLayer(MiniBatchTransformer):
+    """Sample neighbor edges from a graph for a single layer."""
+
+    def __init__(self, datapipe, sampler, fanout, replace, prob_name):
+        super().__init__(datapipe, self._sample_per_layer)
+        self.sampler = sampler
+        self.fanout = fanout
+        self.replace = replace
+        self.prob_name = prob_name
+
+    def _sample_per_layer(self, minibatch):
+        subgraph = self.sampler(
+            minibatch._seed_nodes, self.fanout, self.replace, self.prob_name
+        )
+        minibatch.sampled_subgraphs.insert(0, subgraph)
+        return minibatch
+
+
+@functional_datapipe("compact_per_layer")
+class CompactPerLayer(MiniBatchTransformer):
+    """Compact the sampled edges for a single layer."""
+
+    def __init__(self, datapipe, deduplicate):
+        super().__init__(datapipe, self._compact_per_layer)
+        self.deduplicate = deduplicate
+
+    def _compact_per_layer(self, minibatch):
+        subgraph = minibatch.sampled_subgraphs[0]
+        seeds = minibatch._seed_nodes
+        if self.deduplicate:
+            (
+                original_row_node_ids,
+                compacted_csc_format,
+            ) = unique_and_compact_csc_formats(subgraph.sampled_csc, seeds)
+            subgraph = SampledSubgraphImpl(
+                sampled_csc=compacted_csc_format,
+                original_column_node_ids=seeds,
+                original_row_node_ids=original_row_node_ids,
+                original_edge_ids=subgraph.original_edge_ids,
+            )
+        else:
+            (
+                original_row_node_ids,
+                compacted_csc_format,
+            ) = compact_csc_format(subgraph.sampled_csc, seeds)
+            subgraph = SampledSubgraphImpl(
+                sampled_csc=compacted_csc_format,
+                original_column_node_ids=seeds,
+                original_row_node_ids=original_row_node_ids,
+                original_edge_ids=subgraph.original_edge_ids,
+            )
+        minibatch._seed_nodes = original_row_node_ids
+        minibatch.sampled_subgraphs[0] = subgraph
+        return minibatch
+
+
 @functional_datapipe("sample_neighbor")
 class NeighborSampler(SubgraphSampler):
+    # pylint: disable=abstract-method
     """Sample neighbor edges from a graph and return a subgraph.
 
     Functional name: :obj:`sample_neighbor`.
@@ -95,6 +156,7 @@ class NeighborSampler(SubgraphSampler):
     )]
     """
 
+    # pylint: disable=useless-super-delegation
     def __init__(
         self,
         datapipe,
@@ -103,26 +165,19 @@ def __init__(
         replace=False,
         prob_name=None,
         deduplicate=True,
+        sampler=None,
     ):
-        super().__init__(datapipe)
-        self.graph = graph
-        # Convert fanouts to a list of tensors.
-        self.fanouts = []
-        for fanout in fanouts:
-            if not isinstance(fanout, torch.Tensor):
-                fanout = torch.LongTensor([int(fanout)])
-            self.fanouts.insert(0, fanout)
-        self.replace = replace
-        self.prob_name = prob_name
-        self.deduplicate = deduplicate
-        self.sampler = graph.sample_neighbors
+        if sampler is None:
+            sampler = graph.sample_neighbors
+        super().__init__(
+            datapipe, graph, fanouts, replace, prob_name, deduplicate, sampler
+        )
 
-    def sample_subgraphs(self, seeds, seeds_timestamp):
-        subgraphs = []
-        num_layers = len(self.fanouts)
+    def _prepare(self, node_type_to_id, minibatch):
+        seeds = minibatch._seed_nodes
         # Enrich seeds with all node types.
         if isinstance(seeds, dict):
-            ntypes = list(self.graph.node_type_to_id.keys())
+            ntypes = list(node_type_to_id.keys())
             # Loop over different seeds to extract the device they are on.
             device = None
             dtype = None
@@ -134,42 +189,37 @@ def sample_subgraphs(self, seeds, seeds_timestamp):
             seeds = {
                 ntype: seeds.get(ntype, default_tensor) for ntype in ntypes
             }
-        for hop in range(num_layers):
-            subgraph = self.sampler(
-                seeds,
-                self.fanouts[hop],
-                self.replace,
-                self.prob_name,
+        minibatch._seed_nodes = seeds
+        minibatch.sampled_subgraphs = []
+        return minibatch
+
+    @staticmethod
+    def _set_input_nodes(minibatch):
+        minibatch.input_nodes = minibatch._seed_nodes
+        return minibatch
+
+    # pylint: disable=arguments-differ
+    def sampling_stages(
+        self, datapipe, graph, fanouts, replace, prob_name, deduplicate, sampler
+    ):
+        datapipe = datapipe.transform(
+            partial(self._prepare, graph.node_type_to_id)
+        )
+        for fanout in reversed(fanouts):
+            # Convert fanout to tensor.
+            if not isinstance(fanout, torch.Tensor):
+                fanout = torch.LongTensor([int(fanout)])
+            datapipe = datapipe.sample_per_layer(
+                sampler, fanout, replace, prob_name
             )
-            if self.deduplicate:
-                (
-                    original_row_node_ids,
-                    compacted_csc_format,
-                ) = unique_and_compact_csc_formats(subgraph.sampled_csc, seeds)
-                subgraph = SampledSubgraphImpl(
-                    sampled_csc=compacted_csc_format,
-                    original_column_node_ids=seeds,
-                    original_row_node_ids=original_row_node_ids,
-                    original_edge_ids=subgraph.original_edge_ids,
-                )
-            else:
-                (
-                    original_row_node_ids,
-                    compacted_csc_format,
-                ) = compact_csc_format(subgraph.sampled_csc, seeds)
-                subgraph = SampledSubgraphImpl(
-                    sampled_csc=compacted_csc_format,
-                    original_column_node_ids=seeds,
-                    original_row_node_ids=original_row_node_ids,
-                    original_edge_ids=subgraph.original_edge_ids,
-                )
-            subgraphs.insert(0, subgraph)
-            seeds = original_row_node_ids
-        return seeds, subgraphs
+            datapipe = datapipe.compact_per_layer(deduplicate)
+
+        return datapipe.transform(self._set_input_nodes)
 
 
 @functional_datapipe("sample_layer_neighbor")
 class LayerNeighborSampler(NeighborSampler):
+    # pylint: disable=abstract-method
     """Sample layer neighbor edges from a graph and return a subgraph.
 
     Functional name: :obj:`sample_layer_neighbor`.
@@ -280,5 +330,5 @@ def __init__(
             replace,
             prob_name,
             deduplicate,
+            graph.sample_layer_neighbors,
         )
-        self.sampler = graph.sample_layer_neighbors
diff --git a/python/dgl/graphbolt/subgraph_sampler.py b/python/dgl/graphbolt/subgraph_sampler.py
index 3e3c3d9b507c..b05b8ca30619 100644
--- a/python/dgl/graphbolt/subgraph_sampler.py
+++ b/python/dgl/graphbolt/subgraph_sampler.py
@@ -22,21 +22,44 @@ class SubgraphSampler(MiniBatchTransformer):
     Functional name: :obj:`sample_subgraph`.
 
     This class is the base class of all subgraph samplers. Any subclass of
-    SubgraphSampler should implement the :meth:`sample_subgraphs` method.
+    SubgraphSampler should implement either the :meth:`sample_subgraphs` method
+    or the :meth:`sampling_stages` method to define the fine-grained sampling
+    stages to take advantage of optimizations provided by the GraphBolt
+    DataLoader.
 
     Parameters
     ----------
     datapipe : DataPipe
         The datapipe.
+    args : Non-Keyword Arguments
+        Arguments to be passed into sampling_stages.
+    kwargs : Keyword Arguments
+        Arguments to be passed into sampling_stages.
     """
 
     def __init__(
         self,
         datapipe,
+        *args,
+        **kwargs,
     ):
-        super().__init__(datapipe, self._sample)
+        datapipe = datapipe.transform(self._preprocess)
+        datapipe = self.sampling_stages(datapipe, *args, **kwargs)
+        datapipe = datapipe.transform(self._postprocess)
+        super().__init__(datapipe, self._identity)
 
-    def _sample(self, minibatch):
+    @staticmethod
+    def _identity(minibatch):
+        return minibatch
+
+    @staticmethod
+    def _postprocess(minibatch):
+        delattr(minibatch, "_seed_nodes")
+        delattr(minibatch, "_seeds_timestamp")
+        return minibatch
+
+    @staticmethod
+    def _preprocess(minibatch):
         if minibatch.node_pairs is not None:
             (
                 seeds,
@@ -44,7 +67,7 @@ def _sample(self, minibatch):
                 minibatch.compacted_node_pairs,
                 minibatch.compacted_negative_srcs,
                 minibatch.compacted_negative_dsts,
-            ) = self._node_pairs_preprocess(minibatch)
+            ) = SubgraphSampler._node_pairs_preprocess(minibatch)
         elif minibatch.seed_nodes is not None:
             seeds = minibatch.seed_nodes
             seeds_timestamp = (
@@ -55,13 +78,12 @@ def _sample(self, minibatch):
                 f"Invalid minibatch {minibatch}: Either `node_pairs` or "
                 "`seed_nodes` should have a value."
             )
-        (
-            minibatch.input_nodes,
-            minibatch.sampled_subgraphs,
-        ) = self.sample_subgraphs(seeds, seeds_timestamp)
+        minibatch._seed_nodes = seeds
+        minibatch._seeds_timestamp = seeds_timestamp
         return minibatch
 
-    def _node_pairs_preprocess(self, minibatch):
+    @staticmethod
+    def _node_pairs_preprocess(minibatch):
         use_timestamp = hasattr(minibatch, "timestamp")
         node_pairs = minibatch.node_pairs
         neg_src, neg_dst = minibatch.negative_srcs, minibatch.negative_dsts
@@ -191,6 +213,23 @@ def _node_pairs_preprocess(self, minibatch):
             compacted_negative_dsts if has_neg_dst else None,
         )
 
+    def _sample(self, minibatch):
+        (
+            minibatch.input_nodes,
+            minibatch.sampled_subgraphs,
+        ) = self.sample_subgraphs(
+            minibatch._seed_nodes, minibatch._seeds_timestamp
+        )
+        return minibatch
+
+    def sampling_stages(self, datapipe):
+        """The sampling stages are defined here by chaining to the datapipe. The
+        default implementation expects :meth:`sample_subgraphs` to be
+        implemented. To define fine-grained stages, this method should be
+        overridden.
+        """
+        return datapipe.transform(self._sample)
+
     def sample_subgraphs(self, seeds, seeds_timestamp):
         """Sample subgraphs from the given seeds, possibly with temporal constraints.
 

From 177dc133d685d69a89a775eb9d1ca720094e14c7 Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Fri, 2 Feb 2024 08:16:52 +0300
Subject: [PATCH 07/45] [GraphBolt][CUDA] IndexSelectCSC kernel launch config
 change. (#7056)

---
 graphbolt/src/cuda/index_select_csc_impl.cu | 8 ++++++--
 graphbolt/src/cuda/index_select_impl.cu     | 2 +-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/graphbolt/src/cuda/index_select_csc_impl.cu b/graphbolt/src/cuda/index_select_csc_impl.cu
index d1a6a89af18f..ce8af7a9f615 100644
--- a/graphbolt/src/cuda/index_select_csc_impl.cu
+++ b/graphbolt/src/cuda/index_select_csc_impl.cu
@@ -14,12 +14,13 @@
 #include <numeric>
 
 #include "./common.h"
+#include "./max_uva_threads.h"
 #include "./utils.h"
 
 namespace graphbolt {
 namespace ops {
 
-constexpr int BLOCK_SIZE = 128;
+constexpr int BLOCK_SIZE = CUDA_MAX_NUM_THREADS;
 
 // Given the in_degree array and a permutation, returns in_degree of the output
 // and the permuted and modified in_degree of the input. The modified in_degree
@@ -130,7 +131,10 @@ std::tuple<torch::Tensor, torch::Tensor> UVAIndexSelectCSCCopyIndices(
   torch::Tensor output_indices =
       torch::empty(output_size.value(), options.dtype(indices.scalar_type()));
   const dim3 block(BLOCK_SIZE);
-  const dim3 grid((edge_count_aligned + BLOCK_SIZE - 1) / BLOCK_SIZE);
+  const dim3 grid(
+      (std::min(edge_count_aligned, cuda::max_uva_threads.value_or(1 << 20)) +
+       BLOCK_SIZE - 1) /
+      BLOCK_SIZE);
 
   // Find the smallest integer type to store the coo_aligned_rows tensor.
   const int num_bits = cuda::NumberOfBits(num_nodes);
diff --git a/graphbolt/src/cuda/index_select_impl.cu b/graphbolt/src/cuda/index_select_impl.cu
index 389d2430f227..43fd144848b0 100644
--- a/graphbolt/src/cuda/index_select_impl.cu
+++ b/graphbolt/src/cuda/index_select_impl.cu
@@ -131,7 +131,7 @@ torch::Tensor UVAIndexSelectImpl_(torch::Tensor input, torch::Tensor index) {
         IndexSelectSingleKernel, num_blocks, num_threads, 0, input_ptr,
         input_len, index_sorted_ptr, return_len, ret_ptr, permutation_ptr);
   } else {
-    constexpr int BLOCK_SIZE = 512;
+    constexpr int BLOCK_SIZE = CUDA_MAX_NUM_THREADS;
     dim3 block(BLOCK_SIZE, 1);
     while (static_cast<int64_t>(block.x) >= 2 * aligned_feature_size) {
       block.x >>= 1;

From 8568386911075fe9113e4b5e40bfe3b36400831e Mon Sep 17 00:00:00 2001
From: Rhett Ying <85214957+Rhett-Ying@users.noreply.github.com>
Date: Fri, 2 Feb 2024 16:49:59 +0800
Subject: [PATCH 08/45] [dev] update system requirements (#7072)

---
 docs/source/install/index.rst | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/docs/source/install/index.rst b/docs/source/install/index.rst
index 25249e8e36b9..23391ebce1cd 100644
--- a/docs/source/install/index.rst
+++ b/docs/source/install/index.rst
@@ -5,11 +5,13 @@ System requirements
 -------------------
 DGL works with the following operating systems:
 
-* Ubuntu 16.04
+* Ubuntu 20.04+
+* CentOS 8+
+* RHEL 8+
 * macOS X
 * Windows 10
 
-DGL requires Python version 3.6, 3.7, 3.8 or 3.9.
+DGL requires Python version 3.7, 3.8, 3.9, 3.10, 3.11.
 
 DGL supports multiple tensor libraries as backends, e.g., PyTorch, MXNet. For requirements on backends and how to select one, see :ref:`backends`.
 

From d5b03bcb27a8467ff6a7738c895287b520eee36b Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Sat, 3 Feb 2024 01:49:54 +0300
Subject: [PATCH 09/45] [GraphBolt][CUDA] GPUCache performance fix. (#7073)

---
 graphbolt/src/cuda/gpu_cache.cu | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/graphbolt/src/cuda/gpu_cache.cu b/graphbolt/src/cuda/gpu_cache.cu
index 0a47bbbddc18..7c479fcc0c10 100644
--- a/graphbolt/src/cuda/gpu_cache.cu
+++ b/graphbolt/src/cuda/gpu_cache.cu
@@ -43,20 +43,19 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> GpuCache::Query(
       torch::empty(keys.size(0), keys.options().dtype(torch::kLong));
   auto missing_keys =
       torch::empty(keys.size(0), keys.options().dtype(torch::kLong));
-  cuda::CopyScalar<size_t> missing_len;
-  auto stream = cuda::GetCurrentStream();
+  auto allocator = cuda::GetAllocator();
+  auto missing_len_device = allocator.AllocateStorage<size_t>(1);
   cache_->Query(
       reinterpret_cast<const key_t *>(keys.data_ptr()), keys.size(0),
       values.data_ptr<float>(),
       reinterpret_cast<uint64_t *>(missing_index.data_ptr()),
-      reinterpret_cast<key_t *>(missing_keys.data_ptr()), missing_len.get(),
-      stream);
+      reinterpret_cast<key_t *>(missing_keys.data_ptr()),
+      missing_len_device.get(), cuda::GetCurrentStream());
   values = values.view(torch::kByte)
                .slice(1, 0, num_bytes_)
                .view(dtype_)
                .view(shape_);
-  // To safely read missing_len, we synchronize
-  stream.synchronize();
+  cuda::CopyScalar<size_t> missing_len(missing_len_device.get());
   missing_index = missing_index.slice(0, 0, static_cast<size_t>(missing_len));
   missing_keys = missing_keys.slice(0, 0, static_cast<size_t>(missing_len));
   return std::make_tuple(values, missing_index, missing_keys);

From 0a42d863b740e3e13d79ee081d3792a4a04aed87 Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Sat, 3 Feb 2024 01:50:06 +0300
Subject: [PATCH 10/45] [GraphBolt][CUDA] GPUCachedFeature for multiGPU
 example. (#7074)

---
 examples/multigpu/graphbolt/node_classification.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/examples/multigpu/graphbolt/node_classification.py b/examples/multigpu/graphbolt/node_classification.py
index b9fa73353300..4186ba4f15c9 100644
--- a/examples/multigpu/graphbolt/node_classification.py
+++ b/examples/multigpu/graphbolt/node_classification.py
@@ -284,6 +284,12 @@ def run(rank, world_size, args, devices, dataset):
     hidden_size = 256
     out_size = num_classes
 
+    if args.gpu_cache_size > 0:
+        dataset.feature._features[("node", None, "feat")] = gb.GPUCachedFeature(
+            dataset.feature._features[("node", None, "feat")],
+            args.gpu_cache_size,
+        )
+
     # Create GraphSAGE model. It should be copied onto a GPU as a replica.
     model = SAGE(in_size, hidden_size, out_size).to(device)
     model = DDP(model)
@@ -381,6 +387,12 @@ def parse_args():
     parser.add_argument(
         "--num-workers", type=int, default=0, help="The number of processes."
     )
+    parser.add_argument(
+        "--gpu-cache-size",
+        type=int,
+        default=0,
+        help="The GPU cache size for input features.",
+    )
     parser.add_argument(
         "--mode",
         default="pinned-cuda",

From 15695ed0ecc9eb39f862a8c8b2c23e453ee7b8f2 Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Sat, 3 Feb 2024 10:03:06 +0300
Subject: [PATCH 11/45] [GraphBolt][CUDA] Handle edge case of %100 cache hit
 rate. (#7080)

---
 graphbolt/src/cuda/gpu_cache.cu               |  1 +
 .../graphbolt/impl/test_gpu_cached_feature.py | 25 ++++++++++++++++---
 2 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/graphbolt/src/cuda/gpu_cache.cu b/graphbolt/src/cuda/gpu_cache.cu
index 7c479fcc0c10..f72446ec2626 100644
--- a/graphbolt/src/cuda/gpu_cache.cu
+++ b/graphbolt/src/cuda/gpu_cache.cu
@@ -78,6 +78,7 @@ void GpuCache::Replace(torch::Tensor keys, torch::Tensor values) {
       "Values should have the correct dimensions.");
   TORCH_CHECK(
       values.scalar_type() == dtype_, "Values should have the correct dtype.");
+  if (keys.numel() == 0) return;
   keys = keys.to(torch::kLong);
   torch::Tensor float_values;
   if (num_bytes_ % sizeof(float) != 0) {
diff --git a/tests/python/pytorch/graphbolt/impl/test_gpu_cached_feature.py b/tests/python/pytorch/graphbolt/impl/test_gpu_cached_feature.py
index d251701cdaf9..eb9a62babff1 100644
--- a/tests/python/pytorch/graphbolt/impl/test_gpu_cached_feature.py
+++ b/tests/python/pytorch/graphbolt/impl/test_gpu_cached_feature.py
@@ -28,14 +28,16 @@
         torch.float64,
     ],
 )
-def test_gpu_cached_feature(dtype):
+@pytest.mark.parametrize("cache_size_a", [1, 1024])
+@pytest.mark.parametrize("cache_size_b", [1, 1024])
+def test_gpu_cached_feature(dtype, cache_size_a, cache_size_b):
     a = torch.tensor([[1, 2, 3], [4, 5, 6]], dtype=dtype, pin_memory=True)
     b = torch.tensor(
         [[[1, 2], [3, 4]], [[4, 5], [6, 7]]], dtype=dtype, pin_memory=True
     )
 
-    feat_store_a = gb.GPUCachedFeature(gb.TorchBasedFeature(a), 2)
-    feat_store_b = gb.GPUCachedFeature(gb.TorchBasedFeature(b), 1)
+    feat_store_a = gb.GPUCachedFeature(gb.TorchBasedFeature(a), cache_size_a)
+    feat_store_b = gb.GPUCachedFeature(gb.TorchBasedFeature(b), cache_size_b)
 
     # Test read the entire feature.
     assert torch.equal(feat_store_a.read(), a.to("cuda"))
@@ -52,6 +54,23 @@ def test_gpu_cached_feature(dtype):
             "cuda"
         ),
     )
+    assert torch.equal(
+        feat_store_a.read(torch.tensor([1, 1]).to("cuda")),
+        torch.tensor([[4, 5, 6], [4, 5, 6]], dtype=dtype).to("cuda"),
+    )
+    assert torch.equal(
+        feat_store_b.read(torch.tensor([0]).to("cuda")),
+        torch.tensor([[[1, 2], [3, 4]]], dtype=dtype).to("cuda"),
+    )
+    # The cache should be full now for the large cache sizes, %100 hit expected.
+    if cache_size_a >= 1024:
+        total_miss = feat_store_a._feature.total_miss
+        feat_store_a.read(torch.tensor([0, 1]).to("cuda"))
+        assert total_miss == feat_store_a._feature.total_miss
+    if cache_size_b >= 1024:
+        total_miss = feat_store_b._feature.total_miss
+        feat_store_b.read(torch.tensor([0, 1]).to("cuda"))
+        assert total_miss == feat_store_b._feature.total_miss
 
     # Test get the size of the entire feature with ids.
     assert feat_store_a.size() == torch.Size([3])

From 1e6fa711f2ed163075d4c308a0cd8b418096254c Mon Sep 17 00:00:00 2001
From: czkkkkkk <zekucai@gmail.com>
Date: Sat, 3 Feb 2024 17:05:58 +0800
Subject: [PATCH 12/45] [Graphbolt] Add fast path for tamporal sampling.
 (#7078)

---
 graphbolt/src/fused_csc_sampling_graph.cc | 72 +++++++++++++++++++++++
 1 file changed, 72 insertions(+)

diff --git a/graphbolt/src/fused_csc_sampling_graph.cc b/graphbolt/src/fused_csc_sampling_graph.cc
index 66de586f683e..1306193e2b66 100644
--- a/graphbolt/src/fused_csc_sampling_graph.cc
+++ b/graphbolt/src/fused_csc_sampling_graph.cc
@@ -810,12 +810,71 @@ torch::Tensor TemporalMask(
   return mask;
 }
 
+/**
+ * @brief Fast path for temporal sampling without probability. It is used when
+ * the number of neighbors is large. It randomly samples neighbors and checks
+ * the timestamp of the neighbors. It is successful if the number of sampled
+ * neighbors in kTriedThreshold trials is equal to the fanout.
+ */
+std::pair<bool, std::vector<int64_t>> FastTemporalPick(
+    torch::Tensor seed_timestamp, torch::Tensor csc_indices, int64_t fanout,
+    bool replace, const torch::optional<torch::Tensor>& node_timestamp,
+    const torch::optional<torch::Tensor>& edge_timestamp, int64_t seed_offset,
+    int64_t offset, int64_t num_neighbors) {
+  constexpr int64_t kTriedThreshold = 1000;
+  auto timestamp = utils::GetValueByIndex<int64_t>(seed_timestamp, seed_offset);
+  std::vector<int64_t> sampled_edges;
+  sampled_edges.reserve(fanout);
+  std::set<int64_t> sampled_edge_set;
+  int64_t sample_count = 0;
+  int64_t tried = 0;
+  while (sample_count < fanout && tried < kTriedThreshold) {
+    int64_t edge_id =
+        RandomEngine::ThreadLocal()->RandInt(offset, offset + num_neighbors);
+    ++tried;
+    if (!replace && sampled_edge_set.count(edge_id) > 0) {
+      continue;
+    }
+    if (node_timestamp.has_value()) {
+      int64_t neighbor_id =
+          utils::GetValueByIndex<int64_t>(csc_indices, edge_id);
+      if (utils::GetValueByIndex<int64_t>(
+              node_timestamp.value(), neighbor_id) >= timestamp)
+        continue;
+    }
+    if (edge_timestamp.has_value() &&
+        utils::GetValueByIndex<int64_t>(edge_timestamp.value(), edge_id) >=
+            timestamp) {
+      continue;
+    }
+    if (!replace) {
+      sampled_edge_set.insert(edge_id);
+    }
+    sampled_edges.push_back(edge_id);
+    sample_count++;
+  }
+  if (sample_count < fanout) {
+    return {false, {}};
+  }
+  return {true, sampled_edges};
+}
+
 int64_t TemporalNumPick(
     torch::Tensor seed_timestamp, torch::Tensor csc_indics, int64_t fanout,
     bool replace, const torch::optional<torch::Tensor>& probs_or_mask,
     const torch::optional<torch::Tensor>& node_timestamp,
     const torch::optional<torch::Tensor>& edge_timestamp, int64_t seed_offset,
     int64_t offset, int64_t num_neighbors) {
+  constexpr int64_t kFastPathThreshold = 1000;
+  if (num_neighbors > kFastPathThreshold && !probs_or_mask.has_value()) {
+    // TODO: Currently we use the fast path both in TemporalNumPick and
+    // TemporalPick. We may only sample once in TemporalNumPick and use the
+    // sampled edges in TemporalPick to avoid sampling twice.
+    auto [success, sampled_edges] = FastTemporalPick(
+        seed_timestamp, csc_indics, fanout, replace, node_timestamp,
+        edge_timestamp, seed_offset, offset, num_neighbors);
+    if (success) return sampled_edges.size();
+  }
   auto mask = TemporalMask(
       utils::GetValueByIndex<int64_t>(seed_timestamp, seed_offset), csc_indics,
       probs_or_mask, node_timestamp, edge_timestamp,
@@ -1183,6 +1242,19 @@ int64_t TemporalPick(
     const torch::optional<torch::Tensor>& node_timestamp,
     const torch::optional<torch::Tensor>& edge_timestamp, SamplerArgs<S> args,
     PickedType* picked_data_ptr) {
+  constexpr int64_t kFastPathThreshold = 1000;
+  if (S == SamplerType::NEIGHBOR && num_neighbors > kFastPathThreshold &&
+      !probs_or_mask.has_value()) {
+    auto [success, sampled_edges] = FastTemporalPick(
+        seed_timestamp, csc_indices, fanout, replace, node_timestamp,
+        edge_timestamp, seed_offset, offset, num_neighbors);
+    if (success) {
+      for (size_t i = 0; i < sampled_edges.size(); ++i) {
+        picked_data_ptr[i] = static_cast<PickedType>(sampled_edges[i]);
+      }
+      return sampled_edges.size();
+    }
+  }
   auto mask = TemporalMask(
       utils::GetValueByIndex<int64_t>(seed_timestamp, seed_offset), csc_indices,
       probs_or_mask, node_timestamp, edge_timestamp,

From 9273387ead0545e60c9e09ad2fa40e6e5ce3188c Mon Sep 17 00:00:00 2001
From: Rhett Ying <85214957+Rhett-Ying@users.noreply.github.com>
Date: Sun, 4 Feb 2024 10:37:45 +0800
Subject: [PATCH 13/45] [GraphBolt] move return_eids check to internal python
 API (#7071)

---
 .../impl/fused_csc_sampling_graph.py          | 20 +++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/python/dgl/graphbolt/impl/fused_csc_sampling_graph.py b/python/dgl/graphbolt/impl/fused_csc_sampling_graph.py
index b8cee5e18a7c..958bad29bc6e 100644
--- a/python/dgl/graphbolt/impl/fused_csc_sampling_graph.py
+++ b/python/dgl/graphbolt/impl/fused_csc_sampling_graph.py
@@ -625,8 +625,16 @@ def sample_neighbors(
         if isinstance(nodes, dict):
             nodes = self._convert_to_homogeneous_nodes(nodes)
 
+        return_eids = (
+            self.edge_attributes is not None
+            and ORIGINAL_EDGE_ID in self.edge_attributes
+        )
         C_sampled_subgraph = self._sample_neighbors(
-            nodes, fanouts, replace, probs_name
+            nodes,
+            fanouts,
+            replace=replace,
+            probs_name=probs_name,
+            return_eids=return_eids,
         )
         return self._convert_to_sampled_subgraph(C_sampled_subgraph)
 
@@ -679,6 +687,7 @@ def _sample_neighbors(
         fanouts: torch.Tensor,
         replace: bool = False,
         probs_name: Optional[str] = None,
+        return_eids: bool = False,
     ) -> torch.ScriptObject:
         """Sample neighboring edges of the given nodes and return the induced
         subgraph.
@@ -714,6 +723,9 @@ def _sample_neighbors(
             corresponding to each neighboring edge of a node. It must be a 1D
             floating-point or boolean tensor, with the number of elements
             equalling the total number of edges.
+        return_eids: bool, optional
+            Boolean indicating whether to return the original edge IDs of the
+            sampled edges.
 
         Returns
         -------
@@ -722,16 +734,12 @@ def _sample_neighbors(
         """
         # Ensure nodes is 1-D tensor.
         self._check_sampler_arguments(nodes, fanouts, probs_name)
-        has_original_eids = (
-            self.edge_attributes is not None
-            and ORIGINAL_EDGE_ID in self.edge_attributes
-        )
         return self._c_csc_graph.sample_neighbors(
             nodes,
             fanouts.tolist(),
             replace,
             False,
-            has_original_eids,
+            return_eids,
             probs_name,
         )
 

From 6459a688ae15d797dd4d0586f2f8ad2e46d58145 Mon Sep 17 00:00:00 2001
From: Rhett Ying <85214957+Rhett-Ying@users.noreply.github.com>
Date: Sun, 4 Feb 2024 10:39:44 +0800
Subject: [PATCH 14/45] [DistGB] enable GB sampling on homograph (#7061)

---
 python/dgl/distributed/graph_services.py      | 138 +++++++++++++++++-
 .../distributed/test_distributed_sampling.py  |  32 +++-
 2 files changed, 157 insertions(+), 13 deletions(-)

diff --git a/python/dgl/distributed/graph_services.py b/python/dgl/distributed/graph_services.py
index 0a732ca0e7b0..4dea7c206691 100644
--- a/python/dgl/distributed/graph_services.py
+++ b/python/dgl/distributed/graph_services.py
@@ -3,7 +3,9 @@
 
 import numpy as np
 
-from .. import backend as F
+import torch
+
+from .. import backend as F, graphbolt as gb
 from ..base import EID, NID
 from ..convert import graph, heterograph
 from ..sampling import (
@@ -65,6 +67,81 @@ def __getstate__(self):
         return self.global_src, self.global_dst, self.order_id
 
 
+def _sample_neighbors_graphbolt(
+    g, gpb, nodes, fanout, prob=None, replace=False
+):
+    """Sample from local partition via graphbolt.
+
+    The input nodes use global IDs. We need to map the global node IDs to local
+    node IDs, perform sampling and map the sampled results to the global IDs
+    space again. The sampled results are stored in three vectors that store
+    source nodes, destination nodes, etype IDs and edge IDs.
+
+    [Rui][TODO] edge IDs are not returned as not supported yet.
+
+    Parameters
+    ----------
+    g : FusedCSCSamplingGraph
+        The local partition.
+    gpb : GraphPartitionBook
+        The graph partition book.
+    nodes : tensor
+        The nodes to sample neighbors from.
+    fanout : tensor or int
+        The number of edges to be sampled for each node.
+    prob : tensor, optional
+        The probability associated with each neighboring edge of a node.
+    replace : bool, optional
+        If True, sample with replacement.
+
+    Returns
+    -------
+    tensor
+        The source node ID array.
+    tensor
+        The destination node ID array.
+    tensor
+        The edge type ID array.
+    tensor
+        The edge ID array.
+    """
+    # 1. Map global node IDs to local node IDs.
+    nodes = gpb.nid2localnid(nodes, gpb.partid)
+
+    # 2. Perform sampling.
+    # [Rui][TODO] `prob` and `replace` are not tested yet. Skip for now.
+    assert (
+        prob is None
+    ), "DistGraphBolt does not support sampling with probability."
+    assert (
+        not replace
+    ), "DistGraphBolt does not support sampling with replacement."
+
+    # Sanity checks.
+    assert isinstance(
+        g, gb.FusedCSCSamplingGraph
+    ), "Expect a FusedCSCSamplingGraph."
+    assert isinstance(nodes, torch.Tensor), "Expect a tensor of nodes."
+    if isinstance(fanout, int):
+        fanout = torch.LongTensor([fanout])
+    assert isinstance(fanout, torch.Tensor), "Expect a tensor of fanout."
+    # [Rui][TODO] Support multiple fanouts.
+    assert fanout.numel() == 1, "Expect a single fanout."
+
+    subgraph = g._sample_neighbors(nodes, fanout)
+
+    # 3. Map local node IDs to global node IDs.
+    local_src = subgraph.indices
+    local_dst = torch.repeat_interleave(
+        subgraph.original_column_node_ids, torch.diff(subgraph.indptr)
+    )
+    global_nid_mapping = g.node_attributes[NID]
+    global_src = global_nid_mapping[local_src]
+    global_dst = global_nid_mapping[local_dst]
+
+    return global_src, global_dst, subgraph.type_per_edge
+
+
 def _sample_neighbors(
     local_g, partition_book, seed_nodes, fan_out, edge_dir, prob, replace
 ):
@@ -212,12 +289,21 @@ def _in_subgraph(local_g, partition_book, seed_nodes):
 class SamplingRequest(Request):
     """Sampling Request"""
 
-    def __init__(self, nodes, fan_out, edge_dir="in", prob=None, replace=False):
+    def __init__(
+        self,
+        nodes,
+        fan_out,
+        edge_dir="in",
+        prob=None,
+        replace=False,
+        use_graphbolt=False,
+    ):
         self.seed_nodes = nodes
         self.edge_dir = edge_dir
         self.prob = prob
         self.replace = replace
         self.fan_out = fan_out
+        self.use_graphbolt = use_graphbolt
 
     def __setstate__(self, state):
         (
@@ -226,6 +312,7 @@ def __setstate__(self, state):
             self.prob,
             self.replace,
             self.fan_out,
+            self.use_graphbolt,
         ) = state
 
     def __getstate__(self):
@@ -235,6 +322,7 @@ def __getstate__(self):
             self.prob,
             self.replace,
             self.fan_out,
+            self.use_graphbolt,
         )
 
     def process_request(self, server_state):
@@ -245,6 +333,16 @@ def process_request(self, server_state):
             prob = [kv_store.data_store[self.prob]]
         else:
             prob = None
+        if self.use_graphbolt:
+            global_src, global_dst, etype_ids = _sample_neighbors_graphbolt(
+                local_g,
+                partition_book,
+                self.seed_nodes,
+                self.fan_out,
+                prob,
+                self.replace,
+            )
+            return SubgraphResponse(global_src, global_dst, etype_ids)
         global_src, global_dst, global_eids = _sample_neighbors(
             local_g,
             partition_book,
@@ -449,13 +547,14 @@ def merge_graphs(res_list, num_nodes):
             eids.append(res.global_eids)
         src_tensor = F.cat(srcs, 0)
         dst_tensor = F.cat(dsts, 0)
-        eid_tensor = F.cat(eids, 0)
+        eid_tensor = None if eids[0] is None else F.cat(eids, 0)
     else:
         src_tensor = res_list[0].global_src
         dst_tensor = res_list[0].global_dst
         eid_tensor = res_list[0].global_eids
     g = graph((src_tensor, dst_tensor), num_nodes=num_nodes)
-    g.edata[EID] = eid_tensor
+    if eid_tensor is not None:
+        g.edata[EID] = eid_tensor
     return g
 
 
@@ -491,7 +590,8 @@ def _distributed_access(g, nodes, issue_remote_req, local_access):
     """
     req_list = []
     partition_book = g.get_partition_book()
-    nodes = toindex(nodes).tousertensor()
+    if not isinstance(nodes, torch.Tensor):
+        nodes = toindex(nodes).tousertensor()
     partition_id = partition_book.nid2partid(nodes)
     local_nids = None
     for pid in range(partition_book.num_partitions()):
@@ -721,7 +821,15 @@ def local_access(local_g, partition_book, local_nids):
         return frontier
 
 
-def sample_neighbors(g, nodes, fanout, edge_dir="in", prob=None, replace=False):
+def sample_neighbors(
+    g,
+    nodes,
+    fanout,
+    edge_dir="in",
+    prob=None,
+    replace=False,
+    use_graphbolt=False,
+):
     """Sample from the neighbors of the given nodes from a distributed graph.
 
     For each node, a number of inbound (or outbound when ``edge_dir == 'out'``) edges
@@ -764,6 +872,8 @@ def sample_neighbors(g, nodes, fanout, edge_dir="in", prob=None, replace=False):
 
         For sampling without replacement, if fanout > the number of neighbors, all the
         neighbors are sampled. If fanout == -1, all neighbors are collected.
+    use_graphbolt : bool, optional
+        Whether to use GraphBolt for sampling.
 
     Returns
     -------
@@ -795,12 +905,26 @@ def issue_remote_req(node_ids):
         else:
             _prob = None
         return SamplingRequest(
-            node_ids, fanout, edge_dir=edge_dir, prob=_prob, replace=replace
+            node_ids,
+            fanout,
+            edge_dir=edge_dir,
+            prob=_prob,
+            replace=replace,
+            use_graphbolt=use_graphbolt,
         )
 
     def local_access(local_g, partition_book, local_nids):
         # See NOTE 1
         _prob = [g.edata[prob].local_partition] if prob is not None else None
+        if use_graphbolt:
+            return _sample_neighbors_graphbolt(
+                local_g,
+                partition_book,
+                local_nids,
+                fanout,
+                prob=_prob,
+                replace=replace,
+            )
         return _sample_neighbors(
             local_g,
             partition_book,
diff --git a/tests/distributed/test_distributed_sampling.py b/tests/distributed/test_distributed_sampling.py
index 9eb47342455b..c8d51b398846 100644
--- a/tests/distributed/test_distributed_sampling.py
+++ b/tests/distributed/test_distributed_sampling.py
@@ -31,6 +31,7 @@ def start_server(
     disable_shared_mem,
     graph_name,
     graph_format=["csc", "coo"],
+    use_graphbolt=False,
 ):
     g = DistGraphServer(
         rank,
@@ -40,6 +41,7 @@ def start_server(
         tmpdir / (graph_name + ".json"),
         disable_shared_mem=disable_shared_mem,
         graph_format=graph_format,
+        use_graphbolt=use_graphbolt,
     )
     g.start()
 
@@ -72,6 +74,7 @@ def start_sample_client_shuffle(
     group_id,
     orig_nid,
     orig_eid,
+    use_graphbolt=False,
 ):
     os.environ["DGL_GROUP_ID"] = str(group_id)
     gpb = None
@@ -80,17 +83,26 @@ def start_sample_client_shuffle(
             tmpdir / "test_sampling.json", rank
         )
     dgl.distributed.initialize("rpc_ip_config.txt")
-    dist_graph = DistGraph("test_sampling", gpb=gpb)
-    sampled_graph = sample_neighbors(dist_graph, [0, 10, 99, 66, 1024, 2008], 3)
+    dist_graph = DistGraph(
+        "test_sampling", gpb=gpb, use_graphbolt=use_graphbolt
+    )
+    sampled_graph = sample_neighbors(
+        dist_graph, [0, 10, 99, 66, 1024, 2008], 3, use_graphbolt=use_graphbolt
+    )
 
     src, dst = sampled_graph.edges()
     src = orig_nid[src]
     dst = orig_nid[dst]
     assert sampled_graph.num_nodes() == g.num_nodes()
     assert np.all(F.asnumpy(g.has_edges_between(src, dst)))
-    eids = g.edge_ids(src, dst)
-    eids1 = orig_eid[sampled_graph.edata[dgl.EID]]
-    assert np.array_equal(F.asnumpy(eids1), F.asnumpy(eids))
+    if use_graphbolt:
+        assert (
+            dgl.EID not in sampled_graph.edata
+        ), "EID should not be in sampled graph if use_graphbolt=True."
+    else:
+        eids = g.edge_ids(src, dst)
+        eids1 = orig_eid[sampled_graph.edata[dgl.EID]]
+        assert np.array_equal(F.asnumpy(eids1), F.asnumpy(eids))
 
 
 def start_find_edges_client(rank, tmpdir, disable_shared_mem, eids, etype=None):
@@ -378,7 +390,9 @@ def test_rpc_sampling():
         check_rpc_sampling(Path(tmpdirname), 1)
 
 
-def check_rpc_sampling_shuffle(tmpdir, num_server, num_groups=1):
+def check_rpc_sampling_shuffle(
+    tmpdir, num_server, num_groups=1, use_graphbolt=False
+):
     generate_ip_config("rpc_ip_config.txt", num_server, num_server)
 
     g = CitationGraphDataset("cora")[0]
@@ -393,6 +407,7 @@ def check_rpc_sampling_shuffle(tmpdir, num_server, num_groups=1):
         num_hops=num_hops,
         part_method="metis",
         return_mapping=True,
+        use_graphbolt=use_graphbolt,
     )
 
     pserver_list = []
@@ -406,6 +421,7 @@ def check_rpc_sampling_shuffle(tmpdir, num_server, num_groups=1):
                 num_server > 1,
                 "test_sampling",
                 ["csc", "coo"],
+                use_graphbolt,
             ),
         )
         p.start()
@@ -427,6 +443,7 @@ def check_rpc_sampling_shuffle(tmpdir, num_server, num_groups=1):
                     group_id,
                     orig_nids,
                     orig_eids,
+                    use_graphbolt,
                 ),
             )
             p.start()
@@ -1012,6 +1029,9 @@ def test_rpc_sampling_shuffle(num_server):
 
     os.environ["DGL_DIST_MODE"] = "distributed"
     with tempfile.TemporaryDirectory() as tmpdirname:
+        check_rpc_sampling_shuffle(
+            Path(tmpdirname), num_server, use_graphbolt=True
+        )
         check_rpc_sampling_shuffle(Path(tmpdirname), num_server)
         # [TODO][Rhett] Tests for multiple groups may fail sometimes and
         # root cause is unknown. Let's disable them for now.

From 3d854a6b9b7e8b705a7e429f365370abec7243ee Mon Sep 17 00:00:00 2001
From: Mingbang Wang <100203018+Skeleton003@users.noreply.github.com>
Date: Sun, 4 Feb 2024 11:39:43 +0800
Subject: [PATCH 15/45] [Misc] Correct docstrings in `python/dgl/convert.py`
 (#7060)

---
 python/dgl/convert.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/python/dgl/convert.py b/python/dgl/convert.py
index 49578a168831..1ab64ddbb116 100644
--- a/python/dgl/convert.py
+++ b/python/dgl/convert.py
@@ -1,4 +1,5 @@
 """Module for converting graph from/to other object."""
+
 from collections import defaultdict
 from collections.abc import Mapping
 
@@ -296,9 +297,9 @@ def heterograph(data_dict, num_nodes_dict=None, idtype=None, device=None):
     >>> g = dgl.heterograph(data_dict)
     >>> g
     Graph(num_nodes={'game': 5, 'topic': 3, 'user': 4},
-          num_edges={('user', 'follows', 'user'): 2, ('user', 'follows', 'topic'): 2,
+          num_edges={('user', 'follows', 'topic'): 2, ('user', 'follows', 'user'): 2,
                      ('user', 'plays', 'game'): 2},
-          metagraph=[('user', 'user', 'follows'), ('user', 'topic', 'follows'),
+          metagraph=[('user', 'topic', 'follows'), ('user', 'user', 'follows'),
                      ('user', 'game', 'plays')])
 
     Explicitly specify the number of nodes for each node type in the graph.
@@ -1810,11 +1811,11 @@ def to_networkx(
     ...     ('user', 'follows', 'topic'): (torch.tensor([1, 1]), torch.tensor([1, 2])),
     ...     ('user', 'plays', 'game'): (torch.tensor([0, 3]), torch.tensor([3, 4]))
     ... })
-    ... g.ndata['n'] = {
+    >>> g.ndata['n'] = {
     ...     'game': torch.zeros(5, 1),
     ...     'user': torch.ones(4, 1)
     ... }
-    ... g.edata['e'] = {
+    >>> g.edata['e'] = {
     ...     ('user', 'follows', 'user'): torch.zeros(2, 1),
     ...     'plays': torch.ones(2, 1)
     ... }

From af0b63eda6315dcd0db6bdb25f2db84a0db12754 Mon Sep 17 00:00:00 2001
From: yxy235 <77922129+yxy235@users.noreply.github.com>
Date: Sun, 4 Feb 2024 13:24:34 +0800
Subject: [PATCH 16/45] [GraphBolt] Fix gpu `NegativeSampler` for seeds.
 (#7068)

Co-authored-by: Ubuntu <ubuntu@ip-172-31-0-133.us-west-2.compute.internal>
---
 .../impl/fused_csc_sampling_graph.py          |  8 +-
 .../impl/uniform_negative_sampler.py          | 21 +++++-
 .../graphbolt/impl/test_negative_sampler.py   | 75 +++++++++++++------
 3 files changed, 79 insertions(+), 25 deletions(-)

diff --git a/python/dgl/graphbolt/impl/fused_csc_sampling_graph.py b/python/dgl/graphbolt/impl/fused_csc_sampling_graph.py
index 958bad29bc6e..de81c137833b 100644
--- a/python/dgl/graphbolt/impl/fused_csc_sampling_graph.py
+++ b/python/dgl/graphbolt/impl/fused_csc_sampling_graph.py
@@ -1026,7 +1026,13 @@ def sample_negative_edges_uniform_2(
             torch.cat(
                 (
                     pos_src.repeat_interleave(negative_ratio),
-                    torch.randint(0, max_node_id, (num_negative,)),
+                    torch.randint(
+                        0,
+                        max_node_id,
+                        (num_negative,),
+                        dtype=node_pairs.dtype,
+                        device=node_pairs.device,
+                    ),
                 ),
             )
             .view(2, num_negative)
diff --git a/python/dgl/graphbolt/impl/uniform_negative_sampler.py b/python/dgl/graphbolt/impl/uniform_negative_sampler.py
index cc7fa4e8fb1f..1b95d07f6601 100644
--- a/python/dgl/graphbolt/impl/uniform_negative_sampler.py
+++ b/python/dgl/graphbolt/impl/uniform_negative_sampler.py
@@ -76,15 +76,30 @@ def _sample_with_etype(self, node_pairs, etype=None, use_seeds=False):
             # Construct indexes for all node pairs.
             num_pos_node_pairs = node_pairs.shape[0]
             negative_ratio = self.negative_ratio
-            pos_indexes = torch.arange(0, num_pos_node_pairs)
+            pos_indexes = torch.arange(
+                0,
+                num_pos_node_pairs,
+                device=seeds.device,
+            )
             neg_indexes = pos_indexes.repeat_interleave(negative_ratio)
             indexes = torch.cat((pos_indexes, neg_indexes))
             # Construct labels for all node pairs.
             pos_num = node_pairs.shape[0]
             neg_num = seeds.shape[0] - pos_num
             labels = torch.cat(
-                (torch.ones(pos_num), torch.zeros(neg_num))
-            ).bool()
+                (
+                    torch.ones(
+                        pos_num,
+                        dtype=torch.bool,
+                        device=seeds.device,
+                    ),
+                    torch.zeros(
+                        neg_num,
+                        dtype=torch.bool,
+                        device=seeds.device,
+                    ),
+                ),
+            )
             return seeds, labels, indexes
         else:
             return self.graph.sample_negative_edges_uniform(
diff --git a/tests/python/pytorch/graphbolt/impl/test_negative_sampler.py b/tests/python/pytorch/graphbolt/impl/test_negative_sampler.py
index 44aab2d8b8bb..9b2f783b7d86 100644
--- a/tests/python/pytorch/graphbolt/impl/test_negative_sampler.py
+++ b/tests/python/pytorch/graphbolt/impl/test_negative_sampler.py
@@ -1,5 +1,7 @@
 import re
 
+import backend as F
+
 import dgl.graphbolt as gb
 import pytest
 import torch
@@ -14,7 +16,9 @@ def test_NegativeSampler_invoke():
         torch.arange(0, 2 * num_seeds).reshape(-1, 2), names="node_pairs"
     )
     batch_size = 10
-    item_sampler = gb.ItemSampler(item_set, batch_size=batch_size)
+    item_sampler = gb.ItemSampler(item_set, batch_size=batch_size).copy_to(
+        F.ctx()
+    )
     negative_ratio = 2
 
     # Invoke NegativeSampler via class constructor.
@@ -35,13 +39,17 @@ def test_NegativeSampler_invoke():
 
 def test_UniformNegativeSampler_invoke():
     # Instantiate graph and required datapipes.
-    graph = gb_test_utils.rand_csc_graph(100, 0.05, bidirection_edge=True)
+    graph = gb_test_utils.rand_csc_graph(100, 0.05, bidirection_edge=True).to(
+        F.ctx()
+    )
     num_seeds = 30
     item_set = gb.ItemSet(
         torch.arange(0, 2 * num_seeds).reshape(-1, 2), names="seeds"
     )
     batch_size = 10
-    item_sampler = gb.ItemSampler(item_set, batch_size=batch_size)
+    item_sampler = gb.ItemSampler(item_set, batch_size=batch_size).copy_to(
+        F.ctx()
+    )
     negative_ratio = 2
 
     def _verify(negative_sampler):
@@ -70,13 +78,17 @@ def _verify(negative_sampler):
 
 def test_UniformNegativeSampler_node_pairs_invoke():
     # Instantiate graph and required datapipes.
-    graph = gb_test_utils.rand_csc_graph(100, 0.05, bidirection_edge=True)
+    graph = gb_test_utils.rand_csc_graph(100, 0.05, bidirection_edge=True).to(
+        F.ctx()
+    )
     num_seeds = 30
     item_set = gb.ItemSet(
         torch.arange(0, 2 * num_seeds).reshape(-1, 2), names="node_pairs"
     )
     batch_size = 10
-    item_sampler = gb.ItemSampler(item_set, batch_size=batch_size)
+    item_sampler = gb.ItemSampler(item_set, batch_size=batch_size).copy_to(
+        F.ctx()
+    )
     negative_ratio = 2
 
     # Verify iteration over UniformNegativeSampler.
@@ -106,13 +118,17 @@ def _verify(negative_sampler):
 @pytest.mark.parametrize("negative_ratio", [1, 5, 10, 20])
 def test_Uniform_NegativeSampler_node_pairs(negative_ratio):
     # Construct FusedCSCSamplingGraph.
-    graph = gb_test_utils.rand_csc_graph(100, 0.05, bidirection_edge=True)
+    graph = gb_test_utils.rand_csc_graph(100, 0.05, bidirection_edge=True).to(
+        F.ctx()
+    )
     num_seeds = 30
     item_set = gb.ItemSet(
         torch.arange(0, num_seeds * 2).reshape(-1, 2), names="node_pairs"
     )
     batch_size = 10
-    item_sampler = gb.ItemSampler(item_set, batch_size=batch_size)
+    item_sampler = gb.ItemSampler(item_set, batch_size=batch_size).copy_to(
+        F.ctx()
+    )
     # Construct NegativeSampler.
     negative_sampler = gb.UniformNegativeSampler(
         item_sampler,
@@ -134,13 +150,17 @@ def test_Uniform_NegativeSampler_node_pairs(negative_ratio):
 @pytest.mark.parametrize("negative_ratio", [1, 5, 10, 20])
 def test_Uniform_NegativeSampler(negative_ratio):
     # Construct FusedCSCSamplingGraph.
-    graph = gb_test_utils.rand_csc_graph(100, 0.05, bidirection_edge=True)
+    graph = gb_test_utils.rand_csc_graph(100, 0.05, bidirection_edge=True).to(
+        F.ctx()
+    )
     num_seeds = 30
     item_set = gb.ItemSet(
         torch.arange(0, num_seeds * 2).reshape(-1, 2), names="seeds"
     )
     batch_size = 10
-    item_sampler = gb.ItemSampler(item_set, batch_size=batch_size)
+    item_sampler = gb.ItemSampler(item_set, batch_size=batch_size).copy_to(
+        F.ctx()
+    )
     # Construct NegativeSampler.
     negative_sampler = gb.UniformNegativeSampler(
         item_sampler,
@@ -159,12 +179,15 @@ def test_Uniform_NegativeSampler(negative_ratio):
         neg_src = data.seeds[batch_size:, 0]
         assert torch.equal(pos_src.repeat_interleave(negative_ratio), neg_src)
         # Check labels.
-        assert torch.equal(data.labels[:batch_size], torch.ones(batch_size))
         assert torch.equal(
-            data.labels[batch_size:], torch.zeros(batch_size * negative_ratio)
+            data.labels[:batch_size], torch.ones(batch_size).to(F.ctx())
+        )
+        assert torch.equal(
+            data.labels[batch_size:],
+            torch.zeros(batch_size * negative_ratio).to(F.ctx()),
         )
         # Check indexes.
-        pos_indexes = torch.arange(0, batch_size)
+        pos_indexes = torch.arange(0, batch_size).to(F.ctx())
         neg_indexes = pos_indexes.repeat_interleave(negative_ratio)
         expected_indexes = torch.cat((pos_indexes, neg_indexes))
         assert torch.equal(data.indexes, expected_indexes)
@@ -173,13 +196,17 @@ def test_Uniform_NegativeSampler(negative_ratio):
 def test_Uniform_NegativeSampler_error_shape():
     # 1. seeds with shape N*3.
     # Construct FusedCSCSamplingGraph.
-    graph = gb_test_utils.rand_csc_graph(100, 0.05, bidirection_edge=True)
+    graph = gb_test_utils.rand_csc_graph(100, 0.05, bidirection_edge=True).to(
+        F.ctx()
+    )
     num_seeds = 30
     item_set = gb.ItemSet(
         torch.arange(0, num_seeds * 3).reshape(-1, 3), names="seeds"
     )
     batch_size = 10
-    item_sampler = gb.ItemSampler(item_set, batch_size=batch_size)
+    item_sampler = gb.ItemSampler(item_set, batch_size=batch_size).copy_to(
+        F.ctx()
+    )
     negative_ratio = 2
     # Construct NegativeSampler.
     negative_sampler = gb.UniformNegativeSampler(
@@ -201,7 +228,9 @@ def test_Uniform_NegativeSampler_error_shape():
     item_set = gb.ItemSet(
         torch.arange(0, num_seeds * 2).reshape(-1, 2, 1), names="seeds"
     )
-    item_sampler = gb.ItemSampler(item_set, batch_size=batch_size)
+    item_sampler = gb.ItemSampler(item_set, batch_size=batch_size).copy_to(
+        F.ctx()
+    )
     # Construct NegativeSampler.
     negative_sampler = gb.UniformNegativeSampler(
         item_sampler,
@@ -220,7 +249,9 @@ def test_Uniform_NegativeSampler_error_shape():
     # 3. seeds with shape N.
     # Construct FusedCSCSamplingGraph.
     item_set = gb.ItemSet(torch.arange(0, num_seeds), names="seeds")
-    item_sampler = gb.ItemSampler(item_set, batch_size=batch_size)
+    item_sampler = gb.ItemSampler(item_set, batch_size=batch_size).copy_to(
+        F.ctx()
+    )
     # Construct NegativeSampler.
     negative_sampler = gb.UniformNegativeSampler(
         item_sampler,
@@ -260,7 +291,7 @@ def get_hetero_graph():
 
 
 def test_NegativeSampler_Hetero_node_pairs_Data():
-    graph = get_hetero_graph()
+    graph = get_hetero_graph().to(F.ctx())
     itemset = gb.ItemSetDict(
         {
             "n1:e1:n2": gb.ItemSet(
@@ -274,13 +305,13 @@ def test_NegativeSampler_Hetero_node_pairs_Data():
         }
     )
 
-    item_sampler = gb.ItemSampler(itemset, batch_size=2)
+    item_sampler = gb.ItemSampler(itemset, batch_size=2).copy_to(F.ctx())
     negative_dp = gb.UniformNegativeSampler(item_sampler, graph, 1)
     assert len(list(negative_dp)) == 5
 
 
 def test_NegativeSampler_Hetero_Data():
-    graph = get_hetero_graph()
+    graph = get_hetero_graph().to(F.ctx())
     itemset = gb.ItemSetDict(
         {
             "n1:e1:n2": gb.ItemSet(
@@ -295,7 +326,9 @@ def test_NegativeSampler_Hetero_Data():
     )
     batch_size = 2
     negative_ratio = 1
-    item_sampler = gb.ItemSampler(itemset, batch_size=batch_size)
+    item_sampler = gb.ItemSampler(itemset, batch_size=batch_size).copy_to(
+        F.ctx()
+    )
     negative_dp = gb.UniformNegativeSampler(item_sampler, graph, negative_ratio)
     assert len(list(negative_dp)) == 5
     # Perform negative sampling.
@@ -311,5 +344,5 @@ def test_NegativeSampler_Hetero_Data():
         for etype, seeds_data in data.seeds.items():
             neg_src = seeds_data[batch_size:, 0]
             neg_dst = seeds_data[batch_size:, 1]
-            assert torch.equal(expected_neg_src[i][etype], neg_src)
+            assert torch.equal(expected_neg_src[i][etype].to(F.ctx()), neg_src)
             assert (neg_dst < 3).all(), neg_dst

From 95142b844e3c3050959d66ff6267b09dd4a17794 Mon Sep 17 00:00:00 2001
From: Andrei Ivanov <32910461+drivanov@users.noreply.github.com>
Date: Sun, 4 Feb 2024 00:06:06 -0800
Subject: [PATCH 17/45] [GraphBolt] Fixing two issues in GraphBolt tests.
 (#7059)

Co-authored-by: Rhett Ying <85214957+Rhett-Ying@users.noreply.github.com>
---
 tests/python/pytorch/graphbolt/gb_test_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/python/pytorch/graphbolt/gb_test_utils.py b/tests/python/pytorch/graphbolt/gb_test_utils.py
index dd7abc74da0c..59c4c3a90276 100644
--- a/tests/python/pytorch/graphbolt/gb_test_utils.py
+++ b/tests/python/pytorch/graphbolt/gb_test_utils.py
@@ -269,7 +269,7 @@ def genereate_raw_data_for_hetero_dataset(
     # Generate train/test/valid set.
     os.makedirs(os.path.join(test_dir, "set"), exist_ok=True)
     user_ids = torch.arange(num_nodes["user"])
-    np.random.shuffle(user_ids)
+    np.random.shuffle(user_ids.numpy())
     num_train = int(num_nodes["user"] * 0.6)
     num_validation = int(num_nodes["user"] * 0.2)
     num_test = num_nodes["user"] - num_train - num_validation

From 308f099b4e31a0cc61ba787e5997ec38d5dcf921 Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Sun, 4 Feb 2024 14:29:29 +0300
Subject: [PATCH 18/45] [GraphBolt][CUDA] Improve GPUCache help string in
 multiGPU example. (#7081)

---
 examples/multigpu/graphbolt/node_classification.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/multigpu/graphbolt/node_classification.py b/examples/multigpu/graphbolt/node_classification.py
index 4186ba4f15c9..b144fdb1d5cc 100644
--- a/examples/multigpu/graphbolt/node_classification.py
+++ b/examples/multigpu/graphbolt/node_classification.py
@@ -391,7 +391,7 @@ def parse_args():
         "--gpu-cache-size",
         type=int,
         default=0,
-        help="The GPU cache size for input features.",
+        help="The capacity of the GPU cache, the number of features to store.",
     )
     parser.add_argument(
         "--mode",

From f4989867713acae87e11993c479723251a0fd942 Mon Sep 17 00:00:00 2001
From: Mingbang Wang <100203018+Skeleton003@users.noreply.github.com>
Date: Sun, 4 Feb 2024 20:10:45 +0800
Subject: [PATCH 19/45] [GraphBolt] Add error messages for attribute size check
 (#7079)

Co-authored-by: Muhammed Fatih BALIN <m.f.balin@gmail.com>
---
 graphbolt/src/fused_csc_sampling_graph.cc | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/graphbolt/src/fused_csc_sampling_graph.cc b/graphbolt/src/fused_csc_sampling_graph.cc
index 1306193e2b66..4dacb9792448 100644
--- a/graphbolt/src/fused_csc_sampling_graph.cc
+++ b/graphbolt/src/fused_csc_sampling_graph.cc
@@ -97,12 +97,21 @@ c10::intrusive_ptr<FusedCSCSamplingGraph> FusedCSCSamplingGraph::Create(
   }
   if (node_attributes.has_value()) {
     for (const auto& pair : node_attributes.value()) {
-      TORCH_CHECK(pair.value().size(0) == indptr.size(0) - 1);
+      TORCH_CHECK(
+          pair.value().size(0) == indptr.size(0) - 1,
+          "Expected node_attribute.size(0) and num_nodes to be equal, "
+          "but node_attribute.size(0) was ",
+          pair.value().size(0), ", and num_nodes was ", indptr.size(0) - 1,
+          ".");
     }
   }
   if (edge_attributes.has_value()) {
     for (const auto& pair : edge_attributes.value()) {
-      TORCH_CHECK(pair.value().size(0) == indices.size(0));
+      TORCH_CHECK(
+          pair.value().size(0) == indices.size(0),
+          "Expected edge_attribute.size(0) and num_edges to be equal, "
+          "but edge_attribute.size(0) was ",
+          pair.value().size(0), ", and num_edges was ", indices.size(0), ".");
     }
   }
   return c10::make_intrusive<FusedCSCSamplingGraph>(

From 346197c47722993cf8cd9f41891d1457ef82decc Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Sun, 4 Feb 2024 19:56:40 +0300
Subject: [PATCH 20/45] [GraphBolt][CUDA] Add `gb.index_select` and fix example
 inferencing. (#7051)

---
 docs/source/api/python/dgl.graphbolt.rst      |  1 +
 .../sampling/graphbolt/link_prediction.py     | 36 ++++++++-----------
 .../sampling/graphbolt/node_classification.py | 29 ++++++---------
 python/dgl/graphbolt/base.py                  | 28 +++++++++++++++
 .../impl/torch_based_feature_store.py         |  8 ++---
 tests/python/pytorch/graphbolt/test_base.py   | 28 +++++++++++++++
 6 files changed, 84 insertions(+), 46 deletions(-)

diff --git a/docs/source/api/python/dgl.graphbolt.rst b/docs/source/api/python/dgl.graphbolt.rst
index 156b3a5712d7..ba7c7e129d9c 100644
--- a/docs/source/api/python/dgl.graphbolt.rst
+++ b/docs/source/api/python/dgl.graphbolt.rst
@@ -187,6 +187,7 @@ Utilities
     etype_tuple_to_str
     isin
     seed
+    index_select
     expand_indptr
     add_reverse_edges
     exclude_seed_edges
diff --git a/examples/sampling/graphbolt/link_prediction.py b/examples/sampling/graphbolt/link_prediction.py
index 20e169b570a5..bc01ac171439 100644
--- a/examples/sampling/graphbolt/link_prediction.py
+++ b/examples/sampling/graphbolt/link_prediction.py
@@ -79,14 +79,10 @@ def forward(self, blocks, x):
                 hidden_x = F.relu(hidden_x)
         return hidden_x
 
-    def inference(self, graph, features, dataloader, device):
+    def inference(self, graph, features, dataloader, storage_device):
         """Conduct layer-wise inference to get all the node embeddings."""
-        feature = features.read("node", None, "feat")
-
-        buffer_device = torch.device("cpu")
-        # Enable pin_memory for faster CPU to GPU data transfer if the
-        # model is running on a GPU.
-        pin_memory = buffer_device != device
+        pin_memory = storage_device == "pinned"
+        buffer_device = torch.device("cpu" if pin_memory else storage_device)
 
         print("Start node embedding inference.")
         for layer_idx, layer in enumerate(self.layers):
@@ -99,17 +95,17 @@ def inference(self, graph, features, dataloader, device):
                 device=buffer_device,
                 pin_memory=pin_memory,
             )
-            feature = feature.to(device)
-            for step, data in tqdm.tqdm(enumerate(dataloader)):
-                x = feature[data.input_nodes]
-                hidden_x = layer(data.blocks[0], x)  # len(blocks) = 1
+            for data in tqdm.tqdm(dataloader):
+                # len(blocks) = 1
+                hidden_x = layer(data.blocks[0], data.node_features["feat"])
                 if not is_last_layer:
                     hidden_x = F.relu(hidden_x)
                 # By design, our seed nodes are contiguous.
                 y[data.seed_nodes[0] : data.seed_nodes[-1] + 1] = hidden_x.to(
                     buffer_device, non_blocking=True
                 )
-            feature = y
+            if not is_last_layer:
+                features.update("node", None, "feat", y)
 
         return y
 
@@ -185,7 +181,9 @@ def create_dataloader(args, graph, features, itemset, is_train=True):
     # [Role]:
     # Initialize a neighbor sampler for sampling the neighborhoods of nodes.
     ############################################################################
-    datapipe = datapipe.sample_neighbor(graph, args.fanout)
+    datapipe = datapipe.sample_neighbor(
+        graph, args.fanout if is_train else [-1]
+    )
 
     ############################################################################
     # [Input]:
@@ -213,12 +211,9 @@ def create_dataloader(args, graph, features, itemset, is_train=True):
     # A FeatureFetcher object to fetch node features.
     # [Role]:
     # Initialize a feature fetcher for fetching features of the sampled
-    # subgraphs. This step is skipped in evaluation/inference because features
-    # are updated as a whole during it, thus storing features in minibatch is
-    # unnecessary.
+    # subgraphs.
     ############################################################################
-    if is_train:
-        datapipe = datapipe.fetch_feature(features, node_feature_keys=["feat"])
+    datapipe = datapipe.fetch_feature(features, node_feature_keys=["feat"])
 
     ############################################################################
     # [Input]:
@@ -286,15 +281,12 @@ def evaluate(args, model, graph, features, all_nodes_set, valid_set, test_set):
     model.eval()
     evaluator = Evaluator(name="ogbl-citation2")
 
-    # Since we need to use all neghborhoods for evaluation, we set the fanout
-    # to -1.
-    args.fanout = [-1]
     dataloader = create_dataloader(
         args, graph, features, all_nodes_set, is_train=False
     )
 
     # Compute node embeddings for the entire graph.
-    node_emb = model.inference(graph, features, dataloader, args.device)
+    node_emb = model.inference(graph, features, dataloader, args.storage_device)
     results = []
 
     # Loop over both validation and test sets.
diff --git a/examples/sampling/graphbolt/node_classification.py b/examples/sampling/graphbolt/node_classification.py
index c8eaf9a47f79..e5496e23a567 100644
--- a/examples/sampling/graphbolt/node_classification.py
+++ b/examples/sampling/graphbolt/node_classification.py
@@ -131,11 +131,9 @@ def create_dataloader(
     # A FeatureFetcher object to fetch node features.
     # [Role]:
     # Initialize a feature fetcher for fetching features of the sampled
-    # subgraphs. This step is skipped in inference because features are updated
-    # as a whole during it, thus storing features in minibatch is unnecessary.
+    # subgraphs.
     ############################################################################
-    if job != "infer":
-        datapipe = datapipe.fetch_feature(features, node_feature_keys=["feat"])
+    datapipe = datapipe.fetch_feature(features, node_feature_keys=["feat"])
 
     ############################################################################
     # [Step-5]:
@@ -194,14 +192,10 @@ def forward(self, blocks, x):
                 hidden_x = self.dropout(hidden_x)
         return hidden_x
 
-    def inference(self, graph, features, dataloader, device):
+    def inference(self, graph, features, dataloader, storage_device):
         """Conduct layer-wise inference to get all the node embeddings."""
-        feature = features.read("node", None, "feat")
-
-        buffer_device = torch.device("cpu")
-        # Enable pin_memory for faster CPU to GPU data transfer if the
-        # model is running on a GPU.
-        pin_memory = buffer_device != device
+        pin_memory = storage_device == "pinned"
+        buffer_device = torch.device("cpu" if pin_memory else storage_device)
 
         for layer_idx, layer in enumerate(self.layers):
             is_last_layer = layer_idx == len(self.layers) - 1
@@ -213,11 +207,9 @@ def inference(self, graph, features, dataloader, device):
                 device=buffer_device,
                 pin_memory=pin_memory,
             )
-            feature = feature.to(device)
-
-            for step, data in tqdm(enumerate(dataloader)):
-                x = feature[data.input_nodes]
-                hidden_x = layer(data.blocks[0], x)  # len(blocks) = 1
+            for data in tqdm(dataloader):
+                # len(blocks) = 1
+                hidden_x = layer(data.blocks[0], data.node_features["feat"])
                 if not is_last_layer:
                     hidden_x = F.relu(hidden_x)
                     hidden_x = self.dropout(hidden_x)
@@ -225,7 +217,8 @@ def inference(self, graph, features, dataloader, device):
                 y[data.seed_nodes[0] : data.seed_nodes[-1] + 1] = hidden_x.to(
                     buffer_device
                 )
-            feature = y
+            if not is_last_layer:
+                features.update("node", None, "feat", y)
 
         return y
 
@@ -245,7 +238,7 @@ def layerwise_infer(
         num_workers=args.num_workers,
         job="infer",
     )
-    pred = model.inference(graph, features, dataloader, args.device)
+    pred = model.inference(graph, features, dataloader, args.storage_device)
     pred = pred[test_set._items[0]]
     label = test_set._items[1].to(pred.device)
 
diff --git a/python/dgl/graphbolt/base.py b/python/dgl/graphbolt/base.py
index f9086ef1e888..77f72660832c 100644
--- a/python/dgl/graphbolt/base.py
+++ b/python/dgl/graphbolt/base.py
@@ -15,6 +15,7 @@
     "etype_tuple_to_str",
     "CopyTo",
     "isin",
+    "index_select",
     "expand_indptr",
     "CSCFormatBase",
     "seed",
@@ -102,6 +103,33 @@ def expand_indptr(indptr, dtype=None, node_ids=None, output_size=None):
     )
 
 
+def index_select(tensor, index):
+    """Returns a new tensor which indexes the input tensor along dimension dim
+    using the entries in index.
+
+    The returned tensor has the same number of dimensions as the original tensor
+    (tensor). The first dimension has the same size as the length of index;
+    other dimensions have the same size as in the original tensor.
+
+    When tensor is a pinned tensor and index.is_cuda is True, the operation runs
+    on the CUDA device and the returned tensor will also be on CUDA.
+
+    Parameters
+    ----------
+    tensor : torch.Tensor
+        The input tensor.
+    index : torch.Tensor
+        The 1-D tensor containing the indices to index.
+
+    Returns
+        -------
+        torch.Tensor
+            The indexed input tensor, equivalent to tensor[index].
+    """
+    assert index.dim() == 1, "Index should be 1D tensor."
+    return torch.ops.graphbolt.index_select(tensor, index)
+
+
 def etype_tuple_to_str(c_etype):
     """Convert canonical etype from tuple to string.
 
diff --git a/python/dgl/graphbolt/impl/torch_based_feature_store.py b/python/dgl/graphbolt/impl/torch_based_feature_store.py
index 0799c93ea93a..577e29b7325b 100644
--- a/python/dgl/graphbolt/impl/torch_based_feature_store.py
+++ b/python/dgl/graphbolt/impl/torch_based_feature_store.py
@@ -7,6 +7,7 @@
 import numpy as np
 import torch
 
+from ..base import index_select
 from ..feature_store import Feature
 from .basic_feature_store import BasicFeatureStore
 from .ondisk_metadata import OnDiskFeatureData
@@ -117,7 +118,7 @@ def read(self, ids: torch.Tensor = None):
             if self._tensor.is_pinned():
                 return self._tensor.cuda()
             return self._tensor
-        return torch.ops.graphbolt.index_select(self._tensor, ids)
+        return index_select(self._tensor, ids)
 
     def size(self):
         """Get the size of the feature.
@@ -144,11 +145,6 @@ def update(self, value: torch.Tensor, ids: torch.Tensor = None):
             updated.
         """
         if ids is None:
-            assert self.size() == value.size()[1:], (
-                f"ids is None, so the entire feature will be updated. "
-                f"But the size of the feature is {self.size()}, "
-                f"while the size of the value is {value.size()[1:]}."
-            )
             self._tensor = value
         else:
             assert ids.shape[0] == value.shape[0], (
diff --git a/tests/python/pytorch/graphbolt/test_base.py b/tests/python/pytorch/graphbolt/test_base.py
index b25b28166294..5d7d6c477c33 100644
--- a/tests/python/pytorch/graphbolt/test_base.py
+++ b/tests/python/pytorch/graphbolt/test_base.py
@@ -250,6 +250,34 @@ def test_isin_non_1D_dim():
         gb.isin(elements, test_elements)
 
 
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        torch.bool,
+        torch.uint8,
+        torch.int8,
+        torch.int16,
+        torch.int32,
+        torch.int64,
+        torch.float16,
+        torch.bfloat16,
+        torch.float32,
+        torch.float64,
+    ],
+)
+@pytest.mark.parametrize("idtype", [torch.int32, torch.int64])
+@pytest.mark.parametrize("pinned", [False, True])
+def test_index_select(dtype, idtype, pinned):
+    if F._default_context_str != "gpu" and pinned:
+        pytest.skip("Pinned tests are available only on GPU.")
+    tensor = torch.tensor([[2, 3], [5, 5], [20, 13]], dtype=dtype)
+    tensor = tensor.pin_memory() if pinned else tensor.to(F.ctx())
+    index = torch.tensor([0, 2], dtype=idtype, device=F.ctx())
+    gb_result = gb.index_select(tensor, index)
+    torch_result = tensor.to(F.ctx())[index.long()]
+    assert torch.equal(torch_result, gb_result)
+
+
 def torch_expand_indptr(indptr, dtype, nodes=None):
     if nodes is None:
         nodes = torch.arange(len(indptr) - 1, dtype=dtype, device=indptr.device)

From f3af2a9fd4666ce16af87750da4c5b32d0eb7f05 Mon Sep 17 00:00:00 2001
From: Rhett Ying <85214957+Rhett-Ying@users.noreply.github.com>
Date: Mon, 5 Feb 2024 09:48:20 +0800
Subject: [PATCH 21/45] [DistGB] return eids together with etype_ids in
 sampling (#7084)

---
 python/dgl/distributed/graph_services.py      | 158 ++++++++++++------
 .../distributed/test_distributed_sampling.py  |  90 +++++++---
 2 files changed, 171 insertions(+), 77 deletions(-)

diff --git a/python/dgl/distributed/graph_services.py b/python/dgl/distributed/graph_services.py
index 4dea7c206691..76be905a054f 100644
--- a/python/dgl/distributed/graph_services.py
+++ b/python/dgl/distributed/graph_services.py
@@ -6,7 +6,7 @@
 import torch
 
 from .. import backend as F, graphbolt as gb
-from ..base import EID, NID
+from ..base import EID, ETYPE, NID
 from ..convert import graph, heterograph
 from ..sampling import (
     sample_etype_neighbors as local_sample_etype_neighbors,
@@ -40,16 +40,29 @@
 class SubgraphResponse(Response):
     """The response for sampling and in_subgraph"""
 
-    def __init__(self, global_src, global_dst, global_eids):
+    def __init__(
+        self, global_src, global_dst, *, global_eids=None, etype_ids=None
+    ):
         self.global_src = global_src
         self.global_dst = global_dst
         self.global_eids = global_eids
+        self.etype_ids = etype_ids
 
     def __setstate__(self, state):
-        self.global_src, self.global_dst, self.global_eids = state
+        (
+            self.global_src,
+            self.global_dst,
+            self.global_eids,
+            self.etype_ids,
+        ) = state
 
     def __getstate__(self):
-        return self.global_src, self.global_dst, self.global_eids
+        return (
+            self.global_src,
+            self.global_dst,
+            self.global_eids,
+            self.etype_ids,
+        )
 
 
 class FindEdgeResponse(Response):
@@ -68,7 +81,7 @@ def __getstate__(self):
 
 
 def _sample_neighbors_graphbolt(
-    g, gpb, nodes, fanout, prob=None, replace=False
+    g, gpb, nodes, fanout, edge_dir="in", prob=None, replace=False
 ):
     """Sample from local partition via graphbolt.
 
@@ -77,8 +90,6 @@ def _sample_neighbors_graphbolt(
     space again. The sampled results are stored in three vectors that store
     source nodes, destination nodes, etype IDs and edge IDs.
 
-    [Rui][TODO] edge IDs are not returned as not supported yet.
-
     Parameters
     ----------
     g : FusedCSCSamplingGraph
@@ -89,6 +100,8 @@ def _sample_neighbors_graphbolt(
         The nodes to sample neighbors from.
     fanout : tensor or int
         The number of edges to be sampled for each node.
+    edge_dir : str, optional
+        Determines whether to sample inbound or outbound edges.
     prob : tensor, optional
         The probability associated with each neighboring edge of a node.
     replace : bool, optional
@@ -100,11 +113,15 @@ def _sample_neighbors_graphbolt(
         The source node ID array.
     tensor
         The destination node ID array.
-    tensor
-        The edge type ID array.
     tensor
         The edge ID array.
+    tensor
+        The edge type ID array.
     """
+    assert (
+        edge_dir == "in"
+    ), f"GraphBolt only supports inbound edge sampling but got {edge_dir}."
+
     # 1. Map global node IDs to local node IDs.
     nodes = gpb.nid2localnid(nodes, gpb.partid)
 
@@ -139,11 +156,20 @@ def _sample_neighbors_graphbolt(
     global_src = global_nid_mapping[local_src]
     global_dst = global_nid_mapping[local_dst]
 
-    return global_src, global_dst, subgraph.type_per_edge
+    # [Rui][TODO] edge IDs are not supported yet.
+    return LocalSampledGraph(
+        global_src, global_dst, None, subgraph.type_per_edge
+    )
 
 
-def _sample_neighbors(
-    local_g, partition_book, seed_nodes, fan_out, edge_dir, prob, replace
+def _sample_neighbors_dgl(
+    local_g,
+    partition_book,
+    seed_nodes,
+    fan_out,
+    edge_dir="in",
+    prob=None,
+    replace=False,
 ):
     """Sample from local partition.
 
@@ -170,7 +196,38 @@ def _sample_neighbors(
         global_nid_mapping, src
     ), F.gather_row(global_nid_mapping, dst)
     global_eids = F.gather_row(local_g.edata[EID], sampled_graph.edata[EID])
-    return global_src, global_dst, global_eids
+    return LocalSampledGraph(global_src, global_dst, global_eids)
+
+
+def _sample_neighbors(use_graphbolt, *args, **kwargs):
+    """Wrapper for sampling neighbors.
+
+    The actual sampling function depends on whether to use GraphBolt.
+
+    Parameters
+    ----------
+    use_graphbolt : bool
+        Whether to use GraphBolt for sampling.
+    args : list
+        The arguments for the sampling function.
+    kwargs : dict
+        The keyword arguments for the sampling function.
+
+    Returns
+    -------
+    tensor
+        The source node ID array.
+    tensor
+        The destination node ID array.
+    tensor
+        The edge ID array.
+    tensor
+        The edge type ID array.
+    """
+    func = (
+        _sample_neighbors_graphbolt if use_graphbolt else _sample_neighbors_dgl
+    )
+    return func(*args, **kwargs)
 
 
 def _sample_etype_neighbors(
@@ -211,7 +268,7 @@ def _sample_etype_neighbors(
         global_nid_mapping, src
     ), F.gather_row(global_nid_mapping, dst)
     global_eids = F.gather_row(local_g.edata[EID], sampled_graph.edata[EID])
-    return global_src, global_dst, global_eids
+    return LocalSampledGraph(global_src, global_dst, global_eids)
 
 
 def _find_edges(local_g, partition_book, seed_edges):
@@ -257,7 +314,7 @@ def _in_subgraph(local_g, partition_book, seed_nodes):
     src, dst = sampled_graph.edges()
     global_src, global_dst = global_nid_mapping[src], global_nid_mapping[dst]
     global_eids = F.gather_row(local_g.edata[EID], sampled_graph.edata[EID])
-    return global_src, global_dst, global_eids
+    return LocalSampledGraph(global_src, global_dst, global_eids)
 
 
 # --- NOTE 1 ---
@@ -333,26 +390,22 @@ def process_request(self, server_state):
             prob = [kv_store.data_store[self.prob]]
         else:
             prob = None
-        if self.use_graphbolt:
-            global_src, global_dst, etype_ids = _sample_neighbors_graphbolt(
-                local_g,
-                partition_book,
-                self.seed_nodes,
-                self.fan_out,
-                prob,
-                self.replace,
-            )
-            return SubgraphResponse(global_src, global_dst, etype_ids)
-        global_src, global_dst, global_eids = _sample_neighbors(
+        res = _sample_neighbors(
+            self.use_graphbolt,
             local_g,
             partition_book,
             self.seed_nodes,
             self.fan_out,
-            self.edge_dir,
-            prob,
-            self.replace,
+            edge_dir=self.edge_dir,
+            prob=prob,
+            replace=self.replace,
+        )
+        return SubgraphResponse(
+            res.global_src,
+            res.global_dst,
+            global_eids=res.global_eids,
+            etype_ids=res.etype_ids,
         )
-        return SubgraphResponse(global_src, global_dst, global_eids)
 
 
 class SamplingRequestEtype(Request):
@@ -407,7 +460,7 @@ def process_request(self, server_state):
             ]
         else:
             probs = None
-        global_src, global_dst, global_eids = _sample_etype_neighbors(
+        res = _sample_etype_neighbors(
             local_g,
             partition_book,
             self.seed_nodes,
@@ -418,7 +471,12 @@ def process_request(self, server_state):
             self.replace,
             self.etype_sorted,
         )
-        return SubgraphResponse(global_src, global_dst, global_eids)
+        return SubgraphResponse(
+            res.global_src,
+            res.global_dst,
+            global_eids=res.global_eids,
+            etype_ids=res.etype_ids,
+        )
 
 
 class EdgesRequest(Request):
@@ -532,7 +590,7 @@ def process_request(self, server_state):
         global_src, global_dst, global_eids = _in_subgraph(
             local_g, partition_book, self.seed_nodes
         )
-        return SubgraphResponse(global_src, global_dst, global_eids)
+        return SubgraphResponse(global_src, global_dst, global_eids=global_eids)
 
 
 def merge_graphs(res_list, num_nodes):
@@ -541,25 +599,33 @@ def merge_graphs(res_list, num_nodes):
         srcs = []
         dsts = []
         eids = []
+        etype_ids = []
         for res in res_list:
             srcs.append(res.global_src)
             dsts.append(res.global_dst)
             eids.append(res.global_eids)
+            etype_ids.append(res.etype_ids)
         src_tensor = F.cat(srcs, 0)
         dst_tensor = F.cat(dsts, 0)
         eid_tensor = None if eids[0] is None else F.cat(eids, 0)
+        etype_id_tensor = None if etype_ids[0] is None else F.cat(etype_ids, 0)
     else:
         src_tensor = res_list[0].global_src
         dst_tensor = res_list[0].global_dst
         eid_tensor = res_list[0].global_eids
+        etype_id_tensor = res_list[0].etype_ids
     g = graph((src_tensor, dst_tensor), num_nodes=num_nodes)
     if eid_tensor is not None:
         g.edata[EID] = eid_tensor
+    if etype_id_tensor is not None:
+        g.edata[ETYPE] = etype_id_tensor
     return g
 
 
-LocalSampledGraph = namedtuple(
-    "LocalSampledGraph", "global_src global_dst global_eids"
+LocalSampledGraph = namedtuple(  # pylint: disable=unexpected-keyword-arg
+    "LocalSampledGraph",
+    "global_src global_dst global_eids etype_ids",
+    defaults=(None, None, None, None),
 )
 
 
@@ -615,10 +681,8 @@ def _distributed_access(g, nodes, issue_remote_req, local_access):
     # sample neighbors for the nodes in the local partition.
     res_list = []
     if local_nids is not None:
-        src, dst, eids = local_access(
-            g.local_partition, partition_book, local_nids
-        )
-        res_list.append(LocalSampledGraph(src, dst, eids))
+        res = local_access(g.local_partition, partition_book, local_nids)
+        res_list.append(res)
 
     # receive responses from remote machines.
     if msgseq2pos is not None:
@@ -916,23 +980,15 @@ def issue_remote_req(node_ids):
     def local_access(local_g, partition_book, local_nids):
         # See NOTE 1
         _prob = [g.edata[prob].local_partition] if prob is not None else None
-        if use_graphbolt:
-            return _sample_neighbors_graphbolt(
-                local_g,
-                partition_book,
-                local_nids,
-                fanout,
-                prob=_prob,
-                replace=replace,
-            )
         return _sample_neighbors(
+            use_graphbolt,
             local_g,
             partition_book,
             local_nids,
             fanout,
-            edge_dir,
-            _prob,
-            replace,
+            edge_dir=edge_dir,
+            prob=_prob,
+            replace=replace,
         )
 
     frontier = _distributed_access(g, nodes, issue_remote_req, local_access)
diff --git a/tests/distributed/test_distributed_sampling.py b/tests/distributed/test_distributed_sampling.py
index c8d51b398846..7d10dc43f122 100644
--- a/tests/distributed/test_distributed_sampling.py
+++ b/tests/distributed/test_distributed_sampling.py
@@ -1,7 +1,7 @@
 import multiprocessing as mp
 import os
 import random
-import sys
+import tempfile
 import time
 import traceback
 import unittest
@@ -1013,47 +1013,85 @@ def check_rpc_bipartite_etype_sampling_shuffle(tmpdir, num_server):
         assert np.all(F.asnumpy(orig_dst1) == orig_dst)
 
 
-# Wait non shared memory graph store
-@unittest.skipIf(os.name == "nt", reason="Do not support windows yet")
-@unittest.skipIf(
-    dgl.backend.backend_name == "tensorflow",
-    reason="Not support tensorflow for now",
-)
-@unittest.skipIf(
-    dgl.backend.backend_name == "mxnet", reason="Turn off Mxnet support"
-)
 @pytest.mark.parametrize("num_server", [1])
-def test_rpc_sampling_shuffle(num_server):
+@pytest.mark.parametrize("use_graphbolt", [False, True])
+def test_rpc_sampling_shuffle(num_server, use_graphbolt):
     reset_envs()
-    import tempfile
-
     os.environ["DGL_DIST_MODE"] = "distributed"
     with tempfile.TemporaryDirectory() as tmpdirname:
         check_rpc_sampling_shuffle(
-            Path(tmpdirname), num_server, use_graphbolt=True
+            Path(tmpdirname), num_server, use_graphbolt=use_graphbolt
         )
-        check_rpc_sampling_shuffle(Path(tmpdirname), num_server)
-        # [TODO][Rhett] Tests for multiple groups may fail sometimes and
-        # root cause is unknown. Let's disable them for now.
-        # check_rpc_sampling_shuffle(Path(tmpdirname), num_server, num_groups=2)
+
+
+@pytest.mark.parametrize("num_server", [1])
+def test_rpc_hetero_sampling_shuffle(num_server):
+    reset_envs()
+    os.environ["DGL_DIST_MODE"] = "distributed"
+    with tempfile.TemporaryDirectory() as tmpdirname:
         check_rpc_hetero_sampling_shuffle(Path(tmpdirname), num_server)
+
+
+@pytest.mark.parametrize("num_server", [1])
+def test_rpc_hetero_sampling_empty_shuffle(num_server):
+    reset_envs()
+    os.environ["DGL_DIST_MODE"] = "distributed"
+    with tempfile.TemporaryDirectory() as tmpdirname:
         check_rpc_hetero_sampling_empty_shuffle(Path(tmpdirname), num_server)
-        check_rpc_hetero_etype_sampling_shuffle(Path(tmpdirname), num_server)
-        check_rpc_hetero_etype_sampling_shuffle(
-            Path(tmpdirname), num_server, ["csc"]
-        )
-        check_rpc_hetero_etype_sampling_shuffle(
-            Path(tmpdirname), num_server, ["csr"]
-        )
+
+
+@pytest.mark.parametrize("num_server", [1])
+@pytest.mark.parametrize(
+    "graph_formats", [None, ["csc"], ["csr"], ["csc", "coo"]]
+)
+def test_rpc_hetero_etype_sampling_shuffle(num_server, graph_formats):
+    reset_envs()
+    os.environ["DGL_DIST_MODE"] = "distributed"
+    with tempfile.TemporaryDirectory() as tmpdirname:
         check_rpc_hetero_etype_sampling_shuffle(
-            Path(tmpdirname), num_server, ["csc", "coo"]
+            Path(tmpdirname), num_server, graph_formats=graph_formats
         )
+
+
+@pytest.mark.parametrize("num_server", [1])
+def test_rpc_hetero_etype_sampling_empty_shuffle(num_server):
+    reset_envs()
+    os.environ["DGL_DIST_MODE"] = "distributed"
+    with tempfile.TemporaryDirectory() as tmpdirname:
         check_rpc_hetero_etype_sampling_empty_shuffle(
             Path(tmpdirname), num_server
         )
+
+
+@pytest.mark.parametrize("num_server", [1])
+def test_rpc_bipartite_sampling_empty_shuffle(num_server):
+    reset_envs()
+    os.environ["DGL_DIST_MODE"] = "distributed"
+    with tempfile.TemporaryDirectory() as tmpdirname:
         check_rpc_bipartite_sampling_empty(Path(tmpdirname), num_server)
+
+
+@pytest.mark.parametrize("num_server", [1])
+def test_rpc_bipartite_sampling_shuffle(num_server):
+    reset_envs()
+    os.environ["DGL_DIST_MODE"] = "distributed"
+    with tempfile.TemporaryDirectory() as tmpdirname:
         check_rpc_bipartite_sampling_shuffle(Path(tmpdirname), num_server)
+
+
+@pytest.mark.parametrize("num_server", [1])
+def test_rpc_bipartite_etype_sampling_empty_shuffle(num_server):
+    reset_envs()
+    os.environ["DGL_DIST_MODE"] = "distributed"
+    with tempfile.TemporaryDirectory() as tmpdirname:
         check_rpc_bipartite_etype_sampling_empty(Path(tmpdirname), num_server)
+
+
+@pytest.mark.parametrize("num_server", [1])
+def test_rpc_bipartite_etype_sampling_shuffle(num_server):
+    reset_envs()
+    os.environ["DGL_DIST_MODE"] = "distributed"
+    with tempfile.TemporaryDirectory() as tmpdirname:
         check_rpc_bipartite_etype_sampling_shuffle(Path(tmpdirname), num_server)
 
 

From 4b265390f45e6aa8b40d8b090c4c94ffc5402cdc Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Mon, 5 Feb 2024 07:06:44 +0300
Subject: [PATCH 22/45] [GraphBolt][CUDA] Fix link prediction early-stop.
 (#7083)

---
 examples/sampling/graphbolt/link_prediction.py | 2 ++
 python/dgl/graphbolt/feature_fetcher.py        | 7 +------
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/examples/sampling/graphbolt/link_prediction.py b/examples/sampling/graphbolt/link_prediction.py
index bc01ac171439..0794f79ceb8b 100644
--- a/examples/sampling/graphbolt/link_prediction.py
+++ b/examples/sampling/graphbolt/link_prediction.py
@@ -332,6 +332,8 @@ def train(args, model, graph, features, train_set):
 
             total_loss += loss.item()
             if step + 1 == args.early_stop:
+                # Early stopping requires a new dataloader to reset its state.
+                dataloader = create_dataloader(args, graph, features, train_set)
                 break
 
         end_epoch_time = time.time()
diff --git a/python/dgl/graphbolt/feature_fetcher.py b/python/dgl/graphbolt/feature_fetcher.py
index 7b94d1e1b3d8..01ff25af8c15 100644
--- a/python/dgl/graphbolt/feature_fetcher.py
+++ b/python/dgl/graphbolt/feature_fetcher.py
@@ -174,10 +174,5 @@ def _read(self, data):
         with torch.cuda.stream(self.stream):
             data = self._read_data(data, current_stream)
             if self.stream is not None:
-                event = torch.cuda.current_stream().record_event()
-
-                def _wait():
-                    event.wait()
-
-                data.wait = _wait
+                data.wait = torch.cuda.current_stream().record_event().wait
             return data

From badeaf19dc3d8c888aa8aed3fac6b0384858c559 Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Mon, 5 Feb 2024 07:20:15 +0300
Subject: [PATCH 23/45] [GraphBolt][CUDA] Pipelined sampling optimization
 (#7039)

---
 python/dgl/graphbolt/base.py                  |  75 +++++++++
 python/dgl/graphbolt/dataloader.py            |  87 ++++------
 python/dgl/graphbolt/impl/neighbor_sampler.py | 152 +++++++++++++++++-
 python/dgl/graphbolt/minibatch_transformer.py |   8 +-
 python/dgl/graphbolt/subgraph_sampler.py      |   6 +-
 .../graphbolt/impl/test_neighbor_sampler.py   |  73 +++++++++
 .../pytorch/graphbolt/test_dataloader.py      |  26 ++-
 7 files changed, 352 insertions(+), 75 deletions(-)
 create mode 100644 tests/python/pytorch/graphbolt/impl/test_neighbor_sampler.py

diff --git a/python/dgl/graphbolt/base.py b/python/dgl/graphbolt/base.py
index 77f72660832c..a32357cea760 100644
--- a/python/dgl/graphbolt/base.py
+++ b/python/dgl/graphbolt/base.py
@@ -1,5 +1,6 @@
 """Base types and utilities for Graph Bolt."""
 
+from collections import deque
 from dataclasses import dataclass
 
 import torch
@@ -14,6 +15,10 @@
     "etype_str_to_tuple",
     "etype_tuple_to_str",
     "CopyTo",
+    "FutureWaiter",
+    "Waiter",
+    "Bufferer",
+    "EndMarker",
     "isin",
     "index_select",
     "expand_indptr",
@@ -247,6 +252,76 @@ def __iter__(self):
             yield data
 
 
+@functional_datapipe("mark_end")
+class EndMarker(IterDataPipe):
+    """Used to mark the end of a datapipe and is a no-op."""
+
+    def __init__(self, datapipe):
+        self.datapipe = datapipe
+
+    def __iter__(self):
+        yield from self.datapipe
+
+
+@functional_datapipe("buffer")
+class Bufferer(IterDataPipe):
+    """Buffers items before yielding them.
+
+    Parameters
+    ----------
+    datapipe : DataPipe
+        The data pipeline.
+    buffer_size : int, optional
+        The size of the buffer which stores the fetched samples. If data coming
+        from datapipe has latency spikes, consider setting to a higher value.
+        Default is 1.
+    """
+
+    def __init__(self, datapipe, buffer_size=1):
+        self.datapipe = datapipe
+        if buffer_size <= 0:
+            raise ValueError(
+                "'buffer_size' is required to be a positive integer."
+            )
+        self.buffer = deque(maxlen=buffer_size)
+
+    def __iter__(self):
+        for data in self.datapipe:
+            if len(self.buffer) < self.buffer.maxlen:
+                self.buffer.append(data)
+            else:
+                return_data = self.buffer.popleft()
+                self.buffer.append(data)
+                yield return_data
+        while len(self.buffer) > 0:
+            yield self.buffer.popleft()
+
+
+@functional_datapipe("wait")
+class Waiter(IterDataPipe):
+    """Calls the wait function of all items."""
+
+    def __init__(self, datapipe):
+        self.datapipe = datapipe
+
+    def __iter__(self):
+        for data in self.datapipe:
+            data.wait()
+            yield data
+
+
+@functional_datapipe("wait_future")
+class FutureWaiter(IterDataPipe):
+    """Calls the result function of all items and returns their results."""
+
+    def __init__(self, datapipe):
+        self.datapipe = datapipe
+
+    def __iter__(self):
+        for data in self.datapipe:
+            yield data.result()
+
+
 @dataclass
 class CSCFormatBase:
     r"""Basic class representing data in Compressed Sparse Column (CSC) format.
diff --git a/python/dgl/graphbolt/dataloader.py b/python/dgl/graphbolt/dataloader.py
index b0dd9daccfaf..cffb24070a06 100644
--- a/python/dgl/graphbolt/dataloader.py
+++ b/python/dgl/graphbolt/dataloader.py
@@ -1,6 +1,6 @@
 """Graph Bolt DataLoaders"""
 
-from collections import deque
+from concurrent.futures import ThreadPoolExecutor
 
 import torch
 import torch.utils.data
@@ -9,6 +9,7 @@
 
 from .base import CopyTo
 from .feature_fetcher import FeatureFetcher
+from .impl.neighbor_sampler import SamplePerLayer
 
 from .internal import datapipe_graph_to_adjlist
 from .item_sampler import ItemSampler
@@ -16,8 +17,6 @@
 
 __all__ = [
     "DataLoader",
-    "Awaiter",
-    "Bufferer",
 ]
 
 
@@ -40,61 +39,6 @@ def _find_and_wrap_parent(datapipe_graph, target_datapipe, wrapper, **kwargs):
     return datapipe_graph
 
 
-class EndMarker(dp.iter.IterDataPipe):
-    """Used to mark the end of a datapipe and is a no-op."""
-
-    def __init__(self, datapipe):
-        self.datapipe = datapipe
-
-    def __iter__(self):
-        yield from self.datapipe
-
-
-class Bufferer(dp.iter.IterDataPipe):
-    """Buffers items before yielding them.
-
-    Parameters
-    ----------
-    datapipe : DataPipe
-        The data pipeline.
-    buffer_size : int, optional
-        The size of the buffer which stores the fetched samples. If data coming
-        from datapipe has latency spikes, consider setting to a higher value.
-        Default is 1.
-    """
-
-    def __init__(self, datapipe, buffer_size=1):
-        self.datapipe = datapipe
-        if buffer_size <= 0:
-            raise ValueError(
-                "'buffer_size' is required to be a positive integer."
-            )
-        self.buffer = deque(maxlen=buffer_size)
-
-    def __iter__(self):
-        for data in self.datapipe:
-            if len(self.buffer) < self.buffer.maxlen:
-                self.buffer.append(data)
-            else:
-                return_data = self.buffer.popleft()
-                self.buffer.append(data)
-                yield return_data
-        while len(self.buffer) > 0:
-            yield self.buffer.popleft()
-
-
-class Awaiter(dp.iter.IterDataPipe):
-    """Calls the wait function of all items."""
-
-    def __init__(self, datapipe):
-        self.datapipe = datapipe
-
-    def __iter__(self):
-        for data in self.datapipe:
-            data.wait()
-            yield data
-
-
 class MultiprocessingWrapper(dp.iter.IterDataPipe):
     """Wraps a datapipe with multiprocessing.
 
@@ -156,6 +100,10 @@ class DataLoader(torch.utils.data.DataLoader):
         If True, the data loader will overlap the UVA feature fetcher operations
         with the rest of operations by using an alternative CUDA stream. Default
         is True.
+    overlap_graph_fetch : bool, optional
+        If True, the data loader will overlap the UVA graph fetching operations
+        with the rest of operations by using an alternative CUDA stream. Default
+        is False.
     max_uva_threads : int, optional
         Limits the number of CUDA threads used for UVA copies so that the rest
         of the computations can run simultaneously with it. Setting it to a too
@@ -170,6 +118,7 @@ def __init__(
         num_workers=0,
         persistent_workers=True,
         overlap_feature_fetch=True,
+        overlap_graph_fetch=False,
         max_uva_threads=6144,
     ):
         # Multiprocessing requires two modifications to the datapipe:
@@ -179,7 +128,7 @@ def __init__(
         # 2. Cut the datapipe at FeatureFetcher, and wrap the inner datapipe
         #    of the FeatureFetcher with a multiprocessing PyTorch DataLoader.
 
-        datapipe = EndMarker(datapipe)
+        datapipe = datapipe.mark_end()
         datapipe_graph = dp_utils.traverse_dps(datapipe)
 
         # (1) Insert minibatch distribution.
@@ -223,7 +172,25 @@ def __init__(
                 datapipe_graph = dp_utils.replace_dp(
                     datapipe_graph,
                     feature_fetcher,
-                    Awaiter(Bufferer(feature_fetcher, buffer_size=1)),
+                    feature_fetcher.buffer(1).wait(),
+                )
+
+        if (
+            overlap_graph_fetch
+            and num_workers == 0
+            and torch.cuda.is_available()
+        ):
+            torch.ops.graphbolt.set_max_uva_threads(max_uva_threads)
+            samplers = dp_utils.find_dps(
+                datapipe_graph,
+                SamplePerLayer,
+            )
+            executor = ThreadPoolExecutor(max_workers=1)
+            for sampler in samplers:
+                datapipe_graph = dp_utils.replace_dp(
+                    datapipe_graph,
+                    sampler,
+                    sampler.fetch_and_sample(_get_uva_stream(), executor, 1),
                 )
 
         # (4) Cut datapipe at CopyTo and wrap with prefetcher. This enables the
diff --git a/python/dgl/graphbolt/impl/neighbor_sampler.py b/python/dgl/graphbolt/impl/neighbor_sampler.py
index 605da8ff5ce3..887b1864d83f 100644
--- a/python/dgl/graphbolt/impl/neighbor_sampler.py
+++ b/python/dgl/graphbolt/impl/neighbor_sampler.py
@@ -1,18 +1,152 @@
 """Neighbor subgraph samplers for GraphBolt."""
 
+from concurrent.futures import ThreadPoolExecutor
 from functools import partial
 
 import torch
 from torch.utils.data import functional_datapipe
+from torchdata.datapipes.iter import Mapper
 
 from ..internal import compact_csc_format, unique_and_compact_csc_formats
 from ..minibatch_transformer import MiniBatchTransformer
 
 from ..subgraph_sampler import SubgraphSampler
+from .fused_csc_sampling_graph import fused_csc_sampling_graph
 from .sampled_subgraph_impl import SampledSubgraphImpl
 
 
-__all__ = ["NeighborSampler", "LayerNeighborSampler"]
+__all__ = [
+    "NeighborSampler",
+    "LayerNeighborSampler",
+    "SamplePerLayer",
+    "SamplePerLayerFromFetchedSubgraph",
+    "FetchInsubgraphData",
+]
+
+
+@functional_datapipe("fetch_insubgraph_data")
+class FetchInsubgraphData(Mapper):
+    """Fetches the insubgraph and wraps it in a FusedCSCSamplingGraph object. If
+    the provided sample_per_layer_obj has a valid prob_name, then it reads the
+    probabilies of all the fetched edges. Furthermore, if type_per_array tensor
+    exists in the underlying graph, then the types of all the fetched edges are
+    read as well."""
+
+    def __init__(
+        self, datapipe, sample_per_layer_obj, stream=None, executor=None
+    ):
+        super().__init__(datapipe, self._fetch_per_layer)
+        self.graph = sample_per_layer_obj.sampler.__self__
+        self.prob_name = sample_per_layer_obj.prob_name
+        self.stream = stream
+        if executor is None:
+            self.executor = ThreadPoolExecutor(max_workers=1)
+        else:
+            self.executor = executor
+
+    def _fetch_per_layer_impl(self, minibatch, stream):
+        with torch.cuda.stream(self.stream):
+            index = minibatch._seed_nodes
+            if isinstance(index, dict):
+                index = self.graph._convert_to_homogeneous_nodes(index)
+
+            index, original_positions = index.sort()
+            if (original_positions.diff() == 1).all().item():  # is_sorted
+                minibatch._subgraph_seed_nodes = None
+            else:
+                minibatch._subgraph_seed_nodes = original_positions
+            index.record_stream(torch.cuda.current_stream())
+            index_select_csc_with_indptr = partial(
+                torch.ops.graphbolt.index_select_csc, self.graph.csc_indptr
+            )
+
+            def record_stream(tensor):
+                if stream is not None and tensor.is_cuda:
+                    tensor.record_stream(stream)
+
+            indptr, indices = index_select_csc_with_indptr(
+                self.graph.indices, index, None
+            )
+            record_stream(indptr)
+            record_stream(indices)
+            output_size = len(indices)
+            if self.graph.type_per_edge is not None:
+                _, type_per_edge = index_select_csc_with_indptr(
+                    self.graph.type_per_edge, index, output_size
+                )
+                record_stream(type_per_edge)
+            else:
+                type_per_edge = None
+            if self.graph.edge_attributes is not None:
+                probs_or_mask = self.graph.edge_attributes.get(
+                    self.prob_name, None
+                )
+                if probs_or_mask is not None:
+                    _, probs_or_mask = index_select_csc_with_indptr(
+                        probs_or_mask, index, output_size
+                    )
+                    record_stream(probs_or_mask)
+            else:
+                probs_or_mask = None
+            if self.graph.node_type_offset is not None:
+                node_type_offset = torch.searchsorted(
+                    index, self.graph.node_type_offset
+                )
+            else:
+                node_type_offset = None
+            subgraph = fused_csc_sampling_graph(
+                indptr,
+                indices,
+                node_type_offset=node_type_offset,
+                type_per_edge=type_per_edge,
+                node_type_to_id=self.graph.node_type_to_id,
+                edge_type_to_id=self.graph.edge_type_to_id,
+            )
+            if self.prob_name is not None and probs_or_mask is not None:
+                subgraph.edge_attributes = {self.prob_name: probs_or_mask}
+
+            minibatch.sampled_subgraphs.insert(0, subgraph)
+
+            if self.stream is not None:
+                minibatch.wait = torch.cuda.current_stream().record_event().wait
+
+            return minibatch
+
+    def _fetch_per_layer(self, minibatch):
+        current_stream = None
+        if self.stream is not None:
+            current_stream = torch.cuda.current_stream()
+            self.stream.wait_stream(current_stream)
+        return self.executor.submit(
+            self._fetch_per_layer_impl, minibatch, current_stream
+        )
+
+
+@functional_datapipe("sample_per_layer_from_fetched_subgraph")
+class SamplePerLayerFromFetchedSubgraph(MiniBatchTransformer):
+    """Sample neighbor edges from a graph for a single layer."""
+
+    def __init__(self, datapipe, sample_per_layer_obj):
+        super().__init__(datapipe, self._sample_per_layer_from_fetched_subgraph)
+        self.sampler_name = sample_per_layer_obj.sampler.__name__
+        self.fanout = sample_per_layer_obj.fanout
+        self.replace = sample_per_layer_obj.replace
+        self.prob_name = sample_per_layer_obj.prob_name
+
+    def _sample_per_layer_from_fetched_subgraph(self, minibatch):
+        subgraph = minibatch.sampled_subgraphs[0]
+
+        sampled_subgraph = getattr(subgraph, self.sampler_name)(
+            minibatch._subgraph_seed_nodes,
+            self.fanout,
+            self.replace,
+            self.prob_name,
+        )
+        delattr(minibatch, "_subgraph_seed_nodes")
+        sampled_subgraph.original_column_node_ids = minibatch._seed_nodes
+        minibatch.sampled_subgraphs[0] = sampled_subgraph
+
+        return minibatch
 
 
 @functional_datapipe("sample_per_layer")
@@ -72,6 +206,19 @@ def _compact_per_layer(self, minibatch):
         return minibatch
 
 
+@functional_datapipe("fetch_and_sample")
+class FetcherAndSampler(MiniBatchTransformer):
+    """Overlapped graph sampling operation replacement."""
+
+    def __init__(self, sampler, stream, executor, buffer_size):
+        datapipe = sampler.datapipe.fetch_insubgraph_data(
+            sampler, stream, executor
+        )
+        datapipe = datapipe.buffer(buffer_size).wait_future().wait()
+        datapipe = datapipe.sample_per_layer_from_fetched_subgraph(sampler)
+        super().__init__(datapipe)
+
+
 @functional_datapipe("sample_neighbor")
 class NeighborSampler(SubgraphSampler):
     # pylint: disable=abstract-method
@@ -173,7 +320,8 @@ def __init__(
             datapipe, graph, fanouts, replace, prob_name, deduplicate, sampler
         )
 
-    def _prepare(self, node_type_to_id, minibatch):
+    @staticmethod
+    def _prepare(node_type_to_id, minibatch):
         seeds = minibatch._seed_nodes
         # Enrich seeds with all node types.
         if isinstance(seeds, dict):
diff --git a/python/dgl/graphbolt/minibatch_transformer.py b/python/dgl/graphbolt/minibatch_transformer.py
index 8822f2ac6203..b7b00b7a1b29 100644
--- a/python/dgl/graphbolt/minibatch_transformer.py
+++ b/python/dgl/graphbolt/minibatch_transformer.py
@@ -29,10 +29,10 @@ class MiniBatchTransformer(Mapper):
     def __init__(
         self,
         datapipe,
-        transformer,
+        transformer=None,
     ):
         super().__init__(datapipe, self._transformer)
-        self.transformer = transformer
+        self.transformer = transformer or self._identity
 
     def _transformer(self, minibatch):
         minibatch = self.transformer(minibatch)
@@ -40,3 +40,7 @@ def _transformer(self, minibatch):
             minibatch, (MiniBatch,)
         ), "The transformer output should be an instance of MiniBatch"
         return minibatch
+
+    @staticmethod
+    def _identity(minibatch):
+        return minibatch
diff --git a/python/dgl/graphbolt/subgraph_sampler.py b/python/dgl/graphbolt/subgraph_sampler.py
index b05b8ca30619..ab7c969063c9 100644
--- a/python/dgl/graphbolt/subgraph_sampler.py
+++ b/python/dgl/graphbolt/subgraph_sampler.py
@@ -46,11 +46,7 @@ def __init__(
         datapipe = datapipe.transform(self._preprocess)
         datapipe = self.sampling_stages(datapipe, *args, **kwargs)
         datapipe = datapipe.transform(self._postprocess)
-        super().__init__(datapipe, self._identity)
-
-    @staticmethod
-    def _identity(minibatch):
-        return minibatch
+        super().__init__(datapipe)
 
     @staticmethod
     def _postprocess(minibatch):
diff --git a/tests/python/pytorch/graphbolt/impl/test_neighbor_sampler.py b/tests/python/pytorch/graphbolt/impl/test_neighbor_sampler.py
new file mode 100644
index 000000000000..f4f1084c8e5c
--- /dev/null
+++ b/tests/python/pytorch/graphbolt/impl/test_neighbor_sampler.py
@@ -0,0 +1,73 @@
+import unittest
+from functools import partial
+
+import backend as F
+
+import dgl
+import dgl.graphbolt as gb
+import pytest
+import torch
+
+
+def get_hetero_graph():
+    # COO graph:
+    # [0, 0, 1, 1, 2, 2, 3, 3, 4, 4]
+    # [2, 4, 2, 3, 0, 1, 1, 0, 0, 1]
+    # [1, 1, 1, 1, 0, 0, 0, 0, 0] - > edge type.
+    # num_nodes = 5, num_n1 = 2, num_n2 = 3
+    ntypes = {"n1": 0, "n2": 1}
+    etypes = {"n1:e1:n2": 0, "n2:e2:n1": 1}
+    indptr = torch.LongTensor([0, 2, 4, 6, 8, 10])
+    indices = torch.LongTensor([2, 4, 2, 3, 0, 1, 1, 0, 0, 1])
+    type_per_edge = torch.LongTensor([1, 1, 1, 1, 0, 0, 0, 0, 0, 0])
+    edge_attributes = {
+        "weight": torch.FloatTensor(
+            [2.5, 0, 8.4, 0, 0.4, 1.2, 2.5, 0, 8.4, 0.5]
+        ),
+        "mask": torch.BoolTensor([1, 0, 1, 0, 1, 1, 1, 0, 1, 1]),
+    }
+    node_type_offset = torch.LongTensor([0, 2, 5])
+    return gb.fused_csc_sampling_graph(
+        indptr,
+        indices,
+        node_type_offset=node_type_offset,
+        type_per_edge=type_per_edge,
+        node_type_to_id=ntypes,
+        edge_type_to_id=etypes,
+        edge_attributes=edge_attributes,
+    )
+
+
+@unittest.skipIf(F._default_context_str != "gpu", reason="Enabled only on GPU.")
+@pytest.mark.parametrize("hetero", [False, True])
+@pytest.mark.parametrize("prob_name", [None, "weight", "mask"])
+def test_NeighborSampler_GraphFetch(hetero, prob_name):
+    items = torch.arange(3)
+    names = "seed_nodes"
+    itemset = gb.ItemSet(items, names=names)
+    graph = get_hetero_graph().to(F.ctx())
+    if hetero:
+        itemset = gb.ItemSetDict({"n2": itemset})
+    else:
+        graph.type_per_edge = None
+    item_sampler = gb.ItemSampler(itemset, batch_size=2).copy_to(F.ctx())
+    fanout = torch.LongTensor([2])
+    datapipe = item_sampler.map(gb.SubgraphSampler._preprocess)
+    datapipe = datapipe.map(
+        partial(gb.NeighborSampler._prepare, graph.node_type_to_id)
+    )
+    sample_per_layer = gb.SamplePerLayer(
+        datapipe, graph.sample_neighbors, fanout, False, prob_name
+    )
+    compact_per_layer = sample_per_layer.compact_per_layer(True)
+    gb.seed(123)
+    expected_results = list(compact_per_layer)
+    datapipe = gb.FetchInsubgraphData(datapipe, sample_per_layer)
+    datapipe = datapipe.wait_future()
+    datapipe = gb.SamplePerLayerFromFetchedSubgraph(datapipe, sample_per_layer)
+    datapipe = datapipe.compact_per_layer(True)
+    gb.seed(123)
+    new_results = list(datapipe)
+    assert len(expected_results) == len(new_results)
+    for a, b in zip(expected_results, new_results):
+        assert repr(a) == repr(b)
diff --git a/tests/python/pytorch/graphbolt/test_dataloader.py b/tests/python/pytorch/graphbolt/test_dataloader.py
index 80a8c7164a57..2ee78bf3be6a 100644
--- a/tests/python/pytorch/graphbolt/test_dataloader.py
+++ b/tests/python/pytorch/graphbolt/test_dataloader.py
@@ -47,11 +47,21 @@ def test_DataLoader():
     F._default_context_str != "gpu",
     reason="This test requires the GPU.",
 )
-@pytest.mark.parametrize("overlap_feature_fetch", [True, False])
+@pytest.mark.parametrize(
+    "sampler_name", ["NeighborSampler", "LayerNeighborSampler"]
+)
 @pytest.mark.parametrize("enable_feature_fetch", [True, False])
-def test_gpu_sampling_DataLoader(overlap_feature_fetch, enable_feature_fetch):
+@pytest.mark.parametrize("overlap_feature_fetch", [True, False])
+@pytest.mark.parametrize("overlap_graph_fetch", [True, False])
+def test_gpu_sampling_DataLoader(
+    sampler_name,
+    enable_feature_fetch,
+    overlap_feature_fetch,
+    overlap_graph_fetch,
+):
     N = 40
     B = 4
+    num_layers = 2
     itemset = dgl.graphbolt.ItemSet(torch.arange(N), names="seed_nodes")
     graph = gb_test_utils.rand_csc_graph(200, 0.15, bidirection_edge=True).to(
         F.ctx()
@@ -68,10 +78,10 @@ def test_gpu_sampling_DataLoader(overlap_feature_fetch, enable_feature_fetch):
 
     datapipe = dgl.graphbolt.ItemSampler(itemset, batch_size=B)
     datapipe = datapipe.copy_to(F.ctx(), extra_attrs=["seed_nodes"])
-    datapipe = dgl.graphbolt.NeighborSampler(
+    datapipe = getattr(dgl.graphbolt, sampler_name)(
         datapipe,
         graph,
-        fanouts=[torch.LongTensor([2]) for _ in range(2)],
+        fanouts=[torch.LongTensor([2]) for _ in range(num_layers)],
     )
     if enable_feature_fetch:
         datapipe = dgl.graphbolt.FeatureFetcher(
@@ -81,14 +91,18 @@ def test_gpu_sampling_DataLoader(overlap_feature_fetch, enable_feature_fetch):
         )
 
     dataloader = dgl.graphbolt.DataLoader(
-        datapipe, overlap_feature_fetch=overlap_feature_fetch
+        datapipe,
+        overlap_feature_fetch=overlap_feature_fetch,
+        overlap_graph_fetch=overlap_graph_fetch,
     )
     bufferer_awaiter_cnt = int(enable_feature_fetch and overlap_feature_fetch)
+    if overlap_graph_fetch:
+        bufferer_awaiter_cnt += num_layers
     datapipe = dataloader.dataset
     datapipe_graph = dp_utils.traverse_dps(datapipe)
     awaiters = dp_utils.find_dps(
         datapipe_graph,
-        dgl.graphbolt.Awaiter,
+        dgl.graphbolt.Waiter,
     )
     assert len(awaiters) == bufferer_awaiter_cnt
     bufferers = dp_utils.find_dps(

From 4ee0a8bddbd93963b5f078c475381f4ab521d2e1 Mon Sep 17 00:00:00 2001
From: Rhett Ying <85214957+Rhett-Ying@users.noreply.github.com>
Date: Mon, 5 Feb 2024 13:22:44 +0800
Subject: [PATCH 24/45] [DistGB] return global eids from GB sampling on
 homograph (#7085)

---
 python/dgl/distributed/graph_services.py       |  9 ++++++---
 tests/distributed/test_distributed_sampling.py | 15 +++++++++++----
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/python/dgl/distributed/graph_services.py b/python/dgl/distributed/graph_services.py
index 76be905a054f..58eeb6de1f89 100644
--- a/python/dgl/distributed/graph_services.py
+++ b/python/dgl/distributed/graph_services.py
@@ -145,7 +145,8 @@ def _sample_neighbors_graphbolt(
     # [Rui][TODO] Support multiple fanouts.
     assert fanout.numel() == 1, "Expect a single fanout."
 
-    subgraph = g._sample_neighbors(nodes, fanout)
+    return_eids = g.edge_attributes is not None and EID in g.edge_attributes
+    subgraph = g._sample_neighbors(nodes, fanout, return_eids=return_eids)
 
     # 3. Map local node IDs to global node IDs.
     local_src = subgraph.indices
@@ -156,9 +157,11 @@ def _sample_neighbors_graphbolt(
     global_src = global_nid_mapping[local_src]
     global_dst = global_nid_mapping[local_dst]
 
-    # [Rui][TODO] edge IDs are not supported yet.
+    global_eids = None
+    if return_eids:
+        global_eids = g.edge_attributes[EID][subgraph.original_edge_ids]
     return LocalSampledGraph(
-        global_src, global_dst, None, subgraph.type_per_edge
+        global_src, global_dst, global_eids, subgraph.type_per_edge
     )
 
 
diff --git a/tests/distributed/test_distributed_sampling.py b/tests/distributed/test_distributed_sampling.py
index 7d10dc43f122..0795d4a03d25 100644
--- a/tests/distributed/test_distributed_sampling.py
+++ b/tests/distributed/test_distributed_sampling.py
@@ -75,6 +75,7 @@ def start_sample_client_shuffle(
     orig_nid,
     orig_eid,
     use_graphbolt=False,
+    return_eids=False,
 ):
     os.environ["DGL_GROUP_ID"] = str(group_id)
     gpb = None
@@ -95,7 +96,7 @@ def start_sample_client_shuffle(
     dst = orig_nid[dst]
     assert sampled_graph.num_nodes() == g.num_nodes()
     assert np.all(F.asnumpy(g.has_edges_between(src, dst)))
-    if use_graphbolt:
+    if use_graphbolt and not return_eids:
         assert (
             dgl.EID not in sampled_graph.edata
         ), "EID should not be in sampled graph if use_graphbolt=True."
@@ -391,7 +392,7 @@ def test_rpc_sampling():
 
 
 def check_rpc_sampling_shuffle(
-    tmpdir, num_server, num_groups=1, use_graphbolt=False
+    tmpdir, num_server, num_groups=1, use_graphbolt=False, return_eids=False
 ):
     generate_ip_config("rpc_ip_config.txt", num_server, num_server)
 
@@ -408,6 +409,7 @@ def check_rpc_sampling_shuffle(
         part_method="metis",
         return_mapping=True,
         use_graphbolt=use_graphbolt,
+        store_eids=return_eids,
     )
 
     pserver_list = []
@@ -444,6 +446,7 @@ def check_rpc_sampling_shuffle(
                     orig_nids,
                     orig_eids,
                     use_graphbolt,
+                    return_eids,
                 ),
             )
             p.start()
@@ -1015,12 +1018,16 @@ def check_rpc_bipartite_etype_sampling_shuffle(tmpdir, num_server):
 
 @pytest.mark.parametrize("num_server", [1])
 @pytest.mark.parametrize("use_graphbolt", [False, True])
-def test_rpc_sampling_shuffle(num_server, use_graphbolt):
+@pytest.mark.parametrize("return_eids", [False, True])
+def test_rpc_sampling_shuffle(num_server, use_graphbolt, return_eids):
     reset_envs()
     os.environ["DGL_DIST_MODE"] = "distributed"
     with tempfile.TemporaryDirectory() as tmpdirname:
         check_rpc_sampling_shuffle(
-            Path(tmpdirname), num_server, use_graphbolt=use_graphbolt
+            Path(tmpdirname),
+            num_server,
+            use_graphbolt=use_graphbolt,
+            return_eids=return_eids,
         )
 
 

From a2e1c79618bcc2e004f1b6ff04e5be7eb52b57c8 Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Mon, 5 Feb 2024 20:01:31 +0300
Subject: [PATCH 25/45] [GraphBolt][CUDA] Pipelined sampling accuracy fix
 (#7088)

---
 python/dgl/graphbolt/impl/neighbor_sampler.py | 27 +++++++++++++------
 .../graphbolt/impl/test_neighbor_sampler.py   |  8 ++++--
 2 files changed, 25 insertions(+), 10 deletions(-)

diff --git a/python/dgl/graphbolt/impl/neighbor_sampler.py b/python/dgl/graphbolt/impl/neighbor_sampler.py
index 887b1864d83f..737b475a94e6 100644
--- a/python/dgl/graphbolt/impl/neighbor_sampler.py
+++ b/python/dgl/graphbolt/impl/neighbor_sampler.py
@@ -48,22 +48,33 @@ def _fetch_per_layer_impl(self, minibatch, stream):
         with torch.cuda.stream(self.stream):
             index = minibatch._seed_nodes
             if isinstance(index, dict):
+                for idx in index.values():
+                    idx.record_stream(torch.cuda.current_stream())
                 index = self.graph._convert_to_homogeneous_nodes(index)
+            else:
+                index.record_stream(torch.cuda.current_stream())
+
+            def record_stream(tensor):
+                if stream is not None and tensor.is_cuda:
+                    tensor.record_stream(stream)
+                return tensor
 
-            index, original_positions = index.sort()
-            if (original_positions.diff() == 1).all().item():  # is_sorted
+            if self.graph.node_type_offset is None:
+                # sorting not needed.
                 minibatch._subgraph_seed_nodes = None
             else:
-                minibatch._subgraph_seed_nodes = original_positions
-            index.record_stream(torch.cuda.current_stream())
+                index, original_positions = index.sort()
+                if (original_positions.diff() == 1).all().item():
+                    # already sorted.
+                    minibatch._subgraph_seed_nodes = None
+                else:
+                    minibatch._subgraph_seed_nodes = record_stream(
+                        original_positions.sort()[1]
+                    )
             index_select_csc_with_indptr = partial(
                 torch.ops.graphbolt.index_select_csc, self.graph.csc_indptr
             )
 
-            def record_stream(tensor):
-                if stream is not None and tensor.is_cuda:
-                    tensor.record_stream(stream)
-
             indptr, indices = index_select_csc_with_indptr(
                 self.graph.indices, index, None
             )
diff --git a/tests/python/pytorch/graphbolt/impl/test_neighbor_sampler.py b/tests/python/pytorch/graphbolt/impl/test_neighbor_sampler.py
index f4f1084c8e5c..09528d98899d 100644
--- a/tests/python/pytorch/graphbolt/impl/test_neighbor_sampler.py
+++ b/tests/python/pytorch/graphbolt/impl/test_neighbor_sampler.py
@@ -41,8 +41,12 @@ def get_hetero_graph():
 @unittest.skipIf(F._default_context_str != "gpu", reason="Enabled only on GPU.")
 @pytest.mark.parametrize("hetero", [False, True])
 @pytest.mark.parametrize("prob_name", [None, "weight", "mask"])
-def test_NeighborSampler_GraphFetch(hetero, prob_name):
-    items = torch.arange(3)
+@pytest.mark.parametrize("sorted", [False, True])
+def test_NeighborSampler_GraphFetch(hetero, prob_name, sorted):
+    if sorted:
+        items = torch.arange(3)
+    else:
+        items = torch.tensor([2, 0, 1])
     names = "seed_nodes"
     itemset = gb.ItemSet(items, names=names)
     graph = get_hetero_graph().to(F.ctx())

From ee8b7b39ce19d6d6e0e97c48d2973a4ec586dd8d Mon Sep 17 00:00:00 2001
From: Rhett Ying <85214957+Rhett-Ying@users.noreply.github.com>
Date: Tue, 6 Feb 2024 11:22:38 +0800
Subject: [PATCH 26/45] [DistGB] enable GB sampling on heterograph (#7087)

---
 python/dgl/distributed/graph_services.py      |  36 +++-
 .../distributed/test_distributed_sampling.py  | 200 +++++++++++++++---
 2 files changed, 196 insertions(+), 40 deletions(-)

diff --git a/python/dgl/distributed/graph_services.py b/python/dgl/distributed/graph_services.py
index 58eeb6de1f89..9188a38675a9 100644
--- a/python/dgl/distributed/graph_services.py
+++ b/python/dgl/distributed/graph_services.py
@@ -1,4 +1,5 @@
 """A set of graph services of getting subgraphs from DistGraph"""
+import os
 from collections import namedtuple
 
 import numpy as np
@@ -708,24 +709,47 @@ def _frontier_to_heterogeneous_graph(g, frontier, gpb):
             idtype=g.idtype,
         )
 
-    etype_ids, frontier.edata[EID] = gpb.map_to_per_etype(frontier.edata[EID])
-    src, dst = frontier.edges()
+    # For DGL partitions, the global edge IDs are always stored in the edata.
+    # For GraphBolt partitions, the edge type IDs are always stored in the
+    # edata. As for the edge IDs, they are stored in the edata if the graph is
+    # partitioned with `store_eids=True`. Otherwise, the edge IDs are not
+    # stored.
+    etype_ids, type_wise_eids = (
+        gpb.map_to_per_etype(frontier.edata[EID])
+        if EID in frontier.edata
+        else (frontier.edata[ETYPE], None)
+    )
     etype_ids, idx = F.sort_1d(etype_ids)
+    if type_wise_eids is not None:
+        type_wise_eids = F.gather_row(type_wise_eids, idx)
+
+    # Sort the edges by their edge types.
+    src, dst = frontier.edges()
     src, dst = F.gather_row(src, idx), F.gather_row(dst, idx)
-    eid = F.gather_row(frontier.edata[EID], idx)
-    _, src = gpb.map_to_per_ntype(src)
-    _, dst = gpb.map_to_per_ntype(dst)
+    src_ntype_ids, src = gpb.map_to_per_ntype(src)
+    dst_ntype_ids, dst = gpb.map_to_per_ntype(dst)
 
     data_dict = dict()
     edge_ids = {}
     for etid, etype in enumerate(g.canonical_etypes):
+        src_ntype, _, dst_ntype = etype
+        src_ntype_id = g.get_ntype_id(src_ntype)
+        dst_ntype_id = g.get_ntype_id(dst_ntype)
         type_idx = etype_ids == etid
         if F.sum(type_idx, 0) > 0:
             data_dict[etype] = (
                 F.boolean_mask(src, type_idx),
                 F.boolean_mask(dst, type_idx),
             )
-            edge_ids[etype] = F.boolean_mask(eid, type_idx)
+            if "DGL_DIST_DEBUG" in os.environ:
+                assert torch.all(
+                    src_ntype_id == src_ntype_ids[type_idx]
+                ), "source ntype is is not expected."
+                assert torch.all(
+                    dst_ntype_id == dst_ntype_ids[type_idx]
+                ), "destination ntype is is not expected."
+            if type_wise_eids is not None:
+                edge_ids[etype] = F.boolean_mask(type_wise_eids, type_idx)
     hg = heterograph(
         data_dict,
         {ntype: g.num_nodes(ntype) for ntype in g.ntypes},
diff --git a/tests/distributed/test_distributed_sampling.py b/tests/distributed/test_distributed_sampling.py
index 0795d4a03d25..eec8f51dbaa4 100644
--- a/tests/distributed/test_distributed_sampling.py
+++ b/tests/distributed/test_distributed_sampling.py
@@ -91,6 +91,9 @@ def start_sample_client_shuffle(
         dist_graph, [0, 10, 99, 66, 1024, 2008], 3, use_graphbolt=use_graphbolt
     )
 
+    assert (
+        dgl.ETYPE not in sampled_graph.edata
+    ), "Etype should not be in homogeneous sampled graph."
     src, dst = sampled_graph.edges()
     src = orig_nid[src]
     dst = orig_nid[dst]
@@ -460,23 +463,37 @@ def check_rpc_sampling_shuffle(
         assert p.exitcode == 0
 
 
-def start_hetero_sample_client(rank, tmpdir, disable_shared_mem, nodes):
+def start_hetero_sample_client(
+    rank,
+    tmpdir,
+    disable_shared_mem,
+    nodes,
+    use_graphbolt=False,
+    return_eids=False,
+):
     gpb = None
     if disable_shared_mem:
         _, _, _, gpb, _, _, _ = load_partition(
             tmpdir / "test_sampling.json", rank
         )
     dgl.distributed.initialize("rpc_ip_config.txt")
-    dist_graph = DistGraph("test_sampling", gpb=gpb)
+    dist_graph = DistGraph(
+        "test_sampling", gpb=gpb, use_graphbolt=use_graphbolt
+    )
     assert "feat" in dist_graph.nodes["n1"].data
     assert "feat" not in dist_graph.nodes["n2"].data
     assert "feat" not in dist_graph.nodes["n3"].data
     if gpb is None:
         gpb = dist_graph.get_partition_book()
     try:
-        sampled_graph = sample_neighbors(dist_graph, nodes, 3)
+        # Enable santity check in distributed sampling.
+        os.environ["DGL_DIST_DEBUG"] = "1"
+        sampled_graph = sample_neighbors(
+            dist_graph, nodes, 3, use_graphbolt=use_graphbolt
+        )
         block = dgl.to_block(sampled_graph, nodes)
-        block.edata[dgl.EID] = sampled_graph.edata[dgl.EID]
+        if not use_graphbolt or return_eids:
+            block.edata[dgl.EID] = sampled_graph.edata[dgl.EID]
     except Exception as e:
         print(traceback.format_exc())
         block = None
@@ -528,7 +545,9 @@ def start_hetero_etype_sample_client(
     return block, gpb
 
 
-def check_rpc_hetero_sampling_shuffle(tmpdir, num_server):
+def check_rpc_hetero_sampling_shuffle(
+    tmpdir, num_server, use_graphbolt=False, return_eids=False
+):
     generate_ip_config("rpc_ip_config.txt", num_server, num_server)
 
     g = create_random_hetero()
@@ -543,6 +562,8 @@ def check_rpc_hetero_sampling_shuffle(tmpdir, num_server):
         num_hops=num_hops,
         part_method="metis",
         return_mapping=True,
+        use_graphbolt=use_graphbolt,
+        store_eids=return_eids,
     )
 
     pserver_list = []
@@ -550,16 +571,27 @@ def check_rpc_hetero_sampling_shuffle(tmpdir, num_server):
     for i in range(num_server):
         p = ctx.Process(
             target=start_server,
-            args=(i, tmpdir, num_server > 1, "test_sampling"),
+            args=(
+                i,
+                tmpdir,
+                num_server > 1,
+                "test_sampling",
+                ["csc", "coo"],
+                use_graphbolt,
+            ),
         )
         p.start()
         time.sleep(1)
         pserver_list.append(p)
 
     block, gpb = start_hetero_sample_client(
-        0, tmpdir, num_server > 1, nodes={"n3": [0, 10, 99, 66, 124, 208]}
+        0,
+        tmpdir,
+        num_server > 1,
+        nodes={"n3": [0, 10, 99, 66, 124, 208]},
+        use_graphbolt=use_graphbolt,
+        return_eids=return_eids,
     )
-    print("Done sampling")
     for p in pserver_list:
         p.join()
         assert p.exitcode == 0
@@ -570,10 +602,17 @@ def check_rpc_hetero_sampling_shuffle(tmpdir, num_server):
         # These are global Ids after shuffling.
         shuffled_src = F.gather_row(block.srcnodes[src_type].data[dgl.NID], src)
         shuffled_dst = F.gather_row(block.dstnodes[dst_type].data[dgl.NID], dst)
-        shuffled_eid = block.edges[etype].data[dgl.EID]
-
         orig_src = F.asnumpy(F.gather_row(orig_nid_map[src_type], shuffled_src))
         orig_dst = F.asnumpy(F.gather_row(orig_nid_map[dst_type], shuffled_dst))
+
+        assert np.all(
+            F.asnumpy(g.has_edges_between(orig_src, orig_dst, etype=etype))
+        )
+
+        if use_graphbolt and not return_eids:
+            continue
+
+        shuffled_eid = block.edges[etype].data[dgl.EID]
         orig_eid = F.asnumpy(F.gather_row(orig_eid_map[c_etype], shuffled_eid))
 
         # Check the node Ids and edge Ids.
@@ -592,7 +631,9 @@ def get_degrees(g, nids, ntype):
     return deg
 
 
-def check_rpc_hetero_sampling_empty_shuffle(tmpdir, num_server):
+def check_rpc_hetero_sampling_empty_shuffle(
+    tmpdir, num_server, use_graphbolt=False, return_eids=False
+):
     generate_ip_config("rpc_ip_config.txt", num_server, num_server)
 
     g = create_random_hetero(empty=True)
@@ -607,6 +648,8 @@ def check_rpc_hetero_sampling_empty_shuffle(tmpdir, num_server):
         num_hops=num_hops,
         part_method="metis",
         return_mapping=True,
+        use_graphbolt=use_graphbolt,
+        store_eids=return_eids,
     )
 
     pserver_list = []
@@ -614,7 +657,14 @@ def check_rpc_hetero_sampling_empty_shuffle(tmpdir, num_server):
     for i in range(num_server):
         p = ctx.Process(
             target=start_server,
-            args=(i, tmpdir, num_server > 1, "test_sampling"),
+            args=(
+                i,
+                tmpdir,
+                num_server > 1,
+                "test_sampling",
+                ["csc", "coo"],
+                use_graphbolt,
+            ),
         )
         p.start()
         time.sleep(1)
@@ -623,9 +673,13 @@ def check_rpc_hetero_sampling_empty_shuffle(tmpdir, num_server):
     deg = get_degrees(g, orig_nids["n3"], "n3")
     empty_nids = F.nonzero_1d(deg == 0)
     block, gpb = start_hetero_sample_client(
-        0, tmpdir, num_server > 1, nodes={"n3": empty_nids}
+        0,
+        tmpdir,
+        num_server > 1,
+        nodes={"n3": empty_nids},
+        use_graphbolt=use_graphbolt,
+        return_eids=return_eids,
     )
-    print("Done sampling")
     for p in pserver_list:
         p.join()
         assert p.exitcode == 0
@@ -759,22 +813,36 @@ def create_random_bipartite():
     return g
 
 
-def start_bipartite_sample_client(rank, tmpdir, disable_shared_mem, nodes):
+def start_bipartite_sample_client(
+    rank,
+    tmpdir,
+    disable_shared_mem,
+    nodes,
+    use_graphbolt=False,
+    return_eids=False,
+):
     gpb = None
     if disable_shared_mem:
         _, _, _, gpb, _, _, _ = load_partition(
             tmpdir / "test_sampling.json", rank
         )
     dgl.distributed.initialize("rpc_ip_config.txt")
-    dist_graph = DistGraph("test_sampling", gpb=gpb)
+    dist_graph = DistGraph(
+        "test_sampling", gpb=gpb, use_graphbolt=use_graphbolt
+    )
     assert "feat" in dist_graph.nodes["user"].data
     assert "feat" in dist_graph.nodes["game"].data
     if gpb is None:
         gpb = dist_graph.get_partition_book()
-    sampled_graph = sample_neighbors(dist_graph, nodes, 3)
+    # Enable santity check in distributed sampling.
+    os.environ["DGL_DIST_DEBUG"] = "1"
+    sampled_graph = sample_neighbors(
+        dist_graph, nodes, 3, use_graphbolt=use_graphbolt
+    )
     block = dgl.to_block(sampled_graph, nodes)
     if sampled_graph.num_edges() > 0:
-        block.edata[dgl.EID] = sampled_graph.edata[dgl.EID]
+        if not use_graphbolt or return_eids:
+            block.edata[dgl.EID] = sampled_graph.edata[dgl.EID]
     dgl.distributed.exit_client()
     return block, gpb
 
@@ -812,7 +880,9 @@ def start_bipartite_etype_sample_client(
     return block, gpb
 
 
-def check_rpc_bipartite_sampling_empty(tmpdir, num_server):
+def check_rpc_bipartite_sampling_empty(
+    tmpdir, num_server, use_graphbolt=False, return_eids=False
+):
     """sample on bipartite via sample_neighbors() which yields empty sample results"""
     generate_ip_config("rpc_ip_config.txt", num_server, num_server)
 
@@ -828,6 +898,8 @@ def check_rpc_bipartite_sampling_empty(tmpdir, num_server):
         num_hops=num_hops,
         part_method="metis",
         return_mapping=True,
+        use_graphbolt=use_graphbolt,
+        store_eids=return_eids,
     )
 
     pserver_list = []
@@ -835,7 +907,14 @@ def check_rpc_bipartite_sampling_empty(tmpdir, num_server):
     for i in range(num_server):
         p = ctx.Process(
             target=start_server,
-            args=(i, tmpdir, num_server > 1, "test_sampling"),
+            args=(
+                i,
+                tmpdir,
+                num_server > 1,
+                "test_sampling",
+                ["csc", "coo"],
+                use_graphbolt,
+            ),
         )
         p.start()
         time.sleep(1)
@@ -844,7 +923,12 @@ def check_rpc_bipartite_sampling_empty(tmpdir, num_server):
     deg = get_degrees(g, orig_nids["game"], "game")
     empty_nids = F.nonzero_1d(deg == 0)
     block, _ = start_bipartite_sample_client(
-        0, tmpdir, num_server > 1, nodes={"game": empty_nids, "user": [1]}
+        0,
+        tmpdir,
+        num_server > 1,
+        nodes={"game": empty_nids, "user": [1]},
+        use_graphbolt=use_graphbolt,
+        return_eids=return_eids,
     )
 
     print("Done sampling")
@@ -856,7 +940,9 @@ def check_rpc_bipartite_sampling_empty(tmpdir, num_server):
     assert len(block.etypes) == len(g.etypes)
 
 
-def check_rpc_bipartite_sampling_shuffle(tmpdir, num_server):
+def check_rpc_bipartite_sampling_shuffle(
+    tmpdir, num_server, use_graphbolt=False, return_eids=False
+):
     """sample on bipartite via sample_neighbors() which yields non-empty sample results"""
     generate_ip_config("rpc_ip_config.txt", num_server, num_server)
 
@@ -872,6 +958,8 @@ def check_rpc_bipartite_sampling_shuffle(tmpdir, num_server):
         num_hops=num_hops,
         part_method="metis",
         return_mapping=True,
+        use_graphbolt=use_graphbolt,
+        store_eids=return_eids,
     )
 
     pserver_list = []
@@ -879,7 +967,14 @@ def check_rpc_bipartite_sampling_shuffle(tmpdir, num_server):
     for i in range(num_server):
         p = ctx.Process(
             target=start_server,
-            args=(i, tmpdir, num_server > 1, "test_sampling"),
+            args=(
+                i,
+                tmpdir,
+                num_server > 1,
+                "test_sampling",
+                ["csc", "coo"],
+                use_graphbolt,
+            ),
         )
         p.start()
         time.sleep(1)
@@ -888,7 +983,12 @@ def check_rpc_bipartite_sampling_shuffle(tmpdir, num_server):
     deg = get_degrees(g, orig_nid_map["game"], "game")
     nids = F.nonzero_1d(deg > 0)
     block, gpb = start_bipartite_sample_client(
-        0, tmpdir, num_server > 1, nodes={"game": nids, "user": [0]}
+        0,
+        tmpdir,
+        num_server > 1,
+        nodes={"game": nids, "user": [0]},
+        use_graphbolt=use_graphbolt,
+        return_eids=return_eids,
     )
     print("Done sampling")
     for p in pserver_list:
@@ -901,10 +1001,16 @@ def check_rpc_bipartite_sampling_shuffle(tmpdir, num_server):
         # These are global Ids after shuffling.
         shuffled_src = F.gather_row(block.srcnodes[src_type].data[dgl.NID], src)
         shuffled_dst = F.gather_row(block.dstnodes[dst_type].data[dgl.NID], dst)
-        shuffled_eid = block.edges[etype].data[dgl.EID]
-
         orig_src = F.asnumpy(F.gather_row(orig_nid_map[src_type], shuffled_src))
         orig_dst = F.asnumpy(F.gather_row(orig_nid_map[dst_type], shuffled_dst))
+        assert np.all(
+            F.asnumpy(g.has_edges_between(orig_src, orig_dst, etype=etype))
+        )
+
+        if use_graphbolt and not return_eids:
+            continue
+
+        shuffled_eid = block.edges[etype].data[dgl.EID]
         orig_eid = F.asnumpy(F.gather_row(orig_eid_map[c_etype], shuffled_eid))
 
         # Check the node Ids and edge Ids.
@@ -1032,19 +1138,35 @@ def test_rpc_sampling_shuffle(num_server, use_graphbolt, return_eids):
 
 
 @pytest.mark.parametrize("num_server", [1])
-def test_rpc_hetero_sampling_shuffle(num_server):
+@pytest.mark.parametrize("use_graphbolt,", [False, True])
+@pytest.mark.parametrize("return_eids", [False, True])
+def test_rpc_hetero_sampling_shuffle(num_server, use_graphbolt, return_eids):
     reset_envs()
     os.environ["DGL_DIST_MODE"] = "distributed"
     with tempfile.TemporaryDirectory() as tmpdirname:
-        check_rpc_hetero_sampling_shuffle(Path(tmpdirname), num_server)
+        check_rpc_hetero_sampling_shuffle(
+            Path(tmpdirname),
+            num_server,
+            use_graphbolt=use_graphbolt,
+            return_eids=return_eids,
+        )
 
 
 @pytest.mark.parametrize("num_server", [1])
-def test_rpc_hetero_sampling_empty_shuffle(num_server):
+@pytest.mark.parametrize("use_graphbolt", [False, True])
+@pytest.mark.parametrize("return_eids", [False, True])
+def test_rpc_hetero_sampling_empty_shuffle(
+    num_server, use_graphbolt, return_eids
+):
     reset_envs()
     os.environ["DGL_DIST_MODE"] = "distributed"
     with tempfile.TemporaryDirectory() as tmpdirname:
-        check_rpc_hetero_sampling_empty_shuffle(Path(tmpdirname), num_server)
+        check_rpc_hetero_sampling_empty_shuffle(
+            Path(tmpdirname),
+            num_server,
+            use_graphbolt=use_graphbolt,
+            return_eids=return_eids,
+        )
 
 
 @pytest.mark.parametrize("num_server", [1])
@@ -1071,19 +1193,29 @@ def test_rpc_hetero_etype_sampling_empty_shuffle(num_server):
 
 
 @pytest.mark.parametrize("num_server", [1])
-def test_rpc_bipartite_sampling_empty_shuffle(num_server):
+@pytest.mark.parametrize("use_graphbolt", [False, True])
+@pytest.mark.parametrize("return_eids", [False, True])
+def test_rpc_bipartite_sampling_empty_shuffle(
+    num_server, use_graphbolt, return_eids
+):
     reset_envs()
     os.environ["DGL_DIST_MODE"] = "distributed"
     with tempfile.TemporaryDirectory() as tmpdirname:
-        check_rpc_bipartite_sampling_empty(Path(tmpdirname), num_server)
+        check_rpc_bipartite_sampling_empty(
+            Path(tmpdirname), num_server, use_graphbolt, return_eids
+        )
 
 
 @pytest.mark.parametrize("num_server", [1])
-def test_rpc_bipartite_sampling_shuffle(num_server):
+@pytest.mark.parametrize("use_graphbolt", [False, True])
+@pytest.mark.parametrize("return_eids", [False, True])
+def test_rpc_bipartite_sampling_shuffle(num_server, use_graphbolt, return_eids):
     reset_envs()
     os.environ["DGL_DIST_MODE"] = "distributed"
     with tempfile.TemporaryDirectory() as tmpdirname:
-        check_rpc_bipartite_sampling_shuffle(Path(tmpdirname), num_server)
+        check_rpc_bipartite_sampling_shuffle(
+            Path(tmpdirname), num_server, use_graphbolt, return_eids
+        )
 
 
 @pytest.mark.parametrize("num_server", [1])

From 845864d2a0ccd1b1cc6c2ae385980a697b28c349 Mon Sep 17 00:00:00 2001
From: yxy235 <77922129+yxy235@users.noreply.github.com>
Date: Tue, 6 Feb 2024 14:26:06 +0800
Subject: [PATCH 27/45] [GraphBolt] Modify `SubgraphSampler` to support
 `seeds`. (#7049)

Co-authored-by: Ubuntu <ubuntu@ip-172-31-0-133.us-west-2.compute.internal>
---
 python/dgl/graphbolt/subgraph_sampler.py      |  121 +-
 .../graphbolt/test_subgraph_sampler.py        | 1294 ++++++++++++++++-
 2 files changed, 1395 insertions(+), 20 deletions(-)

diff --git a/python/dgl/graphbolt/subgraph_sampler.py b/python/dgl/graphbolt/subgraph_sampler.py
index ab7c969063c9..9425b4542e7c 100644
--- a/python/dgl/graphbolt/subgraph_sampler.py
+++ b/python/dgl/graphbolt/subgraph_sampler.py
@@ -3,6 +3,7 @@
 from collections import defaultdict
 from typing import Dict
 
+import torch
 from torch.utils.data import functional_datapipe
 
 from .base import etype_str_to_tuple
@@ -69,10 +70,16 @@ def _preprocess(minibatch):
             seeds_timestamp = (
                 minibatch.timestamp if hasattr(minibatch, "timestamp") else None
             )
+        elif minibatch.seeds is not None:
+            (
+                seeds,
+                seeds_timestamp,
+                minibatch.compacted_seeds,
+            ) = SubgraphSampler._seeds_preprocess(minibatch)
         else:
             raise ValueError(
-                f"Invalid minibatch {minibatch}: Either `node_pairs` or "
-                "`seed_nodes` should have a value."
+                f"Invalid minibatch {minibatch}: One of `node_pairs`, "
+                "`seed_nodes` and `seeds` should have a value."
             )
         minibatch._seed_nodes = seeds
         minibatch._seeds_timestamp = seeds_timestamp
@@ -226,6 +233,116 @@ def sampling_stages(self, datapipe):
         """
         return datapipe.transform(self._sample)
 
+    @staticmethod
+    def _seeds_preprocess(minibatch):
+        """Preprocess `seeds` in a minibatch to construct `unique_seeds`,
+        `node_timestamp` and `compacted_seeds` for further sampling. It
+        optionally incorporates timestamps for temporal graphs, organizing and
+        compacting seeds based on their types and timestamps.
+
+        Parameters
+        ----------
+        minibatch: MiniBatch
+            The minibatch.
+
+        Returns
+        -------
+        unique_seeds: torch.Tensor or Dict[str, torch.Tensor]
+            A tensor or a dictionary of tensors representing the unique seeds.
+            In heterogeneous graphs, seeds are returned for each node type.
+        nodes_timestamp: None or a torch.Tensor or Dict[str, torch.Tensor]
+            Containing timestamps for each seed. This is only returned if
+            `minibatch` includes timestamps and the graph is temporal.
+        compacted_seeds: torch.tensor or a Dict[str, torch.Tensor]
+            Representation of compacted seeds corresponding to 'seeds', where
+            all node ids inside are compacted.
+        """
+        use_timestamp = hasattr(minibatch, "timestamp")
+        seeds = minibatch.seeds
+        is_heterogeneous = isinstance(seeds, Dict)
+        if is_heterogeneous:
+            # Collect nodes from all types of input.
+            nodes = defaultdict(list)
+            nodes_timestamp = None
+            if use_timestamp:
+                nodes_timestamp = defaultdict(list)
+            for etype, pair in seeds.items():
+                assert pair.ndim == 1 or (
+                    pair.ndim == 2 and pair.shape[1] == 2
+                ), (
+                    "Only tensor with shape 1*N and N*2 is "
+                    + f"supported now, but got {pair.shape}."
+                )
+                ntypes = etype[:].split(":")[::2]
+                pair = pair.view(pair.shape[0], -1)
+                if use_timestamp:
+                    negative_ratio = (
+                        pair.shape[0] // minibatch.timestamp[etype].shape[0] - 1
+                    )
+                    neg_timestamp = minibatch.timestamp[
+                        etype
+                    ].repeat_interleave(negative_ratio)
+                for i, ntype in enumerate(ntypes):
+                    nodes[ntype].append(pair[:, i])
+                    if use_timestamp:
+                        nodes_timestamp[ntype].append(
+                            minibatch.timestamp[etype]
+                        )
+                        nodes_timestamp[ntype].append(neg_timestamp)
+            # Unique and compact the collected nodes.
+            if use_timestamp:
+                (
+                    unique_seeds,
+                    nodes_timestamp,
+                    compacted,
+                ) = compact_temporal_nodes(nodes, nodes_timestamp)
+            else:
+                unique_seeds, compacted = unique_and_compact(nodes)
+                nodes_timestamp = None
+            compacted_seeds = {}
+            # Map back in same order as collect.
+            for etype, pair in seeds.items():
+                if pair.ndim == 1:
+                    compacted_seeds[etype] = compacted[etype].pop(0)
+                else:
+                    src_type, _, dst_type = etype_str_to_tuple(etype)
+                    src = compacted[src_type].pop(0)
+                    dst = compacted[dst_type].pop(0)
+                    compacted_seeds[etype] = torch.cat((src, dst)).view(2, -1).T
+        else:
+            # Collect nodes from all types of input.
+            nodes = [seeds.view(-1)]
+            nodes_timestamp = None
+            if use_timestamp:
+                # Timestamp for source and destination nodes are the same.
+                negative_ratio = (
+                    seeds.shape[0] // minibatch.timestamp.shape[0] - 1
+                )
+                neg_timestamp = minibatch.timestamp.repeat_interleave(
+                    negative_ratio
+                )
+                seeds_timestamp = torch.cat(
+                    (minibatch.timestamp, neg_timestamp)
+                )
+                nodes_timestamp = [seeds_timestamp for _ in range(seeds.ndim)]
+            # Unique and compact the collected nodes.
+            if use_timestamp:
+                (
+                    unique_seeds,
+                    nodes_timestamp,
+                    compacted,
+                ) = compact_temporal_nodes(nodes, nodes_timestamp)
+            else:
+                unique_seeds, compacted = unique_and_compact(nodes)
+                nodes_timestamp = None
+            # Map back in same order as collect.
+            compacted_seeds = compacted[0].view(seeds.shape)
+        return (
+            unique_seeds,
+            nodes_timestamp,
+            compacted_seeds,
+        )
+
     def sample_subgraphs(self, seeds, seeds_timestamp):
         """Sample subgraphs from the given seeds, possibly with temporal constraints.
 
diff --git a/tests/python/pytorch/graphbolt/test_subgraph_sampler.py b/tests/python/pytorch/graphbolt/test_subgraph_sampler.py
index ec7cd7f4afae..a5c8ef53c305 100644
--- a/tests/python/pytorch/graphbolt/test_subgraph_sampler.py
+++ b/tests/python/pytorch/graphbolt/test_subgraph_sampler.py
@@ -40,7 +40,7 @@ def _get_sampler(sampler_type):
     )
 
 
-def test_SubgraphSampler_invoke():
+def test_SubgraphSampler_invoke_nodes():
     itemset = gb.ItemSet(torch.arange(10), names="seed_nodes")
     item_sampler = gb.ItemSampler(itemset, batch_size=2).copy_to(F.ctx())
 
@@ -56,7 +56,7 @@ def test_SubgraphSampler_invoke():
 
 
 @pytest.mark.parametrize("labor", [False, True])
-def test_NeighborSampler_invoke(labor):
+def test_NeighborSampler_invoke_nodes(labor):
     graph = gb_test_utils.rand_csc_graph(20, 0.15, bidirection_edge=True).to(
         F.ctx()
     )
@@ -79,7 +79,7 @@ def test_NeighborSampler_invoke(labor):
 
 
 @pytest.mark.parametrize("labor", [False, True])
-def test_NeighborSampler_fanouts(labor):
+def test_NeighborSampler_fanouts_nodes(labor):
     graph = gb_test_utils.rand_csc_graph(20, 0.15, bidirection_edge=True).to(
         F.ctx()
     )
@@ -108,7 +108,7 @@ def test_NeighborSampler_fanouts(labor):
     "sampler_type",
     [SamplerType.Normal, SamplerType.Layer, SamplerType.Temporal],
 )
-def test_SubgraphSampler_Node(sampler_type):
+def test_SubgraphSampler_Node_seed_nodes(sampler_type):
     _check_sampler_type(sampler_type)
     graph = gb_test_utils.rand_csc_graph(20, 0.15, bidirection_edge=True).to(
         F.ctx()
@@ -140,7 +140,7 @@ def to_link_batch(data):
     "sampler_type",
     [SamplerType.Normal, SamplerType.Layer, SamplerType.Temporal],
 )
-def test_SubgraphSampler_Link(sampler_type):
+def test_SubgraphSampler_Link_node_pairs(sampler_type):
     _check_sampler_type(sampler_type)
     graph = gb_test_utils.rand_csc_graph(20, 0.15, bidirection_edge=True).to(
         F.ctx()
@@ -168,7 +168,7 @@ def test_SubgraphSampler_Link(sampler_type):
     "sampler_type",
     [SamplerType.Normal, SamplerType.Layer, SamplerType.Temporal],
 )
-def test_SubgraphSampler_Link_With_Negative(sampler_type):
+def test_SubgraphSampler_Link_With_Negative_node_pairs(sampler_type):
     _check_sampler_type(sampler_type)
     graph = gb_test_utils.rand_csc_graph(20, 0.15, bidirection_edge=True).to(
         F.ctx()
@@ -219,7 +219,7 @@ def get_hetero_graph():
     "sampler_type",
     [SamplerType.Normal, SamplerType.Layer, SamplerType.Temporal],
 )
-def test_SubgraphSampler_Node_Hetero(sampler_type):
+def test_SubgraphSampler_Node_seed_nodes_Hetero(sampler_type):
     _check_sampler_type(sampler_type)
     graph = get_hetero_graph().to(F.ctx())
     items = torch.arange(3)
@@ -248,7 +248,7 @@ def test_SubgraphSampler_Node_Hetero(sampler_type):
     "sampler_type",
     [SamplerType.Normal, SamplerType.Layer, SamplerType.Temporal],
 )
-def test_SubgraphSampler_Link_Hetero(sampler_type):
+def test_SubgraphSampler_Link_Hetero_node_pairs(sampler_type):
     _check_sampler_type(sampler_type)
     graph = get_hetero_graph().to(F.ctx())
     first_items = torch.LongTensor([[0, 0, 1, 1], [0, 2, 0, 1]]).T
@@ -292,7 +292,7 @@ def test_SubgraphSampler_Link_Hetero(sampler_type):
     "sampler_type",
     [SamplerType.Normal, SamplerType.Layer, SamplerType.Temporal],
 )
-def test_SubgraphSampler_Link_Hetero_With_Negative(sampler_type):
+def test_SubgraphSampler_Link_Hetero_With_Negative_node_pairs(sampler_type):
     _check_sampler_type(sampler_type)
     graph = get_hetero_graph().to(F.ctx())
     first_items = torch.LongTensor([[0, 0, 1, 1], [0, 2, 0, 1]]).T
@@ -337,7 +337,7 @@ def test_SubgraphSampler_Link_Hetero_With_Negative(sampler_type):
     "sampler_type",
     [SamplerType.Normal, SamplerType.Layer, SamplerType.Temporal],
 )
-def test_SubgraphSampler_Link_Hetero_Unknown_Etype(sampler_type):
+def test_SubgraphSampler_Link_Hetero_Unknown_Etype_node_pairs(sampler_type):
     _check_sampler_type(sampler_type)
     graph = get_hetero_graph().to(F.ctx())
     first_items = torch.LongTensor([[0, 0, 1, 1], [0, 2, 0, 1]]).T
@@ -382,7 +382,9 @@ def test_SubgraphSampler_Link_Hetero_Unknown_Etype(sampler_type):
     "sampler_type",
     [SamplerType.Normal, SamplerType.Layer, SamplerType.Temporal],
 )
-def test_SubgraphSampler_Link_Hetero_With_Negative_Unknown_Etype(sampler_type):
+def test_SubgraphSampler_Link_Hetero_With_Negative_Unknown_Etype_node_pairs(
+    sampler_type,
+):
     _check_sampler_type(sampler_type)
     graph = get_hetero_graph().to(F.ctx())
     first_items = torch.LongTensor([[0, 0, 1, 1], [0, 2, 0, 1]]).T
@@ -432,7 +434,7 @@ def test_SubgraphSampler_Link_Hetero_With_Negative_Unknown_Etype(sampler_type):
     "replace",
     [False, True],
 )
-def test_SubgraphSampler_Random_Hetero_Graph(sampler_type, replace):
+def test_SubgraphSampler_Random_Hetero_Graph_seed_ndoes(sampler_type, replace):
     _check_sampler_type(sampler_type)
     if F._default_context_str == "gpu" and replace == True:
         pytest.skip("Sampling with replacement not yet supported on GPU.")
@@ -523,7 +525,7 @@ def test_SubgraphSampler_Random_Hetero_Graph(sampler_type, replace):
     "sampler_type",
     [SamplerType.Normal, SamplerType.Layer, SamplerType.Temporal],
 )
-def test_SubgraphSampler_without_dedpulication_Homo(sampler_type):
+def test_SubgraphSampler_without_dedpulication_Homo_seed_nodes(sampler_type):
     _check_sampler_type(sampler_type)
     graph = dgl.graph(
         ([5, 0, 1, 5, 6, 7, 2, 2, 4], [0, 1, 2, 2, 2, 2, 3, 4, 4])
@@ -587,7 +589,7 @@ def test_SubgraphSampler_without_dedpulication_Homo(sampler_type):
     "sampler_type",
     [SamplerType.Normal, SamplerType.Layer, SamplerType.Temporal],
 )
-def test_SubgraphSampler_without_dedpulication_Hetero(sampler_type):
+def test_SubgraphSampler_without_dedpulication_Hetero_seed_nodes(sampler_type):
     _check_sampler_type(sampler_type)
     graph = get_hetero_graph().to(F.ctx())
     items = torch.arange(2)
@@ -680,7 +682,7 @@ def test_SubgraphSampler_without_dedpulication_Hetero(sampler_type):
     reason="Fails due to different result on the GPU.",
 )
 @pytest.mark.parametrize("labor", [False, True])
-def test_SubgraphSampler_unique_csc_format_Homo_cpu(labor):
+def test_SubgraphSampler_unique_csc_format_Homo_cpu_seed_nodes(labor):
     torch.manual_seed(1205)
     graph = dgl.graph(([5, 0, 6, 7, 2, 2, 4], [0, 1, 2, 2, 3, 4, 4]))
     graph = gb.from_dglgraph(graph, True).to(F.ctx())
@@ -739,7 +741,7 @@ def test_SubgraphSampler_unique_csc_format_Homo_cpu(labor):
     reason="Fails due to different result on the CPU.",
 )
 @pytest.mark.parametrize("labor", [False, True])
-def test_SubgraphSampler_unique_csc_format_Homo_gpu(labor):
+def test_SubgraphSampler_unique_csc_format_Homo_gpu_seed_nodes(labor):
     torch.manual_seed(1205)
     graph = dgl.graph(([5, 0, 7, 7, 2, 4], [0, 1, 2, 2, 3, 4]))
     graph = gb.from_dglgraph(graph, is_homogeneous=True).to(F.ctx())
@@ -794,7 +796,7 @@ def test_SubgraphSampler_unique_csc_format_Homo_gpu(labor):
 
 
 @pytest.mark.parametrize("labor", [False, True])
-def test_SubgraphSampler_unique_csc_format_Hetero(labor):
+def test_SubgraphSampler_unique_csc_format_Hetero_seed_nodes(labor):
     graph = get_hetero_graph().to(F.ctx())
     itemset = gb.ItemSetDict(
         {"n2": gb.ItemSet(torch.arange(2), names="seed_nodes")}
@@ -878,7 +880,7 @@ def test_SubgraphSampler_unique_csc_format_Hetero(labor):
     "sampler_type",
     [SamplerType.Normal, SamplerType.Layer, SamplerType.Temporal],
 )
-def test_SubgraphSampler_Hetero_multifanout_per_layer(sampler_type):
+def test_SubgraphSampler_Hetero_multifanout_per_layer_seed_nodes(sampler_type):
     _check_sampler_type(sampler_type)
     graph = get_hetero_graph().to(F.ctx())
     items_n1 = torch.tensor([0])
@@ -939,3 +941,1259 @@ def test_SubgraphSampler_Hetero_multifanout_per_layer(sampler_type):
                 len(sampled_subgraph.sampled_csc["n2:e2:n1"].indices)
                 == indices_len[step]["n2:e2:n1"]
             )
+
+
+def test_SubgraphSampler_invoke():
+    itemset = gb.ItemSet(torch.arange(10), names="seeds")
+    item_sampler = gb.ItemSampler(itemset, batch_size=2).copy_to(F.ctx())
+
+    # Invoke via class constructor.
+    datapipe = gb.SubgraphSampler(item_sampler)
+    with pytest.raises(NotImplementedError):
+        next(iter(datapipe))
+
+    # Invokde via functional form.
+    datapipe = item_sampler.sample_subgraph()
+    with pytest.raises(NotImplementedError):
+        next(iter(datapipe))
+
+
+@pytest.mark.parametrize("labor", [False, True])
+def test_NeighborSampler_invoke(labor):
+    graph = gb_test_utils.rand_csc_graph(20, 0.15, bidirection_edge=True).to(
+        F.ctx()
+    )
+    itemset = gb.ItemSet(torch.arange(10), names="seeds")
+    item_sampler = gb.ItemSampler(itemset, batch_size=2).copy_to(F.ctx())
+    num_layer = 2
+    fanouts = [torch.LongTensor([2]) for _ in range(num_layer)]
+
+    # Invoke via class constructor.
+    Sampler = gb.LayerNeighborSampler if labor else gb.NeighborSampler
+    datapipe = Sampler(item_sampler, graph, fanouts)
+    assert len(list(datapipe)) == 5
+
+    # Invokde via functional form.
+    if labor:
+        datapipe = item_sampler.sample_layer_neighbor(graph, fanouts)
+    else:
+        datapipe = item_sampler.sample_neighbor(graph, fanouts)
+    assert len(list(datapipe)) == 5
+
+
+@pytest.mark.parametrize("labor", [False, True])
+def test_NeighborSampler_fanouts(labor):
+    graph = gb_test_utils.rand_csc_graph(20, 0.15, bidirection_edge=True).to(
+        F.ctx()
+    )
+    itemset = gb.ItemSet(torch.arange(10), names="seeds")
+    item_sampler = gb.ItemSampler(itemset, batch_size=2).copy_to(F.ctx())
+    num_layer = 2
+
+    # `fanouts` is a list of tensors.
+    fanouts = [torch.LongTensor([2]) for _ in range(num_layer)]
+    if labor:
+        datapipe = item_sampler.sample_layer_neighbor(graph, fanouts)
+    else:
+        datapipe = item_sampler.sample_neighbor(graph, fanouts)
+    assert len(list(datapipe)) == 5
+
+    # `fanouts` is a list of integers.
+    fanouts = [2 for _ in range(num_layer)]
+    if labor:
+        datapipe = item_sampler.sample_layer_neighbor(graph, fanouts)
+    else:
+        datapipe = item_sampler.sample_neighbor(graph, fanouts)
+    assert len(list(datapipe)) == 5
+
+
+@pytest.mark.parametrize(
+    "sampler_type",
+    [SamplerType.Normal, SamplerType.Layer, SamplerType.Temporal],
+)
+def test_SubgraphSampler_Node(sampler_type):
+    _check_sampler_type(sampler_type)
+    graph = gb_test_utils.rand_csc_graph(20, 0.15, bidirection_edge=True).to(
+        F.ctx()
+    )
+    items = torch.arange(10)
+    names = "seeds"
+    if sampler_type == SamplerType.Temporal:
+        graph.node_attributes = {"timestamp": torch.arange(20).to(F.ctx())}
+        graph.edge_attributes = {
+            "timestamp": torch.arange(len(graph.indices)).to(F.ctx())
+        }
+        items = (items, torch.arange(10))
+        names = (names, "timestamp")
+    itemset = gb.ItemSet(items, names=names)
+    item_sampler = gb.ItemSampler(itemset, batch_size=2).copy_to(F.ctx())
+    num_layer = 2
+    fanouts = [torch.LongTensor([2]) for _ in range(num_layer)]
+    sampler = _get_sampler(sampler_type)
+    sampler_dp = sampler(item_sampler, graph, fanouts)
+    assert len(list(sampler_dp)) == 5
+    for data in sampler_dp:
+        assert torch.equal(
+            data.compacted_seeds, torch.tensor([0, 1]).to(F.ctx())
+        )
+
+
+@pytest.mark.parametrize(
+    "sampler_type",
+    [SamplerType.Normal, SamplerType.Layer, SamplerType.Temporal],
+)
+def test_SubgraphSampler_Link(sampler_type):
+    _check_sampler_type(sampler_type)
+    graph = gb_test_utils.rand_csc_graph(20, 0.15, bidirection_edge=True).to(
+        F.ctx()
+    )
+    items = torch.arange(20).reshape(-1, 2)
+    names = "seeds"
+    if sampler_type == SamplerType.Temporal:
+        graph.node_attributes = {"timestamp": torch.arange(20).to(F.ctx())}
+        graph.edge_attributes = {
+            "timestamp": torch.arange(len(graph.indices)).to(F.ctx())
+        }
+        items = (items, torch.arange(10))
+        names = (names, "timestamp")
+    itemset = gb.ItemSet(items, names=names)
+    datapipe = gb.ItemSampler(itemset, batch_size=2).copy_to(F.ctx())
+    num_layer = 2
+    fanouts = [torch.LongTensor([2]) for _ in range(num_layer)]
+    sampler = _get_sampler(sampler_type)
+    datapipe = sampler(datapipe, graph, fanouts)
+    # TODO: `exclude_seed_edges` doesn't support `seeds` now.
+    # datapipe = datapipe.transform(partial(gb.exclude_seed_edges))
+    assert len(list(datapipe)) == 5
+    for data in datapipe:
+        assert torch.equal(
+            data.compacted_seeds, torch.tensor([[0, 1], [2, 3]]).to(F.ctx())
+        )
+
+
+@pytest.mark.parametrize(
+    "sampler_type",
+    [SamplerType.Normal, SamplerType.Layer, SamplerType.Temporal],
+)
+def test_SubgraphSampler_Link_With_Negative(sampler_type):
+    _check_sampler_type(sampler_type)
+    graph = gb_test_utils.rand_csc_graph(20, 0.15, bidirection_edge=True).to(
+        F.ctx()
+    )
+    items = torch.arange(20).reshape(-1, 2)
+    names = "seeds"
+    if sampler_type == SamplerType.Temporal:
+        graph.node_attributes = {"timestamp": torch.arange(20).to(F.ctx())}
+        graph.edge_attributes = {
+            "timestamp": torch.arange(len(graph.indices)).to(F.ctx())
+        }
+        items = (items, torch.arange(10))
+        names = (names, "timestamp")
+    itemset = gb.ItemSet(items, names=names)
+    datapipe = gb.ItemSampler(itemset, batch_size=2).copy_to(F.ctx())
+    num_layer = 2
+    fanouts = [torch.LongTensor([2]) for _ in range(num_layer)]
+    datapipe = gb.UniformNegativeSampler(datapipe, graph, 1)
+    sampler = _get_sampler(sampler_type)
+    datapipe = sampler(datapipe, graph, fanouts)
+    # TODO: `exclude_seed_edges` doesn't support `seeds` now.
+    # datapipe = datapipe.transform(partial(gb.exclude_seed_edges))
+    assert len(list(datapipe)) == 5
+
+
+@pytest.mark.parametrize(
+    "sampler_type",
+    [SamplerType.Normal, SamplerType.Layer, SamplerType.Temporal],
+)
+def test_SubgraphSampler_Node_Hetero(sampler_type):
+    _check_sampler_type(sampler_type)
+    graph = get_hetero_graph().to(F.ctx())
+    items = torch.arange(3)
+    names = "seeds"
+    if sampler_type == SamplerType.Temporal:
+        graph.node_attributes = {
+            "timestamp": torch.arange(graph.csc_indptr.numel() - 1).to(F.ctx())
+        }
+        graph.edge_attributes = {
+            "timestamp": torch.arange(graph.indices.numel()).to(F.ctx())
+        }
+        items = (items, torch.randint(0, 10, (3,)))
+        names = (names, "timestamp")
+    itemset = gb.ItemSetDict({"n2": gb.ItemSet(items, names=names)})
+    item_sampler = gb.ItemSampler(itemset, batch_size=2).copy_to(F.ctx())
+    num_layer = 2
+    fanouts = [torch.LongTensor([2]) for _ in range(num_layer)]
+    sampler = _get_sampler(sampler_type)
+    sampler_dp = sampler(item_sampler, graph, fanouts)
+    assert len(list(sampler_dp)) == 2
+    expected_compacted_seeds = {"n2": [torch.tensor([0, 1]), torch.tensor([0])]}
+    for step, minibatch in enumerate(sampler_dp):
+        assert len(minibatch.sampled_subgraphs) == num_layer
+        for etype, compacted_seeds in minibatch.compacted_seeds.items():
+            assert torch.equal(
+                compacted_seeds,
+                expected_compacted_seeds[etype][step].to(F.ctx()),
+            )
+
+
+@pytest.mark.parametrize(
+    "sampler_type",
+    [SamplerType.Normal, SamplerType.Layer, SamplerType.Temporal],
+)
+def test_SubgraphSampler_Link_Hetero(sampler_type):
+    _check_sampler_type(sampler_type)
+    graph = get_hetero_graph().to(F.ctx())
+    first_items = torch.LongTensor([[0, 0, 1, 1], [0, 2, 0, 1]]).T
+    first_names = "seeds"
+    second_items = torch.LongTensor([[0, 0, 1, 1, 2, 2], [0, 1, 1, 0, 0, 1]]).T
+    second_names = "seeds"
+    if sampler_type == SamplerType.Temporal:
+        graph.node_attributes = {
+            "timestamp": torch.arange(graph.csc_indptr.numel() - 1).to(F.ctx())
+        }
+        graph.edge_attributes = {
+            "timestamp": torch.arange(graph.indices.numel()).to(F.ctx())
+        }
+        first_items = (first_items, torch.randint(0, 10, (4,)))
+        first_names = (first_names, "timestamp")
+        second_items = (second_items, torch.randint(0, 10, (6,)))
+        second_names = (second_names, "timestamp")
+    itemset = gb.ItemSetDict(
+        {
+            "n1:e1:n2": gb.ItemSet(
+                first_items,
+                names=first_names,
+            ),
+            "n2:e2:n1": gb.ItemSet(
+                second_items,
+                names=second_names,
+            ),
+        }
+    )
+
+    datapipe = gb.ItemSampler(itemset, batch_size=2).copy_to(F.ctx())
+    num_layer = 2
+    fanouts = [torch.LongTensor([2]) for _ in range(num_layer)]
+    sampler = _get_sampler(sampler_type)
+    datapipe = sampler(datapipe, graph, fanouts)
+    # TODO: `exclude_seed_edges` doesn't support `seeds` now.
+    # datapipe = datapipe.transform(partial(gb.exclude_seed_edges))
+    assert len(list(datapipe)) == 5
+    for data in datapipe:
+        for compacted_seeds in data.compacted_seeds.values():
+            if sampler_type == SamplerType.Temporal:
+                assert torch.equal(
+                    compacted_seeds, torch.tensor([[0, 0], [1, 1]]).to(F.ctx())
+                )
+            else:
+                assert torch.equal(
+                    torch.sort(compacted_seeds.T, dim=1)[0].T,
+                    torch.tensor([[0, 0], [0, 1]]).to(F.ctx()),
+                )
+
+
+@pytest.mark.parametrize(
+    "sampler_type",
+    [SamplerType.Normal, SamplerType.Layer, SamplerType.Temporal],
+)
+def test_SubgraphSampler_Link_Hetero_With_Negative(sampler_type):
+    _check_sampler_type(sampler_type)
+    graph = get_hetero_graph().to(F.ctx())
+    first_items = torch.LongTensor([[0, 0, 1, 1], [0, 2, 0, 1]]).T
+    first_names = "seeds"
+    second_items = torch.LongTensor([[0, 0, 1, 1, 2, 2], [0, 1, 1, 0, 0, 1]]).T
+    second_names = "seeds"
+    if sampler_type == SamplerType.Temporal:
+        graph.node_attributes = {
+            "timestamp": torch.arange(graph.csc_indptr.numel() - 1).to(F.ctx())
+        }
+        graph.edge_attributes = {
+            "timestamp": torch.arange(graph.indices.numel()).to(F.ctx())
+        }
+        first_items = (first_items, torch.randint(0, 10, (4,)))
+        first_names = (first_names, "timestamp")
+        second_items = (second_items, torch.randint(0, 10, (6,)))
+        second_names = (second_names, "timestamp")
+    itemset = gb.ItemSetDict(
+        {
+            "n1:e1:n2": gb.ItemSet(
+                first_items,
+                names=first_names,
+            ),
+            "n2:e2:n1": gb.ItemSet(
+                second_items,
+                names=second_names,
+            ),
+        }
+    )
+
+    datapipe = gb.ItemSampler(itemset, batch_size=2).copy_to(F.ctx())
+    num_layer = 2
+    fanouts = [torch.LongTensor([2]) for _ in range(num_layer)]
+    datapipe = gb.UniformNegativeSampler(datapipe, graph, 1)
+    sampler = _get_sampler(sampler_type)
+    datapipe = sampler(datapipe, graph, fanouts)
+    # TODO: `exclude_seed_edges` doesn't support `seeds` now.
+    # datapipe = datapipe.transform(partial(gb.exclude_seed_edges))
+    assert len(list(datapipe)) == 5
+
+
+@pytest.mark.parametrize(
+    "sampler_type",
+    [SamplerType.Normal, SamplerType.Layer, SamplerType.Temporal],
+)
+def test_SubgraphSampler_Link_Hetero_Unknown_Etype(sampler_type):
+    _check_sampler_type(sampler_type)
+    graph = get_hetero_graph().to(F.ctx())
+    first_items = torch.LongTensor([[0, 0, 1, 1], [0, 2, 0, 1]]).T
+    first_names = "seeds"
+    second_items = torch.LongTensor([[0, 0, 1, 1, 2, 2], [0, 1, 1, 0, 0, 1]]).T
+    second_names = "seeds"
+    if sampler_type == SamplerType.Temporal:
+        graph.node_attributes = {
+            "timestamp": torch.arange(graph.csc_indptr.numel() - 1).to(F.ctx())
+        }
+        graph.edge_attributes = {
+            "timestamp": torch.arange(graph.indices.numel()).to(F.ctx())
+        }
+        first_items = (first_items, torch.randint(0, 10, (4,)))
+        first_names = (first_names, "timestamp")
+        second_items = (second_items, torch.randint(0, 10, (6,)))
+        second_names = (second_names, "timestamp")
+    # "e11" and "e22" are not valid edge types.
+    itemset = gb.ItemSetDict(
+        {
+            "n1:e11:n2": gb.ItemSet(
+                first_items,
+                names=first_names,
+            ),
+            "n2:e22:n1": gb.ItemSet(
+                second_items,
+                names=second_names,
+            ),
+        }
+    )
+
+    datapipe = gb.ItemSampler(itemset, batch_size=2).copy_to(F.ctx())
+    num_layer = 2
+    fanouts = [torch.LongTensor([2]) for _ in range(num_layer)]
+    sampler = _get_sampler(sampler_type)
+    datapipe = sampler(datapipe, graph, fanouts)
+    # TODO: `exclude_seed_edges` doesn't support `seeds` now.
+    # datapipe = datapipe.transform(partial(gb.exclude_seed_edges))
+    assert len(list(datapipe)) == 5
+
+
+@pytest.mark.parametrize(
+    "sampler_type",
+    [SamplerType.Normal, SamplerType.Layer, SamplerType.Temporal],
+)
+def test_SubgraphSampler_Link_Hetero_With_Negative_Unknown_Etype(sampler_type):
+    _check_sampler_type(sampler_type)
+    graph = get_hetero_graph().to(F.ctx())
+    first_items = torch.LongTensor([[0, 0, 1, 1], [0, 2, 0, 1]]).T
+    first_names = "seeds"
+    second_items = torch.LongTensor([[0, 0, 1, 1, 2, 2], [0, 1, 1, 0, 0, 1]]).T
+    second_names = "seeds"
+    if sampler_type == SamplerType.Temporal:
+        graph.node_attributes = {
+            "timestamp": torch.arange(graph.csc_indptr.numel() - 1).to(F.ctx())
+        }
+        graph.edge_attributes = {
+            "timestamp": torch.arange(graph.indices.numel()).to(F.ctx())
+        }
+        first_items = (first_items, torch.randint(0, 10, (4,)))
+        first_names = (first_names, "timestamp")
+        second_items = (second_items, torch.randint(0, 10, (6,)))
+        second_names = (second_names, "timestamp")
+    # "e11" and "e22" are not valid edge types.
+    itemset = gb.ItemSetDict(
+        {
+            "n1:e11:n2": gb.ItemSet(
+                first_items,
+                names=first_names,
+            ),
+            "n2:e22:n1": gb.ItemSet(
+                second_items,
+                names=second_names,
+            ),
+        }
+    )
+
+    datapipe = gb.ItemSampler(itemset, batch_size=2).copy_to(F.ctx())
+    num_layer = 2
+    fanouts = [torch.LongTensor([2]) for _ in range(num_layer)]
+    datapipe = gb.UniformNegativeSampler(datapipe, graph, 1)
+    sampler = _get_sampler(sampler_type)
+    datapipe = sampler(datapipe, graph, fanouts)
+    # TODO: `exclude_seed_edges` doesn't support `seeds` now.
+    # datapipe = datapipe.transform(partial(gb.exclude_seed_edges))
+    assert len(list(datapipe)) == 5
+
+
+@pytest.mark.parametrize(
+    "sampler_type",
+    [SamplerType.Normal, SamplerType.Layer, SamplerType.Temporal],
+)
+@pytest.mark.parametrize(
+    "replace",
+    [False, True],
+)
+def test_SubgraphSampler_Random_Hetero_Graph(sampler_type, replace):
+    _check_sampler_type(sampler_type)
+    if F._default_context_str == "gpu" and replace == True:
+        pytest.skip("Sampling with replacement not yet supported on GPU.")
+    num_nodes = 5
+    num_edges = 9
+    num_ntypes = 3
+    num_etypes = 3
+    (
+        csc_indptr,
+        indices,
+        node_type_offset,
+        type_per_edge,
+        node_type_to_id,
+        edge_type_to_id,
+    ) = gb_test_utils.random_hetero_graph(
+        num_nodes, num_edges, num_ntypes, num_etypes
+    )
+    node_attributes = {}
+    edge_attributes = {
+        "A1": torch.randn(num_edges),
+        "A2": torch.randn(num_edges),
+    }
+    if sampler_type == SamplerType.Temporal:
+        node_attributes["timestamp"] = torch.randint(0, 10, (num_nodes,))
+        edge_attributes["timestamp"] = torch.randint(0, 10, (num_edges,))
+    graph = gb.fused_csc_sampling_graph(
+        csc_indptr,
+        indices,
+        node_type_offset=node_type_offset,
+        type_per_edge=type_per_edge,
+        node_type_to_id=node_type_to_id,
+        edge_type_to_id=edge_type_to_id,
+        node_attributes=node_attributes,
+        edge_attributes=edge_attributes,
+    ).to(F.ctx())
+    first_items = torch.tensor([0])
+    first_names = "seeds"
+    second_items = torch.tensor([0])
+    second_names = "seeds"
+    if sampler_type == SamplerType.Temporal:
+        first_items = (first_items, torch.randint(0, 10, (1,)))
+        first_names = (first_names, "timestamp")
+        second_items = (second_items, torch.randint(0, 10, (1,)))
+        second_names = (second_names, "timestamp")
+    itemset = gb.ItemSetDict(
+        {
+            "n2": gb.ItemSet(first_items, names=first_names),
+            "n1": gb.ItemSet(second_items, names=second_names),
+        }
+    )
+
+    item_sampler = gb.ItemSampler(itemset, batch_size=2).copy_to(F.ctx())
+    num_layer = 2
+    fanouts = [torch.LongTensor([2]) for _ in range(num_layer)]
+    sampler = _get_sampler(sampler_type)
+
+    sampler_dp = sampler(item_sampler, graph, fanouts, replace=replace)
+
+    for data in sampler_dp:
+        for sampledsubgraph in data.sampled_subgraphs:
+            for _, value in sampledsubgraph.sampled_csc.items():
+                assert torch.equal(
+                    torch.ge(
+                        value.indices,
+                        torch.zeros(len(value.indices)).to(F.ctx()),
+                    ),
+                    torch.ones(len(value.indices)).to(F.ctx()),
+                )
+                assert torch.equal(
+                    torch.ge(
+                        value.indptr, torch.zeros(len(value.indptr)).to(F.ctx())
+                    ),
+                    torch.ones(len(value.indptr)).to(F.ctx()),
+                )
+            for _, value in sampledsubgraph.original_column_node_ids.items():
+                assert torch.equal(
+                    torch.ge(value, torch.zeros(len(value)).to(F.ctx())),
+                    torch.ones(len(value)).to(F.ctx()),
+                )
+            for _, value in sampledsubgraph.original_row_node_ids.items():
+                assert torch.equal(
+                    torch.ge(value, torch.zeros(len(value)).to(F.ctx())),
+                    torch.ones(len(value)).to(F.ctx()),
+                )
+
+
+@pytest.mark.parametrize(
+    "sampler_type",
+    [SamplerType.Normal, SamplerType.Layer, SamplerType.Temporal],
+)
+def test_SubgraphSampler_without_dedpulication_Homo_Node(sampler_type):
+    _check_sampler_type(sampler_type)
+    graph = dgl.graph(
+        ([5, 0, 1, 5, 6, 7, 2, 2, 4], [0, 1, 2, 2, 2, 2, 3, 4, 4])
+    )
+    graph = gb.from_dglgraph(graph, True).to(F.ctx())
+    seed_nodes = torch.LongTensor([0, 3, 4])
+    items = seed_nodes
+    names = "seeds"
+    if sampler_type == SamplerType.Temporal:
+        graph.node_attributes = {
+            "timestamp": torch.zeros(graph.csc_indptr.numel() - 1).to(F.ctx())
+        }
+        graph.edge_attributes = {
+            "timestamp": torch.zeros(graph.indices.numel()).to(F.ctx())
+        }
+        items = (items, torch.randint(1, 10, (3,)))
+        names = (names, "timestamp")
+
+    itemset = gb.ItemSet(items, names=names)
+    item_sampler = gb.ItemSampler(itemset, batch_size=len(seed_nodes)).copy_to(
+        F.ctx()
+    )
+    num_layer = 2
+    fanouts = [torch.LongTensor([2]) for _ in range(num_layer)]
+
+    sampler = _get_sampler(sampler_type)
+    if sampler_type == SamplerType.Temporal:
+        datapipe = sampler(item_sampler, graph, fanouts)
+    else:
+        datapipe = sampler(item_sampler, graph, fanouts, deduplicate=False)
+
+    length = [17, 7]
+    compacted_indices = [
+        (torch.arange(0, 10) + 7).to(F.ctx()),
+        (torch.arange(0, 4) + 3).to(F.ctx()),
+    ]
+    indptr = [
+        torch.tensor([0, 1, 2, 4, 4, 6, 8, 10]).to(F.ctx()),
+        torch.tensor([0, 1, 2, 4]).to(F.ctx()),
+    ]
+    seeds = [
+        torch.tensor([0, 2, 2, 3, 4, 4, 5]).to(F.ctx()),
+        torch.tensor([0, 3, 4]).to(F.ctx()),
+    ]
+    for data in datapipe:
+        for step, sampled_subgraph in enumerate(data.sampled_subgraphs):
+            assert len(sampled_subgraph.original_row_node_ids) == length[step]
+            assert torch.equal(
+                sampled_subgraph.sampled_csc.indices, compacted_indices[step]
+            )
+            assert torch.equal(
+                sampled_subgraph.sampled_csc.indptr, indptr[step]
+            )
+            assert torch.equal(
+                torch.sort(sampled_subgraph.original_column_node_ids)[0],
+                seeds[step],
+            )
+
+
+@pytest.mark.parametrize(
+    "sampler_type",
+    [SamplerType.Normal, SamplerType.Layer, SamplerType.Temporal],
+)
+def test_SubgraphSampler_without_dedpulication_Hetero_Node(sampler_type):
+    _check_sampler_type(sampler_type)
+    graph = get_hetero_graph().to(F.ctx())
+    items = torch.arange(2)
+    names = "seeds"
+    if sampler_type == SamplerType.Temporal:
+        graph.node_attributes = {
+            "timestamp": torch.zeros(graph.csc_indptr.numel() - 1).to(F.ctx())
+        }
+        graph.edge_attributes = {
+            "timestamp": torch.zeros(graph.indices.numel()).to(F.ctx())
+        }
+        items = (items, torch.randint(1, 10, (2,)))
+        names = (names, "timestamp")
+    itemset = gb.ItemSetDict({"n2": gb.ItemSet(items, names=names)})
+    item_sampler = gb.ItemSampler(itemset, batch_size=2).copy_to(F.ctx())
+    num_layer = 2
+    fanouts = [torch.LongTensor([2]) for _ in range(num_layer)]
+    sampler = _get_sampler(sampler_type)
+    if sampler_type == SamplerType.Temporal:
+        datapipe = sampler(item_sampler, graph, fanouts)
+    else:
+        datapipe = sampler(item_sampler, graph, fanouts, deduplicate=False)
+    csc_formats = [
+        {
+            "n1:e1:n2": gb.CSCFormatBase(
+                indptr=torch.tensor([0, 2, 4]),
+                indices=torch.tensor([4, 5, 6, 7]),
+            ),
+            "n2:e2:n1": gb.CSCFormatBase(
+                indptr=torch.tensor([0, 2, 4, 6, 8]),
+                indices=torch.tensor([2, 3, 4, 5, 6, 7, 8, 9]),
+            ),
+        },
+        {
+            "n1:e1:n2": gb.CSCFormatBase(
+                indptr=torch.tensor([0, 2, 4]),
+                indices=torch.tensor([0, 1, 2, 3]),
+            ),
+            "n2:e2:n1": gb.CSCFormatBase(
+                indptr=torch.tensor([0]),
+                indices=torch.tensor([], dtype=torch.int64),
+            ),
+        },
+    ]
+    original_column_node_ids = [
+        {
+            "n1": torch.tensor([0, 1, 1, 0]),
+            "n2": torch.tensor([0, 1]),
+        },
+        {
+            "n1": torch.tensor([], dtype=torch.int64),
+            "n2": torch.tensor([0, 1]),
+        },
+    ]
+    original_row_node_ids = [
+        {
+            "n1": torch.tensor([0, 1, 1, 0, 0, 1, 1, 0]),
+            "n2": torch.tensor([0, 1, 0, 2, 0, 1, 0, 1, 0, 2]),
+        },
+        {
+            "n1": torch.tensor([0, 1, 1, 0]),
+            "n2": torch.tensor([0, 1]),
+        },
+    ]
+
+    for data in datapipe:
+        for step, sampled_subgraph in enumerate(data.sampled_subgraphs):
+            for ntype in ["n1", "n2"]:
+                assert torch.equal(
+                    sampled_subgraph.original_row_node_ids[ntype],
+                    original_row_node_ids[step][ntype].to(F.ctx()),
+                )
+                assert torch.equal(
+                    sampled_subgraph.original_column_node_ids[ntype],
+                    original_column_node_ids[step][ntype].to(F.ctx()),
+                )
+            for etype in ["n1:e1:n2", "n2:e2:n1"]:
+                assert torch.equal(
+                    sampled_subgraph.sampled_csc[etype].indices,
+                    csc_formats[step][etype].indices.to(F.ctx()),
+                )
+                assert torch.equal(
+                    sampled_subgraph.sampled_csc[etype].indptr,
+                    csc_formats[step][etype].indptr.to(F.ctx()),
+                )
+
+
+@unittest.skipIf(
+    F._default_context_str == "gpu",
+    reason="Fails due to different result on the GPU.",
+)
+@pytest.mark.parametrize("labor", [False, True])
+def test_SubgraphSampler_unique_csc_format_Homo_Node_cpu(labor):
+    torch.manual_seed(1205)
+    graph = dgl.graph(([5, 0, 6, 7, 2, 2, 4], [0, 1, 2, 2, 3, 4, 4]))
+    graph = gb.from_dglgraph(graph, True).to(F.ctx())
+    seed_nodes = torch.LongTensor([0, 3, 4])
+
+    itemset = gb.ItemSet(seed_nodes, names="seeds")
+    item_sampler = gb.ItemSampler(itemset, batch_size=len(seed_nodes)).copy_to(
+        F.ctx()
+    )
+    num_layer = 2
+    fanouts = [torch.LongTensor([2]) for _ in range(num_layer)]
+
+    Sampler = gb.LayerNeighborSampler if labor else gb.NeighborSampler
+    datapipe = Sampler(
+        item_sampler,
+        graph,
+        fanouts,
+        deduplicate=True,
+    )
+
+    original_row_node_ids = [
+        torch.tensor([0, 3, 4, 5, 2, 6, 7]).to(F.ctx()),
+        torch.tensor([0, 3, 4, 5, 2]).to(F.ctx()),
+    ]
+    compacted_indices = [
+        torch.tensor([3, 4, 4, 2, 5, 6]).to(F.ctx()),
+        torch.tensor([3, 4, 4, 2]).to(F.ctx()),
+    ]
+    indptr = [
+        torch.tensor([0, 1, 2, 4, 4, 6]).to(F.ctx()),
+        torch.tensor([0, 1, 2, 4]).to(F.ctx()),
+    ]
+    seeds = [
+        torch.tensor([0, 3, 4, 5, 2]).to(F.ctx()),
+        torch.tensor([0, 3, 4]).to(F.ctx()),
+    ]
+    for data in datapipe:
+        for step, sampled_subgraph in enumerate(data.sampled_subgraphs):
+            assert torch.equal(
+                sampled_subgraph.original_row_node_ids,
+                original_row_node_ids[step],
+            )
+            assert torch.equal(
+                sampled_subgraph.sampled_csc.indices, compacted_indices[step]
+            )
+            assert torch.equal(
+                sampled_subgraph.sampled_csc.indptr, indptr[step]
+            )
+            assert torch.equal(
+                sampled_subgraph.original_column_node_ids, seeds[step]
+            )
+
+
+@unittest.skipIf(
+    F._default_context_str == "cpu",
+    reason="Fails due to different result on the CPU.",
+)
+@pytest.mark.parametrize("labor", [False, True])
+def test_SubgraphSampler_unique_csc_format_Homo_Node_gpu(labor):
+    torch.manual_seed(1205)
+    graph = dgl.graph(([5, 0, 7, 7, 2, 4], [0, 1, 2, 2, 3, 4]))
+    graph = gb.from_dglgraph(graph, is_homogeneous=True).to(F.ctx())
+    seed_nodes = torch.LongTensor([0, 3, 4])
+
+    itemset = gb.ItemSet(seed_nodes, names="seeds")
+    item_sampler = gb.ItemSampler(itemset, batch_size=len(seed_nodes)).copy_to(
+        F.ctx()
+    )
+    num_layer = 2
+    fanouts = [torch.LongTensor([-1]) for _ in range(num_layer)]
+
+    Sampler = gb.LayerNeighborSampler if labor else gb.NeighborSampler
+    datapipe = Sampler(
+        item_sampler,
+        graph,
+        fanouts,
+        deduplicate=True,
+    )
+
+    original_row_node_ids = [
+        torch.tensor([0, 3, 4, 2, 5, 7]).to(F.ctx()),
+        torch.tensor([0, 3, 4, 2, 5]).to(F.ctx()),
+    ]
+    compacted_indices = [
+        torch.tensor([4, 3, 2, 5, 5]).to(F.ctx()),
+        torch.tensor([4, 3, 2]).to(F.ctx()),
+    ]
+    indptr = [
+        torch.tensor([0, 1, 2, 3, 5, 5]).to(F.ctx()),
+        torch.tensor([0, 1, 2, 3]).to(F.ctx()),
+    ]
+    seeds = [
+        torch.tensor([0, 3, 4, 2, 5]).to(F.ctx()),
+        torch.tensor([0, 3, 4]).to(F.ctx()),
+    ]
+    for data in datapipe:
+        for step, sampled_subgraph in enumerate(data.sampled_subgraphs):
+            assert torch.equal(
+                sampled_subgraph.original_row_node_ids,
+                original_row_node_ids[step],
+            )
+            assert torch.equal(
+                sampled_subgraph.sampled_csc.indices, compacted_indices[step]
+            )
+            assert torch.equal(
+                sampled_subgraph.sampled_csc.indptr, indptr[step]
+            )
+            assert torch.equal(
+                sampled_subgraph.original_column_node_ids, seeds[step]
+            )
+
+
+@pytest.mark.parametrize("labor", [False, True])
+def test_SubgraphSampler_unique_csc_format_Hetero_Node(labor):
+    graph = get_hetero_graph().to(F.ctx())
+    itemset = gb.ItemSetDict({"n2": gb.ItemSet(torch.arange(2), names="seeds")})
+    item_sampler = gb.ItemSampler(itemset, batch_size=2).copy_to(F.ctx())
+    num_layer = 2
+    fanouts = [torch.LongTensor([2]) for _ in range(num_layer)]
+    Sampler = gb.LayerNeighborSampler if labor else gb.NeighborSampler
+    datapipe = Sampler(
+        item_sampler,
+        graph,
+        fanouts,
+        deduplicate=True,
+    )
+    csc_formats = [
+        {
+            "n1:e1:n2": gb.CSCFormatBase(
+                indptr=torch.tensor([0, 2, 4]),
+                indices=torch.tensor([0, 1, 1, 0]),
+            ),
+            "n2:e2:n1": gb.CSCFormatBase(
+                indptr=torch.tensor([0, 2, 4]),
+                indices=torch.tensor([0, 2, 0, 1]),
+            ),
+        },
+        {
+            "n1:e1:n2": gb.CSCFormatBase(
+                indptr=torch.tensor([0, 2, 4]),
+                indices=torch.tensor([0, 1, 1, 0]),
+            ),
+            "n2:e2:n1": gb.CSCFormatBase(
+                indptr=torch.tensor([0]),
+                indices=torch.tensor([], dtype=torch.int64),
+            ),
+        },
+    ]
+    original_column_node_ids = [
+        {
+            "n1": torch.tensor([0, 1]),
+            "n2": torch.tensor([0, 1]),
+        },
+        {
+            "n1": torch.tensor([], dtype=torch.int64),
+            "n2": torch.tensor([0, 1]),
+        },
+    ]
+    original_row_node_ids = [
+        {
+            "n1": torch.tensor([0, 1]),
+            "n2": torch.tensor([0, 1, 2]),
+        },
+        {
+            "n1": torch.tensor([0, 1]),
+            "n2": torch.tensor([0, 1]),
+        },
+    ]
+
+    for data in datapipe:
+        for step, sampled_subgraph in enumerate(data.sampled_subgraphs):
+            for ntype in ["n1", "n2"]:
+                assert torch.equal(
+                    sampled_subgraph.original_row_node_ids[ntype],
+                    original_row_node_ids[step][ntype].to(F.ctx()),
+                )
+                assert torch.equal(
+                    sampled_subgraph.original_column_node_ids[ntype],
+                    original_column_node_ids[step][ntype].to(F.ctx()),
+                )
+            for etype in ["n1:e1:n2", "n2:e2:n1"]:
+                assert torch.equal(
+                    sampled_subgraph.sampled_csc[etype].indices,
+                    csc_formats[step][etype].indices.to(F.ctx()),
+                )
+                assert torch.equal(
+                    sampled_subgraph.sampled_csc[etype].indptr,
+                    csc_formats[step][etype].indptr.to(F.ctx()),
+                )
+
+
+@pytest.mark.parametrize(
+    "sampler_type",
+    [SamplerType.Normal, SamplerType.Layer, SamplerType.Temporal],
+)
+def test_SubgraphSampler_Hetero_multifanout_per_layer(sampler_type):
+    _check_sampler_type(sampler_type)
+    graph = get_hetero_graph().to(F.ctx())
+    items_n1 = torch.tensor([0])
+    items_n2 = torch.tensor([1])
+    names = "seeds"
+    if sampler_type == SamplerType.Temporal:
+        graph.node_attributes = {
+            "timestamp": torch.arange(graph.csc_indptr.numel() - 1).to(F.ctx())
+        }
+        graph.edge_attributes = {
+            "timestamp": torch.arange(graph.indices.numel()).to(F.ctx())
+        }
+        # All edges can be sampled.
+        items_n1 = (items_n1, torch.tensor([10]))
+        items_n2 = (items_n2, torch.tensor([10]))
+        names = (names, "timestamp")
+    itemset = gb.ItemSetDict(
+        {
+            "n1": gb.ItemSet(items=items_n1, names=names),
+            "n2": gb.ItemSet(items=items_n2, names=names),
+        }
+    )
+    item_sampler = gb.ItemSampler(itemset, batch_size=2).copy_to(F.ctx())
+    num_layer = 2
+    # The number of edges to be sampled for each edge types of each node.
+    fanouts = [torch.LongTensor([2, 1]) for _ in range(num_layer)]
+    sampler = _get_sampler(sampler_type)
+    sampler_dp = sampler(item_sampler, graph, fanouts)
+    if sampler_type == SamplerType.Temporal:
+        indices_len = [
+            {
+                "n1:e1:n2": 4,
+                "n2:e2:n1": 3,
+            },
+            {
+                "n1:e1:n2": 2,
+                "n2:e2:n1": 1,
+            },
+        ]
+    else:
+        indices_len = [
+            {
+                "n1:e1:n2": 4,
+                "n2:e2:n1": 2,
+            },
+            {
+                "n1:e1:n2": 2,
+                "n2:e2:n1": 1,
+            },
+        ]
+    for minibatch in sampler_dp:
+        for step, sampled_subgraph in enumerate(minibatch.sampled_subgraphs):
+            assert (
+                len(sampled_subgraph.sampled_csc["n1:e1:n2"].indices)
+                == indices_len[step]["n1:e1:n2"]
+            )
+            assert (
+                len(sampled_subgraph.sampled_csc["n2:e2:n1"].indices)
+                == indices_len[step]["n2:e2:n1"]
+            )
+
+
+@pytest.mark.parametrize(
+    "sampler_type",
+    [SamplerType.Normal, SamplerType.Layer, SamplerType.Temporal],
+)
+def test_SubgraphSampler_without_dedpulication_Homo_Link(sampler_type):
+    _check_sampler_type(sampler_type)
+    graph = dgl.graph(
+        ([5, 0, 1, 5, 6, 7, 2, 2, 4], [0, 1, 2, 2, 2, 2, 3, 4, 4])
+    )
+    graph = gb.from_dglgraph(graph, True).to(F.ctx())
+    seed_nodes = torch.LongTensor([[0, 1], [3, 5]])
+    items = seed_nodes
+    names = "seeds"
+    if sampler_type == SamplerType.Temporal:
+        graph.node_attributes = {
+            "timestamp": torch.zeros(graph.csc_indptr.numel() - 1).to(F.ctx())
+        }
+        graph.edge_attributes = {
+            "timestamp": torch.zeros(graph.indices.numel()).to(F.ctx())
+        }
+        items = (items, torch.randint(1, 10, (3,)))
+        names = (names, "timestamp")
+
+    itemset = gb.ItemSet(items, names=names)
+    item_sampler = gb.ItemSampler(itemset, batch_size=4).copy_to(F.ctx())
+    num_layer = 2
+    fanouts = [torch.LongTensor([2]) for _ in range(num_layer)]
+
+    sampler = _get_sampler(sampler_type)
+    if sampler_type == SamplerType.Temporal:
+        datapipe = sampler(item_sampler, graph, fanouts)
+    else:
+        datapipe = sampler(item_sampler, graph, fanouts, deduplicate=False)
+
+    length = [13, 7]
+    compacted_indices = [
+        (torch.arange(0, 6) + 7).to(F.ctx()),
+        (torch.arange(0, 3) + 4).to(F.ctx()),
+    ]
+    indptr = [
+        torch.tensor([0, 1, 2, 3, 3, 3, 4, 6]).to(F.ctx()),
+        torch.tensor([0, 1, 2, 3, 3]).to(F.ctx()),
+    ]
+    seeds = [
+        torch.tensor([0, 0, 1, 2, 3, 5, 5]).to(F.ctx()),
+        torch.tensor([0, 1, 3, 5]).to(F.ctx()),
+    ]
+    for data in datapipe:
+        for step, sampled_subgraph in enumerate(data.sampled_subgraphs):
+            assert len(sampled_subgraph.original_row_node_ids) == length[step]
+            assert torch.equal(
+                sampled_subgraph.sampled_csc.indices, compacted_indices[step]
+            )
+            assert torch.equal(
+                sampled_subgraph.sampled_csc.indptr, indptr[step]
+            )
+            assert torch.equal(
+                torch.sort(sampled_subgraph.original_column_node_ids)[0],
+                seeds[step],
+            )
+
+
+@pytest.mark.parametrize(
+    "sampler_type",
+    [SamplerType.Normal, SamplerType.Layer, SamplerType.Temporal],
+)
+def test_SubgraphSampler_without_dedpulication_Hetero_Link(sampler_type):
+    _check_sampler_type(sampler_type)
+    graph = get_hetero_graph().to(F.ctx())
+    items = torch.arange(2).view(1, 2)
+    names = "seeds"
+    if sampler_type == SamplerType.Temporal:
+        graph.node_attributes = {
+            "timestamp": torch.zeros(graph.csc_indptr.numel() - 1).to(F.ctx())
+        }
+        graph.edge_attributes = {
+            "timestamp": torch.zeros(graph.indices.numel()).to(F.ctx())
+        }
+        items = (items, torch.randint(1, 10, (2,)))
+        names = (names, "timestamp")
+    itemset = gb.ItemSetDict({"n1:e1:n2": gb.ItemSet(items, names=names)})
+    item_sampler = gb.ItemSampler(itemset, batch_size=2).copy_to(F.ctx())
+    num_layer = 2
+    fanouts = [torch.LongTensor([2]) for _ in range(num_layer)]
+    sampler = _get_sampler(sampler_type)
+    if sampler_type == SamplerType.Temporal:
+        datapipe = sampler(item_sampler, graph, fanouts)
+    else:
+        datapipe = sampler(item_sampler, graph, fanouts, deduplicate=False)
+    csc_formats = [
+        {
+            "n1:e1:n2": gb.CSCFormatBase(
+                indptr=torch.tensor([0, 2, 4, 6]),
+                indices=torch.tensor([3, 4, 5, 6, 7, 8]),
+            ),
+            "n2:e2:n1": gb.CSCFormatBase(
+                indptr=torch.tensor([0, 2, 4, 6]),
+                indices=torch.tensor([3, 4, 5, 6, 7, 8]),
+            ),
+        },
+        {
+            "n1:e1:n2": gb.CSCFormatBase(
+                indptr=torch.tensor([0, 2]),
+                indices=torch.tensor([1, 2]),
+            ),
+            "n2:e2:n1": gb.CSCFormatBase(
+                indptr=torch.tensor([0, 2]),
+                indices=torch.tensor([1, 2], dtype=torch.int64),
+            ),
+        },
+    ]
+    original_column_node_ids = [
+        {
+            "n1": torch.tensor([0, 1, 0]),
+            "n2": torch.tensor([1, 0, 2]),
+        },
+        {
+            "n1": torch.tensor([0]),
+            "n2": torch.tensor([1]),
+        },
+    ]
+    original_row_node_ids = [
+        {
+            "n1": torch.tensor([0, 1, 0, 1, 0, 0, 1, 0, 1]),
+            "n2": torch.tensor([1, 0, 2, 0, 2, 0, 1, 0, 2]),
+        },
+        {
+            "n1": torch.tensor([0, 1, 0]),
+            "n2": torch.tensor([1, 0, 2]),
+        },
+    ]
+
+    for data in datapipe:
+        for step, sampled_subgraph in enumerate(data.sampled_subgraphs):
+            for ntype in ["n1", "n2"]:
+                assert torch.equal(
+                    sampled_subgraph.original_row_node_ids[ntype],
+                    original_row_node_ids[step][ntype].to(F.ctx()),
+                )
+                assert torch.equal(
+                    sampled_subgraph.original_column_node_ids[ntype],
+                    original_column_node_ids[step][ntype].to(F.ctx()),
+                )
+            for etype in ["n1:e1:n2", "n2:e2:n1"]:
+                assert torch.equal(
+                    sampled_subgraph.sampled_csc[etype].indices,
+                    csc_formats[step][etype].indices.to(F.ctx()),
+                )
+                assert torch.equal(
+                    sampled_subgraph.sampled_csc[etype].indptr,
+                    csc_formats[step][etype].indptr.to(F.ctx()),
+                )
+
+
+@unittest.skipIf(
+    F._default_context_str == "gpu",
+    reason="Fails due to different result on the GPU.",
+)
+@pytest.mark.parametrize("labor", [False, True])
+def test_SubgraphSampler_unique_csc_format_Homo_Link_cpu(labor):
+    torch.manual_seed(1205)
+    graph = dgl.graph(([5, 0, 6, 7, 2, 2, 4], [0, 1, 2, 2, 3, 4, 4]))
+    graph = gb.from_dglgraph(graph, True).to(F.ctx())
+    seed_nodes = torch.LongTensor([[0, 3], [4, 4]])
+
+    itemset = gb.ItemSet(seed_nodes, names="seeds")
+    item_sampler = gb.ItemSampler(itemset, batch_size=4).copy_to(F.ctx())
+    num_layer = 2
+    fanouts = [torch.LongTensor([2]) for _ in range(num_layer)]
+
+    Sampler = gb.LayerNeighborSampler if labor else gb.NeighborSampler
+    datapipe = Sampler(
+        item_sampler,
+        graph,
+        fanouts,
+        deduplicate=True,
+    )
+
+    original_row_node_ids = [
+        torch.tensor([0, 3, 4, 5, 2, 6, 7]).to(F.ctx()),
+        torch.tensor([0, 3, 4, 5, 2]).to(F.ctx()),
+    ]
+    compacted_indices = [
+        torch.tensor([3, 4, 4, 2, 5, 6]).to(F.ctx()),
+        torch.tensor([3, 4, 4, 2]).to(F.ctx()),
+    ]
+    indptr = [
+        torch.tensor([0, 1, 2, 4, 4, 6]).to(F.ctx()),
+        torch.tensor([0, 1, 2, 4]).to(F.ctx()),
+    ]
+    seeds = [
+        torch.tensor([0, 3, 4, 5, 2]).to(F.ctx()),
+        torch.tensor([0, 3, 4]).to(F.ctx()),
+    ]
+    for data in datapipe:
+        for step, sampled_subgraph in enumerate(data.sampled_subgraphs):
+            assert torch.equal(
+                sampled_subgraph.original_row_node_ids,
+                original_row_node_ids[step],
+            )
+            assert torch.equal(
+                sampled_subgraph.sampled_csc.indices, compacted_indices[step]
+            )
+            assert torch.equal(
+                sampled_subgraph.sampled_csc.indptr, indptr[step]
+            )
+            assert torch.equal(
+                sampled_subgraph.original_column_node_ids, seeds[step]
+            )
+
+
+@unittest.skipIf(
+    F._default_context_str == "cpu",
+    reason="Fails due to different result on the CPU.",
+)
+@pytest.mark.parametrize("labor", [False, True])
+def test_SubgraphSampler_unique_csc_format_Homo_Link_gpu(labor):
+    torch.manual_seed(1205)
+    graph = dgl.graph(([5, 0, 7, 7, 2, 4], [0, 1, 2, 2, 3, 4]))
+    graph = gb.from_dglgraph(graph, is_homogeneous=True).to(F.ctx())
+    seed_nodes = torch.LongTensor([[0, 3], [4, 4]])
+
+    itemset = gb.ItemSet(seed_nodes, names="seeds")
+    item_sampler = gb.ItemSampler(itemset, batch_size=4).copy_to(F.ctx())
+    num_layer = 2
+    fanouts = [torch.LongTensor([-1]) for _ in range(num_layer)]
+
+    Sampler = gb.LayerNeighborSampler if labor else gb.NeighborSampler
+    datapipe = Sampler(
+        item_sampler,
+        graph,
+        fanouts,
+        deduplicate=True,
+    )
+
+    original_row_node_ids = [
+        torch.tensor([0, 3, 4, 2, 5, 7]).to(F.ctx()),
+        torch.tensor([0, 3, 4, 2, 5]).to(F.ctx()),
+    ]
+    compacted_indices = [
+        torch.tensor([4, 3, 2, 5, 5]).to(F.ctx()),
+        torch.tensor([4, 3, 2]).to(F.ctx()),
+    ]
+    indptr = [
+        torch.tensor([0, 1, 2, 3, 5, 5]).to(F.ctx()),
+        torch.tensor([0, 1, 2, 3]).to(F.ctx()),
+    ]
+    seeds = [
+        torch.tensor([0, 3, 4, 2, 5]).to(F.ctx()),
+        torch.tensor([0, 3, 4]).to(F.ctx()),
+    ]
+    for data in datapipe:
+        for step, sampled_subgraph in enumerate(data.sampled_subgraphs):
+            assert torch.equal(
+                sampled_subgraph.original_row_node_ids,
+                original_row_node_ids[step],
+            )
+            assert torch.equal(
+                sampled_subgraph.sampled_csc.indices, compacted_indices[step]
+            )
+            assert torch.equal(
+                sampled_subgraph.sampled_csc.indptr, indptr[step]
+            )
+            assert torch.equal(
+                sampled_subgraph.original_column_node_ids, seeds[step]
+            )
+
+
+@pytest.mark.parametrize("labor", [False, True])
+def test_SubgraphSampler_unique_csc_format_Hetero_Link(labor):
+    graph = get_hetero_graph().to(F.ctx())
+    itemset = gb.ItemSetDict(
+        {"n1:e1:n2": gb.ItemSet(torch.tensor([[0, 1]]), names="seeds")}
+    )
+    item_sampler = gb.ItemSampler(itemset, batch_size=2).copy_to(F.ctx())
+    num_layer = 2
+    fanouts = [torch.LongTensor([2]) for _ in range(num_layer)]
+    Sampler = gb.LayerNeighborSampler if labor else gb.NeighborSampler
+    datapipe = Sampler(
+        item_sampler,
+        graph,
+        fanouts,
+        deduplicate=True,
+    )
+    csc_formats = [
+        {
+            "n1:e1:n2": gb.CSCFormatBase(
+                indptr=torch.tensor([0, 2, 4, 6]),
+                indices=torch.tensor([1, 0, 0, 1, 0, 1]),
+            ),
+            "n2:e2:n1": gb.CSCFormatBase(
+                indptr=torch.tensor([0, 2, 4]),
+                indices=torch.tensor([1, 2, 1, 0]),
+            ),
+        },
+        {
+            "n1:e1:n2": gb.CSCFormatBase(
+                indptr=torch.tensor([0, 2]),
+                indices=torch.tensor([1, 0]),
+            ),
+            "n2:e2:n1": gb.CSCFormatBase(
+                indptr=torch.tensor([0, 2]),
+                indices=torch.tensor([1, 2], dtype=torch.int64),
+            ),
+        },
+    ]
+    original_column_node_ids = [
+        {
+            "n1": torch.tensor([0, 1]),
+            "n2": torch.tensor([0, 1, 2]),
+        },
+        {
+            "n1": torch.tensor([0]),
+            "n2": torch.tensor([1]),
+        },
+    ]
+    original_row_node_ids = [
+        {
+            "n1": torch.tensor([0, 1]),
+            "n2": torch.tensor([0, 1, 2]),
+        },
+        {
+            "n1": torch.tensor([0, 1]),
+            "n2": torch.tensor([0, 1, 2]),
+        },
+    ]
+
+    for data in datapipe:
+        for step, sampled_subgraph in enumerate(data.sampled_subgraphs):
+            for ntype in ["n1", "n2"]:
+                assert torch.equal(
+                    torch.sort(sampled_subgraph.original_row_node_ids[ntype])[
+                        0
+                    ],
+                    original_row_node_ids[step][ntype].to(F.ctx()),
+                )
+                assert torch.equal(
+                    torch.sort(
+                        sampled_subgraph.original_column_node_ids[ntype]
+                    )[0],
+                    original_column_node_ids[step][ntype].to(F.ctx()),
+                )
+            for etype in ["n1:e1:n2", "n2:e2:n1"]:
+                assert torch.equal(
+                    sampled_subgraph.sampled_csc[etype].indices,
+                    csc_formats[step][etype].indices.to(F.ctx()),
+                )
+                assert torch.equal(
+                    sampled_subgraph.sampled_csc[etype].indptr,
+                    csc_formats[step][etype].indptr.to(F.ctx()),
+                )

From 43912418d8575a13eb54028e3f7f78f794b81c4b Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Tue, 6 Feb 2024 09:50:12 +0300
Subject: [PATCH 28/45] [GraphBolt][CUDA] puregpu option for the multiGPU
 example. (#7089)

---
 .../multigpu/graphbolt/node_classification.py | 37 ++++++++++---------
 .../impl/fused_csc_sampling_graph.py          |  5 ++-
 .../impl/torch_based_feature_store.py         | 10 ++++-
 3 files changed, 30 insertions(+), 22 deletions(-)

diff --git a/examples/multigpu/graphbolt/node_classification.py b/examples/multigpu/graphbolt/node_classification.py
index b144fdb1d5cc..952b5b355c48 100644
--- a/examples/multigpu/graphbolt/node_classification.py
+++ b/examples/multigpu/graphbolt/node_classification.py
@@ -151,9 +151,7 @@ def evaluate(rank, model, dataloader, num_classes, device):
     y = []
     y_hats = []
 
-    for step, data in (
-        tqdm.tqdm(enumerate(dataloader)) if rank == 0 else enumerate(dataloader)
-    ):
+    for data in tqdm.tqdm(dataloader) if rank == 0 else dataloader:
         blocks = data.blocks
         x = data.node_features["feat"]
         y.append(data.labels)
@@ -271,8 +269,11 @@ def run(rank, world_size, args, devices, dataset):
 
     # Pin the graph and features to enable GPU access.
     if args.storage_device == "pinned":
-        dataset.graph.pin_memory_()
-        dataset.feature.pin_memory_()
+        graph = dataset.graph.pin_memory_()
+        feature = dataset.feature.pin_memory_()
+    else:
+        graph = dataset.graph.to(args.storage_device)
+        feature = dataset.feature.to(args.storage_device)
 
     train_set = dataset.tasks[0].train_set
     valid_set = dataset.tasks[0].validation_set
@@ -280,13 +281,13 @@ def run(rank, world_size, args, devices, dataset):
     args.fanout = list(map(int, args.fanout.split(",")))
     num_classes = dataset.tasks[0].metadata["num_classes"]
 
-    in_size = dataset.feature.size("node", None, "feat")[0]
+    in_size = feature.size("node", None, "feat")[0]
     hidden_size = 256
     out_size = num_classes
 
-    if args.gpu_cache_size > 0:
-        dataset.feature._features[("node", None, "feat")] = gb.GPUCachedFeature(
-            dataset.feature._features[("node", None, "feat")],
+    if args.gpu_cache_size > 0 and args.storage_device != "cuda":
+        feature._features[("node", None, "feat")] = gb.GPUCachedFeature(
+            feature._features[("node", None, "feat")],
             args.gpu_cache_size,
         )
 
@@ -297,24 +298,24 @@ def run(rank, world_size, args, devices, dataset):
     # Create data loaders.
     train_dataloader = create_dataloader(
         args,
-        dataset.graph,
-        dataset.feature,
+        graph,
+        feature,
         train_set,
         device,
         is_train=True,
     )
     valid_dataloader = create_dataloader(
         args,
-        dataset.graph,
-        dataset.feature,
+        graph,
+        feature,
         valid_set,
         device,
         is_train=False,
     )
     test_dataloader = create_dataloader(
         args,
-        dataset.graph,
-        dataset.feature,
+        graph,
+        feature,
         test_set,
         device,
         is_train=False,
@@ -396,9 +397,9 @@ def parse_args():
     parser.add_argument(
         "--mode",
         default="pinned-cuda",
-        choices=["cpu-cuda", "pinned-cuda"],
-        help="Dataset storage placement and Train device: 'cpu' for CPU and RAM,"
-        " 'pinned' for pinned memory in RAM, 'cuda' for GPU and GPU memory.",
+        choices=["cpu-cuda", "pinned-cuda", "cuda-cuda"],
+        help="Dataset storage placement and Train device: 'cpu' for CPU and RAM"
+        ", 'pinned' for pinned memory in RAM, 'cuda' for GPU and GPU memory.",
     )
     return parser.parse_args()
 
diff --git a/python/dgl/graphbolt/impl/fused_csc_sampling_graph.py b/python/dgl/graphbolt/impl/fused_csc_sampling_graph.py
index de81c137833b..d30a3fbdfa83 100644
--- a/python/dgl/graphbolt/impl/fused_csc_sampling_graph.py
+++ b/python/dgl/graphbolt/impl/fused_csc_sampling_graph.py
@@ -1092,7 +1092,8 @@ def _pin(x):
         return self2._apply_to_members(_pin if device == "pinned" else _to)
 
     def pin_memory_(self):
-        """Copy `FusedCSCSamplingGraph` to the pinned memory in-place."""
+        """Copy `FusedCSCSamplingGraph` to the pinned memory in-place. Returns
+        the same object modified in-place."""
         # torch.Tensor.pin_memory() is not an inplace operation. To make it
         # truly in-place, we need to use cudaHostRegister. Then, we need to use
         # cudaHostUnregister to unpin the tensor in the destructor.
@@ -1123,7 +1124,7 @@ def _pin(x):
 
             return x
 
-        self._apply_to_members(_pin)
+        return self._apply_to_members(_pin)
 
 
 def fused_csc_sampling_graph(
diff --git a/python/dgl/graphbolt/impl/torch_based_feature_store.py b/python/dgl/graphbolt/impl/torch_based_feature_store.py
index 577e29b7325b..9fd3c5f45f04 100644
--- a/python/dgl/graphbolt/impl/torch_based_feature_store.py
+++ b/python/dgl/graphbolt/impl/torch_based_feature_store.py
@@ -175,7 +175,8 @@ def metadata(self):
         )
 
     def pin_memory_(self):
-        """In-place operation to copy the feature to pinned memory."""
+        """In-place operation to copy the feature to pinned memory. Returns the
+        same object modified in-place."""
         # torch.Tensor.pin_memory() is not an inplace operation. To make it
         # truly in-place, we need to use cudaHostRegister. Then, we need to use
         # cudaHostUnregister to unpin the tensor in the destructor.
@@ -194,6 +195,8 @@ def pin_memory_(self):
 
             self._is_inplace_pinned.add(x)
 
+        return self
+
     def is_pinned(self):
         """Returns True if the stored feature is pinned."""
         return self._tensor.is_pinned()
@@ -289,10 +292,13 @@ def __init__(self, feat_data: List[OnDiskFeatureData]):
         super().__init__(features)
 
     def pin_memory_(self):
-        """In-place operation to copy the feature store to pinned memory."""
+        """In-place operation to copy the feature store to pinned memory.
+        Returns the same object modified in-place."""
         for feature in self._features.values():
             feature.pin_memory_()
 
+        return self
+
     def is_pinned(self):
         """Returns True if all the stored features are pinned."""
         return all(feature.is_pinned() for feature in self._features.values())

From 0504bc2c1154f2b6e1b3450cc560f8418cb74012 Mon Sep 17 00:00:00 2001
From: Rhett Ying <85214957+Rhett-Ying@users.noreply.github.com>
Date: Wed, 7 Feb 2024 10:08:56 +0800
Subject: [PATCH 29/45] [DistGB] use expand_indptr for csc to coo (#7090)

---
 python/dgl/distributed/graph_services.py | 7 +++++--
 python/dgl/distributed/partition.py      | 5 ++---
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/python/dgl/distributed/graph_services.py b/python/dgl/distributed/graph_services.py
index 9188a38675a9..0c0a87ebaa85 100644
--- a/python/dgl/distributed/graph_services.py
+++ b/python/dgl/distributed/graph_services.py
@@ -151,8 +151,11 @@ def _sample_neighbors_graphbolt(
 
     # 3. Map local node IDs to global node IDs.
     local_src = subgraph.indices
-    local_dst = torch.repeat_interleave(
-        subgraph.original_column_node_ids, torch.diff(subgraph.indptr)
+    local_dst = gb.expand_indptr(
+        subgraph.indptr,
+        dtype=local_src.dtype,
+        node_ids=subgraph.original_column_node_ids,
+        output_size=local_src.shape[0],
     )
     global_nid_mapping = g.node_attributes[NID]
     global_src = global_nid_mapping[local_src]
diff --git a/python/dgl/distributed/partition.py b/python/dgl/distributed/partition.py
index 6928d24da534..83f83b97c9d7 100644
--- a/python/dgl/distributed/partition.py
+++ b/python/dgl/distributed/partition.py
@@ -199,11 +199,10 @@ def _verify_graphbolt_partition(graph, part_id, gpb, ntypes, etypes):
         field in graph.edge_attributes for field in required_edata_fields
     ), "the partition graph should contain edge mapping to global edge ID."
 
-    num_nodes = graph.total_num_nodes
     num_edges = graph.total_num_edges
     local_src_ids = graph.indices
-    local_dst_ids = torch.repeat_interleave(
-        torch.arange(num_nodes), torch.diff(graph.csc_indptr)
+    local_dst_ids = gb.expand_indptr(
+        graph.csc_indptr, dtype=local_src_ids.dtype, output_size=num_edges
     )
     global_src_ids = graph.node_attributes[NID][local_src_ids]
     global_dst_ids = graph.node_attributes[NID][local_dst_ids]

From 8cf5ad84b622c754bcc0b71c0f4e0f29072d687f Mon Sep 17 00:00:00 2001
From: Ramon Zhou <deluxurous@gmail.com>
Date: Wed, 7 Feb 2024 11:02:28 +0800
Subject: [PATCH 30/45] [GraphBolt] Add `to_pyg_data` for MiniBatch (#7076)

---
 python/dgl/graphbolt/minibatch.py             | 46 +++++++++-
 .../pytorch/graphbolt/test_minibatch.py       | 83 +++++++++++++++++++
 2 files changed, 128 insertions(+), 1 deletion(-)

diff --git a/python/dgl/graphbolt/minibatch.py b/python/dgl/graphbolt/minibatch.py
index 7e528bcb8ea2..aef6b31988a2 100644
--- a/python/dgl/graphbolt/minibatch.py
+++ b/python/dgl/graphbolt/minibatch.py
@@ -8,7 +8,7 @@
 import dgl
 from dgl.utils import recursive_apply
 
-from .base import etype_str_to_tuple
+from .base import etype_str_to_tuple, expand_indptr
 from .internal import get_attributes
 from .sampled_subgraph import SampledSubgraph
 
@@ -474,6 +474,50 @@ def node_pairs_with_labels(self):
         else:
             return None
 
+    def to_pyg_data(self):
+        """Construct a PyG Data from `MiniBatch`. This function only supports
+        node classification task on a homogeneous graph and the number of
+        features cannot be more than one.
+        """
+        from torch_geometric.data import Data
+
+        if self.sampled_subgraphs is None:
+            edge_index = None
+        else:
+            col_nodes = []
+            row_nodes = []
+            for subgraph in self.sampled_subgraphs:
+                if subgraph is None:
+                    continue
+                sampled_csc = subgraph.sampled_csc
+                indptr = sampled_csc.indptr
+                indices = sampled_csc.indices
+                expanded_indptr = expand_indptr(
+                    indptr, dtype=indices.dtype, output_size=len(indices)
+                )
+                col_nodes.append(expanded_indptr)
+                row_nodes.append(indices)
+            col_nodes = torch.cat(col_nodes)
+            row_nodes = torch.cat(row_nodes)
+            edge_index = torch.unique(
+                torch.stack((col_nodes, row_nodes)), dim=1
+            )
+
+        if self.node_features is None:
+            node_features = None
+        else:
+            assert (
+                len(self.node_features) == 1
+            ), "`to_pyg_data` only supports single feature homogeneous graph."
+            node_features = next(iter(self.node_features.values()))
+
+        pyg_data = Data(
+            x=node_features,
+            edge_index=edge_index,
+            y=self.labels,
+        )
+        return pyg_data
+
     def to(self, device: torch.device):  # pylint: disable=invalid-name
         """Copy `MiniBatch` to the specified device using reflection."""
 
diff --git a/tests/python/pytorch/graphbolt/test_minibatch.py b/tests/python/pytorch/graphbolt/test_minibatch.py
index 96bce1289d50..1f708c84c664 100644
--- a/tests/python/pytorch/graphbolt/test_minibatch.py
+++ b/tests/python/pytorch/graphbolt/test_minibatch.py
@@ -859,3 +859,86 @@ def test_dgl_link_predication_hetero(mode):
                 minibatch.negative_node_pairs[etype][1],
                 minibatch.compacted_negative_dsts[etype],
             )
+
+
+def test_to_pyg_data():
+    test_subgraph_a = gb.SampledSubgraphImpl(
+        sampled_csc=gb.CSCFormatBase(
+            indptr=torch.tensor([0, 1, 3, 5, 6]),
+            indices=torch.tensor([0, 1, 2, 2, 1, 2]),
+        ),
+        original_column_node_ids=torch.tensor([10, 11, 12, 13]),
+        original_row_node_ids=torch.tensor([19, 20, 21, 22, 25, 30]),
+        original_edge_ids=torch.tensor([10, 11, 12, 13]),
+    )
+    test_subgraph_b = gb.SampledSubgraphImpl(
+        sampled_csc=gb.CSCFormatBase(
+            indptr=torch.tensor([0, 1, 3]),
+            indices=torch.tensor([1, 2, 0]),
+        ),
+        original_row_node_ids=torch.tensor([10, 11, 12]),
+        original_edge_ids=torch.tensor([10, 15, 17]),
+        original_column_node_ids=torch.tensor([10, 11]),
+    )
+    expected_edge_index = torch.tensor(
+        [[0, 0, 1, 1, 1, 2, 2, 3], [0, 1, 0, 1, 2, 1, 2, 2]]
+    )
+    expected_node_features = torch.tensor([[1], [2], [3], [4]])
+    expected_labels = torch.tensor([0, 1])
+    test_minibatch = gb.MiniBatch(
+        sampled_subgraphs=[test_subgraph_a, test_subgraph_b],
+        node_features={"feat": expected_node_features},
+        labels=expected_labels,
+    )
+    pyg_data = test_minibatch.to_pyg_data()
+    pyg_data.validate()
+    assert torch.equal(pyg_data.edge_index, expected_edge_index)
+    assert torch.equal(pyg_data.x, expected_node_features)
+    assert torch.equal(pyg_data.y, expected_labels)
+
+    # Test with sampled_csc as None.
+    test_minibatch = gb.MiniBatch(
+        sampled_subgraphs=None,
+        node_features={"feat": expected_node_features},
+        labels=expected_labels,
+    )
+    pyg_data = test_minibatch.to_pyg_data()
+    assert pyg_data.edge_index is None, "Edge index should be none."
+
+    # Test with node_features as None.
+    test_minibatch = gb.MiniBatch(
+        sampled_subgraphs=[test_subgraph_a],
+        node_features=None,
+        labels=expected_labels,
+    )
+    pyg_data = test_minibatch.to_pyg_data()
+    assert pyg_data.x is None, "Node features should be None."
+
+    # Test with labels as None.
+    test_minibatch = gb.MiniBatch(
+        sampled_subgraphs=[test_subgraph_a],
+        node_features={"feat": expected_node_features},
+        labels=None,
+    )
+    pyg_data = test_minibatch.to_pyg_data()
+    assert pyg_data.y is None, "Labels should be None."
+
+    # Test with multiple features.
+    test_minibatch = gb.MiniBatch(
+        sampled_subgraphs=[test_subgraph_a],
+        node_features={
+            "feat": expected_node_features,
+            "extra_feat": torch.tensor([[3], [4]]),
+        },
+        labels=expected_labels,
+    )
+    try:
+        pyg_data = test_minibatch.to_pyg_data()
+        assert (
+            pyg_data.x is None,
+        ), "Multiple features case should raise an error."
+    except AssertionError as e:
+        assert (
+            str(e)
+            == "`to_pyg_data` only supports single feature homogeneous graph."
+        )

From 870d8d025bae7742aaab94eba150ba0d7d444800 Mon Sep 17 00:00:00 2001
From: Rhett Ying <85214957+Rhett-Ying@users.noreply.github.com>
Date: Thu, 8 Feb 2024 09:56:31 +0800
Subject: [PATCH 31/45] [doc] fix undefined variable in code snippet (#7107)

---
 docs/source/guide/minibatch-node.rst | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/source/guide/minibatch-node.rst b/docs/source/guide/minibatch-node.rst
index af83463248d4..6e81895b8026 100644
--- a/docs/source/guide/minibatch-node.rst
+++ b/docs/source/guide/minibatch-node.rst
@@ -44,6 +44,7 @@ putting the list of generated MFGs onto GPU.
 
     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
     dataset = gb.BuiltinDataset("ogbn-arxiv").load()
+    g = dataset.graph
     train_set = dataset.tasks[0].train_set
     datapipe = gb.ItemSampler(train_set, batch_size=1024, shuffle=True)
     datapipe = datapipe.sample_neighbor(g, [10, 10]) # 2 layers.
@@ -205,6 +206,7 @@ of node types to node IDs.
 
     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
     dataset = gb.BuiltinDataset("ogbn-mag").load()
+    g = dataset.graph
     train_set = dataset.tasks[0].train_set
     datapipe = gb.ItemSampler(train_set, batch_size=1024, shuffle=True)
     datapipe = datapipe.sample_neighbor(g, [10, 10]) # 2 layers.

From 763bd39ff56a6e21473b077ae096d6acb79b4cba Mon Sep 17 00:00:00 2001
From: Rhett Ying <85214957+Rhett-Ying@users.noreply.github.com>
Date: Thu, 8 Feb 2024 10:37:01 +0800
Subject: [PATCH 32/45] [DistGB] sample with graphbolt on homograph via
 DistDataLoader (#7098)

---
 tests/distributed/test_mp_dataloader.py | 118 +++++++++++++++---------
 1 file changed, 73 insertions(+), 45 deletions(-)

diff --git a/tests/distributed/test_mp_dataloader.py b/tests/distributed/test_mp_dataloader.py
index 4f8d80c9ddbf..cdefb28b93ad 100644
--- a/tests/distributed/test_mp_dataloader.py
+++ b/tests/distributed/test_mp_dataloader.py
@@ -22,10 +22,19 @@
 
 
 class NeighborSampler(object):
-    def __init__(self, g, fanouts, sample_neighbors):
+    def __init__(
+        self,
+        g,
+        fanouts,
+        sample_neighbors,
+        use_graphbolt=False,
+        return_eids=False,
+    ):
         self.g = g
         self.fanouts = fanouts
         self.sample_neighbors = sample_neighbors
+        self.use_graphbolt = use_graphbolt
+        self.return_eids = return_eids
 
     def sample_blocks(self, seeds):
         import torch as th
@@ -35,13 +44,16 @@ def sample_blocks(self, seeds):
         for fanout in self.fanouts:
             # For each seed node, sample ``fanout`` neighbors.
             frontier = self.sample_neighbors(
-                self.g, seeds, fanout, replace=True
+                self.g, seeds, fanout, use_graphbolt=self.use_graphbolt
             )
             # Then we compact the frontier into a bipartite graph for
             # message passing.
             block = dgl.to_block(frontier, seeds)
             # Obtain the seed nodes for next layer.
             seeds = block.srcdata[dgl.NID]
+            if frontier.num_edges() > 0:
+                if not self.use_graphbolt or self.return_eids:
+                    block.edata[dgl.EID] = frontier.edata[dgl.EID]
 
             blocks.insert(0, block)
         return blocks
@@ -53,6 +65,7 @@ def start_server(
     part_config,
     disable_shared_mem,
     num_clients,
+    use_graphbolt=False,
 ):
     print("server: #clients=" + str(num_clients))
     g = DistGraphServer(
@@ -63,6 +76,7 @@ def start_server(
         part_config,
         disable_shared_mem=disable_shared_mem,
         graph_format=["csc", "coo"],
+        use_graphbolt=use_graphbolt,
     )
     g.start()
 
@@ -75,30 +89,36 @@ def start_dist_dataloader(
     drop_last,
     orig_nid,
     orig_eid,
-    group_id=0,
+    use_graphbolt=False,
+    return_eids=False,
 ):
-    import dgl
-    import torch as th
-
-    os.environ["DGL_GROUP_ID"] = str(group_id)
     dgl.distributed.initialize(ip_config)
     gpb = None
-    disable_shared_mem = num_server > 0
+    disable_shared_mem = num_server > 1
     if disable_shared_mem:
         _, _, _, gpb, _, _, _ = load_partition(part_config, rank)
     num_nodes_to_sample = 202
     batch_size = 32
     train_nid = th.arange(num_nodes_to_sample)
-    dist_graph = DistGraph("test_mp", gpb=gpb, part_config=part_config)
-
-    for i in range(num_server):
-        part, _, _, _, _, _, _ = load_partition(part_config, i)
+    dist_graph = DistGraph(
+        "test_sampling",
+        gpb=gpb,
+        part_config=part_config,
+        use_graphbolt=use_graphbolt,
+    )
 
     # Create sampler
     sampler = NeighborSampler(
-        dist_graph, [5, 10], dgl.distributed.sample_neighbors
+        dist_graph,
+        [5, 10],
+        dgl.distributed.sample_neighbors,
+        use_graphbolt=use_graphbolt,
+        return_eids=return_eids,
     )
 
+    # Enable santity check in distributed sampling.
+    os.environ["DGL_DIST_DEBUG"] = "1"
+
     # We need to test creating DistDataLoader multiple times.
     for i in range(2):
         # Create DataLoader for constructing blocks
@@ -113,7 +133,7 @@ def start_dist_dataloader(
         groundtruth_g = CitationGraphDataset("cora")[0]
         max_nid = []
 
-        for epoch in range(2):
+        for _ in range(2):
             for idx, blocks in zip(
                 range(0, num_nodes_to_sample, batch_size), dataloader
             ):
@@ -129,6 +149,16 @@ def start_dist_dataloader(
                     src_nodes_id, dst_nodes_id
                 )
                 assert np.all(F.asnumpy(has_edges))
+
+                if use_graphbolt and not return_eids:
+                    continue
+                eids = orig_eid[block.edata[dgl.EID]]
+                expected_eids = groundtruth_g.edge_ids(
+                    src_nodes_id, dst_nodes_id
+                )
+                assert th.equal(
+                    eids, expected_eids
+                ), f"{eids} != {expected_eids}"
             if drop_last:
                 assert (
                     np.max(max_nid)
@@ -311,23 +341,22 @@ def check_neg_dataloader(g, num_server, num_workers):
             assert p.exitcode == 0
 
 
-@unittest.skip(reason="Skip due to glitch in CI")
-@pytest.mark.parametrize("num_server", [3])
+@pytest.mark.parametrize("num_server", [1])
 @pytest.mark.parametrize("num_workers", [0, 4])
-@pytest.mark.parametrize("drop_last", [True, False])
-@pytest.mark.parametrize("num_groups", [1])
-def test_dist_dataloader(num_server, num_workers, drop_last, num_groups):
+@pytest.mark.parametrize("drop_last", [False, True])
+@pytest.mark.parametrize("use_graphbolt", [False, True])
+@pytest.mark.parametrize("return_eids", [False, True])
+def test_dist_dataloader(
+    num_server, num_workers, drop_last, use_graphbolt, return_eids
+):
     reset_envs()
-    # No multiple partitions on single machine for
-    # multiple client groups in case of race condition.
-    if num_groups > 1:
-        num_server = 1
+    os.environ["DGL_DIST_MODE"] = "distributed"
+    os.environ["DGL_NUM_SAMPLER"] = str(num_workers)
     with tempfile.TemporaryDirectory() as test_dir:
         ip_config = "ip_config.txt"
         generate_ip_config(ip_config, num_server, num_server)
 
         g = CitationGraphDataset("cora")[0]
-        print(g.idtype)
         num_parts = num_server
         num_hops = 1
 
@@ -339,6 +368,8 @@ def test_dist_dataloader(num_server, num_workers, drop_last, num_groups):
             num_hops=num_hops,
             part_method="metis",
             return_mapping=True,
+            use_graphbolt=use_graphbolt,
+            store_eids=return_eids,
         )
 
         part_config = os.path.join(test_dir, "test_sampling.json")
@@ -353,36 +384,33 @@ def test_dist_dataloader(num_server, num_workers, drop_last, num_groups):
                     part_config,
                     num_server > 1,
                     num_workers + 1,
+                    use_graphbolt,
                 ),
             )
             p.start()
             time.sleep(1)
             pserver_list.append(p)
 
-        os.environ["DGL_DIST_MODE"] = "distributed"
-        os.environ["DGL_NUM_SAMPLER"] = str(num_workers)
         ptrainer_list = []
         num_trainers = 1
         for trainer_id in range(num_trainers):
-            for group_id in range(num_groups):
-                p = ctx.Process(
-                    target=start_dist_dataloader,
-                    args=(
-                        trainer_id,
-                        ip_config,
-                        part_config,
-                        num_server,
-                        drop_last,
-                        orig_nid,
-                        orig_eid,
-                        group_id,
-                    ),
-                )
-                p.start()
-                time.sleep(
-                    1
-                )  # avoid race condition when instantiating DistGraph
-                ptrainer_list.append(p)
+            p = ctx.Process(
+                target=start_dist_dataloader,
+                args=(
+                    trainer_id,
+                    ip_config,
+                    part_config,
+                    num_server,
+                    drop_last,
+                    orig_nid,
+                    orig_eid,
+                    use_graphbolt,
+                    return_eids,
+                ),
+            )
+            p.start()
+            time.sleep(1)  # avoid race condition when instantiating DistGraph
+            ptrainer_list.append(p)
 
         for p in ptrainer_list:
             p.join()

From 7f7967b384578999f2e2e40d146b516c90649505 Mon Sep 17 00:00:00 2001
From: Rhett Ying <85214957+Rhett-Ying@users.noreply.github.com>
Date: Thu, 8 Feb 2024 12:01:03 +0800
Subject: [PATCH 33/45] [doc] fix undefined variable in example

---
 docs/source/guide/minibatch-node.rst | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/source/guide/minibatch-node.rst b/docs/source/guide/minibatch-node.rst
index 6e81895b8026..4fa695694a6d 100644
--- a/docs/source/guide/minibatch-node.rst
+++ b/docs/source/guide/minibatch-node.rst
@@ -45,6 +45,7 @@ putting the list of generated MFGs onto GPU.
     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
     dataset = gb.BuiltinDataset("ogbn-arxiv").load()
     g = dataset.graph
+    feature = dataset.feature
     train_set = dataset.tasks[0].train_set
     datapipe = gb.ItemSampler(train_set, batch_size=1024, shuffle=True)
     datapipe = datapipe.sample_neighbor(g, [10, 10]) # 2 layers.
@@ -207,6 +208,7 @@ of node types to node IDs.
     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
     dataset = gb.BuiltinDataset("ogbn-mag").load()
     g = dataset.graph
+    feature = dataset.feature
     train_set = dataset.tasks[0].train_set
     datapipe = gb.ItemSampler(train_set, batch_size=1024, shuffle=True)
     datapipe = datapipe.sample_neighbor(g, [10, 10]) # 2 layers.

From 3ebdee7768561dccb7bb878d573123f8808df9ca Mon Sep 17 00:00:00 2001
From: Rhett Ying <85214957+Rhett-Ying@users.noreply.github.com>
Date: Fri, 9 Feb 2024 09:58:32 +0800
Subject: [PATCH 34/45] [DistGB] sample with graphbolt on homograph via
 DistNodeDataLoader (#7108)

---
 python/dgl/dataloading/neighbor_sampler.py |  6 +-
 python/dgl/distributed/dist_graph.py       |  7 ++-
 tests/distributed/test_mp_dataloader.py    | 73 +++++++++++++++++++---
 3 files changed, 73 insertions(+), 13 deletions(-)

diff --git a/python/dgl/dataloading/neighbor_sampler.py b/python/dgl/dataloading/neighbor_sampler.py
index 603d39107cf1..13b5639b2582 100644
--- a/python/dgl/dataloading/neighbor_sampler.py
+++ b/python/dgl/dataloading/neighbor_sampler.py
@@ -192,9 +192,11 @@ def sample_blocks(self, g, seed_nodes, exclude_eids=None):
                 output_device=self.output_device,
                 exclude_edges=exclude_eids,
             )
-            eid = frontier.edata[EID]
             block = to_block(frontier, seed_nodes)
-            block.edata[EID] = eid
+            # If sampled from graphbolt-backed DistGraph, `EID` may not be in
+            # the block.
+            if EID in frontier.edata.keys():
+                block.edata[EID] = frontier.edata[EID]
             seed_nodes = block.srcdata[NID]
             blocks.insert(0, block)
 
diff --git a/python/dgl/distributed/dist_graph.py b/python/dgl/distributed/dist_graph.py
index 5bf76498ec97..b1af7b7dbd44 100644
--- a/python/dgl/distributed/dist_graph.py
+++ b/python/dgl/distributed/dist_graph.py
@@ -1406,7 +1406,12 @@ def sample_neighbors(
             )
         else:
             frontier = graph_services.sample_neighbors(
-                self, seed_nodes, fanout, replace=replace, prob=prob
+                self,
+                seed_nodes,
+                fanout,
+                replace=replace,
+                prob=prob,
+                use_graphbolt=self._use_graphbolt,
             )
         return frontier
 
diff --git a/tests/distributed/test_mp_dataloader.py b/tests/distributed/test_mp_dataloader.py
index cdefb28b93ad..4d6e8cf25834 100644
--- a/tests/distributed/test_mp_dataloader.py
+++ b/tests/distributed/test_mp_dataloader.py
@@ -342,7 +342,7 @@ def check_neg_dataloader(g, num_server, num_workers):
 
 
 @pytest.mark.parametrize("num_server", [1])
-@pytest.mark.parametrize("num_workers", [0, 4])
+@pytest.mark.parametrize("num_workers", [0, 1])
 @pytest.mark.parametrize("drop_last", [False, True])
 @pytest.mark.parametrize("use_graphbolt", [False, True])
 @pytest.mark.parametrize("return_eids", [False, True])
@@ -429,6 +429,8 @@ def start_node_dataloader(
     orig_nid,
     orig_eid,
     groundtruth_g,
+    use_graphbolt=False,
+    return_eids=False,
 ):
     dgl.distributed.initialize(ip_config)
     gpb = None
@@ -437,7 +439,12 @@ def start_node_dataloader(
         _, _, _, gpb, _, _, _ = load_partition(part_config, rank)
     num_nodes_to_sample = 202
     batch_size = 32
-    dist_graph = DistGraph("test_mp", gpb=gpb, part_config=part_config)
+    dist_graph = DistGraph(
+        "test_sampling",
+        gpb=gpb,
+        part_config=part_config,
+        use_graphbolt=use_graphbolt,
+    )
     assert len(dist_graph.ntypes) == len(groundtruth_g.ntypes)
     assert len(dist_graph.etypes) == len(groundtruth_g.etypes)
     if len(dist_graph.etypes) == 1:
@@ -459,6 +466,9 @@ def start_node_dataloader(
         ]
     )  # test int for hetero
 
+    # Enable santity check in distributed sampling.
+    os.environ["DGL_DIST_DEBUG"] = "1"
+
     # We need to test creating DistDataLoader multiple times.
     for i in range(2):
         # Create DataLoader for constructing blocks
@@ -472,7 +482,7 @@ def start_node_dataloader(
             num_workers=num_workers,
         )
 
-        for epoch in range(2):
+        for _ in range(2):
             for idx, (_, _, blocks) in zip(
                 range(0, num_nodes_to_sample, batch_size), dataloader
             ):
@@ -487,6 +497,16 @@ def start_node_dataloader(
                         src_nodes_id, dst_nodes_id, etype=etype
                     )
                     assert np.all(F.asnumpy(has_edges))
+
+                    if use_graphbolt and not return_eids:
+                        continue
+                    eids = orig_eid[etype][block.edata[dgl.EID]]
+                    expected_eids = groundtruth_g.edge_ids(
+                        src_nodes_id, dst_nodes_id
+                    )
+                    assert th.equal(
+                        eids, expected_eids
+                    ), f"{eids} != {expected_eids}"
     del dataloader
     # this is needed since there's two test here in one process
     dgl.distributed.exit_client()
@@ -509,7 +529,7 @@ def start_edge_dataloader(
         _, _, _, gpb, _, _, _ = load_partition(part_config, rank)
     num_edges_to_sample = 202
     batch_size = 32
-    dist_graph = DistGraph("test_mp", gpb=gpb, part_config=part_config)
+    dist_graph = DistGraph("test_sampling", gpb=gpb, part_config=part_config)
     assert len(dist_graph.ntypes) == len(groundtruth_g.ntypes)
     assert len(dist_graph.etypes) == len(groundtruth_g.etypes)
     if len(dist_graph.etypes) == 1:
@@ -561,7 +581,14 @@ def start_edge_dataloader(
     dgl.distributed.exit_client()
 
 
-def check_dataloader(g, num_server, num_workers, dataloader_type):
+def check_dataloader(
+    g,
+    num_server,
+    num_workers,
+    dataloader_type,
+    use_graphbolt=False,
+    return_eids=False,
+):
     with tempfile.TemporaryDirectory() as test_dir:
         ip_config = "ip_config.txt"
         generate_ip_config(ip_config, num_server, num_server)
@@ -576,6 +603,8 @@ def check_dataloader(g, num_server, num_workers, dataloader_type):
             num_hops=num_hops,
             part_method="metis",
             return_mapping=True,
+            use_graphbolt=use_graphbolt,
+            store_eids=return_eids,
         )
         part_config = os.path.join(test_dir, "test_sampling.json")
         if not isinstance(orig_nid, dict):
@@ -594,6 +623,7 @@ def check_dataloader(g, num_server, num_workers, dataloader_type):
                     part_config,
                     num_server > 1,
                     num_workers + 1,
+                    use_graphbolt,
                 ),
             )
             p.start()
@@ -615,6 +645,8 @@ def check_dataloader(g, num_server, num_workers, dataloader_type):
                     orig_nid,
                     orig_eid,
                     g,
+                    use_graphbolt,
+                    return_eids,
                 ),
             )
             p.start()
@@ -663,14 +695,35 @@ def create_random_hetero():
     return g
 
 
-@unittest.skip(reason="Skip due to glitch in CI")
-@pytest.mark.parametrize("num_server", [3])
-@pytest.mark.parametrize("num_workers", [0, 4])
+@pytest.mark.parametrize("num_server", [1])
+@pytest.mark.parametrize("num_workers", [0, 1])
 @pytest.mark.parametrize("dataloader_type", ["node", "edge"])
-def test_dataloader(num_server, num_workers, dataloader_type):
+@pytest.mark.parametrize("use_graphbolt", [False, True])
+@pytest.mark.parametrize("return_eids", [False, True])
+def test_dataloader_homograph(
+    num_server, num_workers, dataloader_type, use_graphbolt, return_eids
+):
+    if dataloader_type == "edge" and use_graphbolt:
+        # GraphBolt does not support edge dataloader.
+        return
     reset_envs()
     g = CitationGraphDataset("cora")[0]
-    check_dataloader(g, num_server, num_workers, dataloader_type)
+    check_dataloader(
+        g,
+        num_server,
+        num_workers,
+        dataloader_type,
+        use_graphbolt=use_graphbolt,
+        return_eids=return_eids,
+    )
+
+
+@unittest.skip(reason="Skip due to glitch in CI")
+@pytest.mark.parametrize("num_server", [1])
+@pytest.mark.parametrize("num_workers", [0, 1])
+@pytest.mark.parametrize("dataloader_type", ["node", "edge"])
+def test_dataloader_heterograph(num_server, num_workers, dataloader_type):
+    reset_envs()
     g = create_random_hetero()
     check_dataloader(g, num_server, num_workers, dataloader_type)
 

From 6735a3ae3df75b2eadf774360567e5a144e359f0 Mon Sep 17 00:00:00 2001
From: Rhett Ying <85214957+Rhett-Ying@users.noreply.github.com>
Date: Fri, 9 Feb 2024 10:09:16 +0800
Subject: [PATCH 35/45] [DistGB] enable sample etype neighbors on heterograph
 (#7095)

---
 python/dgl/distributed/graph_services.py      |  79 +++++--
 .../distributed/test_distributed_sampling.py  | 205 +++++++++++++++---
 2 files changed, 234 insertions(+), 50 deletions(-)

diff --git a/python/dgl/distributed/graph_services.py b/python/dgl/distributed/graph_services.py
index 0c0a87ebaa85..d0d743999c65 100644
--- a/python/dgl/distributed/graph_services.py
+++ b/python/dgl/distributed/graph_services.py
@@ -143,8 +143,6 @@ def _sample_neighbors_graphbolt(
     if isinstance(fanout, int):
         fanout = torch.LongTensor([fanout])
     assert isinstance(fanout, torch.Tensor), "Expect a tensor of fanout."
-    # [Rui][TODO] Support multiple fanouts.
-    assert fanout.numel() == 1, "Expect a single fanout."
 
     return_eids = g.edge_attributes is not None and EID in g.edge_attributes
     subgraph = g._sample_neighbors(nodes, fanout, return_eids=return_eids)
@@ -237,15 +235,15 @@ def _sample_neighbors(use_graphbolt, *args, **kwargs):
     return func(*args, **kwargs)
 
 
-def _sample_etype_neighbors(
+def _sample_etype_neighbors_dgl(
     local_g,
     partition_book,
     seed_nodes,
-    etype_offset,
     fan_out,
-    edge_dir,
-    prob,
-    replace,
+    edge_dir="in",
+    prob=None,
+    replace=False,
+    etype_offset=None,
     etype_sorted=False,
 ):
     """Sample from local partition.
@@ -255,6 +253,8 @@ def _sample_etype_neighbors(
     The sampled results are stored in three vectors that store source nodes, destination nodes
     and edge IDs.
     """
+    assert etype_offset is not None, "The etype offset is not provided."
+
     local_ids = partition_book.nid2localnid(seed_nodes, partition_book.partid)
     local_ids = F.astype(local_ids, local_g.idtype)
 
@@ -278,6 +278,43 @@ def _sample_etype_neighbors(
     return LocalSampledGraph(global_src, global_dst, global_eids)
 
 
+def _sample_etype_neighbors(use_graphbolt, *args, **kwargs):
+    """Wrapper for sampling etype neighbors.
+
+    The actual sampling function depends on whether to use GraphBolt.
+
+    Parameters
+    ----------
+    use_graphbolt : bool
+        Whether to use GraphBolt for sampling.
+    args : list
+        The arguments for the sampling function.
+    kwargs : dict
+        The keyword arguments for the sampling function.
+
+    Returns
+    -------
+    tensor
+        The source node ID array.
+    tensor
+        The destination node ID array.
+    tensor
+        The edge ID array.
+    tensor
+        The edge type ID array.
+    """
+    func = (
+        _sample_neighbors_graphbolt
+        if use_graphbolt
+        else _sample_etype_neighbors_dgl
+    )
+    if use_graphbolt:
+        # GraphBolt does not require `etype_offset` and `etype_sorted`.
+        kwargs.pop("etype_offset", None)
+        kwargs.pop("etype_sorted", None)
+    return func(*args, **kwargs)
+
+
 def _find_edges(local_g, partition_book, seed_edges):
     """Given an edge ID array, return the source
     and destination node ID array ``s`` and ``d`` in the local partition.
@@ -426,6 +463,7 @@ def __init__(
         prob=None,
         replace=False,
         etype_sorted=True,
+        use_graphbolt=False,
     ):
         self.seed_nodes = nodes
         self.edge_dir = edge_dir
@@ -433,6 +471,7 @@ def __init__(
         self.replace = replace
         self.fan_out = fan_out
         self.etype_sorted = etype_sorted
+        self.use_graphbolt = use_graphbolt
 
     def __setstate__(self, state):
         (
@@ -442,6 +481,7 @@ def __setstate__(self, state):
             self.replace,
             self.fan_out,
             self.etype_sorted,
+            self.use_graphbolt,
         ) = state
 
     def __getstate__(self):
@@ -452,6 +492,7 @@ def __getstate__(self):
             self.replace,
             self.fan_out,
             self.etype_sorted,
+            self.use_graphbolt,
         )
 
     def process_request(self, server_state):
@@ -468,15 +509,16 @@ def process_request(self, server_state):
         else:
             probs = None
         res = _sample_etype_neighbors(
+            self.use_graphbolt,
             local_g,
             partition_book,
             self.seed_nodes,
-            etype_offset,
             self.fan_out,
-            self.edge_dir,
-            probs,
-            self.replace,
-            self.etype_sorted,
+            edge_dir=self.edge_dir,
+            prob=probs,
+            replace=self.replace,
+            etype_offset=etype_offset,
+            etype_sorted=self.etype_sorted,
         )
         return SubgraphResponse(
             res.global_src,
@@ -772,6 +814,7 @@ def sample_etype_neighbors(
     prob=None,
     replace=False,
     etype_sorted=True,
+    use_graphbolt=False,
 ):
     """Sample from the neighbors of the given nodes from a distributed graph.
 
@@ -825,6 +868,8 @@ def sample_etype_neighbors(
         neighbors are sampled. If fanout == -1, all neighbors are collected.
     etype_sorted : bool, optional
         Indicates whether etypes are sorted.
+    use_graphbolt : bool, optional
+        Whether to use GraphBolt for sampling.
 
     Returns
     -------
@@ -882,6 +927,7 @@ def issue_remote_req(node_ids):
             prob=_prob,
             replace=replace,
             etype_sorted=etype_sorted,
+            use_graphbolt=use_graphbolt,
         )
 
     def local_access(local_g, partition_book, local_nids):
@@ -897,14 +943,15 @@ def local_access(local_g, partition_book, local_nids):
                 for etype in g.canonical_etypes
             ]
         return _sample_etype_neighbors(
+            use_graphbolt,
             local_g,
             partition_book,
             local_nids,
-            etype_offset,
             fanout,
-            edge_dir,
-            _prob,
-            replace,
+            edge_dir=edge_dir,
+            prob=_prob,
+            replace=replace,
+            etype_offset=etype_offset,
             etype_sorted=etype_sorted,
         )
 
diff --git a/tests/distributed/test_distributed_sampling.py b/tests/distributed/test_distributed_sampling.py
index eec8f51dbaa4..3d22fd5cf2d8 100644
--- a/tests/distributed/test_distributed_sampling.py
+++ b/tests/distributed/test_distributed_sampling.py
@@ -508,6 +508,8 @@ def start_hetero_etype_sample_client(
     fanout=3,
     nodes={"n3": [0, 10, 99, 66, 124, 208]},
     etype_sorted=False,
+    use_graphbolt=False,
+    return_eids=False,
 ):
     gpb = None
     if disable_shared_mem:
@@ -515,12 +517,14 @@ def start_hetero_etype_sample_client(
             tmpdir / "test_sampling.json", rank
         )
     dgl.distributed.initialize("rpc_ip_config.txt")
-    dist_graph = DistGraph("test_sampling", gpb=gpb)
+    dist_graph = DistGraph(
+        "test_sampling", gpb=gpb, use_graphbolt=use_graphbolt
+    )
     assert "feat" in dist_graph.nodes["n1"].data
     assert "feat" not in dist_graph.nodes["n2"].data
     assert "feat" not in dist_graph.nodes["n3"].data
 
-    if dist_graph.local_partition is not None:
+    if (not use_graphbolt) and dist_graph.local_partition is not None:
         # Check whether etypes are sorted in dist_graph
         local_g = dist_graph.local_partition
         local_nids = np.arange(local_g.num_nodes())
@@ -533,11 +537,19 @@ def start_hetero_etype_sample_client(
     if gpb is None:
         gpb = dist_graph.get_partition_book()
     try:
+        # Enable santity check in distributed sampling.
+        os.environ["DGL_DIST_DEBUG"] = "1"
         sampled_graph = sample_etype_neighbors(
-            dist_graph, nodes, fanout, etype_sorted=etype_sorted
+            dist_graph,
+            nodes,
+            fanout,
+            etype_sorted=etype_sorted,
+            use_graphbolt=use_graphbolt,
         )
         block = dgl.to_block(sampled_graph, nodes)
-        block.edata[dgl.EID] = sampled_graph.edata[dgl.EID]
+        if sampled_graph.num_edges() > 0:
+            if not use_graphbolt or return_eids:
+                block.edata[dgl.EID] = sampled_graph.edata[dgl.EID]
     except Exception as e:
         print(traceback.format_exc())
         block = None
@@ -689,7 +701,11 @@ def check_rpc_hetero_sampling_empty_shuffle(
 
 
 def check_rpc_hetero_etype_sampling_shuffle(
-    tmpdir, num_server, graph_formats=None
+    tmpdir,
+    num_server,
+    graph_formats=None,
+    use_graphbolt=False,
+    return_eids=False,
 ):
     generate_ip_config("rpc_ip_config.txt", num_server, num_server)
 
@@ -706,6 +722,8 @@ def check_rpc_hetero_etype_sampling_shuffle(
         part_method="metis",
         return_mapping=True,
         graph_formats=graph_formats,
+        use_graphbolt=use_graphbolt,
+        store_eids=return_eids,
     )
 
     pserver_list = []
@@ -713,7 +731,14 @@ def check_rpc_hetero_etype_sampling_shuffle(
     for i in range(num_server):
         p = ctx.Process(
             target=start_server,
-            args=(i, tmpdir, num_server > 1, "test_sampling", ["csc", "coo"]),
+            args=(
+                i,
+                tmpdir,
+                num_server > 1,
+                "test_sampling",
+                ["csc", "coo"],
+                use_graphbolt,
+            ),
         )
         p.start()
         time.sleep(1)
@@ -730,6 +755,8 @@ def check_rpc_hetero_etype_sampling_shuffle(
         fanout,
         nodes={"n3": [0, 10, 99, 66, 124, 208]},
         etype_sorted=etype_sorted,
+        use_graphbolt=use_graphbolt,
+        return_eids=return_eids,
     )
     print("Done sampling")
     for p in pserver_list:
@@ -747,19 +774,26 @@ def check_rpc_hetero_etype_sampling_shuffle(
         # These are global Ids after shuffling.
         shuffled_src = F.gather_row(block.srcnodes[src_type].data[dgl.NID], src)
         shuffled_dst = F.gather_row(block.dstnodes[dst_type].data[dgl.NID], dst)
-        shuffled_eid = block.edges[etype].data[dgl.EID]
-
         orig_src = F.asnumpy(F.gather_row(orig_nid_map[src_type], shuffled_src))
         orig_dst = F.asnumpy(F.gather_row(orig_nid_map[dst_type], shuffled_dst))
-        orig_eid = F.asnumpy(F.gather_row(orig_eid_map[c_etype], shuffled_eid))
+        assert np.all(
+            F.asnumpy(g.has_edges_between(orig_src, orig_dst, etype=etype))
+        )
+
+        if use_graphbolt and not return_eids:
+            continue
 
         # Check the node Ids and edge Ids.
+        shuffled_eid = block.edges[etype].data[dgl.EID]
+        orig_eid = F.asnumpy(F.gather_row(orig_eid_map[c_etype], shuffled_eid))
         orig_src1, orig_dst1 = g.find_edges(orig_eid, etype=etype)
         assert np.all(F.asnumpy(orig_src1) == orig_src)
         assert np.all(F.asnumpy(orig_dst1) == orig_dst)
 
 
-def check_rpc_hetero_etype_sampling_empty_shuffle(tmpdir, num_server):
+def check_rpc_hetero_etype_sampling_empty_shuffle(
+    tmpdir, num_server, use_graphbolt=False, return_eids=False
+):
     generate_ip_config("rpc_ip_config.txt", num_server, num_server)
 
     g = create_random_hetero(dense=True, empty=True)
@@ -774,6 +808,8 @@ def check_rpc_hetero_etype_sampling_empty_shuffle(tmpdir, num_server):
         num_hops=num_hops,
         part_method="metis",
         return_mapping=True,
+        use_graphbolt=use_graphbolt,
+        store_eids=return_eids,
     )
 
     pserver_list = []
@@ -781,7 +817,14 @@ def check_rpc_hetero_etype_sampling_empty_shuffle(tmpdir, num_server):
     for i in range(num_server):
         p = ctx.Process(
             target=start_server,
-            args=(i, tmpdir, num_server > 1, "test_sampling"),
+            args=(
+                i,
+                tmpdir,
+                num_server > 1,
+                "test_sampling",
+                ["csc", "coo"],
+                use_graphbolt,
+            ),
         )
         p.start()
         time.sleep(1)
@@ -791,7 +834,13 @@ def check_rpc_hetero_etype_sampling_empty_shuffle(tmpdir, num_server):
     deg = get_degrees(g, orig_nids["n3"], "n3")
     empty_nids = F.nonzero_1d(deg == 0)
     block, gpb = start_hetero_etype_sample_client(
-        0, tmpdir, num_server > 1, fanout, nodes={"n3": empty_nids}
+        0,
+        tmpdir,
+        num_server > 1,
+        fanout,
+        nodes={"n3": empty_nids},
+        use_graphbolt=use_graphbolt,
+        return_eids=return_eids,
     )
     print("Done sampling")
     for p in pserver_list:
@@ -848,7 +897,13 @@ def start_bipartite_sample_client(
 
 
 def start_bipartite_etype_sample_client(
-    rank, tmpdir, disable_shared_mem, fanout=3, nodes={}
+    rank,
+    tmpdir,
+    disable_shared_mem,
+    fanout=3,
+    nodes={},
+    use_graphbolt=False,
+    return_eids=False,
 ):
     gpb = None
     if disable_shared_mem:
@@ -856,11 +911,13 @@ def start_bipartite_etype_sample_client(
             tmpdir / "test_sampling.json", rank
         )
     dgl.distributed.initialize("rpc_ip_config.txt")
-    dist_graph = DistGraph("test_sampling", gpb=gpb)
+    dist_graph = DistGraph(
+        "test_sampling", gpb=gpb, use_graphbolt=use_graphbolt
+    )
     assert "feat" in dist_graph.nodes["user"].data
     assert "feat" in dist_graph.nodes["game"].data
 
-    if dist_graph.local_partition is not None:
+    if not use_graphbolt and dist_graph.local_partition is not None:
         # Check whether etypes are sorted in dist_graph
         local_g = dist_graph.local_partition
         local_nids = np.arange(local_g.num_nodes())
@@ -872,10 +929,13 @@ def start_bipartite_etype_sample_client(
 
     if gpb is None:
         gpb = dist_graph.get_partition_book()
-    sampled_graph = sample_etype_neighbors(dist_graph, nodes, fanout)
+    sampled_graph = sample_etype_neighbors(
+        dist_graph, nodes, fanout, use_graphbolt=use_graphbolt
+    )
     block = dgl.to_block(sampled_graph, nodes)
     if sampled_graph.num_edges() > 0:
-        block.edata[dgl.EID] = sampled_graph.edata[dgl.EID]
+        if not use_graphbolt or return_eids:
+            block.edata[dgl.EID] = sampled_graph.edata[dgl.EID]
     dgl.distributed.exit_client()
     return block, gpb
 
@@ -1019,7 +1079,9 @@ def check_rpc_bipartite_sampling_shuffle(
         assert np.all(F.asnumpy(orig_dst1) == orig_dst)
 
 
-def check_rpc_bipartite_etype_sampling_empty(tmpdir, num_server):
+def check_rpc_bipartite_etype_sampling_empty(
+    tmpdir, num_server, use_graphbolt=False, return_eids=False
+):
     """sample on bipartite via sample_etype_neighbors() which yields empty sample results"""
     generate_ip_config("rpc_ip_config.txt", num_server, num_server)
 
@@ -1035,6 +1097,8 @@ def check_rpc_bipartite_etype_sampling_empty(tmpdir, num_server):
         num_hops=num_hops,
         part_method="metis",
         return_mapping=True,
+        use_graphbolt=use_graphbolt,
+        store_eids=return_eids,
     )
 
     pserver_list = []
@@ -1042,7 +1106,14 @@ def check_rpc_bipartite_etype_sampling_empty(tmpdir, num_server):
     for i in range(num_server):
         p = ctx.Process(
             target=start_server,
-            args=(i, tmpdir, num_server > 1, "test_sampling"),
+            args=(
+                i,
+                tmpdir,
+                num_server > 1,
+                "test_sampling",
+                ["csc", "coo"],
+                use_graphbolt,
+            ),
         )
         p.start()
         time.sleep(1)
@@ -1050,8 +1121,13 @@ def check_rpc_bipartite_etype_sampling_empty(tmpdir, num_server):
 
     deg = get_degrees(g, orig_nids["game"], "game")
     empty_nids = F.nonzero_1d(deg == 0)
-    block, gpb = start_bipartite_etype_sample_client(
-        0, tmpdir, num_server > 1, nodes={"game": empty_nids, "user": [1]}
+    block, _ = start_bipartite_etype_sample_client(
+        0,
+        tmpdir,
+        num_server > 1,
+        nodes={"game": empty_nids, "user": [1]},
+        use_graphbolt=use_graphbolt,
+        return_eids=return_eids,
     )
 
     print("Done sampling")
@@ -1064,7 +1140,9 @@ def check_rpc_bipartite_etype_sampling_empty(tmpdir, num_server):
     assert len(block.etypes) == len(g.etypes)
 
 
-def check_rpc_bipartite_etype_sampling_shuffle(tmpdir, num_server):
+def check_rpc_bipartite_etype_sampling_shuffle(
+    tmpdir, num_server, use_graphbolt=False, return_eids=False
+):
     """sample on bipartite via sample_etype_neighbors() which yields non-empty sample results"""
     generate_ip_config("rpc_ip_config.txt", num_server, num_server)
 
@@ -1080,6 +1158,8 @@ def check_rpc_bipartite_etype_sampling_shuffle(tmpdir, num_server):
         num_hops=num_hops,
         part_method="metis",
         return_mapping=True,
+        use_graphbolt=use_graphbolt,
+        store_eids=return_eids,
     )
 
     pserver_list = []
@@ -1087,7 +1167,14 @@ def check_rpc_bipartite_etype_sampling_shuffle(tmpdir, num_server):
     for i in range(num_server):
         p = ctx.Process(
             target=start_server,
-            args=(i, tmpdir, num_server > 1, "test_sampling"),
+            args=(
+                i,
+                tmpdir,
+                num_server > 1,
+                "test_sampling",
+                ["csc", "coo"],
+                use_graphbolt,
+            ),
         )
         p.start()
         time.sleep(1)
@@ -1097,7 +1184,13 @@ def check_rpc_bipartite_etype_sampling_shuffle(tmpdir, num_server):
     deg = get_degrees(g, orig_nid_map["game"], "game")
     nids = F.nonzero_1d(deg > 0)
     block, gpb = start_bipartite_etype_sample_client(
-        0, tmpdir, num_server > 1, fanout, nodes={"game": nids, "user": [0]}
+        0,
+        tmpdir,
+        num_server > 1,
+        fanout,
+        nodes={"game": nids, "user": [0]},
+        use_graphbolt=use_graphbolt,
+        return_eids=return_eids,
     )
     print("Done sampling")
     for p in pserver_list:
@@ -1110,13 +1203,18 @@ def check_rpc_bipartite_etype_sampling_shuffle(tmpdir, num_server):
         # These are global Ids after shuffling.
         shuffled_src = F.gather_row(block.srcnodes[src_type].data[dgl.NID], src)
         shuffled_dst = F.gather_row(block.dstnodes[dst_type].data[dgl.NID], dst)
-        shuffled_eid = block.edges[etype].data[dgl.EID]
-
         orig_src = F.asnumpy(F.gather_row(orig_nid_map[src_type], shuffled_src))
         orig_dst = F.asnumpy(F.gather_row(orig_nid_map[dst_type], shuffled_dst))
-        orig_eid = F.asnumpy(F.gather_row(orig_eid_map[c_etype], shuffled_eid))
+        assert np.all(
+            F.asnumpy(g.has_edges_between(orig_src, orig_dst, etype=etype))
+        )
+
+        if use_graphbolt and not return_eids:
+            continue
 
         # Check the node Ids and edge Ids.
+        shuffled_eid = block.edges[etype].data[dgl.EID]
+        orig_eid = F.asnumpy(F.gather_row(orig_eid_map[c_etype], shuffled_eid))
         orig_src1, orig_dst1 = g.find_edges(orig_eid, etype=etype)
         assert np.all(F.asnumpy(orig_src1) == orig_src)
         assert np.all(F.asnumpy(orig_dst1) == orig_dst)
@@ -1173,7 +1271,7 @@ def test_rpc_hetero_sampling_empty_shuffle(
 @pytest.mark.parametrize(
     "graph_formats", [None, ["csc"], ["csr"], ["csc", "coo"]]
 )
-def test_rpc_hetero_etype_sampling_shuffle(num_server, graph_formats):
+def test_rpc_hetero_etype_sampling_shuffle_dgl(num_server, graph_formats):
     reset_envs()
     os.environ["DGL_DIST_MODE"] = "distributed"
     with tempfile.TemporaryDirectory() as tmpdirname:
@@ -1183,12 +1281,33 @@ def test_rpc_hetero_etype_sampling_shuffle(num_server, graph_formats):
 
 
 @pytest.mark.parametrize("num_server", [1])
-def test_rpc_hetero_etype_sampling_empty_shuffle(num_server):
+@pytest.mark.parametrize("return_eids", [False, True])
+def test_rpc_hetero_etype_sampling_shuffle_graphbolt(num_server, return_eids):
+    reset_envs()
+    os.environ["DGL_DIST_MODE"] = "distributed"
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        check_rpc_hetero_etype_sampling_shuffle(
+            Path(tmpdirname),
+            num_server,
+            use_graphbolt=True,
+            return_eids=return_eids,
+        )
+
+
+@pytest.mark.parametrize("num_server", [1])
+@pytest.mark.parametrize("use_graphbolt", [False, True])
+@pytest.mark.parametrize("return_eids", [False, True])
+def test_rpc_hetero_etype_sampling_empty_shuffle(
+    num_server, use_graphbolt, return_eids
+):
     reset_envs()
     os.environ["DGL_DIST_MODE"] = "distributed"
     with tempfile.TemporaryDirectory() as tmpdirname:
         check_rpc_hetero_etype_sampling_empty_shuffle(
-            Path(tmpdirname), num_server
+            Path(tmpdirname),
+            num_server,
+            use_graphbolt=use_graphbolt,
+            return_eids=return_eids,
         )
 
 
@@ -1219,19 +1338,37 @@ def test_rpc_bipartite_sampling_shuffle(num_server, use_graphbolt, return_eids):
 
 
 @pytest.mark.parametrize("num_server", [1])
-def test_rpc_bipartite_etype_sampling_empty_shuffle(num_server):
+@pytest.mark.parametrize("use_graphbolt", [False, True])
+@pytest.mark.parametrize("return_eids", [False, True])
+def test_rpc_bipartite_etype_sampling_empty_shuffle(
+    num_server, use_graphbolt, return_eids
+):
     reset_envs()
     os.environ["DGL_DIST_MODE"] = "distributed"
     with tempfile.TemporaryDirectory() as tmpdirname:
-        check_rpc_bipartite_etype_sampling_empty(Path(tmpdirname), num_server)
+        check_rpc_bipartite_etype_sampling_empty(
+            Path(tmpdirname),
+            num_server,
+            use_graphbolt=use_graphbolt,
+            return_eids=return_eids,
+        )
 
 
 @pytest.mark.parametrize("num_server", [1])
-def test_rpc_bipartite_etype_sampling_shuffle(num_server):
+@pytest.mark.parametrize("use_graphbolt", [False, True])
+@pytest.mark.parametrize("return_eids", [False, True])
+def test_rpc_bipartite_etype_sampling_shuffle(
+    num_server, use_graphbolt, return_eids
+):
     reset_envs()
     os.environ["DGL_DIST_MODE"] = "distributed"
     with tempfile.TemporaryDirectory() as tmpdirname:
-        check_rpc_bipartite_etype_sampling_shuffle(Path(tmpdirname), num_server)
+        check_rpc_bipartite_etype_sampling_shuffle(
+            Path(tmpdirname),
+            num_server,
+            use_graphbolt=use_graphbolt,
+            return_eids=return_eids,
+        )
 
 
 def check_standalone_sampling(tmpdir):

From 924c5669e6cd80a52283e1d9fa551b96c740062b Mon Sep 17 00:00:00 2001
From: Andrei Ivanov <32910461+drivanov@users.noreply.github.com>
Date: Thu, 8 Feb 2024 18:54:26 -0800
Subject: [PATCH 36/45] Fixing problem with complex numbers appearing in the
 `lap_pe` function. (#6925)

---
 python/dgl/transforms/functional.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/python/dgl/transforms/functional.py b/python/dgl/transforms/functional.py
index 92b554a95ce1..964f665be5c3 100644
--- a/python/dgl/transforms/functional.py
+++ b/python/dgl/transforms/functional.py
@@ -3688,10 +3688,6 @@ def lap_pe(g, k, padding=False, return_eigval=False):
         )
         max_freqs = k
         topk_indices = EigVal.argsort()[1:]
-        # Since scipy may return complex value, to avoid crashing in NN code,
-        # convert them to real number.
-        topk_EigVal = EigVal[topk_indices].real
-        topk_EigVec = EigVec[:, topk_indices].real
     else:
         # Fallback to numpy since scipy.sparse do not support this case.
         EigVal, EigVec = np.linalg.eig(L.toarray())
@@ -3699,8 +3695,11 @@ def lap_pe(g, k, padding=False, return_eigval=False):
         kpartition_indices = np.argpartition(EigVal, max_freqs)[: max_freqs + 1]
         topk_eigvals = EigVal[kpartition_indices]
         topk_indices = kpartition_indices[topk_eigvals.argsort()][1:]
-        topk_EigVec = EigVec[:, topk_indices]
-        topk_EigVal = EigVal[topk_indices]
+
+    # Since scipy may return complex value, to avoid crashing in NN code,
+    # convert them to real number.
+    topk_EigVal = EigVal[topk_indices].real
+    topk_EigVec = EigVec[:, topk_indices].real
     eigvals = F.tensor(topk_EigVal, dtype=F.float32)
 
     # get random flip signs

From 8e6cbd621f34c0892ab5adac5259cd61401a6d2b Mon Sep 17 00:00:00 2001
From: Rhett Ying <85214957+Rhett-Ying@users.noreply.github.com>
Date: Fri, 9 Feb 2024 15:51:39 +0800
Subject: [PATCH 37/45] [DistGB] sample with graphbolt on heterograph via
 DistNodeDataLoader (#7112)

---
 python/dgl/distributed/dist_graph.py    | 33 ++++++------
 tests/distributed/test_mp_dataloader.py | 67 +++++++++++++++++++------
 2 files changed, 67 insertions(+), 33 deletions(-)

diff --git a/python/dgl/distributed/dist_graph.py b/python/dgl/distributed/dist_graph.py
index b1af7b7dbd44..ecda7de8f547 100644
--- a/python/dgl/distributed/dist_graph.py
+++ b/python/dgl/distributed/dist_graph.py
@@ -622,18 +622,7 @@ def __init__(
 
         self._init_ndata_store()
         self._init_edata_store()
-
-        self._num_nodes = 0
-        self._num_edges = 0
-        for part_md in self._gpb.metadata():
-            self._num_nodes += int(part_md["num_nodes"])
-            self._num_edges += int(part_md["num_edges"])
-
-        # When we store node/edge types in a list, they are stored in the order of type IDs.
-        self._ntype_map = {ntype: i for i, ntype in enumerate(self.ntypes)}
-        self._etype_map = {
-            etype: i for i, etype in enumerate(self.canonical_etypes)
-        }
+        self._init_metadata()
 
     def _init(self, gpb):
         self._client = get_kvstore()
@@ -698,6 +687,19 @@ def _init_edata_store(self):
             else:
                 self._edata_store[etype] = data
 
+    def _init_metadata(self):
+        self._num_nodes = 0
+        self._num_edges = 0
+        for part_md in self._gpb.metadata():
+            self._num_nodes += int(part_md["num_nodes"])
+            self._num_edges += int(part_md["num_edges"])
+
+        # When we store node/edge types in a list, they are stored in the order of type IDs.
+        self._ntype_map = {ntype: i for i, ntype in enumerate(self.ntypes)}
+        self._etype_map = {
+            etype: i for i, etype in enumerate(self.canonical_etypes)
+        }
+
     def __getstate__(self):
         return self.graph_name, self._gpb, self._use_graphbolt
 
@@ -707,11 +709,7 @@ def __setstate__(self, state):
 
         self._init_ndata_store()
         self._init_edata_store()
-        self._num_nodes = 0
-        self._num_edges = 0
-        for part_md in self._gpb.metadata():
-            self._num_nodes += int(part_md["num_nodes"])
-            self._num_edges += int(part_md["num_edges"])
+        self._init_metadata()
 
     @property
     def local_partition(self):
@@ -1403,6 +1401,7 @@ def sample_neighbors(
                 replace=replace,
                 etype_sorted=etype_sorted,
                 prob=prob,
+                use_graphbolt=self._use_graphbolt,
             )
         else:
             frontier = graph_services.sample_neighbors(
diff --git a/tests/distributed/test_mp_dataloader.py b/tests/distributed/test_mp_dataloader.py
index 4d6e8cf25834..cdb6f27b5aed 100644
--- a/tests/distributed/test_mp_dataloader.py
+++ b/tests/distributed/test_mp_dataloader.py
@@ -487,22 +487,23 @@ def start_node_dataloader(
                 range(0, num_nodes_to_sample, batch_size), dataloader
             ):
                 block = blocks[-1]
-                for src_type, etype, dst_type in block.canonical_etypes:
-                    o_src, o_dst = block.edges(etype=etype)
+                for c_etype in block.canonical_etypes:
+                    src_type, _, dst_type = c_etype
+                    o_src, o_dst = block.edges(etype=c_etype)
                     src_nodes_id = block.srcnodes[src_type].data[dgl.NID][o_src]
                     dst_nodes_id = block.dstnodes[dst_type].data[dgl.NID][o_dst]
                     src_nodes_id = orig_nid[src_type][src_nodes_id]
                     dst_nodes_id = orig_nid[dst_type][dst_nodes_id]
                     has_edges = groundtruth_g.has_edges_between(
-                        src_nodes_id, dst_nodes_id, etype=etype
+                        src_nodes_id, dst_nodes_id, etype=c_etype
                     )
                     assert np.all(F.asnumpy(has_edges))
 
                     if use_graphbolt and not return_eids:
                         continue
-                    eids = orig_eid[etype][block.edata[dgl.EID]]
+                    eids = orig_eid[c_etype][block.edges[c_etype].data[dgl.EID]]
                     expected_eids = groundtruth_g.edge_ids(
-                        src_nodes_id, dst_nodes_id
+                        src_nodes_id, dst_nodes_id, etype=c_etype
                     )
                     assert th.equal(
                         eids, expected_eids
@@ -610,7 +611,7 @@ def check_dataloader(
         if not isinstance(orig_nid, dict):
             orig_nid = {g.ntypes[0]: orig_nid}
         if not isinstance(orig_eid, dict):
-            orig_eid = {g.etypes[0]: orig_eid}
+            orig_eid = {g.canonical_etypes[0]: orig_eid}
 
         pserver_list = []
         ctx = mp.get_context("spawn")
@@ -718,14 +719,27 @@ def test_dataloader_homograph(
     )
 
 
-@unittest.skip(reason="Skip due to glitch in CI")
 @pytest.mark.parametrize("num_server", [1])
 @pytest.mark.parametrize("num_workers", [0, 1])
 @pytest.mark.parametrize("dataloader_type", ["node", "edge"])
-def test_dataloader_heterograph(num_server, num_workers, dataloader_type):
+@pytest.mark.parametrize("use_graphbolt", [False, True])
+@pytest.mark.parametrize("return_eids", [False, True])
+def test_dataloader_heterograph(
+    num_server, num_workers, dataloader_type, use_graphbolt, return_eids
+):
+    if dataloader_type == "edge" and use_graphbolt:
+        # GraphBolt does not support edge dataloader.
+        return
     reset_envs()
     g = create_random_hetero()
-    check_dataloader(g, num_server, num_workers, dataloader_type)
+    check_dataloader(
+        g,
+        num_server,
+        num_workers,
+        dataloader_type,
+        use_graphbolt=use_graphbolt,
+        return_eids=return_eids,
+    )
 
 
 @unittest.skip(reason="Skip due to glitch in CI")
@@ -740,10 +754,18 @@ def test_neg_dataloader(num_server, num_workers):
 
 
 def start_multiple_dataloaders(
-    ip_config, part_config, graph_name, orig_g, num_dataloaders, dataloader_type
+    ip_config,
+    part_config,
+    graph_name,
+    orig_g,
+    num_dataloaders,
+    dataloader_type,
+    use_graphbolt,
 ):
     dgl.distributed.initialize(ip_config)
-    dist_g = dgl.distributed.DistGraph(graph_name, part_config=part_config)
+    dist_g = dgl.distributed.DistGraph(
+        graph_name, part_config=part_config, use_graphbolt=use_graphbolt
+    )
     if dataloader_type == "node":
         train_ids = th.arange(orig_g.num_nodes())
         batch_size = orig_g.num_nodes() // 100
@@ -777,13 +799,17 @@ def start_multiple_dataloaders(
     dgl.distributed.exit_client()
 
 
-@unittest.skip(reason="Skip due to glitch in CI")
 @pytest.mark.parametrize("num_dataloaders", [1, 4])
-@pytest.mark.parametrize("num_workers", [0, 1, 4])
+@pytest.mark.parametrize("num_workers", [0, 1])
 @pytest.mark.parametrize("dataloader_type", ["node", "edge"])
+@pytest.mark.parametrize("use_graphbolt", [False, True])
+@pytest.mark.parametrize("return_eids", [False, True])
 def test_multiple_dist_dataloaders(
-    num_dataloaders, num_workers, dataloader_type
+    num_dataloaders, num_workers, dataloader_type, use_graphbolt, return_eids
 ):
+    if dataloader_type == "edge" and use_graphbolt:
+        # GraphBolt does not support edge dataloader.
+        return
     reset_envs()
     os.environ["DGL_DIST_MODE"] = "distributed"
     os.environ["DGL_NUM_SAMPLER"] = str(num_workers)
@@ -794,8 +820,15 @@ def test_multiple_dist_dataloaders(
         generate_ip_config(ip_config, num_parts, num_servers)
 
         orig_g = dgl.rand_graph(1000, 10000)
-        graph_name = "test"
-        partition_graph(orig_g, graph_name, num_parts, test_dir)
+        graph_name = "test_multiple_dataloaders"
+        partition_graph(
+            orig_g,
+            graph_name,
+            num_parts,
+            test_dir,
+            use_graphbolt=use_graphbolt,
+            store_eids=return_eids,
+        )
         part_config = os.path.join(test_dir, f"{graph_name}.json")
 
         p_servers = []
@@ -809,6 +842,7 @@ def test_multiple_dist_dataloaders(
                     part_config,
                     num_servers > 1,
                     num_workers + 1,
+                    use_graphbolt,
                 ),
             )
             p.start()
@@ -824,6 +858,7 @@ def test_multiple_dist_dataloaders(
                 orig_g,
                 num_dataloaders,
                 dataloader_type,
+                use_graphbolt,
             ),
         )
         p_client.start()

From 8204fe1912d95bac865797af98f01dafc2ba2b65 Mon Sep 17 00:00:00 2001
From: Lourens Touwen <touwenlourens@gmail.com>
Date: Wed, 14 Feb 2024 10:35:15 -0500
Subject: [PATCH 38/45] [Example] Super small fix for import in HGL_SP example
 (#6984)

Co-authored-by: Hongzhi (Steve), Chen <chenhongzhi.nkcs@gmail.com>
---
 examples/pytorch/hgp_sl/functions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/pytorch/hgp_sl/functions.py b/examples/pytorch/hgp_sl/functions.py
index 3c22261f2e6b..c12d72d25a8f 100644
--- a/examples/pytorch/hgp_sl/functions.py
+++ b/examples/pytorch/hgp_sl/functions.py
@@ -9,10 +9,10 @@
 """
 import dgl
 import torch
+from dgl._sparse_ops import _gsddmm, _gspmm
 from dgl.backend import astype
 from dgl.base import ALL, is_all
 from dgl.heterograph_index import HeteroGraphIndex
-from dgl.sparse import _gsddmm, _gspmm
 from torch import Tensor
 from torch.autograd import Function
 

From 13e7c2faeb008017256eef3cb7d2d5824c353c14 Mon Sep 17 00:00:00 2001
From: Andrei Ivanov <32910461+drivanov@users.noreply.github.com>
Date: Wed, 14 Feb 2024 07:36:00 -0800
Subject: [PATCH 39/45] [GraphBolt] Improving `subgraph_sampler` tests. (#7047)

---
 .../graphbolt/test_subgraph_sampler.py        | 281 +++++++++---------
 1 file changed, 138 insertions(+), 143 deletions(-)

diff --git a/tests/python/pytorch/graphbolt/test_subgraph_sampler.py b/tests/python/pytorch/graphbolt/test_subgraph_sampler.py
index a5c8ef53c305..d8b80381cc0b 100644
--- a/tests/python/pytorch/graphbolt/test_subgraph_sampler.py
+++ b/tests/python/pytorch/graphbolt/test_subgraph_sampler.py
@@ -1,4 +1,5 @@
 import unittest
+import warnings
 
 from enum import Enum
 from functools import partial
@@ -9,7 +10,6 @@
 import dgl.graphbolt as gb
 import pytest
 import torch
-from torchdata.datapipes.iter import Mapper
 
 from . import gb_test_utils
 
@@ -22,6 +22,12 @@ def _check_sampler_type(sampler_type):
         )
 
 
+def _check_sampler_len(sampler, lenExp):
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore", category=UserWarning)
+        assert len(list(sampler)) == lenExp
+
+
 class SamplerType(Enum):
     Normal = 0
     Layer = 1
@@ -128,7 +134,7 @@ def test_SubgraphSampler_Node_seed_nodes(sampler_type):
     fanouts = [torch.LongTensor([2]) for _ in range(num_layer)]
     sampler = _get_sampler(sampler_type)
     sampler_dp = sampler(item_sampler, graph, fanouts)
-    assert len(list(sampler_dp)) == 5
+    _check_sampler_len(sampler_dp, 5)
 
 
 def to_link_batch(data):
@@ -161,7 +167,7 @@ def test_SubgraphSampler_Link_node_pairs(sampler_type):
     sampler = _get_sampler(sampler_type)
     datapipe = sampler(datapipe, graph, fanouts)
     datapipe = datapipe.transform(partial(gb.exclude_seed_edges))
-    assert len(list(datapipe)) == 5
+    _check_sampler_len(datapipe, 5)
 
 
 @pytest.mark.parametrize(
@@ -190,7 +196,7 @@ def test_SubgraphSampler_Link_With_Negative_node_pairs(sampler_type):
     sampler = _get_sampler(sampler_type)
     datapipe = sampler(datapipe, graph, fanouts)
     datapipe = datapipe.transform(partial(gb.exclude_seed_edges))
-    assert len(list(datapipe)) == 5
+    _check_sampler_len(datapipe, 5)
 
 
 def get_hetero_graph():
@@ -239,9 +245,11 @@ def test_SubgraphSampler_Node_seed_nodes_Hetero(sampler_type):
     fanouts = [torch.LongTensor([2]) for _ in range(num_layer)]
     sampler = _get_sampler(sampler_type)
     sampler_dp = sampler(item_sampler, graph, fanouts)
-    assert len(list(sampler_dp)) == 2
-    for minibatch in sampler_dp:
-        assert len(minibatch.sampled_subgraphs) == num_layer
+    _check_sampler_len(sampler_dp, 2)
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore", category=UserWarning)
+        for minibatch in sampler_dp:
+            assert len(minibatch.sampled_subgraphs) == num_layer
 
 
 @pytest.mark.parametrize(
@@ -285,7 +293,7 @@ def test_SubgraphSampler_Link_Hetero_node_pairs(sampler_type):
     sampler = _get_sampler(sampler_type)
     datapipe = sampler(datapipe, graph, fanouts)
     datapipe = datapipe.transform(partial(gb.exclude_seed_edges))
-    assert len(list(datapipe)) == 5
+    _check_sampler_len(datapipe, 5)
 
 
 @pytest.mark.parametrize(
@@ -330,7 +338,7 @@ def test_SubgraphSampler_Link_Hetero_With_Negative_node_pairs(sampler_type):
     sampler = _get_sampler(sampler_type)
     datapipe = sampler(datapipe, graph, fanouts)
     datapipe = datapipe.transform(partial(gb.exclude_seed_edges))
-    assert len(list(datapipe)) == 5
+    _check_sampler_len(datapipe, 5)
 
 
 @pytest.mark.parametrize(
@@ -375,7 +383,7 @@ def test_SubgraphSampler_Link_Hetero_Unknown_Etype_node_pairs(sampler_type):
     sampler = _get_sampler(sampler_type)
     datapipe = sampler(datapipe, graph, fanouts)
     datapipe = datapipe.transform(partial(gb.exclude_seed_edges))
-    assert len(list(datapipe)) == 5
+    _check_sampler_len(datapipe, 5)
 
 
 @pytest.mark.parametrize(
@@ -423,7 +431,7 @@ def test_SubgraphSampler_Link_Hetero_With_Negative_Unknown_Etype_node_pairs(
     sampler = _get_sampler(sampler_type)
     datapipe = sampler(datapipe, graph, fanouts)
     datapipe = datapipe.transform(partial(gb.exclude_seed_edges))
-    assert len(list(datapipe)) == 5
+    _check_sampler_len(datapipe, 5)
 
 
 @pytest.mark.parametrize(
@@ -493,32 +501,28 @@ def test_SubgraphSampler_Random_Hetero_Graph_seed_ndoes(sampler_type, replace):
 
     sampler_dp = sampler(item_sampler, graph, fanouts, replace=replace)
 
-    for data in sampler_dp:
-        for sampledsubgraph in data.sampled_subgraphs:
-            for _, value in sampledsubgraph.sampled_csc.items():
-                assert torch.equal(
-                    torch.ge(
-                        value.indices,
-                        torch.zeros(len(value.indices)).to(F.ctx()),
-                    ),
-                    torch.ones(len(value.indices)).to(F.ctx()),
-                )
-                assert torch.equal(
-                    torch.ge(
-                        value.indptr, torch.zeros(len(value.indptr)).to(F.ctx())
-                    ),
-                    torch.ones(len(value.indptr)).to(F.ctx()),
-                )
-            for _, value in sampledsubgraph.original_column_node_ids.items():
-                assert torch.equal(
-                    torch.ge(value, torch.zeros(len(value)).to(F.ctx())),
-                    torch.ones(len(value)).to(F.ctx()),
-                )
-            for _, value in sampledsubgraph.original_row_node_ids.items():
-                assert torch.equal(
-                    torch.ge(value, torch.zeros(len(value)).to(F.ctx())),
-                    torch.ones(len(value)).to(F.ctx()),
-                )
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore", category=UserWarning)
+        for data in sampler_dp:
+            for sampledsubgraph in data.sampled_subgraphs:
+                for _, value in sampledsubgraph.sampled_csc.items():
+                    for idx in [value.indices, value.indptr]:
+                        assert torch.equal(
+                            torch.ge(idx, torch.zeros(len(idx)).to(F.ctx())),
+                            torch.ones(len(idx)).to(F.ctx()),
+                        )
+                node_ids = [
+                    sampledsubgraph.original_column_node_ids,
+                    sampledsubgraph.original_row_node_ids,
+                ]
+                for ids in node_ids:
+                    for _, value in ids.items():
+                        assert torch.equal(
+                            torch.ge(
+                                value, torch.zeros(len(value)).to(F.ctx())
+                            ),
+                            torch.ones(len(value)).to(F.ctx()),
+                        )
 
 
 @pytest.mark.parametrize(
@@ -570,9 +574,60 @@ def test_SubgraphSampler_without_dedpulication_Homo_seed_nodes(sampler_type):
         torch.tensor([0, 2, 2, 3, 4, 4, 5]).to(F.ctx()),
         torch.tensor([0, 3, 4]).to(F.ctx()),
     ]
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore", category=UserWarning)
+        for data in datapipe:
+            for step, sampled_subgraph in enumerate(data.sampled_subgraphs):
+                assert (
+                    len(sampled_subgraph.original_row_node_ids) == length[step]
+                )
+                assert torch.equal(
+                    sampled_subgraph.sampled_csc.indices,
+                    compacted_indices[step],
+                )
+                assert torch.equal(
+                    sampled_subgraph.sampled_csc.indptr, indptr[step]
+                )
+                assert torch.equal(
+                    torch.sort(sampled_subgraph.original_column_node_ids)[0],
+                    seeds[step],
+                )
+
+
+def _assert_hetero_values(
+    datapipe, original_row_node_ids, original_column_node_ids, csc_formats
+):
     for data in datapipe:
         for step, sampled_subgraph in enumerate(data.sampled_subgraphs):
-            assert len(sampled_subgraph.original_row_node_ids) == length[step]
+            for ntype in ["n1", "n2"]:
+                assert torch.equal(
+                    sampled_subgraph.original_row_node_ids[ntype],
+                    original_row_node_ids[step][ntype].to(F.ctx()),
+                )
+                assert torch.equal(
+                    sampled_subgraph.original_column_node_ids[ntype],
+                    original_column_node_ids[step][ntype].to(F.ctx()),
+                )
+            for etype in ["n1:e1:n2", "n2:e2:n1"]:
+                assert torch.equal(
+                    sampled_subgraph.sampled_csc[etype].indices,
+                    csc_formats[step][etype].indices.to(F.ctx()),
+                )
+                assert torch.equal(
+                    sampled_subgraph.sampled_csc[etype].indptr,
+                    csc_formats[step][etype].indptr.to(F.ctx()),
+                )
+
+
+def _assert_homo_values(
+    datapipe, original_row_node_ids, compacted_indices, indptr, seeds
+):
+    for data in datapipe:
+        for step, sampled_subgraph in enumerate(data.sampled_subgraphs):
+            assert torch.equal(
+                sampled_subgraph.original_row_node_ids,
+                original_row_node_ids[step],
+            )
             assert torch.equal(
                 sampled_subgraph.sampled_csc.indices, compacted_indices[step]
             )
@@ -580,8 +635,7 @@ def test_SubgraphSampler_without_dedpulication_Homo_seed_nodes(sampler_type):
                 sampled_subgraph.sampled_csc.indptr, indptr[step]
             )
             assert torch.equal(
-                torch.sort(sampled_subgraph.original_column_node_ids)[0],
-                seeds[step],
+                sampled_subgraph.original_column_node_ids, seeds[step]
             )
 
 
@@ -655,26 +709,14 @@ def test_SubgraphSampler_without_dedpulication_Hetero_seed_nodes(sampler_type):
         },
     ]
 
-    for data in datapipe:
-        for step, sampled_subgraph in enumerate(data.sampled_subgraphs):
-            for ntype in ["n1", "n2"]:
-                assert torch.equal(
-                    sampled_subgraph.original_row_node_ids[ntype],
-                    original_row_node_ids[step][ntype].to(F.ctx()),
-                )
-                assert torch.equal(
-                    sampled_subgraph.original_column_node_ids[ntype],
-                    original_column_node_ids[step][ntype].to(F.ctx()),
-                )
-            for etype in ["n1:e1:n2", "n2:e2:n1"]:
-                assert torch.equal(
-                    sampled_subgraph.sampled_csc[etype].indices,
-                    csc_formats[step][etype].indices.to(F.ctx()),
-                )
-                assert torch.equal(
-                    sampled_subgraph.sampled_csc[etype].indptr,
-                    csc_formats[step][etype].indptr.to(F.ctx()),
-                )
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore", category=UserWarning)
+        _assert_hetero_values(
+            datapipe,
+            original_row_node_ids,
+            original_column_node_ids,
+            csc_formats,
+        )
 
 
 @unittest.skipIf(
@@ -719,21 +761,9 @@ def test_SubgraphSampler_unique_csc_format_Homo_cpu_seed_nodes(labor):
         torch.tensor([0, 3, 4, 5, 2]).to(F.ctx()),
         torch.tensor([0, 3, 4]).to(F.ctx()),
     ]
-    for data in datapipe:
-        for step, sampled_subgraph in enumerate(data.sampled_subgraphs):
-            assert torch.equal(
-                sampled_subgraph.original_row_node_ids,
-                original_row_node_ids[step],
-            )
-            assert torch.equal(
-                sampled_subgraph.sampled_csc.indices, compacted_indices[step]
-            )
-            assert torch.equal(
-                sampled_subgraph.sampled_csc.indptr, indptr[step]
-            )
-            assert torch.equal(
-                sampled_subgraph.original_column_node_ids, seeds[step]
-            )
+    _assert_homo_values(
+        datapipe, original_row_node_ids, compacted_indices, indptr, seeds
+    )
 
 
 @unittest.skipIf(
@@ -778,21 +808,9 @@ def test_SubgraphSampler_unique_csc_format_Homo_gpu_seed_nodes(labor):
         torch.tensor([0, 3, 4, 2, 5]).to(F.ctx()),
         torch.tensor([0, 3, 4]).to(F.ctx()),
     ]
-    for data in datapipe:
-        for step, sampled_subgraph in enumerate(data.sampled_subgraphs):
-            assert torch.equal(
-                sampled_subgraph.original_row_node_ids,
-                original_row_node_ids[step],
-            )
-            assert torch.equal(
-                sampled_subgraph.sampled_csc.indices, compacted_indices[step]
-            )
-            assert torch.equal(
-                sampled_subgraph.sampled_csc.indptr, indptr[step]
-            )
-            assert torch.equal(
-                sampled_subgraph.original_column_node_ids, seeds[step]
-            )
+    _assert_homo_values(
+        datapipe, original_row_node_ids, compacted_indices, indptr, seeds
+    )
 
 
 @pytest.mark.parametrize("labor", [False, True])
@@ -853,27 +871,9 @@ def test_SubgraphSampler_unique_csc_format_Hetero_seed_nodes(labor):
             "n2": torch.tensor([0, 1]),
         },
     ]
-
-    for data in datapipe:
-        for step, sampled_subgraph in enumerate(data.sampled_subgraphs):
-            for ntype in ["n1", "n2"]:
-                assert torch.equal(
-                    sampled_subgraph.original_row_node_ids[ntype],
-                    original_row_node_ids[step][ntype].to(F.ctx()),
-                )
-                assert torch.equal(
-                    sampled_subgraph.original_column_node_ids[ntype],
-                    original_column_node_ids[step][ntype].to(F.ctx()),
-                )
-            for etype in ["n1:e1:n2", "n2:e2:n1"]:
-                assert torch.equal(
-                    sampled_subgraph.sampled_csc[etype].indices,
-                    csc_formats[step][etype].indices.to(F.ctx()),
-                )
-                assert torch.equal(
-                    sampled_subgraph.sampled_csc[etype].indptr,
-                    csc_formats[step][etype].indptr.to(F.ctx()),
-                )
+    _assert_hetero_values(
+        datapipe, original_row_node_ids, original_column_node_ids, csc_formats
+    )
 
 
 @pytest.mark.parametrize(
@@ -886,7 +886,9 @@ def test_SubgraphSampler_Hetero_multifanout_per_layer_seed_nodes(sampler_type):
     items_n1 = torch.tensor([0])
     items_n2 = torch.tensor([1])
     names = "seed_nodes"
+    item_length = 2
     if sampler_type == SamplerType.Temporal:
+        item_length = 3
         graph.node_attributes = {
             "timestamp": torch.arange(graph.csc_indptr.numel() - 1).to(F.ctx())
         }
@@ -909,38 +911,31 @@ def test_SubgraphSampler_Hetero_multifanout_per_layer_seed_nodes(sampler_type):
     fanouts = [torch.LongTensor([2, 1]) for _ in range(num_layer)]
     sampler = _get_sampler(sampler_type)
     sampler_dp = sampler(item_sampler, graph, fanouts)
-    if sampler_type == SamplerType.Temporal:
-        indices_len = [
-            {
-                "n1:e1:n2": 4,
-                "n2:e2:n1": 3,
-            },
-            {
-                "n1:e1:n2": 2,
-                "n2:e2:n1": 1,
-            },
-        ]
-    else:
-        indices_len = [
-            {
-                "n1:e1:n2": 4,
-                "n2:e2:n1": 2,
-            },
-            {
-                "n1:e1:n2": 2,
-                "n2:e2:n1": 1,
-            },
-        ]
-    for minibatch in sampler_dp:
-        for step, sampled_subgraph in enumerate(minibatch.sampled_subgraphs):
-            assert (
-                len(sampled_subgraph.sampled_csc["n1:e1:n2"].indices)
-                == indices_len[step]["n1:e1:n2"]
-            )
-            assert (
-                len(sampled_subgraph.sampled_csc["n2:e2:n1"].indices)
-                == indices_len[step]["n2:e2:n1"]
-            )
+    indices_len = [
+        {
+            "n1:e1:n2": 4,
+            "n2:e2:n1": item_length,
+        },
+        {
+            "n1:e1:n2": 2,
+            "n2:e2:n1": 1,
+        },
+    ]
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore", category=UserWarning)
+        for minibatch in sampler_dp:
+            for step, sampled_subgraph in enumerate(
+                minibatch.sampled_subgraphs
+            ):
+                assert (
+                    len(sampled_subgraph.sampled_csc["n1:e1:n2"].indices)
+                    == indices_len[step]["n1:e1:n2"]
+                )
+                assert (
+                    len(sampled_subgraph.sampled_csc["n2:e2:n1"].indices)
+                    == indices_len[step]["n2:e2:n1"]
+                )
 
 
 def test_SubgraphSampler_invoke():

From 3ded35872f09df5f00ce47b5c761265d3fe05050 Mon Sep 17 00:00:00 2001
From: SuperYY <45654909+wkmyws@users.noreply.github.com>
Date: Fri, 16 Feb 2024 22:34:57 +0800
Subject: [PATCH 40/45] fix epoch loss (#6319)

---
 examples/pytorch/mvgrl/graph/main.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/pytorch/mvgrl/graph/main.py b/examples/pytorch/mvgrl/graph/main.py
index bebadf319928..38f5aaf8f7b1 100644
--- a/examples/pytorch/mvgrl/graph/main.py
+++ b/examples/pytorch/mvgrl/graph/main.py
@@ -130,8 +130,8 @@ def collate(samples):
 
         print("Epoch {}, Loss {:.4f}".format(epoch, loss_all))
 
-        if loss < best:
-            best = loss
+        if loss_all < best:
+            best = loss_all
             best_t = epoch
             cnt_wait = 0
             th.save(model.state_dict(), f"{args.dataname}.pkl")

From 458b938c423743243640d2fb237581daec5da81e Mon Sep 17 00:00:00 2001
From: Mingbang Wang <100203018+Skeleton003@users.noreply.github.com>
Date: Sun, 18 Feb 2024 01:04:39 +0800
Subject: [PATCH 41/45] [GraphBolt] modify `preprocess_ondisk_dataset()`
 (#6986)

---
 python/dgl/graphbolt/impl/ondisk_dataset.py   | 359 +++++++++++++-----
 .../python/pytorch/graphbolt/gb_test_utils.py |  16 +-
 .../graphbolt/impl/test_ondisk_dataset.py     |  36 +-
 3 files changed, 286 insertions(+), 125 deletions(-)

diff --git a/python/dgl/graphbolt/impl/ondisk_dataset.py b/python/dgl/graphbolt/impl/ondisk_dataset.py
index e636e17e8f31..ca95bcf8f3f1 100644
--- a/python/dgl/graphbolt/impl/ondisk_dataset.py
+++ b/python/dgl/graphbolt/impl/ondisk_dataset.py
@@ -7,14 +7,14 @@
 from copy import deepcopy
 from typing import Dict, List, Union
 
+import numpy as np
+
 import torch
 import yaml
 
-import dgl
-
 from ...base import dgl_warning
 from ...data.utils import download, extract_archive
-from ..base import etype_str_to_tuple
+from ..base import etype_str_to_tuple, ORIGINAL_EDGE_ID
 from ..dataset import Dataset, Task
 from ..internal import (
     calculate_dir_hash,
@@ -26,7 +26,10 @@
 )
 from ..itemset import ItemSet, ItemSetDict
 from ..sampling_graph import SamplingGraph
-from .fused_csc_sampling_graph import from_dglgraph, FusedCSCSamplingGraph
+from .fused_csc_sampling_graph import (
+    fused_csc_sampling_graph,
+    FusedCSCSamplingGraph,
+)
 from .ondisk_metadata import (
     OnDiskGraphTopology,
     OnDiskMetaData,
@@ -38,6 +41,240 @@
 __all__ = ["OnDiskDataset", "preprocess_ondisk_dataset", "BuiltinDataset"]
 
 
+def _graph_data_to_fused_csc_sampling_graph(
+    dataset_dir: str,
+    graph_data: Dict,
+    include_original_edge_id: bool,
+) -> FusedCSCSamplingGraph:
+    """Convert the raw graph data into FusedCSCSamplingGraph.
+
+    Parameters
+    ----------
+    dataset_dir : str
+        The path to the dataset directory.
+    graph_data : Dict
+        The raw data read from yaml file.
+    include_original_edge_id : bool
+        Whether to include the original edge id in the FusedCSCSamplingGraph.
+
+    Returns
+    -------
+    sampling_graph : FusedCSCSamplingGraph
+        The FusedCSCSamplingGraph constructed from the raw data.
+    """
+    from ...sparse import spmatrix
+
+    is_homogeneous = (
+        len(graph_data["nodes"]) == 1
+        and len(graph_data["edges"]) == 1
+        and "type" not in graph_data["nodes"][0]
+        and "type" not in graph_data["edges"][0]
+    )
+
+    if is_homogeneous:
+        # Homogeneous graph.
+        edge_fmt = graph_data["edges"][0]["format"]
+        edge_path = graph_data["edges"][0]["path"]
+        src, dst = read_edges(dataset_dir, edge_fmt, edge_path)
+        num_nodes = graph_data["nodes"][0]["num"]
+        num_edges = len(src)
+        coo_tensor = torch.tensor(np.array([src, dst]))
+        sparse_matrix = spmatrix(coo_tensor, shape=(num_nodes, num_nodes))
+        del coo_tensor
+        indptr, indices, edge_ids = sparse_matrix.csc()
+        del sparse_matrix
+        node_type_offset = None
+        type_per_edge = None
+        node_type_to_id = None
+        edge_type_to_id = None
+        node_attributes = {}
+        edge_attributes = {}
+        if include_original_edge_id:
+            edge_attributes[ORIGINAL_EDGE_ID] = edge_ids
+    else:
+        # Heterogeneous graph.
+        # Sort graph_data by ntype/etype lexicographically to ensure ordering.
+        graph_data["nodes"].sort(key=lambda x: x["type"])
+        graph_data["edges"].sort(key=lambda x: x["type"])
+        # Construct node_type_offset and node_type_to_id.
+        node_type_offset = [0]
+        node_type_to_id = {}
+        for ntype_id, node_info in enumerate(graph_data["nodes"]):
+            node_type_to_id[node_info["type"]] = ntype_id
+            node_type_offset.append(node_type_offset[-1] + node_info["num"])
+        total_num_nodes = node_type_offset[-1]
+        # Construct edge_type_offset, edge_type_to_id and coo_tensor.
+        edge_type_offset = [0]
+        edge_type_to_id = {}
+        coo_src_list = []
+        coo_dst_list = []
+        coo_etype_list = []
+        for etype_id, edge_info in enumerate(graph_data["edges"]):
+            edge_type_to_id[edge_info["type"]] = etype_id
+            edge_fmt = edge_info["format"]
+            edge_path = edge_info["path"]
+            src, dst = read_edges(dataset_dir, edge_fmt, edge_path)
+            edge_type_offset.append(edge_type_offset[-1] + len(src))
+            src_type, _, dst_type = etype_str_to_tuple(edge_info["type"])
+            src += node_type_offset[node_type_to_id[src_type]]
+            dst += node_type_offset[node_type_to_id[dst_type]]
+            coo_src_list.append(torch.tensor(src))
+            coo_dst_list.append(torch.tensor(dst))
+            coo_etype_list.append(torch.full((len(src),), etype_id))
+        total_num_edges = edge_type_offset[-1]
+
+        coo_src = torch.cat(coo_src_list)
+        del coo_src_list
+        coo_dst = torch.cat(coo_dst_list)
+        del coo_dst_list
+        coo_etype = torch.cat(coo_etype_list)
+        del coo_etype_list
+
+        sparse_matrix = spmatrix(
+            indices=torch.stack((coo_src, coo_dst), dim=0),
+            shape=(total_num_nodes, total_num_nodes),
+        )
+        del coo_src, coo_dst
+        indptr, indices, edge_ids = sparse_matrix.csc()
+        del sparse_matrix
+        node_type_offset = torch.tensor(node_type_offset)
+        type_per_edge = torch.index_select(coo_etype, dim=0, index=edge_ids)
+        del coo_etype
+        node_attributes = {}
+        edge_attributes = {}
+        if include_original_edge_id:
+            edge_ids -= torch.gather(
+                input=torch.tensor(edge_type_offset),
+                dim=0,
+                index=type_per_edge,
+            )
+            edge_attributes[ORIGINAL_EDGE_ID] = edge_ids
+
+    # Load the sampling related node/edge features and add them to
+    # the sampling-graph.
+    if graph_data.get("feature_data", None):
+        if is_homogeneous:
+            # Homogeneous graph.
+            for graph_feature in graph_data["feature_data"]:
+                in_memory = (
+                    True
+                    if "in_memory" not in graph_feature
+                    else graph_feature["in_memory"]
+                )
+                if graph_feature["domain"] == "node":
+                    node_data = read_data(
+                        os.path.join(dataset_dir, graph_feature["path"]),
+                        graph_feature["format"],
+                        in_memory=in_memory,
+                    )
+                    assert node_data.shape[0] == num_nodes
+                    node_attributes[graph_feature["name"]] = node_data
+                elif graph_feature["domain"] == "edge":
+                    edge_data = read_data(
+                        os.path.join(dataset_dir, graph_feature["path"]),
+                        graph_feature["format"],
+                        in_memory=in_memory,
+                    )
+                    assert edge_data.shape[0] == num_edges
+                    edge_attributes[graph_feature["name"]] = edge_data
+        else:
+            # Heterogeneous graph.
+            node_feature_collector = {}
+            edge_feature_collector = {}
+            for graph_feature in graph_data["feature_data"]:
+                in_memory = (
+                    True
+                    if "in_memory" not in graph_feature
+                    else graph_feature["in_memory"]
+                )
+                if graph_feature["domain"] == "node":
+                    node_data = read_data(
+                        os.path.join(dataset_dir, graph_feature["path"]),
+                        graph_feature["format"],
+                        in_memory=in_memory,
+                    )
+                    if graph_feature["name"] not in node_feature_collector:
+                        node_feature_collector[graph_feature["name"]] = {}
+                    node_feature_collector[graph_feature["name"]][
+                        graph_feature["type"]
+                    ] = node_data
+                elif graph_feature["domain"] == "edge":
+                    edge_data = read_data(
+                        os.path.join(dataset_dir, graph_feature["path"]),
+                        graph_feature["format"],
+                        in_memory=in_memory,
+                    )
+                    if graph_feature["name"] not in edge_feature_collector:
+                        edge_feature_collector[graph_feature["name"]] = {}
+                    edge_feature_collector[graph_feature["name"]][
+                        graph_feature["type"]
+                    ] = edge_data
+
+            # For heterogenous, a node/edge feature must cover all node/edge types.
+            all_node_types = set(node_type_to_id.keys())
+            for feat_name, feat_data in node_feature_collector.items():
+                existing_node_type = set(feat_data.keys())
+                assert all_node_types == existing_node_type, (
+                    f"Node feature {feat_name} does not cover all node types. "
+                    f"Existing types: {existing_node_type}. "
+                    f"Expected types: {all_node_types}."
+                )
+            all_edge_types = set(edge_type_to_id.keys())
+            for feat_name, feat_data in edge_feature_collector.items():
+                existing_edge_type = set(feat_data.keys())
+                assert all_edge_types == existing_edge_type, (
+                    f"Edge feature {feat_name} does not cover all edge types. "
+                    f"Existing types: {existing_edge_type}. "
+                    f"Expected types: {all_edge_types}."
+                )
+
+            for feat_name, feat_data in node_feature_collector.items():
+                _feat = next(iter(feat_data.values()))
+                feat_tensor = torch.empty(
+                    ([total_num_nodes] + list(_feat.shape[1:])),
+                    dtype=_feat.dtype,
+                )
+                for ntype, feat in feat_data.items():
+                    feat_tensor[
+                        node_type_offset[
+                            node_type_to_id[ntype]
+                        ] : node_type_offset[node_type_to_id[ntype] + 1]
+                    ] = feat
+                node_attributes[feat_name] = feat_tensor
+            del node_feature_collector
+            for feat_name, feat_data in edge_feature_collector.items():
+                _feat = next(iter(feat_data.values()))
+                feat_tensor = torch.empty(
+                    ([total_num_edges] + list(_feat.shape[1:])),
+                    dtype=_feat.dtype,
+                )
+                for etype, feat in feat_data.items():
+                    feat_tensor[
+                        edge_type_offset[
+                            edge_type_to_id[etype]
+                        ] : edge_type_offset[edge_type_to_id[etype] + 1]
+                    ] = feat
+                edge_attributes[feat_name] = feat_tensor
+            del edge_feature_collector
+
+    if not bool(node_attributes):
+        node_attributes = None
+    if not bool(edge_attributes):
+        edge_attributes = None
+
+    # Construct the FusedCSCSamplingGraph.
+    return fused_csc_sampling_graph(
+        csc_indptr=indptr,
+        indices=indices,
+        node_type_offset=node_type_offset,
+        type_per_edge=type_per_edge,
+        node_type_to_id=node_type_to_id,
+        edge_type_to_id=edge_type_to_id,
+        node_attributes=node_attributes,
+        edge_attributes=edge_attributes,
+    )
+
+
 def preprocess_ondisk_dataset(
     dataset_dir: str,
     include_original_edge_id: bool = False,
@@ -115,108 +352,20 @@ def preprocess_ondisk_dataset(
     os.makedirs(os.path.join(dataset_dir, processed_dir_prefix), exist_ok=True)
     output_config = deepcopy(input_config)
 
-    # 2. Load the edge data and create a DGLGraph.
+    # 2. Load the data and create a FusedCSCSamplingGraph.
     if "graph" not in input_config:
         raise RuntimeError("Invalid config: does not contain graph field.")
-    # For any graph that node/edge types are specified, we construct DGLGraph
-    # with `dgl.heterograph()` even there's only one node/edge type. This is
-    # because we want to save the node/edge types in the graph. So the logic of
-    # checking whether the graph is homogeneous is different from the logic in
-    # `DGLGraph.is_homogeneous()`. Otherwise, we construct DGLGraph with
-    # `dgl.graph()`.
-    is_homogeneous = (
-        len(input_config["graph"]["nodes"]) == 1
-        and len(input_config["graph"]["edges"]) == 1
-        and "type" not in input_config["graph"]["nodes"][0]
-        and "type" not in input_config["graph"]["edges"][0]
-    )
-    if is_homogeneous:
-        # Homogeneous graph.
-        num_nodes = input_config["graph"]["nodes"][0]["num"]
-        edge_fmt = input_config["graph"]["edges"][0]["format"]
-        edge_path = input_config["graph"]["edges"][0]["path"]
-        src, dst = read_edges(dataset_dir, edge_fmt, edge_path)
-        g = dgl.graph((src, dst), num_nodes=num_nodes)
-    else:
-        # Heterogeneous graph.
-        # Construct the num nodes dict.
-        num_nodes_dict = {}
-        for node_info in input_config["graph"]["nodes"]:
-            num_nodes_dict[node_info["type"]] = node_info["num"]
-        # Construct the data dict.
-        data_dict = {}
-        for edge_info in input_config["graph"]["edges"]:
-            edge_fmt = edge_info["format"]
-            edge_path = edge_info["path"]
-            src, dst = read_edges(dataset_dir, edge_fmt, edge_path)
-            data_dict[etype_str_to_tuple(edge_info["type"])] = (src, dst)
-        # Construct the heterograph.
-        g = dgl.heterograph(data_dict, num_nodes_dict)
-
-    # 3. Load the sampling related node/edge features and add them to
-    # the sampling-graph.
-    if input_config["graph"].get("feature_data", None):
-        for graph_feature in input_config["graph"]["feature_data"]:
-            in_memory = (
-                True
-                if "in_memory" not in graph_feature
-                else graph_feature["in_memory"]
-            )
-            if graph_feature["domain"] == "node":
-                node_data = read_data(
-                    os.path.join(dataset_dir, graph_feature["path"]),
-                    graph_feature["format"],
-                    in_memory=in_memory,
-                )
-                if is_homogeneous:
-                    g.ndata[graph_feature["name"]] = node_data
-                else:
-                    g.nodes[graph_feature["type"]].data[
-                        graph_feature["name"]
-                    ] = node_data
-            if graph_feature["domain"] == "edge":
-                edge_data = read_data(
-                    os.path.join(dataset_dir, graph_feature["path"]),
-                    graph_feature["format"],
-                    in_memory=in_memory,
-                )
-                if is_homogeneous:
-                    g.edata[graph_feature["name"]] = edge_data
-                else:
-                    g.edges[etype_str_to_tuple(graph_feature["type"])].data[
-                        graph_feature["name"]
-                    ] = edge_data
-        if not is_homogeneous:
-            # For heterogenous graph, a node/edge feature must cover all
-            # node/edge types.
-            ntypes = g.ntypes
-            assert all(
-                set(g.nodes[ntypes[0]].data.keys())
-                == set(g.nodes[ntype].data.keys())
-                for ntype in ntypes
-            ), (
-                "Node feature does not cover all node types: "
-                + f"{set(g.nodes[ntype].data.keys() for ntype in ntypes)}."
-            )
-            etypes = g.canonical_etypes
-            assert all(
-                set(g.edges[etypes[0]].data.keys())
-                == set(g.edges[etype].data.keys())
-                for etype in etypes
-            ), (
-                "Edge feature does not cover all edge types: "
-                + f"{set(g.edges[etype].data.keys() for etype in etypes)}."
-            )
 
-    # 4. Convert the DGLGraph to a FusedCSCSamplingGraph.
-    fused_csc_sampling_graph = from_dglgraph(
-        g, is_homogeneous, include_original_edge_id
+    sampling_graph = _graph_data_to_fused_csc_sampling_graph(
+        dataset_dir,
+        input_config["graph"],
+        include_original_edge_id,
     )
 
-    # 5. Record value of include_original_edge_id.
+    # 3. Record value of include_original_edge_id.
     output_config["include_original_edge_id"] = include_original_edge_id
 
-    # 6. Save the FusedCSCSamplingGraph and modify the output_config.
+    # 4. Save the FusedCSCSamplingGraph and modify the output_config.
     output_config["graph_topology"] = {}
     output_config["graph_topology"]["type"] = "FusedCSCSamplingGraph"
     output_config["graph_topology"]["path"] = os.path.join(
@@ -224,7 +373,7 @@ def preprocess_ondisk_dataset(
     )
 
     torch.save(
-        fused_csc_sampling_graph,
+        sampling_graph,
         os.path.join(
             dataset_dir,
             output_config["graph_topology"]["path"],
@@ -232,7 +381,7 @@ def preprocess_ondisk_dataset(
     )
     del output_config["graph"]
 
-    # 7. Load the node/edge features and do necessary conversion.
+    # 5. Load the node/edge features and do necessary conversion.
     if input_config.get("feature_data", None):
         has_edge_feature_data = False
         for feature, out_feature in zip(
@@ -259,7 +408,7 @@ def preprocess_ondisk_dataset(
         if has_edge_feature_data and not include_original_edge_id:
             dgl_warning("Edge feature is stored, but edge IDs are not saved.")
 
-    # 8. Save tasks and train/val/test split according to the output_config.
+    # 6. Save tasks and train/val/test split according to the output_config.
     if input_config.get("tasks", None):
         for input_task, output_task in zip(
             input_config["tasks"], output_config["tasks"]
@@ -286,13 +435,13 @@ def preprocess_ondisk_dataset(
                             output_data["format"],
                         )
 
-    # 9. Save the output_config.
+    # 7. Save the output_config.
     output_config_path = os.path.join(dataset_dir, preprocess_metadata_path)
     with open(output_config_path, "w") as f:
         yaml.dump(output_config, f)
     print("Finish preprocessing the on-disk dataset.")
 
-    # 10. Calculate and save the hash value of the dataset directory.
+    # 8. Calculate and save the hash value of the dataset directory.
     hash_value_file = "dataset_hash_value.txt"
     hash_value_file_path = os.path.join(
         dataset_dir, processed_dir_prefix, hash_value_file
@@ -303,7 +452,7 @@ def preprocess_ondisk_dataset(
     with open(hash_value_file_path, "w") as f:
         f.write(json.dumps(dir_hash, indent=4))
 
-    # 11. Return the absolute path of the preprocessing yaml file.
+    # 9. Return the absolute path of the preprocessing yaml file.
     return output_config_path
 
 
diff --git a/tests/python/pytorch/graphbolt/gb_test_utils.py b/tests/python/pytorch/graphbolt/gb_test_utils.py
index 59c4c3a90276..005d99b2cba3 100644
--- a/tests/python/pytorch/graphbolt/gb_test_utils.py
+++ b/tests/python/pytorch/graphbolt/gb_test_utils.py
@@ -92,8 +92,10 @@ def random_homo_graphbolt_graph(
 ):
     """Generate random graphbolt version homograph"""
     # Generate random edges.
-    nodes = np.repeat(np.arange(num_nodes), 5)
-    neighbors = np.random.randint(0, num_nodes, size=(num_edges))
+    nodes = np.repeat(np.arange(num_nodes, dtype=np.int64), 5)
+    neighbors = np.random.randint(
+        0, num_nodes, size=(num_edges), dtype=np.int64
+    )
     edges = np.stack([nodes, neighbors], axis=1)
     os.makedirs(os.path.join(test_dir, "edges"), exist_ok=True)
     assert edge_fmt in ["numpy", "csv"], print(
@@ -101,9 +103,9 @@ def random_homo_graphbolt_graph(
     )
     if edge_fmt == "csv":
         # Wrtie into edges/edge.csv
-        edges = pd.DataFrame(edges, columns=["src", "dst"])
+        edges_DataFrame = pd.DataFrame(edges, columns=["src", "dst"])
         edge_path = os.path.join("edges", "edge.csv")
-        edges.to_csv(
+        edges_DataFrame.to_csv(
             os.path.join(test_dir, edge_path),
             index=False,
             header=False,
@@ -136,7 +138,7 @@ def random_homo_graphbolt_graph(
         np.arange(each_set_size),
         np.arange(each_set_size, 2 * each_set_size),
     )
-    train_data = np.vstack(train_pairs).T.astype(np.int64)
+    train_data = np.vstack(train_pairs).T.astype(edges.dtype)
     train_path = os.path.join("set", "train.npy")
     np.save(os.path.join(test_dir, train_path), train_data)
 
@@ -144,7 +146,7 @@ def random_homo_graphbolt_graph(
         np.arange(each_set_size, 2 * each_set_size),
         np.arange(2 * each_set_size, 3 * each_set_size),
     )
-    validation_data = np.vstack(validation_pairs).T.astype(np.int64)
+    validation_data = np.vstack(validation_pairs).T.astype(edges.dtype)
     validation_path = os.path.join("set", "validation.npy")
     np.save(os.path.join(test_dir, validation_path), validation_data)
 
@@ -152,7 +154,7 @@ def random_homo_graphbolt_graph(
         np.arange(2 * each_set_size, 3 * each_set_size),
         np.arange(3 * each_set_size, 4 * each_set_size),
     )
-    test_data = np.vstack(test_pairs).T.astype(np.int64)
+    test_data = np.vstack(test_pairs).T.astype(edges.dtype)
     test_path = os.path.join("set", "test.npy")
     np.save(os.path.join(test_dir, test_path), test_data)
 
diff --git a/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py b/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py
index c1e02b0efca3..f5459b316517 100644
--- a/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py
+++ b/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py
@@ -1211,7 +1211,8 @@ def test_OnDiskDataset_preprocess_homogeneous_hardcode(edge_fmt="numpy"):
 
         # Generate edges.
         edges = np.array(
-            [[0, 0, 1, 1, 2, 2, 3, 3, 4, 4], [1, 2, 2, 3, 3, 4, 4, 0, 0, 1]]
+            [[0, 0, 1, 1, 2, 2, 3, 3, 4, 4], [1, 2, 2, 3, 3, 4, 4, 0, 0, 1]],
+            dtype=np.int64,
         ).T
         os.makedirs(os.path.join(test_dir, "edges"), exist_ok=True)
         edges = edges.T
@@ -1220,14 +1221,18 @@ def test_OnDiskDataset_preprocess_homogeneous_hardcode(edge_fmt="numpy"):
 
         # Generate graph edge-feats.
         edge_feats = np.array(
-            [0.0, 1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, 9.9]
+            [0.0, 1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, 9.9],
+            dtype=np.float64,
         )
         os.makedirs(os.path.join(test_dir, "data"), exist_ok=True)
         edge_feat_path = os.path.join("data", "edge-feat.npy")
         np.save(os.path.join(test_dir, edge_feat_path), edge_feats)
 
         # Generate node-feats.
-        node_feats = np.array([0.0, 1.9, 2.8, 3.7, 4.6])
+        node_feats = np.array(
+            [0.0, 1.9, 2.8, 3.7, 4.6],
+            dtype=np.float64,
+        )
         node_feat_path = os.path.join("data", "node-feat.npy")
         np.save(os.path.join(test_dir, node_feat_path), node_feats)
 
@@ -1391,45 +1396,50 @@ def test_OnDiskDataset_preprocess_heterogeneous_hardcode(edge_fmt="numpy"):
         # Generate edges.
         os.makedirs(os.path.join(test_dir, "edges"), exist_ok=True)
         np.save(
-            os.path.join(test_dir, "edges", "a_a.npy"), np.array([[0], [1]])
+            os.path.join(test_dir, "edges", "a_a.npy"),
+            np.array([[0], [1]], dtype=np.int64),
         )
         np.save(
             os.path.join(test_dir, "edges", "a_b.npy"),
-            np.array([[0, 1, 1], [0, 0, 1]]),
+            np.array([[0, 1, 1], [0, 0, 1]], dtype=np.int64),
         )
         np.save(
             os.path.join(test_dir, "edges", "b_b.npy"),
-            np.array([[0, 0, 1], [1, 2, 2]]),
+            np.array([[0, 0, 1], [1, 2, 2]], dtype=np.int64),
         )
         np.save(
             os.path.join(test_dir, "edges", "b_a.npy"),
-            np.array([[1, 2, 2], [0, 0, 1]]),
+            np.array([[1, 2, 2], [0, 0, 1]], dtype=np.int64),
         )
 
         # Generate node features.
         os.makedirs(os.path.join(test_dir, "data"), exist_ok=True)
         np.save(
-            os.path.join(test_dir, "data", "A-feat.npy"), np.array([0.0, 1.9])
+            os.path.join(test_dir, "data", "A-feat.npy"),
+            np.array([0.0, 1.9], dtype=np.float64),
         )
         np.save(
             os.path.join(test_dir, "data", "B-feat.npy"),
-            np.array([2.8, 3.7, 4.6]),
+            np.array([2.8, 3.7, 4.6], dtype=np.float64),
         )
 
         # Generate edge features.
         os.makedirs(os.path.join(test_dir, "data"), exist_ok=True)
-        np.save(os.path.join(test_dir, "data", "a_a-feat.npy"), np.array([0.0]))
+        np.save(
+            os.path.join(test_dir, "data", "a_a-feat.npy"),
+            np.array([0.0], dtype=np.float64),
+        )
         np.save(
             os.path.join(test_dir, "data", "a_b-feat.npy"),
-            np.array([1.1, 2.2, 3.3]),
+            np.array([1.1, 2.2, 3.3], dtype=np.float64),
         )
         np.save(
             os.path.join(test_dir, "data", "b_b-feat.npy"),
-            np.array([4.4, 5.5, 6.6]),
+            np.array([4.4, 5.5, 6.6], dtype=np.float64),
         )
         np.save(
             os.path.join(test_dir, "data", "b_a-feat.npy"),
-            np.array([7.7, 8.8, 9.9]),
+            np.array([7.7, 8.8, 9.9], dtype=np.float64),
         )
 
         yaml_content = (

From b569e4beafb7b7153d1761e1aef807b6e45c291b Mon Sep 17 00:00:00 2001
From: Mingbang Wang <100203018+Skeleton003@users.noreply.github.com>
Date: Sun, 18 Feb 2024 13:05:42 +0800
Subject: [PATCH 42/45] [Misc] Correct some typos (#7120)

---
 .../python/pytorch/graphbolt/gb_test_utils.py | 24 ++++++++++---------
 .../graphbolt/impl/test_ondisk_dataset.py     |  2 +-
 2 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/tests/python/pytorch/graphbolt/gb_test_utils.py b/tests/python/pytorch/graphbolt/gb_test_utils.py
index 005d99b2cba3..4084c6035f93 100644
--- a/tests/python/pytorch/graphbolt/gb_test_utils.py
+++ b/tests/python/pytorch/graphbolt/gb_test_utils.py
@@ -98,11 +98,12 @@ def random_homo_graphbolt_graph(
     )
     edges = np.stack([nodes, neighbors], axis=1)
     os.makedirs(os.path.join(test_dir, "edges"), exist_ok=True)
-    assert edge_fmt in ["numpy", "csv"], print(
-        "only numpy and csv are supported for edges."
-    )
+    assert edge_fmt in [
+        "numpy",
+        "csv",
+    ], "Only numpy and csv are supported for edges."
     if edge_fmt == "csv":
-        # Wrtie into edges/edge.csv
+        # Write into edges/edge.csv
         edges_DataFrame = pd.DataFrame(edges, columns=["src", "dst"])
         edge_path = os.path.join("edges", "edge.csv")
         edges_DataFrame.to_csv(
@@ -111,7 +112,7 @@ def random_homo_graphbolt_graph(
             header=False,
         )
     else:
-        # Wrtie into edges/edge.npy
+        # Write into edges/edge.npy
         edges = edges.T
         edge_path = os.path.join("edges", "edge.npy")
         np.save(os.path.join(test_dir, edge_path), edges)
@@ -160,7 +161,7 @@ def random_homo_graphbolt_graph(
 
     yaml_content = f"""
         dataset_name: {dataset_name}
-        graph: # graph structure and required attributes.
+        graph: # Graph structure and required attributes.
             nodes:
                 - num: {num_nodes}
             edges:
@@ -219,7 +220,7 @@ def random_homo_graphbolt_graph(
     return yaml_content
 
 
-def genereate_raw_data_for_hetero_dataset(
+def generate_raw_data_for_hetero_dataset(
     test_dir, dataset_name, num_nodes, num_edges, num_classes, edge_fmt="csv"
 ):
     # Generate edges.
@@ -229,9 +230,10 @@ def genereate_raw_data_for_hetero_dataset(
         src = torch.randint(0, num_nodes[src_ntype], (num_edge,))
         dst = torch.randint(0, num_nodes[dst_ntype], (num_edge,))
         os.makedirs(os.path.join(test_dir, "edges"), exist_ok=True)
-        assert edge_fmt in ["numpy", "csv"], print(
-            "only numpy and csv are supported for edges."
-        )
+        assert edge_fmt in [
+            "numpy",
+            "csv",
+        ], "Only numpy and csv are supported for edges."
         if edge_fmt == "csv":
             # Write into edges/edge.csv
             edges = pd.DataFrame(
@@ -290,7 +292,7 @@ def genereate_raw_data_for_hetero_dataset(
 
     yaml_content = f"""
         dataset_name: {dataset_name}
-        graph: # graph structure and required attributes.
+        graph: # Graph structure and required attributes.
           nodes:
             - type: user
               num: {num_nodes["user"]}
diff --git a/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py b/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py
index f5459b316517..285c52c7a1b7 100644
--- a/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py
+++ b/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py
@@ -2713,7 +2713,7 @@ def test_OnDiskDataset_heterogeneous(include_original_edge_id, edge_fmt):
             ("user", "click", "item"): 20000,
         }
         num_classes = 10
-        gbt.genereate_raw_data_for_hetero_dataset(
+        gbt.generate_raw_data_for_hetero_dataset(
             test_dir,
             dataset_name,
             num_nodes,

From 13cbad32ce165343acda401c9c2a44553b04ebd4 Mon Sep 17 00:00:00 2001
From: Andrei Ivanov <32910461+drivanov@users.noreply.github.com>
Date: Sun, 18 Feb 2024 18:09:15 -0800
Subject: [PATCH 43/45] Suppressing warnings generated by the
 `test_prop_nodes_topo` test. (#6651)

Co-authored-by: Hongzhi (Steve), Chen <chenhongzhi.nkcs@gmail.com>
---
 tests/python/common/test_propagate.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/tests/python/common/test_propagate.py b/tests/python/common/test_propagate.py
index e3d3ed21426a..1801dfb63d80 100644
--- a/tests/python/common/test_propagate.py
+++ b/tests/python/common/test_propagate.py
@@ -100,9 +100,17 @@ def test_prop_nodes_topo(idtype):
     tree.ndata["x"] = F.zeros((5, 2))
     # set all leaf nodes to be ones
     tree.nodes[[1, 3, 4]].data["x"] = F.ones((3, 2))
-    dgl.prop_nodes_topo(
-        tree, message_func=mfunc, reduce_func=rfunc, apply_node_func=None
-    )
+
+    # Filtering DGLWarning:
+    #    The input graph for the user-defined edge
+    #    function does not contain valid edges
+    import warnings
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore", category=UserWarning)
+        dgl.prop_nodes_topo(
+            tree, message_func=mfunc, reduce_func=rfunc, apply_node_func=None
+        )
     # root node get the sum
     assert F.allclose(tree.nodes[0].data["x"], F.tensor([[3.0, 3.0]]))
 

From 2df85862a51711b49abd95b8797900a6511599d3 Mon Sep 17 00:00:00 2001
From: Ramon Zhou <deluxurous@gmail.com>
Date: Tue, 20 Feb 2024 11:11:05 +0800
Subject: [PATCH 44/45] [GraphBolt] Correct `to_pyg_data` (#7124)

---
 python/dgl/graphbolt/minibatch.py                | 2 +-
 tests/python/pytorch/graphbolt/test_minibatch.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/dgl/graphbolt/minibatch.py b/python/dgl/graphbolt/minibatch.py
index aef6b31988a2..f75c72ab005c 100644
--- a/python/dgl/graphbolt/minibatch.py
+++ b/python/dgl/graphbolt/minibatch.py
@@ -500,7 +500,7 @@ def to_pyg_data(self):
             col_nodes = torch.cat(col_nodes)
             row_nodes = torch.cat(row_nodes)
             edge_index = torch.unique(
-                torch.stack((col_nodes, row_nodes)), dim=1
+                torch.stack((row_nodes, col_nodes)), dim=1
             )
 
         if self.node_features is None:
diff --git a/tests/python/pytorch/graphbolt/test_minibatch.py b/tests/python/pytorch/graphbolt/test_minibatch.py
index 1f708c84c664..7a428fd828d9 100644
--- a/tests/python/pytorch/graphbolt/test_minibatch.py
+++ b/tests/python/pytorch/graphbolt/test_minibatch.py
@@ -881,7 +881,7 @@ def test_to_pyg_data():
         original_column_node_ids=torch.tensor([10, 11]),
     )
     expected_edge_index = torch.tensor(
-        [[0, 0, 1, 1, 1, 2, 2, 3], [0, 1, 0, 1, 2, 1, 2, 2]]
+        [[0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 0, 1, 2, 1, 2, 3]]
     )
     expected_node_features = torch.tensor([[1], [2], [3], [4]])
     expected_labels = torch.tensor([0, 1])

From 3ced3411e55bca803ed5ec5e1de6f62e1f21478f Mon Sep 17 00:00:00 2001
From: caojy1998 <84027205+caojy1998@users.noreply.github.com>
Date: Tue, 20 Feb 2024 13:35:00 +0800
Subject: [PATCH 45/45] [Dataset] Add dataset ogbn-papers100M (#7096)

Co-authored-by: Ubuntu <ubuntu@ip-172-31-21-37.ap-northeast-1.compute.internal>
---
 python/dgl/graphbolt/impl/ondisk_dataset.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/python/dgl/graphbolt/impl/ondisk_dataset.py b/python/dgl/graphbolt/impl/ondisk_dataset.py
index ca95bcf8f3f1..b1494ef37914 100644
--- a/python/dgl/graphbolt/impl/ondisk_dataset.py
+++ b/python/dgl/graphbolt/impl/ondisk_dataset.py
@@ -877,6 +877,16 @@ class BuiltinDataset(OnDiskDataset):
             Reverse edges are added to the original graph and duplicated
             edges are removed.
 
+    **ogbn-papers100M**
+        The ogbn-papers100M dataset is a directed graph, representing the citation
+        network between all Computer Science (CS) arXiv papers indexed by MAG.
+        See more details in `ogbn-papers100M
+        <https://ogb.stanford.edu/docs/nodeprop/#ogbn-papers100M>`_.
+
+        .. note::
+            Reverse edges are added to the original graph and duplicated
+            edges are removed.
+
     **ogbn-products**
         The ogbn-products dataset is an undirected and unweighted graph,
         representing an Amazon product co-purchasing network. See more details
@@ -916,7 +926,7 @@ class BuiltinDataset(OnDiskDataset):
         "ogbn-products",
         "ogbn-arxiv",
     ]
-    _large_datasets = ["ogb-lsc-mag240m"]
+    _large_datasets = ["ogb-lsc-mag240m", "ogbn-papers100M"]
     _all_datasets = _datasets + _large_datasets
 
     def __init__(self, name: str, root: str = "datasets") -> OnDiskDataset: