Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Feature] K-hop subgraph sampling with support for heterogeneous directed graphs #6750

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
154 changes: 154 additions & 0 deletions python/dgl/subgraph.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
"out_subgraph",
"khop_in_subgraph",
"khop_out_subgraph",
"khop_subgraph",
]


Expand Down Expand Up @@ -984,6 +985,159 @@ def khop_out_subgraph(

DGLGraph.khop_out_subgraph = utils.alias_func(khop_out_subgraph)

def khop_subgraph(
graph, nodes, k, *,fanout=None, relabel_nodes=True, store_ids=True, output_device=None
):
"""Return the subgraph induced by k-hop neighborhood of the specified node(s) by treating directed edges as undireted while hopping.

We can expand a set of nodes by including the successors and predecessor of them. From a
specified node set, a k-hop subgraph is obtained by first repeating the node set
expansion for k times and then creating a node induced subgraph. In addition to
extracting the subgraph, DGL also copies the features of the extracted nodes and
edges to the resulting graph. The copy is *lazy* and incurs data movement only
when needed. We can control how many nodes to include using fanout.

If the graph is heterogeneous, DGL extracts a subgraph per relation and composes
them as the resulting graph. Thus the resulting graph has the same set of relations
as the input one.

Parameters
----------
graph : DGLGraph
The input graph.
nodes : nodes or dict[str, nodes]
The starting node(s) to expand, which cannot have any duplicate value. The result
will be undefined otherwise. The allowed formats are:

* Int: ID of a single node.
* Int Tensor: Each element is a node ID. The tensor must have the same device
type and ID data type as the graph's.
* iterable[int]: Each element is a node ID.

If the graph is homogeneous, one can directly pass the above formats.
Otherwise, the argument must be a dictionary with keys being node types
and values being the node IDs in the above formats.
k : int
The number of hops.
fanout: int, optinal
The number of successor and predeccesors each include when expanding. If None, include all
relabel_nodes : bool, optional
If True, it will remove the isolated nodes and relabel the rest nodes in the
extracted subgraph.
store_ids : bool, optional
If True, it will store the raw IDs of the extracted edges in the ``edata`` of the
resulting graph under name ``dgl.EID``; if ``relabel_nodes`` is ``True``, it will
also store the raw IDs of the extracted nodes in the ``ndata`` of the resulting
graph under name ``dgl.NID``.
output_device : Framework-specific device context object, optional
The output device. Default is the same as the input graph.

Returns
-------
DGLGraph
The subgraph.
Tensor or dict[str, Tensor], optional
The new IDs of the input :attr:`nodes` after node relabeling. This is returned
only when :attr:`relabel_nodes` is True. It is in the same form as :attr:`nodes`.

"""
import numpy as np
import torch
if graph.is_block:
raise DGLError("Extracting subgraph of a block graph is not allowed.")

is_mapping = isinstance(nodes, Mapping)
if not is_mapping:
assert (
len(graph.ntypes) == 1
), "need a dict of node type and IDs for graph with multiple node types"
nodes = {graph.ntypes[0]: nodes}

for nty, nty_nodes in nodes.items():
nodes[nty] = utils.prepare_tensor(
graph, nty_nodes, 'nodes["{}"]'.format(nty)
)

last_hop_nodes = nodes
k_hop_nodes_ = [last_hop_nodes]
device = context_of(nodes)
place_holder = F.copy_to(F.tensor([], dtype=graph.idtype), device)
for _ in range(k):
current_hop_nodes = {nty: [] for nty in graph.ntypes}
# add outgoing nbrs
for cetype in graph.canonical_etypes:
srctype, _, dsttype = cetype
_, out_nbrs = graph.out_edges(
last_hop_nodes.get(srctype, place_holder), etype=cetype
)
if fanout is not None and fanout<len(out_nbrs):
indices = torch.LongTensor(np.random.choice(len(out_nbrs),fanout, replace=False))
current_hop_nodes[dsttype].append((out_nbrs[indices]))
else:
current_hop_nodes[dsttype].append(out_nbrs)
# add incoming nbrs
for cetype in graph.canonical_etypes:
srctype, _, dsttype = cetype
in_nbrs, _ = graph.in_edges(
last_hop_nodes.get(dsttype, place_holder), etype=cetype
)
if fanout is not None and fanout<len(in_nbrs):
indices = torch.LongTensor(np.random.choice(len(in_nbrs),fanout, replace=False))
current_hop_nodes[srctype].append(in_nbrs[indices])
else:
current_hop_nodes[srctype].append(in_nbrs)
for nty in graph.ntypes:
if len(current_hop_nodes[nty]) == 0:
current_hop_nodes[nty] = place_holder
continue
current_hop_nodes[nty] = F.unique(
F.cat(current_hop_nodes[nty], dim=0)
)
k_hop_nodes_.append(current_hop_nodes)
last_hop_nodes = current_hop_nodes

k_hop_nodes = dict()
inverse_indices = dict()
for nty in graph.ntypes:
k_hop_nodes[nty], inverse_indices[nty] = F.unique(
F.cat(
[
hop_nodes.get(nty, place_holder)
for hop_nodes in k_hop_nodes_
],
dim=0,
),
return_inverse=True,
)


sub_g = node_subgraph(
graph, k_hop_nodes, relabel_nodes=relabel_nodes, store_ids=store_ids
)
if output_device is not None:
sub_g = sub_g.to(output_device)
if relabel_nodes:
if is_mapping:
seed_inverse_indices = dict()
for nty in nodes:
seed_inverse_indices[nty] = F.slice_axis(
inverse_indices[nty], axis=0, begin=0, end=len(nodes[nty])
)
else:
seed_inverse_indices = F.slice_axis(
inverse_indices[nty], axis=0, begin=0, end=len(nodes[nty])
)
if output_device is not None:
seed_inverse_indices = recursive_apply(
seed_inverse_indices, lambda x: F.copy_to(x, output_device)
)
return sub_g, seed_inverse_indices
else:
return sub_g


DGLGraph.khop_subgraph = utils.alias_func(khop_subgraph)


def node_type_subgraph(graph, ntypes, output_device=None):
"""Return the subgraph induced on given node types.
Expand Down
61 changes: 61 additions & 0 deletions tests/python/common/test_subgraph.py
Original file line number Diff line number Diff line change
Expand Up @@ -784,6 +784,67 @@ def test_khop_out_subgraph(idtype):
assert F.array_equal(F.astype(inv["user"], idtype), F.tensor([0], idtype))
assert F.array_equal(F.astype(inv["game"], idtype), F.tensor([0], idtype))

@parametrize_idtype
def test_khop_subgraph(idtype):
g = dgl.graph(
([0, 2, 0, 4, 2, 5], [1, 1, 2, 3, 4, 2]), idtype=idtype, device=F.ctx()
)
g.edata["w"] = F.tensor([[0, 1], [2, 3], [4, 5], [6, 7], [8, 9], [10, 11]])
sg, inv = dgl.khop_subgraph(g, 0, k=2)

assert sg.idtype == g.idtype
u, v = sg.edges()
edge_set = set(zip(list(F.asnumpy(u)), list(F.asnumpy(v))))
assert edge_set == {(0, 1), (2, 1), (4, 2), (2, 3), (0, 2)}
assert F.array_equal(
sg.edata[dgl.EID], F.tensor([0, 2, 1, 4, 5], dtype=idtype)
)
assert F.array_equal(
sg.edata["w"], F.tensor([[0, 1], [4, 5], [2, 3], [8, 9], [10, 11]])
)
assert F.array_equal(F.astype(inv, idtype), F.tensor([0], idtype))

# Test multiple nodes
sg, inv = dgl.khop_subgraph(g, [0, 2], k=1)
assert sg.num_edges() == 5

sg, inv = dgl.khop_subgraph(g, F.tensor([0, 2], idtype), k=1)
assert sg.num_edges() == 5

#test on hetrograph
g = dgl.heterograph(
{
("user", "plays", "game"): ([0, 1, 1, 2, 4, 5], [0, 0, 2, 1, 0, 2]),
("user", "follows", "user"): ([0, 1], [1, 3]),
},
idtype=idtype,
device=F.ctx(),
)
sg, inv = dgl.khop_subgraph(g, {"user": 0}, k=2)
assert sg.idtype == idtype
assert sg.num_nodes("game") == 2
assert sg.num_nodes("user") == 4
assert len(sg.ntypes) == 2
assert len(sg.etypes) == 2
u, v = sg["follows"].edges()
edge_set = set(zip(list(F.asnumpy(u)), list(F.asnumpy(v))))
assert edge_set == {(0, 1), (1, 2)}
u, v = sg["plays"].edges()
edge_set = set(zip(list(F.asnumpy(u)), list(F.asnumpy(v))))
assert edge_set == {(0, 0), (1, 0), (1, 1), (3, 0)}
assert F.array_equal(F.astype(inv["user"], idtype), F.tensor([0], idtype))

# Test multiple nodes
sg, inv = dgl.khop_subgraph(
g, {"user": F.tensor([2], idtype), "game": 0}, k=1
)
assert sg.num_edges("follows") == 1
u, v = sg["plays"].edges()
edge_set = set(zip(list(F.asnumpy(u)), list(F.asnumpy(v))))
assert edge_set == {(0, 0), (1, 0), (2, 1), (3, 0)}
assert F.array_equal(F.astype(inv["user"], idtype), F.tensor([2], idtype))
assert F.array_equal(F.astype(inv["game"], idtype), F.tensor([0], idtype))


@unittest.skipIf(not F.gpu_ctx(), "only necessary with GPU")
@pytest.mark.parametrize(
Expand Down