From 80203e34cd21516d2a503c671b0708f0b343ff45 Mon Sep 17 00:00:00 2001 From: hutiechuan <47250730+hutiechuan@users.noreply.github.com> Date: Thu, 11 Jan 2024 07:50:32 +0800 Subject: [PATCH 01/22] [GraphBolt] add node classification example for pyg model (#6872) Co-authored-by: Muhammed Fatih BALIN --- examples/sampling/pyg/README.md | 57 +++++ examples/sampling/pyg/node_classification.py | 235 +++++++++++++++++++ 2 files changed, 292 insertions(+) create mode 100644 examples/sampling/pyg/README.md create mode 100644 examples/sampling/pyg/node_classification.py diff --git a/examples/sampling/pyg/README.md b/examples/sampling/pyg/README.md new file mode 100644 index 000000000000..c75aa9b90dfe --- /dev/null +++ b/examples/sampling/pyg/README.md @@ -0,0 +1,57 @@ +## Overview + +This project demonstrates the training and evaluation of a GraphSAGE model for node classification on large graphs. The example utilizes GraphBolt for efficient data handling and PyG for the GNN training. + + +# Node classification on graph + +This example aims to demonstrate how to run node classification task on heterogeneous graph with **GraphBolt**. + +## Model + +The model is a three-layer GraphSAGE network implemented using PyTorch Geometric's SAGEConv layers. + + +## Default Run on `ogbn-arxiv` dataset + +``` +python node_classification.py +``` + + + + +## Accuracies +``` +Final performance(for ogbn-arxiv): +All runs: +Highest Train: 62.26 +Highest Valid: 59.89 +Final Train: 62.26 +Final Test: 52.78 +``` + + + +## Run on `ogbn-products` dataset + +### Sample on CPU and train/infer on CPU + +``` +python node_classification.py --dataset ogbn-products +``` + +## Accuracies +``` +Final performance(for ogbn-products): +All runs: +Highest Train: 90.79 +Highest Valid: 89.86 +Final Train: 90.79 +Final Test: 75.24 +``` + + + + + diff --git a/examples/sampling/pyg/node_classification.py b/examples/sampling/pyg/node_classification.py new file mode 100644 index 000000000000..923025298c83 --- /dev/null +++ b/examples/sampling/pyg/node_classification.py @@ -0,0 +1,235 @@ +""" +This script demonstrates node classification with GraphSAGE on large graphs, +merging GraphBolt (GB) and PyTorch Geometric (PyG). GraphBolt efficiently manages +data loading for large datasets, crucial for mini-batch processing. Post data +loading, PyG's user-friendly framework takes over for training, showcasing seamless +integration with GraphBolt. This combination offers an efficient alternative to +traditional Deep Graph Library (DGL) methods, highlighting adaptability and +scalability in handling large-scale graph data for diverse real-world applications. + + + +Key Features: +- Implements the GraphSAGE model, a scalable GNN, for node classification on large graphs. +- Utilizes GraphBolt, an efficient framework for large-scale graph data processing. +- Integrates with PyTorch Geometric for building and training the GraphSAGE model. +- The script is well-documented, providing clear explanations at each step. + +This flowchart describes the main functional sequence of the provided example. +main: + +main +│ +├───> Load and preprocess dataset (GraphBolt) +│ │ +│ └───> Utilize GraphBolt's BuiltinDataset for dataset handling +│ +├───> Instantiate the SAGE model (PyTorch Geometric) +│ │ +│ └───> Define the GraphSAGE model architecture +│ +├───> Train the model +│ │ +│ ├───> Mini-Batch Processing with GraphBolt +│ │ │ +│ │ └───> Efficient handling of mini-batches using GraphBolt's utilities +│ │ +│ └───> Training Loop +│ │ +│ ├───> Forward and backward passes +│ │ +│ └───> Parameters optimization +│ +└───> Evaluate the model + │ + └───> Performance assessment on validation and test datasets + │ + └───> Accuracy and other relevant metrics calculation + + +""" + +import argparse + +import dgl.graphbolt as gb +import torch +import torch.nn.functional as F +import torchmetrics.functional as MF +from torch_geometric.nn import SAGEConv + + +class GraphSAGE(torch.nn.Module): + ##################################################################### + # (HIGHLIGHT) Define the GraphSAGE model architecture. + # + # - This class inherits from `torch.nn.Module`. + # - Two convolutional layers are created using the SAGEConv class from PyG. + # - 'in_size', 'hidden_size', 'out_size' are the sizes of + # the input, hidden, and output features, respectively. + # - The forward method defines the computation performed at every call. + ##################################################################### + def __init__(self, in_size, hidden_size, out_size): + super(GraphSAGE, self).__init__() + self.layers = torch.nn.ModuleList() + self.layers.append(SAGEConv(in_size, hidden_size)) + self.layers.append(SAGEConv(hidden_size, hidden_size)) + self.layers.append(SAGEConv(hidden_size, out_size)) + + def forward(self, blocks, x, device): + h = x + for i, (layer, block) in enumerate(zip(self.layers, blocks)): + src, dst = block.edges() + edge_index = torch.stack([src, dst], dim=0) + h_src, h_dst = h, h[: block.number_of_dst_nodes()] + h = layer((h_src, h_dst), edge_index) + if i != len(blocks) - 1: + h = F.relu(h) + return h + + +def create_dataloader(dataset_set, graph, feature, device, is_train): + ##################################################################### + # (HIGHLIGHT) Create a data loader for efficiently loading graph data. + # + # - 'ItemSampler' samples mini-batches of node IDs from the dataset. + # - 'sample_neighbor' performs neighbor sampling on the graph. + # - 'FeatureFetcher' fetches node features based on the sampled subgraph. + # - 'CopyTo' copies the fetched data to the specified device. + + ##################################################################### + # Create a datapipe for mini-batch sampling with a specific neighbor fanout. + # Here, [10, 10, 10] specifies the number of neighbors sampled for each node at each layer. + # We're using `sample_neighbor` for consistency with DGL's sampling API. + # Note: GraphBolt offers additional sampling methods, such as `sample_layer_neighbor`, + # which could provide further optimization and efficiency for GNN training. + # Users are encouraged to explore these advanced features for potentially improved performance. + + # Initialize an ItemSampler to sample mini-batches from the dataset. + datapipe = gb.ItemSampler( + dataset_set, batch_size=1024, shuffle=is_train, drop_last=is_train + ) + # Sample neighbors for each node in the mini-batch. + datapipe = datapipe.sample_neighbor(graph, [10, 10, 10]) + # Fetch node features for the sampled subgraph. + datapipe = datapipe.fetch_feature(feature, node_feature_keys=["feat"]) + # Copy the data to the specified device. + datapipe = datapipe.copy_to(device=device) + # Create and return a DataLoader to handle data loading. + dataloader = gb.DataLoader(datapipe, num_workers=0) + + return dataloader + + +def train(model, dataloader, optimizer, criterion, device, num_classes): + ##################################################################### + # (HIGHLIGHT) Train the model for one epoch. + # + # - Iterates over the data loader, fetching mini-batches of graph data. + # - For each mini-batch, it performs a forward pass, computes loss, and + # updates the model parameters. + # - The function returns the average loss and accuracy for the epoch. + # + # Parameters: + # model: The GraphSAGE model. + # dataloader: DataLoader that provides mini-batches of graph data. + # optimizer: Optimizer used for updating model parameters. + # criterion: Loss function used for training. + # device: The device (CPU/GPU) to run the training on. + ##################################################################### + + model.train() # Set the model to training mode + total_loss = 0 # Accumulator for the total loss + total_correct = 0 # Accumulator for the total number of correct predictions + total_samples = 0 # Accumulator for the total number of samples processed + num_batches = 0 # Counter for the number of mini-batches processed + + for minibatch in dataloader: + node_features = minibatch.node_features["feat"] + labels = minibatch.labels + optimizer.zero_grad() + out = model(minibatch.blocks, node_features, device) + loss = criterion(out, labels) + total_loss += loss.item() + total_correct += MF.accuracy( + out, labels, task="multiclass", num_classes=num_classes + ) * labels.size(0) + total_samples += labels.size(0) + loss.backward() + optimizer.step() + num_batches += 1 + avg_loss = total_loss / num_batches + avg_accuracy = total_correct / total_samples + return avg_loss, avg_accuracy + + +@torch.no_grad() +def evaluate(model, dataloader, device, num_classes): + model.eval() + y_hats = [] + ys = [] + for minibatch in dataloader: + node_features = minibatch.node_features["feat"] + labels = minibatch.labels + out = model(minibatch.blocks, node_features, device) + y_hats.append(out) + ys.append(labels) + + return MF.accuracy( + torch.cat(y_hats), + torch.cat(ys), + task="multiclass", + num_classes=num_classes, + ) + + +def main(): + parser = argparse.ArgumentParser( + description="Which dataset are you going to use?" + ) + parser.add_argument( + "--dataset", + type=str, + default="ogbn-arxiv", + help='Name of the dataset to use (e.g., "ogbn-products", "ogbn-arxiv")', + ) + args = parser.parse_args() + dataset_name = args.dataset + dataset = gb.BuiltinDataset(dataset_name).load() + graph = dataset.graph + feature = dataset.feature + train_set = dataset.tasks[0].train_set + valid_set = dataset.tasks[0].validation_set + test_set = dataset.tasks[0].test_set + num_classes = dataset.tasks[0].metadata["num_classes"] + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + train_dataloader = create_dataloader( + train_set, graph, feature, device, is_train=True + ) + valid_dataloader = create_dataloader( + valid_set, graph, feature, device, is_train=False + ) + test_dataloader = create_dataloader( + test_set, graph, feature, device, is_train=False + ) + in_channels = feature.size("node", None, "feat")[0] + hidden_channels = 128 + model = GraphSAGE(in_channels, hidden_channels, num_classes).to(device) + optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4) + criterion = torch.nn.CrossEntropyLoss() + for epoch in range(10): + train_loss, train_accuracy = train( + model, train_dataloader, optimizer, criterion, device, num_classes + ) + + valid_accuracy = evaluate(model, valid_dataloader, device, num_classes) + print( + f"Epoch {epoch}, Train Loss: {train_loss:.4f}, Train Accuracy: {valid_accuracy:.4f}, " + f"Valid Accuracy: {valid_accuracy:.4f}" + ) + test_accuracy = evaluate(model, test_dataloader, device, num_classes) + print(f"Test Accuracy: {test_accuracy:.4f}") + + +if __name__ == "__main__": + main() From b6087efd7f59a71ed691f07370939f5cdc3706bc Mon Sep 17 00:00:00 2001 From: Muhammed Fatih BALIN Date: Wed, 10 Jan 2024 22:04:22 -0500 Subject: [PATCH 02/22] [GraphBolt] PyG example typo fix (#6931) --- examples/sampling/pyg/node_classification.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/sampling/pyg/node_classification.py b/examples/sampling/pyg/node_classification.py index 923025298c83..a34fbf4abecc 100644 --- a/examples/sampling/pyg/node_classification.py +++ b/examples/sampling/pyg/node_classification.py @@ -224,7 +224,7 @@ def main(): valid_accuracy = evaluate(model, valid_dataloader, device, num_classes) print( - f"Epoch {epoch}, Train Loss: {train_loss:.4f}, Train Accuracy: {valid_accuracy:.4f}, " + f"Epoch {epoch}, Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}, " f"Valid Accuracy: {valid_accuracy:.4f}" ) test_accuracy = evaluate(model, test_dataloader, device, num_classes) From 3ff7ad9d2de4b520138b2a7fccfaa00801712d05 Mon Sep 17 00:00:00 2001 From: Rhett Ying <85214957+Rhett-Ying@users.noreply.github.com> Date: Thu, 11 Jan 2024 14:34:21 +0800 Subject: [PATCH 03/22] [GraphBolt] update notebooks about numpy edges (#6936) --- .../stochastic_training/ondisk-dataset-specification.rst | 6 ++++-- .../stochastic_training/ondisk_dataset_heterograph.ipynb | 7 ++++--- .../stochastic_training/ondisk_dataset_homograph.ipynb | 5 +++-- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/docs/source/stochastic_training/ondisk-dataset-specification.rst b/docs/source/stochastic_training/ondisk-dataset-specification.rst index b567247ef2d3..0587b26a8806 100644 --- a/docs/source/stochastic_training/ondisk-dataset-specification.rst +++ b/docs/source/stochastic_training/ondisk-dataset-specification.rst @@ -122,8 +122,10 @@ The ``graph`` field is used to specify the graph structure. It has two fields: homogeneous graphs. For heterogeneous graphs, it is the edge type. - ``format``: ``string`` - The ``format`` field is used to specify the format of the edge data. It can - only be ``csv`` for now. + The ``format`` field is used to specify the format of the edge data. It + can be ``csv`` or ``numpy``. If it is ``csv``, no ``index`` and ``header`` + fields are needed. If it is ``numpy``, the array requires to be in shape + of ``(2, num_edges)``. ``numpy`` format is recommended for large graphs. - ``path``: ``string`` The ``path`` field is used to specify the path of the edge data. It is diff --git a/notebooks/stochastic_training/ondisk_dataset_heterograph.ipynb b/notebooks/stochastic_training/ondisk_dataset_heterograph.ipynb index 0c28e8c402e2..8b2cb3ead2d5 100644 --- a/notebooks/stochastic_training/ondisk_dataset_heterograph.ipynb +++ b/notebooks/stochastic_training/ondisk_dataset_heterograph.ipynb @@ -103,10 +103,11 @@ "cell_type": "markdown", "source": [ "### Generate graph structure data\n", - "For heterogeneous graph, we need to save different edge edges(namely node pairs) into separate **CSV** files.\n", + "For heterogeneous graph, we need to save different edge edges(namely node pairs) into separate **Numpy** or **CSV** files.\n", "\n", - "**Note**:\n", - "when saving to file, do not save index and header.\n" + "Note:\n", + "- when saving to **Numpy**, the array requires to be in shape of `(2, N)`. This format is recommended as constructing graph from it is much faster than **CSV** file.\n", + "- when saving to **CSV** file, do not save index and header.\n" ], "metadata": { "id": "qhNtIn_xhlnl" diff --git a/notebooks/stochastic_training/ondisk_dataset_homograph.ipynb b/notebooks/stochastic_training/ondisk_dataset_homograph.ipynb index 0f2e99b4442e..5ce4c8f168a6 100644 --- a/notebooks/stochastic_training/ondisk_dataset_homograph.ipynb +++ b/notebooks/stochastic_training/ondisk_dataset_homograph.ipynb @@ -103,10 +103,11 @@ "cell_type": "markdown", "source": [ "### Generate graph structure data\n", - "For homogeneous graph, we just need to save edges(namely node pairs) into **CSV** file.\n", + "For homogeneous graph, we just need to save edges(namely node pairs) into **Numpy** or **CSV** file.\n", "\n", "Note:\n", - "when saving to file, do not save index and header.\n" + "- when saving to **Numpy**, the array requires to be in shape of `(2, N)`. This format is recommended as constructing graph from it is much faster than **CSV** file.\n", + "- when saving to **CSV** file, do not save index and header.\n" ], "metadata": { "id": "qhNtIn_xhlnl" From afca11142771a0d99c3fbdbe1c17ad4dc775b934 Mon Sep 17 00:00:00 2001 From: peizhou001 <110809584+peizhou001@users.noreply.github.com> Date: Thu, 11 Jan 2024 16:29:50 +0800 Subject: [PATCH 04/22] [Graphbolt]Fix negative sampler (#6933) Co-authored-by: Ubuntu --- .../graphbolt/fused_csc_sampling_graph.h | 26 --- graphbolt/src/fused_csc_sampling_graph.cc | 12 -- graphbolt/src/python_binding.cc | 3 - .../impl/fused_csc_sampling_graph.py | 32 ++-- .../impl/uniform_negative_sampler.py | 15 +- .../graphbolt/impl/test_negative_sampler.py | 8 +- .../pytorch/graphbolt/test_integration.py | 153 ++++++++---------- 7 files changed, 98 insertions(+), 151 deletions(-) diff --git a/graphbolt/include/graphbolt/fused_csc_sampling_graph.h b/graphbolt/include/graphbolt/fused_csc_sampling_graph.h index 6c60df2e4a33..2423ed982c5d 100644 --- a/graphbolt/include/graphbolt/fused_csc_sampling_graph.h +++ b/graphbolt/include/graphbolt/fused_csc_sampling_graph.h @@ -359,32 +359,6 @@ class FusedCSCSamplingGraph : public torch::CustomClassHolder { torch::optional node_timestamp_attr_name, torch::optional edge_timestamp_attr_name) const; - /** - * @brief Sample negative edges by randomly choosing negative - * source-destination pairs according to a uniform distribution. For each edge - * ``(u, v)``, it is supposed to generate `negative_ratio` pairs of negative - * edges ``(u, v')``, where ``v'`` is chosen uniformly from all the nodes in - * the graph. - * - * @param node_pairs A tuple of two 1D tensors that represent the source and - * destination of positive edges, with 'positive' indicating that these edges - * are present in the graph. It's important to note that within the context of - * a heterogeneous graph, the ids in these tensors signify heterogeneous ids. - * @param negative_ratio The ratio of the number of negative samples to - * positive samples. - * @param max_node_id The maximum ID of the node to be selected. It - * should correspond to the number of nodes of a specific type. - * - * @return A tuple consisting of two 1D tensors represents the source and - * destination of negative edges. In the context of a heterogeneous - * graph, both the input nodes and the selected nodes are represented - * by heterogeneous IDs. Note that negative refers to false negatives, - * which means the edge could be present or not present in the graph. - */ - std::tuple SampleNegativeEdgesUniform( - const std::tuple& node_pairs, - int64_t negative_ratio, int64_t max_node_id) const; - /** * @brief Copy the graph to shared memory. * @param shared_memory_name The name of the shared memory. diff --git a/graphbolt/src/fused_csc_sampling_graph.cc b/graphbolt/src/fused_csc_sampling_graph.cc index 8432d8a2da01..431ee2f752b0 100644 --- a/graphbolt/src/fused_csc_sampling_graph.cc +++ b/graphbolt/src/fused_csc_sampling_graph.cc @@ -712,18 +712,6 @@ FusedCSCSamplingGraph::TemporalSampleNeighbors( } } -std::tuple -FusedCSCSamplingGraph::SampleNegativeEdgesUniform( - const std::tuple& node_pairs, - int64_t negative_ratio, int64_t max_node_id) const { - torch::Tensor pos_src; - std::tie(pos_src, std::ignore) = node_pairs; - auto neg_len = pos_src.size(0) * negative_ratio; - auto neg_src = pos_src.repeat(negative_ratio); - auto neg_dst = torch::randint(0, max_node_id, {neg_len}, pos_src.options()); - return std::make_tuple(neg_src, neg_dst); -} - static c10::intrusive_ptr BuildGraphFromSharedMemoryHelper(SharedMemoryHelper&& helper) { helper.InitializeRead(); diff --git a/graphbolt/src/python_binding.cc b/graphbolt/src/python_binding.cc index c60ad4b91180..44b6306d890d 100644 --- a/graphbolt/src/python_binding.cc +++ b/graphbolt/src/python_binding.cc @@ -52,9 +52,6 @@ TORCH_LIBRARY(graphbolt, m) { .def( "temporal_sample_neighbors", &FusedCSCSamplingGraph::TemporalSampleNeighbors) - .def( - "sample_negative_edges_uniform", - &FusedCSCSamplingGraph::SampleNegativeEdgesUniform) .def("copy_to_shared_memory", &FusedCSCSamplingGraph::CopyToSharedMemory) .def_pickle( // __getstate__ diff --git a/python/dgl/graphbolt/impl/fused_csc_sampling_graph.py b/python/dgl/graphbolt/impl/fused_csc_sampling_graph.py index 80ff42433934..486bd59caccb 100644 --- a/python/dgl/graphbolt/impl/fused_csc_sampling_graph.py +++ b/python/dgl/graphbolt/impl/fused_csc_sampling_graph.py @@ -876,7 +876,8 @@ def sample_negative_edges_uniform( pairs according to a uniform distribution. For each edge ``(u, v)``, it is supposed to generate `negative_ratio` pairs of negative edges ``(u, v')``, where ``v'`` is chosen uniformly from all the nodes in - the graph. + the graph. As ``u`` is exactly same as the corresponding positive edges, + it returns None for negative sources. Parameters ---------- @@ -903,23 +904,22 @@ def sample_negative_edges_uniform( `edge_type`. Note that negative refers to false negatives, which means the edge could be present or not present in the graph. """ - if edge_type is not None: - assert ( - self.node_type_offset is not None - ), "The 'node_type_offset' array is necessary for performing \ - negative sampling by edge type." - _, _, dst_node_type = etype_str_to_tuple(edge_type) - dst_node_type_id = self.node_type_to_id[dst_node_type] - offset = self._node_type_offset_list - max_node_id = ( - offset[dst_node_type_id + 1] - offset[dst_node_type_id] - ) + if edge_type: + _, _, dst_ntype = etype_str_to_tuple(edge_type) + max_node_id = self.num_nodes[dst_ntype] else: max_node_id = self.total_num_nodes - return self._c_csc_graph.sample_negative_edges_uniform( - node_pairs, - negative_ratio, - max_node_id, + pos_src, _ = node_pairs + num_negative = pos_src.size(0) * negative_ratio + return ( + None, + torch.randint( + 0, + max_node_id, + (num_negative,), + dtype=pos_src.dtype, + device=pos_src.device, + ), ) def copy_to_shared_memory(self, shared_memory_name: str): diff --git a/python/dgl/graphbolt/impl/uniform_negative_sampler.py b/python/dgl/graphbolt/impl/uniform_negative_sampler.py index 512bd7ab5bc9..f979fd603249 100644 --- a/python/dgl/graphbolt/impl/uniform_negative_sampler.py +++ b/python/dgl/graphbolt/impl/uniform_negative_sampler.py @@ -32,20 +32,23 @@ class UniformNegativeSampler(NegativeSampler): Examples -------- >>> from dgl import graphbolt as gb - >>> indptr = torch.LongTensor([0, 2, 4, 5]) - >>> indices = torch.LongTensor([1, 2, 0, 2, 0]) + >>> indptr = torch.LongTensor([0, 1, 2, 3, 4]) + >>> indices = torch.LongTensor([1, 2, 3, 0]) >>> graph = gb.fused_csc_sampling_graph(indptr, indices) - >>> node_pairs = (torch.tensor([0, 1]), torch.tensor([1, 2])) + >>> node_pairs = torch.tensor([[0, 1], [1, 2], [2, 3], [3, 0]]) >>> item_set = gb.ItemSet(node_pairs, names="node_pairs") >>> item_sampler = gb.ItemSampler( - ... item_set, batch_size=1,) + ... item_set, batch_size=4,) >>> neg_sampler = gb.UniformNegativeSampler( ... item_sampler, graph, 2) >>> for minibatch in neg_sampler: ... print(minibatch.negative_srcs) ... print(minibatch.negative_dsts) - (tensor([0, 0, 0]), tensor([1, 1, 2]), tensor([1, 0, 0])) - (tensor([1, 1, 1]), tensor([2, 1, 2]), tensor([1, 0, 0])) + None + tensor([[2, 1], + [2, 1], + [3, 2], + [1, 3]]) """ def __init__( diff --git a/tests/python/pytorch/graphbolt/impl/test_negative_sampler.py b/tests/python/pytorch/graphbolt/impl/test_negative_sampler.py index 7905f32798f1..577ade0e6f3f 100644 --- a/tests/python/pytorch/graphbolt/impl/test_negative_sampler.py +++ b/tests/python/pytorch/graphbolt/impl/test_negative_sampler.py @@ -46,8 +46,7 @@ def test_UniformNegativeSampler_invoke(): def _verify(negative_sampler): for data in negative_sampler: # Assertation - assert data.negative_srcs.size(0) == batch_size - assert data.negative_srcs.size(1) == negative_ratio + assert data.negative_srcs is None assert data.negative_dsts.size(0) == batch_size assert data.negative_dsts.size(1) == negative_ratio @@ -90,12 +89,9 @@ def test_Uniform_NegativeSampler(negative_ratio): # Assertation assert len(pos_src) == batch_size assert len(pos_dst) == batch_size - assert len(neg_src) == batch_size assert len(neg_dst) == batch_size - assert neg_src.numel() == batch_size * negative_ratio + assert neg_src is None assert neg_dst.numel() == batch_size * negative_ratio - expected_src = pos_src.repeat(negative_ratio).view(-1, negative_ratio) - assert torch.equal(expected_src, neg_src) def get_hetero_graph(): diff --git a/tests/python/pytorch/graphbolt/test_integration.py b/tests/python/pytorch/graphbolt/test_integration.py index e6c16567b8ba..fe3b5c0a2daa 100644 --- a/tests/python/pytorch/graphbolt/test_integration.py +++ b/tests/python/pytorch/graphbolt/test_integration.py @@ -48,7 +48,7 @@ def test_integration_link_prediction(): } feature_store = gb.BasicFeatureStore(features) datapipe = gb.ItemSampler(item_set, batch_size=4) - datapipe = datapipe.sample_uniform_negative(graph, 1) + datapipe = datapipe.sample_uniform_negative(graph, 2) fanouts = torch.LongTensor([1]) datapipe = datapipe.sample_neighbor(graph, [fanouts, fanouts], replace=True) datapipe = datapipe.transform(gb.exclude_seed_edges) @@ -62,23 +62,23 @@ def test_integration_link_prediction(): str( """MiniBatch(seed_nodes=None, sampled_subgraphs=[SampledSubgraphImpl(sampled_csc=CSCFormatBase(indptr=tensor([0, 1, 1, 1, 1, 1, 2]), - indices=tensor([5, 4]), + indices=tensor([0, 4]), ), original_row_node_ids=tensor([5, 3, 1, 2, 0, 4]), original_edge_ids=None, original_column_node_ids=tensor([5, 3, 1, 2, 0, 4]), ), - SampledSubgraphImpl(sampled_csc=CSCFormatBase(indptr=tensor([0, 1, 1, 1, 1, 1]), - indices=tensor([5]), + SampledSubgraphImpl(sampled_csc=CSCFormatBase(indptr=tensor([0, 1, 1, 1, 1, 1, 2]), + indices=tensor([5, 4]), ), original_row_node_ids=tensor([5, 3, 1, 2, 0, 4]), original_edge_ids=None, - original_column_node_ids=tensor([5, 3, 1, 2, 0]), + original_column_node_ids=tensor([5, 3, 1, 2, 0, 4]), )], positive_node_pairs=(tensor([0, 1, 1, 1]), tensor([2, 3, 3, 1])), - node_pairs_with_labels=((tensor([0, 1, 1, 1, 0, 1, 1, 1]), tensor([2, 3, 3, 1, 4, 4, 1, 4])), - tensor([1., 1., 1., 1., 0., 0., 0., 0.])), + node_pairs_with_labels=((tensor([0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1]), tensor([2, 3, 3, 1, 4, 4, 1, 4, 0, 1, 1, 5])), + tensor([1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.])), node_pairs=(tensor([5, 3, 3, 3]), tensor([1, 2, 2, 3])), node_features={'feat': tensor([[0.5160, 0.2486], @@ -87,131 +87,120 @@ def test_integration_link_prediction(): [0.2109, 0.1089], [0.9634, 0.2294], [0.5503, 0.8223]])}, - negative_srcs=tensor([[5], - [3], - [3], - [3]]), - negative_node_pairs=(tensor([0, 1, 1, 1]), - tensor([4, 4, 1, 4])), - negative_dsts=tensor([[0], - [0], - [3], - [0]]), + negative_srcs=None, + negative_node_pairs=(tensor([0, 0, 1, 1, 1, 1, 1, 1]), + tensor([4, 4, 1, 4, 0, 1, 1, 5])), + negative_dsts=tensor([[0, 0], + [3, 0], + [5, 3], + [3, 4]]), labels=None, input_nodes=tensor([5, 3, 1, 2, 0, 4]), edge_features=[{}, {}], compacted_node_pairs=(tensor([0, 1, 1, 1]), tensor([2, 3, 3, 1])), - compacted_negative_srcs=tensor([[0], - [1], - [1], - [1]]), - compacted_negative_dsts=tensor([[4], - [4], - [1], - [4]]), + compacted_negative_srcs=None, + compacted_negative_dsts=tensor([[4, 4], + [1, 4], + [0, 1], + [1, 5]]), blocks=[Block(num_src_nodes=6, num_dst_nodes=6, num_edges=2), - Block(num_src_nodes=6, num_dst_nodes=5, num_edges=1)], + Block(num_src_nodes=6, num_dst_nodes=6, num_edges=2)], )""" ), str( """MiniBatch(seed_nodes=None, - sampled_subgraphs=[SampledSubgraphImpl(sampled_csc=CSCFormatBase(indptr=tensor([0, 0, 0, 0, 1, 2]), - indices=tensor([1, 3]), + sampled_subgraphs=[SampledSubgraphImpl(sampled_csc=CSCFormatBase(indptr=tensor([0, 0, 0, 0, 1, 2, 3]), + indices=tensor([4, 1, 0]), ), - original_row_node_ids=tensor([3, 4, 0, 5, 1]), + original_row_node_ids=tensor([3, 4, 0, 1, 5, 2]), original_edge_ids=None, - original_column_node_ids=tensor([3, 4, 0, 5, 1]), + original_column_node_ids=tensor([3, 4, 0, 1, 5, 2]), ), - SampledSubgraphImpl(sampled_csc=CSCFormatBase(indptr=tensor([0, 0, 0, 0, 1, 2]), - indices=tensor([1, 3]), + SampledSubgraphImpl(sampled_csc=CSCFormatBase(indptr=tensor([0, 0, 0, 0, 1, 2, 3]), + indices=tensor([4, 4, 0]), ), - original_row_node_ids=tensor([3, 4, 0, 5, 1]), + original_row_node_ids=tensor([3, 4, 0, 1, 5, 2]), original_edge_ids=None, - original_column_node_ids=tensor([3, 4, 0, 5, 1]), + original_column_node_ids=tensor([3, 4, 0, 1, 5, 2]), )], positive_node_pairs=(tensor([0, 1, 1, 2]), tensor([0, 0, 1, 1])), - node_pairs_with_labels=((tensor([0, 1, 1, 2, 0, 1, 1, 2]), tensor([0, 0, 1, 1, 1, 1, 3, 4])), - tensor([1., 1., 1., 1., 0., 0., 0., 0.])), + node_pairs_with_labels=((tensor([0, 1, 1, 2, 0, 0, 1, 1, 1, 1, 2, 2]), tensor([0, 0, 1, 1, 3, 4, 5, 4, 1, 0, 3, 4])), + tensor([1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.])), node_pairs=(tensor([3, 4, 4, 0]), tensor([3, 3, 4, 4])), node_features={'feat': tensor([[0.8672, 0.2276], [0.5503, 0.8223], [0.9634, 0.2294], + [0.6172, 0.7865], [0.5160, 0.2486], - [0.6172, 0.7865]])}, - negative_srcs=tensor([[3], - [4], - [4], - [0]]), - negative_node_pairs=(tensor([0, 1, 1, 2]), - tensor([1, 1, 3, 4])), - negative_dsts=tensor([[4], - [4], - [5], - [1]]), + [0.2109, 0.1089]])}, + negative_srcs=None, + negative_node_pairs=(tensor([0, 0, 1, 1, 1, 1, 2, 2]), + tensor([3, 4, 5, 4, 1, 0, 3, 4])), + negative_dsts=tensor([[1, 5], + [2, 5], + [4, 3], + [1, 5]]), labels=None, - input_nodes=tensor([3, 4, 0, 5, 1]), + input_nodes=tensor([3, 4, 0, 1, 5, 2]), edge_features=[{}, {}], compacted_node_pairs=(tensor([0, 1, 1, 2]), tensor([0, 0, 1, 1])), - compacted_negative_srcs=tensor([[0], - [1], - [1], - [2]]), - compacted_negative_dsts=tensor([[1], - [1], - [3], - [4]]), - blocks=[Block(num_src_nodes=5, num_dst_nodes=5, num_edges=2), - Block(num_src_nodes=5, num_dst_nodes=5, num_edges=2)], + compacted_negative_srcs=None, + compacted_negative_dsts=tensor([[3, 4], + [5, 4], + [1, 0], + [3, 4]]), + blocks=[Block(num_src_nodes=6, num_dst_nodes=6, num_edges=3), + Block(num_src_nodes=6, num_dst_nodes=6, num_edges=3)], )""" ), str( """MiniBatch(seed_nodes=None, - sampled_subgraphs=[SampledSubgraphImpl(sampled_csc=CSCFormatBase(indptr=tensor([0, 0, 1]), - indices=tensor([1]), + sampled_subgraphs=[SampledSubgraphImpl(sampled_csc=CSCFormatBase(indptr=tensor([0, 0, 1, 1, 2]), + indices=tensor([1, 0]), ), - original_row_node_ids=tensor([5, 4]), + original_row_node_ids=tensor([5, 4, 0, 1]), original_edge_ids=None, - original_column_node_ids=tensor([5, 4]), + original_column_node_ids=tensor([5, 4, 0, 1]), ), - SampledSubgraphImpl(sampled_csc=CSCFormatBase(indptr=tensor([0, 0, 1]), - indices=tensor([1]), + SampledSubgraphImpl(sampled_csc=CSCFormatBase(indptr=tensor([0, 0, 1, 1, 2]), + indices=tensor([1, 0]), ), - original_row_node_ids=tensor([5, 4]), + original_row_node_ids=tensor([5, 4, 0, 1]), original_edge_ids=None, - original_column_node_ids=tensor([5, 4]), + original_column_node_ids=tensor([5, 4, 0, 1]), )], positive_node_pairs=(tensor([0, 1]), tensor([0, 0])), - node_pairs_with_labels=((tensor([0, 1, 0, 1]), tensor([0, 0, 0, 0])), - tensor([1., 1., 0., 0.])), + node_pairs_with_labels=((tensor([0, 1, 0, 0, 1, 1]), tensor([0, 0, 2, 1, 2, 3])), + tensor([1., 1., 0., 0., 0., 0.])), node_pairs=(tensor([5, 4]), tensor([5, 5])), node_features={'feat': tensor([[0.5160, 0.2486], - [0.5503, 0.8223]])}, - negative_srcs=tensor([[5], - [4]]), - negative_node_pairs=(tensor([0, 1]), - tensor([0, 0])), - negative_dsts=tensor([[5], - [5]]), + [0.5503, 0.8223], + [0.9634, 0.2294], + [0.6172, 0.7865]])}, + negative_srcs=None, + negative_node_pairs=(tensor([0, 0, 1, 1]), + tensor([2, 1, 2, 3])), + negative_dsts=tensor([[0, 4], + [0, 1]]), labels=None, - input_nodes=tensor([5, 4]), + input_nodes=tensor([5, 4, 0, 1]), edge_features=[{}, {}], compacted_node_pairs=(tensor([0, 1]), tensor([0, 0])), - compacted_negative_srcs=tensor([[0], - [1]]), - compacted_negative_dsts=tensor([[0], - [0]]), - blocks=[Block(num_src_nodes=2, num_dst_nodes=2, num_edges=1), - Block(num_src_nodes=2, num_dst_nodes=2, num_edges=1)], + compacted_negative_srcs=None, + compacted_negative_dsts=tensor([[2, 1], + [2, 3]]), + blocks=[Block(num_src_nodes=4, num_dst_nodes=4, num_edges=2), + Block(num_src_nodes=4, num_dst_nodes=4, num_edges=2)], )""" ), ] From c40f54e643e952dab4c37600d110eede882a64ac Mon Sep 17 00:00:00 2001 From: Rhett Ying <85214957+Rhett-Ying@users.noreply.github.com> Date: Fri, 12 Jan 2024 08:54:11 +0800 Subject: [PATCH 05/22] [release] upgrade version to next one --- python/update_version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/update_version.py b/python/update_version.py index e5dd2b3d8f32..00b4ef0c5f77 100644 --- a/python/update_version.py +++ b/python/update_version.py @@ -16,7 +16,7 @@ # (usually "aYYMMDD") # The environment variable DGL_VERSION_SUFFIX is the local version label # suffix for indicating CPU and CUDA versions as in PEP 440 (e.g. "+cu102") -__version__ = "2.0" + os.getenv("DGL_PRERELEASE", "") +__version__ = "2.1" + os.getenv("DGL_PRERELEASE", "") __version__ += os.getenv("DGL_VERSION_SUFFIX", "") print(__version__) From 47a1d6a85df1602833eedeb8cd3fb618c5e30cea Mon Sep 17 00:00:00 2001 From: Rhett Ying <85214957+Rhett-Ying@users.noreply.github.com> Date: Fri, 12 Jan 2024 08:58:00 +0800 Subject: [PATCH 06/22] [release] upgrade version to 2.1 on master (#6940) --- conda/dgl/meta.yaml | 2 +- include/dgl/runtime/c_runtime_api.h | 2 +- python/dgl/_ffi/libinfo.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/conda/dgl/meta.yaml b/conda/dgl/meta.yaml index fdf912349d03..f6acfa9a8c1a 100644 --- a/conda/dgl/meta.yaml +++ b/conda/dgl/meta.yaml @@ -1,6 +1,6 @@ package: name: dgl{{ environ.get('DGL_PACKAGE_SUFFIX', '') }} - version: 2.0{{ environ.get('DGL_VERSION_SUFFIX', '') }} + version: 2.1{{ environ.get('DGL_VERSION_SUFFIX', '') }} source: git_rev: {{ environ.get('DGL_RELEASE_BRANCH', 'master') }} diff --git a/include/dgl/runtime/c_runtime_api.h b/include/dgl/runtime/c_runtime_api.h index d9e9d4edc054..6dc7e5d2d299 100644 --- a/include/dgl/runtime/c_runtime_api.h +++ b/include/dgl/runtime/c_runtime_api.h @@ -33,7 +33,7 @@ #endif // DGL version -#define DGL_VERSION "2.0" +#define DGL_VERSION "2.1" #ifdef __cplusplus extern "C" { diff --git a/python/dgl/_ffi/libinfo.py b/python/dgl/_ffi/libinfo.py index f82a6f700033..6f841e246bd5 100644 --- a/python/dgl/_ffi/libinfo.py +++ b/python/dgl/_ffi/libinfo.py @@ -105,4 +105,4 @@ def find_lib_path(name=None, search_path=None, optional=False): # We use the version of the incoming release for code # that is under development. # The following line is set by dgl/python/update_version.py -__version__ = "2.0" +__version__ = "2.1" From f86212edb5012f36f30ed8f79513b2b3f54cf1ea Mon Sep 17 00:00:00 2001 From: Muhammed Fatih BALIN Date: Fri, 12 Jan 2024 03:51:53 -0500 Subject: [PATCH 07/22] [GraphBolt][CUDA] Enable tests for weighted sampling (#6919) --- graphbolt/src/cuda/neighbor_sampler.cu | 58 ++++++++++++++++--- .../impl/test_fused_csc_sampling_graph.py | 30 ++++------ 2 files changed, 60 insertions(+), 28 deletions(-) diff --git a/graphbolt/src/cuda/neighbor_sampler.cu b/graphbolt/src/cuda/neighbor_sampler.cu index 7cede6b5a471..96070303c1ce 100644 --- a/graphbolt/src/cuda/neighbor_sampler.cu +++ b/graphbolt/src/cuda/neighbor_sampler.cu @@ -43,7 +43,7 @@ template < __global__ void _ComputeRandoms( const int64_t num_edges, const indptr_t* const sliced_indptr, const indptr_t* const sub_indptr, const indices_t* const csr_rows, - const weights_t* const weights, const indices_t* const indices, + const weights_t* const sliced_weights, const indices_t* const indices, const uint64_t random_seed, float_t* random_arr, edge_id_t* edge_ids) { int64_t i = blockIdx.x * blockDim.x + threadIdx.x; const int stride = gridDim.x * blockDim.x; @@ -65,7 +65,8 @@ __global__ void _ComputeRandoms( } const auto rnd = curand_uniform(&rng); - const auto prob = weights ? weights[in_idx] : static_cast(1); + const auto prob = + sliced_weights ? sliced_weights[i] : static_cast(1); const auto exp_rnd = -__logf(rnd); const float_t adjusted_rnd = prob > 0 ? static_cast(exp_rnd / prob) @@ -77,6 +78,13 @@ __global__ void _ComputeRandoms( } } +struct IsPositive { + template + __host__ __device__ auto operator()(probs_t x) { + return x > 0; + } +}; + template struct MinInDegreeFanout { const indptr_t* in_degree; @@ -152,7 +160,18 @@ c10::intrusive_ptr SampleNeighbors( auto in_degree_and_sliced_indptr = SliceCSCIndptr(indptr, nodes); auto in_degree = std::get<0>(in_degree_and_sliced_indptr); auto sliced_indptr = std::get<1>(in_degree_and_sliced_indptr); - auto sub_indptr = ExclusiveCumSum(in_degree); + torch::Tensor sub_indptr; + // @todo mfbalin, refactor IndexSelectCSCImpl so that it does not have to take + // nodes as input + torch::optional sliced_probs_or_mask; + if (probs_or_mask.has_value()) { + torch::Tensor sliced_probs_or_mask_tensor; + std::tie(sub_indptr, sliced_probs_or_mask_tensor) = + IndexSelectCSCImpl(indptr, probs_or_mask.value(), nodes); + sliced_probs_or_mask = sliced_probs_or_mask_tensor; + } else { + sub_indptr = ExclusiveCumSum(in_degree); + } if (fanouts.size() > 1) { torch::Tensor sliced_type_per_edge; std::tie(sub_indptr, sliced_type_per_edge) = @@ -187,6 +206,29 @@ c10::intrusive_ptr SampleNeighbors( AT_DISPATCH_INDEX_TYPES( indptr.scalar_type(), "SampleNeighborsIndptr", ([&] { using indptr_t = index_t; + if (probs_or_mask.has_value()) { // Count nonzero probs into in_degree. + GRAPHBOLT_DISPATCH_ALL_TYPES( + probs_or_mask.value().scalar_type(), + "SampleNeighborsPositiveProbs", ([&] { + using probs_t = scalar_t; + auto is_nonzero = thrust::make_transform_iterator( + sliced_probs_or_mask.value().data_ptr(), + IsPositive{}); + size_t tmp_storage_size = 0; + cub::DeviceSegmentedReduce::Sum( + nullptr, tmp_storage_size, is_nonzero, + in_degree.data_ptr(), num_rows, + sub_indptr.data_ptr(), + sub_indptr.data_ptr() + 1, stream); + auto tmp_storage = + allocator.AllocateStorage(tmp_storage_size); + cub::DeviceSegmentedReduce::Sum( + tmp_storage.get(), tmp_storage_size, is_nonzero, + in_degree.data_ptr(), num_rows, + sub_indptr.data_ptr(), + sub_indptr.data_ptr() + 1, stream); + })); + } thrust::counting_iterator iota(0); auto sampled_degree = thrust::make_transform_iterator( iota, MinInDegreeFanout{ @@ -246,10 +288,10 @@ c10::intrusive_ptr SampleNeighbors( probs_or_mask_scalar_type, "SampleNeighborsProbs", ([&] { using probs_t = scalar_t; - probs_t* probs_ptr = nullptr; - if (probs_or_mask.has_value()) { - probs_ptr = - probs_or_mask.value().data_ptr(); + probs_t* sliced_probs_ptr = nullptr; + if (sliced_probs_or_mask.has_value()) { + sliced_probs_ptr = sliced_probs_or_mask.value() + .data_ptr(); } const indices_t* indices_ptr = layer ? indices.data_ptr() : nullptr; @@ -261,7 +303,7 @@ c10::intrusive_ptr SampleNeighbors( _ComputeRandoms, grid, block, 0, stream, num_edges, sliced_indptr.data_ptr(), sub_indptr.data_ptr(), - coo_rows.data_ptr(), probs_ptr, + coo_rows.data_ptr(), sliced_probs_ptr, indices_ptr, random_seed, randoms.get(), edge_id_segments.get()); })); diff --git a/tests/python/pytorch/graphbolt/impl/test_fused_csc_sampling_graph.py b/tests/python/pytorch/graphbolt/impl/test_fused_csc_sampling_graph.py index 34a508deb728..3e13a1ce2f5a 100644 --- a/tests/python/pytorch/graphbolt/impl/test_fused_csc_sampling_graph.py +++ b/tests/python/pytorch/graphbolt/impl/test_fused_csc_sampling_graph.py @@ -1797,10 +1797,6 @@ def test_sample_neighbors_fanouts( assert subgraph.sampled_csc["n2:e2:n1"].indptr.size(0) == 2 -@unittest.skipIf( - F._default_context_str == "gpu", - reason="Sampling with replacement not yet supported on GPU.", -) @pytest.mark.parametrize( "replace, expected_sampled_num1, expected_sampled_num2", [(False, 2, 2), (True, 4, 4)], @@ -1808,6 +1804,8 @@ def test_sample_neighbors_fanouts( def test_sample_neighbors_replace( replace, expected_sampled_num1, expected_sampled_num2 ): + if F._default_context_str == "gpu" and replace == True: + pytest.skip("Sampling with replacement not yet supported on GPU.") """Original graph in COO: "n1:e1:n2":[0, 0, 1, 1, 1], [0, 2, 0, 1, 2] "n2:e2:n1":[0, 0, 1, 2], [0, 1, 1 ,0] @@ -1966,14 +1964,12 @@ def test_sample_neighbors_return_eids_hetero(labor): ) -@unittest.skipIf( - F._default_context_str == "gpu", - reason="Sampling with replacement not yet supported on GPU.", -) @pytest.mark.parametrize("replace", [True, False]) @pytest.mark.parametrize("labor", [False, True]) @pytest.mark.parametrize("probs_name", ["weight", "mask"]) def test_sample_neighbors_probs(replace, labor, probs_name): + if F._default_context_str == "gpu" and replace == True: + pytest.skip("Sampling with replacement not yet supported on GPU.") """Original graph in COO: 1 0 1 0 1 1 0 1 1 0 @@ -2020,10 +2016,6 @@ def test_sample_neighbors_probs(replace, labor, probs_name): assert sampled_num == 4 -@unittest.skipIf( - F._default_context_str == "gpu", - reason="Sampling with replacement not yet supported on GPU.", -) @pytest.mark.parametrize("replace", [True, False]) @pytest.mark.parametrize("labor", [False, True]) @pytest.mark.parametrize( @@ -2034,6 +2026,8 @@ def test_sample_neighbors_probs(replace, labor, probs_name): ], ) def test_sample_neighbors_zero_probs(replace, labor, probs_or_mask): + if F._default_context_str == "gpu" and replace == True: + pytest.skip("Sampling with replacement not yet supported on GPU.") # Initialize data. total_num_nodes = 5 total_num_edges = 12 @@ -2065,10 +2059,6 @@ def test_sample_neighbors_zero_probs(replace, labor, probs_or_mask): assert sampled_num == 0 -@unittest.skipIf( - F._default_context_str == "gpu", - reason="Sampling with replacement not yet supported on GPU.", -) @pytest.mark.parametrize("replace", [False, True]) @pytest.mark.parametrize("labor", [False, True]) @pytest.mark.parametrize( @@ -2089,6 +2079,8 @@ def test_sample_neighbors_zero_probs(replace, labor, probs_or_mask): ], ) def test_sample_neighbors_homo_pick_number(fanouts, replace, labor, probs_name): + if F._default_context_str == "gpu" and replace == True: + pytest.skip("Sampling with replacement not yet supported on GPU.") """Original graph in COO: 1 1 1 1 1 1 0 0 0 0 0 0 @@ -2150,10 +2142,6 @@ def test_sample_neighbors_homo_pick_number(fanouts, replace, labor, probs_name): assert sampled_num == min(fanouts[0], 6) -@unittest.skipIf( - F._default_context_str == "gpu", - reason="Sampling with replacement not yet supported on GPU.", -) @pytest.mark.parametrize("replace", [False, True]) @pytest.mark.parametrize("labor", [False, True]) @pytest.mark.parametrize( @@ -2171,6 +2159,8 @@ def test_sample_neighbors_homo_pick_number(fanouts, replace, labor, probs_name): def test_sample_neighbors_hetero_pick_number( fanouts, replace, labor, probs_name ): + if F._default_context_str == "gpu" and replace == True: + pytest.skip("Sampling with replacement not yet supported on GPU.") # Initialize data. total_num_nodes = 10 total_num_edges = 9 From 3795a006b91c94291b911f0daa261c0598d7ffd8 Mon Sep 17 00:00:00 2001 From: Muhammed Fatih BALIN Date: Fri, 12 Jan 2024 05:00:24 -0500 Subject: [PATCH 08/22] [GraphBolt][CUDA] Refactor codebase with `CUB_CALL` macro (#6870) --- graphbolt/src/cuda/common.h | 34 ++++-- graphbolt/src/cuda/csr_to_coo.cu | 18 +-- graphbolt/src/cuda/cumsum.cu | 19 +--- graphbolt/src/cuda/index_select_csc_impl.cu | 56 +++------- graphbolt/src/cuda/index_select_impl.cu | 17 +-- graphbolt/src/cuda/insubgraph.cu | 2 - graphbolt/src/cuda/isin.cu | 9 +- graphbolt/src/cuda/neighbor_sampler.cu | 104 +++++------------- graphbolt/src/cuda/sampling_utils.cu | 29 ++--- graphbolt/src/cuda/sort_impl.cu | 25 +---- graphbolt/src/cuda/unique_and_compact_impl.cu | 78 +++++-------- 11 files changed, 128 insertions(+), 263 deletions(-) diff --git a/graphbolt/src/cuda/common.h b/graphbolt/src/cuda/common.h index b2d5991b3f50..cc9980dccda5 100644 --- a/graphbolt/src/cuda/common.h +++ b/graphbolt/src/cuda/common.h @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -82,15 +83,34 @@ inline bool is_zero(dim3 size) { #define CUDA_CALL(func) C10_CUDA_CHECK((func)) -#define CUDA_KERNEL_CALL(kernel, nblks, nthrs, shmem, stream, ...) \ - { \ - if (!graphbolt::cuda::is_zero((nblks)) && \ - !graphbolt::cuda::is_zero((nthrs))) { \ - (kernel)<<<(nblks), (nthrs), (shmem), (stream)>>>(__VA_ARGS__); \ - C10_CUDA_KERNEL_LAUNCH_CHECK(); \ - } \ +#define CUDA_KERNEL_CALL(kernel, nblks, nthrs, shmem, ...) \ + { \ + if (!graphbolt::cuda::is_zero((nblks)) && \ + !graphbolt::cuda::is_zero((nthrs))) { \ + auto stream = graphbolt::cuda::GetCurrentStream(); \ + (kernel)<<<(nblks), (nthrs), (shmem), stream>>>(__VA_ARGS__); \ + C10_CUDA_KERNEL_LAUNCH_CHECK(); \ + } \ } +#define CUB_CALL(fn, ...) \ + { \ + auto allocator = graphbolt::cuda::GetAllocator(); \ + auto stream = graphbolt::cuda::GetCurrentStream(); \ + size_t workspace_size = 0; \ + CUDA_CALL(cub::fn(nullptr, workspace_size, __VA_ARGS__, stream)); \ + auto workspace = allocator.AllocateStorage(workspace_size); \ + CUDA_CALL(cub::fn(workspace.get(), workspace_size, __VA_ARGS__, stream)); \ + } + +#define THRUST_CALL(fn, ...) \ + [&] { \ + auto allocator = graphbolt::cuda::GetAllocator(); \ + auto stream = graphbolt::cuda::GetCurrentStream(); \ + const auto exec_policy = thrust::cuda::par_nosync(allocator).on(stream); \ + return thrust::fn(exec_policy, __VA_ARGS__); \ + }() + /** * @brief This class is designed to handle the copy operation of a single * scalar_t item from a given CUDA device pointer. Later, if the object is cast diff --git a/graphbolt/src/cuda/csr_to_coo.cu b/graphbolt/src/cuda/csr_to_coo.cu index 524f56eb058e..42765986bb25 100644 --- a/graphbolt/src/cuda/csr_to_coo.cu +++ b/graphbolt/src/cuda/csr_to_coo.cu @@ -39,8 +39,6 @@ struct AdjacentDifference { }; torch::Tensor CSRToCOO(torch::Tensor indptr, torch::ScalarType output_dtype) { - auto allocator = cuda::GetAllocator(); - auto stream = cuda::GetCurrentStream(); const auto num_rows = indptr.size(0) - 1; thrust::counting_iterator iota(0); @@ -69,19 +67,9 @@ torch::Tensor CSRToCOO(torch::Tensor indptr, torch::ScalarType output_dtype) { constexpr int64_t max_copy_at_once = std::numeric_limits::max(); for (int64_t i = 0; i < num_rows; i += max_copy_at_once) { - std::size_t tmp_storage_size = 0; - CUDA_CALL(cub::DeviceCopy::Batched( - nullptr, tmp_storage_size, input_buffer + i, - output_buffer + i, buffer_sizes + i, - std::min(num_rows - i, max_copy_at_once), stream)); - - auto tmp_storage = - allocator.AllocateStorage(tmp_storage_size); - - CUDA_CALL(cub::DeviceCopy::Batched( - tmp_storage.get(), tmp_storage_size, input_buffer + i, - output_buffer + i, buffer_sizes + i, - std::min(num_rows - i, max_copy_at_once), stream)); + CUB_CALL( + DeviceCopy::Batched, input_buffer + i, output_buffer + i, + buffer_sizes + i, std::min(num_rows - i, max_copy_at_once)); } })); return csr_rows; diff --git a/graphbolt/src/cuda/cumsum.cu b/graphbolt/src/cuda/cumsum.cu index ed50e6914e84..3537697ac4c7 100644 --- a/graphbolt/src/cuda/cumsum.cu +++ b/graphbolt/src/cuda/cumsum.cu @@ -12,21 +12,14 @@ namespace graphbolt { namespace ops { torch::Tensor ExclusiveCumSum(torch::Tensor input) { - auto allocator = cuda::GetAllocator(); - auto stream = cuda::GetCurrentStream(); auto result = torch::empty_like(input); - AT_DISPATCH_INTEGRAL_TYPES( - input.scalar_type(), "ExclusiveCumSum", ([&] { - size_t tmp_storage_size = 0; - cub::DeviceScan::ExclusiveSum( - nullptr, tmp_storage_size, input.data_ptr(), - result.data_ptr(), input.size(0), stream); - auto tmp_storage = allocator.AllocateStorage(tmp_storage_size); - cub::DeviceScan::ExclusiveSum( - tmp_storage.get(), tmp_storage_size, input.data_ptr(), - result.data_ptr(), input.size(0), stream); - })); + AT_DISPATCH_INTEGRAL_TYPES(input.scalar_type(), "ExclusiveCumSum", ([&] { + CUB_CALL( + DeviceScan::ExclusiveSum, + input.data_ptr(), + result.data_ptr(), input.size(0)); + })); return result; } diff --git a/graphbolt/src/cuda/index_select_csc_impl.cu b/graphbolt/src/cuda/index_select_csc_impl.cu index c24f2a7f0d07..da0b80584482 100644 --- a/graphbolt/src/cuda/index_select_csc_impl.cu +++ b/graphbolt/src/cuda/index_select_csc_impl.cu @@ -5,11 +5,10 @@ * @brief Index select csc operator implementation on CUDA. */ #include -#include #include -#include #include #include +#include #include #include @@ -88,7 +87,7 @@ std::tuple UVAIndexSelectCSCCopyIndices( torch::Tensor indices, const int64_t num_nodes, const indptr_t* const in_degree, const indptr_t* const sliced_indptr, const int64_t* const perm, torch::TensorOptions nodes_options, - torch::ScalarType indptr_scalar_type, cudaStream_t stream) { + torch::ScalarType indptr_scalar_type) { auto allocator = cuda::GetAllocator(); thrust::counting_iterator iota(0); @@ -109,14 +108,9 @@ std::tuple UVAIndexSelectCSCCopyIndices( output_indptr.data_ptr(), output_indptr_aligned.get()); thrust::tuple zero_value{}; // Compute the prefix sum over actual and modified indegrees. - size_t tmp_storage_size = 0; - CUDA_CALL(cub::DeviceScan::ExclusiveScan( - nullptr, tmp_storage_size, modified_in_degree, output_indptr_pair, - PairSum{}, zero_value, num_nodes + 1, stream)); - auto tmp_storage = allocator.AllocateStorage(tmp_storage_size); - CUDA_CALL(cub::DeviceScan::ExclusiveScan( - tmp_storage.get(), tmp_storage_size, modified_in_degree, - output_indptr_pair, PairSum{}, zero_value, num_nodes + 1, stream)); + CUB_CALL( + DeviceScan::ExclusiveScan, modified_in_degree, output_indptr_pair, + PairSum{}, zero_value, num_nodes + 1); } // Copy the actual total number of edges. @@ -138,7 +132,7 @@ std::tuple UVAIndexSelectCSCCopyIndices( // Perform the actual copying, of the indices array into // output_indices in an aligned manner. CUDA_KERNEL_CALL( - _CopyIndicesAlignedKernel, grid, block, 0, stream, + _CopyIndicesAlignedKernel, grid, block, 0, static_cast(edge_count_aligned), num_nodes, sliced_indptr, output_indptr.data_ptr(), output_indptr_aligned.get(), reinterpret_cast(indices.data_ptr()), @@ -151,7 +145,6 @@ std::tuple UVAIndexSelectCSCImpl( // Sorting nodes so that accesses over PCI-e are more regular. const auto sorted_idx = Sort(nodes, cuda::NumberOfBits(indptr.size(0) - 1)).second; - auto stream = cuda::GetCurrentStream(); const int64_t num_nodes = nodes.size(0); auto in_degree_and_sliced_indptr = SliceCSCIndptr(indptr, nodes); @@ -167,7 +160,7 @@ std::tuple UVAIndexSelectCSCImpl( return UVAIndexSelectCSCCopyIndices( indices, num_nodes, in_degree, sliced_indptr, sorted_idx.data_ptr(), nodes.options(), - indptr.scalar_type(), stream); + indptr.scalar_type()); })); })); } @@ -191,9 +184,7 @@ template void IndexSelectCSCCopyIndices( const int64_t num_nodes, indices_t* const indices, indptr_t* const sliced_indptr, const indptr_t* const in_degree, - indptr_t* const output_indptr, indices_t* const output_indices, - cudaStream_t stream) { - auto allocator = cuda::GetAllocator(); + indptr_t* const output_indptr, indices_t* const output_indices) { thrust::counting_iterator iota(0); auto input_buffer_it = thrust::make_transform_iterator( @@ -206,21 +197,14 @@ void IndexSelectCSCCopyIndices( // Performs the copy from indices into output_indices. for (int64_t i = 0; i < num_nodes; i += max_copy_at_once) { - size_t tmp_storage_size = 0; - CUDA_CALL(cub::DeviceMemcpy::Batched( - nullptr, tmp_storage_size, input_buffer_it + i, output_buffer_it + i, - buffer_sizes + i, std::min(num_nodes - i, max_copy_at_once), stream)); - auto tmp_storage = allocator.AllocateStorage(tmp_storage_size); - CUDA_CALL(cub::DeviceMemcpy::Batched( - tmp_storage.get(), tmp_storage_size, input_buffer_it + i, - output_buffer_it + i, buffer_sizes + i, - std::min(num_nodes - i, max_copy_at_once), stream)); + CUB_CALL( + DeviceMemcpy::Batched, input_buffer_it + i, output_buffer_it + i, + buffer_sizes + i, std::min(num_nodes - i, max_copy_at_once)); } } std::tuple DeviceIndexSelectCSCImpl( torch::Tensor indptr, torch::Tensor indices, torch::Tensor nodes) { - auto stream = cuda::GetCurrentStream(); const int64_t num_nodes = nodes.size(0); auto in_degree_and_sliced_indptr = SliceCSCIndptr(indptr, nodes); return AT_DISPATCH_INTEGRAL_TYPES( @@ -234,17 +218,10 @@ std::tuple DeviceIndexSelectCSCImpl( torch::Tensor output_indptr = torch::empty( num_nodes + 1, nodes.options().dtype(indptr.scalar_type())); - { // Compute the output indptr, output_indptr. - size_t tmp_storage_size = 0; - CUDA_CALL(cub::DeviceScan::ExclusiveSum( - nullptr, tmp_storage_size, in_degree, - output_indptr.data_ptr(), num_nodes + 1, stream)); - auto allocator = cuda::GetAllocator(); - auto tmp_storage = allocator.AllocateStorage(tmp_storage_size); - CUDA_CALL(cub::DeviceScan::ExclusiveSum( - tmp_storage.get(), tmp_storage_size, in_degree, - output_indptr.data_ptr(), num_nodes + 1, stream)); - } + // Compute the output indptr, output_indptr. + CUB_CALL( + DeviceScan::ExclusiveSum, in_degree, + output_indptr.data_ptr(), num_nodes + 1); // Number of edges being copied. auto edge_count = @@ -259,8 +236,7 @@ std::tuple DeviceIndexSelectCSCImpl( IndexSelectCSCCopyIndices( num_nodes, reinterpret_cast(indices.data_ptr()), sliced_indptr, in_degree, output_indptr.data_ptr(), - reinterpret_cast(output_indices.data_ptr()), - stream); + reinterpret_cast(output_indices.data_ptr())); })); return std::make_tuple(output_indptr, output_indices); })); diff --git a/graphbolt/src/cuda/index_select_impl.cu b/graphbolt/src/cuda/index_select_impl.cu index 0c937e9030f0..af2c9fe96a24 100644 --- a/graphbolt/src/cuda/index_select_impl.cu +++ b/graphbolt/src/cuda/index_select_impl.cu @@ -5,13 +5,8 @@ * @brief Index select operator implementation on CUDA. */ #include -#include #include -#include -#include -#include -#include #include #include "./common.h" @@ -124,14 +119,12 @@ torch::Tensor UVAIndexSelectImpl_(torch::Tensor input, torch::Tensor index) { const IdType* index_sorted_ptr = sorted_index.data_ptr(); const int64_t* permutation_ptr = permutation.data_ptr(); - auto stream = cuda::GetCurrentStream(); - if (aligned_feature_size == 1) { // Use a single thread to process each output row to avoid wasting threads. const int num_threads = cuda::FindNumThreads(return_len); const int num_blocks = (return_len + num_threads - 1) / num_threads; CUDA_KERNEL_CALL( - IndexSelectSingleKernel, num_blocks, num_threads, 0, stream, input_ptr, + IndexSelectSingleKernel, num_blocks, num_threads, 0, input_ptr, input_len, index_sorted_ptr, return_len, ret_ptr, permutation_ptr); } else { dim3 block(512, 1); @@ -144,15 +137,15 @@ torch::Tensor UVAIndexSelectImpl_(torch::Tensor input, torch::Tensor index) { // When feature size is smaller than GPU cache line size, use unaligned // version for less SM usage, which is more resource efficient. CUDA_KERNEL_CALL( - IndexSelectMultiKernel, grid, block, 0, stream, input_ptr, input_len, + IndexSelectMultiKernel, grid, block, 0, input_ptr, input_len, aligned_feature_size, index_sorted_ptr, return_len, ret_ptr, permutation_ptr); } else { // Use aligned version to improve the memory access pattern. CUDA_KERNEL_CALL( - IndexSelectMultiKernelAligned, grid, block, 0, stream, input_ptr, - input_len, aligned_feature_size, index_sorted_ptr, return_len, - ret_ptr, permutation_ptr); + IndexSelectMultiKernelAligned, grid, block, 0, input_ptr, input_len, + aligned_feature_size, index_sorted_ptr, return_len, ret_ptr, + permutation_ptr); } } diff --git a/graphbolt/src/cuda/insubgraph.cu b/graphbolt/src/cuda/insubgraph.cu index 33473aad8bbc..d3526486e6dd 100644 --- a/graphbolt/src/cuda/insubgraph.cu +++ b/graphbolt/src/cuda/insubgraph.cu @@ -8,8 +8,6 @@ #include #include -#include - #include "./common.h" namespace graphbolt { diff --git a/graphbolt/src/cuda/isin.cu b/graphbolt/src/cuda/isin.cu index dca4a594afaa..d6b36696f5f5 100644 --- a/graphbolt/src/cuda/isin.cu +++ b/graphbolt/src/cuda/isin.cu @@ -7,8 +7,6 @@ #include #include -#include - #include "./common.h" namespace graphbolt { @@ -16,15 +14,12 @@ namespace ops { torch::Tensor IsIn(torch::Tensor elements, torch::Tensor test_elements) { auto sorted_test_elements = Sort(test_elements); - auto allocator = cuda::GetAllocator(); - auto stream = cuda::GetCurrentStream(); - const auto exec_policy = thrust::cuda::par_nosync(allocator).on(stream); auto result = torch::empty_like(elements, torch::kBool); AT_DISPATCH_INTEGRAL_TYPES( elements.scalar_type(), "IsInOperation", ([&] { - thrust::binary_search( - exec_policy, sorted_test_elements.data_ptr(), + THRUST_CALL( + binary_search, sorted_test_elements.data_ptr(), sorted_test_elements.data_ptr() + sorted_test_elements.size(0), elements.data_ptr(), diff --git a/graphbolt/src/cuda/neighbor_sampler.cu b/graphbolt/src/cuda/neighbor_sampler.cu index 96070303c1ce..e40a6c909d45 100644 --- a/graphbolt/src/cuda/neighbor_sampler.cu +++ b/graphbolt/src/cuda/neighbor_sampler.cu @@ -5,12 +5,10 @@ * @brief Index select operator implementation on CUDA. */ #include -#include #include #include #include #include -#include #include #include #include @@ -18,7 +16,6 @@ #include #include #include -#include #include #include #include @@ -142,7 +139,6 @@ c10::intrusive_ptr SampleNeighbors( // are all resident on the GPU. If not, it is better to first extract them // before calling this function. auto allocator = cuda::GetAllocator(); - const auto stream = cuda::GetCurrentStream(); auto num_rows = nodes.size(0); auto fanouts_pinned = torch::empty( fanouts.size(), @@ -156,7 +152,8 @@ c10::intrusive_ptr SampleNeighbors( auto fanouts_device = allocator.AllocateStorage(fanouts.size()); CUDA_CALL(cudaMemcpyAsync( fanouts_device.get(), fanouts_pinned_ptr, - sizeof(int64_t) * fanouts.size(), cudaMemcpyHostToDevice, stream)); + sizeof(int64_t) * fanouts.size(), cudaMemcpyHostToDevice, + cuda::GetCurrentStream())); auto in_degree_and_sliced_indptr = SliceCSCIndptr(indptr, nodes); auto in_degree = std::get<0>(in_degree_and_sliced_indptr); auto sliced_indptr = std::get<1>(in_degree_and_sliced_indptr); @@ -185,14 +182,9 @@ c10::intrusive_ptr SampleNeighbors( c10::TensorOptions().dtype(in_degree.scalar_type()).pinned_memory(true)); AT_DISPATCH_INDEX_TYPES( indptr.scalar_type(), "SampleNeighborsInDegree", ([&] { - size_t tmp_storage_size = 0; - cub::DeviceReduce::Max( - nullptr, tmp_storage_size, in_degree.data_ptr(), - max_in_degree.data_ptr(), num_rows, stream); - auto tmp_storage = allocator.AllocateStorage(tmp_storage_size); - cub::DeviceReduce::Max( - tmp_storage.get(), tmp_storage_size, in_degree.data_ptr(), - max_in_degree.data_ptr(), num_rows, stream); + CUB_CALL( + DeviceReduce::Max, in_degree.data_ptr(), + max_in_degree.data_ptr(), num_rows); })); auto coo_rows = CSRToCOO(sub_indptr, indices.scalar_type()); const auto num_edges = coo_rows.size(0); @@ -214,19 +206,11 @@ c10::intrusive_ptr SampleNeighbors( auto is_nonzero = thrust::make_transform_iterator( sliced_probs_or_mask.value().data_ptr(), IsPositive{}); - size_t tmp_storage_size = 0; - cub::DeviceSegmentedReduce::Sum( - nullptr, tmp_storage_size, is_nonzero, + CUB_CALL( + DeviceSegmentedReduce::Sum, is_nonzero, in_degree.data_ptr(), num_rows, sub_indptr.data_ptr(), - sub_indptr.data_ptr() + 1, stream); - auto tmp_storage = - allocator.AllocateStorage(tmp_storage_size); - cub::DeviceSegmentedReduce::Sum( - tmp_storage.get(), tmp_storage_size, is_nonzero, - in_degree.data_ptr(), num_rows, - sub_indptr.data_ptr(), - sub_indptr.data_ptr() + 1, stream); + sub_indptr.data_ptr() + 1); })); } thrust::counting_iterator iota(0); @@ -235,16 +219,10 @@ c10::intrusive_ptr SampleNeighbors( in_degree.data_ptr(), fanouts_device.get(), fanouts.size()}); - { // Compute output_indptr. - size_t tmp_storage_size = 0; - cub::DeviceScan::ExclusiveSum( - nullptr, tmp_storage_size, sampled_degree, - output_indptr.data_ptr(), num_rows + 1, stream); - auto tmp_storage = allocator.AllocateStorage(tmp_storage_size); - cub::DeviceScan::ExclusiveSum( - tmp_storage.get(), tmp_storage_size, sampled_degree, - output_indptr.data_ptr(), num_rows + 1, stream); - } + // Compute output_indptr. + CUB_CALL( + DeviceScan::ExclusiveSum, sampled_degree, + output_indptr.data_ptr(), num_rows + 1); auto num_sampled_edges = cuda::CopyScalar{output_indptr.data_ptr() + num_rows}; @@ -300,8 +278,8 @@ c10::intrusive_ptr SampleNeighbors( (num_edges + BLOCK_SIZE - 1) / BLOCK_SIZE); // Compute row and random number pairs. CUDA_KERNEL_CALL( - _ComputeRandoms, grid, block, 0, stream, - num_edges, sliced_indptr.data_ptr(), + _ComputeRandoms, grid, block, 0, num_edges, + sliced_indptr.data_ptr(), sub_indptr.data_ptr(), coo_rows.data_ptr(), sliced_probs_ptr, indices_ptr, random_seed, randoms.get(), @@ -312,21 +290,12 @@ c10::intrusive_ptr SampleNeighbors( // Sort the random numbers along with edge ids, after // sorting the first fanout elements of each row will // give us the sampled edges. - size_t tmp_storage_size = 0; - CUDA_CALL(cub::DeviceSegmentedSort::SortPairs( - nullptr, tmp_storage_size, randoms.get(), + CUB_CALL( + DeviceSegmentedSort::SortPairs, randoms.get(), randoms_sorted.get(), edge_id_segments.get(), sorted_edge_id_segments.get(), num_edges, num_rows, sub_indptr.data_ptr(), - sub_indptr.data_ptr() + 1, stream)); - auto tmp_storage = - allocator.AllocateStorage(tmp_storage_size); - CUDA_CALL(cub::DeviceSegmentedSort::SortPairs( - tmp_storage.get(), tmp_storage_size, randoms.get(), - randoms_sorted.get(), edge_id_segments.get(), - sorted_edge_id_segments.get(), num_edges, num_rows, - sub_indptr.data_ptr(), - sub_indptr.data_ptr() + 1, stream)); + sub_indptr.data_ptr() + 1); picked_eids = torch::empty( static_cast(num_sampled_edges), @@ -341,19 +310,11 @@ c10::intrusive_ptr SampleNeighbors( auto sampled_segment_end_it = thrust::make_transform_iterator( iota, SegmentEndFunc{ sub_indptr.data_ptr(), sampled_degree}); - size_t tmp_storage_size = 0; - CUDA_CALL(cub::DeviceSegmentedSort::SortKeys( - nullptr, tmp_storage_size, edge_id_segments.get(), - sorted_edge_id_segments.get(), picked_eids.size(0), - num_rows, sub_indptr.data_ptr(), - sampled_segment_end_it, stream)); - auto tmp_storage = - allocator.AllocateStorage(tmp_storage_size); - CUDA_CALL(cub::DeviceSegmentedSort::SortKeys( - tmp_storage.get(), tmp_storage_size, edge_id_segments.get(), + CUB_CALL( + DeviceSegmentedSort::SortKeys, edge_id_segments.get(), sorted_edge_id_segments.get(), picked_eids.size(0), num_rows, sub_indptr.data_ptr(), - sampled_segment_end_it, stream)); + sampled_segment_end_it); } auto input_buffer_it = thrust::make_transform_iterator( @@ -370,17 +331,10 @@ c10::intrusive_ptr SampleNeighbors( // Copy the sampled edge ids into picked_eids tensor. for (int64_t i = 0; i < num_rows; i += max_copy_at_once) { - size_t tmp_storage_size = 0; - CUDA_CALL(cub::DeviceCopy::Batched( - nullptr, tmp_storage_size, input_buffer_it + i, - output_buffer_it + i, sampled_degree + i, - std::min(num_rows - i, max_copy_at_once), stream)); - auto tmp_storage = - allocator.AllocateStorage(tmp_storage_size); - CUDA_CALL(cub::DeviceCopy::Batched( - tmp_storage.get(), tmp_storage_size, input_buffer_it + i, + CUB_CALL( + DeviceCopy::Batched, input_buffer_it + i, output_buffer_it + i, sampled_degree + i, - std::min(num_rows - i, max_copy_at_once), stream)); + std::min(num_rows - i, max_copy_at_once)); } })); @@ -392,10 +346,8 @@ c10::intrusive_ptr SampleNeighbors( AT_DISPATCH_INDEX_TYPES( indices.scalar_type(), "SampleNeighborsOutputIndices", ([&] { using indices_t = index_t; - const auto exec_policy = - thrust::cuda::par_nosync(allocator).on(stream); - thrust::gather( - exec_policy, picked_eids.data_ptr(), + THRUST_CALL( + gather, picked_eids.data_ptr(), picked_eids.data_ptr() + picked_eids.size(0), indices.data_ptr(), output_indices.data_ptr()); @@ -412,10 +364,8 @@ c10::intrusive_ptr SampleNeighbors( picked_eids.options().dtype(types.scalar_type())); AT_DISPATCH_INTEGRAL_TYPES( types.scalar_type(), "SampleNeighborsOutputTypePerEdge", ([&] { - const auto exec_policy = - thrust::cuda::par_nosync(allocator).on(stream); - thrust::gather( - exec_policy, picked_eids.data_ptr(), + THRUST_CALL( + gather, picked_eids.data_ptr(), picked_eids.data_ptr() + picked_eids.size(0), types.data_ptr(), output_type_per_edge.value().data_ptr()); diff --git a/graphbolt/src/cuda/sampling_utils.cu b/graphbolt/src/cuda/sampling_utils.cu index 0589bd47e643..4db17a5e4436 100644 --- a/graphbolt/src/cuda/sampling_utils.cu +++ b/graphbolt/src/cuda/sampling_utils.cu @@ -4,7 +4,7 @@ * @file cuda/sampling_utils.cu * @brief Sampling utility function implementations on CUDA. */ -#include +#include #include #include @@ -36,9 +36,6 @@ struct SliceFunc { // Returns (indptr[nodes + 1] - indptr[nodes], indptr[nodes]) std::tuple SliceCSCIndptr( torch::Tensor indptr, torch::Tensor nodes) { - auto allocator = cuda::GetAllocator(); - const auto exec_policy = - thrust::cuda::par_nosync(allocator).on(cuda::GetCurrentStream()); const int64_t num_nodes = nodes.size(0); // Read indptr only once in case it is pinned and access is slow. auto sliced_indptr = @@ -53,8 +50,8 @@ std::tuple SliceCSCIndptr( AT_DISPATCH_INDEX_TYPES( nodes.scalar_type(), "IndexSelectCSCNodes", ([&] { using nodes_t = index_t; - thrust::for_each( - exec_policy, iota, iota + num_nodes, + THRUST_CALL( + for_each, iota, iota + num_nodes, SliceFunc{ nodes.data_ptr(), indptr.data_ptr(), in_degree.data_ptr(), @@ -92,9 +89,6 @@ std::tuple SliceCSCIndptrHetero( auto new_sub_indptr = torch::empty(num_rows + 1, sub_indptr.options()); auto new_indegree = torch::empty(num_rows + 2, sub_indptr.options()); auto new_sliced_indptr = torch::empty(num_rows, sliced_indptr.options()); - auto allocator = cuda::GetAllocator(); - auto stream = cuda::GetCurrentStream(); - const auto exec_policy = thrust::cuda::par_nosync(allocator).on(stream); thrust::counting_iterator iota(0); AT_DISPATCH_INTEGRAL_TYPES( sub_indptr.scalar_type(), "SliceCSCIndptrHeteroIndptr", ([&] { @@ -102,8 +96,8 @@ std::tuple SliceCSCIndptrHetero( AT_DISPATCH_INTEGRAL_TYPES( etypes.scalar_type(), "SliceCSCIndptrHeteroTypePerEdge", ([&] { using etype_t = scalar_t; - thrust::for_each( - exec_policy, iota, iota + num_rows, + THRUST_CALL( + for_each, iota, iota + num_rows, EdgeTypeSearch{ sub_indptr.data_ptr(), sliced_indptr.data_ptr(), @@ -111,17 +105,10 @@ std::tuple SliceCSCIndptrHetero( new_sub_indptr.data_ptr(), new_sliced_indptr.data_ptr()}); })); - size_t tmp_storage_size = 0; - cub::DeviceAdjacentDifference::SubtractLeftCopy( - nullptr, tmp_storage_size, new_sub_indptr.data_ptr(), - new_indegree.data_ptr(), num_rows + 1, cub::Difference{}, - stream); - auto tmp_storage = allocator.AllocateStorage(tmp_storage_size); - cub::DeviceAdjacentDifference::SubtractLeftCopy( - tmp_storage.get(), tmp_storage_size, + CUB_CALL( + DeviceAdjacentDifference::SubtractLeftCopy, new_sub_indptr.data_ptr(), - new_indegree.data_ptr(), num_rows + 1, cub::Difference{}, - stream); + new_indegree.data_ptr(), num_rows + 1, cub::Difference{}); })); // Discard the first element of the SubtractLeftCopy result and ensure that // new_indegree tensor has size num_rows + 1 so that its ExclusiveCumSum is diff --git a/graphbolt/src/cuda/sort_impl.cu b/graphbolt/src/cuda/sort_impl.cu index a8eb10879726..c097e14b52b8 100644 --- a/graphbolt/src/cuda/sort_impl.cu +++ b/graphbolt/src/cuda/sort_impl.cu @@ -5,7 +5,6 @@ * @brief Sort implementation on CUDA. */ #include -#include #include @@ -21,8 +20,6 @@ std::conditional_t< torch::Tensor> Sort(const scalar_t* input_keys, int64_t num_items, int num_bits) { const auto options = torch::TensorOptions().device(c10::DeviceType::CUDA); - auto allocator = cuda::GetAllocator(); - auto stream = cuda::GetCurrentStream(); constexpr c10::ScalarType dtype = c10::CppTypeToScalarType::value; auto sorted_array = torch::empty(num_items, options.dtype(dtype)); auto sorted_keys = sorted_array.data_ptr(); @@ -36,24 +33,14 @@ Sort(const scalar_t* input_keys, int64_t num_items, int num_bits) { auto sorted_idx = torch::empty_like(original_idx); const int64_t* input_values = original_idx.data_ptr(); int64_t* sorted_values = sorted_idx.data_ptr(); - size_t tmp_storage_size = 0; - CUDA_CALL(cub::DeviceRadixSort::SortPairs( - nullptr, tmp_storage_size, input_keys, sorted_keys, input_values, - sorted_values, num_items, 0, num_bits, stream)); - auto tmp_storage = allocator.AllocateStorage(tmp_storage_size); - CUDA_CALL(cub::DeviceRadixSort::SortPairs( - tmp_storage.get(), tmp_storage_size, input_keys, sorted_keys, - input_values, sorted_values, num_items, 0, num_bits, stream)); + CUB_CALL( + DeviceRadixSort::SortPairs, input_keys, sorted_keys, input_values, + sorted_values, num_items, 0, num_bits); return std::make_pair(sorted_array, sorted_idx); } else { - size_t tmp_storage_size = 0; - CUDA_CALL(cub::DeviceRadixSort::SortKeys( - nullptr, tmp_storage_size, input_keys, sorted_keys, num_items, 0, - num_bits, stream)); - auto tmp_storage = allocator.AllocateStorage(tmp_storage_size); - CUDA_CALL(cub::DeviceRadixSort::SortKeys( - tmp_storage.get(), tmp_storage_size, input_keys, sorted_keys, num_items, - 0, num_bits, stream)); + CUB_CALL( + DeviceRadixSort::SortKeys, input_keys, sorted_keys, num_items, 0, + num_bits); return sorted_array; } } diff --git a/graphbolt/src/cuda/unique_and_compact_impl.cu b/graphbolt/src/cuda/unique_and_compact_impl.cu index 013f6d752f43..7a92d8173cf2 100644 --- a/graphbolt/src/cuda/unique_and_compact_impl.cu +++ b/graphbolt/src/cuda/unique_and_compact_impl.cu @@ -4,15 +4,11 @@ * @file cuda/unique_and_compact_impl.cu * @brief Unique and compact operator implementation on CUDA. */ -#include #include #include #include #include -#include #include -#include -#include #include #include @@ -33,23 +29,17 @@ struct EqualityFunc { } }; -#define DefineReductionFunction(reduce_fn, name) \ - template \ - auto name(const scalar_iterator_t input, int64_t size) { \ - auto allocator = cuda::GetAllocator(); \ - auto stream = cuda::GetCurrentStream(); \ - using scalar_t = std::remove_reference_t; \ - cuda::CopyScalar result; \ - size_t workspace_size = 0; \ - reduce_fn(nullptr, workspace_size, input, result.get(), size, stream); \ - auto tmp_storage = allocator.AllocateStorage(workspace_size); \ - reduce_fn( \ - tmp_storage.get(), workspace_size, input, result.get(), size, stream); \ - return result; \ +#define DefineCubReductionFunction(cub_reduce_fn, name) \ + template \ + auto name(const scalar_iterator_t input, int64_t size) { \ + using scalar_t = std::remove_reference_t; \ + cuda::CopyScalar result; \ + CUB_CALL(cub_reduce_fn, input, result.get(), size); \ + return result; \ } -DefineReductionFunction(cub::DeviceReduce::Max, Max); -DefineReductionFunction(cub::DeviceReduce::Min, Min); +DefineCubReductionFunction(DeviceReduce::Max, Max); +DefineCubReductionFunction(DeviceReduce::Min, Min); std::tuple UniqueAndCompact( const torch::Tensor src_ids, const torch::Tensor dst_ids, @@ -60,7 +50,6 @@ std::tuple UniqueAndCompact( "Dtypes of tensors passed to UniqueAndCompact need to be identical."); auto allocator = cuda::GetAllocator(); auto stream = cuda::GetCurrentStream(); - const auto exec_policy = thrust::cuda::par_nosync(allocator).on(stream); return AT_DISPATCH_INTEGRAL_TYPES( src_ids.scalar_type(), "unique_and_compact", ([&] { auto src_ids_ptr = src_ids.data_ptr(); @@ -84,8 +73,8 @@ std::tuple UniqueAndCompact( // Mark dst nodes in the src_ids tensor. auto is_dst = allocator.AllocateStorage(src_ids.size(0)); - thrust::binary_search( - exec_policy, sorted_unique_dst_ids_ptr, + THRUST_CALL( + binary_search, sorted_unique_dst_ids_ptr, sorted_unique_dst_ids_ptr + unique_dst_ids.size(0), src_ids_ptr, src_ids_ptr + src_ids.size(0), is_dst.get()); @@ -96,16 +85,10 @@ std::tuple UniqueAndCompact( auto is_src = thrust::make_transform_iterator( is_dst.get(), thrust::logical_not{}); cuda::CopyScalar only_src_size; - size_t workspace_size = 0; - cub::DeviceSelect::Flagged( - nullptr, workspace_size, src_ids_ptr, is_src, + CUB_CALL( + DeviceSelect::Flagged, src_ids_ptr, is_src, only_src.data_ptr(), only_src_size.get(), - src_ids.size(0), stream); - auto tmp_storage = allocator.AllocateStorage(workspace_size); - cub::DeviceSelect::Flagged( - tmp_storage.get(), workspace_size, src_ids_ptr, is_src, - only_src.data_ptr(), only_src_size.get(), - src_ids.size(0), stream); + src_ids.size(0)); stream.synchronize(); only_src = only_src.slice(0, 0, static_cast(only_src_size)); } @@ -129,16 +112,10 @@ std::tuple UniqueAndCompact( { // Compute the unique operation on the only_src tensor. cuda::CopyScalar unique_only_src_size; - size_t workspace_size = 0; - CUDA_CALL(cub::DeviceSelect::Unique( - nullptr, workspace_size, sorted_only_src.data_ptr(), - unique_only_src_ptr, unique_only_src_size.get(), only_src.size(0), - stream)); - auto tmp_storage = allocator.AllocateStorage(workspace_size); - CUDA_CALL(cub::DeviceSelect::Unique( - tmp_storage.get(), workspace_size, - sorted_only_src.data_ptr(), unique_only_src_ptr, - unique_only_src_size.get(), only_src.size(0), stream)); + CUB_CALL( + DeviceSelect::Unique, sorted_only_src.data_ptr(), + unique_only_src_ptr, unique_only_src_size.get(), + only_src.size(0)); stream.synchronize(); unique_only_src = unique_only_src.slice( 0, 0, static_cast(unique_only_src_size)); @@ -146,7 +123,8 @@ std::tuple UniqueAndCompact( auto real_order = torch::cat({unique_dst_ids, unique_only_src}); // Sort here so that binary search can be used to lookup new_ids. - auto [sorted_order, new_ids] = Sort(real_order, num_bits); + torch::Tensor sorted_order, new_ids; + std::tie(sorted_order, new_ids) = Sort(real_order, num_bits); auto sorted_order_ptr = sorted_order.data_ptr(); auto new_ids_ptr = new_ids.data_ptr(); // Holds the found locations of the src and dst ids in the sorted_order. @@ -154,8 +132,8 @@ std::tuple UniqueAndCompact( // tensors. auto new_dst_ids_loc = allocator.AllocateStorage(dst_ids.size(0)); - thrust::lower_bound( - exec_policy, sorted_order_ptr, + THRUST_CALL( + lower_bound, sorted_order_ptr, sorted_order_ptr + sorted_order.size(0), dst_ids_ptr, dst_ids_ptr + dst_ids.size(0), new_dst_ids_loc.get()); @@ -172,16 +150,16 @@ std::tuple UniqueAndCompact( auto new_src_ids_loc = allocator.AllocateStorage(src_ids.size(0)); - thrust::lower_bound( - exec_policy, sorted_order_ptr, + THRUST_CALL( + lower_bound, sorted_order_ptr, sorted_order_ptr + sorted_order.size(0), src_ids_ptr, src_ids_ptr + src_ids.size(0), new_src_ids_loc.get()); // Finally, lookup the new compact ids of the src and dst tensors via // gather operations. auto new_src_ids = torch::empty_like(src_ids); - thrust::gather( - exec_policy, new_src_ids_loc.get(), + THRUST_CALL( + gather, new_src_ids_loc.get(), new_src_ids_loc.get() + src_ids.size(0), new_ids.data_ptr(), new_src_ids.data_ptr()); // Perform check before we gather for the dst indices. @@ -189,8 +167,8 @@ std::tuple UniqueAndCompact( throw std::out_of_range("Some ids not found."); } auto new_dst_ids = torch::empty_like(dst_ids); - thrust::gather( - exec_policy, new_dst_ids_loc.get(), + THRUST_CALL( + gather, new_dst_ids_loc.get(), new_dst_ids_loc.get() + dst_ids.size(0), new_ids.data_ptr(), new_dst_ids.data_ptr()); return std::make_tuple(real_order, new_src_ids, new_dst_ids); From 0f3bfd7ecbd359f222e835c208ea8dce44570ad6 Mon Sep 17 00:00:00 2001 From: Muhammed Fatih BALIN Date: Fri, 12 Jan 2024 06:10:26 -0500 Subject: [PATCH 09/22] [GraphBolt][CUDA] Refactor `IndexSelectCSC` and add `output_size` argument (#6927) --- graphbolt/include/graphbolt/cuda_ops.h | 25 +++++- graphbolt/src/cuda/index_select_csc_impl.cu | 88 +++++++++++-------- graphbolt/src/cuda/insubgraph.cu | 12 +-- graphbolt/src/cuda/neighbor_sampler.cu | 21 +++-- graphbolt/src/index_select.cc | 5 +- graphbolt/src/index_select.h | 4 +- .../impl/test_in_subgraph_sampler.py | 13 ++- 7 files changed, 110 insertions(+), 58 deletions(-) diff --git a/graphbolt/include/graphbolt/cuda_ops.h b/graphbolt/include/graphbolt/cuda_ops.h index a045a933d1df..f0f48f75e603 100644 --- a/graphbolt/include/graphbolt/cuda_ops.h +++ b/graphbolt/include/graphbolt/cuda_ops.h @@ -68,6 +68,27 @@ Sort(torch::Tensor input, int num_bits = 0); */ torch::Tensor IsIn(torch::Tensor elements, torch::Tensor test_elements); +/** + * @brief Select columns for a sparse matrix in a CSC format according to nodes + * tensor. + * + * NOTE: The shape of all tensors must be 1-D. + * + * @param in_degree Indegree tensor containing degrees of nodes being copied. + * @param sliced_indptr Sliced_indptr tensor containing indptr values of nodes + * being copied. + * @param indices Indices tensor with edge information of shape (indptr[N],). + * @param nodes Nodes tensor with shape (M,). + * @param nodes_max An upperbound on `nodes.max()`. + * @param output_size The total number of edges being copied. + * @return (torch::Tensor, torch::Tensor) Output indptr and indices tensors of + * shapes (M + 1,) and ((indptr[nodes + 1] - indptr[nodes]).sum(),). + */ +std::tuple IndexSelectCSCImpl( + torch::Tensor in_degree, torch::Tensor sliced_indptr, torch::Tensor indices, + torch::Tensor nodes, int64_t nodes_max, + torch::optional output_size = torch::nullopt); + /** * @brief Select columns for a sparse matrix in a CSC format according to nodes * tensor. @@ -77,11 +98,13 @@ torch::Tensor IsIn(torch::Tensor elements, torch::Tensor test_elements); * @param indptr Indptr tensor containing offsets with shape (N,). * @param indices Indices tensor with edge information of shape (indptr[N],). * @param nodes Nodes tensor with shape (M,). + * @param output_size The total number of edges being copied. * @return (torch::Tensor, torch::Tensor) Output indptr and indices tensors of * shapes (M + 1,) and ((indptr[nodes + 1] - indptr[nodes]).sum(),). */ std::tuple IndexSelectCSCImpl( - torch::Tensor indptr, torch::Tensor indices, torch::Tensor nodes); + torch::Tensor indptr, torch::Tensor indices, torch::Tensor nodes, + torch::optional output_size = torch::nullopt); /** * @brief Slices the indptr tensor with nodes and returns the indegrees of the diff --git a/graphbolt/src/cuda/index_select_csc_impl.cu b/graphbolt/src/cuda/index_select_csc_impl.cu index da0b80584482..cd42bb987da9 100644 --- a/graphbolt/src/cuda/index_select_csc_impl.cu +++ b/graphbolt/src/cuda/index_select_csc_impl.cu @@ -86,14 +86,15 @@ template std::tuple UVAIndexSelectCSCCopyIndices( torch::Tensor indices, const int64_t num_nodes, const indptr_t* const in_degree, const indptr_t* const sliced_indptr, - const int64_t* const perm, torch::TensorOptions nodes_options, - torch::ScalarType indptr_scalar_type) { + const int64_t* const perm, torch::TensorOptions options, + torch::ScalarType indptr_scalar_type, + torch::optional output_size) { auto allocator = cuda::GetAllocator(); thrust::counting_iterator iota(0); // Output indptr for the slice indexed by nodes. auto output_indptr = - torch::empty(num_nodes + 1, nodes_options.dtype(indptr_scalar_type)); + torch::empty(num_nodes + 1, options.dtype(indptr_scalar_type)); auto output_indptr_aligned = allocator.AllocateStorage(num_nodes + 1); @@ -114,16 +115,18 @@ std::tuple UVAIndexSelectCSCCopyIndices( } // Copy the actual total number of edges. - auto edge_count = - cuda::CopyScalar{output_indptr.data_ptr() + num_nodes}; + if (!output_size.has_value()) { + auto edge_count = + cuda::CopyScalar{output_indptr.data_ptr() + num_nodes}; + output_size = static_cast(edge_count); + } // Copy the modified number of edges. auto edge_count_aligned = cuda::CopyScalar{output_indptr_aligned.get() + num_nodes}; // Allocate output array with actual number of edges. - torch::Tensor output_indices = torch::empty( - static_cast(edge_count), - nodes_options.dtype(indices.scalar_type())); + torch::Tensor output_indices = + torch::empty(output_size.value(), options.dtype(indices.scalar_type())); const dim3 block(BLOCK_SIZE); const dim3 grid( (static_cast(edge_count_aligned) + BLOCK_SIZE - 1) / @@ -141,26 +144,22 @@ std::tuple UVAIndexSelectCSCCopyIndices( } std::tuple UVAIndexSelectCSCImpl( - torch::Tensor indptr, torch::Tensor indices, torch::Tensor nodes) { + torch::Tensor in_degree, torch::Tensor sliced_indptr, torch::Tensor indices, + torch::Tensor nodes, int num_bits, torch::optional output_size) { // Sorting nodes so that accesses over PCI-e are more regular. - const auto sorted_idx = - Sort(nodes, cuda::NumberOfBits(indptr.size(0) - 1)).second; + const auto sorted_idx = Sort(nodes, num_bits).second; const int64_t num_nodes = nodes.size(0); - auto in_degree_and_sliced_indptr = SliceCSCIndptr(indptr, nodes); return AT_DISPATCH_INTEGRAL_TYPES( - indptr.scalar_type(), "UVAIndexSelectCSCIndptr", ([&] { + sliced_indptr.scalar_type(), "UVAIndexSelectCSCIndptr", ([&] { using indptr_t = scalar_t; - auto in_degree = - std::get<0>(in_degree_and_sliced_indptr).data_ptr(); - auto sliced_indptr = - std::get<1>(in_degree_and_sliced_indptr).data_ptr(); return GRAPHBOLT_DISPATCH_ELEMENT_SIZES( indices.element_size(), "UVAIndexSelectCSCCopyIndices", ([&] { return UVAIndexSelectCSCCopyIndices( - indices, num_nodes, in_degree, sliced_indptr, + indices, num_nodes, in_degree.data_ptr(), + sliced_indptr.data_ptr(), sorted_idx.data_ptr(), nodes.options(), - indptr.scalar_type()); + sliced_indptr.scalar_type(), output_size); })); })); } @@ -204,38 +203,39 @@ void IndexSelectCSCCopyIndices( } std::tuple DeviceIndexSelectCSCImpl( - torch::Tensor indptr, torch::Tensor indices, torch::Tensor nodes) { - const int64_t num_nodes = nodes.size(0); - auto in_degree_and_sliced_indptr = SliceCSCIndptr(indptr, nodes); + torch::Tensor in_degree, torch::Tensor sliced_indptr, torch::Tensor indices, + torch::TensorOptions options, torch::optional output_size) { + const int64_t num_nodes = sliced_indptr.size(0); return AT_DISPATCH_INTEGRAL_TYPES( - indptr.scalar_type(), "IndexSelectCSCIndptr", ([&] { + sliced_indptr.scalar_type(), "IndexSelectCSCIndptr", ([&] { using indptr_t = scalar_t; - auto in_degree = - std::get<0>(in_degree_and_sliced_indptr).data_ptr(); - auto sliced_indptr = - std::get<1>(in_degree_and_sliced_indptr).data_ptr(); + auto in_degree_ptr = in_degree.data_ptr(); + auto sliced_indptr_ptr = sliced_indptr.data_ptr(); // Output indptr for the slice indexed by nodes. torch::Tensor output_indptr = torch::empty( - num_nodes + 1, nodes.options().dtype(indptr.scalar_type())); + num_nodes + 1, options.dtype(sliced_indptr.scalar_type())); // Compute the output indptr, output_indptr. CUB_CALL( - DeviceScan::ExclusiveSum, in_degree, + DeviceScan::ExclusiveSum, in_degree_ptr, output_indptr.data_ptr(), num_nodes + 1); // Number of edges being copied. - auto edge_count = - cuda::CopyScalar{output_indptr.data_ptr() + num_nodes}; + if (!output_size.has_value()) { + auto edge_count = + cuda::CopyScalar{output_indptr.data_ptr() + num_nodes}; + output_size = static_cast(edge_count); + } // Allocate output array of size number of copied edges. torch::Tensor output_indices = torch::empty( - static_cast(edge_count), - nodes.options().dtype(indices.scalar_type())); + output_size.value(), options.dtype(indices.scalar_type())); GRAPHBOLT_DISPATCH_ELEMENT_SIZES( indices.element_size(), "IndexSelectCSCCopyIndices", ([&] { using indices_t = element_size_t; IndexSelectCSCCopyIndices( num_nodes, reinterpret_cast(indices.data_ptr()), - sliced_indptr, in_degree, output_indptr.data_ptr(), + sliced_indptr_ptr, in_degree_ptr, + output_indptr.data_ptr(), reinterpret_cast(output_indices.data_ptr())); })); return std::make_tuple(output_indptr, output_indices); @@ -243,13 +243,27 @@ std::tuple DeviceIndexSelectCSCImpl( } std::tuple IndexSelectCSCImpl( - torch::Tensor indptr, torch::Tensor indices, torch::Tensor nodes) { + torch::Tensor in_degree, torch::Tensor sliced_indptr, torch::Tensor indices, + torch::Tensor nodes, int64_t nodes_max, + torch::optional output_size) { if (indices.is_pinned()) { - return UVAIndexSelectCSCImpl(indptr, indices, nodes); + int num_bits = cuda::NumberOfBits(nodes_max + 1); + return UVAIndexSelectCSCImpl( + in_degree, sliced_indptr, indices, nodes, num_bits, output_size); } else { - return DeviceIndexSelectCSCImpl(indptr, indices, nodes); + return DeviceIndexSelectCSCImpl( + in_degree, sliced_indptr, indices, nodes.options(), output_size); } } +std::tuple IndexSelectCSCImpl( + torch::Tensor indptr, torch::Tensor indices, torch::Tensor nodes, + torch::optional output_size) { + auto [in_degree, sliced_indptr] = SliceCSCIndptr(indptr, nodes); + return IndexSelectCSCImpl( + in_degree, sliced_indptr, indices, nodes, indptr.size(0) - 2, + output_size); +} + } // namespace ops } // namespace graphbolt diff --git a/graphbolt/src/cuda/insubgraph.cu b/graphbolt/src/cuda/insubgraph.cu index d3526486e6dd..bd72ba9a93c9 100644 --- a/graphbolt/src/cuda/insubgraph.cu +++ b/graphbolt/src/cuda/insubgraph.cu @@ -16,15 +16,17 @@ namespace ops { c10::intrusive_ptr InSubgraph( torch::Tensor indptr, torch::Tensor indices, torch::Tensor nodes, torch::optional type_per_edge) { - auto [output_indptr, output_indices] = - IndexSelectCSCImpl(indptr, indices, nodes); + auto [in_degree, sliced_indptr] = SliceCSCIndptr(indptr, nodes); + auto [output_indptr, output_indices] = IndexSelectCSCImpl( + in_degree, sliced_indptr, indices, nodes, indptr.size(0) - 2); + const int64_t num_edges = output_indices.size(0); torch::optional output_type_per_edge; if (type_per_edge) { - output_type_per_edge = - std::get<1>(IndexSelectCSCImpl(indptr, type_per_edge.value(), nodes)); + output_type_per_edge = std::get<1>(IndexSelectCSCImpl( + in_degree, sliced_indptr, type_per_edge.value(), nodes, + indptr.size(0) - 2, num_edges)); } auto rows = CSRToCOO(output_indptr, indices.scalar_type()); - auto [in_degree, sliced_indptr] = SliceCSCIndptr(indptr, nodes); auto i = torch::arange(output_indices.size(0), output_indptr.options()); auto edge_ids = i - output_indptr.gather(0, rows) + sliced_indptr.gather(0, rows); diff --git a/graphbolt/src/cuda/neighbor_sampler.cu b/graphbolt/src/cuda/neighbor_sampler.cu index e40a6c909d45..a31a0f673ed7 100644 --- a/graphbolt/src/cuda/neighbor_sampler.cu +++ b/graphbolt/src/cuda/neighbor_sampler.cu @@ -157,25 +157,30 @@ c10::intrusive_ptr SampleNeighbors( auto in_degree_and_sliced_indptr = SliceCSCIndptr(indptr, nodes); auto in_degree = std::get<0>(in_degree_and_sliced_indptr); auto sliced_indptr = std::get<1>(in_degree_and_sliced_indptr); + torch::optional num_edges_; torch::Tensor sub_indptr; - // @todo mfbalin, refactor IndexSelectCSCImpl so that it does not have to take - // nodes as input torch::optional sliced_probs_or_mask; if (probs_or_mask.has_value()) { torch::Tensor sliced_probs_or_mask_tensor; - std::tie(sub_indptr, sliced_probs_or_mask_tensor) = - IndexSelectCSCImpl(indptr, probs_or_mask.value(), nodes); + std::tie(sub_indptr, sliced_probs_or_mask_tensor) = IndexSelectCSCImpl( + in_degree, sliced_indptr, probs_or_mask.value(), nodes, + indptr.size(0) - 2, num_edges_); sliced_probs_or_mask = sliced_probs_or_mask_tensor; - } else { - sub_indptr = ExclusiveCumSum(in_degree); + num_edges_ = sliced_probs_or_mask_tensor.size(0); } if (fanouts.size() > 1) { torch::Tensor sliced_type_per_edge; - std::tie(sub_indptr, sliced_type_per_edge) = - IndexSelectCSCImpl(indptr, type_per_edge.value(), nodes); + std::tie(sub_indptr, sliced_type_per_edge) = IndexSelectCSCImpl( + in_degree, sliced_indptr, type_per_edge.value(), nodes, + indptr.size(0) - 2, num_edges_); std::tie(sub_indptr, in_degree, sliced_indptr) = SliceCSCIndptrHetero( sub_indptr, sliced_type_per_edge, sliced_indptr, fanouts.size()); num_rows = sliced_indptr.size(0); + num_edges_ = sliced_type_per_edge.size(0); + } + // If sub_indptr was not computed in the two code blocks above: + if (!probs_or_mask.has_value() && fanouts.size() <= 1) { + sub_indptr = ExclusiveCumSum(in_degree); } auto max_in_degree = torch::empty( 1, diff --git a/graphbolt/src/index_select.cc b/graphbolt/src/index_select.cc index 00257061c675..598bd043b165 100644 --- a/graphbolt/src/index_select.cc +++ b/graphbolt/src/index_select.cc @@ -22,14 +22,15 @@ torch::Tensor IndexSelect(torch::Tensor input, torch::Tensor index) { } std::tuple IndexSelectCSC( - torch::Tensor indptr, torch::Tensor indices, torch::Tensor nodes) { + torch::Tensor indptr, torch::Tensor indices, torch::Tensor nodes, + torch::optional output_size) { TORCH_CHECK( indices.sizes().size() == 1, "IndexSelectCSC only supports 1d tensors"); if (utils::is_on_gpu(nodes) && utils::is_accessible_from_gpu(indptr) && utils::is_accessible_from_gpu(indices)) { GRAPHBOLT_DISPATCH_CUDA_ONLY_DEVICE( c10::DeviceType::CUDA, "IndexSelectCSCImpl", - { return IndexSelectCSCImpl(indptr, indices, nodes); }); + { return IndexSelectCSCImpl(indptr, indices, nodes, output_size); }); } // @todo: The CPU supports only integer dtypes for indices tensor. TORCH_CHECK( diff --git a/graphbolt/src/index_select.h b/graphbolt/src/index_select.h index 9f95e051ab53..29fa6db2e751 100644 --- a/graphbolt/src/index_select.h +++ b/graphbolt/src/index_select.h @@ -25,11 +25,13 @@ namespace ops { * @param indptr Indptr tensor containing offsets with shape (N,). * @param indices Indices tensor with edge information of shape (indptr[N],). * @param nodes Nodes tensor with shape (M,). + * @param output_size The total number of edges being copied. * @return (torch::Tensor, torch::Tensor) Output indptr and indices tensors of * shapes (M + 1,) and ((indptr[nodes + 1] - indptr[nodes]).sum(),). */ std::tuple IndexSelectCSC( - torch::Tensor indptr, torch::Tensor indices, torch::Tensor nodes); + torch::Tensor indptr, torch::Tensor indices, torch::Tensor nodes, + torch::optional output_size = torch::nullopt); /** * @brief Select rows from input tensor according to index tensor. diff --git a/tests/python/pytorch/graphbolt/impl/test_in_subgraph_sampler.py b/tests/python/pytorch/graphbolt/impl/test_in_subgraph_sampler.py index 6ae5c47ffc97..9f4c021b8928 100644 --- a/tests/python/pytorch/graphbolt/impl/test_in_subgraph_sampler.py +++ b/tests/python/pytorch/graphbolt/impl/test_in_subgraph_sampler.py @@ -22,7 +22,10 @@ ) @pytest.mark.parametrize("idtype", [torch.int32, torch.int64]) @pytest.mark.parametrize("is_pinned", [False, True]) -def test_index_select_csc(indptr_dtype, indices_dtype, idtype, is_pinned): +@pytest.mark.parametrize("output_size", [None, True]) +def test_index_select_csc( + indptr_dtype, indices_dtype, idtype, is_pinned, output_size +): """Original graph in COO: 1 0 1 0 1 0 1 0 0 1 0 1 @@ -38,7 +41,7 @@ def test_index_select_csc(indptr_dtype, indices_dtype, idtype, is_pinned): index = torch.tensor([0, 5, 3], dtype=idtype) cpu_indptr, cpu_indices = torch.ops.graphbolt.index_select_csc( - indptr, indices, index + indptr, indices, index, None ) if is_pinned: indptr = indptr.pin_memory() @@ -48,10 +51,12 @@ def test_index_select_csc(indptr_dtype, indices_dtype, idtype, is_pinned): indices = indices.cuda() index = index.cuda() + if output_size: + output_size = len(cpu_indices) + gpu_indptr, gpu_indices = torch.ops.graphbolt.index_select_csc( - indptr, indices, index + indptr, indices, index, output_size ) - assert not cpu_indptr.is_cuda assert not cpu_indices.is_cuda From dfff53bc24ecc07e67fb396120308da34add72da Mon Sep 17 00:00:00 2001 From: Mingbang Wang <100203018+Skeleton003@users.noreply.github.com> Date: Mon, 15 Jan 2024 12:11:40 +0800 Subject: [PATCH 10/22] [GraphBolt] add test for PR#6873 (#6923) --- python/dgl/graphbolt/impl/ondisk_dataset.py | 2 +- .../python/pytorch/graphbolt/gb_test_utils.py | 41 +++++++++++++++++++ .../graphbolt/impl/test_ondisk_dataset.py | 27 ++++++++++-- 3 files changed, 65 insertions(+), 5 deletions(-) diff --git a/python/dgl/graphbolt/impl/ondisk_dataset.py b/python/dgl/graphbolt/impl/ondisk_dataset.py index fde717a6b6bb..8fb198c4875a 100644 --- a/python/dgl/graphbolt/impl/ondisk_dataset.py +++ b/python/dgl/graphbolt/impl/ondisk_dataset.py @@ -158,7 +158,7 @@ def preprocess_ondisk_dataset( graph_feature["name"] ] = edge_data if not is_homogeneous: - # For homogeneous graph, a node/edge feature must cover all + # For heterogenous graph, a node/edge feature must cover all # node/edge types. for feat_name, feat_data in g.ndata.items(): existing_types = set(feat_data.keys()) diff --git a/tests/python/pytorch/graphbolt/gb_test_utils.py b/tests/python/pytorch/graphbolt/gb_test_utils.py index 14661ac7bff7..dd7abc74da0c 100644 --- a/tests/python/pytorch/graphbolt/gb_test_utils.py +++ b/tests/python/pytorch/graphbolt/gb_test_utils.py @@ -165,6 +165,12 @@ def random_homo_graphbolt_graph( - format: {edge_fmt} path: {edge_path} feature_data: + - domain: node + type: null + name: feat + format: numpy + in_memory: true + path: {node_feat_path} - domain: edge type: null name: feat @@ -250,6 +256,16 @@ def genereate_raw_data_for_hetero_dataset( np.save(os.path.join(test_dir, node_feat_path), node_feats) node_feats_path[ntype] = node_feat_path + # Generate edge features. + edge_feats_path = {} + os.makedirs(os.path.join(test_dir, "data"), exist_ok=True) + for etype, num_edge in num_edges.items(): + src_ntype, etype_str, dst_ntype = etype + edge_feat_path = os.path.join("data", f"{etype_str}-feat.npy") + edge_feats = np.random.rand(num_edge, num_classes) + np.save(os.path.join(test_dir, edge_feat_path), edge_feats) + edge_feats_path[etype_str] = edge_feat_path + # Generate train/test/valid set. os.makedirs(os.path.join(test_dir, "set"), exist_ok=True) user_ids = torch.arange(num_nodes["user"]) @@ -285,6 +301,31 @@ def genereate_raw_data_for_hetero_dataset( - type: "user:click:item" format: {edge_fmt} path: {edges_path["click"]} + feature_data: + - domain: node + type: user + name: feat + format: numpy + in_memory: true + path: {node_feats_path["user"]} + - domain: node + type: item + name: feat + format: numpy + in_memory: true + path: {node_feats_path["item"]} + - domain: edge + type: "user:follow:user" + name: feat + format: numpy + in_memory: true + path: {edge_feats_path["follow"]} + - domain: edge + type: "user:click:item" + name: feat + format: numpy + in_memory: true + path: {edge_feats_path["click"]} feature_data: - domain: node type: user diff --git a/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py b/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py index 8f261652c2bd..1eb70ef1bfe3 100644 --- a/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py +++ b/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py @@ -1136,9 +1136,14 @@ def test_OnDiskDataset_preprocess_homogeneous(edge_fmt): assert fused_csc_sampling_graph.total_num_nodes == num_nodes assert fused_csc_sampling_graph.total_num_edges == num_edges assert ( - fused_csc_sampling_graph.edge_attributes is None - or gb.ORIGINAL_EDGE_ID + fused_csc_sampling_graph.node_attributes is not None + and "feat" in fused_csc_sampling_graph.node_attributes + ) + assert ( + fused_csc_sampling_graph.edge_attributes is not None + and gb.ORIGINAL_EDGE_ID not in fused_csc_sampling_graph.edge_attributes + and "feat" in fused_csc_sampling_graph.edge_attributes ) num_samples = 100 @@ -2147,7 +2152,14 @@ def test_OnDiskDataset_homogeneous(include_original_edge_id, edge_fmt): assert isinstance(graph, gb.FusedCSCSamplingGraph) assert graph.total_num_nodes == num_nodes assert graph.total_num_edges == num_edges - assert graph.edge_attributes is not None + assert ( + graph.node_attributes is not None + and "feat" in graph.node_attributes + ) + assert ( + graph.edge_attributes is not None + and "feat" in graph.edge_attributes + ) assert ( not include_original_edge_id ) or gb.ORIGINAL_EDGE_ID in graph.edge_attributes @@ -2220,7 +2232,14 @@ def test_OnDiskDataset_heterogeneous(include_original_edge_id, edge_fmt): assert graph.total_num_edges == sum( num_edge for num_edge in num_edges.values() ) - assert graph.edge_attributes is not None + assert ( + graph.node_attributes is not None + and "feat" in graph.node_attributes + ) + assert ( + graph.edge_attributes is not None + and "feat" in graph.edge_attributes + ) assert ( not include_original_edge_id ) or gb.ORIGINAL_EDGE_ID in graph.edge_attributes From 982f20284ba55f20f4c65d0968fe2b4da3e50dd9 Mon Sep 17 00:00:00 2001 From: rudongyu Date: Mon, 15 Jan 2024 13:50:02 +0800 Subject: [PATCH 11/22] [Doc Fix] fix the format of gt doc (#6949) --- docs/source/graphtransformer/data.rst | 1 + docs/source/graphtransformer/index.rst | 6 +++--- docs/source/graphtransformer/model.rst | 7 ++++++- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/docs/source/graphtransformer/data.rst b/docs/source/graphtransformer/data.rst index a8d09dd89a71..e0123e103632 100644 --- a/docs/source/graphtransformer/data.rst +++ b/docs/source/graphtransformer/data.rst @@ -5,6 +5,7 @@ In this section, we will prepare the data for the Graphormer model introduced be .. code:: python + def collate(graphs): # compute shortest path features, can be done in advance for g in graphs: diff --git a/docs/source/graphtransformer/index.rst b/docs/source/graphtransformer/index.rst index 691403d13fdb..f8e6363f8eb6 100644 --- a/docs/source/graphtransformer/index.rst +++ b/docs/source/graphtransformer/index.rst @@ -1,8 +1,8 @@ -🆕 Tutorial: GraphTransformer +🆕 Tutorial: Graph Transformer ========== -This tutorial introduces the **graphtransformer** module, which is a set of -utility modules for building and training graph transformer models. +This tutorial introduces the **graph transformer** (:mod:`~dgl.nn.gt`) module, +which is a set of utility modules for building and training graph transformer models. .. toctree:: :maxdepth: 2 diff --git a/docs/source/graphtransformer/model.rst b/docs/source/graphtransformer/model.rst index eb5d74a0e2ad..78919d48f089 100644 --- a/docs/source/graphtransformer/model.rst +++ b/docs/source/graphtransformer/model.rst @@ -12,6 +12,7 @@ Degree Encoding The degree encoder is a learnable embedding layer that encodes the degree of each node into a vector. It takes as input the batched input and output degrees of graph nodes, and outputs the degree embeddings of the nodes. .. code:: python + degree_encoder = dgl.nn.DegreeEncoder( max_degree=8, # the maximum degree to cut off embedding_dim=512 # the dimension of the degree embedding @@ -22,6 +23,7 @@ Path Encoding The path encoder encodes the edge features on the shortest path between two nodes to get attention bias for the self-attention module. It takes as input the batched edge features in shape and outputs the attention bias based on path encoding. .. code:: python + path_encoder = PathEncoder( max_len=5, # the maximum length of the shortest path feat_dim=512, # the dimension of the edge feature @@ -33,6 +35,7 @@ Spatial Encoding The spatial encoder encodes the shortest distance between two nodes to get attention bias for the self-attention module. It takes as input the shortest distance between two nodes and outputs the attention bias based on spatial encoding. .. code:: python + spatial_encoder = SpatialEncoder( max_dist=5, # the maximum distance between two nodes num_heads=8, # the number of attention heads @@ -46,6 +49,7 @@ The Graphormer layer is like a Transformer encoder layer with the Multi-head Att We can stack multiple Graphormer layers as a list just like implementing a Transformer encoder in PyTorch. .. code:: python + layers = th.nn.ModuleList([ GraphormerLayer( feat_size=512, # the dimension of the input node features @@ -63,6 +67,7 @@ Model Forward Grouping the modules above defines the primary components of the Graphormer model. We then can define the forward process as follows: .. code:: python + node_feat, in_degree, out_degree, attn_mask, path_data, dist = \ next(iter(dataloader)) # we will use the first batch as an example num_graphs, max_num_nodes, _ = node_feat.shape @@ -84,6 +89,6 @@ Grouping the modules above defines the primary components of the Graphormer mode attn_bias=attn_bias, ) -For simplicity, we omit some details in the forward process. For the complete implementation, please refer to the `Graphormer example `_. You can also explore other `utility modules `_ to customize your own graph transformer model. In the next section, we will show how to prepare the data for training. From b003732d318e3c43996d79018eccb41bc90588fc Mon Sep 17 00:00:00 2001 From: Mingbang Wang <100203018+Skeleton003@users.noreply.github.com> Date: Mon, 15 Jan 2024 15:16:58 +0800 Subject: [PATCH 12/22] [GraphBolt] Update `__repr__` of `TorchBasedFeature` and `TorchBasedFeatureStore` (#6945) --- .../impl/torch_based_feature_store.py | 85 ++++++++++--------- .../impl/test_torch_based_feature_store.py | 65 +++++++------- 2 files changed, 83 insertions(+), 67 deletions(-) diff --git a/python/dgl/graphbolt/impl/torch_based_feature_store.py b/python/dgl/graphbolt/impl/torch_based_feature_store.py index c33b562dd849..292b0b1d4e59 100644 --- a/python/dgl/graphbolt/impl/torch_based_feature_store.py +++ b/python/dgl/graphbolt/impl/torch_based_feature_store.py @@ -1,4 +1,6 @@ """Torch-based feature store for GraphBolt.""" + +import textwrap from typing import Dict, List import numpy as np @@ -169,7 +171,37 @@ def pin_memory_(self): self._tensor = self._tensor.pin_memory() def __repr__(self) -> str: - return _torch_based_feature_str(self) + ret = ( + "TorchBasedFeature(\n" + " feature={feature},\n" + " metadata={metadata},\n" + ")" + ) + + feature_str = str(self._tensor) + feature_str_lines = feature_str.splitlines() + if len(feature_str_lines) > 1: + feature_str = ( + feature_str_lines[0] + + "\n" + + textwrap.indent( + "\n".join(feature_str_lines[1:]), " " * len(" feature=") + ) + ) + + metadata_str = str(self.metadata()) + metadata_str_lines = metadata_str.splitlines() + if len(metadata_str_lines) > 1: + metadata_str = ( + metadata_str_lines[0] + + "\n" + + textwrap.indent( + "\n".join(metadata_str_lines[1:]), + " " * len(" metadata="), + ) + ) + + return ret.format(feature=feature_str, metadata=metadata_str) class TorchBasedFeatureStore(BasicFeatureStore): @@ -236,40 +268,17 @@ def pin_memory_(self): feature.pin_memory_() def __repr__(self) -> str: - return _torch_based_feature_store_str(self._features) - - -def _torch_based_feature_str(feature: TorchBasedFeature) -> str: - final_str = "TorchBasedFeature(" - indent_len = len(final_str) - - def _add_indent(_str, indent): - lines = _str.split("\n") - lines = [lines[0]] + [" " * indent + line for line in lines[1:]] - return "\n".join(lines) - - feature_str = "feature=" + _add_indent( - str(feature._tensor), indent_len + len("feature=") - ) - final_str += feature_str + ",\n" + " " * indent_len - metadata_str = "metadata=" + _add_indent( - str(feature.metadata()), indent_len + len("metadata=") - ) - final_str += metadata_str + ",\n)" - return final_str - - -def _torch_based_feature_store_str( - features: Dict[str, TorchBasedFeature] -) -> str: - final_str = "TorchBasedFeatureStore" - indent_len = len(final_str) - - def _add_indent(_str, indent): - lines = _str.split("\n") - lines = [lines[0]] + [" " * indent + line for line in lines[1:]] - return "\n".join(lines) - - features_str = _add_indent(str(features), indent_len) - final_str += features_str - return final_str + ret = "TorchBasedFeatureStore(\n" + " {features}\n" + ")" + + features_str = str(self._features) + features_str_lines = features_str.splitlines() + if len(features_str_lines) > 1: + features_str = ( + features_str_lines[0] + + "\n" + + textwrap.indent( + "\n".join(features_str_lines[1:]), " " * len(" ") + ) + ) + + return ret.format(features=features_str) diff --git a/tests/python/pytorch/graphbolt/impl/test_torch_based_feature_store.py b/tests/python/pytorch/graphbolt/impl/test_torch_based_feature_store.py index d8ce37a5580d..3de5fe3a0a82 100644 --- a/tests/python/pytorch/graphbolt/impl/test_torch_based_feature_store.py +++ b/tests/python/pytorch/graphbolt/impl/test_torch_based_feature_store.py @@ -296,23 +296,27 @@ def test_torch_based_feature_repr(in_memory): feature_a = gb.TorchBasedFeature(a, metadata=metadata) feature_b = gb.TorchBasedFeature(b) - expected_str_feature_a = str( - """TorchBasedFeature(feature=tensor([[1, 2, 3], - [4, 5, 6]]), - metadata={'max_value': 3}, -)""" + expected_str_feature_a = ( + "TorchBasedFeature(\n" + " feature=tensor([[1, 2, 3],\n" + " [4, 5, 6]]),\n" + " metadata={'max_value': 3},\n" + ")" ) - expected_str_feature_b = str( - """TorchBasedFeature(feature=tensor([[[1, 2], - [3, 4]], - - [[4, 5], - [6, 7]]]), - metadata={}, -)""" + expected_str_feature_b = ( + "TorchBasedFeature(\n" + " feature=tensor([[[1, 2],\n" + " [3, 4]],\n" + "\n" + " [[4, 5],\n" + " [6, 7]]]),\n" + " metadata={},\n" + ")" ) - assert str(feature_a) == expected_str_feature_a - assert str(feature_b) == expected_str_feature_b + + assert repr(feature_a) == expected_str_feature_a, feature_a + assert repr(feature_b) == expected_str_feature_b, feature_b + a = b = metadata = None feature_a = feature_b = None expected_str_feature_a = expected_str_feature_b = None @@ -345,21 +349,24 @@ def test_torch_based_feature_store_repr(in_memory): ] feature_store = gb.TorchBasedFeatureStore(feature_data) - expected_feature_store_str = str( - """TorchBasedFeatureStore{(, 'paper', 'a'): TorchBasedFeature(feature=tensor([[1, 2, 4], - [2, 5, 3]]), - metadata={}, - ), (, 'paper:cites:paper', 'b'): TorchBasedFeature(feature=tensor([[[1, 2], - [3, 4]], - - [[2, 5], - [3, 4]]]), - metadata={}, - )}""" - ) - assert str(feature_store) == expected_feature_store_str, print( - feature_store + expected_feature_store_str = ( + "TorchBasedFeatureStore(\n" + " {(, 'paper', 'a'): TorchBasedFeature(\n" + " feature=tensor([[1, 2, 4],\n" + " [2, 5, 3]]),\n" + " metadata={},\n" + " ), (, 'paper:cites:paper', 'b'): TorchBasedFeature(\n" + " feature=tensor([[[1, 2],\n" + " [3, 4]],\n" + "\n" + " [[2, 5],\n" + " [3, 4]]]),\n" + " metadata={},\n" + " )}\n" + ")" ) + assert repr(feature_store) == expected_feature_store_str, feature_store + a = b = feature_data = None feature_store = expected_feature_store_str = None From 90e57e741f249d43d2643ab327d07d4749ed847a Mon Sep 17 00:00:00 2001 From: Mingbang Wang <100203018+Skeleton003@users.noreply.github.com> Date: Mon, 15 Jan 2024 15:17:27 +0800 Subject: [PATCH 13/22] [GraphBolt] Update `__repr__` of `ItemSet` and `ItemSetDict` (#6944) --- python/dgl/graphbolt/itemset.py | 54 ++++++------ .../graphbolt/impl/test_ondisk_dataset.py | 84 +++++++++++-------- .../python/pytorch/graphbolt/test_itemset.py | 65 +++++++------- 3 files changed, 110 insertions(+), 93 deletions(-) diff --git a/python/dgl/graphbolt/itemset.py b/python/dgl/graphbolt/itemset.py index 5976fe7798b0..8697ac9c59e4 100644 --- a/python/dgl/graphbolt/itemset.py +++ b/python/dgl/graphbolt/itemset.py @@ -1,5 +1,6 @@ """GraphBolt Itemset.""" +import textwrap from typing import Dict, Iterable, Iterator, Sized, Tuple, Union import torch @@ -175,7 +176,14 @@ def names(self) -> Tuple[str]: return self._names def __repr__(self) -> str: - return _itemset_str(self, "ItemSet") + ret = ( + f"ItemSet(\n" + f" items={self._items},\n" + f" names={self._names},\n" + f")" + ) + + return ret class ItemSetDict: @@ -330,31 +338,19 @@ def names(self) -> Tuple[str]: return self._names def __repr__(self) -> str: - return _itemset_str(self, "ItemSetDict") - - -def _itemset_str(itemset: Union[ItemSet, ItemSetDict], name) -> str: - final_str = f"{name}(" - indent_len = len(final_str) - - def _add_indent(_str, indent): - lines = _str.split("\n") - lines = [lines[0]] + [" " * indent + line for line in lines[1:]] - return "\n".join(lines) - - items = ( - itemset._items if isinstance(itemset, ItemSet) else itemset._itemsets - ) - item_str = ( - "items=" - + _add_indent(str(items), indent_len + len("items=")) - + ",\n" - + " " * indent_len - ) - name_str = ( - "names=" - + _add_indent(str(itemset._names), indent_len + len("items=")) - + ",\n)" - ) - final_str += item_str + name_str - return final_str + ret = ( + "ItemSetDict(\n" + " itemsets={itemsets},\n" + " names={names},\n" + ")" + ) + + itemsets_str = repr(self._itemsets) + lines = itemsets_str.splitlines() + itemsets_str = ( + lines[0] + + "\n" + + textwrap.indent("\n".join(lines[1:]), " " * len(" itemsets=")) + ) + + return ret.format(itemsets=itemsets_str, names=self._names) diff --git a/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py b/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py index 1eb70ef1bfe3..c3a9beab9749 100644 --- a/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py +++ b/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py @@ -2348,18 +2348,21 @@ def test_OnDiskTask_repr_homogeneous(): ) metadata = {"name": "node_classification"} task = gb.OnDiskTask(metadata, item_set, item_set, item_set) - expected_str = str( - """OnDiskTask(validation_set=ItemSet(items=(tensor([0, 1, 2, 3, 4]), tensor([5, 6, 7, 8, 9])), - names=('seed_nodes', 'labels'), - ), - train_set=ItemSet(items=(tensor([0, 1, 2, 3, 4]), tensor([5, 6, 7, 8, 9])), - names=('seed_nodes', 'labels'), - ), - test_set=ItemSet(items=(tensor([0, 1, 2, 3, 4]), tensor([5, 6, 7, 8, 9])), - names=('seed_nodes', 'labels'), - ), - metadata={'name': 'node_classification'}, -)""" + expected_str = ( + "OnDiskTask(validation_set=ItemSet(\n" + " items=(tensor([0, 1, 2, 3, 4]), tensor([5, 6, 7, 8, 9])),\n" + " names=('seed_nodes', 'labels'),\n" + " ),\n" + " train_set=ItemSet(\n" + " items=(tensor([0, 1, 2, 3, 4]), tensor([5, 6, 7, 8, 9])),\n" + " names=('seed_nodes', 'labels'),\n" + " ),\n" + " test_set=ItemSet(\n" + " items=(tensor([0, 1, 2, 3, 4]), tensor([5, 6, 7, 8, 9])),\n" + " names=('seed_nodes', 'labels'),\n" + " ),\n" + " metadata={'name': 'node_classification'},\n" + ")" ) assert str(task) == expected_str, print(task) @@ -2373,30 +2376,39 @@ def test_OnDiskTask_repr_heterogeneous(): ) metadata = {"name": "node_classification"} task = gb.OnDiskTask(metadata, item_set, item_set, item_set) - expected_str = str( - """OnDiskTask(validation_set=ItemSetDict(items={'user': ItemSet(items=(tensor([0, 1, 2, 3, 4]),), - names=('seed_nodes',), - ), 'item': ItemSet(items=(tensor([5, 6, 7, 8, 9]),), - names=('seed_nodes',), - )}, - names=('seed_nodes',), - ), - train_set=ItemSetDict(items={'user': ItemSet(items=(tensor([0, 1, 2, 3, 4]),), - names=('seed_nodes',), - ), 'item': ItemSet(items=(tensor([5, 6, 7, 8, 9]),), - names=('seed_nodes',), - )}, - names=('seed_nodes',), - ), - test_set=ItemSetDict(items={'user': ItemSet(items=(tensor([0, 1, 2, 3, 4]),), - names=('seed_nodes',), - ), 'item': ItemSet(items=(tensor([5, 6, 7, 8, 9]),), - names=('seed_nodes',), - )}, - names=('seed_nodes',), - ), - metadata={'name': 'node_classification'}, -)""" + expected_str = ( + "OnDiskTask(validation_set=ItemSetDict(\n" + " itemsets={'user': ItemSet(\n" + " items=(tensor([0, 1, 2, 3, 4]),),\n" + " names=('seed_nodes',),\n" + " ), 'item': ItemSet(\n" + " items=(tensor([5, 6, 7, 8, 9]),),\n" + " names=('seed_nodes',),\n" + " )},\n" + " names=('seed_nodes',),\n" + " ),\n" + " train_set=ItemSetDict(\n" + " itemsets={'user': ItemSet(\n" + " items=(tensor([0, 1, 2, 3, 4]),),\n" + " names=('seed_nodes',),\n" + " ), 'item': ItemSet(\n" + " items=(tensor([5, 6, 7, 8, 9]),),\n" + " names=('seed_nodes',),\n" + " )},\n" + " names=('seed_nodes',),\n" + " ),\n" + " test_set=ItemSetDict(\n" + " itemsets={'user': ItemSet(\n" + " items=(tensor([0, 1, 2, 3, 4]),),\n" + " names=('seed_nodes',),\n" + " ), 'item': ItemSet(\n" + " items=(tensor([5, 6, 7, 8, 9]),),\n" + " names=('seed_nodes',),\n" + " )},\n" + " names=('seed_nodes',),\n" + " ),\n" + " metadata={'name': 'node_classification'},\n" + ")" ) assert str(task) == expected_str, print(task) diff --git a/tests/python/pytorch/graphbolt/test_itemset.py b/tests/python/pytorch/graphbolt/test_itemset.py index 3174c2d42910..58238aeec0f4 100644 --- a/tests/python/pytorch/graphbolt/test_itemset.py +++ b/tests/python/pytorch/graphbolt/test_itemset.py @@ -529,24 +529,27 @@ def test_ItemSetDict_iteration_node_pairs_neg_dsts(): def test_ItemSet_repr(): # ItemSet with single name. item_set = gb.ItemSet(torch.arange(0, 5), names="seed_nodes") - expected_str = str( - """ItemSet(items=(tensor([0, 1, 2, 3, 4]),), - names=('seed_nodes',), -)""" + expected_str = ( + "ItemSet(\n" + " items=(tensor([0, 1, 2, 3, 4]),),\n" + " names=('seed_nodes',),\n" + ")" ) - assert str(item_set) == expected_str, print(item_set) + + assert str(item_set) == expected_str, item_set # ItemSet with multiple names. item_set = gb.ItemSet( (torch.arange(0, 5), torch.arange(5, 10)), names=("seed_nodes", "labels"), ) - expected_str = str( - """ItemSet(items=(tensor([0, 1, 2, 3, 4]), tensor([5, 6, 7, 8, 9])), - names=('seed_nodes', 'labels'), -)""" + expected_str = ( + "ItemSet(\n" + " items=(tensor([0, 1, 2, 3, 4]), tensor([5, 6, 7, 8, 9])),\n" + " names=('seed_nodes', 'labels'),\n" + ")" ) - assert str(item_set) == expected_str, print(item_set) + assert str(item_set) == expected_str, item_set def test_ItemSetDict_repr(): @@ -557,16 +560,19 @@ def test_ItemSetDict_repr(): "item": gb.ItemSet(torch.arange(5, 10), names="seed_nodes"), } ) - expected_str = str( - """ItemSetDict(items={'user': ItemSet(items=(tensor([0, 1, 2, 3, 4]),), - names=('seed_nodes',), - ), 'item': ItemSet(items=(tensor([5, 6, 7, 8, 9]),), - names=('seed_nodes',), - )}, - names=('seed_nodes',), -)""" + expected_str = ( + "ItemSetDict(\n" + " itemsets={'user': ItemSet(\n" + " items=(tensor([0, 1, 2, 3, 4]),),\n" + " names=('seed_nodes',),\n" + " ), 'item': ItemSet(\n" + " items=(tensor([5, 6, 7, 8, 9]),),\n" + " names=('seed_nodes',),\n" + " )},\n" + " names=('seed_nodes',),\n" + ")" ) - assert str(item_set) == expected_str, print(item_set) + assert str(item_set) == expected_str, item_set # ItemSetDict with multiple names. item_set = gb.ItemSetDict( @@ -581,13 +587,16 @@ def test_ItemSetDict_repr(): ), } ) - expected_str = str( - """ItemSetDict(items={'user': ItemSet(items=(tensor([0, 1, 2, 3, 4]), tensor([5, 6, 7, 8, 9])), - names=('seed_nodes', 'labels'), - ), 'item': ItemSet(items=(tensor([5, 6, 7, 8, 9]), tensor([10, 11, 12, 13, 14])), - names=('seed_nodes', 'labels'), - )}, - names=('seed_nodes', 'labels'), -)""" + expected_str = ( + "ItemSetDict(\n" + " itemsets={'user': ItemSet(\n" + " items=(tensor([0, 1, 2, 3, 4]), tensor([5, 6, 7, 8, 9])),\n" + " names=('seed_nodes', 'labels'),\n" + " ), 'item': ItemSet(\n" + " items=(tensor([5, 6, 7, 8, 9]), tensor([10, 11, 12, 13, 14])),\n" + " names=('seed_nodes', 'labels'),\n" + " )},\n" + " names=('seed_nodes', 'labels'),\n" + ")" ) - assert str(item_set) == expected_str, print(item_set) + assert str(item_set) == expected_str, item_set From 9a8aa8fa0f42d4f75512178d47b3537044216046 Mon Sep 17 00:00:00 2001 From: peizhou001 <110809584+peizhou001@users.noreply.github.com> Date: Mon, 15 Jan 2024 17:50:47 +0800 Subject: [PATCH 14/22] [Grapbolt]Negative node pairs should be 2D (#6951) Co-authored-by: Ubuntu --- python/dgl/graphbolt/minibatch.py | 42 ++++++++++--------- python/dgl/graphbolt/subgraph_sampler.py | 6 +++ .../pytorch/graphbolt/impl/test_minibatch.py | 22 ++++++---- .../pytorch/graphbolt/test_integration.py | 26 +++++++++--- 4 files changed, 63 insertions(+), 33 deletions(-) diff --git a/python/dgl/graphbolt/minibatch.py b/python/dgl/graphbolt/minibatch.py index f21095418dc0..ec7ead0c36a8 100644 --- a/python/dgl/graphbolt/minibatch.py +++ b/python/dgl/graphbolt/minibatch.py @@ -299,15 +299,15 @@ def negative_node_pairs(self): # For homogeneous graph. if isinstance(self.compacted_negative_srcs, torch.Tensor): negative_node_pairs = ( - self.compacted_negative_srcs.view(-1), - self.compacted_negative_dsts.view(-1), + self.compacted_negative_srcs, + self.compacted_negative_dsts, ) # For heterogeneous graph. else: negative_node_pairs = { etype: ( - neg_src.view(-1), - self.compacted_negative_dsts[etype].view(-1), + neg_src, + self.compacted_negative_dsts[etype], ) for etype, neg_src in self.compacted_negative_srcs.items() } @@ -319,10 +319,10 @@ def negative_node_pairs(self): if isinstance(self.compacted_negative_srcs, torch.Tensor): negative_ratio = self.compacted_negative_srcs.size(1) negative_node_pairs = ( - self.compacted_negative_srcs.view(-1), - self.compacted_node_pairs[1].repeat_interleave( - negative_ratio - ), + self.compacted_negative_srcs, + self.compacted_node_pairs[1] + .repeat_interleave(negative_ratio) + .view(-1, negative_ratio), ) # For heterogeneous graph. else: @@ -331,10 +331,10 @@ def negative_node_pairs(self): ].size(1) negative_node_pairs = { etype: ( - neg_src.view(-1), - self.compacted_node_pairs[etype][1].repeat_interleave( - negative_ratio - ), + neg_src, + self.compacted_node_pairs[etype][1] + .repeat_interleave(negative_ratio) + .view(-1, negative_ratio), ) for etype, neg_src in self.compacted_negative_srcs.items() } @@ -346,10 +346,10 @@ def negative_node_pairs(self): if isinstance(self.compacted_negative_dsts, torch.Tensor): negative_ratio = self.compacted_negative_dsts.size(1) negative_node_pairs = ( - self.compacted_node_pairs[0].repeat_interleave( - negative_ratio - ), - self.compacted_negative_dsts.view(-1), + self.compacted_node_pairs[0] + .repeat_interleave(negative_ratio) + .view(-1, negative_ratio), + self.compacted_negative_dsts, ) # For heterogeneous graph. else: @@ -358,10 +358,10 @@ def negative_node_pairs(self): ].size(1) negative_node_pairs = { etype: ( - self.compacted_node_pairs[etype][0].repeat_interleave( - negative_ratio - ), - neg_dst.view(-1), + self.compacted_node_pairs[etype][0] + .repeat_interleave(negative_ratio) + .view(-1, negative_ratio), + neg_dst, ) for etype, neg_dst in self.compacted_negative_dsts.items() } @@ -396,6 +396,7 @@ def node_pairs_with_labels(self): for etype in positive_node_pairs: pos_src, pos_dst = positive_node_pairs[etype] neg_src, neg_dst = negative_node_pairs[etype] + neg_src, neg_dst = neg_src.view(-1), neg_dst.view(-1) node_pairs_by_etype[etype] = ( torch.cat((pos_src, neg_src), dim=0), torch.cat((pos_dst, neg_dst), dim=0), @@ -410,6 +411,7 @@ def node_pairs_with_labels(self): # Homogeneous graph. pos_src, pos_dst = positive_node_pairs neg_src, neg_dst = negative_node_pairs + neg_src, neg_dst = neg_src.view(-1), neg_dst.view(-1) node_pairs = ( torch.cat((pos_src, neg_src), dim=0), torch.cat((pos_dst, neg_dst), dim=0), diff --git a/python/dgl/graphbolt/subgraph_sampler.py b/python/dgl/graphbolt/subgraph_sampler.py index ce22f607c663..3e3c3d9b507c 100644 --- a/python/dgl/graphbolt/subgraph_sampler.py +++ b/python/dgl/graphbolt/subgraph_sampler.py @@ -130,10 +130,16 @@ def _node_pairs_preprocess(self, minibatch): for etype, _ in neg_src.items(): src_type, _, _ = etype_str_to_tuple(etype) compacted_negative_srcs[etype] = compacted[src_type].pop(0) + compacted_negative_srcs[etype] = compacted_negative_srcs[ + etype + ].view(neg_src[etype].shape) if has_neg_dst: for etype, _ in neg_dst.items(): _, _, dst_type = etype_str_to_tuple(etype) compacted_negative_dsts[etype] = compacted[dst_type].pop(0) + compacted_negative_dsts[etype] = compacted_negative_dsts[ + etype + ].view(neg_dst[etype].shape) else: # Collect nodes from all types of input. nodes = list(node_pairs) diff --git a/tests/python/pytorch/graphbolt/impl/test_minibatch.py b/tests/python/pytorch/graphbolt/impl/test_minibatch.py index a32e3e4a6e53..79e75df6bb56 100644 --- a/tests/python/pytorch/graphbolt/impl/test_minibatch.py +++ b/tests/python/pytorch/graphbolt/impl/test_minibatch.py @@ -125,8 +125,12 @@ def test_minibatch_representation_homo(): negative_srcs=tensor([[8], [1], [6]]), - negative_node_pairs=(tensor([0, 1, 2]), - tensor([6, 0, 0])), + negative_node_pairs=(tensor([[0], + [1], + [2]]), + tensor([[6], + [0], + [0]])), negative_dsts=tensor([[2], [8], [8]]), @@ -278,7 +282,11 @@ def test_minibatch_representation_hetero(): negative_srcs={'B': tensor([[8], [1], [6]])}, - negative_node_pairs={'A:r:B': (tensor([0, 1, 2]), tensor([6, 0, 0]))}, + negative_node_pairs={'A:r:B': (tensor([[0], + [1], + [2]]), tensor([[6], + [0], + [0]]))}, negative_dsts={'B': tensor([[2], [8], [8]])}, @@ -773,12 +781,12 @@ def test_dgl_link_predication_homo(mode): if mode == "neg_graph" or mode == "neg_src": assert torch.equal( minibatch.negative_node_pairs[0], - minibatch.compacted_negative_srcs.view(-1), + minibatch.compacted_negative_srcs, ) if mode == "neg_graph" or mode == "neg_dst": assert torch.equal( minibatch.negative_node_pairs[1], - minibatch.compacted_negative_dsts.view(-1), + minibatch.compacted_negative_dsts, ) ( node_pairs, @@ -834,11 +842,11 @@ def test_dgl_link_predication_hetero(mode): for etype, src in minibatch.compacted_negative_srcs.items(): assert torch.equal( minibatch.negative_node_pairs[etype][0], - src.view(-1), + src, ) if mode == "neg_graph" or mode == "neg_dst": for etype, dst in minibatch.compacted_negative_dsts.items(): assert torch.equal( minibatch.negative_node_pairs[etype][1], - minibatch.compacted_negative_dsts[etype].view(-1), + minibatch.compacted_negative_dsts[etype], ) diff --git a/tests/python/pytorch/graphbolt/test_integration.py b/tests/python/pytorch/graphbolt/test_integration.py index fe3b5c0a2daa..bea5f234869c 100644 --- a/tests/python/pytorch/graphbolt/test_integration.py +++ b/tests/python/pytorch/graphbolt/test_integration.py @@ -88,8 +88,14 @@ def test_integration_link_prediction(): [0.9634, 0.2294], [0.5503, 0.8223]])}, negative_srcs=None, - negative_node_pairs=(tensor([0, 0, 1, 1, 1, 1, 1, 1]), - tensor([4, 4, 1, 4, 0, 1, 1, 5])), + negative_node_pairs=(tensor([[0, 0], + [1, 1], + [1, 1], + [1, 1]]), + tensor([[4, 4], + [1, 4], + [0, 1], + [1, 5]])), negative_dsts=tensor([[0, 0], [3, 0], [5, 3], @@ -138,8 +144,14 @@ def test_integration_link_prediction(): [0.5160, 0.2486], [0.2109, 0.1089]])}, negative_srcs=None, - negative_node_pairs=(tensor([0, 0, 1, 1, 1, 1, 2, 2]), - tensor([3, 4, 5, 4, 1, 0, 3, 4])), + negative_node_pairs=(tensor([[0, 0], + [1, 1], + [1, 1], + [2, 2]]), + tensor([[3, 4], + [5, 4], + [1, 0], + [3, 4]])), negative_dsts=tensor([[1, 5], [2, 5], [4, 3], @@ -186,8 +198,10 @@ def test_integration_link_prediction(): [0.9634, 0.2294], [0.6172, 0.7865]])}, negative_srcs=None, - negative_node_pairs=(tensor([0, 0, 1, 1]), - tensor([2, 1, 2, 3])), + negative_node_pairs=(tensor([[0, 0], + [1, 1]]), + tensor([[2, 1], + [2, 3]])), negative_dsts=tensor([[0, 4], [0, 1]]), labels=None, From 40816f6e4069b353b25cbed5e0956257843f5b65 Mon Sep 17 00:00:00 2001 From: yxy235 <77922129+yxy235@users.noreply.github.com> Date: Tue, 16 Jan 2024 10:04:19 +0800 Subject: [PATCH 15/22] [GraphBolt] Automatically force preprocess on-disk dataset. (#6937) Co-authored-by: Ubuntu --- python/dgl/graphbolt/impl/ondisk_dataset.py | 47 +++- python/dgl/graphbolt/internal/utils.py | 51 ++++ .../graphbolt/impl/test_ondisk_dataset.py | 230 +++++++++++++++++- .../pytorch/graphbolt/utils/test_internal.py | 66 +++++ 4 files changed, 381 insertions(+), 13 deletions(-) diff --git a/python/dgl/graphbolt/impl/ondisk_dataset.py b/python/dgl/graphbolt/impl/ondisk_dataset.py index 8fb198c4875a..11245c458061 100644 --- a/python/dgl/graphbolt/impl/ondisk_dataset.py +++ b/python/dgl/graphbolt/impl/ondisk_dataset.py @@ -1,5 +1,6 @@ """GraphBolt OnDiskDataset.""" +import json import os import shutil from copy import deepcopy @@ -15,6 +16,8 @@ from ..base import etype_str_to_tuple from ..dataset import Dataset, Task from ..internal import ( + calculate_dir_hash, + check_dataset_change, copy_or_convert_data, get_attributes, read_data, @@ -37,7 +40,7 @@ def preprocess_ondisk_dataset( dataset_dir: str, include_original_edge_id: bool = False, - force_preprocess: bool = False, + force_preprocess: bool = None, ) -> str: """Preprocess the on-disk dataset. Parse the input config file, load the data, and save the data in the format that GraphBolt supports. @@ -72,6 +75,20 @@ def preprocess_ondisk_dataset( processed_dir_prefix, "metadata.yaml" ) if os.path.exists(os.path.join(dataset_dir, preprocess_metadata_path)): + if force_preprocess is None: + with open( + os.path.join(dataset_dir, preprocess_metadata_path), "r" + ) as f: + preprocess_config = yaml.safe_load(f) + if ( + preprocess_config.get("include_original_edge_id", None) + == include_original_edge_id + ): + force_preprocess = check_dataset_change( + dataset_dir, processed_dir_prefix + ) + else: + force_preprocess = True if force_preprocess: shutil.rmtree(os.path.join(dataset_dir, processed_dir_prefix)) print( @@ -180,7 +197,10 @@ def preprocess_ondisk_dataset( g, is_homogeneous, include_original_edge_id ) - # 5. Save the FusedCSCSamplingGraph and modify the output_config. + # 5. Record value of include_original_edge_id. + output_config["include_original_edge_id"] = include_original_edge_id + + # 6. Save the FusedCSCSamplingGraph and modify the output_config. output_config["graph_topology"] = {} output_config["graph_topology"]["type"] = "FusedCSCSamplingGraph" output_config["graph_topology"]["path"] = os.path.join( @@ -196,7 +216,7 @@ def preprocess_ondisk_dataset( ) del output_config["graph"] - # 6. Load the node/edge features and do necessary conversion. + # 7. Load the node/edge features and do necessary conversion. if input_config.get("feature_data", None): for feature, out_feature in zip( input_config["feature_data"], output_config["feature_data"] @@ -218,7 +238,7 @@ def preprocess_ondisk_dataset( is_feature=True, ) - # 7. Save tasks and train/val/test split according to the output_config. + # 8. Save tasks and train/val/test split according to the output_config. if input_config.get("tasks", None): for input_task, output_task in zip( input_config["tasks"], output_config["tasks"] @@ -245,13 +265,24 @@ def preprocess_ondisk_dataset( output_data["format"], ) - # 8. Save the output_config. + # 9. Save the output_config. output_config_path = os.path.join(dataset_dir, preprocess_metadata_path) with open(output_config_path, "w") as f: yaml.dump(output_config, f) print("Finish preprocessing the on-disk dataset.") - # 9. Return the absolute path of the preprocessing yaml file. + # 10. Calculate and save the hash value of the dataset directory. + hash_value_file = "dataset_hash_value.txt" + hash_value_file_path = os.path.join( + dataset_dir, processed_dir_prefix, hash_value_file + ) + if os.path.exists(hash_value_file_path): + os.remove(hash_value_file_path) + dir_hash = calculate_dir_hash(dataset_dir) + with open(hash_value_file_path, "w") as f: + f.write(json.dumps(dir_hash, indent=4)) + + # 11. Return the absolute path of the preprocessing yaml file. return output_config_path @@ -398,7 +429,7 @@ def __init__( self, path: str, include_original_edge_id: bool = False, - force_preprocess: bool = False, + force_preprocess: bool = None, ) -> None: # Always call the preprocess function first. If already preprocessed, # the function will return the original path directly. @@ -720,7 +751,7 @@ def __init__(self, name: str, root: str = "datasets") -> OnDiskDataset: download(url, path=zip_file_path) extract_archive(zip_file_path, root, overwrite=True) os.remove(zip_file_path) - super().__init__(dataset_dir) + super().__init__(dataset_dir, force_preprocess=False) def _ondisk_task_str(task: OnDiskTask) -> str: diff --git a/python/dgl/graphbolt/internal/utils.py b/python/dgl/graphbolt/internal/utils.py index ee61037ffd7b..af5bcdb6db93 100644 --- a/python/dgl/graphbolt/internal/utils.py +++ b/python/dgl/graphbolt/internal/utils.py @@ -1,7 +1,10 @@ """Utility functions for GraphBolt.""" +import hashlib +import json import os import shutil +from typing import List, Union import numpy as np import pandas as pd @@ -145,3 +148,51 @@ def read_edges(dataset_dir, edge_fmt, edge_path): ) src, dst = edge_data["src"].to_numpy(), edge_data["dst"].to_numpy() return (src, dst) + + +def calculate_file_hash(file_path, hash_algo="md5"): + """Calculate the hash value of a file.""" + hash_algos = ["md5", "sha1", "sha224", "sha256", "sha384", "sha512"] + if hash_algo in hash_algos: + hash_obj = getattr(hashlib, hash_algo)() + else: + raise ValueError( + f"Hash algorithm must be one of: {hash_algos}, but got `{hash_algo}`." + ) + with open(file_path, "rb") as file: + for chunk in iter(lambda: file.read(4096), b""): + hash_obj.update(chunk) + return hash_obj.hexdigest() + + +def calculate_dir_hash( + dir_path, hash_algo="md5", ignore: Union[str, List[str]] = None +): + """Calculte the hash values of all files under the directory.""" + hashes = {} + for dirpath, _, filenames in os.walk(dir_path): + for filename in filenames: + if ignore and filename in ignore: + continue + filepath = os.path.join(dirpath, filename) + file_hash = calculate_file_hash(filepath, hash_algo=hash_algo) + hashes[filepath] = file_hash + return hashes + + +def check_dataset_change(dataset_dir, processed_dir): + """Check whether dataset has been changed by checking its hash value.""" + hash_value_file = "dataset_hash_value.txt" + hash_value_file_path = os.path.join( + dataset_dir, processed_dir, hash_value_file + ) + if not os.path.exists(hash_value_file_path): + return True + with open(hash_value_file_path, "r") as f: + oringinal_hash_value = json.load(f) + present_hash_value = calculate_dir_hash(dataset_dir, ignore=hash_value_file) + if oringinal_hash_value == present_hash_value: + force_preprocess = False + else: + force_preprocess = True + return force_preprocess diff --git a/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py b/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py index c3a9beab9749..1e5fdd4fbabe 100644 --- a/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py +++ b/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py @@ -32,9 +32,11 @@ def load_dataset(dataset): return dataset.load() -def write_yaml_and_load_dataset(yaml_content, dir): +def write_yaml_and_load_dataset(yaml_content, dir, force_preprocess=False): write_yaml_file(yaml_content, dir) - return load_dataset(gb.OnDiskDataset(dir)) + return load_dataset( + gb.OnDiskDataset(dir, force_preprocess=force_preprocess) + ) def test_OnDiskDataset_TVTSet_exceptions(): @@ -52,7 +54,7 @@ def test_OnDiskDataset_TVTSet_exceptions(): """ write_yaml_file(yaml_content, test_dir) with pytest.raises(pydantic.ValidationError): - _ = gb.OnDiskDataset(test_dir).load() + _ = gb.OnDiskDataset(test_dir, force_preprocess=False).load() # Case 2: ``type`` is not specified while multiple TVT sets are # specified. @@ -74,7 +76,7 @@ def test_OnDiskDataset_TVTSet_exceptions(): AssertionError, match=r"Only one TVT set is allowed if type is not specified.", ): - _ = gb.OnDiskDataset(test_dir).load() + _ = gb.OnDiskDataset(test_dir, force_preprocess=False).load() def test_OnDiskDataset_multiple_tasks(): @@ -1001,7 +1003,7 @@ def test_OnDiskDataset_Graph_Exceptions(): pydantic.ValidationError, match="1 validation error for OnDiskMetaData", ): - _ = gb.OnDiskDataset(test_dir).load() + _ = gb.OnDiskDataset(test_dir, force_preprocess=False).load() def test_OnDiskDataset_Graph_homogeneous(): @@ -1359,6 +1361,7 @@ def test_OnDiskDataset_preprocess_yaml_content_unix(): data: - format: numpy path: preprocessed/set/test.npy + include_original_edge_id: False """ target_yaml_data = yaml.safe_load(target_yaml_content) # Check yaml content. @@ -1513,6 +1516,7 @@ def test_OnDiskDataset_preprocess_yaml_content_windows(): data: - format: numpy path: preprocessed\\set\\test.npy + include_original_edge_id: False """ target_yaml_data = yaml.safe_load(target_yaml_content) # Check yaml content. @@ -1609,6 +1613,119 @@ def test_OnDiskDataset_preprocess_force_preprocess(capsys): assert target_yaml_data["tasks"][0]["name"] == "fake_name" +def test_OnDiskDataset_preprocess_auto_force_preprocess(capsys): + """Test force preprocess of OnDiskDataset.""" + with tempfile.TemporaryDirectory() as test_dir: + # All metadata fields are specified. + dataset_name = "graphbolt_test" + num_nodes = 4000 + num_edges = 20000 + num_classes = 10 + + # Generate random graph. + yaml_content = gbt.random_homo_graphbolt_graph( + test_dir, + dataset_name, + num_nodes, + num_edges, + num_classes, + ) + yaml_file = os.path.join(test_dir, "metadata.yaml") + with open(yaml_file, "w") as f: + f.write(yaml_content) + + # First preprocess on-disk dataset. + preprocessed_metadata_path = ( + gb.ondisk_dataset.preprocess_ondisk_dataset( + test_dir, include_original_edge_id=False + ) + ) + captured = capsys.readouterr().out.split("\n") + assert captured == [ + "Start to preprocess the on-disk dataset.", + "Finish preprocessing the on-disk dataset.", + "", + ] + with open(preprocessed_metadata_path, "r") as f: + target_yaml_data = yaml.safe_load(f) + assert target_yaml_data["tasks"][0]["name"] == "link_prediction" + + # 1. Change yaml_data. + with open(yaml_file, "r") as f: + yaml_data = yaml.safe_load(f) + yaml_data["tasks"][0]["name"] = "fake_name" + with open(yaml_file, "w") as f: + yaml.dump(yaml_data, f) + preprocessed_metadata_path = ( + gb.ondisk_dataset.preprocess_ondisk_dataset( + test_dir, include_original_edge_id=False + ) + ) + captured = capsys.readouterr().out.split("\n") + assert captured == [ + "The on-disk dataset is re-preprocessing, so the existing " + + "preprocessed dataset has been removed.", + "Start to preprocess the on-disk dataset.", + "Finish preprocessing the on-disk dataset.", + "", + ] + with open(preprocessed_metadata_path, "r") as f: + target_yaml_data = yaml.safe_load(f) + assert target_yaml_data["tasks"][0]["name"] == "fake_name" + + # 2. Change edge feature. + edge_feats = np.random.rand(num_edges, num_classes) + edge_feat_path = os.path.join("data", "edge-feat.npy") + np.save(os.path.join(test_dir, edge_feat_path), edge_feats) + preprocessed_metadata_path = ( + gb.ondisk_dataset.preprocess_ondisk_dataset( + test_dir, include_original_edge_id=False + ) + ) + captured = capsys.readouterr().out.split("\n") + assert captured == [ + "The on-disk dataset is re-preprocessing, so the existing " + + "preprocessed dataset has been removed.", + "Start to preprocess the on-disk dataset.", + "Finish preprocessing the on-disk dataset.", + "", + ] + preprocessed_edge_feat = np.load( + os.path.join(test_dir, "preprocessed", edge_feat_path) + ) + assert preprocessed_edge_feat.all() == edge_feats.all() + with open(preprocessed_metadata_path, "r") as f: + target_yaml_data = yaml.safe_load(f) + assert target_yaml_data["include_original_edge_id"] == False + + # 3. Change include_original_edge_id. + preprocessed_metadata_path = ( + gb.ondisk_dataset.preprocess_ondisk_dataset( + test_dir, include_original_edge_id=True + ) + ) + captured = capsys.readouterr().out.split("\n") + assert captured == [ + "The on-disk dataset is re-preprocessing, so the existing " + + "preprocessed dataset has been removed.", + "Start to preprocess the on-disk dataset.", + "Finish preprocessing the on-disk dataset.", + "", + ] + with open(preprocessed_metadata_path, "r") as f: + target_yaml_data = yaml.safe_load(f) + assert target_yaml_data["include_original_edge_id"] == True + + # 4. Change nothing. + preprocessed_metadata_path = ( + gb.ondisk_dataset.preprocess_ondisk_dataset( + test_dir, include_original_edge_id=True + ) + ) + captured = capsys.readouterr().out.split("\n") + assert captured == ["The dataset is already preprocessed.", ""] + + @pytest.mark.parametrize("edge_fmt", ["csv", "numpy"]) def test_OnDiskDataset_load_name(edge_fmt): """Test preprocess of OnDiskDataset.""" @@ -2341,6 +2458,109 @@ def test_OnDiskDataset_force_preprocess(capsys): dataset = None +def test_OnDiskDataset_auto_force_preprocess(capsys): + """Test force preprocess of OnDiskDataset.""" + with tempfile.TemporaryDirectory() as test_dir: + # All metadata fields are specified. + dataset_name = "graphbolt_test" + num_nodes = 4000 + num_edges = 20000 + num_classes = 10 + + # Generate random graph. + yaml_content = gbt.random_homo_graphbolt_graph( + test_dir, + dataset_name, + num_nodes, + num_edges, + num_classes, + ) + yaml_file = os.path.join(test_dir, "metadata.yaml") + with open(yaml_file, "w") as f: + f.write(yaml_content) + + # First preprocess on-disk dataset. + dataset = gb.OnDiskDataset( + test_dir, include_original_edge_id=False + ).load() + captured = capsys.readouterr().out.split("\n") + assert captured == [ + "Start to preprocess the on-disk dataset.", + "Finish preprocessing the on-disk dataset.", + "", + ] + tasks = dataset.tasks + assert tasks[0].metadata["name"] == "link_prediction" + + # 1. Change yaml_data. + with open(yaml_file, "r") as f: + yaml_data = yaml.safe_load(f) + yaml_data["tasks"][0]["name"] = "fake_name" + with open(yaml_file, "w") as f: + yaml.dump(yaml_data, f) + dataset = gb.OnDiskDataset( + test_dir, include_original_edge_id=False + ).load() + captured = capsys.readouterr().out.split("\n") + assert captured == [ + "The on-disk dataset is re-preprocessing, so the existing " + + "preprocessed dataset has been removed.", + "Start to preprocess the on-disk dataset.", + "Finish preprocessing the on-disk dataset.", + "", + ] + tasks = dataset.tasks + assert tasks[0].metadata["name"] == "fake_name" + + # 2. Change edge feature. + edge_feats = np.random.rand(num_edges, num_classes) + edge_feat_path = os.path.join("data", "edge-feat.npy") + np.save(os.path.join(test_dir, edge_feat_path), edge_feats) + dataset = gb.OnDiskDataset( + test_dir, include_original_edge_id=False + ).load() + captured = capsys.readouterr().out.split("\n") + assert captured == [ + "The on-disk dataset is re-preprocessing, so the existing " + + "preprocessed dataset has been removed.", + "Start to preprocess the on-disk dataset.", + "Finish preprocessing the on-disk dataset.", + "", + ] + assert torch.equal( + dataset.feature.read("edge", None, "feat"), + torch.from_numpy(edge_feats), + ) + graph = dataset.graph + assert gb.ORIGINAL_EDGE_ID not in graph.edge_attributes + + # 3. Change include_original_edge_id. + dataset = gb.OnDiskDataset( + test_dir, include_original_edge_id=True + ).load() + captured = capsys.readouterr().out.split("\n") + assert captured == [ + "The on-disk dataset is re-preprocessing, so the existing " + + "preprocessed dataset has been removed.", + "Start to preprocess the on-disk dataset.", + "Finish preprocessing the on-disk dataset.", + "", + ] + graph = dataset.graph + assert gb.ORIGINAL_EDGE_ID in graph.edge_attributes + + # 4. Change Nothing. + dataset = gb.OnDiskDataset( + test_dir, include_original_edge_id=True + ).load() + captured = capsys.readouterr().out.split("\n") + assert captured == ["The dataset is already preprocessed.", ""] + + graph = None + tasks = None + dataset = None + + def test_OnDiskTask_repr_homogeneous(): item_set = gb.ItemSet( (torch.arange(0, 5), torch.arange(5, 10)), diff --git a/tests/python/pytorch/graphbolt/utils/test_internal.py b/tests/python/pytorch/graphbolt/utils/test_internal.py index 939c10a2a438..c1274f1a9adf 100644 --- a/tests/python/pytorch/graphbolt/utils/test_internal.py +++ b/tests/python/pytorch/graphbolt/utils/test_internal.py @@ -1,3 +1,4 @@ +import json import os import re import tempfile @@ -200,3 +201,68 @@ def test_read_edges_error(): ), ): internal.read_edges(test_dir, "numpy", edge_path) + + +def test_calculate_file_hash(): + with tempfile.TemporaryDirectory() as test_dir: + test_file_path = os.path.join(test_dir, "test.txt") + with open(test_file_path, "w") as file: + file.write("test content") + hash_value = internal.calculate_file_hash( + test_file_path, hash_algo="md5" + ) + expected_hash_value = "9473fdd0d880a43c21b7778d34872157" + assert expected_hash_value == hash_value + with pytest.raises( + ValueError, + match=re.escape( + "Hash algorithm must be one of: ['md5', 'sha1', 'sha224', " + + "'sha256', 'sha384', 'sha512'], but got `fake`." + ), + ): + hash_value = internal.calculate_file_hash( + test_file_path, hash_algo="fake" + ) + + +def test_calculate_dir_hash(): + with tempfile.TemporaryDirectory() as test_dir: + test_file_path_1 = os.path.join(test_dir, "test_1.txt") + test_file_path_2 = os.path.join(test_dir, "test_2.txt") + with open(test_file_path_1, "w") as file: + file.write("test content") + with open(test_file_path_2, "w") as file: + file.write("test contents of directory") + hash_value = internal.calculate_dir_hash(test_dir, hash_algo="md5") + expected_hash_value = [ + "56e708a2bdf92887d4a7f25cbc13c555", + "9473fdd0d880a43c21b7778d34872157", + ] + assert len(hash_value) == 2 + for val in hash_value.values(): + assert val in expected_hash_value + + +def test_check_dataset_change(): + with tempfile.TemporaryDirectory() as test_dir: + # Generate directory and record its hash value. + test_file_path_1 = os.path.join(test_dir, "test_1.txt") + test_file_path_2 = os.path.join(test_dir, "test_2.txt") + with open(test_file_path_1, "w") as file: + file.write("test content") + with open(test_file_path_2, "w") as file: + file.write("test contents of directory") + hash_value = internal.calculate_dir_hash(test_dir, hash_algo="md5") + hash_value_file = "dataset_hash_value.txt" + hash_value_file_paht = os.path.join( + test_dir, "preprocessed", hash_value_file + ) + os.makedirs(os.path.join(test_dir, "preprocessed"), exist_ok=True) + with open(hash_value_file_paht, "w") as file: + file.write(json.dumps(hash_value, indent=4)) + + # Modify the content of a file. + with open(test_file_path_2, "w") as file: + file.write("test contents of directory changed") + + assert internal.check_dataset_change(test_dir, "preprocessed") From 80f36134c15464d7ad92350519e9c8c0d124b2ac Mon Sep 17 00:00:00 2001 From: Mingbang Wang <100203018+Skeleton003@users.noreply.github.com> Date: Tue, 16 Jan 2024 11:14:04 +0800 Subject: [PATCH 16/22] [GraphBolt] Update the docstring of `ItemSet` (#6943) --- python/dgl/graphbolt/itemset.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/python/dgl/graphbolt/itemset.py b/python/dgl/graphbolt/itemset.py index 8697ac9c59e4..8f93e9fed414 100644 --- a/python/dgl/graphbolt/itemset.py +++ b/python/dgl/graphbolt/itemset.py @@ -25,7 +25,10 @@ class requires each input itemset to be iterable. items. names: Union[str, Tuple[str]], optional The names of the items. If it is a tuple, each name corresponds to an - item in the tuple. + item in the tuple. The naming is arbitrary, but in general practice, + the names should be chosen from ['seed_nodes', 'node_pairs', 'labels', + 'negative_srcs', 'negative_dsts'] to align with the attributes of + class `dgl.graphbolt.MiniBatch`. Examples -------- From 905321f855268b7ff65df056ba7f5d4ae5b60aae Mon Sep 17 00:00:00 2001 From: Mingbang Wang <100203018+Skeleton003@users.noreply.github.com> Date: Tue, 16 Jan 2024 14:38:58 +0800 Subject: [PATCH 17/22] [GraphBolt] Modify `__repr__` (#6953) --- python/dgl/graphbolt/impl/ondisk_dataset.py | 42 +++++----- .../impl/torch_based_feature_store.py | 57 ++++--------- python/dgl/graphbolt/itemset.py | 20 ++--- .../graphbolt/impl/test_ondisk_dataset.py | 82 +++++++++---------- 4 files changed, 87 insertions(+), 114 deletions(-) diff --git a/python/dgl/graphbolt/impl/ondisk_dataset.py b/python/dgl/graphbolt/impl/ondisk_dataset.py index 11245c458061..c86476a41961 100644 --- a/python/dgl/graphbolt/impl/ondisk_dataset.py +++ b/python/dgl/graphbolt/impl/ondisk_dataset.py @@ -3,6 +3,7 @@ import json import os import shutil +import textwrap from copy import deepcopy from typing import Dict, List, Union @@ -339,7 +340,24 @@ def test_set(self) -> Union[ItemSet, ItemSetDict]: return self._test_set def __repr__(self) -> str: - return _ondisk_task_str(self) + ret = "{Classname}({attributes})" + + attributes_str = "" + + attributes = get_attributes(self) + attributes.reverse() + for attribute in attributes: + if attribute[0] == "_": + continue + value = getattr(self, attribute) + attributes_str += f"{attribute}={value},\n" + attributes_str = textwrap.indent( + attributes_str, " " * len("OnDiskTask(") + ).strip() + + return ret.format( + Classname=self.__class__.__name__, attributes=attributes_str + ) class OnDiskDataset(Dataset): @@ -752,25 +770,3 @@ def __init__(self, name: str, root: str = "datasets") -> OnDiskDataset: extract_archive(zip_file_path, root, overwrite=True) os.remove(zip_file_path) super().__init__(dataset_dir, force_preprocess=False) - - -def _ondisk_task_str(task: OnDiskTask) -> str: - final_str = "OnDiskTask(" - indent_len = len(final_str) - - def _add_indent(_str, indent): - lines = _str.split("\n") - lines = [lines[0]] + [" " * indent + line for line in lines[1:]] - return "\n".join(lines) - - attributes = get_attributes(task) - attributes.reverse() - for name in attributes: - if name[0] == "_": - continue - val = getattr(task, name) - final_str += ( - f"{name}={_add_indent(str(val), indent_len + len(name) + 1)},\n" - + " " * indent_len - ) - return final_str[:-indent_len] + ")" diff --git a/python/dgl/graphbolt/impl/torch_based_feature_store.py b/python/dgl/graphbolt/impl/torch_based_feature_store.py index 292b0b1d4e59..9e0718a1d9fe 100644 --- a/python/dgl/graphbolt/impl/torch_based_feature_store.py +++ b/python/dgl/graphbolt/impl/torch_based_feature_store.py @@ -172,36 +172,24 @@ def pin_memory_(self): def __repr__(self) -> str: ret = ( - "TorchBasedFeature(\n" + "{Classname}(\n" " feature={feature},\n" " metadata={metadata},\n" ")" ) - feature_str = str(self._tensor) - feature_str_lines = feature_str.splitlines() - if len(feature_str_lines) > 1: - feature_str = ( - feature_str_lines[0] - + "\n" - + textwrap.indent( - "\n".join(feature_str_lines[1:]), " " * len(" feature=") - ) - ) - - metadata_str = str(self.metadata()) - metadata_str_lines = metadata_str.splitlines() - if len(metadata_str_lines) > 1: - metadata_str = ( - metadata_str_lines[0] - + "\n" - + textwrap.indent( - "\n".join(metadata_str_lines[1:]), - " " * len(" metadata="), - ) - ) - - return ret.format(feature=feature_str, metadata=metadata_str) + feature_str = textwrap.indent( + str(self._tensor), " " * len(" feature=") + ).strip() + metadata_str = textwrap.indent( + str(self.metadata()), " " * len(" metadata=") + ).strip() + + return ret.format( + Classname=self.__class__.__name__, + feature=feature_str, + metadata=metadata_str, + ) class TorchBasedFeatureStore(BasicFeatureStore): @@ -268,17 +256,8 @@ def pin_memory_(self): feature.pin_memory_() def __repr__(self) -> str: - ret = "TorchBasedFeatureStore(\n" + " {features}\n" + ")" - - features_str = str(self._features) - features_str_lines = features_str.splitlines() - if len(features_str_lines) > 1: - features_str = ( - features_str_lines[0] - + "\n" - + textwrap.indent( - "\n".join(features_str_lines[1:]), " " * len(" ") - ) - ) - - return ret.format(features=features_str) + ret = "{Classname}(\n" + " {features}\n" + ")" + features_str = textwrap.indent(str(self._features), " ").strip() + return ret.format( + Classname=self.__class__.__name__, features=features_str + ) diff --git a/python/dgl/graphbolt/itemset.py b/python/dgl/graphbolt/itemset.py index 8f93e9fed414..14dc49b523cb 100644 --- a/python/dgl/graphbolt/itemset.py +++ b/python/dgl/graphbolt/itemset.py @@ -180,7 +180,7 @@ def names(self) -> Tuple[str]: def __repr__(self) -> str: ret = ( - f"ItemSet(\n" + f"{self.__class__.__name__}(\n" f" items={self._items},\n" f" names={self._names},\n" f")" @@ -342,18 +342,18 @@ def names(self) -> Tuple[str]: def __repr__(self) -> str: ret = ( - "ItemSetDict(\n" + "{Classname}(\n" " itemsets={itemsets},\n" " names={names},\n" ")" ) - itemsets_str = repr(self._itemsets) - lines = itemsets_str.splitlines() - itemsets_str = ( - lines[0] - + "\n" - + textwrap.indent("\n".join(lines[1:]), " " * len(" itemsets=")) - ) + itemsets_str = textwrap.indent( + repr(self._itemsets), " " * len(" itemsets=") + ).strip() - return ret.format(itemsets=itemsets_str, names=self._names) + return ret.format( + Classname=self.__class__.__name__, + itemsets=itemsets_str, + names=self._names, + ) diff --git a/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py b/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py index 1e5fdd4fbabe..b669f2e7d1f4 100644 --- a/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py +++ b/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py @@ -2570,21 +2570,20 @@ def test_OnDiskTask_repr_homogeneous(): task = gb.OnDiskTask(metadata, item_set, item_set, item_set) expected_str = ( "OnDiskTask(validation_set=ItemSet(\n" - " items=(tensor([0, 1, 2, 3, 4]), tensor([5, 6, 7, 8, 9])),\n" - " names=('seed_nodes', 'labels'),\n" - " ),\n" + " items=(tensor([0, 1, 2, 3, 4]), tensor([5, 6, 7, 8, 9])),\n" + " names=('seed_nodes', 'labels'),\n" + " ),\n" " train_set=ItemSet(\n" - " items=(tensor([0, 1, 2, 3, 4]), tensor([5, 6, 7, 8, 9])),\n" - " names=('seed_nodes', 'labels'),\n" - " ),\n" + " items=(tensor([0, 1, 2, 3, 4]), tensor([5, 6, 7, 8, 9])),\n" + " names=('seed_nodes', 'labels'),\n" + " ),\n" " test_set=ItemSet(\n" - " items=(tensor([0, 1, 2, 3, 4]), tensor([5, 6, 7, 8, 9])),\n" - " names=('seed_nodes', 'labels'),\n" - " ),\n" - " metadata={'name': 'node_classification'},\n" - ")" + " items=(tensor([0, 1, 2, 3, 4]), tensor([5, 6, 7, 8, 9])),\n" + " names=('seed_nodes', 'labels'),\n" + " ),\n" + " metadata={'name': 'node_classification'},)" ) - assert str(task) == expected_str, print(task) + assert repr(task) == expected_str, task def test_OnDiskTask_repr_heterogeneous(): @@ -2598,39 +2597,38 @@ def test_OnDiskTask_repr_heterogeneous(): task = gb.OnDiskTask(metadata, item_set, item_set, item_set) expected_str = ( "OnDiskTask(validation_set=ItemSetDict(\n" - " itemsets={'user': ItemSet(\n" - " items=(tensor([0, 1, 2, 3, 4]),),\n" - " names=('seed_nodes',),\n" - " ), 'item': ItemSet(\n" - " items=(tensor([5, 6, 7, 8, 9]),),\n" - " names=('seed_nodes',),\n" - " )},\n" - " names=('seed_nodes',),\n" - " ),\n" + " itemsets={'user': ItemSet(\n" + " items=(tensor([0, 1, 2, 3, 4]),),\n" + " names=('seed_nodes',),\n" + " ), 'item': ItemSet(\n" + " items=(tensor([5, 6, 7, 8, 9]),),\n" + " names=('seed_nodes',),\n" + " )},\n" + " names=('seed_nodes',),\n" + " ),\n" " train_set=ItemSetDict(\n" - " itemsets={'user': ItemSet(\n" - " items=(tensor([0, 1, 2, 3, 4]),),\n" - " names=('seed_nodes',),\n" - " ), 'item': ItemSet(\n" - " items=(tensor([5, 6, 7, 8, 9]),),\n" - " names=('seed_nodes',),\n" - " )},\n" - " names=('seed_nodes',),\n" - " ),\n" + " itemsets={'user': ItemSet(\n" + " items=(tensor([0, 1, 2, 3, 4]),),\n" + " names=('seed_nodes',),\n" + " ), 'item': ItemSet(\n" + " items=(tensor([5, 6, 7, 8, 9]),),\n" + " names=('seed_nodes',),\n" + " )},\n" + " names=('seed_nodes',),\n" + " ),\n" " test_set=ItemSetDict(\n" - " itemsets={'user': ItemSet(\n" - " items=(tensor([0, 1, 2, 3, 4]),),\n" - " names=('seed_nodes',),\n" - " ), 'item': ItemSet(\n" - " items=(tensor([5, 6, 7, 8, 9]),),\n" - " names=('seed_nodes',),\n" - " )},\n" - " names=('seed_nodes',),\n" - " ),\n" - " metadata={'name': 'node_classification'},\n" - ")" + " itemsets={'user': ItemSet(\n" + " items=(tensor([0, 1, 2, 3, 4]),),\n" + " names=('seed_nodes',),\n" + " ), 'item': ItemSet(\n" + " items=(tensor([5, 6, 7, 8, 9]),),\n" + " names=('seed_nodes',),\n" + " )},\n" + " names=('seed_nodes',),\n" + " ),\n" + " metadata={'name': 'node_classification'},)" ) - assert str(task) == expected_str, print(task) + assert repr(task) == expected_str, task def test_OnDiskDataset_load_tasks_selectively(): From de1eedc6e85b2726b2ae2adc6f79b9c8343907f3 Mon Sep 17 00:00:00 2001 From: yxy235 <77922129+yxy235@users.noreply.github.com> Date: Tue, 16 Jan 2024 15:05:58 +0800 Subject: [PATCH 18/22] [GraphBolt] Add check about whether edge IDs are saved when edge feature is stored. (#6948) Co-authored-by: Ubuntu --- python/dgl/graphbolt/impl/ondisk_dataset.py | 5 ++ .../graphbolt/impl/test_ondisk_dataset.py | 56 +++++++++++++++++++ 2 files changed, 61 insertions(+) diff --git a/python/dgl/graphbolt/impl/ondisk_dataset.py b/python/dgl/graphbolt/impl/ondisk_dataset.py index c86476a41961..5da99fe74f23 100644 --- a/python/dgl/graphbolt/impl/ondisk_dataset.py +++ b/python/dgl/graphbolt/impl/ondisk_dataset.py @@ -219,6 +219,7 @@ def preprocess_ondisk_dataset( # 7. Load the node/edge features and do necessary conversion. if input_config.get("feature_data", None): + has_edge_feature_data = False for feature, out_feature in zip( input_config["feature_data"], output_config["feature_data"] ): @@ -230,6 +231,8 @@ def preprocess_ondisk_dataset( in_memory = ( True if "in_memory" not in feature else feature["in_memory"] ) + if not has_edge_feature_data and feature["domain"] == "edge": + has_edge_feature_data = True copy_or_convert_data( os.path.join(dataset_dir, feature["path"]), os.path.join(dataset_dir, out_feature["path"]), @@ -238,6 +241,8 @@ def preprocess_ondisk_dataset( in_memory=in_memory, is_feature=True, ) + if has_edge_feature_data and not include_original_edge_id: + dgl_warning("Edge feature is stored, but edge IDs are not saved.") # 8. Save tasks and train/val/test split according to the output_config. if input_config.get("tasks", None): diff --git a/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py b/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py index b669f2e7d1f4..237ab0c590d1 100644 --- a/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py +++ b/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py @@ -1726,6 +1726,35 @@ def test_OnDiskDataset_preprocess_auto_force_preprocess(capsys): assert captured == ["The dataset is already preprocessed.", ""] +def test_OnDiskDataset_preprocess_not_include_eids(): + with tempfile.TemporaryDirectory() as test_dir: + # All metadata fields are specified. + dataset_name = "graphbolt_test" + num_nodes = 4000 + num_edges = 20000 + num_classes = 10 + + # Generate random graph. + yaml_content = gbt.random_homo_graphbolt_graph( + test_dir, + dataset_name, + num_nodes, + num_edges, + num_classes, + ) + yaml_file = os.path.join(test_dir, "metadata.yaml") + with open(yaml_file, "w") as f: + f.write(yaml_content) + + with pytest.warns( + DGLWarning, + match="Edge feature is stored, but edge IDs are not saved.", + ): + gb.ondisk_dataset.preprocess_ondisk_dataset( + test_dir, include_original_edge_id=False + ) + + @pytest.mark.parametrize("edge_fmt", ["csv", "numpy"]) def test_OnDiskDataset_load_name(edge_fmt): """Test preprocess of OnDiskDataset.""" @@ -2586,6 +2615,33 @@ def test_OnDiskTask_repr_homogeneous(): assert repr(task) == expected_str, task +def test_OnDiskDataset_not_include_eids(): + with tempfile.TemporaryDirectory() as test_dir: + # All metadata fields are specified. + dataset_name = "graphbolt_test" + num_nodes = 4000 + num_edges = 20000 + num_classes = 10 + + # Generate random graph. + yaml_content = gbt.random_homo_graphbolt_graph( + test_dir, + dataset_name, + num_nodes, + num_edges, + num_classes, + ) + yaml_file = os.path.join(test_dir, "metadata.yaml") + with open(yaml_file, "w") as f: + f.write(yaml_content) + + with pytest.warns( + DGLWarning, + match="Edge feature is stored, but edge IDs are not saved.", + ): + gb.OnDiskDataset(test_dir, include_original_edge_id=False) + + def test_OnDiskTask_repr_heterogeneous(): item_set = gb.ItemSetDict( { From 6f9db8134567ce24def548f4de3cac5a63a92865 Mon Sep 17 00:00:00 2001 From: Mingbang Wang <100203018+Skeleton003@users.noreply.github.com> Date: Tue, 16 Jan 2024 15:43:57 +0800 Subject: [PATCH 19/22] [Misc] Correct `test_OnDiskDataset_preprocess_homogeneous` (#6922) --- tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py b/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py index 237ab0c590d1..ef15453463e7 100644 --- a/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py +++ b/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py @@ -1175,9 +1175,9 @@ def test_OnDiskDataset_preprocess_homogeneous(edge_fmt): yaml_file = os.path.join(test_dir, "metadata.yaml") with open(yaml_file, "w") as f: f.write(yaml_content) - # Test do not generate original_edge_id. + # Test generating original_edge_id. output_file = gb.ondisk_dataset.preprocess_ondisk_dataset( - test_dir, include_original_edge_id=False + test_dir, include_original_edge_id=True ) with open(output_file, "rb") as f: processed_dataset = yaml.load(f, Loader=yaml.Loader) @@ -1186,8 +1186,7 @@ def test_OnDiskDataset_preprocess_homogeneous(edge_fmt): ) assert ( fused_csc_sampling_graph.edge_attributes is not None - and gb.ORIGINAL_EDGE_ID - not in fused_csc_sampling_graph.edge_attributes + and gb.ORIGINAL_EDGE_ID in fused_csc_sampling_graph.edge_attributes ) fused_csc_sampling_graph = None From bd74c44c39e037e645dfd3e7e580d4395b211a14 Mon Sep 17 00:00:00 2001 From: Muhammed Fatih BALIN Date: Tue, 16 Jan 2024 03:27:48 -0500 Subject: [PATCH 20/22] [GraphBolt][CUDA] Add `.to()` method to Graph and FeatureStore. (#6957) --- .../impl/fused_csc_sampling_graph.py | 16 +++++- .../impl/torch_based_feature_store.py | 18 +++++++ .../impl/test_fused_csc_sampling_graph.py | 53 ++++++++++++------- .../impl/test_torch_based_feature_store.py | 53 +++++++++++++++++++ 4 files changed, 120 insertions(+), 20 deletions(-) diff --git a/python/dgl/graphbolt/impl/fused_csc_sampling_graph.py b/python/dgl/graphbolt/impl/fused_csc_sampling_graph.py index 486bd59caccb..8f026b3c5095 100644 --- a/python/dgl/graphbolt/impl/fused_csc_sampling_graph.py +++ b/python/dgl/graphbolt/impl/fused_csc_sampling_graph.py @@ -956,7 +956,21 @@ def to(self, device: torch.device) -> None: # pylint: disable=invalid-name def _to(x): return x.to(device) if hasattr(x, "to") else x - return self._apply_to_members(_to) + def _pin(x): + return x.pin_memory() if hasattr(x, "pin_memory") else x + + # Create a copy of self. + self2 = fused_csc_sampling_graph( + self.csc_indptr, + self.indices, + self.node_type_offset, + self.type_per_edge, + self.node_type_to_id, + self.edge_type_to_id, + self.node_attributes, + self.edge_attributes, + ) + return self2._apply_to_members(_pin if device == "pinned" else _to) def pin_memory_(self): """Copy `FusedCSCSamplingGraph` to the pinned memory in-place.""" diff --git a/python/dgl/graphbolt/impl/torch_based_feature_store.py b/python/dgl/graphbolt/impl/torch_based_feature_store.py index 9e0718a1d9fe..3952eb0a84b4 100644 --- a/python/dgl/graphbolt/impl/torch_based_feature_store.py +++ b/python/dgl/graphbolt/impl/torch_based_feature_store.py @@ -1,5 +1,6 @@ """Torch-based feature store for GraphBolt.""" +import copy import textwrap from typing import Dict, List @@ -170,6 +171,16 @@ def pin_memory_(self): """In-place operation to copy the feature to pinned memory.""" self._tensor = self._tensor.pin_memory() + def to(self, device): # pylint: disable=invalid-name + """Copy `TorchBasedFeature` to the specified device.""" + # copy.copy is a shallow copy so it does not copy tensor memory. + self2 = copy.copy(self) + if device == "pinned": + self2.pin_memory_() + else: + self2._tensor = self2._tensor.to(device) + return self2 + def __repr__(self) -> str: ret = ( "{Classname}(\n" @@ -255,6 +266,13 @@ def pin_memory_(self): for feature in self._features.values(): feature.pin_memory_() + def to(self, device): # pylint: disable=invalid-name + """Copy `TorchBasedFeatureStore` to the specified device.""" + # copy.copy is a shallow copy so it does not copy tensor memory. + self2 = copy.copy(self) + self2._features = {k: v.to(device) for k, v in self2._features.items()} + return self2 + def __repr__(self) -> str: ret = "{Classname}(\n" + " {features}\n" + ")" features_str = textwrap.indent(str(self._features), " ").strip() diff --git a/tests/python/pytorch/graphbolt/impl/test_fused_csc_sampling_graph.py b/tests/python/pytorch/graphbolt/impl/test_fused_csc_sampling_graph.py index 3e13a1ce2f5a..b2f240e6279b 100644 --- a/tests/python/pytorch/graphbolt/impl/test_fused_csc_sampling_graph.py +++ b/tests/python/pytorch/graphbolt/impl/test_fused_csc_sampling_graph.py @@ -1552,25 +1552,46 @@ def create_fused_csc_sampling_graph(): ) +def is_graph_on_device_type(graph, device_type): + assert graph.csc_indptr.device.type == device_type + assert graph.indices.device.type == device_type + assert graph.node_type_offset.device.type == device_type + assert graph.type_per_edge.device.type == device_type + assert graph.csc_indptr.device.type == device_type + for key in graph.edge_attributes: + assert graph.edge_attributes[key].device.type == device_type + + +def is_graph_pinned(graph): + assert graph.csc_indptr.is_pinned() + assert graph.indices.is_pinned() + assert graph.node_type_offset.is_pinned() + assert graph.type_per_edge.is_pinned() + assert graph.csc_indptr.is_pinned() + for key in graph.edge_attributes: + assert graph.edge_attributes[key].is_pinned() + + @unittest.skipIf( F._default_context_str == "cpu", reason="`to` function needs GPU to test.", ) -def test_csc_sampling_graph_to_device(): +@pytest.mark.parametrize("device", ["pinned", "cuda"]) +def test_csc_sampling_graph_to_device(device): # Construct FusedCSCSamplingGraph. graph = create_fused_csc_sampling_graph() # Copy to device. - graph = graph.to("cuda") - - # Check. - assert graph.csc_indptr.device.type == "cuda" - assert graph.indices.device.type == "cuda" - assert graph.node_type_offset.device.type == "cuda" - assert graph.type_per_edge.device.type == "cuda" - assert graph.csc_indptr.device.type == "cuda" - for key in graph.edge_attributes: - assert graph.edge_attributes[key].device.type == "cuda" + graph2 = graph.to(device) + + if device == "cuda": + is_graph_on_device_type(graph2, "cuda") + elif device == "pinned": + is_graph_on_device_type(graph2, "cpu") + is_graph_pinned(graph2) + + # The original variable should be untouched. + is_graph_on_device_type(graph, "cpu") @unittest.skipIf( @@ -1584,14 +1605,8 @@ def test_csc_sampling_graph_to_pinned_memory(): # Copy to pinned_memory in-place. graph.pin_memory_() - # Check. - assert graph.csc_indptr.is_pinned() - assert graph.indices.is_pinned() - assert graph.node_type_offset.is_pinned() - assert graph.type_per_edge.is_pinned() - assert graph.csc_indptr.is_pinned() - for key in graph.edge_attributes: - assert graph.edge_attributes[key].is_pinned() + is_graph_on_device_type(graph, "cpu") + is_graph_pinned(graph) @pytest.mark.parametrize("labor", [False, True]) diff --git a/tests/python/pytorch/graphbolt/impl/test_torch_based_feature_store.py b/tests/python/pytorch/graphbolt/impl/test_torch_based_feature_store.py index 3de5fe3a0a82..be4b43b79461 100644 --- a/tests/python/pytorch/graphbolt/impl/test_torch_based_feature_store.py +++ b/tests/python/pytorch/graphbolt/impl/test_torch_based_feature_store.py @@ -136,6 +136,59 @@ def test_torch_based_feature(in_memory): feature_a = feature_b = None +def is_feature_store_pinned(store): + for feature in store._features.values(): + assert feature._tensor.is_pinned() + + +def is_feature_store_on_cuda(store): + for feature in store._features.values(): + assert feature._tensor.is_cuda + + +def is_feature_store_on_cpu(store): + for feature in store._features.values(): + assert not feature._tensor.is_cuda + + +@unittest.skipIf( + F._default_context_str == "cpu", + reason="Tests for pinned memory are only meaningful on GPU.", +) +@pytest.mark.parametrize("device", ["pinned", "cuda"]) +def test_feature_store_to_device(device): + with tempfile.TemporaryDirectory() as test_dir: + a = torch.tensor([[1, 2, 4], [2, 5, 3]]) + b = torch.tensor([[[1, 2], [3, 4]], [[2, 5], [3, 4]]]) + write_tensor_to_disk(test_dir, "a", a, fmt="torch") + write_tensor_to_disk(test_dir, "b", b, fmt="numpy") + feature_data = [ + gb.OnDiskFeatureData( + domain="node", + type="paper", + name="a", + format="torch", + path=os.path.join(test_dir, "a.pt"), + ), + gb.OnDiskFeatureData( + domain="edge", + type="paper:cites:paper", + name="b", + format="numpy", + path=os.path.join(test_dir, "b.npy"), + ), + ] + feature_store = gb.TorchBasedFeatureStore(feature_data) + feature_store2 = feature_store.to(device) + if device == "pinned": + is_feature_store_pinned(feature_store2) + elif device == "cuda": + is_feature_store_on_cuda(feature_store2) + + # The original variable should be untouched. + is_feature_store_on_cpu(feature_store) + + @unittest.skipIf( F._default_context_str == "cpu", reason="Tests for pinned memory are only meaningful on GPU.", From c81ff6ad2da04fbf3c4f819744d3906108fe073c Mon Sep 17 00:00:00 2001 From: Ramon Zhou Date: Tue, 16 Jan 2024 18:20:37 +0800 Subject: [PATCH 21/22] [GraphBolt] Fix fanouts setting in rgcn example (#6959) --- examples/sampling/graphbolt/rgcn/hetero_rgcn.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/examples/sampling/graphbolt/rgcn/hetero_rgcn.py b/examples/sampling/graphbolt/rgcn/hetero_rgcn.py index 9b1b4bebcf4a..5a664a73a7d6 100644 --- a/examples/sampling/graphbolt/rgcn/hetero_rgcn.py +++ b/examples/sampling/graphbolt/rgcn/hetero_rgcn.py @@ -430,6 +430,7 @@ def evaluate( else: evaluator = MAG240MEvaluator() + num_etype = len(g.num_edges) data_loader = create_dataloader( name, g, @@ -437,7 +438,7 @@ def evaluate( item_set, device, batch_size=4096, - fanouts=[25, 10], + fanouts=[torch.full((num_etype,), 25), torch.full((num_etype,), 10)], shuffle=False, num_workers=num_workers, ) @@ -491,6 +492,7 @@ def train( print("Start to train...") category = "paper" + num_etype = len(g.num_edges) data_loader = create_dataloader( name, g, @@ -498,7 +500,7 @@ def train( train_set, device, batch_size=1024, - fanouts=[25, 10], + fanouts=[torch.full((num_etype,), 25), torch.full((num_etype,), 10)], shuffle=True, num_workers=num_workers, ) From 053c822175b70f7202dd793713476789ec02f8c6 Mon Sep 17 00:00:00 2001 From: Muhammed Fatih BALIN Date: Tue, 16 Jan 2024 11:16:13 -0500 Subject: [PATCH 22/22] [GraphBolt][CUDA] Enable GPU sampling in examples (#6861) --- .../sampling/graphbolt/link_prediction.py | 39 ++++++++++----- .../sampling/graphbolt/node_classification.py | 48 ++++++++++++------- .../graphbolt/quickstart/link_prediction.py | 14 ++++-- .../quickstart/node_classification.py | 14 ++++-- 4 files changed, 80 insertions(+), 35 deletions(-) diff --git a/examples/sampling/graphbolt/link_prediction.py b/examples/sampling/graphbolt/link_prediction.py index 45d5e525b28d..20e169b570a5 100644 --- a/examples/sampling/graphbolt/link_prediction.py +++ b/examples/sampling/graphbolt/link_prediction.py @@ -144,6 +144,16 @@ def create_dataloader(args, graph, features, itemset, is_train=True): shuffle=is_train, ) + ############################################################################ + # [Input]: + # 'device': The device to copy the data to. + # [Output]: + # A CopyTo object to copy the data to the specified device. Copying here + # ensures that the rest of the operations run on the GPU. + ############################################################################ + if args.storage_device != "cpu": + datapipe = datapipe.copy_to(device=args.device) + ############################################################################ # [Input]: # 'args.neg_ratio': Specify the ratio of negative to positive samples. @@ -216,7 +226,8 @@ def create_dataloader(args, graph, features, itemset, is_train=True): # [Output]: # A CopyTo object to copy the data to the specified device. ############################################################################ - datapipe = datapipe.copy_to(device=args.device) + if args.storage_device == "cpu": + datapipe = datapipe.copy_to(device=args.device) ############################################################################ # [Input]: @@ -304,11 +315,11 @@ def train(args, model, graph, features, train_set): optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) dataloader = create_dataloader(args, graph, features, train_set) - for epoch in tqdm.trange(args.epochs): + for epoch in range(args.epochs): model.train() total_loss = 0 start_epoch_time = time.time() - for step, data in enumerate(dataloader): + for step, data in tqdm.tqdm(enumerate(dataloader)): # Get node pairs with labels for loss calculation. compacted_pairs, labels = data.node_pairs_with_labels @@ -366,24 +377,30 @@ def parse_args(): help="Whether to exclude reverse edges during sampling. Default: 1", ) parser.add_argument( - "--device", - default="cpu", - choices=["cpu", "cuda"], - help="Train device: 'cpu' for CPU, 'cuda' for GPU.", + "--mode", + default="pinned-cuda", + choices=["cpu-cpu", "cpu-cuda", "pinned-cuda", "cuda-cuda"], + help="Dataset storage placement and Train device: 'cpu' for CPU and RAM," + " 'pinned' for pinned memory in RAM, 'cuda' for GPU and GPU memory.", ) return parser.parse_args() def main(args): if not torch.cuda.is_available(): - args.device = "cpu" - print(f"Training in {args.device} mode.") + args.mode = "cpu-cpu" + print(f"Training in {args.mode} mode.") + args.storage_device, args.device = args.mode.split("-") + args.device = torch.device(args.device) # Load and preprocess dataset. print("Loading data") dataset = gb.BuiltinDataset("ogbl-citation2").load() - graph = dataset.graph - features = dataset.feature + + # Move the dataset to the selected storage. + graph = dataset.graph.to(args.storage_device) + features = dataset.feature.to(args.storage_device) + train_set = dataset.tasks[0].train_set args.fanout = list(map(int, args.fanout.split(","))) diff --git a/examples/sampling/graphbolt/node_classification.py b/examples/sampling/graphbolt/node_classification.py index 56c9b9ddbc08..f589e667b455 100644 --- a/examples/sampling/graphbolt/node_classification.py +++ b/examples/sampling/graphbolt/node_classification.py @@ -92,6 +92,19 @@ def create_dataloader( ############################################################################ # [Step-2]: + # self.copy_to() + # [Input]: + # 'device': The device to copy the data to. + # 'extra_attrs': The extra attributes to copy. + # [Output]: + # A CopyTo object to copy the data to the specified device. Copying here + # ensures that the rest of the operations run on the GPU. + ############################################################################ + if args.storage_device != "cpu": + datapipe = datapipe.copy_to(device=device, extra_attrs=["seed_nodes"]) + + ############################################################################ + # [Step-3]: # self.sample_neighbor() # [Input]: # 'graph': The network topology for sampling. @@ -109,7 +122,7 @@ def create_dataloader( ) ############################################################################ - # [Step-3]: + # [Step-4]: # self.fetch_feature() # [Input]: # 'features': The node features. @@ -125,17 +138,18 @@ def create_dataloader( datapipe = datapipe.fetch_feature(features, node_feature_keys=["feat"]) ############################################################################ - # [Step-4]: + # [Step-5]: # self.copy_to() # [Input]: # 'device': The device to copy the data to. # [Output]: # A CopyTo object to copy the data to the specified device. ############################################################################ - datapipe = datapipe.copy_to(device=device) + if args.storage_device == "cpu": + datapipe = datapipe.copy_to(device=device) ############################################################################ - # [Step-5]: + # [Step-6]: # gb.DataLoader() # [Input]: # 'datapipe': The datapipe object to be used for data loading. @@ -259,7 +273,7 @@ def evaluate(args, model, graph, features, itemset, num_classes): job="evaluate", ) - for step, data in tqdm(enumerate(dataloader)): + for step, data in tqdm(enumerate(dataloader), "Evaluating"): x = data.node_features["feat"] y.append(data.labels) y_hats.append(model(data.blocks, x)) @@ -289,7 +303,7 @@ def train(args, graph, features, train_set, valid_set, num_classes, model): t0 = time.time() model.train() total_loss = 0 - for step, data in enumerate(dataloader): + for step, data in tqdm(enumerate(dataloader), "Training"): # The input features from the source nodes in the first layer's # computation graph. x = data.node_features["feat"] @@ -349,28 +363,30 @@ def parse_args(): " identical with the number of layers in your model. Default: 10,10,10", ) parser.add_argument( - "--device", - default="cpu", - choices=["cpu", "cuda"], - help="Train device: 'cpu' for CPU, 'cuda' for GPU.", + "--mode", + default="pinned-cuda", + choices=["cpu-cpu", "cpu-cuda", "pinned-cuda", "cuda-cuda"], + help="Dataset storage placement and Train device: 'cpu' for CPU and RAM," + " 'pinned' for pinned memory in RAM, 'cuda' for GPU and GPU memory.", ) return parser.parse_args() def main(args): if not torch.cuda.is_available(): - args.device = "cpu" - print(f"Training in {args.device} mode.") + args.mode = "cpu-cpu" + print(f"Training in {args.mode} mode.") + args.storage_device, args.device = args.mode.split("-") args.device = torch.device(args.device) # Load and preprocess dataset. print("Loading data...") dataset = gb.BuiltinDataset("ogbn-products").load() - graph = dataset.graph - # Currently the neighbor-sampling process can only be done on the CPU, - # therefore there is no need to copy the graph to the GPU. - features = dataset.feature + # Move the dataset to the selected storage. + graph = dataset.graph.to(args.storage_device) + features = dataset.feature.to(args.storage_device) + train_set = dataset.tasks[0].train_set valid_set = dataset.tasks[0].validation_set test_set = dataset.tasks[0].test_set diff --git a/examples/sampling/graphbolt/quickstart/link_prediction.py b/examples/sampling/graphbolt/quickstart/link_prediction.py index 03d15446fed3..350c541ca92d 100644 --- a/examples/sampling/graphbolt/quickstart/link_prediction.py +++ b/examples/sampling/graphbolt/quickstart/link_prediction.py @@ -18,7 +18,7 @@ ############################################################################ # (HIGHLIGHT) Create a single process dataloader with dgl graphbolt package. ############################################################################ -def create_dataloader(dateset, device, is_train=True): +def create_dataloader(dataset, device, is_train=True): # The second of two tasks in the dataset is link prediction. task = dataset.tasks[1] itemset = task.train_set if is_train else task.test_set @@ -26,6 +26,9 @@ def create_dataloader(dateset, device, is_train=True): # Sample seed edges from the itemset. datapipe = gb.ItemSampler(itemset, batch_size=256) + # Copy the mini-batch to the designated device for sampling and training. + datapipe = datapipe.copy_to(device) + if is_train: # Sample negative edges for the seed edges. datapipe = datapipe.sample_uniform_negative( @@ -47,9 +50,6 @@ def create_dataloader(dateset, device, is_train=True): dataset.feature, node_feature_keys=["feat"] ) - # Copy the mini-batch to the designated device for training. - datapipe = datapipe.copy_to(device) - # Initiate the dataloader for the datapipe. return gb.DataLoader(datapipe) @@ -158,6 +158,12 @@ def train(model, dataset, device): print("Loading data...") dataset = gb.BuiltinDataset("cora").load() + # If a CUDA device is selected, we pin the graph and the features so that + # the GPU can access them. + if device == torch.device("cuda:0"): + dataset.graph.pin_memory_() + dataset.feature.pin_memory_() + in_size = dataset.feature.size("node", None, "feat")[0] model = GraphSAGE(in_size).to(device) diff --git a/examples/sampling/graphbolt/quickstart/node_classification.py b/examples/sampling/graphbolt/quickstart/node_classification.py index 823d08d5b447..0c824e880111 100644 --- a/examples/sampling/graphbolt/quickstart/node_classification.py +++ b/examples/sampling/graphbolt/quickstart/node_classification.py @@ -13,10 +13,13 @@ ############################################################################ # (HIGHLIGHT) Create a single process dataloader with dgl graphbolt package. ############################################################################ -def create_dataloader(dateset, itemset, device): +def create_dataloader(dataset, itemset, device): # Sample seed nodes from the itemset. datapipe = gb.ItemSampler(itemset, batch_size=16) + # Copy the mini-batch to the designated device for sampling and training. + datapipe = datapipe.copy_to(device, extra_attrs=["seed_nodes"]) + # Sample neighbors for the seed nodes. datapipe = datapipe.sample_neighbor(dataset.graph, fanouts=[4, 2]) @@ -25,9 +28,6 @@ def create_dataloader(dateset, itemset, device): dataset.feature, node_feature_keys=["feat"] ) - # Copy the mini-batch to the designated device for training. - datapipe = datapipe.copy_to(device) - # Initiate the dataloader for the datapipe. return gb.DataLoader(datapipe) @@ -119,6 +119,12 @@ def train(model, dataset, device): print("Loading data...") dataset = gb.BuiltinDataset("cora").load() + # If a CUDA device is selected, we pin the graph and the features so that + # the GPU can access them. + if device == torch.device("cuda:0"): + dataset.graph.pin_memory_() + dataset.feature.pin_memory_() + in_size = dataset.feature.size("node", None, "feat")[0] out_size = dataset.tasks[0].metadata["num_classes"] model = GCN(in_size, out_size).to(device)