In [None]:
import dgl
import numpy as np
import torch as th

from load_graph import load_reddit, load_ogb

# Copy and load the graph

Copy the OGB product graph downloaded in Section 4 to the local folder.

In [None]:
!mkdir -p dataset/ogbn_products_dgl
!cp ../4-large_graph/ogbn-products.zip dataset/ogbn_products_dgl
!unzip dataset/ogbn_products_dgl/ogbn-products.zip -d dataset/ogbn_products_dgl

Load the OGB product graph into DGL.

There is a prompt message `Will you update the dataset now? (y/N)` Please answer `N` to avoid download the dataset from the offical website.

In [None]:
graph_name = 'ogbn-products'
# graph_name = 'ogbn-papers100M'
g, _ = load_ogb(graph_name)

In [None]:
print('#nodes:', g.number_of_nodes())
print('#edges:', g.number_of_edges())

The graph has node features and labels. In addition, its node data have three mask arrays to indicate whether a node belongs to a training, validation or testing set.

In [None]:
print(list(g.ndata.keys()))

This dataset doesn't have any edge data.

In [None]:
print(list(g.edata.keys()))

# Partition a graph with one partition

This converts the OGB product graph into the DGL format for distributed training. It is used for the standalone mode. 

In [None]:
balance_ntypes = g.ndata['train_mask']
dgl.distributed.partition_graph(g, graph_name=graph_name, num_parts=1, out_path='standalone_data')

# Partition a graph with 4 partitions

This partitions a graph into 4 parts and balance the number of nodes in the training set as well as the number of edges.

In [None]:
balance_ntypes = g.ndata['train_mask']
dgl.distributed.partition_graph(g, graph_name=graph_name, num_parts=4, out_path='4part_data',
                                balance_ntypes=balance_ntypes,
                                balance_edges=True)