# Stochastic Training of GNN for Node Classification on Large Heterogeneous  Graphs using Multiple GPUs

*Note: this tutorial requires a GPU enabled machine with multiple gpu devices*

This tutorial shows how to train a multi-layer R-GCN for node classification on the `ogbn-mag` dataset provided by OGB using multiple GPUs on a single machine.

At the end of this tutorial you will be able to

* Use multiple GPUs on a single machine to train a GNN model for a large heterogeneous graph.

In [1]:
import numpy as np
import dgl
import torch
import dgl.nn as dglnn
import torch.nn as nn
from torch.nn.parallel import DistributedDataParallel
import torch.nn.functional as F
import torch.multiprocessing as mp
import sklearn.metrics
import tqdm

from utils import thread_wrapped_func

Using backend: pytorch


## Load Dataset

We are re-using the same dataset from the single gpu heterogeneous mini-batch tutorial.

In [2]:
from ogb.nodeproppred import DglNodePropPredDataset

dataset = DglNodePropPredDataset(name='ogbn-mag')

graph, label = dataset[0] # graph: dgl graph object, label: torch tensor of shape (num_nodes, 1)

split_idx = dataset.get_idx_split()
train_nids, valid_nids, test_nids = split_idx["train"], split_idx["valid"], split_idx["test"]

In [3]:
print(graph)

print('Node labels')
node_labels = label['paper'].flatten()
print('Shape of target node labels:', node_labels.shape)
num_classes = (node_labels.max() + 1).item()
print('Number of classes:', num_classes)

print('Node features')
node_features = graph.nodes['paper'].data['feat']
num_features = node_features.shape[1]
print('Shape of features of paper node type: {}'.format(num_features))

Graph(num_nodes={'author': 1134649, 'field_of_study': 59965, 'institution': 8740, 'paper': 736389},
      num_edges={('author', 'affiliated_with', 'institution'): 1043998, ('author', 'writes', 'paper'): 7145660, ('paper', 'cites', 'paper'): 5416271, ('paper', 'has_topic', 'field_of_study'): 7505078},
      metagraph=[('author', 'institution', 'affiliated_with'), ('author', 'paper', 'writes'), ('paper', 'paper', 'cites'), ('paper', 'field_of_study', 'has_topic')])
Node labels
Shape of target node labels: torch.Size([736389])
Number of classes: 349
Node features
Shape of features of paper node type: 128


In [4]:
src_writes, dst_writes = graph.all_edges(etype="writes")
src_topic, dst_topic = graph.all_edges(etype="has_topic")
src_aff, dst_aff = graph.all_edges(etype="affiliated_with")


graph = dgl.heterograph({
    ("author", "writes", "paper"): (src_writes, dst_writes),
    ("paper", "has_topic", "field_of_study"): (src_topic, dst_topic),
    ("author", "affiliated_with", "institution"): (src_aff, dst_aff),
    ("paper", "writes-rev", "author"): (dst_writes, src_writes),
    ("field_of_study", "has_topic-rev", "paper"): (dst_topic, src_topic),
    ("institution", "affiliated_with-rev", "author"): (dst_aff, src_aff),
})

### Defining data loader for DDP

We need to partition the dataset so each gpu has it's part.

nids is a dictionary of node types to node ids because the graph is heterogeneous

In [5]:
def create_dataloader(rank, world_size, graph, nids, fanout):
    part_nids = {}
    for ntype, ids in nids.items():
        partition_size = len(ids) // world_size
        partition_offset = partition_size * rank
        ids = ids[partition_offset:partition_offset+partition_size]
        part_nids[ntype] = ids
    
    sampler = dgl.dataloading.MultiLayerNeighborSampler(fanout)
    dataloader = dgl.dataloading.NodeDataLoader(
        graph, part_nids, sampler,
        batch_size=1024,
        shuffle=True,
        drop_last=False,
        num_workers=0
    )
    
    return dataloader


## Defining Model

We reuse the same RGCN model from the previous tutorial

In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import dgl.nn as dglnn


In [7]:
class RGCN(nn.Module):
    def __init__(self, in_feats, n_hidden, n_classes, n_layers, rel_names):
        super().__init__()
        
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.n_classes = n_classes
        self.layers = nn.ModuleList()
        
        self.layers.append(dglnn.HeteroGraphConv({
            rel: dglnn.GraphConv(in_feats, n_hidden)
            for rel in rel_names}, aggregate='sum'))
        
        for i in range(1, n_layers - 1):
            self.layers.append(dglnn.HeteroGraphConv({
                rel: dglnn.GraphConv(n_hidden, n_hidden)
                for rel in rel_names}, aggregate='sum'))
            
        self.layers.append(dglnn.HeteroGraphConv({
            rel: dglnn.GraphConv(n_hidden, n_classes)
            for rel in rel_names}, aggregate='sum'))

    def forward(self, bipartites, x):
        # inputs are features of nodes
        for l, (layer, bipartite) in enumerate(zip(self.layers, bipartites)):
            x = layer(bipartite, x)
            if l != self.n_layers - 1:
                x = {k: F.relu(v) for k, v in x.items()}
        return x
    

class NodeEmbed(nn.Module):
    def __init__(self, num_nodes, embed_size, rank=0):
        super(NodeEmbed, self).__init__()
        self.rank = rank
        self.embed_size = embed_size
        self.node_embeds = nn.ModuleDict()
        for ntype in num_nodes:
            node_embed = torch.nn.Embedding(num_nodes[ntype], self.embed_size)
            nn.init.uniform_(node_embed.weight, -1.0, 1.0)
            self.node_embeds[ntype] = node_embed
    
    def forward(self, node_ids):
        embeds = {}
        for ntype in node_ids:
            embeds[ntype] = self.node_embeds[ntype](node_ids[ntype]).to(self.rank)
        return embeds

## Wrapping model with `DistributedDataParallel`

In [None]:
def init_model(rank, in_feats, n_hidden, n_classes, n_layers, rel_names):
    model = RGCN(in_feats, n_hidden, n_classes, n_layers, rel_names).to(rank)
    return DistributedDataParallel(model, device_ids=[rank], output_device=rank, find_unused_parameters=True)

## Defining Training Loop

In [None]:
@thread_wrapped_func
def train(rank, world_size, data):
    # data is the output of load_data
    torch.distributed.init_process_group(
        backend='gloo',
        init_method='tcp://127.0.0.1:12345',
        world_size=world_size,
        rank=rank)
    torch.cuda.set_device(rank)
    
    graph, node_features, node_labels, train_nids, valid_nids, test_nids, num_features, num_classes = data
    fanout = [15, 15]
    train_dataloader = create_dataloader(rank, world_size, graph, train_nids, fanout)
    # We only use one worker for validation
    valid_dataloader = create_dataloader(0, 1, graph, valid_nids, fanout)
    
    num_nodes = {ntype: graph.number_of_nodes(ntype) for ntype in graph.ntypes if ntype != 'paper'}
    num_layers = 2
    hidden_dim = 128
    embed = NodeEmbed(num_nodes, hidden_dim, rank)
    
    model = init_model(rank, num_features, hidden_dim, num_classes, num_layers, graph.etypes)
    opt = torch.optim.Adam(list(model.parameters()) + list(embed.parameters()))
    torch.distributed.barrier()
    
    best_accuracy = 0
    best_model_path = 'model.pt'
    for epoch in range(10):
        model.train()

        for step, (input_nodes, output_nodes, bipartites) in enumerate(train_dataloader):
            bipartites = [b.to(rank) for b in bipartites]
            
            featureless_nodes = {ntype: node_ids for ntype, node_ids in input_nodes.items() if ntype != 'paper'}
            embeddings = embed(featureless_nodes)
            
            inputs = {'paper': node_features[input_nodes['paper']].to(rank)}
            inputs.update(embeddings)
            
            labels = node_labels[output_nodes['paper']].to(rank)
            predictions = model(bipartites, inputs)['paper']
            

            loss = F.cross_entropy(predictions, labels)
            opt.zero_grad()
            loss.backward()
            opt.step()

            accuracy = sklearn.metrics.accuracy_score(labels.cpu().numpy(), predictions.argmax(1).detach().cpu().numpy())

            if rank == 0 and step % 10 == 0:
                print('Epoch {:05d} Step {:05d} Loss {:.04f}'.format(epoch, step, loss.item()))

        torch.distributed.barrier()
        
        if rank == 0:
            model.eval()
            predictions = []
            labels = []
            with torch.no_grad():
                for input_nodes, output_nodes, bipartites in valid_dataloader:
                    bipartites = [b.to(rank) for b in bipartites]
                    
                    featureless_nodes = {ntype: node_ids for ntype, node_ids in input_nodes.items() if ntype != "paper"}
                    embeddings = {ntype: node_embedding.cuda() for ntype, node_embedding in embed(featureless_nodes).items()}
                    inputs = {'paper': node_features[input_nodes['paper']].cuda()}
                    inputs.update(embeddings)
            
                    labels.append(node_labels[output_nodes['paper']].numpy())
                    predictions.append(model(bipartites, inputs)['paper'].argmax(1).cpu().numpy())
                predictions = np.concatenate(predictions)
                labels = np.concatenate(labels)
                accuracy = sklearn.metrics.accuracy_score(labels, predictions)
                print('Epoch {} Validation Accuracy {}'.format(epoch, accuracy))
                if best_accuracy < accuracy:
                    best_accuracy = accuracy
                    torch.save(model.module.state_dict(), best_model_path)
                    
        torch.distributed.barrier()

## Spawning multiple processes for the Multi GPU training

if __name__ == '__main__':
    procs = []
    data = graph, node_features, node_labels, train_nids, valid_nids, test_nids, num_features, num_classes
    for proc_id in range(4):    # 4 gpus
        p = mp.Process(target=train, args=(proc_id, 4, data))
        p.start()
        procs.append(p)
    for p in procs:
        p.join()

In [8]:
def init_model(rank, in_feats, n_hidden, n_classes, n_layers, rel_names):
    model = RGCN(in_feats, n_hidden, n_classes, n_layers, rel_names).to(rank)
    return DistributedDataParallel(model, device_ids=[rank], output_device=rank, find_unused_parameters=True)

## Defining Training Loop

In [9]:
@thread_wrapped_func
def train(rank, world_size, data):
    # data is the output of load_data
    torch.distributed.init_process_group(
        backend='gloo',
        init_method='tcp://127.0.0.1:12345',
        world_size=world_size,
        rank=rank)
    torch.cuda.set_device(rank)
    
    graph, node_features, node_labels, train_nids, valid_nids, test_nids, num_features, num_classes = data
    fanout = [15, 15]
    train_dataloader = create_dataloader(rank, world_size, graph, train_nids, fanout)
    # We only use one worker for validation
    valid_dataloader = create_dataloader(0, 1, graph, valid_nids, fanout)
    
    num_nodes = {ntype: graph.number_of_nodes(ntype) for ntype in graph.ntypes if ntype != 'paper'}
    num_layers = 2
    hidden_dim = 128
    embed = NodeEmbed(num_nodes, hidden_dim, rank)
    
    model = init_model(rank, num_features, hidden_dim, num_classes, num_layers, graph.etypes)
    opt = torch.optim.Adam(list(model.parameters()) + list(embed.parameters()))
    torch.distributed.barrier()
    
    best_accuracy = 0
    best_model_path = 'model.pt'
    for epoch in range(10):
        model.train()

        for step, (input_nodes, output_nodes, bipartites) in enumerate(train_dataloader):
            bipartites = [b.to(rank) for b in bipartites]
            
            featureless_nodes = {ntype: node_ids for ntype, node_ids in input_nodes.items() if ntype != 'paper'}
            embeddings = embed(featureless_nodes)
            
            inputs = {'paper': node_features[input_nodes['paper']].to(rank)}
            inputs.update(embeddings)
            
            labels = node_labels[output_nodes['paper']].to(rank)
            predictions = model(bipartites, inputs)['paper']
            

            loss = F.cross_entropy(predictions, labels)
            opt.zero_grad()
            loss.backward()
            opt.step()

            accuracy = sklearn.metrics.accuracy_score(labels.cpu().numpy(), predictions.argmax(1).detach().cpu().numpy())

            if rank == 0 and step % 10 == 0:
                print('Epoch {:05d} Step {:05d} Loss {:.04f}'.format(epoch, step, loss.item()))

        torch.distributed.barrier()
        
        if rank == 0:
            model.eval()
            predictions = []
            labels = []
            with torch.no_grad():
                for input_nodes, output_nodes, bipartites in valid_dataloader:
                    bipartites = [b.to(rank) for b in bipartites]
                    
                    featureless_nodes = {ntype: node_ids for ntype, node_ids in input_nodes.items() if ntype != "paper"}
                    embeddings = {ntype: node_embedding.cuda() for ntype, node_embedding in embed(featureless_nodes).items()}
                    inputs = {'paper': node_features[input_nodes['paper']].cuda()}
                    inputs.update(embeddings)
            
                    labels.append(node_labels[output_nodes['paper']].numpy())
                    predictions.append(model(bipartites, inputs)['paper'].argmax(1).cpu().numpy())
                predictions = np.concatenate(predictions)
                labels = np.concatenate(labels)
                accuracy = sklearn.metrics.accuracy_score(labels, predictions)
                print('Epoch {} Validation Accuracy {}'.format(epoch, accuracy))
                if best_accuracy < accuracy:
                    best_accuracy = accuracy
                    torch.save(model.module.state_dict(), best_model_path)
                    
        torch.distributed.barrier()

## Spawning multiple processes for the Multi GPU training

In [10]:
if __name__ == '__main__':
    procs = []
    data = graph, node_features, node_labels, train_nids, valid_nids, test_nids, num_features, num_classes
    for proc_id in range(4):    # 4 gpus
        p = mp.Process(target=train, args=(proc_id, 4, data))
        p.start()
        procs.append(p)
    for p in procs:
        p.join()

Epoch 00000 Step 00000 Loss 6.2387
Epoch 00000 Step 00010 Loss 4.9378
Epoch 00000 Step 00020 Loss 4.6241
Epoch 00000 Step 00030 Loss 4.3468
Epoch 00000 Step 00040 Loss 4.1040
Epoch 00000 Step 00050 Loss 3.9855
Epoch 00000 Step 00060 Loss 3.7055
Epoch 00000 Step 00070 Loss 3.5657
Epoch 00000 Step 00080 Loss 3.4811
Epoch 00000 Step 00090 Loss 3.3586
Epoch 00000 Step 00100 Loss 3.2200
Epoch 00000 Step 00110 Loss 3.2090
Epoch 00000 Step 00120 Loss 3.1430
Epoch 00000 Step 00130 Loss 3.1262
Epoch 00000 Step 00140 Loss 3.0292
Epoch 00000 Step 00150 Loss 3.0444
Epoch 0 Validation Accuracy 0.2701336333790595
Epoch 00001 Step 00000 Loss 3.0211
Epoch 00001 Step 00010 Loss 2.9341
Epoch 00001 Step 00020 Loss 2.9294
Epoch 00001 Step 00030 Loss 3.0074
Epoch 00001 Step 00040 Loss 2.8715
Epoch 00001 Step 00050 Loss 2.8772
Epoch 00001 Step 00060 Loss 2.8220
Epoch 00001 Step 00070 Loss 2.8864
Epoch 00001 Step 00080 Loss 2.7738
Epoch 00001 Step 00090 Loss 2.7418
Epoch 00001 Step 00100 Loss 2.6536
Epoch 00

## Conclusion

In this tutorial, you have learned how to train a multi-layer RGCN with neighbor sampling on a large heterogeneous dataset that cannot fit into a single GPU.  The method you have learned can scale to a graph of any size, and works on a single machine with a single GPU.