# Data Handling of Graphs

A single graph: `torch.geometric.data.Data` hold following attributes by default:
- `data.x` (`[num_nodes, num_node_features]`): Node feature matrix
- `data.edge_index` (`[2, num_edges]`): Graph connectivity in [COO format](https://pytorch.org/docs/stable/sparse.html#sparse-coo-docs) type `torch.long`
- `data.edge_attr` (`[num_edges, num_edge_features]`): Edge feature matrix
- `data.y` (Node-level targets of shape `[num_nodes, *]` or graph-level targets of shape  `[1, *]`): Target to train against
- `data.x` (`[num_nodes, num_dimensions]`): Node position matrix

In [1]:
import os
import torch
import torch_geometric
from torch_geometric.data import Data

In [2]:
print(torch.__version__)
print(torch.version.cuda)
print(torch.cuda.is_available())
print(torch_geometric.__version__)

2.3.1+cu121
12.1
True
2.4.0


In [3]:
# Define source-target nodes of all edges
edge_index  = torch.tensor([[0, 1, 1, 2],            
                            [1, 0, 2, 1]], dtype=torch.long)

x = torch.tensor([[-1],[0],[1]], dtype=torch.float)

data = Data(x=x, edge_index = edge_index)
data

Data(x=[3, 1], edge_index=[2, 4])

If you want to write your indices this way, you should transpose and call `contiguous` on it before passing them to the data constructor:

In [4]:
x = torch.tensor([[-1],[0],[1]], dtype=torch.float) # [3,1]

edge_index = torch.tensor([[0,1], # node 0->1
                           [1,0], # node 1->0
                           [1,2],
                           [2,1],
                        ],dtype=torch.long)

data = Data(x=x, edge_index = edge_index.t().contiguous())
data

Data(x=[3, 1], edge_index=[2, 4])

In [5]:
# Check your final Data object
data.validate(raise_on_error=True)

True

Besides holding a number of node-level, edge-level or graph-level attributes, Data provides a number of useful utility functions, e.g.

In [6]:
print(data.keys())

['edge_index', 'x']


In [7]:
print(data['x'])

tensor([[-1.],
        [ 0.],
        [ 1.]])


In [8]:
for key, item in data:
    print(f'{key}: found in data')

x: found in data
edge_index: found in data


In [9]:
'edge_attr' in data

False

Analyzing the graph structure

In [10]:
data.num_nodes

3

In [11]:
data.num_edges

4

In [12]:
data.num_node_features

1

In [13]:
data.has_isolated_nodes()

False

In [14]:
data.has_self_loops()

False

In [15]:
# Transfer data object to GPU
device = torch.device('cuda') if torch.cuda.is_available() else 'cpu'
print('Transfer to:',device)

data = data.to(device)

Transfer to: cuda


# Common Benchmark Datasets
- [Homogeneous Datasets](https://pytorch-geometric.readthedocs.io/en/latest/modules/datasets.html#id19)
- [Heterogeneous Datasets](https://pytorch-geometric.readthedocs.io/en/latest/modules/datasets.html#id20)
- [Hypergraph Datasets](https://pytorch-geometric.readthedocs.io/en/latest/modules/datasets.html#id21)
- [Synthetic Datasets](https://pytorch-geometric.readthedocs.io/en/latest/modules/datasets.html#id22)
- [Graph Generators](https://pytorch-geometric.readthedocs.io/en/latest/modules/datasets.html#graph-generators)
- [Motif Generators](https://pytorch-geometric.readthedocs.io/en/latest/modules/datasets.html#id24)

In [17]:
from torch_geometric.datasets import TUDataset

In [18]:
# https://pytorch-geometric.readthedocs.io/en/latest/generated/torch_geometric.datasets.TUDataset.html#torch_geometric.datasets.TUDataset
dataset = TUDataset(name='ENZYMES',root='./tmp/ENZYMES')

In [19]:
len(dataset)

600

In [20]:
dataset.num_classes

6

In [21]:
dataset.num_node_features

3

In [22]:
dataset = dataset.shuffle()

In [23]:
data = dataset[0]
data

Data(edge_index=[2, 74], x=[20, 3], y=[1])

In [24]:
data.is_undirected()

True

In [25]:
train_dataset = dataset[:540]

In [26]:
test_dataset = dataset[540:]

Let’s try another one! Let’s download Cora, the standard benchmark dataset for semi-supervised graph node classification:

In [27]:
from torch_geometric.datasets import Planetoid

In [28]:
dataset = Planetoid(name='Cora',root='./tmp/Cora')

In [29]:
len(dataset) # Number of graph

1

In [30]:
data.num_nodes

20

In [31]:
dataset.num_classes # Number of classes

7

In [32]:
dataset.num_node_features

1433

In [33]:
# The dataset contains only a single, undirected citation graph
# This time, the Data objects holds a label for each node, and additional node-level attributes
data = dataset[0]
data

Data(x=[2708, 1433], edge_index=[2, 10556], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708])

In [34]:
data.is_undirected()

True

In [35]:
data.train_mask.sum().item() # denotes against which nodes to train (140 nodes),

140

In [36]:
data.val_mask.sum().item() # denotes which nodes to use for validation, e.g., to perform early stopping (500 nodes)

500

In [37]:
data.test_mask.sum().item() # denotes against which nodes to test (1000 nodes).

1000

# Mini-batches

In [38]:
from torch_geometric.loader import DataLoader

In [39]:
dataset = TUDataset(root='./tmp/ENZYMES', name='ENZYMES', use_node_attr=True)
loader = DataLoader(dataset, batch_size=32, shuffle=True)

print(len(dataset))
print(600/32)

for i,batch in enumerate(loader):
    print(i,len(batch))

600
18.75
0 32
1 32
2 32
3 32
4 32
5 32
6 32
7 32
8 32
9 32
10 32
11 32
12 32
13 32
14 32
15 32
16 32
17 32
18 24


In [40]:
batch

DataBatch(edge_index=[2, 2544], x=[658, 21], y=[24], batch=[658], ptr=[25])

In [41]:
batch

DataBatch(edge_index=[2, 2544], x=[658, 21], y=[24], batch=[658], ptr=[25])

In [42]:
batch.num_graphs

24

You can use it to, e.g., average node features in the node dimension for each graph individually

In [43]:
from torch_geometric.utils import scatter

In [44]:
dataset = TUDataset(root='./tmp/ENZYMES', name='ENZYMES', use_node_attr=True)
loader = DataLoader(dataset, batch_size=32, shuffle=True)

for data in loader:
    x = scatter(data.x, data.batch, dim=0, reduce='mean')
    print(x.size())

torch.Size([32, 21])
torch.Size([32, 21])
torch.Size([32, 21])
torch.Size([32, 21])
torch.Size([32, 21])
torch.Size([32, 21])
torch.Size([32, 21])
torch.Size([32, 21])
torch.Size([32, 21])
torch.Size([32, 21])
torch.Size([32, 21])
torch.Size([32, 21])
torch.Size([32, 21])
torch.Size([32, 21])
torch.Size([32, 21])
torch.Size([32, 21])
torch.Size([32, 21])
torch.Size([32, 21])
torch.Size([24, 21])


# Data Transforms

Transforms are a common way in `torchvision` to transform images and perform augmentation. PyG comes with its own transforms, which expect a Data object as input and return a new transformed `Data` object. Transforms can be chained together using `torch_geometric.transforms.Compose` and are applied before saving a processed dataset on disk (`pre_transform`) or before accessing a graph in a dataset (`transform`).

In [45]:
from torch_geometric.datasets import ShapeNet

In [46]:
dataset = ShapeNet(root='./tmp/ShapeNet', categories=['Airplane'])
print(len(dataset))
dataset[0]

2349


Data(x=[2518, 3], y=[2518], pos=[2518, 3], category=[1])

In [47]:
import torch_geometric.transforms as T

In [48]:
# Convert the point cloud dataset into a graph
# We use the pre_transform to convert the data before saving it to disk
# ImportError: 'knn_graph' requires 'torch-cluster' -> 
# -> pip install pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-${TORCH}+${CUDA}.html
# pip install pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-2.3.0+cpu.html
dataset = ShapeNet(root='/tmp/ShapeNet', categories=['Airplane'],
                    pre_transform=T.KNNGraph(k=6))

dataset[0]

Processing...
Done!


Data(x=[2518, 3], y=[2518], pos=[2518, 3], category=[1], edge_index=[2, 15108])

In addition, we can use the transform argument to randomly augment a Data object, e.g., translating each node position by a small number

In [49]:
# We use the pre_transform to convert the data before saving it to disk
dataset = ShapeNet(root='./tmp/ShapeNet',
                   categories=['Airplane'],
                   pre_transform=T.KNNGraph(k=6),
                   transform=T.RandomJitter(0.01)
                  )
dataset[0]



Data(x=[2518, 3], y=[2518], pos=[2518, 3], category=[1])

# Learning Methods on Graphs

In [56]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.datasets import Planetoid

In [57]:
dataset = Planetoid(root='./tmp/Cora',name='Cora')

In [58]:
class GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = GCNConv(dataset.num_node_features, 16)
        self.conv2 = GCNConv(16, dataset.num_classes)

    def forward(self, data):
        x , edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        
        return F.log_softmax(x, dim=1)

In [59]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'device: {device}')

device: cuda


In [64]:
model = GCN().to(device)

In [65]:
data = dataset[0].to(device)

In [66]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

In [67]:
model.train()

for epoch in range(200):
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    print(f'epoch={epoch},loss={loss}')
    loss.backward()
    optimizer.step()
    

epoch=0,loss=1.9455114603042603
epoch=1,loss=1.8429951667785645
epoch=2,loss=1.7121249437332153
epoch=3,loss=1.5596548318862915
epoch=4,loss=1.4205514192581177
epoch=5,loss=1.2978447675704956
epoch=6,loss=1.1286779642105103
epoch=7,loss=1.0362197160720825
epoch=8,loss=0.9037310481071472
epoch=9,loss=0.8179407715797424
epoch=10,loss=0.7078883647918701
epoch=11,loss=0.6095179319381714
epoch=12,loss=0.5704272389411926
epoch=13,loss=0.4828247129917145
epoch=14,loss=0.4128057062625885
epoch=15,loss=0.38286420702934265
epoch=16,loss=0.3569647967815399
epoch=17,loss=0.26963409781455994
epoch=18,loss=0.2606673240661621
epoch=19,loss=0.25787153840065
epoch=20,loss=0.21835796535015106
epoch=21,loss=0.22407560050487518
epoch=22,loss=0.16507311165332794
epoch=23,loss=0.17343175411224365
epoch=24,loss=0.15525883436203003
epoch=25,loss=0.13886618614196777
epoch=26,loss=0.13131652772426605
epoch=27,loss=0.11579317599534988
epoch=28,loss=0.14060644805431366
epoch=29,loss=0.11509215831756592
epoch=30,l

In [71]:
model.eval()
pred = model(data)
pred[0]

tensor([-9.0530e+00, -8.4368e+00, -7.5589e+00, -2.0623e-03, -8.6588e+00,
        -8.4020e+00, -7.1221e+00], device='cuda:0', grad_fn=<SelectBackward0>)

In [73]:
pred = model(data).argmax(dim=1)
pred[0]

tensor(3, device='cuda:0')

In [74]:
correct = (pred[data.test_mask] == data.y[data.test_mask]).sum()
correct

tensor(808, device='cuda:0')

In [75]:
acc = int(correct) / int(data.test_mask.sum())
print(f'Accuracy: {acc:.4f}')

Accuracy: 0.8080
