# PyTorch Geometric tutorials

## Data handling of Graphs
PyG 中使用Data实例来存储一个图，以下实例介绍图的初始化及其包含的属性！

In [1]:
import torch
from torch_geometric.data import Data

# node index can start without 0
edge_index = torch.tensor([[0, 1, 1, 2],
                           [1, 0, 2, 1]], dtype=torch.long)
x = torch.tensor([[-1], [0], [1]], dtype=torch.float)

data = Data(x=x, edge_index=edge_index)

注意：如果边不是以COO形式给出的而是以节点对的形式，需要先转置`t()`再利用函数`contiguous()`，例如以上示例：

In [2]:
edge_index = torch.tensor([[0,1], [1,0], [1,2], [2,1]], dtype=torch.long)
edge_index.t().contiguous()

tensor([[0, 1, 1, 2],
        [1, 0, 2, 1]])

In [3]:
data # 2, 4

Data(edge_index=[2, 4], x=[3, 1])

In [4]:
data.x

tensor([[-1.],
        [ 0.],
        [ 1.]])

In [5]:
data.y

In [6]:
data.edge_index

tensor([[0, 1, 1, 2],
        [1, 0, 2, 1]])

In [7]:
data.edge_attr

In [8]:
data.pos

In [9]:
for key, item in data:
    print(key, item)

edge_index tensor([[0, 1, 1, 2],
        [1, 0, 2, 1]])
x tensor([[-1.],
        [ 0.],
        [ 1.]])


In [10]:
data.num_nodes

3

In [11]:
data.num_edges

4

In [12]:
data.num_node_features

1

In [13]:
data.contains_isolated_nodes()

False

In [14]:
data.contains_self_loops()

False

In [15]:
data.is_directed()

False

In [16]:
# transfer data to GPU
if torch.cuda.is_available():
    data = data.to(torch.device('cuda'))

## Common Benchmark Datasets
- Planetoid datasets
    - Cora
    - Citeseer
    - Pubmed
- Graph classification datasets.
- QM7 and QM9 datasets.
- 3D mesh/point

### TUDataset

In [17]:
from torch_geometric.datasets import TUDataset

dataset = TUDataset(root='./data', name='ENZYMES')

In [18]:
len(dataset) # 600 graphs for graph classification.

600

In [19]:
dataset.num_classes # 6 classes

6

In [20]:
dataset.num_node_features

3

In [21]:
dataset.num_edge_attributes

0

In [22]:
data = dataset[0] # 37 nodes with 3 features,168/2 edges and the graph is assigned to clase 1.
data

Data(edge_index=[2, 168], x=[37, 3], y=[1])

In [23]:
data.is_directed()

False

Split the dataset:90/10 train/test

In [24]:
train_dataset = dataset[:540]
test_dataset = dataset[540:]
train_dataset

ENZYMES(540)

Dataset permutation

In [25]:
dataset = dataset.shuffle() # dataset = dataset[torch.randperm(len(dataset))]
dataset[0]

Data(edge_index=[2, 104], x=[30, 3], y=[1])

### Cora dataset

In [26]:
from torch_geometric.datasets import Planetoid 
dataset = Planetoid(root='./data', name='Cora')

In [27]:
len(dataset)

1

In [28]:
data = dataset[0]
data

Data(edge_index=[2, 10556], test_mask=[2708], train_mask=[2708], val_mask=[2708], x=[2708, 1433], y=[2708])

In [29]:
data.is_undirected()

True

In [30]:
data.train_mask

tensor([ True,  True,  True,  ..., False, False, False])

In [31]:
data.train_mask.sum().item()

140

In [32]:
data.val_mask.sum().item()

500

In [33]:
data.test_mask.sum().item()

1000

In [34]:
data.y

tensor([3, 4, 4,  ..., 3, 3, 3])

- `train_mask` 训练的节点
- `val_mask` 验证节点
- `test_mask` 测试节点

## Mini-batchs

神经网络通常会按照`batch`方式进行训练，PyG 通过构建稀疏化的分块对角阵实现`mini-batch`的并行化，构建方式按照每一个Data实例的`edge_index`构建一个图的邻接矩阵，然后将所有节点的特征向量按行拼接。使得不同数量的顶点数和边数的图可以一起训练。可以使用PyG内部的 `torch_geometric.data.DataLoader`进行图拼接的过程！
$$\mathbf{A}=\left[\begin{array}{ccc}
\mathbf{A}_{1} & & \\
& \ddots & \\
& & \mathbf{A}_{n}
\end{array}\right], \quad \mathbf{X}=\left[\begin{array}{c}
\mathbf{X}_{1} \\
\vdots \\
\mathbf{X}_{n}
\end{array}\right], \quad \mathbf{Y}=\left[\begin{array}{c}
\mathbf{Y}_{1} \\
\vdots \\
\mathbf{Y}_{n}
\end{array}\right]$$

In [35]:
from torch_geometric.datasets import TUDataset
from torch_geometric.data import DataLoader

dataset = TUDataset(root='./data', name='ENZYMES', use_node_attr=True)
loader = DataLoader(dataset, batch_size=32, shuffle=True)
len(loader)

19

In [36]:
for batch in loader:
    print(batch)

Batch(batch=[1034], edge_index=[2, 3968], x=[1034, 21], y=[32])
Batch(batch=[1057], edge_index=[2, 4196], x=[1057, 21], y=[32])
Batch(batch=[1034], edge_index=[2, 4078], x=[1034, 21], y=[32])
Batch(batch=[1167], edge_index=[2, 4186], x=[1167, 21], y=[32])
Batch(batch=[1096], edge_index=[2, 4132], x=[1096, 21], y=[32])
Batch(batch=[1176], edge_index=[2, 4286], x=[1176, 21], y=[32])
Batch(batch=[1070], edge_index=[2, 4226], x=[1070, 21], y=[32])
Batch(batch=[1077], edge_index=[2, 4114], x=[1077, 21], y=[32])
Batch(batch=[1085], edge_index=[2, 4276], x=[1085, 21], y=[32])
Batch(batch=[949], edge_index=[2, 3546], x=[949, 21], y=[32])
Batch(batch=[1049], edge_index=[2, 3944], x=[1049, 21], y=[32])
Batch(batch=[940], edge_index=[2, 3690], x=[940, 21], y=[32])
Batch(batch=[1298], edge_index=[2, 4444], x=[1298, 21], y=[32])
Batch(batch=[935], edge_index=[2, 3650], x=[935, 21], y=[32])
Batch(batch=[982], edge_index=[2, 3866], x=[982, 21], y=[32])
Batch(batch=[1022], edge_index=[2, 3908], x=[102

`batch`大小为32个图，但是每一个图的规模是不一样的，上例中第一个batch内的32个图共1005节点，含有3948条边。

`torch_geometric.data.Batch`继承`torch_geometric.data.Data`，并且添加了一个额外的属性`batch`。`batch`是一个列向量，代表了每一个节点对应到哪一个图。
$$\text { batch }=\left[\begin{array}{cccccccc}
0 & \cdots & 0 & 1 & \cdots & n-2 & n-1 & \cdots & n-1
\end{array}\right]^{\top}$$

可以根据`Batch`对每个图中的节点特征进行平均化，其中使用到 `scatter` 库，以每一个图为单位，将各个图中的所有节点的特征向量计算了一个平均值，所以维度为`[32, 21]`。

In [37]:
from torch_scatter import scatter_mean
from torch_geometric.datasets import TUDataset
from torch_geometric.data import DataLoader

dataset = TUDataset(root='./data', name='ENZYMES', use_node_attr=True)
loader = DataLoader(dataset, batch_size=32, shuffle=True)
for data in loader:
    print(data)
    print(data.num_graphs)
    x = scatter_mean(data.x, data.batch, dim=0)
    print(x.size())

Batch(batch=[971], edge_index=[2, 3762], x=[971, 21], y=[32])
32
torch.Size([32, 21])
Batch(batch=[1214], edge_index=[2, 4626], x=[1214, 21], y=[32])
32
torch.Size([32, 21])
Batch(batch=[1102], edge_index=[2, 4184], x=[1102, 21], y=[32])
32
torch.Size([32, 21])
Batch(batch=[1076], edge_index=[2, 3630], x=[1076, 21], y=[32])
32
torch.Size([32, 21])
Batch(batch=[1044], edge_index=[2, 3978], x=[1044, 21], y=[32])
32
torch.Size([32, 21])
Batch(batch=[1152], edge_index=[2, 4216], x=[1152, 21], y=[32])
32
torch.Size([32, 21])
Batch(batch=[1053], edge_index=[2, 4152], x=[1053, 21], y=[32])
32
torch.Size([32, 21])
Batch(batch=[1018], edge_index=[2, 3876], x=[1018, 21], y=[32])
32
torch.Size([32, 21])
Batch(batch=[1076], edge_index=[2, 4082], x=[1076, 21], y=[32])
32
torch.Size([32, 21])
Batch(batch=[941], edge_index=[2, 3586], x=[941, 21], y=[32])
32
torch.Size([32, 21])
Batch(batch=[1119], edge_index=[2, 4292], x=[1119, 21], y=[32])
32
torch.Size([32, 21])
Batch(batch=[1026], edge_index=[2, 3

## Data Transforms
PyG 中的数据变换与 `torchvision` 中图片变换与扩充类似。变换操作可以是使用`torch_geometric.transform.Compose`进行图拼接！

下面以ShapeNet（17000 3D点云，16种形状） 数据集为例，下载数据时可以根据 kNN 对点云进行图构造：

In [38]:
from torch_geometric.datasets import ShapeNet
dataset = ShapeNet(root='./data/ShapeNet', categories=['Airplane'])
dataset

ShapeNet(2349, categories=['Airplane'])

In [39]:
dataset[0]

Data(category=[1], pos=[2518, 3], x=[2518, 3], y=[2518])

可以根据KNN构造点云图，下载时已预处理下次使用时不要在进行图构造！

In [40]:
import torch_geometric.transforms as T
dataset = ShapeNet(root='./data/ShapeNet', categories=['Airplane'],
                    pre_transform=T.KNNGraph(k=6),
                    transform=T.RandomTranslate(0.01))
dataset[0]

  'sure to delete `{}` first.'.format(self.processed_dir))


Data(category=[1], pos=[2518, 3], x=[2518, 3], y=[2518])

## Learning Methods on Graphs

在学习PyG的数据处理与图变换方法后，接下来就依赖 PyG 实现 GNN了！

In [41]:
# 加载数据集
from torch_geometric.datasets import Planetoid
dataset = Planetoid(root='./data', name='Cora')
dataset

Cora()

In [42]:
# 2-layers GCN
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = GCNConv(dataset.num_node_features, 16)
        self.conv2 = GCNConv(16, dataset.num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)

In [43]:
# 训练
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Net().to(device)
data = dataset[0].to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

model.train()
for epoch in range(200):
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()

In [44]:
# 测试
model.eval()
_, pred = model(data).max(dim=1)
correct = float (pred[data.test_mask].eq(data.y[data.test_mask]).sum().item())
acc = correct / data.test_mask.sum().item()
print('Accuracy: {:.4f}'.format(acc))

Accuracy: 0.7970
