### Graph classification

In [1]:
import os
import torch
from torch_geometric.datasets import TUDataset

dataset = TUDataset(root='../datasets/TUDataset', name='MUTAG')

print()
print(f'Dataset: {dataset}')
print("===========")
print(f'# graphs: {len(dataset)}')
print(f'# features: {dataset.num_features}')
print(f'# classes: {dataset.num_classes}')


Dataset: MUTAG(188)
# graphs: 188
# features: 7
# classes: 2


In [2]:
data = dataset[0]

print()
print(data)
print('=============')

# gather some info
print(f'# nodes:{data.num_nodes}')
print(f'# edges:{data.num_edges}')
print(f'avg node degree: {data.num_edges/data.num_nodes:.2f}')
print(f'has isolated nodes: {data.has_isolated_nodes()}')
print(f'has self-loops:{data.has_self_loops()}')
print(f'is undirected:{data.is_undirected()}')


Data(edge_index=[2, 38], x=[17, 7], edge_attr=[38, 4], y=[1])
# nodes:17
# edges:38
avg node degree: 2.24
has isolated nodes: False
has self-loops:False
is undirected:True


In [3]:
torch.manual_seed(12345)
dataset = dataset.shuffle()

train_dataset = dataset[:150]
test_dataset = dataset[150:]

print(f'# training graphs: {len(train_dataset)}')
print(f'# testing graphs: {len(test_dataset)}')

# training graphs: 150
# testing graphs: 38


### mini-batching of graphs

PyG parallelization across multiple graphs: stack adjacency matrices in a diagonal fashion to create a giant graph that holds multiple isolated subgraphs. Node and target features are simply concatenated in the node dimension. 

- PyG batches multiple graphs into a single giant graph

- The "ptr" attribute in the DataBatch object stores the cumulative number of nodes in the graphs at each batch index.

- The "batch" vector/atttribute in the DataBatch object stores the graph number among the total # graphs


In [4]:
from torch_geometric.loader import DataLoader

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=True)

for step, data in enumerate(train_loader):
    print(f'# graphs in current batch: {data.num_graphs} and step:{step+1}')
    print(data)
    # print(data.ptr)
    # print(type(data.ptr))
    # print(type(data.batch))

    # slice the batchgraph into individual graphs using data.ptr 
    # print(data.batch[:data.ptr[1]])
    # print(data.batch[data.ptr[1]:data.ptr[2]])

# graphs in current batch: 64 and step:1
DataBatch(edge_index=[2, 2636], x=[1188, 7], edge_attr=[2636, 4], y=[64], batch=[1188], ptr=[65])
# graphs in current batch: 64 and step:2
DataBatch(edge_index=[2, 2506], x=[1139, 7], edge_attr=[2506, 4], y=[64], batch=[1139], ptr=[65])
# graphs in current batch: 22 and step:3
DataBatch(edge_index=[2, 852], x=[387, 7], edge_attr=[852, 4], y=[22], batch=[387], ptr=[23])


### GNN model for graph classification

In [5]:
dataset.num_features, dataset.num_classes

(7, 2)

In [11]:
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.nn import global_mean_pool

class GCN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__() # python3: calls parent class w/o explicitly specifying the parent class name [best practice]
        # super(GCN,self).__init__() # python2 & early python3
        torch.manual_seed(12345)
        self.conv1 = GCNConv(dataset.num_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.conv3 = GCNConv(hidden_channels, hidden_channels)
        self.lin = Linear(hidden_channels, dataset.num_classes)
    
    def forward(self,x,edge_index,batch):
        # obtain node embeddings
        x = self.conv1(x,edge_index)
        x = x.relu()
        x = self.conv2(x,edge_index)
        x = x.relu()
        x = self.conv3(x, edge_index)
        
        # readout layer 
        x = global_mean_pool(x,batch) # [batch_size, hidden_channels]

        # apply final classifier 
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin(x)
        # print(x.shape)

        return x #[batch_size, num_classes] ==> [64,2]

model1 = GCN(hidden_channels=64)
print(model1) # shows only the structure defined in the __init__ method. 

GCN(
  (conv1): GCNConv(7, 64)
  (conv2): GCNConv(64, 64)
  (conv3): GCNConv(64, 64)
  (lin): Linear(in_features=64, out_features=2, bias=True)
)


In [7]:
len(train_loader), len(train_loader.dataset)

(3, 150)

In [12]:
model = GCN(hidden_channels=64)
print(model)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss() # classification problems

def train():
    model.train()

    for data in train_loader:
        out = model(data.x, data.edge_index,data.batch)
        loss = criterion(out, data.y)
        loss.backward() # derive gradients
        optimizer.step() # update parameters based on gradients
        optimizer.zero_grad() # clear gradients

def test(loader):
    model.eval()

    correct = 0
    for data in loader:
        out = model(data.x, data.edge_index, data.batch)
        pred = out.argmax(dim=1)
        correct += int((pred == data.y).sum())
    return correct/len(loader.dataset)

max_test_acc = 0
for epoch in range(1,171):
    train()
    train_acc = test(train_loader)
    test_acc = test(test_loader)
    max_test_acc = test_acc if test_acc > max_test_acc else max_test_acc
    print(f'Epoch: {epoch:03d}, train acc: {train_acc:.4f}, test acc: {test_acc:.4f}')
print(max_test_acc)


GCN(
  (conv1): GCNConv(7, 64)
  (conv2): GCNConv(64, 64)
  (conv3): GCNConv(64, 64)
  (lin): Linear(in_features=64, out_features=2, bias=True)
)
Epoch: 001, train acc: 0.6467, test acc: 0.7368
Epoch: 002, train acc: 0.6467, test acc: 0.7368
Epoch: 003, train acc: 0.6467, test acc: 0.7368
Epoch: 004, train acc: 0.6467, test acc: 0.7368
Epoch: 005, train acc: 0.6467, test acc: 0.7368
Epoch: 006, train acc: 0.6533, test acc: 0.7105
Epoch: 007, train acc: 0.6467, test acc: 0.7368
Epoch: 008, train acc: 0.6467, test acc: 0.7368
Epoch: 009, train acc: 0.6467, test acc: 0.7368
Epoch: 010, train acc: 0.6467, test acc: 0.7368
Epoch: 011, train acc: 0.6467, test acc: 0.7368
Epoch: 012, train acc: 0.6467, test acc: 0.7368
Epoch: 013, train acc: 0.6400, test acc: 0.7368
Epoch: 014, train acc: 0.6400, test acc: 0.7105
Epoch: 015, train acc: 0.6400, test acc: 0.7105
Epoch: 016, train acc: 0.6267, test acc: 0.7895
Epoch: 017, train acc: 0.6467, test acc: 0.6842
Epoch: 018, train acc: 0.6467, test ac

- neighborhood normalization decreases the expressivity of GNNs in distinguishing certain graph structures
- use GraphConv instead of GCNConv

- Note: Using GraphConv instead of GCNConv improved test accuracy from 81% to 84%

In [13]:
from torch_geometric.nn import GraphConv

class GNN(torch.nn.Module):
    def __init__(self,hidden_channels):
        super(GNN, self).__init__()
        torch.manual_seed(12345)
        self.conv1 = GraphConv(dataset.num_features,hidden_channels)
        self.conv2 = GraphConv(hidden_channels,hidden_channels)
        self.conv3 = GraphConv(hidden_channels,hidden_channels)
        self.lin = Linear(hidden_channels,dataset.num_classes)
    
    def forward(self,x,edge_index,batch):
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = self.conv2(x, edge_index)
        x = x.relu()
        x = self.conv3(x, edge_index)

        x = global_mean_pool(x, batch)

        x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin(x)
    
        return x

model2 = GNN(hidden_channels=64)
print(model2)

GNN(
  (conv1): GraphConv(7, 64)
  (conv2): GraphConv(64, 64)
  (conv3): GraphConv(64, 64)
  (lin): Linear(in_features=64, out_features=2, bias=True)
)


In [14]:
model = GNN(hidden_channels=64)
print(model)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

max_test_acc = 0 
for epoch in range(1,201):
    train()
    train_acc = test(train_loader)
    test_acc = test(test_loader)
    max_test_acc = test_acc if test_acc > max_test_acc else max_test_acc
    print(f'Epoch: {epoch:03d}, train_acc: {train_acc:.4f}, test_acc: {test_acc:.4f}')
print(max_test_acc)

GNN(
  (conv1): GraphConv(7, 64)
  (conv2): GraphConv(64, 64)
  (conv3): GraphConv(64, 64)
  (lin): Linear(in_features=64, out_features=2, bias=True)
)
Epoch: 001, train_acc: 0.6467, test_acc: 0.7368
Epoch: 002, train_acc: 0.6467, test_acc: 0.7105
Epoch: 003, train_acc: 0.5000, test_acc: 0.3158
Epoch: 004, train_acc: 0.6533, test_acc: 0.7368
Epoch: 005, train_acc: 0.6467, test_acc: 0.7368
Epoch: 006, train_acc: 0.6467, test_acc: 0.7368
Epoch: 007, train_acc: 0.6467, test_acc: 0.7368
Epoch: 008, train_acc: 0.6467, test_acc: 0.7368
Epoch: 009, train_acc: 0.6467, test_acc: 0.7368
Epoch: 010, train_acc: 0.6467, test_acc: 0.7368
Epoch: 011, train_acc: 0.6467, test_acc: 0.7368
Epoch: 012, train_acc: 0.6600, test_acc: 0.7895
Epoch: 013, train_acc: 0.6400, test_acc: 0.8158
Epoch: 014, train_acc: 0.6933, test_acc: 0.7368
Epoch: 015, train_acc: 0.6800, test_acc: 0.7368
Epoch: 016, train_acc: 0.6733, test_acc: 0.7105
Epoch: 017, train_acc: 0.6733, test_acc: 0.7895
Epoch: 018, train_acc: 0.6667, t

### comparing global max and mean pool operations

In [None]:
from torch_geometric.nn import global_max_pool, global_mean_pool
import torch

In [None]:
a = torch.randint(1,10,size=(4,3))
# batch = torch.tensor([0,0,1,1])
batch =  torch.zeros(4).long()
print(a,batch)

In [None]:
global_max_pool(a,batch), global_mean_pool(a,batch)