In [145]:
import torch
import torch_geometric
from torch_geometric.datasets import TUDataset
from torch_geometric.loader import DataLoader

In [146]:
import torch

In [147]:
dataset = TUDataset(root="../data/TUDataset", name='COLLAB')

### About Collab Dataset
COLLAB is a scientific collaboration dataset. A graph corresponds to a researcher’s ego network, i.e., the researcher and its collaborators are nodes and an edge indicates collaboration between two researchers. A researcher’s ego network has three possible labels, i.e., High Energy Physics, Condensed Matter Physics, and Astro Physics, which are the fields that the researcher belongs to. The dataset has 5,000 graphs and each graph has label 0, 1, or 2.
https://paperswithcode.com/dataset/collab 
https://networkrepository.com/COLLAB.php

In [148]:
print(f'Dataset: {dataset}')
print(f'Num Graphs: {len(dataset)}')
print(f'Num Nodes: {dataset.num_nodes}')
print(f'Num classes: {dataset.num_classes}')

Dataset: COLLAB(5000)
Num Graphs: 5000
Num Nodes: 372474
Num classes: 3


In [149]:
torch_geometric.utils.degree(dataset[4].edge_index[0], dataset[4].num_nodes)

# dataset[0].edge_index

tensor([42., 46., 39., 45.,  8., 45., 39., 45., 42., 45.,  7., 45., 45., 45.,
        42., 47., 45., 45., 45., 45., 47., 42., 45., 45., 47., 45., 39., 45.,
        42., 45., 45., 45., 45., 45., 45., 45., 45., 45., 45., 47., 47., 45.,
        45., 45., 45., 45., 45., 42.])

In [150]:
# set CUDA
device = "cuda:0" if (torch.cuda.is_available()) else "cpu"
device

'cuda:0'

In [151]:
max_degree = 0
for data in dataset:
    deg = torch_geometric.utils.degree(data.edge_index[1], num_nodes=data.num_nodes)
    max_degree = max(max_degree, max(deg).item())
# assign to one hot degree for each data (OneHotDegree receive maximum degree parameter)
dataset.transform = torch_geometric.transforms.OneHotDegree(int(max_degree))

In [152]:
cuda_dataset = []
for g in dataset:
    cuda_dataset.append(g.to(torch.device(device), non_blocking=True))

In [154]:
cuda_dataset[0].x.device

device(type='cuda', index=0)

In [155]:
split = 0.8
seed = 123

num_split = round(len(dataset) * split)
torch.manual_seed(seed)
dataset.shuffle()

COLLAB(5000)

In [156]:
train_dataset = cuda_dataset[:num_split]
test_dataset = cuda_dataset[num_split:]
print('Train: ', len(train_dataset))
print('Test: ', len(test_dataset))

Train:  4000
Test:  1000


In [157]:
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=True)

### Model Construction

In [158]:
from torch.nn import Linear
import torch.nn.functional as F

# Graph neural network models
from torch_geometric.nn import GCNConv

# pooling method (for readout layer)
from torch_geometric.nn import global_mean_pool

In [159]:
class GCN(torch.nn.Module):
    def __init__(self, data, hidden_channels):
        super(GCN, self).__init__()
        # seed
        torch.manual_seed(42)
        self.conv1 = GCNConv(data.num_node_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.conv3 = GCNConv(hidden_channels, hidden_channels)
        self.lin = Linear(hidden_channels, data.num_classes)
        
    def forward(self, x, edge_index, batch):
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = self.conv2(x, edge_index)
        x = x.relu()
        x = self.conv3(x, edge_index)        
        
        x = global_mean_pool(x, batch)
        
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin(x)
        
        return x

In [160]:
for data in train_loader:
    print(data.x.to(torch.device(device)))
    break

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0')


In [161]:
def train(model, loader):
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    
    model.train()
    
    for data in loader:
        out = model(data.x, data.edge_index, data.batch)
        loss = criterion(out, data.y)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    return out, loss

def test(model, loader):
    model.eval()
    correct = 0
    for data in loader:
        out = model(data.x, data.edge_index, data.batch)
        pred = out.argmax(dim=1)
        correct += int((pred == data.y).sum())
    return correct/len(loader.dataset)

In [162]:
model = GCN(dataset, 42)
model.to(torch.device(device))

GCN(
  (conv1): GCNConv(492, 42)
  (conv2): GCNConv(42, 42)
  (conv3): GCNConv(42, 42)
  (lin): Linear(in_features=42, out_features=3, bias=True)
)

In [163]:
list_loss = []
list_train_acc = []
list_test_acc = []

for epoch in range(0, 25):
    out, loss = train(model, train_loader)
    train_acc = test(model, train_loader)
    test_acc = test(model, test_loader)
    
    list_train_acc.append(round(train_acc, 4))
    list_test_acc.append(round(test_acc, 4))
    list_loss.append(round(loss.item(), 4))

    print(f"epoch: {epoch+1} train_acc: {train_acc:.4f} loss: {loss:.4f} test_acc: {test_acc:.4f}")

epoch: 1 train_acc: 0.7937 loss: 0.3320 test_acc: 0.4620
epoch: 2 train_acc: 0.7993 loss: 0.6185 test_acc: 0.6540
epoch: 3 train_acc: 0.8043 loss: 0.4405 test_acc: 0.5270
epoch: 4 train_acc: 0.8070 loss: 0.4485 test_acc: 0.6410
epoch: 5 train_acc: 0.8087 loss: 0.3878 test_acc: 0.6620
epoch: 6 train_acc: 0.8340 loss: 0.3902 test_acc: 0.6420
epoch: 7 train_acc: 0.8297 loss: 0.3311 test_acc: 0.5400
epoch: 8 train_acc: 0.8498 loss: 0.4273 test_acc: 0.5910
epoch: 9 train_acc: 0.8417 loss: 0.6867 test_acc: 0.6750
epoch: 10 train_acc: 0.8365 loss: 0.2806 test_acc: 0.5830
epoch: 11 train_acc: 0.8605 loss: 0.2475 test_acc: 0.5760
epoch: 12 train_acc: 0.8670 loss: 0.4175 test_acc: 0.5640
epoch: 13 train_acc: 0.8622 loss: 0.3740 test_acc: 0.6680
epoch: 14 train_acc: 0.8698 loss: 0.3088 test_acc: 0.5400
epoch: 15 train_acc: 0.8710 loss: 0.1967 test_acc: 0.4970
epoch: 16 train_acc: 0.8792 loss: 0.1939 test_acc: 0.6420
epoch: 17 train_acc: 0.8852 loss: 0.4456 test_acc: 0.6760
epoch: 18 train_acc: 0.

Free cuda memory

In [168]:
import gc

model.cpu()
del model
gc.collect()
torch.cuda.empty_cache()