In [6]:
import torch
import torch_geometric
from torch_geometric.datasets import TUDataset
from torch_geometric.loader import DataLoader

In [7]:
import torch

In [8]:
dataset = TUDataset(root="../data/TUDataset", name='COLLAB')

### About Collab Dataset
COLLAB is a scientific collaboration dataset. A graph corresponds to a researcher’s ego network, i.e., the researcher and its collaborators are nodes and an edge indicates collaboration between two researchers. A researcher’s ego network has three possible labels, i.e., High Energy Physics, Condensed Matter Physics, and Astro Physics, which are the fields that the researcher belongs to. The dataset has 5,000 graphs and each graph has label 0, 1, or 2.
https://paperswithcode.com/dataset/collab 
https://networkrepository.com/COLLAB.php

In [9]:
print(f'Dataset: {dataset}')
print(f'Num Graphs: {len(dataset)}')
print(f'Num Nodes: {dataset.num_nodes}')
print(f'Num classes: {dataset.num_classes}')

Dataset: COLLAB(5000)
Num Graphs: 5000
Num Nodes: 372474
Num classes: 3


In [10]:
torch_geometric.utils.degree(dataset[4].edge_index[0], dataset[4].num_nodes)

# dataset[0].edge_index

tensor([42., 46., 39., 45.,  8., 45., 39., 45., 42., 45.,  7., 45., 45., 45.,
        42., 47., 45., 45., 45., 45., 47., 42., 45., 45., 47., 45., 39., 45.,
        42., 45., 45., 45., 45., 45., 45., 45., 45., 45., 45., 47., 47., 45.,
        45., 45., 45., 45., 45., 42.])

In [11]:
# set CUDA
device = "cuda:0" if (torch.cuda.is_available()) else "cpu"
device

'cuda:0'

In [12]:
max_degree = 0
for data in dataset:
    deg = torch_geometric.utils.degree(data.edge_index[1], num_nodes=data.num_nodes)
    max_degree = max(max_degree, max(deg).item())
# assign to one hot degree for each data (OneHotDegree receive maximum degree parameter)
dataset.transform = torch_geometric.transforms.OneHotDegree(int(max_degree))

In [16]:
cuda_dataset = []
for g in dataset:
    cuda_dataset.append(g.to(torch.device(device)))

In [17]:
cuda_dataset[0].x.device

device(type='cuda', index=0)

In [18]:
split = 0.8
seed = 123

num_split = round(len(dataset) * split)
torch.manual_seed(seed)
dataset.shuffle()

COLLAB(5000)

In [19]:
train_dataset = cuda_dataset[:num_split]
test_dataset = cuda_dataset[num_split:]
print('Train: ', len(train_dataset))
print('Test: ', len(test_dataset))

Train:  4000
Test:  1000


In [20]:
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=True)

### Model Construction

In [49]:
from torch.nn import Linear
import torch.nn.functional as F

# Graph neural network models
from torch_geometric.nn import GCNConv
from torch_geometric.nn import GraphConv

# pooling method (for readout layer)
from torch_geometric.nn import global_mean_pool
from torch_geometric.nn import global_max_pool
from torch_geometric.nn import SAGPooling

In [90]:
class GCN(torch.nn.Module):
    def __init__(self, data, hidden_channels):
        super(GCN, self).__init__()
        # seed
        torch.manual_seed(42)
        self.conv1 = GCNConv(data.num_node_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        # self.conv3 = GCNConv(hidden_channels, hidden_channels)
        self.lin = Linear(hidden_channels, data.num_classes)
        
    def forward(self, x, edge_index, batch):
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = self.conv2(x, edge_index)
        # x = x.relu()
        # x = self.conv3(x, edge_index)        
        
        x = global_mean_pool(x, batch)
        
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin(x)
        
        return x

In [80]:
# https://github.com/inyeoplee77/SAGPool/blob/master/networks.py
# class GCN(torch.nn.Module):
#     def __init__(self, data, hidden_channels):
#         super(GCN, self).__init__()
#         # seed
#         torch.manual_seed(42)
#         self.conv1 = GCNConv(data.num_node_features, hidden_channels)
#         self.pool1 = SAGPooling(hidden_channels, ratio=0.5)
#         self.conv2 = GCNConv(hidden_channels, hidden_channels)
#         self.pool2 = SAGPooling(hidden_channels, ratio=0.5)
#         self.conv3 = GCNConv(hidden_channels, hidden_channels)
#         self.pool3 = SAGPooling(hidden_channels, ratio=0.5)
        
#         self.lin1 = Linear(hidden_channels*2, hidden_channels)
#         self.lin2 = Linear(hidden_channels, hidden_channels//2)
#         self.lin3 = Linear(hidden_channels//2, data.num_classes)

#     def forward(self, x, edge_index, batch):
#         x = self.conv1(x, edge_index)
#         x = x.relu()
#         x, edge_index, _, batch, _, _ = self.pool1(x, edge_index, None, batch)
#         x1 = torch.cat([global_max_pool(x, batch), global_mean_pool(x, batch)], dim=1)
        
#         x = self.conv2(x, edge_index)
#         x = x.relu()
#         x, edge_index, _, batch, _, _ = self.pool2(x, edge_index, None, batch)
#         x2 = torch.cat([global_max_pool(x, batch), global_mean_pool(x, batch)], dim=1)
        
#         x = self.conv3(x, edge_index)
#         x, edge_index, _, batch, _, _ = self.pool3(x, edge_index, None, batch)
#         x3 = torch.cat([global_max_pool(x, batch), global_mean_pool(x, batch)], dim=1)

#         # x = global_max_pool(x, batch)
#         x = x1 + x2 + x3
        
#         x = F.relu(self.lin1(x))
#         x = F.dropout(x, p=0.5, training=self.training)
#         x = F.relu(self.lin2(x))
#         x = self.lin3(x)

#         return x

In [54]:
for data in train_loader:
    print(data.x.to(torch.device(device)))
    break

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0')


In [55]:
def train(model, loader):
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    
    model.train()
    
    for data in loader:
        out = model(data.x, data.edge_index, data.batch)
        loss = criterion(out, data.y)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    return out, loss

@torch.no_grad()
def test(model, loader):
    model.eval()
    correct = 0
    for data in loader:
        out = model(data.x, data.edge_index, data.batch)
        pred = out.argmax(dim=1)
        correct += int((pred == data.y).sum())
    return correct/len(loader.dataset)

In [83]:
model = GCN(dataset, 64)
model.to(torch.device(device))

GCN(
  (conv1): GCNConv(492, 64)
  (pool1): SAGPooling(GraphConv, 64, ratio=0.5, multiplier=1.0)
  (conv2): GCNConv(64, 64)
  (pool2): SAGPooling(GraphConv, 64, ratio=0.5, multiplier=1.0)
  (conv3): GCNConv(64, 64)
  (pool3): SAGPooling(GraphConv, 64, ratio=0.5, multiplier=1.0)
  (lin1): Linear(in_features=128, out_features=64, bias=True)
  (lin2): Linear(in_features=64, out_features=32, bias=True)
  (lin3): Linear(in_features=32, out_features=3, bias=True)
)

In [85]:
list_loss = []
list_train_acc = []
list_test_acc = []

for epoch in range(0, 10):
    out, loss = train(model, train_loader)
    train_acc = test(model, train_loader)
    test_acc = test(model, test_loader)
    
    list_train_acc.append(round(train_acc, 4))
    list_test_acc.append(round(test_acc, 4))
    list_loss.append(round(loss.item(), 4))

    print(f"epoch: {epoch+1} train_acc: {train_acc:.4f} loss: {loss:.4f} test_acc: {test_acc:.4f}")



epoch: 1 train_acc: 0.7662 loss: 0.5372 test_acc: 0.3930
epoch: 2 train_acc: 0.7740 loss: 0.6605 test_acc: 0.6050
epoch: 3 train_acc: 0.8085 loss: 0.5898 test_acc: 0.4880
epoch: 4 train_acc: 0.8053 loss: 0.6668 test_acc: 0.6180
epoch: 5 train_acc: 0.8033 loss: 0.4897 test_acc: 0.4470
epoch: 6 train_acc: 0.8023 loss: 0.3882 test_acc: 0.4740
epoch: 7 train_acc: 0.8115 loss: 0.3308 test_acc: 0.4630
epoch: 8 train_acc: 0.8080 loss: 0.3953 test_acc: 0.4960
epoch: 9 train_acc: 0.8230 loss: 0.3040 test_acc: 0.4570
epoch: 10 train_acc: 0.8317 loss: 0.2975 test_acc: 0.5230


Free cuda memory

In [86]:
import gc

model.cpu()
del model
gc.collect()
torch.cuda.empty_cache()

In [89]:
for train in train_loader:
    print(train)
    print(train.x)
    
    break

DataBatch(edge_index=[2, 190668], y=[64], num_nodes=4960, x=[4960, 492], batch=[4960], ptr=[65])
tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0')
