### Dataset ###

In [1]:
from torch_geometric.datasets import TUDataset
dataset = TUDataset(root="TUDataset", name="MUTAG")


Data(edge_index=[2, 32], x=[16, 14], edge_attr=[32, 3], y=[1])

In [2]:
from sklearn.model_selection import train_test_split
from torch_geometric.data import DataLoader
labels = [data.y.item() for data in dataset]

train_idx, temp_idx = train_test_split(
    list(range(len(dataset))),
    test_size=0.2,              
    stratify=labels,
    random_state=42
)


val_idx, test_idx = train_test_split(
    temp_idx,
    test_size=0.5,              
    stratify=[labels[i] for i in temp_idx],
    random_state=42
)

train_dataset = [dataset[i] for i in train_idx]
val_dataset = [dataset[i] for i in val_idx]
test_dataset = [dataset[i] for i in test_idx]

print(f"Training set   = {len(train_dataset)} graphs")
print(f"Validation set = {len(val_dataset)} graphs")
print(f"Test set       = {len(test_dataset)} graphs")

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


Training set   = 150 graphs
Validation set = 19 graphs
Test set       = 19 graphs




In [12]:
import torch
from torch import nn
from torch_geometric.nn import GCNConv, global_add_pool, GraphConv, Linear
import torch.nn.functional as F

class GCN(torch.nn.Module):
    def __init__(self, num_features, num_classes, dim):
        super(GCN, self).__init__()

        self.dim = dim
        self.num_features = num_features
        self.num_classes = num_classes

        self.conv1 = GraphConv(self.num_features, dim)
        self.conv2 = GraphConv(dim, dim)
        self.conv3 = GraphConv(dim, dim)
        # self.conv4 = GraphConv(dim, dim)
        # self.conv5 = GraphConv(dim, dim)

        self.lin1 = Linear(dim, dim)
        self.lin2 = Linear(dim, self.num_classes)

    def forward(self, x, edge_index, batch = None, edge_weight=None):
        x = self.conv1(x, edge_index, edge_weight).relu()
        x = self.conv2(x, edge_index, edge_weight).relu()
        x = self.conv3(x, edge_index, edge_weight).relu()
        # x = self.conv4(x, edge_index, edge_weight).relu()
        # x = self.conv5(x, edge_index, edge_weight).relu()
        embedding = global_add_pool(x, batch)
        x = self.lin1(embedding).relu()
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin2(x)
        return x, embedding

    
    def predict(self, x, edge_index, batch=None):
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        x = x.to(device)
        edge_index = edge_index.to(device)
        if batch is not None:
          batch = batch.to(device)
        out, graph_embedding = self.forward(x, edge_index, batch)
        return out, graph_embedding

In [13]:
gcn = GCN(
    num_features=dataset.num_features,
    dim=32,
    num_classes=dataset.num_classes
)

optimizer = torch.optim.Adam(gcn.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()

def train(train_loader):
    gcn.train()
    for data in train_loader:
        optimizer.zero_grad()
        output, embedding = gcn(data.x, data.edge_index, data.batch)
        loss = criterion(output, data.y)
        loss.backward()
        optimizer.step()
    return loss

def test(loader):
    gcn.eval()
    correct = 0
    for data in loader:
        out, embedding = gcn(data.x, data.edge_index, data.batch)
        pred = out.argmax(dim=1)  # Predict the most probable class
        correct += int((pred == data.y).sum())
    return correct / len(loader.dataset)  # Return the accuracy

acc_plt = []
TestAcc_lst = []
loss_lst = []
epoch_lst = []
for epoch in range(500):
    train_loss = train(train_loader)
    train_acc = test(train_loader)
    test_acc = test(test_loader)
    print(f"Epoch {epoch}: Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}")
    acc_plt.append(train_acc)
    TestAcc_lst.append(test_acc)
    epoch_lst.append(epoch)
    loss_lst.append(train_loss)

Epoch 0: Train Loss: 0.6739, Train Acc: 0.7067, Test Acc: 0.7368
Epoch 1: Train Loss: 0.4866, Train Acc: 0.6667, Test Acc: 0.6316
Epoch 2: Train Loss: 0.9794, Train Acc: 0.6667, Test Acc: 0.6316
Epoch 3: Train Loss: 0.5704, Train Acc: 0.6667, Test Acc: 0.6316
Epoch 4: Train Loss: 0.6529, Train Acc: 0.6667, Test Acc: 0.6316
Epoch 5: Train Loss: 0.6232, Train Acc: 0.6667, Test Acc: 0.6316
Epoch 6: Train Loss: 0.5301, Train Acc: 0.7067, Test Acc: 0.6842
Epoch 7: Train Loss: 0.5007, Train Acc: 0.7667, Test Acc: 0.7368
Epoch 8: Train Loss: 0.4349, Train Acc: 0.7933, Test Acc: 0.6842
Epoch 9: Train Loss: 0.5728, Train Acc: 0.8000, Test Acc: 0.6842
Epoch 10: Train Loss: 0.7465, Train Acc: 0.8067, Test Acc: 0.6842
Epoch 11: Train Loss: 0.4588, Train Acc: 0.8067, Test Acc: 0.6842
Epoch 12: Train Loss: 0.5105, Train Acc: 0.8067, Test Acc: 0.6316
Epoch 13: Train Loss: 0.2839, Train Acc: 0.8067, Test Acc: 0.6316
Epoch 14: Train Loss: 0.2646, Train Acc: 0.8067, Test Acc: 0.6842
Epoch 15: Train Loss

In [14]:
def evaluate_accuracy(loader, model):
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for data in loader:
            out, embedding = model(data.x, data.edge_index, data.batch)
            pred = out.argmax(dim=1)
            correct += (pred == data.y).sum().item()
            total += data.num_graphs

    return correct / total
train_acc = evaluate_accuracy(train_loader, gcn)
val_acc = evaluate_accuracy(val_loader, gcn)
test_acc = evaluate_accuracy(test_loader, gcn)
print(f"Train Accuracy: {train_acc:.4f}")
print(f"Validation Accuracy: {val_acc:.4f}")
print(f"Test Accuracy: {test_acc:.4f}")

Train Accuracy: 1.0000
Validation Accuracy: 0.7895
Test Accuracy: 0.8421


In [17]:
torch.save(gcn.state_dict(), "../model/GCN_mutag.pth")

In [16]:
i = 77
pred = gcn.predict(dataset[i].x, dataset[i].edge_index, None)
print(pred[0].argmax(dim=1))
print(dataset[i].y)

tensor([0])
tensor([0])
