In [1]:
import torch
import torch_geometric

from torch_geometric.datasets import TUDataset
from torch_geometric.loader import DataLoader

In [None]:
import matplotlib as plt
import numpy as np
import networkx
from torch_geometric.data import Data
from sklearn.manifold import TSNE


In [None]:
from torch_geometric.utils import to_dense_adj

### Data Preprocessing

Dataset import

In [None]:
dataset = TUDataset(root="../dataset", name='PROTEINS')

In [None]:
insp = 3
print(dataset[insp+851])
print(dataset[insp+213])
print(dataset[insp+2]) 

Create complementary graph

In [None]:
adj_o = to_dense_adj(dataset[2].edge_index)
adj_c = abs(to_dense_adj(dataset[2].edge_index) - 1) - torch.eye(len(dataset[2].x))

print("Original:", adj_o)
print("Complementary:", (adj_c))

In [None]:
print(adj_o[0].nonzero().t().contiguous() == dataset[2].edge_index)

In [None]:
adj_c[0].nonzero().t().contiguous()

In [None]:
def toComplementary(g):
    c = abs(to_dense_adj(g.edge_index) - 1) - torch.eye(len(g.x))
    c = c[0].nonzero().t().contiguous()
    return c

In [None]:
dataset_c = []
for graph in dataset:
    edge_c = toComplementary(graph)
    dataset_c.append(Data(edge_index=edge_c, x=graph.x, y=graph.y))

In [None]:
print(dataset_c[5])
print(dataset[5])

In [None]:
torch.manual_seed(42)
len((dataset.shuffle()).y)

In [293]:
# dataset = TUDataset(root="../dataset", name='MUTAG')

Downloading https://www.chrsmrrs.com/graphkerneldatasets/MUTAG.zip
Extracting ..\dataset\MUTAG\MUTAG.zip
Processing...
Done!


Train test split

In [294]:
split = 0.8
seed = 123

num_split = round(len(dataset) * split)
# dataset.shuffle()

In [295]:
train_dataset = dataset[:num_split]
test_dataset = dataset[num_split:]
print('Train: ', len(train_dataset))
print('Test: ', len(test_dataset))

Train:  150
Test:  38


In [296]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)

In [297]:
for d in train_loader:
    print(d.y)
    break

tensor([0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1,
        1, 1, 1, 0, 1, 1, 1, 0])


### Base Model (GCN)

In [298]:
from torch_geometric.nn import GCNConv
from torch.nn import Linear
from torch.nn import Linear
from torch_geometric.nn import global_mean_pool
from torch_geometric.nn import global_max_pool
from torch_geometric.nn import global_add_pool
import torch.nn.functional as F

In [299]:
class GCN(torch.nn.Module):
    def __init__(self, dataset, hidden_channels):
        super(GCN, self).__init__()
        
        # weight seed
        torch.manual_seed(42)
        self.conv1 = GCNConv(dataset.num_node_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.conv3 = GCNConv(hidden_channels, hidden_channels)
        self.lin = Linear(hidden_channels, dataset.num_classes) # for final classification

    def forward(self, x, edge_index, batch):
        # step 1. get node embedding using GCNConv layer
        x = self.conv1(x, edge_index)
        x = x.relu() # apply relu activation after conv
        x = self.conv2(x, edge_index)
        x = x.relu()
        x = self.conv3(x, edge_index)

        # step 2. add readout layer to aggregate all node features of graph
        e = global_add_pool(x, batch)

        # apply classifier (using linear)
        x = F.dropout(e, p=0.5, training=self.training)
        x = self.lin(x)

        return x, e

In [300]:
def train(model, loader):
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    
    model.train()
    
    for data in loader:
        out, z = model(data.x, data.edge_index, data.batch)
        loss = criterion(out, data.y)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    return out, loss

@torch.no_grad()
def test(model, loader):
    model.eval()
    correct = 0
    for data in loader:
        out, z = model(data.x, data.edge_index, data.batch)
        pred = out.argmax(dim=1)
        correct += int((pred == data.y).sum())
    return correct/len(loader.dataset), z

In [301]:
model = GCN(dataset, 64)
# dataset.num_node_features

In [302]:
list_loss = []
list_train_acc = []
list_test_acc = []
z = None

for epoch in range(0, 100):
    out, loss = train(model, train_loader)
    train_acc, z = test(model, train_loader)
    test_acc, z = test(model, test_loader)
    
    list_train_acc.append(round(train_acc, 4))
    list_test_acc.append(round(test_acc, 4))
    list_loss.append(round(loss.item(), 4))

    print(f"epoch: {epoch+1} train_acc: {train_acc:.4f} loss: {loss:.4f} test_acc: {test_acc:.4f}")

epoch: 1 train_acc: 0.6600 loss: 0.9123 test_acc: 0.6842
epoch: 2 train_acc: 0.6600 loss: 0.5043 test_acc: 0.6842
epoch: 3 train_acc: 0.6600 loss: 0.5662 test_acc: 0.6842
epoch: 4 train_acc: 0.6600 loss: 0.6777 test_acc: 0.6842
epoch: 5 train_acc: 0.6600 loss: 0.7570 test_acc: 0.6842
epoch: 6 train_acc: 0.7067 loss: 0.6504 test_acc: 0.7105
epoch: 7 train_acc: 0.6600 loss: 0.6339 test_acc: 0.7105
epoch: 8 train_acc: 0.6933 loss: 0.5130 test_acc: 0.7105
epoch: 9 train_acc: 0.6600 loss: 0.8391 test_acc: 0.6842
epoch: 10 train_acc: 0.7000 loss: 0.6510 test_acc: 0.7105
epoch: 11 train_acc: 0.7467 loss: 0.4059 test_acc: 0.7105
epoch: 12 train_acc: 0.6600 loss: 0.4858 test_acc: 0.6842
epoch: 13 train_acc: 0.7000 loss: 0.5843 test_acc: 0.7105
epoch: 14 train_acc: 0.6600 loss: 0.5435 test_acc: 0.7105
epoch: 15 train_acc: 0.6867 loss: 0.6472 test_acc: 0.7105
epoch: 16 train_acc: 0.6800 loss: 0.7247 test_acc: 0.7105
epoch: 17 train_acc: 0.6800 loss: 0.4440 test_acc: 0.7105
epoch: 18 train_acc: 0.

In [419]:
embeddings = [] 
labels = [] 
color_list = ['red', 'blue']

for dt in train_loader:
    _ , z = model(dt.x, dt.edge_index, dt.batch)
    for emb in z:
        # print(emb.detach().numpy())|
        embeddings.append(emb.detach().numpy())
    labels += [color_list[y-1] for y in dt.y]

for dt in test_loader:
    _ , z = model(dt.x, dt.edge_index, dt.batch)
    for emb in z:
        # print(emb.detach().numpy())|
        embeddings.append(emb.detach().numpy())
    labels += [color_list[y-1] for y in dt.y]

In [420]:
print(len(embeddings))
print(len(labels))

188
188


In [422]:
embeddings = np.array(embeddings)

In [433]:
tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(embeddings)
tsne.kl_divergence_

0.1634192019701004

In [434]:
import plotly.express as px

In [437]:
fig = px.scatter(x=X_tsne[:, 0], y=X_tsne[:, 1], color=labels)
fig.update_layout(
    title="GCN MUTAG Dataset",
    xaxis_title="First t-SNE",
    yaxis_title="Second t-SNE",
)
fig.show()