In [2]:
# Basic GNN coding from stanford 2019 hands-on tutorial

import torch
import torch.nn as nn
import torch.nn.functional as F

import torchvision
import torchvision.transforms as transforms
import sklearn.metrics as metrics


In [None]:
# dependency for torch-geometric
# !pip install --verbose --no-cache-dir torch-scatter
# !pip install --verbose --no-cache-dir torch-sparse
# !pip install --verbose --no-cache-dir torch-cluster

In [4]:
import torch_geometric.nn as pyg_nn # graph neural net layers.
import torch_geometric.utils as pyg_utils 

import time
from datetime import datetime

import networkx as nx
import numpy as np
import torch
import torch.optim as optim

from torch_geometric.datasets import TUDataset
from torch_geometric.datasets import Planetoid
from torch_geometric.data import DataLoader # only dataloader not data object

import torch_geometric.transforms as T

from tensorboardX import SummaryWriter
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt


# Part 1 - Codes from the website

In [5]:
class GNNStack(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, task='node'):
        super(GNNStack, self).__init__()
        self.task = task
        self.convs = nn.ModuleList() # sw: need to use ModuleList so that the parameters can be recognized.
        self.convs.append(self.build_conv_model(input_dim, hidden_dim))
        self.lns = nn.ModuleList()
        self.lns.append(nn.LayerNorm(hidden_dim))
        self.lns.append(nn.LayerNorm(hidden_dim))
        for l in range(2):
            self.convs.append(self.build_conv_model(hidden_dim, hidden_dim))

        # post-message-passing
        # sw: this is added after all the conv layers.
        self.post_mp = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim), nn.Dropout(0.25), 
            nn.Linear(hidden_dim, output_dim))
        if not (self.task == 'node' or self.task == 'graph'):
            # wow. only node or graph tasks
            raise RuntimeError('Unknown task.')

        self.dropout = 0.25
        self.num_layers = 3 # sw: three layers only...

    def build_conv_model(self, input_dim, hidden_dim):
        # refer to pytorch geometric nn module for different implementation of GNNs.
        if self.task == 'node':
            return pyg_nn.GCNConv(input_dim, hidden_dim)
            # Q: What is exactly GCNConv module? A: It is the 2017 Kipf paper.
            # Q: input and hidden? A: input_dim is the same as number of nodes.
            
        else:
            return pyg_nn.GINConv(nn.Sequential(nn.Linear(input_dim, hidden_dim),
                                  nn.ReLU(), nn.Linear(hidden_dim, hidden_dim)))

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch # data provides: x, edge_index, and batch...?
        if data.num_node_features == 0:
            x = torch.ones(data.num_nodes, 1)

        for i in range(self.num_layers):
            x = self.convs[i](x, edge_index)
            emb = x
            x = F.relu(x)
            x = F.dropout(x, p=self.dropout, training=self.training) 
            # Q: What are the self.dropout and self.training?
            if not i == self.num_layers - 1:
                x = self.lns[i](x)

        if self.task == 'graph':
            x = pyg_nn.global_mean_pool(x, batch)

        x = self.post_mp(x)

        return emb, F.log_softmax(x, dim=1) 
        # sw: it is log softmax. Second, the returning of embedding might not be necessary in my coding. 

    def loss(self, pred, label):
        return F.nll_loss(pred, label)
    

In [6]:
def train(dataset, task, writer):
    if task == 'graph':
        data_size = len(dataset)
        loader = DataLoader(dataset[:int(data_size * 0.8)], batch_size=64, shuffle=True)
        test_loader = DataLoader(dataset[int(data_size * 0.8):], batch_size=64, shuffle=True)
        # sw: I think, dataloader is designed for the case of many small graphs (not a large single graph.)? No?
    else:
        test_loader = loader = DataLoader(dataset, batch_size=64, shuffle=True) 
        # sw: See? This dataloader function can also incorporate a single graph. 
        # sw: in this case, the batch_size seems to not matter at all. 

    # build model
    model = GNNStack(max(dataset.num_node_features, 1), 32, dataset.num_classes, task=task) 
    # input_dim, hidden_dim, output_dim, task='node'
    # input_dim: number of nodes! 
    # hidden_dim: 32. 
    # Q: What is the hidden dim? Where can we see it from the formula? A: the hidden layers' width
    # output_dim: prediction class.

    opt = optim.Adam(model.parameters(), lr=0.01)
    
    # train
    for epoch in range(200):
        total_loss = 0
        model.train()
        # sw: wait. model.train()? A: set the model in a training mode. 
        for batch in loader:
            # Q: What does the batch look like? This simple "batch" satisfies so many things. 
            # A: Yeah. Check the data preparation webpage.
            
            opt.zero_grad()
            embedding, pred = model(batch)
            label = batch.y
            if task == 'node':
                pred = pred[batch.train_mask]
                label = label[batch.train_mask]
            loss = model.loss(pred, label)
            loss.backward() # sw: compute the gradients.
            opt.step() # sw: Q - what is this? Oh. desent step.
            total_loss += loss.item() * batch.num_graphs
        total_loss /= len(loader.dataset)
        writer.add_scalar("loss", total_loss, epoch) # sw: cool. Use writer for a real-time visualization.

        if epoch % 10 == 0:
            test_acc = test(test_loader, model)
            print("Epoch {}. Loss: {:.4f}. Test accuracy: {:.4f}".format(
                epoch, total_loss, test_acc))
            writer.add_scalar("test accuracy", test_acc, epoch) # sw: cool again.

    return model


In [7]:
def test(loader, model, is_validation=False):
    model.eval()

    correct = 0
    for data in loader:
        with torch.no_grad():
            emb, pred = model(data)
            pred = pred.argmax(dim=1)
            label = data.y

        if model.task == 'node':
            mask = data.val_mask if is_validation else data.test_mask
            # node classification: only evaluate on nodes in test set
            pred = pred[mask]
            label = data.y[mask]
            
        correct += pred.eq(label).sum().item() # sw: cool method - pred.eq(label).
    
    if model.task == 'graph':
        total = len(loader.dataset) 
    else:
        total = 0
        for data in loader.dataset:
            total += torch.sum(data.test_mask).item() # sw: count the total number of items.
    return correct / total


In [8]:
# 
writer = SummaryWriter("./log/" + datetime.now().strftime("%Y%m%d-%H%M%S"))
dataset = Planetoid(root='/tmp/cora', name='cora')
task = 'node'

Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.x
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.tx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.allx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.y
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.ty
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.ally
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.graph
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.test.index
Processing...
Done!


In [9]:
model = train(dataset, task, writer)

Epoch 0. Loss: 2.0180. Test accuracy: 0.2880
Epoch 10. Loss: 0.4844. Test accuracy: 0.7680
Epoch 20. Loss: 0.0538. Test accuracy: 0.7650
Epoch 30. Loss: 0.0550. Test accuracy: 0.7380
Epoch 40. Loss: 0.0275. Test accuracy: 0.7150
Epoch 50. Loss: 0.0374. Test accuracy: 0.7860
Epoch 60. Loss: 0.0026. Test accuracy: 0.7540
Epoch 70. Loss: 0.0231. Test accuracy: 0.7530
Epoch 80. Loss: 0.0269. Test accuracy: 0.7730
Epoch 90. Loss: 0.0134. Test accuracy: 0.7700
Epoch 100. Loss: 0.0131. Test accuracy: 0.7720
Epoch 110. Loss: 0.0137. Test accuracy: 0.7600
Epoch 120. Loss: 0.0186. Test accuracy: 0.7590
Epoch 130. Loss: 0.0002. Test accuracy: 0.7490
Epoch 140. Loss: 0.0053. Test accuracy: 0.7730
Epoch 150. Loss: 0.0074. Test accuracy: 0.7770
Epoch 160. Loss: 0.0016. Test accuracy: 0.7610
Epoch 170. Loss: 0.0027. Test accuracy: 0.7630
Epoch 180. Loss: 0.0016. Test accuracy: 0.7620
Epoch 190. Loss: 0.0002. Test accuracy: 0.7600


In [10]:
# dataset[0].train_mask.sum().item()

140

In [14]:
test_loader = loader = DataLoader(dataset, batch_size=64, shuffle=True) 

In [17]:
for batch in loader:
    b = batch
    break
    
b.num_graphs

1

In [13]:
dataset[:int(data_size * 0.8)]

cora()

In [19]:
len(loader.dataset)

1

Epoch 0. Loss: 1.9562. Test accuracy: 0.2880
Epoch 10. Loss: 0.3495. Test accuracy: 0.7680
Epoch 20. Loss: 0.0306. Test accuracy: 0.7560
Epoch 30. Loss: 0.0075. Test accuracy: 0.7880
Epoch 40. Loss: 0.0061. Test accuracy: 0.7690
Epoch 50. Loss: 0.1484. Test accuracy: 0.7620
Epoch 60. Loss: 0.0824. Test accuracy: 0.7440
Epoch 70. Loss: 0.0113. Test accuracy: 0.7770
Epoch 80. Loss: 0.0377. Test accuracy: 0.7630
Epoch 90. Loss: 0.0069. Test accuracy: 0.7750
Epoch 100. Loss: 0.0034. Test accuracy: 0.7660
Epoch 110. Loss: 0.0013. Test accuracy: 0.7300
Epoch 120. Loss: 0.0036. Test accuracy: 0.7300
Epoch 130. Loss: 0.0008. Test accuracy: 0.7350
Epoch 140. Loss: 0.0089. Test accuracy: 0.7570
Epoch 150. Loss: 0.0031. Test accuracy: 0.7630
Epoch 160. Loss: 0.0060. Test accuracy: 0.7550
Epoch 170. Loss: 0.0026. Test accuracy: 0.7690
Epoch 180. Loss: 0.0046. Test accuracy: 0.7740
Epoch 190. Loss: 0.0042. Test accuracy: 0.7720


In [21]:
print(batch)
print(batch.batch)
print(batch.edge_index)
print(batch.test_mask)
print(batch.train_mask)
print(batch.val_mask)
print(batch.x)
print(batch.y)


Batch(batch=[2708], edge_index=[2, 10556], test_mask=[2708], train_mask=[2708], val_mask=[2708], x=[2708, 1433], y=[2708])
tensor([0, 0, 0,  ..., 0, 0, 0])
tensor([[   0,    0,    0,  ..., 2707, 2707, 2707],
        [ 633, 1862, 2582,  ...,  598, 1473, 2706]])
tensor([False, False, False,  ...,  True,  True,  True])
tensor([ True,  True,  True,  ..., False, False, False])
tensor([False, False, False,  ..., False, False, False])
tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])
tensor([3, 4, 4,  ..., 3, 3, 3])


In [25]:
data_size = len(dataset)
# loader = DataLoader(dataset[:int(data_size * 0.8)], batch_size=64, shuffle=True)


1

In [34]:
print(np.sum(dataset[0].train_mask.numpy()))
print(np.sum(dataset[0].val_mask.numpy()))
print(np.sum(dataset[0].test_mask.numpy()))

140
500
1000


In [35]:
dataset[0].edge_index

tensor([[   0,    0,    0,  ..., 2707, 2707, 2707],
        [ 633, 1862, 2582,  ...,  598, 1473, 2706]])

In [37]:
dataset[0].x.numpy().sum(axis = 1)

array([ 9., 23., 19., ..., 18., 14., 13.], dtype=float32)

In [13]:
# visualizing node embeddings (copied from the website. but it does not work yet. Supposed to be TSNE embedding.)
color_list = ["red", "orange", "green", "blue", "purple", "brown"]

loader = DataLoader(dataset, batch_size=64, shuffle=True)
embs = []
colors = []
for batch in loader:
    emb, pred = model(batch)
    embs.append(emb)
    colors += [color_list[y] for y in batch.y]
# embs = torch.cat(embs, dim=0)

# xs, ys = zip(*TSNE().fit_transform(embs.detach().numpy()))
# plt.scatter(xs, ys, color=colors)


IndexError: list index out of range

In [15]:
model(batch)

(tensor([[ 3.0167, -1.4585, -1.7189,  ...,  3.6860,  1.5209, -2.5636],
         [ 1.0228,  0.3428, -2.9589,  ...,  1.4222,  4.7391,  4.5572],
         [ 1.5355,  0.3136, -2.2060,  ...,  1.4762,  4.0850,  2.4967],
         ...,
         [-2.6399,  1.1844,  0.8522,  ..., -0.7638, -1.0413, -1.2856],
         [ 5.1368, -1.9537, -2.1222,  ...,  6.0914,  2.9457, -3.4103],
         [ 4.5058, -1.8394, -1.9021,  ...,  5.0542,  2.6885, -2.9412]],
        grad_fn=<AddBackward0>),
 tensor([[-2.5246e+01, -1.5542e+01, -2.4860e+01,  ..., -1.6116e+01,
          -3.3639e+01, -2.7084e+01],
         [-2.7873e+01, -3.3524e+01, -2.9612e+01,  ...,  0.0000e+00,
          -4.3021e+01, -3.3519e+01],
         [-3.5554e+01, -4.5612e+01, -3.3197e+01,  ...,  0.0000e+00,
          -3.8650e+01, -3.3558e+01],
         ...,
         [-3.9406e-02, -3.2844e+00, -1.7830e+01,  ..., -1.6356e+01,
          -6.7437e+00, -1.8519e+01],
         [-5.4494e+01, -3.5872e+01, -3.3857e+01,  ..., -3.0090e+01,
          -6.4676e+01, -