In [1]:
from collections import defaultdict

import seaborn as sns
from tqdm import tqdm_notebook
import numpy as np

import torch
import torch.nn.functional as F

from data import PPRDataset, HeatDataset, set_train_val_test_split
from models import GCNConvNet
from utils import train_seeds, test_seeds

This notebook provides an example of enhancing GCN with GDC. It does not represent a perfect reproduction of the results in the paper.

### Load dataset

In [2]:
cora_ppr = PPRDataset(
    name='Cora',
    use_lcc=True,
    alpha=0.05,
    t_matrix='sym',
    self_loops=1.0,
    k=128,
    eps=None,
    sparse_normalization='col_one'
)
cora_heat = HeatDataset(
    name='Cora',
    use_lcc=True,
    t=5,
    t_matrix='sym',
    self_loops=1.0,
    k=None,
    eps=128,
    sparse_normalization='sym_one'
)

name=ppr_Cora_use_lcc=True_alpha=0.05_t_matrix=sym_self_loops=1.00_k=128_eps=None_sparse_normalization=col_one;num_nodes=2485;num_edges=318080
name=heat_Cora_use_lcc=True_t=5.00_t_matrix=sym_self_loops=1.00_k=None_eps=128.00000000_sparse_normalization=sym_one;num_nodes=2485;num_edges=318080


In [3]:
dataset = cora_ppr
dataset.data.to('cuda')

Data(edge_attr=[318080], edge_index=[2, 318080], test_mask=[2485], train_mask=[2485], val_mask=[2485], x=[2485, 1433], y=[2485])

### Create model

In [4]:
model = GCNConvNet(
    dataset,
    hidden=[64],
    dropout=0.5
)
model.to('cuda')

GCNConvNet(
  (layers): ModuleList(
    (0): GCNConv(1433, 64)
    (1): GCNConv(64, 7)
  )
  (dropout): Dropout(p=0.5)
  (act_fn): ReLU()
)


GCNConvNet(
  (layers): ModuleList(
    (0): GCNConv(1433, 64)
    (1): GCNConv(64, 7)
  )
  (dropout): Dropout(p=0.5)
  (act_fn): ReLU()
)

### Train model

In [5]:
test = False

In [6]:
seeds = test_seeds if test else train_seeds
patience = 100
max_epochs = 10000
optimizer = torch.optim.Adam(
    [
        {'params': model.non_reg_params, 'weight_decay': 0},
        {'params': model.reg_params, 'weight_decay': 0.10}
    ],
    lr=0.01
)

In [7]:
def train_semi_sup(model, optimizer, data):
    model.train()
    optimizer.zero_grad()
    logits = model(data)
    loss = F.nll_loss(
        logits[data.train_mask],
        data.y[data.train_mask]
    )
    loss.backward()
    optimizer.step()
    return loss.item()

In [8]:
def eval_semi_sup(model, data, test, num_classes):
    model.eval()
    with torch.no_grad():
        logits = model(data)
    eval_dict = {}
    keys = ['train', 'test'] if test else ['train', 'val']
    for key in keys:
        mask = data[f'{key}_mask']
        pred = logits[mask].max(1)[1]
        acc = pred.eq(data.y[mask]).sum().item() / mask.sum().item()
        eval_dict['{}_acc'.format(key)] = acc
    return eval_dict

In [9]:
best_dict = defaultdict(list)

for seed in tqdm_notebook(seeds):
    set_train_val_test_split(
        seed=seed,
        data=dataset.data
    )
    tmp_dict = {'val_acc': 0}
    patience_cnt = 0
    for epoch in range(1, max_epochs + 1):
        if patience_cnt == patience:
            break
        train_loss = train_semi_sup(
            model,
            optimizer,
            dataset.data
        )
        eval_dict = eval_semi_sup(
            model,
            dataset.data,
            False,
            dataset.num_classes
        )
        if eval_dict['val_acc'] <= tmp_dict['val_acc']:
            patience_cnt += 1
        else:
            patience_cnt = 0
            tmp_dict['epoch'] = epoch
            tmp_dict.update(eval_dict)
            if test:
                res_dict = eval_semi_sup(
                    model,
                    dataset.data,
                    True,
                    dataset.num_classes
                )
                res_dict['epoch'] = epoch
            else:
                res_dict = tmp_dict
    for k, v in res_dict.items():
        best_dict[k].append(v)

HBox(children=(IntProgress(value=0), HTML(value='')))




In [10]:
mean_acc = np.mean(best_dict['val_acc'])

boots_series = sns.algorithms.bootstrap(
    best_dict['val_acc'],
    func=np.mean,
    n_boot=1000
)
best_dict['acc_ci'] = sns.utils.ci(boots_series, 95)
uncertainty = np.max(np.abs(best_dict['acc_ci'] - mean_acc))

In [11]:
print(f"Mean accuracy: {100 * mean_acc:.2f} +- {100 * uncertainty:.2f}%")

Mean accuracy: 83.09 +- 0.24%
