In [1]:
from collections import defaultdict

from tqdm import tqdm_notebook
import seaborn as sns
import numpy as np

import torch
import torch.nn.functional as F

from data import get_dataset, PPRDataset, HeatDataset, set_train_val_test_split
from models import GCNConvNet
from seeds import val_seeds, test_seeds

# GCN with GDC
This notebook provides an example of enhancing GCN with GDC.

## Load dataset and preprocess with GDC

In [2]:
cora_raw = dataset = get_dataset(
    'Cora',
    use_lcc=True
)

cora_ppr = PPRDataset(
    name='Cora',
    use_lcc=True,
    alpha=0.05,
    t_matrix='sym',
    self_loops=1.0,
    k=128,
    eps=None,
    sparse_normalization='col_one'
)

cora_heat = HeatDataset(
    name='Cora',
    use_lcc=True,
    t=5,
    t_matrix='sym',
    self_loops=1.0,
    k=None,
    eps=128,
    sparse_normalization='sym_one'
)

Choose no GDC, GDC using diffusion defined by personalized PageRank (PPR), or GDC with heat kernel diffusion.

In [3]:
# dataset = cora_raw
# reg_lambda = 0.06

dataset = cora_ppr
reg_lambda = 0.10

# dataset = cora_heat
# reg_lambda = 0.09

We will use the GPU in this notebook. If you want to use a CPU instead simply change this line to `cpu`.

In [4]:
device = 'cuda'
dataset.data.to(device)

Data(edge_attr=[318080], edge_index=[2, 318080], test_mask=[2485], train_mask=[2485], val_mask=[2485], x=[2485, 1433], y=[2485])

## Create GCN model

In [5]:
model = GCNConvNet(
    dataset,
    hidden=[64],
    dropout=0.5
)

## Train model

In [6]:
test = False

In [7]:
seeds = test_seeds if test else val_seeds
patience = 100
max_epochs = 10000
lr = 0.01
num_development = 1500
num_per_class = 20

In [8]:
def train_semi_sup(model, optimizer, data):
    model.train()
    optimizer.zero_grad()
    logits = model(data)
    loss = F.nll_loss(
        logits[data.train_mask],
        data.y[data.train_mask]
    )
    loss.backward()
    optimizer.step()
    return loss.item()

In [9]:
def eval_semi_sup(model, data, test):
    model.eval()
    with torch.no_grad():
        logits = model(data)
    eval_dict = {}
    keys = ['train', 'test'] if test else ['train', 'val']
    for key in keys:
        mask = data[f'{key}_mask']
        pred = logits[mask].max(1)[1]
        acc = pred.eq(data.y[mask]).sum().item() / mask.sum().item()
        eval_dict[f'{key}_acc'] = acc
    return eval_dict

In [10]:
best_dict = defaultdict(list)

for seed in tqdm_notebook(seeds):
    set_train_val_test_split(
        seed=seed,
        data=dataset.data,
        num_development=num_development,
        num_per_class=num_per_class
    )
    
    model.to(device).reset_parameters()
    optimizer = torch.optim.Adam(
        [
            {'params': model.non_reg_params, 'weight_decay': 0},
            {'params': model.reg_params, 'weight_decay': reg_lambda}
        ],
        lr=lr)
    
    tmp_dict = {'val_acc': 0}
    patience_cnt = 0
    
    # Training loop
    for epoch in range(1, max_epochs + 1):
        if patience_cnt == patience:
            break
        train_loss = train_semi_sup(
            model,
            optimizer,
            dataset.data
        )
        eval_dict = eval_semi_sup(
            model,
            dataset.data,
            False
        )
        if eval_dict['val_acc'] <= tmp_dict['val_acc']:
            patience_cnt += 1
        else:
            patience_cnt = 0
            tmp_dict['epoch'] = epoch
            tmp_dict.update(eval_dict)
            if test:
                res_dict = eval_semi_sup(
                    model,
                    dataset.data,
                    True
                )
                res_dict['epoch'] = epoch
            else:
                res_dict = tmp_dict

    for k, v in res_dict.items():
        best_dict[k.replace('val_', '').replace('test_', '')].append(v)

HBox(children=(IntProgress(value=0), HTML(value='')))




## Evaluate result

Calculate statistics using bootstrapping.

In [11]:
mean_acc = np.mean(best_dict['acc'])

boots_series = sns.algorithms.bootstrap(
    best_dict['acc'],
    func=np.mean,
    n_boot=1000
)
best_dict['acc_ci'] = sns.utils.ci(boots_series, 95)
uncertainty = np.max(np.abs(best_dict['acc_ci'] - mean_acc))

In [12]:
print(f"Mean accuracy: {100 * mean_acc:.2f} +- {100 * uncertainty:.2f}%")

Mean accuracy: 83.21 +- 0.22%
