# Setup

In [24]:
import torch
import argparse
import numpy as np

parser = argparse.ArgumentParser()

parser.add_argument('--seed', type=int, default=123)
parser.add_argument('--subgraph_size', type=int, default=64)
parser.add_argument('--num_subgraphs', type=int, default=64)
parser.add_argument('--batch_size', type=int, default=128)
parser.add_argument('--file', type=str, default="./out.csv")
parser.add_argument('--dataset', type=str, default='cora', choices=[
    'cora', 'citeseer', 'polblogs', 'pubmed'
])
parser.add_argument('--attack', type=str, default='pgd', choices=[
    'meta', 'pgd', 'nettack'
])
parser.add_argument('--ptb_rate', type=float, default=0.25)

# device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
args = parser.parse_args("")

device = "cuda:1" if torch.cuda.is_available() else "cpu"
# device = 'cpu'

np.random.seed(args.seed)
torch.manual_seed(args.seed)

<torch._C.Generator at 0x7fc4d8278cd0>

# Attack

In [32]:
from deeprobust.graph.data import Dataset, PrePtbDataset, Dpr2Pyg
from deeprobust.graph.utils import preprocess
from numpy import ndarray

clean_dataset = Dataset(root='./tmp/', name=args.dataset, seed=args.seed)
adj, feat, labels = clean_dataset.adj, clean_dataset.features, clean_dataset.labels
adj, feat, labels = preprocess(adj, feat, labels, preprocess_adj=False) # conver to tensor
idx_train, idx_val, idx_test = clean_dataset.idx_train, clean_dataset.idx_val, clean_dataset.idx_test
# adj = torch.tensor(clean_dataset.adj.toarray(), dtype=torch.float).to(device)
# feat = torch.tensor(clean_dataset.features.toarray(), dtype=torch.float).to(device)
# label = torch.tensor(clean_dataset.labels, dtype=torch.long).to(device)

train_mask = torch.zeros([adj.shape[0]], dtype=torch.bool)  
train_mask[idx_train] = 1
test_mask = torch.zeros([adj.shape[0]], dtype=torch.bool)  
test_mask[idx_test] = 1

num_samples = 2560
subgraph_size = 64


Loading cora dataset...
Selecting 1 largest connected components


In [47]:
from deeprobust.graph.defense import GCN
from deeprobust.graph.global_attack import PGDAttack

# ptb_data = PrePtbDataset(root='./tmp/', name=args.dataset,
#                                         attack_method='meta',
#                                         ptb_rate=0.25) # here ptb_rate means number of perturbation per nodes

# ptb_adj = torch.tensor(ptb_data.adj.toarray(), dtype=torch.float).to(device)

victim_model = GCN(nfeat=feat.shape[1], nclass=labels.max().item()+1,
                    nhid=16, dropout=0.5, weight_decay=5e-4, device=device).to(device)
victim_model.fit(feat, adj, labels, idx_train)
model = PGDAttack(model=victim_model, nnodes=adj.shape[0], loss_type='CE', device=device).to(device)
model.attack(feat, adj, labels, idx_train, n_perturbations=adj.triu().sum() * 0.25)

100%|██████████| 200/200 [00:09<00:00, 21.71it/s]


In [48]:
gcn = GCN(nfeat=feat.shape[1], nclass=labels.max().item()+1,
                nhid=16, device=device).to(device)
gcn.fit(feat, model.modified_adj, labels, idx_train, idx_val, patience=30)
print('=== testing GCN on purified graph ===')
gcn.test(idx_test)

=== testing GCN on purified graph ===
Test set results: loss= 0.9869 accuracy= 0.6771


0.6770623742454729

# Eval

In [116]:
from torch_geometric.nn import Sequential, DenseGCNConv
from torch.nn import Linear, ReLU
import torch.nn.functional as F
from tqdm import tqdm
from torch.utils.data import TensorDataset, DataLoader

def get_gcn(in_size, out_size, hid: int=64):
    return Sequential('x, adj', [
        (DenseGCNConv(in_size, hid), 'x, adj -> x'),
        ReLU(inplace=True),
        (DenseGCNConv(hid, hid), 'x, adj -> x'),
        ReLU(inplace=True),
        Linear(hid, out_size),
        # Linear(hid, int(label.max()) + 1),
    ]).to(device)

def train(model, dataloader: DataLoader, epochs: int):
    model.train()
    optimizer = torch.optim.Adam(model.parameters(), 1e-4)
    t = tqdm(range(epochs), bar_format='{l_bar}{bar:10}{r_bar}{bar:-10b}')
    t.set_description("Model training")
    loss = torch.tensor(0)
    for _ in t:
        for feats, adjs, labels, train_masks in dataloader:
            pred = model(feats, adjs)
            mask = train_masks.flatten()
            loss = F.cross_entropy(pred.flatten(end_dim=1)[mask], labels.flatten(end_dim=1)[mask])
            loss.backward()
            optimizer.step()
            t.set_postfix({"loss": round(loss.item(), 2)})

def train_adj(model, feat, adj, label, train_mask, epochs: int):
    model.train()
    optimizer = torch.optim.Adam(model.parameters(), 1e-4)
    t = tqdm(range(epochs), bar_format='{l_bar}{bar:10}{r_bar}{bar:-10b}')
    t.set_description("Model training")
    loss = torch.tensor(0)
    for _ in t:
        pred = model(feat, adj)
        loss = F.cross_entropy(pred.squeeze()[train_mask], label[train_mask])
        loss.backward()
        optimizer.step()
        t.set_postfix({"loss": round(loss.item(), 2)})

def eval(model, test_mask, feat, adj, label, text=""):
    model.eval()
    pred = model(feat, adj)
    acc = ((pred.argmax(dim=2).squeeze() == label)[test_mask]).float().mean().item()
    print("")
    print(text + f"Accuracy: {acc: 0.2%}")

In [155]:
baseline_gcn = get_gcn(feat.shape[1], int(label.max()) + 1)
train_adj(baseline_gcn, feat, adj, label, train_mask, 1000)

Model training: 100%|██████████| 1000/1000 [00:02<00:00, 409.58it/s, loss=0]  


Accuracy:  82.49%
123





In [156]:
eval(baseline_gcn, test_mask, feat, adj, label)


Accuracy:  82.49%


In [151]:
ptb_gcn = get_gcn(feat.shape[1], int(label.max()) + 1)
train_adj(ptb_gcn, feat, ptb_adj, label, train_mask, 1000)

Model training: 100%|██████████| 1000/1000 [00:02<00:00, 418.76it/s, loss=0.45]


In [152]:
eval(ptb_gcn, test_mask, feat, adj, label)


Accuracy:  34.31%


In [157]:
import random

def get_neighbor_subgraphs(adj: torch.Tensor, size: int, n: int) -> torch.Tensor:
    """
    Returns a tensor ~ `(n * size)` of node indices for `n` supgraphs of size `size`
    """
    res = torch.zeros([n, size], dtype=torch.long)

    for i in range(n):
        out = []
        stack = [random.randint(0, adj.shape[0] - 1)]
        while len(out) < size:
            if len(stack) == 0:
                stack.append(random.randint(0, adj.shape[0] - 1))
            curNode = stack.pop()
            if curNode not in out:
                out.append(curNode)
                children = adj[curNode].nonzero().t()[0].cpu().tolist()
                stack = children + stack
        res[i] = torch.tensor(out)

    return res

subgraph_ids = get_neighbor_subgraphs(torch.tensor(ptb_data.adj.toarray()), 64, num_samples)

temp = adj[subgraph_ids]
subgraph_feats = feat[subgraph_ids]
subgraph_adjs = torch.zeros(num_samples, subgraph_size, subgraph_size).to(device)
# subgraph_adjs_pgd = torch.zeros(num_samples, subgraph_size, subgraph_size).to(device)
subgraph_labels = label[subgraph_ids]
subgraph_train_masks = train_mask[subgraph_ids]
for i in range(num_samples):
    subgraph_adjs[i] = temp[i][:,subgraph_ids[i]]

temp = ptb_adj[subgraph_ids]
subgraph_adjs_pgd = torch.zeros(num_samples, subgraph_size, subgraph_size).to(device)
for i in range(num_samples):
    subgraph_adjs_pgd[i] = temp[i][:,subgraph_ids[i]]

In [153]:
clean_dataloader = DataLoader(
    TensorDataset(subgraph_feats, subgraph_adjs, subgraph_labels, subgraph_train_masks), 
    batch_size=256, shuffle=True)

clean_subgraphs_gcn = get_gcn(feat.shape[1], int(label.max()) + 1)
train(clean_subgraphs_gcn, clean_dataloader, 100)

Model training: 100%|██████████| 100/100 [00:04<00:00, 23.12it/s, loss=0]  


Accuracy:  81.99%





In [160]:
eval(clean_subgraphs_gcn, test_mask, feat, adj, label)


Accuracy:  81.99%


In [173]:
pgd_dataloader = DataLoader(
    TensorDataset(subgraph_feats, subgraph_adjs_pgd, subgraph_labels, subgraph_train_masks),
    batch_size=1024, shuffle=True)

ptb_gcn_samples = get_gcn(feat.shape[1], int(label.max()) + 1)
train(ptb_gcn_samples, pgd_dataloader, 200)

Model training: 100%|██████████| 200/200 [00:24<00:00,  8.14it/s, loss=0.23]


In [175]:
eval(ptb_gcn_samples, test_mask, feat, adj, label)


Accuracy:  64.94%
