In [1]:
from __future__ import division
from __future__ import print_function

import time
import torch
import numpy as np
from numpy import argmax
import torch.nn.functional as F
from pygcn.gcnio.data import dataio
from pygcn.gcnio.util import utils
from pygcn.gcn2 import GCN
import scipy.sparse
import json
from sklearn.preprocessing import StandardScaler
import glog as log
import torch.optim as optim
print(torch.__version__)
#from torch.profiler import profile, record_function, ProfilerActivity

1.8.1+cu111


In [2]:
cuda = torch.cuda.is_available()
print('cuda: %s' % cuda)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#device = 'cpu'

cuda: True


In [3]:
def load_data(prefix, normalize=True):
    adj_full = scipy.sparse.load_npz('./{}/adj_full.npz'.format(prefix))
    adj_train = scipy.sparse.load_npz('./{}/adj_train.npz'.format(prefix))
    role = json.load(open('./{}/role.json'.format(prefix)))
    feats = np.load('./{}/feats.npy'.format(prefix))
    class_map = json.load(open('./{}/class_map.json'.format(prefix)))
    class_map = {int(k):v for k,v in class_map.items()}
    assert len(class_map) == feats.shape[0]
    # ---- normalize feats ----
    train_nodes = np.array(list(set(adj_train.nonzero()[0])))
    train_feats = feats[train_nodes]
    scaler = StandardScaler()
    scaler.fit(train_feats)
    feats = scaler.transform(feats)
    # -------------------------
    return adj_full, adj_train, feats, class_map, role


def process_graph_data(adj_full, adj_train, feats, class_map, role, name):
    """
    setup vertex property map for output classes, train/val/test masks, and feats
    INPUT:
        G           graph-tool graph, full graph including training,val,testing
        feats       ndarray of shape |V|xf
        class_map   dictionary {vertex_id: class_id}
        val_nodes   index of validation nodes
        test_nodes  index of testing nodes
    OUTPUT:
        G           graph-tool graph unchanged
        role        array of size |V|, indicating 'train'/'val'/'test'
        class_arr   array of |V|x|C|, converted by class_map
        feats       array of features unchanged
    """
    num_vertices = adj_full.shape[0]
    if isinstance(list(class_map.values())[0],list):
        print("labels are list")
        num_classes = len(list(class_map.values())[0])
        class_arr = np.zeros((num_vertices, 1))
        p = 0;
        for k,v in class_map.items():
            class_arr[p] = argmax(v)
            p = p+1
    else:
        num_classes = max(class_map.values()) - min(class_map.values()) + 1
        class_arr = np.zeros((num_vertices, 1))
        for k,v in class_map.items():
            class_arr[k] = v
    if name=='flickr' or name=='reddit' or name=='ppi' or name=='amazon' or name=='yelp':
        class_arr = np.squeeze(class_arr.astype(int))
        
    return adj_full, adj_train, feats, class_arr, role

In [4]:
# make sure you use the same data splits as you generated attacks
seed = 15
np.random.seed(seed)
torch.manual_seed(seed)
if cuda:
    torch.cuda.manual_seed(seed)

# load original dataset (to get clean features and labels)
SMALL = True
if SMALL:
    dataset = 'polblogs'
    data = dataio.Dataset(root='/tmp/', name=dataset)
    adj, features, labels = data.adj, data.features, data.labels
    idx_train, idx_val, idx_test = data.idx_train, data.idx_val, data.idx_test
    
    log.info(type(adj))
    log.info(adj.shape)
    log.info(type(features))
    log.info(features.shape)
    log.info(type(labels))
    log.info(labels.shape)
    log.info(type(idx_train))
    log.info(idx_train.shape)
    log.info(type(idx_val))
    log.info(idx_val.shape)
    log.info(type(idx_test))
    log.info(idx_test.shape)
else:
    data_prefix = './dataset/amazon'
    temp_data = load_data(data_prefix)
    data_list = data_prefix.split('/')
    print(data_list[-1])
    train_data = process_graph_data(*temp_data,data_list[-1])
    adj,adj_train,features,labels,role = train_data
    features = scipy.sparse.csr_matrix(features)
    idx_train = np.array(role['tr'])
    idx_val = np.array(role['va'])
    idx_test = np.array(role['te'])
    log.info(type(adj))
    log.info(adj.shape)
    log.info(type(adj_train))
    log.info(adj_train.shape)
    log.info(type(features))
    log.info(features.shape)
    log.info(type(labels))
    log.info(labels.shape)
    log.info(type(labels[0]))
    log.info(type(idx_train))
    log.info(idx_train.shape)
    log.info(type(idx_val))
    log.info(idx_val.shape)
    log.info(type(idx_test))
    log.info(idx_test.shape)

'''
flickr: (89250,1)
ppi:    (14755,121)
reddit: (232965,1)
amazon: (1569960,107)
yelp:   (716847,100)
'''

I0407 20:59:05.436220 24747 <ipython-input-4-32039c1c17c2>:16] <class 'scipy.sparse.csr.csr_matrix'>
I0407 20:59:05.436852 24747 <ipython-input-4-32039c1c17c2>:17] (1222, 1222)
I0407 20:59:05.437446 24747 <ipython-input-4-32039c1c17c2>:18] <class 'scipy.sparse.csr.csr_matrix'>
I0407 20:59:05.438158 24747 <ipython-input-4-32039c1c17c2>:19] (1222, 1490)
I0407 20:59:05.438760 24747 <ipython-input-4-32039c1c17c2>:20] <class 'numpy.ndarray'>
I0407 20:59:05.439442 24747 <ipython-input-4-32039c1c17c2>:21] (1222,)
I0407 20:59:05.440007 24747 <ipython-input-4-32039c1c17c2>:22] <class 'numpy.ndarray'>
I0407 20:59:05.440676 24747 <ipython-input-4-32039c1c17c2>:23] (121,)
I0407 20:59:05.441258 24747 <ipython-input-4-32039c1c17c2>:24] <class 'numpy.ndarray'>
I0407 20:59:05.441856 24747 <ipython-input-4-32039c1c17c2>:25] (123,)
I0407 20:59:05.442409 24747 <ipython-input-4-32039c1c17c2>:26] <class 'numpy.ndarray'>
I0407 20:59:05.442972 24747 <ipython-input-4-32039c1c17c2>:27] (978,)


Loading polblogs dataset...
Selecting 1 largest connected components


'\nflickr: (89250,1)\nppi:    (14755,121)\nreddit: (232965,1)\namazon: (1569960,107)\nyelp:   (716847,100)\n'

In [5]:
print(labels[0])
print(labels[1])
print(labels[8])
print(labels.max())

model = GCN(nfeat=features.shape[1], nhid=32, nclass=labels.max()+1, device=device)
  
optimizer = optim.Adam(model.parameters(),
                       lr=0.01, weight_decay=5e-4)

0
0
0
1


In [6]:
model = model.to(device)
TRAIN = 1
if TRAIN:
    #with profile(activities=[
    #    ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True) as prof:
    #    with record_function("model_fit"):
    #        model.fit(features, adj, labels, idx_train, train_iters=200, verbose=True, name='ppi')
    #        torch.save(model.state_dict(),'./model/gcn.pt')
    #print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))
    #prof.export_chrome_trace("trace.json")
    model.fit(features, adj, labels, idx_train, train_iters=200, verbose=True, name='polblogs')
    #torch.save(model.state_dict(),'./model/gcn.pt')
TEST = 0
if TEST:
    model.load_state_dict(torch.load('./model/gcn.pt'))
    model.eval()
    model.test(idx_test)

Transform data to GPU device ...
_train_without_val
Epoch 0, training loss: 0.7483662962913513
Epoch 10, training loss: 0.5556492805480957
Epoch 20, training loss: 0.3468811511993408
Epoch 30, training loss: 0.1872067153453827
Epoch 40, training loss: 0.12925803661346436
Epoch 50, training loss: 0.09555499255657196
Epoch 60, training loss: 0.09045597165822983
Epoch 70, training loss: 0.07575611025094986
Epoch 80, training loss: 0.06230434030294418
Epoch 90, training loss: 0.06129242479801178
Epoch 100, training loss: 0.05861958861351013
Epoch 110, training loss: 0.049979642033576965
Epoch 120, training loss: 0.05113193392753601
Epoch 130, training loss: 0.04146952927112579
Epoch 140, training loss: 0.05045653507113457
Epoch 150, training loss: 0.043416596949100494
Epoch 160, training loss: 0.03731120750308037
Epoch 170, training loss: 0.04108182340860367
Epoch 180, training loss: 0.04026451334357262
Epoch 190, training loss: 0.03945418819785118
Forward time: 0.8350s
Layer1 time: 0.2672