In [1]:
from __future__ import division
from __future__ import print_function

import time
import torch
import numpy as np
from numpy import argmax
import torch.nn.functional as F
from pygcn.gcnio.data import dataio
from pygcn.gcnio.util import utils
from pygcn.gcn import GCN
import scipy.sparse
import json
from sklearn.preprocessing import StandardScaler
import glog as log
import torch.optim as optim


In [2]:
cuda = torch.cuda.is_available()
print('cuda: %s' % cuda)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#device = 'cpu'

cuda: True


In [31]:
def load_data(prefix, normalize=True):
    adj_full = scipy.sparse.load_npz('./{}/adj_full.npz'.format(prefix))
    adj_train = scipy.sparse.load_npz('./{}/adj_train.npz'.format(prefix))
    role = json.load(open('./{}/role.json'.format(prefix)))
    feats = np.load('./{}/feats.npy'.format(prefix))
    class_map = json.load(open('./{}/class_map.json'.format(prefix)))
    class_map = {int(k):v for k,v in class_map.items()}
    assert len(class_map) == feats.shape[0]
    # ---- normalize feats ----
    train_nodes = np.array(list(set(adj_train.nonzero()[0])))
    train_feats = feats[train_nodes]
    scaler = StandardScaler()
    scaler.fit(train_feats)
    feats = scaler.transform(feats)
    # -------------------------
    return adj_full, adj_train, feats, class_map, role


def process_graph_data(adj_full, adj_train, feats, class_map, role, name):
    """
    setup vertex property map for output classes, train/val/test masks, and feats
    INPUT:
        G           graph-tool graph, full graph including training,val,testing
        feats       ndarray of shape |V|xf
        class_map   dictionary {vertex_id: class_id}
        val_nodes   index of validation nodes
        test_nodes  index of testing nodes
    OUTPUT:
        G           graph-tool graph unchanged
        role        array of size |V|, indicating 'train'/'val'/'test'
        class_arr   array of |V|x|C|, converted by class_map
        feats       array of features unchanged
    """
    num_vertices = adj_full.shape[0]
    if isinstance(list(class_map.values())[0],list):
        print("labels are list")
        num_classes = len(list(class_map.values())[0])
        class_arr = np.zeros((num_vertices, 1))
        p = 0;
        for k,v in class_map.items():
            class_arr[p] = argmax(v)
            p = p+1
    else:
        num_classes = max(class_map.values()) - min(class_map.values()) + 1
        class_arr = np.zeros((num_vertices, 1))
        for k,v in class_map.items():
            class_arr[k] = v
    if name=='flickr' or name=='reddit' or name=='ppi' or name=='amazon' or name=='yelp':
        class_arr = np.squeeze(class_arr.astype(int))
        
    return adj_full, adj_train, feats, class_arr, role

In [36]:
# make sure you use the same data splits as you generated attacks
seed = 15
np.random.seed(seed)
torch.manual_seed(seed)
if cuda:
    torch.cuda.manual_seed(seed)

# load original dataset (to get clean features and labels)
SMALL = False
if SMALL:
    dataset = 'polblogs'
    data = dataio.Dataset(root='/tmp/', name=dataset)
    adj, features, labels = data.adj, data.features, data.labels
    idx_train, idx_val, idx_test = data.idx_train, data.idx_val, data.idx_test
    
    log.info(type(adj))
    log.info(adj.shape)
    log.info(type(features))
    log.info(features.shape)
    log.info(type(labels))
    log.info(labels.shape)
    log.info(type(idx_train))
    log.info(idx_train.shape)
    log.info(type(idx_val))
    log.info(idx_val.shape)
    log.info(type(idx_test))
    log.info(idx_test.shape)
else:
    data_prefix = './dataset/amazon'
    temp_data = load_data(data_prefix)
    data_list = data_prefix.split('/')
    print(data_list[-1])
    train_data = process_graph_data(*temp_data,data_list[-1])
    adj,adj_train,features,labels,role = train_data
    features = scipy.sparse.csr_matrix(features)
    idx_train = np.array(role['tr'])
    idx_val = np.array(role['va'])
    idx_test = np.array(role['te'])
    log.info(type(adj))
    log.info(adj.shape)
    log.info(type(adj_train))
    log.info(adj_train.shape)
    log.info(type(features))
    log.info(features.shape)
    log.info(type(labels))
    log.info(labels.shape)
    log.info(type(labels[0]))
    log.info(type(idx_train))
    log.info(idx_train.shape)
    log.info(type(idx_val))
    log.info(idx_val.shape)
    log.info(type(idx_test))
    log.info(idx_test.shape)

'''
flickr: (89250,1)
ppi:    (14755,121)
reddit: (232965,1)
amazon: (1569960,107)
yelp:   (716847,100)
'''

amazon
labels are list


I0228 22:34:35.626614 24971 <ipython-input-36-ed1d41fec3fb>:39] <class 'scipy.sparse.csr.csr_matrix'>
I0228 22:34:35.627515 24971 <ipython-input-36-ed1d41fec3fb>:40] (1569960, 1569960)
I0228 22:34:35.628129 24971 <ipython-input-36-ed1d41fec3fb>:41] <class 'scipy.sparse.csr.csr_matrix'>
I0228 22:34:35.628807 24971 <ipython-input-36-ed1d41fec3fb>:42] (1569960, 1569960)
I0228 22:34:35.629362 24971 <ipython-input-36-ed1d41fec3fb>:43] <class 'scipy.sparse.csr.csr_matrix'>
I0228 22:34:35.629944 24971 <ipython-input-36-ed1d41fec3fb>:44] (1569960, 200)
I0228 22:34:35.630615 24971 <ipython-input-36-ed1d41fec3fb>:45] <class 'numpy.ndarray'>
I0228 22:34:35.631145 24971 <ipython-input-36-ed1d41fec3fb>:46] (1569960,)
I0228 22:34:35.631695 24971 <ipython-input-36-ed1d41fec3fb>:47] <class 'numpy.int64'>
I0228 22:34:35.632252 24971 <ipython-input-36-ed1d41fec3fb>:48] <class 'numpy.ndarray'>
I0228 22:34:35.632793 24971 <ipython-input-36-ed1d41fec3fb>:49] (1255968,)
I0228 22:34:35.633334 24971 <ipython-

'\nflickr: (89250,1)\nppi:    (14755,121)\nreddit: (232965,1)\namazon: (1569960,107)\nyelp:   (716847,100)\n'

In [37]:
print(labels[0])
print(labels[1])
print(labels[7])
print(labels.max())

# Model and optimizer
if len(labels.shape)>1:
    model = GCN(nfeat=features.shape[1], nhid=32, nclass=len(labels[0]), device=device)
else:
    model = GCN(nfeat=features.shape[1], nhid=32, nclass=labels.max()+1, device=device)
    
    
optimizer = optim.Adam(model.parameters(),
                       lr=0.01, weight_decay=5e-4)

43
37
53
101


In [38]:
a = np.array([[1],[2],[3]])
b = np.squeeze(a)
print(b)
print(b.shape)

from numpy import argmax
a = np.array([0,0,1])
print(argmax(a))


[1 2 3]
(3,)
2


In [39]:
model = model.to(device)

model.fit(features, adj, labels, idx_train, train_iters=200, verbose=True)
# # using validation to pick model
# model.fit(features, perturbed_adj, labels, idx_train, idx_val, train_iters=200, verbose=True)
model.eval()
# You can use the inner function of model to test
model.test(idx_test)

Epoch 0, training loss: 4.654569149017334
Epoch 10, training loss: 3.999567985534668
Epoch 20, training loss: 3.535738945007324
Epoch 30, training loss: 3.279048442840576
Epoch 40, training loss: 3.0889415740966797
Epoch 50, training loss: 2.967660903930664
Epoch 60, training loss: 2.90462589263916
Epoch 70, training loss: 2.8706634044647217
Epoch 80, training loss: 2.847438097000122
Epoch 90, training loss: 2.8307056427001953
Epoch 100, training loss: 2.8176076412200928
Epoch 110, training loss: 2.805354356765747
Epoch 120, training loss: 2.7962636947631836
Epoch 130, training loss: 2.7868523597717285
Epoch 140, training loss: 2.778909921646118
Epoch 150, training loss: 2.770613193511963
Epoch 160, training loss: 2.7654590606689453
Epoch 170, training loss: 2.759206533432007
Epoch 180, training loss: 2.7541463375091553
Epoch 190, training loss: 2.7500600814819336
Test set results: loss= 2.6280 accuracy= 0.3918


tensor(0.3918, device='cuda:0', dtype=torch.float64)