In [1]:
from os import listdir
import os
from os.path import join
import numpy as np
import time
import torch
import math
import argparse
import dgl
import dgl.data
from dgl.dataloading import GraphDataLoader
from progress.bar import Bar
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, roc_curve,auc,accuracy_score
import pickle
import math
import torch
import torch.nn as nn
import torch.backends.cudnn as cudnn
import torch.optim
import torch.utils.data
from torch.utils.data import Dataset
from torch.optim import SGD
from tqdm.notebook import tqdm as tqdm
# import dgl.data.TUDataset as TUDataset
# our package
from Modules.STGST_torch_s2 import STGSTModule
import Modules.graphScattering as np_GST

Using backend: pytorch


In [2]:
class GST_coef_dataset(Dataset):
    def __init__(self, GSTcoe_all, label_all, split='train', test_rate=0.2):

        self.lenth = len(label_all)

        # if self.normalize:
        #     phis_mean = np.mean(phis[train_idx],axis=0)
        #     phis_std = np.std(phis[train_idx], axis=0)
        #     phis = (phis - phis_mean) / phis_std
        #     phis[np.isnan(phis)] = 0 # phis_std may be zero, remove invalid values here
        #     phis[np.isinf(phis)] = 0

        train_idx = int(self.lenth*(1-test_rate))

        if split == 'train':
            self.GSTcoe = GSTcoe_all[0:train_idx]
            self.labels = label_all[0:train_idx]
        elif split == 'test':
            self.GSTcoe = GSTcoe_all[(train_idx):]
            self.labels = label_all[(train_idx):]
        else:
            raise RuntimeError('Invalid split')

    def __getitem__(self, index):
        return self.GSTcoe[index,:,:], self.labels[index]
    
    def __len__(self):
        return len(self.labels)

class MLPs(nn.Module):
    def __init__(self, class_num=2, midnum = 128, nodeNum=None):
        super(MLPs, self).__init__()
        self.nodeNum = nodeNum
        self.mlp1 = nn.Linear(in_features=self.nodeNum*63, out_features=midnum, bias=True)
        # self.dropout1 = nn.Dropout(0.5)
        self.mlp2 = nn.Linear(in_features=midnum, out_features=class_num, bias=True)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.mlp1(x)
        x = self.relu(x)
        # x = self.dropout1(x)
        x = self.mlp2(x)
        return x

def computeNcoe(scale,layers):
    num = 0
    for i in range(layers):
        num = num + pow(scale, i)
    return num

class Perceptron(nn.Module):
    def __init__(self, input_dim):
        super(Perceptron, self).__init__()
        self.layer = nn.Linear(input_dim, 1)
        
    def forward(self, x):
        x = self.layer(x)
        return x

def sign(x):
    x[x>=0] = 1
    x[x<0] = -1
    return x

def loss_func(scores, label, type="svm"):
    assert type=="perceptron" or type=="svm", "loss type error"
    if type == "perceptron":
        # 感知机损失函数，label取值集合为{-1, 1}
        loss = -label*scores
    else:
        # SVM损失函数，label取值集合为{-1, 1}
        loss = 1-label*scores
   
    loss[loss<=0] = 0
    return torch.sum(loss)

def pred(x):
    return sign(x)

def valid(test_loader,model):
    pred_scores=[]
    labels=[]
    for j, (input, target) in enumerate(test_loader):
        input_var = input.to(device).float()
        target_var =target.to(device).int()
        scores = model(input_var).squeeze(1).squeeze(1)
        for m in range(len(target)):
            pred_scores.append(scores[m].item())
            labels.append(np.float(target[m].numpy()))

    labels = np.array(labels)
    # print(labels)
    labels[labels>0]=1
    labels[labels<=0]=0    
    # print(labels)
    pred_scores=np.array(pred_scores)
    # print(pred_scores)
    pred_scores[pred_scores>0]=1
    pred_scores[pred_scores<=0]=0
    # print(pred_scores)
    acc= accuracy_score(labels, pred_scores)
    return acc

In [14]:
global args
parser = argparse.ArgumentParser(description="GST configuration")
parser.add_argument("--datadir", type=str, default='/DATA7_DB7/data/gjliu/dataset', help="path of dataset")
parser.add_argument("--dataset", type=str, default='DD', help="name of dataset")
parser.add_argument("--split", type=str, default='train')
parser.add_argument("--epochs", type=int, default= 10000)
parser.add_argument("--batchsize", type=int, default= 1, help="batch size of dataset")
parser.add_argument('--workers',default=1,type=int, metavar='N')
parser.add_argument("--numScales", type=int, default= 5, help="size of filter bank")
parser.add_argument("--numLayers", type=int, default= 5, help="layers of GST")
args = parser.parse_known_args()[0]
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
num_gst_coe = computeNcoe(args.numScales, args.numLayers)

device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')

data = dgl.data.TUDataset('ENZYMES')
# data = dgl.data.GINDataset('DD', self_loop=True)


In [17]:
g, label = data[0]
print('Number of categories:', data.num_labels)
print('Number of num_nodes:', g.num_nodes())
print('Number of num_edges:', g.num_edges())
print('label:', label)
print('node_attr', g.ndata['node_attr'].shape)
# print('edata:', g.edata)
# print('edges:',g.edges())
# print('edges:',g.edges()[0][0:200])
# print('edges:',g.edges()[1][0:200])
# print('length',len(data))

Number of categories: 6
Number of num_nodes: 37
Number of num_edges: 168
label: tensor(5)
node_attr torch.Size([37, 18])


In [18]:
print(np.shape(np.expand_dims(g.ndata['node_attr'].permute(1,0).numpy(), axis=0)))

(1, 18, 37)


In [13]:
node_attr = np.expand_dims(np.expand_dims(g.ndata['node_attr'].numpy(), axis=0), axis=0)
print(np.shape(node_attr))

(1, 1, 42)


In [None]:
# dataloader = GraphDataLoader(data, batch_size=1, shuffle=True)
label_all = np.zeros(len(data))
# bar = Bar('>>>', fill='>', max=len(dataloader))
for k,(g, labels) in tqdm(enumerate(data)):
    
    A = np.zeros((g.num_nodes(),g.num_nodes()))
    for i in range(g.num_edges()):
        A[g.edges()[0][i].item()][g.edges()[1][i].item()] = 1

    GSTmodel = np_GST.DiffusionScattering(args.numScales, args.numLayers, A)        
#     node_attr = np.expand_dims(np.expand_dims(g.ndata['node_attr'].numpy(), axis=0), axis=0)
    
    fake_node_attr = np.ones(g.num_nodes())
    node_attr = np.expand_dims(np.expand_dims(fake_node_attr, axis=0), axis=0)
#     print(np.shape(node_attr))
    co_GST = GSTmodel.computeTransform(node_attr)
    if k == 0:
        num_coe = np.shape(co_GST)[2]
        GSTcoe_all = np.zeros((len(data),1,num_coe))

    GSTcoe_all[k] = co_GST[0]
    label_all[k] = int(labels.item())
#     bar.next()
# bar.finish()
np.save('/DATA7_DB7/data/gjliu/dataset/COLLAB/allphi_COLLAB.npy',GSTcoe_all)
np.save('/DATA7_DB7/data/gjliu/dataset/COLLAB/alllabel_COLLAB.npy',label_all)
print(label_all)
label_all = label_all*2-1
print(label_all)
print(num_coe)

0it [00:00, ?it/s]

In [79]:
g, labels = data[0]
print(labels)

tensor(0)


In [26]:
print(np.shape(label_all))

(1113,)


In [29]:
print(len(label_all[2:]))

1111


In [22]:
print('np.shape(GSTcoe_all[0])',np.shape(GSTcoe_all[0]))
print(GSTcoe_all[0])

np.shape(GSTcoe_all[0]) (1, 781)
[[8.22222222e+00 2.27079080e+00 7.93636873e-01 7.64380080e-01
  6.67090487e-01 3.84449919e-01 7.16750186e-01 2.77906196e-01
  2.67350304e-01 2.45602695e-01 2.03523167e-01 2.79047787e-01
  7.71773504e-02 6.95870825e-02 6.46447170e-02 5.55899136e-02
  3.10825909e-01 9.06355558e-02 5.92137860e-02 5.09254776e-02
  4.73949939e-02 2.38204682e-01 7.00564701e-02 4.28111443e-02
  2.84209054e-02 1.84420264e-02 8.82163381e-02 4.02961963e-02
  3.51149572e-02 2.45288696e-02 1.95637844e-02 2.87021262e-01
  8.24744612e-02 7.31945365e-02 4.98342149e-02 3.24342970e-02
  1.36340936e-01 2.97025268e-02 2.73919194e-02 2.20415321e-02
  1.53706712e-02 8.93182020e-02 2.59239396e-02 2.27581883e-02
  1.84113636e-02 1.46757966e-02 5.54591849e-02 2.40960491e-02
  1.93493495e-02 1.30199606e-02 8.55968007e-03 3.40753147e-02
  1.83193474e-02 1.79125772e-02 1.43881176e-02 1.18634534e-02
  7.14946936e-02 3.45051481e-02 3.71366545e-02 3.40774451e-02
  2.65244057e-02 3.91907177e-02 1.016

In [36]:

GSTdataset_train = GST_coef_dataset(GSTcoe_all, label_all, 'train')
GSTdataset_test = GST_coef_dataset(GSTcoe_all, label_all, 'test')
train_loader = torch.utils.data.DataLoader(GSTdataset_train, batch_size=4,
               shuffle=True, pin_memory=True)
test_loader = torch.utils.data.DataLoader(GSTdataset_test, batch_size=4,
               shuffle=False, pin_memory=True)
model = Perceptron(input_dim = num_coe)
model = nn.DataParallel(model, device_ids = [i for i in range(torch.cuda.device_count())])
model.to(device)
optimizer = SGD(model.parameters(), lr=0.5)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=int(100), gamma=0.3)

st = time.time()
#------------------------------------- 训练阶段-----------------------------------------------
for epoch in range(1):
    st_epoch = time.time()
    print('Epoch {}/{}'.format(epoch, args.epochs - 1))
    print('-' * 10)
    bar = Bar('>>>', fill='>', max=len(train_loader))
    loss_train = []
    for i, (input, target) in (enumerate(train_loader)):


        input_var = input.to(device).float()
        target_var =target.to(device).int()
        # print(np.shape(input_var))
        # print(np.shape(target_var))

        # 前向传播
        scores = model(input_var).squeeze(1).squeeze(1)
        print('np.shape(scores)',np.shape(scores),scores)
        print('np.shape(target_var)',np.shape(target_var),target_var)
        loss = loss_func(scores, target_var, "svm")

        # 反向传播
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        loss_train.append(loss.item())
            # for name,param in model.named_parameters():
            #     print(name, param)
        bar.next()
        break
    bar.finish()
    scheduler.step()
    if epoch % 1 == 0:

        # 计算分类的准确率
        acc = valid(test_loader,model)
#         acc = valid(train_loader,model)
        print("loss=", np.mean(loss_train),"acc=", acc) #loss.detach().cpu().numpy()

        # print('zantin')
        # input()        

    bt_epoch = time.time()
    print('epoch time:',bt_epoch-st_epoch,'  total time:', bt_epoch - st)

Epoch 0/9999
----------
np.shape(scores) torch.Size([4]) tensor([0.3225, 0.2459, 0.2877, 0.3435], device='cuda:0',
       grad_fn=<SqueezeBackward1>)
np.shape(target_var) torch.Size([4]) tensor([-1, -1, -1, -1], device='cuda:0', dtype=torch.int32)
loss= 5.19954776763916 acc= 0.0
epoch time: 0.03913617134094238   total time: 0.03944993019104004


In [42]:
acc= accuracy_score([2,3,0,0], [1,3,0,2])
print(acc)

0.5
