In [297]:
from torch_geometric.nn import GATConv
from torch_geometric.nn import SAGEConv
from igraph import Graph
from torch_geometric.utils import dense_to_sparse
from torch.nn import ModuleList
import igraph as ig
import numpy as np
import json
import pandas as pd
import torch
from torch_geometric.data import Data
import torch.nn.functional as F
import torch_geometric.transforms as T
import torch.nn as nn
from torch_geometric.nn import GCNConv
from torch_geometric.utils import train_test_split_edges
from sklearn.metrics import confusion_matrix
from sklearn.metrics.cluster import (v_measure_score, homogeneity_score, completeness_score)
from sklearn.metrics import precision_score, recall_score, roc_auc_score, accuracy_score, f1_score
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from torch.nn.parameter import Parameter
import scipy.sparse as sp
from torch.nn.modules.module import Module
from sklearn.metrics import classification_report
import torch.optim as optim
from torch.autograd import Variable
import time
import math
import neptune
import os
import random

seed=42
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

In [298]:
seed=42
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

graphml_file_path = r'Data/Network/ImGAGN_graph_tclin.graphml'  # Replace according to labels
g = ig.Graph.Read_GraphML(graphml_file_path)

# Extract node features, labels, and names
features = []
labels = []
for vertex in g.vs:
    labels.append(vertex['label'])
    vertex_features = []
    for attribute in vertex.attributes():
        if attribute not in ['name', 'label']:
            try:
                value = float(vertex[attribute])  # Convert attribute to float
            except ValueError:
                value = 0.0  # Default value if conversion fails
            vertex_features.append(value)
    features.append(vertex_features)

# Convert features and labels to tensors
features_tensor = torch.tensor(features, dtype=torch.float)
labels_tensor = torch.tensor(labels, dtype=torch.long)

# Convert the edge list to the format required by PyTorch Geometric
edges = g.get_edgelist()
edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()

# Create the PyTorch Geometric Data object
data = Data(x=features_tensor, edge_index=edge_index, y=labels_tensor)

print(data)

random_seed = 42

Data(x=[6048, 20], edge_index=[2, 20697], y=[6048])


In [299]:
def normalize(mx):
    rowsum = np.array(mx.sum(1))
    r_inv = np.power(rowsum, -1).flatten()
    r_inv[np.isinf(r_inv)] = 0
    r_mat_inv = sp.diags(r_inv)
    mx = r_mat_inv.dot(mx)
    return mx

def sparse_mx_to_torch_sparse_tensor(sparse_mx):
    sparse_mx = sparse_mx.tocoo().astype(np.float32)
    indices = torch.from_numpy(
        np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64))
    values = torch.from_numpy(sparse_mx.data)
    shape = torch.Size(sparse_mx.shape)
    return torch.sparse.FloatTensor(indices, values, shape)

def add_edges(adj_real, adj_new):
    adj = adj_real+adj_new
    adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply(adj.T > adj)
    adj = normalize(adj + sp.eye(adj.shape[0]))
    adj = sparse_mx_to_torch_sparse_tensor(adj)
    return adj

def accuracy(output, labels, output_AUC):
    preds = output.max(1)[1].type_as(labels)

    confusion_mat = confusion_matrix(labels, preds)

    recall = recall_score(labels.cpu().numpy(), preds.cpu().numpy(), zero_division=0)
    f1_score_ = f1_score(labels.cpu().numpy(), preds.cpu().numpy())
    AUC = roc_auc_score(labels.cpu().numpy(), output_AUC.detach().cpu().numpy())
    acc = accuracy_score(labels.cpu().numpy(), preds.cpu().numpy())
    precision = precision_score(labels.cpu().numpy(), preds.cpu().numpy(), zero_division=0)
    return recall, f1_score_, AUC, acc, precision, confusion_mat

def load_data(data, ratio_generated):
    print('Processing graph data...')
    global idx_train, idx_test
    # Extract features and labels from the PyTorch Geometric Data object
    features = data.x
    labels = data.y

    # Convert train and test edge indices to standard numpy arrays for processing
    idx_train = idx_train
    idx_train = idx_train
    # The rest of the code remains largely the same as in your original function
    majority = np.array([x for x in idx_train if labels[x] == 0])
    minority = np.array([x for x in idx_train if labels[x] == 1])

    num_minority = minority.shape[0]
    num_majority = majority.shape[0]
    print("Number of majority: ", num_majority)
    print("Number of minority: ", num_minority)

    generate_node = []
    generate_label = []
    for i in range(len(labels), len(labels) + int(ratio_generated * num_majority) - num_minority):
        generate_node.append(i)
        generate_label.append(1)
    idx_train = np.hstack((idx_train, np.array(generate_node)))

    minority_test = np.array([x for x in idx_test if labels[x] == 1])
    minority_all = np.hstack((minority, minority_test))

    labels = np.hstack((labels, np.array(generate_label)))

    # Construct adjacency matrix from PyTorch Geometric Data
    edge_index = data.edge_index.numpy()
    adj_real = sp.coo_matrix((np.ones(edge_index.shape[1]), (edge_index[0], edge_index[1])), 
                             shape=(len(labels), len(labels)), dtype=np.float32)

    adj = adj_real + adj_real.T.multiply(adj_real.T > adj_real) - adj_real.multiply(adj_real.T > adj_real)

    # Normalizing features and adjacency matrix
    features = normalize(sp.csr_matrix(features))
    adj = normalize(adj + sp.eye(adj.shape[0]))

    # Convert to PyTorch tensors
    features = torch.FloatTensor(np.array(features.todense()))
    labels = torch.LongTensor(labels)
    adj = sparse_mx_to_torch_sparse_tensor(adj)

    idx_train = torch.LongTensor(idx_train)
    idx_test = torch.LongTensor(idx_test)
    generate_node = torch.LongTensor(np.array(generate_node))
    minority = torch.LongTensor(minority)
    majority = torch.LongTensor(majority)
    minority_all = torch.LongTensor(minority_all)

    return adj, adj_real, features, labels, idx_train, idx_test, generate_node, minority, majority, minority_all

In [301]:
class GraphSAGE(nn.Module):
    def __init__(self, nfeat, layer_channels, nclass, dropout, generate_node, min_node):
        super(GraphSAGE, self).__init__()
        
        self.convs = ModuleList()

        # Add the input layer
        self.convs.append(SAGEConv(nfeat, layer_channels[0]))
        for i in range(1, len(layer_channels)):
            self.convs.append(SAGEConv(layer_channels[i-1], layer_channels[i]))
            
        self.gc_nclass = SAGEConv(layer_channels[-1], nclass)
        self.gc_2 = SAGEConv(layer_channels[-1], 2)
        self.attention = Attention(nfeat*2, 1)  # Custom attention mechanism, make sure to define or adapt it
        self.generate_node = generate_node
        self.min_node = min_node
        self.dropout = dropout
        self.eps = 1e-10

    def forward(self, x, adj):
        if adj.is_sparse:
            # If adj is already a sparse tensor, directly use the indices
            edge_index = adj._indices()
        else:
            # If adj is a dense tensor, convert it to a sparse format
            edge_index, _ = dense_to_sparse(adj)

        for layer in self.convs:
            x = F.relu(layer(x, edge_index))
            x = F.dropout(x, self.dropout, training=self.training)
 
        x1 = self.gc_nclass(x, edge_index)
        x2 = self.gc_2(x, edge_index)
        return F.log_softmax(x1, dim=1), F.log_softmax(x2, dim=1), F.softmax(x1, dim=1)[:,-1]

    def get_embedding(self, x, adj):
        if adj.is_sparse:
            # If adj is already a sparse tensor, directly use the indices
            edge_index = adj._indices()
        else:
            # If adj is a dense tensor, convert it to a sparse format
            edge_index, _ = dense_to_sparse(adj)
        for layer in self.convs:
            x = F.relu(layer(x, edge_index))

        return x

In [304]:
class Attention(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(Attention, self).__init__()
        self.mlp = nn.Sequential(
            nn.Linear(input_dim, input_dim // 2, bias=True),
            nn.ReLU(),
            nn.Linear(input_dim // 2, output_dim, bias=True),
        )

    def forward(self, x):
        return self.mlp(x)

In [306]:
class Generator(nn.Module):
    def __init__(self,  dim):
        super(Generator, self).__init__( )

        self.fc1 = nn.Linear(100, 200)
        self.fc2 = nn.Linear(200, 200)
        self.fc3 = nn.Linear(200, dim)
        self.fc4 = nn.Tanh()

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        x = self.fc4(x)
        x = (x+1)/2
        return x

In [309]:
def train(features, adj):
    global max_recall, test_recall, test_f1, test_AUC, test_acc, test_pre, confusion
    model.train()
    optimizer.zero_grad()
    output, output_gen, output_AUC = model(features, adj)

    labels_true = torch.cat((torch.LongTensor(num_real).fill_(0), torch.LongTensor(num_false).fill_(1)))

    if cuda:
        labels_true=labels_true.cuda()

    loss_dis = - euclidean_dist(features[minority], features[majority]).mean()
    #F.nll_loss(output[idx_train], labels[idx_train])
    loss_train =  F.nll_loss(output[idx_train], labels[idx_train]) + F.nll_loss(output_gen[idx_train], labels_true) + loss_dis

    loss_train.backward()
    optimizer.step()


    if not fastmode:
        model.eval()
        output, output_gen, output_AUC = model(features, adj)


    recall_val, f1_val, AUC_val, acc_val, pre_val, conf_val = accuracy(output[idx_val], labels[idx_val], output_AUC[idx_val])
    recall_train, f1_train, AUC_train, acc_train, pre_train, conf_train = accuracy(output[idx_val], labels[idx_val], output_AUC[idx_val])

    output, output_gen, output_AUC = model(features, adj)
    recall_tmp, f1_tmp, AUC_tmp, acc_tmp, pre_tmp, conf = accuracy(output[idx_test], labels[idx_test], output_AUC[idx_test])
    test_recall = recall_tmp
    test_f1 = f1_tmp
    test_AUC = AUC_tmp
    test_acc = acc_tmp
    test_pre = pre_tmp
    max_recall = (recall_val + acc_val)/2
    confusion = conf

    return test_recall, test_pre, test_f1, test_AUC, test_acc, confusion, loss_train


def euclidean_dist(x, y):
    m, n = x.size(0), y.size(0)
    xx = torch.pow(x, 2).sum(1, keepdim=True).expand(m, n)
    yy = torch.pow(y, 2).sum(1, keepdim=True).expand(n, m).t()
    dist = xx + yy
    dist.addmm_(1, -2, x, y.t())
    dist = dist.clamp(min=1e-12).sqrt()  # for numerical stability
    return dist


In [310]:
seed=42
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)


targets = np.array([x for x in range(6048) if data.y[x] == 1])
ntargets = np.array([x for x in range(6048) if data.y[x] == 0])

targets_train, targets_test, train_label_1, test_label_1 = train_test_split(targets, np.ones(len(targets)), test_size=0.3)
ntargets_train, ntargets_test, train_label_0,test_label_0 = train_test_split(ntargets, np.zeros(len(ntargets)), test_size=0.3)

idx_train = np.concatenate((np.array(targets_train),np.array(ntargets_train)))
idx_test = np.concatenate((ntargets_test, targets_test))

idx_train_origin = idx_train
print(idx_train.shape, idx_test.shape)



(4233,) (1815,)


In [311]:


#hyper parameters#################
#################################
#################################

#Tclin hype
ratio = 1
layer_channels = [40, 80, 80]
dropout = 0.3 #0.2, 0.25, 0.3, 0.4, 0.5
lr = 0.005
weight_decay = 0.0004   #0.0003, 0.0004, 0.0005, 0.0007, 0.0009, 0.001, 0.002, 0.003
fastmode = False
no_cuda = False
num= 10
seed= 42
epochs_gen = 13
epochs = 20 #25, 30, 20
alpha = 0.2


'''
#Pancreatic hype
ratio = 1
layer_channels = [40, 80, 80]
dropout = 0.7 #0.2, 0.25, 0.3, 0.4, 0.5
lr = 0.0007
weight_decay = 0.001   #0.0003, 0.0004, 0.0005, 0.0007, 0.0009, 0.001, 0.002, 0.003
fastmode = False
no_cuda = False
num= 10
seed= 42
epochs_gen = 5
epochs = 25 #25, 30, 20
alpha = 0.2



#Leukemia
ratio = 1
layer_channels = [40, 80, 80]
dropout = 0.7 #0.2, 0.25, 0.3, 0.4, 0.5
lr = 0.002
weight_decay = 0.001   #0.0003, 0.0004, 0.0005, 0.0007, 0.0009, 0.001, 0.002, 0.003
fastmode = False
no_cuda = False
num= 10
seed= 42
epochs_gen = 27
epochs = 25 #25, 30, 20
alpha = 0.2
'''
###############################
###############################
###############################

np.random.seed(seed)
torch.manual_seed(seed)

cuda = not no_cuda and torch.cuda.is_available()

adj, adj_real, features, labels, idx_temp, idx_test, generate_node, minority, majority, minority_all = load_data(data, ratio)

print(adj.shape, adj_real.shape, features.shape, labels.shape, idx_temp.shape, idx_test.shape, generate_node.shape, minority.shape, majority.shape, minority_all.shape)


model = GraphSAGE(nfeat=features.shape[1],
    layer_channels = layer_channels,
    nclass=labels.max().item() + 1,
    dropout=dropout,
    generate_node= generate_node,
    min_node = minority)

optimizer = optim.Adam(model.parameters(),lr=lr, weight_decay=weight_decay)

num_false = labels.shape[0]- features.shape[0]
model_generator = Generator(minority_all.shape[0])
optimizer_G = torch.optim.Adam(model_generator.parameters(),
                       lr=lr, weight_decay=weight_decay)

max_recall = 0
test_recall = 0
test_f1 = 0
test_AUC = 0
test_acc=0
test_pre =0
confusion = 0 

if cuda:
    model.cuda()
    features = features.cuda()
    adj = adj.cuda()
    labels = labels.cuda()
    idx_temp = idx_temp.cuda()
    idx_test = idx_test.cuda()
    model_generator.cuda()




for epoch_gen in range(epochs_gen):
    part = epoch_gen % num
    range_val_maj = range(int(part*len(majority)/num), int((part+1)*len(majority)/num))
    range_val_min = range(int(part * len(minority) / num), int((part + 1) * len(minority) / num))

    range_train_maj = list(range(0,int(part*len(majority)/num)))+ list(range(int((part+1)*len(majority)/num),len(majority)))
    range_train_min = list(range(0,int(part*len(minority)/num)))+ list(range(int((part+1)*len(minority)/num),len(minority)))

    idx_val = torch.cat((majority[range_val_maj], minority[range_val_min]))
    idx_train = torch.cat((majority[range_train_maj], minority[range_train_min]))
    idx_train = torch.cat((idx_train, generate_node))
 
    num_real = features.shape[0] - len(idx_test) -len(idx_val)

    # Train model
    model_generator.train()
    optimizer_G.zero_grad()
    z = Variable(torch.FloatTensor(np.random.normal(0, 1, (generate_node.shape[0], 100))))
    if cuda:
        z=z.cuda()

    adj_min = model_generator(z)
    gen_imgs1 = torch.mm(F.softmax(adj_min[:,0:minority.shape[0]], dim=1), features[minority])
    gen_imgs1_all = torch.mm(F.softmax(adj_min, dim=1), features[minority_all])

    matr = F.softmax(adj_min[:,0:minority.shape[0]], dim =1).data.cpu().numpy()
    pos=np.where(matr>1/matr.shape[1])

    adj_temp = sp.coo_matrix((np.ones(pos[0].shape[0]),(generate_node[pos[0]].numpy(), minority_all[pos[1]].numpy())),
                             shape=(labels.shape[0], labels.shape[0]),
                             dtype=np.float32)

    adj_new = add_edges(adj_real, adj_temp)
    if cuda:
        adj_new=adj_new.cuda()

    t_total = time.time()
    # model.eval()
    output, output_gen, output_AUC = model(torch.cat((features, gen_imgs1.data),0), adj)

    labels_true = torch.LongTensor(num_false).fill_(0)
    labels_min = torch.LongTensor(num_false).fill_(1)
    if cuda:
        labels_true = labels_true.cuda()
        labels_min = labels_min.cuda()
    #F.nll_loss(output_gen[generate_node], labels_true) \ + F.nll_loss(output[generate_node], labels_min) \
    g_loss = F.nll_loss(output_gen[generate_node], labels_true) \
            + F.nll_loss(output[generate_node], labels_min) \
            + euclidean_dist(features[minority], gen_imgs1).mean()
    g_loss.backward()
    optimizer_G.step()

    for epoch in range(epochs):
        test_recall, test_pre, test_f1, test_AUC, test_acc, confusion, t_loss = train(torch.cat((features, gen_imgs1.data.detach()),0), adj_new)

    
    print("g_loss=".format(g_loss),"Epoch:", '%04d' % (epoch_gen + 1),
        "recall=", "{:.5f}".format(test_recall), "precision=", "{:.5f}".format(test_pre),"f1=", "{:.5f}".format(test_f1), "AUC=", "{:.5f}".format(test_AUC), "ACC=", "{:.5f}".format(test_acc))
    


print()
print('final_test')
print()
print("Test Recall: ", test_recall)
print("Test Accuracy: ", test_acc)
print("Test F1: ", test_f1)
print("Test precision: ", test_pre)
print("Test AUC: ", test_AUC)

Processing graph data...
Number of majority:  4167
Number of minority:  66
torch.Size([10149, 10149]) (10149, 10149) torch.Size([6048, 20]) torch.Size([10149]) torch.Size([8334]) torch.Size([1815]) torch.Size([4101]) torch.Size([66]) torch.Size([4167]) torch.Size([95])


  r_inv = np.power(rowsum, -1).flatten()


[neptune] [info   ] Neptune initialized. Open in the app: https://app.neptune.ai/georgewang2008/ImGAGN/e/IM-118
g_loss= Epoch: 0001 recall= 1.00000 precision= 0.01728 f1= 0.03398 AUC= 0.67661 ACC= 0.09146
g_loss= Epoch: 0002 recall= 0.79310 precision= 0.02419 f1= 0.04694 AUC= 0.70168 ACC= 0.48540
g_loss= Epoch: 0003 recall= 0.27586 precision= 0.03008 f1= 0.05424 AUC= 0.73126 ACC= 0.84628
g_loss= Epoch: 0004 recall= 0.10345 precision= 0.01987 f1= 0.03333 AUC= 0.74984 ACC= 0.90413
g_loss= Epoch: 0005 recall= 0.06897 precision= 0.02273 f1= 0.03419 AUC= 0.75507 ACC= 0.93774

final_test

Test Recall:  0.06896551724137931
Test Accuracy:  0.9377410468319559
Test F1:  0.03418803418803419
Test precision:  0.022727272727272728
Test AUC:  0.7550681546125034
[[1700   86]
 [  27    2]]


In [None]:
model_generator = Generator(minority_all.shape[0])

embedded = model.get_embedding(torch.cat((features, gen_imgs1.data),0), adj)



print(embedded.shape, embedded)

df = pd.DataFrame(embedded.detach().numpy())
csv_filename = 'Imgagn-Embedding.csv'
df.to_csv(csv_filename, index=False)
print()


torch.Size([10149, 80]) tensor([[0.3395, 0.0364, 0.2212,  ..., 0.2102, 0.2679, 0.2598],
        [0.1764, 0.0568, 0.3672,  ..., 0.1314, 0.1526, 0.1064],
        [1.8216, 0.0000, 0.0000,  ..., 1.0246, 1.3121, 1.4680],
        ...,
        [0.0000, 0.0781, 0.6537,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0783, 0.6546,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0784, 0.6545,  ..., 0.0000, 0.0000, 0.0000]],
       grad_fn=<ReluBackward0>)



array([2980,  699, 1795, ..., 2052, 5331, 5548])