In [144]:

from igraph import Graph
import igraph as ig
import numpy as np
import json
import pandas as pd
import torch
from torch_geometric.data import Data
import torch.nn.functional as F
import torch_geometric.transforms as T
import torch.nn as nn
from torch_geometric.nn import GCNConv
from torch_geometric.utils import train_test_split_edges
from sklearn.metrics.cluster import (v_measure_score, homogeneity_score, completeness_score)
from sklearn.metrics import precision_score, recall_score, roc_auc_score, accuracy_score, f1_score
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from torch.nn.parameter import Parameter
import scipy.sparse as sp
from torch.nn.modules.module import Module
from sklearn.metrics import classification_report
import torch.optim as optim
from torch.autograd import Variable
import time
import math
import neptune
import os

In [145]:
node_list_path = r'C:\Users\George\Desktop\ISEF-2023\Datas\Node list\back up\current_protein_Signal+meta+targets.json'
with open(node_list_path, 'r') as file:
    node_list = json.load(file)
graph = r'C:\Users\George\Desktop\ISEF-2023\Network construction\PPI_homo_graph_features_loaded.graphml'
# Create an igraph object
PPI_graph = ig.Graph.Load(graph, format='graphml')

feature_keys = [
    "Indegree", "Outdegree", "Closeness", "Betweenness", "Pagerank", "Cluster_coefficients",
    "Nearest_Neighbor_Degree", "Similarity", "Subunit", "Transmembrane",
    "Catalytic_activity", "Interaction", "Tissue_Specificity", "Disease",
    "Sequence_conflict", "Modified_residue", "Function", "Binding_site",
    "Natural_variant", "Alternative_products", "Subcellular_location",
    "Active_site", "Disulfide_bond", "Mutagenesis", "PTM", "STP_involvement"
]

features = torch.tensor([
    PPI_graph.vs[key] for key in feature_keys
], dtype=torch.float).t()

edge_indices = torch.tensor(PPI_graph.get_edgelist(), dtype=torch.long).t()

# Assuming you have a label attribute in your graph
labels = torch.tensor(PPI_graph.vs["label"], dtype=torch.float)

# Create a PyTorch Geometric Data object
data_ = Data(x=features, edge_index=edge_indices, y=labels)

print(data_)

Data(x=[7392, 26], edge_index=[2, 49502], y=[7392])


  return reader(f, *args, **kwds)


In [146]:
data_ = train_test_split_edges(data_)
print(data_)



Data(x=[7392, 26], y=[7392], val_pos_edge_index=[2, 1237], test_pos_edge_index=[2, 2474], train_pos_edge_index=[2, 42062], train_neg_adj_mask=[7392, 7392], val_neg_edge_index=[2, 1237], test_neg_edge_index=[2, 2474])


In [147]:
def normalize(mx):
    rowsum = np.array(mx.sum(1))
    r_inv = np.power(rowsum, -1).flatten()
    r_inv[np.isinf(r_inv)] = 0.
    r_mat_inv = sp.diags(r_inv)
    mx = r_mat_inv.dot(mx)
    return mx

def sparse_mx_to_torch_sparse_tensor(sparse_mx):
    sparse_mx = sparse_mx.tocoo().astype(np.float32)
    indices = torch.from_numpy(
        np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64))
    values = torch.from_numpy(sparse_mx.data)
    shape = torch.Size(sparse_mx.shape)
    return torch.sparse.FloatTensor(indices, values, shape)

def add_edges(adj_real, adj_new):
    adj = adj_real+adj_new
    adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply(adj.T > adj)
    adj = normalize(adj + sp.eye(adj.shape[0]))
    adj = sparse_mx_to_torch_sparse_tensor(adj)
    return adj

def accuracy(output, labels, output_AUC):
    preds = output.max(1)[1].type_as(labels)


    recall = recall_score(labels.cpu().numpy(), preds.cpu().numpy(), zero_division=0)
    f1_score_ = f1_score(labels.cpu().numpy(), preds.cpu().numpy())
    AUC = roc_auc_score(labels.cpu().numpy(), output_AUC.detach().cpu().numpy())
    acc = accuracy_score(labels.cpu().numpy(), preds.cpu().numpy())
    precision = precision_score(labels.cpu().numpy(), preds.cpu().numpy(), zero_division=0)
    return recall, f1_score_, AUC, acc, precision

def load_data(data, ratio_generated):
    print('Processing graph data...')

    # Extract features and labels from the PyTorch Geometric Data object
    features = data.x
    labels = data.y

    # Convert train and test edge indices to standard numpy arrays for processing
    idx_train = data.train_pos_edge_index[0].numpy()
    idx_test = data.test_pos_edge_index[0].numpy()

    # The rest of the code remains largely the same as in your original function
    majority = np.array([x for x in idx_train if labels[x] == 0])
    minority = np.array([x for x in idx_train if labels[x] == 1])

    num_minority = minority.shape[0]
    num_majority = majority.shape[0]
    print("Number of majority: ", num_majority)
    print("Number of minority: ", num_minority)

    generate_node = []
    generate_label = []
    for i in range(len(labels), len(labels) + int(ratio_generated * num_majority) - num_minority):
        generate_node.append(i)
        generate_label.append(1)
    idx_train = np.hstack((idx_train, np.array(generate_node)))

    minority_test = np.array([x for x in idx_test if labels[x] == 1])
    minority_all = np.hstack((minority, minority_test))

    labels = np.hstack((labels, np.array(generate_label)))

    # Construct adjacency matrix from PyTorch Geometric Data
    edge_index = data.train_pos_edge_index.numpy()
    adj_real = sp.coo_matrix((np.ones(edge_index.shape[1]), (edge_index[0], edge_index[1])), 
                             shape=(len(labels), len(labels)), dtype=np.float32)

    adj = adj_real + adj_real.T.multiply(adj_real.T > adj_real) - adj_real.multiply(adj_real.T > adj_real)

    # Normalizing features and adjacency matrix
    features = normalize(sp.csr_matrix(features))
    adj = normalize(adj + sp.eye(adj.shape[0]))

    # Convert to PyTorch tensors
    features = torch.FloatTensor(np.array(features.todense()))
    labels = torch.LongTensor(labels)
    adj = sparse_mx_to_torch_sparse_tensor(adj)

    idx_train = torch.LongTensor(idx_train)
    idx_test = torch.LongTensor(idx_test)
    generate_node = torch.LongTensor(np.array(generate_node))
    minority = torch.LongTensor(minority)
    majority = torch.LongTensor(majority)
    minority_all = torch.LongTensor(minority_all)

    return adj, adj_real, features, labels, idx_train, idx_test, generate_node, minority, majority, minority_all

In [148]:
####
#  Orignal dataset
####
'''
def load_data(ratio_generated, path="../dataset/citeseer/", dataset="citeseer"):
    print('Loading {} dataset...'.format(dataset))

    idx_features_labels = np.genfromtxt("{}features.{}".format(path, dataset),
                                        dtype=np.float32)
    features = sp.csr_matrix(idx_features_labels[:, 0:-1], dtype=np.float32)
    labels = idx_features_labels[:, -1]

    idx_train = np.genfromtxt("{}train.{}".format(path, dataset),
                              dtype=np.int32).squeeze()

    idx_test = np.genfromtxt("{}test.{}".format(path, dataset),
                             dtype=np.int32).squeeze()

    majority = np.array([x for x in idx_train if labels[x] == 0])
    minority = np.array([x for x in idx_train if labels[x] == 1])

    num_minority = minority.shape[0]
    num_majority = majority.shape[0]
    print("Number of majority: ", num_majority)
    print("Number of minority: ", num_minority)

    generate_node = []
    generate_label=[]
    for i in range(labels.shape[0], labels.shape[0]+int(ratio_generated*num_majority)-num_minority):
        generate_node.append(i)
        generate_label.append(1)
    idx_train= np.hstack((idx_train, np.array(generate_node)))
    print(idx_train.shape)

    minority_test = np.array([x for x in idx_test if labels[x] == 1])
    minority_all = np.hstack((minority, minority_test))


    labels= np.hstack((labels, np.array(generate_label)))


    edges = np.genfromtxt("{}edges.{}".format(path, dataset),
                                    dtype=np.int32)

    adj_real = sp.coo_matrix((np.ones(edges.shape[0]), (edges[:, 0], edges[:, 1])),
                        shape=(labels.shape[0], labels.shape[0]),
                        dtype=np.float32)

    adj = adj_real + adj_real.T.multiply(adj_real.T > adj_real) - adj_real.multiply(adj_real.T > adj_real)

    features = normalize(features)
    adj = normalize(adj + sp.eye(adj.shape[0]))

    features = torch.FloatTensor(np.array(features.todense()))
    labels = torch.LongTensor(labels)
    adj = sparse_mx_to_torch_sparse_tensor(adj)

    idx_train = torch.LongTensor(idx_train)
    idx_test = torch.LongTensor(idx_test)
    generate_node=torch.LongTensor(np.array(generate_node))
    minority = torch.LongTensor(minority)
    majority = torch.LongTensor(majority)
    minority_all = torch.LongTensor(minority_all)

    return adj, adj_real,features, labels, idx_train, idx_test, generate_node, minority, majority, minority_all

#######################################
#######################################
#######################################
#######################################
#######################################
#######################################'''

'\ndef load_data(ratio_generated, path="../dataset/citeseer/", dataset="citeseer"):\n    print(\'Loading {} dataset...\'.format(dataset))\n\n    idx_features_labels = np.genfromtxt("{}features.{}".format(path, dataset),\n                                        dtype=np.float32)\n    features = sp.csr_matrix(idx_features_labels[:, 0:-1], dtype=np.float32)\n    labels = idx_features_labels[:, -1]\n\n    idx_train = np.genfromtxt("{}train.{}".format(path, dataset),\n                              dtype=np.int32).squeeze()\n\n    idx_test = np.genfromtxt("{}test.{}".format(path, dataset),\n                             dtype=np.int32).squeeze()\n\n    majority = np.array([x for x in idx_train if labels[x] == 0])\n    minority = np.array([x for x in idx_train if labels[x] == 1])\n\n    num_minority = minority.shape[0]\n    num_majority = majority.shape[0]\n    print("Number of majority: ", num_majority)\n    print("Number of minority: ", num_minority)\n\n    generate_node = []\n    generate_l

In [149]:
class GraphConvolution(Module):

    def __init__(self, in_features, out_features, bias=True):
        super(GraphConvolution, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.weight = Parameter(torch.FloatTensor(in_features, out_features))
        if bias:
            self.bias = Parameter(torch.FloatTensor(out_features))
        else:
            self.register_parameter('bias', None)
        self.reset_parameters()

    def reset_parameters(self):
        stdv = 1. / math.sqrt(self.weight.size(1))
        self.weight.data.uniform_(-stdv, stdv)
        if self.bias is not None:
            self.bias.data.uniform_(-stdv, stdv)

    def forward(self, input, adj):
        support = torch.mm(input, self.weight)
        output = torch.spmm(adj, support)
        if self.bias is not None:
            return output + self.bias
        else:
            return output

    def __repr__(self):
        return self.__class__.__name__ + ' (' \
               + str(self.in_features) + ' -> ' \
               + str(self.out_features) + ')'

In [150]:
class Attention(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(Attention, self).__init__()
        self.mlp = nn.Sequential(
            nn.Linear(input_dim, input_dim // 2, bias=True),
            nn.ReLU(),
            nn.Linear(input_dim // 2, output_dim, bias=True),
        )

    def forward(self, x):
        return self.mlp(x)


In [151]:
class GCN(nn.Module):
    def __init__(self, nfeat, nhid, nclass, dropout, generate_node, min_node):
        super(GCN, self).__init__()

        self.gc1 = GraphConvolution(nfeat, nhid)
        self.gc2 = GraphConvolution(nhid, nclass)
        self.gc3 = GraphConvolution(nhid, 2)
        self.attention = Attention(nfeat*2, 1)
        self.generate_node = generate_node
        self.min_node = min_node
        self.dropout = dropout
        self.eps = 1e-10

    def forward(self, x, adj):

        x = F.relu(self.gc1(x, adj))
        x = F.dropout(x, self.dropout, training=self.training)
        x1 = self.gc2(x, adj)
        x2 = self.gc3(x, adj)
        return F.log_softmax(x1, dim=1), F.log_softmax(x2, dim=1), F.softmax(x1, dim=1)[:,-1]

    def get_embedding(self,x , adj):
        x = F.relu(self.gc1(x, adj))
        x = torch.spmm(adj, x)
        return x

In [152]:
class Generator(nn.Module):
    def __init__(self,  dim):
        super(Generator, self).__init__( )

        self.fc1 = nn.Linear(100, 200)
        self.fc2 = nn.Linear(200, 200)
        self.fc3 = nn.Linear(200, dim)
        self.fc4 = nn.Tanh()

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        x = self.fc4(x)
        x = (x+1)/2
        return x

In [153]:
ratio = 0.2
hidden = 128
dropout = 0.5
lr = 0.0009
weight_decay = 0.003
fastmode = False
no_cuda = False
num= 10
seed= 42
epochs_gen = 10
epochs = 10

np.random.seed(seed)
torch.manual_seed(seed)

cuda = not no_cuda and torch.cuda.is_available()

adj, adj_real, features, labels, idx_temp, idx_test, generate_node, minority, majority, minority_all = load_data(data_, ratio)

print(adj.shape, adj_real.shape, features.shape, labels.shape, idx_temp.shape, idx_test.shape, generate_node.shape, minority.shape, majority.shape, minority_all.shape)

# Model and optimizer
model = GCN(nfeat=features.shape[1],
    nhid=hidden,
    nclass=labels.max().item() + 1,
    dropout=dropout,
    generate_node= generate_node,
    min_node = minority)
optimizer = optim.Adam(model.parameters(),lr=lr, weight_decay=weight_decay)

num_false = labels.shape[0]- features.shape[0]
model_generator = Generator(minority_all.shape[0])
optimizer_G = torch.optim.Adam(model_generator.parameters(),
                       lr=lr, weight_decay=weight_decay)

max_recall = 0
test_recall = 0
test_f1 = 0
test_AUC = 0
test_acc=0
test_pre =0

if cuda:
    model.cuda()
    features = features.cuda()
    adj = adj.cuda()
    labels = labels.cuda()
    idx_temp = idx_temp.cuda()
    idx_test = idx_test.cuda()
    model_generator.cuda()


Processing graph data...
Number of majority:  41319
Number of minority:  743
torch.Size([14912, 14912]) (14912, 14912) torch.Size([7392, 26]) torch.Size([14912]) torch.Size([49582]) torch.Size([2474]) torch.Size([7520]) torch.Size([743]) torch.Size([41319]) torch.Size([814])


In [156]:
def train(features, adj):
    global max_recall, test_recall, test_f1, test_AUC, test_acc, test_pre
    model.train()
    optimizer.zero_grad()
    output, output_gen, output_AUC = model(features, adj)

    print(f"Size of features: {features.shape}")
    print(f"Size of output: {output.shape}")
    print(f"Size of output_gen: {output_gen.shape}")
    print(f"Size of output_AUC: {output_AUC.shape}")
    print(f"Size of labels: {labels.shape}")
    print(f"Size of idx_train: {idx_train.shape}")
    print(f"Size of num_flase: {num_false}")

    labels_true = torch.cat((torch.LongTensor(num_real).fill_(0), torch.LongTensor(num_false).fill_(1)))

    print(f"Size of labels_true: {labels_true.size()}")

    if cuda:
        labels_true=labels_true.cuda()

    loss_dis = - euclidean_dist(features[minority], features[majority]).mean()
    loss_train = F.nll_loss(output[idx_train], labels[idx_train]) + F.nll_loss(output_gen[idx_train], labels_true) + loss_dis

    print(f"Size of output[idx_train]: {output[idx_train].shape}")
    print(f"Size of labels[idx_train]: {labels[idx_train].shape}")
    print(f"Size of output_gen[idx_train]: {output_gen[idx_train].shape}")

    loss_train.backward()
    optimizer.step()


    if not fastmode:
        model.eval()
        output, output_gen, output_AUC = model(features, adj)


    recall_val, f1_val, AUC_val, acc_val, pre_val = accuracy(output[idx_val], labels[idx_val], output_AUC[idx_val])
    recall_train, f1_train, AUC_train, acc_train, pre_train = accuracy(output[idx_val], labels[idx_val], output_AUC[idx_val])

    if max_recall < (recall_val + acc_val)/2:
        output, output_gen, output_AUC = model(features, adj)
        recall_tmp, f1_tmp, AUC_tmp, acc_tmp, pre_tmp = accuracy(output[idx_test], labels[idx_test], output_AUC[idx_test])
        test_recall = recall_tmp
        test_f1 = f1_tmp
        test_AUC = AUC_tmp
        test_acc = acc_tmp
        test_pre = pre_tmp
        max_recall = (recall_val + acc_val)/2

    return recall_val, f1_val, acc_val, recall_train, f1_train, acc_train


def euclidean_dist(x, y):
    m, n = x.size(0), y.size(0)
    xx = torch.pow(x, 2).sum(1, keepdim=True).expand(m, n)
    yy = torch.pow(y, 2).sum(1, keepdim=True).expand(n, m).t()
    dist = xx + yy
    dist.addmm_(1, -2, x, y.t())
    dist = dist.clamp(min=1e-12).sqrt()  # for numerical stability
    return dist


In [157]:
for epoch_gen in range(epochs_gen):
    part = epoch_gen % num
    range_val_maj = range(int(part*len(majority)/num), int((part+1)*len(majority)/num))
    range_val_min = range(int(part * len(minority) / num), int((part + 1) * len(minority) / num))

    range_train_maj = list(range(0,int(part*len(majority)/num)))+ list(range(int((part+1)*len(majority)/num),len(majority)))
    range_train_min = list(range(0,int(part*len(minority)/num)))+ list(range(int((part+1)*len(minority)/num),len(minority)))

    idx_val = torch.cat((majority[range_val_maj], minority[range_val_min]))
    idx_train = torch.cat((majority[range_train_maj], minority[range_train_min]))
    idx_train = torch.cat((idx_train, generate_node))
    num_real = features.shape[0] - len(idx_test) -len(idx_val)

    # Train model
    model_generator.train()
    optimizer_G.zero_grad()
    z = Variable(torch.FloatTensor(np.random.normal(0, 1, (generate_node.shape[0], 100))))
    if cuda:
        z=z.cuda()

    adj_min = model_generator(z)
    gen_imgs1 = torch.mm(F.softmax(adj_min[:,0:minority.shape[0]], dim=1), features[minority])
    gen_imgs1_all = torch.mm(F.softmax(adj_min, dim=1), features[minority_all])

    matr = F.softmax(adj_min[:,0:minority.shape[0]], dim =1).data.cpu().numpy()
    pos=np.where(matr>1/matr.shape[1])

    print("Max generate_node index:", generate_node.max())
    print("Max minority_all index:", minority_all.max())
    print("Shape of labels:", labels.shape)

    adj_temp = sp.coo_matrix((np.ones(pos[0].shape[0]),(generate_node[pos[0]].numpy(), minority_all[pos[1]].numpy())),
                             shape=(labels.shape[0], labels.shape[0]),
                             dtype=np.float32)

    adj_new = add_edges(adj_real, adj_temp)
    if cuda:
        adj_new=adj_new.cuda()

    t_total = time.time()
    # model.eval()
    output, output_gen, output_AUC = model(torch.cat((features, gen_imgs1.data),0), adj)

    labels_true = torch.LongTensor(num_false).fill_(0)
    labels_min = torch.LongTensor(num_false).fill_(1)
    if cuda:
        labels_true = labels_true.cuda()
        labels_min = labels_min.cuda()

    g_loss = F.nll_loss(output_gen[generate_node], labels_true) \
             + F.nll_loss(output[generate_node], labels_min) \
             + euclidean_dist(features[minority], gen_imgs1).mean()
    g_loss.backward()
    optimizer_G.step()

    for epoch in range(epochs):
        recall_val, f1_val, acc_val, recall_train, f1_train, acc_train = train(torch.cat((features, gen_imgs1.data.detach()),0), adj_new)
    print("Epoch:", '%04d' % (epoch_gen + 1),
        "train_recall=", "{:.5f}".format(recall_train), "train_f1=", "{:.5f}".format(f1_train),"train_acc=", "{:.5f}".format(acc_train),
        "val_recall=", "{:.5f}".format(recall_val), "val_f1=", "{:.5f}".format(f1_val),"val_acc=", "{:.5f}".format(acc_val))



print("Test Recall: ", test_recall)
print("Test Accuracy: ", test_acc)
print("Test F1: ", test_f1)
print("Test precision: ", test_pre)
print("Test AUC: ", test_AUC)

Max generate_node index: tensor(14911)
Max minority_all index: tensor(7287)
Shape of labels: torch.Size([14912])
Size of features: torch.Size([14912, 26])
Size of output: torch.Size([14912, 2])
Size of output_gen: torch.Size([14912, 2])
Size of output_AUC: torch.Size([14912])
Size of labels: torch.Size([14912])
Size of idx_train: torch.Size([45377])
Size of num_flase: 7520
Size of labels_true: torch.Size([8233])


ValueError: Expected input batch_size (45377) to match target batch_size (8233).