In [1]:
!sudo apt-get install libmetis-dev
!pip install metis
import metis
import random
import tensorflow as tf
import numpy as np
from scipy import sparse
import scipy.io as sio

Reading package lists... Done
Building dependency tree       
Reading state information... Done
libmetis-dev is already the newest version (5.1.0.dfsg-5).
0 upgraded, 0 newly installed, 0 to remove and 30 not upgraded.


In [2]:
def mask_test_edges(A):
  A_triu = sparse.triu(A)
  edges = np.stack(A_triu.nonzero()).T # all edges of the graph
  num_val = int(0.05 * edges.shape[0]) # 5% of the edges for validation
  num_test = int(0.1 * edges.shape[0]) # 10% of the edges for testing
  edge_ind = np.arange(edges.shape[0]) # indices of the edges
  np.random.shuffle(edge_ind) # shuffling the indices
  val_edge_ind = edge_ind[:num_val] # under 5%: indices for validation
  test_edge_ind = edge_ind[num_val:(num_val + num_test)] # 5-15%: indices for testing
  train_edge_ind = edge_ind[(num_val + num_test):] # over 15%: indices for training
  val_edges = edges[val_edge_ind]
  test_edges = edges[test_edge_ind]
  train_edges = edges[train_edge_ind]
  # the incomplete adjacency matrix for training
  arg1 = (np.ones(train_edges.shape[0]), (train_edges[:, 0], train_edges[:, 1]))
  A_train_triu = sparse.csr_matrix(arg1, shape=A.shape, dtype='float32')
  A_train = A_train_triu + A_train_triu.T
  edges = edges.tolist()
  str_edges = set(str(edge[0]) + " " + str(edge[1]) for edge in edges)
  print("Selecting the negative test set!")
  str_test_edges_false = set()
  while len(str_test_edges_false) < len(test_edges): # picking the same number of negative test edges
    ind_i = np.random.randint(0, A.shape[0])
    ind_j = np.random.randint(0, A.shape[0])
    if ind_i == ind_j: continue
    # these ones were selected earlier
    if str(ind_i) + " " + str(ind_j) in str_edges: continue
    if str(ind_j) + " " + str(ind_i) in str_edges: continue
    if str(ind_j) + " " + str(ind_i) in str_test_edges_false: continue
    if str(ind_i) + " " + str(ind_j) in str_test_edges_false: continue
    # these ones were not
    str_test_edges_false.add(str(ind_i) + " " + str(ind_j))
  test_edges_false = []
  for str_edge_false in str_test_edges_false:
    edge_false = str_edge_false.split(" ")
    test_edges_false.append([int(edge_false[0]), int(edge_false[1])])
  print("Test set is ready!")
  print("Selecting the negative validation set!")
  str_val_edges_false = set()
  while len(str_val_edges_false) < len(val_edges): # annyi negatív validációs példát választunk, amennyi pozitív van
    ind_i = np.random.randint(0, A.shape[0])
    ind_j = np.random.randint(0, A.shape[0])
    if ind_i == ind_j: continue
    # these ones were selected earlier
    if str(ind_i) + " " + str(ind_j) in str_edges: continue
    if str(ind_j) + " " + str(ind_i) in str_edges: continue
    if str(ind_j) + " " + str(ind_i) in str_val_edges_false: continue
    if str(ind_i) + " " + str(ind_j) in str_val_edges_false: continue
    if str(ind_j) + " " + str(ind_i) in str_test_edges_false: continue
    if str(ind_i) + " " + str(ind_j) in str_test_edges_false: continue
    # these ones were not
    str_val_edges_false.add(str(ind_i) + " " + str(ind_j))
  val_edges_false = []
  for str_edge_false in str_val_edges_false:
    edge_false = str_edge_false.split(" ")
    val_edges_false.append([int(edge_false[0]), int(edge_false[1])])
  print("Validation set is ready!")
  # we are ready
  test_edges_false = np.array(test_edges_false)
  val_edges_false = np.array(val_edges_false)
  return A_train, val_edges, val_edges_false, test_edges, test_edges_false

In [3]:
# hyperparameters
hidden = 512 # number of hidden units in the encoder layer
latent = 256 # dimension of the latent variables
learning_rate = 0.001
epochs = 200
nparts = 50 # number of partitions
batch_size = 1 # number of clusters per batch
K = 3 # number of Lanczos iterations

In [4]:
filename = '/content/drive/MyDrive/GRAPH DATA/ppi.mat' # dataset

In [5]:
mat_dict = sio.loadmat(filename)
A = mat_dict['A'].ceil()
X = mat_dict['X']
Y = mat_dict['Y']
train_mask = mat_dict['train_mask'].squeeze().astype(bool)
val_mask = mat_dict['val_mask'].squeeze().astype(bool)
test_mask = mat_dict['test_mask'].squeeze().astype(bool)

# selecting the validation and test edges, and the incomplete adjacency matrix for training
A_train, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(A)

Selecting the negative test set!
Test set is ready!
Selecting the negative validation set!
Validation set is ready!


In [6]:
def cluster_graph(A, nparts):
  if nparts == 1:
    edge_cuts, parts = 0, [0, ] * A.shape[0]
  else:
    edge_cuts, parts = metis.part_graph([neighbors for neighbors in A.tolil().rows], nparts=nparts)
  print('Number of edge cuts: %d.' % edge_cuts)
  cluster_dict = {}
  for index, part in enumerate(parts):
    if part not in cluster_dict:
      cluster_dict[part] = []
    cluster_dict[part].append(index)
  return cluster_dict

# the clustering algorithm (METIS)
cluster_dict = cluster_graph(A_train, nparts)

Number of edge cuts: 252793.


In [7]:
def preprocess_support(A):
  N = A.shape[1]
  D = sparse.csr_matrix(A.sum(axis=1))
  norm = D.power(-0.5)
  L = sparse.eye(N, dtype='float32') - A.multiply(norm).T.multiply(norm)
  max_eigval = sparse.linalg.eigsh(L, k=1, return_eigenvectors=False)[0]
  L_ = 2.0 / max_eigval * L - sparse.eye(N, dtype='float32')
  return L_

def toTensorSparse(S):
  return tf.constant(S.todense())

def toTensor(T):
  return tf.constant(T)

In [8]:
# layer classes

class bilinear_layer:

  def __init__(self, indim, outdim):
    pass

  def __call__(self, tensor):
    return tf.linalg.matmul(tensor, tf.transpose(tensor))

# unused
class FC_layer:

  def __init__(self, indim, outdim):
    initial_value = tf.initializers.he_normal()((indim, outdim,))
    self.weight = tf.Variable(initial_value=initial_value, trainable=True)

  def __call__(self, tensor):
    return tf.linalg.matmul(tensor, self.weight)

class GC_layer:

  def __init__(self, indim, outdim):
    global K
    self.K = K + 1
    delta = np.zeros((outdim, K + 1, K + 1), dtype='float32')
    for o in range(outdim):
      delta[o, 0, 0] = 1.0
    self.filter = tf.Variable(initial_value=delta, trainable=True)
    initial_value = tf.initializers.he_normal()((indim, outdim,))
    self.weight = tf.Variable(initial_value=initial_value, trainable=True)

  # Lanczos algorithm implemented for multiple vectors
  def Lanczos_algorithm(self, tensor, support, K, embed=False):
    Q = [tf.zeros(tensor.shape), tensor / tf.linalg.norm(tensor, axis=0)]
    C = [tf.zeros(tensor.shape[1])]
    B = [tf.zeros(tensor.shape[1])]
    for k in range(1, K + 1):
      if embed: # numpy pipeline
        z = tf.constant(support.dot(Q[k].numpy()))
      else: # tensorflow pipeline
        z = tf.linalg.matmul(support, Q[k])
      C.append(tf.einsum('ab, ab -> b', Q[k], z))
      z = z - Q[k] * C[k] - Q[k-1] * B[k-1]
      B.append(tf.linalg.norm(z, axis=0))
      Q.append(z / B[k])
    Vs = tf.transpose(tf.stack(Q[1:-1])) # Lanczos vectors
    Cs = tf.transpose(tf.stack(C[1:])) # diagonal Lanczos scalars
    Bs = tf.transpose(tf.stack(B[1:-1])) # off-diagonal Lanczos scalars
    Hs = tf.linalg.diag(Bs, k=-1) + tf.linalg.diag(Cs) + tf.linalg.diag(Bs, k=1) # tridiagonal matrix
    return Vs, Hs

  def __call__(self, tensor, support, embed=False):
    # see the following reference:
    # "Susnjara, A., Perraudin, N., Kressner, D., & Vandergheynst, P. (2015).
    # Accelerated filtering on graphs using lanczos method. arXiv preprint arXiv:1509.04537."
    tensor = tf.linalg.matmul(tensor, self.weight)
    V, H = self.Lanczos_algorithm(tensor, support, self.K, embed)
    delta = tf.one_hot(tf.zeros(tensor.shape[1], dtype=tf.uint8), self.K)
    norm = tf.linalg.norm(tensor, axis=0)
    eigvals, eigvecs = tf.linalg.eigh(H)
    T = tf.einsum('abc, acd, ade -> abe', eigvecs, self.filter, eigvecs)
    result = tf.einsum('abc, acd, ad, a -> ba', V, T, delta, norm)
    return result


In [9]:
# our model class (for the paper "Scalable Graph Variational Autoencoders")

class Model:

  def __init__(self, size_tuple, optimizer, nonlinear):
    self.sources = [] # variables to optimize
    self.build(size_tuple) # builds the model by stacking layers on each other
    self.optimizer = optimizer
    self.nonlinear = nonlinear
    self.Z_mean = None # mean embedding layer
    self.Z_var = None # variance embedding layer
    self.noise = None # the noise sample
    self.sample = None # self.Z_mean + self.Z_var * self.noise
    self.A_gamma = None # the reconstructions
  
  def build(self, size_tuple):
    X_dim, hidden, latent = size_tuple
    self.enc_layer = GC_layer(X_dim, hidden)
    self.enc_mean_layer = GC_layer(hidden, latent)
    self.enc_var_layer = GC_layer(hidden, latent)
    self.A_dec_gamma_layer = bilinear_layer(latent, latent)
    # filling the source array with weights
    layers = [self.enc_layer, self.enc_mean_layer, self.enc_var_layer]
    for layer in layers:
      self.sources.append(layer.weight)
      self.sources.append(layer.filter)
  
  # forward propagation in the encoder
  def encode(self, X, S):
    enc = self.nonlinear(self.enc_layer(X, S))
    enc_mean = self.enc_mean_layer(enc, S)
    enc_var = tf.math.exp(self.enc_var_layer(enc, S))
    return enc_mean, enc_var

  # returns only the node embeddings
  def embed(self, X, S):
    enc = self.nonlinear(self.enc_layer(X, S, embed=True))
    enc_mean = self.enc_mean_layer(enc, S, embed=True)
    return enc_mean

  # forward propagation in the decoder
  def decode(self, sample):
    A_dec_gamma = self.A_dec_gamma_layer(sample)
    return A_dec_gamma

  def predict(self, X, S):
    self.Z_mean, self.Z_var = self.encode(X, S)
    self.noise = tf.random.normal(self.Z_var.shape)
    self.sample = self.Z_mean + self.Z_var * self.noise # reparameterization trick
    self.A_gamma = self.decode(self.sample)

  def train(self, X, A, val_edges, val_edges_false, cluster_dict, batch_size, epochs):
    for epoch in range(epochs):
      # only a subgraph is used in the training process
      samples = random.sample(cluster_dict.keys(), batch_size)
      nodes = sum([cluster_dict[sample] for sample in samples], [])
      S_batch = toTensorSparse(preprocess_support(A[nodes].T[nodes]))
      A_batch = toTensor(A.T[nodes].T[nodes].todense())
      X_batch = tf.math.l2_normalize(toTensor(X[nodes]), axis=1)
      # optimization
      with tf.GradientTape() as tape:
        self.predict(X_batch, S_batch)
        losses = self.loss(A_batch, X_batch)
        loss_ = tf.reduce_sum(losses)
      print(epoch, [loss.numpy() for loss in losses], loss_.numpy())
      grads = tape.gradient(loss_, self.sources)
      self.optimizer.apply_gradients(zip(grads, self.sources))

  def test(self, X, A, test_edges, test_edges_false):
    S_test = preprocess_support(A)
    X_test = tf.math.l2_normalize(toTensor(X), axis=1)
    self.Z_mean = self.embed(X_test, S_test)
    roc_auc, pr_auc = self.accuracy(test_edges, test_edges_false)
    print(roc_auc, pr_auc)

  # Kullback–Leibler divergence
  def KL_Divergence(self):
    loss = 0.5 * tf.reduce_mean(self.Z_mean**2.0 + self.Z_var**2.0 - 2.0 * tf.math.log(self.Z_var) - 1.0)
    return loss

  # reconstruction loss
  def re_A_loss(self, A):
    density = tf.reduce_sum(A) / tf.size(A, out_type=tf.float32)
    pos_weight = (1.0 - density) / density
    loss = -0.5 * tf.reduce_mean(1.0 / (1.0 - density) * tf.nn.weighted_cross_entropy_with_logits(labels=A, logits=self.A_gamma, pos_weight=pos_weight))
    return -loss

  # list of all loss functions
  def loss(self, A, X):
    return self.KL_Divergence(), self.re_A_loss(A)
  
  # through the ratio parameter, the number of edges used for validation/testing can be adjusted
  def accuracy(self, edges_pos, edges_neg, ratio=1.0):
    A_dec = self.Z_mean
    #print("positive samples")
    p = np.random.permutation(len(edges_pos))
    limit = round(ratio * len(edges_pos))
    left_pos = []
    right_pos = []
    for edge in edges_pos[p][:limit]:
      left_pos.append(A_dec[edge[0], :])
      right_pos.append(A_dec[edge[1], :])
    re_pos = tf.nn.sigmoid(tf.einsum('ij, ij -> i', tf.stack(left_pos), tf.stack(right_pos)))
    #print("negative samples")
    p = np.random.permutation(len(edges_neg))
    limit = round(ratio * len(edges_neg))
    left_neg = []
    right_neg = []
    for edge in edges_neg[p][:limit]:
      left_neg.append(A_dec[edge[0], :])
      right_neg.append(A_dec[edge[1], :])
    re_neg = tf.nn.sigmoid(tf.einsum('ij, ij -> i', tf.stack(left_neg), tf.stack(right_neg)))
    #print("stacking all")
    re_all = tf.stack([re_pos, re_neg])
    all = tf.stack([tf.ones(len(re_pos)), tf.zeros(len(re_neg))])
    from sklearn.metrics import roc_auc_score, average_precision_score
    #print("metrics evaluation")
    return roc_auc_score(all, re_all), average_precision_score(all, re_all)


In [10]:
size_tuple = (X.shape[1], hidden, latent)
optimizer = tf.optimizers.Adam(learning_rate=learning_rate)
nonlinear = tf.nn.relu

model = Model(size_tuple, optimizer, nonlinear)

print('Training...')
model.train(X, A_train, val_edges, val_edges_false, cluster_dict, batch_size, epochs)
print('Testing...')
model.test(X, A_train, test_edges, test_edges_false)

Training...
0 [0.008317752, 6.246009] 6.254327
1 [0.008069412, 5.9270678] 5.9351373
2 [0.014910133, 5.648883] 5.663793
3 [0.011988765, 5.3217406] 5.3337293
4 [0.011688336, 5.334153] 5.3458414
5 [0.017081242, 5.09459] 5.1116714
6 [0.02987887, 4.9757056] 5.0055847
7 [0.026919054, 4.6635675] 4.6904864
8 [0.030616263, 4.5900393] 4.6206555
9 [0.032911334, 4.4368744] 4.4697857
10 [0.038473215, 4.222215] 4.2606883
11 [0.03077813, 4.19717] 4.2279477
12 [0.043574877, 3.894365] 3.93794
13 [0.04895062, 3.9082577] 3.9572084
14 [0.051079202, 3.667451] 3.7185302
15 [0.0784937, 3.369637] 3.4481306
16 [0.058125816, 3.4906619] 3.5487876
17 [0.08811037, 3.10078] 3.1888905
18 [0.11530216, 2.941951] 3.0572531
19 [0.1110618, 2.8980174] 3.0090792
20 [0.121197455, 2.6761198] 2.7973173
21 [0.15065007, 2.4647005] 2.6153505
22 [0.16292194, 2.318737] 2.481659
23 [0.26418662, 2.0345817] 2.2987683
24 [0.20465362, 2.187996] 2.3926497
25 [0.24887456, 1.9449385] 2.193813
26 [0.2603096, 2.002591] 2.2629006
27 [0.21150