In [1]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv
from torch_geometric.utils import negative_sampling
import sys
import os
import torch
import numpy as np
import pandas as pd
import random
import copy
from torch_geometric.utils.dropout import dropout_adj
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
sys.path.append(os.path.abspath("C:\\Data\\Code\\BioML_manuscript\\data"))
from utils.boolODE_data_to_pyg_data import make_adj_from_df, to_pyg_data

In [None]:
# set user parameters:
#   datadir: directory in which data in the BoolODE format is available
#   name: name of the directory in which the data is located (subdirectory of datadir)
#   filenm: name under which the results should be saved for this network, note: output/"+filenm+"/"+filenm+"/" should exist before running!
#   num_features: amount of cells available for the data (2000 for mCAD example network)
datadir = 'data/'
name = 'hESC'
filenm = 'hESC'
df=pd.read_csv(datadir + name + '/ExpressionData.csv', index_col=0)

adj_df = pd.read_csv(datadir + name + '/refNetwork.csv', index_col=0)

mat = df.to_numpy()

sz = df.to_numpy().shape
edge_index, adj = make_adj_from_df(datadir,df, name)
true_data = to_pyg_data(mat, sz[0], sz[1], edge_index=edge_index)

ode_dim = true_data.x.shape[0]

num_features = 758

In [None]:
import torch
from torch_geometric.data import Data
from torch_geometric.utils import degree
import csv

# method to obtain a subnetwork that includes only Transcription factors from the full ChIP-seq network
def get_tf_network(data):

    tf = []

    with open('data\\hESC\\refNetwork.csv') as f:
        reader = csv.reader(f,delimiter=",")
        k = 0
        next(reader)
        for ln in reader:
            k+=1
            if not ln[0] in tf:
                tf.append(ln[0])

    tf_ind = []

    with open('data\\hESC\\ExpressionData.csv') as f:
        reader = csv.reader(f,delimiter=",")
        next(reader)
        k = 0
        for ln in reader:
            if ln[0] in tf:
                tf_ind.append(k)
            k += 1
    
    num_nodes = data.num_nodes
    
    node_mask = torch.zeros(num_nodes, dtype=torch.bool)
    node_mask[tf_ind] = True

    # map old indices to new indices
    old_to_new = -torch.ones(num_nodes, dtype=torch.long)
    old_to_new[tf_ind] = torch.arange(len(tf_ind))

    # keep only edges between the top-N nodes
    src, dst = data.edge_index
    edge_mask = node_mask[src] & node_mask[dst]
    new_edge_index = data.edge_index[:, edge_mask]
    new_edge_index = old_to_new[new_edge_index]

    # subsample node features and create new data object
    new_data = Data(
        x=data.x[tf_ind] if data.x is not None else None,
        edge_index=new_edge_index
    )

    # copy other per-node fields
    for key in data.keys:
        if key in ['x', 'edge_index']:
            continue
        attr = data[key]
        if torch.is_tensor(attr) and attr.size(0) == num_nodes:
            new_data[key] = attr[tf_ind]
        else:
            new_data[key] = attr

    return new_data

# method to get the top N nodes from a Pytorch Geometric dataset, making a subgraph, and returning this subgraph in a new data object
def top_n_nodes_by_degree(data, N):
    num_nodes = data.num_nodes
    # compute node degree
    
    deg = degree(data.edge_index[0], num_nodes=num_nodes)

    # get indices of top N nodes
    top_n_indices = deg.topk(N).indices
    node_mask = torch.zeros(num_nodes, dtype=torch.bool)
    node_mask[top_n_indices] = True

    # map old indices to new indices
    old_to_new = -torch.ones(num_nodes, dtype=torch.long)
    old_to_new[top_n_indices] = torch.arange(N)

    # keep only edges between the top-N nodes
    src, dst = data.edge_index
    edge_mask = node_mask[src] & node_mask[dst]
    new_edge_index = data.edge_index[:, edge_mask]
    new_edge_index = old_to_new[new_edge_index]

    # subsample node features and create new data object
    new_data = Data(
        x=data.x[top_n_indices] if data.x is not None else None,
        edge_index=new_edge_index
    )

    # copy other per-node fields
    for key in data.keys:
        if key in ['x', 'edge_index']:
            continue
        attr = data[key]
        if torch.is_tensor(attr) and attr.size(0) == num_nodes:
            new_data[key] = attr[top_n_indices]
        else:
            new_data[key] = attr

    return new_data

In [None]:
from torch_geometric.utils import train_test_split_edges
from scipy.io import savemat

# create the TF network, and subsequently create subnetworks of the TF network of different sizes (based on nodes with highest degree)
tfdata = get_tf_network(true_data)

data10 = top_n_nodes_by_degree(tfdata,10)
data10 = train_test_split_edges(data10)

data15 = top_n_nodes_by_degree(tfdata,15)
data15 = train_test_split_edges(data15)

data20 = top_n_nodes_by_degree(tfdata,20)
data20 = train_test_split_edges(data20)

data50 = top_n_nodes_by_degree(tfdata,50)
data50 = train_test_split_edges(data50)

data80 = top_n_nodes_by_degree(tfdata,80)
data80 = train_test_split_edges(data80)

data100 = top_n_nodes_by_degree(tfdata, 100)
data100 = train_test_split_edges(data100)

tfdata = train_test_split_edges(tfdata)

In [76]:
# Define Graph Autoencoder (GAE) Model
class GAE(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim=16):
        super(GAE, self).__init__()
        self.conv1 = SAGEConv(input_dim, hidden_dim)
        self.conv2 = SAGEConv(hidden_dim, hidden_dim)
        self.dropout = torch.nn.Dropout(0.3)

        # one linear layer (only weights) for decoding
        self.lin1 = torch.nn.Linear(hidden_dim, hidden_dim, bias=False)

    # encode node features
    def encode(self, data):
        #x = self.dropout(data.x)
        edge_index = dropout_adj(data.train_pos_edge_index, p = 0.2)[0]
        #edge_index = data.edge_index
        x = self.conv1(data.x, edge_index)
        x = F.relu(x)
        x = self.dropout(x)
        return self.conv2(x, edge_index)

    # decode specific edges
    def decode(self, z, edge_index):
        return (z[edge_index[0]] * self.lin1(z[edge_index[1]])).sum(dim=-1)  # Inner product
    
    # decode all edges for full adjacency matrix inference
    def decode_all(self,z):
        adj_matrix = torch.ones((z.shape[0], z.shape[0]))
        full_edge_index = adj_matrix.nonzero().t().contiguous()

        return (z[full_edge_index[0]] * self.lin1(z[full_edge_index[1]])).sum(dim=-1)

In [None]:
# Train Model
def train(model, data, optimizer, criterion):
    model.train()
    optimizer.zero_grad()
    z = model.encode(data)
    neg_edges = negative_sampling(data.train_pos_edge_index, data.x.shape[0], data.train_pos_edge_index.size(1))

    edges = torch.cat([data.train_pos_edge_index, neg_edges], dim=1)
    
    # Labels: 1 for real edges, 0 for negative samples
    labels = torch.cat([torch.ones(data.train_pos_edge_index.size(1)), torch.zeros(neg_edges.size(1))]).to(data.x.device)
    preds = model.decode(z, edges)
    
    loss = criterion(preds, labels)
    loss.backward()
    optimizer.step()
    return loss.item()

def validate(model, data, criterion):
    model.eval()
    z = model.encode(data)
    neg_edges = data.val_neg_edge_index #negative_sampling(data.train_pos_edge_index, data.x.shape[0], data.val_pos_edge_index.size(1))
    
    edges = torch.cat([data.val_pos_edge_index, data.val_neg_edge_index], dim=1)
    labels = torch.cat([torch.ones(data.val_pos_edge_index.size(1)), torch.zeros(neg_edges.size(1))]).to(data.x.device)

    preds = model.decode(z, edges)

    val_loss = criterion(preds, labels)

    return val_loss.item()


In [None]:
from sklearn.metrics import roc_auc_score

def auroc(model, data, criterion):
    model.eval()
    z = model.encode(data)
    neg_edges = data.test_neg_edge_index

    # use test indices to evaluate performance of the GAE
    edges = torch.cat([data.test_pos_edge_index, data.test_neg_edge_index], dim=1)

    labels = torch.cat([torch.ones(data.test_pos_edge_index.size(1)), torch.zeros(neg_edges.size(1))]).to(data.x.device)

    preds = model.decode(z, edges)

    preds = preds.sigmoid()

    labels = labels.cpu()
    preds = preds.cpu()
    labels = labels.detach().numpy()
    preds = preds.detach().numpy()

    auroc = roc_auc_score(labels, preds)

    return [auroc, labels, preds]

In [None]:
from scipy.io import savemat

num_features = 758
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# set data name + subnetwork here
nm_cur = "data10"
data_cur = data10

data_cur.to(device)

auroc_scores = []

# repeat GAE training 20 times to get an estimate of the variance between training runs
for k in range(0,20):

    print("training model " + str(k) + "...")
    model = GAE(input_dim=num_features,hidden_dim=200)
    model.to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
    criterion = torch.nn.BCEWithLogitsLoss()

    loss_vec = []

    for epoch in range(500):
        loss = train(model, data_cur, optimizer, criterion)
        
        loss_vec.append(loss)
        # print(loss)

        val_loss = validate(model, data_cur, criterion)
        #print("val loss:")
        #print(val_loss)

    ans = auroc(model,data_cur,criterion)
    auroc_scores.append(ans[0])

savemat(nm_cur+"_auroc_new.mat",{"auroc":auroc_scores})

training model 0...




training model 1...




training model 2...




training model 3...




training model 4...




training model 5...




training model 6...




training model 7...




training model 8...




training model 9...




training model 10...




training model 11...




training model 12...




training model 13...




training model 14...




training model 15...




training model 16...




training model 17...




training model 18...




training model 19...


