# Libraries

In [3]:
import sys
sys.path.append('..')

import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
import scipy as sp
import pandas as pd
import graph_tool as gt
from tqdm import tqdm
from operator import itemgetter
from functools import reduce
import itertools
import scipy.sparse as sps
import random
import os

import MuxVizPy as mxp


import warnings
warnings.filterwarnings("ignore")


working_dir = "../DataMNT/"

# multilayer settings
layerCouplingStrength = 1
networkOfLayersType = "categorical" ## = all to all

#virus metadata
virus_metadata = pd.read_csv(working_dir+"/Files/viruses_metadata.csv", header=0, sep=";")

virus_metadata_onco = virus_metadata[virus_metadata["isOncogenic"] == True].reset_index()
virus_metadata_nonco = virus_metadata[virus_metadata["isOncogenic"] == False].reset_index()

#dictionary containing a unquie mapping between name of the protein and a corresponding index
node_map_df = pd.read_csv(working_dir+"/Files/node_map.csv")
node_map_dict = {k:(v-1) for k,v in zip(node_map_df["Prot"], node_map_df["Index"])}

#function to create list of n_iter combination of nonco virus indexes with a fixed random seed for repitibility
def SamplingForNoco(n_iter, start=0, group_dim=8, random_seed=1234):
    np.random.seed(random_seed)
    nonco_cond = np.where(np.all([np.array(virus_metadata["virus"]!="Human_SARS_coronavirus_2"),
                                  np.array(virus_metadata["virus_short"]!="Lymphocytic_choriomeningitis_virus"),
                                  np.array(virus_metadata["neigh_order"]==NEIGH_ORDER), 
                                  np.array(virus_metadata["isOncogenic"]==False)],
                                  axis=0))
    
    nonco_sampling = np.array([np.random.choice(nonco_cond[0], group_dim, replace=False) for i in range(n_iter+start)])
    return nonco_sampling[start:(n_iter+start)]

# Index Lists

In [4]:
n_iters = 256
np.random.seed(100)

#position of the sars cov-2 in the metadata file
Sars_pos = np.where(np.array(np.all([virus_metadata["neigh_order"]==NEIGH_ORDER, virus_metadata["virus"]=="Human_SARS_coronavirus_2"], axis=0)))[0][0]
#positions of the oncogenic viruses in the metadata file
onco_virus_indexes = np.where(np.array(np.all([virus_metadata["neigh_order"]==NEIGH_ORDER, virus_metadata["isOncogenic"] == True], axis=0)))[0]

### creating combinations of indexes referred to specific viruses to build each sample for each combination set ###

#N
n_virus_indexes = SamplingForNoco(n_iters, group_dim=4, random_seed=41252145)


#N1O
n1o_virus_indexes = []
n1o_sampling = SamplingForNoco(n_iters, group_dim=3, random_seed=456)
for i in range(len(n1o_sampling)):
    onco_pick = np.random.choice(onco_virus_indexes)
    n1o_virus_indexes.append(np.concatenate([n1o_sampling[i], [onco_pick]]))

#N1S
Snonco_nonco_samples = SamplingForNoco(n_iters, group_dim=3, random_seed=4563)
n1s_virus_indexes = np.concatenate([Snonco_nonco_samples, np.repeat(Sars_pos,n_iters).reshape([n_iters,1])], axis=1)
    
#N2O
n2o_virus_indexes = []
n2o_sampling = SamplingForNoco(n_iters, group_dim=2, random_seed=17521)
for i in range(len(n2o_sampling)):
    onco_pick = np.random.choice(onco_virus_indexes, 2)
    n2o_virus_indexes.append(np.concatenate([n2o_sampling[i], onco_pick]))
    
#N3O
n3o_virus_indexes = []
n3o_sampling = SamplingForNoco(n_iters, group_dim=3, random_seed=17521)
for i in range(len(n2o_sampling)):
    onco_pick = np.random.choice(onco_virus_indexes)
    n3o_virus_indexes.append(np.concatenate([n3o_sampling[i], [onco_pick]]))

#O
comb = list(itertools.combinations(range(8), 4))
o_virus_indexes = np.array([list(onco_virus_indexes[list(comb[i])]) for i in range(len(comb))])

In [17]:
#organize in lists to better handle later

index_lists = [n_virus_indexes,
               n1o_virus_indexes,
               n2o_virus_indexes, 
               n3o_virus_indexes,
               o_virus_indexes
               ]

names_lists=["n", 
             "n1o", 
             "n2o", 
             "n3o",
             "o"
             ]

print(dict(zip(names_lists, [len(ioo) for ioo in index_lists])))

{'n': 256, 'n1o': 256, 'n2o': 256, 'n3o': 256, 'o': 70}


In [16]:
#save the indexes combination for reproducibility
for i in range(len(index_lists)):
    np.savetxt(X=index_lists[i], fname=working_dir+"/topology/indexes/"+names_lists[i]+".txt", fmt="%d")

# Topological quantities extraction

## LVC

In [None]:
#computing LVC size for each sample of each combination set and print on file

#for original (not randomized) human PPI network case
if not os.path.isdir(working_dir+"/topology/LVC/original"):
    os.mkdir(working_dir+"/topology/LVC/original")
for nam, lst in zip(names_lists, index_lists):
    print(nam)
    lvc_size=[]
    for i in tqdm(range(len(lst))):
        net = mxp.VirusMultiplex_from_dirlist([working_dir+"/SynteticViruses/original/"+a for a in virus_metadata.iloc[lst[i]]["virus"]])
        
        lvc_curr = mxp.topology.get_multi_LVC(net.g_list, printt=False)

        if type(lvc_curr)==int:
            lvc_curr = [lvc_curr]
        lvc_size.append(len(lvc_curr))

    np.savetxt(X=lvc_size, fname=working_dir+"/topology/LVC/original/"+nam+"_lvc.txt", fmt="%d")

#for each realization of the randomization of the human PPI network
for k in range(1,500):
    if not os.path.isdir(working_dir+"/topology/LVC/"+str(k)):
        os.mkdir(working_dir+"/topology/LVC/"+str(k))
    for nam, lst in zip(names_lists, index_lists):
        print(nam)
        lvc_size=[]
        for i in tqdm(range(len(lst))):
            net = mxp.VirusMultiplex_from_dirlist([working_dir+"/SynteticViruses/"+str(k)+"/"+a for a in virus_metadata.iloc[lst[i]]["virus"]])
            
            lvc_curr = mxp.topology.get_multi_LVC(net.g_list, printt=False)
    
            if type(lvc_curr)==int:
                lvc_curr = [lvc_curr]
            lvc_size.append(len(lvc_curr))
    
        np.savetxt(X=lvc_size, fname=working_dir+"/topology/LVC/"+str(k)+"/"+nam+"_lvc.txt", fmt="%d")
        

In [None]:
# getting the list of nodes in the LVCs
# the resulting file will be used for the GO enrichment analysis

# original network
net = mxp.VirusMultiplex_from_dirlist([working_dir+"/SynteticViruses/original/"+a for a in virus_metadata.iloc[onco_virus_indexes]["virus"]])  
lvc_curr = mxp.topology.get_multi_LVC(net.g_list, printt=False)
np.savetxt(X=np.array(list(net.node_map))[lvc_curr], fname=working_dir+"/GOdata/genes.list", fmt="%s")

# saving union of nodes in onco layers of the original net
# to be used as background gene set for the GO enrichment analysis
#np.savetxt(X=list(net.node_map.keys()), fname=working_dir+"/GOdata/gobackground.list", fmt="%s")

#randomized networks
for k in tqdm(range(500)):
    net = mxp.VirusMultiplex_from_dirlist([working_dir+"/SynteticViruses/"+str(k)+"/"+a for a in virus_metadata.iloc[onco_virus_indexes]["virus"]])
            
    lvc_curr = mxp.topology.get_multi_LVC(net.g_list, printt=False)
    lvc_synt=np.array(list(net.node_map))[lvc_curr]
    np.savetxt(X=lvc_synt, fname=working_dir+"/GOdata/Synt/genes_"+str(k)+".list", fmt="%s")

## Percolation

In [12]:
# computing multi pagerank node percolation critical point for samples from different combination sets

#in the case of original network
if not os.path.isdir(working_dir+"/topology/perc/original"):
    os.mkdir(working_dir+"/topology/perc/original")
for nam, lst in zip(names_lists, index_lists):
    print(nam)
    perc_list=[]
    for i in tqdm(range(len(lst))):
        net = mxp.VirusMultiplex_from_dirlist([working_dir+"/SynteticViruses/original/"+a for a in virus_metadata.iloc[lst[i]]["virus"]])
        
        tensor=mxp.build.get_node_tensor_from_network_list(net.g_list)

        order = mxp.versatility.get_multi_RW_centrality_edge_colored(tensor)
        order = order.sort_values("vers")["phy nodes"].to_numpy()
        g_agg = mxp.build.get_aggregate_network(tensor)

        perc_agg_2 = gt.topology.vertex_percolation(g_agg, order, second=True)[0]
        max_perc = np.argmax(perc_agg_2)/len(perc_agg_2)
        perc_list.append(max_perc)

    np.savetxt(X=perc_list, fname=working_dir+"/topology/perc/original/"+nam+".txt", fmt="%.5f")

#for randomizations

for k in range(0,1): # in this case done for the first randomized system, enlarge the number to get more
    if not os.path.isdir(working_dir+"/topology/perc/"+str(k)):
        os.mkdir(working_dir+"/topology/perc/"+str(k))
    for nam, lst in zip(names_lists, index_lists):
        print(nam)
        perc_list=[]
        for i in tqdm(range(len(lst))):
            net = mxp.VirusMultiplex_from_dirlist([working_dir+"/SynteticViruses/"+str(k)+"/"+a for a in virus_metadata.iloc[lst[i]]["virus"]])
            
            tensor=mxp.build.get_node_tensor_from_network_list(net.g_list)

            order = mxp.versatility.get_multi_RW_centrality_edge_colored(tensor)
            order = order.sort_values("vers")["phy nodes"].to_numpy()
            g_agg = mxp.build.get_aggregate_network(tensor)
    
            perc_agg_2 = gt.topology.vertex_percolation(g_agg, order, second=True)[0]
            max_perc = np.argmax(perc_agg_2)/len(perc_agg_2)
            perc_list.append(max_perc)
    
        np.savetxt(X=perc_list, fname=working_dir+"/topology/perc/"+str(k)+"/"+nam+".txt", fmt="%.5f")
        

n


100%|█████████████████████████████████████████| 256/256 [02:33<00:00,  1.67it/s]


n1o


100%|█████████████████████████████████████████| 256/256 [04:14<00:00,  1.00it/s]


n2o


100%|█████████████████████████████████████████| 256/256 [05:36<00:00,  1.31s/it]


n3o


100%|█████████████████████████████████████████| 256/256 [04:00<00:00,  1.06it/s]


o


100%|███████████████████████████████████████████| 70/70 [02:33<00:00,  2.19s/it]


## Community partition

In [None]:
# get number of not-empty modules and the modularity of the multilayers samples from the combination sets
# using the degree corrected stochastic block model algorithm

#original network

if not os.path.isdir(working_dir+"/topology/mods/original"):
    os.mkdir(working_dir+"/topology/mods/original")
if not os.path.isdir(working_dir+"/topology/mody/original"):
    os.mkdir(working_dir+"/topology/mody/original")
for nam, lst in zip(names_lists, index_lists):
    print(nam)
    mods_list=[]
    mody_list=[]
    for i in tqdm(range(len(lst))):
        net = mxp.VirusMultiplex_from_dirlist([working_dir+"/SynteticViruses/original/"+a for a in virus_metadata.iloc[lst[i]]["virus"]])
        
        mod_res = mxp.mesoscale.get_mod(g_multi=net.g_multi, n_iter=1)
        mods_list.append(mod_res[0])
        mody_list.append(mod_res[1])

    np.savetxt(X=mods_list, fname=working_dir+"/topology/mods/original/"+nam+".txt", fmt="%d")
    np.savetxt(X=mody_list, fname=working_dir+"/topology/mody/original/"+nam+".txt", fmt="%.5f")

#randomizations

for k in range(0,1): # in this case done for the first randomized system, enlarge the number to get more
    if not os.path.isdir(working_dir+"/topology/mods/"+str(k)):
        os.mkdir(working_dir+"/topology/mods/"+str(k))
    if not os.path.isdir(working_dir+"/topology/mody/"+str(k)):
        os.mkdir(working_dir+"/topology/mody/"+str(k))
    for nam, lst in zip(names_lists, index_lists):
        print(nam)
        mods_list=[]
        mody_list=[]
        for i in tqdm(range(len(lst))):
            net = mxp.VirusMultiplex_from_dirlist([working_dir+"/SynteticViruses/"+str(k)+"/"+a for a in virus_metadata.iloc[lst[i]]["virus"]])
            
            mod_res = mxp.mesoscale.get_mod(g_multi=net.g_multi, n_iter=1)
            mods_list.append(mod_res[0])
            mody_list.append(mod_res[1])
    
        np.savetxt(X=mods_list, fname=working_dir+"/topology/mods/"+str(k)+"/"+nam+".txt", fmt="%d")
        np.savetxt(X=mody_list, fname=working_dir+"/topology/mody/"+str(k)+"/"+nam+".txt", fmt="%.5f")
        

## Centrality

In [None]:
# producing vectors of floats containing the values of the multi-pagerank centrality measure calculated
# for samples belonging to the n and n1o combination sets, to be used for the machine learning phase

# the final vectors are of the size of the number of nodes in the entire human PPI net, and each position
# correpsponds to a specific human protein, in order to create a common framework for all the samples

#building a ID, protein names map to construct the final centrality vectors in a uniform way
biostr_df = pd.read_csv(working_dir+"/data_BIOGRID/BIOGRID_homo_sapiens.nodes", sep=" ")
biostr_map = dict(zip(biostr_df["nodeSymbol"], biostr_df["nodeID"]))


for i in tqdm(range(3000)):
    for j in [0,1]:          ##
        nam=names_lists[j]   ## cosidering only samples form n and n1o combination sets
        lst=index_lists[j]   ##
        if not os.path.isdir(working_dir+"/topology/centrality/"+nam):
            os.mkdir(working_dir+"/topology/centrality/"+nam)
        
        net = mxp.VirusMultiplex(lst[i], target_folder=target_folder, virus_metadata=virus_metadata)
        tensor = mxp.build.get_node_tensor_from_network_list(net.g_list)
        
        
        res_df = mxp.versatility.get_multi_RW_centrality_edge_colored(node_tensor=tensor, cval=0.15)
        

        list_res = np.array(list(net.node_map.keys()))[res_df.sort_values("vers", ascending=False).index[:50]]

        centr_norm = np.zeros(len(biostr_map))
        centr_norm[np.array(itemgetter(*list(net.node_map.keys()))(biostr_map))] = res_df["vers"].to_numpy()
        centr_norm=centr_norm/max(centr_norm)
        
        np.savetxt(X=centr_norm, fname=working_dir+"/topology/centrality/"+nam+"/"+str(i)+".txt", fmt="%.4e")

  1%|▏                                      | 11/2000 [00:37<1:33:09,  2.81s/it]