In [1]:
import sys
sys.path.append('..')

import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
import scipy as sp
import pandas as pd
import graph_tool as gt
from tqdm import tqdm
from operator import itemgetter
from functools import reduce
import itertools
import os
#from pymnet import *

import MuxVizPy as mxp

import warnings
warnings.filterwarnings("ignore")

Here we import some metadata useful for organizing the PPIs data of each virus.
In order to simplify the analysis, it was created a dictionary associating each protein symbol appearing in the interactions data to a unique number _node_map.csv_

In [2]:
#dictionary containing a unquie mapping between name of the protein and a corresponding index
node_map_df = pd.read_csv("../DataMNT/Files/node_map.csv")
node_map_dict = {k:(v-1) for k,v in zip(node_map_df["Prot"], node_map_df["Index"])}

#virus metadata
virus_metadata = pd.read_csv("../DataMNT/Files/viruses_metadata.csv", header=0, sep=";")

virus_metadata.head(5)

Unnamed: 0,virus,viral.genes,human.targets,human.targets.frac,go.terms,go.terms.frac,vhg.links,neigh_order,virus_short,family,nodes_virus,nodes_human,ppis,isOncogenic
0,African_swine_fever_virus__strain_Badajoz_1971...,2,4,0.000201,406,0.000803,461,0,African_swine_fever_virus,Asfarviridae,2,4,4,False
1,African_swine_fever_virus__strain_Badajoz_1971...,2,1703,0.085385,406,0.000803,80723,1,African_swine_fever_virus,Asfarviridae,2,4,4,False
2,Avian_infectious_bronchitis_virus__strain_Beau...,5,2794,0.140085,463,0.000916,167036,1,Avian_infectious_bronchitis_virus,Coronaviridae,5,9,9,False
3,Avian_infectious_bronchitis_virus__strain_Beau...,5,9,0.000451,463,0.000916,567,0,Avian_infectious_bronchitis_virus,Coronaviridae,5,9,9,False
4,Avian_leukosis_virus_RSA,1,1,5e-05,10,2e-05,12,0,Avian_leukosis_virus_RSA,Retroviridae,1,1,1,False


## Human network

We start by building the entire human interactome with data from BIOSTRING.

In [5]:
human_ppi = pd.read_csv("../DataMNT/data_BIOGRID/BIOSTR_homo_sapiens.edges", sep=" ", header=None)
human_ppi.columns=["source","target","weight"]
#human PPI connections with respective identificative index
human_nodes = pd.read_csv("../DataMNT/data_BIOGRID//BIOSTR_homo_sapiens.nodes", sep=" ", index_col=0)

human_map = dict(zip(human_nodes['nodeSymbol'], np.arange(len(human_nodes))))

#create human PPI network
human_g = gt.Graph(directed=False)
human_g.add_edge_list(human_ppi.values)

The following function creates a subset of the entire human interactome by starting from a list of proteins (directly targeted), and selecting them with their first neighbors and the edges between these nodes.

In [5]:
def create_syntetic_virus(human_g, virus_targeted_proteins):
    """
    function that return the subnetwork of the entire human PPI network corresponding to the subset of edges associated
    to nodes that can either be directly targeted by the virus of being a n.n. of such nodes
    Parameters:
                human_g: graph tool network corresponding to the entire human PPI network
                virus_targeted_proteins: array of strings corresponding to symbols of proteins directly 
                                         targetd by the virus
    Outputs:
                gf: network subset as explained in the  description
                original_index_nodes: dictionary associating the nodes indexes in the subnetwork to the
                                      indexes associated to the same nodes in the entire orginal network

    """

    vtp = itemgetter(*virus_targeted_proteins)(human_map)
    # find all the first nearest neighbors of the targeted proteins
    first_neigh_nodes_rep=[]
    if isinstance(vtp,np.int64):
        first_neigh_nodes_rep.append(human_g.get_all_neighbors(vtp))
        first_neigh_nodes=np.unique(np.concatenate([[vtp], np.concatenate(first_neigh_nodes_rep)]))
    else:
        for vi in vtp:
            first_neigh_nodes_rep.append(human_g.get_all_neighbors(vi))
        first_neigh_nodes=np.unique(np.concatenate([vtp, np.concatenate(first_neigh_nodes_rep)]))

    # mask which tells which proteins in the human genome are the NN found above
    neighbors_mask = np.isin(np.arange(len(human_nodes)), first_neigh_nodes)

    # create vertex property map to remeber which node corresponds to which protein
    labels = human_g.new_vertex_property("int", np.arange(len(human_nodes)))
    human_g.vertex_properties["labels"]=labels

    # create a view of a graph and hide all the vertices not in the mask
    gw = gt.GraphView(human_g, vfilt=neighbors_mask)
    # then create a new graph in which the hidden nodes are pruned
    gf = gt.Graph(gw, prune=True)
    #by doing these you can retrive the original names
    original_index_nodes = gf.vp["labels"].get_array()
    return gf, original_index_nodes

In [6]:
virus_nodes0_list, virus_nodes1_list = [], []
virus_nodes0_dict, virus_nodes1_dict ={}, {}
target_folder = "../DataMNT/Virus_data_Enriched_0.7_Neigh_0/"

neigh_ord_1_indexes = virus_metadata[virus_metadata["neigh_order"]==1].index.values

for i, vi in enumerate(neigh_ord_1_indexes):
    read_nodes = pd.read_csv(target_folder+"/"+virus_metadata.loc[vi,"virus"]+"/nodes.csv")
    virus_nodes1 = list(read_nodes[read_nodes["type"]==1].node)    
    
    virus_nodes1_list.append(virus_nodes1)
    virus_nodes1_dict[virus_metadata.loc[vi,"virus_short"]] = virus_nodes1

## Create Viruses

In [7]:
biostr_df = pd.read_csv("../DataMNT/data_BIOGRID/BIOSTR_homo_sapiens.nodes", sep=" ")

biostr_map = dict(zip(biostr_df["nodeID"], biostr_df["nodeSymbol"]))

### Original

In [22]:
human_g = gt.Graph(directed=False)
human_g.add_edge_list(human_ppi.values)

if not os.path.isdir("../DataMNT/SynteticViruses/"+str(k)):
    os.mkdir("../DataMNT/SynteticViruses/"+str(k))
for i in tqdm(range(len(virus_nodes1_list))):
    g0, n0 = create_syntetic_virus(human_g, virus_nodes1_list[i])
    name = virus_metadata.loc[neigh_ord_1_indexes[i], "virus"]
    provv_dict = dict(zip(np.arange(len(n0)),n0))

    g_df = pd.DataFrame(g0.get_edges())
    g_df["source"] = g_df[0].map(provv_dict).map(biostr_map)
    g_df["target"] = g_df[1].map(provv_dict).map(biostr_map)
    
    if not os.path.isdir("../DataMNT/SynteticViruses/"+str(k)+"/"+name):
        os.mkdir("../DataMNT/SynteticViruses/"+str(k)+"/"+name)
    
    np.savetxt(X=np.array(n0), fname="../DataMNT/SynteticViruses/original"+"/"+name+"/nodes.txt", fmt="%d")
    g_df[["source", "target"]].to_csv("../DataMNT/SynteticViruses/original"+"/"+name+"/edges.csv", index=False)

100%|█████████████████████████████████████████████| 1/1 [00:10<00:00, 10.27s/it]
