In [1]:
import numpy as np 
import pandas as pd
import os
from tqdm import tqdm
import graph_tool as gt
import scipy as sp

In BIOGRID_data/VIRUS_from_BIOGRID I have all the viruses interaction data downloaded from BIOGRID. With this script I wanto to create a list of file in which from each of this files, I exctract the human proteins directly targeted by the viruses and unifrom the nomenclature with the NCBI one.
I will save all this file in the BIOGRID_to_VIRPROT.

# From BIOGRID interaction file for human viruses, exctract list of human proteins directly targeted by the viruses

In [8]:
dir_list = os.listdir("data_BIOGRID/VIRUS_from_BIOGRID")
dir_list

['BIOGRID-ORGANISM-Human_papillomavirus_6b-4.4.220.mitab.txt',
 'BIOGRID-ORGANISM-Human_papillomavirus_9-4.4.220.mitab.txt',
 'BIOGRID-ORGANISM-Human_Herpesvirus_6B-4.4.220.mitab.txt',
 'BIOGRID-ORGANISM-Human_Herpesvirus_5-4.4.220.mitab.txt',
 'BIOGRID-ORGANISM-Human_papillomavirus_32-4.4.220.mitab.txt',
 'BIOGRID-ORGANISM-Severe_acute_respiratory_syndrome_coronavirus_2-4.4.220.mitab.txt',
 'BIOGRID-ORGANISM-Human_Immunodeficiency_Virus_2-4.4.220.mitab.txt',
 'BIOGRID-ORGANISM-Vaccinia_Virus-4.4.220.mitab.txt',
 'BIOGRID-ORGANISM-Human_Herpesvirus_4-4.4.220.mitab.txt',
 'BIOGRID-ORGANISM-Severe_acute_respiratory_syndrome_coronavirus-4.4.220.mitab.txt',
 'BIOGRID-ORGANISM-Human_Herpesvirus_2-4.4.220.mitab.txt',
 'BIOGRID-ORGANISM-Human_Immunodeficiency_Virus_1-4.4.220.mitab.txt',
 'BIOGRID-ORGANISM-Human_Herpesvirus_3-4.4.220.mitab.txt',
 'BIOGRID-ORGANISM-Human_papillomavirus_10-4.4.220.mitab.txt',
 'BIOGRID-ORGANISM-Human_papillomavirus_7-4.4.220.mitab.txt',
 'BIOGRID-ORGANISM-Human_

In [9]:

for vn in tqdm(dir_list):
    #load all the annotations on interactions between virus and human
    if not os.path.isdir("data_BIOGRID/BIOGRID_to_VIRPROT/"+vn[17:-18]):
        os.mkdir("data_BIOGRID/BIOGRID_to_VIRPROT/"+vn[17:-18])
    
    virus_ppi_all = pd.read_csv("data_BIOGRID/VIRUS_from_BIOGRID/"+vn, sep="\t")

    #create processed edge list
    virus_ppi = pd.DataFrame()
    virus_ppi["source"] = [virus_ppi_all.iloc[i,2].split("|")[1][22:] for i in range(len(virus_ppi_all))]
    virus_ppi["target"] = [virus_ppi_all.iloc[i,3].split("|")[1][22:] for i in range(len(virus_ppi_all))]
    virus_ppi["source ishum"] = virus_ppi_all["Taxid Interactor A"]=="taxid:9606"
    virus_ppi["target ishum"] = virus_ppi_all["Taxid Interactor B"]=="taxid:9606"
    virus_ppi["type"] = virus_ppi_all["Interaction Types"]

    #select human proteins only which appear at leat once
    human_prot_in_vir = np.unique(np.concatenate([virus_ppi[virus_ppi["source ishum"]]["source"], virus_ppi[virus_ppi["target ishum"]]["target"]]))

    #load ncbi database
    ncbi = pd.read_csv("data_BIOGRID/Homo_sapiens.gene_info", sep="\t")
    symbols = ncbi["Symbol"]
    synonyms = [ncbi["Synonyms"][i].split("|") for i in range(len(ncbi))]
    unique_synonims = np.unique(np.concatenate(synonyms))

    #select proteins that are in ncbi index, and check if the others are related to some synonim, if true, pick the standard name
    ncbi_prots = human_prot_in_vir[np.isin(human_prot_in_vir, symbols)]
    not_ncbi_prots = human_prot_in_vir[np.logical_not(np.isin(human_prot_in_vir, symbols))]
    but_synon = np.isin(human_prot_in_vir[np.logical_not(np.isin(human_prot_in_vir, symbols))], unique_synonims)

    new_names = []
    for i in range(len(not_ncbi_prots)):
        if but_synon[i]:
            new_names.append(symbols[([not_ncbi_prots[i] in s for s in synonyms])].values[0])

    def_names = np.concatenate([ncbi_prots, new_names])
    np.savetxt(X=def_names, fname = "data_BIOGRID/BIOGRID_to_VIRPROT/"+vn[17:-18]+"/nodes.txt", fmt="%s")

100%|███████████████████████████████████████████| 23/23 [03:51<00:00, 10.07s/it]


# Take data from the Covid19 project of BIOGRID, in which I have PPI for Sars-Cov, Sars-Cov2, MERS

In [196]:
nodes_covid = pd.read_csv("data_BIOGRID/Covid_project/BIOGRID-PROJECT-covid19_coronavirus_project-GENES-4.4.220.projectindex.txt", sep="\t")
nodes_covid.columns

Index(['#BIOGRID ID', 'ENTREZ GENE ID', 'SYSTEMATIC NAME', 'OFFICIAL SYMBOL',
       'SYNONYMS', 'ORGANISM ID', 'ORGANISM NAME', 'INTERACTION COUNT',
       'PTM COUNT', 'CHEMICAL INTERACTION COUNT', 'SOURCE', 'VIRUS VALUES',
       'VIRUS IDS', 'VIRUS TAGS', 'VIRUS EVIDENCE VALUES',
       'VIRUS EVIDENCE IDS', 'VIRUS EVIDENCE CLASSES',
       'VIRUS EVIDENCE METHODS'],
      dtype='object')

In [197]:
interactions_covid = pd.read_csv("data_BIOGRID/Covid_project/BIOGRID-PROJECT-covid19_coronavirus_project-INTERACTIONS-4.4.220.tab3.txt", sep="\t")
interactions_covid.columns

Index(['#BioGRID Interaction ID', 'Entrez Gene Interactor A',
       'Entrez Gene Interactor B', 'BioGRID ID Interactor A',
       'BioGRID ID Interactor B', 'Systematic Name Interactor A',
       'Systematic Name Interactor B', 'Official Symbol Interactor A',
       'Official Symbol Interactor B', 'Synonyms Interactor A',
       'Synonyms Interactor B', 'Experimental System',
       'Experimental System Type', 'Author', 'Publication Source',
       'Organism ID Interactor A', 'Organism ID Interactor B', 'Throughput',
       'Score', 'Modification', 'Qualifications', 'Tags', 'Source Database',
       'SWISS-PROT Accessions Interactor A', 'TREMBL Accessions Interactor A',
       'REFSEQ Accessions Interactor A', 'SWISS-PROT Accessions Interactor B',
       'TREMBL Accessions Interactor B', 'REFSEQ Accessions Interactor B',
       'Ontology Term IDs', 'Ontology Term Names', 'Ontology Term Categories',
       'Ontology Term Qualifier IDs', 'Ontology Term Qualifier Names',
       'Ontology

In [32]:
interactions_sarscov2 = interactions_covid[(interactions_covid["Organism ID Interactor A"]==2697049)|(interactions_covid["Organism ID Interactor B"]==2697049)]
interactions_sarscov = interactions_covid[(interactions_covid["Organism ID Interactor A"]==694009)|(interactions_covid["Organism ID Interactor B"]==694009)]
interactions_mers = interactions_covid[(interactions_covid["Organism ID Interactor A"]==1335626)|(interactions_covid["Organism ID Interactor B"]==1335626)]

In [200]:
virus_ppi.to_csv("data_BIOGRID/Covid_project/sarscov2net.txt", sep=" ", index=False)

In [198]:
#do the same things for sarscov2, sarscov and mers

virus_ppi = pd.DataFrame()
virus_ppi["source"] = interactions_sarscov2["Official Symbol Interactor A"]
virus_ppi["target"] = interactions_sarscov2["Official Symbol Interactor B"]
virus_ppi["source ishum"] = interactions_sarscov2["Organism ID Interactor A"]==9606
virus_ppi["target ishum"] = interactions_sarscov2["Organism ID Interactor B"]==9606

#select human proteins only which appear at leat once
human_prot_in_vir = np.unique(np.concatenate([virus_ppi[virus_ppi["source ishum"]]["source"], virus_ppi[virus_ppi["target ishum"]]["target"]]))

#load ncbi database
ncbi = pd.read_csv("data_BIOGRID/Homo_sapiens.gene_info", sep="\t")
symbols = ncbi["Symbol"]
synonyms = [ncbi["Synonyms"][i].split("|") for i in range(len(ncbi))]
unique_synonims = np.unique(np.concatenate(synonyms))

#select proteins that are in ncbi index, and check if the others are related to some synonim, if true, pick the standard name
ncbi_prots = human_prot_in_vir[np.isin(human_prot_in_vir, symbols)]
not_ncbi_prots = human_prot_in_vir[np.logical_not(np.isin(human_prot_in_vir, symbols))]
but_synon = np.isin(human_prot_in_vir[np.logical_not(np.isin(human_prot_in_vir, symbols))], unique_synonims)

new_names = []
for i in range(len(not_ncbi_prots)):
    if but_synon[i]:
        new_names.append(symbols[([not_ncbi_prots[i] in s for s in synonyms])].values[0])

def_names = np.concatenate([ncbi_prots, new_names])
#np.savetxt(X=def_names, fname="BIOGRID_data/Covid_project/mers_humanprots.txt", fmt="%s")

# Create human PPI from data in BIOGRID

In [57]:
human_ppi_all = pd.read_csv("Files/BIOGRID-ORGANISM-Homo_sapiens-4.4.220.mitab.txt", sep="\t")

In [65]:
human_ppi_all["Interaction Detection Method"].value_counts()

psi-mi:"MI:0004"(affinity chromatography technology)       719963
psi-mi:"MI:1313"(bioid)                                    144978
psi-mi:"MI:0018"(two hybrid)                               117747
psi-mi:"MI:0401"(biochemical)                               72858
psi-mi:"MI:0096"(pull down)                                 42338
psi-mi:"MI:0254"(genetic interference)                      16776
psi-mi:"MI:0415"(enzymatic study)                           14266
psi-mi:"MI:0686"(unspecified method)                        10737
psi-mi:"MI:0428"(imaging technique)                          4648
psi-mi:"MI:0055"(fluorescent resonance energy transfer)      2316
psi-mi:"MI:0114"(x-ray crystallography)                      2119
psi-mi:"MI:0090"(protein complementation assay)              1751
psi-mi:"MI:0047"(far western blotting)                        872
Name: Interaction Detection Method, dtype: int64

In [110]:
human_ppi = pd.DataFrame()
#take gene IDs
human_ppi["source"] = [human_ppi_all.iloc[i,0][22:] for i in range(len(human_ppi_all))]
human_ppi["target"] = [human_ppi_all.iloc[i,1][22:] for i in range(len(human_ppi_all))]

#take only values for which the gene ID is a number
human_ppi = human_ppi[human_ppi["source"].map(lambda x: x.isnumeric()) & human_ppi["target"].map(lambda x: x.isnumeric())]
human_ppi = human_ppi.astype(int)

In [141]:
hnodes = np.unique(np.concatenate([human_ppi["source"], human_ppi["target"]]))

#check if there are the same connections but with source and target switched to see if links are directed
hm = np.isin([str(h[0])+str(h[1]) for h in human_ppi.to_numpy()[:,[1,0]]], [h[0]+h[1] for h in human_ppi.to_numpy()])
hm.sum() < len(hm)

In [154]:
#map gene ID to the index which will be used in the graph
nodeMap = dict(zip(hnodes, np.arange(len(hnodes))))

In [83]:
human_g = gt.Graph(directed=True)
human_net = human_ppi
human_net["target"] =human_net["target"].map(lambda x: nodeMap[x])
human_net["source"] =human_net["source"].map(lambda x: nodeMap[x])
human_g.add_edge_list(human_net.values)
nm = human_g.new_vertex_property("int", np.array(list(nodeMap.keys())))
human_g.vp["nodemap"] = nm

In [227]:
human_g

<Graph object, directed, with 27887 vertices and 1148242 edges, 1 internal vertex property, at 0x7f3a4d292860>

# Form the node and edges files I created, create new one with only proteins in NCBI

In [4]:
ncbi = pd.read_csv("data_BIOGRID/Homo_sapiens.gene_info", sep="\t")
symbols = ncbi["Symbol"]
synonyms = [ncbi["Synonyms"][i].split("|") for i in range(len(ncbi))]
unique_synonims = np.unique(np.concatenate(synonyms))

In [15]:
not_ncbi = np.where(np.logical_not(np.isin(human_nodes["nodeSymbol"], symbols)))[0]

In [22]:
isnotin_syn = np.where(np.logical_not(np.isin(human_nodes["nodeSymbol"][not_ncbi], unique_synonims)))[0]
isin_syn = np.where(np.isin(human_nodes["nodeSymbol"][not_ncbi], unique_synonims))[0]

In [24]:
new_names = []
for n in tqdm(human_nodes["nodeSymbol"][not_ncbi[isin_syn]]):
    new_names.append(symbols[([n in s for s in synonyms])].values[0])

100%|███████████████████████████████████████| 1201/1201 [00:22<00:00, 52.59it/s]


In [39]:
human_nodes_symb = human_nodes["nodeSymbol"].to_numpy()
human_nodes_label = human_nodes["nodeLabel"].to_numpy()

In [40]:
human_nodes_symb[not_ncbi[isin_syn]] = new_names

In [41]:
human_nodes_symb = np.delete(human_nodes_symb, not_ncbi[isnotin_syn])
human_nodes_label = np.delete(human_nodes_label, not_ncbi[isnotin_syn])

In [43]:
pd.DataFrame({"nodeSymbol": human_nodes_symb, "nodeCode": human_nodes_label}).to_csv("data_BIOGRID/BIOGRID_homo_sapiens_ncbi.nodes")

In [78]:
map_node_dict = dict(zip(np.delete(np.arange(len(human_nodes)), not_ncbi[isnotin_syn]), np.arange(len(human_nodes_symb))))

In [112]:
pul_ppi = human_ppi[np.logical_not(human_ppi["source"].isin(not_ncbi[isnotin_syn]))]
pul_ppi = pul_ppi[np.logical_not(pul_ppi["target"].isin(not_ncbi[isnotin_syn]))]

In [125]:
pul_ppi_new = pd.DataFrame()
pul_ppi_new["source"]= pul_ppi["source"].map(map_node_dict)
pul_ppi_new["target"]= pul_ppi["target"].map(map_node_dict)

In [127]:
pul_ppi_new.to_csv("data_BIOGRID/BIOGRID_homo_sapiens_ncbi.edges")