In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy as sp
import MuxVizPy as mxp
import graph_tool as gt
from operator import itemgetter
from tqdm import tqdm
import os

import optuna
from optuna.samplers import TPESampler, RandomSampler

# From STRING.viruses dataset exctract human proteins directly targeted by viruses and uniform names to NCBI

In [3]:
#dataset with all links from Virus STRING
full_links = pd.read_csv("data_STRING/9606.protein.links.full.v10.5.txt", sep=" ")
#only interactions between virus and human
links = full_links[(full_links["protein2"].map(lambda x: not("9606"in x)))]
links = links.reset_index(drop=True)
viruses_code = np.unique(links["protein2"].map(lambda x: x.split(".")[0]).to_numpy())

In [13]:
links

Unnamed: 0,protein1,protein2,neighborhood,neighborhood_transferred,fusion,cooccurence,homology,coexpression,coexpression_transferred,experiments,experiments_transferred,database,database_transferred,textmining,textmining_transferred,combined_score
0,9606.ENSP00000000233,10258.Q6TVW1_ORFSA,0,0,0,0,0,0,0,0,0,0,0,0,188,188
1,9606.ENSP00000000233,10279.Q98187_MCV1,0,0,0,0,0,0,0,0,0,0,0,0,188,188
2,9606.ENSP00000000233,10298.GM_HHV11,0,0,0,0,0,0,0,0,0,0,0,17,336,336
3,9606.ENSP00000000233,37296.ORF4_HHV8P,0,0,0,0,0,0,0,0,0,0,0,91,109,155
4,9606.ENSP00000000233,195054.PRO_0000039759,0,0,0,0,0,0,0,0,0,0,0,0,329,329
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84004,9606.ENSP00000473243,10310.GB_HHV2H,0,0,0,0,0,0,0,0,0,0,0,0,227,227
84005,9606.ENSP00000473243,10298.GB_HHV11,0,0,0,0,0,0,0,0,0,0,0,0,227,227
84006,9606.ENSP00000473243,10335.GB_VZVD,0,0,0,0,0,0,0,0,0,0,0,0,227,227
84007,9606.ENSP00000473243,10359.GB_HCMVM,0,0,0,0,0,0,0,0,0,0,0,0,227,227


In [11]:
np.savetxt(X=links["protein1"].unique(), fname="string_prot.txt", fmt="%s")

In [15]:
#correspondance between name of virus and taxonomic code
codevir = pd.read_csv("data_STRING/viruses_codes.txt", sep="\t")

virus_name = []
virus_name_len = []
for s in viruses_code:
    if len(codevir[codevir["## taxon_id"]==int(s)]["official_name_NCBI"].to_list())==0:
        virus_name.append("")
        virus_name_len.append(0)
    else:
        virus_name.append(codevir[codevir["## taxon_id"]==int(s)]["official_name_NCBI"].to_list()[0])
        virus_name_len.append(1)

virus_names_df = pd.DataFrame({"name": virus_name, "ID": viruses_code})
virus_names_dict = dict(zip(viruses_code, virus_name))
ID_nonzeronames = np.array(virus_names_df[virus_names_df["name"]!=""].ID)

In [21]:
#filter only links with protein2 corresponding to known virus
links_pul = links[(links["protein2"].map(lambda x: np.any(x.split(".")[0] in ID_nonzeronames)))].reset_index(drop=True)

#links_pul["protein1"]=links_pul["protein1"].map(lambda x: x.split(".")[1])
links_pul["virus"]=links_pul["protein2"].map(lambda x: x.split(".")[0])
#links_pul["protein2"]=links_pul["protein2"].map(lambda x: x.split(".")[1])
links_pul["virus_name"]=itemgetter(*links_pul["virus"].to_list())(virus_names_dict)

In [22]:
links_pul

Unnamed: 0,protein1,protein2,neighborhood,neighborhood_transferred,fusion,cooccurence,homology,coexpression,coexpression_transferred,experiments,experiments_transferred,database,database_transferred,textmining,textmining_transferred,combined_score,virus,virus_name
0,9606.ENSP00000000233,10258.Q6TVW1_ORFSA,0,0,0,0,0,0,0,0,0,0,0,0,188,188,10258,Orf virus (strain Goat/Texas/SA00/2000)
1,9606.ENSP00000000233,10279.Q98187_MCV1,0,0,0,0,0,0,0,0,0,0,0,0,188,188,10279,Molluscum contagiosum virus subtype 1
2,9606.ENSP00000000233,10298.GM_HHV11,0,0,0,0,0,0,0,0,0,0,0,17,336,336,10298,Human herpesvirus 1 (strain 17)
3,9606.ENSP00000000233,37296.ORF4_HHV8P,0,0,0,0,0,0,0,0,0,0,0,91,109,155,37296,Human herpesvirus 8 type P (isolate GK18)
4,9606.ENSP00000000233,195054.PRO_0000039759,0,0,0,0,0,0,0,0,0,0,0,0,329,329,195054,Human parechovirus 2 (strain Williamson)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80770,9606.ENSP00000473243,10310.GB_HHV2H,0,0,0,0,0,0,0,0,0,0,0,0,227,227,10310,Human herpesvirus 2 (strain HG52)
80771,9606.ENSP00000473243,10298.GB_HHV11,0,0,0,0,0,0,0,0,0,0,0,0,227,227,10298,Human herpesvirus 1 (strain 17)
80772,9606.ENSP00000473243,10335.GB_VZVD,0,0,0,0,0,0,0,0,0,0,0,0,227,227,10335,Varicella-zoster virus (strain Dumas)
80773,9606.ENSP00000473243,10359.GB_HCMVM,0,0,0,0,0,0,0,0,0,0,0,0,227,227,10359,Human cytomegalovirus (strain Merlin)


In [19]:
#mapping between node ID and name
mapper = pd.read_csv("data_STRING/9606.protein.info.v11.5.txt", sep="\t")
mapper["ID"] = mapper["#string_protein_id"].map(lambda x: x.split(".")[1])
mapper_dict = dict(zip(mapper["ID"], mapper["preferred_name"]))

In [29]:
mapper

Unnamed: 0,#string_protein_id,preferred_name,protein_size,annotation,ID
0,9606.ENSP00000000233,ARF5,180,ADP-ribosylation factor 5; GTP-binding protein...,ENSP00000000233
1,9606.ENSP00000000412,M6PR,277,Cation-dependent mannose-6-phosphate receptor;...,ENSP00000000412
2,9606.ENSP00000001008,FKBP4,459,Peptidyl-prolyl cis-trans isomerase FKBP4; Imm...,ENSP00000001008
3,9606.ENSP00000001146,CYP26B1,512,Cytochrome P450 26B1; Involved in the metaboli...,ENSP00000001146
4,9606.ENSP00000002125,NDUFAF7,441,"Protein arginine methyltransferase NDUFAF7, mi...",ENSP00000002125
...,...,...,...,...,...
19561,9606.ENSP00000485671,ENSG00000280273,120,HCG1991042,ENSP00000485671
19562,9606.ENSP00000485672,ENSG00000279458,86,annotation not available,ENSP00000485672
19563,9606.ENSP00000485673,ENSG00000279988,243,annotation not available,ENSP00000485673
19564,9606.ENSP00000485675,ENSG00000280116,84,annotation not available,ENSP00000485675


In [25]:
uga=np.isin(links_pul["protein1"].unique(),mapper["#string_protein_id"])

In [28]:
links_pul["protein1"].unique().shape

(8575,)

In [66]:
#filter again with proteins1 which code is mapped to a name
links_res = links_pul[links_pul["protein1"].isin(mapper["ID"])]
links_res["protein1_name"]=itemgetter(*links_res["protein1"].to_list())(mapper_dict)

In [74]:
#list of unique viruses names
unique_viruses = links_res["virus_name"].unique()

In [251]:
#proteins for each virus which are in a link with text mining of high confidence 
##r experiment with mean confidence
type0_list = []
type1_list = []

for un in unique_viruses:
    provv_link = links_res[links_res["virus_name"]==un]
    type0_list.append(provv_link["protein2"].unique())
    type1_list.append(provv_link[(provv_link["textmining"]>700) | (provv_link["experiments"]>400)]["protein1_name"].unique())

res_df = pd.DataFrame({"name":unique_viruses, 
                       "len_vir":[len(l) for l in type0_list],
                       "len_hum":[len(l) for l in type1_list], 
                       "virus": type0_list, 
                       "human": type1_list})
res_df["name"] = res_df["name"].map(lambda x: x.replace("(", ""))
res_df["name"] = res_df["name"].map(lambda x: x.replace(")", ""))
res_df["name"] = res_df["name"].map(lambda x: x.replace("/", "_"))
res_df["name"] = res_df["name"].map(lambda x: x.replace(" ", "_"))

In [121]:
#load ncbi database
ncbi = pd.read_csv("BIOGRID_data/Homo_sapiens.gene_info", sep="\t")
symbols = ncbi["Symbol"]
synonyms = [ncbi["Synonyms"][i].split("|") for i in range(len(ncbi))]
unique_synonims = np.unique(np.concatenate(synonyms))



In [252]:
resss = res_df[res_df["len_hum"]>0].sort_values("name").reset_index(drop=True)

In [255]:
np.any(np.isin(resss.loc[18,"human"], symbols))

True

In [257]:
new_list_human = []
for j in range(len(resss)):
    print(f"{j}/{len(resss)}")
    if not np.all(np.isin(resss.loc[j,"human"], symbols)):
        chg_idx = np.where(np.logical_not(np.isin(resss.loc[j,"human"], symbols)))[0]
        but_synon = np.isin(resss.loc[j,"human"][chg_idx], unique_synonims)

        new_names = []
        for i in tqdm(range(len(chg_idx))):
            if but_synon[i]:
                new_names.append(symbols[([resss.loc[j,"human"][chg_idx[i]] in s for s in synonyms])].values[0])
            else:
                new_names.append(resss.loc[j,"human"][chg_idx[i]])

        new_list_human.append(np.delete(resss.loc[j,"human"], chg_idx[np.where(np.logical_not(but_synon))[0]]))
    else:
        new_list_human.append(resss.loc[j,"human"])

0/83
1/83
2/83
3/83
4/83
5/83
6/83
7/83
8/83
9/83
10/83
11/83
12/83


100%|█████████████████████████████████████████████| 1/1 [00:02<00:00,  2.16s/it]


13/83
14/83
15/83
16/83
17/83
18/83


100%|███████████████████████████████████████████| 48/48 [01:37<00:00,  2.04s/it]


19/83
20/83
21/83
22/83
23/83
24/83
25/83


100%|███████████████████████████████████████████| 15/15 [00:30<00:00,  2.04s/it]


26/83


100%|█████████████████████████████████████████████| 1/1 [00:02<00:00,  2.09s/it]


27/83


100%|█████████████████████████████████████████████| 1/1 [00:02<00:00,  2.06s/it]


28/83
29/83
30/83
31/83


100%|█████████████████████████████████████████████| 2/2 [00:04<00:00,  2.13s/it]


32/83


100%|█████████████████████████████████████████████| 6/6 [00:12<00:00,  2.12s/it]


33/83


100%|█████████████████████████████████████████████| 3/3 [00:06<00:00,  2.12s/it]


34/83


100%|███████████████████████████████████████████| 26/26 [00:54<00:00,  2.10s/it]


35/83
36/83
37/83


100%|█████████████████████████████████████████████| 5/5 [00:10<00:00,  2.11s/it]


38/83


100%|███████████████████████████████████████████| 48/48 [01:33<00:00,  1.96s/it]


39/83


100%|█████████████████████████████████████████████| 1/1 [00:01<00:00,  2.00s/it]


40/83
41/83


100%|███████████████████████████████████████████| 16/16 [00:29<00:00,  1.85s/it]


42/83


100%|███████████████████████████████████████████| 20/20 [00:40<00:00,  2.00s/it]


43/83


100%|█████████████████████████████████████████████| 9/9 [00:19<00:00,  2.12s/it]


44/83


100%|███████████████████████████████████████████| 11/11 [00:23<00:00,  2.13s/it]


45/83


100%|███████████████████████████████████████████| 26/26 [00:52<00:00,  2.03s/it]


46/83
47/83
48/83


100%|█████████████████████████████████████████████| 4/4 [00:07<00:00,  1.96s/it]


49/83


100%|███████████████████████████████████████████| 29/29 [00:54<00:00,  1.89s/it]


50/83
51/83
52/83
53/83
54/83
55/83
56/83


100%|█████████████████████████████████████████████| 2/2 [00:04<00:00,  2.01s/it]


57/83
58/83
59/83
60/83
61/83
62/83
63/83
64/83
65/83
66/83


100%|███████████████████████████████████████████| 1/1 [00:00<00:00, 6721.64it/s]


67/83
68/83


100%|█████████████████████████████████████████████| 8/8 [00:16<00:00,  2.06s/it]


69/83
70/83
71/83


100%|█████████████████████████████████████████████| 1/1 [00:02<00:00,  2.08s/it]


72/83


100%|█████████████████████████████████████████████| 1/1 [00:02<00:00,  2.12s/it]


73/83
74/83
75/83
76/83


100%|███████████████████████████████████████████| 21/21 [00:43<00:00,  2.08s/it]


77/83
78/83
79/83
80/83
81/83
82/83


100%|███████████████████████████████████████████| 25/25 [00:52<00:00,  2.11s/it]


In [264]:
resss["human_new"] = new_list_human

In [285]:
for i in range(len(resss)):
    if not os.path.isdir(os.path.isdir("STRING_data/STRING_to_VIRPROT/"+resss.loc[i,"name"])):
        os.mkdir("STRING_data/STRING_to_VIRPROT/"+resss.loc[i,"name"])
    np.savetxt(X=resss.loc[i,"human_new"], fname="STRING_data/STRING_to_VIRPROT/"+resss.loc[i,"name"]+"/type1.txt", fmt="%s")
    np.savetxt(X=resss.loc[i,"virus"], fname="STRING_data/STRING_to_VIRPROT/"+resss.loc[i,"name"]+"/type0.txt", fmt="%s")