In [1]:
import sys, os, re
import pandas as pd
import numpy as np
import json
import requests

In [2]:
string_name = pd.read_csv('data/9606.protein.info.v11.5.txt', sep="\t")

In [3]:
string_name.head()

Unnamed: 0,#string_protein_id,preferred_name,protein_size,annotation
0,9606.ENSP00000000233,ARF5,180,ADP-ribosylation factor 5; GTP-binding protein...
1,9606.ENSP00000000412,M6PR,277,Cation-dependent mannose-6-phosphate receptor;...
2,9606.ENSP00000001008,FKBP4,459,Peptidyl-prolyl cis-trans isomerase FKBP4; Imm...
3,9606.ENSP00000001146,CYP26B1,512,Cytochrome P450 26B1; Involved in the metaboli...
4,9606.ENSP00000002125,NDUFAF7,441,"Protein arginine methyltransferase NDUFAF7, mi..."


In [4]:
name_dict = {}

string_name_dict = string_name.to_dict('split')

for i in range(len(string_name_dict['data'])):
    name_dict[string_name_dict['data'][i][0]] = {"name": string_name_dict['data'][i][1], "description": string_name_dict['data'][i][3].split(";")[0].strip()}

del string_name, string_name_dict

In [5]:
string_alias = pd.read_csv('data/9606.protein.aliases.v11.5.txt', sep="\t")


In [6]:
string_alias_dict = string_alias.to_dict('split')

for i in range(len(string_alias_dict['data'])):
    if string_alias_dict['data'][i][2] == "Ensembl_HGNC_Entrez_Gene_ID":
        name = string_alias_dict['data'][i][0]
        if name in name_dict:
            name_dict[name]["identifier"] = int(string_alias_dict['data'][i][1])

del string_alias_dict, string_alias

In [7]:
len(name_dict)

19566

In [8]:
no_id_proteins = {}
for gene in name_dict:
    if "identifier" not in name_dict[gene]:
        no_id_proteins[name_dict[gene]["name"]] = gene

In [9]:
len(no_id_proteins)

1116

In [10]:
no_id_proteins

{'STARD3NL': '9606.ENSP00000009041',
 'SYPL1': '9606.ENSP00000011473',
 'RTFDC1': '9606.ENSP00000023939',
 'KIAA2022': '9606.ENSP00000055682',
 'C12orf5': '9606.ENSP00000179259',
 'C4orf6': '9606.ENSP00000195455',
 'KIAA1467': '9606.ENSP00000197268',
 'ATP5D': '9606.ENSP00000215375',
 'USP18': '9606.ENSP00000215794',
 'RTDR1': '9606.ENSP00000216036',
 'GNPNAT1': '9606.ENSP00000216410',
 'PYCRL': '9606.ENSP00000220966',
 'OBFC1': '9606.ENSP00000224950',
 'C11orf63': '9606.ENSP00000227349',
 'USP5': '9606.ENSP00000229268',
 'SKIV2L2': '9606.ENSP00000230640',
 'LEPRE1': '9606.ENSP00000236040',
 'RARRES1': '9606.ENSP00000237696',
 'C2orf43': '9606.ENSP00000237822',
 'TMEM180': '9606.ENSP00000238936',
 'CCDC53': '9606.ENSP00000240079',
 'TNFSF10': '9606.ENSP00000241261',
 'FTSJ2': '9606.ENSP00000242257',
 'KIAA1045': '9606.ENSP00000242315',
 'C19orf43': '9606.ENSP00000242784',
 'CD97': '9606.ENSP00000242786',
 'ATP5E': '9606.ENSP00000243997',
 'GLTSCR2': '9606.ENSP00000246802',
 'HN1L': '96

In [11]:
with open('data/hetionet-v1.0.json', 'r') as f:
    json_data = json.load(f)

    for node in json_data["nodes"]:
        if node["name"] in no_id_proteins:
            string_id = no_id_proteins[node["name"]]
            name_dict[string_id]["identifier"] = int(node["identifier"])

In [12]:
name_dict_with_id = {}

for k in name_dict:
    if "identifier" in name_dict[k]:
        name_dict_with_id[k] = name_dict[k].copy()

In [13]:
with open("data/string_name_dict.json", "w") as f:
    json.dump(name_dict_with_id, f, indent=2)

In [14]:
len(name_dict_with_id)

19041

In [15]:
##string

string_link = pd.read_csv('data/9606.protein.links.detailed.v11.5.txt', sep=" ")

string_link.head()

Unnamed: 0,protein1,protein2,neighborhood,fusion,cooccurence,coexpression,experimental,database,textmining,combined_score
0,9606.ENSP00000000233,9606.ENSP00000379496,0,0,0,54,0,0,144,155
1,9606.ENSP00000000233,9606.ENSP00000314067,0,0,0,0,180,0,61,197
2,9606.ENSP00000000233,9606.ENSP00000263116,0,0,0,62,152,0,101,222
3,9606.ENSP00000000233,9606.ENSP00000361263,0,0,0,0,161,0,64,181
4,9606.ENSP00000000233,9606.ENSP00000409666,0,0,0,82,213,0,72,270


In [16]:
cutoff_score = 900

string_link_dict = string_link[string_link["combined_score"] >= cutoff_score].to_dict('split')

link_set = set()

for i in range(len(string_link_dict['data'])):
    protein_1 = string_link_dict['data'][i][0]
    protein_2 = string_link_dict['data'][i][1]

    if protein_1 in name_dict_with_id and protein_2 in name_dict_with_id:
        link_set.add(tuple(sorted([name_dict_with_id[protein_1]["identifier"], name_dict_with_id[protein_2]["identifier"]])))
del string_link_dict, string_link

In [17]:
len(link_set)

122535

In [18]:
link_set

{(7475, 51384),
 (5424, 5932),
 (8649, 153129),
 (3105, 3133),
 (8693, 10071),
 (716, 9364),
 (3297, 3303),
 (4508, 9114),
 (10474, 57325),
 (7153, 23225),
 (3630, 9370),
 (5710, 122706),
 (6124, 10480),
 (23560, 51187),
 (117583, 387755),
 (4998, 5716),
 (2290, 4088),
 (9046, 131450),
 (4939, 8519),
 (990, 157777),
 (816, 84254),
 (3932, 5921),
 (86, 9318),
 (4809, 6194),
 (10164, 140453),
 (1975, 1983),
 (207, 23608),
 (11224, 100529239),
 (2534, 3667),
 (4586, 27090),
 (2033, 5371),
 (7111, 29765),
 (1453, 2810),
 (1718, 4047),
 (10884, 64928),
 (4173, 5983),
 (7158, 554313),
 (1464, 6382),
 (841, 7157),
 (55131, 64794),
 (5825, 8504),
 (6714, 59341),
 (1854, 29922),
 (27247, 57128),
 (6129, 9669),
 (55127, 55226),
 (91782, 100526767),
 (5578, 5594),
 (3558, 3586),
 (11224, 64969),
 (9401, 11073),
 (22978, 50808),
 (5977, 9275),
 (7157, 10111),
 (1528, 29937),
 (2735, 3549),
 (2244, 3689),
 (10179, 11340),
 (1374, 23054),
 (3978, 5111),
 (2038, 51327),
 (8497, 8573),
 (1398, 1948),


In [19]:
json_connections = set()
with open('data/hetionet-v1.0.json', 'r') as f:
    json_data = json.load(f)

    for edge in json_data["edges"]:
        if edge["source_id"][0] == "Gene" and edge["target_id"][0] == "Gene":
            id_1 = edge["source_id"][1]
            id_2 = edge["target_id"][1]

            json_connections.add(tuple(sorted([id_1, id_2])))


In [20]:
len(link_set.difference(json_connections))

97103

In [21]:
len(json_connections.difference(link_set))

445614

In [22]:
len(json_connections)

471046

In [23]:
len(link_set)

122535

In [24]:
len(link_set.intersection(json_connections))

25432

In [25]:
new_links = []
for new_link in link_set.difference(json_connections):
    edge = {"source_id": ["Gene", new_link[0]], "target_id": ["Gene", new_link[1]], "kind": "STRING"}
    new_links.append(edge)

with open("data/string_new_link_900.json", "w") as f:
    json.dump(new_links, f, indent=2)

In [15]:

#<oboInOwl:hasDbXref rdf:datatype="http://www.w3.org/2001/XMLSchema#string">ICD10CM:N46.0</oboInOwl:hasDbXref>
#<oboInOwl:hasDbXref rdf:datatype="http://www.w3.org/2001/XMLSchema#string">MESH:D006509</oboInOwl:hasDbXref>

rx_icd10 = re.compile(r'<oboInOwl:hasDbXref rdf:datatype="http://www.w3.org/2001/XMLSchema#string">ICD10CM:(\S+)</oboInOwl:hasDbXref>')
rx_mesh = re.compile(r'<oboInOwl:hasDbXref rdf:datatype="http://www.w3.org/2001/XMLSchema#string">MESH:(\S+)</oboInOwl:hasDbXref>')
rx_kegg = re.compile(r'<oboInOwl:hasDbXref rdf:datatype="http://www.w3.org/2001/XMLSchema#string">KEGG:(\S+)</oboInOwl:hasDbXref>')
rx_omid = re.compile(r'<oboInOwl:hasDbXref rdf:datatype="http://www.w3.org/2001/XMLSchema#string">OMIM:(\S+)</oboInOwl:hasDbXref>')

disease_id = {}

hetionet_disease_id = set()

with open('data/hetionet-v1.0.json', 'r') as f:
    json_data = json.load(f)

    for node in json_data["nodes"]:
        if node["kind"] == "Disease" and "data" in node and "url" in node["data"]:
            url = node["data"]["url"]
            r = requests.get(url)
            #print (url)

            #print (r.text)

            s_icd10 = rx_icd10.search(r.text)

            if s_icd10:
                if node["identifier"] not in disease_id:
                    disease_id[node["identifier"]] = {}
                disease_id[node["identifier"]]["icd10"] = s_icd10.group(1)
            else:
                #print (node["identifier"], node["name"])
                hetionet_disease_id.add(node["identifier"])


            s_mesh = rx_mesh.search(r.text)

            if s_mesh:
                if node["identifier"] not in disease_id:
                    disease_id[node["identifier"]] = {}
                disease_id[node["identifier"]]["mesh"] = s_mesh.group(1)
                
                hetionet_disease_id.discard(node["identifier"])

            s_kegg = rx_kegg.search(r.text)

            if s_kegg:
                if node["identifier"] not in disease_id:
                    disease_id[node["identifier"]] = {}
                disease_id[node["identifier"]]["kegg"] = s_kegg.group(1)
                
                hetionet_disease_id.discard(node["identifier"])

            s_omid = rx_omid.search(r.text)

            if s_omid:
                if node["identifier"] not in disease_id:
                    disease_id[node["identifier"]] = {}
                disease_id[node["identifier"]]["kegg"] = s_omid.group(1)
                
                hetionet_disease_id.discard(node["identifier"])

In [17]:
len(hetionet_disease_id)

with open("data/no_kegg_disease.json", "w") as f:
    json.dump(list(hetionet_disease_id), f, indent=2)

In [18]:
with open("data/disease_kegg.json", "w") as f:
    json.dump(disease_id, f, indent=2)

In [11]:
hetionet_disease_id

{'DOID:0060073', 'DOID:5099', 'DOID:8398', 'DOID:90', 'DOID:9917'}

In [8]:
url = "http://www.ontobee.org/ontology/DOID?iri=http://purl.obolibrary.org/obo/DOID_4481"
r = requests.get(url)
print (url)

print (r.text)

http://www.ontobee.org/ontology/DOID?iri=http://purl.obolibrary.org/obo/DOID_4481
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="http://www.ontobee.org/ontology/view/DOID?iri=http://purl.obolibrary.org/obo/DOID_4481"?>
<rdf:RDF xmlns="http://www.w3.org/2002/07/owl#"
     xml:base="http://www.w3.org/2002/07/owl"
     xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
     xmlns:owl="http://www.w3.org/2002/07/owl#"
     xmlns:oboInOwl="http://www.geneontology.org/formats/oboInOwl#"
     xmlns:xsd="http://www.w3.org/2001/XMLSchema#"
     xmlns:skos="http://www.w3.org/2004/02/skos/core#"
     xmlns:doid="http://purl.obolibrary.org/obo/doid#"
     xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#"
     xmlns:ns4="http://purl.obolibrary.org/obo/"
     xmlns:foaf="http://xmlns.com/foaf/0.1/"
     xmlns:dc="http://purl.org/dc/elements/1.1/">
    


    <!-- 
    ///////////////////////////////////////////////////////////////////////////////////////
    //
    // Annotation p

In [14]:
disease_id

{'DOID:14227': {'icd10': 'N46.0', 'mesh': 'D053713'},
 'DOID:9352': {'icd10': 'E11', 'mesh': 'D003924', 'kegg': '125853'},
 'DOID:8778': {'icd10': 'K50.1', 'mesh': 'D003424'},
 'DOID:5612': {'icd10': 'C72.0', 'mesh': 'D013120'},
 'DOID:363': {'icd10': 'C55', 'mesh': 'D014594'},
 'DOID:11054': {'icd10': 'C67', 'mesh': 'D001749', 'kegg': '109800'},
 'DOID:1793': {'icd10': 'C25.0', 'mesh': 'D010190', 'kegg': '05212'},
 'DOID:8850': {'icd10': 'C08'},
 'DOID:0060119': {'mesh': 'D010610'},
 'DOID:10763': {'icd10': 'I10', 'mesh': 'D006973'},
 'DOID:3312': {'icd10': 'F31', 'mesh': 'D001714'},
 'DOID:418': {'icd10': 'M34.0', 'mesh': 'D012595', 'kegg': '181750'},
 'DOID:2377': {'icd10': 'G35', 'mesh': 'D009103', 'kegg': '612594'},
 'DOID:4606': {'icd10': 'C24.0', 'mesh': 'D001650'},
 'DOID:12995': {'icd10': 'F91', 'mesh': 'D019955'},
 'DOID:4481': {'kegg': '607154'},
 'DOID:11819': {'icd10': 'C66', 'mesh': 'D014516', 'kegg': '191600'},
 'DOID:1909': {'mesh': 'D008545', 'kegg': '05218'},
 'DOID:1