In [1]:
import json
import gzip
import copy
import re
import requests

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm import tqdm
from scipy.stats import fisher_exact
from wordcloud import WordCloud

In [12]:
DO_ID_PATH_JSON = 'data/do/do_id.json'
DO_ID_PATH_OBO = 'data/do/doid.obo.txt'
PUBMED_PATH = 'data/do/human_pubmed.tab'
UNI2DO_PATH = 'data/do/uniprot_do.json'

In [37]:
# Parse the disease ontology
do = {}  # { do_id : do_object }
obj = {}  # { id: do_id, name: definition, xref: list_of_omim_ids, is_a: list_of_parents, is_obsolete: True }
with open(DO_ID_PATH_OBO) as f:
    for line in f:
        line = line.strip().split(": ")
        
        if line and len(line) == 2:
            k, v = line
            if k == "id" and v.startswith("DOID:"):
                obj["id"] = v.split(":")[1]
            elif k == 'def':
                obj['descr'] = v.split('"')[1] 
            elif k == "xref" and "OMIM" in v:
                obj["omim"] = v.split(":")[1]
            elif k == "name":
                obj["name"] = v
            elif k == "is_a":
                obj.setdefault("is_a", []).append(v.split()[0].split(":")[1])
            elif k == "is_obsolete":
                obj["is_obsolete"] = True
        else:
            if obj.get("id") and not obj.get("is_obsolete"):
                do[obj["id"]] = obj
            obj = {}
            
name_do = {}
for k, v in do.items():
    name_do.setdefault(v['name'], k)

In [70]:
do_df = pd.DataFrame(do).transpose()
do_df.head()

Unnamed: 0,id,name,descr,is_a,omim
1816,1816,angiosarcoma,A vascular cancer that derives_from the cells ...,[175],
2116,2116,pterygium,A corneal disease that is characterized by a t...,[10124],
14667,14667,disease of metabolism,A disease that involving errors in metabolic p...,[4],
40001,40001,shrimp allergy,A crustacean allergy that has_allergic_trigger...,[0060524],
40002,40002,aspirin allergy,A drug allergy that has_allergic_trigger acety...,[0060500],


In [38]:
pubmed = pd.read_table(PUBMED_PATH, sep = '\t', dtype=str) 
pubmed.head()

Unnamed: 0,Entry,Entry name,PubMed ID
0,Q96IY4,CBPB2_HUMAN,1939207; 14702039; 15057823; 15489334; 1057498...
1,P22362,CCL1_HUMAN,2809212; 2212659; 14702039; 15489334; 15340161...
2,Q8NCR9,CLRN3_HUMAN,17974005; 15164054; 15489334; 23033978
3,Q8IUK8,CBLN2_HUMAN,14702039; 12975309; 15489334
4,Q9BX69,CARD6_HUMAN,15489334; 22814378; 23186163


In [16]:
uniprot_pmid = {}
for n, i in enumerate(pubmed['PubMed ID'][pubmed['PubMed ID'].isna() == False].values):
    ids = i.strip().split('; ')
    for key in ids:
        uniprot_pmid.setdefault(key, []).append(pubmed.loc[n, 'Entry'])

In [50]:
pmids = list(uniprot_pmid.keys())
diseases = {}  # { uniprot_id : list_of_diseases }
URL = "https://www.ebi.ac.uk/europepmc/annotations_api/annotationsByArticleIds"
for i in tqdm(range(0, len(pmids), 8)):
    params = {"articleIds": ",".join(["MED:{}".format(pmid) for pmid in pmids[i:min(i + 8, len(pmids))]]), "type": "Diseases",
              "section"   : "Abstract", "format": "JSON"}
    r = requests.get(URL, params=params)
    obj = json.loads(r.text)
    for ele in obj:
        for annotation in ele.get("annotations"):
            for uniprot_id in uniprot_pmid[ele["extId"]]:
                if annotation["exact"] in name_do.keys():
                    DOID = name_do[annotation["exact"]]
                    diseases.setdefault(uniprot_id, set()).add(DOID)
diseases = {k: list(v) for k, v in diseases.items()}
diseases




  0%|                                                                                         | 0/9592 [00:00<?, ?it/s][A[A[A


  0%|                                                                                 | 1/9592 [00:00<38:09,  4.19it/s][A[A[A


  0%|                                                                                 | 2/9592 [00:00<37:25,  4.27it/s][A[A[A


  0%|                                                                                 | 3/9592 [00:00<36:37,  4.36it/s][A[A[A


  0%|                                                                                 | 4/9592 [00:00<36:26,  4.38it/s][A[A[A


  0%|                                                                                 | 5/9592 [00:01<35:50,  4.46it/s][A[A[A


  0%|                                                                                 | 6/9592 [00:01<34:24,  4.64it/s][A[A[A


  0%|                                                                           

KeyboardInterrupt: 

In [45]:
go_human = pd.read_table('data/go/go.csv', dtype=str)
go_human.head()

Unnamed: 0,entry_ac,go_id,go_descr
0,Q9Y263,5623,cell
1,Q9Y263,30054,cell junction
2,Q9Y263,5737,cytoplasm
3,Q9Y263,70062,extracellular exosome
4,Q9Y263,5634,nucleus


In [83]:
do_human_df = pd.merge(left=do_human, right=do_df, how='inner', left_on='do_id', right_on='id').iloc[:, [0, 1, 3, 4]]
print(do_human_df.shape)
print(do_human.shape)
do_human_df.sort_values(by=['entry_ac', 'do_id']).head()
do_human_df.to_csv('data/do/do_human.csv', sep='\t')

(49311, 4)
(49311, 2)


In [78]:
with open(UNI2DO_PATH) as file:
    do_human_dict = json.load(file)

do_human = []
for key in do_human_dict.keys():
    for ids in do_human_dict[key]:
        do_human.append([key, ids])
do_human = pd.DataFrame(do_human, columns=['entry_ac', 'do_id'])
do_human.sort_values(by=['entry_ac', 'do_id']).head()

Unnamed: 0,entry_ac,do_id
21824,A0A075B6H7,50460
21822,A0A075B6H7,60058
21825,A0A075B6H7,80322
21821,A0A075B6H7,1240
21823,A0A075B6H7,12858
