In [2]:
%load_ext autoreload
%autoreload 2

import obonet
import ujson
import sys
import logging
# import urllib
import pandas as pd
# import pronto
# import owlready2



from tqdm.auto import tqdm
from collections import defaultdict, Counter
# from bigbio.dataloader import BigBioConfigHelpers
# from pronto import Ontology
# from nxontology import NXOntology
# from nxontology.imports import pronto_to_multidigraph, multidigraph_to_digraph, from_file



sys.path.insert(0, '..')
from umls_utils import UmlsMappings
from bigbio_utils import load_dataset_df
# from bigbio_utils import dataset_to_df



logger = logging.getLogger()
logger.setLevel(logging.INFO)

tqdm.pandas()
pd.set_option('display.max_rows', 200)

# conhelps = BigBioConfigHelpers()

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [14]:
def extract_synonyms(data):
    if 'synonym' in data:
        return [syn.split('"')[1]for syn in data['synonym']]
    else:
        return []

def extract_definition(data):
    if 'def' in data:
        return data['def'].strip('"').split('"')[0]
        
    else:
        return ''
        
def term_to_synonyms(graph, filter_prefix=None):
    node_dict = {}
    for curie, data in tqdm(graph.nodes(data=True)):
        if filter_prefix is not None:
            if not curie.startswith(filter_prefix):
                continue
        if 'name' not in data:
            # print(f'Missing name.  CURIE: {curie}, data: {data}')
        # if 'synonym' not in data:
            # print(f'Missing synonym.  CURIE: {curie}, data: {data}')
            synonyms = extract_synonyms(data)
        else:
            synonyms = [data['name']] + extract_synonyms(data)
            node_dict[curie] = synonyms

    return node_dict

def term_to_definitions(graph, filter_prefix=None):
    node_dict = {}
    for curie, data in tqdm(graph.nodes(data=True)):
        if filter_prefix is not None:
            if not curie.startswith(filter_prefix):
                continue
        else:
            definition = extract_definition(data)
            node_dict[curie] = definition

    return node_dict

In [29]:
# url = 'http://purl.obolibrary.org/obo/cl.obo'

urls = {
    'doid':'https://raw.githubusercontent.com/DiseaseOntology/HumanDiseaseOntology/main/src/ontology/doid.obo',
    'cl': 'http://purl.obolibrary.org/obo/cl.obo',
    'uberon': 'http://purl.obolibrary.org/obo/uberon.obo',
    'chebi':'http://purl.obolibrary.org/obo/chebi.obo',
    'obi':'http://purl.obolibrary.org/obo/obi.obo',
    'pr':'http://purl.obolibrary.org/obo/pr.obo',
    'cvcl':'https://ftp.expasy.org/databases/cellosaurus/cellosaurus.obo',
    'go':'http://purl.obolibrary.org/obo/go.obo',
}

types = {
    'doid':'disease',
    'cl': 'cell line & type',
    'uberon': 'tissue',
    'chebi':'chemical',
    'obi':'assay',
    'pr':'protein',
    'cvcl': 'cell line & type',
    'go': 'subcellular'
}

names = {
    'doid':'disease ontology',
    'cl': 'cell ontology',
    'uberon': 'uberon',
    'chebi':'chemicals of biological interest',
    'obi':'ontology of biological investigations',
    'pr':'protein ontology',
    'cvcl': 'cellosaurus',
    'go': 'gene ontology'
}

filter_prefixes = {
    'doid': None,
    'cl': 'CL',
    'uberon': 'UBERON',
    'chebi': None,
    'obi':'OBI',
    'pr':'PR',
    'cvcl': None,
    'go': None,
}


to_add = [
    'mesh','pubchem','uniprot','ncbigene','ncbi_taxon','rfam','bao'
]

In [35]:
all_synonyms = {}
all_definitions = {}
# url = 'https://raw.githubusercontent.com/DiseaseOntology/HumanDiseaseOntology/main/src/ontology/doid.obo'
for ontology_id, url in urls.items():
    print(ontology_id)
    ontology = obonet.read_obo(url)
    # ontology = from_file(url)
    # cl = obonet.read_obo('../data/soda/doid.owl')
    synonyms = term_to_synonyms(ontology, filter_prefix=filter_prefixes[ontology_id])
    definitions = term_to_definitions(ontology, filter_prefixes[ontology_id])
    print(Counter(key.split(':')[0] for key in synonyms))

    all_synonyms[ontology_id] = synonyms
    all_definitions[ontology_id] = definitions

with open('../../soda_ontologies/obo_synonyms.json', 'w') as f:
    f.write(ujson.dumps(all_synonyms))

with open('../../soda_ontologies/obo_definitions.json', 'w') as f:
    f.write(ujson.dumps(all_definitions))

MultiDiGraph named 'doid' with 11367 nodes and 15777 edges


  0%|          | 0/11367 [00:00<?, ?it/s]

  0%|          | 0/11367 [00:00<?, ?it/s]

Counter({'DOID': 11367})
MultiDiGraph named 'doid' with 11367 nodes and 15777 edges


  0%|          | 0/15979 [00:00<?, ?it/s]

  0%|          | 0/15979 [00:00<?, ?it/s]

Counter({'CL': 2721})
MultiDiGraph named 'cl' with 15979 nodes and 46853 edges


  0%|          | 0/25018 [00:00<?, ?it/s]

  0%|          | 0/25018 [00:00<?, ?it/s]

Counter({'UBERON': 14457})
MultiDiGraph named 'uberon' with 25018 nodes and 77050 edges


  0%|          | 0/164954 [00:00<?, ?it/s]

  0%|          | 0/164954 [00:00<?, ?it/s]

Counter({'CHEBI': 164954})
MultiDiGraph named 'chebi' with 164954 nodes and 328942 edges


  0%|          | 0/4799 [00:00<?, ?it/s]

  0%|          | 0/4799 [00:00<?, ?it/s]

Counter({'OBI': 3896})
MultiDiGraph named 'obi' with 4799 nodes and 5464 edges


  0%|          | 0/329448 [00:00<?, ?it/s]

  0%|          | 0/329448 [00:00<?, ?it/s]

Counter({'PR': 226975})
MultiDiGraph named 'pr' with 329448 nodes and 843091 edges




  0%|          | 0/146062 [00:00<?, ?it/s]

  0%|          | 0/146062 [00:00<?, ?it/s]

Counter({'CVCL_B0T9': 1, 'CVCL_B0T8': 1, 'CVCL_E548': 1, 'CVCL_KA96': 1, 'CVCL_IW91': 1, 'CVCL_B375': 1, 'CVCL_X345': 1, 'CVCL_C4ZN': 1, 'CVCL_E549': 1, 'CVCL_G217': 1, 'CVCL_VG99': 1, 'CVCL_6479': 1, 'CVCL_6480': 1, 'CVCL_S361': 1, 'CVCL_4977': 1, 'CVCL_3627': 1, 'CVCL_3628': 1, 'CVCL_WM09': 1, 'CVCL_B5B3': 1, 'CVCL_E557': 1, 'CVCL_ZB51': 1, 'CVCL_VG31': 1, 'CVCL_VG32': 1, 'CVCL_ZW87': 1, 'CVCL_ZW88': 1, 'CVCL_ZW89': 1, 'CVCL_XB90': 1, 'CVCL_6494': 1, 'CVCL_ZB52': 1, 'CVCL_ZB53': 1, 'CVCL_A2TJ': 1, 'CVCL_A2UI': 1, 'CVCL_A2UJ': 1, 'CVCL_A2UK': 1, 'CVCL_A2UL': 1, 'CVCL_A2TK': 1, 'CVCL_A2UM': 1, 'CVCL_A2UN': 1, 'CVCL_A2UP': 1, 'CVCL_A2UQ': 1, 'CVCL_A2TL': 1, 'CVCL_A2UR': 1, 'CVCL_A2US': 1, 'CVCL_A2UT': 1, 'CVCL_A2TM': 1, 'CVCL_A2UU': 1, 'CVCL_A2UV': 1, 'CVCL_A2UW': 1, 'CVCL_A2UX': 1, 'CVCL_IP58': 1, 'CVCL_IJ15': 1, 'CVCL_C7W5': 1, 'CVCL_YZ96': 1, 'CVCL_YZ97': 1, 'CVCL_A6HV': 1, 'CVCL_WU68': 1, 'CVCL_V362': 1, 'CVCL_C4MD': 1, 'CVCL_C4ME': 1, 'CVCL_C4MF': 1, 'CVCL_C4MG': 1, 'CVCL_C4MH': 1,

  0%|          | 0/42887 [00:00<?, ?it/s]

  0%|          | 0/42887 [00:00<?, ?it/s]

Counter({'GO': 42887})


# Load BioID Dataset

In [45]:
df = load_dataset_df('bioid')



  0%|          | 0/1 [00:00<?, ?it/s]

In [49]:
Counter([y.split(':')[0] for x in df.db_ids for y in x])

Counter({'Cellosaurus': 777,
         'PubChem': 95,
         'CHEBI': 1456,
         'Uniprot': 3698,
         'NCBI gene': 2397,
         'GO': 931,
         'NCBI taxon': 1058,
         'CL': 612,
         'Uberon': 701,
         'Rfam': 19,
         'Corum': 3,
         'BAO': 1})

In [52]:
df[df.db_ids.map(lambda x: len(x) > 1)]

Unnamed: 0,document_id,offsets,text,type,db_ids,split,mention_id
6,1083858,"[[441, 444]]",TfR,[protein],"[Uniprot:Q9UP52, Uniprot:P02786]",train,1083858.7
34,1183526,"[[133, 136]]",PCs,"[cell, protein]","[CL:0000121, Uniprot:Q61315]",train,1183526.10
63,1369248,"[[73, 76]]",RAR,[protein],"[Uniprot:P10826, Uniprot:P10276, Uniprot:P13631]",train,1369248.5
65,1369248,"[[85, 88]]",DR5,[protein],"[Uniprot:P28702, Uniprot:P19793, Uniprot:P48443]",train,1369248.7
59,1369248,"[[329, 332]]",RAR,[protein],"[Uniprot:P10826, Uniprot:P10276, Uniprot:P13631]",train,1369248.12
...,...,...,...,...,...,...,...
10275,5293153,"[[37, 40]]",AKT,[protein],"[Uniprot:Q60823, Uniprot:P31750]",train,5293153.2
10355,5341517,"[[3, 7]]",U2OS,"[gene, cellline]","[Cellosaurus:CVCL_0042, NCBI gene:11011]",train,5341517.1
10389,5341523,"[[49, 54]]",SYCP3,[protein],"[Uniprot:Q62209, Uniprot:P70281]",train,5341523.3
10395,5341524,"[[21, 24]]",Eng,[protein],"[Uniprot:P31533, Uniprot:Q04896, Uniprot:P09015]",train,5341524.2


In [53]:
df.shape

(10425, 7)

# Get list of species in BioID XML files

In [3]:
import re
import glob
import xml.etree.ElementTree as ET

In [4]:
files = glob.glob('../../soda_xml/xml/*')
all_tax_ids = []
for f in files:
    text = open(f).read()
    pattern = re.compile(r'ext_tax_ids="(\d{1,11})"')
    tax_ids = re.findall(pattern, text)
    # print(tax_ids)
    all_tax_ids.extend(tax_ids)
    
len(set(all_tax_ids))

443

In [5]:
all_tax_ids = []
for f in tqdm(files):
    text = open(f).read()
    # pattern = re.compile(r'ext_tax_ids="(\d{1,11})"')
    # tax_ids = re.findall(pattern, text)
    # # print(tax_ids)
    # all_tax_ids.extend(tax_ids)

    xml_content = text

    # Parse the XML content
    root = ET.fromstring(xml_content)

    # Find all sd-tag elements with entity_type attribute set to "gene"
    gene_tags = [tag for tag in root.findall('.//sd-tag[@entity_type="gene"]')]

    # Extract the ext_tax_ids values from these tags
    # try:
    tax_ids = [int(y) for tag in gene_tags if tag.get('ext_tax_ids') for y in tag.get('ext_tax_ids').split('///')]

        # ext_tax_ids_values = [int(tag.get('ext_tax_ids')) for tag in gene_tags if tag.get('ext_tax_ids')]

    # except:
    # print([tag.get('ext_tax_ids') for tag in gene_tags if tag.get('ext_tax_ids')])

    # Add to list
    all_tax_ids.extend(tax_ids)

# Counter(all_tax_ids)
len(set(all_tax_ids))

  0%|          | 0/3213 [00:00<?, ?it/s]

173

In [6]:
with open('../data/soda_taxa.json', 'w') as f:
    f.write(ujson.dumps(list(set(all_tax_ids)), indent=2))