# Making a Custom Grounder for Gilda

This tutorial presents several ways of generating custom groundings that can be used with Gilda.

In [1]:
import gilda
import gilda.term
from tabulate import tabulate
from gilda.process import normalize
import pandas as pd
import time
import sys
from tqdm.auto import tqdm

In [2]:
print(sys.version)

3.10.2 (main, Feb  2 2022, 06:19:27) [Clang 13.0.0 (clang-1300.0.29.3)]


In [3]:
print(time.asctime())

Tue Apr 26 17:48:13 2022


In [4]:
def matches_df(scored_matches) -> pd.DataFrame:
    return pd.DataFrame([
        { 
            **m.term.to_json(),
            'score': m.score,
        } 
        for m in scored_matches
    ])

# Custom Terms from an Ontology via *OBO Graph JSON*

Many ontologies are pre-parsed into the [OBO Graph JSON](https://github.com/geneontology/obographs) format that are readily usable without ontology-specific software. In this example, we get the URL for an OBO Graph JSON for the [Monarch Disease Ontolgy (MONDO)](https://obofoundry.org/ontology/mondo) then generate Gilda terms for its entries based on their names and synonyms.

In [5]:
import requests
    
def terms_from_obograph_url(url, prefix, uri_prefix):
    res = requests.get(url).json()
    custom_terms = []
    missing_label = 0

    for node in tqdm(res['graphs'][0]['nodes'], unit_scale=True, unit="node"):
        uri = node['id']
        if not uri.startswith(uri_prefix):
            continue  # skip imported terms

        identifier = uri[len(uri_prefix):]

        name = node.get('lbl')
        if name is None:
            missing_label += 1
            continue

        custom_terms.append(gilda.term.Term(
            norm_text=normalize(name),
            text=name,
            db=prefix,
            id=identifier,
            entry_name=name,
            status="name",
            source=prefix,
        ))
        for synonym_data in node.get('meta', {}).get('synonyms', []):
            synonym = synonym_data['val']
            custom_terms.append(gilda.term.Term(
                norm_text=normalize(synonym),
                text=synonym,
                db=prefix,
                id=identifier,
                entry_name=name,
                status="synonym",
                source=prefix,
            ))

    print(f"{missing_label:,} nodes were missing labels")
    return custom_terms

In [6]:
mondo_url = 'http://purl.obolibrary.org/obo/mondo.json'
mondo_prefix = "MONDO"
mondo_uri_prefix = "http://purl.obolibrary.org/obo/MONDO_"

mondo_terms = terms_from_obograph_url(mondo_url, mondo_prefix, mondo_uri_prefix)
mondo_grounder = gilda.make_grounder(mondo_terms)
mondo_grounder.print_summary()

  0%|          | 0.00/44.3k [00:00<?, ?node/s]

74 nodes were missing labels
Lookups: 112,921
Terms: 128,873
Term Namespaces: {'MONDO'}
Term Statuses: {'name': 24907, 'synonym': 103966}
Adeft Disambiguators: 202
Gilda Disambiguators: 1,008



In [7]:
matches_df(mondo_grounder.ground("alzheimer disease"))

Unnamed: 0,norm_text,text,db,id,entry_name,status,source,score
0,alzheimer disease,Alzheimer disease,MONDO,4975,Alzheimer disease,name,MONDO,0.771593
1,alzheimer disease,Alzheimer disease,MONDO,7088,Alzheimer disease type 1,synonym,MONDO,0.549371


In [8]:
matches_df(mondo_grounder.ground("alzheimer's disease"))

Unnamed: 0,norm_text,text,db,id,entry_name,status,source,score
0,alzheimer's disease,Alzheimer's disease,MONDO,4975,Alzheimer disease,synonym,MONDO,0.511647


# Custom Terms from an Ontology via `obonet`

Many ontologies are available in the text-based OBO format which can be parsed with tools like [`obonet`](https://github.com/dhimmel/obonet) into a [`networkx`](https://networkx.org/) graph object. In this example, we get the OBO document for the [Uber Anatomy Ontology (UBERON)](https://obofoundry.org/ontology/uberon) then generate Gilda terms for its entries based on their names and synonyms by traversing the nodes corresponding to UBERON terms in the graph object.

In [9]:
import obonet

def terms_from_obo_url(url, prefix):
    g = obonet.read_obo(url)
    custom_obo_terms = []
    for node, data in tqdm(g.nodes(data=True), unit_scale=True, unit="node"):
        # Skip entries imported from other ontologies
        if not node.startswith(f"{prefix}:"):
            continue

        identifier = node.removeprefix(f"{prefix}:")

        name = data["name"]
        if not name:
            continue
        custom_obo_terms.append(gilda.term.Term(
            norm_text=normalize(name),
            text=name,
            db=prefix,
            id=identifier,
            entry_name=name,
            status="name",
            source=prefix,
        ))

        # Add terms for all synonyms
        for synonym_raw in data.get("synonym", []):
            try:
                # Try to parse out of the quoted OBO Field
                synonym = synonym_raw.split('"')[1].strip()
            except IndexError:
                continue  # the synonym was malformed
            if not synonym:
                continue
            custom_obo_terms.append(gilda.term.Term(
                norm_text=normalize(synonym),
                text=synonym,
                db=prefix,
                id=identifier,
                entry_name=name,
                status="synonym",
                source=prefix,
            ))
    return custom_obo_terms

In [10]:
uberon_prefix = "UBERON"
uberon_url = "http://purl.obolibrary.org/obo/uberon/basic.obo"

uberon_terms = terms_from_obo_url(uberon_url, uberon_prefix)

uberon_grounder = gilda.make_grounder(uberon_terms)
uberon_grounder.print_summary()

  0%|          | 0.00/14.5k [00:00<?, ?node/s]

Lookups: 49,677
Terms: 53,613
Term Namespaces: {'UBERON'}
Term Statuses: {'name': 14224, 'synonym': 39389}
Adeft Disambiguators: 202
Gilda Disambiguators: 1,008



In [11]:
matches_df(uberon_grounder.ground("neck"))

Unnamed: 0,norm_text,text,db,id,entry_name,status,source,score
0,neck,neck,UBERON,974,neck,name,UBERON,0.777778


## Example with NCBITaxon

While NCBITaxon itself isn't curated as an ontology, the OBO Foundry community maintains an export into the OWL, OBO, and OBO Graph JSON formats. 

In [12]:
ncbitaxon_url = "http://purl.obolibrary.org/obo/ncbitaxon.obo"
ncbitaxon_prefix = "NCBITaxon"

ncbitaxon_terms = terms_from_obo_url(ncbitaxon_url, ncbitaxon_prefix)

ncbitaxon_grounder = gilda.make_grounder(ncbitaxon_terms)
ncbitaxon_grounder.print_summary()

  0%|          | 0.00/2.40M [00:00<?, ?node/s]

Lookups: 2,709,497
Terms: 2,721,934
Term Namespaces: {'NCBITaxon'}
Term Statuses: {'name': 2401826, 'synonym': 320108}
Adeft Disambiguators: 202
Gilda Disambiguators: 1,008



In [19]:
matches_df(ncbitaxon_grounder.ground("e coli"))

In [18]:
matches_df(ncbitaxon_grounder.ground("e. coli"))

Unnamed: 0,norm_text,text,db,id,entry_name,status,source,score
0,e. coli,E. coli,NCBITaxon,562,Escherichia coli,synonym,NCBITaxon,0.511647


# Custom Terms from PyOBO

Many biomedical resources curate terms with labels and synonyms that would be useful for generating a custom grounder with Gilda, but they don't live in ontologies in the OWL, OBO, or OBO Graph JSON formats. [PyOBO](https://github.com/pyobo/pyobo) provides unified way to access and process many resources in an ontology-like way. In this example, several pathway databases are loaded for grounding including Reactome, WikiPathways, PathBank, and the Pathway Ontology (which itself actually is an ontology).

In [14]:
import pyobo
import pyobo.api.utils

print(pyobo.get_version())

0.7.0


In [15]:
custom_pathway_terms = []

prefixes = [
    "reactome", 
    "wikipathways", 
    "pw",  # Pathway ontology
    "pathbank",
]

# Repeat the steps for several pathway resources
for prefix in prefixes:
    version = pyobo.api.utils.get_version(prefix)
    names = pyobo.get_id_name_mapping(prefix)
    synonyms = pyobo.get_id_synonyms_mapping(prefix)
    print(
        f"{prefix} v{version}, {len(names):,} names, {sum(len(v) for v in synonyms.values()):,} synonyms"
    )

    for identifier, name in names.items():
        # Create a Gilda term for the standard label
        custom_pathway_terms.append(gilda.Term(
            norm_text=normalize(name),
            text=name,
            db=prefix,
            id=identifier,
            entry_name=name,
            status="name",
            source=prefix,
        ))
        
        # Create a Gilda term for each synonym
        for synonym in synonyms.get(identifier, []):
            custom_pathway_terms.append(gilda.Term(
                norm_text=normalize(synonym),
                text=synonym,
                db=prefix,
                id=identifier,
                entry_name=name,
                status="synonym",
                source=prefix,
            ))



reactome v80, 21,423 names, 0 synonyms
wikipathways v20220410, 1,718 names, 0 synonyms
pw v2019-10-23, 2,600 names, 1,957 synonyms
pathbank v2.0, 110,242 names, 0 synonyms


In [16]:
# Generate a grounder using a list of Gilda terms
custom_pathway_grounder = gilda.make_grounder(custom_pathway_terms)
custom_pathway_grounder.print_summary()

Lookups: 76,499
Terms: 137,940
Term Namespaces: {'pw', 'reactome', 'wikipathways', 'pathbank'}
Term Statuses: {'name': 135983, 'synonym': 1957}
Adeft Disambiguators: 202
Gilda Disambiguators: 1,008



In [17]:
matches_df(custom_pathway_grounder.ground("apoptosis"))

Unnamed: 0,norm_text,text,db,id,entry_name,status,source,score
0,apoptosis,Apoptosis,reactome,R-BTA-109581,Apoptosis,name,reactome,0.762317
1,apoptosis,Apoptosis,reactome,R-CEL-109581,Apoptosis,name,reactome,0.762317
2,apoptosis,Apoptosis,reactome,R-CFA-109581,Apoptosis,name,reactome,0.762317
3,apoptosis,Apoptosis,reactome,R-DDI-109581,Apoptosis,name,reactome,0.762317
4,apoptosis,Apoptosis,reactome,R-DME-109581,Apoptosis,name,reactome,0.762317
5,apoptosis,Apoptosis,reactome,R-DRE-109581,Apoptosis,name,reactome,0.762317
6,apoptosis,Apoptosis,reactome,R-GGA-109581,Apoptosis,name,reactome,0.762317
7,apoptosis,Apoptosis,reactome,R-HSA-109581,Apoptosis,name,reactome,0.762317
8,apoptosis,Apoptosis,reactome,R-MMU-109581,Apoptosis,name,reactome,0.762317
9,apoptosis,Apoptosis,reactome,R-PFA-109581,Apoptosis,name,reactome,0.762317
