## This notebook pulls mappings from NCBO Bioportal

mappings endpoint documentation: https://data.bioontology.org/documentation#Mapping

sample code: https://gist.github.com/callahantiff/a28fb3160782f42f104e9ec41553af0d

NCBO sample code: https://github.com/ncbo/ncbo_rest_sample_code


In [1]:
import requests
import json
import os
import pandas as pd
import time

In [2]:
script_path = os.getcwd()
data_path = os.path.join(script_path,'data')
result_path = os.path.join(script_path,'result')

In [3]:
import urllib.request, urllib.error, urllib.parse

## Load the API key
with open(os.path.join(script_path,'config.json'),'rb') as keyfile:
    keyinfo = json.load(keyfile)
    apikey = keyinfo['apikey']

## Format the apikey for the header
def get_json(url, apikey):
    opener = urllib.request.build_opener()
    opener.addheaders = [('Authorization', 'apikey token=' + apikey)]
    return json.loads(opener.open(url).read())

## Provide the list of ontologies to map
onto_list = ["BAO","OBI","EFO","NCIT","EDAM","MMO","CHMO"]

## Pull mapped pairs out of a paginated dictionary
def get_mappings(onto_source,page_dict):
    mappinglist = []
    for eachcollection in page_dict['collection']:
        tmpdict = {'source_ontology': onto_source,
                   'source_id': eachcollection['classes'][0]['@id'],
                   'map_method': eachcollection['source'],
                   'target_id': eachcollection['classes'][1]['@id']}
        mappinglist.append(tmpdict)
    return mappinglist

In [1]:
## filter the results to only mappings within ontologies of interest
def filter_for_ontos(onto_list,mappingdf):
    relevant_df = pd.DataFrame(columns=mappingdf.columns.tolist())
    source_ont = mappingdf.iloc[0]['source_ontology']
    target_list = [x for x in onto_list if x!=source_ont]
    for eachtarget in target_list:
        if eachtarget == "EDAM":
            target_subset = mappingdf.loc[mappingdf['target_id'].str.contains("edamontology.org/topic")]
        else:
            target_subset = mappingdf.loc[mappingdf['target_id'].str.contains(eachtarget)]
            if eachtarget == "MMO":
                tempsubset = target_subset.copy()
                target_subset = tempsubset.loc[~tempsubset['target_id'].str.contains("EMMO")]
        relevant_df = pd.concat(([relevant_df,target_subset]),ignore_index=True)
    return relevant_df

def download_mappings(apikey,onto_list,onto_subset,starting_page):
    for each_onto in onto_subset:
        print("now downloading mappings from: ",each_onto, "page: ",starting_page)
        allmappinglist = []
        ontologymap = f"https://data.bioontology.org/ontologies/{each_onto}/mappings"
        r = get_json(ontologymap,apikey)
        tmpmapping = get_mappings(each_onto, r)
        allmappinglist.extend(tmpmapping)
        if starting_page == 0:
            print(r["links"]["nextPage"])
            page = get_json(r['links']["nextPage"],apikey)
            print(page["links"]["nextPage"])
            tmpmapping = get_mappings(each_onto, page)
            allmappinglist.extend(tmpmapping)
        else:
            page = starting_page
            print(page["links"]["nextPage"])
            tmpmapping = get_mappings(each_onto, page)
            allmappinglist.extend(tmpmapping)
        next_page = page
        while next_page:
            next_page = page["links"]["nextPage"]
            tmpmapping = get_mappings(each_onto, page)
            allmappinglist.extend(tmpmapping)
            allmappingcopy = [x for x in allmappinglist]
            if next_page:
                page = get_json(next_page,apikey)
                print(page["links"]["nextPage"])
        mappingdf = pd.DataFrame(allmappingcopy)
        print(len(mappingdf))
        relevant_df = filter_for_ontos(onto_list,mappingdf)
        relevant_df.to_csv(os.path.join(result_path,"mappings",f"{each_onto}_mappings.tsv"), sep='\t', header=True)


In [5]:
## get mapping statistics
ontology = onto_list[4]
print(ontology)
r = get_json(f"https://data.bioontology.org/mappings/statistics/ontologies/{ontology}",apikey)
print(r['EFO'])
print(r['OBI'])
print(r['CHMO'])
print(r['MMO'])
print(r['BAO'])
#print(r['EDAM'])

EDAM
30
36
11
3
41


In [None]:
## Confirm the reverse mappings
ontology = onto_list[6]
print(ontology)
r = get_json(f"https://data.bioontology.org/mappings/statistics/ontologies/{ontology}",apikey)
print(r['EDAM'])

## Trouble-shooting
The mappings between EDAM "electron microscopy" and CHMO "electron microscopy" exists, but isn't being pulled by the current script. Investigate what's wrong
* The filter was not including EDAM properly
* The fixes to the downloader appeared to have worked

In [None]:
onto_subset = ["MMO"]
starting_page = 0
for each_onto in onto_subset:
    print("now downloading mappings from: ",each_onto, "page: ",starting_page)
    allmappinglist = []
    ontologymap = f"https://data.bioontology.org/ontologies/{each_onto}/mappings"
    r = get_json(ontologymap,apikey)
    print(r['links']['nextPage'])
    if starting_page == 0:
        page = get_json(r['links']["nextPage"],apikey)
        #print(page['links']['nextPage'])
    else:
        break

In [None]:
%%time

## Download all mappings from all ontologies of interest
onto_list = ["MMO","CHMO","BAO","OBI","EDAM","EFO","NCIT"]
onto_subset = ["MMO","CHMO","BAO","OBI","EDAM"]
download_mappings(apikey,onto_list,onto_subset,0)


### Download mappings from specific ontologies of interest

In [None]:
%%time
## Download extensive mappings
each_onto = "EFO"#onto_list[4]
i = 0
allmappinglist = []
relevant_df = pd.DataFrame(columns=["source_ontology","source_id","map_method","target_id"])
ontologymap = f"https://data.bioontology.org/ontologies/{each_onto}/mappings"
r = get_json(ontologymap,apikey)
starting_page = 0
tmpmapping = get_mappings(each_onto, r)
allmappinglist.extend(tmpmapping)
if starting_page == 0:
    print(r["links"]["nextPage"])
    page = get_json(r['links']["nextPage"],apikey)
    print(page["links"]["nextPage"])
    tmpmapping = get_mappings(each_onto, page)
    allmappinglist.extend(tmpmapping)
else:
    page = starting_page
    print(page["links"]["nextPage"])
    tmpmapping = get_mappings(each_onto, page)
    allmappinglist.extend(tmpmapping)

In [None]:
## Save current progress to resume later
from datetime import datetime
import json
current_progress = {"page":page,
                     "next_page": next_page,
                     "i":i,
                     "each_onto":each_onto,
                     "allmappinglist": allmappinglist}

progress_date = datetime.strftime(datetime.now(),"%Y-%m-%d")

with open(os.path.join(result_path,"mappings",f"{each_onto}_progress({progress_date}).json"), "w") as outwrite:
    outwrite.write(json.dumps(current_progress))

In [None]:
## Resume progress
from datetime import datetime
import json
each_onto = "EFO" #onto_list[4]
progress_date = "2024-10-19"
with open(os.path.join(result_path,"mappings",f"{each_onto}_progress({progress_date}).json"), "r") as infile:
    parameter_dict = json.load(infile)

print(parameter_dict)
page = parameter_dict["page"]
next_page = parameter_dict["next_page"]
i = parameter_dict["i"]
each_onto = parameter_dict["each_onto"]
allmappinglist = parameter_dict["allmappinglist"]

In [None]:
print(next_page)

In [None]:
%%time
while next_page:
    next_page = page["links"]["nextPage"]
    tmpmapping = get_mappings(each_onto, page)
    allmappinglist.extend(tmpmapping)
    allmappingcopy = [x for x in allmappinglist]
    if next_page:
        page = get_json(next_page,apikey)
        #print(page["links"]["nextPage"])
    if len(allmappinglist) >= 5000: ## Export when the number of mappings exceeds 1000
        i=i+1
        mappingdf = pd.DataFrame(allmappinglist)
        mappingdf.to_csv(os.path.join(result_path,"mappings",f"{each_onto}_all_mappings_{str(i)}.tsv"), sep='\t', header=True)
        tmp_df = filter_for_ontos(onto_list,mappingdf)
        if len(tmp_df) > 0:
            relevant_df = pd.concat((relevant_df,tmp_df), ignore_index = True)
        allmappinglist = [] ## Reset the mappings list
        print(i, " file dumped. Current page: ",next_page)

mappingdf = pd.DataFrame(allmappingcopy)
print(len(mappingdf))
relevant_df = filter_for_ontos(onto_list,mappingdf)        
relevant_df.to_csv(os.path.join(result_path,"mappings",f"{each_onto}_relevant_mappings.tsv"), sep='\t', header=True)

### Download mappings for specific nodes

In [None]:
## Get list of measTechOnly terms
script_path = os.getcwd()
parent_path = os.path.abspath(os.path.join(script_path, os.pardir))
measTechOnlyFile = os.path.join(parent_path,"measTechNet","results","measTechOnly_mapped_triples.tsv")
measTechOnlyDF = pd.read_csv(measTechOnlyFile)
measTechNodes = list(set(measTechOnlyDF['subject_id'].unique().tolist()).union(set(measTechOnlyDF['object_id'].unique().tolist())))

mappings_df = pd.DataFrame(columns=["source_ontology","source_id","map_method","target_id"])
for eachonto in onto_list:
    allmappinglist = []
    if eachonto == "EDAM":
        classlist = [x for x in measTechNodes if "edamontology.org/topic" in x]
    else:
        classlist = [x for x in measTechNodes if eachonto in x]
        
    for eachclass in classlist:
        classmap = f"https://data.bioontology.org/ontologies/{eachonto}/classes/{eachclass}/mappings"
        r = get_json(ontologymap,apikey)
        page = starting_page
        tmpmapping = get_mappings(each_onto, page)
        allmappinglist.extend(tmpmapping)
        
    mappingdf = pd.DataFrame(allmappinglist)
    relevant_df = filter_for_ontos(onto_list,mappingdf)
    relevant_df.to_csv(os.path.join(result_path,"class-based_mappings",f"{each_onto}_mappings.tsv"), sep='\t', header=True)    

## Filtering general ontology mapping files

In [1]:
import requests
import json
import os
import pandas as pd
import time

In [2]:
script_path = os.getcwd()
data_path = os.path.join(script_path,'data')
result_path = os.path.join(script_path,'result')
map_path = os.path.join(result_path,'mappings')

In [5]:
## Provide the list of ontologies to map
onto_list = ["BAO","OBI","EFO","NCIT","EDAM","MMO","CHMO"]

In [12]:
## load mapping files
onto_2_map = onto_list[2]
onto_file_list = os.listdir(map_path)
onto_only_list = [x for x in onto_file_list if onto_2_map in x]

allmappinglist = pd.read_csv(os.path.join(map_path,onto_only_list[0]),delimiter='\t',header=0,index_col=0)
for eachfile in onto_only_list:
    mappingdf = pd.read_csv(os.path.join(map_path,eachfile),delimiter='\t',header=0,index_col=0)
    try:
        tmpdf = filter_for_ontos(onto_list,mappingdf)
        if len(tmpdf)>0:
            allmappinglist = pd.concat((allmappinglist,tmpdf),ignore_index=True)
    except:
        print(eachfile)
clean_mappings = filter_for_ontos(onto_list,allmappinglist)
clean_mappings.drop_duplicates(keep="first",inplace=True)
print(len(clean_mappings))
clean_mappings.to_csv(os.path.join(map_path,f'{onto_2_map}_mappings.tsv'),sep='\t',header=True)

EFO_progress(2024-10-20).json
8177


In [None]:
tmpmapping = get_mappings(each_onto, page)
allmappinglist.extend(tmpmapping)
allmappingcopy = [x for x in allmappinglist]
    mappingdf = pd.DataFrame(allmappinglist)
    mappingdf.to_csv(os.path.join(result_path,"mappings",f"{each_onto}_all_mappings_{str(i)}.tsv"), sep='\t', header=True)
    #tmp_df = filter_for_ontos(onto_list,mappingdf)

## Figuring out the data structure of a result from the API

The organization of the mappings appear to be as follows:
A result from the API ontology/mappings endpoint:
* r.keys:  dict_keys(['page', 'pageCount', 'totalCount', 'prevPage', 'nextPage', 'links', 'collection'])
  * r['collection'].keys():  dict_keys(['id', 'source', 'classes', 'process', '@id', '@type'])
    * r['collection'][0]['classes'].keys:  dict_keys(['@id', '@type', 'links', '@context'])

Where each pair of mapped terms appear to be listed under 'classes' with classes[0] being the term in the source ontology and classes[1] being the term from a different ontology

The source is how two terms were mapped

In [None]:
## Test to see that an API request is working
REST_URL = "http://data.bioontology.org"
term = "survey"
r = get_json(REST_URL + "/search?q=" + term,apikey)["collection"]
#print(r)

In [None]:
## Testing the class mapping endpoint
ontology_shorthand = 'BRO'
classurl = 'http%3A%2F%2Fbioontology.org%2Fontologies%2FBiomedicalResourceOntology.owl%23Ontology_Development_and_Management'
classmap = f"https://data.bioontology.org/ontologies/{ontology_shorthand}/classes/{classurl}/mappings"
r = get_json(classmap,apikey)
print('r[0].keys: ', r[0].keys())
print('r[0][classes][0].keys: ', r[0]['classes'][0].keys())
print('r[0][classes][0][links]: ', r[0]['classes'][0]['links'].keys())
print(r[0]['classes'][0]['links']['descendants'])

In [None]:
## Test the ontology mapping end point
ontology_shorthand = 'MMO'
classurl = 'http://purl.obolibrary.org/obo/MMO_0000000'
ontologymap = f"https://data.bioontology.org/ontologies/{ontology_shorthand}/mappings"
r = get_json(ontologymap,apikey)

print("r.keys: ",r.keys())
print("r['links']: ", r['links'])
print("r['page']: ", r['page'])
print("r['collection'].keys(): ", r['collection'][0].keys())
print("r['collection'][0]['classes'].keys: ", r['collection'][0]['classes'][0].keys())
print("collection id: ",r['collection'][0]["@id"])
print("class id: ", r['collection'][0]['classes'][1]["@id"])
print("class type: ", r['collection'][0]['classes'][1]["@type"])
print("class context: ", r['collection'][0]['classes'][1]["@context"])
print("number of classes: ",len(r['collection'][0]['classes']))

print(r['pageCount'])
for eachcollection in r['collection'][0:3]:
    #print(len(eachcollection['classes']))
    print(eachcollection['classes'][0]['@id'],eachcollection['source'],eachcollection['classes'][1]['@id'])
    print(eachcollection['classes'][0]['@context'],eachcollection['classes'][1]['@context'])

In [None]:
## Test the use of pagination
page = get_json(r['links']["nextPage"],apikey)
allmappinglist = []
# Iterate over the available pages adding labels from all classes
# When we hit the last page, the while loop will exit
next_page = page
while next_page:
    next_page = page["links"]["nextPage"]
    tmpmapping = get_mappings("MMO", page)
    allmappinglist.extend(tmpmapping)
    if next_page:
        page = get_json(next_page,apikey)

In [None]:
mappingdf = pd.DataFrame(mappinglist)
print(mappingdf.head(n=2))

In [None]:
print(len(mappingdf))

## Test the use of pageCount and baseurls


In [None]:
## Download extensive mappings
each_onto = "EFO"#onto_list[4]
i = 0
allmappinglist = []
relevant_df = pd.DataFrame(columns=["source_ontology","source_id","map_method","target_id"])
ontologymap = f"https://data.bioontology.org/ontologies/{each_onto}/mappings"
r = get_json(ontologymap,apikey)
pageNumber = 0
pageCount = r["pageCount"]
tmpmapping = get_mappings(each_onto, r)
allmappinglist.extend(tmpmapping)
print(pageCount)
if pageNumber == 0:
    page = get_json(r['links']["nextPage"],apikey)
    tmpmapping = get_mappings(each_onto, page)
    allmappinglist.extend(tmpmapping)
    pageNumber = pageNumber + 1
    next_page = f"https://data.bioontology.org/ontologies/{each_onto}/mappings?page={pageNumber}"
else:
    baseurl = f"https://data.bioontology.org/ontologies/{each_onto}/mappings?page={pageNumber}"
    page = get_json(baseurl,apikey)
    tmpmapping = get_mappings(each_onto, page)
    allmappinglist.extend(tmpmapping)
    pageNumber = pageNumber + 1
    next_page = baseurl

In [13]:
## Save current progress to resume later
from datetime import datetime
import json
current_progress = {"pageNumber": pageNumber,
                    "pageCount": pageCount,
                    "next_page": next_page,
                    "i":i,
                    "each_onto":each_onto,
                    "allmappinglist": allmappinglist}

progress_date = datetime.strftime(datetime.now(),"%Y-%m-%d")

with open(os.path.join(result_path,"mappings",f"{each_onto}_progress({progress_date}).json"), "w") as outwrite:
    outwrite.write(json.dumps(current_progress))

In [10]:
## Resume progress (if saved)
from datetime import datetime
import json
each_onto = "EFO" #onto_list[4]
progress_date = "2024-10-19"
with open(os.path.join(result_path,"mappings",f"{each_onto}_progress({progress_date}).json"), "r") as infile:
    parameter_dict = json.load(infile)

next_page = parameter_dict["next_page"]
pageNumber = parameter_dict["pageNumber"]
pageCount = parameter_dict["pageCount"]
i = parameter_dict["i"]
each_onto = parameter_dict["each_onto"]
allmappinglist = parameter_dict["allmappinglist"]

In [11]:
print(pageNumber, pageCount)

10128 10292


In [None]:
## Resume progress (if not saved)
from datetime import datetime
import json
each_onto = "EFO" #onto_list[4]
pageNumber = 4403
i = 47
next_page = next_page = f"https://data.bioontology.org/ontologies/{each_onto}/mappings?page={pageNumber}"
allmappinglist = []

In [None]:
pageNumber=4403
next_page = "https://data.bioontology.org/ontologies/EFO/mappings?page=4403"
i=47
print(pageNumber, pageCount)

In [None]:
relevant_df = tmp_df.copy()
print(len(relevant_df))

In [12]:
while pageNumber < pageCount:
    page = get_json(next_page,apikey)
    tmpmapping = get_mappings(each_onto, page)
    allmappinglist.extend(tmpmapping)
    allmappingcopy = [x for x in allmappinglist]
    pageNumber = pageNumber+1
    next_page = f"https://data.bioontology.org/ontologies/{each_onto}/mappings?page={pageNumber}"
    if len(allmappinglist) >= 5000: ## Export when the number of mappings exceeds 1000
        i=i+1
        mappingdf = pd.DataFrame(allmappinglist)
        mappingdf.to_csv(os.path.join(result_path,"mappings",f"{each_onto}_all_mappings_{str(i)}.tsv"), sep='\t', header=True)
        #tmp_df = filter_for_ontos(onto_list,mappingdf)
        #if len(tmp_df) > 0:
        #    relevant_df = pd.concat((relevant_df,tmp_df), ignore_index = True)
        allmappinglist = [] ## Reset the mappings list
        print(i, " file dumped. Current page: ",next_page)

mappingdf = pd.DataFrame(allmappinglist)
mappingdf.to_csv(os.path.join(result_path,"mappings",f"{each_onto}_all_mappings_{str(i)}.tsv"), sep='\t', header=True)
#mappingdf = pd.DataFrame(allmappingcopy)
#print(len(mappingdf))
#relevant_df = filter_for_ontos(onto_list,mappingdf)        
#relevant_df.to_csv(os.path.join(result_path,"mappings",f"{each_onto}_relevant_mappings.tsv"), sep='\t', header=True)