In [1]:
# Physical detection method BAO_0000035
# assay bioassay component > bioassay BAO_0000015
# assay method component > assay method BAO_0003028
# BAO_0000248 assay kit
# secondary assay BAO_0000032
## molecular entity BAO_0003043

## Use BioPortal API to get descendants for branches of interest and save to file

In [2]:
import urllib.request, urllib.error, urllib.parse
import json
import pandas as pd

In [3]:
REST_URL = "http://data.bioontology.org"
encoded_url = "http%3A%2F%2Fwww.bioassayontology.org%2Fbao%23"

with open('bioportal_api_key.txt') as f:
    API_KEY = f.read()

In [4]:
def get_json(url):
    opener = urllib.request.build_opener()
    opener.addheaders = [('Authorization', 'apikey token=' + API_KEY)]
    return json.loads(opener.open(url).read())

In [5]:
def get_children(term):
    request = REST_URL + '/ontologies/BAO/classes/' + encoded_url + term + '/children?pagesize=200'
    result = get_json(request)['collection']
    
    for i in result:
        id = i['@id']
        pref_name = i['prefLabel']
        search_results.append((id, pref_name))
        
        if 'bioassay' in id:
            term_id = i['@id'].split('#')[-1]
            get_children(term_id)

In [6]:
def get_descendants(term):
    request = REST_URL + '/ontologies/BAO/classes/' + encoded_url + term + '/descendants?pagesize=200'
    result = get_json(request)['collection']
    
    for i in result:
        id = i['@id']
        pref_name = i['prefLabel']
        search_results.append((id, pref_name))

In [7]:
def get_pref_name(term):
    request = REST_URL + '/ontologies/BAO/classes/' + encoded_url + term
    result = get_json(request)['prefLabel']
    return result

In [8]:
dfs = []
for bao_id in ['BAO_0000032', 'BAO_0000035', 'BAO_0000015', 'BAO_0003028', 'BAO_0000248', 'BAO_0003043']:    
    search_results = []
    get_children(bao_id)
    dfs.append(search_results)

In [9]:
len(dfs)

6

In [10]:
list_of_term_dfs = []

for df, bao_id in zip(dfs, ['BAO_0000032', 'BAO_0000035', 'BAO_0000015', 'BAO_0003028', 'BAO_0000248', 'BAO_0003043']):
    terms_df = pd.DataFrame(df)
    terms_df.columns = ['link', 'term']
    terms_df['branch_id'] = bao_id
    pref_name = get_pref_name(bao_id)
    terms_df['branch_pref_name'] = pref_name

    list_of_term_dfs.append(terms_df)

In [11]:
combined_df = pd.concat(list_of_term_dfs)
combined_df['id'] = combined_df['link'].apply(lambda x: x.split('#')[-1] if 'bao#' in x else x.split('/')[-1])

In [12]:
combined_df.tail()

Unnamed: 0,link,term,branch_id,branch_pref_name,id
406,http://www.bioassayontology.org/bao#BAO_0000796,ketoconazole,BAO_0003043,molecular entity,BAO_0000796
407,http://www.bioassayontology.org/bao#BAO_0000844,alkaline phosphatase-cAMP,BAO_0003043,molecular entity,BAO_0000844
408,http://www.bioassayontology.org/bao#BAO_0000846,7-amino-4-methylcoumarin,BAO_0003043,molecular entity,BAO_0000846
409,http://www.bioassayontology.org/bao#BAO_0000840,7-amino-4-trifluoromethylcoumarin,BAO_0003043,molecular entity,BAO_0000840
410,http://www.bioassayontology.org/bao#BAO_0000848,angiotensin II,BAO_0003043,molecular entity,BAO_0000848


In [13]:
combined_df.to_csv('bao_branches_of_interest_descendants.csv', index=False)

## Load previously obtained branches + descendents

In [3]:
combined_df = pd.read_csv('bao_branches_of_interest_descendants.csv')

In [4]:
id_dict = combined_df.groupby('branch_id').agg({'id': set, 'branch_pref_name': 'first'}).to_dict(orient='index')

In [5]:
id_dict.keys()

dict_keys(['BAO_0000015', 'BAO_0000032', 'BAO_0000035', 'BAO_0000248', 'BAO_0003028', 'BAO_0003043'])

In [6]:
[(key, id_dict[key]['branch_pref_name']) for key in id_dict.keys()]

[('BAO_0000015', 'bioassay'),
 ('BAO_0000032', 'secondary assay'),
 ('BAO_0000035', 'physical detection method'),
 ('BAO_0000248', 'assay kit'),
 ('BAO_0003028', 'assay method'),
 ('BAO_0003043', 'molecular entity')]

## Analyse branches in the gold standard

In [7]:
# done on 12 Feb
gold_standard = pd.read_csv('../data/BAO_linking_gold_standard.csv')

In [8]:
gold_standard_bao = gold_standard.loc[~gold_standard['bao_id'].isnull()]

In [9]:
gold_standard_bao.head()

Unnamed: 0,assay_id,assay_type,description,method,bao_preferred_term,bao_id
0,1870082,B,Agonist activity at GAL4 DNA binding domain fu...,FRET-based assay,fluorescence resonance energy transfer,BAO_0000001
1,1991217,B,Agonist activity at human FXR expressed in hum...,luciferase reporter gene assay,luciferase reporter gene assay,BAO_0002661
2,2168604,B,Antagonist activity at ERbeta (unknown origin)...,LanthaScreen TR-FRET assay,time resolved fluorescence resonance energy tr...,BAO_0000004
3,643730,B,Antagonist activity at mouse cloned Smo recept...,luciferase reporter gene assay,luciferase reporter gene assay,BAO_0002661
4,835167,B,Inhibition of FGFR2 by TR-FRET analysis,TR-FRET analysis,time resolved fluorescence resonance energy tr...,BAO_0000004


In [10]:
def find_branch(x):
    for key in id_dict:
        if x in id_dict[key]['id']:
            return key

In [11]:
def find_branch_name(x):
    for key in id_dict:
        if x in id_dict[key]['id']:
            return id_dict[key]['branch_pref_name']

In [12]:
gold_standard_bao['branch_id'] = gold_standard_bao['bao_id'].apply(lambda x: find_branch(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gold_standard_bao['branch_id'] = gold_standard_bao['bao_id'].apply(lambda x: find_branch(x))


In [13]:
gold_standard_bao['branch_pref_name'] = gold_standard_bao['bao_id'].apply(lambda x: find_branch_name(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gold_standard_bao['branch_pref_name'] = gold_standard_bao['bao_id'].apply(lambda x: find_branch_name(x))


In [14]:
gold_standard_bao.head()

Unnamed: 0,assay_id,assay_type,description,method,bao_preferred_term,bao_id,branch_id,branch_pref_name
0,1870082,B,Agonist activity at GAL4 DNA binding domain fu...,FRET-based assay,fluorescence resonance energy transfer,BAO_0000001,BAO_0000035,physical detection method
1,1991217,B,Agonist activity at human FXR expressed in hum...,luciferase reporter gene assay,luciferase reporter gene assay,BAO_0002661,BAO_0000015,bioassay
2,2168604,B,Antagonist activity at ERbeta (unknown origin)...,LanthaScreen TR-FRET assay,time resolved fluorescence resonance energy tr...,BAO_0000004,BAO_0000035,physical detection method
3,643730,B,Antagonist activity at mouse cloned Smo recept...,luciferase reporter gene assay,luciferase reporter gene assay,BAO_0002661,BAO_0000015,bioassay
4,835167,B,Inhibition of FGFR2 by TR-FRET analysis,TR-FRET analysis,time resolved fluorescence resonance energy tr...,BAO_0000004,BAO_0000035,physical detection method


In [15]:
gold_standard_bao['branch_id'].drop_duplicates()

0      BAO_0000035
1      BAO_0000015
15     BAO_0000248
24     BAO_0003028
34            None
346    BAO_0003043
Name: branch_id, dtype: object

In [16]:
gold_standard_bao.loc[gold_standard_bao['branch_id'].isnull()]

Unnamed: 0,assay_id,assay_type,description,method,bao_preferred_term,bao_id,branch_id,branch_pref_name
34,1588202,B,Binding affinity to His6-tagged human recombin...,VP-ITC microcalorimetry,VP-ITC Isothermal Titration Calorimeter,BAO_0000708,,
113,2074736,B,Displacement of [3H]CP55940 from human CB1R tr...,Microbeta TriLux based luminescence analysis,MicroBeta TriLux,BAO_0003094,,
220,2052717,B,Binding affinity at Escherichia coli pBR322 DN...,agarose gel based electrophoresis,agarose gel,BAO_0010020,,
227,588548,B,Binding affinity to Escherichia coli pUC19 DNA...,agarose gel electrophoresis,agarose gel,BAO_0010020,,
460,1907990,F,Induction of YqiG protein expression in Bacill...,35S-methionine pulse labeling based 2D-PAGE an...,PAGE,BAO_0010019,,
509,830179,F,Antimigraine activity in marmosets assessed as...,laser doppler flowmetry,laser,BAO_0150033,,
654,1651980,B,Poison activity at recombinant human topoisome...,ethidium bromide staining based agarose gel el...,agarose gel,BAO_0010020,,
674,2112257,B,Inhibition of human TDO expressed in HEK293-EB...,HPLC analysis,HPLC System,BAO_0002733,,
675,1621810,B,Inhibition of OGT (unknown origin) assessed as...,HPLC method,HPLC System,BAO_0002733,,
741,1724006,B,Inhibition of human SMS2 expressed in HEK293T ...,RapidFire/mass spectrometry assay,RapidFire Mass Spec,BAO_0002577,,


In [None]:
# Additional branches in the gold standard
# BAO_0000708, BAO_0003094, BAO_0150033, BAO_0002733, BAO_0002577 - assay method component > instrument BAO_0003118
# BAO_0010020, BAO_0010019 - assay bioassay component > bioassay specification BAO_0000026

In [None]:
# Expected branches:
# Physical detection method BAO_0000035
# assay bioassay component > bioassay BAO_0000015
# assay method component > assay method BAO_0003028
# BAO_0000248 assay kit
(# secondary assay BAO_0000032) not used in gold standard
## molecular entity BAO_0003043