## measurementTechnique extraction and standardization review

#### Analyze the extraction of measurementTechniques by ChatGPT
Approach:
- Select 10 records at random from each of the sampled repositories
- For each record, evaluate the techniques pulled by ChatGPT as True or False positive based on description
- Calculate precision and recall for each record
- If there are terms which are the same, but appear different, pull those out for standardization analysis

#### Analyse the extraction of extraneous generic 'stop word' terms
Approach:
- Split all ChatGPT generated terms into words (split by space)
- Generate frequency table of terms
- Identify top set of generic terms for use as permutations in standardization

#### Analyze the standardization of measurementTechnique terms
Approach:
- Create list of terms based based on mapping to multiple ontologies
- Create permutations in term list
- Run the standardization approach and calculate precision, recall F relative to original term

#### Determine threshold for cut off to favor false negatives over false positives
Approach:
- For each threshhold cutoff
  - Select 25 terms and determine how well they matched



In [1]:
import requests
import json
import os
import pandas as pd

In [None]:
script_path = os.getcwd()
data_path = os.path.join(script_path,'data')
result_path = os.path.join(script_path,'result')

### Analyze ChatGPT's extraction

In [None]:
## Load the file
gpt_results = pd.read_csv(os.path.join(data_path,'GPT Measurement Techniques results.tsv'),delimiter='\t',header=0)
print(gpt_results.head(n=2))

## select 10 random records
ransamps = gpt_results.groupby('Data Repository').sample(10)

def ifenumed(rowdata):
    numlist = ['1.','2.','3.','4.','5.','6.','7.','8.','9.','10.','11.','12.','13.','14.','15.']
    for eachnum in numlist:
        if eachnum in rowdata:
            rowdata.replace(eachnum," - ")
        else:
            break
    return rowdata

## format results from records
def clean_predictions(row):
    rowdata = row['Predictions']
    if '1. ' in rowdata:
        rowdata = ifenumed(rowdata)
    tmpdata = rowdata.replace(" - ","|")
    tmplist = tmpdata.split("|")
    cleanlist = [x.replace("- ","").strip() for x in tmplist]
    return cleanlist 

ransamps['clean_pred'] = ransamps.apply(lambda row: clean_predictions(row), axis=1)
exploded_df = ransamps.explode('clean_pred')
#print(ransamps.head(n=2))
exploded_df.drop(columns=['Model','Predictions'],inplace=True)
print(exploded_df.head(n=2))

## Export results for manual evaluation
exploded_df.to_csv(os.path.join(result_path,'GPT_sample.tsv'),sep='\t',header=True)


### Analyse the extraction of extraneous generic 'stop word' terms

In [None]:
## Pull the terms from Dylan's list (since the source is not needed)
t2t_results = pd.read_csv(os.path.join(data_path,'Measurement Techniques mapped.tsv'),delimiter='\t',header=2)

## Pull the list of techniques
techniques = t2t_results['Technique'].tolist()

## process the terms
cleanlist = []
for eachterm in techniques:
    tmpterm = eachterm.lower()
    tmpterm.replace("-"," ").replace(":"," ")
    tmplist = tmpterm.split(" ")
    cleantmp = [x.replace("(","").replace(")","").strip() for x in tmplist]
    cleanlist.extend(cleantmp)

termseries = pd.Series(cleanlist)
termdf = termseries.to_frame('technique')
termfreq = termdf.groupby('technique').size().reset_index(name='counts')
termfreq.sort_values('counts',ascending=False,inplace=True)
print(termfreq.loc[termfreq['counts']>5])
termfreq.to_csv(os.path.join(result_path,'stopword_freq.tsv'),sep='\t',header=0)

### Analyze the standardization of measurementTechnique terms

#### Get measurementTechnique terms to test

* mappings endpoint documentation: https://data.bioontology.org/documentation#Mapping
* sample code: https://gist.github.com/callahantiff/a28fb3160782f42f104e9ec41553af0d
* NCBO sample code: https://github.com/ncbo/ncbo_rest_sample_code

In [None]:
import urllib.request, urllib.error, urllib.parse

## Load the API key
with open(os.path.join(script_path,'config.json'),'rb') as keyfile:
    keyinfo = json.load(keyfile)
    apikey = keyinfo['apikey']

## Format the apikey for the header
def get_json(url, apikey):
    opener = urllib.request.build_opener()
    opener.addheaders = [('Authorization', 'apikey token=' + apikey)]
    return json.loads(opener.open(url).read())

## Provide the list of ontologies to map
onto_list = ["BAO","OBI","EFO","NCIT","EDAMT","MMO","CHMO"]

## Pull mapped pairs out of a paginated dictionary
def get_mappings(onto_source,page_dict):
    mappinglist = []
    for eachcollection in page_dict['collection']:
        tmpdict = {'source_ontology': onto_source,
                   'source_id': eachcollection['classes'][0]['@id'],
                   'map_method': eachcollection['source'],
                   'target_id': eachcollection['classes'][1]['@id']}
        mappinglist.append(tmpdict)
    return mappinglist

In [None]:
## filter the results to only mappings within ontologies of interest
def filter_for_ontos(onto_list,mappingdf):
    relevant_df = pd.DataFrame(columns=mappingdf.columns.tolist())
    source_ont = mappingdf.iloc[0]['source_ontology']
    target_list = [x for x in onto_list if x!=source_ont]
    for eachtarget in target_list:
        target_subset = mappingdf.loc[mappingdf['target_id'].str.contains(eachtarget)]
        relevant_df = pd.concat(([relevant_df,target_subset]),ignore_index=True)
    return relevant_df

def download_mappings(apikey,onto_list,starting_page):
    for each_onto in onto_list:
        print("now downloading mappings from: ",each_onto)
        allmappinglist = []
        ontologymap = f"https://data.bioontology.org/ontologies/{each_onto}/mappings"
        r = get_json(ontologymap,apikey)
        if starting_page == 0:
            page = get_json(r['links']["nextPage"],apikey)
        else:
            page = starting_page
        next_page = page
        while next_page:
            next_page = page["links"]["nextPage"]
            tmpmapping = get_mappings(each_onto, page)
            allmappinglist.extend(tmpmapping)
            if next_page:
                page = get_json(next_page,apikey)
                print(page["links"]["nextPage"])
        mappingdf = pd.DataFrame(allmappinglist)
        relevant_df = filter_for_ontos(onto_list,mappingdf)
        relevant_df.to_csv(os.path.join(result_path,"mappings",f"{each_onto}_mappings.tsv"), sep='\t', header=True)


In [None]:
%%time

## Download all mappings from ontologies of interest
download_mappings(apikey,onto_list,0)


### Figuring out the data structure of a result from the API

The organization of the mappings appear to be as follows:
A result from the API ontology/mappings endpoint:
* r.keys:  dict_keys(['page', 'pageCount', 'totalCount', 'prevPage', 'nextPage', 'links', 'collection'])
  * r['collection'].keys():  dict_keys(['id', 'source', 'classes', 'process', '@id', '@type'])
    * r['collection'][0]['classes'].keys:  dict_keys(['@id', '@type', 'links', '@context'])

Where each pair of mapped terms appear to be listed under 'classes' with classes[0] being the term in the source ontology and classes[1] being the term from a different ontology

The source is how two terms were mapped

In [None]:
## Test to see that an API request is working
REST_URL = "http://data.bioontology.org"
term = "survey"
r = get_json(REST_URL + "/search?q=" + term,apikey)["collection"]
#print(r)

In [None]:
## Testing the class mapping endpoint
ontology_shorthand = 'BRO'
classurl = 'http%3A%2F%2Fbioontology.org%2Fontologies%2FBiomedicalResourceOntology.owl%23Ontology_Development_and_Management'
classmap = f"https://data.bioontology.org/ontologies/{ontology_shorthand}/classes/{classurl}/mappings"
r = get_json(classmap,apikey)
print('r[0].keys: ', r[0].keys())
print('r[0][classes][0].keys: ', r[0]['classes'][0].keys())
print('r[0][classes][0][links]: ', r[0]['classes'][0]['links'].keys())
print(r[0]['classes'][0]['links']['descendants'])

In [None]:
## Test the ontology mapping end point
ontology_shorthand = 'MMO'
classurl = 'http://purl.obolibrary.org/obo/MMO_0000000'
ontologymap = f"https://data.bioontology.org/ontologies/{ontology_shorthand}/mappings"
r = get_json(ontologymap,apikey)

print("r.keys: ",r.keys())
print("r['links']: ", r['links'])
print("r['page']: ", r['page'])
print("r['collection'].keys(): ", r['collection'][0].keys())
print("r['collection'][0]['classes'].keys: ", r['collection'][0]['classes'][0].keys())
print("collection id: ",r['collection'][0]["@id"])
print("class id: ", r['collection'][0]['classes'][1]["@id"])
print("class type: ", r['collection'][0]['classes'][1]["@type"])
print("class context: ", r['collection'][0]['classes'][1]["@context"])
print("number of classes: ",len(r['collection'][0]['classes']))

print(r['pageCount'])
for eachcollection in r['collection'][0:3]:
    #print(len(eachcollection['classes']))
    print(eachcollection['classes'][0]['@id'],eachcollection['source'],eachcollection['classes'][1]['@id'])
    print(eachcollection['classes'][0]['@context'],eachcollection['classes'][1]['@context'])

In [None]:
## Test the use of pagination
page = get_json(r['links']["nextPage"],apikey)
allmappinglist = []
# Iterate over the available pages adding labels from all classes
# When we hit the last page, the while loop will exit
next_page = page
while next_page:
    next_page = page["links"]["nextPage"]
    tmpmapping = get_mappings("MMO", page)
    allmappinglist.extend(tmpmapping)
    if next_page:
        page = get_json(next_page,apikey)

In [None]:
mappingdf = pd.DataFrame(mappinglist)
print(mappingdf.head(n=2))

In [None]:
print(len(mappingdf))

## Pulling terms that are mapped between measTech Ontologies

MMO and CHMO are more technique-focused ontologies, so focusing on the mappings between these ontologies and others is a fast way to obtain measurementTechnique terms. 

### Pull terms for mapping via T2T pipeline

In [None]:
import os
import pandas as pd
import random

script_path = os.getcwd()
data_path = os.path.join(script_path,'data')
result_path = os.path.join(script_path,'result')
onto_path = os.path.join(data_path,'ontology_files')
map_path = os.path.join(result_path,'mappings')

In [None]:
MMO_mapping = pd.read_csv(os.path.join(map_path,'MMO_mappings.tsv'),delimiter='\t',header=0,index_col=0)
CHMO_mapping = pd.read_csv(os.path.join(map_path,'CHMO_mappings.tsv'),delimiter='\t',header=0,index_col=0) 
mapping_list = [MMO_mapping,CHMO_mapping]

In [None]:
## Pull synonymous terms based on the mappings

In [None]:
MMO_onto = pd.read_csv(os.path.join(onto_path,'MMO.csv'),delimiter=',',header=0, usecols=['Class ID','Preferred Label','Synonyms','has_exact_synonym'])
MMO_onto['alternative term'] = 'N/A'
CHMO_onto = pd.read_csv(os.path.join(onto_path,'CHMO.csv'),delimiter=',',header=0, usecols=['Class ID','Preferred Label','Synonyms','has_exact_synonym'])
CHMO_onto['alternative term'] = 'N/A'
EFO_onto = pd.read_csv(os.path.join(onto_path,'EFO.csv'),delimiter=',',header=0, usecols=['Class ID','Preferred Label','Synonyms','has_exact_synonym'])
EFO_onto['alternative term'] = 'N/A'
OBI_onto = pd.read_csv(os.path.join(onto_path,'OBI.csv'),delimiter=',',header=0, usecols=['Class ID','Preferred Label','Synonyms','alternative term'])
OBI_onto['has_exact_synonym'] = 'N/A'
BAO_onto = pd.read_csv(os.path.join(onto_path,'BAO.csv'),delimiter=',',header=0, usecols=['Class ID','Preferred Label','Synonyms','alternative term'])
BAO_onto['has_exact_synonym'] = 'N/A'

In [None]:
print(MMO_mapping.head(n=2))

ontodict = {'MMO':MMO_onto, 'EFO':EFO_onto, 'CHMO':CHMO_onto, 'BAO':BAO_onto, 'OBI':OBI_onto}
classlist = []
for eachmapping in mapping_list:
    sourcelist = eachmapping['source_id'].unique().tolist()
    targetlist = MMO_mapping['target_id'].unique().tolist()
    classlist.extend(sourcelist)
    classlist.extend(targetlist)
    classlist = list(set(classlist))

print(len(classlist), classlist[0:1])

In [None]:
termdf = pd.DataFrame(columns = ['Class ID','Preferred Label','Synonyms','alternative term','has_exact_synonym'])

namespace_list = ['MMO','CHMO','BAO','OBI','EFO']

for eachnamespace in namespace_list:
    tmpclasslist = [x for x in classlist if 'eachnamespace' in x]
    eachdf = ontodict[eachnamespace]
    tmpdf = eachdf.loc[eachdf['Class ID'].isin(classlist)]
    termdf = pd.concat((termdf,tmpdf),ignore_index=True)

print(termdf.head(n=2))

In [None]:
def split_synonyms(a_synonym):
    try:
        syns = a_synonym.split('|')
    except:
        syns = 'N/A'
    return syns

termdf['synlist'] = termdf.apply(lambda row: split_synonyms(row['Synonyms']),axis=1)
expanded_terms = termdf.explode('synlist')

print(expanded_terms.head(n=2))

In [None]:
preferred_terms = expanded_terms['Preferred Label'].unique().tolist()
synterms = expanded_terms['synlist'].unique().tolist()
all_terms = list(set(preferred_terms).union(set(synterms)))
print(len(all_terms),all_terms[0:2])

In [None]:
## mutate the terms

In [None]:
stopword_list = ['analysis','study','testing','sampling','assessment','diagnostic','tests',
                 'design','review','detection','identification','administration','system',
                 'surveillance','process','approach','method']

def add_stopword(original_term,stopword_list):
    stopword_len = len(stopword_list)
    doublestop = 2*stopword_len
    tmp_num = random.randrange(0,doublestop)
    if tmp_num < stopword_len:
        tmpterm = original_term +" "+ stopword_list[tmp_num]
    else:
        tmpterm = original_term
    return tmpterm

all_term_df = pd.DataFrame(all_terms)
all_term_df.rename(columns={0:'Original term'},inplace=True)
all_term_df['Syn1'] = all_term_df.apply(lambda row: add_stopword(row['Original term'],stopword_list),axis=1)
all_term_df['Syn2'] = all_term_df.apply(lambda row: add_stopword(row['Original term'],stopword_list),axis=1)
all_term_df['Syn3'] = all_term_df.apply(lambda row: add_stopword(row['Original term'],stopword_list),axis=1)
print(all_term_df.head(n=5))

In [None]:
all_term_df.to_csv(os.path.join(result_path,'mmo_chmo_terms.tsv'),sep='\t',header=True)
expanded_terms.to_csv(os.path.join(result_path,'mmo_chmo_terms_mapped.tsv'),sep='\t',header=True)

In [None]:
terms_2_check = list(set(all_terms).union(set(all_term_df['Syn1'].unique().tolist()).union(set(all_term_df['Syn2']).union(set(all_term_df['Syn3'].unique().tolist())))))
with open(os.path.join(result_path,'terms_2_test.txt'),'w') as outwrite:
    for eachterm in terms_2_check:
        outwrite.write(eachterm+'\n')

### Format the results of the terms mapped via T2T pipeline


In [None]:
import os
import pandas as pd

script_path = os.getcwd()
data_path = os.path.join(script_path,'data')
result_path = os.path.join(script_path,'result')

In [None]:
t2t_result = pd.read_csv(os.path.join(data_path,'measTech_T2T_test_result.tsv'), delimiter='\t',header=0)
t2t_result.rename(columns={'Term.1':'found_term'},inplace=True)
print(t2t_result.head(n=2))
print(len(t2t_result))

In [None]:
original_term_df = pd.read_csv(os.path.join(result_path,'mmo_chmo_terms.tsv'),delimiter='\t',header=0,index_col=0)
print(original_term_df.head(n=2))
original_term_map = pd.read_csv(os.path.join(result_path,'mmo_chmo_terms_mapped.tsv'),delimiter='\t',header=0,index_col=0)
print(original_term_map.head(n=2))

In [None]:
syn1 = original_term_df[['Original term','Syn1']].copy()
syn1.rename(columns = {'Syn1':'Term'},inplace=True)
syn1_merged = syn1.merge(t2t_result, on='Term',how='inner')
syn2 = original_term_df[['Original term','Syn2']].copy()
syn2.rename(columns = {'Syn2':'Term'},inplace=True)
syn2_merged = syn2.merge(t2t_result, on='Term',how='inner')
syn3 = original_term_df[['Original term','Syn3']].copy()
syn3.rename(columns = {'Syn3':'Term'},inplace=True)
syn3_merged = syn3.merge(t2t_result, on='Term',how='inner')

#print(len(syn1_merged), syn1_merged.head(n=2))
#print(len(syn2_merged), syn2_merged.head(n=2))
#print(len(syn3_merged), syn3_merged)

all_terms = pd.concat((syn1_merged,syn2_merged,syn3_merged),ignore_index=True)
all_terms.drop_duplicates(keep='first')
print(len(all_terms))
print(all_terms.head(n=2))

In [None]:
preferred_term = original_term_map[['Class ID','Preferred Label']].copy()
preferred_term.rename(columns={'Preferred Label':'Original term'},inplace=True)

synonym_term = original_term_map[['Class ID','synlist']].copy()
synonym_term.rename(columns={'synlist':'Original term'},inplace=True)

preferred_merge = preferred_term.merge(all_terms,on='Original term',how='inner')
print(len(preferred_merge))
print(preferred_merge.head(n=2))
print(preferred_merge.iloc[0]['Term'],preferred_merge.iloc[1]['Term'] )
synonym_merge = synonym_term.merge(all_terms,on='Original term',how='inner')
print(len(synonym_merge))

results_to_analyze = pd.concat((preferred_merge,synonym_merge),ignore_index=True)
#results_to_analyze.to_csv(os.path.join(result_path,'T2T_results_formatted.tsv'),sep='\t',header=True)

In [None]:
## Inspect number of original (preferred label) terms tested
preferred_terms_tested = preferred_merge.groupby('Original term')
print(len(preferred_terms_tested))

synonym_term_tested  = synonym_merge.groupby('Original term')
print(len(synonym_term_tested))

terms_tested = results_to_analyze.groupby('Term')
print(len(terms_tested))

## Analyze the formatted results of the T2T pipeline

In [31]:
import os
import pandas as pd

script_path = os.getcwd()
data_path = os.path.join(script_path,'data')
result_path = os.path.join(script_path,'result')

results_to_analyze = pd.read_csv(os.path.join(result_path,'T2T_results_formatted.tsv'),delimiter='\t',header=0,index_col=0)

In [2]:
def clean_up_urls(bao_url):
    if "bao" in bao_url:
        if "ebi" in bao_url:
            clean_url = bao_url.replace("https://www.ebi.ac.uk/ols4/ontologies/bao/classes/http%253A%252F%252Fwww.bioassayontology.org%252Fbao%2523","http://www.bioassayontology.org/bao#")
        else:
            clean_url = bao_url
    else:
        clean_url = bao_url
    return clean_url

In [33]:
results_to_analyze['TermID'] = results_to_analyze.apply(lambda row: clean_up_urls(row['TermID']), axis=1)
print(results_to_analyze.head(n=2))
exact_matches = results_to_analyze.loc[results_to_analyze['Class ID'] == results_to_analyze['TermID']].copy()
exact_matches.drop_duplicates(keep="first",inplace=True)
print(len(exact_matches))
#exact_matches.to_csv(os.path.join(result_path,'T2T_exact_matches.tsv'),sep='\t',header=True)

                                     Class ID  \
0  http://purl.obolibrary.org/obo/MMO_0000380   
1  http://purl.obolibrary.org/obo/MMO_0000380   

                                     Original term  \
0  graphite furnace atomic absorption spectrometry   
1  graphite furnace atomic absorption spectrometry   

                                                Term  T2T best score Ontology  \
0  graphite furnace atomic absorption spectrometr...           0.834      mmo   
1  graphite furnace atomic absorption spectrometr...           0.883      mmo   

                                        found_term  \
0  graphite furnace atomic absorption spectrometry   
1  graphite furnace atomic absorption spectrometry   

                                       TermID  
0  http://purl.obolibrary.org/obo/MMO_0000380  
1  http://purl.obolibrary.org/obo/MMO_0000380  
307


#### Handler for BAO Class ID's since the TermID may use an OLS version of BAO

* Example of BAO ID from NCBO BioPortal Mappings: http://www.bioassayontology.org/bao#BAO_0000415
* Example of BAO ID from T2T results: https://www.ebi.ac.uk/ols4/ontologies/bao/classes/http%253A%252F%252Fwww.bioassayontology.org%252Fbao%2523BAO_0000453


In [5]:
## Exact matches based on identifier
score_bins = [0,0.5,0.6,0.7,0.8,0.9,1.0]
groups = exact_matches.groupby(['Ontology', pd.cut(exact_matches['T2T best score'], score_bins)],observed=False)
print(groups.size().unstack())

T2T best score  (0.0, 0.5]  (0.5, 0.6]  (0.6, 0.7]  (0.7, 0.8]  (0.8, 0.9]  \
Ontology                                                                     
bao                      0           0           0           7           8   
chmo                     0           0           5          24          58   
efo                      0           0           0           1           0   
mmo                      0           1           3          21          33   
obi                      0           0           1           6           7   

T2T best score  (0.9, 1.0]  
Ontology                    
bao                      7  
chmo                    72  
efo                      2  
mmo                     41  
obi                     10  


In [6]:
## Exact matches of manipulated terms
original_term_exact = exact_matches.loc[exact_matches['Original term'] == exact_matches['Term']]
#original_term_exact = exact_matches.loc[exact_matches['Original term'] == exact_matches['Term']]['Original term'].unique()
#manipulated_term_match = exact_matches.loc[exact_matches['Original term'] != exact_matches['Term']]
manipulated_term_match = exact_matches.loc[exact_matches['Original term'] != exact_matches['Term']]['Original term'].unique()
print(len(original_term_exact))
print(len(manipulated_term_match))


137
106


In [35]:
mismatched = results_to_analyze.loc[results_to_analyze['Class ID'] != results_to_analyze['TermID']].copy()
mismatched.drop_duplicates(keep='first',inplace=True)

In [36]:
## of exact matches by term
exact_term_matches = mismatched.loc[mismatched['Original term'] == mismatched['found_term']].copy()
exact_term_matches.drop_duplicates(keep='first',inplace=True)
#exact_term_matches.to_csv(os.path.join(result_path,'T2T_term_matches.tsv'),sep='\t',header=True)
original_exact = exact_term_matches.loc[exact_term_matches['Original term'] == exact_term_matches['Term']]

print(len(exact_term_matches))
print(len(original_exact))
print(len(original_exact['Term'].unique().tolist()))
print(len(exact_term_matches['Term'].unique().tolist()))

score_bins = [0,0.5,0.6,0.7,0.8,0.9,1.0]
groups = exact_term_matches.groupby(['Ontology', pd.cut(exact_term_matches['T2T best score'], score_bins)],observed=False)
print(groups.size().unstack())

154
59
52
137
T2T best score  (0.0, 0.5]  (0.5, 0.6]  (0.6, 0.7]  (0.7, 0.8]  (0.8, 0.9]  \
Ontology                                                                     
bao                      0           0           0          18          30   
chmo                     0           0           0           0           3   
efo                      0           0           0           3           5   
mmo                      0           0           2           4          22   
obi                      0           0           0           0           4   

T2T best score  (0.9, 1.0]  
Ontology                    
bao                     33  
chmo                     2  
efo                      9  
mmo                     17  
obi                      2  


In [16]:
## True mismatches by ID and term
true_mismatch = mismatched.loc[mismatched['Original term'] != mismatched['found_term']].copy()
original_true_mismatch = true_mismatch.loc[true_mismatch['Original term'] == true_mismatch['Term']]
true_mismatch.drop_duplicates(keep='first',inplace=True)

print(len(true_mismatch))
print(len(original_true_mismatch))
print(len(original_true_mismatch['Term'].unique().tolist()))
print(len(true_mismatch['Term'].unique().tolist()))

score_bins = [0,0.5,0.6,0.7,0.8,0.9,1.0]
groups = true_mismatch.groupby(['Ontology', pd.cut(true_mismatch['T2T best score'], score_bins)],observed=False)
print(groups.size().unstack())

1280
295
257
785
T2T best score  (0.0, 0.5]  (0.5, 0.6]  (0.6, 0.7]  (0.7, 0.8]  (0.8, 0.9]  \
Ontology                                                                     
bao                      2           1          20          18          15   
chmo                     0          14          18          12          17   
edam                     0           0           0          13          19   
efo                      0           0           1           2          10   
mmo                      0           0           3          14          18   
ncit                     8          35         102         190         533   
obi                      0           2           5           7           8   

T2T best score  (0.9, 1.0]  
Ontology                    
bao                     14  
chmo                    13  
edam                    15  
efo                      5  
mmo                     16  
ncit                   126  
obi                      4  


In [17]:
## matches of manipulated terms
original_term_mismatch = true_mismatch.loc[true_mismatch['Original term'] == true_mismatch['Term']]
manipulated_term_mismatch = true_mismatch.loc[true_mismatch['Original term'] != true_mismatch['Term']]
print(len(original_term_mismatch))
print(len(manipulated_term_mismatch))

original_term_mismatch = true_mismatch.loc[true_mismatch['Original term'] == true_mismatch['Term']]['Original term'].unique()
manipulated_term_mismatch = true_mismatch.loc[true_mismatch['Original term'] != true_mismatch['Term']]['Original term'].unique()
print(len(original_term_mismatch))
print(len(manipulated_term_mismatch))

295
985
257
309


In [19]:
## Review matches of manipulated terms by ontology
ontolist = ['bao','chmo','edam','efo','mmo','ncit','obi']

for eachonto in ontolist:
    tmpdf = true_mismatch.loc[true_mismatch['Ontology']==eachonto].copy()
    tmpdf.drop_duplicates(keep='first',inplace=True)
    tmpdf.to_csv(os.path.join(result_path,'to_review',f'mismatch_{eachonto}.tsv'),sep='\t',header=True)

## Analyze the T2T terms that did not match via ID or exact word matching

In [2]:
import os
import pandas as pd

script_path = os.getcwd()
data_path = os.path.join(script_path,'data')
result_path = os.path.join(script_path,'result')
file_path = os.path.join(data_path,'reviewed_mismatched_terms')


In [3]:
ontolist = ['bao','chmo','edamt','efo','mmo','ncit','obi']

basedf = pd.DataFrame(columns=['Class ID','Original term','Term','T2T best score','TermID','Ontology','found_term','match_eval'])

for eachonto in ontolist:
    tmp = pd.read_excel(os.path.join(file_path,'measTech_T2T_test.xlsx'), eachonto, header=0,index_col=0,engine='openpyxl')
    basedf = pd.concat((basedf,tmp),ignore_index=True)
    
print(basedf.head(n=2))

exact_matches = pd.read_csv(os.path.join(result_path,'T2T_exact_matches.tsv'),delimiter='\t',header=0)
exact_term_matches = pd.read_csv(os.path.join(result_path,'T2T_term_matches.tsv'),delimiter='\t',header=0)

exact_matches['match_eval'] = 'good'
exact_term_matches['match_eval'] = 'good'

basedf = pd.concat((basedf,exact_matches,exact_term_matches),ignore_index=True)
print(len(basedf))

  basedf = pd.concat((basedf,tmp),ignore_index=True)


                                     Class ID Original term  \
0  http://purl.obolibrary.org/obo/MMO_0000496     histology   
1  http://purl.obolibrary.org/obo/MMO_0000496     histology   

                Term  T2T best score  \
0  histology testing           0.747   
1          histology           0.996   

                                       TermID Ontology found_term match_eval  
0  http://purl.obolibrary.org/obo/NCIT_C16681      bao  Histology       good  
1  http://purl.obolibrary.org/obo/NCIT_C16681      bao  Histology       good  
1741


In [4]:
cleandf = basedf.drop_duplicates(keep='first')
print(len(basedf),len(cleandf))

1741 1741


In [6]:
meanevaldf = cleandf.groupby(['Ontology','match_eval'])['T2T best score'].mean().reset_index(name='avg T2T score')
medevaldf = cleandf.groupby(['Ontology','match_eval'])['T2T best score'].mean().reset_index(name='median T2T score')
countevaldf = cleandf.groupby(['Ontology','match_eval']).size().reset_index(name='counts')
mineval = cleandf.groupby(['Ontology','match_eval'])['T2T best score'].min().reset_index(name='lowest T2T score')
stdeveval = cleandf.groupby(['Ontology','match_eval'])['T2T best score'].std().reset_index(name='std_dev')
totalevaldf = meanevaldf.merge(medevaldf.merge(countevaldf.merge(mineval.merge(stdeveval,on=['Ontology','match_eval'],how='outer'),on=['Ontology','match_eval'],how='outer'),on=['Ontology','match_eval'],how='outer'),on=['Ontology','match_eval'],how='outer')
print(totalevaldf)

totalevaldf.to_csv(os.path.join(result_path,'T2T_results_evaluated.tsv'),sep='\t',header=True)

   Ontology match_eval  avg T2T score  median T2T score  counts  \
0       bao       good       0.862667          0.862667     138   
1       bao         ok       0.723222          0.723222       9   
2       bao       poor       0.710385          0.710385      26   
3      chmo       good       0.864873          0.864873     205   
4      chmo         ok       0.905800          0.905800       5   
5      chmo       poor       0.633214          0.633214      28   
6      edam       good       0.898435          0.898435      23   
7      edam         ok       0.853417          0.853417      24   
8       efo       good       0.888406          0.888406      32   
9       efo         ok       0.673000          0.673000       1   
10      efo       poor       0.900000          0.900000       5   
11      mmo       good       0.867343          0.867343     175   
12      mmo       poor       0.816150          0.816150      20   
13     ncit       good       0.881945          0.881945     21