# Measurement Techniques Graph - Preparing the data

Measurement Techniques are currently scattered across multiple different ontologies. In order to improve the mapping of measTech terms to ontology terms, the measurement techniques across multiple ontologies need to be combined to reduce the likelihood of treating synonymous terms as multiple separate entities

To do this, we will:

1. Convert measurement technique ontology branches into subject (parent) predicate (has subclass) object (child) triples.
2. Use NCBO BioPortals to map synonymous terms between ontologies to de-duplicate nodes
3. Use term similarity and shared nodes to identify potential synonymous terms for de-duplication
4. iterate on 3 until the graph is relatively unique

This notebook covers the data processing for the conversion to triples and the mapping. Note that it depends on the mappings generated during the initial analysis of measurementTechniques (see measTechAnalysis)

In [6]:
import os
import requests
import pandas as pd

In [7]:
script_path = os.getcwd()
parent_path = os.path.abspath(os.path.join(script_path, os.pardir))
raw_path = os.path.join(script_path,'raw_files')
map_path = os.path.join(parent_path,'measTechAnalysis','result','mappings')
raw_file_list = os.listdir(raw_path)
map_file_list = [x for x in os.listdir(map_path) if '.tsv' in x]
result_path = os.path.join(script_path,'results')

## Create ordered mapping file

In [3]:
## NCIT is too big to fetch the mappings for, so generate it from the other mappings
def generate_NCIT_mappings(map_path,map_file_list):
    ncit = pd.DataFrame(columns=["source_id","map_method","target_id"])
    for eachmap in map_file_list:
        df = pd.read_csv(os.path.join(map_path,eachmap), delimiter='\t',header=0,index_col=0)
        ncit_df = df.loc[df['target_id'].astype(str).str.contains('NCIT')]
        ncit = pd.concat((ncit,ncit_df),ignore_index=True)
    ncit.rename(columns = {"target_id":"subject","source_id":"object"}, inplace=True)
    ncit.rename(columns = {"subject":"source_id", "object":"target_id"}, inplace=True)
    ncit.to_csv(os.path.join(map_path,'NCIT_mappings.tsv'),sep='\t', header=True)

In [4]:
generate_NCIT_mappings(map_path,map_file_list)

In [8]:
#### Filter mappings to just the relevant ones
def create_mapping_df(map_path, mapping_ontos):
    mapping_df = pd.DataFrame(columns=["source_ontology","source_id","map_method","target_id"])
    for eachmap in mapping_ontos:
        tmpdf = pd.read_csv(os.path.join(map_path,f"{eachmap}_mappings.tsv"),delimiter="\t",index_col=0,header=0)
        ### get rid of same URIs
        unique_map = tmpdf.loc[tmpdf['map_method'].astype(str).str.strip()!="SAME_URI"]
        ### limit maps to ones within ontology range
        no_other_ontos = unique_map.loc[unique_map['target_id'].astype(str).str.contains("edamontology") | 
                                         unique_map['target_id'].astype(str).str.contains("EFO") |
                                         unique_map['target_id'].astype(str).str.contains("CHMO") |
                                         unique_map['target_id'].astype(str).str.contains("MMO") |
                                         unique_map['target_id'].astype(str).str.contains("OBI") |
                                         unique_map['target_id'].astype(str).str.contains("BAO") |
                                         unique_map['target_id'].astype(str).str.contains("NCIT")
                                        ]
        mapping_df = pd.concat((mapping_df,no_other_ontos),ignore_index=True)
    no_dups = mapping_df.drop_duplicates(keep='first')
    return no_dups                   

In [9]:
mapping_ontos = ["NCIT","EDAM","EFO","BAO","OBI","CHMO","MMO"]
mapping_df = create_mapping_df(map_path, mapping_ontos)
clean_mapping_df = mapping_df.drop(columns='source_ontology')
clean_mapping_df.to_csv(os.path.join(map_path,'all_mappings.tsv'),sep='\t',header=True)

In [10]:
clean_mapping_df = pd.read_csv(os.path.join(map_path,'all_mappings.tsv'),delimiter='\t',header=0,index_col=0)

In [11]:
print(len(clean_mapping_df))

17120


## Create iri/label dictionaries:

In [12]:
def update_dictionaries(raw_path,raw_file_list):
    vocab = pd.DataFrame([{'id':'https://www.w3.org/2002/07/owl#sameAs','name':'sameAs'},
                          {'id':'http://www.w3.org/2000/01/rdf-schema#subClassOf','name':'subClassOf'}])
    for eachfile in raw_file_list:
        df = pd.read_csv(os.path.join(raw_path,eachfile),header=0, usecols=['Class ID','Preferred Label'])
        if eachfile == 'NCIT.csv':
            df['Class ID'] = df['Class ID'].apply(lambda x: x.replace('http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#','http://purl.obolibrary.org/obo/NCIT_'))
        df.rename(columns={'Preferred Label':'name','Class ID':'id'},inplace=True)
        vocab = pd.concat((vocab,df),ignore_index=True)
    clean_vocab = vocab.drop_duplicates(keep='first')
    return clean_vocab

In [13]:
name_map = update_dictionaries(raw_path,raw_file_list)
name_map.to_csv(os.path.join(result_path,'name_iri_map.tsv'),sep='\t',header=True)
print(name_map.head(n=2))
print(len(name_map))

                                                id        name
0            https://www.w3.org/2002/07/owl#sameAs      sameAs
1  http://www.w3.org/2000/01/rdf-schema#subClassOf  subClassOf
255404


## Create triples

In [14]:
df = pd.read_csv(os.path.join(map_path,map_file_list[0]),delimiter='\t',header=0,index_col=0)
df = pd.read_csv(os.path.join(raw_path,raw_file_list[0]),header=0, usecols=['Class ID','Parents'])

print(df.head(n=2))
print(len(df))

                                      Class ID  \
0   http://purl.obolibrary.org/obo/CHEBI_50444   
1  http://purl.obolibrary.org/obo/CHEBI_131787   

                                      Parents  
0  http://purl.obolibrary.org/obo/CHEBI_50218  
1  http://purl.obolibrary.org/obo/CHEBI_48706  
7773


In [15]:
#### Address subclass of many triples delineated by "|"
## These will only be in the object_id field and is a reason why there are missing object labels
def split_multi_parents(objectvalue):
    if "|" in objectvalue:
        parents = objectvalue.split("|")
    else:
        parents = [objectvalue]
    return parents

In [16]:
def convert_to_triples(df,reverse_mapping=False):
    if 'map_method' in list(df.keys()):
        ## This is a mapping file
        no_self_dups = df.loc[df['map_method'].astype(str).str.strip()!="SAME_URI"]
        triple_df = no_self_dups[['source_id','target_id']].copy()
        triple_df.rename(columns={'source_id':'subject','target_id':'object'},inplace=True)
        triple_df['predicate']='https://www.w3.org/2002/07/owl#sameAs'
        ## Generate a reverse mapping
        if reverse_mapping==True:
            tmpdf = no_self_dups[['source_id','target_id']].copy()
            tmpdf.rename(columns={'source_id':'object','target_id':'subject'},inplace=True)
            tmpdf['predicate']='https://www.w3.org/2002/07/owl#sameAs'
            ## assemble together
            triple_df = pd.concat((triple_df,tmpdf),ignore_index=True)
        triple_df.drop_duplicates(keep='first',inplace=True)
    if 'Parents' in list(df.keys()):
        temp = df.loc[~df['Parents'].isna()].copy()
        temp['object'] = temp.apply(lambda row: split_multi_parents(row['Parents']),axis=1)
        triple_df = temp.explode('object')
        triple_df.drop(columns='Parents',inplace=True)
        triple_df.rename(columns={'Class ID':'subject'},inplace=True)
        triple_df['predicate'] = 'http://www.w3.org/2000/01/rdf-schema#subClassOf'
    return triple_df    

In [17]:
def generate_triples(raw_path, map_path, raw_file_list, map_file_list):
    triple_df = pd.DataFrame(columns=['subject','predicate','object'])
    for eachfile in raw_file_list:
        df = pd.read_csv(os.path.join(raw_path,eachfile),header=0, usecols=['Class ID','Parents'])
        if eachfile == 'NCIT.csv':
            df['Class ID'] = df['Class ID'].apply(lambda x: x.replace('http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#','http://purl.obolibrary.org/obo/NCIT_'))
            df['Parents'] = df['Parents'].apply(lambda x: x.replace('http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#','http://purl.obolibrary.org/obo/NCIT_'))
        clean_df = convert_to_triples(df,False)
        triple_df = pd.concat((triple_df,clean_df),ignore_index=True)
    map_files = [x for x in map_file_list if x!='all_mappings.tsv']
    for eachmap in map_files:
        df = pd.read_csv(os.path.join(map_path,eachmap),delimiter='\t',header=0,index_col=0)
        clean_df = convert_to_triples(df)
        triple_df = pd.concat((triple_df,clean_df),ignore_index=True)
    clean_triples = triple_df.drop_duplicates(keep='first',inplace=False)
    return clean_triples

In [18]:
clean_triples = generate_triples(raw_path, map_path, raw_file_list, map_file_list)
clean_triples.to_csv(os.path.join(result_path,'all_iri_triples.tsv'),sep='\t',header=True)
print(len(clean_triples))

325371


In [19]:
def map_triples(clean_triples,name_map):
    clean_triples.rename(columns={'subject':'subject_id','predicate':'predicate_id','object':'object_id'},inplace=True)
    subject_map = name_map.copy()
    subject_map.rename(columns={'name':'subject','id':'subject_id'},inplace=True)
    predicate_map = name_map.copy()
    predicate_map.rename(columns={'name':'predicate','id':'predicate_id'},inplace=True)
    object_map = name_map.copy()
    object_map.rename(columns={'name':'object','id':'object_id'},inplace=True)
    tmpdf = clean_triples.merge(subject_map,on='subject_id',how='left')
    tmp2df = tmpdf.merge(object_map,on='object_id',how='left')
    tmp3df = tmp2df.merge(predicate_map,on='predicate_id',how='left')
    mapped_triples = tmp3df.drop_duplicates(keep='first')
    return mapped_triples

In [20]:
mapped_triples = map_triples(clean_triples,name_map)
print(len(mapped_triples))
mapped_triples.to_csv(os.path.join(result_path,'all_ontologies_mapped.tsv'),sep='\t',header=True)

325988


### filtering the triples to measTech only

In [21]:
### Get the base parent inclusion list
parent_inclusion_list = []
with open(os.path.join(result_path,'parent_inclusion_list.txt'),'r') as infile:
    for line in infile:
        parent_inclusion_list.append(line.strip())

print(len(parent_inclusion_list))
print(parent_inclusion_list[0])

41
http://purl.obolibrary.org/obo/NCIT_C20368


In [22]:
#### Iterate until all the descendants of the parent_inclusion_list are reached
def generate_lineage_list(parent_inclusion_list, mapped_triples):
    i = 0
    all_parent_list = [x for x in parent_inclusion_list]
    old_all_parent_list = len(set(all_parent_list))
    tmpdf = mapped_triples.loc[mapped_triples['object_id'].isin(parent_inclusion_list)]
    new_parent = tmpdf['subject_id'].unique().tolist()
    all_parent_list.extend(new_parent)
    while len(set(all_parent_list)) != old_all_parent_list:
        tmpdf = mapped_triples.loc[mapped_triples['object_id'].isin(new_parent)]
        new_parent = tmpdf['subject_id'].unique().tolist()
        old_all_parent_list = len(set(all_parent_list))
        all_parent_list.extend(new_parent)
        i = i+1

    all_unique_parents = list(set(all_parent_list))
    print("iterations run: ",i," iterative # of records included: ",len(all_parent_list)," unique records included: ",len(all_unique_parents))
    return all_unique_parents

In [23]:
### generate the measTech inclusion list
all_unique_parents = generate_lineage_list(parent_inclusion_list, mapped_triples)

iterations run:  13  iterative # of records included:  270458  unique records included:  58238


In [25]:
all_measTech_triples = mapped_triples.loc[mapped_triples['subject_id'].isin(all_unique_parents)]
## Filter out any ontologies (like CHEBI) which may have been mapped due to ingestion via NCIT
onto_keys = ["NCIT","topics","EFO","BAO","OBI","CHMO","MMO"]
measTechOnly = pd.DataFrame(columns=["subject_id","predicate_id","object_id","subject","object","predicate"])
for eachonto in onto_keys:
    tmpdf = all_measTech_triples.loc[all_measTech_triples['subject_id'].astype(str).str.contains(eachonto) | 
                                     all_measTech_triples['object_id'].astype(str).str.contains(eachonto)]
    measTechOnly = pd.concat((measTechOnly,tmpdf),ignore_index=True)

measTechOnly.drop_duplicates(keep="first",inplace=True)    
measTechOnly.to_csv(os.path.join(result_path,'measTechOnly_mapped_triples.tsv'),sep='\t',header=True)
print(len(measTechOnly))

68217


In [27]:
print(measTechOnly.head(n=2))

                                    subject_id  \
0  http://purl.obolibrary.org/obo/NCIT_C101294   
1   http://purl.obolibrary.org/obo/NCIT_C16681   

                                      predicate_id  \
0  http://www.w3.org/2000/01/rdf-schema#subClassOf   
1  http://www.w3.org/2000/01/rdf-schema#subClassOf   

                                         object_id                  subject  \
0  http://www.bioassayontology.org/bao#BAO_0002445  Whole Genome Sequencing   
1  http://www.bioassayontology.org/bao#BAO_0000448                Histology   

                         object   predicate  
0             genotyping method  subClassOf  
1  morphology assessment method  subClassOf  


In [28]:
print(len(all_measTech_triples.loc[all_measTech_triples['predicate']=='sameAs']))

1960


## Troubleshooting

### Investigating missing NCIT mappings

In [None]:
print(map_file_list)
mapping_ontos = ["NCIT","topic","EFO","BAO","OBI","CHMO","MMO"]

In [None]:
NCIT_map = pd.read_csv(os.path.join(map_path,map_file_list[6]),delimiter='\t',header=0,index_col=0)

relevant_df = pd.DataFrame(columns=['source_id','map_method','target_id','source_ontology'])
for eachonto in mapping_ontos:
    tmpdf = NCIT_map.loc[NCIT_map['target_id'].astype(str).str.contains(eachonto)]
    tmp2df = tmpdf.loc[tmpdf['map_method']!="SAME_URI"]
    relevant_df = pd.concat((relevant_df,tmp2df),ignore_index=True)

relevant_df.drop(columns=["source_ontology"],inplace=True)

relevant_df.to_csv(os.path.join(map_path,'NCIT_relevant_mappings.tsv'))

In [None]:
all_measTech_triples = pd.read_csv(os.path.join(result_path,'measTechOnly_mapped_triples.tsv'),delimiter='\t',header=0,index_col=0)
print(len(all_measTech_triples))
print(all_measTech_triples.head(n=2))
ncit_only_triples = all_measTech_triples.loc[all_measTech_triples['subject_id'].astype(str).str.contains("NCIT")]
print(len(ncit_only_triples))

#### The number of NCIT triples is excessively low -- need to investigate why

In [30]:
ncit_onto = pd.read_csv(os.path.join(raw_path,'NCIT.csv'),header=0, usecols=['Class ID','Parents'])
print(len(ncit_onto))

184165


In [32]:
print(ncit_onto.head(n=2))
print(ncit_onto.iloc[0]['Parents'])

                                            Class ID  \
0  http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus...   
1  http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus...   

                                             Parents  
0  http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus...  
1  http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus...  
http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C173890


#### The issue is that the urls are formatted differently -- need to convert purl version with ncicb version

In [33]:
i = 0
ncit_list = [x for x in parent_inclusion_list if 'NCIT' in x]
ncit_parent_list = [x.replace('http://purl.obolibrary.org/obo/NCIT_','http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#') for x in ncit_list]
old_ncit_parent = len(set(ncit_parent_list))
tmpdf = ncit_onto.loc[ncit_onto['Parents'].isin(ncit_parent_list)]
tmplist = tmpdf['Class ID'].unique().tolist()
new_ncit_parent = list(set(ncit_parent_list).union(set(tmplist)))
while len(set(new_ncit_parent)) != old_ncit_parent:
    tmpdf = ncit_onto.loc[ncit_onto['Parents'].isin(new_ncit_parent)]
    tmplist = tmpdf['Class ID'].unique().tolist()  
    old_ncit_parent = len(set(new_ncit_parent))
    new_ncit_parent = list(set(new_ncit_parent).union(set(tmplist)))
    print("iterations run: ",i," ncit_parent_list: ",len(new_ncit_parent)," old_parent: ",old_ncit_parent)
    i=i+1

iterations run:  0  ncit_parent_list:  4066  old_parent:  1068
iterations run:  1  ncit_parent_list:  7459  old_parent:  4066
iterations run:  2  ncit_parent_list:  11815  old_parent:  7459
iterations run:  3  ncit_parent_list:  25438  old_parent:  11815
iterations run:  4  ncit_parent_list:  27345  old_parent:  25438
iterations run:  5  ncit_parent_list:  28363  old_parent:  27345
iterations run:  6  ncit_parent_list:  28792  old_parent:  28363
iterations run:  7  ncit_parent_list:  28917  old_parent:  28792
iterations run:  8  ncit_parent_list:  28955  old_parent:  28917
iterations run:  9  ncit_parent_list:  28957  old_parent:  28955
iterations run:  10  ncit_parent_list:  28957  old_parent:  28957


In [None]:
### Test a solution before implementing it

df = ncit_onto.copy()
df['Class ID'] = df['Class ID'].apply(lambda x: x.replace('http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#','http://purl.obolibrary.org/obo/NCIT_'))
print(df.head(n=2))

## Test bits of code

In [None]:
i = 0
all_parent_list = [x for x in parent_inclusion_list]
old_all_parent_list = len(set(all_parent_list))
tmpdf = mapped_triples.loc[mapped_triples['object_id'].isin(parent_inclusion_list)]
new_parent = tmpdf['subject_id'].unique().tolist()
all_parent_list.extend(new_parent)
while len(set(all_parent_list)) != old_all_parent_list:
    tmpdf = mapped_triples.loc[mapped_triples['object_id'].isin(new_parent)]
    new_parent = tmpdf['subject_id'].unique().tolist()
    old_all_parent_list = len(set(all_parent_list))
    all_parent_list.extend(new_parent)
    i = i+1
    print("iteration: ",i," old parents",old_all_parent_list," all parents: ",len(set(all_parent_list)))