## Ontology subsetter
Sometimes, only a subset of an ontology has suitable values for a property. Under those circumstances, it would be more efficient to work with the subset of interest rather than the entire ontology. This script helps to subset (take all children of a specific parent)



In [2]:
import os
import pandas as pd

script_path = os.getcwd()
data_path = 'data'
ontology_file = 'NCIT.csv'
ontology_path = os.path.join(script_path,data_path,ontology_file)

headerlist = ['Class ID', 'Preferred Label', 'Synonyms', 'Definitions', 'Obsolete', 'Semantic Types', 'Parents']
ontology = pd.read_csv(ontology_path, header=0, usecols=headerlist)
print(ontology.head(n=2))
print(ontology.iloc[0]['Class ID'])

                                            Class ID  \
0  http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus...   
1  http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus...   

                                   Preferred Label  \
0  Child Disobeys Rules or Gets in Trouble at Home   
1               DNA Repair Protein RAD51 Homolog 4   

                                            Synonyms  \
0  Child Disobeys Rules or Gets in Trouble at Hom...   
1  DNA Repair Protein RAD51 Homolog 4|RAD51-Like ...   

                                         Definitions  Obsolete  \
0  A question about whether a child disobeys rule...     False   
1  DNA repair protein RAD51 homolog 4 (328 aa, ~3...     False   

   Semantic Types                                            Parents  
0             NaN  http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus...  
1             NaN  http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus...  
http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C183258


In [10]:
def generate_id_df(baseurl,df):
    for eachurl in baseurl:
        df['Class ID'] = df['Class ID'].str.replace(eachurl,'')
        df['Parents'] = df['Parents'].str.replace(eachurl,'')
    return df

def check_urls(df):
    parent_urls = df.loc[df['Parents'].str.contains('http')]
    id_urls = df.loc[df['Class ID'].str.contains('http')]
    urls_df = parent_urls.merge(id_urls,how='outer')
    return urls_df

In [13]:
baseurl = ["http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#",
           "http://www.w3.org/2002/07/owl#"]
include_parent = ['C19160']
exclude_parent = ['C25193']
clean_ontology = generate_id_df(baseurl,ontology)
remaining_urls = check_urls(clean_ontology)


In [19]:
## Grow the parent list
desired_subset = []
while len(desired_subset)<=len(clean_ontology):
    print(len(desired_subset))
    temp_children = ontology.loc[clean_ontology['Parents'].isin(include_parent)]
    temp_subset = temp_children.drop_duplicates(keep='first')
    if len(temp_subset) > 0:
        if len(set(exclude_parent).intersection(set(temp_subset['Class ID'].unique().tolist())))>0:
            clean_subset = temp_subset.loc[~temp_subset['Class ID'].isin(exclude_parent)]
        else:
            clean_subset = temp_subset
        desired_subset.extend(clean_subset['Class ID'].unique().tolist())
        include_parent = clean_subset['Class ID'].unique().tolist()
    else:
        break
    

0
202
316
364
368


In [21]:
desired_subset_df = clean_ontology.loc[clean_ontology['Class ID'].isin(desired_subset)]
print(desired_subset_df)
desired_subset_df.to_csv(os.path.join(script_path,data_path,'NCIT_disciplines.tsv'), sep='\t',header=True)

       Class ID                          Preferred Label  \
427      C16627                         Medical Genetics   
429      C16628                      Population Genetics   
431      C16626                     Biochemical Genetics   
436      C16625                      Behavioral Genetics   
885      C18867                     Mathematical Biology   
...         ...                                      ...   
181413   C16497                                Dentistry   
183181  C201963  Developmental and Behavioral Pediatrics   
183864   C19164                   Experimental Pathology   
183868   C19165                         Cancer Histology   
183962   C18700                   Clinical Endocrinology   

                                                 Synonyms  \
427                    Medical Genetics|Genetics, Medical   
429              Genetics, Population|Population Genetics   
431                                  Biochemical Genetics   
436     Genetic Determinants of Beh