In [1]:
import requests
import time
import re
import pandas as pd

from IPython.display import clear_output
from GetTermsOntobee import get_URI_from_name
from OntologyConversorSCAE import OntologyConversorSCAE
from OntologyConversorHCA import OntologyConversorHCA

# 1. Getting SCEA disease terms and URIs

First we will define a function with which we will obtain all the terms of the project of SCEA. In this case we will use this function to obtain all the diseases of a project.

In [2]:
def get_terms_from_project(experiment_id, name_column):
    experiment_metadata = f"https://www.ebi.ac.uk/gxa/sc/experiment/{experiment_id}/download?fileType=experiment-design&accessKey="
    
    df = pd.read_csv(experiment_metadata, sep='\t', low_memory=False)
    time.sleep(1) # It's a good practice to wait every time we make a petition
    
    # If project doesnt have the expected column, finish
    if name_column not in df.columns:
        return []
    
    project_term_names = df[name_column].unique()

    return list(project_term_names)

Now we will loop over all projects of SCEA so we get all the disease terms used. Using the url https://www.ebi.ac.uk/gxa/sc/json/experiments we obtain all projects.

In [3]:
seed_url = "https://www.ebi.ac.uk/gxa/sc/json/experiments"

In [4]:
headers = {
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
}

In [5]:
answer = requests.get(seed_url, headers=headers)

And we can iterate over `experiments` and get the terms of each project.

In [6]:
disease_names_SCEA = []
avoid_collections = ["Human Cell Atlas"] # We dont want to get the URIs of these collections

n_experiments = len(answer.json()['experiments'])

for n, experiment in enumerate(answer.json()['experiments']):
    print(f"{n+1}/{n_experiments}")

    # If project is from a collection we are not interested in, we skip it
    if [i for i in experiment["experimentProjects"] if i in avoid_collections]:
        clear_output(wait=True)
        continue
    
    # Get the terms of the project
    experiment_id = experiment['experimentAccession']
    disease_names_SCEA += get_terms_from_project(experiment_id=experiment_id, 
                                            name_column='Sample Characteristic[disease]')
    
    clear_output(wait=True)

# Remove the duplicated term names
disease_names_SCEA = list(set(disease_names_SCEA))
disease_names_SCEA

['idiopathic pulmonary fibrosis',
 'type I diabetes mellitus',
 'prostate carcinoma',
 'myelodysplastic syndrome',
 'brain glioblastoma',
 "Crohn's disease",
 'squamous cell lung carcinoma',
 'bronchioalveolar carcinoma; non-small cell lung cancer',
 'lung adenocarcinoma',
 'renal cell carcinoma',
 'hypocellular myelodysplastic syndrome',
 'glioblastoma',
 'refractory anemia with excess blasts',
 'type II diabetes mellitus',
 'ovarian carcinoma',
 'small intestine neuroendocrine tumor',
 'fetal growth restriction',
 'tonsilitis',
 'large cell lung carcinoma',
 'HIV infection',
 'wet macular degeneration',
 'hepatitis C infection',
 'rectal adenocarcinoma',
 'metastatic breast cancer',
 'colorectal cancer',
 'chronic obstructive pulmonary disease',
 'melanoma',
 'multiple myeloma',
 'normal',
 'obstructive sleep apnea syndrome',
 'fibrosis',
 'B cell acute lymphoblastic leukemia',
 'head and neck squamous cell carcinoma',
 "Parkinson's disease",
 'lung carcinoma',
 'not applicable',
 'i

The next step is to search the URIs of each term. We define the function `get_URIs_from_term` to fulfill that goal, using the function `get_URI_from_name` to search the URI in a set of ontologies.

In [7]:
def get_URIs_from_term(term_name, ontologies):
    if term_name is None:
        return []
    
    print(f'Searching URIs for "{term_name}"')
    terms = []
    
    # First get the URIs of all ontologies
    for ontology in ontologies:
        URI = get_URI_from_name(term_name, ontology)
        time.sleep(1)

        if URI is not None:
            terms.append({'name': term_name, 'URI': URI})
    
    # If we dont find any URI with the ontologies, we search for a URI in ALL ontologies
    if not terms:
        URI = get_URI_from_name(term_name)
        
        if URI is not None:
            terms.append({'name': term_name, 'URI': URI})
    
    
    splitted = re.split(' and |, |; ', term_name)
    
    # If the term_name is a list of terms, we iterate over them
    if not terms and len(splitted) > 1:
        for new_name in splitted:
            terms += get_URIs_from_term(new_name, ontologies)
            return terms
    
    # If none URI has been found, we add a None URI so we know that
    if not terms:
        terms.append({
            'name': term_name,
            'URI': None
        })
    
    return terms

The ontologies we have considered important are `PATO`, `MONDO`, `DOID` and `HP`.

In [8]:
ontologies = ['PATO', 'MONDO', 'DOID', 'HP']

In [9]:
get_URIs_from_term('bronchioalveolar carcinoma; non-small cell lung cancer', ontologies)

Searching URIs for "bronchioalveolar carcinoma; non-small cell lung cancer"
Searching URIs for "bronchioalveolar carcinoma"


[{'name': 'bronchioalveolar carcinoma',
  'URI': 'http://purl.obolibrary.org/obo/MONDO_0000503'},
 {'name': 'bronchioalveolar carcinoma',
  'URI': 'http://purl.obolibrary.org/obo/DOID_0050870'}]

Now, we can get the URIs of each term name and create a dataframe with the URIs and the names.

In [10]:
terms_URIs_SCEA = []
for name in disease_names_SCEA:
    terms_URIs_SCEA += get_URIs_from_term(name, ontologies)

Searching URIs for "idiopathic pulmonary fibrosis"
Searching URIs for "type I diabetes mellitus"
Searching URIs for "prostate carcinoma"
Searching URIs for "myelodysplastic syndrome"
Searching URIs for "brain glioblastoma"
Searching URIs for "Crohn's disease"
Searching URIs for "squamous cell lung carcinoma"
Searching URIs for "bronchioalveolar carcinoma; non-small cell lung cancer"
Searching URIs for "bronchioalveolar carcinoma"
Searching URIs for "lung adenocarcinoma"
Searching URIs for "renal cell carcinoma"
Searching URIs for "hypocellular myelodysplastic syndrome"
Searching URIs for "glioblastoma"
Searching URIs for "refractory anemia with excess blasts"
Searching URIs for "type II diabetes mellitus"
Searching URIs for "ovarian carcinoma"
Searching URIs for "small intestine neuroendocrine tumor"
Searching URIs for "fetal growth restriction"
Searching URIs for "tonsilitis"
Searching URIs for "large cell lung carcinoma"
Searching URIs for "HIV infection"
Searching URIs for "wet macu

In [11]:
SCEA_diseases = pd.DataFrame(terms_URIs_SCEA)
SCEA_diseases

Unnamed: 0,name,URI
0,idiopathic pulmonary fibrosis,http://purl.obolibrary.org/obo/MONDO_0008345
1,idiopathic pulmonary fibrosis,http://purl.obolibrary.org/obo/DOID_0050156
2,type I diabetes mellitus,http://purl.obolibrary.org/obo/MONDO_0005147
3,type I diabetes mellitus,http://purl.obolibrary.org/obo/DOID_9744
4,type I diabetes mellitus,http://purl.obolibrary.org/obo/HP_0100651
...,...,...
95,myxoid liposarcoma,http://purl.obolibrary.org/obo/MONDO_0013280
96,myxoid liposarcoma,http://purl.obolibrary.org/obo/DOID_5363
97,myxoid liposarcoma,http://purl.obolibrary.org/obo/HP_0012268
98,cecum adenocarcinoma,http://purl.obolibrary.org/obo/MONDO_0006028


Finally, we have to map the diseases names to ontology nomenglature. We can do that using the method `parse_word`  from the class `OntologyConversorSCAE`.

In [12]:
conversor_SCEA = OntologyConversorSCAE()

SCEA_diseases['name'] = SCEA_diseases['name'].apply(conversor_SCEA.parse_word)
SCEA_diseases

Unnamed: 0,name,URI
0,IdiopathicPulmonaryFibrosis,http://purl.obolibrary.org/obo/MONDO_0008345
1,IdiopathicPulmonaryFibrosis,http://purl.obolibrary.org/obo/DOID_0050156
2,Type1DiabetesMellitus,http://purl.obolibrary.org/obo/MONDO_0005147
3,Type1DiabetesMellitus,http://purl.obolibrary.org/obo/DOID_9744
4,Type1DiabetesMellitus,http://purl.obolibrary.org/obo/HP_0100651
...,...,...
95,MyxoidLiposarcoma,http://purl.obolibrary.org/obo/MONDO_0013280
96,MyxoidLiposarcoma,http://purl.obolibrary.org/obo/DOID_5363
97,MyxoidLiposarcoma,http://purl.obolibrary.org/obo/HP_0012268
98,CecumAdenocarcinoma,http://purl.obolibrary.org/obo/MONDO_0006028


# 2. Getting HCA disease terms and URIs

We can use the HCA API-REST (https://service.azul.data.humancellatlas.org/index/projects?size=999&catalog=dcp1) to obtain all projects info. With that info, we will get the all the terms names.

In [13]:
seed_url = "https://service.azul.data.humancellatlas.org/index/projects?size=999&catalog=dcp1"

# User-Agent obtenido del recurso del curso.
headers = {
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
}

answer = requests.get(seed_url, headers=headers)
project_hits = answer.json()["hits"]

Iterating over the projects we can get all the terms.

In [14]:
disease_names_HCA = []

for n, hit in enumerate(project_hits):    
    for item in hit['donorOrganisms']:
        disease_names_HCA += item['disease']
            
disease_names_HCA = list(set(disease_names_HCA))
disease_names_HCA

['acute kidney tubular necrosis',
 'adrenal cortex adenoma',
 'ulcerative colitis (disease)',
 'hyperlipidemia (disease)',
 'reversible cerebral vasoconstriction syndrome',
 'Lyme disease',
 'prostate cancer',
 'end stage renal failure',
 'hiatus hernia (disease)',
 'syndromic dyslipidemia',
 'cataract (disease)',
 None,
 'melanoma (disease)',
 'asymptomatic dengue',
 'non-alcoholic fatty liver disease',
 'pericardial effusion (disease)',
 'Enterococcus faecalis infection',
 'benign prostatic hyperplasia (disease)',
 'kidney cancer',
 'cardiac arrest',
 'normal',
 'obstructive sleep apnea syndrome',
 'acquired aneurysmal subarachnoid hemorrhage',
 'depressive disorder',
 'essential hypertension',
 'anxiety disorder',
 'pure autonomic failure',
 'hereditary hemochromatosis',
 'osteoarthritis, hip',
 'ventricular tachycardia',
 'diverticulitis',
 'type 2 diabetes mellitus',
 'acoustic neuroma',
 'hemolytic-uremic syndrome',
 'irritable bowel syndrome',
 'orofaciodigital syndrome VIII',
 

Now, we use the same ontologies and function we have used for SCEA so we can get the URIs of HCA disease terms.

In [15]:
terms_URIs_HCA = []
for name in disease_names_HCA:
    terms_URIs_HCA += get_URIs_from_term(name, ontologies)

Searching URIs for "acute kidney tubular necrosis"
Searching URIs for "adrenal cortex adenoma"
Searching URIs for "ulcerative colitis (disease)"
Searching URIs for "hyperlipidemia (disease)"
Searching URIs for "reversible cerebral vasoconstriction syndrome"
Searching URIs for "Lyme disease"
Searching URIs for "prostate cancer"
Searching URIs for "end stage renal failure"
Searching URIs for "hiatus hernia (disease)"
Searching URIs for "syndromic dyslipidemia"
Searching URIs for "cataract (disease)"
Searching URIs for "melanoma (disease)"
Searching URIs for "asymptomatic dengue"
Searching URIs for "non-alcoholic fatty liver disease"
Searching URIs for "pericardial effusion (disease)"
Searching URIs for "Enterococcus faecalis infection"
Searching URIs for "benign prostatic hyperplasia (disease)"
Searching URIs for "kidney cancer"
Searching URIs for "cardiac arrest"
Searching URIs for "normal"
Searching URIs for "obstructive sleep apnea syndrome"
Searching URIs for "acquired aneurysmal sub

Finally, we create a dataframe and map the term names to the names in the ontology.

In [16]:
HCA_diseases = pd.DataFrame(terms_URIs_HCA)
HCA_diseases

Unnamed: 0,name,URI
0,acute kidney tubular necrosis,http://purl.obolibrary.org/obo/MONDO_0006637
1,acute kidney tubular necrosis,http://purl.obolibrary.org/obo/DOID_12556
2,adrenal cortex adenoma,http://purl.obolibrary.org/obo/MONDO_0003924
3,ulcerative colitis (disease),http://purl.obolibrary.org/obo/MONDO_0005101
4,hyperlipidemia (disease),http://purl.obolibrary.org/obo/MONDO_0021187
...,...,...
73,arthritis,http://purl.obolibrary.org/obo/SYMP_0019169
74,arthritis,http://purl.obolibrary.org/obo/HP_0001369
75,gastroesophageal reflux disease,http://purl.obolibrary.org/obo/MONDO_0007186
76,gastroesophageal reflux disease,http://purl.obolibrary.org/obo/DOID_8534


In [17]:
conversor_HCA = OntologyConversorHCA()

HCA_diseases['name'] = SCEA_diseases['name'].apply(conversor_HCA.parse_word)
HCA_diseases

Unnamed: 0,name,URI
0,IdiopathicPulmonaryFibrosis,http://purl.obolibrary.org/obo/MONDO_0006637
1,IdiopathicPulmonaryFibrosis,http://purl.obolibrary.org/obo/DOID_12556
2,Type1DiabetesMellitus,http://purl.obolibrary.org/obo/MONDO_0003924
3,Type1DiabetesMellitus,http://purl.obolibrary.org/obo/MONDO_0005101
4,Type1DiabetesMellitus,http://purl.obolibrary.org/obo/MONDO_0021187
...,...,...
73,HeadAndNeckSquamousCellCarcinoma,http://purl.obolibrary.org/obo/SYMP_0019169
74,ParkinsonsDisease,http://purl.obolibrary.org/obo/HP_0001369
75,ParkinsonsDisease,http://purl.obolibrary.org/obo/MONDO_0007186
76,LungCarcinoma,http://purl.obolibrary.org/obo/DOID_8534


# 3. Join SCEA and HCA disease terms

The last thing we have to do is join terms from HCA and SCEA.

In [18]:
df = SCEA_diseases.append(HCA_diseases, ignore_index=True).drop_duplicates()
df

Unnamed: 0,name,URI
0,IdiopathicPulmonaryFibrosis,http://purl.obolibrary.org/obo/MONDO_0008345
1,IdiopathicPulmonaryFibrosis,http://purl.obolibrary.org/obo/DOID_0050156
2,Type1DiabetesMellitus,http://purl.obolibrary.org/obo/MONDO_0005147
3,Type1DiabetesMellitus,http://purl.obolibrary.org/obo/DOID_9744
4,Type1DiabetesMellitus,http://purl.obolibrary.org/obo/HP_0100651
...,...,...
173,HeadAndNeckSquamousCellCarcinoma,http://purl.obolibrary.org/obo/SYMP_0019169
174,ParkinsonsDisease,http://purl.obolibrary.org/obo/HP_0001369
175,ParkinsonsDisease,http://purl.obolibrary.org/obo/MONDO_0007186
176,LungCarcinoma,http://purl.obolibrary.org/obo/DOID_8534


In [20]:
df.to_csv('../../SingleCell-Files/diseases_ontology.csv', index=False, header=False, sep='\t')