In [15]:
import requests
import time
import re
import pandas as pd

from IPython.display import clear_output
from GetTermsOntobee import get_URI_from_name
from OntologyConversorSCAE import OntologyConversorSCAE
from OntologyConversorHCA import OntologyConversorHCA

# 1. Getting SCEA disease terms and URIs

First we will define a function with which we will obtain all the terms of the project of SCEA. In this case we will use this function to obtain all the diseases of a project.

In [3]:
def get_terms_from_project(experiment_id, name_column):
    experiment_metadata = f"https://www.ebi.ac.uk/gxa/sc/experiment/{experiment_id}/download?fileType=experiment-design&accessKey="
    
    df = pd.read_csv(experiment_metadata, sep='\t', low_memory=False)
    time.sleep(1) # It's a good practice to wait every time we make a petition
    
    # If project doesnt have the expected column, finish
    if name_column not in df.columns:
        return []
    
    project_term_names = df[name_column].unique()

    return list(project_term_names)

Now we will loop over all projects of SCEA so we get all the disease terms used. Using the url https://www.ebi.ac.uk/gxa/sc/json/experiments we obtain all projects.

In [4]:
seed_url = "https://www.ebi.ac.uk/gxa/sc/json/experiments"

In [5]:
headers = {
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
}

In [6]:
answer = requests.get(seed_url, headers=headers)

And we can iterate over `experiments` and get the terms of each project.

In [7]:
disease_names = []
avoid_collections = ["Human Cell Atlas"] # We dont want to get the URIs of these collections

n_experiments = len(answer.json()['experiments'])

for n, experiment in enumerate(answer.json()['experiments']):
    print(f"{n+1}/{n_experiments}")

    # If project is from a collection we are not interested in, we skip it
    if [i for i in experiment["experimentProjects"] if i in avoid_collections]:
        clear_output(wait=True)
        continue
    
    # Get the terms of the project
    experiment_id = experiment['experimentAccession']
    disease_names += get_terms_from_project(experiment_id=experiment_id, 
                                            name_column='Sample Characteristic[disease]')
    
    clear_output(wait=True)

# Remove the duplicated term names
disease_names_SCEA = list(set(disease_names_SCEA))
disease_names_SCEA

['renal cell carcinoma',
 'bronchioalveolar carcinoma; non-small cell lung cancer',
 'lung adenocarcinoma',
 'normal',
 'normal',
 "Crohn's disease",
 'lung adenocarcinoma',
 'non-small cell lung carcinoma',
 'COVID-19',
 'normal',
 'myelodysplastic syndrome',
 'refractory anemia with excess blasts',
 'hypocellular myelodysplastic syndrome',
 'normal',
 'normal',
 'glioblastoma',
 'melanoma',
 'type II diabetes mellitus',
 'normal',
 'multiple myeloma',
 'normal',
 'HIV infection',
 'normal',
 'normal',
 'normal',
 'small intestine neuroendocrine tumor',
 'cecum adenocarcinoma',
 'rectal adenocarcinoma',
 'chronic obstructive pulmonary disease',
 'normal',
 'normal',
 'normal',
 'normal',
 'wet macular degeneration',
 'normal',
 'head and neck squamous cell carcinoma',
 'normal',
 'COVID-19',
 'influenza',
 'normal',
 'COVID-19',
 'normal',
 'obstructive sleep apnea syndrome',
 'normal',
 'metastatic breast cancer',
 'breast carcinoma',
 'normal',
 'chronic phase chronic myeloid leukem

The next step is to search the URIs of each term. We define the function `get_URIs_from_term` to fulfill that goal, using the function `get_URI_from_name` to search the URI in a set of ontologies.

In [23]:
def get_URIs_from_term(term_name, ontologies):
    if term_name is None:
        return []
    
    print(f'Searching URIs for "{term_name}"')
    terms = []
    
    # First get the URIs of all ontologies
    for ontology in ontologies:
        URI = get_URI_from_name(term_name, ontology)
        time.sleep(1)

        if URI is not None:
            terms.append({'name': term_name, 'URI': URI})
    
    # If we dont find any URI with the ontologies, we search for a URI in ALL ontologies
    if not terms:
        URI = get_URI_from_name(term_name)
        
        if URI is not None:
            terms.append({'name': term_name, 'URI': URI})
    
    
    splitted = re.split(' and |, |; ', term_name)
    
    # If the term_name is a list of terms, we iterate over them
    if not terms and len(splitted) > 1:
        for new_name in splitted:
            terms += get_URIs_from_term(new_name, ontologies)
            return terms
    
    # If none URI has been found, we add a None URI so we know that
    if not terms:
        terms.append({
            'name': term_name,
            'URI': None
        })
    
    return terms

The ontologies we have considered important are `PATO`, `MONDO`, `DOID` and `HP`.

In [10]:
ontologies = ['PATO', 'MONDO', 'DOID', 'HP']

In [11]:
get_URIs_from_term('bronchioalveolar carcinoma; non-small cell lung cancer', ontologies)

Searching URIs for "bronchioalveolar carcinoma; non-small cell lung cancer"
Searching URIs for "bronchioalveolar carcinoma"
Searching URIs for "non-small cell lung cancer"


[{'name': 'bronchioalveolar carcinoma',
  'URI': 'http://purl.obolibrary.org/obo/MONDO_0000503'},
 {'name': 'bronchioalveolar carcinoma',
  'URI': 'http://purl.obolibrary.org/obo/DOID_0050870'},
 {'name': 'non-small cell lung cancer',
  'URI': 'http://purl.obolibrary.org/obo/MONDO_0005233'},
 {'name': 'non-small cell lung cancer',
  'URI': 'http://purl.obolibrary.org/obo/DOID_3908'},
 {'name': 'non-small cell lung cancer',
  'URI': 'http://purl.obolibrary.org/obo/HP_0030358'}]

Now, we can get the URIs of each term name and create a dataframe with the URIs and the names.

In [12]:
terms_URIs_SCEA = []
for name in disease_names_SCEA:
    terms_URIs_SCEA += get_URIs_from_term(name, ontologies)

Searching URIs for "multiple myeloma"
Searching URIs for "pancreatic neoplasm"
Searching URIs for "type II diabetes mellitus"
Searching URIs for "wet macular degeneration"
Searching URIs for "metastatic breast cancer"
Searching URIs for "type I diabetes mellitus"
Searching URIs for "hypocellular myelodysplastic syndrome"
Searching URIs for "Parkinson's disease"
Searching URIs for "tonsilitis"
Searching URIs for "rectal adenocarcinoma"
Searching URIs for "HIV infection"
Searching URIs for "myelodysplastic syndrome"
Searching URIs for "prostate carcinoma"
Searching URIs for "hepatitis C infection"
Searching URIs for "squamous cell lung carcinoma"
Searching URIs for "fibrosis"
Searching URIs for "renal cell carcinoma"
Searching URIs for "bronchioalveolar carcinoma; non-small cell lung cancer"
Searching URIs for "bronchioalveolar carcinoma"
Searching URIs for "non-small cell lung cancer"
Searching URIs for "breast carcinoma"
Searching URIs for "chronic obstructive pulmonary disease"
Search

In [13]:
SCEA_diseases = pd.DataFrame(terms_URIs_SCEA)
SCEA_diseases

Unnamed: 0,name,URI
0,multiple myeloma,http://purl.obolibrary.org/obo/MONDO_0009693
1,multiple myeloma,http://purl.obolibrary.org/obo/DOID_9538
2,multiple myeloma,http://purl.obolibrary.org/obo/HP_0006775
3,pancreatic neoplasm,http://purl.obolibrary.org/obo/MONDO_0021040
4,pancreatic neoplasm,http://purl.obolibrary.org/obo/DOID_1793
...,...,...
98,cecum adenocarcinoma,http://purl.obolibrary.org/obo/MONDO_0006028
99,cecum adenocarcinoma,http://purl.obolibrary.org/obo/DOID_3039
100,large cell lung carcinoma,http://purl.obolibrary.org/obo/MONDO_0003050
101,large cell lung carcinoma,http://purl.obolibrary.org/obo/DOID_4556


Finally, we have to map the diseases names to ontology nomenglature. We can do that using the method `parse_word`  from the class `OntologyConversorSCAE`.

In [14]:
conversor_SCEA = OntologyConversorSCAE()

SCEA_diseases['name'] = SCEA_diseases['name'].apply(conversor_SCEA.parse_word)
SCEA_diseases

Unnamed: 0,name,URI
0,MultipleMyeloma,http://purl.obolibrary.org/obo/MONDO_0009693
1,MultipleMyeloma,http://purl.obolibrary.org/obo/DOID_9538
2,MultipleMyeloma,http://purl.obolibrary.org/obo/HP_0006775
3,PancreaticCancer,http://purl.obolibrary.org/obo/MONDO_0021040
4,PancreaticCancer,http://purl.obolibrary.org/obo/DOID_1793
...,...,...
98,CecumAdenocarcinoma,http://purl.obolibrary.org/obo/MONDO_0006028
99,CecumAdenocarcinoma,http://purl.obolibrary.org/obo/DOID_3039
100,LungCarcinoma,http://purl.obolibrary.org/obo/MONDO_0003050
101,LungCarcinoma,http://purl.obolibrary.org/obo/DOID_4556


# 2. Getting HCA disease terms and URIs

We can use the HCA API-REST (https://service.azul.data.humancellatlas.org/index/projects?size=999&catalog=dcp1) to obtain all projects info. With that info, we will get the all the terms names.

In [16]:
seed_url = "https://service.azul.data.humancellatlas.org/index/projects?size=999&catalog=dcp1"

# User-Agent obtenido del recurso del curso.
headers = {
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
}

answer = requests.get(seed_url, headers=headers)
project_hits = answer.json()["hits"]

Iterating over the projects we can get all the terms.

In [17]:
disease_names_HCA = []

for n, hit in enumerate(project_hits):    
    for item in hit['donorOrganisms']:
        disease_names_HCA += item['disease']
            
disease_names_HCA = list(set(disease_names_HCA))
disease_names_HCA

['hyperlipidemia (disease)',
 'osteoarthritis, hip',
 'depressive disorder',
 'Enterococcus faecalis infection',
 'acoustic neuroma',
 'stroke disorder',
 'adrenal cortex adenoma',
 'type 2 diabetes mellitus',
 'reversible cerebral vasoconstriction syndrome',
 'diverticulitis',
 'arthritis',
 'prostate cancer',
 None,
 'melanoma (disease)',
 'cardiac arrest',
 'non-alcoholic fatty liver disease',
 'syndromic dyslipidemia',
 'pericardial effusion (disease)',
 'irritable bowel syndrome',
 'kidney cancer',
 'obstructive sleep apnea syndrome',
 'normal',
 'end stage renal failure',
 'hereditary hemochromatosis',
 'hiatus hernia (disease)',
 'cataract (disease)',
 'acquired aneurysmal subarachnoid hemorrhage',
 'acute kidney tubular necrosis',
 'colitis (disease)',
 'hypertension',
 'ulcerative colitis (disease)',
 'anxiety disorder',
 'pure autonomic failure',
 'orofaciodigital syndrome VIII',
 'gastroesophageal reflux disease',
 'essential hypertension',
 'Lyme disease',
 'ventricular tac

In [20]:
terms_URIs_HCA = []
for name in disease_names_HCA:
    terms_URIs_HCA += get_URIs_from_term(name, ontologies)

Searching URIs for "hyperlipidemia (disease)"
Searching URIs for "osteoarthritis, hip"
Searching URIs for "depressive disorder"
Searching URIs for "Enterococcus faecalis infection"
Searching URIs for "acoustic neuroma"
Searching URIs for "stroke disorder"
Searching URIs for "adrenal cortex adenoma"
Searching URIs for "type 2 diabetes mellitus"
Searching URIs for "reversible cerebral vasoconstriction syndrome"
Searching URIs for "diverticulitis"
Searching URIs for "arthritis"
Searching URIs for "prostate cancer"
Searching URIs for "melanoma (disease)"
Searching URIs for "cardiac arrest"
Searching URIs for "non-alcoholic fatty liver disease"
Searching URIs for "syndromic dyslipidemia"
Searching URIs for "pericardial effusion (disease)"
Searching URIs for "irritable bowel syndrome"
Searching URIs for "kidney cancer"
Searching URIs for "obstructive sleep apnea syndrome"
Searching URIs for "normal"
Searching URIs for "end stage renal failure"
Searching URIs for "hereditary hemochromatosis"


In [21]:
HCA_diseases = pd.DataFrame(terms_URIs_HCA)
HCA_diseases

Unnamed: 0,name,URI
0,hyperlipidemia (disease),http://purl.obolibrary.org/obo/MONDO_0021187
1,"osteoarthritis, hip",http://purl.obolibrary.org/obo/MONDO_0006629
2,depressive disorder,http://purl.obolibrary.org/obo/MONDO_0002050
3,depressive disorder,http://purl.obolibrary.org/obo/DOID_12294
4,depressive disorder,http://purl.obolibrary.org/obo/NBO_0000515
...,...,...
73,asymptomatic dengue,http://purl.obolibrary.org/obo/DOID_0050143
74,benign prostatic hyperplasia (disease),http://purl.obolibrary.org/obo/MONDO_0010811
75,hemolytic-uremic syndrome,http://purl.obolibrary.org/obo/MONDO_0001549
76,hemolytic-uremic syndrome,http://purl.obolibrary.org/obo/DOID_12554


In [22]:
conversor_HCA = OntologyConversorHCA()

HCA_diseases['name'] = SCEA_diseases['name'].apply(conversor_HCA.parse_word)
HCA_diseases

Unnamed: 0,name,URI
0,MultipleMyeloma,http://purl.obolibrary.org/obo/MONDO_0021187
1,MultipleMyeloma,http://purl.obolibrary.org/obo/MONDO_0006629
2,MultipleMyeloma,http://purl.obolibrary.org/obo/MONDO_0002050
3,PancreaticCancer,http://purl.obolibrary.org/obo/DOID_12294
4,PancreaticCancer,http://purl.obolibrary.org/obo/NBO_0000515
...,...,...
73,Glioblastoma,http://purl.obolibrary.org/obo/DOID_0050143
74,LungCarcinoma,http://purl.obolibrary.org/obo/MONDO_0010811
75,LungCarcinoma,http://purl.obolibrary.org/obo/MONDO_0001549
76,LungCarcinoma,http://purl.obolibrary.org/obo/DOID_12554
