In [23]:
import requests
import time
import re
import pandas as pd

from IPython.display import clear_output
from GetTermsOntobee import get_URI_from_name

# 1. Getting SCEA disease terms and URIs

First we will define a function with which we will obtain all the terms of the project of SCEA. In this case we will use this function to obtain all the diseases of a project.

In [15]:
def get_terms_from_project(experiment_id, name_column):
    experiment_metadata = f"https://www.ebi.ac.uk/gxa/sc/experiment/{experiment_id}/download?fileType=experiment-design&accessKey="
    
    df = pd.read_csv(experiment_metadata, sep='\t', low_memory=False)
    
    # If project doesnt have the expected column, finish
    if name_column not in df.columns:
        return []
    
    project_term_names = df[name_column].unique()

    return list(project_term_names)

Now we will loop over all projects of SCEA so we get all the disease terms used. Using the url https://www.ebi.ac.uk/gxa/sc/json/experiments we obtain all projects.

In [1]:
seed_url = "https://www.ebi.ac.uk/gxa/sc/json/experiments"

In [2]:
headers = {
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
}

In [4]:
answer = requests.get(seed_url, headers=headers)

And we can iterate over `experiments` and get the terms of each project.

In [16]:
diseases = []
disease_names = []
avoid_collections = ["Human Cell Atlas"]

n_experiments = len(answer.json()['experiments'])

for n, experiment in enumerate(answer.json()['experiments']):
    print(f"{n+1}/{n_experiments}")

    if [i for i in experiment["experimentProjects"] if i in avoid_collections]:
        clear_output(wait=True)
        continue
    
    experiment_id = experiment['experimentAccession']
    disease_names_pro = get_terms_from_project(experiment_id=experiment_id, 
                                               name_column='Sample Characteristic[disease]')

    disease_names += disease_names_pro
    
    time.sleep(1)
    clear_output(wait=True)

disease_names

['renal cell carcinoma',
 'bronchioalveolar carcinoma; non-small cell lung cancer',
 'lung adenocarcinoma',
 'normal',
 'normal',
 "Crohn's disease",
 'lung adenocarcinoma',
 'non-small cell lung carcinoma',
 'COVID-19',
 'normal',
 'myelodysplastic syndrome',
 'refractory anemia with excess blasts',
 'hypocellular myelodysplastic syndrome',
 'normal',
 'normal',
 'glioblastoma',
 'melanoma',
 'type II diabetes mellitus',
 'normal',
 'multiple myeloma',
 'normal',
 'HIV infection',
 'normal',
 'normal',
 'normal',
 'small intestine neuroendocrine tumor',
 'cecum adenocarcinoma',
 'rectal adenocarcinoma',
 'chronic obstructive pulmonary disease',
 'normal',
 'normal',
 'normal',
 'normal',
 'wet macular degeneration',
 'normal',
 'head and neck squamous cell carcinoma',
 'normal',
 'COVID-19',
 'influenza',
 'normal',
 'COVID-19',
 'normal',
 'obstructive sleep apnea syndrome',
 'normal',
 'metastatic breast cancer',
 'breast carcinoma',
 'normal',
 'chronic phase chronic myeloid leukem

As we can see, there are so many terms repeated. We can delete the duplicated ones so they are useless.

In [17]:
disease_names_uniques = list(set(disease_names))
disease_names_uniques

['idiopathic pulmonary fibrosis',
 'large cell lung carcinoma',
 'small intestine neuroendocrine tumor',
 'obstructive sleep apnea syndrome',
 'squamous cell lung carcinoma',
 'brain glioblastoma',
 'pancreatic neoplasm',
 'ovarian serous adenocarcinoma',
 "Parkinson's disease",
 "Crohn's disease",
 'non-small cell lung carcinoma',
 'hepatitis C infection',
 'myxoid liposarcoma',
 'ovarian carcinoma',
 'myelodysplastic syndrome',
 'melanoma',
 'HIV infection',
 'lung carcinoma',
 'not applicable',
 'metastatic breast cancer',
 'hypocellular myelodysplastic syndrome',
 'B cell acute lymphoblastic leukemia',
 'renal cell carcinoma',
 'influenza',
 'chronic obstructive pulmonary disease',
 'breast carcinoma',
 'lung adenocarcinoma',
 'normal',
 'tonsilitis',
 'colorectal cancer',
 'cecum adenocarcinoma',
 'head and neck squamous cell carcinoma',
 'bronchioalveolar carcinoma; non-small cell lung cancer',
 'type II diabetes mellitus',
 'multiple myeloma',
 'prostate carcinoma',
 'fibrosis',

The next step is to search the URIs of each term. We define the function `get_URIs_from_term` to fulfill that goal, using the function `get_URI_from_name` to search the URI in a set of ontologies.

In [56]:
def get_URIs_from_term(term_name, ontologies):
    print(f'Searching URIs for "{term_name}"')
    terms = []
    
    # First get the URIs of all ontologies
    for ontology in ontologies:
        URI = get_URI_from_name(term_name, ontology)
        time.sleep(1)

        if URI is not None:
            terms.append({'name': term_name, 'URI': URI})
    
    # If we dont find any URI with the ontologies, we search for a URI in ALL ontologies
    if not terms:
        URI = get_URI_from_name(term_name)
        
        if URI is not None:
            terms.append({'name': term_name, 'URI': URI})
    
    
    splitted = re.split(' and |, |; ', term_name)
    
    # If the term_name is a list of terms, we iterate over them
    if not terms and len(splitted) > 1:
        for new_name in splitted:
            terms += get_URIs_from_term(new_name, ontologies)
        
    return terms

The ontologies we have considered important are `PATO`, `MONDO`, `DOID` and `HP`.

In [57]:
ontologies = ['PATO', 'MONDO', 'DOID', 'HP']

In [58]:
get_URIs_from_term('bronchioalveolar carcinoma; non-small cell lung cancer', ontologies)

Searching URIs for "bronchioalveolar carcinoma; non-small cell lung cancer"
Searching URIs for "bronchioalveolar carcinoma"
Searching URIs for "non-small cell lung cancer"


[{'name': 'bronchioalveolar carcinoma',
  'URI': 'http://purl.obolibrary.org/obo/MONDO_0000503'},
 {'name': 'bronchioalveolar carcinoma',
  'URI': 'http://purl.obolibrary.org/obo/DOID_0050870'},
 {'name': 'non-small cell lung cancer',
  'URI': 'http://purl.obolibrary.org/obo/MONDO_0005233'},
 {'name': 'non-small cell lung cancer',
  'URI': 'http://purl.obolibrary.org/obo/DOID_3908'},
 {'name': 'non-small cell lung cancer',
  'URI': 'http://purl.obolibrary.org/obo/HP_0030358'}]

Now, we can get the URIs of each term name and create a dataframe with the URIs and the names.

In [59]:
terms_URIs = []
for name in disease_names_uniques:
    term_URIs = get_URIs_from_term(name, ontologies)
    
    terms_URIs += term_URIs

Searching URIs for "idiopathic pulmonary fibrosis"
Searching URIs for "large cell lung carcinoma"
Searching URIs for "small intestine neuroendocrine tumor"
Searching URIs for "obstructive sleep apnea syndrome"
Searching URIs for "squamous cell lung carcinoma"
Searching URIs for "brain glioblastoma"
Searching URIs for "pancreatic neoplasm"
Searching URIs for "ovarian serous adenocarcinoma"
Searching URIs for "Parkinson's disease"
Searching URIs for "Crohn's disease"
Searching URIs for "non-small cell lung carcinoma"
Searching URIs for "hepatitis C infection"
Searching URIs for "myxoid liposarcoma"
Searching URIs for "ovarian carcinoma"
Searching URIs for "myelodysplastic syndrome"
Searching URIs for "melanoma"
Searching URIs for "HIV infection"
Searching URIs for "lung carcinoma"
Searching URIs for "not applicable"
Searching URIs for "metastatic breast cancer"
Searching URIs for "hypocellular myelodysplastic syndrome"
Searching URIs for "B cell acute lymphoblastic leukemia"
Searching UR

In [65]:
SCEA_diseases = pd.DataFrame(terms_URIs)
SCEA_diseases

Unnamed: 0,name,URI
0,idiopathic pulmonary fibrosis,http://purl.obolibrary.org/obo/MONDO_0008345
1,idiopathic pulmonary fibrosis,http://purl.obolibrary.org/obo/DOID_0050156
2,large cell lung carcinoma,http://purl.obolibrary.org/obo/MONDO_0003050
3,large cell lung carcinoma,http://purl.obolibrary.org/obo/DOID_4556
4,large cell lung carcinoma,http://purl.obolibrary.org/obo/HP_0030360
...,...,...
98,glioblastoma,http://purl.obolibrary.org/obo/HP_0012174
99,fetal growth restriction,http://purl.obolibrary.org/obo/MONDO_0005030
100,COVID-19,http://purl.obolibrary.org/obo/MONDO_0100096
101,COVID-19,http://purl.obolibrary.org/obo/DOID_0080600


In [67]:
from OntologyConversorSCAE import OntologyConversorSCAE

conversor = OntologyConversorSCAE()

SyntaxError: invalid syntax (<ipython-input-67-398f37f70398>, line 1)

# 2. Getting HCA disease terms and URIs

We can use the HCA API-REST (https://service.azul.data.humancellatlas.org/index/projects?size=999&catalog=dcp1) to obtain all projects info. With that info, we will get the all the terms names.

In [None]:
seed_url = "https://service.azul.data.humancellatlas.org/index/projects?size=999&catalog=dcp1"

# User-Agent obtenido del recurso del curso.
headers = {
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
}

answer = requests.get(seed_url, headers=headers)
project_hits = answer.json()["hits"]

In [None]:
projects = []
accessing_error = []

n_projects = len(project_hits)

disease_names_HCA = []

for n, hit in enumerate(project_hits):    
    for item in hit['donorOrganisms']:
        disease_names_HCA += item['disease']
        
    clear_output(wait=True)
    
disease_names_HCA = list(set(disease_names_HCA))