In [1]:
import requests
import pandas as pd
import urllib.parse
import json
import time
import re

from IPython.display import clear_output
from GetTermsOntobee import get_URI_from_name, get_name_from_URI

# SCEA diseases

In [2]:
seed_url = "https://www.ebi.ac.uk/gxa/sc/json/experiments"

In [3]:
headers = {
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
}

In [4]:
answer = requests.get(seed_url, headers=headers)

In [5]:
ontologies = ['PATO', 'MONDO', 'DOID', 'HP']

In [12]:
def get_terms_from_project(experiment_id, name_column, ontology_column):
    terms = []
    terms_names = []
    
    experiment_metadata = f"https://www.ebi.ac.uk/gxa/sc/experiment/{experiment_id}/download?fileType=experiment-design&accessKey="
    
    df = pd.read_csv(experiment_metadata, sep='\t', low_memory=False)
    
    display(df)
    
    # If project doesnt have the expected column, finish
    if name_column not in df.columns:
        return terms, terms_names
    
    project_term_names = df[name_column].unique()
    project_term_URIs = df[ontology_column].unique()

    print(project_term_names)
    print(project_term_URIs)
    
    # Balance names and URIs
    project_term_URIs = list(project_term_URIs)
    project_term_URIs += [None] * (len(project_term_names) - len(project_term_URIs))

    for name, URI in zip(project_term_names, project_term_URIs):
        # If we have already code the cell_type we skip it
        if name in terms_names:
            continue

        print(name)

        # If we have no info of URI we search for it
        if URI is None or str(URI) == 'nan':
            for ontology in ontologies:

                URI = get_URI_from_name(name, ontology)
                time.sleep(1)

                if URI is not None:
                    terms.append({
                        'name': name,
                        'URI': URI
                    })

            if URI is None:
                URI = get_URI_from_name(name)
                time.sleep(1)

                terms.append({
                    'name': name,
                    'URI': URI
                })

        # Otherwise   
        else:                    
            terms_names.append(name)

            terms.append({
                'name': name,
                'URI': URI
            })

        # If there are two or more cell types in the name
        if URI is None:
            new_l = re.split(' and |, |; ', name)
            for new_name in new_l:

                for ontology in ontologies:

                    URI = get_URI_from_name(new_name, ontology)
                    time.sleep(1)

                    if URI is not None:

                        terms.append({
                            'name': new_name,
                            'URI': URI
                        })

                if URI is None:
                    URI = get_URI_from_name(name)
                    time.sleep(1)

                    terms.append({
                        'name': name,
                        'URI': URI
                    })

                terms_names.append(new_name)


    return terms, terms_names

In [17]:
diseases = []
disease_names = []
avoid_collections = ["Human Cell Atlas"]

n_experiments = len(answer.json()['experiments'])

for n, experiment in enumerate(answer.json()['experiments']):
    print(f"{n+1}/{n_experiments}")

    if [i for i in experiment["experimentProjects"] if i in avoid_collections]:
        clear_output(wait=True)
        continue
    
    experiment_id = experiment['experimentAccession']
    diseases_pro, disease_names_pro = get_terms_from_project(experiment_id=experiment_id, 
                                                             name_column='Sample Characteristic[disease]', 
                                                             ontology_column='Sample Characteristic Ontology Term[disease]')
    
    diseases += diseases_pro
    disease_names += disease_names_pro
               
    clear_output(wait=True)

df = pd.DataFrame(diseases)

181/181
normal
COVID-19


In [18]:
df = df.drop_duplicates()

In [19]:
df[df.isnull().any(axis=1)].name.tolist()

['bronchioalveolar carcinoma; non-small cell lung cancer']

In [20]:
df[df['name'] == 'bronchioalveolar carcinoma']

Unnamed: 0,name,URI
2,bronchioalveolar carcinoma,http://purl.obolibrary.org/obo/MONDO_0000503
3,bronchioalveolar carcinoma,http://purl.obolibrary.org/obo/DOID_0050870


In [21]:
from OntologyConversorSCAE import OntologyConversorSCAE

conversor = OntologyConversorSCAE()

In [22]:
df['name'] = df['name'].apply(conversor.parse_word)
df

Unnamed: 0,name,URI
0,RenalCellCarcinoma,http://www.ebi.ac.uk/efo/EFO_0000681
1,BronchioalveolarCarcinoma,
2,BronchioalveolarCarcinoma,http://purl.obolibrary.org/obo/MONDO_0000503
3,BronchioalveolarCarcinoma,http://purl.obolibrary.org/obo/DOID_0050870
5,NonSmallCellLungCancer,http://www.ebi.ac.uk/efo/EFO_0003060
6,NonSmallCellLungCancer,http://purl.obolibrary.org/obo/MONDO_0005233
7,NonSmallCellLungCancer,http://purl.obolibrary.org/obo/DOID_3908
8,NonSmallCellLungCancer,http://purl.obolibrary.org/obo/HP_0030358
9,LungCarcinoma,http://www.ebi.ac.uk/efo/EFO_0000571
10,Normal,http://purl.obolibrary.org/obo/PATO_0000461


# HCA diseases

In [3]:
seed_url = "https://service.azul.data.humancellatlas.org/index/projects?size=999&catalog=dcp1"

In [4]:
# User-Agent obtenido del recurso del curso.
headers = {
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
}

In [5]:
answer = requests.get(seed_url, headers=headers)
project_hits = answer.json()["hits"]

In [10]:
projects = []
accessing_error = []

n_projects = len(project_hits)

diseases = []

for n, hit in enumerate(project_hits):    
    for item in hit['donorOrganisms']:
        diseases += item['disease']
        
    clear_output(wait=True)
    
diseases = list(set(diseases))

In [11]:
diseases = [x for x in diseases if x is not None]
diseases

['arthritis',
 'irritable bowel syndrome',
 'melanoma (disease)',
 'asymptomatic dengue',
 'hemolytic-uremic syndrome',
 'hereditary hemochromatosis',
 'ventricular tachycardia',
 'prostate cancer',
 'depressive disorder',
 'pure autonomic failure',
 'acoustic neuroma',
 'hyperlipidemia (disease)',
 'obstructive sleep apnea syndrome',
 'reversible cerebral vasoconstriction syndrome',
 'essential hypertension',
 'cardiac arrest',
 'gastroesophageal reflux disease',
 'benign prostatic hyperplasia (disease)',
 'acquired aneurysmal subarachnoid hemorrhage',
 'hiatus hernia (disease)',
 'adrenal cortex adenoma',
 'kidney cancer',
 'pericardial effusion (disease)',
 'colitis (disease)',
 'anxiety disorder',
 'osteoarthritis, hip',
 'Enterococcus faecalis infection',
 'ulcerative colitis (disease)',
 'cataract (disease)',
 'hypertension',
 'type 2 diabetes mellitus',
 'orofaciodigital syndrome VIII',
 'stroke disorder',
 'diverticulitis',
 'syndromic dyslipidemia',
 'end stage renal failure',

In [28]:
diseases_URIs = list(map(get_URI_from_name, diseases))
diseases_URIs

['http://purl.obolibrary.org/obo/MONDO_0001549',
 'http://purl.obolibrary.org/obo/MONDO_0005101',
 'http://purl.obolibrary.org/obo/MONDO_0010336',
 'http://purl.obolibrary.org/obo/MONDO_0004375',
 'http://purl.obolibrary.org/obo/MONDO_0005292',
 'http://purl.obolibrary.org/obo/MONDO_0005148',
 'http://purl.obolibrary.org/obo/MPATH_458']

In [29]:
from OntologyConversorHCA import OntologyConversorHCA

conversor = OntologyConversorHCA()

In [30]:
diseases = list(map(conversor.parse_word, diseases))
diseases

['HemolyticUremicSyndrome',
 'UlcerativeColitis',
 'OrofaciodigitalSyndromeVIII',
 'EndStageRenalFailure',
 'UlcerativeColitis',
 'Type2DiabetesMellitus',
 'Control']

In [31]:
rows = pd.DataFrame({'name': diseases, 'URI': diseases_URIs})
rows

Unnamed: 0,name,URI
0,HemolyticUremicSyndrome,http://purl.obolibrary.org/obo/MONDO_0001549
1,UlcerativeColitis,http://purl.obolibrary.org/obo/MONDO_0005101
2,OrofaciodigitalSyndromeVIII,http://purl.obolibrary.org/obo/MONDO_0010336
3,EndStageRenalFailure,http://purl.obolibrary.org/obo/MONDO_0004375
4,UlcerativeColitis,http://purl.obolibrary.org/obo/MONDO_0005292
5,Type2DiabetesMellitus,http://purl.obolibrary.org/obo/MONDO_0005148
6,Control,http://purl.obolibrary.org/obo/MPATH_458


In [32]:
df = df.append(rows, ignore_index=True).drop_duplicates()
df

Unnamed: 0,name,URI
0,RenalCellCarcinoma,http://www.ebi.ac.uk/efo/EFO_0000681
1,BronchioalveolarCarcinoma,
2,BronchioalveolarCarcinoma,http://purl.obolibrary.org/obo/MONDO_0000503
3,BronchioalveolarCarcinoma,http://purl.obolibrary.org/obo/DOID_0050870
4,NonSmallCellLungCancer,http://www.ebi.ac.uk/efo/EFO_0003060
...,...,...
62,OrofaciodigitalSyndromeVIII,http://purl.obolibrary.org/obo/MONDO_0010336
63,EndStageRenalFailure,http://purl.obolibrary.org/obo/MONDO_0004375
64,UlcerativeColitis,http://purl.obolibrary.org/obo/MONDO_0005292
65,Type2DiabetesMellitus,http://purl.obolibrary.org/obo/MONDO_0005148


In [33]:
df.to_csv('../SingleCell-Files/diseases_ontology.csv', index=False, header=False, sep='\t')

# Tests

In [13]:
diseases_pro, disease_names_pro = get_terms_from_project(experiment_id='E-CURD-55', 
                                                         name_column='Sample Characteristic[disease]', 
                                                         ontology_column='Sample Characteristic Ontology Term[disease]')

Unnamed: 0,Assay,Sample Characteristic[organism],Sample Characteristic Ontology Term[organism],Sample Characteristic[developmental stage],Sample Characteristic Ontology Term[developmental stage],Sample Characteristic[individual],Sample Characteristic Ontology Term[individual],Sample Characteristic[disease],Sample Characteristic Ontology Term[disease],Sample Characteristic[disease staging],Sample Characteristic Ontology Term[disease staging],Sample Characteristic[sampling time point],Sample Characteristic Ontology Term[sampling time point],Sample Characteristic[organism part],Sample Characteristic Ontology Term[organism part],Factor Value[disease],Factor Value Ontology Term[disease],Factor Value[disease staging],Factor Value Ontology Term[disease staging]
0,SAMC150711-AAAAACGACGCTCTTC,Homo sapiens,http://purl.obolibrary.org/obo/NCBITaxon_9606,adult,http://www.ebi.ac.uk/efo/EFO_0001272,ERS1,,COVID-19,http://purl.obolibrary.org/obo/MONDO_0100096,early recovery stage,,less than 7 days after negative nucleic acid b...,,blood,http://purl.obolibrary.org/obo/UBERON_0000178,COVID-19,http://purl.obolibrary.org/obo/MONDO_0100096,early recovery stage,
1,SAMC150711-AAAACGACGCTCTTCC,Homo sapiens,http://purl.obolibrary.org/obo/NCBITaxon_9606,adult,http://www.ebi.ac.uk/efo/EFO_0001272,ERS1,,COVID-19,http://purl.obolibrary.org/obo/MONDO_0100096,early recovery stage,,less than 7 days after negative nucleic acid b...,,blood,http://purl.obolibrary.org/obo/UBERON_0000178,COVID-19,http://purl.obolibrary.org/obo/MONDO_0100096,early recovery stage,
2,SAMC150711-AAACAAGACGCTCTTC,Homo sapiens,http://purl.obolibrary.org/obo/NCBITaxon_9606,adult,http://www.ebi.ac.uk/efo/EFO_0001272,ERS1,,COVID-19,http://purl.obolibrary.org/obo/MONDO_0100096,early recovery stage,,less than 7 days after negative nucleic acid b...,,blood,http://purl.obolibrary.org/obo/UBERON_0000178,COVID-19,http://purl.obolibrary.org/obo/MONDO_0100096,early recovery stage,
3,SAMC150711-AAACACGACGCTCTTC,Homo sapiens,http://purl.obolibrary.org/obo/NCBITaxon_9606,adult,http://www.ebi.ac.uk/efo/EFO_0001272,ERS1,,COVID-19,http://purl.obolibrary.org/obo/MONDO_0100096,early recovery stage,,less than 7 days after negative nucleic acid b...,,blood,http://purl.obolibrary.org/obo/UBERON_0000178,COVID-19,http://purl.obolibrary.org/obo/MONDO_0100096,early recovery stage,
4,SAMC150711-AAACCTGCAAATTGCC,Homo sapiens,http://purl.obolibrary.org/obo/NCBITaxon_9606,adult,http://www.ebi.ac.uk/efo/EFO_0001272,ERS1,,COVID-19,http://purl.obolibrary.org/obo/MONDO_0100096,early recovery stage,,less than 7 days after negative nucleic acid b...,,blood,http://purl.obolibrary.org/obo/UBERON_0000178,COVID-19,http://purl.obolibrary.org/obo/MONDO_0100096,early recovery stage,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97583,SAMC191986-TTTGTCATCAAACCGT,Homo sapiens,http://purl.obolibrary.org/obo/NCBITaxon_9606,adult,http://www.ebi.ac.uk/efo/EFO_0001272,Healthy_Control_2,,normal,http://purl.obolibrary.org/obo/PATO_0000461,healthy control,http://purl.obolibrary.org/obo/PATO_0000461,,,blood,http://purl.obolibrary.org/obo/UBERON_0000178,normal,http://purl.obolibrary.org/obo/PATO_0000461,healthy control,http://purl.obolibrary.org/obo/PATO_0000461
97584,SAMC191986-TTTGTCATCCAATGGT,Homo sapiens,http://purl.obolibrary.org/obo/NCBITaxon_9606,adult,http://www.ebi.ac.uk/efo/EFO_0001272,Healthy_Control_2,,normal,http://purl.obolibrary.org/obo/PATO_0000461,healthy control,http://purl.obolibrary.org/obo/PATO_0000461,,,blood,http://purl.obolibrary.org/obo/UBERON_0000178,normal,http://purl.obolibrary.org/obo/PATO_0000461,healthy control,http://purl.obolibrary.org/obo/PATO_0000461
97585,SAMC191986-TTTGTCATCCTTTCGG,Homo sapiens,http://purl.obolibrary.org/obo/NCBITaxon_9606,adult,http://www.ebi.ac.uk/efo/EFO_0001272,Healthy_Control_2,,normal,http://purl.obolibrary.org/obo/PATO_0000461,healthy control,http://purl.obolibrary.org/obo/PATO_0000461,,,blood,http://purl.obolibrary.org/obo/UBERON_0000178,normal,http://purl.obolibrary.org/obo/PATO_0000461,healthy control,http://purl.obolibrary.org/obo/PATO_0000461
97586,SAMC191986-TTTGTCATCGTCGTTC,Homo sapiens,http://purl.obolibrary.org/obo/NCBITaxon_9606,adult,http://www.ebi.ac.uk/efo/EFO_0001272,Healthy_Control_2,,normal,http://purl.obolibrary.org/obo/PATO_0000461,healthy control,http://purl.obolibrary.org/obo/PATO_0000461,,,blood,http://purl.obolibrary.org/obo/UBERON_0000178,normal,http://purl.obolibrary.org/obo/PATO_0000461,healthy control,http://purl.obolibrary.org/obo/PATO_0000461


['COVID-19' 'normal']
['http://purl.obolibrary.org/obo/MONDO_0100096'
 'http://purl.obolibrary.org/obo/PATO_0000461']
COVID-19
normal


In [14]:
disease_names_pro

['COVID-19', 'normal']