In [1]:
import requests
import pandas as pd
import urllib.parse
import json
import time

from IPython.display import clear_output
from GetTermsOntobee import get_URI_from_name, get_name_from_URI

# SCEA diseases

In [2]:
seed_url = "https://www.ebi.ac.uk/gxa/sc/json/experiments"

In [3]:
headers = {
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
}

In [4]:
answer = requests.get(seed_url, headers=headers)

In [5]:
diseases = []
disease_names = []
avoid_collections = ["Human Cell Atlas"]

n_experiments = len(answer.json()['experiments'])

for n, experiment in enumerate(answer.json()['experiments']):
    print(f"{n+1}/{n_experiments}")

    if [i for i in experiment["experimentProjects"] if i in avoid_collections]:
        clear_output(wait=True)
        continue
    
    experiment_id = experiment['experimentAccession']
    experiment_metadata = f"https://www.ebi.ac.uk/gxa/sc/experiment/{experiment_id}/download?fileType=experiment-design&accessKey="
    
    df = pd.read_csv(experiment_metadata, sep='\t', low_memory=False)
    
    if 'Sample Characteristic[disease]' in df.columns:
        diseases_names = df['Sample Characteristic[disease]'].unique()
        diseases_URIs = df['Sample Characteristic Ontology Term[disease]'].unique()
    
        diseases_URIs = list(diseases_URIs)
        diseases_URIs += [None] * (len(diseases_names) - len(diseases_URIs))

        for name, URI in zip(diseases_names, diseases_URIs):
            if name in disease_names:
                continue
            
            print(name)
            
            if URI is None or str(URI) == 'nan':
                URI = get_URI_from_name(name)
            
            disease_names.append(name)
            
            
            diseases.append({
                'name': name,
                'URI': URI
            })    
           
    time.sleep(1)
    
    clear_output(wait=True)

df = pd.DataFrame(diseases)

181/181


In [6]:
df

Unnamed: 0,name,URI
0,renal cell carcinoma,http://www.ebi.ac.uk/efo/EFO_0000681
1,bronchioalveolar carcinoma; non-small cell lun...,
2,lung adenocarcinoma,http://www.ebi.ac.uk/efo/EFO_0000571
3,normal,http://purl.obolibrary.org/obo/PATO_0000461
4,Crohn's disease,http://www.ebi.ac.uk/efo/EFO_0000384
5,non-small cell lung carcinoma,http://www.ebi.ac.uk/efo/EFO_0003060
6,COVID-19,http://purl.obolibrary.org/obo/MONDO_0100096
7,myelodysplastic syndrome,http://www.ebi.ac.uk/efo/EFO_0000198
8,refractory anemia with excess blasts,http://www.ebi.ac.uk/efo/EFO_0003811
9,hypocellular myelodysplastic syndrome,http://purl.obolibrary.org/obo/PATO_0000461


In [7]:
from OntologyConversorSCAE import OntologyConversorSCAE

conversor = OntologyConversorSCAE()

In [8]:
df['name'] = df['name'].apply(conversor.parse_word)
df

Unnamed: 0,name,URI
0,RenalCellCarcinoma,http://www.ebi.ac.uk/efo/EFO_0000681
1,BronchioalveolarCarcinoma,
2,LungCarcinoma,http://www.ebi.ac.uk/efo/EFO_0000571
3,Normal,http://purl.obolibrary.org/obo/PATO_0000461
4,CrohnsDisease,http://www.ebi.ac.uk/efo/EFO_0000384
5,LungCarcinoma,http://www.ebi.ac.uk/efo/EFO_0003060
6,COVID-19,http://purl.obolibrary.org/obo/MONDO_0100096
7,MyelodysplasticSyndrome,http://www.ebi.ac.uk/efo/EFO_0000198
8,RefractoryAnemiaWithExcessBlasts,http://www.ebi.ac.uk/efo/EFO_0003811
9,MyelodysplasticSyndrome,http://purl.obolibrary.org/obo/PATO_0000461


# HCA diseases

In [9]:
seed_url = "https://service.azul.data.humancellatlas.org/index/projects?size=999&catalog=dcp1"

In [10]:
# User-Agent obtenido del recurso del curso.
headers = {
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
}

In [11]:
answer = requests.get(seed_url, headers=headers)
project_hits = answer.json()["hits"]

In [12]:
projects = []
accessing_error = []

n_projects = len(project_hits)

diseases = []

for n, hit in enumerate(project_hits):
    print(n)
    
    for item in hit['specimens']:
        diseases += item['disease']
        
    clear_output(wait=True)
    
diseases = list(set(diseases))

27


In [13]:
diseases = [x for x in diseases if x is not None]
diseases

['normal',
 'orofaciodigital syndrome VIII',
 'hemolytic-uremic syndrome',
 'type 2 diabetes mellitus',
 'ulcerative colitis (disease)',
 'colitis (disease)',
 'end stage renal failure']

In [14]:
diseases_URIs = list(map(get_URI_from_name, diseases))
diseases_URIs

['http://purl.obolibrary.org/obo/MPATH_458',
 'http://purl.obolibrary.org/obo/MONDO_0010336',
 'http://purl.obolibrary.org/obo/MONDO_0001549',
 'http://purl.obolibrary.org/obo/MONDO_0005148',
 'http://purl.obolibrary.org/obo/MONDO_0005101',
 'http://purl.obolibrary.org/obo/MONDO_0005292',
 'http://purl.obolibrary.org/obo/MONDO_0004375']

In [15]:
from OntologyConversorHCA import OntologyConversorHCA

conversor = OntologyConversorHCA()

In [16]:
diseases = list(map(conversor.parse_word, diseases))
diseases

['Control',
 'OrofaciodigitalSyndromeVIII',
 'HemolyticUremicSyndrome',
 'Type2DiabetesMellitus',
 'UlcerativeColitis',
 'UlcerativeColitis',
 'EndStageRenalFailure']

In [17]:
rows = pd.DataFrame({'name': diseases, 'URI': diseases_URIs})
rows

Unnamed: 0,name,URI
0,Control,http://purl.obolibrary.org/obo/MPATH_458
1,OrofaciodigitalSyndromeVIII,http://purl.obolibrary.org/obo/MONDO_0010336
2,HemolyticUremicSyndrome,http://purl.obolibrary.org/obo/MONDO_0001549
3,Type2DiabetesMellitus,http://purl.obolibrary.org/obo/MONDO_0005148
4,UlcerativeColitis,http://purl.obolibrary.org/obo/MONDO_0005101
5,UlcerativeColitis,http://purl.obolibrary.org/obo/MONDO_0005292
6,EndStageRenalFailure,http://purl.obolibrary.org/obo/MONDO_0004375


In [18]:
df = df.append(rows, ignore_index=True).drop_duplicates().dropna()
df

Unnamed: 0,name,URI
0,RenalCellCarcinoma,http://www.ebi.ac.uk/efo/EFO_0000681
2,LungCarcinoma,http://www.ebi.ac.uk/efo/EFO_0000571
3,Normal,http://purl.obolibrary.org/obo/PATO_0000461
4,CrohnsDisease,http://www.ebi.ac.uk/efo/EFO_0000384
5,LungCarcinoma,http://www.ebi.ac.uk/efo/EFO_0003060
6,COVID-19,http://purl.obolibrary.org/obo/MONDO_0100096
7,MyelodysplasticSyndrome,http://www.ebi.ac.uk/efo/EFO_0000198
8,RefractoryAnemiaWithExcessBlasts,http://www.ebi.ac.uk/efo/EFO_0003811
9,MyelodysplasticSyndrome,http://purl.obolibrary.org/obo/PATO_0000461
10,Glioblastoma,http://www.ebi.ac.uk/efo/EFO_0000519


In [19]:
df.to_csv('../SingleCell-Files/diseases_ontology.csv', index=False, header=False, sep='\t')