In [28]:
import requests
import pandas as pd
import urllib.parse
import json
import time
import re

from IPython.display import clear_output
from GetTermsOntobee import get_URI_from_name, get_name_from_URI

# SCEA diseases

In [29]:
seed_url = "https://www.ebi.ac.uk/gxa/sc/json/experiments"

In [30]:
headers = {
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
}

In [31]:
answer = requests.get(seed_url, headers=headers)

In [35]:
def get_terms_from_project(experiment_id, name_column, ontology_column):
    terms = []
    terms_names = []
    
    experiment_metadata = f"https://www.ebi.ac.uk/gxa/sc/experiment/{experiment_id}/download?fileType=experiment-design&accessKey="
    
    df = pd.read_csv(experiment_metadata, sep='\t', low_memory=False)
            
    if name_column in df.columns:
        project_term_names = df[name_column].unique()
        project_term_URIs = df[ontology_column].unique()
    
        # Balance names and URIs
        project_term_URIs = list(project_term_URIs)
        project_term_URIs += [None] * (len(project_term_names) - len(project_term_URIs))
        
        for name, URI in zip(project_term_names, project_term_URIs):
            # If we have already code the cell_type we skip it
            if name in terms_names:
                continue
            
            print(name)
            
            # If we have no info of URI we search for it
            if URI is None or str(URI) == 'nan':
                URI = get_URI_from_name(name)
                time.sleep(1)
            
            # If there are two cell types in the name
            if URI is None:
                new_l = re.split(' and |, |; ', name)
                for new_name in new_l:
                    URI = get_URI_from_name(new_name)
                    time.sleep(1)
                    
                    terms_names.append(new_name)

                    terms.append({
                        'name': new_name,
                        'URI': URI
                    })
            # Otherwise   
            else:                    
                terms_names.append(name)

                terms.append({
                    'name': name,
                    'URI': URI
                })
    return terms, terms_names

In [36]:
diseases = []
disease_names = []
avoid_collections = ["Human Cell Atlas"]

n_experiments = len(answer.json()['experiments'])

for n, experiment in enumerate(answer.json()['experiments']):
    print(f"{n+1}/{n_experiments}")

    if [i for i in experiment["experimentProjects"] if i in avoid_collections]:
        clear_output(wait=True)
        continue
    
    experiment_id = experiment['experimentAccession']
    diseases_pro, disease_names_pro = get_terms_from_project(experiment_id=experiment_id, 
                                                             name_column='Sample Characteristic[disease]', 
                                                             ontology_column='Sample Characteristic Ontology Term[disease]')
    
    diseases += diseases_pro
    disease_names += disease_names_pro
               
    clear_output(wait=True)

df = pd.DataFrame(diseases)

181/181
normal
COVID-19


In [38]:
df[df.isnull().any(axis=1)].name.tolist()

['metastatic breast cancer', 'chronic phase chronic myeloid leukemia']

In [39]:
from OntologyConversorSCAE import OntologyConversorSCAE

conversor = OntologyConversorSCAE()

In [40]:
df['name'] = df['name'].apply(conversor.parse_word)
df

Unnamed: 0,name,URI
0,RenalCellCarcinoma,http://www.ebi.ac.uk/efo/EFO_0000681
1,BronchioalveolarCarcinoma,http://purl.obolibrary.org/obo/DOID_0050870
2,Non-SmallCellLungCancer,http://www.ebi.ac.uk/efo/EFO_0003060
3,LungCarcinoma,http://www.ebi.ac.uk/efo/EFO_0000571
4,Normal,http://purl.obolibrary.org/obo/PATO_0000461
...,...,...
111,PancreaticCancer,http://www.ebi.ac.uk/efo/EFO_0003860
112,Normal,http://purl.obolibrary.org/obo/PATO_0000461
113,OvarianCarcinoma,http://www.ebi.ac.uk/efo/EFO_0001075
114,Normal,http://purl.obolibrary.org/obo/PATO_0000461


# HCA diseases

In [41]:
seed_url = "https://service.azul.data.humancellatlas.org/index/projects?size=999&catalog=dcp1"

In [42]:
# User-Agent obtenido del recurso del curso.
headers = {
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
}

In [43]:
answer = requests.get(seed_url, headers=headers)
project_hits = answer.json()["hits"]

In [44]:
projects = []
accessing_error = []

n_projects = len(project_hits)

diseases = []

for n, hit in enumerate(project_hits):
    print(n)
    
    for item in hit['specimens']:
        diseases += item['disease']
        
    clear_output(wait=True)
    
diseases = list(set(diseases))

27


In [45]:
diseases = [x for x in diseases if x is not None]
diseases

['normal',
 'orofaciodigital syndrome VIII',
 'hemolytic-uremic syndrome',
 'type 2 diabetes mellitus',
 'ulcerative colitis (disease)',
 'colitis (disease)',
 'end stage renal failure']

In [46]:
diseases_URIs = list(map(get_URI_from_name, diseases))
diseases_URIs

['http://purl.obolibrary.org/obo/MPATH_458',
 'http://purl.obolibrary.org/obo/MONDO_0010336',
 'http://purl.obolibrary.org/obo/MONDO_0001549',
 'http://purl.obolibrary.org/obo/MONDO_0005148',
 'http://purl.obolibrary.org/obo/MONDO_0005101',
 'http://purl.obolibrary.org/obo/MONDO_0005292',
 'http://purl.obolibrary.org/obo/MONDO_0004375']

In [47]:
from OntologyConversorHCA import OntologyConversorHCA

conversor = OntologyConversorHCA()

In [48]:
diseases = list(map(conversor.parse_word, diseases))
diseases

['Control',
 'OrofaciodigitalSyndromeVIII',
 'HemolyticUremicSyndrome',
 'Type2DiabetesMellitus',
 'UlcerativeColitis',
 'UlcerativeColitis',
 'EndStageRenalFailure']

In [49]:
rows = pd.DataFrame({'name': diseases, 'URI': diseases_URIs})
rows

Unnamed: 0,name,URI
0,Control,http://purl.obolibrary.org/obo/MPATH_458
1,OrofaciodigitalSyndromeVIII,http://purl.obolibrary.org/obo/MONDO_0010336
2,HemolyticUremicSyndrome,http://purl.obolibrary.org/obo/MONDO_0001549
3,Type2DiabetesMellitus,http://purl.obolibrary.org/obo/MONDO_0005148
4,UlcerativeColitis,http://purl.obolibrary.org/obo/MONDO_0005101
5,UlcerativeColitis,http://purl.obolibrary.org/obo/MONDO_0005292
6,EndStageRenalFailure,http://purl.obolibrary.org/obo/MONDO_0004375


In [50]:
df = df.append(rows, ignore_index=True).drop_duplicates().dropna()
df

Unnamed: 0,name,URI
0,RenalCellCarcinoma,http://www.ebi.ac.uk/efo/EFO_0000681
1,BronchioalveolarCarcinoma,http://purl.obolibrary.org/obo/DOID_0050870
2,Non-SmallCellLungCancer,http://www.ebi.ac.uk/efo/EFO_0003060
3,LungCarcinoma,http://www.ebi.ac.uk/efo/EFO_0000571
4,Normal,http://purl.obolibrary.org/obo/PATO_0000461
6,CrohnsDisease,http://www.ebi.ac.uk/efo/EFO_0000384
8,LungCarcinoma,http://www.ebi.ac.uk/efo/EFO_0003060
9,COVID-19,http://purl.obolibrary.org/obo/MONDO_0100096
11,MyelodysplasticSyndrome,http://www.ebi.ac.uk/efo/EFO_0000198
12,RefractoryAnemiaWithExcessBlasts,http://www.ebi.ac.uk/efo/EFO_0003811


In [51]:
df.to_csv('../SingleCell-Files/diseases_ontology.csv', index=False, header=False, sep='\t')