In [8]:
import requests
import pandas as pd
import urllib.parse
import json
import time
import re

from IPython.display import clear_output
from GetTermsOntobee import get_URI_from_name, get_name_from_URI

# SCEA diseases

In [9]:
seed_url = "https://www.ebi.ac.uk/gxa/sc/json/experiments"

In [10]:
headers = {
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
}

In [11]:
answer = requests.get(seed_url, headers=headers)

In [12]:
ontologies = ['EFO', 'PATO', 'MONDO', 'DOID', 'HP']

In [13]:
def get_terms_from_project(experiment_id, name_column, ontology_column):
    terms = []
    terms_names = []
    
    experiment_metadata = f"https://www.ebi.ac.uk/gxa/sc/experiment/{experiment_id}/download?fileType=experiment-design&accessKey="
    
    df = pd.read_csv(experiment_metadata, sep='\t', low_memory=False)
            
    if name_column in df.columns:
        project_term_names = df[name_column].unique()
        project_term_URIs = df[ontology_column].unique()
    
        # Balance names and URIs
        project_term_URIs = list(project_term_URIs)
        project_term_URIs += [None] * (len(project_term_names) - len(project_term_URIs))
        
        for name, URI in zip(project_term_names, project_term_URIs):
            # If we have already code the cell_type we skip it
            if name in terms_names:
                continue
            
            print(name)
            
            # If we have no info of URI we search for it
            if URI is None or str(URI) == 'nan':
                for ontology in ontologies:
                    
                    URI = get_URI_from_name(name, ontology)
                    time.sleep(1)
                    
                    if URI is not None:
                        terms.append({
                            'name': name,
                            'URI': URI
                        })
                
                if URI is None:
                    URI = get_URI_from_name(name)
                    time.sleep(1)
                    
                    terms.append({
                        'name': name,
                        'URI': URI
                    })
                    
            # Otherwise   
            else:                    
                terms_names.append(name)

                terms.append({
                    'name': name,
                    'URI': URI
                })
                
            # If there are two or more cell types in the name
            if URI is None:
                new_l = re.split(' and |, |; ', name)
                for new_name in new_l:
                    
                    for ontology in ontologies:
                        
                        URI = get_URI_from_name(new_name, ontology)
                        time.sleep(1)
                        
                        if URI is not None:

                            terms.append({
                                'name': new_name,
                                'URI': URI
                            })
                    
                    if URI is None:
                        URI = get_URI_from_name(name)
                        time.sleep(1)

                        terms.append({
                            'name': name,
                            'URI': URI
                        })
                        
                    terms_names.append(new_name)
            

    return terms, terms_names

In [14]:
diseases = []
disease_names = []
avoid_collections = ["Human Cell Atlas"]

n_experiments = len(answer.json()['experiments'])

for n, experiment in enumerate(answer.json()['experiments']):
    print(f"{n+1}/{n_experiments}")

    if [i for i in experiment["experimentProjects"] if i in avoid_collections]:
        clear_output(wait=True)
        continue
    
    experiment_id = experiment['experimentAccession']
    diseases_pro, disease_names_pro = get_terms_from_project(experiment_id=experiment_id, 
                                                             name_column='Sample Characteristic[disease]', 
                                                             ontology_column='Sample Characteristic Ontology Term[disease]')
    
    diseases += diseases_pro
    disease_names += disease_names_pro
               
    clear_output(wait=True)

df = pd.DataFrame(diseases)

2/181
bronchioalveolar carcinoma; non-small cell lung cancer


ParserError: Document is empty

In [None]:
df = df.drop_duplicates()

In [None]:
df[df.isnull().any(axis=1)].name.tolist()

In [None]:
df[df['name'] == 'bronchioalveolar carcinoma']

In [None]:
from OntologyConversorSCAE import OntologyConversorSCAE

conversor = OntologyConversorSCAE()

In [None]:
df['name'] = df['name'].apply(conversor.parse_word)
df

# HCA diseases

In [None]:
seed_url = "https://service.azul.data.humancellatlas.org/index/projects?size=999&catalog=dcp1"

In [None]:
# User-Agent obtenido del recurso del curso.
headers = {
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
}

In [None]:
answer = requests.get(seed_url, headers=headers)
project_hits = answer.json()["hits"]

In [None]:
projects = []
accessing_error = []

n_projects = len(project_hits)

diseases = []

for n, hit in enumerate(project_hits):
    print(n)
    
    for item in hit['specimens']:
        diseases += item['disease']
        
    clear_output(wait=True)
    
diseases = list(set(diseases))

In [None]:
diseases = [x for x in diseases if x is not None]
diseases

In [None]:
diseases_URIs = list(map(get_URI_from_name, diseases))
diseases_URIs

In [None]:
from OntologyConversorHCA import OntologyConversorHCA

conversor = OntologyConversorHCA()

In [None]:
diseases = list(map(conversor.parse_word, diseases))
diseases

In [None]:
rows = pd.DataFrame({'name': diseases, 'URI': diseases_URIs})
rows

In [None]:
df = df.append(rows, ignore_index=True).drop_duplicates()
df

In [None]:
df.to_csv('../SingleCell-Files/diseases_ontology.csv', index=False, header=False, sep='\t')