In [1]:
import requests
import pandas as pd
import urllib.parse
import json
import time
import re

from IPython.display import clear_output
from GetTermsOntobee import get_URI_from_name, get_name_from_URI

# SCEA organism parts

In [2]:
seed_url = "https://www.ebi.ac.uk/gxa/sc/json/experiments"

In [3]:
headers = {
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
}

In [4]:
answer = requests.get(seed_url, headers=headers)

In [5]:
ontologies = ['EFO', 'UBERON', 'CL', 'PO', 'ZFA']

In [6]:
def get_terms_from_project(experiment_id, name_column, ontology_column):
    terms = []
    terms_names = []
    
    experiment_metadata = f"https://www.ebi.ac.uk/gxa/sc/experiment/{experiment_id}/download?fileType=experiment-design&accessKey="
    
    df = pd.read_csv(experiment_metadata, sep='\t', low_memory=False)
            
    if name_column in df.columns:
        project_term_names = df[name_column].unique()
        project_term_URIs = df[ontology_column].unique()
    
        # Balance names and URIs
        project_term_URIs = list(project_term_URIs)
        project_term_URIs += [None] * (len(project_term_names) - len(project_term_URIs))
        
        for name, URI in zip(project_term_names, project_term_URIs):
            # If we have already code the cell_type we skip it
            if name in terms_names:
                continue
            
            print(name)
            
            # If we have no info of URI we search for it
            if URI is None or str(URI) == 'nan':
                for ontology in ontologies:
                    
                    URI = get_URI_from_name(name, ontology)
                    time.sleep(1)
                    
                    if URI is not None:
                        terms.append({
                            'name': name,
                            'URI': URI
                        })
                
                if URI is None:
                    URI = get_URI_from_name(name)
                    time.sleep(1)
                    
                    terms.append({
                        'name': name,
                        'URI': URI
                    })
                    
            # Otherwise   
            else:                    
                terms_names.append(name)

                terms.append({
                    'name': name,
                    'URI': URI
                })
                
            # If there are two or more cell types in the name
            if URI is None:
                new_l = re.split(' and |, |; ', name)
                for new_name in new_l:
                    
                    for ontology in ontologies:
                        
                        URI = get_URI_from_name(new_name, ontology)
                        time.sleep(1)
                        
                        if URI is not None:

                            terms.append({
                                'name': new_name,
                                'URI': URI
                            })
                    
                    if URI is None:
                        URI = get_URI_from_name(name)
                        time.sleep(1)

                        terms.append({
                            'name': name,
                            'URI': URI
                        })
                        
                    terms_names.append(new_name)
            

    return terms, terms_names

In [7]:
organism_parts = []
organism_part_names = []
avoid_collections = ["Human Cell Atlas"]

n_experiments = len(answer.json()['experiments'])

for n, experiment in enumerate(answer.json()['experiments']):
    print(f"{n+1}/{n_experiments}")

    if [i for i in experiment["experimentProjects"] if i in avoid_collections]:
        clear_output(wait=True)
        continue
    
    experiment_id = experiment['experimentAccession']
    organism_parts_pro, organism_part_names_pro = get_terms_from_project(experiment_id=experiment_id, 
                                                             name_column='Sample Characteristic[organism part]', 
                                                             ontology_column='Sample Characteristic Ontology Term[organism part]')
    
    organism_parts += organism_parts_pro
    organism_part_names += organism_part_names_pro
    
    clear_output(wait=True)

df_SCEA = pd.DataFrame(organism_parts)

181/181
blood


In [8]:
df_SCEA = df_SCEA.drop_duplicates()

In [9]:
df_SCEA[df_SCEA.isnull().any(axis=1)].name.tolist()

['smooth muscle, peri-urethral mesenchyme and urethral epithelium',
 'dorsal medial ganglionic eminence',
 'pigmented layer of retina and optic choroid',
 'primary visual area, layer 1, layer 2/3 and layer 4',
 'primary visual area, layer 1',
 'brain without olfactory bulb']

In [10]:
from OntologyConversorSCAE import OntologyConversorSCAE

conversor_SCAE = OntologyConversorSCAE()

In [11]:
df = df_SCEA
df['name'] = df['name'].apply(conversor_SCAE.parse_word)
df

Unnamed: 0,name,URI
0,Kidney,http://purl.obolibrary.org/obo/UBERON_0002113
1,Bronchiole,http://purl.obolibrary.org/obo/UBERON_0002186
2,MiddleLobeOfRightLung,http://purl.obolibrary.org/obo/UBERON_0002174
3,Lung,http://purl.obolibrary.org/obo/UBERON_0002048
4,PedalDigit,http://purl.obolibrary.org/obo/UBERON_0006051
...,...,...
452,Caecum,http://purl.obolibrary.org/obo/UBERON_0001153
453,RectoSigmoidJunction,http://purl.obolibrary.org/obo/UBERON_0036214
454,SigmoidColon,http://purl.obolibrary.org/obo/UBERON_0001159
456,AscendingColon,http://purl.obolibrary.org/obo/UBERON_0001156


# HCA diseases

In [12]:
seed_url = "https://service.azul.data.humancellatlas.org/index/projects?size=999&catalog=dcp1"

In [13]:
# User-Agent obtenido del recurso del curso.
headers = {
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
}

In [14]:
answer = requests.get(seed_url, headers=headers)
project_hits = answer.json()["hits"]

In [15]:
n_projects = len(project_hits)

organism_parts = []

for n, hit in enumerate(project_hits):
    print(n)
    
    for item in hit['specimens']:
        organism_parts += item['organ']
        organism_parts += item['organPart']
        
    clear_output(wait=True)
    
organism_parts = list(set(organism_parts))

27


In [16]:
organism_parts = [x for x in organism_parts if x is not None]
organism_parts

['caudate lobe',
 'esophagus',
 'renal medulla',
 'parietal endoderm',
 'lymph node',
 'lung parenchyma',
 'lower lobe of left lung',
 'cortex',
 'peripheral blood mononuclear cell',
 'mammary gland',
 'presumptive gut',
 'lower lobe of right lung',
 'hematopoietic system',
 'umbilical cord blood',
 'venous blood',
 'skin of body',
 'Left lateral basal bronchopulmonary segment',
 'decidua',
 'placenta',
 'mediastinal lymph node',
 'kidney',
 'trachea',
 'blood',
 'blastocyst',
 'ureter',
 'bladder organ',
 'cortex of kidney',
 'islet of Langerhans',
 'immune system',
 'retina',
 'visceral endoderm',
 'lung',
 'bone tissue',
 'heart',
 'tongue',
 'muscle organ',
 'diaphragm',
 'definitive endoderm',
 'pancreas',
 'retinal neural layer',
 'renal pelvis',
 'large intestine',
 'eye',
 'bone marrow',
 'endoderm',
 'esophagus mucosa',
 'brain',
 'spleen',
 'colon',
 'thymus',
 'fovea centralis',
 'liver',
 'yolk sac endoderm',
 'lamina propria of mucosa of colon',
 'embryo',
 'adipose tissue

In [17]:
organism_parts_URIs = list(map(get_URI_from_name, organism_parts))
organism_parts_URIs

['http://purl.obolibrary.org/obo/EHDA_10432',
 'http://purl.obolibrary.org/obo/UBERON_0001043',
 'http://purl.obolibrary.org/obo/UBERON_0000362',
 'http://purl.obolibrary.org/obo/UBERON_0008800',
 'http://purl.obolibrary.org/obo/BTO_0000784',
 'http://purl.obolibrary.org/obo/UBERON_0008946',
 'http://purl.obolibrary.org/obo/UBERON_0008953',
 'http://purl.obolibrary.org/obo/PO_0005708',
 'http://purl.obolibrary.org/obo/CL_2000001',
 'http://purl.obolibrary.org/obo/UBERON_0001911',
 'http://purl.obolibrary.org/obo/UBERON_0007026',
 'http://purl.obolibrary.org/obo/UBERON_0002171',
 'http://purl.obolibrary.org/obo/UBERON_0002390',
 'http://purl.obolibrary.org/obo/UBERON_0012168',
 'http://purl.obolibrary.org/obo/UBERON_0013756',
 'http://purl.obolibrary.org/obo/UBERON_0002097',
 'http://purl.obolibrary.org/obo/FMA_7379',
 'http://purl.obolibrary.org/obo/UBERON_0002450',
 'http://purl.obolibrary.org/obo/BTO_0001078',
 'http://purl.obolibrary.org/obo/UBERON_0002524',
 'http://purl.obolibrary

In [18]:
from OntologyConversorHCA import OntologyConversorHCA

conversor_HCA = OntologyConversorHCA()

In [19]:
organism_parts = list(map(conversor_HCA.parse_word, organism_parts))
organism_parts

['CaudateLobe',
 'Esophagus',
 'RenalMedulla',
 'ParietalEndoderm',
 'LymphNode',
 'LungParenchyma',
 'LowerLobeOfLeftLung',
 'Cortex',
 'PeripheralBloodMononuclearCell',
 'MammaryGland',
 'presumptiveGut',
 'LowerLobeOfRightLung',
 'HematopoieticSystem',
 'UmbilicalCordBlood',
 'VenousBlood',
 'Skin',
 'LeftLateralBasalBronchopulmonarySegment',
 'Decidua',
 'Placenta',
 'MediastinalLymphNode',
 'Kidney',
 'Trachea',
 'Blood',
 'Blastocyst',
 'Ureter',
 'Bladder',
 'CortexOfKidney',
 'IsletOfLangerhans',
 'ImmuneSystem',
 'Retina',
 'VisceralEndoderm',
 'Lung',
 'BoneTissue',
 'Heart',
 'Tongue',
 'Muscle',
 'Diaphragm',
 'DefinitiveEndoderm',
 'Pancreas',
 'RetinalNeuralLayer',
 'RenalPelvis',
 'LargeIntestine',
 'Eye',
 'BoneMarrow',
 'Endoderm',
 'EsophagusMucosa',
 'Brain',
 'Spleen',
 'Colon',
 'Thymus',
 'FoveaCentralis',
 'Liver',
 'YolkSacEndoderm',
 'LaminaPropriaOfMucosaOfColon',
 'Embryo',
 'AdiposeTissue',
 'Tumor',
 'Epidermis']

In [20]:
rows = pd.DataFrame({'name': organism_parts, 'URI': organism_parts_URIs})
rows

Unnamed: 0,name,URI
0,CaudateLobe,http://purl.obolibrary.org/obo/EHDA_10432
1,Esophagus,http://purl.obolibrary.org/obo/UBERON_0001043
2,RenalMedulla,http://purl.obolibrary.org/obo/UBERON_0000362
3,ParietalEndoderm,http://purl.obolibrary.org/obo/UBERON_0008800
4,LymphNode,http://purl.obolibrary.org/obo/BTO_0000784
5,LungParenchyma,http://purl.obolibrary.org/obo/UBERON_0008946
6,LowerLobeOfLeftLung,http://purl.obolibrary.org/obo/UBERON_0008953
7,Cortex,http://purl.obolibrary.org/obo/PO_0005708
8,PeripheralBloodMononuclearCell,http://purl.obolibrary.org/obo/CL_2000001
9,MammaryGland,http://purl.obolibrary.org/obo/UBERON_0001911


In [21]:
df = df.append(rows, ignore_index=True).drop_duplicates()
df

Unnamed: 0,name,URI
0,Kidney,http://purl.obolibrary.org/obo/UBERON_0002113
1,Bronchiole,http://purl.obolibrary.org/obo/UBERON_0002186
2,MiddleLobeOfRightLung,http://purl.obolibrary.org/obo/UBERON_0002174
3,Lung,http://purl.obolibrary.org/obo/UBERON_0002048
4,PedalDigit,http://purl.obolibrary.org/obo/UBERON_0006051
...,...,...
204,FoveaCentralis,http://purl.obolibrary.org/obo/UBERON_0001786
206,YolkSacEndoderm,http://purl.obolibrary.org/obo/UBERON_0003257
207,LaminaPropriaOfMucosaOfColon,http://purl.obolibrary.org/obo/UBERON_0007177
208,Embryo,http://purl.obolibrary.org/obo/IDOMAL_0000646


In [22]:
df.to_csv('../SingleCell-Files/organism_parts_ontology.csv', index=False, header=False, sep='\t')