In [20]:
import requests
import pandas as pd
import urllib.parse
import json
import time
import re

from IPython.display import clear_output
from GetTermsOntobee import get_URI_from_name, get_name_from_URI

# SCEA organism parts

In [23]:
seed_url = "https://www.ebi.ac.uk/gxa/sc/json/experiments"

In [24]:
headers = {
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
}

In [25]:
answer = requests.get(seed_url, headers=headers)

In [26]:
def get_terms_from_project(experiment_id, name_column, ontology_column):
    terms = []
    terms_names = []
    
    experiment_metadata = f"https://www.ebi.ac.uk/gxa/sc/experiment/{experiment_id}/download?fileType=experiment-design&accessKey="
    
    df = pd.read_csv(experiment_metadata, sep='\t', low_memory=False)
            
    if name_column in df.columns:
        project_term_names = df[name_column].unique()
        project_term_URIs = df[ontology_column].unique()
    
        # Balance names and URIs
        project_term_URIs = list(project_term_URIs)
        project_term_URIs += [None] * (len(project_term_names) - len(project_term_URIs))
        
        for name, URI in zip(project_term_names, project_term_URIs):
            # If we have already code the cell_type we skip it
            if name in terms_names:
                continue
            
            print(name)
            
            # If we have no info of URI we search for it
            if URI is None or str(URI) == 'nan':
                URI = get_URI_from_name(name)
                time.sleep(1)
            
            # If there are two cell types in the name
            if URI is None:
                new_l = re.split(' and |, |; ', name)
                for new_name in new_l:
                    URI = get_URI_from_name(new_name)
                    time.sleep(1)
                    
                    terms_names.append(new_name)

                    terms.append({
                        'name': new_name,
                        'URI': URI
                    })
            # Otherwise   
            else:                    
                terms_names.append(name)

                terms.append({
                    'name': name,
                    'URI': URI
                })
    return terms, terms_names

In [29]:
organism_parts = []
organism_part_names = []
avoid_collections = ["Human Cell Atlas"]

n_experiments = len(answer.json()['experiments'])

for n, experiment in enumerate(answer.json()['experiments']):
    print(f"{n+1}/{n_experiments}")

    if [i for i in experiment["experimentProjects"] if i in avoid_collections]:
        clear_output(wait=True)
        continue
    
    experiment_id = experiment['experimentAccession']
    organism_parts_pro, organism_part_names_pro = get_terms_from_project(experiment_id=experiment_id, 
                                                             name_column='Sample Characteristic[organism part]', 
                                                             ontology_column='Sample Characteristic Ontology Term[organism part]')
    
    organism_parts += organism_parts_pro
    organism_part_names += organism_part_names_pro
    
    clear_output(wait=True)

df_SCEA = pd.DataFrame(organism_parts)

181/181
blood


In [31]:
df_SCEA[df_SCEA.isnull().any(axis=1)].name.tolist()

['peri-urethral mesenchyme',
 'dorsal medial ganglionic eminence',
 'brain without olfactory bulb',
 'brain without olfactory bulb']

In [32]:
from OntologyConversorSCAE import OntologyConversorSCAE

conversor_SCAE = OntologyConversorSCAE()

In [33]:
df = df_SCEA
df['name'] = df['name'].apply(conversor_SCAE.parse_word)
df

Unnamed: 0,name,URI
0,Kidney,http://purl.obolibrary.org/obo/UBERON_0002113
1,Bronchiole,http://purl.obolibrary.org/obo/UBERON_0002186
2,MiddleLobeOfRightLung,http://purl.obolibrary.org/obo/UBERON_0002174
3,Lung,http://purl.obolibrary.org/obo/UBERON_0002048
4,PedalDigit,http://purl.obolibrary.org/obo/UBERON_0006051
...,...,...
232,Pancreas,http://purl.obolibrary.org/obo/UBERON_0001264
233,PeritonealFluid,http://purl.obolibrary.org/obo/UBERON_0001268
234,Hemolymph,http://purl.obolibrary.org/obo/FBbt_00001683
235,Heart,http://purl.obolibrary.org/obo/UBERON_0000948


# HCA diseases

In [34]:
seed_url = "https://service.azul.data.humancellatlas.org/index/projects?size=999&catalog=dcp1"

In [35]:
# User-Agent obtenido del recurso del curso.
headers = {
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
}

In [36]:
answer = requests.get(seed_url, headers=headers)
project_hits = answer.json()["hits"]

In [37]:
n_projects = len(project_hits)

organism_parts = []

for n, hit in enumerate(project_hits):
    print(n)
    
    for item in hit['specimens']:
        organism_parts += item['organ']
        organism_parts += item['organPart']
        
    clear_output(wait=True)
    
organism_parts = list(set(organism_parts))

27


In [38]:
organism_parts = [x for x in organism_parts if x is not None]
organism_parts

['bone marrow',
 'diaphragm',
 'bladder organ',
 'umbilical cord blood',
 'thymus',
 'ureter',
 'Left lateral basal bronchopulmonary segment',
 'renal pelvis',
 'definitive endoderm',
 'parietal endoderm',
 'adipose tissue',
 'mammary gland',
 'fovea centralis',
 'visceral endoderm',
 'blood',
 'islet of Langerhans',
 'lamina propria of mucosa of colon',
 'bone tissue',
 'cortex of kidney',
 'renal medulla',
 'brain',
 'pancreas',
 'lower lobe of right lung',
 'caudate lobe',
 'liver',
 'spleen',
 'skin epidermis',
 'cortex',
 'decidua',
 'placenta',
 'lower lobe of left lung',
 'presumptive gut',
 'blastocyst',
 'skin of body',
 'lung',
 'esophagus',
 'tongue',
 'heart',
 'retinal neural layer',
 'immune system',
 'embryo',
 'esophagus mucosa',
 'lung parenchyma',
 'large intestine',
 'tumor',
 'venous blood',
 'colon',
 'mediastinal lymph node',
 'kidney',
 'eye',
 'endoderm',
 'peripheral blood mononuclear cell',
 'muscle organ',
 'retina',
 'trachea',
 'lymph node',
 'hematopoietic

In [39]:
organism_parts_URIs = list(map(get_URI_from_name, organism_parts))
organism_parts_URIs

['http://purl.obolibrary.org/obo/UBERON_0002371',
 'http://purl.obolibrary.org/obo/UBERON_0001103',
 'http://purl.obolibrary.org/obo/UBERON_0018707',
 'http://purl.obolibrary.org/obo/UBERON_0012168',
 'http://purl.obolibrary.org/obo/UBERON_0002370',
 'http://purl.obolibrary.org/obo/UBERON_0000056',
 'http://purl.obolibrary.org/obo/FMA_7379',
 'http://purl.obolibrary.org/obo/UBERON_0001224',
 'http://purl.obolibrary.org/obo/UBERON_0005439',
 'http://purl.obolibrary.org/obo/UBERON_0008800',
 'http://purl.obolibrary.org/obo/UBERON_0001013',
 'http://purl.obolibrary.org/obo/UBERON_0001911',
 'http://purl.obolibrary.org/obo/UBERON_0001786',
 'http://purl.obolibrary.org/obo/UBERON_0004877',
 'http://purl.obolibrary.org/obo/NCRO_0000105',
 'http://purl.obolibrary.org/obo/UBERON_0000006',
 'http://purl.obolibrary.org/obo/UBERON_0007177',
 'http://purl.obolibrary.org/obo/UBERON_0002481',
 'http://purl.obolibrary.org/obo/UBERON_0001225',
 'http://purl.obolibrary.org/obo/UBERON_0000362',
 'http:/

In [40]:
from OntologyConversorHCA import OntologyConversorHCA

conversor_HCA = OntologyConversorHCA()

In [41]:
organism_parts = list(map(conversor_HCA.parse_word, organism_parts))
organism_parts

['BoneMarrow',
 'Diaphragm',
 'Bladder',
 'UmbilicalCordBlood',
 'Thymus',
 'Ureter',
 'LeftLateralBasalBronchopulmonarySegment',
 'RenalPelvis',
 'DefinitiveEndoderm',
 'ParietalEndoderm',
 'AdiposeTissue',
 'MammaryGland',
 'FoveaCentralis',
 'VisceralEndoderm',
 'Blood',
 'IsletOfLangerhans',
 'LaminaPropriaOfMucosaOfColon',
 'BoneTissue',
 'CortexOfKidney',
 'RenalMedulla',
 'Brain',
 'Pancreas',
 'LowerLobeOfRightLung',
 'CaudateLobe',
 'Liver',
 'Spleen',
 'Epidermis',
 'Cortex',
 'Decidua',
 'Placenta',
 'LowerLobeOfLeftLung',
 'presumptiveGut',
 'Blastocyst',
 'Skin',
 'Lung',
 'Esophagus',
 'Tongue',
 'Heart',
 'RetinalNeuralLayer',
 'ImmuneSystem',
 'Embryo',
 'EsophagusMucosa',
 'LungParenchyma',
 'LargeIntestine',
 'Tumor',
 'VenousBlood',
 'Colon',
 'MediastinalLymphNode',
 'Kidney',
 'Eye',
 'Endoderm',
 'PeripheralBloodMononuclearCell',
 'Muscle',
 'Retina',
 'Trachea',
 'LymphNode',
 'HematopoieticSystem',
 'YolkSacEndoderm']

In [42]:
rows = pd.DataFrame({'name': organism_parts, 'URI': organism_parts_URIs})
rows

Unnamed: 0,name,URI
0,BoneMarrow,http://purl.obolibrary.org/obo/UBERON_0002371
1,Diaphragm,http://purl.obolibrary.org/obo/UBERON_0001103
2,Bladder,http://purl.obolibrary.org/obo/UBERON_0018707
3,UmbilicalCordBlood,http://purl.obolibrary.org/obo/UBERON_0012168
4,Thymus,http://purl.obolibrary.org/obo/UBERON_0002370
5,Ureter,http://purl.obolibrary.org/obo/UBERON_0000056
6,LeftLateralBasalBronchopulmonarySegment,http://purl.obolibrary.org/obo/FMA_7379
7,RenalPelvis,http://purl.obolibrary.org/obo/UBERON_0001224
8,DefinitiveEndoderm,http://purl.obolibrary.org/obo/UBERON_0005439
9,ParietalEndoderm,http://purl.obolibrary.org/obo/UBERON_0008800


In [43]:
df = df.append(rows, ignore_index=True).drop_duplicates().dropna()
df

Unnamed: 0,name,URI
0,Kidney,http://purl.obolibrary.org/obo/UBERON_0002113
1,Bronchiole,http://purl.obolibrary.org/obo/UBERON_0002186
2,MiddleLobeOfRightLung,http://purl.obolibrary.org/obo/UBERON_0002174
3,Lung,http://purl.obolibrary.org/obo/UBERON_0002048
4,PedalDigit,http://purl.obolibrary.org/obo/UBERON_0006051
...,...,...
289,Muscle,http://purl.obolibrary.org/obo/UBERON_0001630
291,Trachea,http://purl.obolibrary.org/obo/UBERON_0003126
292,LymphNode,http://purl.obolibrary.org/obo/BTO_0000784
293,HematopoieticSystem,http://purl.obolibrary.org/obo/UBERON_0002390


In [44]:
df.to_csv('../SingleCell-Files/organism_parts_ontology.csv', index=False, header=False, sep='\t')