In [1]:
import requests
import pandas as pd
import urllib.parse
import json
import time

from IPython.display import clear_output
from GetTermsOntobee import get_URI_from_name, get_name_from_URI

In [2]:
seed_url = "https://www.ebi.ac.uk/gxa/sc/json/experiments"

In [3]:
headers = {
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
}

In [4]:
answer = requests.get(seed_url, headers=headers)

In [None]:
def get_terms_from_project(experiment_id, name_column, ontology_column):
    terms = []
    terms_names = []
    
    experiment_metadata = f"https://www.ebi.ac.uk/gxa/sc/experiment/{experiment_id}/download?fileType=experiment-design&accessKey="
    
    df = pd.read_csv(experiment_metadata, sep='\t', low_memory=False)
            
    if name_column in df.columns:
        project_term_names = df[name_column].unique()
        project_term_URIs = df[ontology_column].unique()
    
        # Balance names and URIs
        project_term_URIs = list(project_term_URIs)
        project_term_URIs += [None] * (len(project_term_names) - len(project_term_URIs))
        
        for name, URI in zip(project_term_names, project_term_URIs):
            # If we have already code the cell_type we skip it
            if name in terms_names:
                continue
            
            print(name)
            
            # If we have no info of URI we search for it
            if URI is None or str(URI) == 'nan':
                URI = get_URI_from_name(name)
                time.sleep(1)
            
            # If there are two cell types in the name
            if URI is None:
                new_l = re.split(' and |, ', name)
                for new_name in new_l:
                    URI = get_URI_from_name(new_name)
                    time.sleep(1)
                    
                    terms_names.append(new_name)

                    terms.append({
                        'name': new_name,
                        'URI': URI
                    })
            # Otherwise   
            else:                    
                terms_names.append(name)

                terms.append({
                    'name': name,
                    'URI': URI
                })
    return terms, terms_names

In [None]:
avoid_collections = ["Human Cell Atlas"]

cell_types = []
cell_types_names = []

n_experiments = len(answer.json()['experiments'])

for n, experiment in enumerate(answer.json()['experiments']):
    print(f"{n+1}/{n_experiments}")

    if [i for i in experiment["experimentProjects"] if i in avoid_collections]:
        clear_output(wait=True)
        continue
    
    experiment_id = experiment['experimentAccession']
    cell_types_pro, cell_types_names_pro = get_terms_from_project(experiment_id=experiment_id, 
                                                                  name_column='Sample Characteristic[cell type]', 
                                                                  ontology_column='Sample Characteristic Ontology Term[cell type]')
    
    cell_types += cell_types_pro
    cell_types_names += cell_types_names_pro
    
    clear_output(wait=True)

df_SCEA = pd.DataFrame(cell_types)

166/181


In [7]:
df_SCEA[df_SCEA.isnull().any(axis=1)]

Unnamed: 0,name,URI
13,olfactory projection neuron innvervating DC2 g...,
14,olfactory projection neuron innervating VM2 gl...,
20,multi-lymphoid progenitor,
27,marrow-derived B cell,
39,"neuronal, glial",
40,vascular cells,
44,Un-cryopreserved peripheral blood mononuclear ...,
55,circulating tumor cell,
59,"megakaryocyte-erythroid progenitor cell, commo...",
67,dormant hematopoietic stem cell,


In [55]:
df_SCEA[df_SCEA['name'] == 'common lymphoid progenitor']

Unnamed: 0,name,URI


In [9]:
df_SCEA.iloc[59]['name']

'megakaryocyte-erythroid progenitor cell, common myeloid progenitor'

In [38]:
df_SCEA['name'].tolist()

['epithelial cell',
 'not applicable',
 'protoplast',
 'hemocyte',
 'bone marrow cell',
 'stem cell',
 'myoepithelial cell of mammary gland',
 'luminal epithelial cell of mammary gland',
 'plant protoplast',
 'neuron',
 'olfactory projection neuron',
 'olfactory projection neuron innervating DA1, VA1d or DC3 glomerulus',
 'astrocyte',
 'olfactory projection neuron innvervating DC2 glomerulus',
 'olfactory projection neuron innervating VM2 glomerulus',
 'long term hematopoietic stem cell',
 'hematopoietic multipotent progenitor cell',
 'short term hematopoietic stem cell',
 'granulocyte macrophage progenitor',
 'lymphoid-primed multipotent progenitor',
 'multi-lymphoid progenitor',
 'hematopoietic stem cell',
 'thrombocyte',
 'thymic T cell',
 'mature T cell',
 'neutrophil',
 'myeloid cell',
 'marrow-derived B cell',
 'spheroplast',
 'microglial cell',
 'basophil mast progenitor cell',
 'granulocyte monocyte progenitor cell',
 'neural crest cell',
 'embryonic stem cell',
 'plasma cell',

In [9]:
from OntologyConversorSCAE import OntologyConversorSCAE

conversor = OntologyConversorSCAE()

In [10]:
df = df_SCEA

df['name'] = df['name'].apply(conversor.parse_word)
df

Unnamed: 0,name,URI
0,EpithelialCell,http://purl.obolibrary.org/obo/CL_0000066
3,Protoplast,http://purl.obolibrary.org/obo/CL_0000371
4,Hemocyte,http://purl.obolibrary.org/obo/CL_0000387
9,BoneMarrowCell,http://purl.obolibrary.org/obo/CL_0002092
12,StemCell,http://purl.obolibrary.org/obo/CL_0000034
...,...,...
494,NeoplasticCell,http://purl.obolibrary.org/obo/CL_0001063
497,CD4+CD25+AlphaBetaRegulatoryTcell,http://purl.obolibrary.org/obo/CL_0000792
504,CD4+AlphaBetaMemoryTCell,http://purl.obolibrary.org/obo/CL_0000897
509,LungMacrophage,http://purl.obolibrary.org/obo/CL_1001603


# HCA cell types

In [11]:
seed_url = "https://service.azul.data.humancellatlas.org/index/projects?size=999&catalog=dcp1"

In [12]:
# User-Agent obtenido del recurso del curso.
headers = {
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
}

In [13]:
answer = requests.get(seed_url, headers=headers)
project_hits = answer.json()["hits"]

In [14]:
n_projects = len(project_hits)

cell_types = []

for n, hit in enumerate(project_hits):
    entry_id = hit["entryId"]
    seed_url = "https://service.explore.data.humancellatlas.org/repository/projects/" + entry_id
    
    print(n)
    
    for item in hit['cellSuspensions']:
        cell_types += item['selectedCellType']
        
    clear_output(wait=True)
    
cell_types = list(set(cell_types))

27


In [15]:
cell_types = [x for x in cell_types if x is not None]
cell_types

In [16]:
cell_types_URIs = list(map(get_URI_from_name, cell_types))
cell_types_URIs

In [17]:
from OntologyConversorHCA import OntologyConversorHCA

conversor = OntologyConversorHCA()

In [18]:
cell_types = list(map(conversor.parse_word, cell_types))
cell_types

['BoneMarrowHematopoieticCell',
 'Leukocyte',
 'Fibroblast',
 'CD8+AlphaBetaTcell',
 'NaturalKillerCell',
 'DendriticCell',
 'EndodermalCell',
 'MyofibroblastCell',
 'EpithelialCellOfEsophagus',
 'CD11b+CD11c+DC',
 'PlasmaCell',
 'CD45-',
 'Epcam+',
 'CD11c+DC',
 'CD34+CD38-HematopoieticStemCell',
 'Bcell',
 'InhibitoryInterneuron',
 'MyeloidCell',
 'PeripheralBloodMononuclearCell',
 'KidneyCell',
 'CD11b+Macrophages/Monocytes',
 'CordBloodHematopoieticStemCell',
 'Live',
 'Tcell',
 'Monocyte',
 'Tcell',
 'Splenocyte',
 'EmbryonicFibroblast',
 'InnateLymphoidCell',
 'Neuron',
 'HLAG+',
 'Cancer-associatedFibroblasts(CAFs)',
 'EpithelialCell',
 'EndothelialCell',
 'StromalCell',
 'EffectorMemoryCD8+AlphaBetaTCellTerminallyDifferentiated',
 'MononuclearCell',
 'PancreaticPPCell',
 'NeuralCell',
 'CD31+Endothelial']

In [None]:
rows = pd.DataFrame({'name': cell_types, 'URI': cell_types_URIs})
rows

In [19]:
df = df.append(rows, ignore_index=True).drop_duplicates().dropna()
df

Unnamed: 0,name,URI
0,EpithelialCell,http://purl.obolibrary.org/obo/CL_0000066
1,Protoplast,http://purl.obolibrary.org/obo/CL_0000371
2,Hemocyte,http://purl.obolibrary.org/obo/CL_0000387
3,BoneMarrowCell,http://purl.obolibrary.org/obo/CL_0002092
4,StemCell,http://purl.obolibrary.org/obo/CL_0000034
...,...,...
112,StromalCell,http://purl.obolibrary.org/obo/TAO_0009226
113,EffectorMemoryCD8+AlphaBetaTCellTerminallyDiff...,http://purl.obolibrary.org/obo/CL_0001062
114,MononuclearCell,http://purl.obolibrary.org/obo/CL_0000842
115,PancreaticPPCell,http://purl.obolibrary.org/obo/CL_0002275


In [20]:
df.to_csv('../SingleCell-Files/cell_types_ontology.csv', index=False, header=False, sep='\t')

# Test

In [23]:
experiment_id = "E-MTAB-8810"
experiment_metadata = f"https://www.ebi.ac.uk/gxa/sc/experiment/{experiment_id}/download?fileType=experiment-design&accessKey="
    
df = pd.read_csv(experiment_metadata, sep='\t', low_memory=False)
df

Unnamed: 0,Assay,Sample Characteristic[organism],Sample Characteristic Ontology Term[organism],Sample Characteristic[strain],Sample Characteristic Ontology Term[strain],Sample Characteristic[age],Sample Characteristic Ontology Term[age],Sample Characteristic[developmental stage],Sample Characteristic Ontology Term[developmental stage],Sample Characteristic[sex],...,Sample Characteristic[genotype],Sample Characteristic Ontology Term[genotype],Sample Characteristic[organism part],Sample Characteristic Ontology Term[organism part],Sample Characteristic[cell type],Sample Characteristic Ontology Term[cell type],Factor Value[compound],Factor Value Ontology Term[compound],Factor Value[dose],Factor Value Ontology Term[dose]
0,ERR3971663-AAACCTGAGAAAGTGG,Mus musculus,http://purl.obolibrary.org/obo/NCBITaxon_10090,C57BL/6J,http://www.ebi.ac.uk/efo/EFO_0000606,10 week,,adult,http://www.ebi.ac.uk/efo/EFO_0001272,mixed,...,wild type genotype,,heart,http://purl.obolibrary.org/obo/UBERON_0000948,cardiac non-myocyte and cardiomyocyte,,angiotensin II 1.5 milligram per kilogram per day,http://purl.obolibrary.org/obo/CHEBI_48432,,
1,ERR3971663-AAACCTGAGCCGGTAA,Mus musculus,http://purl.obolibrary.org/obo/NCBITaxon_10090,C57BL/6J,http://www.ebi.ac.uk/efo/EFO_0000606,10 week,,adult,http://www.ebi.ac.uk/efo/EFO_0001272,mixed,...,wild type genotype,,heart,http://purl.obolibrary.org/obo/UBERON_0000948,cardiac non-myocyte and cardiomyocyte,,angiotensin II 1.5 milligram per kilogram per day,http://purl.obolibrary.org/obo/CHEBI_48432,,
2,ERR3971663-AAACCTGAGGCAGGTT,Mus musculus,http://purl.obolibrary.org/obo/NCBITaxon_10090,C57BL/6J,http://www.ebi.ac.uk/efo/EFO_0000606,10 week,,adult,http://www.ebi.ac.uk/efo/EFO_0001272,mixed,...,wild type genotype,,heart,http://purl.obolibrary.org/obo/UBERON_0000948,cardiac non-myocyte and cardiomyocyte,,angiotensin II 1.5 milligram per kilogram per day,http://purl.obolibrary.org/obo/CHEBI_48432,,
3,ERR3971663-AAACCTGCAATAAGCA,Mus musculus,http://purl.obolibrary.org/obo/NCBITaxon_10090,C57BL/6J,http://www.ebi.ac.uk/efo/EFO_0000606,10 week,,adult,http://www.ebi.ac.uk/efo/EFO_0001272,mixed,...,wild type genotype,,heart,http://purl.obolibrary.org/obo/UBERON_0000948,cardiac non-myocyte and cardiomyocyte,,angiotensin II 1.5 milligram per kilogram per day,http://purl.obolibrary.org/obo/CHEBI_48432,,
4,ERR3971663-AAACCTGCAATAGAGT,Mus musculus,http://purl.obolibrary.org/obo/NCBITaxon_10090,C57BL/6J,http://www.ebi.ac.uk/efo/EFO_0000606,10 week,,adult,http://www.ebi.ac.uk/efo/EFO_0001272,mixed,...,wild type genotype,,heart,http://purl.obolibrary.org/obo/UBERON_0000948,cardiac non-myocyte and cardiomyocyte,,angiotensin II 1.5 milligram per kilogram per day,http://purl.obolibrary.org/obo/CHEBI_48432,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39681,ERR3971670-TTTGTCAGTTGTCGCG,Mus musculus,http://purl.obolibrary.org/obo/NCBITaxon_10090,C57BL/6J,http://www.ebi.ac.uk/efo/EFO_0000606,10 week,,adult,http://www.ebi.ac.uk/efo/EFO_0001272,mixed,...,wild type genotype,,heart,http://purl.obolibrary.org/obo/UBERON_0000948,cardiac non-myocyte and cardiomyocyte,,none,http://www.ebi.ac.uk/efo/EFO_0001461,,
39682,ERR3971670-TTTGTCATCGAGGTAG,Mus musculus,http://purl.obolibrary.org/obo/NCBITaxon_10090,C57BL/6J,http://www.ebi.ac.uk/efo/EFO_0000606,10 week,,adult,http://www.ebi.ac.uk/efo/EFO_0001272,mixed,...,wild type genotype,,heart,http://purl.obolibrary.org/obo/UBERON_0000948,cardiac non-myocyte and cardiomyocyte,,none,http://www.ebi.ac.uk/efo/EFO_0001461,,
39683,ERR3971670-TTTGTCATCGGTCTAA,Mus musculus,http://purl.obolibrary.org/obo/NCBITaxon_10090,C57BL/6J,http://www.ebi.ac.uk/efo/EFO_0000606,10 week,,adult,http://www.ebi.ac.uk/efo/EFO_0001272,mixed,...,wild type genotype,,heart,http://purl.obolibrary.org/obo/UBERON_0000948,cardiac non-myocyte and cardiomyocyte,,none,http://www.ebi.ac.uk/efo/EFO_0001461,,
39684,ERR3971670-TTTGTCATCTTCCTTC,Mus musculus,http://purl.obolibrary.org/obo/NCBITaxon_10090,C57BL/6J,http://www.ebi.ac.uk/efo/EFO_0000606,10 week,,adult,http://www.ebi.ac.uk/efo/EFO_0001272,mixed,...,wild type genotype,,heart,http://purl.obolibrary.org/obo/UBERON_0000948,cardiac non-myocyte and cardiomyocyte,,none,http://www.ebi.ac.uk/efo/EFO_0001461,,


In [28]:
if 'Sample Characteristic[cell type]' in df.columns:
    cell_type_names = df['Sample Characteristic[cell type]'].unique()
    cell_type_URIs = df['Sample Characteristic Ontology Term[cell type]'].unique()

    new_l = []
    
    for x in cell_type_names:
        if " and " in x:
            print(x.split(" and "))
            new_l += x.split(" and ")
        else:
            new_l += [x]

    cell_type_names = new_l
    
    cell_type_URIs = list(cell_type_URIs)
    cell_type_URIs += [None] * (len(cell_type_names) - len(cell_type_URIs))
    
    for name, URI in zip(cell_type_names, cell_type_URIs):
        
        if URI is None or str(URI) == 'nan':
            URI = get_URI_from_name(name)
        print(name, URI)
        

['cardiac non-myocyte', 'cardiomyocyte']
cardiac non-myocyte None
cardiomyocyte http://purl.obolibrary.org/obo/OARCS_0000013


In [None]:
get_URI_from_name('neoplastic cell')

In [None]:
l = ['cardiac non-myocyte and cardiomyocyte']
new_l = []
for x in l:
    if " and " in x:
        new_l += x.split(" and ")

new_l

In [28]:
import re

string = 'megakaryocyte-erythroid progenitor cell'
re.split(' and |, ', string)

['megakaryocyte-erythroid progenitor cell']

In [30]:
get_cell_types_from_project('E-GEOD-81682')

Unnamed: 0,Assay,Sample Characteristic[organism],Sample Characteristic Ontology Term[organism],Sample Characteristic[strain],Sample Characteristic Ontology Term[strain],Sample Characteristic[sex],Sample Characteristic Ontology Term[sex],Sample Characteristic[age],Sample Characteristic Ontology Term[age],Sample Characteristic[organism part],Sample Characteristic Ontology Term[organism part],Sample Characteristic[cell type],Sample Characteristic Ontology Term[cell type],Sample Characteristic[facs marker],Sample Characteristic Ontology Term[facs marker],Factor Value[single cell identifier],Factor Value Ontology Term[single cell identifier],Factor Value[cell type],Factor Value Ontology Term[cell type]
0,HSPC_001,Mus musculus,http://purl.obolibrary.org/obo/NCBITaxon_10090,C57BL/6,http://www.ebi.ac.uk/efo/EFO_0004472,female,http://purl.obolibrary.org/obo/PATO_0000383,8 to 12 week,,bone marrow,http://purl.obolibrary.org/obo/UBERON_0002371,hematopoietic stem cell and hematopoietic mult...,,Lin- Sca1+ c-Kit+,,HSPC_001,,hematopoietic stem cell and hematopoietic mult...,
1,HSPC_002,Mus musculus,http://purl.obolibrary.org/obo/NCBITaxon_10090,C57BL/6,http://www.ebi.ac.uk/efo/EFO_0004472,female,http://purl.obolibrary.org/obo/PATO_0000383,8 to 12 week,,bone marrow,http://purl.obolibrary.org/obo/UBERON_0002371,hematopoietic stem cell and hematopoietic mult...,,Lin- Sca1+ c-Kit+,,HSPC_002,,hematopoietic stem cell and hematopoietic mult...,
2,HSPC_003,Mus musculus,http://purl.obolibrary.org/obo/NCBITaxon_10090,C57BL/6,http://www.ebi.ac.uk/efo/EFO_0004472,female,http://purl.obolibrary.org/obo/PATO_0000383,8 to 12 week,,bone marrow,http://purl.obolibrary.org/obo/UBERON_0002371,hematopoietic stem cell and hematopoietic mult...,,Lin- Sca1+ c-Kit+,,HSPC_003,,hematopoietic stem cell and hematopoietic mult...,
3,HSPC_004,Mus musculus,http://purl.obolibrary.org/obo/NCBITaxon_10090,C57BL/6,http://www.ebi.ac.uk/efo/EFO_0004472,female,http://purl.obolibrary.org/obo/PATO_0000383,8 to 12 week,,bone marrow,http://purl.obolibrary.org/obo/UBERON_0002371,hematopoietic stem cell and hematopoietic mult...,,Lin- Sca1+ c-Kit+,,HSPC_004,,hematopoietic stem cell and hematopoietic mult...,
4,HSPC_005,Mus musculus,http://purl.obolibrary.org/obo/NCBITaxon_10090,C57BL/6,http://www.ebi.ac.uk/efo/EFO_0004472,female,http://purl.obolibrary.org/obo/PATO_0000383,8 to 12 week,,bone marrow,http://purl.obolibrary.org/obo/UBERON_0002371,hematopoietic stem cell and hematopoietic mult...,,Lin- Sca1+ c-Kit+,,HSPC_005,,hematopoietic stem cell and hematopoietic mult...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1915,Prog_848,Mus musculus,http://purl.obolibrary.org/obo/NCBITaxon_10090,C57BL/6,http://www.ebi.ac.uk/efo/EFO_0004472,female,http://purl.obolibrary.org/obo/PATO_0000383,8 to 12 week,,bone marrow,http://purl.obolibrary.org/obo/UBERON_0002371,"megakaryocyte-erythroid progenitor cell, commo...",,Lin- Sca1- c-Kit+,,Prog_848,,"megakaryocyte-erythroid progenitor cell, commo...",
1916,Prog_849,Mus musculus,http://purl.obolibrary.org/obo/NCBITaxon_10090,C57BL/6,http://www.ebi.ac.uk/efo/EFO_0004472,female,http://purl.obolibrary.org/obo/PATO_0000383,8 to 12 week,,bone marrow,http://purl.obolibrary.org/obo/UBERON_0002371,"megakaryocyte-erythroid progenitor cell, commo...",,Lin- Sca1- c-Kit+,,Prog_849,,"megakaryocyte-erythroid progenitor cell, commo...",
1917,Prog_850,Mus musculus,http://purl.obolibrary.org/obo/NCBITaxon_10090,C57BL/6,http://www.ebi.ac.uk/efo/EFO_0004472,female,http://purl.obolibrary.org/obo/PATO_0000383,8 to 12 week,,bone marrow,http://purl.obolibrary.org/obo/UBERON_0002371,"megakaryocyte-erythroid progenitor cell, commo...",,Lin- Sca1- c-Kit+,,Prog_850,,"megakaryocyte-erythroid progenitor cell, commo...",
1918,Prog_851,Mus musculus,http://purl.obolibrary.org/obo/NCBITaxon_10090,C57BL/6,http://www.ebi.ac.uk/efo/EFO_0004472,female,http://purl.obolibrary.org/obo/PATO_0000383,8 to 12 week,,bone marrow,http://purl.obolibrary.org/obo/UBERON_0002371,"megakaryocyte-erythroid progenitor cell, commo...",,Lin- Sca1- c-Kit+,,Prog_851,,"megakaryocyte-erythroid progenitor cell, commo...",


hematopoietic stem cell and hematopoietic multipotent progenitor cell
long term hematopoietic stem cell
megakaryocyte-erythroid progenitor cell, common myeloid progenitor and granulocyte monocyte progenitor cell


[{'name': 'hematopoietic stem cell',
  'URI': 'http://purl.obolibrary.org/obo/CL_0000037'},
 {'name': 'hematopoietic multipotent progenitor cell',
  'URI': 'http://purl.obolibrary.org/obo/CL_0000837'},
 {'name': 'long term hematopoietic stem cell',
  'URI': 'http://purl.obolibrary.org/obo/CL_0002034'},
 {'name': 'megakaryocyte-erythroid progenitor cell',
  'URI': 'http://purl.obolibrary.org/obo/CL_0000050'},
 {'name': 'common myeloid progenitor',
  'URI': 'http://purl.obolibrary.org/obo/CL_0000049'},
 {'name': 'granulocyte monocyte progenitor cell',
  'URI': 'http://purl.obolibrary.org/obo/CL_0000557'}]

In [24]:
', ' in 'megakaryocyte-erythroid progenitor cell, common myeloid progenitor'

True