In [1]:
import requests
import pandas as pd
import urllib.parse
import json
import time
import re

from IPython.display import clear_output
from GetTermsOntobee import get_URI_from_name, get_name_from_URI

In [2]:
seed_url = "https://www.ebi.ac.uk/gxa/sc/json/experiments"

In [3]:
headers = {
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
}

In [4]:
answer = requests.get(seed_url, headers=headers)

In [5]:
ontologies = ['CL', 'EFO', 'UBERON', 'NCIT', 'PO']

In [6]:
def get_terms_from_project(experiment_id, name_column, ontology_column):
    terms = []
    terms_names = []
    
    experiment_metadata = f"https://www.ebi.ac.uk/gxa/sc/experiment/{experiment_id}/download?fileType=experiment-design&accessKey="
    
    df = pd.read_csv(experiment_metadata, sep='\t', low_memory=False)
            
    if name_column in df.columns:
        project_term_names = df[name_column].unique()
        project_term_URIs = df[ontology_column].unique()
    
        # Balance names and URIs
        project_term_URIs = list(project_term_URIs)
        project_term_URIs += [None] * (len(project_term_names) - len(project_term_URIs))
        
        for name, URI in zip(project_term_names, project_term_URIs):
            # If we have already code the cell_type we skip it
            if name in terms_names:
                continue
            
            print(name)
            
            # If we have no info of URI we search for it
            if URI is None or str(URI) == 'nan':
                for ontology in ontologies:
                    
                    URI = get_URI_from_name(name, ontology)
                    time.sleep(1)
                    
                    if URI is not None:
                        terms.append({
                            'name': name,
                            'URI': URI
                        })
                
                if URI is None:
                    URI = get_URI_from_name(name)
                    time.sleep(1)
                    
                    terms.append({
                        'name': name,
                        'URI': URI
                    })
                    
            # Otherwise   
            else:                    
                terms_names.append(name)

                terms.append({
                    'name': name,
                    'URI': URI
                })
                
            # If there are two or more cell types in the name
            if URI is None:
                new_l = re.split(' and |, ', name)
                for new_name in new_l:
                    
                    for ontology in ontologies:
                        
                        URI = get_URI_from_name(new_name, ontology)
                        time.sleep(1)
                        
                        if URI is not None:

                            terms.append({
                                'name': new_name,
                                'URI': URI
                            })
                    
                    if URI is None:
                        URI = get_URI_from_name(name)
                        time.sleep(1)

                        terms.append({
                            'name': name,
                            'URI': URI
                        })
                        
                    terms_names.append(new_name)
            

    return terms, terms_names

In [7]:
avoid_collections = ["Human Cell Atlas"]

cell_types = []
cell_types_names = []

n_experiments = len(answer.json()['experiments'])

for n, experiment in enumerate(answer.json()['experiments']):
    print(f"{n+1}/{n_experiments}")

    # We skip projects from the avoid collections
    if [i for i in experiment["experimentProjects"] if i in avoid_collections]:
        clear_output(wait=True)
        continue
    
    experiment_id = experiment['experimentAccession']
    cell_types_pro, cell_types_names_pro = get_terms_from_project(experiment_id=experiment_id, 
                                                                  name_column='Sample Characteristic[cell type]', 
                                                                  ontology_column='Sample Characteristic Ontology Term[cell type]')
    
    cell_types += cell_types_pro
    cell_types_names += cell_types_names_pro
    
    clear_output(wait=True)

df_SCEA = pd.DataFrame(cell_types)

181/181


In [10]:
df_SCEA = df_SCEA.drop_duplicates()

In [11]:
df_SCEA[df_SCEA.isnull().any(axis=1)].name.tolist()

['olfactory projection neuron innvervating DC2 glomerulus',
 'olfactory projection neuron innervating VM2 glomerulus',
 'multi-lymphoid progenitor',
 'hematopoietic stem cell and thrombocyte',
 'neutrophil and myeloid cell',
 'marrow-derived B cell',
 'neuronal, glial and vascular cells',
 'Un-cryopreserved peripheral blood mononuclear cells (PBMCs)',
 'hematopoietic stem cell and hematopoietic multipotent progenitor cell',
 'megakaryocyte-erythroid progenitor cell, common myeloid progenitor and granulocyte monocyte progenitor cell',
 'dormant hematopoietic stem cell',
 'mixed cell types',
 'cardiac non-myocyte',
 'induced neural border stem cell',
 'embryonic neural border stem cell',
 'extra thymic aire-expressing cells',
 'mix of stromal fibroblasts and epithelial tumour cells',
 'cardiac non-myocyte and cardiomyocyte']

In [12]:
df_SCEA[df_SCEA['name'] == 'common lymphoid progenitor']

Unnamed: 0,name,URI


In [13]:
df_SCEA['name'].tolist()

['epithelial cell',
 'not applicable',
 'protoplast',
 'hemocyte',
 'bone marrow cell',
 'stem cell',
 'myoepithelial cell of mammary gland',
 'luminal epithelial cell of mammary gland',
 'plant protoplast',
 'neuron',
 'olfactory projection neuron',
 'olfactory projection neuron innervating DA1, VA1d or DC3 glomerulus',
 'astrocyte',
 'astrocyte',
 'olfactory projection neuron innvervating DC2 glomerulus',
 'olfactory projection neuron innervating VM2 glomerulus',
 'long term hematopoietic stem cell',
 'hematopoietic multipotent progenitor cell',
 'short term hematopoietic stem cell',
 'granulocyte macrophage progenitor',
 'lymphoid-primed multipotent progenitor',
 'multi-lymphoid progenitor',
 'hematopoietic stem cell and thrombocyte',
 'hematopoietic stem cell',
 'hematopoietic stem cell',
 'thrombocyte',
 'thrombocyte',
 'thrombocyte',
 'thymic T cell',
 'mature T cell',
 'neutrophil and myeloid cell',
 'neutrophil',
 'neutrophil',
 'myeloid cell',
 'myeloid cell',
 'marrow-derived

In [14]:
from OntologyConversorSCAE import OntologyConversorSCAE

conversor = OntologyConversorSCAE()

In [15]:
df = df_SCEA

df['name'] = df['name'].apply(conversor.parse_word)
df

Unnamed: 0,name,URI
0,EpithelialCell,http://purl.obolibrary.org/obo/CL_0000066
1,NotApplicable,http://purl.obolibrary.org/obo/NCIT_C48660
2,Protoplast,http://purl.obolibrary.org/obo/CL_0000371
4,Hemocyte,http://purl.obolibrary.org/obo/CL_0000387
6,BoneMarrowCell,http://purl.obolibrary.org/obo/CL_0002092
...,...,...
243,Monocyte,http://purl.obolibrary.org/obo/CL_0000860
254,"[Fibroblast, EpithelialTumorCell]",
258,"[CardiacNonMyocyte, Cardiomyocyte]",
260,Cardiomyocyte,http://purl.obolibrary.org/obo/CL_0000746


# HCA cell types

In [16]:
seed_url = "https://service.azul.data.humancellatlas.org/index/projects?size=999&catalog=dcp1"

In [17]:
# User-Agent obtenido del recurso del curso.
headers = {
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
}

In [18]:
answer = requests.get(seed_url, headers=headers)
project_hits = answer.json()["hits"]

In [19]:
n_projects = len(project_hits)

cell_types = []

for n, hit in enumerate(project_hits):
    entry_id = hit["entryId"]
    seed_url = "https://service.explore.data.humancellatlas.org/repository/projects/" + entry_id
    
    print(n)
    
    for item in hit['cellSuspensions']:
        cell_types += item['selectedCellType']
        
    clear_output(wait=True)
    
cell_types = list(set(cell_types))

27


In [20]:
cell_types = [x for x in cell_types if x is not None]
cell_types

['CD31+ endothelial',
 'dendritic cell',
 'T cell',
 'Epcam+',
 'CD11b+ Macrophages/monocytes',
 'inhibitory interneuron',
 'myofibroblast cell',
 'stromal cell',
 'B cell',
 'effector memory CD8-positive, alpha-beta T cell, terminally differentiated',
 'monocyte',
 'endodermal cell',
 'Plasma cell',
 'neuron',
 'CD4+ T cell',
 'myeloid cell',
 'peripheral blood mononuclear cell',
 'CD11c+ DC',
 'leukocyte',
 'fibroblast',
 'epithelial cell of esophagus',
 'neural cell',
 'CD8-positive, alpha-beta T cell',
 'embryonic fibroblast',
 'endothelial cell',
 'splenocyte',
 'pancreatic PP cell',
 'CD11b+CD11c+DC',
 'CD45-',
 'CAFs',
 'cord blood hematopoietic stem cell',
 'HLAG+',
 'epithelial cell',
 'bone marrow hematopoietic cell',
 'natural killer cell',
 'kidney cell',
 'CD34-positive, CD38-negative hematopoietic stem cell',
 'innate lymphoid cell',
 'live',
 'mononuclear cell']

In [21]:
cell_types_URIs = list(map(get_URI_from_name, cell_types))
cell_types_URIs

[None,
 'http://purl.obolibrary.org/obo/CL_0000451',
 'http://purl.obolibrary.org/obo/CL_0000084',
 'http://purl.obolibrary.org/obo/OMIT_0033324',
 None,
 'http://purl.obolibrary.org/obo/CL_0000498',
 'http://purl.obolibrary.org/obo/CL_0000186',
 'http://purl.obolibrary.org/obo/TAO_0009226',
 'http://purl.obolibrary.org/obo/CL_0000236',
 'http://purl.obolibrary.org/obo/CL_0001062',
 'http://purl.obolibrary.org/obo/CL_0000576',
 'http://purl.obolibrary.org/obo/CL_0000223',
 'http://purl.obolibrary.org/obo/CL_0000786',
 'http://purl.obolibrary.org/obo/CL_0000540',
 None,
 'http://purl.obolibrary.org/obo/CL_0000763',
 'http://purl.obolibrary.org/obo/CL_2000001',
 None,
 'http://purl.obolibrary.org/obo/CL_0000738',
 'http://purl.obolibrary.org/obo/CL_0000057',
 'http://purl.obolibrary.org/obo/CL_0002252',
 'http://purl.obolibrary.org/obo/CL_0002319',
 'http://purl.obolibrary.org/obo/CL_0000625',
 'http://purl.obolibrary.org/obo/BTO_0004725',
 'http://purl.obolibrary.org/obo/CL_0000115',
 '

In [22]:
df_HCA = pd.DataFrame({'name': cell_types, 'URI': cell_types_URIs})
df_HCA[df_HCA.isnull().any(axis=1)].name.tolist()

['CD31+ endothelial',
 'CD11b+ Macrophages/monocytes',
 'CD4+ T cell',
 'CD11c+ DC',
 'CD11b+CD11c+DC',
 'CD45-']

In [23]:
len(cell_types)

40

In [24]:
from OntologyConversorHCA import OntologyConversorHCA

conversor = OntologyConversorHCA()

In [25]:
cell_types = list(map(conversor.parse_word, cell_types))
cell_types

['CD31+Endothelial',
 'DendriticCell',
 'Tcell',
 'Epcam+',
 'CD11b+Macrophages/Monocytes',
 'InhibitoryInterneuron',
 'MyofibroblastCell',
 'StromalCell',
 'Bcell',
 'EffectorMemoryCD8+AlphaBetaTCellTerminallyDifferentiated',
 'Monocyte',
 'EndodermalCell',
 'PlasmaCell',
 'Neuron',
 'Tcell',
 'MyeloidCell',
 'PeripheralBloodMononuclearCell',
 'CD11c+DC',
 'Leukocyte',
 'Fibroblast',
 'EpithelialCellOfEsophagus',
 'NeuralCell',
 'CD8+AlphaBetaTcell',
 'EmbryonicFibroblast',
 'EndothelialCell',
 'Splenocyte',
 'PancreaticPPCell',
 'CD11b+CD11c+DC',
 'CD45-',
 'Cancer-associatedFibroblasts(CAFs)',
 'CordBloodHematopoieticStemCell',
 'HLAG+',
 'EpithelialCell',
 'BoneMarrowHematopoieticCell',
 'NaturalKillerCell',
 'KidneyCell',
 'CD34+CD38-HematopoieticStemCell',
 'InnateLymphoidCell',
 'Live',
 'MononuclearCell']

In [26]:
rows = pd.DataFrame({'name': cell_types, 'URI': cell_types_URIs})
rows

Unnamed: 0,name,URI
0,CD31+Endothelial,
1,DendriticCell,http://purl.obolibrary.org/obo/CL_0000451
2,Tcell,http://purl.obolibrary.org/obo/CL_0000084
3,Epcam+,http://purl.obolibrary.org/obo/OMIT_0033324
4,CD11b+Macrophages/Monocytes,
5,InhibitoryInterneuron,http://purl.obolibrary.org/obo/CL_0000498
6,MyofibroblastCell,http://purl.obolibrary.org/obo/CL_0000186
7,StromalCell,http://purl.obolibrary.org/obo/TAO_0009226
8,Bcell,http://purl.obolibrary.org/obo/CL_0000236
9,EffectorMemoryCD8+AlphaBetaTCellTerminallyDiff...,http://purl.obolibrary.org/obo/CL_0001062


In [32]:
df = df.append(rows, ignore_index=True).dropna().drop_duplicates()
df

Unnamed: 0,name,URI
0,EpithelialCell,http://purl.obolibrary.org/obo/CL_0000066
1,NotApplicable,http://purl.obolibrary.org/obo/NCIT_C48660
2,Protoplast,http://purl.obolibrary.org/obo/CL_0000371
3,Hemocyte,http://purl.obolibrary.org/obo/CL_0000387
4,BoneMarrowCell,http://purl.obolibrary.org/obo/CL_0002092
...,...,...
144,KidneyCell,http://purl.obolibrary.org/obo/CL_1000497
145,CD34+CD38-HematopoieticStemCell,http://purl.obolibrary.org/obo/CL_0001024
146,InnateLymphoidCell,http://purl.obolibrary.org/obo/CL_0001065
147,Live,http://www.ebi.ac.uk/swo/maturity/SWO_9000065


In [33]:
df.to_csv('../SingleCell-Files/cell_types_ontology.csv', index=False, header=False, sep='\t')

# Test

In [23]:
experiment_id = "E-MTAB-8810"
experiment_metadata = f"https://www.ebi.ac.uk/gxa/sc/experiment/{experiment_id}/download?fileType=experiment-design&accessKey="

df = pd.read_csv(experiment_metadata, sep='\t', low_memory=False)
df

Unnamed: 0,Assay,Sample Characteristic[organism],Sample Characteristic Ontology Term[organism],Sample Characteristic[strain],Sample Characteristic Ontology Term[strain],Sample Characteristic[age],Sample Characteristic Ontology Term[age],Sample Characteristic[developmental stage],Sample Characteristic Ontology Term[developmental stage],Sample Characteristic[sex],...,Sample Characteristic[genotype],Sample Characteristic Ontology Term[genotype],Sample Characteristic[organism part],Sample Characteristic Ontology Term[organism part],Sample Characteristic[cell type],Sample Characteristic Ontology Term[cell type],Factor Value[compound],Factor Value Ontology Term[compound],Factor Value[dose],Factor Value Ontology Term[dose]
0,ERR3971663-AAACCTGAGAAAGTGG,Mus musculus,http://purl.obolibrary.org/obo/NCBITaxon_10090,C57BL/6J,http://www.ebi.ac.uk/efo/EFO_0000606,10 week,,adult,http://www.ebi.ac.uk/efo/EFO_0001272,mixed,...,wild type genotype,,heart,http://purl.obolibrary.org/obo/UBERON_0000948,cardiac non-myocyte and cardiomyocyte,,angiotensin II 1.5 milligram per kilogram per day,http://purl.obolibrary.org/obo/CHEBI_48432,,
1,ERR3971663-AAACCTGAGCCGGTAA,Mus musculus,http://purl.obolibrary.org/obo/NCBITaxon_10090,C57BL/6J,http://www.ebi.ac.uk/efo/EFO_0000606,10 week,,adult,http://www.ebi.ac.uk/efo/EFO_0001272,mixed,...,wild type genotype,,heart,http://purl.obolibrary.org/obo/UBERON_0000948,cardiac non-myocyte and cardiomyocyte,,angiotensin II 1.5 milligram per kilogram per day,http://purl.obolibrary.org/obo/CHEBI_48432,,
2,ERR3971663-AAACCTGAGGCAGGTT,Mus musculus,http://purl.obolibrary.org/obo/NCBITaxon_10090,C57BL/6J,http://www.ebi.ac.uk/efo/EFO_0000606,10 week,,adult,http://www.ebi.ac.uk/efo/EFO_0001272,mixed,...,wild type genotype,,heart,http://purl.obolibrary.org/obo/UBERON_0000948,cardiac non-myocyte and cardiomyocyte,,angiotensin II 1.5 milligram per kilogram per day,http://purl.obolibrary.org/obo/CHEBI_48432,,
3,ERR3971663-AAACCTGCAATAAGCA,Mus musculus,http://purl.obolibrary.org/obo/NCBITaxon_10090,C57BL/6J,http://www.ebi.ac.uk/efo/EFO_0000606,10 week,,adult,http://www.ebi.ac.uk/efo/EFO_0001272,mixed,...,wild type genotype,,heart,http://purl.obolibrary.org/obo/UBERON_0000948,cardiac non-myocyte and cardiomyocyte,,angiotensin II 1.5 milligram per kilogram per day,http://purl.obolibrary.org/obo/CHEBI_48432,,
4,ERR3971663-AAACCTGCAATAGAGT,Mus musculus,http://purl.obolibrary.org/obo/NCBITaxon_10090,C57BL/6J,http://www.ebi.ac.uk/efo/EFO_0000606,10 week,,adult,http://www.ebi.ac.uk/efo/EFO_0001272,mixed,...,wild type genotype,,heart,http://purl.obolibrary.org/obo/UBERON_0000948,cardiac non-myocyte and cardiomyocyte,,angiotensin II 1.5 milligram per kilogram per day,http://purl.obolibrary.org/obo/CHEBI_48432,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39681,ERR3971670-TTTGTCAGTTGTCGCG,Mus musculus,http://purl.obolibrary.org/obo/NCBITaxon_10090,C57BL/6J,http://www.ebi.ac.uk/efo/EFO_0000606,10 week,,adult,http://www.ebi.ac.uk/efo/EFO_0001272,mixed,...,wild type genotype,,heart,http://purl.obolibrary.org/obo/UBERON_0000948,cardiac non-myocyte and cardiomyocyte,,none,http://www.ebi.ac.uk/efo/EFO_0001461,,
39682,ERR3971670-TTTGTCATCGAGGTAG,Mus musculus,http://purl.obolibrary.org/obo/NCBITaxon_10090,C57BL/6J,http://www.ebi.ac.uk/efo/EFO_0000606,10 week,,adult,http://www.ebi.ac.uk/efo/EFO_0001272,mixed,...,wild type genotype,,heart,http://purl.obolibrary.org/obo/UBERON_0000948,cardiac non-myocyte and cardiomyocyte,,none,http://www.ebi.ac.uk/efo/EFO_0001461,,
39683,ERR3971670-TTTGTCATCGGTCTAA,Mus musculus,http://purl.obolibrary.org/obo/NCBITaxon_10090,C57BL/6J,http://www.ebi.ac.uk/efo/EFO_0000606,10 week,,adult,http://www.ebi.ac.uk/efo/EFO_0001272,mixed,...,wild type genotype,,heart,http://purl.obolibrary.org/obo/UBERON_0000948,cardiac non-myocyte and cardiomyocyte,,none,http://www.ebi.ac.uk/efo/EFO_0001461,,
39684,ERR3971670-TTTGTCATCTTCCTTC,Mus musculus,http://purl.obolibrary.org/obo/NCBITaxon_10090,C57BL/6J,http://www.ebi.ac.uk/efo/EFO_0000606,10 week,,adult,http://www.ebi.ac.uk/efo/EFO_0001272,mixed,...,wild type genotype,,heart,http://purl.obolibrary.org/obo/UBERON_0000948,cardiac non-myocyte and cardiomyocyte,,none,http://www.ebi.ac.uk/efo/EFO_0001461,,


In [28]:
if 'Sample Characteristic[cell type]' in df.columns:
    cell_type_names = df['Sample Characteristic[cell type]'].unique()
    cell_type_URIs = df['Sample Characteristic Ontology Term[cell type]'].unique()

    new_l = []
    
    for x in cell_type_names:
        if " and " in x:
            print(x.split(" and "))
            new_l += x.split(" and ")
        else:
            new_l += [x]

    cell_type_names = new_l
    
    cell_type_URIs = list(cell_type_URIs)
    cell_type_URIs += [None] * (len(cell_type_names) - len(cell_type_URIs))
    
    for name, URI in zip(cell_type_names, cell_type_URIs):
        
        if URI is None or str(URI) == 'nan':
            URI = get_URI_from_name(name)
        print(name, URI)
        

['cardiac non-myocyte', 'cardiomyocyte']
cardiac non-myocyte None
cardiomyocyte http://purl.obolibrary.org/obo/OARCS_0000013


In [None]:
get_URI_from_name('neoplastic cell')

In [None]:
l = ['cardiac non-myocyte and cardiomyocyte']
new_l = []
for x in l:
    if " and " in x:
        new_l += x.split(" and ")

new_l

In [28]:
import re

string = 'megakaryocyte-erythroid progenitor cell'
re.split(' and |, ', string)

['megakaryocyte-erythroid progenitor cell']

In [21]:
get_terms_from_project('E-GEOD-100618', name_column='Sample Characteristic[cell type]', 
                                                                  ontology_column='Sample Characteristic Ontology Term[cell type]')

granulocyte macrophage progenitor
lymphoid-primed multipotent progenitor
multi-lymphoid progenitor


([{'name': 'granulocyte macrophage progenitor',
   'URI': 'http://purl.obolibrary.org/obo/CL_0000557'},
  {'name': 'lymphoid-primed multipotent progenitor',
   'URI': 'http://purl.obolibrary.org/obo/CL_0000936'},
  {'name': 'multi-lymphoid progenitor', 'URI': None},
  {'name': 'multi-lymphoid progenitor', 'URI': None}],
 ['granulocyte macrophage progenitor',
  'lymphoid-primed multipotent progenitor',
  'multi-lymphoid progenitor'])