In [1]:
import requests
import pandas as pd
import urllib.parse
import json
import time
import re

from IPython.display import clear_output
from GetTermsOntobee import get_URI_from_name, get_name_from_URI

In [2]:
seed_url = "https://www.ebi.ac.uk/gxa/sc/json/experiments"

In [3]:
headers = {
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
}

In [4]:
answer = requests.get(seed_url, headers=headers)

In [5]:
ontologies = ['CL', 'EFO', 'UBERON', 'NCIT', 'PO']

In [6]:
def get_terms_from_project(experiment_id, name_column, ontology_column):
    terms = []
    terms_names = []
    
    experiment_metadata = f"https://www.ebi.ac.uk/gxa/sc/experiment/{experiment_id}/download?fileType=experiment-design&accessKey="
    
    df = pd.read_csv(experiment_metadata, sep='\t', low_memory=False)
            
    if name_column in df.columns:
        project_term_names = df[name_column].unique()
        project_term_URIs = df[ontology_column].unique()
    
        # Balance names and URIs
        project_term_URIs = list(project_term_URIs)
        project_term_URIs += [None] * (len(project_term_names) - len(project_term_URIs))
        
        for name, URI in zip(project_term_names, project_term_URIs):
            # If we have already code the cell_type we skip it
            if name in terms_names:
                continue
            
            print(name)
            
            # If we have no info of URI we search for it
            if URI is None or str(URI) == 'nan':
                for ontology in ontologies:
                    
                    URI = get_URI_from_name(name, ontology)
                    time.sleep(1)
                    
                    if URI is not None:
                        terms.append({
                            'name': name,
                            'URI': URI
                        })
                
                if URI is None:
                    URI = get_URI_from_name(name)
                    time.sleep(1)
                    
                    terms.append({
                        'name': name,
                        'URI': URI
                    })
                    
            # Otherwise   
            else:                    
                terms_names.append(name)

                terms.append({
                    'name': name,
                    'URI': URI
                })
                
            # If there are two or more cell types in the name
            if URI is None:
                new_l = re.split(' and |, ', name)
                for new_name in new_l:
                    
                    for ontology in ontologies:
                        
                        URI = get_URI_from_name(new_name, ontology)
                        time.sleep(1)
                        
                        if URI is not None:

                            terms.append({
                                'name': new_name,
                                'URI': URI
                            })
                    
                    if URI is None:
                        URI = get_URI_from_name(name)
                        time.sleep(1)

                        terms.append({
                            'name': name,
                            'URI': URI
                        })
                        
                    terms_names.append(new_name)
            

    return terms, terms_names

In [7]:
avoid_collections = ["Human Cell Atlas"]

cell_types = []
cell_types_names = []

n_experiments = len(answer.json()['experiments'])

for n, experiment in enumerate(answer.json()['experiments']):
    print(f"{n+1}/{n_experiments}")

    # We skip projects from the avoid collections
    if [i for i in experiment["experimentProjects"] if i in avoid_collections]:
        clear_output(wait=True)
        continue
    
    experiment_id = experiment['experimentAccession']
    cell_types_pro, cell_types_names_pro = get_terms_from_project(experiment_id=experiment_id, 
                                                                  name_column='Sample Characteristic[cell type]', 
                                                                  ontology_column='Sample Characteristic Ontology Term[cell type]')
    
    cell_types += cell_types_pro
    cell_types_names += cell_types_names_pro
    
    clear_output(wait=True)

df_SCEA = pd.DataFrame(cell_types)

181/181


In [8]:
df_SCEA = df_SCEA.drop_duplicates()

In [9]:
df_SCEA[df_SCEA.isnull().any(axis=1)].name.tolist()

['olfactory projection neuron innvervating DC2 glomerulus',
 'olfactory projection neuron innervating VM2 glomerulus',
 'multi-lymphoid progenitor',
 'hematopoietic stem cell and thrombocyte',
 'neutrophil and myeloid cell',
 'marrow-derived B cell',
 'neuronal, glial and vascular cells',
 'Un-cryopreserved peripheral blood mononuclear cells (PBMCs)',
 'hematopoietic stem cell and hematopoietic multipotent progenitor cell',
 'megakaryocyte-erythroid progenitor cell, common myeloid progenitor and granulocyte monocyte progenitor cell',
 'dormant hematopoietic stem cell',
 'mixed cell types',
 'cardiac non-myocyte',
 'induced neural border stem cell',
 'embryonic neural border stem cell',
 'extra thymic aire-expressing cells',
 'mix of stromal fibroblasts and epithelial tumour cells',
 'cardiac non-myocyte and cardiomyocyte']

In [10]:
df_SCEA[df_SCEA['name'] == 'Neuron']

Unnamed: 0,name,URI


In [11]:
df_SCEA['name'].tolist()

['epithelial cell',
 'not applicable',
 'protoplast',
 'hemocyte',
 'bone marrow cell',
 'stem cell',
 'myoepithelial cell of mammary gland',
 'luminal epithelial cell of mammary gland',
 'plant protoplast',
 'neuron',
 'olfactory projection neuron',
 'olfactory projection neuron innervating DA1, VA1d or DC3 glomerulus',
 'astrocyte',
 'astrocyte',
 'olfactory projection neuron innvervating DC2 glomerulus',
 'olfactory projection neuron innervating VM2 glomerulus',
 'long term hematopoietic stem cell',
 'hematopoietic multipotent progenitor cell',
 'short term hematopoietic stem cell',
 'granulocyte macrophage progenitor',
 'lymphoid-primed multipotent progenitor',
 'multi-lymphoid progenitor',
 'hematopoietic stem cell and thrombocyte',
 'hematopoietic stem cell',
 'hematopoietic stem cell',
 'thrombocyte',
 'thrombocyte',
 'thrombocyte',
 'thymic T cell',
 'mature T cell',
 'neutrophil and myeloid cell',
 'neutrophil',
 'neutrophil',
 'myeloid cell',
 'myeloid cell',
 'marrow-derived

In [12]:
from OntologyConversorSCAE import OntologyConversorSCAE

conversor = OntologyConversorSCAE()

In [13]:
df = df_SCEA

df['name'] = df['name'].apply(conversor.parse_word)
df

Unnamed: 0,name,URI
0,EpithelialCell,http://purl.obolibrary.org/obo/CL_0000066
1,NotApplicable,http://purl.obolibrary.org/obo/NCIT_C48660
2,Protoplast,http://purl.obolibrary.org/obo/CL_0000371
4,Hemocyte,http://purl.obolibrary.org/obo/CL_0000387
6,BoneMarrowCell,http://purl.obolibrary.org/obo/CL_0002092
...,...,...
243,Monocyte,http://purl.obolibrary.org/obo/CL_0000860
254,"[Fibroblast, EpithelialTumorCell]",
258,"[CardiacNonMyocyte, Cardiomyocyte]",
260,Cardiomyocyte,http://purl.obolibrary.org/obo/CL_0000746


# HCA cell types

In [14]:
seed_url = "https://service.azul.data.humancellatlas.org/index/projects?size=999&catalog=dcp1"

In [15]:
# User-Agent obtenido del recurso del curso.
headers = {
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
}

In [16]:
answer = requests.get(seed_url, headers=headers)
project_hits = answer.json()["hits"]

In [17]:
n_projects = len(project_hits)

cell_types = []

for n, hit in enumerate(project_hits):
    entry_id = hit["entryId"]
    seed_url = "https://service.explore.data.humancellatlas.org/repository/projects/" + entry_id
    
    print(n)
    
    for item in hit['cellSuspensions']:
        cell_types += item['selectedCellType']
        
    clear_output(wait=True)
    
cell_types = list(set(cell_types))

27


In [18]:
cell_types = [x for x in cell_types if x is not None]
cell_types

['kidney cell',
 'bone marrow hematopoietic cell',
 'monocyte',
 'CD31+ endothelial',
 'peripheral blood mononuclear cell',
 'CD45-',
 'natural killer cell',
 'cord blood hematopoietic stem cell',
 'myofibroblast cell',
 'embryonic fibroblast',
 'inhibitory interneuron',
 'pancreatic PP cell',
 'splenocyte',
 'Epcam+',
 'live',
 'endodermal cell',
 'neuron',
 'CD34-positive, CD38-negative hematopoietic stem cell',
 'myeloid cell',
 'effector memory CD8-positive, alpha-beta T cell, terminally differentiated',
 'epithelial cell of esophagus',
 'mononuclear cell',
 'dendritic cell',
 'leukocyte',
 'CAFs',
 'endothelial cell',
 'HLAG+',
 'stromal cell',
 'Plasma cell',
 'CD11b+ Macrophages/monocytes',
 'innate lymphoid cell',
 'epithelial cell',
 'CD8-positive, alpha-beta T cell',
 'neural cell',
 'CD11c+ DC',
 'fibroblast',
 'T cell',
 'B cell',
 'CD4+ T cell',
 'CD11b+CD11c+DC']

In [19]:
cell_types_URIs = list(map(get_URI_from_name, cell_types))
cell_types_URIs

['http://purl.obolibrary.org/obo/CL_1000497',
 'http://purl.obolibrary.org/obo/CL_1001610',
 'http://purl.obolibrary.org/obo/CL_0000576',
 None,
 'http://purl.obolibrary.org/obo/CL_2000001',
 None,
 'http://purl.obolibrary.org/obo/CL_0000623',
 'http://purl.obolibrary.org/obo/CL_2000095',
 'http://purl.obolibrary.org/obo/CL_0000186',
 'http://purl.obolibrary.org/obo/BTO_0004725',
 'http://purl.obolibrary.org/obo/CL_0000498',
 'http://purl.obolibrary.org/obo/CL_0002275',
 'http://purl.obolibrary.org/obo/BTO_0001598',
 'http://purl.obolibrary.org/obo/OMIT_0033324',
 'http://www.ebi.ac.uk/swo/maturity/SWO_9000065',
 'http://purl.obolibrary.org/obo/CL_0000223',
 'http://purl.obolibrary.org/obo/CL_0000540',
 'http://purl.obolibrary.org/obo/CL_0001024',
 'http://purl.obolibrary.org/obo/CL_0000763',
 'http://purl.obolibrary.org/obo/CL_0001062',
 'http://purl.obolibrary.org/obo/CL_0002252',
 'http://purl.obolibrary.org/obo/CL_0000842',
 'http://purl.obolibrary.org/obo/CL_0000451',
 'http://pur

In [20]:
df_HCA = pd.DataFrame({'name': cell_types, 'URI': cell_types_URIs})
df_HCA[df_HCA.isnull().any(axis=1)].name.tolist()

['CD31+ endothelial',
 'CD45-',
 'CD11b+ Macrophages/monocytes',
 'CD11c+ DC',
 'CD4+ T cell',
 'CD11b+CD11c+DC']

In [21]:
len(cell_types)

40

In [22]:
from OntologyConversorHCA import OntologyConversorHCA

conversor = OntologyConversorHCA()

In [23]:
cell_types = list(map(conversor.parse_word, cell_types))
cell_types

['KidneyCell',
 'BoneMarrowHematopoieticCell',
 'Monocyte',
 'CD31+Endothelial',
 'PeripheralBloodMononuclearCell',
 'CD45-',
 'NaturalKillerCell',
 'CordBloodHematopoieticStemCell',
 'MyofibroblastCell',
 'EmbryonicFibroblast',
 'InhibitoryInterneuron',
 'PancreaticPPCell',
 'Splenocyte',
 'Epcam+',
 'Live',
 'EndodermalCell',
 'Neuron',
 'CD34+CD38-HematopoieticStemCell',
 'MyeloidCell',
 'EffectorMemoryCD8+AlphaBetaTCellTerminallyDifferentiated',
 'EpithelialCellOfEsophagus',
 'MononuclearCell',
 'DendriticCell',
 'Leukocyte',
 'Cancer-associatedFibroblasts(CAFs)',
 'EndothelialCell',
 'HLAG+',
 'StromalCell',
 'PlasmaCell',
 'CD11b+Macrophages/Monocytes',
 'InnateLymphoidCell',
 'EpithelialCell',
 'CD8+AlphaBetaTcell',
 'NeuralCell',
 'CD11c+DC',
 'Fibroblast',
 'Tcell',
 'Bcell',
 'Tcell',
 'CD11b+CD11c+DC']

In [24]:
rows = pd.DataFrame({'name': cell_types, 'URI': cell_types_URIs})
rows

Unnamed: 0,name,URI
0,KidneyCell,http://purl.obolibrary.org/obo/CL_1000497
1,BoneMarrowHematopoieticCell,http://purl.obolibrary.org/obo/CL_1001610
2,Monocyte,http://purl.obolibrary.org/obo/CL_0000576
3,CD31+Endothelial,
4,PeripheralBloodMononuclearCell,http://purl.obolibrary.org/obo/CL_2000001
5,CD45-,
6,NaturalKillerCell,http://purl.obolibrary.org/obo/CL_0000623
7,CordBloodHematopoieticStemCell,http://purl.obolibrary.org/obo/CL_2000095
8,MyofibroblastCell,http://purl.obolibrary.org/obo/CL_0000186
9,EmbryonicFibroblast,http://purl.obolibrary.org/obo/BTO_0004725


In [27]:
df = df.append(rows, ignore_index=True)
df

Unnamed: 0,name,URI
0,EpithelialCell,http://purl.obolibrary.org/obo/CL_0000066
1,NotApplicable,http://purl.obolibrary.org/obo/NCIT_C48660
2,Protoplast,http://purl.obolibrary.org/obo/CL_0000371
3,Hemocyte,http://purl.obolibrary.org/obo/CL_0000387
4,BoneMarrowCell,http://purl.obolibrary.org/obo/CL_0002092
...,...,...
168,Fibroblast,http://purl.obolibrary.org/obo/CL_0000057
169,Tcell,http://purl.obolibrary.org/obo/CL_0000084
170,Bcell,http://purl.obolibrary.org/obo/CL_0000236
171,Tcell,


In [28]:
df.to_csv('../SingleCell-Files/cell_types_ontology.csv', index=False, header=False, sep='\t')

# Test

In [None]:
experiment_id = "E-MTAB-8810"
experiment_metadata = f"https://www.ebi.ac.uk/gxa/sc/experiment/{experiment_id}/download?fileType=experiment-design&accessKey="

df = pd.read_csv(experiment_metadata, sep='\t', low_memory=False)
df

In [None]:
if 'Sample Characteristic[cell type]' in df.columns:
    cell_type_names = df['Sample Characteristic[cell type]'].unique()
    cell_type_URIs = df['Sample Characteristic Ontology Term[cell type]'].unique()

    new_l = []
    
    for x in cell_type_names:
        if " and " in x:
            print(x.split(" and "))
            new_l += x.split(" and ")
        else:
            new_l += [x]

    cell_type_names = new_l
    
    cell_type_URIs = list(cell_type_URIs)
    cell_type_URIs += [None] * (len(cell_type_names) - len(cell_type_URIs))
    
    for name, URI in zip(cell_type_names, cell_type_URIs):
        
        if URI is None or str(URI) == 'nan':
            URI = get_URI_from_name(name)
        print(name, URI)
        

In [None]:
get_URI_from_name('neoplastic cell')

In [None]:
l = ['cardiac non-myocyte and cardiomyocyte']
new_l = []
for x in l:
    if " and " in x:
        new_l += x.split(" and ")

new_l

In [None]:
import re

string = 'megakaryocyte-erythroid progenitor cell'
re.split(' and |, ', string)

In [None]:
get_terms_from_project('E-GEOD-100618', name_column='Sample Characteristic[cell type]', 
                                                                  ontology_column='Sample Characteristic Ontology Term[cell type]')