In [1]:
import requests
import pandas as pd
import urllib.parse
import json
import time

from IPython.display import clear_output
from GetTermsOntobee import get_URI_from_name, get_name_from_URI

In [2]:
def get_first(l):
    if len(l) > 0:
        return l[0]
    
    return None

In [3]:
def get_cell_info(cell_URI):
    
    print(cell_URI)
    
    if cell_URI is None or cell_URI == 'http://purl.obolibrary.org/obo/NCIT_C48660':
        return None, None, None, None
    
    query = f'''
    DEFINE sql:describe-mode "CBD"
    DESCRIBE <{cell_URI}>
    FROM <http://purl.obolibrary.org/obo/merged/CL>
    '''
    query_parsed = urllib.parse.quote(query)
    
    headers = {
        "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
    }
    url = f"http://sparql.hegroup.org/sparql/?default-graph-uri=&query={query_parsed}&format=application%2Fsparql-results%2Bjson&timeout=0&debug=on"
    answer = requests.get(url, headers=headers)
    
    time.sleep(2)
    
    columns = answer.json()["head"]["vars"]
    results = answer.json()["results"]

    rows = []

    for result in answer.json()["results"]["bindings"]:
        result_dict = {}
        for header in columns:
            result_dict[header] = result[header]['value']

        rows.append(result_dict)

    df = pd.DataFrame(rows)
    if df.empty:
        return None, None, None, None
    
    df['p'] = df['p'].apply(lambda x: x.split('#')[1] if len(x.split('#')) > 1 else x)
    
    synonym = get_first(df[df['p'] == 'hasExactSynonym']['o'].tolist())
    comment = get_first(df[df['p'] == 'comment']['o'].tolist())
    super_classes = [x for x in df[df['p'] == 'subClassOf']['o'].tolist() if x.startswith('http')]
    asserts = df[df['p'] == 'someValuesFrom']['o'].tolist()
    
    return synonym, comment, super_classes, asserts

In [4]:
seed_url = "https://www.ebi.ac.uk/gxa/sc/json/experiments"

In [5]:
headers = {
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
}

In [6]:
answer = requests.get(seed_url, headers=headers)

In [11]:
avoid_collections = ["Human Cell Atlas"]

cell_types = []
cell_types_names = []
relations = []

n_experiments = len(answer.json()['experiments'])

for n, experiment in enumerate(answer.json()['experiments']):
    print(f"{n+1}/{n_experiments}")

    experiment_id = experiment['experimentAccession']
    experiment_metadata = f"https://www.ebi.ac.uk/gxa/sc/experiment/{experiment_id}/download?fileType=experiment-design&accessKey="
    
    df = pd.read_csv(experiment_metadata, sep='\t', low_memory=False)
    
    if 'Sample Characteristic[cell type]' in df.columns:
        cell_type_names = df['Sample Characteristic[cell type]'].unique()
        cell_type_URIs = df['Sample Characteristic Ontology Term[cell type]'].unique()
    
    
        for name, URI in zip(cell_type_names, cell_type_URIs):
            if name in cell_types_names:
                continue
            
            if URI is None:
                URI = get_URI_from_name(name, 'CL')
            
            if URI is None:
                continue
            
            synonym, comment, superclasses, asserts = get_cell_info(URI)
            cell_types_names.append(name)
            
            if superclasses:
            
                for superclass in superclasses:
                    cell_types.append({
                        'name': name,
                        'URI': URI,
                        'synonym': synonym,
                        'comment': comment,
                        'superclass': superclass,
                        'asserts': None
                    })

                    superclass_URI = get_URI_from_name(superclass, 'CL')
                    relations.append({
                        'subclass': name,
                        'subclass_URI': URI,
                        'superclass': superclass,
                        'superclass_URI': superclass_URI
                    })
            
            if asserts:

                for a in asserts:
                    cell_types.append({
                        'name': name,
                        'URI': URI,
                        'synonym': synonym,
                        'comment': comment,
                        'superclass': None,
                        'asserts': a
                    })

    time.sleep(2)
    
    clear_output(wait=True)

df_ct = pd.DataFrame(cell_types)
df_relations = pd.DataFrame(relations)

181/181


In [12]:
df_ct

Unnamed: 0,name,URI,synonym,comment,superclass,asserts
0,epithelial cell,http://purl.obolibrary.org/obo/CL_0000066,epitheliocyte,,http://purl.obolibrary.org/obo/CL_0000548,
1,epithelial cell,http://purl.obolibrary.org/obo/CL_0000066,epitheliocyte,,,http://purl.obolibrary.org/obo/UBERON_0000483
2,epithelial cell,http://purl.obolibrary.org/obo/CL_0000066,epitheliocyte,,,http://purl.obolibrary.org/obo/UBERON_0000483
3,protoplast,http://purl.obolibrary.org/obo/CL_0000371,,,http://purl.obolibrary.org/obo/CL_0000578,
4,hemocyte,http://purl.obolibrary.org/obo/CL_0000387,,,http://purl.obolibrary.org/obo/CL_0000519,
...,...,...,...,...,...,...
445,macrophage,http://purl.obolibrary.org/obo/CL_0000235,histiocyte,"Morphology: Diameter 30_M-80 _M, abundant cyto...",,http://purl.obolibrary.org/obo/GO_0031268
446,macrophage,http://purl.obolibrary.org/obo/CL_0000235,histiocyte,"Morphology: Diameter 30_M-80 _M, abundant cyto...",,http://purl.obolibrary.org/obo/CL_0000576
447,macrophage,http://purl.obolibrary.org/obo/CL_0000235,histiocyte,"Morphology: Diameter 30_M-80 _M, abundant cyto...",,http://purl.obolibrary.org/obo/GO_0006909
448,macrophage,http://purl.obolibrary.org/obo/CL_0000235,histiocyte,"Morphology: Diameter 30_M-80 _M, abundant cyto...",,http://purl.obolibrary.org/obo/GO_0031268


In [13]:
df_relations

Unnamed: 0,subclass,subclass_URI,superclass,superclass_URI
0,epithelial cell,http://purl.obolibrary.org/obo/CL_0000066,http://purl.obolibrary.org/obo/CL_0000548,http://purl.obolibrary.org/obo/CL_0000548
1,protoplast,http://purl.obolibrary.org/obo/CL_0000371,http://purl.obolibrary.org/obo/CL_0000578,http://purl.obolibrary.org/obo/CL_0000578
2,hemocyte,http://purl.obolibrary.org/obo/CL_0000387,http://purl.obolibrary.org/obo/CL_0000519,http://purl.obolibrary.org/obo/CL_0000519
3,hemocyte,http://purl.obolibrary.org/obo/CL_0000387,http://purl.obolibrary.org/obo/CL_0000548,http://purl.obolibrary.org/obo/CL_0000548
4,bone marrow cell,http://purl.obolibrary.org/obo/CL_0002092,http://purl.obolibrary.org/obo/CL_0001035,http://purl.obolibrary.org/obo/CL_0001035
...,...,...,...,...
97,adipocyte,http://purl.obolibrary.org/obo/CL_0000136,http://purl.obolibrary.org/obo/CL_0002320,http://purl.obolibrary.org/obo/CL_0002320
98,preadipocyte,http://purl.obolibrary.org/obo/CL_0002334,http://purl.obolibrary.org/obo/CL_0000057,http://purl.obolibrary.org/obo/CL_0000057
99,macrophage,http://purl.obolibrary.org/obo/CL_0000235,http://purl.obolibrary.org/obo/CL_0000145,http://purl.obolibrary.org/obo/CL_0000145
100,macrophage,http://purl.obolibrary.org/obo/CL_0000235,http://purl.obolibrary.org/obo/CL_0000766,http://purl.obolibrary.org/obo/CL_0000766


In [15]:
df_ct.to_csv('../SingleCell-Files/cell_types.csv', index=False)
df_relations.to_csv('../SingleCell-Files/cell_types_relations.csv', index=False)

In [2]:
get_name_from_URI('http://purl.obolibrary.org/obo/CL_0000548', 'CL')

'animal cell'

In [3]:
df_relations = pd.read_csv('../SingleCell-Files/cell_types_relations.csv')

In [4]:
df_relations['superclass'] = df_relations['superclass'].apply(get_name_from_URI)

TypeError: get_name_from_URI() missing 1 required positional argument: 'ontology'