In [1]:
import pandas as pd
import re
import yaml

In [2]:
# Start by loading relevant ABC metadata files into dataframes
dpath = '../data/WMB_taxonomy_20230830/'
cluster_annotation_term = pd.read_csv(dpath + 'cluster_annotation_term.csv', sep=',')
cluster_annotation_term_set = pd.read_csv(dpath + 'cluster_annotation_term_set.csv', sep=',')
#cluster = pd.read_csv(dpath + 'cluster.csv', sep=',')
#cluster_to_cluster_annotation_membership = pd.read_csv(dpath + 'cluster_to_cluster_annotation_membership.csv', sep=',')

In [3]:
cluster_annotation_term[0:2]

Unnamed: 0,label,name,cluster_annotation_term_set_label,parent_term_label,parent_term_set_label,term_set_order,term_order,cluster_annotation_term_set_name,color_hex_triplet
0,CS20230722_NEUR_Glut,Glut,CCN20230722_NEUR,,,0,0,neurotransmitter,#2B93DF
1,CS20230722_NEUR_NA,,CCN20230722_NEUR,,,0,1,neurotransmitter,#666666


In [4]:
cluster_annotation_term_set[0:2]

Unnamed: 0,label,name,description,order
0,CCN20230722_NEUR,neurotransmitter,Clusters are assigned based on the average exp...,0
1,CCN20230722_CLAS,class,The top level of cell type definition in the m...,1


In [5]:
# Turn these into dicts for fast lookup
# turning nan to empty string better for lookups, although could cause trouble with numbers.

def df2dict(df: pd.DataFrame, key_column: str):
    return df.set_index(key_column).fillna('').to_dict(orient='index')

cluster_annotation_term_dict = df2dict(cluster_annotation_term, 'label')
cluster_annotation_term_set_dict = df2dict(cluster_annotation_term_set, 'label')

In [6]:
# Old taxonomy - use as source of accesssions for now.  These accessions will match 
old_taxonomy = pd.read_csv('../data/BDSO_taxonomy_templates/nomenclature_table_CS202212150_annotated.csv', sep=',')
old_taxonomy [0:2]

Unnamed: 0,cell_set_accession,original_label,cell_set_label,cell_set_preferred_alias,cell_set_aligned_alias,cell_set_additional_aliases,cell_set_structure,cell_set_ontology_tag,cell_set_alias_assignee,cell_set_alias_citation,taxonomy_id,child_cell_set_accessions,NT,MBA,projection,layer,CL
0,CS202212150_1,,WMB 00001,0001 Car3 Glut_1,,,Brain,UBERON:0000955,Zizhen Yao,,CCN202212150,,GO:0061535,,,,
1,CS202212150_10,,WMB 00010,0010 IT EP-CLA Glut_1,,,Brain,UBERON:0000955,Zizhen Yao,,CCN202212150,,GO:0061535,http://purl.obolibrary.org/obo/MBA_942 | http:...,PATO:0070034,,


In [47]:
import requests
clget = requests.get("http://purl.obolibrary.org/obo/cl/cl.json")
clget

<Response [200]>

In [95]:
import json
cl = json.loads(clget.content)['graphs'][0]
cl_lookup = { n['id'].replace("http://purl.obolibrary.org/obo/CL_", "CL:") : n['lbl'] 
          for n in cl['nodes'] 
          if 'lbl' in n.keys() }


In [7]:
old_taxonomy_dict = df2dict(old_taxonomy, 'cell_set_preferred_alias')

In [61]:
# old_taxonomy has CL mappings we can use
old_taxonomy[['cell_set_accession', 'cell_set_preferred_alias', 'CL',]].dropna()

Unnamed: 0,cell_set_accession,cell_set_preferred_alias,CL
4414,CS202212150_4972,4972 CB Granule Glut_1,CL:0000120
4415,CS202212150_4973,4973 CB Granule Glut_1,CL:0000120
4416,CS202212150_4974,4974 CB Granule Glut_1,CL:0000120
4417,CS202212150_4975,4975 CB Granule Glut_2,CL:0000120
4418,CS202212150_4976,4976 CB Granule Glut_2,CL:0000120
...,...,...,...
5972,CS202212150_6374,Pallium glutamatergic,CL:0000679
5973,CS202212150_6375,Subpallium GABAergic,CL:0000617
5974,CS202212150_6376,PAL-sAMY-TH-HY-MB-HB neuronal,CL:0000540
5975,CS202212150_6377,CBX-MOB-other neuronal,CL:0000540


In [8]:
# A second source of CL mappings:
with open('../data/BDSO_taxonomy_templates/taxonomy_details.yaml', 'r') as taxd_file:
    taxd=yaml.safe_load(taxd_file.read())

In [10]:
root_node_CL_mappings = { n['Node']: n['Cell_type'] for n in taxd[0]['Root_nodes']}

In [9]:
taxd[0]['Root_nodes'][0:2]

[{'Node': 'CS202212150_6347',
  'Cell_type': 'CL:0000679',
  'Location_relation': 'has_soma_location'},
 {'Node': 'CS202212150_6348',
  'Cell_type': 'CL:0000679',
  'Location_relation': 'has_soma_location'}]

In [44]:
# Mappings in both
set(root_node_CL_mappings.values()).intersection(set(old_taxonomy['CL'].dropna()))

{'CL:0000125', 'CL:0000617', 'CL:0000679'}

In [45]:
#mappings only in root node map
set(root_node_CL_mappings.values()).difference(set(old_taxonomy['CL'].dropna()))

{'CL:0000700',
 'CL:0000738',
 'CL:0000850',
 'CL:4023063',
 'CL:4023064',
 'CL:4023072'}

In [62]:
# Mappings only in taxonomy
len(set(old_taxonomy['CL'].dropna()).difference(set(root_node_CL_mappings.values())))

17

In [96]:
# Combining CL terms into a single lookup with labels

CL_mapping = df2dict(old_taxonomy[['cell_set_accession', 'CL']].dropna(), 'cell_set_accession')
CL_mapping.update({n['Node'] : { 'CL': n['Cell_type']} for n in taxd[0]['Root_nodes']})

In [97]:
#[{'cell_annotation_key': 'neurotransmitter', 'cell_label': 'Glut'},
# {'cell_annotation_key': 'neurotransmitter', 'cell_label': 'GABA'},
# {'cell_annotation_key': 'neurotransmitter', 'cell_label': 'Dopa'},
# {'cell_annotation_key': 'neurotransmitter', 'cell_label': 'Glut-GABA'},
# {'cell_annotation_key': 'neurotransmitter', 'cell_label': 'Chol'},
# {'cell_annotation_key': 'neurotransmitter', 'cell_label': 'Hist'},
# {'cell_annotation_key': 'neurotransmitter', 'cell_label': 'GABA-Glyc'},
# {'cell_annotation_key': 'neurotransmitter', 'cell_label': 'Sero'},
# {'cell_annotation_key': 'neurotransmitter', 'cell_label': 'Nora'},

NT = { 'Glut': {'ID': 'CL:0000679', 'label': 'glutamatergic neuron'},
        'GABA': {'ID': 'CL:0000617', 'label': 'GABAergic neuron'},
        'Sero': {'ID': 'CL:0000850', 'label': 'serotonergic neuron'},
        'Dopa': {'ID': 'CL:0000700', 'label': 'dopaminergic neuron'}
      }
       ##  TOTO ADD missing NTs

to_remove = []
for k,v in CL_mapping.items():
    if cl_lookup[v['CL']] in [n['label'] for n in NT.values()]:
        to_remove.append(k)
    else: 
        CL_mapping[k]['label'] = cl_lookup[v['CL']]

[CL_mapping.pop(key) for key in to_remove]

CL_mapping



{'CS202212150_4972': {'CL': 'CL:0000120', 'label': 'granule cell'},
 'CS202212150_4973': {'CL': 'CL:0000120', 'label': 'granule cell'},
 'CS202212150_4974': {'CL': 'CL:0000120', 'label': 'granule cell'},
 'CS202212150_4975': {'CL': 'CL:0000120', 'label': 'granule cell'},
 'CS202212150_4976': {'CL': 'CL:0000120', 'label': 'granule cell'},
 'CS202212150_5099': {'CL': 'CL:0000121', 'label': 'Purkinje cell'},
 'CS202212150_5102': {'CL': 'CL:0000644', 'label': 'Bergmann glial cell'},
 'CS202212150_5103': {'CL': 'CL:0000127', 'label': 'astrocyte'},
 'CS202212150_5104': {'CL': 'CL:0000127', 'label': 'astrocyte'},
 'CS202212150_5105': {'CL': 'CL:0000127', 'label': 'astrocyte'},
 'CS202212150_5106': {'CL': 'CL:0000127', 'label': 'astrocyte'},
 'CS202212150_5107': {'CL': 'CL:0000127', 'label': 'astrocyte'},
 'CS202212150_5108': {'CL': 'CL:0000127', 'label': 'astrocyte'},
 'CS202212150_5109': {'CL': 'CL:0000127', 'label': 'astrocyte'},
 'CS202212150_5110': {'CL': 'CL:0000127', 'label': 'astrocyte

In [11]:
annotation_keys = []
orders = list(cluster_annotation_term_set['order'])
orders.sort()
max_order = orders[-1]
for k,v in cluster_annotation_term_set_dict.items():
    key = {}
    key['annotation_key'] = v['name']
    key['description'] = v['description']
    if not v['name'] == 'neurotransmitter': key['rank'] = - (v['order'] - max_order)
    annotation_keys.append(key)
annotation_keys[0:2]

[{'annotation_key': 'neurotransmitter',
  'description': 'Clusters are assigned based on the average expression of both neurotransmitter transporter genes and key neurotransmitter synthesizing enzyme genes.'},
 {'annotation_key': 'class',
  'description': 'The top level of cell type definition in the mouse whole brain taxonomy. It is primarily determined by broad brain region and neurotransmitter type. All cells within a subclass belong to the same class. Class provides a broader categorization of cell types.',
  'rank': 3}]

In [14]:
## Testing matches when old term name is stripped

old_name_new_name_interesction = set(old_taxonomy_dict.keys()).intersection(set([re.sub("^\d+ (.+)", r"\1", v['name']) 
                                                                                    for v in cluster_annotation_term_dict.values()]))
print (len(old_name_new_name_interesction))
print (list(old_name_new_name_interesction)[0:3])

# These names can be used to look up accessions in the old taxonomy

716
['COAa-PAA-MEA Barhl2 Glut', 'IC Tfap2d Maf Glut_1', 'PRP-NI-PRNc-GRN Otp Glut']


In [60]:
# How many old names have an accession that maps to a CL ID in the root_node_CL_mappings

for name in old_name_new_name_interesction:
    accession = old_taxonomy_dict[name]['cell_set_accession']
    if accession in root_node_CL_mappings.keys():
        id = root_node_CL_mappings[accession]
        print(' ; '.join([id , cl_lookup[id]]))


CL:4023072 ; brain vascular cell
CL:0000679 ; glutamatergic neuron
CL:0000700 ; dopaminergic neuron
CL:0000679 ; glutamatergic neuron
CL:0000679 ; glutamatergic neuron
CL:0000679 ; glutamatergic neuron
CL:0000679 ; glutamatergic neuron
CL:0000617 ; GABAergic neuron
CL:0000617 ; GABAergic neuron
CL:0000617 ; GABAergic neuron
CL:0000617 ; GABAergic neuron
CL:0000617 ; GABAergic neuron
CL:0000679 ; glutamatergic neuron
CL:0000679 ; glutamatergic neuron
CL:0000679 ; glutamatergic neuron
CL:0000679 ; glutamatergic neuron
CL:0000617 ; GABAergic neuron
CL:0000617 ; GABAergic neuron
CL:0000617 ; GABAergic neuron
CL:0000679 ; glutamatergic neuron
CL:0000738 ; leukocyte


In [24]:
# Testing parent cell set lookup

parents = set(cluster_annotation_term['parent_term_label'].fillna(''))
labels = set(cluster_annotation_term['label'].fillna(''))
parents.difference(labels)

{''}

Conclusion - all parent labels resolve internally

In [25]:
from collections import Counter
names = [v['name'] for v in cluster_annotation_term_dict.values()]
duplicate_names = [k for k,v in Counter(names).items() if v>1]

## TODO

* Make more complete set of links to CL, including label - but stripping NT, which should go on NT cell sets.

In [94]:
import re
annotations = []

# In the ols taxonomy only clusters having leading numbers.

def newname_2_old(name:str, old_names):
    old_style_name = re.sub("^\d+ (.+)", r"\1", name)
    if old_style_name in old_names:
        return old_style_name
    else:
        return name

for k,v in cluster_annotation_term_dict.items():
    annotation = {}
    name = newname_2_old(v['name'], old_taxonomy_dict.keys())

    if v['cluster_annotation_term_set_name'] == 'neurotransmitter':
        # Ideally mixed NTs would be split.  Do that later
        annotation['cell_annotation_key'] = v['cluster_annotation_term_set_name']
        annotation['cell_label'] = name
        if name in NTs.keys():
            annotation['cell_type'] = NT[name]['label']
            annotation['cell_type_ontology_term_id'] = NT[name]['ID']
        next
    if name in duplicate_names: 
        next
        
    if name in old_taxonomy_dict.keys():
        annotation['cell_annotation_key'] =  v['cluster_annotation_term_set_name']
        annotation['cell_label'] = name
        cell_set_accession = old_taxonomy_dict[name]['cell_set_accession']
        annotation['cell_set_accession'] = cell_set_accession
        ## Now we need to look up parent cell set accessions. For that we need the stripped name
        ### Lookup name of parent
        if v['parent_term_label'] and v['parent_term_label'] in cluster_annotation_term_dict.keys():
            parent_name = cluster_annotation_term_dict[v['parent_term_label']]['name']
            parent_cell_set_name = newname_2_old(parent_name, old_taxonomy_dict.keys())
            if parent_cell_set_name in old_taxonomy_dict.keys():
                annotation['parent_cell_set_label'] = parent_cell_set_name
                annotation['parent_cell_set_accession'] = old_taxonomy_dict[parent_cell_set_name]['cell_set_accession']
            else:
                print("No mapping for parent: " + v['parent_term_label'] + ' ' + parent_name)      
        if cell_set_accession in CL_mapping.keys(): 
            annotation['cell_type'] = CL_mapping[cell_set_accession]['label']
            annotation['cell_type_ontology_term_id'] = CL_mapping[cell_set_accession]['CL']

    if annotation:
        annotations.append(annotation)
annotations

No mapping for parent: CS20230722_CLAS_03 03 OB-CR Glut
No mapping for parent: CS20230722_CLAS_04 04 DG-IMN Glut
No mapping for parent: CS20230722_CLAS_04 04 DG-IMN Glut
No mapping for parent: CS20230722_CLAS_06 06 CTX-CGE GABA
No mapping for parent: CS20230722_CLAS_06 06 CTX-CGE GABA
No mapping for parent: CS20230722_CLAS_06 06 CTX-CGE GABA
No mapping for parent: CS20230722_CLAS_06 06 CTX-CGE GABA
No mapping for parent: CS20230722_CLAS_07 07 CTX-MGE GABA
No mapping for parent: CS20230722_CLAS_07 07 CTX-MGE GABA
No mapping for parent: CS20230722_CLAS_07 07 CTX-MGE GABA
No mapping for parent: CS20230722_CLAS_07 07 CTX-MGE GABA
No mapping for parent: CS20230722_CLAS_08 08 CNU-MGE GABA
No mapping for parent: CS20230722_CLAS_08 08 CNU-MGE GABA
No mapping for parent: CS20230722_CLAS_08 08 CNU-MGE GABA
No mapping for parent: CS20230722_CLAS_08 08 CNU-MGE GABA
No mapping for parent: CS20230722_CLAS_09 09 CNU-LGE GABA
No mapping for parent: CS20230722_CLAS_09 09 CNU-LGE GABA
No mapping for par

[{'cell_annotation_key': 'neurotransmitter',
  'cell_label': 'Glut',
  'cell_type': 'glutamatergic neuron',
  'cell_type_ontology_term_id': 'CL:0000679'},
 {'cell_annotation_key': 'neurotransmitter', 'cell_label': ''},
 {'cell_annotation_key': 'neurotransmitter',
  'cell_label': 'GABA',
  'cell_type': 'GABAergic neuron',
  'cell_type_ontology_term_id': 'CL:0000617'},
 {'cell_annotation_key': 'neurotransmitter',
  'cell_label': 'Dopa',
  'cell_type': 'dopaminergic neuron',
  'cell_type_ontology_term_id': 'CL:0000700'},
 {'cell_annotation_key': 'neurotransmitter', 'cell_label': 'Glut-GABA'},
 {'cell_annotation_key': 'neurotransmitter', 'cell_label': 'Chol'},
 {'cell_annotation_key': 'neurotransmitter', 'cell_label': 'Hist'},
 {'cell_annotation_key': 'neurotransmitter', 'cell_label': 'GABA-Glyc'},
 {'cell_annotation_key': 'neurotransmitter',
  'cell_label': 'Sero',
  'cell_type': 'serotonergic neuron',
  'cell_type_ontology_term_id': 'CL:0000850'},
 {'cell_annotation_key': 'neurotransmitt

In [99]:
# Where are links to NT added.  Seems like not here:
cluster_annotation_term['parent_term_set_label'].drop_duplicates()

0                    NaN
44      CCN20230722_CLAS
382     CCN20230722_SUBC
1583    CCN20230722_SUPT
Name: parent_term_set_label, dtype: object