In [1]:
import requests
import json
import yaml
import pandas as pd

# Aim

Return a lookup from ABC accession to CL term.
Do this based on harvesting old CL mappings and mapping forwards on names.

There are 3 sources of mappings

- old root node mappings
- cell set mappings
- NT mappings

NT mappings should supercede any existing mappings to NT.

In [2]:
## NT mappings (using label names from ABC atlas)

NT =  { 'Glut': {'ID': 'CL:0000679', 'label': 'glutamatergic neuron'},
        'GABA': {'ID': 'CL:0000617', 'label': 'GABAergic neuron'},
        'Sero': {'ID': 'CL:0000850', 'label': 'serotonergic neuron'},
        'Dopa': {'ID': 'CL:0000700', 'label': 'dopaminergic neuron'},
        'Chol': {'ID': 'CL:0000108', 'label': 'cholinergic neuron'},
        'Hist': {'ID': 'CL:0011110', 'label': 'histaminergic neuron'},
        'Nora': {'ID': 'CL:0008025', 'label': 'noradrendergic neuron'}       
      }


In [3]:
# Make a CL ID to label lookup
clget = requests.get("http://purl.obolibrary.org/obo/cl/cl.json")
cl = json.loads(clget.content)['graphs'][0]
cl_lookup = { n['id'].replace("http://purl.obolibrary.org/obo/CL_", "CL:") : n['lbl'] 
          for n in cl['nodes'] 
          if 'lbl' in n.keys() }

In [4]:
# Function for generating quick dictionary lookups from dataframes - cos I suck a pandas joins
def df2dict(df: pd.DataFrame, key_column: str):
    return df.set_index(key_column).fillna('').to_dict(orient='index')

In [5]:
# Old taxonomy has CL mappings and the names we will use to map forward.
old_taxonomy = pd.read_csv('../data/BDSO_taxonomy_templates/nomenclature_table_CS202212150_annotated.csv', sep=',')
old_taxonomy[['cell_set_preferred_alias', 'CL',]].dropna()[0:3]

Unnamed: 0,cell_set_preferred_alias,CL
4414,4972 CB Granule Glut_1,CL:0000120
4415,4973 CB Granule Glut_1,CL:0000120
4416,4974 CB Granule Glut_1,CL:0000120


In [6]:
# Some lookups from old taxonomy
old_taxonomy_name_dict = df2dict(old_taxonomy, 'cell_set_preferred_alias')
old_taxonomy_accession_dict = df2dict(old_taxonomy, 'cell_set_accession')


In [7]:

CL_mapping = df2dict(old_taxonomy[['cell_set_preferred_alias', 'CL',]].dropna(), 'cell_set_preferred_alias')

In [8]:
with open('../data/BDSO_taxonomy_templates/taxonomy_details.yaml', 'r') as taxd_file:
    taxd=yaml.safe_load(taxd_file.read())
taxd[0]['Root_nodes'][0:3]

[{'Node': 'CS202212150_6347',
  'Cell_type': 'CL:0000679',
  'Location_relation': 'has_soma_location'},
 {'Node': 'CS202212150_6348',
  'Cell_type': 'CL:0000679',
  'Location_relation': 'has_soma_location'},
 {'Node': 'CS202212150_6349',
  'Cell_type': 'CL:0000679',
  'Location_relation': 'has_soma_location'}]

In [9]:
# we need to map these too.  To make combining easy, use the same data structure

root_node_CL_mappings = { old_taxonomy_accession_dict[n['Node']]['cell_set_preferred_alias']: 
                         { 'CL': n['Cell_type'], 'cell_set_accession' : n['Node'] }
                         for n in taxd[0]['Root_nodes'] }

In [10]:
# merge into one mapping
CL_mapping.update(root_node_CL_mappings)

In [11]:
list(CL_mapping.items())[0:3]


[('4972 CB Granule Glut_1', {'CL': 'CL:0000120'}),
 ('4973 CB Granule Glut_1', {'CL': 'CL:0000120'}),
 ('4974 CB Granule Glut_1', {'CL': 'CL:0000120'})]

In [12]:
# remove NTs. We only want to use the mapping of these directly to cluster in the NT field in ABC.

to_remove = []
for k,v in CL_mapping.items():
    if cl_lookup[v['CL']] in [n['label'] for n in NT.values()]:
        to_remove.append(k)

[CL_mapping.pop(key) for key in to_remove]


[{'CL': 'CL:0000679'},
 {'CL': 'CL:0000617'},
 {'CL': 'CL:0000679', 'cell_set_accession': 'CS202212150_6347'},
 {'CL': 'CL:0000679', 'cell_set_accession': 'CS202212150_6348'},
 {'CL': 'CL:0000679', 'cell_set_accession': 'CS202212150_6349'},
 {'CL': 'CL:0000617', 'cell_set_accession': 'CS202212150_6352'},
 {'CL': 'CL:0000617', 'cell_set_accession': 'CS202212150_6353'},
 {'CL': 'CL:0000617', 'cell_set_accession': 'CS202212150_6354'},
 {'CL': 'CL:0000679', 'cell_set_accession': 'CS202212150_6355'},
 {'CL': 'CL:0000679', 'cell_set_accession': 'CS202212150_6356'},
 {'CL': 'CL:0000617', 'cell_set_accession': 'CS202212150_6357'},
 {'CL': 'CL:0000679', 'cell_set_accession': 'CS202212150_5468'},
 {'CL': 'CL:0000679', 'cell_set_accession': 'CS202212150_6359'},
 {'CL': 'CL:0000679', 'cell_set_accession': 'CS202212150_6361'},
 {'CL': 'CL:0000679', 'cell_set_accession': 'CS202212150_6362'},
 {'CL': 'CL:0000679', 'cell_set_accession': 'CS202212150_6363'},
 {'CL': 'CL:0000850', 'cell_set_accession': 

In [13]:
# Start by loading relevant ABC metadata files into dataframes
dpath = '../data/WMB_taxonomy_20230830/'
cluster_annotation_term = pd.read_csv(dpath + 'cluster_annotation_term.csv', sep=',')

In [14]:
import re
def newname_2_old(name:str, old_names):
    old_style_name = re.sub("^\d+ (.+)", r"\1", name)
    if old_style_name in old_names:
        return old_style_name
    else:
        return name


In [15]:
cluster_annotation_term_dict = df2dict(cluster_annotation_term, 'label')

In [16]:
out = {}
for k,v in cluster_annotation_term_dict.items():
    old_name = newname_2_old(v['name'], old_taxonomy_name_dict.keys())
    if old_name in CL_mapping.keys():
        out[k] = CL_mapping[old_name]['CL']

out
    

{'CS20230722_CLAS_33': 'CL:4023072',
 'CS20230722_CLAS_34': 'CL:0000738',
 'CS20230722_SUBC_327': 'CL:0000128',
 'CS20230722_SUBC_330': 'CL:4023051',
 'CS20230722_SUBC_331': 'CL:2000043',
 'CS20230722_SUBC_332': 'CL:0000192',
 'CS20230722_SUBC_333': 'CL:0000115',
 'CS20230722_SUBC_338': 'CL:0000542',
 'CS20230722_SUPT_1154': 'CL:0000120',
 'CS20230722_SUPT_1159': 'CL:0000127',
 'CS20230722_SUPT_1163': 'CL:0000127',
 'CS20230722_SUPT_1166': 'CL:0000127',
 'CS20230722_SUPT_1172': 'CL:0002085',
 'CS20230722_SUPT_1175': 'CL:0000065',
 'CS20230722_SUPT_1179': 'CL:0002453',
 'CS20230722_SUPT_1187': 'CL:4023051',
 'CS20230722_CLUS_5197': 'CL:0000120',
 'CS20230722_CLUS_5198': 'CL:0000120',
 'CS20230722_CLUS_5199': 'CL:0000120',
 'CS20230722_CLUS_5208': 'CL:0000127',
 'CS20230722_CLUS_5209': 'CL:0000127',
 'CS20230722_CLUS_5210': 'CL:0000127',
 'CS20230722_CLUS_5211': 'CL:0000127',
 'CS20230722_CLUS_5212': 'CL:0000127',
 'CS20230722_CLUS_5213': 'CL:0000127',
 'CS20230722_CLUS_5224': 'CL:000012

In [23]:
{ (v, cl_lookup[v]) for v in out.values()}

{('CL:0000065', 'ependymal cell'),
 ('CL:0000115', 'endothelial cell'),
 ('CL:0000120', 'granule cell'),
 ('CL:0000127', 'astrocyte'),
 ('CL:0000128', 'oligodendrocyte'),
 ('CL:0000192', 'smooth muscle cell'),
 ('CL:0000542', 'lymphocyte'),
 ('CL:0000738', 'leukocyte'),
 ('CL:0002085', 'tanycyte'),
 ('CL:0002453', 'oligodendrocyte precursor cell'),
 ('CL:2000043', 'brain pericyte'),
 ('CL:4023051', 'vascular leptomeningeal cell'),
 ('CL:4023072', 'brain vascular cell')}

## Hmmmm

That a lot of work for a very small number of CL term mappings...!  And no neurons!!!!  It may be that we have lost a lot to name changes.

With this number it might be better to re-review the latest taxonomies by hand!

In [24]:
# This looks like a good level for manual mappings

[ a['name'] for i,a in cluster_annotation_term.iterrows() if a['cluster_annotation_term_set_name'] == 'subclass']

['001 CLA-EPd-CTX Car3 Glut',
 '002 IT EP-CLA Glut',
 '003 L5/6 IT TPE-ENT Glut',
 '004 L6 IT CTX Glut',
 '005 L5 IT CTX Glut',
 '006 L4/5 IT CTX Glut',
 '007 L2/3 IT CTX Glut',
 '008 L2/3 IT ENT Glut',
 '009 L2/3 IT PIR-ENTl Glut',
 '010 IT AON-TT-DP Glut',
 '011 L2 IT ENT-po Glut',
 '012 MEA Slc17a7 Glut',
 '013 COAp Grxcr2 Glut',
 '014 LA-BLA-BMA-PA Glut',
 '015 ENTmv-PA-COAp Glut',
 '016 CA1-ProS Glut',
 '017 CA3 Glut',
 '018 L2 IT PPP-APr Glut',
 '019 L2/3 IT PPP Glut',
 '020 L2/3 IT RSP Glut',
 '021 L4 RSP-ACA Glut',
 '022 L5 ET CTX Glut',
 '023 SUB-ProS Glut',
 '024 L5 PPP Glut',
 '025 CA2-FC-IG Glut',
 '026 NLOT Rho Glut',
 '027 L6b EPd Glut',
 '028 L6b/CT ENT Glut',
 '029 L6b CTX Glut',
 '030 L6 CT CTX Glut',
 '031 CT SUB Glut',
 '032 L5 NP CTX Glut',
 '033 NP SUB Glut',
 '034 NP PPP Glut',
 '035 OB Eomes Ms4a15 Glut',
 '036 HPF CR Glut',
 '037 DG Glut',
 '038 DG-PIR Ex IMN',
 '039 OB Meis2 Thsd7b Gaba',
 '040 OB Trdn Gaba',
 '041 OB-in Frmd7 Gaba',
 '042 OB-out Frmd7 Gaba',
 