In [66]:
import requests
import json
import yaml
import pandas as pd

# Aim

Return a lookup from ABC accession to CL term.
Do this based on harvesting old CL mappings and mapping forwards on names.

There are 3 sources of mappings

- old root node mappings
- cell set mappings
- NT mappings

NT mappings should supercede any existing mappings to NT.

In [85]:
## NT mappings (using label names from ABC atlas)

NT =  { 'Glut': {'ID': 'CL:0000679', 'label': 'glutamatergic neuron'},
        'GABA': {'ID': 'CL:0000617', 'label': 'GABAergic neuron'},
        'Sero': {'ID': 'CL:0000850', 'label': 'serotonergic neuron'},
        'Dopa': {'ID': 'CL:0000700', 'label': 'dopaminergic neuron'},
        'Chol': {'ID': 'CL:0000108', 'label': 'cholinergic neuron'},
        'Hist': {'ID': 'CL:0011110', 'label': 'histaminergic neuron'},
        'Nora': {'ID': 'CL:0008025', 'label': 'noradrendergic neuron'}       
      }


In [86]:
# Make a CL ID to label lookup
clget = requests.get("http://purl.obolibrary.org/obo/cl/cl.json")
cl = json.loads(clget.content)['graphs'][0]
cl_lookup = { n['id'].replace("http://purl.obolibrary.org/obo/CL_", "CL:") : n['lbl'] 
          for n in cl['nodes'] 
          if 'lbl' in n.keys() }

In [87]:
# Function for generating quick dictionary lookups from dataframes - cos I suck a pandas joins
def df2dict(df: pd.DataFrame, key_column: str):
    return df.set_index(key_column).fillna('').to_dict(orient='index')

In [88]:
# Old taxonomy has CL mappings and the names we will use to map forward.
old_taxonomy = pd.read_csv('../data/BDSO_taxonomy_templates/nomenclature_table_CS202212150_annotated.csv', sep=',')
old_taxonomy[['cell_set_preferred_alias', 'CL',]].dropna()[0:3]

Unnamed: 0,cell_set_preferred_alias,CL
4414,4972 CB Granule Glut_1,CL:0000120
4415,4973 CB Granule Glut_1,CL:0000120
4416,4974 CB Granule Glut_1,CL:0000120


In [89]:
# Some lookups from old taxonomy
old_taxonomy_name_dict = df2dict(old_taxonomy, 'cell_set_preferred_alias')
old_taxonomy_accession_dict = df2dict(old_taxonomy, 'cell_set_accession')


In [90]:

CL_mapping = df2dict(old_taxonomy[['cell_set_preferred_alias', 'CL',]].dropna(), 'cell_set_preferred_alias')

In [91]:
with open('../data/BDSO_taxonomy_templates/taxonomy_details.yaml', 'r') as taxd_file:
    taxd=yaml.safe_load(taxd_file.read())
taxd[0]['Root_nodes'][0:3]

[{'Node': 'CS202212150_6347',
  'Cell_type': 'CL:0000679',
  'Location_relation': 'has_soma_location'},
 {'Node': 'CS202212150_6348',
  'Cell_type': 'CL:0000679',
  'Location_relation': 'has_soma_location'},
 {'Node': 'CS202212150_6349',
  'Cell_type': 'CL:0000679',
  'Location_relation': 'has_soma_location'}]

In [92]:
# we need to map these too.  To make combining easy, use the same data structure

root_node_CL_mappings = { old_taxonomy_accession_dict[n['Node']]['cell_set_preferred_alias']: 
                         { 'CL': n['Cell_type'], 'cell_set_accession' : n['Node'] }
                         for n in taxd[0]['Root_nodes'] }

In [93]:
# merge into one mapping
CL_mapping.update(root_node_CL_mappings)

In [94]:
list(CL_mapping.items())[0:3]


[('4972 CB Granule Glut_1', {'CL': 'CL:0000120'}),
 ('4973 CB Granule Glut_1', {'CL': 'CL:0000120'}),
 ('4974 CB Granule Glut_1', {'CL': 'CL:0000120'})]

In [95]:
# remove NTs. We only want to use the mapping of these directly to cluster in the NT field in ABC.

to_remove = []
for k,v in CL_mapping.items():
    if cl_lookup[v['CL']] in [n['label'] for n in NT.values()]:
        to_remove.append(k)

[CL_mapping.pop(key) for key in to_remove]
for k,v in

[{'CL': 'CL:0000679'},
 {'CL': 'CL:0000617'},
 {'CL': 'CL:0000679', 'cell_set_accession': 'CS202212150_6347'},
 {'CL': 'CL:0000679', 'cell_set_accession': 'CS202212150_6348'},
 {'CL': 'CL:0000679', 'cell_set_accession': 'CS202212150_6349'},
 {'CL': 'CL:0000617', 'cell_set_accession': 'CS202212150_6352'},
 {'CL': 'CL:0000617', 'cell_set_accession': 'CS202212150_6353'},
 {'CL': 'CL:0000617', 'cell_set_accession': 'CS202212150_6354'},
 {'CL': 'CL:0000679', 'cell_set_accession': 'CS202212150_6355'},
 {'CL': 'CL:0000679', 'cell_set_accession': 'CS202212150_6356'},
 {'CL': 'CL:0000617', 'cell_set_accession': 'CS202212150_6357'},
 {'CL': 'CL:0000679', 'cell_set_accession': 'CS202212150_5468'},
 {'CL': 'CL:0000679', 'cell_set_accession': 'CS202212150_6359'},
 {'CL': 'CL:0000679', 'cell_set_accession': 'CS202212150_6361'},
 {'CL': 'CL:0000679', 'cell_set_accession': 'CS202212150_6362'},
 {'CL': 'CL:0000679', 'cell_set_accession': 'CS202212150_6363'},
 {'CL': 'CL:0000850', 'cell_set_accession': 

In [96]:
# Start by loading relevant ABC metadata files into dataframes
dpath = '../data/WMB_taxonomy_20230830/'
cluster_annotation_term = pd.read_csv(dpath + 'cluster_annotation_term.csv', sep=',')

In [97]:
import re
def newname_2_old(name:str, old_names):
    old_style_name = re.sub("^\d+ (.+)", r"\1", name)
    if old_style_name in old_names:
        return old_style_name
    else:
        return name


In [98]:
cluster_annotation_term_dict = df2dict(cluster_annotation_term, 'label')

In [104]:
out = {}
for k,v in cluster_annotation_term_dict.items():
    old_name = newname_2_old(v['name'], old_taxonomy_name_dict.keys())
    if old_name in CL_mapping.keys():
        out[k] = CL_mapping[old_name]['CL']

out
    

{'CS20230722_CLAS_33': 'CL:4023072',
 'CS20230722_CLAS_34': 'CL:0000738',
 'CS20230722_SUBC_327': 'CL:0000128',
 'CS20230722_SUBC_330': 'CL:4023051',
 'CS20230722_SUBC_331': 'CL:2000043',
 'CS20230722_SUBC_332': 'CL:0000192',
 'CS20230722_SUBC_333': 'CL:0000115',
 'CS20230722_SUBC_338': 'CL:0000542',
 'CS20230722_SUPT_1154': 'CL:0000120',
 'CS20230722_SUPT_1159': 'CL:0000127',
 'CS20230722_SUPT_1163': 'CL:0000127',
 'CS20230722_SUPT_1166': 'CL:0000127',
 'CS20230722_SUPT_1172': 'CL:0002085',
 'CS20230722_SUPT_1175': 'CL:0000065',
 'CS20230722_SUPT_1179': 'CL:0002453',
 'CS20230722_SUPT_1187': 'CL:4023051',
 'CS20230722_CLUS_5197': 'CL:0000120',
 'CS20230722_CLUS_5198': 'CL:0000120',
 'CS20230722_CLUS_5199': 'CL:0000120',
 'CS20230722_CLUS_5208': 'CL:0000127',
 'CS20230722_CLUS_5209': 'CL:0000127',
 'CS20230722_CLUS_5210': 'CL:0000127',
 'CS20230722_CLUS_5211': 'CL:0000127',
 'CS20230722_CLUS_5212': 'CL:0000127',
 'CS20230722_CLUS_5213': 'CL:0000127',
 'CS20230722_CLUS_5224': 'CL:000012

In [103]:
{ v['CL'] for v in out.values()}

{'CL:0000065',
 'CL:0000115',
 'CL:0000120',
 'CL:0000127',
 'CL:0000128',
 'CL:0000192',
 'CL:0000542',
 'CL:0000738',
 'CL:0002085',
 'CL:0002453',
 'CL:2000043',
 'CL:4023051',
 'CL:4023072'}

## Hmmmm

That a lot of work for a very small number of CL term mappings...!