In [1]:
import json

#  Open files

with open('./intermediatte_files/Siletti_from_abc.json', 'r') as f:
    Siletti_abc_cas = json.loads(f.read())

with open('./cas_source/CS202210140_non_neuronal.json', 'r') as f:
    nn = json.loads(f.read())
with open('./cas_source/CS202210140_neurons.json', 'r') as f:
    neurons = json.loads(f.read())

#Generate lookups for accessions for nn and neuron clusters respectively
neuron_clusters = { a['cell_set_accession'] for a in neurons['annotations'] 
                    if a['labelset'] == 'Cluster'}
nn_clusters = { a['cell_set_accession'] for a in nn['annotations'] 
                    if a['labelset'] == 'Cluster'}

# Make lists of subclusters from ABC that have a parent cluster in nn & neurons respectively
nn_subclusters = [a for a in Siletti_abc_cas['annotations']
                  if (a['labelset'] == 'subcluster') 
                  and (a['parent_cell_set_accession'] in nn_clusters)]
neuron_subclusters = [a for a in Siletti_abc_cas['annotations']
                  if (a['labelset'] == 'subcluster') 
                  and (a['parent_cell_set_accession'] in neuron_clusters)]

# Merge in subclusters

nn['annotations'].extend(nn_subclusters)
neurons['annotations'].extend(neuron_subclusters)

In [2]:
#Commenting as depends on path to anndata file - not stored in repo
#%%bash
# Generate file of CAS files from subclusters in Siletti anndata files
#cas anndata2cas --anndata Siletti_all_non_neuronal_cells.h5ad --labelsets subcluster_id --output intermediatte_files/Siletti_subclusters_from_anndata_cas.json

In [3]:
# Generate lookup for subcluster cell_ids, keyed on subclusters_id

# Open files
import json
with open('./intermediatte_files/Siletti_subclusters_from_anndata_cas.json', 'r') as f:
    Siletti_subclusters_from_ad = json.loads(f.read())

subcluster_cell_id_lookup = { a['cell_label'] : a['cell_ids'] for a in Siletti_subclusters_from_ad['annotations']}

In [4]:
# Check number of matches on both sides
len(subcluster_cell_id_lookup.keys())

297

In [5]:
len(Siletti_subclusters_from_ad['annotations'])

297

In [6]:
len(neuron_subclusters)

3016

In [7]:
neuron_subclusters[0]

{'labelset': 'subcluster',
 'cell_label': 'URL_297_0',
 'cell_set_accession': 'CS202210140_494',
 'parent_cell_set_accession': 'CS202210140_298'}

In [8]:
import re
# Testing regex pattern
m = re.match('.+_(\d+)', 'URL_297_0')
m.group(1)

'0'

In [9]:
# Use regex matching ID -> label to attach cell_ids to subclusters
for c in neuron_subclusters:
    m = re.match('.+_(\d+)', c['cell_label'])
    if int(m.group(1)) in subcluster_cell_id_lookup.keys():
        c['cell_ids'] = subcluster_cell_id_lookup[int(m.group(1))]

In [10]:
# Test addition cell_Ids has worked
neuron_subclusters[0]['cell_ids'][0:5]

KeyError: 'cell_ids'

In [None]:
import pandas as pd
AT = pd.read_csv('./Annotation_transfer/Clusters_AT_from_MTG.tsv', sep='\t')
MTG_annotations = pd.read_csv('./MTG_cas/CCN20240304_annotations.tsv', sep='\t')
MTG_annotations[0:5]

In [None]:
# Aim 
# Look up transfered MTG names (in MT) in MTG cell_label var - MT - & link Siletti Cell label to MTG annotation object

# 1. Make template for annotation transfer objects
# 2. Iterate over annotatoin transfer labels, matching to cluster labels & using this to generate annotation transfer objects.
# 3. Attach annotation transfer objects to annotatinons

In [None]:
MTG_annotation_lookup = MTG_annotations.set_index("cell_label").to_dict(orient='index')

In [None]:
# What matches?
set(MTG_annotation_lookup.keys()).intersection(set(AT['Transferred MTG Label']))

In [None]:
# What doesn't match?  - Can we match this by hand?
(set(AT['Transferred MTG Label']))-set(MTG_annotation_lookup.keys())

In [None]:
# Dict comp => Dict keyed on Siletti Cluster_name (=cell_label) with value = MTG annotation transfer object
# Iterate over Transferred annotations (AT)
out = {}
comment = "We performed PCA (50 components) on our full dataset, trained a random forest classifier (scikit-learn, class_ weight=‘balanced’, max_depth=50) on the MTG labels, and then predicted labels for all cells. We labeled each cluster with the mode of its constituent cells if two conditions were met: more than 0.8 of predicted labels matched the mode, and the mean probability of these pre- dictions was greater than 0.8."
for i,r in AT.iterrows(): 
    if r['Transferred MTG Label'] in MTG_annotation_lookup.keys():
        MTG_ann =  MTG_annotation_lookup[r['Transferred MTG Label']]
        out[r['Cluster name']]= { "transferred_cell_label" : r['Transferred MTG Label'], 
                                  "source_node_accession": MTG_ann['labelset'] + '_' + MTG_ann['cell_set_accession'],
                                  "source_taxonomy": "https://purl.brain-bican.org/taxonomy/AIT_MTG/AIT_MTG.json",
                                  "comment": comment }
# Test
out['Amex_175']

In [None]:
# Attach Annotation transfers to CAS for nn & neurons matching on names

for a in neurons['annotations']:
    if a['cell_label'] in out.keys():
        a['annotation_transfer'] = [out[a['cell_label']]]
        # Testing
[a['annotation_transfer'] for a in neurons['annotations'] if 'annotation_transfer' in a.keys()][0]

In [None]:
# TODO - repeat for Neurons. Save and test CAS.json