**MAKE SURE THE PICKLE FILE USED HERE IS UP TO DATE!**

The pickle file is created in density3_test_total_neuron_count_pickle.ipynb

The current validation script is here: 
- https://bbpgitlab.epfl.ch/molsys/personal/gene_to_ccfv3/-/blob/main/cell_atlas/validation.py?ref_type=heads

Currently it is checking for **cell, neuron, glia, astrocyte, microglia, oligodendrocyte, gad, pv, sst, vip, generic_excitatory, generic_inhibitory** numbers

In the **cerebellum, isocortex, fiber_tracts_ids, hippocampus, hippocampal_formation, thalamus, striatum, VPL, LGd, VPM, MOB** regions

Files to create:
   - .load_nrrd(cell_density)
   - neuron_density.nrrd
   - glia_density.nrrd
   - astrocyte_density.nrrd
   - microglia_density.nrrd
   - oligodendrocyte_density.nrrd
   - gad67+_density.nrrd
   - pv+_density.nrrd
   - sst+_density.nrrd
   - vip+_density.nrrd
   - Generic_Excitatory_Neuron_MType_Generic_Excitatory_Neuron_EType.nrrd
   - Generic_Inhibitory_Neuron_MType_Generic_Inhibitory_Neuron_EType.nrrd

Reconciling region ids:
- parcellation_term_label = 1.json / AllenCCF-Annotation-2020-
- parcellation_label = AllenCCF-Annotation-2020

In [1]:
import pickle
import pandas as pd
import json
from voxcell import RegionMap
import nrrd
import numpy as np
import os
import copy

In [9]:

path = '/gpfs/bbp.cscs.ch/data/project/proj84/csaba/aibs_10x_mouse_wholebrain/results/density_calculations/'


# Load the dictionary of DataFrames from the pickle file
with open(f'{path}total_cells.pickle', 'rb') as f:
    total_cells = pickle.load(f)


In [3]:
file = '/gpfs/bbp.cscs.ch/data/project/proj84/csaba/aibs_10x_mouse_wholebrain/metadata/parcellation_to_parcellation_term_membership_extend.csv'
parcellation_annotation = pd.read_csv(file)
#parcellation_annotation = parcellation_annotation[parcellation_annotation['parcellation_term_set_name'] == 'substructure']

In [4]:
regions = list(total_cells.keys())
print(f'Regions _covered_ in the brain / total cells dict of df: {len(regions)}')
# Assuming 'my_list' is your list
if 'unassigned' in regions:
    regions.remove('unassigned')
    print("For now we are removing the region unassigned as we cannot place it in the 1.json system")
    
if 'brain-unassigned' in regions:
    regions.remove('brain-unassigned')
    print("For now we are removing the region brain-unassigned as we cannot place it in the 1.json system")    
    
print(f"Remaining regions: {len(regions)}")
region_info = parcellation_annotation[parcellation_annotation['parcellation_term_acronym'].isin(regions)]
region_info.shape

Regions _covered_ in the brain / total cells dict of df: 703
For now we are removing the region unassigned as we cannot place it in the 1.json system
For now we are removing the region brain-unassigned as we cannot place it in the 1.json system
Remaining regions: 701


(959, 19)

In [5]:
missing_regions = parcellation_annotation[~parcellation_annotation['parcellation_term_acronym'].isin(regions)]
missin_substructures = missing_regions[missing_regions['parcellation_term_set_name'] == 'substructure']
print(f"All substructures not covered by merfish slices: \n1. {missin_substructures['parcellation_term_name'].values}")

All substructures not covered by merfish slices: 
1. ['Frontal pole, layer 1' 'Retrosplenial area, dorsal part, layer 4'
 'Lobule II' 'Lobule III' 'Lobules IV-V' 'Crus 1' 'Crus 2'
 'cuneate fascicle' 'pyramidal decussation' 'direct tectospinal pathway'
 'unassigned' 'brain, unassigned' 'Field CA1, unassigned'
 'Field CA3, unassigned' 'Bed nucleus of the anterior commissure'
 'Gracile nucleus' 'Lingula (I)' 'Declive (VI)'
 'Folium-tuber vermis (VII)' 'Pyramus (VIII)' 'Uvula (IX)' 'Nodulus (X)'
 'Simple lobule' 'Paramedian lobule' 'Copula pyramidis' 'Paraflocculus'
 'Flocculus' 'vomeronasal nerve' 'central canal, spinal cord/medulla']


In [6]:
#If this does not trigger every region acronym is a substructure 
for region in regions:
    acronym = region_info[region_info['parcellation_term_acronym'] == region]
    if acronym.shape[0] == 0:
        print(f"{region} is not a substructure! ")

## Region parcellation based on validation code

We can check which region is covered by the current validation code __registered_regions__ and which are not __missing_reg__. 
Strings to check: regions where literature data is available
Second list: remaining areas in the brain. 


In [7]:
#hierarchy_json = '/gpfs/bbp.cscs.ch/home/veraszto/bbp_prod_files/1.json'
hierarchy_json = '/gpfs/bbp.cscs.ch/data/project/proj84/atlas_pipeline_runs/2024-05-15T22:44:26+02:00/hierarchy_ccfv3_l23split_barrelsplit.json'
region_map = RegionMap.load_json(hierarchy_json)

# List of strings to check
strings_to_check = ['Cerebellum', 'Isocortex', 'fiber tracts', 'ventricular systems', 'Hippocampus', 
                    'Hippocampal formation', 'Thalamus', 'Striatum', 'VPL', 'LGd', 'VPM', 'MOB']

second_list = ['Brain stem', 'Olfactory areas', 'Cortical subplate', 'Cerebral nuclei', ]
missing_reg = []
registered_regions = []

for region in regions:
    acronym = region_info[region_info['parcellation_term_acronym'] == region][['parcellation_term_acronym', 'parcellation_term_name', 'parcellation_label']]
    long_id = region_info[region_info['parcellation_term_acronym'] == region]['parcellation_label']
    substring = "AllenCCF-Annotation-2020-"
    index = long_id.iloc[0].find(substring)
    id = long_id.iloc[0][index + len(substring):]
    # print(long_id.iloc[0], id)
    ascendants = region_map.get(int(id), "name", with_ascendants=True)
    # Assuming ascendant is your list
    found_strings = [s for s in strings_to_check if any(s in a for a in ascendants)]
    found_strings_2 = [s for s in second_list if any(s in a for a in ascendants)]
    if found_strings:
        #print(f"Found strings in the ascendant list: {found_strings}")
        registered_regions.append(id)
    elif found_strings_2:
        missing_reg.append(id)
        #print(f"Found strings in second list: {found_strings_2}")
    else:
        print(id)
        break

## Calculate total cells and density in a region: Isocortex

Deal with regions missing from the total cells

In [8]:
isocortex = (
    region_map.find("Isocortex", attr="name", with_descendants=True)
    | region_map.find("Entorhinal area", attr="name", with_descendants=True)
    | region_map.find("Piriform area", attr="name", with_descendants=True)
)
cerebellum = region_map.find(
        "Cerebellum", attr="name", with_descendants=True
    ) | region_map.find("arbor vitae", attr="name", with_descendants=True)

fiber_tracts_ids = (
    region_map.find("fiber tracts", attr="name", with_descendants=True)
    | region_map.find("grooves", attr="name", with_descendants=True)
    | region_map.find("ventricular systems", attr="name", with_descendants=True)
)
hippocampus = (
    region_map.find("Hippocampal region", attr="name", with_descendants=True)
)
hippocampal_formation = (
    region_map.find("Hippocampal formation", attr="name", with_descendants=True)
)
thalamus = (
    region_map.find("Thalamus", attr="name", with_descendants=True)
)
striatum = (
    region_map.find("Striatum", attr="name", with_descendants=True)
)
VPL = (
    region_map.find("Ventral posterolateral nucleus of the thalamus", attr="name", with_descendants=True)
)
LGd = (
    region_map.find("Dorsal part of the lateral geniculate complex", attr="name", with_descendants=True)
)
VPM = (
    region_map.find("Ventral posteromedial nucleus of the thalamus", attr="name", with_descendants=True)
)
MOB = (
    region_map.find("Main olfactory bulb", attr="name", with_descendants=True)
)

In [9]:
#This cell will complement the two cells above
Hindbrain = (
    region_map.find("Hindbrain", attr="name", with_descendants=True)
)
Midbrain = (
    region_map.find("Midbrain", attr="name", with_descendants=True)
)
Interbrain = (
    region_map.find("Interbrain", attr="name", with_descendants=True)
)
Olfactoryareas = (
    region_map.find("Olfactory areas", attr="name", with_descendants=True)
)
Corticalsubplate = (
    region_map.find("Cortical subplate", attr="name", with_descendants=True)
)
Cerebralnuclei = (
    region_map.find("Cerebral nuclei", attr="name", with_descendants=True)
)
cerebral_cortex = (
    region_map.find("Cerebral cortex", attr="name", with_descendants=True)
)
wholebrain = (
    region_map.find("root", attr="name", with_descendants=True)
)

areas = {
    'wholebrain': wholebrain, 'Cerebral cortex': cerebral_cortex,
    'isocortex': isocortex, 'cerebellum': cerebellum, 'fiber_tracts_ids': fiber_tracts_ids, 
    'hippocampus': hippocampus, 'hippocampal_formation':hippocampal_formation, 
    'thalamus': thalamus, 'striatum': striatum, 'VPL': VPL, 'LGd': LGd, 'VPM': VPM, 'MOB': MOB,
    'Olfactory areas': Olfactoryareas, 'Cortical subplate': Corticalsubplate, 
    'Cerebral nuclei': Cerebralnuclei, 'Interbrain': Interbrain , 'Midbrain': Midbrain, 
    'Interbrain': Interbrain
}

In [13]:
total_cells_copy = copy.deepcopy(total_cells) 

for key, value in areas.items():
    
    keys_to_drop = ['unassigned', 'brain-unassigned']
    for region in regions:

        acronym = region_info[region_info['parcellation_term_acronym'] == region][['parcellation_term_acronym', 'parcellation_term_name', 'parcellation_label']]

        long_id = region_info[region_info['parcellation_term_acronym'] == region]['parcellation_label']
        substring = "AllenCCF-Annotation-2020-"
        index = long_id.iloc[0].find(substring)
        id_ = long_id.iloc[0][index + len(substring):]

        if int(id_) in value:
            #print(region, id_, "is in the set")
            None
        else:
            #print(id_, "is not in the set")
            keys_to_drop.append(region)

        #break    

    # Create a new dictionary without the specified keys (all leaf regions and their cell types in 1 region)
    total_cells_filtered = {key: value for key, value in total_cells_copy.items() if key not in keys_to_drop}

    # Concatenate all DataFrames into a single DataFrame
    concatenated_df = pd.concat(total_cells_filtered.values())

    # Group by all columns except the 'total_cells' column and sum the values
    summed_df = concatenated_df.groupby(concatenated_df.index).sum()
    print(f"The brain area {key} has {summed_df.shape[0]} different cell types")
    print("Total sum of cells:", '{:,.0f}'.format(summed_df.sum()[0]))

The brain area wholebrain has 5272 different cell types
Total sum of cells: 64,089,207
The brain area Cerebral cortex has 3095 different cell types
Total sum of cells: 28,146,467
The brain area isocortex has 946 different cell types
Total sum of cells: 17,188,892
The brain area cerebellum has 861 different cell types
Total sum of cells: 6,072,302
The brain area fiber_tracts_ids has 3174 different cell types
Total sum of cells: 7,977,437
The brain area hippocampus has 590 different cell types
Total sum of cells: 4,086,949
The brain area hippocampal_formation has 1743 different cell types
Total sum of cells: 6,008,031
The brain area thalamus has 1682 different cell types
Total sum of cells: 2,823,808
The brain area striatum has 1379 different cell types
Total sum of cells: 8,114,890
The brain area VPL has 81 different cell types
Total sum of cells: 117,011
The brain area LGd has 63 different cell types
Total sum of cells: 107,728
The brain area VPM has 58 different cell types
Total sum o

In [14]:
meta_path = "/gpfs/bbp.cscs.ch/data/project/proj84/csaba/aibs_10x_mouse_wholebrain/metadata/WMB-10X/20231215/views/cell_metadata_with_cluster_annotation.csv"
metadata = pd.read_csv(meta_path, dtype={'cell_label':str})
metadata = metadata[['class', 'subclass', 'cluster']]

n_classes = ['01 IT-ET Glut', '02 NP-CT-L6b Glut', '03 OB-CR Glut',
       '04 DG-IMN Glut', '05 OB-IMN GABA', '06 CTX-CGE GABA',
       '07 CTX-MGE GABA', '08 CNU-MGE GABA', '09 CNU-LGE GABA',
       '10 LSX GABA', '11 CNU-HYa GABA', '12 HY GABA', '13 CNU-HYa Glut',
       '14 HY Glut', '15 HY Gnrh1 Glut', '16 HY MM Glut', '17 MH-LH Glut',
       '18 TH Glut', '19 MB Glut', '20 MB GABA', '21 MB Dopa',
       '22 MB-HB Sero', '23 P Glut', '24 MY Glut', '25 Pineal Glut',
       '26 P GABA', '27 MY GABA', '28 CB GABA', '29 CB Glut',]

nn_classes = ['30 Astro-Epen', '31 OPC-Oligo', '32 OEC', '33 Vascular',
       '34 Immune']

exc = ['01 IT-ET Glut', '02 NP-CT-L6b Glut', '03 OB-CR Glut',
      '04 DG-IMN Glut', '13 CNU-HYa Glut', '14 HY Glut', '15 HY Gnrh1 Glut', '16 HY MM Glut', '17 MH-LH Glut',
      '18 TH Glut', '19 MB Glut', '23 P Glut', '24 MY Glut', '25 Pineal Glut', '29 CB Glut',]
inh = ['05 OB-IMN GABA', '06 CTX-CGE GABA', '07 CTX-MGE GABA', '08 CNU-MGE GABA', '09 CNU-LGE GABA',
      '10 LSX GABA', '11 CNU-HYa GABA', '12 HY GABA', '20 MB GABA', '26 P GABA', '27 MY GABA', '28 CB GABA', ]
other = ['21 MB Dopa', '22 MB-HB Sero', ]
exci_inhib_su = exc + inh

astrotypes = ['5206 Bergmann NN_1', '5207 Astro-CB NN_1', '5208 Astro-NT NN_1',
       '5209 Astro-NT NN_1', '5210 Astro-NT NN_1', '5211 Astro-NT NN_1',
       '5212 Astro-NT NN_1', '5213 Astro-NT NN_1', '5214 Astro-NT NN_2',
       '5215 Astro-NT NN_2', '5216 Astro-NT NN_2', '5217 Astro-NT NN_2',
       '5218 Astro-TE NN_1', '5219 Astro-TE NN_1', '5220 Astro-TE NN_1',
       '5221 Astro-TE NN_1', '5222 Astro-TE NN_2', '5223 Astro-TE NN_2',
       '5224 Astro-TE NN_3', '5225 Astro-TE NN_3', '5226 Astro-TE NN_3',
       '5227 Astro-TE NN_3', '5228 Astro-TE NN_4', '5229 Astro-TE NN_5',
       '5230 Astro-TE NN_5', '5231 Astro-OLF NN_1', '5232 Astro-OLF NN_1',
       '5233 Astro-OLF NN_2', '5234 Astro-OLF NN_2',
       '5235 Astro-OLF NN_3', '5236 Astro-OLF NN_3',]

microglia = ['5312 Microglia NN_1']

oligos = ['5266 OPC NN_1', '5267 OPC NN_1',
       '5268 OPC NN_1', '5269 OPC NN_1', '5270 OPC NN_1', '5271 OPC NN_2',
       '5272 COP NN_1', '5273 COP NN_1', '5274 COP NN_1', '5275 COP NN_1',
       '5276 COP NN_1', '5277 COP NN_1', '5278 NFOL NN_2',
       '5279 NFOL NN_2', '5280 NFOL NN_2', '5281 NFOL NN_2',
       '5282 MFOL NN_3', '5283 MFOL NN_3', '5284 MOL NN_4',
       '5285 MOL NN_4', '5286 MOL NN_4', '5287 MOL NN_4', '5288 MOL NN_4',]

glia = astrotypes + microglia + oligos

neurontypes = np.unique(metadata[metadata['class'].isin(n_classes)]['cluster'].values)
nonneurontypes = np.unique(metadata[metadata['class'].isin(nn_classes)]['cluster'].values)
exctypes = np.unique(metadata[metadata['class'].isin(exc)]['cluster'].values)
inhtypes = np.unique(metadata[metadata['class'].isin(inh)]['cluster'].values)
othertypes = np.unique(metadata[metadata['class'].isin(other)]['cluster'].values)
exci_inhib_sum = np.unique(metadata[metadata['class'].isin(exci_inhib_su)]['cluster'].values)

In [15]:
total_cells_copy = copy.deepcopy(total_cells) 

for key, df in total_cells_copy.items():
    total_cells_copy[key] = df[df.index.isin(nonneurontypes)]

for key, value in areas.items():
    
    keys_to_drop = ['unassigned', 'brain-unassigned']
    for region in regions:

        acronym = region_info[region_info['parcellation_term_acronym'] == region][['parcellation_term_acronym', 'parcellation_term_name', 'parcellation_label']]

        long_id = region_info[region_info['parcellation_term_acronym'] == region]['parcellation_label']
        substring = "AllenCCF-Annotation-2020-"
        index = long_id.iloc[0].find(substring)
        id_ = long_id.iloc[0][index + len(substring):]

        if int(id_) in value:
            #print(region, id_, "is in the set")
            None
        else:
            #print(id_, "is not in the set")
            keys_to_drop.append(region)

        #break    

    # Create a new dictionary without the specified keys
    total_cells_filtered = {key: value for key, value in total_cells_copy.items() if key not in keys_to_drop}

    # Concatenate all DataFrames into a single DataFrame
    concatenated_df = pd.concat(total_cells_filtered.values())

    # Group by all columns except the 'total_cells' column and sum the values
    summed_df = concatenated_df.groupby(concatenated_df.index).sum()
    print(f"The brain area {key} has {summed_df.shape[0]}/{len(nonneurontypes)} different nonneuron types")
    print("Total sum of these cells:", '{:,.0f}'.format(summed_df.sum()[0]))

The brain area wholebrain has 117/117 different nonneuron types
Total sum of these cells: 27,598,694
The brain area Cerebral cortex has 106/117 different nonneuron types
Total sum of these cells: 9,357,603
The brain area isocortex has 76/117 different nonneuron types
Total sum of these cells: 5,951,939
The brain area cerebellum has 80/117 different nonneuron types
Total sum of these cells: 2,572,198
The brain area fiber_tracts_ids has 114/117 different nonneuron types
Total sum of these cells: 6,168,942
The brain area hippocampus has 82/117 different nonneuron types
Total sum of these cells: 1,762,772
The brain area hippocampal_formation has 90/117 different nonneuron types
Total sum of these cells: 2,556,155
The brain area thalamus has 88/117 different nonneuron types
Total sum of these cells: 1,230,337
The brain area striatum has 82/117 different nonneuron types
Total sum of these cells: 2,225,040
The brain area VPL has 40/117 different nonneuron types
Total sum of these cells: 65,08

In [16]:
total_cells_copy = copy.deepcopy(total_cells) 

for key, df in total_cells_copy.items():
    total_cells_copy[key] = df[df.index.isin(neurontypes)]

for key, value in areas.items():
    
    keys_to_drop = ['unassigned', 'brain-unassigned']
    for region in regions:

        acronym = region_info[region_info['parcellation_term_acronym'] == region][['parcellation_term_acronym', 'parcellation_term_name', 'parcellation_label']]

        long_id = region_info[region_info['parcellation_term_acronym'] == region]['parcellation_label']
        substring = "AllenCCF-Annotation-2020-"
        index = long_id.iloc[0].find(substring)
        id_ = long_id.iloc[0][index + len(substring):]

        if int(id_) in value:
            #print(region, id_, "is in the set")
            None
        else:
            #print(id_, "is not in the set")
            keys_to_drop.append(region)

        #break    

    # Create a new dictionary without the specified keys
    total_cells_filtered = {key: value for key, value in total_cells_copy.items() if key not in keys_to_drop}

    # Concatenate all DataFrames into a single DataFrame
    concatenated_df = pd.concat(total_cells_filtered.values())

    # Group by all columns except the 'total_cells' column and sum the values
    summed_df = concatenated_df.groupby(concatenated_df.index).sum()
    print(f"The brain area {key} has {summed_df.shape[0]} different neuron types")
    print("Total sum of these cells:", '{:,.0f}'.format(summed_df.sum()[0]))

The brain area wholebrain has 5155 different neuron types
Total sum of these cells: 36,490,512
The brain area Cerebral cortex has 2989 different neuron types
Total sum of these cells: 18,788,865
The brain area isocortex has 870 different neuron types
Total sum of these cells: 11,236,952
The brain area cerebellum has 781 different neuron types
Total sum of these cells: 3,500,104
The brain area fiber_tracts_ids has 3060 different neuron types
Total sum of these cells: 1,808,495
The brain area hippocampus has 508 different neuron types
Total sum of these cells: 2,324,177
The brain area hippocampal_formation has 1653 different neuron types
Total sum of these cells: 3,451,875
The brain area thalamus has 1594 different neuron types
Total sum of these cells: 1,593,471
The brain area striatum has 1297 different neuron types
Total sum of these cells: 5,889,850
The brain area VPL has 41 different neuron types
Total sum of these cells: 51,931
The brain area LGd has 27 different neuron types
Total

In [17]:
total_cells_copy = copy.deepcopy(total_cells) 

for key, df in total_cells_copy.items():
    total_cells_copy[key] = df[df.index.isin(exctypes)]

for key, value in areas.items():
    
    keys_to_drop = ['unassigned', 'brain-unassigned']
    for region in regions:

        acronym = region_info[region_info['parcellation_term_acronym'] == region][['parcellation_term_acronym', 'parcellation_term_name', 'parcellation_label']]

        long_id = region_info[region_info['parcellation_term_acronym'] == region]['parcellation_label']
        substring = "AllenCCF-Annotation-2020-"
        index = long_id.iloc[0].find(substring)
        id_ = long_id.iloc[0][index + len(substring):]

        if int(id_) in value:
            #print(region, id_, "is in the set")
            None
        else:
            #print(id_, "is not in the set")
            keys_to_drop.append(region)

        #break    

    # Create a new dictionary without the specified keys
    total_cells_filtered = {key: value for key, value in total_cells_copy.items() if key not in keys_to_drop}

    # Concatenate all DataFrames into a single DataFrame
    concatenated_df = pd.concat(total_cells_filtered.values())

    # Group by all columns except the 'total_cells' column and sum the values
    summed_df = concatenated_df.groupby(concatenated_df.index).sum()
    print(f"The brain area {key} has {summed_df.shape[0]} different EXC neuron types")
    print("Total sum of these cells:", '{:,.0f}'.format(summed_df.sum()[0]))

The brain area wholebrain has 2581 different EXC neuron types
Total sum of these cells: 19,805,749
The brain area Cerebral cortex has 1608 different EXC neuron types
Total sum of these cells: 13,028,099
The brain area isocortex has 492 different EXC neuron types
Total sum of these cells: 9,625,281
The brain area cerebellum has 395 different EXC neuron types
Total sum of these cells: 2,118,033
The brain area fiber_tracts_ids has 1425 different EXC neuron types
Total sum of these cells: 852,360
The brain area hippocampus has 253 different EXC neuron types
Total sum of these cells: 2,063,011
The brain area hippocampal_formation has 894 different EXC neuron types
Total sum of these cells: 2,989,335
The brain area thalamus has 801 different EXC neuron types
Total sum of these cells: 1,266,290
The brain area striatum has 367 different EXC neuron types
Total sum of these cells: 107,910
The brain area VPL has 11 different EXC neuron types
Total sum of these cells: 49,316
The brain area LGd has

In [18]:
total_cells_copy = copy.deepcopy(total_cells) 

for key, df in total_cells_copy.items():
    total_cells_copy[key] = df[df.index.isin(inhtypes)]

for key, value in areas.items():
    
    keys_to_drop = ['unassigned', 'brain-unassigned']
    for region in regions:

        acronym = region_info[region_info['parcellation_term_acronym'] == region][['parcellation_term_acronym', 'parcellation_term_name', 'parcellation_label']]

        long_id = region_info[region_info['parcellation_term_acronym'] == region]['parcellation_label']
        substring = "AllenCCF-Annotation-2020-"
        index = long_id.iloc[0].find(substring)
        id_ = long_id.iloc[0][index + len(substring):]

        if int(id_) in value:
            #print(region, id_, "is in the set")
            None
        else:
            #print(id_, "is not in the set")
            keys_to_drop.append(region)

        #break    

    # Create a new dictionary without the specified keys
    total_cells_filtered = {key: value for key, value in total_cells_copy.items() if key not in keys_to_drop}

    # Concatenate all DataFrames into a single DataFrame
    concatenated_df = pd.concat(total_cells_filtered.values())

    # Group by all columns except the 'total_cells' column and sum the values
    summed_df = concatenated_df.groupby(concatenated_df.index).sum()
    print(f"The brain area {key} has {summed_df.shape[0]} different INH neuron types")
    print("Total sum of these cells:", '{:,.0f}'.format(summed_df.sum()[0]))

The brain area wholebrain has 2499 different INH neuron types
Total sum of these cells: 16,587,731
The brain area Cerebral cortex has 1336 different INH neuron types
Total sum of these cells: 5,760,765
The brain area isocortex has 378 different INH neuron types
Total sum of these cells: 1,611,671
The brain area cerebellum has 375 different INH neuron types
Total sum of these cells: 1,382,071
The brain area fiber_tracts_ids has 1573 different INH neuron types
Total sum of these cells: 944,724
The brain area hippocampus has 255 different INH neuron types
Total sum of these cells: 261,166
The brain area hippocampal_formation has 739 different INH neuron types
Total sum of these cells: 462,540
The brain area thalamus has 787 different INH neuron types
Total sum of these cells: 327,064
The brain area striatum has 930 different INH neuron types
Total sum of these cells: 5,781,940
The brain area VPL has 30 different INH neuron types
Total sum of these cells: 2,615
The brain area LGd has 12 di

In [19]:
total_cells_copy = copy.deepcopy(total_cells) 

for key, df in total_cells_copy.items():
    total_cells_copy[key] = df[df.index.isin(othertypes)]

for key, value in areas.items():
    
    keys_to_drop = ['unassigned', 'brain-unassigned']
    for region in regions:

        acronym = region_info[region_info['parcellation_term_acronym'] == region][['parcellation_term_acronym', 'parcellation_term_name', 'parcellation_label']]

        long_id = region_info[region_info['parcellation_term_acronym'] == region]['parcellation_label']
        substring = "AllenCCF-Annotation-2020-"
        index = long_id.iloc[0].find(substring)
        id_ = long_id.iloc[0][index + len(substring):]

        if int(id_) in value:
            #print(region, id_, "is in the set")
            None
        else:
            #print(id_, "is not in the set")
            keys_to_drop.append(region)

        #break    

    # Create a new dictionary without the specified keys
    total_cells_filtered = {key: value for key, value in total_cells_copy.items() if key not in keys_to_drop}

    # Concatenate all DataFrames into a single DataFrame
    concatenated_df = pd.concat(total_cells_filtered.values())

    # Group by all columns except the 'total_cells' column and sum the values
    summed_df = concatenated_df.groupby(concatenated_df.index).sum()
    print(f"The brain area {key} has {summed_df.shape[0]} different OTHER neuron types")
    print("Total sum of these cells:", '{:,.0f}'.format(summed_df.sum()[0]))

The brain area wholebrain has 75 different OTHER neuron types
Total sum of these cells: 97,032
The brain area Cerebral cortex has 45 different OTHER neuron types
Total sum of these cells: 0
The brain area isocortex has 0 different OTHER neuron types
Total sum of these cells: 0
The brain area cerebellum has 11 different OTHER neuron types
Total sum of these cells: 0
The brain area fiber_tracts_ids has 62 different OTHER neuron types
Total sum of these cells: 11,411
The brain area hippocampus has 0 different OTHER neuron types
Total sum of these cells: 0
The brain area hippocampal_formation has 20 different OTHER neuron types
Total sum of these cells: 0
The brain area thalamus has 6 different OTHER neuron types
Total sum of these cells: 118
The brain area striatum has 0 different OTHER neuron types
Total sum of these cells: 0
The brain area VPL has 0 different OTHER neuron types
Total sum of these cells: 0
The brain area LGd has 0 different OTHER neuron types
Total sum of these cells: 0


In [20]:
#glia
total_cells_copy = copy.deepcopy(total_cells) 

for key, df in total_cells_copy.items():
    total_cells_copy[key] = df[df.index.isin(glia)]

for key, value in areas.items():
    
    keys_to_drop = ['unassigned', 'brain-unassigned']
    for region in regions:

        acronym = region_info[region_info['parcellation_term_acronym'] == region][['parcellation_term_acronym', 'parcellation_term_name', 'parcellation_label']]

        long_id = region_info[region_info['parcellation_term_acronym'] == region]['parcellation_label']
        substring = "AllenCCF-Annotation-2020-"
        index = long_id.iloc[0].find(substring)
        id_ = long_id.iloc[0][index + len(substring):]

        if int(id_) in value:
            #print(region, id_, "is in the set")
            None
        else:
            #print(id_, "is not in the set")
            keys_to_drop.append(region)

        #break    

    # Create a new dictionary without the specified keys
    total_cells_filtered = {key: value for key, value in total_cells_copy.items() if key not in keys_to_drop}

    # Concatenate all DataFrames into a single DataFrame
    concatenated_df = pd.concat(total_cells_filtered.values())

    # Group by all columns except the 'total_cells' column and sum the values
    summed_df = concatenated_df.groupby(concatenated_df.index).sum()
    print(f"The brain area {key} has {summed_df.shape[0]} different GLIA types")
    print("Total sum of these cells:", '{:,.0f}'.format(summed_df.sum()[0]))

The brain area wholebrain has 55 different GLIA types
Total sum of these cells: 18,981,784
The brain area Cerebral cortex has 53 different GLIA types
Total sum of these cells: 6,142,648
The brain area isocortex has 43 different GLIA types
Total sum of these cells: 3,821,421
The brain area cerebellum has 37 different GLIA types
Total sum of these cells: 1,510,384
The brain area fiber_tracts_ids has 53 different GLIA types
Total sum of these cells: 4,313,006
The brain area hippocampus has 43 different GLIA types
Total sum of these cells: 1,306,825
The brain area hippocampal_formation has 47 different GLIA types
Total sum of these cells: 1,820,891
The brain area thalamus has 47 different GLIA types
Total sum of these cells: 890,791
The brain area striatum has 45 different GLIA types
Total sum of these cells: 1,546,694
The brain area VPL has 29 different GLIA types
Total sum of these cells: 51,141
The brain area LGd has 25 different GLIA types
Total sum of these cells: 27,700
The brain are

In [21]:
#astrotypes
total_cells_copy = copy.deepcopy(total_cells) 

for key, df in total_cells_copy.items():
    total_cells_copy[key] = df[df.index.isin(astrotypes)]

for key, value in areas.items():
    
    keys_to_drop = ['unassigned', 'brain-unassigned']
    for region in regions:

        acronym = region_info[region_info['parcellation_term_acronym'] == region][['parcellation_term_acronym', 'parcellation_term_name', 'parcellation_label']]

        long_id = region_info[region_info['parcellation_term_acronym'] == region]['parcellation_label']
        substring = "AllenCCF-Annotation-2020-"
        index = long_id.iloc[0].find(substring)
        id_ = long_id.iloc[0][index + len(substring):]

        if int(id_) in value:
            #print(region, id_, "is in the set")
            None
        else:
            #print(id_, "is not in the set")
            keys_to_drop.append(region)

        #break    

    # Create a new dictionary without the specified keys
    total_cells_filtered = {key: value for key, value in total_cells_copy.items() if key not in keys_to_drop}

    # Concatenate all DataFrames into a single DataFrame
    concatenated_df = pd.concat(total_cells_filtered.values())

    # Group by all columns except the 'total_cells' column and sum the values
    summed_df = concatenated_df.groupby(concatenated_df.index).sum()
    print(f"The brain area {key} has {summed_df.shape[0]} different astrocyte types")
    print("Total sum of these cells:", '{:,.0f}'.format(summed_df.sum()[0]))

The brain area wholebrain has 31 different astrocyte types
Total sum of these cells: 8,630,984
The brain area Cerebral cortex has 29 different astrocyte types
Total sum of these cells: 3,843,344
The brain area isocortex has 20 different astrocyte types
Total sum of these cells: 2,266,469
The brain area cerebellum has 15 different astrocyte types
Total sum of these cells: 635,483
The brain area fiber_tracts_ids has 29 different astrocyte types
Total sum of these cells: 1,269,625
The brain area hippocampus has 20 different astrocyte types
Total sum of these cells: 903,670
The brain area hippocampal_formation has 23 different astrocyte types
Total sum of these cells: 1,273,266
The brain area thalamus has 23 different astrocyte types
Total sum of these cells: 368,049
The brain area striatum has 21 different astrocyte types
Total sum of these cells: 715,038
The brain area VPL has 10 different astrocyte types
Total sum of these cells: 12,039
The brain area LGd has 9 different astrocyte types

In [22]:
#microglia
total_cells_copy = copy.deepcopy(total_cells) 

for key, df in total_cells_copy.items():
    total_cells_copy[key] = df[df.index.isin(microglia)]

for key, value in areas.items():
    
    keys_to_drop = ['unassigned', 'brain-unassigned']
    for region in regions:

        acronym = region_info[region_info['parcellation_term_acronym'] == region][['parcellation_term_acronym', 'parcellation_term_name', 'parcellation_label']]

        long_id = region_info[region_info['parcellation_term_acronym'] == region]['parcellation_label']
        substring = "AllenCCF-Annotation-2020-"
        index = long_id.iloc[0].find(substring)
        id_ = long_id.iloc[0][index + len(substring):]

        if int(id_) in value:
            #print(region, id_, "is in the set")
            None
        else:
            #print(id_, "is not in the set")
            keys_to_drop.append(region)

        #break    

    # Create a new dictionary without the specified keys
    total_cells_filtered = {key: value for key, value in total_cells_copy.items() if key not in keys_to_drop}

    # Concatenate all DataFrames into a single DataFrame
    concatenated_df = pd.concat(total_cells_filtered.values())

    # Group by all columns except the 'total_cells' column and sum the values
    summed_df = concatenated_df.groupby(concatenated_df.index).sum()
    print(f"The brain area {key} has {summed_df.shape[0]} different microglia types")
    print("Total sum of these cells:", '{:,.0f}'.format(summed_df.sum()[0]))

The brain area wholebrain has 1 different microglia types
Total sum of these cells: 870,149
The brain area Cerebral cortex has 1 different microglia types
Total sum of these cells: 437,210
The brain area isocortex has 1 different microglia types
Total sum of these cells: 303,015
The brain area cerebellum has 1 different microglia types
Total sum of these cells: 45,264
The brain area fiber_tracts_ids has 1 different microglia types
Total sum of these cells: 104,695
The brain area hippocampus has 1 different microglia types
Total sum of these cells: 71,552
The brain area hippocampal_formation has 1 different microglia types
Total sum of these cells: 107,617
The brain area thalamus has 1 different microglia types
Total sum of these cells: 24,582
The brain area striatum has 1 different microglia types
Total sum of these cells: 103,996
The brain area VPL has 1 different microglia types
Total sum of these cells: 962
The brain area LGd has 1 different microglia types
Total sum of these cells:

In [23]:
#oligos
total_cells_copy = copy.deepcopy(total_cells) 

for key, df in total_cells_copy.items():
    total_cells_copy[key] = df[df.index.isin(oligos)]

for key, value in areas.items():
    
    keys_to_drop = ['unassigned', 'brain-unassigned']
    for region in regions:

        acronym = region_info[region_info['parcellation_term_acronym'] == region][['parcellation_term_acronym', 'parcellation_term_name', 'parcellation_label']]

        long_id = region_info[region_info['parcellation_term_acronym'] == region]['parcellation_label']
        substring = "AllenCCF-Annotation-2020-"
        index = long_id.iloc[0].find(substring)
        id_ = long_id.iloc[0][index + len(substring):]

        if int(id_) in value:
            #print(region, id_, "is in the set")
            None
        else:
            #print(id_, "is not in the set")
            keys_to_drop.append(region)

        #break    

    # Create a new dictionary without the specified keys
    total_cells_filtered = {key: value for key, value in total_cells_copy.items() if key not in keys_to_drop}

    # Concatenate all DataFrames into a single DataFrame
    concatenated_df = pd.concat(total_cells_filtered.values())

    # Group by all columns except the 'total_cells' column and sum the values
    summed_df = concatenated_df.groupby(concatenated_df.index).sum()
    print(f"The brain area {key} has {summed_df.shape[0]} different oligocyte types")
    print("Total sum of these cells:", '{:,.0f}'.format(summed_df.sum()[0]))

The brain area wholebrain has 23 different oligocyte types
Total sum of these cells: 9,480,651
The brain area Cerebral cortex has 23 different oligocyte types
Total sum of these cells: 1,862,094
The brain area isocortex has 22 different oligocyte types
Total sum of these cells: 1,251,937
The brain area cerebellum has 21 different oligocyte types
Total sum of these cells: 829,636
The brain area fiber_tracts_ids has 23 different oligocyte types
Total sum of these cells: 2,938,685
The brain area hippocampus has 22 different oligocyte types
Total sum of these cells: 331,604
The brain area hippocampal_formation has 23 different oligocyte types
Total sum of these cells: 440,007
The brain area thalamus has 23 different oligocyte types
Total sum of these cells: 498,160
The brain area striatum has 23 different oligocyte types
Total sum of these cells: 727,660
The brain area VPL has 18 different oligocyte types
Total sum of these cells: 38,140
The brain area LGd has 15 different oligocyte types


In [24]:
#exci_inhib_sum
total_cells_copy = copy.deepcopy(total_cells) 

for key, df in total_cells_copy.items():
    total_cells_copy[key] = df[df.index.isin(exci_inhib_sum)]

for key, value in areas.items():
    
    keys_to_drop = ['unassigned', 'brain-unassigned']
    for region in regions:

        acronym = region_info[region_info['parcellation_term_acronym'] == region][['parcellation_term_acronym', 'parcellation_term_name', 'parcellation_label']]

        long_id = region_info[region_info['parcellation_term_acronym'] == region]['parcellation_label']
        substring = "AllenCCF-Annotation-2020-"
        index = long_id.iloc[0].find(substring)
        id_ = long_id.iloc[0][index + len(substring):]

        if int(id_) in value:
            #print(region, id_, "is in the set")
            None
        else:
            #print(id_, "is not in the set")
            keys_to_drop.append(region)

        #break    

    # Create a new dictionary without the specified keys
    total_cells_filtered = {key: value for key, value in total_cells_copy.items() if key not in keys_to_drop}

    # Concatenate all DataFrames into a single DataFrame
    concatenated_df = pd.concat(total_cells_filtered.values())

    # Group by all columns except the 'total_cells' column and sum the values
    summed_df = concatenated_df.groupby(concatenated_df.index).sum()
    print(f"The brain area {key} has {summed_df.shape[0]} different INH+EXC(Sum) types")
    print("Total sum of these cells:", '{:,.0f}'.format(summed_df.sum()[0]))

The brain area wholebrain has 5080 different INH+EXC(Sum) types
Total sum of these cells: 36,393,481
The brain area Cerebral cortex has 2944 different INH+EXC(Sum) types
Total sum of these cells: 18,788,865
The brain area isocortex has 870 different INH+EXC(Sum) types
Total sum of these cells: 11,236,952
The brain area cerebellum has 770 different INH+EXC(Sum) types
Total sum of these cells: 3,500,104
The brain area fiber_tracts_ids has 2998 different INH+EXC(Sum) types
Total sum of these cells: 1,797,083
The brain area hippocampus has 508 different INH+EXC(Sum) types
Total sum of these cells: 2,324,177
The brain area hippocampal_formation has 1633 different INH+EXC(Sum) types
Total sum of these cells: 3,451,875
The brain area thalamus has 1588 different INH+EXC(Sum) types
Total sum of these cells: 1,593,353
The brain area striatum has 1297 different INH+EXC(Sum) types
Total sum of these cells: 5,889,850
The brain area VPL has 41 different INH+EXC(Sum) types
Total sum of these cells: 5

# Test

In [33]:
#Read CCFv3 annotation volume
data_folder = "/gpfs/bbp.cscs.ch/project/proj84/piluso/share/general/warped_augmented_CCFv3/"
CCFv3_0, _ = nrrd.read(f'{data_folder}annotation_25_2022_CCFv3_0.nrrd')
# CCFv3_0, _ = nrrd.read("/gpfs/bbp.cscs.ch/data/project/proj84/atlas_pipeline_runs/2024-05-15T22:44:26+02:00/annotation_ccfv3_l23split_barrelsplit_validated.nrrd")
CCFv3_0.shape

(528, 320, 456)

In [36]:
path = '/gpfs/bbp.cscs.ch/data/project/proj84/csaba/aibs_10x_mouse_wholebrain/results/density_calculations/'
# Load the dictionary of DataFrames from the pickle file
with open(f'{path}total_cells.pickle', 'rb') as f:
    combined_result_dataframes = pickle.load(f)

In [37]:
def get_all_filenames(folder_path):
    filenames = []
    for filename in os.listdir(folder_path):
        full_path = os.path.join(folder_path, filename)
        if os.path.isfile(full_path):
            filenames.append(filename)
    return filenames

def get_csv_filenames(folder_path):
    csv_filenames = []
    for filename in os.listdir(folder_path):
        full_path = os.path.join(folder_path, filename)
        if os.path.isfile(full_path) and filename.endswith('.csv'):
            csv_filenames.append(filename)
    return sorted(csv_filenames)

def extract_prefix_from_filenames(csv_filenames):
    prefixes = []
    for filename in csv_filenames:
        prefix = filename.split('_')[0]
        prefixes.append(prefix)
    return prefixes

def extract_regions_from_column_names(folder_path, file_list):
    allen_regions = []
    for filename in file_list:
        file_path = os.path.join(folder_path, filename)
        df = pd.read_csv(file_path, nrows=1)  # Read only the first row to get column names
        allen_regions.append(df.columns[0])
        
    unique_allen_regions = sorted(list(set(allen_regions)))    
    return unique_allen_regions

def read_and_concat_csv_files(filenames, unique_prefixes, folder_path):
    result_dataframes = {}
    
    for prefix in unique_prefixes:
        matching_files = [filename for filename in filenames if filename.startswith(prefix)]
        if matching_files:
            dfs = []
            for filename in matching_files:
                df = pd.read_csv(os.path.join(folder_path, filename), index_col='cluster')
                
                # Check if the DataFrame has 'average_density' and 'density' columns
                if 'average_density' in df.columns:
                    # Keep only 'cluster', 'average_density', and rename 'average_density' to 'concatenated_density'
                    df = df[['average_density']].rename(columns={'average_density': 'concatenated_density'})
                elif 'density' in df.columns:
                    # Keep only 'cluster' and rename 'density' to 'concatenated_density'
                    df = df[['density']].rename(columns={'density': 'concatenated_density'})
                
                dfs.append(df)
            
            result_dataframes[prefix] = pd.concat(dfs, ignore_index=False)
    
    return result_dataframes

def combine_rows_and_calculate_average(result_dataframes):
    combined_dataframes = {}
    
    for prefix, df in result_dataframes.items():
        combined_dataframes[prefix] = df.groupby(level=0).mean()
    
    return combined_dataframes

In [39]:
%%time

# Get all regional density data
folder_path = '/gpfs/bbp.cscs.ch/data/project/proj84/csaba/aibs_10x_mouse_wholebrain/results/density_calculations/csv/'
filenames = get_all_filenames(folder_path)
csv_filenames = get_csv_filenames(folder_path)
prefixes = extract_prefix_from_filenames(csv_filenames)
unique_prefixes = sorted(list(set(prefixes)))
#The next line is different because unique_prefixes is different than how Allen named cell types
allen_regions = extract_regions_from_column_names(folder_path, csv_filenames)
#Next we remove the "/" sign from the layer 2/3 regions to get layer 23 like in ['parcellation_term_acronym']
#allen_regions = allen_regions_without_slash = [region.translate(str.maketrans('', '', '/')) for region in allen_regions]

result_dataframes = read_and_concat_csv_files(csv_filenames, unique_prefixes, folder_path)

result_dataframes = {allen_regions[i]: df for i, (_, df) in enumerate(result_dataframes.items())}


CPU times: user 2.26 s, sys: 140 ms, total: 2.4 s
Wall time: 6.38 s


In [40]:
def read_and_test_csv_files(filenames, unique_prefixes, folder_path):
    result_dataframes = {}
    
    for prefix in unique_prefixes:
        matching_files = [filename for filename in filenames if filename.startswith(prefix)]
        if matching_files:
            for filename in matching_files:
                df = pd.read_csv(os.path.join(folder_path, filename), index_col='cluster')
                if df.empty:
                    print(f"The DataFrame from {filename} is empty")
                    
read_and_test_csv_files(csv_filenames, unique_prefixes, folder_path)