In [1]:
!which python
import pandas as pd
print(pd.__version__)
import sys
print(sys.version)
import inspect
print(inspect.getfile(pd))

/gpfs/bbp.cscs.ch/ssd/apps/bsd/2024-02-01/stage_externals/install_gcc-12.3.0-skylake/python-3.11.6-bj4i6m/bin/python
2.2.1
3.12.2 | packaged by Anaconda, Inc. | (main, Feb 27 2024, 17:35:02) [GCC 11.2.0]
/gpfs/bbp.cscs.ch/data/project/proj84/csaba/git/cellden/lib/python3.12/site-packages/pandas/__init__.py


In [2]:
import os, sys, copy
import pandas as pd
from voxcell.nexus.voxelbrain import RegionMap
import numpy as np
import pickle
import nrrd

import re
import multiprocessing as mp

sys.path.append('/gpfs/bbp.cscs.ch/data/project/proj84/csaba/aibs_10x_mouse_wholebrain/notebooks/scripts/')

from helper_functions import get_all_filenames, get_csv_filenames, extract_prefix_from_filenames, read_and_concat_csv_files_new, combine_rows_and_calculate_average, create_combined_dataframe

# Start 

This notebook starts after data was created from MERFISH slices and stored as csv and pickle files. 
We examine scaling individual cell groups based on literature: https://www.pnas.org/doi/full/10.1073/pnas.0604911103
- Total cells (M): 108.69 ± 16.25
- Total neurons (M): 70.89 ± 10.41
- CCTX: 17.8 ± 3.4% of neurons (this is not the isocortex)
- CRB: 59.0 ± 5.0% of neurons

    - "Cerebral cortex" 688 = "Cortical plate" 695 + "Cortical subplate" 703
        - "Cerebral nuclei is not part of the cer.ctx)
        - Cerebral cortex is not the Cerebellar cortex (the Cerebellar cortex is part of the Cerebellum)
     
Note, that we cannot scale glia because we don't have information on that. only all non-neuron types number is known.
     
From test_tutorial_cerebellum.ipynb we decided to only scale exc cells (ie granular layer) in the CRB. 

In [3]:
total_cells = 108.69 * 1_000_000
total_neurons = 70.89 * 1_000_000
total_ctx = total_neurons * 17.8 / 100 #17.8% of the total neurons
total_crb = total_neurons * 59.0 / 100 #59% of the total neurons
total_rest = total_neurons * ( 100 - 59.0 -17.8 )/ 100
total_nonneuron = total_cells - total_neurons
print("Neurons only: ")
print("CTX: " '{:,.0f}'.format(total_ctx,), 
      "CRB: " '{:,.0f}'.format(total_crb),
      "REST: " '{:,.0f}'.format(total_rest),
      "Non-NEURON: " '{:,.0f}'.format(total_nonneuron))

Neurons only: 
CTX: 12,618,420 CRB: 41,825,100 REST: 16,446,480 Non-NEURON: 37,800,000


In [4]:
## Where did these come from: density1.2_print_info.ipynb using unscaled numbers (if you created the pickle file)
## density1.3_print_info_unscaled.ipynb

#New (See ipynb for up-to-date estimation)
'''
#CCFv3a: annotation_25_2022_CCFv3a.nrrd
The brain area wholebrain has 5272 different cell types
Total sum of cells: 92,231,205
The brain area Cerebral cortex has 3095 different cell types
Total sum of cells: 43,858,738
The brain area isocortex has 946 different cell types
Total sum of cells: 28,535,494
The brain area cerebellum has 861 different cell types
Total sum of cells: 6,224,671

The brain area wholebrain has 5155 different neuron types
Total sum of these cells: 52,040,591
The brain area Cerebral cortex has 2989 different neuron types
Total sum of these cells: 29,125,071
The brain area isocortex has 870 different neuron types
Total sum of these cells: 19,101,627
The brain area cerebellum has 781 different neuron types
Total sum of these cells: 3,543,172

The brain area wholebrain has 117/117 different nonneuron types
Total sum of these cells: 40,190,614
The brain area Cerebral cortex has 106/117 different nonneuron types
Total sum of these cells: 14,733,667
The brain area isocortex has 76/117 different nonneuron types
Total sum of these cells: 9,433,867
The brain area cerebellum has 80/117 different nonneuron types
Total sum of these cells: 2,681,499
The brain area fiber_tracts_ids has 114/117 different nonneuron types
Total sum of these cells: 7,881,000
'''

#Cerebral cortex, cerebellum, REST
mfish_ctx = 29_125_071
mfish_crb = 3_543_172
mfish_rest = 52_040_591 - mfish_ctx - mfish_crb
mfish_nonneuron = 40_190_614

In [5]:
#Scaling factors (overview only)
1/(mfish_ctx/total_ctx)*100, 1/(mfish_crb/total_crb)*100, 1/(mfish_rest/total_rest)*100, 1/(mfish_nonneuron/total_nonneuron)*100

(43.3249415941338, 1180.4422703724235, 84.89667850278138, 94.05181020623372)

In [28]:
%%time

#Load metadata of cell type information. Using classes we can easily group all 5k+ ctypes into larger groups
meta_path = "/gpfs/bbp.cscs.ch/data/project/proj84/csaba/aibs_10x_mouse_wholebrain/metadata/WMB-10X/20231215/views/cell_metadata_with_cluster_annotation.csv"
columns_to_read = ['class', 'subclass', 'cluster']
#metadata = pd.read_csv(meta_path, dtype={'cell_label':str}, low_memory=False)
metadata = pd.read_csv(meta_path, usecols=columns_to_read, )

n_classes = ['01 IT-ET Glut', '02 NP-CT-L6b Glut', '03 OB-CR Glut',
       '04 DG-IMN Glut', '05 OB-IMN GABA', '06 CTX-CGE GABA',
       '07 CTX-MGE GABA', '08 CNU-MGE GABA', '09 CNU-LGE GABA',
       '10 LSX GABA', '11 CNU-HYa GABA', '12 HY GABA', '13 CNU-HYa Glut',
       '14 HY Glut', '15 HY Gnrh1 Glut', '16 HY MM Glut', '17 MH-LH Glut',
       '18 TH Glut', '19 MB Glut', '20 MB GABA', '21 MB Dopa',
       '22 MB-HB Sero', '23 P Glut', '24 MY Glut', '25 Pineal Glut',
       '26 P GABA', '27 MY GABA', '28 CB GABA', '29 CB Glut',]

nn_classes = ['30 Astro-Epen', '31 OPC-Oligo', '32 OEC', '33 Vascular',
       '34 Immune']

exc = ['01 IT-ET Glut', '02 NP-CT-L6b Glut', '03 OB-CR Glut',
      '04 DG-IMN Glut', '13 CNU-HYa Glut', '14 HY Glut', '15 HY Gnrh1 Glut', '16 HY MM Glut', '17 MH-LH Glut',
      '18 TH Glut', '19 MB Glut', '23 P Glut', '24 MY Glut', '25 Pineal Glut', '29 CB Glut',]
inh = ['05 OB-IMN GABA', '06 CTX-CGE GABA', '07 CTX-MGE GABA', '08 CNU-MGE GABA', '09 CNU-LGE GABA',
      '10 LSX GABA', '11 CNU-HYa GABA', '12 HY GABA', '20 MB GABA', '26 P GABA', '27 MY GABA', '28 CB GABA', ]
other = ['21 MB Dopa', '22 MB-HB Sero', ]
exci_inhib_su = exc + inh

astrotypes = ['5206 Bergmann NN_1', '5207 Astro-CB NN_1', '5208 Astro-NT NN_1',
       '5209 Astro-NT NN_1', '5210 Astro-NT NN_1', '5211 Astro-NT NN_1',
       '5212 Astro-NT NN_1', '5213 Astro-NT NN_1', '5214 Astro-NT NN_2',
       '5215 Astro-NT NN_2', '5216 Astro-NT NN_2', '5217 Astro-NT NN_2',
       '5218 Astro-TE NN_1', '5219 Astro-TE NN_1', '5220 Astro-TE NN_1',
       '5221 Astro-TE NN_1', '5222 Astro-TE NN_2', '5223 Astro-TE NN_2',
       '5224 Astro-TE NN_3', '5225 Astro-TE NN_3', '5226 Astro-TE NN_3',
       '5227 Astro-TE NN_3', '5228 Astro-TE NN_4', '5229 Astro-TE NN_5',
       '5230 Astro-TE NN_5', '5231 Astro-OLF NN_1', '5232 Astro-OLF NN_1',
       '5233 Astro-OLF NN_2', '5234 Astro-OLF NN_2',
       '5235 Astro-OLF NN_3', '5236 Astro-OLF NN_3',]

microglia = ['5312 Microglia NN_1']

oligos = ['5266 OPC NN_1', '5267 OPC NN_1',
       '5268 OPC NN_1', '5269 OPC NN_1', '5270 OPC NN_1', '5271 OPC NN_2',
       '5272 COP NN_1', '5273 COP NN_1', '5274 COP NN_1', '5275 COP NN_1',
       '5276 COP NN_1', '5277 COP NN_1', '5278 NFOL NN_2',
       '5279 NFOL NN_2', '5280 NFOL NN_2', '5281 NFOL NN_2',
       '5282 MFOL NN_3', '5283 MFOL NN_3', '5284 MOL NN_4',
       '5285 MOL NN_4', '5286 MOL NN_4', '5287 MOL NN_4', '5288 MOL NN_4',]

glia = astrotypes + microglia + oligos

neurontypes = np.unique(metadata[metadata['class'].isin(n_classes)]['cluster'].values)
nonneurontypes = np.unique(metadata[metadata['class'].isin(nn_classes)]['cluster'].values)
exctypes = np.unique(metadata[metadata['class'].isin(exc)]['cluster'].values)
inhtypes = np.unique(metadata[metadata['class'].isin(inh)]['cluster'].values)
othertypes = np.unique(metadata[metadata['class'].isin(other)]['cluster'].values)
exci_inhib_sum = np.unique(metadata[metadata['class'].isin(exci_inhib_su)]['cluster'].values)
celltypes = np.unique(metadata['cluster'].values)

CPU times: user 16.3 s, sys: 455 ms, total: 16.8 s
Wall time: 16.8 s


## Load estimated densities from (density1 step) + external data (parcellation meta extended, hierarchy file)

In [7]:
%%time 

download_base = '/gpfs/bbp.cscs.ch/data/project/proj84/csaba/aibs_10x_mouse_wholebrain/'
root_folder = f"{download_base}results/density_calculations/"

# Get all regional density data
folder_path = f"{root_folder}csv/"
filenames = get_all_filenames(folder_path)
csv_filenames = get_csv_filenames(folder_path)
prefixes = extract_prefix_from_filenames(csv_filenames)
unique_prefixes = sorted(list(set(prefixes)))

#Create a dict of df, each containing a cell type's occurence in all regions and its densities in all regions
result_dataframes = read_and_concat_csv_files_new(csv_filenames, unique_prefixes, folder_path)
combined_result_dataframes = combine_rows_and_calculate_average(result_dataframes)
#shuffled_combined_dataframes = create_combined_dataframe(combined_result_dataframes)

file = '/gpfs/bbp.cscs.ch/data/project/proj84/csaba/aibs_10x_mouse_wholebrain/metadata/parcellation_to_parcellation_term_membership_extend.csv'
parcellation_annotation = pd.read_csv(file)

#hierarchy_json = '/gpfs/bbp.cscs.ch/home/veraszto/bbp_prod_files/1.json'
hierarchy_json = '/gpfs/bbp.cscs.ch/data/project/proj84/atlas_pipeline_runs/2024-05-15T22:44:26+02:00/hierarchy_ccfv3_l23split_barrelsplit.json'

CPU times: user 1.32 s, sys: 124 ms, total: 1.44 s
Wall time: 5.46 s


In [8]:
# Check if 'UVU' exists
if 'UVU' not in combined_result_dataframes:
    print("Non-leaf regions of the Cerebellum were removed")
else:
    print("'UVU' key exists in combined_result_dataframes")

Non-leaf regions of the Cerebellum were removed


You can combine and save the unscaled csv files (dict of dfs) into a pickle file

In [9]:
#write PICKLE file with densities
folder = '/gpfs/bbp.cscs.ch/data/project/proj84/csaba/aibs_10x_mouse_wholebrain/results/density_calculations/'
# Assuming 'total_cells' is your dictionary of DataFrames
with open(f'{folder}non_scaled_densities_new.pickle', 'wb') as f:
    pickle.dump(combined_result_dataframes, f)

print(f"Pickle file saved here: {folder}")

Pickle file saved here: /gpfs/bbp.cscs.ch/data/project/proj84/csaba/aibs_10x_mouse_wholebrain/results/density_calculations/


Select a whole region (e.g. CBX, CTX, HPF) of csv files / regions in the hierarchy

In [10]:
region_map = RegionMap.load_json(hierarchy_json)

cerebellum = region_map.find(
        "Cerebellum", attr="name", with_descendants=True
    ) | region_map.find("arbor vitae", attr="name", with_descendants=True)
# isocortex = (
#     region_map.find("Isocortex", attr="name", with_descendants=True)
#     | region_map.find("Entorhinal area", attr="name", with_descendants=True)
#     | region_map.find("Piriform area", attr="name", with_descendants=True)
# )
cerebral_cortex = (
    region_map.find("Cerebral cortex", attr="name", with_descendants=True)
)   

In [11]:
#This cell checks parcellation information and sees if all leaf region ids match with something in the parcellation database
ctx_leaves = []
for leaf in cerebral_cortex:
    pacellations_sets = parcellation_annotation[parcellation_annotation['label_numbers'] == leaf]
    parcallation = pacellations_sets[pacellations_sets['parcellation_term_set_name'] == 'substructure']
    if parcallation.shape[0] == 1:
        ctx_leaves.append(parcallation['cluster_as_filename'].iloc[0])
    else:
        print(str(leaf) + " " + region_map.get(leaf, "name", with_ascendants=False) +  " has seemingly no coverage.")

3562104832 Primary somatosensory area, trunk, layer 2 has seemingly no coverage.
3215542274 Primary somatosensory area, barrel field, B4 barrel layer 6b has seemingly no coverage.
1945434117 Primary somatosensory area, barrel field, C1 barrel layer 6a has seemingly no coverage.
1643593739 Primary somatosensory area, barrel field, E5 barrel layer 2/3 has seemingly no coverage.
2546495501 Somatomotor areas, layer 2 has seemingly no coverage.
1720700944 Rostrolateral lateral visual area, layer 2 has seemingly no coverage.
16 Layer 6b, isocortex has seemingly no coverage.
2208057363 Primary somatosensory area, unassigned, layer 3 has seemingly no coverage.
2078623765 Infralimbic area, layer 3 has seemingly no coverage.
22 Posterior parietal association areas has seemingly no coverage.
3581771805 Primary somatosensory area, barrel field, C3 barrel layer 3 has seemingly no coverage.
31 Anterior cingulate area has seemingly no coverage.
1896413216 Laterolateral anterior visual area, layer 3 h

In [12]:
#More sanity checks
crb_leaves = []
for leaf in cerebellum:
    pacellations_sets = parcellation_annotation[parcellation_annotation['label_numbers'] == leaf]
    parcallation = pacellations_sets[pacellations_sets['parcellation_term_set_name'] == 'substructure']
    if parcallation.shape[0] == 1:
        crb_leaves.append(parcallation['cluster_as_filename'].iloc[0])
    else:
        print(str(leaf) + " " + region_map.get(leaf, "name", with_ascendants=False))



519 Cerebellar nuclei
528 Cerebellar cortex
1073 Hemispheric regions
1143 Cerebellar cortex, granular layer
1144 Cerebellar cortex, molecular layer
1145 Cerebellar cortex, Purkinje layer
645 Vermal regions
920 Central lobule
928 Culmen
3092369320 Cerebellum: Other
10714 Lobule IV, granular layer
10715 Lobule IV, Purkinje layer
10716 Lobule IV, molecular layer
10717 Lobule V, granular layer
10718 Lobule V, Purkinje layer
992 Lobule IV
10719 Lobule V, molecular layer
1001 Lobule V
1017 Ansiform lobule


```Python

519 Cerebellar nuclei
528 Cerebellar cortex
1073 Hemispheric regions
1143 Cerebellar cortex, granular layer
1144 Cerebellar cortex, molecular layer
1145 Cerebellar cortex, Purkinje layer
645 Vermal regions
920 Central lobule
928 Culmen
10714 Lobule IV, granular layer
10715 Lobule IV, Purkinje layer
10716 Lobule IV, molecular layer
10717 Lobule V, granular layer
10718 Lobule V, Purkinje layer
992 Lobule IV
10719 Lobule V, molecular layer
1001 Lobule V
1017 Ansiform lobule

These regions are either leaf regions but they are not present in CCFv3a/0 or they are not leaf regions


In [13]:
#no CTX or CRB keys should be printed. Only regions with multiple solutions, which is taken care of elsewhere

ctx_keys = []
crb_keys = []
else_keys = []
for key in combined_result_dataframes.keys():
    cluster_as_filename = parcellation_annotation[parcellation_annotation['cluster_as_filename'] == key]
    cluster = cluster_as_filename[cluster_as_filename['parcellation_term_set_name'] == 'substructure']
    #This != part is for ambiguous regions:
    if cluster.shape[0] != 1:
        print(key )
        for key_name in cluster['cluster_as_filename'].values:
            # Add the element to else_keys list
            else_keys.append(key_name)
    elif cluster['label_numbers'].iloc[0] in cerebral_cortex:
        ctx_keys.append(cluster['cluster_as_filename'].iloc[0])
    elif cluster['label_numbers'].iloc[0] in cerebellum:
        crb_keys.append(cluster['cluster_as_filename'].iloc[0])
    else:
        else_keys.append(cluster['cluster_as_filename'].iloc[0])

#remove duplicates if there's any (in else there is)
else_keys = list(dict.fromkeys(else_keys))
crb_keys = list(dict.fromkeys(crb_keys))
ctx_keys = list(dict.fromkeys(ctx_keys))

print(len(ctx_keys), len(crb_keys), len(else_keys), )

fa
icp
sV
scp
st
309 54 340


In [14]:
#We can see that we only have 308 regions out of 567 and 22 out of 88. 
len(cerebral_cortex), len(cerebellum), len(neurontypes), len(exctypes), len(inhtypes)

(989, 89, 5205, 2619, 2511)

### GLOBAL Scaling will take place. 
- We scale down CTX/ELSE uniformly, while we only scale up CRB excitatory cells
#- We scale down CTX/ELSE uniformly, while we only scale up CRB neuron + glia cells
- We scale ALL NON-NEURONS across all REGIONS uniformly. 

In [15]:
%%time
combined_result_dataframes_copy = copy.deepcopy(combined_result_dataframes)
# del scaled_combined_result_dataframes
scaled_combined_result_dataframes = {}

#Do the scaling: CTX: neurons only, CRB: EXC only, ELSE: neuron only
for key, df in combined_result_dataframes_copy.items():
    #Scale only the neurons:
    if key in ctx_keys:
        # Select rows where the value is in neurontypes
        mask = df.index.isin(neurontypes)
        
        # Divide only selected rows by scaling factor for CTX
        df.loc[mask, 'density_mm3'] /= (mfish_ctx/total_ctx)
        scaled_combined_result_dataframes[key] = df
        del mask

        # Select rows where the value is in NON-N
        mask = df.index.isin(nonneurontypes)
        
        # Divide only selected rows by scaling factor for CTX NON_N
        df.loc[mask, 'density_mm3'] /=  (mfish_nonneuron/total_nonneuron) #Change later
        scaled_combined_result_dataframes[key] = df
        del mask
        del df        
        
    elif key in crb_keys:
        # Select rows where the value is in  exc types
        mask = df.index.isin(exctypes)
        # Select rows where the value is in neuron+glia types
        # combined = np.concatenate((exctypes, glia))
        # mask = df.index.isin(combined)
        
        # Divide only selected rows by scaling factor for CB
        df.loc[mask, 'density_mm3'] /= (mfish_crb/total_crb)
        scaled_combined_result_dataframes[key] = df        
        #print(df)
        del mask

        # Select rows where the value is in NON-N
        mask = df.index.isin(nonneurontypes)
        
        # Divide only selected rows by scaling factor for CB NON_N
        df.loc[mask, 'density_mm3'] /=  (mfish_nonneuron/total_nonneuron) #Change later
        scaled_combined_result_dataframes[key] = df
        del mask
        del df        
        
    elif key in else_keys:
        # Select rows where the value is in neurontypes
        mask = df.index.isin(neurontypes)
        
        # Divide only selected rows by scaling factor for REST
        df.loc[mask, 'density_mm3'] /= (mfish_rest/total_rest)
        scaled_combined_result_dataframes[key] = df        
        del mask

        # Select rows where the value is in NON-N
        mask = df.index.isin(nonneurontypes)
        
        # Divide only selected rows by scaling factor for REST NON_N
        df.loc[mask, 'density_mm3'] /=  (mfish_nonneuron/total_nonneuron) #Change later
        scaled_combined_result_dataframes[key] = df
        del mask
        del df        
        
    else:
        print(key, "is an Issue!")


del combined_result_dataframes_copy

CPU times: user 956 ms, sys: 9.17 ms, total: 965 ms
Wall time: 956 ms


ACAd1 is a CTX region, AAA is from rest, ANcr1, UVU is from CRB

In [16]:
#Rest
scaled_combined_result_dataframes['AAA'], combined_result_dataframes['AAA']

(                               density_mm3
 cluster                                   
 0023 L5/6 IT TPE-ENT Glut_1     392.495046
 0093 L4/5 IT CTX Glut_4         249.769575
 0150 L2/3 IT PIR-ENTl Glut_3     35.681368
 0205 MEA Slc17a7 Glut_2        2604.739852
 0207 MEA Slc17a7 Glut_2          35.681368
 ...                                    ...
 5309 Endo NN_1                 4190.094516
 5310 Endo NN_1                11463.466129
 5311 Endo NN_1                 1067.288226
 5312 Microglia NN_1            3241.393871
 5314 BAM NN_1                    79.058387
 
 [412 rows x 1 columns],
                                density_mm3
 cluster                                   
 0023 L5/6 IT TPE-ENT Glut_1     462.320851
 0093 L4/5 IT CTX Glut_4         294.204178
 0150 L2/3 IT PIR-ENTl Glut_3     42.029168
 0205 MEA Slc17a7 Glut_2        3068.129282
 0207 MEA Slc17a7 Glut_2          42.029168
 ...                                    ...
 5309 Endo NN_1                 4455.091834
 5310

In [17]:
#CTX
scaled_combined_result_dataframes['ACAd1'], combined_result_dataframes['ACAd1']

(                             density_mm3
 cluster                                 
 0023 L5/6 IT TPE-ENT Glut_1    19.171176
 0077 L4/5 IT CTX Glut_2        19.171176
 0090 L4/5 IT CTX Glut_4        19.171176
 0094 L4/5 IT CTX Glut_4        19.171176
 0105 L2/3 IT CTX Glut_1      3757.550579
 ...                                  ...
 5311 Endo NN_1               2372.208143
 5312 Microglia NN_1          5410.299273
 5314 BAM NN_1                 291.323807
 5318 DC NN_1                   41.617687
 5319 B cells NN_1              41.617687
 
 [110 rows x 1 columns],
                              density_mm3
 cluster                                 
 0023 L5/6 IT TPE-ENT Glut_1    44.249746
 0077 L4/5 IT CTX Glut_2        44.249746
 0090 L4/5 IT CTX Glut_4        44.249746
 0094 L4/5 IT CTX Glut_4        44.249746
 0105 L2/3 IT CTX Glut_1      8672.950131
 ...                                  ...
 5311 Endo NN_1               2522.235497
 5312 Microglia NN_1          5752.466923
 5314 B

In [18]:
#Cerebellum
scaled_combined_result_dataframes['ANcr1molecularlayer'], combined_result_dataframes['ANcr1molecularlayer']

(                          density_mm3
 cluster                              
 5000 CBN Dmbx1 Gaba_1        7.720368
 5175 DCO Il22 Gly-Gaba_3     3.860184
 5177 CB PLI Gly-Gaba_1     208.449943
 5178 CB PLI Gly-Gaba_1     741.155353
 5179 CB PLI Gly-Gaba_1      54.042578
 ...                               ...
 5316 DC NN_1                 3.630573
 5317 DC NN_1                 3.630573
 5318 DC NN_1                 3.630573
 5319 B cells NN_1           21.783438
 5322 T cells NN_4            3.630573
 
 [75 rows x 1 columns],
                           density_mm3
 cluster                              
 5000 CBN Dmbx1 Gaba_1        7.720368
 5175 DCO Il22 Gly-Gaba_3     3.860184
 5177 CB PLI Gly-Gaba_1     208.449943
 5178 CB PLI Gly-Gaba_1     741.155353
 5179 CB PLI Gly-Gaba_1      54.042578
 ...                               ...
 5316 DC NN_1                 3.860184
 5317 DC NN_1                 3.860184
 5318 DC NN_1                 3.860184
 5319 B cells NN_1           23.161105

In [19]:
len(scaled_combined_result_dataframes.keys()), len(combined_result_dataframes.keys())

(703, 703)

In [20]:
# Assuming 'total_cells' is your dictionary of DataFrames
with open(f'{folder}scaled_densities_global_only.pickle', 'wb') as f:
    pickle.dump(scaled_combined_result_dataframes, f)

print(f"Saved here: {folder}")

Saved here: /gpfs/bbp.cscs.ch/data/project/proj84/csaba/aibs_10x_mouse_wholebrain/results/density_calculations/


The file is a scaled version of combined_result_dataframes (normal name in other parts of the pipeline)

In [21]:
#We can't use csv files with densities as those are not scaled
root_folder = '/gpfs/bbp.cscs.ch/data/project/proj84/csaba/aibs_10x_mouse_wholebrain/results/density_calculations/'

file = os.path.join( root_folder, 'scaled_densities_global_only.pickle' )
#Load region id volumes from volume_calc_from_template.ipynb
with open(file, 'rb') as pickle_file:
    scaled_combined_result_dataframes = pickle.load(pickle_file)
print("Loaded pickle file.")


Loaded pickle file.


In [22]:
#You can save the data as  1 df /t-type as a dictionary

shuffled_combined_dataframes = create_combined_dataframe(scaled_combined_result_dataframes)

# Reorder the dataframes alphabetically by key
sorted_data = {k: shuffled_combined_dataframes[k] for k in sorted(shuffled_combined_dataframes)}

In [23]:
# Assuming 'total_cells' is your dictionary of DataFrames
with open(f'{folder}scaled_densities_global_only_t_types_as_keys.pickle', 'wb') as f:
    pickle.dump(sorted_data, f)

print(f"Saved here: {folder}")

Saved here: /gpfs/bbp.cscs.ch/data/project/proj84/csaba/aibs_10x_mouse_wholebrain/results/density_calculations/


In [24]:
sorted_keys = sorted(shuffled_combined_dataframes)
last_key = sorted_keys[-1]
last_key

'5322 T cells NN_4'

In [25]:
pd.options.display.float_format = '{:.2f}'.format
df_sorted = scaled_combined_result_dataframes['UVUgranularlayer'].sort_values(by='density_mm3', ascending=False)
df_sorted

Unnamed: 0_level_0,density_mm3
cluster,Unnamed: 1_level_1
5201 CB Granule Glut_2,1509904.50
5200 CB Granule Glut_2,119838.67
5202 DCO UBC Glut_1,7215.42
5198 CB Granule Glut_1,7215.42
5310 Endo NN_1,7148.62
...,...
5281 NFOL NN_2,25.00
5275 COP NN_1,25.00
5272 COP NN_1,25.00
5279 NFOL NN_2,25.00


In [26]:
# Iterate through the dictionary keys
for key, df in combined_result_dataframes.items():
    # Check if the key contains 'granularlayer'
    if 'granularlayer' in key:
        # Find the index of the highest value in the 'density_mm3' column
        max_density_index = df['density_mm3'].idxmax()
        # Find the highest value of the 'density_mm3' column
        max_density = df.loc[max_density_index, 'density_mm3']
        print(f"Highest density for {key} at index {max_density_index}: {max_density}")


Highest density for ANcr1granularlayer at index 5201 CB Granule Glut_2: 66541.85404643802
Highest density for ANcr2granularlayer at index 5201 CB Granule Glut_2: 45524.712402293146
Highest density for CENT2granularlayer at index 5201 CB Granule Glut_2: 159963.9810137208
Highest density for CENT3granularlayer at index 5201 CB Granule Glut_2: 131770.58939066585
Highest density for COPYgranularlayer at index 5201 CB Granule Glut_2: 51712.158808933
Highest density for CUL45granularlayer at index 5201 CB Granule Glut_2: 90003.01003139034
Highest density for DECgranularlayer at index 5201 CB Granule Glut_2: 46980.34586977127
Highest density for FLgranularlayer at index 5201 CB Granule Glut_2: 42846.53895355265
Highest density for FOTUgranularlayer at index 5201 CB Granule Glut_2: 46405.02354788069
Highest density for LINGgranularlayer at index 5201 CB Granule Glut_2: 51953.56738391846
Highest density for NODgranularlayer at index 5201 CB Granule Glut_2: 83100.52138014486
Highest density for 

In [30]:
#Helper function to create total cell count values for a 3D brain

# import re
# import pandas as pd
# import numpy as np

def nrrd_for_validation(df, parcellation_annotation, CCFv3):
    all_ids_for_df = []
    df_comb = pd.DataFrame()

    for regionname in df.index.values[0:]:
        density = df.loc[regionname, 'density_mm3']
        #annotation_id_info = substructures[substructures['cluster_as_filename'] == regionname]
        annotation_id_info = parcellation_annotation[parcellation_annotation['cluster_as_filename'] == regionname]

        Annotation2020ids = [int(re.search(r'\d+$', s).group()) for s in annotation_id_info['parcellation_label'].values]
        df_sub = pd.DataFrame({'density': density}, index=Annotation2020ids)
        df_comb = pd.concat([df_comb, df_sub])
        all_ids_for_df.append(Annotation2020ids)

    all_ids_for_df = [value for sublist in all_ids_for_df for value in sublist]
    all_ids_for_df.append(0)
    #Place to put extra regions not part of Allen's Parcellation annotation

    outside = 0
    outsideid = [0]
    df_sub = pd.DataFrame({'density': outside}, index=outsideid)
    df_comb = pd.concat([df_comb, df_sub])

    CCFv3_copy = CCFv3.copy()

    # Expression is 0 in those regions where we don't have any info:
    CCFv3_copy[~np.isin(CCFv3_copy, all_ids_for_df)] = 0.0 

    # Expression is non-zero in these leaf region(s)
    for index, row in df_comb.iterrows():
        density_value = row['density']
        region_id = index
        CCFv3_copy[np.isin(CCFv3, region_id)] = density_value

    #Create outside of the brain as 0
    CCFv3_copy[np.isin(CCFv3, int(0))] = 0

    return CCFv3_copy

In [None]:
%%time

import re

data_folder = "/gpfs/bbp.cscs.ch/project/proj84/piluso/share/general/warped_augmented_CCFv3/"
CCFv3, _ = nrrd.read(f'{data_folder}annotation_25_2022_CCFv3a.nrrd')


#from multiprocessing import Pool

def process_type(types, file_name):
    # Filter DataFrames based on types
    filtered_dataframes = {key: value for key, value in shuffled_combined_dataframes.items() if key in types}
    
    # Combine filtered DataFrames
    combined_df = pd.concat(filtered_dataframes.values())
    
    # Sum the combined DataFrame by index
    summed_df = combined_df.groupby(combined_df.index).sum()
    
    # Validate result
    result = nrrd_for_validation(summed_df, parcellation_annotation, CCFv3)
    
    # Clean up
    del combined_df, summed_df, filtered_dataframes

    return (result, file_name)

def main():
    # Define the parameters for each process
    tasks = [
        (neurontypes, "scaled_total_neuron_densities"),
        (exctypes, "scaled_total_excitatory_densities"),
        (inhtypes, "scaled_total_inhibitory_densities"),
        (astrotypes, "scaled_total_astrotypes_densities"),
        (microglia, "scaled_total_microglia_densities"),
        (oligos, "scaled_total_oligocyte_densities"),
        (glia, "scaled_total_glia_densities"),
        (exci_inhib_sum, "scaled_total_excinh_densities"),
        (celltypes, "scaled_total_celltypes_densities"),
        (nonneurontypes, "scaled_total_nonneuron_densities")
    ]
    
    # Create a multiprocessing Pool
    with mp.Pool(processes=mp.cpu_count()) as pool:
        results = pool.starmap(process_type, tasks)
    
    # Sequentially write the .nrrd files to avoid concurrent writes
    for result, file_name in results:
        nrrd.write(f"{root_folder}total_nrrd_global_only/{file_name}.nrrd", result)
        print(f"{root_folder}total_nrrd/{file_name}.nrrd")
        
if __name__ == "__main__":
    main()

# PART 2 - Transplant (if needed)

In this section we specifically edit a few regions based on inputs from other colleagues / literature values

- First, we create a dataframe with the expected density values.
- Second, we compare this ground truth (or expected values) with the densities originating from transcriptomic data and calculate their ratio: exc_mm3_ratio	inh_mm3_ratio	neurons_mm3_ratio (for this we can use the total densities nrrd files for simplicity)
- Third, we adjust the transcriptomic densities (by multiplying with the ratio) in the generated pickle file.

This way any future input can be added to the original df: df_expected, a ratio can be calculated, and transcriptomic densities can be updated. 
Important to note that pickle files are derived values from csv files (and mean that the values have been collected and updated). nrrd files are derived from these pickle files as per our logic, and they only serve as "projections" of the actual data. 


In [27]:
# Create a dictionary from the expected data:
data = {
    "id": [391, 399, 407, 415, 431, 438, 446, 454, 471, 486, 495, 504, 718, 733, 1020],
    "acronym": [
        "CA1slm", "CA1so", "CA1sp", "CA1sr", "CA2slm", "CA2so", 
        "CA2sp", "CA2sr", "CA3slm", "CA3so", "CA3sp", "CA3sr", "VPL", "VPM", "PO"
    ],
    "Region": [
        "Field CA1, stratum lacunosum-moleculare", "Field CA1, stratum oriens",
        "Field CA1, stratum pyramidale", "Field CA1, stratum radiatum",
        "Field CA2, stratum lacunosum-moleculare", "Field CA2, stratum oriens",
        "Field CA2, stratum pyramidale", "Field CA2, stratum radiatum",
        "Field CA3, stratum lacunosum-moleculare", "Field CA3, stratum oriens",
        "Field CA3, stratum pyramidale", "Field CA3, stratum radiatum",
        "Ventral posterolateral nucleus of the thalamus", "Ventral posteromedial nucleus of the thalamus",
        "Posterior complex of the thalamus"
    ],
    "Expected Inhibitory Dorsal": [
        8420, 6480, 13520, 6480, 8420, 6480, 13520, 3240, 5780, 3170, 7560, 7890, "", "", ""
    ],
    "Expected Inhibitory Ventral": [
        8820, 10500, 10130, 10500, 8820, 10500, 10130, 4260, 7200, 5460, 10540, 6590, "", "", ""
    ],
    "Expected Neurons Dorsal": [
        "", "", 447500, "", "", "", 447500, "", "", "", "", "", "", "", ""
    ],
    "Expected Neurons Ventral": [
        "", "", 180500, "", "", "", 180500, "", "", "", "", "", "", "", ""
    ],
    "exc_mm3": [
        "", "", "", "", "", "", "", "", "", "", "", "", 55340.64, 67940.58, ""
    ],
    "inh_mm3": [
        8620, 8490, 11825, 8490, 8620, 8490, 11825, 3750, 6490, 4315, 9050, 7240, 2126.28, 2978.61, 161.08663
    ],
    "neurons_mm3": [
        "", "", 314000, "", "", "", 314000, "", "", "", 172400, "", 57466.92, 70919.19, ""
    ]
}

# Create a DataFrame from the dictionary
df_e = pd.DataFrame(data)


# New data to be added
new_data = {
    "id": [212, 220, 228, 236, 244, 477],
    "acronym": [
        "MOBglomerularlayer", "MOBgr", "MOBipl", "MOBmi", "MOBopl", "STR"
    ],
    "Region": [
        "Main olfactory bulb, glomerular layer", "Main olfactory bulb, granule layer",
        "Main olfactory bulb, inner plexiform layer", "Main olfactory bulb, mitral layer",
        "Main olfactory bulb, outer plexiform layer", "Striatum"
    ],
    "Expected Inhibitory Dorsal": ["", "", "", "", "", ""],
    "Expected Inhibitory Ventral": ["", "", "", "", "", ""],
    "Expected Neurons Dorsal": ["", "", "", "", "", ""],
    "Expected Neurons Ventral": ["", "", "", "", "", ""],
    "exc_mm3": ["", "", "", "", "", ""],
    "inh_mm3": ["", "", "", "", "", ""],
    "neurons_mm3": [
        630000, 710000, 150000, 350000, 80000, 78560
    ]
}
df_new = pd.DataFrame(new_data)

# Concatenate the new data with the existing data
df_expected = pd.concat([df_e, df_new], ignore_index=True)
df_expected.to_csv('/gpfs/bbp.cscs.ch/data/project/proj84/csaba/aibs_10x_mouse_wholebrain/results/expected_data.csv', index=False)

#Add children to the regions which need to change to include non-leaf-region changes:
file = '/gpfs/bbp.cscs.ch/data/project/proj84/csaba/aibs_10x_mouse_wholebrain/results/df_hierarchy_ccfv3_l23split_barrelsplit.csv'
hierarchy = pd.read_csv(file, index_col=0)

# Merge df_expected with hierarchy on 'id' to get the 'children' column
df_expected = df_expected.merge(hierarchy[['id', 'children', 'acr_list']], on='id', how='left')

import ast
df_expected.loc[: , 'children'] = df_expected.loc[: , 'children'].apply(ast.literal_eval)

# Add new columns to df_expected
df_expected['exc_mm3_ratio'] = np.nan
df_expected['inh_mm3_ratio'] = np.nan
df_expected['neurons_mm3_ratio'] = np.nan

In [28]:
#These are the affacted regions
df_expected['acronym'].values

array(['CA1slm', 'CA1so', 'CA1sp', 'CA1sr', 'CA2slm', 'CA2so', 'CA2sp',
       'CA2sr', 'CA3slm', 'CA3so', 'CA3sp', 'CA3sr', 'VPL', 'VPM', 'PO',
       'MOBglomerularlayer', 'MOBgr', 'MOBipl', 'MOBmi', 'MOBopl', 'STR'],
      dtype=object)

In [29]:
#Add scaling parameter to df_expected too by loading scaled total density files and search for corresponding values

data_folder = "/gpfs/bbp.cscs.ch/project/proj84/piluso/share/general/warped_augmented_CCFv3/"
CCFv3a, _ = nrrd.read(f'{data_folder}annotation_25_2022_CCFv3a.nrrd')
# CCFv3a, _ = nrrd.read("/gpfs/bbp.cscs.ch/data/project/proj84/atlas_pipeline_runs/2024-05-15T22:44:26+02:00/annotation_ccfv3_l23split_barrelsplit_validated.nrrd")

results_folder = '/gpfs/bbp.cscs.ch/data/project/proj84/csaba/aibs_10x_mouse_wholebrain/results/density_calculations/'

#NEURON
filename = "scaled_total_neuron_densities.nrrd"
full_path = os.path.join(results_folder, filename)
if os.path.isfile(full_path):
    neuron, _ = nrrd.read(full_path)

#EXC
filename = "scaled_total_excitatory_densities.nrrd"
full_path = os.path.join(results_folder, filename)
if os.path.isfile(full_path):
    exc, _ = nrrd.read(full_path)
#INH        
filename = "scaled_total_inhibitory_densities.nrrd"
full_path = os.path.join(results_folder, filename)
if os.path.isfile(full_path):
    inh, _ = nrrd.read(full_path)


The next step calculates the ratio between expected and transcriptomic densities. It updates the df.

In [30]:
%%time 

'''We divide by: mean expected / transcriptomic mean densities..'''

# Arrays to process
arrays = {
    'neurons_mm3': neuron,
    'inh_mm3': inh,
    'exc_mm3': exc,
}

# Worker function to process each row
def process_row(row):
    id_ = row['id'] 
    acr = row['acronym']
    name = row['Region']
    ids = row['children']
    ids.append(id_) #Make sure you add the original id_ to the children
    
    
    # Create a mask for the current id list
    mask = np.isin(CCFv3a, [ids])
    
    # Calculate mean/median values (can place NaN or None in the regions not present in CCfV3a)
    # mean_values = {name: arr[mask].mean() if arr[mask].size > 
    #                0 else np.nan for name, arr in arrays.items()}
    mean_values = {name: arr[mask].mean() if arr[mask].size > 
                   0 else None for name, arr in arrays.items()}
    # mean_values = {name: np.median(arr[mask]) if arr[mask].size > 
    #                0 else None for name, arr in arrays.items()}

    # Update row with ratio values
    if mean_values['exc_mm3'] is not np.nan and isinstance(row['exc_mm3'], (int, float)):
        row['exc_mm3_ratio'] = row['exc_mm3'] / mean_values['exc_mm3']
    if mean_values['inh_mm3'] is not np.nan and isinstance(row['inh_mm3'], (int, float)):
        row['inh_mm3_ratio'] = row['inh_mm3'] / mean_values['inh_mm3']
    if mean_values['neurons_mm3'] is not np.nan and isinstance(row['neurons_mm3'], (int, float)):
        row['neurons_mm3_ratio'] = row['neurons_mm3'] / mean_values['neurons_mm3']
    
    return row
    
    
# Use multiprocessing Pool to process rows in parallel
if __name__ == '__main__':
    with mp.Pool(processes=mp.cpu_count()) as pool:
        new_data = pool.map(process_row, [row for _, row in df_expected.iterrows()])
    
    # Convert the list of rows back to a DataFrame
    df_expected = pd.DataFrame(new_data)



CPU times: user 190 ms, sys: 780 ms, total: 970 ms
Wall time: 5.2 s


In [31]:
#We can't use csv files with densities as those are not scaled, but we can use the scaled density pickle file
root_folder = '/gpfs/bbp.cscs.ch/data/project/proj84/csaba/aibs_10x_mouse_wholebrain/results/density_calculations/'

file = os.path.join( root_folder, 'scaled_densities_global_only.pickle' )
#Load region id volumes from volume_calc_from_template.ipynb
with open(file, 'rb') as pickle_file:
    scaled_combined_result_dataframe_from_file = pickle.load(pickle_file)
print("Loaded pickle file.")

Loaded pickle file.


Prepare the scaling. We have to solve again the regional differences between the transcriptomic data and expected densities. We search for regions not present in the transcriptomically covered regions and break them up to their leaf regions. We store the final list of (leaf) regions in a concatenated list.

In [32]:
import ast 
#Note that we will scale the scaled data: scaled_combined_result_dataframes
combined_result_dataframes_copy = copy.deepcopy(scaled_combined_result_dataframe_from_file)


for reg in df_expected['acronym'].values:
    concatenated_result = np.array(df_expected['acronym'].values) 
    if reg not in list(combined_result_dataframes_copy.keys()):
        print(reg, " is not a region recognised by transcriptomics data")
        #leaves_list = df_expected[df_expected['acronym'] == reg]['children'].iat[0]
        acr_list_str = df_expected[df_expected['acronym'] == reg]['acr_list'].iat[0]
        acr_list = ast.literal_eval(acr_list_str)

        # Concatenate the array and the list to cover the full area where secondary scaling takes effect
        # We combine the leaf regions with the list of regions non leaf regions
        concatenated_result = np.concatenate((concatenated_result, acr_list))
        print(len(acr_list), len(df_expected['acronym'].values), len(concatenated_result))

STR  is not a region recognised by transcriptomics data
30 21 51


In [33]:
string_to_find = "AAA"
matches = df_expected.map(lambda x: string_to_find in str(x))
# Find rows where any cell contains the substring
rows_with_substring = df_expected[matches.any(axis=1)]

In [34]:
%%time

#Load metadata of cell type information. Using classes we can easily group all 5k+ ctypes into larger groups
meta_path = "/gpfs/bbp.cscs.ch/data/project/proj84/csaba/aibs_10x_mouse_wholebrain/metadata/WMB-10X/20231215/views/cell_metadata_with_cluster_annotation.csv"
columns_to_read = ['class', 'subclass', 'cluster']
#metadata = pd.read_csv(meta_path, dtype={'cell_label':str}, low_memory=False)
metadata = pd.read_csv(meta_path, usecols=columns_to_read, )

n_classes = ['01 IT-ET Glut', '02 NP-CT-L6b Glut', '03 OB-CR Glut',
       '04 DG-IMN Glut', '05 OB-IMN GABA', '06 CTX-CGE GABA',
       '07 CTX-MGE GABA', '08 CNU-MGE GABA', '09 CNU-LGE GABA',
       '10 LSX GABA', '11 CNU-HYa GABA', '12 HY GABA', '13 CNU-HYa Glut',
       '14 HY Glut', '15 HY Gnrh1 Glut', '16 HY MM Glut', '17 MH-LH Glut',
       '18 TH Glut', '19 MB Glut', '20 MB GABA', '21 MB Dopa',
       '22 MB-HB Sero', '23 P Glut', '24 MY Glut', '25 Pineal Glut',
       '26 P GABA', '27 MY GABA', '28 CB GABA', '29 CB Glut',]

nn_classes = ['30 Astro-Epen', '31 OPC-Oligo', '32 OEC', '33 Vascular',
       '34 Immune']

exc = ['01 IT-ET Glut', '02 NP-CT-L6b Glut', '03 OB-CR Glut',
      '04 DG-IMN Glut', '13 CNU-HYa Glut', '14 HY Glut', '15 HY Gnrh1 Glut', '16 HY MM Glut', '17 MH-LH Glut',
      '18 TH Glut', '19 MB Glut', '23 P Glut', '24 MY Glut', '25 Pineal Glut', '29 CB Glut',]
inh = ['05 OB-IMN GABA', '06 CTX-CGE GABA', '07 CTX-MGE GABA', '08 CNU-MGE GABA', '09 CNU-LGE GABA',
      '10 LSX GABA', '11 CNU-HYa GABA', '12 HY GABA', '20 MB GABA', '26 P GABA', '27 MY GABA', '28 CB GABA', ]
other = ['21 MB Dopa', '22 MB-HB Sero', ]
exci_inhib_su = exc + inh

astrotypes = ['5206 Bergmann NN_1', '5207 Astro-CB NN_1', '5208 Astro-NT NN_1',
       '5209 Astro-NT NN_1', '5210 Astro-NT NN_1', '5211 Astro-NT NN_1',
       '5212 Astro-NT NN_1', '5213 Astro-NT NN_1', '5214 Astro-NT NN_2',
       '5215 Astro-NT NN_2', '5216 Astro-NT NN_2', '5217 Astro-NT NN_2',
       '5218 Astro-TE NN_1', '5219 Astro-TE NN_1', '5220 Astro-TE NN_1',
       '5221 Astro-TE NN_1', '5222 Astro-TE NN_2', '5223 Astro-TE NN_2',
       '5224 Astro-TE NN_3', '5225 Astro-TE NN_3', '5226 Astro-TE NN_3',
       '5227 Astro-TE NN_3', '5228 Astro-TE NN_4', '5229 Astro-TE NN_5',
       '5230 Astro-TE NN_5', '5231 Astro-OLF NN_1', '5232 Astro-OLF NN_1',
       '5233 Astro-OLF NN_2', '5234 Astro-OLF NN_2',
       '5235 Astro-OLF NN_3', '5236 Astro-OLF NN_3',]

microglia = ['5312 Microglia NN_1']

oligos = ['5266 OPC NN_1', '5267 OPC NN_1',
       '5268 OPC NN_1', '5269 OPC NN_1', '5270 OPC NN_1', '5271 OPC NN_2',
       '5272 COP NN_1', '5273 COP NN_1', '5274 COP NN_1', '5275 COP NN_1',
       '5276 COP NN_1', '5277 COP NN_1', '5278 NFOL NN_2',
       '5279 NFOL NN_2', '5280 NFOL NN_2', '5281 NFOL NN_2',
       '5282 MFOL NN_3', '5283 MFOL NN_3', '5284 MOL NN_4',
       '5285 MOL NN_4', '5286 MOL NN_4', '5287 MOL NN_4', '5288 MOL NN_4',]

glia = astrotypes + microglia + oligos

neurontypes = np.unique(metadata[metadata['class'].isin(n_classes)]['cluster'].values)
nonneurontypes = np.unique(metadata[metadata['class'].isin(nn_classes)]['cluster'].values)
exctypes = np.unique(metadata[metadata['class'].isin(exc)]['cluster'].values)
inhtypes = np.unique(metadata[metadata['class'].isin(inh)]['cluster'].values)
othertypes = np.unique(metadata[metadata['class'].isin(other)]['cluster'].values)
exci_inhib_sum = np.unique(metadata[metadata['class'].isin(exci_inhib_su)]['cluster'].values)

CPU times: user 13.6 s, sys: 401 ms, total: 14 s
Wall time: 14 s


Do the scaling: we can scale according to input inhibitory / excitatory / neuron count/

In [35]:
%%time

combined_result_dataframes_copy = copy.deepcopy(scaled_combined_result_dataframe_from_file)
# del scaled_combined_result_dataframes
# We create the dict of dfs anew!
scaled_combined_result_dataframes = {}

#Do the 2ndary scaling: we check every transcriptomic region and scale their cell types if needed: 
for key, df in combined_result_dataframes_copy.items():
    if key in concatenated_result:
        print(key, " is a leaf region which will be scaled.")
        # We find the row where this region is listed (can be a substring if the row is not a leaf region)
        matches = df_expected.map(lambda x: key in str(x))
        rows_with_substring = df_expected[matches.any(axis=1)]
        # If the region has a non-NAN value in the ratios, we multiply that type of cells with the ratio
        if not rows_with_substring.empty and pd.notna(rows_with_substring['exc_mm3_ratio'].iloc[0]):
            mask = df.index.isin(exctypes)
            df.loc[mask, 'density_mm3'] *= rows_with_substring['exc_mm3_ratio'].iloc[0]
            scaled_combined_result_dataframes[key] = df
        #break
        if not rows_with_substring.empty and pd.notna(rows_with_substring['inh_mm3_ratio'].iloc[0]):
            mask = df.index.isin(inhtypes)
            df.loc[mask, 'density_mm3'] *= rows_with_substring['inh_mm3_ratio'].iloc[0]
            scaled_combined_result_dataframes[key] = df

        if not rows_with_substring.empty and pd.notna(rows_with_substring['neurons_mm3_ratio'].iloc[0]):
            mask = df.index.isin(neurontypes)
            df.loc[mask, 'density_mm3'] *= rows_with_substring['neurons_mm3_ratio'].iloc[0]
            scaled_combined_result_dataframes[key] = df
            #break
        #print("Deleting data for " + key)
        del key, df        
    else:
       #In this case we don't have to scale
        scaled_combined_result_dataframes[key] = df
        del key, df

AAA  is a leaf region which will be scaled.
ACB  is a leaf region which will be scaled.
BA  is a leaf region which will be scaled.
CA1slm  is a leaf region which will be scaled.
CA1so  is a leaf region which will be scaled.
CA1sp  is a leaf region which will be scaled.
CA1sr  is a leaf region which will be scaled.
CA2slm  is a leaf region which will be scaled.
CA2so  is a leaf region which will be scaled.
CA2sp  is a leaf region which will be scaled.
CA2sr  is a leaf region which will be scaled.
CA3slm  is a leaf region which will be scaled.
CA3so  is a leaf region which will be scaled.
CA3sp  is a leaf region which will be scaled.
CA3sr  is a leaf region which will be scaled.
CEAc  is a leaf region which will be scaled.
CEAl  is a leaf region which will be scaled.
CEAm  is a leaf region which will be scaled.
CP  is a leaf region which will be scaled.
FS  is a leaf region which will be scaled.
IA  is a leaf region which will be scaled.
LSc  is a leaf region which will be scaled.
LSr  i

In [36]:
len(scaled_combined_result_dataframes)

703

In [37]:
folder = '/gpfs/bbp.cscs.ch/data/project/proj84/csaba/aibs_10x_mouse_wholebrain/results/density_calculations/'
# Assuming 'total_cells' is your dictionary of DataFrames
with open(f'{folder}scaled_densities_global_only_plus_transplant.pickle', 'wb') as f:
    pickle.dump(scaled_combined_result_dataframes, f)

print(f"Saved here: {folder}")

Saved here: /gpfs/bbp.cscs.ch/data/project/proj84/csaba/aibs_10x_mouse_wholebrain/results/density_calculations/
