In this notebook we loop through all MERFISH cells and add a new metadata to them: their region information and its estimated volume in the slice. 
So if a cell sits in the MOB we calculate the intersected volume of the MOB in the respective slice (with slice position)

In [2]:
import pandas as pd
import os
import numpy as np

Load all cells from the 53 MERFISH slices

In [3]:
view_directory = '/gpfs/bbp.cscs.ch/data/project/proj84/csaba/aibs_10x_mouse_wholebrain/metadata/MERFISH-C57BL6J-638850-CCF/20231215/views/'
file = os.path.join( view_directory, 'cell_metadata_with_parcellation_annotation.csv')
#Load all spatial data into 1 dataframe
cell_joined = pd.read_csv(file)
cell_joined.set_index('cell_label',inplace=True)
cell_joined.shape

(3739961, 37)

Remove those 15 cells with no x y z CCF coordinates and no parcellation assigments

In [4]:
cell_joined = cell_joined[~((cell_joined['x_ccf'] == 0) & (cell_joined['y_ccf'] == 0) & (cell_joined['z_ccf'] == 0))]
cell_joined.shape

(3739946, 37)

Create a new smaller df to add additional metadata to the cells

In [5]:
#To avoid altering the original DataFrame, create a copy first
cells = pd.DataFrame()
cells = cell_joined[['brain_section_label', 'x_reconstructed', 'y_reconstructed', 'z_reconstructed']].copy()


In [6]:
# Perform element-wise division TO calculate template number accordingo to Allen tutorial
cells.loc[:, 'template_nr'] = cells.loc[:, 'z_reconstructed'].values / 0.2

In [7]:
# Round the 'template_nr' column to the nearest integer
cells.loc[:, 'template_nr'] = cells['template_nr'].round()

In [8]:
cells.shape, cell_joined.shape

((3739946, 5), (3739946, 37))

In [9]:
cells.loc[:, 'parcellation_substructure'] = cell_joined.loc[:, 'parcellation_substructure']


Template number is the way to slice the warped volume. 

In [10]:
# file = '/gpfs/bbp.cscs.ch/data/project/proj84/csaba/aibs_10x_mouse_wholebrain/metadata/Allen-CCF-2020/20230630/parcellation_to_parcellation_term_membership.csv'
file = '/gpfs/bbp.cscs.ch/data/project/proj84/csaba/aibs_10x_mouse_wholebrain/metadata/parcellation_to_parcellation_term_membership_extend.csv'

parcellation_annotation = pd.read_csv(file)
parcellation_indexes = list(np.unique(cell_joined['parcellation_index']))
description_of_all_indexes = parcellation_annotation[parcellation_annotation['parcellation_index'].isin(parcellation_indexes)]
substructure_info = description_of_all_indexes[description_of_all_indexes['parcellation_term_set_name'] == 'substructure'] 

We can look up the region_id of each parcellation substructure (for each cell) which is hidden in the 'parcellation_label'.
This cell will run 3M times, so it's a bit slow. 

In [11]:
%%time

def get_region_id(substructure):
    region_id = int(substructure_info[substructure_info['parcellation_term_acronym'] == substructure]['parcellation_label'].values[0].split('-')[-1])
    return region_id


#Create region ids for every parcellation substructure for every cell
cells['region_id'] = cells['parcellation_substructure'].apply(lambda x: get_region_id(x))

view_directory = '/gpfs/bbp.cscs.ch/data/project/proj84/csaba/aibs_10x_mouse_wholebrain/results/nmi_scores/'
file = os.path.join( view_directory, 'temp.csv' )
cells.to_csv(file)

CPU times: user 22min 9s, sys: 340 ms, total: 22min 10s
Wall time: 22min 10s


In [12]:
#If you don't want to run the previous cell
view_directory = '/gpfs/bbp.cscs.ch/data/project/proj84/csaba/aibs_10x_mouse_wholebrain/results/nmi_scores/'
file = os.path.join( view_directory, 'temp.csv' )
cells = pd.read_csv( file, index_col='cell_label' )

In [13]:
# Reset the index of cells before merging
cells.reset_index(drop=False, inplace=True)

In [14]:
# Merge cells with substructure_info based on 'parcellation_substructure' in cells and 'parcellation_term_acronym' in substructure_info
cells = cells.merge(substructure_info[['parcellation_term_acronym', 'parcellation_term_name']], 
                        left_on='parcellation_substructure', 
                        right_on='parcellation_term_acronym', 
                        how='left')

# Drop the redundant column 'parcellation_term_acronym'
cells.drop(columns=['parcellation_term_acronym'], inplace=True)

# Set back the original index of cells
cells.set_index('cell_label', inplace=True)  # Replace 'original_index_column_name' with the name of the original index column

In [15]:
cells.shape

(3791571, 8)

In [16]:
# Round the values in the 'template_nr' column to integers and assign back to the column
cells['template_nr'] = np.round(cells['template_nr']).astype(int)

In [17]:
import pickle
# Load the pickle file with the volumes of each region id at every slice (dict of dict)
file_directory = '/gpfs/bbp.cscs.ch/data/project/proj84/csaba/aibs_10x_mouse_wholebrain/results/nmi_scores/'
file = os.path.join( file_directory, 'template_with_regids.pickle' )

#Load region id volumes from volume_calc_from_template.ipynb
with open(file, 'rb') as pickle_file:
    template_with_regids = pickle.load(pickle_file)

print("Loaded pickle file.")

Loaded pickle file.


Placeholder for fixing key error problem

Solutions: these come from parcellation_annotation[parcellation_annotation['parcellation_term_acronym'] has multiple solutions
- 229 > 794
- 326 > (812, 850,) 866
- 956 > 579

In [18]:
key_errors = []
templ_errors = []

def get_voxel_vol(row):
    template_nr = row['template_nr']
    region_id = row['region_id']
    try:
        return template_with_regids[template_nr][region_id]
    except KeyError:

        key_errors.append(region_id)
        templ_errors.append(template_nr)
        # Define mappings for swapping region_id values
        mappings = {229: 794, 326: [812, 850, 866], 956: 579}
        # Attempt swapping the region_id
        if region_id in mappings:
            new_region_ids = mappings[region_id]
            if isinstance(new_region_ids, list):  # Handle multiple mappings
                for new_region_id in new_region_ids:
                    try:
                        return template_with_regids[template_nr][new_region_id]
                    except KeyError:
                        #print(f"Key combination not found: template {template_nr}, region_id {region_id}, or new_region_id {new_region_id}")
                        pass  # Continue to the next mapping if the current one fails
            else:
                try:
                    return template_with_regids[template_nr][new_region_ids]
                except KeyError:
                    #print(f"Multi Key combination not found: template {template_nr}, region_id {region_id}, or new_region_id {new_region_ids}")
                    pass  # Continue to the next mapping if the current one fails
        
        # If all mappings fail, return None
        print(f"Multi Key combination not found: template {template_nr}, region_id {region_id}, or new_region_id {new_region_ids}")
        return None # Return None if key combination not found

# Apply the function to create the new column 'voxel_vol'
cells['voxel_vol'] = cells.apply(get_voxel_vol, axis=1)


#Now there shold not be any np.nan values in the cell voxel vol columns
# Check if 'voxel_vol' column has any NaN values
has_nan = cells['voxel_vol'].isna().any()

if has_nan:
    print("The 'voxel_vol' column contains NaN values.")
else:
    print("The 'voxel_vol' column does not contain any NaN values.")


The 'voxel_vol' column does not contain any NaN values.


### Region ids and region id volumes on slices
Next to Template ids we can add best position along the cortical axis give the nmi estimation between annotation volumes. In positions we have every / more slices, not just the ones which overcame Quality Control. 

Move this up in the notebook otherwise it won't be saved (not essential) - we don't need this information in the end.

In [19]:
import ast
csv_file_path = "/gpfs/bbp.cscs.ch/data/project/proj84/csaba/aibs_10x_mouse_wholebrain/results/nmi_scores/average_template_array/output.csv"
positions = pd.read_csv(csv_file_path)

# Initialize an empty NumPy array to store the first numbers
first_numbers_array = np.array([], dtype=float)

# Iterate over each string in the list
for s in positions['10um_cortical_pos']:
    # Parse the string into a list
    numbers = ast.literal_eval(s)
    # Get the first number
    first_number = numbers[0]
    # Append the first number to the NumPy array
    first_numbers_array = np.append(first_numbers_array, first_number)
    
positions.loc[:, 'best_pos'] = first_numbers_array
positions.loc[:, 'template_nr'] = [string.split('_')[3] for string in positions['X_as_first_dim']]
#np.sort(np.unique(positions.template_nr))
positions.loc[:, 'template_nr'] = positions.loc[:, 'template_nr'].astype(float)

In [20]:
# Create a dictionary from the 'best_pos' and 'template_nr' columns in the positions DataFrame
pos_dict = dict(zip(positions['template_nr'], positions['best_pos']))

# Initialize an array to store the corresponding 'best_pos' values for each 'template_nr' value in cells
best_pos_array = []

# Loop through all values of cells.loc[:, 'template_nr']
for template_nr in cells.loc[:, 'template_nr']:
    # Look up the 'best_pos' value for the current 'template_nr' value in the dictionary
    best_pos_value = pos_dict.get(template_nr)
    # Add the 'best_pos' value to the array
    best_pos_array.append(best_pos_value)
    #print(best_pos_value)

# Convert the array to a numpy array if needed
best_pos_array = np.array(best_pos_array)

# Overwrite the 'best_position' column in the cells DataFrame with the array
cells.loc[:, 'best_position'] = best_pos_array

In [21]:
view_directory = '/gpfs/bbp.cscs.ch/data/project/proj84/csaba/aibs_10x_mouse_wholebrain/results/nmi_scores/'
file = os.path.join( view_directory, 'cells_in_respective_volumes.csv')
cells.to_csv(file)