# This notebook must be run with the Vizgen_2 conda environment within the Vizgen.sif singularity container

In [1]:
import Mapping
import os
import string

import cv2
import geopandas as gpd
import igraph as ig
import leidenalg
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
import rasterio
import seaborn as sns
import tifffile
import umap
from anndata import AnnData as ad
from matplotlib import patches as mpatches
from matplotlib_scalebar.scalebar import ScaleBar
import scanpy as sc
import anndata
from shapely.affinity import translate
from shapely.geometry import Polygon, MultiPolygon, box, shape
from skimage import img_as_bool, img_as_ubyte
from skimage.measure import label, find_contours, regionprops, regionprops_table
from skimage.morphology import skeletonize
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.neighbors import kneighbors_graph
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
import geojson
from adjustText import adjust_text

  from .autonotebook import tqdm as notebook_tqdm
2025-01-10 15:03:26.819776: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-10 15:03:27.618013: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-01-10 15:03:27.874207: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-01-10 15:03:27.965456: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-10 15:03:28.4

In [2]:
def find_parquet_file(experiment_path):
    region_types = ['region_0', 'region_1']
    for region in region_types:
        file_path = f'{experiment_path}baysor/{region}_6-5_micron_polygons.parquet'
        if os.path.exists(file_path):
            return file_path
    return None

def clean_df(dataframe, sc_obj):
    
    rows_to_keep = sc_obj.obs.Name.unique().tolist()
    
    new_df = dataframe[dataframe.Name.isin(rows_to_keep)]
    new_df['Label_pixels'] = new_df['Label_pixels'].apply(lambda x: [i for i in x if i != 0])
    new_df = new_df.drop_duplicates(subset='Name', keep='first')
    df_single_label = new_df[new_df['Label_pixels'].apply(lambda x: len(x) == 1)]
    single_label_values = df_single_label['Label_pixels'].apply(lambda x: x[0])
    label_counts = single_label_values.value_counts()
    unique_labels = label_counts[label_counts == 1].index
    df_filtered = df_single_label[single_label_values.isin(unique_labels)]
    df_filtered['Label_pixels'] = df_filtered['Label_pixels'].apply(lambda x: x[0])
    
    return df_filtered

def subset_label_image(label_image, filt_df):
    label_pixels_to_keep = filt_df['Label_pixels'].tolist()
    mask = np.isin(test_im, label_pixels_to_keep)

    # Subset the label image
    subset_label_image = np.where(mask, label_image, 0)
    return subset_label_image

def morph_reduction(morph_df):
    # dimensional analysis on the morphology stuff
    # Select columns to exclude from PCA
    columns_to_exclude = ['cell_label','local_centroid-0','local_centroid-1']

    # Select only the numeric columns excluding specified columns
    data_for_pca = features_mic.select_dtypes(include=[np.number]).drop(columns=columns_to_exclude, errors='ignore')

    # Standardize the data
    scaler = StandardScaler()
    data_scaled = scaler.fit_transform(data_for_pca)
    
    pca = PCA(n_components=None)  # None will choose the minimum number of components that explain all the variance

    # Fit and transform the scaled data
    principal_components = pca.fit_transform(data_scaled)

    # Create a DataFrame with the principal components
    pca_df = pd.DataFrame(data=principal_components, columns=['PC' + str(i+1) for i in range(principal_components.shape[1])])

    # Join the PCA DataFrame back with the excluded columns for full analysis
    return pd.concat([features_mic['cell_label'].reset_index(drop=True), pca_df], axis=1)

def morph_umap(pca_df):
    pca_components = pca_df[['PC1', 'PC2', 'PC3','PC4', 'PC5','PC6']].values
    
    reducer = umap.UMAP(n_components=2, random_state=42)
    umap_results = reducer.fit_transform(pca_components)
    pca_df['geoUMAP-1'] = umap_results[:, 0]
    pca_df['geoUMAP-2'] = umap_results[:, 1]
    
    return pca_df

In [3]:
# Function to skeletonize segmented cells
def skeletonize_cells(labeled_cells):
    # Convert labels to boolean
    binary_cells = labeled_cells > 0
    # Skeletonize the binary image
    skeleton = skeletonize(binary_cells)
    return skeleton

# Function to analyze skeleton features
def analyze_skeleton(skeleton):
    # Compute the medial axis (skeleton with distance transform)
    skel, distance = medial_axis(skeleton, return_distance=True)
    # Distance to the background
    dist_on_skel = distance * skel
    return skel, dist_on_skel

def skeleton_to_graph(skeleton):
    graph = nx.Graph()
    rows, cols = skeleton.shape
    for index, value in np.ndenumerate(skeleton):
        if value:
            x, y = index
            for dx, dy in [(0, 1), (1, 0), (1, 1), (1, -1)]:
                neighbor_x, neighbor_y = x + dx, y + dy
                if 0 <= neighbor_x < rows and 0 <= neighbor_y < cols:
                    neighbor = (neighbor_x, neighbor_y)
                    if skeleton[neighbor]:
                        if not graph.has_edge(index, neighbor):
                            weight = np.linalg.norm(np.array(index) - np.array(neighbor))
                            graph.add_edge(index, neighbor, weight=weight)
    return graph

def extract_sub_image_with_padding(image, bbox, padding=10):
    min_row, min_col, max_row, max_col = bbox
    min_row = max(min_row - padding, 0)
    min_col = max(min_col - padding, 0)
    max_row = min(max_row + padding, image.shape[0])
    max_col = min(max_col + padding, image.shape[1])
    return image[min_row:max_row, min_col:max_col], (min_row, min_col)

def extract_features_to_dataframe(labeled_cells):
    data = []
    regions = regionprops(labeled_cells)
    
    # Initialize the progress bar
    for region in tqdm(regions, desc="Processing cells"):
        isolated_cell, offset = extract_sub_image_with_padding(labeled_cells, region.bbox)
        isolated_cell = isolated_cell == region.label
        isolated_cell = isolated_cell.astype(int)  # ensure it's an integer label image
        
        skeleton = skeletonize(isolated_cell)
        
        graph = skeleton_to_graph(skeleton)
        
        branch_points = list(nx.articulation_points(graph))
        endpoints = [x for x in graph.nodes() if graph.degree(x) == 1]
        total_length = np.sum(skeleton)
        num_branches = len(branch_points)
        branch_lengths = [graph[u][v]['weight'] for u, v in graph.edges()]
        tortuosity = [graph[u][v]['weight'] / np.linalg.norm(np.array(u) - np.array(v)) for u, v in graph.edges()]

        # Extract geometric properties
        properties = regionprops_table(isolated_cell, properties=[
            'area', 'convex_area', 'eccentricity', 'equivalent_diameter',
            'extent', 'filled_area', 'major_axis_length', 'minor_axis_length',
            'orientation', 'perimeter', 'solidity', 'bbox_area', 'feret_diameter_max',
            'local_centroid', 'moments_hu', 'euler_number'
        ])
        properties_df = pd.DataFrame(properties)

        # Combine data into one dictionary per region
        cell_data = {
            'cell_label': region.label,
            'total_length': total_length,
            'number_of_branches': num_branches,
            'branch_lengths': branch_lengths,
            'num_endpoints': len(endpoints),
            'branch_points': len(branch_points),
            'tortuosity': tortuosity
        }

        # Add region props features
        for key, value in properties_df.iloc[0].items():
            cell_data[key] = value

        data.append(cell_data)

    return pd.DataFrame(data)

def calculate_ramification_features(df):
    # Calculate ramification metrics and add them to the DataFrame
    df['branch_point_ratio'] = df['branch_points'] / df['total_length']
    df['endpoint_to_branch_point_ratio'] = df['num_endpoints'] / df['branch_points']
    df['average_branch_length'] = df['total_length'] / df['number_of_branches']
    df['ramification_index'] = (df['perimeter']/df['area'])/(2*np.sqrt(np.pi/df['area']))

    # Handle cases where there are no branches to avoid division by zero
    df['endpoint_to_branch_point_ratio'] = df['endpoint_to_branch_point_ratio'].fillna(0)
    df['average_branch_length'] = df['average_branch_length'].fillna(0)

    return df

def scale_measurements(df, micron_per_pixel):
    # Define which columns to scale and by what factor
    pixel_measurements = ['perimeter', 'equivalent_diameter', 'major_axis_length', 'minor_axis_length']
    area_measurements = ['area', 'convex_area']

    # Scale measurements that are linear dimensions
    for column in pixel_measurements:
        df[column] *= micron_per_pixel

    # Scale measurements that are area dimensions
    for column in area_measurements:
        df[column] *= (micron_per_pixel ** 2)

    return df

In [4]:
ad_parent = sc.read_h5ad('../02_annotation/ABC_cleaned.h5ad')
ad_parent = ad_parent[ad_parent.obs.subclass_label_transfer == 'Microglia NN']

# accept the base path
experiment_base_paths = ['/hpc/projects/group.quake/doug/Shapes_Spatial/3-mo-male-1/',
                         '/hpc/projects/group.quake/doug/Shapes_Spatial/3-mo-male-2/',
                         '/hpc/projects/group.quake/doug/Shapes_Spatial/3-mo-male-3-rev2/',
                         '/hpc/projects/group.quake/doug/Shapes_Spatial/3-mo-female-1-rev2/',
                         '/hpc/projects/group.quake/doug/Shapes_Spatial/3-mo-female-2/',
                         '/hpc/projects/group.quake/doug/Shapes_Spatial/3-mo-female-3/',
                         '/hpc/projects/group.quake/doug/Shapes_Spatial/24-mo-male-1/',
                         '/hpc/projects/group.quake/doug/Shapes_Spatial/24-mo-male-2/',
                         '/hpc/projects/group.quake/doug/Shapes_Spatial/24-mo-male-4-rev2/',
                         '/hpc/projects/group.quake/doug/Shapes_Spatial/24-mo-female-1/',
                         '/hpc/projects/group.quake/doug/Shapes_Spatial/24-mo-female-3/',
                         '/hpc/projects/group.quake/doug/Shapes_Spatial/24-mo-female-5/']

ad_bulk = []

for experiment in experiment_base_paths:
    batch = experiment.split('/')[-2]
    
    # Okay now we load our image from this base path
    test_im = Mapping.load_tiff_image(experiment + 'labeled_image.tif')
    print(f"Labeled Image Loaded for {batch}")
    # We also want to load in the geometry file
    parquet_file = find_parquet_file(experiment)
    boundaries = gpd.read_parquet(parquet_file)
    # then we want to take a subset from our adata which corresponds to the file path
    ad_viz = ad_parent[ad_parent.obs.batchID == batch]
    
    data = Mapping.extract_label_pixel_values_baysor(boundaries, test_im)    
    filtered_df = clean_df(pd.DataFrame(data),ad_viz)
    
    sub_im = subset_label_image(test_im, filtered_df)
    
    features_df = extract_features_to_dataframe(sub_im)
    
    features_mic = calculate_ramification_features(features_df)

    #final_pca_df = morph_reduction(features_mic)
    
    #final_pca_df = morph_umap(final_pca_df)
    
    merged_df = pd.merge(filtered_df, features_mic, left_on='Label_pixels', right_on='cell_label', how='inner')
    merged_df = merged_df.drop(columns=['branch_lengths', 'tortuosity', 'local_centroid-0', 'local_centroid-1'])
    
    rows_to_keep = merged_df.Name.unique().tolist()
    
    test_ad = ad_viz[ad_viz.obs.Name.isin(rows_to_keep)]
    
    test_ad.obs = pd.merge(test_ad.obs,merged_df,left_on='Name', right_on='Name', how='inner')
    
    ad_bulk.append(test_ad)
concatenated_data = sc.concat(ad_bulk, join='outer')
concatenated_data.obs_names_make_unique()
concatenated_data.obs = concatenated_data.obs.drop('Percentage', axis=1)
concatenated_data.write_h5ad('Microglia_Shapespace_baysor_500.h5ad')

Labeled Image Loaded for 3-mo-male-1


Extracting pixel values: 100%|██████████| 68546/68546 [00:13<00:00, 4988.40it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['Label_pixels'] = new_df['Label_pixels'].apply(lambda x: [i for i in x if i != 0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['Label_pixels'] = df_filtered['Label_pixels'].apply(lambda x: x[0])
Processing cells: 100%|██████████| 704/704 [00:28<00:00, 24.92it/s]


Labeled Image Loaded for 3-mo-male-2


Extracting pixel values: 100%|██████████| 81120/81120 [00:17<00:00, 4748.87it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['Label_pixels'] = new_df['Label_pixels'].apply(lambda x: [i for i in x if i != 0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['Label_pixels'] = df_filtered['Label_pixels'].apply(lambda x: x[0])
Processing cells: 100%|██████████| 687/687 [00:34<00:00, 19.66it/s]


Labeled Image Loaded for 3-mo-male-3-rev2


Extracting pixel values: 100%|██████████| 103945/103945 [00:19<00:00, 5209.57it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['Label_pixels'] = new_df['Label_pixels'].apply(lambda x: [i for i in x if i != 0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['Label_pixels'] = df_filtered['Label_pixels'].apply(lambda x: x[0])
Processing cells: 100%|██████████| 827/827 [01:26<00:00,  9.59it/s]


Labeled Image Loaded for 3-mo-female-1-rev2


Extracting pixel values: 100%|██████████| 91296/91296 [00:17<00:00, 5091.09it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['Label_pixels'] = new_df['Label_pixels'].apply(lambda x: [i for i in x if i != 0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['Label_pixels'] = df_filtered['Label_pixels'].apply(lambda x: x[0])
Processing cells: 100%|██████████| 493/493 [00:21<00:00, 22.61it/s]


Labeled Image Loaded for 3-mo-female-2


Extracting pixel values: 100%|██████████| 140611/140611 [00:31<00:00, 4480.70it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['Label_pixels'] = new_df['Label_pixels'].apply(lambda x: [i for i in x if i != 0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['Label_pixels'] = df_filtered['Label_pixels'].apply(lambda x: x[0])
Processing cells: 100%|██████████| 894/894 [00:50<00:00, 17.53it/s]


Labeled Image Loaded for 3-mo-female-3


Extracting pixel values: 100%|██████████| 139457/139457 [00:27<00:00, 5032.37it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['Label_pixels'] = new_df['Label_pixels'].apply(lambda x: [i for i in x if i != 0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['Label_pixels'] = df_filtered['Label_pixels'].apply(lambda x: x[0])
Processing cells: 100%|██████████| 1201/1201 [00:58<00:00, 20.62it/s]


Labeled Image Loaded for 24-mo-male-1


Extracting pixel values: 100%|██████████| 83921/83921 [00:18<00:00, 4430.58it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['Label_pixels'] = new_df['Label_pixels'].apply(lambda x: [i for i in x if i != 0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['Label_pixels'] = df_filtered['Label_pixels'].apply(lambda x: x[0])
Processing cells: 100%|██████████| 597/597 [00:17<00:00, 34.09it/s]


Labeled Image Loaded for 24-mo-male-2


Extracting pixel values: 100%|██████████| 118133/118133 [00:23<00:00, 4998.98it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['Label_pixels'] = new_df['Label_pixels'].apply(lambda x: [i for i in x if i != 0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['Label_pixels'] = df_filtered['Label_pixels'].apply(lambda x: x[0])
Processing cells: 100%|██████████| 1193/1193 [01:14<00:00, 15.93it/s]


Labeled Image Loaded for 24-mo-male-4-rev2


Extracting pixel values: 100%|██████████| 89561/89561 [00:21<00:00, 4163.13it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['Label_pixels'] = new_df['Label_pixels'].apply(lambda x: [i for i in x if i != 0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['Label_pixels'] = df_filtered['Label_pixels'].apply(lambda x: x[0])
Processing cells: 100%|██████████| 275/275 [00:26<00:00, 10.54it/s]


Labeled Image Loaded for 24-mo-female-1


Extracting pixel values: 100%|██████████| 112490/112490 [00:23<00:00, 4817.72it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['Label_pixels'] = new_df['Label_pixels'].apply(lambda x: [i for i in x if i != 0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['Label_pixels'] = df_filtered['Label_pixels'].apply(lambda x: x[0])
Processing cells: 100%|██████████| 1098/1098 [01:18<00:00, 14.04it/s]


Labeled Image Loaded for 24-mo-female-3


Extracting pixel values: 100%|██████████| 156523/156523 [00:30<00:00, 5115.89it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['Label_pixels'] = new_df['Label_pixels'].apply(lambda x: [i for i in x if i != 0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['Label_pixels'] = df_filtered['Label_pixels'].apply(lambda x: x[0])
Processing cells: 100%|██████████| 2061/2061 [01:41<00:00, 20.30it/s]


Labeled Image Loaded for 24-mo-female-5


Extracting pixel values: 100%|██████████| 146268/146268 [00:27<00:00, 5225.45it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['Label_pixels'] = new_df['Label_pixels'].apply(lambda x: [i for i in x if i != 0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['Label_pixels'] = df_filtered['Label_pixels'].apply(lambda x: x[0])
Processing cells: 100%|██████████| 1386/1386 [01:02<00:00, 22.33it/s]
  utils.warn_names_duplicates("obs")
