In [None]:
import scanpy as sc
import pandas as pd
import os
import string
import geopandas as gpd
import numpy as np
from shapely.geometry import Point
from tqdm import tqdm
import Mapping
import matplotlib.pyplot as plt
import multiprocessing as mp
from functools import partial
import anndata
from adjustText import adjust_text

from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from scipy.stats import spearmanr

In [None]:
def calculate_spearman_corr(adata, gene_columns, morpho_columns):
    """
    Calculate Spearman correlation between gene expression and morphological features.
    
    Parameters:
    - adata: AnnData object containing gene expression and morphological features.
    - gene_columns: List of gene names in adata.var_names.
    - morpho_columns: List of morphological feature names in adata.obs.
    
    Returns:
    - A DataFrame with Spearman correlation coefficients between genes and morphological features.
    """
    # Extract gene expression data (assume it's stored in adata.X or adata.var)
    gene_expr_df = pd.DataFrame(adata[:, gene_columns].X, columns=gene_columns, index=adata.obs_names)
    
    # Extract morphological features from adata.obs
    morpho_df = adata.obs[morpho_columns]
    
    # Initialize a DataFrame to store the correlations
    corr_matrix = pd.DataFrame(index=gene_columns, columns=morpho_columns)

    # Calculate Spearman correlation for each gene-morphological feature pair
    for gene in gene_columns:
        for morpho in morpho_columns:
            corr, _ = spearmanr(gene_expr_df[gene], morpho_df[morpho])
            corr_matrix.at[gene, morpho] = corr

    # Convert the correlation matrix to numeric values
    corr_matrix = corr_matrix.astype(float)

    return corr_matrix

def cluster_morphological_features(corr_matrix):
    """
    Cluster the morphological features based on their correlation with genes.
    
    Parameters:
    - corr_matrix: DataFrame containing correlation coefficients between genes and morphological features.
    
    Returns:
    - Clustering of the morphological features and a dendrogram plot.
    """
    # Cluster the morphological features using hierarchical clustering
    linkage_matrix = linkage(corr_matrix.T, method='ward')
    
    # Plot dendrogram
    plt.figure(figsize=(10, 6))
    dendrogram(linkage_matrix, labels=corr_matrix.columns)
    plt.title('Clustering of Morphological Features Based on Gene Correlations')
    plt.xlabel('Morphological Features')
    plt.ylabel('Distance')
    plt.show()

    return linkage_matrix

def rank_genes_by_contribution(corr_matrix, linkage_matrix, n_clusters=5):
    """
    Rank genes by their contribution to clustering the morphological features.
    
    Parameters:
    - corr_matrix: DataFrame containing Spearman correlation coefficients.
    - linkage_matrix: The linkage matrix obtained from hierarchical clustering.
    - n_clusters: The number of clusters for the morphological features.
    
    Returns:
    - A DataFrame ranking the genes by their contribution to clustering the morphological features.
    """
    # Create clusters of morphological features
    morpho_clusters = fcluster(linkage_matrix, n_clusters, criterion='maxclust')
    
    # Rank genes by their average correlation with features in the same cluster
    gene_contribution = pd.Series(index=corr_matrix.index, dtype=float)
    for gene in corr_matrix.index:
        mean_corr_per_cluster = []
        for cluster in np.unique(morpho_clusters):
            cluster_features = corr_matrix.columns[morpho_clusters == cluster]
            mean_corr_per_cluster.append(corr_matrix.loc[gene, cluster_features].mean())
        # Average the correlations across clusters
        gene_contribution[gene] = np.mean(mean_corr_per_cluster)
    
    # Sort the genes by their contribution
    ranked_genes = gene_contribution.sort_values(ascending=False)
    
    return ranked_genes

def plot_correlation_heatmap(corr_matrix):
    """
    Plot a heatmap of the Spearman correlation matrix.
    
    Parameters:
    - corr_matrix: DataFrame containing Spearman correlation coefficients.
    
    Returns:
    - Displays a heatmap.
    """
    # Create the heatmap using seaborn
    plt.figure(figsize=(12, 6))
    sns.heatmap(corr_matrix, cmap="coolwarm", annot=False, cbar=True)
    
    # Customize plot labels
    plt.title('Spearman Correlation (Genes - Morphological Features)')
    plt.xlabel('Morphological Features')
    plt.ylabel('Genes')
    
    plt.tight_layout()
    plt.show
    
def top_genes_per_morphology(corr_matrix, top_n=20):
    """
    Get the top N genes and their Spearman correlation values for each morphology feature.
    
    Parameters:
    - corr_matrix: DataFrame containing Spearman correlation values (genes as rows, morphological features as columns).
    - top_n: The number of top genes to return for each morphology feature (default: 20).
    
    Returns:
    - A dictionary where each key is a morphological feature, and the value is a DataFrame with the top genes and their Spearman values.
    """
    top_genes_dict = {}

    # Iterate through each morphology feature
    for morpho in corr_matrix.columns:
        # Sort genes by Spearman correlation value in descending order
        top_genes = corr_matrix[morpho].sort_values(ascending=False).head(top_n)
        
        # Store the top genes and their Spearman values in a DataFrame
        top_genes_dict[morpho] = pd.DataFrame({
            'Gene': top_genes.index,
            'Spearman_Correlation': top_genes.values
        })
    
    return top_genes_dict

def top_5_genes_correlation_matrix(corr_matrix):
    """
    Create a new correlation matrix that includes only the top 5 genes for each morphology feature.
    
    Parameters:
    - corr_matrix: DataFrame containing Spearman correlation values (genes as rows, morphological features as columns).
    
    Returns:
    - A new DataFrame with the top 5 genes for each morphology feature.
    """
    # Initialize an empty set to store the top 5 genes across all morphologies
    top_genes_set = set()
    
    # Find the top 5 genes for each morphology feature and add them to the set
    for morpho in corr_matrix.columns:
        top_genes = corr_matrix[morpho].sort_values(ascending=False).head(5).index
        top_genes_set.update(top_genes)
    
    # Create a new correlation matrix that includes only the top 5 genes for each morphology feature
    new_corr_matrix = corr_matrix.loc[[i for i in top_genes_set]]
    
    return new_corr_matrix

In [None]:
new_ad = sc.read_h5ad('Transciptomic_labels_and_morphology_labels_full.h5ad')

In [None]:
start_index = list(new_ad.obs.columns).index('Cell Area')  # Add 1 to include the column itself
columns_after_specific = new_ad.obs.iloc[:, start_index:].values

morphological_columns = new_ad.obs.columns[start_index:]

features = morphological_columns[:-12].tolist()

features = [i for i in features if i not in ['Radius of Influence','Ramification Index']]

In [None]:
corr_matrix = calculate_spearman_corr(new_ad, new_ad.var_names.tolist(), features)
plot_correlation_heatmap(corr_matrix)

In [None]:
top_genes_dict = top_genes_per_morphology(corr_matrix, top_n=6)

new_corr_matrix = top_5_genes_correlation_matrix(corr_matrix)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.cluster.hierarchy as sch

def plot_ordered_clustermap(corr_matrix,output_pdf = None):
    """
    Plot a clustermap with rows and columns ordered to achieve a diagonal structure.
    
    Parameters:
    - corr_matrix: DataFrame containing the correlation matrix.
    
    Returns:
    - A seaborn cluster map plot.
    """
    # Perform hierarchical clustering with optimal leaf ordering
    row_linkage = sch.linkage(corr_matrix, method='ward', optimal_ordering=True)
    col_linkage = sch.linkage(corr_matrix.T, method='ward', optimal_ordering=True)
    
    # Plot the cluster map
    sns.clustermap(corr_matrix, row_linkage=row_linkage, col_linkage=col_linkage, cmap="coolwarm", 
                   linewidths=0.5, figsize=(15, 15), method='ward')
    
    plt.title('Cluster Map with Optimal Leaf Ordering')
    if output_pdf:
        plt.savefig('figures/morph_to_gene.pdf', format='pdf', bbox_inches='tight')
    plt.show()
    
plot_ordered_clustermap(new_corr_matrix.T,output_pdf=True) 

In [None]:
ordered = ['Slc1a2', 'Gria2','Pink1', 'Ank2','Gria1','Slc6a11','Aldoc','Gnas','Mfn2','Atp2a2','Mobp','Pcdh9','Gfap','Aqp4','Sdc3','Cisd1','Sst','Spp1','Apoe','Axl','Pcdhb8','Cxcl3','Rbm3','Cemip2','Cirbp','P2ry12','Abca9','Tmem119',
           'Stab1','Cd33','Mki67','Ifi207','Mrc1','Il10ra','Csf3r','Cx3cr1','Csf1r']

ord_feat = ['Cell Area', 'Convex Hull Perimeter', 'Convex Hull Area','Cell Perimeter','Number of Branches','Skeleton Length','Cell Roughness','Number of Terminal Points',
           'Number of Branching Points','Soma Ratio','Path Distance','Fractal Dimension','Euclidean Distance','Tortuosity','Branching Index','Dendritic Maximum','Soma Area',
           'Soma Perimeter','Eccentricity','Mean Branch Length','Convex Hull Span Ratio','Lacunarity','Soma Circularity','Soma Circularity','Convex Hull Circularity','Cell Convexity',
           'Cell Circularity','Cell Solidity']

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from scipy.stats import zscore
from scipy.cluster.hierarchy import linkage, leaves_list

def plot_clustered_zscore_heatmap(adata, gene_list, group_col='sub_mic',cmap="coolwarm",output_pdf=None):
    """
    Plot a z-score heatmap for a set of genes, grouped and clustered by sub_mic.
    
    Parameters:
    - adata: AnnData object containing gene expression data.
    - gene_list: List of genes to include in the heatmap.
    - group_col: Column name in obs to group by (default: 'sub_mic').
    
    Returns:
    - A clustered heatmap of z-scores for the selected genes, grouped by sub_mic.
    """
    # Extract gene expression data for the selected genes
    gene_expr_df = pd.DataFrame(adata[:, gene_list].X, columns=gene_list, index=adata.obs_names)
    
    # Add the grouping column (sub_mic) to the gene expression DataFrame
    gene_expr_df[group_col] = adata.obs[group_col].values
    
    # Group by sub_mic and calculate the mean expression for each gene in each sub_mic group
    grouped_expr = gene_expr_df.groupby(group_col).mean()

    # Calculate z-scores for each gene across the sub_mic groups
    zscore_expr = grouped_expr.apply(zscore, axis=0)

    # Perform hierarchical clustering on the rows (sub_mic groups)
    row_linkage = linkage(zscore_expr, method='ward')
    col_linkage = linkage(zscore_expr.T, method='ward')

    # Create a clustered heatmap
    #sns.clustermap(zscore_expr,row_linkage=row_linkage,col_linkage=col_linkage, cmap=cmap, linewidths=0.5, figsize=(10, 6), annot=False)
    
    row_linkage = linkage(zscore_expr, method='ward')
    row_order = leaves_list(row_linkage)  # Get the leaf order based on clustering

    # Reorder the rows based on the hierarchical clustering result
    zscore_expr = zscore_expr.iloc[row_order]
    print(row_order)
    # Create a heatmap without clustering the columns
    fig, ax = plt.subplots(figsize=(20, 10))
    
    sns.heatmap(zscore_expr, cmap=cmap, linewidths=0.5, annot=False,ax=ax)
    # Customize plot labels
    plt.title('Clustered Z-Score Heatmap of Genes Grouped by Sub_mic')
    plt.xlabel('Genes')
    plt.ylabel('Sub_mic Groups')

    plt.tight_layout()
    if output_pdf:
        plt.savefig(output_pdf, format='pdf', bbox_inches='tight')
    plt.show()
    
    sns.heatmap(zscore_expr)
    return zscore_expr
    
def plot_clustered_heatmap_with_features(adata, feature_list, group_col='sub_mic',cmap="coolwarm",output_pdf=None):
    """
    Plot a z-score heatmap for the morphological features (from adata.obs) grouped by sub_mic.
    
    Parameters:
    - adata: AnnData object containing morphological features in obs.
    - feature_list: List of feature columns in adata.obs to include in the heatmap.
    - group_col: Column name in obs to group by (default: 'sub_mic').
    
    Returns:
    - A clustered heatmap of z-scores for the selected morphological features, grouped by sub_mic.
    """
    # Extract the morphological features from adata.obs
    feature_df = adata.obs[feature_list]
    
    # Add the grouping column (sub_mic) to the morphological features DataFrame
    feature_df[group_col] = adata.obs[group_col].values
    
    # Group by sub_mic and calculate the mean value for each feature in each sub_mic group
    grouped_features = feature_df.groupby(group_col).mean()

    # Calculate z-scores for each feature across the sub_mic groups
    zscore_features = grouped_features.apply(zscore, axis=0)

    # Perform hierarchical clustering on the rows (sub_mic groups) and columns (features)
    row_linkage = linkage(zscore_features, method='ward')
    col_linkage = linkage(zscore_features.T, method='ward')

    # Create a clustered heatmap
    #sns.clustermap(zscore_features, row_linkage=row_linkage, col_linkage=col_linkage, 
     #              cmap=cmap, linewidths=0.5, figsize=(10, 8), annot=False)
        
    row_linkage = linkage(zscore_features, method='ward')
    row_order = leaves_list(row_linkage)  # Get the leaf order based on clustering

    # Reorder the rows based on the hierarchical clustering result
    # trans = [1, 0, 2]
    # morph = [4, 3, 2,1,0]
    zscore_features = zscore_features.iloc[row_order]
    print(row_order)
    # Create a heatmap without clustering the columns
    fig, ax = plt.subplots(figsize=(20, 10))
    sns.heatmap(zscore_features, cmap=cmap, linewidths=0.5, annot=False,ax=ax)

    # Customize plot labels
    plt.title('Clustered Z-Score Heatmap of Morphological Features Grouped by Sub_mic')
    plt.xlabel('Morphological Features')
    plt.ylabel('Sub_mic Groups')

    plt.tight_layout()
    if output_pdf:
        plt.savefig(output_pdf, format='pdf', bbox_inches='tight')
    plt.show()
    
    sns.heatmap(zscore_features)
    return zscore_features

In [None]:
gene_list = new_corr_matrix.index.tolist()  # Use the genes from corr_matrix
plot_clustered_zscore_heatmap(new_ad, ordered,'sub_mic','afmhot',output_pdf='figures/cluster_mic_by_correlating_genes.pdf')

In [None]:
gene_list = new_corr_matrix.index.tolist()  # Use the genes from corr_matrix
plot_clustered_zscore_heatmap(new_ad, ordered,group_col='ordered_morph',cmap='GnBu',output_pdf='figures/cluster_morph_by_correlating_genes.pdf')

In [None]:
plot_clustered_heatmap_with_features(new_ad, ord_feat, group_col='sub_mic',cmap='afmhot',output_pdf='figures/cluster_mic_by_morph_features.pdf')

In [None]:
plot_clustered_heatmap_with_features(new_ad, features, group_col='ordered_morph',cmap='GnBu',output_pdf='figures/cluster_morph_by_features.pdf')

In [None]:
from adjustText import adjust_text
import gseapy as gp
def plot_morphology_correlation(corr_matrix, morphology_feature, top_n=10, output_pdf=None):
    """
    Plot the correlation of genes with a specific morphological feature, labeling the top N genes.
    Also return the list of the top N genes.
    
    Parameters:
    - corr_matrix: DataFrame containing Spearman correlation values (genes as rows, morphological features as columns).
    - morphology_feature: The specific morphology feature to plot correlations for.
    - top_n: The number of top genes to label (default is 10).
    
    Returns:
    - A plot showing correlations sorted in descending order, with the top N genes labeled and adjusted.
    - List of the top N genes by correlation.
    """
    # Ensure the morphology feature exists in the correlation matrix
    if morphology_feature not in corr_matrix.columns:
        raise ValueError(f"{morphology_feature} not found in corr_matrix columns.")
    
    # Sort the genes by their correlation with the selected morphological feature
    sorted_genes = corr_matrix[morphology_feature].sort_values(ascending=False)
    
    # Get the top N genes
    top_genes = sorted_genes.head(top_n).index.tolist()
    
    # Plot the correlation values
    plt.figure(figsize=(12, 5))
    plt.plot(sorted_genes.values, color='black', linewidth=2)
    
    # Collect the text objects for adjusting later
    texts = []
    
    # Label the top N genes
    for i in range(top_n):
        gene_name = sorted_genes.index[i]
        x_position = i
        y_position = sorted_genes.iloc[i]
        # Add the text objects to the list for adjustment
        texts.append(plt.text(x_position, y_position, gene_name, 
                              ha='center', va='bottom', fontsize=10, color='black'))
    
    # Adjust the labels to avoid overlap
    adjust_text(texts, only_move={'points':'y', 'text':'y'}, arrowprops=dict(arrowstyle="->", color='gray', lw=0.5))
    
    # Customize the plot
    plt.xlabel("Genes")
    plt.ylabel(r"$r_s$")
    plt.title(f"Correlation with {morphology_feature}")
    plt.xticks([])  # Remove x-ticks
    
    # Show the plot
    plt.tight_layout()
    if output_pdf:
        plt.savefig(f'{output_pdf}{morphology_feature}_correlation.pdf',format='pdf')
    plt.show()
    
    enrichr_results = gp.enrichr(
    gene_list=top_genes, 
    gene_sets='GO_Cellular_Component_2023',  # Database for cellular compartments
    organism='Mouse',  # Organism (can be 'Mouse', 'Human', etc.)
    )

    # Convert the results to a pandas DataFrame
    df_results = enrichr_results.results
    gp.barplot(enrichr_results.res2d, title='GO Enrichment', cutoff=0.05, figsize=(6,5),ofname=f'figures/{morphology_feature}_compartment.pdf')
    
    # Return the top N genes
    return top_genes

In [None]:
plot_morphology_correlation(corr_matrix, 'Fractal Dimension', top_n=10,output_pdf='figures/')

In [None]:
plot_morphology_correlation(corr_matrix, 'Cell Solidity', top_n=10,output_pdf='figures/')

In [None]:
plot_morphology_correlation(corr_matrix, 'Soma Area', top_n=10,output_pdf='figures/')

In [None]:
# region dependent effect
def calculate_region_spearman_corr(adata, gene_columns, morpho_columns, region_column):
    """
    Calculate Spearman correlation between gene expression and morphological features
    for each brain region separately.
    
    Parameters:
    - adata: AnnData object containing gene expression and morphological features.
    - gene_columns: List of gene names in adata.var_names.
    - morpho_columns: List of morphological feature names in adata.obs.
    - region_column: Column in adata.obs that contains the brain region information (e.g., 'Brain_Region').
    
    Returns:
    - A dictionary where each key is a brain region and the value is a DataFrame 
      with Spearman correlation coefficients between genes and morphological features.
    """
    # Extract morphological features from adata.obs
    morpho_df = adata.obs[morpho_columns]

    # Extract brain regions
    regions = adata.obs[region_column].unique()

    # Dictionary to store correlation matrices for each region
    region_corr_matrices = {}

    for region in regions:
        # Subset the data for the current region
        region_idx = adata.obs[region_column] == region
        region_adata = adata[region_idx]

        # Extract gene expression data for the region
        gene_expr_df = pd.DataFrame(region_adata[:, gene_columns].X.toarray(), columns=gene_columns, index=region_adata.obs_names)

        # Initialize a DataFrame to store the correlations for the current region
        corr_matrix = pd.DataFrame(index=gene_columns, columns=morpho_columns)

        # Calculate Spearman correlation for each gene-morphological feature pair
        for gene in gene_columns:
            for morpho in morpho_columns:
                # Ensure no shape mismatch by properly aligning the data
                gene_values = gene_expr_df[gene].values.flatten()  # Ensure it's 1D
                morpho_values = morpho_df.loc[region_adata.obs_names, morpho].values.flatten()  # Ensure it's 1D

                if len(gene_values) == len(morpho_values):
                    # Calculate Spearman correlation and only take the first value (correlation coefficient)
                    corr, _ = spearmanr(gene_values, morpho_values)
                    corr_matrix.at[gene, morpho] = corr
                else:
                    corr_matrix.at[gene, morpho] = None  # Handle any mismatch by assigning None or NaN

        # Store the correlation matrix for the region
        region_corr_matrices[region] = corr_matrix.astype(float)

    return region_corr_matrices
region_corrs = calculate_region_spearman_corr(new_ad, gene_columns=ordered, morpho_columns=ord_feat, region_column='Brain_Region')

In [None]:
def top_genes_per_region(region_corrs, regions, morphologies, top_n=5):
    """
    For each region, find the top N genes with the highest average correlation across selected morphology features.
    
    Parameters:
    - region_corrs: Dictionary where each key is a brain region and value is a correlation matrix.
    - regions: List of regions to include in the ranking.
    - morphologies: List of morphological feature names to include in the ranking.
    - top_n: Number of top genes to return (default is 5).
    
    Returns:
    - A dictionary where each key is a region, and the value is a DataFrame containing the top N genes
      with the highest average correlation across the selected morphology features.
    """
    # Dictionary to store top genes for each region
    top_genes_dict = {}

    # Iterate over the regions and find top N genes
    for region in regions:
        if region in region_corrs:
            # Extract the correlation matrix for the current region
            corr_matrix = region_corrs[region].loc[:, morphologies]
            
            # Calculate the average correlation for each gene across the selected morphologies
            avg_corrs = corr_matrix.mean(axis=1)
            
            # Sort the genes by average correlation in descending order and select the top N
            top_genes = avg_corrs.sort_values(ascending=False).head(top_n)
            
            # Store the result in the dictionary
            top_genes_dict[region] = top_genes.reset_index().rename(columns={0: 'Avg_Correlation'})
        else:
            raise ValueError(f"Region '{region}' not found in correlation dictionary.")

    return top_genes_dict

def plot_gene_region_count(top_genes_by_region,output_pdf=None):
    """
    Plot a barplot showing how many regions each gene appears in based on the top genes per region.
    The barplot will be sorted by the number of regions in descending order.
    
    Parameters:
    - top_genes_by_region: Dictionary where each key is a region, and the value is a DataFrame containing
      the top genes with the highest average correlation in that region.
    
    Returns:
    - A sorted bar plot showing the number of regions each gene appears in.
    """
    # Create a list to store the genes across all regions
    all_genes = []

    # Collect all genes from the top_genes_by_region dictionary
    for region, df in top_genes_by_region.items():
        all_genes.extend(df['index'])  # Changed 'Gene' to 'index' based on your DataFrame

    # Create a DataFrame to count the occurrence of each gene across regions
    gene_counts = pd.Series(all_genes).value_counts().reset_index()
    gene_counts.columns = ['Gene', 'Region_Count']

    # Sort the DataFrame by the number of regions in descending order
    gene_counts = gene_counts.sort_values(by='Region_Count', ascending=False)

    # Plot the bar plot
    plt.figure(figsize=(10, 6))
    plt.bar(gene_counts['Gene'], gene_counts['Region_Count'], color='skyblue')
    plt.xlabel('Genes')
    plt.ylabel('Number of Regions')
    plt.title('Number of Regions Each Gene Appears In (Top Genes)')
    plt.xticks(rotation=90, ha='right')
    plt.tight_layout()
    if output_pdf:
        plt.savefig(output_pdf,format='pdf')

    # Show the plot
    plt.show()

In [None]:
top_genes_by_region = top_genes_per_region(
    region_corrs=region_corrs,
    regions=regions,
    morphologies=cluster_1_feat,
    top_n=5
)

plot_gene_region_count(top_genes_by_region,'figures/region_count_branch_structures.pdf')

In [None]:
genes_branch = ['Slc1a2','Slc6a11','Gria2','Atp2a2','Gria1','Pcdh9','Apoe','Mobp','Mfn2','Pink1','Aqp4','Aldoc','Gfap','Ank2','Gnas','Csf3r']

In [None]:
enrichr_results = gp.enrichr(
    gene_list=genes_branch, 
    gene_sets='GO_Cellular_Component_2023',  # Database for cellular compartments
    organism='Mouse',  # Organism (can be 'Mouse', 'Human', etc.)
    )

    # Convert the results to a pandas DataFrame
df_results = enrichr_results.results
gp.barplot(enrichr_results.res2d, title='GO Enrichment', cutoff=0.05, figsize=(6,5),ofname=f'figures/branches_compartment.pdf')