In [1]:
import anndata
import matplotlib.colors as clr
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import random
import scanpy as sc
import sys
from collections import defaultdict
from sklearn.cluster import KMeans, MiniBatchKMeans

import warnings
warnings.filterwarnings("ignore")
sc.settings.verbosity = 0

random.seed(123)
np.random.seed(123)
os.environ["PYTHONHASHSEED"] = "123"

In [2]:
# Colors
color_dct = ["#F56867","#FEB915","#C798EE","#59BE86","#7495D3","#6D1A9C","#15821E","#3A84E6","#997273","#787878","#DB4C6C","#9E7A7A","#554236","#AF5F3C","#93796C","#F9BD3F","#DAB370","#877F6C","#268785"]
color_cts = clr.LinearSegmentedColormap.from_list("magma", ["#000003", "#3B0F6F", "#8C2980", "#F66E5B", "#FD9F6C", "#FBFCBF"], N=256)

In [3]:
# Import model
current_dir = os.getcwd()
model_dir = os.path.abspath(os.path.join(current_dir, ".."))

sys.path.insert(0, model_dir)
from utils import *
from model import mcDETECT, spot_granule
sys.path.pop(0)

'/Users/chenyang/Library/CloudStorage/Dropbox/mcDETECT'

In [4]:
# -------------------- Read data -------------------- #

# Cells
adata = sc.read_h5ad("data/adata.h5ad")

# Neurons
adata_neuron = adata[adata.obs["cell_type"].isin(["Glutamatergic", "GABAergic"])].copy()

# Transcripts
transcripts = pd.read_parquet("data/transcripts.parquet")

# Genes
genes = pd.read_csv("data/genes.csv")
genes = list(genes.iloc[:, 0])

# Negative control markers
nc_genes = pd.read_csv("data/negative_controls.csv")
nc_genes = list(nc_genes["Gene"])

# Spots
spots = sc.read_h5ad("data/spots.h5ad")

In [5]:
# Markers
syn_genes = ["Camk2a", "Cplx2", "Slc17a7", "Ddn", "Syp", "Map1a", "Shank1", "Syn1", "Gria1", "Gria2", "Cyfip2", "Vamp2", "Bsn", "Slc32a1", "Nfasc", "Syt1", "Tubb3", "Nav1", "Shank3", "Mapt"]
len(syn_genes)

20

In [6]:
# Initialize mcDETECT
mc = mcDETECT(type = "MERSCOPE", transcripts = transcripts, syn_genes = syn_genes, nc_genes = nc_genes, eps = 1.5,
              minspl = 3, grid_len = 1, cutoff_prob = 0.95, alpha = 10, low_bound = 3, size_thr = 4,
              in_nucleus_thr = (0.1, 0.9), l = 1, rho = 0.1, s = 1, nc_top = 15, nc_thr = 0.1)

In [7]:
# Read granules
granules = pd.read_csv("output/all_granules.csv")
granules.head()

Unnamed: 0,sphere_x,sphere_y,sphere_z,layer_z,sphere_r,size,comp,in_nucleus,gene,brain_area,global_y_new,global_x_new
0,841.70602,2111.386,0.0,0,1.099598,5.0,2.0,0,Camk2a,FT,2225.470019,462.280283
1,839.2096,2117.4731,0.0,0,1.160026,7.0,3.0,0,Camk2a,FT,2231.031143,458.764775
2,839.05195,2129.9065,0.0,0,0.684313,3.0,1.0,0,Camk2a,FT,2243.248276,456.450483
3,842.6394,2129.926,0.0,0,1.56093,7.0,1.0,0,Camk2a,FT,2243.890434,459.980046
4,832.27235,2146.3137,0.0,0,0.829829,4.0,2.0,0,Camk2a,FT,2258.228949,446.9248


In [None]:
# Granule expression profile
granule_adata = mc.profile(granules, genes = genes)
granule_adata.write_h5ad("output/granule_adata_raw.h5ad")
granule_adata

In [None]:
# # t-SNE embedding
# sc.pp.normalize_total(granule_adata, target_sum=1e4)
# sc.pp.log1p(granule_adata)
# sc.tl.pca(granule_adata, n_comps=10, svd_solver="auto")
# sc.tl.tsne(granule_adata, n_pcs=10)

In [None]:
# granule_adata.write_h5ad("output/granule_adata_tsne.h5ad")

In [None]:
# Granule expression profile
granule_adata = sc.read_h5ad("output/granule_adata_tsne.h5ad")

cutoff = 6250
granule_adata.obs["global_y_new"] = cutoff - granule_adata.obs["global_y_new"]

granule_adata

In [None]:
# Use self-defined genes (should be the same as 1_gene_ranking_raw)
genes_syn_pre = ["Bsn", "Gap43", "Nrxn1", "Slc17a6", "Slc17a7", "Slc32a1", "Snap25", "Stx1a", "Syn1", "Syp", "Syt1", "Vamp2", "Cplx2"]
genes_syn_post = ["Camk2a", "Dlg3", "Dlg4", "Gphn", "Gria1", "Gria2", "Homer1", "Homer2", "Nlgn1", "Nlgn2", "Nlgn3", "Shank1", "Shank3"]
genes_axon = ["Ank3", "Nav1", "Sptnb4", "Nfasc", "Mapt", "Tubb3"]
genes_dendrite = ["Actb", "Cyfip2", "Ddn", "Dlg4", "Map1a", "Map2"]

ref_genes = list(set(genes_syn_pre + genes_syn_post + genes_axon + genes_dendrite))
ref_genes = list(set(ref_genes) & set(genes))
len(ref_genes)

In [None]:
# Subset data
granule_adata_subset = granule_adata[:, ref_genes].copy()
granule_adata_subset

In [None]:
# K-Means clustering
data = granule_adata_subset.X.copy()
if not isinstance(data, np.ndarray):
    data = data.toarray()

n_clusters = 15
# kmeans = KMeans(n_clusters = n_clusters, random_state = 42, n_init = 25)
kmeans = MiniBatchKMeans(n_clusters = n_clusters, batch_size = 5000, random_state = 123)
kmeans.fit(data)
granule_adata.obs["granule_subtype_kmeans"] = kmeans.labels_.astype(str)

desired_order = [f"{i}" for i in range(n_clusters)]
granule_adata.obs["granule_subtype_kmeans"] = pd.Categorical(granule_adata.obs["granule_subtype_kmeans"], categories=desired_order, ordered=True)

In [None]:
# Reference gene expression
marker_genes = {"pre-syn": genes_syn_pre, "post-syn": genes_syn_post, "dendrites": genes_dendrite, "axons": genes_axon}

expression_data = pd.DataFrame(granule_adata.X, columns=granule_adata.var_names, index=granule_adata.obs_names)
kmeans_labels = granule_adata.obs["granule_subtype_kmeans"]
cluster_marker_avg = defaultdict(lambda: defaultdict(int))

for cluster, genes in marker_genes.items():
    num_genes = len(genes)
    for gene in genes:
        if gene in expression_data.columns:
            gene_expression = expression_data[gene].groupby(kmeans_labels).sum()
            for kmeans_cluster, expr_sum in gene_expression.items():
                cluster_marker_avg[kmeans_cluster][cluster] += expr_sum / num_genes

cluster_percentages = {}
for kmeans_cluster, cluster_counts in cluster_marker_avg.items():
    total_expression = sum(cluster_counts.values())
    if total_expression > 0:
        cluster_percentages[kmeans_cluster] = {cluster: (count / total_expression) for cluster, count in cluster_counts.items()}
    else:
        cluster_percentages[kmeans_cluster] = {cluster: 0 for cluster in cluster_counts.keys()}

In [None]:
cluster_labels = list(cluster_percentages.keys())
marker_clusters = list(marker_genes.keys())

plot_data = np.array([
    [cluster_percentages[cluster].get(marker, 0) for marker in marker_clusters]
    for cluster in cluster_labels
])

cluster_labels_int = [int(cl) for cl in cluster_labels]
sorted_indices = np.argsort(cluster_labels_int)

cluster_labels_sorted = [cluster_labels[i] for i in sorted_indices]
plot_data_sorted = plot_data[sorted_indices]

fig, ax = plt.subplots(figsize=(8, 6))
bottom = np.zeros(len(cluster_labels_sorted))

for i, marker_cluster in enumerate(marker_clusters):
    ax.bar(cluster_labels_sorted, plot_data_sorted[:, i], bottom=bottom, label=marker_cluster)
    bottom += plot_data_sorted[:, i]

ax.set_xlabel("mcDETECT Clusters")
ax.set_ylabel("Percentage of Marker Genes")
ax.grid(False)
ax.legend(title="Type", bbox_to_anchor=(1.25, 1), loc="upper right")
plt.tight_layout()
plt.show()

In [None]:
# Plot granule subtype heatmap
ref_genes_sorted = ["Bsn", "Gap43", "Nrxn1", "Slc17a6", "Slc17a7", "Slc32a1", "Stx1a", "Syn1", "Syp", "Syt1", "Vamp2", "Cplx2", "Camk2a", "Dlg3", "Dlg4", "Gphn", "Gria1", "Gria2", "Homer1", "Homer2", "Nlgn1", "Nlgn2", "Nlgn3", "Shank1", "Shank3", "Cyfip2", "Ddn", "Dlg4", "Map1a", "Map2", "Ank3", "Nav1", "Nfasc", "Mapt", "Tubb3"]
granule_adata_subset.obs["granule_subtype_kmeans"] = pd.Categorical(granule_adata.obs["granule_subtype_kmeans"])
granule_adata_subset.obs["granule_subtype_kmeans"] = pd.Categorical(granule_adata_subset.obs["granule_subtype_kmeans"], categories=cluster_labels_sorted, ordered=True)

ax = sc.pl.heatmap(granule_adata_subset, var_names = ref_genes_sorted, groupby = "granule_subtype_kmeans", cmap = "Reds", standard_scale = "var", dendrogram = False, swap_axes = True, show = False, figsize = (10, 6))
plt.show()

In [None]:
# Manual subtyping
pre_list = ["4", "9"]
post_list = ["0", "1", "3"]
den_list = []
axon_list = []
pre_post_list = ["8"]
pre_den_list = ["13"]
post_den_list = ["6", "11", "12", "14"]
pre_post_den_list = ["5", "7", "10"]
others_list = ["2"]

subtype_dict = {"pre-syn": pre_list, "post-syn": post_list, "pre & post": pre_post_list, "pre & den": pre_den_list, "post & den": post_den_list, "pre & post & den": pre_post_den_list, "dendrites": den_list, "axons": axon_list, "others": others_list}
granule_adata.obs["granule_subtype"] = np.nan
for i in subtype_dict.keys():
    ind = pd.Series(granule_adata.obs["granule_subtype_kmeans"]).isin(subtype_dict[i])
    granule_adata.obs.loc[ind, "granule_subtype"] = i
granule_adata.obs["granule_subtype"] = pd.Categorical(granule_adata.obs["granule_subtype"], categories=["pre-syn", "post-syn", "pre & post", "pre & den", "post & den", "pre & post & den", "dendrites", "axons", "others"], ordered=True)
granules["granule_subtype"] = pd.Categorical(granule_adata.obs["granule_subtype"], categories=["pre-syn", "post-syn", "pre & post", "pre & den", "post & den", "pre & post & den", "dendrites", "axons", "others"], ordered=True)
granule_adata.obs["granule_subtype"] = granule_adata.obs["granule_subtype"].cat.remove_unused_categories()

In [None]:
pd.DataFrame({"Proportion": (granule_adata.obs["granule_subtype"].value_counts(normalize=True).sort_index() * 100).round(2)})

In [None]:
granule_adata = assign_palette_to_adata(granule_adata, obs_key = "granule_subtype", cmap_name = "tab10")
granule_adata = assign_palette_to_adata(granule_adata, obs_key = "granule_subtype_kmeans", cmap_name = "tab20")

In [None]:
# Plot granule subtypes t-SNE
sc.set_figure_params(figsize = (8, 8))
ax = sc.pl.tsne(granule_adata, color="granule_subtype_kmeans", size=2, show=False)
ax.grid(False)
ax.set_xticks([])
ax.set_yticks([])
ax.set_xlabel("")
ax.set_ylabel("")
ax.set_title("")
frame_width = 1.5
for spine in ax.spines.values():
    spine.set_linewidth(frame_width)
plt.savefig("output/granule_subtype_kmeans_tsne.jpeg", dpi = 300, bbox_inches = "tight")
plt.close()

ax = sc.pl.tsne(granule_adata, color="granule_subtype", size=2, show=False)
ax.grid(False)
ax.set_xticks([])
ax.set_yticks([])
ax.set_xlabel("")
ax.set_ylabel("")
ax.set_title("")
frame_width = 1.5
for spine in ax.spines.values():
    spine.set_linewidth(frame_width)
plt.savefig("output/granule_subtype_tsne.jpeg", dpi = 300, bbox_inches = "tight")
plt.close()

In [None]:
# Plot granule subtypes scatterplot
sc.set_figure_params(figsize = (6, 9))
ax = sc.pl.scatter(granule_adata, x="global_y_new", y="global_x_new", color="granule_subtype_kmeans", size=1, show=False)
ax.grid(False)
ax.set_xticks([])
ax.set_yticks([])
ax.set_xlabel("")
ax.set_ylabel("")
ax.set_title("")
frame_width = 1.5
for spine in ax.spines.values():
    spine.set_linewidth(frame_width)
plt.savefig("output/granule_subtype_kmeans_scatter.jpeg", dpi = 300, bbox_inches = "tight")
plt.close()

sc.set_figure_params(figsize = (6, 9))
ax = sc.pl.scatter(granule_adata, x="global_y_new", y="global_x_new", color="granule_subtype", size=1, show=False)
ax.grid(False)
ax.set_xticks([])
ax.set_yticks([])
ax.set_xlabel("")
ax.set_ylabel("")
ax.set_title("")
frame_width = 1.5
for spine in ax.spines.values():
    spine.set_linewidth(frame_width)
plt.savefig("output/granule_subtype_scatter.jpeg", dpi = 300, bbox_inches = "tight")
plt.close()

In [None]:
# Plot granule subtype heatmap
group_order = pre_list + post_list + pre_post_list + pre_den_list + post_den_list + pre_post_den_list + den_list + axon_list + others_list
granule_adata_subset.obs["granule_subtype_kmeans"] = pd.Categorical(granule_adata_subset.obs["granule_subtype_kmeans"], categories=group_order, ordered=True)

ax = sc.pl.heatmap(granule_adata_subset, var_names = ref_genes_sorted, groupby = "granule_subtype_kmeans", cmap = "Reds", standard_scale = "var", dendrogram = False, swap_axes = True, show = False, figsize = (10, 7))
plt.savefig("output/granule_subtype_kmeans_heatmap.jpeg", dpi = 300, bbox_inches = "tight")
plt.close()

In [None]:
granule_adata.write_h5ad("output/granule_adata.h5ad")

In [8]:
granule_adata = sc.read_h5ad("output/granule_adata.h5ad")

In [9]:
granules["granule_subtype"] = pd.Categorical(granule_adata.obs["granule_subtype"], categories=["pre-syn", "post-syn", "pre & post", "pre & den", "post & den", "pre & post & den", "dendrites", "axons", "others"], ordered=True)

In [10]:
# Synapses
syn_types = ["pre-syn", "post-syn", "pre & post", "others"]
synapses = granules[granules["granule_subtype"].isin(syn_types)]
synapses.to_csv("output/synapses.csv", index = 0)
synapses.shape

(613140, 13)

In [None]:
# Plot synapses
syn_adata = granule_adata[granule_adata.obs["granule_subtype"].isin(syn_types)].copy()
syn_adata.obs["brain_area"] = pd.Categorical(syn_adata.obs["brain_area"], categories = ["CTXsp", "FT", "HPF-CA", "HPF-DG", "HPF-SR", "Isocortex", "MB", "OLF", "TH"], ordered = True)

sc.set_figure_params(scanpy = True, figsize = (6, 9))
ax = sc.pl.scatter(syn_adata, alpha = 1, x = "global_y_new", y = "global_x_new", color = "brain_area", palette = color_dct, size = 1, title = " ", show = False)
ax.grid(False)
ax.grid(False)
ax.set_xticks([])
ax.set_yticks([])
ax.set_xlabel("")
ax.set_ylabel("")
frame_width = 1.5
for spine in ax.spines.values():
    spine.set_linewidth(frame_width)
plt.savefig("output/synapses.jpeg", dpi = 300, bbox_inches = "tight")
plt.close()

In [None]:
# Subtype count in brain regions
area_list = ["Isocortex", "OLF", "HPF-CA", "HPF-DG", "HPF-SR", "CTXsp", "TH", "MB"]
synapse_subtypes = ["pre-syn", "post-syn"]

filtered_df = synapses[synapses["brain_area"].isin(area_list) & synapses["granule_subtype"].isin(synapse_subtypes)]
count_df = filtered_df.groupby(["brain_area", "granule_subtype"]).size().reset_index(name="count")
count_df = count_df[count_df["granule_subtype"].isin(synapse_subtypes)]

count_df.to_csv("output/synapse_subtype_by_brain_area.csv", index = 0)

In [11]:
# Spot-level synapse metadata
spot_synapse_all = spot_granule(granule = synapses, spot = spots)
spot_synapse_all

AnnData object with n_obs × n_vars = 17667 × 290
    obs: 'spot_id', 'global_x', 'global_y', 'global_y_new', 'global_x_new', 'region_labels', 'brain_area', 'indicator', 'gnl_count', 'gnl_radius', 'gnl_size', 'gnl_score'
    var: 'genes'

In [12]:
# Visualize spot-level synapse count
spot_synapse_all.obs["log_gnl_count"] = [np.log2(i + 1) for i in list(spot_synapse_all.obs["gnl_count"])]

sc.set_figure_params(scanpy = True, figsize = (6, 9))
ax = sc.pl.scatter(spot_synapse_all[spot_synapse_all.obs["log_gnl_count"] != 0].copy(), alpha = 1, x = "global_y_new", y = "global_x_new", color = "log_gnl_count", color_map = color_cts, title = " ", size = 55, show = False)
ax.grid(False)
ax.set_xticks([])
ax.set_yticks([])
ax.set_xlabel("")
ax.set_ylabel("")
for spine in ax.spines.values():
    spine.set_linewidth(False)
plt.savefig("output/synapse_density.jpeg", dpi = 300, bbox_inches = "tight")
plt.close()

In [None]:
synapses_pre = synapses[synapses["granule_subtype"] == "pre-syn"].copy()
synapses_post = synapses[synapses["granule_subtype"] == "post-syn"].copy()
synapses_pre.shape, synapses_post.shape

In [None]:
spot_synapse_pre = spot_granule(granule = synapses_pre, spot = spots).copy()
spot_synapse_post = spot_granule(granule = synapses_post, spot = spots).copy()
spot_synapse_post.obs["global_y_new"] += 6000
spot_synapse_pre_post = anndata.concat([spot_synapse_pre, spot_synapse_post], axis = 0, merge = "same")

In [None]:
spot_synapse_pre_post.obs["log_gnl_count"] = [np.log2(i + 1) for i in list(spot_synapse_pre_post.obs["gnl_count"])]
df = spot_synapse_pre_post.obs.copy()

fig, ax = plt.subplots(figsize = (12, 9))
ax.set_facecolor("black")

sc = ax.scatter(df["global_y_new"], df["global_x_new"], c = df["log_gnl_count"], cmap = color_cts, s = 4, alpha = 1)
ax.grid(False)
ax.set_xticks([])
ax.set_yticks([])
ax.set_xlabel(" ")
ax.set_ylabel(" ")
for spine in ax.spines.values():
    spine.set_linewidth(1.5)
plt.savefig("output/synapse_density_pre_post.jpeg", dpi=300, bbox_inches="tight", facecolor="black")
plt.close()

In [None]:
# Synapse density
array1 = np.array(spot_synapse_all.obs["gnl_count"])
array2 = np.array(spot_synapse_all.obs["brain_area"])
density1 = []

area_list = ["Isocortex", "OLF", "HPF-CA", "HPF-DG", "HPF-SR", "CTXsp", "TH", "MB", "FT"]
for j in area_list:
    temp = array1[array2 == j]
    density1.append(np.sum(temp) / len(temp))
density1_scaled = scale(density1)

array3 = np.array(spot_synapse_all[:, spot_synapse_all.var["genes"].isin(syn_genes)].X)
array3 = array3.sum(axis = 1)
array4 = np.array(spot_synapse_all.obs["brain_area"])
density2 = []

for j in area_list:
    temp = array3[array4 == j]
    density2.append(np.sum(temp) / len(temp))
density2_scaled = scale(density2)

density4 = [1.71603565, 1.964351308, 2.052720791, 1.139278326, 99, 1.678527951, 1.082904337, 0.444031185, 0.0199885]
density4_scaled = scale(density4)

density_comparison = pd.DataFrame({"area_list": area_list, "transcript": density2, "mcDETECT": density1, "ground_truth": density4})
density_comparison.to_csv("output/synapse_density_comparison.csv", index = 0)

In [None]:
# Correlation with ground truth
weights = []
for i in area_list:
    area = np.sum(spot_synapse_all.obs["brain_area"] == i)
    weights.append(area)

density1_scaled = np.delete(density1_scaled, 4)
density2_scaled = np.delete(density2_scaled, 4)
density4_scaled = np.delete(density4_scaled, 4)
weights.pop(4)

print(weighted_corr(density1_scaled, density4_scaled, weights), weighted_corr(density2_scaled, density4_scaled, weights))
print(weighted_spearmanr(density1_scaled, density4_scaled, weights), weighted_spearmanr(density2_scaled, density4_scaled, weights))