In [None]:
from skimage import io
import numpy as np
import os
import scanpy as sc
import squidpy as sq
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import seaborn as sns
import os
import gzip
import numpy as np
import celltypist
from celltypist import models


plt.rcParams['figure.dpi'] = 150
plt.rcParams['font.family'] = ['serif']
plt.rcParams['font.size'] = 12
plt.rcParams['axes.labelsize'] = 12
plt.rcParams['axes.titlesize'] = 12
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

In [None]:
def calcQCmat(andata):
    andata.var_names_make_unique()
    andata.var["mt"] = andata.var_names.str.startswith("mt-")
    andata.var["ribo"] = andata.var_names.str.startswith(("RPS", "RPL"))
    andata.var["hb"] = andata.var_names.str.contains("^HB[^(P)]")
    sc.pp.calculate_qc_metrics(andata, qc_vars=["mt", "ribo", "hb"], inplace=True, log1p=True)
    return andata
path_016 = "/data/kanferg/Sptial_Omics/playGround/Data/Visium_HD_Mouse_Brain_square_example/square_016um"
andata016_ = sc.read_visium(path=path_016)
andata016 = calcQCmat(andata016_)
print(f"{np.shape(andata016.X.todense())}")
sc.pp.filter_cells(andata016, min_counts = 50)
sc.pp.filter_cells(andata016, min_genes = 80)

In [None]:
plt.rcParams['font.size'] = 12
fig, axs = plt.subplots(1, 2, figsize=(7, 3))  # Adjusted figsize for better readability
axs[0].set_title('Number of Cells with Detected \n Gene Expression')
axs[1].set_title('Number of Cells with Detected \n Gene Expression')
sns.histplot(andata016.var['n_cells_by_counts'], kde=False, ax=axs[0],bins = 50)
sns.histplot(andata016.var['n_cells_by_counts'][andata016.var['n_cells_by_counts'] < 50], kde=False, ax=axs[1])
plt.subplots_adjust(wspace=0.5)
plt.suptitle("Number of Cells with Detected Gene Expression", y=1.10)

In [None]:
andata016 = andata016[:,andata016.var.n_cells_by_counts > 50]
print(f'{andata016}')

In [None]:
print(f"{np.shape(andata016.X.todense())}")
andata016 = andata016[andata016.obs["pct_counts_mt"] < 20]
print(f"{np.shape(andata016.X.todense())}")

In [None]:
sc.pp.normalize_total(andata016)
sc.pp.log1p(andata016)
log1p_data = andata016.X.todense()
sc.pp.highly_variable_genes(andata016)
sc.pp.scale(andata016)
andata016.obsm['spatial'] = np.array(andata016.obsm['spatial'], dtype=np.float64)
sc.pp.pca(andata016, n_comps=20)
sc.pp.neighbors(andata016)
sc.tl.umap(andata016)
sc.tl.leiden(andata016, key_added="clusters", flavor="igraph", directed=False, n_iterations=2)

In [None]:
from matplotlib.colors import ListedColormap

# Combine multiple palettes to create a larger custom palette
palette = sns.color_palette("tab20") + sns.color_palette("tab20b") + sns.color_palette("tab20c")

# Convert the combined palette to a ListedColormap
listed_cmap = ListedColormap(palette)

# Create the figure and axis
fig, ax = plt.subplots(1, 1, figsize=(4, 3))

# Plot the spatial scatter plot on the specified axis
sq.pl.spatial_scatter(andata016, color="clusters", ax=ax, palette=listed_cmap)

# From cluster differentially expressed genes to cluster annotation

In [None]:
sc.tl.rank_genes_groups(
    andata016, groupby="clusters", method="wilcoxon", key_added="dea_clusters"
)

In [None]:
sc.pl.rank_genes_groups_dotplot(
    andata016, groupby="clusters", standard_scale="var", n_genes=5, key="dea_clusters"
)

## Refining Differential Gene Expression Analysis with Filter Parameters 

In Scanpy, the sc.tl.filter_rank_genes_groups function is used to filter the results of differential expression analysis. The parameters min_in_group_fraction and max_out_group_fraction help refine the selection of marker genes based on their expression patterns across groups.

Here's what these parameters mean:

min_in_group_fraction: This parameter specifies the minimum fraction of cells within a group (cluster) that must express a gene for it to be considered as a marker gene for that group. For example, if min_in_group_fraction=0.2, it means that at least 20% of the cells in the group must express the gene for it to be considered.

max_out_group_fraction: This parameter specifies the maximum fraction of cells outside the group that can express the gene for it to be considered a marker gene. For example, if max_out_group_fraction=0.2, it means that no more than 20% of the cells outside the group can express the gene for it to be considered specific to the group.

Here’s how you might explain this in your presentation:

"Refining Differential Gene Expression Analysis with Filter Parameters"

When performing differential gene expression analysis, we can apply additional filters to refine our selection of marker genes:

min_in_group_fraction: Ensures that a gene is expressed in a minimum fraction of cells within the target cluster (e.g., at least 20%).
max_out_group_fraction: Limits the fraction of cells outside the target cluster that can express the gene (e.g., no more than 20%).
This approach helps in identifying more specific and relevant marker genes for each cluster, leading to more accurate biological insights.

In [None]:
sc.tl.filter_rank_genes_groups(
    andata016,
    min_in_group_fraction=0.01,
    max_out_group_fraction=0.01,
    key="dea_clusters",
    key_added="dea_clusters_filtered",
)

In [None]:
andata016.uns['dea_clusters']['logfoldchanges']

In [None]:
# Visualize the filtered genes:
sc.pl.rank_genes_groups_dotplot(
    andata016,
    groupby="clusters",
    standard_scale="var",
    n_genes=5,
    key="dea_clusters_filtered",
)

## Optimizing Cluster Granularity 

To reduce the number of clusters in your Leiden clustering results, you can adjust the resolution parameter in the sc.tl.leiden function. The resolution parameter controls the granularity of the clustering: higher values lead to more clusters, and lower values lead to fewer clusters.

ou can experiment with different values of the resolution parameter to find the optimal number of clusters for your analysis. For example, you might try resolution=0.1 for fewer clusters or resolution=1.0 for more clusters. Adjust the resolution until you achieve a satisfactory number of clusters.

In [None]:
from matplotlib.colors import ListedColormap

# Combine multiple palettes to create a larger custom palette
palette = sns.color_palette("tab20") + sns.color_palette("tab20b") + sns.color_palette("tab20c")

# Convert the combined palette to a ListedColormap
listed_cmap = ListedColormap(palette)


In [None]:
sns.set_context("paper", font_scale=1)

resolutions = [0.1, 0.5, 1.0, 2.0]

res = resolutions[0]

fig, ax = plt.subplots(1, 1, figsize=(4, 3))

sc.tl.leiden(andata016, key_added=f'clusters_res_{res}', flavor="igraph", directed=False, resolution=res, n_iterations=2)
sq.pl.spatial_scatter(andata016, color=f'clusters_res_{res}', ax=ax, palette=listed_cmap)
ax.set_title(f'Leiden Clustering (resolution={res})')


In [None]:
sns.set_context("paper", font_scale=1)

resolutions = [0.1, 0.5, 1.0, 2.0]

res = resolutions[1]

fig, ax = plt.subplots(1, 1, figsize=(4, 3))

sc.tl.leiden(andata016, key_added=f'clusters_res_{res}', flavor="igraph", directed=False, resolution=res, n_iterations=2)
sq.pl.spatial_scatter(andata016, color=f'clusters_res_{res}', ax=ax, palette=listed_cmap)
ax.set_title(f'Leiden Clustering (resolution={res})')


In [None]:
sns.set_context("paper", font_scale=1)

resolutions = [0.1, 0.5, 1.0, 2.0]

res = resolutions[2]

fig, ax = plt.subplots(1, 1, figsize=(4, 3))

sc.tl.leiden(andata016, key_added=f'clusters_res_{res}', flavor="igraph", directed=False, resolution=res, n_iterations=2)
sq.pl.spatial_scatter(andata016, color=f'clusters_res_{res}', ax=ax, palette=listed_cmap)
ax.set_title(f'Leiden Clustering (resolution={res})')

The selected leiden resolution would be  resolutions = 0.5.

In [None]:
sns.set_context("paper", font_scale=1)

resolutions = [0.1, 0.5, 1.0, 2.0]

res = resolutions[1]

fig, ax = plt.subplots(1, 1, figsize=(4, 3))

sc.tl.leiden(andata016, key_added=f'clusters_res_{res}', flavor="igraph", directed=False, resolution=res, n_iterations=2)
sq.pl.spatial_scatter(andata016, color=f'clusters_res_{res}', ax=ax, palette=listed_cmap)
ax.set_title(f'Leiden Clustering (resolution={res})')


to incorporate spatial information into your clustering analysis in Squidpy and test different parameters for building spatial neighbors, you can use the sq.gr.spatial_neighbors function. This will allow you to leverage spatial proximity in addition to gene expression data.



In [None]:
sc.pl.rank_genes_groups_dotplot(
    andata016, groupby="clusters_res_0.5", standard_scale="var", n_genes=2, key="dea_clusters"
)

In [None]:
andata016.obsm['spatial']

In [None]:
sq.gr.spatial_neighbors(andata016,coord_type="grid", key_added='spatial_neighbors')

In [None]:
andata016.obs['clusters_spatial']

In [None]:
sc.tl.leiden(andata016, key_added=f'clusters_spatial', neighbors_key=andata016.obsp['spatial_neighbors_connectivities'])

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(4, 3))
sq.pl.spatial_scatter(andata016, color=f'clusters_spatial', ax=ax, palette=listed_cmap)
ax.set_title(f'')

In [None]:
andata016.obsm['spatial'] = np.array(andata016.obsm['spatial'], dtype=np.float64)

sq.gr.spatial_neighbors(andata016,coord_type="grid", n_neighs=6, n_rings=8, key_added='spatial_neighbors')
# Perform clustering using the spatial neighbors graph
sc.tl.leiden(andata016, key_added=f'clusters_spatial', adjacency=andata016.obsp['spatial_neighbors_connectivities'])

fig, ax = plt.subplots(1, 1, figsize=(4, 3))
sq.pl.spatial_scatter(andata016, color=f'clusters_spatial', ax=ax, palette=listed_cmap)
ax.set_title(f'')


In [None]:
def calcQCmat(andata):
    andata.var_names_make_unique()
    andata.var["mt"] = andata.var_names.str.startswith("mt-")
    andata.var["ribo"] = andata.var_names.str.startswith(("RPS", "RPL"))
    andata.var["hb"] = andata.var_names.str.contains("^HB[^(P)]")
    sc.pp.calculate_qc_metrics(andata, qc_vars=["mt", "ribo", "hb"], inplace=True, log1p=True)
    return andata
path_016 = "/data/kanferg/Sptial_Omics/playGround/Data/Visium_HD_Mouse_Brain_square_example/square_016um"
andata016_ = sc.read_visium(path=path_016)
andata016 = calcQCmat(andata016_)
print(f"{np.shape(andata016.X.todense())}")
sc.pp.filter_cells(andata016, min_counts = 50)
sc.pp.filter_cells(andata016, min_genes = 80)