In [None]:
import scimap as sm
import pandas as pd
import sys
import os
import scanpy as sc
import seaborn as sns; sns.set(color_codes=True)
import anndata
adata = anndata.read_h5ad('/data/vasileiosionat2/Xenium/Drake_outputs/ccProcessed.h5ad')

In [None]:
!export OMP_NUM_THREADS=1
!export MKL_NUM_THREADS=1
!export NUMEXPR_NUM_THREADS=1

In [None]:
print(adata.obs.columns)

In [None]:
print(adata.obs['Lvl5'].unique().tolist())

In [None]:
print(adata.obs['sample'].unique().tolist())

In [None]:
category_counts = adata.obs.loc[
    adata.obs['Lvl4'].isin(['Th', 'Tc', 'T.NK.Mix', 'Lymph.Mix.MKI67hi']), 
    'Lvl4'
].value_counts()

print(category_counts)

In [None]:
import numpy as np
import scanpy as sc

# Step 1: Filter your subset based on a condition (e.g., a specific niche or cell type)
# Use .isin() to filter for multiple values in a column
t_cells = adata.obs['Lvl4'].isin(['Th', 'Tc', 'T.NK.Mix', 'Lymph.Mix.MKI67hi'])
t_subset = adata[t_cells].copy()  # Always use .copy() to avoid modifying views

# Step 2: Preprocess the data
np.random.seed(42)  # Ensure reproducibility
sc.pp.normalize_total(t_subset, target_sum=1e4)  # Normalize total counts per cell
sc.pp.log1p(t_subset)  # Log-transform the data

# Step 3: Compute neighbors and perform Leiden clustering
sc.pp.neighbors(t_subset, n_neighbors=10, use_rep='X', random_state=42)  # Compute the neighborhood graph
sc.tl.leiden(t_subset, resolution=1.1, random_state=42)  # Perform Leiden clustering

# Step 4: Save Leiden cluster labels in the full dataset
adata.obs['T_leiden'] = 'NaN'  # Initialize column with default values
adata.obs.loc[t_subset.obs_names, 'T_leiden'] = (
    t_subset.obs['leiden'].apply(lambda cluster: f'T-{cluster}')
)

In [None]:
# Example: UMAP visualization after computing neighbors
sc.tl.umap(t_subset)
sc.pl.umap(t_subset, color='leiden')  # Color by neighborhood information or any other attribute


In [None]:
# Temporarily convert 'T_leiden' to a non-categorical type (e.g., object)
adata.obs['T_leiden'] = adata.obs['T_leiden'].astype('object')

# Condition to select rows where 'Lvl4' is not in the specified categories
condition = ~adata.obs['Lvl4'].isin(['Th', 'Tc', 'T.NK.Mix', 'Lymph.Mix.MKI67hi'])

# Transfer values from 'Lvl4' to 'T_leiden' for rows matching the condition
adata.obs.loc[condition, 'T_leiden'] = adata.obs.loc[condition, 'Lvl4']

# Optionally reset the 'Lvl4' column for these rows if needed
# adata.obs.loc[condition, 'Lvl4'] = None  # Uncomment if you want to reset 'Lvl4'

# Optionally, convert 'T_leiden' back to categorical if required
adata.obs['T_leiden'] = pd.Categorical(adata.obs['T_leiden'])

In [None]:
print(t_subset.obs['leiden'].value_counts())
print(f"Number of unique clusters: {t_subset.obs['leiden'].nunique()}")


In [None]:
# import matplotlib.pyplot as plt
import numpy as np
from matplotlib.backends.backend_pdf import PdfPages
import os


rotation_angle = 30  # Define rotation for this sample

# Specify the directory where you want to save the PDF
save_directory = '/data/vasileiosionat2/Xenium/Figures/lvl4_pdf/'
pdf_filename = os.path.join(save_directory, f'all_clusters_4rows.pdf')

# Filter the data for the specific sample
adata_sample = adata[adata.obs['sample'] == 's2r2_HV184']

unique_clusters = adata_sample.obs.loc[
    (adata_sample.obs['T_leiden'].str.startswith("T")) |
    (adata_sample.obs['T_leiden'].str.startswith("NK")),
    'T_leiden'
].unique()


# Reorder clusters (if a custom order is provided, replace `sorted(unique_clusters)`)
ordered_clusters = sorted(unique_clusters)

# Define the number of rows and columns
num_rows = 6
num_cols = int(np.ceil(len(ordered_clusters) / num_rows))  # Calculate columns based on clusters and rows

# Create a PdfPages object to save the plots
with PdfPages(pdf_filename) as pdf:
    # Create a figure for the sample
    fig, axes = plt.subplots(num_rows, num_cols, figsize=(26, 36))
    fig.patch.set_facecolor('white')

    # Flatten the axes array for easier indexing
    axes = axes.flatten()

    # Get the coordinates for the sample
    x_coords = adata_sample.obs['x_centroid']
    y_coords = adata_sample.obs['y_centroid']
    
    # Apply rotation
    angle = np.deg2rad(rotation_angle)
    new_x_coords = x_coords * np.cos(angle) - y_coords * np.sin(angle)
    new_y_coords = x_coords * np.sin(angle) + y_coords * np.cos(angle)

    # Determine aspect ratio
    x_range = new_x_coords.max() - new_x_coords.min()
    y_range = new_y_coords.max() - new_y_coords.min()
    aspect_ratio = x_range / y_range

       # Loop over each unique cluster in the ordered list
    for idx, cluster in enumerate(ordered_clusters):
        ax = axes[idx]

        # Set white background for the subplot
        ax.set_facecolor('white')

        # Remove the outline
        for spine in ax.spines.values():
            spine.set_visible(False)

        # Scatter plot for the grey dots (all other clusters)
        ax.scatter(
            x=new_x_coords[adata_sample.obs['T_leiden'] != cluster],
            y=new_y_coords[adata_sample.obs['T_leiden'] != cluster],
            c='#C0C0C0', 
            s=3  # Adjust dot size
        )

        # Scatter plot for the red dots (the current cluster)
        ax.scatter(
            x=new_x_coords[adata_sample.obs['T_leiden'] == cluster],
            y=new_y_coords[adata_sample.obs['T_leiden'] == cluster],
            c='red', 
            s=9  # Adjust dot size
        )

        # Set aspect ratio for each subplot
        ax.set_aspect(aspect_ratio)

        # Add the cluster name below the plot
        ax.text(
            0.5, 0.02, f'{cluster}', 
            horizontalalignment='center', 
            verticalalignment='center', 
            transform=ax.transAxes, 
            color='black', fontsize=20, weight='bold'
        )

        # Remove grids
        ax.grid(False)

        # Remove ticks and their labels
        ax.set_xticks([])
        ax.set_yticks([])

    # Turn off empty subplots if there are any
    for ax in axes[len(ordered_clusters):]:
        ax.set_visible(False)

    # Define the directory and file name for the TIFF file
    tiff_filename = os.path.join(save_directory, f'all_clusters_4rows.tiff')

    # Adjust the spacing between subplots
    plt.subplots_adjust(
        left=0.05,    # Space from left edge
        right=0.95,   # Space from right edge
        top=0.95,     # Space from top edge
        bottom=0.05,  # Space from bottom edge
        wspace=0.1,   # Width space between columns
        hspace=0.05    # Height space between rows
    )

    # Add a title to the overall figure
    plt.suptitle(f'All Clusters in s2r2_HV184', color='white', fontsize=20, weight='bold', y=1.02)

    # Save the current figure as a TIFF file
    #fig.savefig(tiff_filename, dpi=300, format='tiff')

    plt.show()
    plt.close(fig)


In [None]:
import scanpy as sc

# Step 1: Ensure all observation names are unique
adata.obs_names_make_unique()

# Step 2: Subset the data to include categories starting with 'T-' or equal to 'B'
adata_subset = adata[(adata.obs['T_leiden'].str.startswith("T"))].copy()

# Step 3: Log-transform the data if not already done
sc.pp.log1p(adata_subset)

# Step 4: Perform differential expression analysis using the Wilcoxon method
sc.tl.rank_genes_groups(adata_subset, 'T_leiden', method='wilcoxon', use_raw=False)

# Step 5: Extract top marker genes for each cluster
top_genes_per_cluster = {}
for cluster in adata_subset.obs['T_leiden'].cat.categories:
    top_genes_per_cluster[cluster] = adata_subset.uns['rank_genes_groups']['names'][cluster][:20]  # Top 20 genes

# Step 6: Generate dendrogram for cluster ordering
sc.tl.dendrogram(adata_subset, groupby='T_leiden')

# Step 7: Retrieve the cluster order based on the dendrogram
cluster_order = adata_subset.uns['dendrogram_T_leiden']['categories_ordered']

# Step 8: Filter genes based on 20% expression threshold
filtered_genes = set()
for cluster in cluster_order:
    cluster_data = adata_subset[adata_subset.obs['T_leiden'] == cluster]
    # Calculate the fraction of cells expressing each gene in the cluster
    gene_expression_fraction = (cluster_data.X > 0).mean(axis=0).A1  # Sparse matrix to 1D array
    genes_above_threshold = cluster_data.var_names[gene_expression_fraction > 0.2]
    filtered_genes.update(genes_above_threshold)

# Step 9: Ensure unique genes for plotting
unique_genes = set()
genes_for_plot = []

for cluster in cluster_order:
    if cluster in top_genes_per_cluster:
        cluster_genes = top_genes_per_cluster[cluster]
        genes_for_this_cluster = []
        for gene in cluster_genes:
            if len(genes_for_this_cluster) >= 10:
                break
            if gene in filtered_genes and gene not in unique_genes:
                unique_genes.add(gene)
                genes_for_this_cluster.append(gene)
        genes_for_plot.extend(genes_for_this_cluster)

# Step 10: Plot the dotplot with the filtered genes, swapping axes if necessary
sc.pl.dotplot(
    adata_subset,
    var_names=genes_for_plot,
    groupby='T_leiden',
    dendrogram=True,
    use_raw=False,
    cmap="vlag",
    standard_scale='var',
    swap_axes=True
)


In [None]:
# Ensure 'T_leiden' is a string type
adata.obs['T_leiden'] = adata.obs['T_leiden'].astype(str)

# Define the replacement dictionary
replacement_dict = {
    'T-0': 'Th_proximity_to_B',
    'T-4': 'Th_proximity_to_B',
    'T-1': 'Th',
    'T-2': 'Tc',
    'T-3': 'T.IL7Rhi',
    'T-5': 'T.IE',
    'T-6': 'Oth.CCL5hi',
    'T-7': 'T_proximity_to_Fib',
    'T-8': 'Oth.GZMKhi',
    'T-9': 'Treg',
    'T-10':'NK',
    'T-11': 'Oth.GZMAhi',
    'T-12': 'Oth.CCR7hi',
    'T-13': 'Oth.CD8Ahi',
    'T-14': 'Oth.ILRAhi',
    'T-15': 'T_proximity_to_BV',

}

# Replace only the specified categories in 'T_leiden2'
adata.obs['T_leiden'] = adata.obs['T_leiden'].astype(str)  # Ensure it's string
adata.obs['T_leiden'] = adata.obs['T_leiden'].replace(replacement_dict)

# Check the unique values in 'T_leiden2' to confirm
print(adata.obs['T_leiden'].unique())


In [None]:
import scanpy as sc

# Step 1: Ensure all observation names are unique
adata.obs_names_make_unique()

# Subset the data to include categories starting with 'T-' or equal to 'B'
adata_subset = adata[(adata.obs['T_leiden'].str.startswith("T"))].copy()

# Step 3: Log-transform the data if not already done
sc.pp.log1p(adata_subset)

# Step 4: Perform differential expression analysis using the Wilcoxon method
sc.tl.rank_genes_groups(adata_subset, 'T_leiden', method='wilcoxon', use_raw=False)

# Step 5: Extract top marker genes for each cluster (optional for reference)
top_genes_per_cluster = {}
for cluster in adata_subset.obs['T_leiden'].cat.categories:
    top_genes_per_cluster[cluster] = adata_subset.uns['rank_genes_groups']['names'][cluster][:10]  # Top 10 genes

# Step 6: Generate dendrogram for cluster ordering
sc.tl.dendrogram(adata_subset, groupby='T_leiden')

# Step 7: Retrieve the cluster order based on the dendrogram
cluster_order = adata_subset.uns['dendrogram_T_leiden']['categories_ordered']

# Step 8: Define your custom list of genes for plotting
custom_genes = ['ADAM28', 'ANXA1', 'APOLD1', 'AQP3', 'BCL2L11', 'C1orf162', 'CCL3L1', 'CCL5', 'CD2', 'CD27', 'CD28', 'CD3D', 
    'CD4', 'CD69', 'CD8A', 'CD83', 'CD247', 'CCR6', 'CCR7', 'CYTIP', 'CTLA4', 'CTSC', 'DUSP2', 'FAS', 'FCGR3A', 
    'FGFBP2', 'FGL2', 'FKBP11', 'FOXP3', 'FXYD2', 'GEM', 'GLIPR1', 'GNLY', 'GPR183', 'GZMA', 'GZMB', 'GZMK', 
    'HAVCR2', 'ICAM1', 'IL1R2', 'IL17A', 'IL17F', 'IL23R', 'IL2RA', 'IL7R', 
    'IRF8', 'ITGA1', 'ITGB2', 'KLRB1', 'KLRC1', 'KLRD1', 'KIT', 'LAG3', 'LYST', 'MDM2', 'MKI67', 'NKG7', 'NLRC5', 
    'PDCD1', 'PLCG2', 'PPARG', 'PRF1', 'RGS16', 'RTKN2', 'SELL', 'SH2D3C', 'SLAMF1', 
    'SLAMF7', 'SMYD2', 'SOX4', 'STAT3', 'TNFRSF9', 'TRAC']  # Replace with your gene list

# Filter the custom list to include only genes present in the subset
genes_for_plot = [gene for gene in custom_genes if gene in adata_subset.var_names]

# Step 9: Plot the dotplot with your chosen genes
sc.pl.dotplot(
    adata_subset,
    var_names=genes_for_plot,
    groupby='T_leiden',
    dendrogram=True,
    use_raw=False,
    cmap="vlag",
    standard_scale='var',
    swap_axes=True
)


In [None]:
import numpy as np
import scanpy as sc

# Step 1: Filter your subset based on a condition (e.g., a specific niche or cell type)
# Use .isin() to filter for multiple values in a column
t_cells2 = adata.obs['T_leiden'].isin(['Th'])
t_subset2 = adata[t_cells2].copy()  # Always use .copy() to avoid modifying views

# Step 2: Preprocess the data
np.random.seed(42)  # Ensure reproducibility
sc.pp.normalize_total(t_subset2, target_sum=1e4)  # Normalize total counts per cell
sc.pp.log1p(t_subset2)  # Log-transform the data

# Step 3: Compute neighbors and perform Leiden clustering
sc.pp.neighbors(t_subset2, n_neighbors=10, use_rep='X', random_state=42)  # Compute the neighborhood graph
sc.tl.leiden(t_subset2, resolution=0.6, random_state=42)  # Perform Leiden clustering

# Step 4: Save Leiden cluster labels in the full dataset
adata.obs['T_leiden2'] = 'NaN'  # Initialize column with default values
adata.obs.loc[t_subset2.obs_names, 'T_leiden2'] = (
    t_subset2.obs['leiden'].apply(lambda cluster: f'T-{cluster}')
)

In [None]:
# Example: UMAP visualization after computing neighbors
sc.tl.umap(t_subset2)
sc.pl.umap(t_subset2, color='leiden')  # Color by neighborhood information or any other attribute


In [None]:
# Temporarily convert 'T_leiden' to a non-categorical type (e.g., object)
adata.obs['T_leiden2'] = adata.obs['T_leiden2'].astype('object')

# Condition to select rows where 'Lvl4' is not in the specified categories
condition = ~adata.obs['Lvl4'].isin(['Th', 'Tc', 'T.NK.Mix', 'Lymph.Mix.MKI67hi'])

# Transfer values from 'Lvl4' to 'T_leiden' for rows matching the condition
adata.obs.loc[condition, 'T_leiden2'] = adata.obs.loc[condition, 'Lvl4']

# Optionally reset the 'Lvl4' column for these rows if needed
# adata.obs.loc[condition, 'Lvl4'] = None  # Uncomment if you want to reset 'Lvl4'

# Optionally, convert 'T_leiden' back to categorical if required
adata.obs['T_leiden2'] = pd.Categorical(adata.obs['T_leiden2'])

In [None]:
import scanpy as sc

# Step 1: Ensure all observation names are unique
adata.obs_names_make_unique()

# Subset the data to include categories starting with 'T-' or equal to 'B'
adata_subset = adata[(adata.obs['T_leiden2'].str.startswith("T-"))].copy()

# Step 3: Log-transform the data if not already done
sc.pp.log1p(adata_subset)

# Step 4: Perform differential expression analysis using the Wilcoxon method
sc.tl.rank_genes_groups(adata_subset, 'T_leiden2', method='wilcoxon', use_raw=False)

# Step 5: Extract top marker genes for each cluster
top_genes_per_cluster = {}
for cluster in adata_subset.obs['T_leiden2'].cat.categories:
    top_genes_per_cluster[cluster] = adata_subset.uns['rank_genes_groups']['names'][cluster][:10]  # Top 10 genes

# Step 6: Generate dendrogram for cluster ordering
sc.tl.dendrogram(adata_subset, groupby='T_leiden2')

# Step 7: Retrieve the cluster order based on the dendrogram
cluster_order = adata_subset.uns['dendrogram_T_leiden2']['categories_ordered']

# Step 8: Ensure each cluster has at least 3 unique genes
unique_genes = set()
genes_for_plot = []

for cluster in cluster_order:
    if cluster in top_genes_per_cluster:
        cluster_genes = top_genes_per_cluster[cluster]
        genes_for_this_cluster = []
        for gene in cluster_genes:
            if len(genes_for_this_cluster) >= 20:
                break
            if gene not in unique_genes:
                unique_genes.add(gene)
                genes_for_this_cluster.append(gene)
        genes_for_plot.extend(genes_for_this_cluster)

# Step 9: Plot the dotplot with the unique genes, swapping axes if necessary
sc.pl.dotplot(
    adata_subset,
    var_names=genes_for_plot,
    groupby='T_leiden2',
    dendrogram=True,
    use_raw=False,
    cmap="vlag",
    standard_scale='var',
    swap_axes=True
)

In [None]:
# Ensure 'T_leiden' is a string type
adata.obs['T_leiden2'] = adata.obs['T_leiden2'].astype(str)

# Define the replacement dictionary
replacement_dict = {
    'T-2': 'Treg',

}

# Replace only the specified categories in 'T_leiden2'
adata.obs['T_leiden2'] = adata.obs['T_leiden2'].astype(str)  # Ensure it's string
adata.obs['T_leiden2'] = adata.obs['T_leiden2'].replace(replacement_dict)

# Check the unique values in 'T_leiden2' to confirm
print(adata.obs['T_leiden2'].unique())


In [None]:
# Temporarily convert 'T_leiden' to a non-categorical type (e.g., object)
adata.obs['T_leiden'] = adata.obs['T_leiden'].astype('object')

# Condition to select rows where 'Lvl4' is not in the specified categories
condition = adata.obs['T_leiden2'].isin(['Treg'])

# Transfer values from 'Lvl4' to 'T_leiden' for rows matching the condition
adata.obs.loc[condition, 'T_leiden'] = adata.obs.loc[condition, 'T_leiden2']


In [None]:
print(t_subset.obs['leiden'].value_counts())
print(f"Number of unique clusters: {t_subset.obs['leiden'].nunique()}")

In [None]:
print(t_subset2.obs['leiden'].value_counts())
print(f"Number of unique clusters: {t_subset2.obs['leiden'].nunique()}")

In [None]:
# Check the unique values in 'T_leiden2' to confirm
print(adata.obs['T_leiden'].unique())

In [None]:
category_counts = adata.obs.loc[
    adata.obs['T_leiden'].isin(['Treg', 'Th_proximity_to_B', 'Oth.GZMKhi', 'Th', 'T.IL7Rhi', 'T.IE', 
    'Oth.ILRAhi', 'Oth.CCR7hi', 'NK', 'Tc', 'Oth.CCL5hi', 'T_proximity_to_Fib',
    'Oth.CD8Ahi', 'Oth.GZMAhi', 'T_proximity_to_BV']), 
    'T_leiden'
].value_counts()

print(category_counts)

In [None]:
adata.obs['Lvl4_thesis'] = adata.obs['Lvl4']

In [None]:
import pandas as pd

# List of values to transfer
values_to_transfer = [
    'Treg', 'Th_proximity_to_B', 'Oth.GZMKhi', 'Th', 'T.IL7Rhi', 'T.IE', 
    'Oth.ILRAhi', 'Oth.CCR7hi', 'NK', 'Tc', 'Oth.CCL5hi', 'T_proximity_to_Fib',
    'Oth.CD8Ahi', 'Oth.GZMAhi', 'T_proximity_to_BV'
]

# Ensure 'Lvl4' column is categorical
if not pd.api.types.is_categorical_dtype(adata.obs['Lvl4']):
    adata.obs['Lvl4'] = adata.obs['Lvl4'].astype('category')

# Add new categories to 'Lvl4' if they aren't already present
new_categories = set(values_to_transfer) - set(adata.obs['Lvl4'].cat.categories)
if new_categories:
    adata.obs['Lvl4'].cat.add_categories(new_categories, inplace=True)

# Transfer values: only update rows where T_leiden matches the specified values
adata.obs.loc[
    adata.obs['T_leiden'].isin(values_to_transfer), 
    'Lvl4'
] = adata.obs.loc[
    adata.obs['T_leiden'].isin(values_to_transfer), 
    'T_leiden'
]

In [None]:
category_counts = adata.obs.loc[
    adata.obs['Lvl4'].isin(['Treg', 'Th_proximity_to_B', 'Oth.GZMKhi', 'Th', 'T.IL7Rhi', 'T.IE', 
    'Oth.ILRAhi', 'Oth.CCR7hi', 'NK', 'Tc', 'Oth.CCL5hi', 'T_proximity_to_Fib',
    'Oth.CD8Ahi', 'Oth.GZMAhi', 'T_proximity_to_BV']), 
    'Lvl4'
].value_counts()

print(category_counts)

In [None]:
import scanpy as sc

# Step 1: Ensure all observation names are unique
adata.obs_names_make_unique()

# Step 2: Subset the data to include categories starting with 'T-' or equal to 'B'
adata_subset = adata[(adata.obs['Lvl4'].isin(['Treg', 'Th_proximity_to_B', 'Oth.GZMKhi', 'Th', 'T.IL7Rhi', 'T.IE', 
    'Oth.ILRAhi', 'Oth.CCR7hi', 'NK', 'Tc', 'Oth.CCL5hi', 'T_proximity_to_Fib',
    'Oth.CD8Ahi', 'Oth.GZMAhi', 'T_proximity_to_BV', 'Fib.1']))].copy()

# Step 3: Log-transform the data if not already done
sc.pp.log1p(adata_subset)

# Step 4: Perform differential expression analysis using the Wilcoxon method
sc.tl.rank_genes_groups(adata_subset, 'Lvl4', method='wilcoxon', use_raw=False)

# Step 5: Extract top marker genes for each cluster
top_genes_per_cluster = {}
for cluster in adata_subset.obs['Lvl4'].cat.categories:
    top_genes_per_cluster[cluster] = adata_subset.uns['rank_genes_groups']['names'][cluster][:20]  # Top 20 genes

# Step 6: Generate dendrogram for cluster ordering
sc.tl.dendrogram(adata_subset, groupby='Lvl4')

# Step 7: Retrieve the cluster order based on the dendrogram
cluster_order = adata_subset.uns['dendrogram_Lvl4']['categories_ordered']

# Step 8: Filter genes based on 20% expression threshold
filtered_genes = set()
for cluster in cluster_order:
    cluster_data = adata_subset[adata_subset.obs['Lvl4'] == cluster]
    # Calculate the fraction of cells expressing each gene in the cluster
    gene_expression_fraction = (cluster_data.X > 0).mean(axis=0).A1  # Sparse matrix to 1D array
    genes_above_threshold = cluster_data.var_names[gene_expression_fraction > 0.2]
    filtered_genes.update(genes_above_threshold)

# Step 9: Ensure unique genes for plotting
unique_genes = set()
genes_for_plot = []

for cluster in cluster_order:
    if cluster in top_genes_per_cluster:
        cluster_genes = top_genes_per_cluster[cluster]
        genes_for_this_cluster = []
        for gene in cluster_genes:
            if len(genes_for_this_cluster) >= 5:
                break
            if gene in filtered_genes and gene not in unique_genes:
                unique_genes.add(gene)
                genes_for_this_cluster.append(gene)
        genes_for_plot.extend(genes_for_this_cluster)

# Step 10: Plot the dotplot with the filtered genes, swapping axes if necessary
sc.pl.dotplot(
    adata_subset,
    var_names=genes_for_plot,
    groupby='Lvl4',
    dendrogram=True,
    use_raw=False,
    cmap="vlag",
    standard_scale='var',
    swap_axes=True
)


In [None]:
adata.obs['Lvl5'] = adata.obs['Lvl4']

In [None]:
# Ensure 'T_leiden' is a string type
adata.obs['Lvl5'] = adata.obs['Lvl5'].astype(str)

# Define the replacement dictionary
replacement_dict = {
    'Oth.CCL5hi': 'T.CCL5hi',
    'Oth.GZMKhi':'T.GZMKhi',
    'Oth.GZMAhi':'T.GZMAhi',
    'Oth.CD8Ahi' : 'T.CD8Ahi',
    'Oth.CCR7hi': 'T.CCR7hi'
}

# Replace only the specified categories in 'T_leiden2'
adata.obs['Lvl5'] = adata.obs['Lvl5'].astype(str)  # Ensure it's string
adata.obs['Lvl5'] = adata.obs['Lvl5'].replace(replacement_dict)

# Check the unique values in 'T_leiden2' to confirm
print(adata.obs['Lvl5'].unique())


In [None]:
# Ensure 'T_leiden' is a string type
adata.obs['Lvl4'] = adata.obs['Lvl4'].astype(str)

# Define the replacement dictionary
replacement_dict = {
    'Oth.ILRAhi': 'Oth.IL2RAhi',
    'T.CCR7hi': 'T.IL7Rhi',
    'Oth.CCL5hi': 'Tc',
    'Oth.GZMKhi':'Tc',
    'Oth.GZMAhi':'Tc',
    'Oth.CD8Ahi': 'Tc',    
}

# Replace only the specified categories in 'T_leiden2'
adata.obs['Lvl4'] = adata.obs['Lvl4'].astype(str)  # Ensure it's string
adata.obs['Lvl4'] = adata.obs['Lvl4'].replace(replacement_dict)

# Check the unique values in 'T_leiden2' to confirm
print(adata.obs['Lvl4'].unique())


In [None]:
# Ensure 'T_leiden' is a string type
adata.obs['Lvl4'] = adata.obs['Lvl4'].astype(str)

# Define the replacement dictionary
replacement_dict = {
    'T.IL7Rhi': 'T.naive'  
}

# Replace only the specified categories in 'T_leiden2'
adata.obs['Lvl4'] = adata.obs['Lvl4'].astype(str)  # Ensure it's string
adata.obs['Lvl4'] = adata.obs['Lvl4'].replace(replacement_dict)

# Check the unique values in 'T_leiden2' to confirm
print(adata.obs['Lvl4'].unique())


In [None]:
import scanpy as sc

# Step 1: Ensure all observation names are unique
adata.obs_names_make_unique()

# Step 2: Subset the data to include categories starting with 'T-' or equal to 'B'
adata_subset = adata[(adata.obs['Lvl4'].isin(['Treg', 'Th_proximity_to_B', 'Th', 'T.IL7Rhi', 'T.IE', 
    'NK', 'Tc', 'T_proximity_to_Fib', 'T_proximity_to_BV']))].copy()

# Step 3: Log-transform the data if not already done
sc.pp.log1p(adata_subset)

# Step 4: Perform differential expression analysis using the Wilcoxon method
sc.tl.rank_genes_groups(adata_subset, 'Lvl4', method='wilcoxon', use_raw=False)

# Step 5: Extract top marker genes for each cluster
top_genes_per_cluster = {}
for cluster in adata_subset.obs['Lvl4'].cat.categories:
    top_genes_per_cluster[cluster] = adata_subset.uns['rank_genes_groups']['names'][cluster][:20]  # Top 20 genes

# Step 6: Generate dendrogram for cluster ordering
sc.tl.dendrogram(adata_subset, groupby='Lvl4')

# Step 7: Retrieve the cluster order based on the dendrogram
cluster_order = adata_subset.uns['dendrogram_Lvl4']['categories_ordered']

# Step 8: Filter genes based on 20% expression threshold
filtered_genes = set()
for cluster in cluster_order:
    cluster_data = adata_subset[adata_subset.obs['Lvl4'] == cluster]
    # Calculate the fraction of cells expressing each gene in the cluster
    gene_expression_fraction = (cluster_data.X > 0).mean(axis=0).A1  # Sparse matrix to 1D array
    genes_above_threshold = cluster_data.var_names[gene_expression_fraction > 0.2]
    filtered_genes.update(genes_above_threshold)

# Step 9: Ensure unique genes for plotting
unique_genes = set()
genes_for_plot = []

for cluster in cluster_order:
    if cluster in top_genes_per_cluster:
        cluster_genes = top_genes_per_cluster[cluster]
        genes_for_this_cluster = []
        for gene in cluster_genes:
            if len(genes_for_this_cluster) >= 10:
                break
            if gene in filtered_genes and gene not in unique_genes:
                unique_genes.add(gene)
                genes_for_this_cluster.append(gene)
        genes_for_plot.extend(genes_for_this_cluster)

# Step 10: Plot the dotplot with the filtered genes, swapping axes if necessary
sc.pl.dotplot(
    adata_subset,
    var_names=genes_for_plot,
    groupby='Lvl4',
    dendrogram=True,
    use_raw=False,
    cmap="vlag",
    standard_scale='var',
    swap_axes=True
)


In [None]:
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.backends.backend_pdf import PdfPages
import os


rotation_angle = 30  # Define rotation for this sample

# Specify the directory where you want to save the PDF
save_directory = '/data/vasileiosionat2/Xenium/Figures/lvl4_pdf/'
pdf_filename = os.path.join(save_directory, f'all_clusters_4rows.pdf')

# Filter the data for the specific sample
adata_sample = adata[adata.obs['sample'] == 's2r2_HV184']

# Extract unique clusters in the 'Lvl4' column matching the specified categories
unique_clusters = adata_sample.obs.loc[
    adata_sample.obs['Lvl4'].isin([
        'Treg', 'Th_proximity_to_B', 'Th', 'T.IL7Rhi', 'T.IE', 
         'NK', 'Tc',  'T_proximity_to_Fib','T_proximity_to_BV'
    ]), 
    'Lvl4'
].unique()

# Print the unique clusters
print(unique_clusters)



# Reorder clusters (if a custom order is provided, replace `sorted(unique_clusters)`)
ordered_clusters = sorted(unique_clusters)

# Define the number of rows and columns
num_rows = 6
num_cols = int(np.ceil(len(ordered_clusters) / num_rows))  # Calculate columns based on clusters and rows

# Create a PdfPages object to save the plots
with PdfPages(pdf_filename) as pdf:
    # Create a figure for the sample
    fig, axes = plt.subplots(num_rows, num_cols, figsize=(26, 36))
    fig.patch.set_facecolor('white')

    # Flatten the axes array for easier indexing
    axes = axes.flatten()

    # Get the coordinates for the sample
    x_coords = adata_sample.obs['x_centroid']
    y_coords = adata_sample.obs['y_centroid']
    
    # Apply rotation
    angle = np.deg2rad(rotation_angle)
    new_x_coords = x_coords * np.cos(angle) - y_coords * np.sin(angle)
    new_y_coords = x_coords * np.sin(angle) + y_coords * np.cos(angle)

    # Determine aspect ratio
    x_range = new_x_coords.max() - new_x_coords.min()
    y_range = new_y_coords.max() - new_y_coords.min()
    aspect_ratio = x_range / y_range

       # Loop over each unique cluster in the ordered list
    for idx, cluster in enumerate(ordered_clusters):
        ax = axes[idx]

        # Set white background for the subplot
        ax.set_facecolor('white')

        # Remove the outline
        for spine in ax.spines.values():
            spine.set_visible(False)

        # Scatter plot for the grey dots (all other clusters)
        ax.scatter(
            x=new_x_coords[adata_sample.obs['Lvl4'] != cluster],
            y=new_y_coords[adata_sample.obs['Lvl4'] != cluster],
            c='#C0C0C0', 
            s=3  # Adjust dot size
        )

        # Scatter plot for the red dots (the current cluster)
        ax.scatter(
            x=new_x_coords[adata_sample.obs['Lvl4'] == cluster],
            y=new_y_coords[adata_sample.obs['Lvl4'] == cluster],
            c='red', 
            s=9  # Adjust dot size
        )

        # Set aspect ratio for each subplot
        ax.set_aspect(aspect_ratio)

        # Add the cluster name below the plot
        ax.text(
            0.5, 0.02, f'{cluster}', 
            horizontalalignment='center', 
            verticalalignment='center', 
            transform=ax.transAxes, 
            color='black', fontsize=20, weight='bold'
        )

        # Remove grids
        ax.grid(False)

        # Remove ticks and their labels
        ax.set_xticks([])
        ax.set_yticks([])

    # Turn off empty subplots if there are any
    for ax in axes[len(ordered_clusters):]:
        ax.set_visible(False)

    # Define the directory and file name for the TIFF file
    tiff_filename = os.path.join(save_directory, f'all_clusters_4rows.tiff')

    # Adjust the spacing between subplots
    plt.subplots_adjust(
        left=0.05,    # Space from left edge
        right=0.95,   # Space from right edge
        top=0.95,     # Space from top edge
        bottom=0.05,  # Space from bottom edge
        wspace=0.1,   # Width space between columns
        hspace=0.05    # Height space between rows
    )

    # Add a title to the overall figure
    plt.suptitle(f'All Clusters in s2r2_HV184', color='white', fontsize=20, weight='bold', y=1.02)

    # Save the current figure as a TIFF file
    #fig.savefig(tiff_filename, dpi=300, format='tiff')

    plt.show()
    plt.close(fig)


In [None]:
# List of desired categories
desired_categories = [
    'Treg', 'Th_proximity_to_B', 'Th', 'T.IL7Rhi', 'T.IE', 
    'Oth.IL2RAhi', 'NK', 'Tc',  'T_proximity_to_Fib',
     'T_proximity_to_BV', 
]

# Filter for only the desired categories and count, then sort in descending order
category_counts = (
    adata.obs.loc[adata.obs['Lvl4'].isin(desired_categories), 'Lvl4']
    .value_counts()
    .reindex(desired_categories, fill_value=0)  # Include all desired categories
    .sort_values(ascending=False)              # Sort by descending count
)

# Print the counts with categories in descending order
print(category_counts)


In [None]:
# Set pandas display options to show all rows without truncation
import pandas as pd
pd.set_option('display.max_rows', None)

# Filter the rows where 'Lvl4' starts with 'Oth' and then count the occurrences
category_counts = (
    adata.obs[adata.obs['Lvl4_thesis'].str.startswith('Oth')]['Lvl4_thesis']  # Correct filtering using the boolean mask
    .value_counts()  # Count all occurrences of categories that start with "Oth"
    .sort_values(ascending=False)  # Sort counts in descending order
)

# Print the counts with categories in descending order
print(category_counts)


In [None]:
#After T cell clustering save
adata.write_h5ad("/data/vasileiosionat2/Xenium/Drake_outputs/ccProcessed.h5ad")

In [None]:
import scanpy as sc

# Step 1: Ensure all observation names are unique
adata.obs_names_make_unique()

# Subset the data to include categories starting with 'T-' or equal to 'B'
adata_subset = adata[(adata.obs['Lvl4'].isin(['LEC',  'VEC.1', 'VEC.3', 'VEC.2', 'VEC.prol', 'SMC.1', 'SMC.2' ]))].copy()

# Step 3: Log-transform the data if not already done
sc.pp.log1p(adata_subset)

# Step 4: Perform differential expression analysis using the Wilcoxon method
sc.tl.rank_genes_groups(adata_subset, 'Lvl4', method='wilcoxon', use_raw=False)

# Step 5: Extract top marker genes for each cluster
top_genes_per_cluster = {}
for cluster in adata_subset.obs['Lvl4'].cat.categories:
    top_genes_per_cluster[cluster] = adata_subset.uns['rank_genes_groups']['names'][cluster][:20]  # Top 10 genes

# Step 6: Generate dendrogram for cluster ordering
sc.tl.dendrogram(adata_subset, groupby='Lvl4')

# Step 7: Retrieve the cluster order based on the dendrogram
cluster_order = adata_subset.uns['dendrogram_Lvl4']['categories_ordered']

# Step 8: Ensure each cluster has at least 3 unique genes
unique_genes = set()
genes_for_plot = []

for cluster in cluster_order:
    if cluster in top_genes_per_cluster:
        cluster_genes = top_genes_per_cluster[cluster]
        genes_for_this_cluster = []
        for gene in cluster_genes:
            if len(genes_for_this_cluster) >= 20:
                break
            if gene not in unique_genes:
                unique_genes.add(gene)
                genes_for_this_cluster.append(gene)
        genes_for_plot.extend(genes_for_this_cluster)

# Step 9: Plot the dotplot with the unique genes, swapping axes if necessary
sc.pl.dotplot(
    adata_subset,
    var_names=genes_for_plot,
    groupby='Lvl4',
    dendrogram=True,
    use_raw=False,
    cmap="vlag",
    standard_scale='var',
    swap_axes=True
)

In [None]:
import numpy as np
import scanpy as sc

# Step 1: Filter your subset based on a condition (e.g., a specific niche or cell type)
# Use .isin() to filter for multiple values in a column
v_cells = adata.obs['Lvl4'].isin(['VEC.1', 'VEC.3', 'VEC.2', 'VEC.prol', 'SMC.1', 'SMC.2', 'LEC' ])
v_subset = adata[v_cells].copy()  # Always use .copy() to avoid modifying views

# Step 2: Preprocess the data
np.random.seed(42)  # Ensure reproducibility
sc.pp.normalize_total(v_subset, target_sum=1e4)  # Normalize total counts per cell
sc.pp.log1p(v_subset)  # Log-transform the data

# Step 3: Compute neighbors and perform Leiden clustering
sc.pp.neighbors(v_subset, n_neighbors=10, use_rep='X', random_state=42)  # Compute the neighborhood graph
sc.tl.leiden(v_subset, resolution=0.8, random_state=42)  # Perform Leiden clustering

# Step 4: Save Leiden cluster labels in the full dataset
adata.obs['V_leiden'] = 'NaN'  # Initialize column with default values
adata.obs.loc[v_subset.obs_names, 'V_leiden'] = (
    v_subset.obs['leiden'].apply(lambda cluster: f'BV-{cluster}')
)

In [None]:
import scanpy as sc

# Step 1: Ensure all observation names are unique
adata.obs_names_make_unique()

# Subset the data to include categories starting with 'T-' or equal to 'B'
adata_subset = adata[(adata.obs['V_leiden'].str.startswith("BV"))].copy()

# Step 3: Log-transform the data if not already done
sc.pp.log1p(adata_subset)

# Step 4: Perform differential expression analysis using the Wilcoxon method
sc.tl.rank_genes_groups(adata_subset, 'V_leiden', method='wilcoxon', use_raw=False)

# Step 5: Extract top marker genes for each cluster
top_genes_per_cluster = {}
for cluster in adata_subset.obs['V_leiden'].cat.categories:
    top_genes_per_cluster[cluster] = adata_subset.uns['rank_genes_groups']['names'][cluster][:30]  # Top 10 genes

# Step 6: Generate dendrogram for cluster ordering
sc.tl.dendrogram(adata_subset, groupby='V_leiden')

# Step 7: Retrieve the cluster order based on the dendrogram
cluster_order = adata_subset.uns['dendrogram_V_leiden']['categories_ordered']

# Step 8: Ensure each cluster has at least 3 unique genes
unique_genes = set()
genes_for_plot = []

for cluster in cluster_order:
    if cluster in top_genes_per_cluster:
        cluster_genes = top_genes_per_cluster[cluster]
        genes_for_this_cluster = []
        for gene in cluster_genes:
            if len(genes_for_this_cluster) >= 20:
                break
            if gene not in unique_genes:
                unique_genes.add(gene)
                genes_for_this_cluster.append(gene)
        genes_for_plot.extend(genes_for_this_cluster)

# Step 9: Plot the dotplot with the unique genes, swapping axes if necessary
sc.pl.dotplot(
    adata_subset,
    var_names=genes_for_plot,
    groupby='V_leiden',
    dendrogram=True,
    use_raw=False,
    cmap="vlag",
    standard_scale='var',
    swap_axes=True
)

In [None]:
print(v_subset.obs['leiden'].value_counts())
print(f"Number of unique clusters: {v_subset.obs['leiden'].nunique()}")

In [None]:
# import matplotlib.pyplot as plt
import numpy as np
from matplotlib.backends.backend_pdf import PdfPages
import os


rotation_angle = 30  # Define rotation for this sample

# Specify the directory where you want to save the PDF
save_directory = '/data/vasileiosionat2/Xenium/Figures/lvl4_pdf/'
pdf_filename = os.path.join(save_directory, f'all_clusters_4rows.pdf')

# Filter the data for the specific sample
adata_sample = adata[adata.obs['sample'] == 's2r2_HV184']

unique_clusters = adata_sample.obs.loc[
    (adata_sample.obs['V_leiden'].str.startswith("BV")),
    'V_leiden'
].unique()


# Reorder clusters (if a custom order is provided, replace `sorted(unique_clusters)`)
ordered_clusters = sorted(unique_clusters)

# Define the number of rows and columns
num_rows = 6
num_cols = int(np.ceil(len(ordered_clusters) / num_rows))  # Calculate columns based on clusters and rows

# Create a PdfPages object to save the plots
with PdfPages(pdf_filename) as pdf:
    # Create a figure for the sample
    fig, axes = plt.subplots(num_rows, num_cols, figsize=(26, 36))
    fig.patch.set_facecolor('white')

    # Flatten the axes array for easier indexing
    axes = axes.flatten()

    # Get the coordinates for the sample
    x_coords = adata_sample.obs['x_centroid']
    y_coords = adata_sample.obs['y_centroid']
    
    # Apply rotation
    angle = np.deg2rad(rotation_angle)
    new_x_coords = x_coords * np.cos(angle) - y_coords * np.sin(angle)
    new_y_coords = x_coords * np.sin(angle) + y_coords * np.cos(angle)

    # Determine aspect ratio
    x_range = new_x_coords.max() - new_x_coords.min()
    y_range = new_y_coords.max() - new_y_coords.min()
    aspect_ratio = x_range / y_range

       # Loop over each unique cluster in the ordered list
    for idx, cluster in enumerate(ordered_clusters):
        ax = axes[idx]

        # Set white background for the subplot
        ax.set_facecolor('white')

        # Remove the outline
        for spine in ax.spines.values():
            spine.set_visible(False)

        # Scatter plot for the grey dots (all other clusters)
        ax.scatter(
            x=new_x_coords[adata_sample.obs['V_leiden'] != cluster],
            y=new_y_coords[adata_sample.obs['V_leiden'] != cluster],
            c='#C0C0C0', 
            s=3  # Adjust dot size
        )

        # Scatter plot for the red dots (the current cluster)
        ax.scatter(
            x=new_x_coords[adata_sample.obs['V_leiden'] == cluster],
            y=new_y_coords[adata_sample.obs['V_leiden'] == cluster],
            c='red', 
            s=9  # Adjust dot size
        )

        # Set aspect ratio for each subplot
        ax.set_aspect(aspect_ratio)

        # Add the cluster name below the plot
        ax.text(
            0.5, 0.02, f'{cluster}', 
            horizontalalignment='center', 
            verticalalignment='center', 
            transform=ax.transAxes, 
            color='black', fontsize=20, weight='bold'
        )

        # Remove grids
        ax.grid(False)

        # Remove ticks and their labels
        ax.set_xticks([])
        ax.set_yticks([])

    # Turn off empty subplots if there are any
    for ax in axes[len(ordered_clusters):]:
        ax.set_visible(False)

    # Define the directory and file name for the TIFF file
    tiff_filename = os.path.join(save_directory, f'all_clusters_4rows.tiff')

    # Adjust the spacing between subplots
    plt.subplots_adjust(
        left=0.05,    # Space from left edge
        right=0.95,   # Space from right edge
        top=0.95,     # Space from top edge
        bottom=0.05,  # Space from bottom edge
        wspace=0.1,   # Width space between columns
        hspace=0.05    # Height space between rows
    )

    # Add a title to the overall figure
    plt.suptitle(f'All Clusters in s2r2_HV184', color='white', fontsize=20, weight='bold', y=1.02)

    # Save the current figure as a TIFF file
    #fig.savefig(tiff_filename, dpi=300, format='tiff')

    plt.show()
    plt.close(fig)


In [None]:
# Ensure 'T_leiden' is a string type
adata.obs['V_leiden'] = adata.obs['V_leiden'].astype(str)

# Define the replacement dictionary
replacement_dict = {
    'BV-0': 'VEC.vein1',
    'BV-1': 'VEC.vein2',
    'BV-2': 'VEC.art2',
    'BV-3': 'VEC.art1',
    'BV-4': 'SMC',
    'BV-5': 'LEC',
    'BV-6': 'VEC-B.Mix',
    'BV-7': 'VEC-Epi.Mix'
}

# Replace only the specified categories in 'T_leiden2'
adata.obs['V_leiden'] = adata.obs['V_leiden'].astype(str)  # Ensure it's string
adata.obs['V_leiden'] = adata.obs['V_leiden'].replace(replacement_dict)

# Check the unique values in 'T_leiden2' to confirm
print(adata.obs['V_leiden'].unique())


In [None]:
import numpy as np
import scanpy as sc

# Step 1: Filter your subset based on a condition (e.g., a specific niche or cell type)
# Use .isin() to filter for multiple values in a column
v_cells2 = adata.obs['V_leiden'].isin(['VEC.art1'])
v_subset2 = adata[v_cells2].copy()  # Always use .copy() to avoid modifying views

# Step 2: Preprocess the data
np.random.seed(42)  # Ensure reproducibility
sc.pp.normalize_total(v_subset2, target_sum=1e4)  # Normalize total counts per cell
sc.pp.log1p(v_subset2)  # Log-transform the data

# Step 3: Compute neighbors and perform Leiden clustering
sc.pp.neighbors(v_subset2, n_neighbors=10, use_rep='X', random_state=42)  # Compute the neighborhood graph
sc.tl.leiden(v_subset2, resolution=0.8, random_state=42)  # Perform Leiden clustering

# Step 4: Save Leiden cluster labels in the full dataset
adata.obs['V_leiden2'] = 'NaN'  # Initialize column with default values
adata.obs.loc[v_subset2.obs_names, 'V_leiden2'] = (
    v_subset2.obs['leiden'].apply(lambda cluster: f'Art-cap-{cluster}')
)

In [None]:
import scanpy as sc

# Step 1: Ensure all observation names are unique
adata.obs_names_make_unique()

# Subset the data to include categories starting with 'T-' or equal to 'B'
adata_subset = adata[(adata.obs['V_leiden2'].isin(['Art-cap-0', 'Art-cap-1', 'Art-cap-2', 'Art-cap-3', 'Art-cap-4', 'Art-cap-5']))].copy()

# Step 3: Log-transform the data if not already done
sc.pp.log1p(adata_subset)

# Step 4: Perform differential expression analysis using the Wilcoxon method
sc.tl.rank_genes_groups(adata_subset, 'V_leiden2', method='wilcoxon', use_raw=False)

# Step 5: Extract top marker genes for each cluster
top_genes_per_cluster = {}
for cluster in adata_subset.obs['V_leiden2'].cat.categories:
    top_genes_per_cluster[cluster] = adata_subset.uns['rank_genes_groups']['names'][cluster][:30]  # Top 10 genes

# Step 6: Generate dendrogram for cluster ordering
sc.tl.dendrogram(adata_subset, groupby='V_leiden2')

# Step 7: Retrieve the cluster order based on the dendrogram
cluster_order = adata_subset.uns['dendrogram_V_leiden2']['categories_ordered']

# Step 8: Ensure each cluster has at least 3 unique genes
unique_genes = set()
genes_for_plot = []

for cluster in cluster_order:
    if cluster in top_genes_per_cluster:
        cluster_genes = top_genes_per_cluster[cluster]
        genes_for_this_cluster = []
        for gene in cluster_genes:
            if len(genes_for_this_cluster) >= 20:
                break
            if gene not in unique_genes:
                unique_genes.add(gene)
                genes_for_this_cluster.append(gene)
        genes_for_plot.extend(genes_for_this_cluster)

# Step 9: Plot the dotplot with the unique genes, swapping axes if necessary
sc.pl.dotplot(
    adata_subset,
    var_names=genes_for_plot,
    groupby='V_leiden2',
    dendrogram=True,
    use_raw=False,
    cmap="vlag",
    standard_scale='var',
    swap_axes=True
)

In [None]:
# import matplotlib.pyplot as plt
import numpy as np
from matplotlib.backends.backend_pdf import PdfPages
import os


rotation_angle = 30  # Define rotation for this sample

# Specify the directory where you want to save the PDF
save_directory = '/data/vasileiosionat2/Xenium/Figures/lvl4_pdf/'
pdf_filename = os.path.join(save_directory, f'all_clusters_4rows.pdf')

# Filter the data for the specific sample
adata_sample = adata[adata.obs['sample'] == 's2r2_HV184']

unique_clusters = adata_sample.obs.loc[
    (adata_sample.obs['V_leiden2'].str.startswith("Art")),
    'V_leiden2'
].unique()


# Reorder clusters (if a custom order is provided, replace `sorted(unique_clusters)`)
ordered_clusters = sorted(unique_clusters)

# Define the number of rows and columns
num_rows = 3
num_cols = int(np.ceil(len(ordered_clusters) / num_rows))  # Calculate columns based on clusters and rows

# Create a PdfPages object to save the plots
with PdfPages(pdf_filename) as pdf:
    # Create a figure for the sample
    fig, axes = plt.subplots(num_rows, num_cols, figsize=(16, 16))
    fig.patch.set_facecolor('white')

    # Flatten the axes array for easier indexing
    axes = axes.flatten()

    # Get the coordinates for the sample
    x_coords = adata_sample.obs['x_centroid']
    y_coords = adata_sample.obs['y_centroid']
    
    # Apply rotation
    angle = np.deg2rad(rotation_angle)
    new_x_coords = x_coords * np.cos(angle) - y_coords * np.sin(angle)
    new_y_coords = x_coords * np.sin(angle) + y_coords * np.cos(angle)

    # Determine aspect ratio
    x_range = new_x_coords.max() - new_x_coords.min()
    y_range = new_y_coords.max() - new_y_coords.min()
    aspect_ratio = x_range / y_range

       # Loop over each unique cluster in the ordered list
    for idx, cluster in enumerate(ordered_clusters):
        ax = axes[idx]

        # Set white background for the subplot
        ax.set_facecolor('white')

        # Remove the outline
        for spine in ax.spines.values():
            spine.set_visible(False)

        # Scatter plot for the grey dots (all other clusters)
        ax.scatter(
            x=new_x_coords[adata_sample.obs['V_leiden2'] != cluster],
            y=new_y_coords[adata_sample.obs['V_leiden2'] != cluster],
            c='#C0C0C0', 
            s=3  # Adjust dot size
        )

        # Scatter plot for the red dots (the current cluster)
        ax.scatter(
            x=new_x_coords[adata_sample.obs['V_leiden2'] == cluster],
            y=new_y_coords[adata_sample.obs['V_leiden2'] == cluster],
            c='red', 
            s=9  # Adjust dot size
        )

        # Set aspect ratio for each subplot
        ax.set_aspect(aspect_ratio)

        # Add the cluster name below the plot
        ax.text(
            0.5, 0.02, f'{cluster}', 
            horizontalalignment='center', 
            verticalalignment='center', 
            transform=ax.transAxes, 
            color='black', fontsize=20, weight='bold'
        )

        # Remove grids
        ax.grid(False)

        # Remove ticks and their labels
        ax.set_xticks([])
        ax.set_yticks([])

    # Turn off empty subplots if there are any
    for ax in axes[len(ordered_clusters):]:
        ax.set_visible(False)

    # Define the directory and file name for the TIFF file
    tiff_filename = os.path.join(save_directory, f'all_clusters_4rows.tiff')

    # Adjust the spacing between subplots
    plt.subplots_adjust(
        left=0.05,    # Space from left edge
        right=0.95,   # Space from right edge
        top=0.95,     # Space from top edge
        bottom=0.05,  # Space from bottom edge
        wspace=0.1,   # Width space between columns
        hspace=0.05    # Height space between rows
    )

    # Add a title to the overall figure
    plt.suptitle(f'All Clusters in s2r2_HV184', color='white', fontsize=20, weight='bold', y=1.02)

    # Save the current figure as a TIFF file
    #fig.savefig(tiff_filename, dpi=300, format='tiff')

    plt.show()
    plt.close(fig)


In [None]:
print(v_subset2.obs['leiden'].value_counts())
print(f"Number of unique clusters: {v_subset2.obs['leiden'].nunique()}")

In [None]:
# Ensure 'T_leiden' is a string type
adata.obs['V_leiden2'] = adata.obs['V_leiden2'].astype(str)

# Define the replacement dictionary
replacement_dict = {
    'Art-cap-3': 'VEC.art3',
    'Art-cap-0': 'VEC.cap',
    'Art-cap-1': 'VEC.cap',
    'Art-cap-5': 'VEC.cap'

}

# Replace only the specified categories in 'T_leiden2'
adata.obs['V_leiden2'] = adata.obs['V_leiden2'].astype(str)  # Ensure it's string
adata.obs['V_leiden2'] = adata.obs['V_leiden2'].replace(replacement_dict)

# Check the unique values in 'T_leiden2' to confirm
print(adata.obs['V_leiden2'].unique())


In [None]:
# Temporarily convert 'T_leiden' to a non-categorical type (e.g., object)
adata.obs['V_leiden'] = adata.obs['V_leiden'].astype('object')

# Condition to select rows where 'Lvl4' is not in the specified categories
condition = adata.obs['V_leiden2'].isin(['VEC.cap', 'VEC.art3'])

# Transfer values from 'Lvl4' to 'T_leiden' for rows matching the condition
adata.obs.loc[condition, 'V_leiden'] = adata.obs.loc[condition, 'V_leiden2']


In [None]:
# Check the unique values in 'T_leiden2' to confirm
print(adata.obs['V_leiden'].unique())

In [None]:
# import matplotlib.pyplot as plt
import numpy as np
from matplotlib.backends.backend_pdf import PdfPages
import os


rotation_angle = 30  # Define rotation for this sample

# Specify the directory where you want to save the PDF
save_directory = '/data/vasileiosionat2/Xenium/Figures/lvl4_pdf/'
pdf_filename = os.path.join(save_directory, f'all_clusters_4rows.pdf')

# Filter the data for the specific sample
adata_sample = adata[adata.obs['sample'] == 's2r2_HV184']

unique_clusters = adata_sample.obs.loc[
    adata_sample.obs['V_leiden'].isin(['VEC.vein1', 'VEC.vein2', 'VEC.art2', 'VEC.cap', 'SMC', 'LEC', 'VEC.art1', 'VEC.art3']),
    'V_leiden'
].unique()


# Reorder clusters (if a custom order is provided, replace `sorted(unique_clusters)`)
ordered_clusters = sorted(unique_clusters)

# Define the number of rows and columns
num_rows = 6
num_cols = int(np.ceil(len(ordered_clusters) / num_rows))  # Calculate columns based on clusters and rows

# Create a PdfPages object to save the plots
with PdfPages(pdf_filename) as pdf:
    # Create a figure for the sample
    fig, axes = plt.subplots(num_rows, num_cols, figsize=(26, 36))
    fig.patch.set_facecolor('white')

    # Flatten the axes array for easier indexing
    axes = axes.flatten()

    # Get the coordinates for the sample
    x_coords = adata_sample.obs['x_centroid']
    y_coords = adata_sample.obs['y_centroid']
    
    # Apply rotation
    angle = np.deg2rad(rotation_angle)
    new_x_coords = x_coords * np.cos(angle) - y_coords * np.sin(angle)
    new_y_coords = x_coords * np.sin(angle) + y_coords * np.cos(angle)

    # Determine aspect ratio
    x_range = new_x_coords.max() - new_x_coords.min()
    y_range = new_y_coords.max() - new_y_coords.min()
    aspect_ratio = x_range / y_range

       # Loop over each unique cluster in the ordered list
    for idx, cluster in enumerate(ordered_clusters):
        ax = axes[idx]

        # Set white background for the subplot
        ax.set_facecolor('white')

        # Remove the outline
        for spine in ax.spines.values():
            spine.set_visible(False)

        # Scatter plot for the grey dots (all other clusters)
        ax.scatter(
            x=new_x_coords[adata_sample.obs['V_leiden'] != cluster],
            y=new_y_coords[adata_sample.obs['V_leiden'] != cluster],
            c='#C0C0C0', 
            s=3  # Adjust dot size
        )

        # Scatter plot for the red dots (the current cluster)
        ax.scatter(
            x=new_x_coords[adata_sample.obs['V_leiden'] == cluster],
            y=new_y_coords[adata_sample.obs['V_leiden'] == cluster],
            c='red', 
            s=9  # Adjust dot size
        )

        # Set aspect ratio for each subplot
        ax.set_aspect(aspect_ratio)

        # Add the cluster name below the plot
        ax.text(
            0.5, 0.02, f'{cluster}', 
            horizontalalignment='center', 
            verticalalignment='center', 
            transform=ax.transAxes, 
            color='black', fontsize=20, weight='bold'
        )

        # Remove grids
        ax.grid(False)

        # Remove ticks and their labels
        ax.set_xticks([])
        ax.set_yticks([])

    # Turn off empty subplots if there are any
    for ax in axes[len(ordered_clusters):]:
        ax.set_visible(False)

    # Define the directory and file name for the TIFF file
    tiff_filename = os.path.join(save_directory, f'all_clusters_4rows.tiff')

    # Adjust the spacing between subplots
    plt.subplots_adjust(
        left=0.05,    # Space from left edge
        right=0.95,   # Space from right edge
        top=0.95,     # Space from top edge
        bottom=0.05,  # Space from bottom edge
        wspace=0.1,   # Width space between columns
        hspace=0.05    # Height space between rows
    )

    # Add a title to the overall figure
    plt.suptitle(f'All Clusters in s2r2_HV184', color='white', fontsize=20, weight='bold', y=1.02)

    # Save the current figure as a TIFF file
    #fig.savefig(tiff_filename, dpi=300, format='tiff')

    plt.show()
    plt.close(fig)


In [None]:
import scanpy as sc

# Step 1: Ensure all observation names are unique
adata.obs_names_make_unique()

# Subset the data to include categories starting with 'T-' or equal to 'B'
adata_subset = adata[(adata.obs['V_leiden'].isin(['VEC.vein1', 'VEC.vein2', 'VEC.art2', 'VEC.cap', 'SMC', 'LEC', 'VEC.art1', 'VEC.art3']))].copy()

# Step 3: Log-transform the data if not already done
sc.pp.log1p(adata_subset)

# Step 4: Perform differential expression analysis using the Wilcoxon method
sc.tl.rank_genes_groups(adata_subset, 'V_leiden', method='wilcoxon', use_raw=False)

# Step 5: Extract top marker genes for each cluster
top_genes_per_cluster = {}
for cluster in adata_subset.obs['V_leiden'].cat.categories:
    top_genes_per_cluster[cluster] = adata_subset.uns['rank_genes_groups']['names'][cluster][:30]  # Top 10 genes

# Step 6: Generate dendrogram for cluster ordering
sc.tl.dendrogram(adata_subset, groupby='V_leiden')

# Step 7: Retrieve the cluster order based on the dendrogram
cluster_order = adata_subset.uns['dendrogram_V_leiden']['categories_ordered']

# Step 8: Ensure each cluster has at least 3 unique genes
unique_genes = set()
genes_for_plot = []

for cluster in cluster_order:
    if cluster in top_genes_per_cluster:
        cluster_genes = top_genes_per_cluster[cluster]
        genes_for_this_cluster = []
        for gene in cluster_genes:
            if len(genes_for_this_cluster) >= 20:
                break
            if gene not in unique_genes:
                unique_genes.add(gene)
                genes_for_this_cluster.append(gene)
        genes_for_plot.extend(genes_for_this_cluster)

# Step 9: Plot the dotplot with the unique genes, swapping axes if necessary
sc.pl.dotplot(
    adata_subset,
    var_names=genes_for_plot,
    groupby='V_leiden',
    dendrogram=True,
    use_raw=False,
    cmap="vlag",
    standard_scale='var',
    swap_axes=True
)

In [None]:
category_counts = adata.obs.loc[
    adata.obs['V_leiden'].isin(['VEC.vein1', 'VEC.vein2', 'VEC.art2', 'VEC.cap', 'SMC', 'LEC', 'VEC.art1',
 'VEC.art3', 'VEC-B.Mix', 'VEC-Epi.Mix']), 
    'V_leiden'
].value_counts()

print(category_counts)

In [None]:
import pandas as pd

# List of values to transfer
values_to_transfer = [
    'VEC.vein1', 'VEC.vein2', 'VEC.art2', 'VEC.cap', 'SMC', 'LEC', 'VEC.art1',
 'VEC.art3', 'VEC-B.Mix', 'VEC-Epi.Mix'
]

# Ensure 'Lvl4' column is categorical
if not pd.api.types.is_categorical_dtype(adata.obs['Lvl4']):
    adata.obs['Lvl4'] = adata.obs['Lvl4'].astype('category')

# Add new categories to 'Lvl4' if they aren't already present
new_categories = set(values_to_transfer) - set(adata.obs['Lvl4'].cat.categories)
if new_categories:
    adata.obs['Lvl4'].cat.add_categories(new_categories, inplace=True)

# Transfer values: only update rows where T_leiden matches the specified values
adata.obs.loc[
    adata.obs['V_leiden'].isin(values_to_transfer), 
    'Lvl4'
] = adata.obs.loc[
    adata.obs['V_leiden'].isin(values_to_transfer), 
    'V_leiden'
]

In [None]:
category_counts = adata.obs.loc[
    adata.obs['Lvl4'].isin(['VEC.vein1', 'VEC.vein2', 'VEC.art2', 'VEC.cap', 'SMC', 'LEC', 'VEC.art1',
 'VEC.art3', 'VEC-B.Mix', 'VEC-Epi.Mix']), 
    'Lvl4'
].value_counts()

print(category_counts)

In [None]:
#After T cell clustering save
adata.write_h5ad("/data/vasileiosionat2/Xenium/Drake_outputs/ccProcessed.h5ad")

In [None]:
import scanpy as sc

# Step 1: Ensure all observation names are unique
adata.obs_names_make_unique()

# Subset the data to include categories starting with 'T-' or equal to 'B'
adata_subset = adata[(adata.obs['Lvl5'].isin(['Fib.1',  'Fib.2', 'Fib.5', 'Fib.3', 'Fib.4', 'MyoF']))].copy()

# Step 3: Log-transform the data if not already done
sc.pp.log1p(adata_subset)

# Step 4: Perform differential expression analysis using the Wilcoxon method
sc.tl.rank_genes_groups(adata_subset, 'Lvl5', method='wilcoxon', use_raw=False)

# Step 5: Extract top marker genes for each cluster
top_genes_per_cluster = {}
for cluster in adata_subset.obs['Lvl5'].cat.categories:
    top_genes_per_cluster[cluster] = adata_subset.uns['rank_genes_groups']['names'][cluster][:20]  # Top 10 genes

# Step 6: Generate dendrogram for cluster ordering
sc.tl.dendrogram(adata_subset, groupby='Lvl5')

# Step 7: Retrieve the cluster order based on the dendrogram
cluster_order = adata_subset.uns['dendrogram_Lvl5']['categories_ordered']

# Step 8: Ensure each cluster has at least 3 unique genes
unique_genes = set()
genes_for_plot = []

for cluster in cluster_order:
    if cluster in top_genes_per_cluster:
        cluster_genes = top_genes_per_cluster[cluster]
        genes_for_this_cluster = []
        for gene in cluster_genes:
            if len(genes_for_this_cluster) >= 20:
                break
            if gene not in unique_genes:
                unique_genes.add(gene)
                genes_for_this_cluster.append(gene)
        genes_for_plot.extend(genes_for_this_cluster)

# Step 9: Plot the dotplot with the unique genes, swapping axes if necessary
sc.pl.dotplot(
    adata_subset,
    var_names=genes_for_plot,
    groupby='Lvl5',
    dendrogram=True,
    use_raw=False,
    cmap="vlag",
    standard_scale='var',
    swap_axes=True
)

In [None]:
# import matplotlib.pyplot as plt
import numpy as np
from matplotlib.backends.backend_pdf import PdfPages
import os


rotation_angle = 30  # Define rotation for this sample

# Specify the directory where you want to save the PDF
save_directory = '/data/vasileiosionat2/Xenium/Figures/lvl4_pdf/'
pdf_filename = os.path.join(save_directory, f'all_clusters_4rows.pdf')

# Filter the data for the specific sample
adata_sample = adata[adata.obs['sample'] == 's2r2_HV184']

unique_clusters = adata_sample.obs.loc[
    adata_sample.obs['Lvl5'].isin(['Fib.1', 'Fib.2', 'Fib.3', 'Fib.4', 'MyoF']),
    'Lvl5'
].unique()


# Reorder clusters (if a custom order is provided, replace `sorted(unique_clusters)`)
ordered_clusters = sorted(unique_clusters)

# Define the number of rows and columns
num_rows = 3
num_cols = int(np.ceil(len(ordered_clusters) / num_rows))  # Calculate columns based on clusters and rows

# Create a PdfPages object to save the plots
with PdfPages(pdf_filename) as pdf:
    # Create a figure for the sample
    fig, axes = plt.subplots(num_rows, num_cols, figsize=(16, 16))
    fig.patch.set_facecolor('white')

    # Flatten the axes array for easier indexing
    axes = axes.flatten()

    # Get the coordinates for the sample
    x_coords = adata_sample.obs['x_centroid']
    y_coords = adata_sample.obs['y_centroid']
    
    # Apply rotation
    angle = np.deg2rad(rotation_angle)
    new_x_coords = x_coords * np.cos(angle) - y_coords * np.sin(angle)
    new_y_coords = x_coords * np.sin(angle) + y_coords * np.cos(angle)

    # Determine aspect ratio
    x_range = new_x_coords.max() - new_x_coords.min()
    y_range = new_y_coords.max() - new_y_coords.min()
    aspect_ratio = x_range / y_range

       # Loop over each unique cluster in the ordered list
    for idx, cluster in enumerate(ordered_clusters):
        ax = axes[idx]

        # Set white background for the subplot
        ax.set_facecolor('white')

        # Remove the outline
        for spine in ax.spines.values():
            spine.set_visible(False)

        # Scatter plot for the grey dots (all other clusters)
        ax.scatter(
            x=new_x_coords[adata_sample.obs['Lvl5'] != cluster],
            y=new_y_coords[adata_sample.obs['Lvl5'] != cluster],
            c='#C0C0C0', 
            s=3  # Adjust dot size
        )

        # Scatter plot for the red dots (the current cluster)
        ax.scatter(
            x=new_x_coords[adata_sample.obs['Lvl5'] == cluster],
            y=new_y_coords[adata_sample.obs['Lvl5'] == cluster],
            c='red', 
            s=9  # Adjust dot size
        )

        # Set aspect ratio for each subplot
        ax.set_aspect(aspect_ratio)

        # Add the cluster name below the plot
        ax.text(
            0.5, 0.02, f'{cluster}', 
            horizontalalignment='center', 
            verticalalignment='center', 
            transform=ax.transAxes, 
            color='black', fontsize=20, weight='bold'
        )

        # Remove grids
        ax.grid(False)

        # Remove ticks and their labels
        ax.set_xticks([])
        ax.set_yticks([])

    # Turn off empty subplots if there are any
    for ax in axes[len(ordered_clusters):]:
        ax.set_visible(False)

    # Define the directory and file name for the TIFF file
    tiff_filename = os.path.join(save_directory, f'all_clusters_4rows.tiff')

    # Adjust the spacing between subplots
    plt.subplots_adjust(
        left=0.05,    # Space from left edge
        right=0.95,   # Space from right edge
        top=0.95,     # Space from top edge
        bottom=0.05,  # Space from bottom edge
        wspace=0.1,   # Width space between columns
        hspace=0.05    # Height space between rows
    )

    # Add a title to the overall figure
    plt.suptitle(f'All Clusters in s2r2_HV184', color='white', fontsize=20, weight='bold', y=1.02)

    # Save the current figure as a TIFF file
    #fig.savefig(tiff_filename, dpi=300, format='tiff')

    plt.show()
    plt.close(fig)


In [None]:
import numpy as np
import scanpy as sc

# Step 1: Filter your subset based on a condition (e.g., a specific niche or cell type)
# Use .isin() to filter for multiple values in a column
f_cells = adata.obs['Lvl4'].isin(['Fib.2'])
f_subset = adata[f_cells].copy()  # Always use .copy() to avoid modifying views

# Step 2: Preprocess the data
np.random.seed(42)  # Ensure reproducibility
sc.pp.normalize_total(f_subset, target_sum=1e4)  # Normalize total counts per cell
sc.pp.log1p(f_subset)  # Log-transform the data

# Step 3: Compute neighbors and perform Leiden clustering
sc.pp.neighbors(f_subset, n_neighbors=10, use_rep='X', random_state=42)  # Compute the neighborhood graph
sc.tl.leiden(f_subset, resolution=0.5, random_state=42)  # Perform Leiden clustering

# Step 4: Save Leiden cluster labels in the full dataset
adata.obs['F_leiden'] = 'NaN'  # Initialize column with default values
adata.obs.loc[f_subset.obs_names, 'F_leiden'] = (
    f_subset.obs['leiden'].apply(lambda cluster: f'Fib2-{cluster}')
)

In [None]:
import scanpy as sc

# Step 1: Ensure all observation names are unique
adata.obs_names_make_unique()

# Step 2: Subset the data to include categories starting with 'T-' or equal to 'B'
adata_subset = adata[(adata.obs['F_leiden'].str.startswith("F"))].copy()

# Step 3: Log-transform the data if not already done
sc.pp.log1p(adata_subset)

# Step 4: Perform differential expression analysis using the Wilcoxon method
sc.tl.rank_genes_groups(adata_subset, 'F_leiden', method='wilcoxon', use_raw=False)

# Step 5: Extract top marker genes for each cluster
top_genes_per_cluster = {}
for cluster in adata_subset.obs['F_leiden'].cat.categories:
    top_genes_per_cluster[cluster] = adata_subset.uns['rank_genes_groups']['names'][cluster][:50]  # Top 20 genes

# Step 6: Generate dendrogram for cluster ordering
sc.tl.dendrogram(adata_subset, groupby='F_leiden')

# Step 7: Retrieve the cluster order based on the dendrogram
cluster_order = adata_subset.uns['dendrogram_F_leiden']['categories_ordered']

# Step 8: Filter genes based on 20% expression threshold
filtered_genes = set()
for cluster in cluster_order:
    cluster_data = adata_subset[adata_subset.obs['F_leiden'] == cluster]
    # Calculate the fraction of cells expressing each gene in the cluster
    gene_expression_fraction = (cluster_data.X > 0).mean(axis=0).A1  # Sparse matrix to 1D array
    genes_above_threshold = cluster_data.var_names[gene_expression_fraction > 0.2]
    filtered_genes.update(genes_above_threshold)

# Step 9: Ensure unique genes for plotting
unique_genes = set()
genes_for_plot = []

for cluster in cluster_order:
    if cluster in top_genes_per_cluster:
        cluster_genes = top_genes_per_cluster[cluster]
        genes_for_this_cluster = []
        for gene in cluster_genes:
            if len(genes_for_this_cluster) >= 20:
                break
            if gene in filtered_genes and gene not in unique_genes:
                unique_genes.add(gene)
                genes_for_this_cluster.append(gene)
        genes_for_plot.extend(genes_for_this_cluster)

# Step 10: Plot the dotplot with the filtered genes, swapping axes if necessary
sc.pl.dotplot(
    adata_subset,
    var_names=genes_for_plot,
    groupby='F_leiden',
    dendrogram=True,
    use_raw=False,
    cmap="vlag",
    standard_scale='var',
    swap_axes=True
)


In [None]:
import pandas as pd

# List of values to transfer
values_to_transfer = [
    'Fib2-1', 'Fib2-2', 'Fib2-0'
]

# Ensure 'Lvl5' column is categorical
if not pd.api.types.is_categorical_dtype(adata.obs['Lvl5']):
    adata.obs['Lvl5'] = adata.obs['Lvl5'].astype('category')

# Add new categories to 'Lvl5' if they aren't already present
new_categories = set(values_to_transfer) - set(adata.obs['Lvl5'].cat.categories)
if new_categories:
    adata.obs['Lvl5'].cat.add_categories(new_categories, inplace=True)

# Transfer values: Update rows where F_leiden matches the specified values
adata.obs.loc[
    adata.obs['F_leiden'].isin(values_to_transfer), 
    'Lvl5'
] = adata.obs['F_leiden']  # Directly assign the values of 'F_leiden' to 'Lvl5'


In [None]:
import scanpy as sc

# Step 1: Ensure all observation names are unique
adata.obs_names_make_unique()

# Subset the data to include categories starting with 'T-' or equal to 'B'
adata_subset = adata[(adata.obs['Lvl5'].isin(['Fib.1',  'Fib2-0', 'Fib2-1', 'Fib2-2', 'Fib.3', 'Fib.4', 'MyoF']))].copy()

# Step 3: Log-transform the data if not already done
sc.pp.log1p(adata_subset)

# Step 4: Perform differential expression analysis using the Wilcoxon method
sc.tl.rank_genes_groups(adata_subset, 'Lvl5', method='wilcoxon', use_raw=False)

# Step 5: Extract top marker genes for each cluster
top_genes_per_cluster = {}
for cluster in adata_subset.obs['Lvl5'].cat.categories:
    top_genes_per_cluster[cluster] = adata_subset.uns['rank_genes_groups']['names'][cluster][:20]  # Top 10 genes

# Step 6: Generate dendrogram for cluster ordering
sc.tl.dendrogram(adata_subset, groupby='Lvl5')

# Step 7: Retrieve the cluster order based on the dendrogram
cluster_order = adata_subset.uns['dendrogram_Lvl5']['categories_ordered']

# Step 8: Ensure each cluster has at least 3 unique genes
unique_genes = set()
genes_for_plot = []

for cluster in cluster_order:
    if cluster in top_genes_per_cluster:
        cluster_genes = top_genes_per_cluster[cluster]
        genes_for_this_cluster = []
        for gene in cluster_genes:
            if len(genes_for_this_cluster) >= 20:
                break
            if gene not in unique_genes:
                unique_genes.add(gene)
                genes_for_this_cluster.append(gene)
        genes_for_plot.extend(genes_for_this_cluster)

# Step 9: Plot the dotplot with the unique genes, swapping axes if necessary
sc.pl.dotplot(
    adata_subset,
    var_names=genes_for_plot,
    groupby='Lvl5',
    dendrogram=True,
    use_raw=False,
    cmap="vlag",
    standard_scale='var',
    swap_axes=True
)

In [None]:
# Ensure 'T_leiden' is a string type
adata.obs['Lvl5'] = adata.obs['Lvl5'].astype(str)

# Define the replacement dictionary
replacement_dict = {
    'Fib2-1': 'Fib.2',
    'Fib2-0': 'Fib.5',
    'Fib2-2': 'Fib.2',
  
}

# Replace only the specified categories in 'T_leiden2'
adata.obs['Lvl5'] = adata.obs['Lvl5'].astype(str)  # Ensure it's string
adata.obs['Lvl5'] = adata.obs['Lvl5'].replace(replacement_dict)

# Check the unique values in 'T_leiden2' to confirm
print(adata.obs['Lvl5'].unique())


In [None]:
# import matplotlib.pyplot as plt
import numpy as np
from matplotlib.backends.backend_pdf import PdfPages
import os


rotation_angle = 30  # Define rotation for this sample

# Specify the directory where you want to save the PDF
save_directory = '/data/vasileiosionat2/Xenium/Figures/lvl4_pdf/'
pdf_filename = os.path.join(save_directory, f'all_clusters_4rows.pdf')

# Filter the data for the specific sample
adata_sample = adata[adata.obs['sample'] == 's2r2_HV184']

unique_clusters = adata_sample.obs.loc[
    adata_sample.obs['Lvl5'].isin(['Fib.1', 'Fib.2', 'Fib.3', 'Fib.4', 'Fib.5', 'MyoF']),
    'Lvl5'
].unique()


# Reorder clusters (if a custom order is provided, replace `sorted(unique_clusters)`)
ordered_clusters = sorted(unique_clusters)

# Define the number of rows and columns
num_rows = 3
num_cols = int(np.ceil(len(ordered_clusters) / num_rows))  # Calculate columns based on clusters and rows

# Create a PdfPages object to save the plots
with PdfPages(pdf_filename) as pdf:
    # Create a figure for the sample
    fig, axes = plt.subplots(num_rows, num_cols, figsize=(16, 16))
    fig.patch.set_facecolor('white')

    # Flatten the axes array for easier indexing
    axes = axes.flatten()

    # Get the coordinates for the sample
    x_coords = adata_sample.obs['x_centroid']
    y_coords = adata_sample.obs['y_centroid']
    
    # Apply rotation
    angle = np.deg2rad(rotation_angle)
    new_x_coords = x_coords * np.cos(angle) - y_coords * np.sin(angle)
    new_y_coords = x_coords * np.sin(angle) + y_coords * np.cos(angle)

    # Determine aspect ratio
    x_range = new_x_coords.max() - new_x_coords.min()
    y_range = new_y_coords.max() - new_y_coords.min()
    aspect_ratio = x_range / y_range

       # Loop over each unique cluster in the ordered list
    for idx, cluster in enumerate(ordered_clusters):
        ax = axes[idx]

        # Set white background for the subplot
        ax.set_facecolor('white')

        # Remove the outline
        for spine in ax.spines.values():
            spine.set_visible(False)

        # Scatter plot for the grey dots (all other clusters)
        ax.scatter(
            x=new_x_coords[adata_sample.obs['Lvl5'] != cluster],
            y=new_y_coords[adata_sample.obs['Lvl5'] != cluster],
            c='#C0C0C0', 
            s=3  # Adjust dot size
        )

        # Scatter plot for the red dots (the current cluster)
        ax.scatter(
            x=new_x_coords[adata_sample.obs['Lvl5'] == cluster],
            y=new_y_coords[adata_sample.obs['Lvl5'] == cluster],
            c='red', 
            s=9  # Adjust dot size
        )

        # Set aspect ratio for each subplot
        ax.set_aspect(aspect_ratio)

        # Add the cluster name below the plot
        ax.text(
            0.5, 0.02, f'{cluster}', 
            horizontalalignment='center', 
            verticalalignment='center', 
            transform=ax.transAxes, 
            color='black', fontsize=20, weight='bold'
        )

        # Remove grids
        ax.grid(False)

        # Remove ticks and their labels
        ax.set_xticks([])
        ax.set_yticks([])

    # Turn off empty subplots if there are any
    for ax in axes[len(ordered_clusters):]:
        ax.set_visible(False)

    # Define the directory and file name for the TIFF file
    tiff_filename = os.path.join(save_directory, f'all_clusters_4rows.tiff')

    # Adjust the spacing between subplots
    plt.subplots_adjust(
        left=0.05,    # Space from left edge
        right=0.95,   # Space from right edge
        top=0.95,     # Space from top edge
        bottom=0.05,  # Space from bottom edge
        wspace=0.1,   # Width space between columns
        hspace=0.05    # Height space between rows
    )

    # Add a title to the overall figure
    plt.suptitle(f'All Clusters in s2r2_HV184', color='white', fontsize=20, weight='bold', y=1.02)

    # Save the current figure as a TIFF file
    #fig.savefig(tiff_filename, dpi=300, format='tiff')

    plt.show()
    plt.close(fig)


In [None]:
import scanpy as sc

# Step 1: Ensure all observation names are unique
adata.obs_names_make_unique()

# Subset the data to include categories starting with 'T-' or equal to 'B'
adata_subset = adata[(adata.obs['Lvl5'].isin(['Fib.1',  'Fib.2', 'Fib.5', 'Fib.3', 'Fib.4', 'MyoF']))].copy()

# Step 3: Log-transform the data if not already done
sc.pp.log1p(adata_subset)

# Step 4: Perform differential expression analysis using the Wilcoxon method
sc.tl.rank_genes_groups(adata_subset, 'Lvl5', method='wilcoxon', use_raw=False)

# Step 5: Extract top marker genes for each cluster
top_genes_per_cluster = {}
for cluster in adata_subset.obs['Lvl5'].cat.categories:
    top_genes_per_cluster[cluster] = adata_subset.uns['rank_genes_groups']['names'][cluster][:20]  # Top 10 genes

# Step 6: Generate dendrogram for cluster ordering
sc.tl.dendrogram(adata_subset, groupby='Lvl5')

# Step 7: Retrieve the cluster order based on the dendrogram
cluster_order = adata_subset.uns['dendrogram_Lvl5']['categories_ordered']

# Step 8: Ensure each cluster has at least 3 unique genes
unique_genes = set()
genes_for_plot = []

for cluster in cluster_order:
    if cluster in top_genes_per_cluster:
        cluster_genes = top_genes_per_cluster[cluster]
        genes_for_this_cluster = []
        for gene in cluster_genes:
            if len(genes_for_this_cluster) >= 20:
                break
            if gene not in unique_genes:
                unique_genes.add(gene)
                genes_for_this_cluster.append(gene)
        genes_for_plot.extend(genes_for_this_cluster)

# Step 9: Plot the dotplot with the unique genes, swapping axes if necessary
sc.pl.dotplot(
    adata_subset,
    var_names=genes_for_plot,
    groupby='Lvl5',
    dendrogram=True,
    use_raw=False,
    cmap="vlag",
    standard_scale='var',
    swap_axes=True
)

In [None]:
#After T cell clustering save
adata.write_h5ad("/data/vasileiosionat2/Xenium/Drake_outputs/ccProcessed.h5ad")

In [None]:
import scanpy as sc

# Step 1: Ensure all observation names are unique
adata.obs_names_make_unique()

# Subset the data to include categories starting with 'T-' or equal to 'B'
adata_subset = adata[(adata.obs['Lvl5'].isin(['DC.1', 'DC.2', 'Mac.1', 'Mac.2', 'Mac.3', 'Mac.Neut.Mix', 'Lang', 'Lang.prol', 'Oth.IRF8hi-MZB1hi']))].copy()

# Step 3: Log-transform the data if not already done
sc.pp.log1p(adata_subset)

# Step 4: Perform differential expression analysis using the Wilcoxon method
sc.tl.rank_genes_groups(adata_subset, 'Lvl5', method='wilcoxon', use_raw=False)

# Step 5: Extract top marker genes for each cluster
top_genes_per_cluster = {}
for cluster in adata_subset.obs['Lvl5'].cat.categories:
    top_genes_per_cluster[cluster] = adata_subset.uns['rank_genes_groups']['names'][cluster][:20]  # Top 10 genes

# Step 6: Generate dendrogram for cluster ordering
sc.tl.dendrogram(adata_subset, groupby='Lvl5')

# Step 7: Retrieve the cluster order based on the dendrogram
cluster_order = adata_subset.uns['dendrogram_Lvl5']['categories_ordered']

# Step 8: Ensure each cluster has at least 3 unique genes
unique_genes = set()
genes_for_plot = []

for cluster in cluster_order:
    if cluster in top_genes_per_cluster:
        cluster_genes = top_genes_per_cluster[cluster]
        genes_for_this_cluster = []
        for gene in cluster_genes:
            if len(genes_for_this_cluster) >= 20:
                break
            if gene not in unique_genes:
                unique_genes.add(gene)
                genes_for_this_cluster.append(gene)
        genes_for_plot.extend(genes_for_this_cluster)

# Step 9: Plot the dotplot with the unique genes, swapping axes if necessary
sc.pl.dotplot(
    adata_subset,
    var_names=genes_for_plot,
    groupby='Lvl5',
    dendrogram=True,
    use_raw=False,
    cmap="vlag",
    standard_scale='var',
    swap_axes=True
)

In [None]:
# Ensure 'T_leiden' is a string type
adata.obs['Lvl5'] = adata.obs['Lvl5'].astype(str)

# Define the replacement dictionary
replacement_dict = {
    'Mac.3': 'Mono',
    'Mac.1': 'cDC1',
    'DC.2': 'cDC2',
    'Mac.2': 'Mac',
    'DC.1': 'mregDC',
    'Oth.IRF8hi-MZB1hi': 'pDC'
}

# Replace only the specified categories in 'T_leiden2'
adata.obs['Lvl5'] = adata.obs['Lvl5'].astype(str)  # Ensure it's string
adata.obs['Lvl5'] = adata.obs['Lvl5'].replace(replacement_dict)

# Check the unique values in 'T_leiden2' to confirm
print(adata.obs['Lvl5'].unique())


In [None]:
#After T cell clustering save
adata.write_h5ad("/data/vasileiosionat2/Xenium/Drake_outputs/ccProcessed.h5ad")

In [None]:
import scanpy as sc

# Step 1: Ensure all observation names are unique
adata.obs_names_make_unique()

# Subset the data to include categories starting with 'T-' or equal to 'B'
adata_subset = adata[(adata.obs['Lvl4'].isin(['Pl.3', 'Pl.1', 'Pl.prol', 'Pl.2', 'Pl.4', 'B']))].copy()

# Step 3: Log-transform the data if not already done
sc.pp.log1p(adata_subset)

# Step 4: Perform differential expression analysis using the Wilcoxon method
sc.tl.rank_genes_groups(adata_subset, 'Lvl4', method='wilcoxon', use_raw=False)

# Step 5: Extract top marker genes for each cluster
top_genes_per_cluster = {}
for cluster in adata_subset.obs['Lvl4'].cat.categories:
    top_genes_per_cluster[cluster] = adata_subset.uns['rank_genes_groups']['names'][cluster][:20]  # Top 10 genes

# Step 6: Generate dendrogram for cluster ordering
sc.tl.dendrogram(adata_subset, groupby='Lvl4')

# Step 7: Retrieve the cluster order based on the dendrogram
cluster_order = adata_subset.uns['dendrogram_Lvl4']['categories_ordered']

# Step 8: Ensure each cluster has at least 3 unique genes
unique_genes = set()
genes_for_plot = []

for cluster in cluster_order:
    if cluster in top_genes_per_cluster:
        cluster_genes = top_genes_per_cluster[cluster]
        genes_for_this_cluster = []
        for gene in cluster_genes:
            if len(genes_for_this_cluster) >= 20:
                break
            if gene not in unique_genes:
                unique_genes.add(gene)
                genes_for_this_cluster.append(gene)
        genes_for_plot.extend(genes_for_this_cluster)

# Step 9: Plot the dotplot with the unique genes, swapping axes if necessary
sc.pl.dotplot(
    adata_subset,
    var_names=genes_for_plot,
    groupby='Lvl4',
    dendrogram=True,
    use_raw=False,
    cmap="vlag",
    standard_scale='var',
    swap_axes=True
)

In [None]:
# List of desired categories
desired_categories = [
    'Pl.3', 'Pl.1', 'Pl.prol', 'Pl.2', 'Pl.4', 'B'
]

# Filter for only the desired categories and count, then sort in descending order
category_counts = (
    adata.obs.loc[adata.obs['Lvl4'].isin(desired_categories), 'Lvl4']
    .value_counts()
    .reindex(desired_categories, fill_value=0)  # Include all desired categories
    .sort_values(ascending=False)              # Sort by descending count
)

# Print the counts with categories in descending order
print(category_counts)


In [None]:
import numpy as np
import scanpy as sc

# Step 1: Filter your subset based on a condition (e.g., a specific niche or cell type)
# Use .isin() to filter for multiple values in a column
b_cells = adata.obs['Lvl4'].isin(['Pl.3', 'Pl.1', 'Pl.2', 'Pl.4', 'B'])
b_subset = adata[b_cells].copy()  # Always use .copy() to avoid modifying views

# Step 2: Preprocess the data
np.random.seed(42)  # Ensure reproducibility
sc.pp.normalize_total(b_subset, target_sum=1e4)  # Normalize total counts per cell
sc.pp.log1p(b_subset)  # Log-transform the data

# Step 3: Compute neighbors and perform Leiden clustering
sc.pp.neighbors(b_subset, n_neighbors=10, use_rep='X', random_state=42)  # Compute the neighborhood graph
sc.tl.leiden(b_subset, resolution=0.8, random_state=42)  # Perform Leiden clustering

# Step 4: Save Leiden cluster labels in the full dataset
adata.obs['B_leiden'] = 'NaN'  # Initialize column with default values
adata.obs.loc[b_subset.obs_names, 'B_leiden'] = (
    b_subset.obs['leiden'].apply(lambda cluster: f'B-plasma-{cluster}')
)

In [None]:
import scanpy as sc

# Step 1: Ensure all observation names are unique
adata.obs_names_make_unique()

# Subset the data to include categories starting with 'T-' or equal to 'B'
adata_subset = adata[(adata.obs['B_leiden'].str.startswith("B-pl"))].copy()

# Step 3: Log-transform the data if not already done
sc.pp.log1p(adata_subset)

# Step 4: Perform differential expression analysis using the Wilcoxon method
sc.tl.rank_genes_groups(adata_subset, 'B_leiden', method='wilcoxon', use_raw=False)

# Step 5: Extract top marker genes for each cluster
top_genes_per_cluster = {}
for cluster in adata_subset.obs['B_leiden'].cat.categories:
    top_genes_per_cluster[cluster] = adata_subset.uns['rank_genes_groups']['names'][cluster][:20]  # Top 10 genes

# Step 6: Generate dendrogram for cluster ordering
sc.tl.dendrogram(adata_subset, groupby='B_leiden')

# Step 7: Retrieve the cluster order based on the dendrogram
cluster_order = adata_subset.uns['dendrogram_B_leiden']['categories_ordered']

# Step 8: Ensure each cluster has at least 3 unique genes
unique_genes = set()
genes_for_plot = []

for cluster in cluster_order:
    if cluster in top_genes_per_cluster:
        cluster_genes = top_genes_per_cluster[cluster]
        genes_for_this_cluster = []
        for gene in cluster_genes:
            if len(genes_for_this_cluster) >= 20:
                break
            if gene not in unique_genes:
                unique_genes.add(gene)
                genes_for_this_cluster.append(gene)
        genes_for_plot.extend(genes_for_this_cluster)

# Step 9: Plot the dotplot with the unique genes, swapping axes if necessary
sc.pl.dotplot(
    adata_subset,
    var_names=genes_for_plot,
    groupby='B_leiden',
    dendrogram=True,
    use_raw=False,
    cmap="vlag",
    standard_scale='var',
    swap_axes=True
)

In [None]:
# List of desired categories
desired_categories = [
    'B-plasma-1', 'B-plasma-2', 'B-plasma-3', 'B-plasma-4', 'B-plasma-0', 'B-plasma-5'
]

# Filter for only the desired categories and count, then sort in descending order
category_counts = (
    adata.obs.loc[adata.obs['B_leiden'].isin(desired_categories), 'B_leiden']
    .value_counts()
    .reindex(desired_categories, fill_value=0)  # Include all desired categories
    .sort_values(ascending=False)              # Sort by descending count
)

# Print the counts with categories in descending order
print(category_counts)


In [None]:
import pandas as pd

# Check that both columns exist in adata.obs
if "B_leiden" in adata.obs.columns and "status" in adata.obs.columns:
    # Filter for 'B-plasma-0' in the B_leiden column
    filtered_data = adata.obs[adata.obs["B_leiden"] == "B-plasma-3"]
    
    # Group by conditionID and count occurrences
    counts_by_condition = filtered_data.groupby("status").size()
    
    # Convert to a DataFrame for better readability (optional)
    counts_df = counts_by_condition.reset_index(name="counts")
    print(counts_df)
else:
    print("Ensure 'B_leiden' and 'conditionID' exist in adata.obs.")


In [None]:
# Ensure 'T_leiden' is a string type
adata.obs['B_leiden'] = adata.obs['B_leiden'].astype(str)

# Define the replacement dictionary
replacement_dict = {
    'B-plasma-0': 'PB',
    'B-plasma-1': 'Pl.1',
    'B-plasma-2': 'Pl.1',
    'B-plasma-3': 'Pl.2',
    'B-plasma-4': 'B',
    'B-plasma-5': 'Pl.1'
}

# Replace only the specified categories in 'T_leiden2'
adata.obs['B_leiden'] = adata.obs['B_leiden'].astype(str)  # Ensure it's string
adata.obs['B_leiden'] = adata.obs['B_leiden'].replace(replacement_dict)

# Check the unique values in 'T_leiden2' to confirm
print(adata.obs['B_leiden'].unique())


In [None]:
# Ensure 'T_leiden' is a string type
adata.obs['B_leiden'] = adata.obs['B_leiden'].astype(str)

# Define the replacement dictionary
replacement_dict = {
    'PB': 'Pl.1',
    'Pl.1': 'Pl.2.1',
    'Pl.2': 'Pl.2.2',
}

# Replace only the specified categories in 'T_leiden2'
adata.obs['B_leiden'] = adata.obs['B_leiden'].astype(str)  # Ensure it's string
adata.obs['B_leiden'] = adata.obs['B_leiden'].replace(replacement_dict)

# Check the unique values in 'T_leiden2' to confirm
print(adata.obs['B_leiden'].unique())


In [None]:
import pandas as pd

# List of values to transfer
values_to_transfer = [
    'Pl.1', 'Pl.2.2', 'Pl.2.1', 'B'
]

# Ensure 'Lvl5' column is categorical
if not pd.api.types.is_categorical_dtype(adata.obs['Lvl5']):
    adata.obs['Lvl5'] = adata.obs['Lvl5'].astype('category')

# Add new categories to 'Lvl5' if they aren't already present
new_categories = set(values_to_transfer) - set(adata.obs['Lvl5'].cat.categories)
if new_categories:
    adata.obs['Lvl5'].cat.add_categories(new_categories, inplace=True)

# Transfer values: Update rows where F_leiden matches the specified values
adata.obs.loc[
    adata.obs['B_leiden'].isin(values_to_transfer), 
    'Lvl5'
] = adata.obs['B_leiden']  # Directly assign the values of 'F_leiden' to 'Lvl5'

In [None]:
# Ensure 'T_leiden' is a string type
adata.obs['Lvl5'] = adata.obs['Lvl5'].astype(str)

# Define the replacement dictionary
replacement_dict = {
    'Pl.prol': 'PB',
}

# Replace only the specified categories in 'T_leiden2'
adata.obs['Lvl5'] = adata.obs['Lvl5'].astype(str)  # Ensure it's string
adata.obs['Lvl5'] = adata.obs['Lvl5'].replace(replacement_dict)

# Check the unique values in 'T_leiden2' to confirm
print(adata.obs['Lvl5'].unique())


In [None]:
import scanpy as sc

# Step 1: Ensure all observation names are unique
adata.obs_names_make_unique()

# Subset the data to include categories starting with 'T-' or equal to 'B'
adata_subset = adata[(adata.obs['Lvl5'].isin(['Pl.1', 'Pl.2.2', 'PB', 'B', 'Pl.2.1']))].copy()

# Step 3: Log-transform the data if not already done
sc.pp.log1p(adata_subset)

# Step 4: Perform differential expression analysis using the Wilcoxon method
sc.tl.rank_genes_groups(adata_subset, 'Lvl5', method='wilcoxon', use_raw=False)

# Step 5: Extract top marker genes for each cluster
top_genes_per_cluster = {}
for cluster in adata_subset.obs['Lvl5'].cat.categories:
    top_genes_per_cluster[cluster] = adata_subset.uns['rank_genes_groups']['names'][cluster][:20]  # Top 10 genes

# Step 6: Generate dendrogram for cluster ordering
sc.tl.dendrogram(adata_subset, groupby='Lvl5')

# Step 7: Retrieve the cluster order based on the dendrogram
cluster_order = adata_subset.uns['dendrogram_Lvl5']['categories_ordered']

# Step 8: Ensure each cluster has at least 3 unique genes
unique_genes = set()
genes_for_plot = []

for cluster in cluster_order:
    if cluster in top_genes_per_cluster:
        cluster_genes = top_genes_per_cluster[cluster]
        genes_for_this_cluster = []
        for gene in cluster_genes:
            if len(genes_for_this_cluster) >= 20:
                break
            if gene not in unique_genes:
                unique_genes.add(gene)
                genes_for_this_cluster.append(gene)
        genes_for_plot.extend(genes_for_this_cluster)

# Step 9: Plot the dotplot with the unique genes, swapping axes if necessary
sc.pl.dotplot(
    adata_subset,
    var_names=genes_for_plot,
    groupby='Lvl5',
    dendrogram=True,
    use_raw=False,
    cmap="vlag",
    standard_scale='var',
    swap_axes=True
)

In [None]:
# Ensure 'T_leiden' is a string type
adata.obs['Lvl5'] = adata.obs['Lvl5'].astype(str)

# Define the replacement dictionary
replacement_dict = {
    'Pl': 'Pl.2',
    'B-Pl': 'Pl.1'
}

# Replace only the specified categories in 'T_leiden2'
adata.obs['Lvl5'] = adata.obs['Lvl5'].astype(str)  # Ensure it's string
adata.obs['Lvl5'] = adata.obs['Lvl5'].replace(replacement_dict)

# Check the unique values in 'T_leiden2' to confirm
print(adata.obs['Lvl5'].unique())

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.backends.backend_pdf import PdfPages
import os


rotation_angle = 30  # Define rotation for this sample

# Specify the directory where you want to save the PDF
save_directory = '/data/vasileiosionat2/Xenium/Figures/lvl4_pdf/'
pdf_filename = os.path.join(save_directory, f'all_clusters_4rows.pdf')

# Filter the data for the specific sample
adata_sample = adata[adata.obs['sample'] == 's2r2_HV184']

# Get unique clusters for the selected sample
unique_clusters = adata_sample.obs.loc[
    (adata_sample.obs['Lvl5'].str.startswith("P"))|
    (adata_sample.obs['Lvl5'].str.startswith("B")),
    'Lvl5'
].unique()

# Reorder clusters (if a custom order is provided, replace `sorted(unique_clusters)`)
ordered_clusters = sorted(unique_clusters)

# Define the number of rows and columns
num_rows = 3
num_cols = int(np.ceil(len(ordered_clusters) / num_rows))  # Calculate columns based on clusters and rows

# Create a PdfPages object to save the plots
with PdfPages(pdf_filename) as pdf:
    # Create a figure for the sample
    fig, axes = plt.subplots(num_rows, num_cols, figsize=(16, 26))
    fig.patch.set_facecolor('white')

    # Flatten the axes array for easier indexing
    axes = axes.flatten()

    # Get the coordinates for the sample
    x_coords = adata_sample.obs['x_centroid']
    y_coords = adata_sample.obs['y_centroid']
    
    # Apply rotation
    angle = np.deg2rad(rotation_angle)
    new_x_coords = x_coords * np.cos(angle) - y_coords * np.sin(angle)
    new_y_coords = x_coords * np.sin(angle) + y_coords * np.cos(angle)

    # Determine aspect ratio
    x_range = new_x_coords.max() - new_x_coords.min()
    y_range = new_y_coords.max() - new_y_coords.min()
    aspect_ratio = x_range / y_range

       # Loop over each unique cluster in the ordered list
    for idx, cluster in enumerate(ordered_clusters):
        ax = axes[idx]

        # Set white background for the subplot
        ax.set_facecolor('white')

        # Remove the outline
        for spine in ax.spines.values():
            spine.set_visible(False)

        # Scatter plot for the grey dots (all other clusters)
        ax.scatter(
            x=new_x_coords[adata_sample.obs['Lvl5'] != cluster],
            y=new_y_coords[adata_sample.obs['Lvl5'] != cluster],
            c='#C0C0C0', 
            s=3  # Adjust dot size
        )

        # Scatter plot for the red dots (the current cluster)
        ax.scatter(
            x=new_x_coords[adata_sample.obs['Lvl5'] == cluster],
            y=new_y_coords[adata_sample.obs['Lvl5'] == cluster],
            c='red', 
            s=9  # Adjust dot size
        )

        # Set aspect ratio for each subplot
        ax.set_aspect(aspect_ratio)

        # Add the cluster name below the plot
        ax.text(
            0.5, 0.02, f'{cluster}', 
            horizontalalignment='center', 
            verticalalignment='center', 
            transform=ax.transAxes, 
            color='black', fontsize=20, weight='bold'
        )

        # Remove grids
        ax.grid(False)

        # Remove ticks and their labels
        ax.set_xticks([])
        ax.set_yticks([])

    # Turn off empty subplots if there are any
    for ax in axes[len(ordered_clusters):]:
        ax.set_visible(False)

    # Define the directory and file name for the TIFF file
    tiff_filename = os.path.join(save_directory, f'all_clusters_4rows.tiff')

    # Adjust the spacing between subplots
    plt.subplots_adjust(
        left=0.05,    # Space from left edge
        right=0.95,   # Space from right edge
        top=0.95,     # Space from top edge
        bottom=0.05,  # Space from bottom edge
        wspace=0.1,   # Width space between columns
        hspace=0.05    # Height space between rows
    )

    # Add a title to the overall figure
    plt.suptitle(f'All Clusters in s2r2_HV184', color='white', fontsize=20, weight='bold', y=1.02)

    # Save the current figure as a TIFF file
    #fig.savefig(tiff_filename, dpi=300, format='tiff')

    plt.show()
    plt.close(fig)

In [None]:
#After T cell clustering save
adata.write_h5ad("/data/vasileiosionat2/Xenium/Drake_outputs/ccProcessed.h5ad")

In [None]:
import numpy as np
import scanpy as sc

# Step 1: Filter your subset based on a condition (e.g., a specific niche or cell type)
# Use .isin() to filter for multiple values in a column
Mel_cells = adata.obs['Lvl4'].isin(['NeuroEpi'])
Mel_subset = adata[Mel_cells].copy()  # Always use .copy() to avoid modifying views

# Step 2: Preprocess the data
np.random.seed(42)  # Ensure reproducibility
sc.pp.normalize_total(Mel_subset, target_sum=1e4)  # Normalize total counts per cell
sc.pp.log1p(Mel_subset)  # Log-transform the data

# Step 3: Compute neighbors and perform Leiden clustering
sc.pp.neighbors(Mel_subset, n_neighbors=10, use_rep='X', random_state=42)  # Compute the neighborhood graph
sc.tl.leiden(Mel_subset, resolution=0.3, random_state=42)  # Perform Leiden clustering

# Step 4: Save Leiden cluster labels in the full dataset
adata.obs['Mel_leiden'] = 'NaN'  # Initialize column with default values
adata.obs.loc[Mel_subset.obs_names, 'Mel_leiden'] = (
    Mel_subset.obs['leiden'].apply(lambda cluster: f'NeuroEpi-{cluster}')
)

In [None]:
import scanpy as sc

# Step 1: Ensure all observation names are unique
adata.obs_names_make_unique()

# Subset the data to include categories starting with 'T-' or equal to 'B'
adata_subset = adata[(adata.obs['Mel_leiden'].str.startswith("Ne"))].copy()

# Step 3: Log-transform the data if not already done
sc.pp.log1p(adata_subset)

# Step 4: Perform differential expression analysis using the Wilcoxon method
sc.tl.rank_genes_groups(adata_subset, 'Mel_leiden', method='wilcoxon', use_raw=False)

# Step 5: Extract top marker genes for each cluster
top_genes_per_cluster = {}
for cluster in adata_subset.obs['Mel_leiden'].cat.categories:
    top_genes_per_cluster[cluster] = adata_subset.uns['rank_genes_groups']['names'][cluster][:20]  # Top 10 genes

# Step 6: Generate dendrogram for cluster ordering
sc.tl.dendrogram(adata_subset, groupby='Mel_leiden')

# Step 7: Retrieve the cluster order based on the dendrogram
cluster_order = adata_subset.uns['dendrogram_Mel_leiden']['categories_ordered']

# Step 8: Ensure each cluster has at least 3 unique genes
unique_genes = set()
genes_for_plot = []

for cluster in cluster_order:
    if cluster in top_genes_per_cluster:
        cluster_genes = top_genes_per_cluster[cluster]
        genes_for_this_cluster = []
        for gene in cluster_genes:
            if len(genes_for_this_cluster) >= 20:
                break
            if gene not in unique_genes:
                unique_genes.add(gene)
                genes_for_this_cluster.append(gene)
        genes_for_plot.extend(genes_for_this_cluster)

# Step 9: Plot the dotplot with the unique genes, swapping axes if necessary
sc.pl.dotplot(
    adata_subset,
    var_names=genes_for_plot,
    groupby='Mel_leiden',
    dendrogram=True,
    use_raw=False,
    cmap="vlag",
    standard_scale='var',
    swap_axes=True
)

In [None]:
adata.obs['Mel_leiden'] = adata.obs['Mel_leiden'].astype(str)
# Convert the 'niche_cc14' column to categorical
adata.obs['Lvl5'] = pd.Categorical(adata.obs['Lvl5'])

# Define the replacement dictionary
replacement_dict = {
    'NeuroEpi-0': 'Neur',
    'NeuroEpi-1': 'Mel',

   }


# Replace values in the 'niche_cc14' column
adata.obs['Mel_leiden'] = adata.obs['Mel_leiden'].replace(replacement_dict)
print(adata.obs['Mel_leiden'].unique().tolist())

In [None]:
import pandas as pd

# List of values to transfer
values_to_transfer = [
    'Neur', 'Mel'
]

# Ensure 'Lvl5' column is categorical
if not pd.api.types.is_categorical_dtype(adata.obs['Lvl5']):
    adata.obs['Lvl5'] = adata.obs['Lvl5'].astype('category')

# Add new categories to 'Lvl5' if they aren't already present
new_categories = set(values_to_transfer) - set(adata.obs['Lvl5'].cat.categories)
if new_categories:
    adata.obs['Lvl5'].cat.add_categories(new_categories, inplace=True)

# Transfer values: Update rows where F_leiden matches the specified values
adata.obs.loc[
    adata.obs['Mel_leiden'].isin(values_to_transfer), 
    'Lvl5'
] = adata.obs['Mel_leiden']  # Directly assign the values of 'F_leiden' to 'Lvl5'

In [None]:
import scanpy as sc
from sklearn.cluster import KMeans

# Step 1: Filter cells based on the 'Lvl4' column to isolate T-helper (Th) cells
Mel_cells = adata[adata.obs['Lvl4'] == 'NeuroEpi'].copy()  # Create a copy to avoid view modification warning

# Step 2: Subset the genes you're interested in (if you have a specific list)
genes_of_interest = ['MLANA']  # Replace with your list of genes
Mel_subset = Mel_cells[:, genes_of_interest]

# Step 3: Normalize or scale the data (optional but can help with clustering)
sc.pp.scale(Mel_subset)  # Scale the data for clustering

# Step 4: Perform k-means clustering
n_clusters = 2  # Adjust based on how many clusters you want (e.g., 3 for Th-0, Th-1, Th-2)
kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=42)  # Set n_init explicitly
kmeans_labels = kmeans.fit_predict(Mel_subset.X)  # Get cluster labels

# Step 5: Save the KMeans cluster labels to the 'Lvl4_kmeans' column in `adata.obs`
# Ensure the labels are saved correctly for only the Th cells
adata.obs['Lvl4_kmeans_Mel'] = 'NaN'  # Initialize the new column with NaN values
adata.obs.loc[adata.obs['Lvl4'] == 'NeuroEpi', 'Lvl4_kmeans_Mel'] = [f'B-{i}' for i in kmeans_labels]

# Step 6: Run PCA to compute PCA coordinates for visualization (optional)
sc.tl.pca(adata)  # Compute PCA on the original data

# Step 7: Visualize the clusters in PCA space (optional)
sc.pl.pca(adata, color='Lvl4_kmeans_Mel')  # Plot the clusters using PCA results

In [None]:
import scanpy as sc

# Step 1: Ensure observation names are unique
adata.obs_names_make_unique()

# Step 2: Subset the adata object to only the specific clusters
desired_clusters = [  
'B-0', 'B-1'
]
adata_subset = adata[adata.obs['Lvl4_kmeans_Mel'].isin(desired_clusters)].copy()  # Make a copy to avoid modifying a view

# Step 3: Log-transform the data if it hasn't been done already
sc.pp.log1p(adata_subset)

# Step 4: Perform differential expression analysis for the selected clusters
sc.tl.rank_genes_groups(adata_subset, 'Lvl4_kmeans_Mel', method='wilcoxon', use_raw=False)

# Step 5: Extract top marker genes for the selected clusters
top_genes_per_cluster = {}
for cluster in adata_subset.obs['Lvl4_kmeans_Mel'].cat.categories:
    top_genes_per_cluster[cluster] = adata_subset.uns['rank_genes_groups']['names'][cluster]

# Step 6: Generate the dendrogram to get cluster order
sc.tl.dendrogram(adata_subset, groupby='Lvl4_kmeans_Mel')

# Step 7: Retrieve the cluster order based on the dendrogram
cluster_order = adata_subset.uns['dendrogram_Lvl4_kmeans_Mel']['categories_ordered']

# Step 8: Ensure each cluster has at least 3 unique genes
unique_genes = set()
genes_for_plot = []

for cluster in cluster_order:
    if cluster in top_genes_per_cluster:
        cluster_genes = top_genes_per_cluster[cluster]
        genes_for_this_cluster = []
        for gene in cluster_genes:
            if len(genes_for_this_cluster) >= 10:
                break
            if gene not in unique_genes:
                unique_genes.add(gene)
                genes_for_this_cluster.append(gene)
        genes_for_plot.extend(genes_for_this_cluster)

# Step 9: Plot the matrixplot with the unique genes, swapping axes if necessary
sc.pl.dotplot(adata_subset, var_names=genes_for_plot, groupby='Lvl4_kmeans_Mel', dendrogram=True, use_raw=False, cmap="vlag", standard_scale='var', swap_axes=True)

In [None]:
import scanpy as sc

# Step 1: Ensure all observation names are unique
adata.obs_names_make_unique()

# Subset the data to include categories starting with 'T-' or equal to 'B'
adata_subset = adata[(adata.obs['B_leiden'].str.startswith("B-"))].copy()

# Step 3: Log-transform the data if not already done
sc.pp.log1p(adata_subset)

# Step 4: Perform differential expression analysis using the Wilcoxon method
sc.tl.rank_genes_groups(adata_subset, 'B_leiden', method='wilcoxon', use_raw=False)

# Step 5: Extract top marker genes for each cluster (optional for reference)
top_genes_per_cluster = {}
for cluster in adata_subset.obs['B_leiden'].cat.categories:
    top_genes_per_cluster[cluster] = adata_subset.uns['rank_genes_groups']['names'][cluster][:10]  # Top 10 genes

# Step 6: Generate dendrogram for cluster ordering
sc.tl.dendrogram(adata_subset, groupby='B_leiden')

# Step 7: Retrieve the cluster order based on the dendrogram
cluster_order = adata_subset.uns['dendrogram_B_leiden']['categories_ordered']

# Step 8: Define your custom list of genes for plotting
custom_genes = [   'ADAM28', 'APOLD1', 'AQP3', 'BANK1', 'BASP1', 'BCL2L11', 'BTNL9', 'CAV1', 'CCL5', 'CCNB2',
    'CCR6', 'CCR7', 'CD19', 'CD27', 'CD69', 'CD70', 'CD79A', 'CD83', 'CD86', 'CLECL1', 'COCH',
    'CXCR4', 'CYTIP', 'DERL3', 'DPEP1', 'DUSP2', 'DNAAF1', 'ESR1', 'FAS', 'FGL2', 'FKBP11', 
    'GLIPR1', 'GPR183', 'HLA-DMB', 'HLA-DQA2', 'HLA-DRB5', 'ICAM1', 'IGF1', 'IGHA1', 'IRF7', 
    'IRF8', 'ITGB2', 'KCNMA1', 'LY86', 'LYST', 'MDM2', 'MEF2C', 'MKI67', 'MPEG1', 'MYC', 'MZB1',
    'NLRC5', 'PCNA', 'PECAM1', 'PLCG2', 'PRDM1', 'PTPRC', 'SDC1', 'SEC11C', 'SELL', 'SERPINB9', 
    'SLAMF1', 'SLAMF7', 'SMYD2', 'SPI1', 'SPIB', 'STAT3', 'TCF4', 'TCL1A', 'THAP2', 'TENT5C',
    'TNFRSF13B', 'TNFRSF17', 'TOP2A', 'TRAC']  # Replace with your gene list

# Filter the custom list to include only genes present in the subset
genes_for_plot = [gene for gene in custom_genes if gene in adata_subset.var_names]

# Step 9: Plot the dotplot with your chosen genes
sc.pl.dotplot(
    adata_subset,
    var_names=genes_for_plot,
    groupby='B_leiden',
    dendrogram=True,
    use_raw=False,
    cmap="vlag",
    standard_scale='var',
    swap_axes=True
)


In [None]:
import scanpy as sc

# Step 1: Ensure all observation names are unique
adata.obs_names_make_unique()

# Step 2: Log-transform the data if not already done
sc.pp.log1p(adata)

# Step 3: Perform differential expression analysis using the Wilcoxon method
sc.tl.rank_genes_groups(adata, 'niche_cc14', method='wilcoxon', use_raw=False)

# Step 4: Extract top marker genes for each cluster (optional for reference)
top_genes_per_cluster = {}
for cluster in adata.obs['niche_cc14'].cat.categories:
    top_genes_per_cluster[cluster] = adata.uns['rank_genes_groups']['names'][cluster][:10]  # Top 10 genes

# Step 5: Generate dendrogram for cluster ordering
sc.tl.dendrogram(adata, groupby='niche_cc14')

# Step 6: Retrieve the cluster order based on the dendrogram
cluster_order = adata.uns['dendrogram_niche_cc14']['categories_ordered']

# Step 7: Define your custom list of genes for plotting
custom_genes = ['CCL19', 'CCR7', 'CXCL1']  # Replace with your gene list

# Filter the custom list to include only genes present in the dataset
genes_for_plot = [gene for gene in custom_genes if gene in adata.var_names]

# Step 8: Plot the dotplot with your chosen genes
sc.pl.dotplot(
    adata,
    var_names=genes_for_plot,
    groupby='niche_cc14',
    dendrogram=True,
    use_raw=False,
    cmap="vlag",
    standard_scale='var',
    swap_axes=True
)


In [None]:
print(b_subset.obs['leiden'].value_counts())
print(f"Number of unique clusters: {b_subset.obs['leiden'].nunique()}")


In [None]:
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.backends.backend_pdf import PdfPages
import os


rotation_angle = 30  # Define rotation for this sample

# Specify the directory where you want to save the PDF
save_directory = '/data/vasileiosionat2/Xenium/Figures/lvl4_pdf/'
pdf_filename = os.path.join(save_directory, f'all_clusters_4rows.pdf')

# Filter the data for the specific sample
adata_sample = adata[adata.obs['sample'] == 's2r2_HV184']

# Get unique clusters for the selected sample
unique_clusters = adata_sample.obs.loc[
    (adata_sample.obs['B_leiden'].str.startswith("B-pl")),
    'B_leiden'
].unique()

# Reorder clusters (if a custom order is provided, replace `sorted(unique_clusters)`)
ordered_clusters = sorted(unique_clusters)

# Define the number of rows and columns
num_rows = 3
num_cols = int(np.ceil(len(ordered_clusters) / num_rows))  # Calculate columns based on clusters and rows

# Create a PdfPages object to save the plots
with PdfPages(pdf_filename) as pdf:
    # Create a figure for the sample
    fig, axes = plt.subplots(num_rows, num_cols, figsize=(16, 26))
    fig.patch.set_facecolor('white')

    # Flatten the axes array for easier indexing
    axes = axes.flatten()

    # Get the coordinates for the sample
    x_coords = adata_sample.obs['x_centroid']
    y_coords = adata_sample.obs['y_centroid']
    
    # Apply rotation
    angle = np.deg2rad(rotation_angle)
    new_x_coords = x_coords * np.cos(angle) - y_coords * np.sin(angle)
    new_y_coords = x_coords * np.sin(angle) + y_coords * np.cos(angle)

    # Determine aspect ratio
    x_range = new_x_coords.max() - new_x_coords.min()
    y_range = new_y_coords.max() - new_y_coords.min()
    aspect_ratio = x_range / y_range

       # Loop over each unique cluster in the ordered list
    for idx, cluster in enumerate(ordered_clusters):
        ax = axes[idx]

        # Set white background for the subplot
        ax.set_facecolor('white')

        # Remove the outline
        for spine in ax.spines.values():
            spine.set_visible(False)

        # Scatter plot for the grey dots (all other clusters)
        ax.scatter(
            x=new_x_coords[adata_sample.obs['B_leiden'] != cluster],
            y=new_y_coords[adata_sample.obs['B_leiden'] != cluster],
            c='#C0C0C0', 
            s=3  # Adjust dot size
        )

        # Scatter plot for the red dots (the current cluster)
        ax.scatter(
            x=new_x_coords[adata_sample.obs['B_leiden'] == cluster],
            y=new_y_coords[adata_sample.obs['B_leiden'] == cluster],
            c='red', 
            s=9  # Adjust dot size
        )

        # Set aspect ratio for each subplot
        ax.set_aspect(aspect_ratio)

        # Add the cluster name below the plot
        ax.text(
            0.5, 0.02, f'{cluster}', 
            horizontalalignment='center', 
            verticalalignment='center', 
            transform=ax.transAxes, 
            color='black', fontsize=20, weight='bold'
        )

        # Remove grids
        ax.grid(False)

        # Remove ticks and their labels
        ax.set_xticks([])
        ax.set_yticks([])

    # Turn off empty subplots if there are any
    for ax in axes[len(ordered_clusters):]:
        ax.set_visible(False)

    # Define the directory and file name for the TIFF file
    tiff_filename = os.path.join(save_directory, f'all_clusters_4rows.tiff')

    # Adjust the spacing between subplots
    plt.subplots_adjust(
        left=0.05,    # Space from left edge
        right=0.95,   # Space from right edge
        top=0.95,     # Space from top edge
        bottom=0.05,  # Space from bottom edge
        wspace=0.1,   # Width space between columns
        hspace=0.05    # Height space between rows
    )

    # Add a title to the overall figure
    plt.suptitle(f'All Clusters in s2r2_HV184', color='white', fontsize=20, weight='bold', y=1.02)

    # Save the current figure as a TIFF file
    #fig.savefig(tiff_filename, dpi=300, format='tiff')

    plt.show()
    plt.close(fig)

In [None]:
import numpy as np
import scanpy as sc

# Step 1: Filter your subset based on a condition (e.g., a specific niche or cell type)
# Use .isin() to filter for multiple values in a column
b_cells2 = adata.obs['B_leiden'].isin(['B-plasma-3'])
b_subset2 = adata[b_cells2].copy()  # Always use .copy() to avoid modifying views

# Step 2: Preprocess the data
np.random.seed(42)  # Ensure reproducibility
sc.pp.normalize_total(b_subset2, target_sum=1e4)  # Normalize total counts per cell
sc.pp.log1p(b_subset2)  # Log-transform the data

# Step 3: Compute neighbors and perform Leiden clustering
sc.pp.neighbors(b_subset2, n_neighbors=10, use_rep='X', random_state=42)  # Compute the neighborhood graph
sc.tl.leiden(b_subset2, resolution=0.5, random_state=42)  # Perform Leiden clustering

# Step 4: Save Leiden cluster labels in the full dataset
adata.obs['B_leiden2'] = 'NaN'  # Initialize column with default values
adata.obs.loc[b_subset2.obs_names, 'B_leiden2'] = (
    b_subset2.obs['leiden'].apply(lambda cluster: f'B-plasma-{cluster}')
)

In [None]:
import scanpy as sc

# Step 1: Ensure all observation names are unique
adata.obs_names_make_unique()

# Subset the data to include categories starting with 'T-' or equal to 'B'
adata_subset = adata[(adata.obs['B_leiden2'].str.startswith("B-pl"))].copy()

# Step 3: Log-transform the data if not already done
sc.pp.log1p(adata_subset)

# Step 4: Perform differential expression analysis using the Wilcoxon method
sc.tl.rank_genes_groups(adata_subset, 'B_leiden2', method='wilcoxon', use_raw=False)

# Step 5: Extract top marker genes for each cluster
top_genes_per_cluster = {}
for cluster in adata_subset.obs['B_leiden2'].cat.categories:
    top_genes_per_cluster[cluster] = adata_subset.uns['rank_genes_groups']['names'][cluster][:10]  # Top 10 genes

# Step 6: Generate dendrogram for cluster ordering
sc.tl.dendrogram(adata_subset, groupby='B_leiden2')

# Step 7: Retrieve the cluster order based on the dendrogram
cluster_order = adata_subset.uns['dendrogram_B_leiden2']['categories_ordered']

# Step 8: Ensure each cluster has at least 3 unique genes
unique_genes = set()
genes_for_plot = []

for cluster in cluster_order:
    if cluster in top_genes_per_cluster:
        cluster_genes = top_genes_per_cluster[cluster]
        genes_for_this_cluster = []
        for gene in cluster_genes:
            if len(genes_for_this_cluster) >= 20:
                break
            if gene not in unique_genes:
                unique_genes.add(gene)
                genes_for_this_cluster.append(gene)
        genes_for_plot.extend(genes_for_this_cluster)

# Step 9: Plot the dotplot with the unique genes, swapping axes if necessary
sc.pl.dotplot(
    adata_subset,
    var_names=genes_for_plot,
    groupby='B_leiden2',
    dendrogram=True,
    use_raw=False,
    cmap="vlag",
    standard_scale='var',
    swap_axes=True
)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.backends.backend_pdf import PdfPages
import os


rotation_angle = 30  # Define rotation for this sample

# Specify the directory where you want to save the PDF
save_directory = '/data/vasileiosionat2/Xenium/Figures/lvl4_pdf/'
pdf_filename = os.path.join(save_directory, f'all_clusters_4rows.pdf')

# Filter the data for the specific sample
adata_sample = adata[adata.obs['sample'] == 's1r2']

# Get unique clusters for the selected sample
unique_clusters = adata_sample.obs.loc[
    (adata_sample.obs['B_leiden2'].str.startswith("B-pl")),
    'B_leiden2'
].unique()

# Reorder clusters (if a custom order is provided, replace `sorted(unique_clusters)`)
ordered_clusters = sorted(unique_clusters)

# Define the number of rows and columns
num_rows = 6
num_cols = int(np.ceil(len(ordered_clusters) / num_rows))  # Calculate columns based on clusters and rows

# Create a PdfPages object to save the plots
with PdfPages(pdf_filename) as pdf:
    # Create a figure for the sample
    fig, axes = plt.subplots(num_rows, num_cols, figsize=(26, 36))
    fig.patch.set_facecolor('white')

    # Flatten the axes array for easier indexing
    axes = axes.flatten()

    # Get the coordinates for the sample
    x_coords = adata_sample.obs['x_centroid']
    y_coords = adata_sample.obs['y_centroid']
    
    # Apply rotation
    angle = np.deg2rad(rotation_angle)
    new_x_coords = x_coords * np.cos(angle) - y_coords * np.sin(angle)
    new_y_coords = x_coords * np.sin(angle) + y_coords * np.cos(angle)

    # Determine aspect ratio
    x_range = new_x_coords.max() - new_x_coords.min()
    y_range = new_y_coords.max() - new_y_coords.min()
    aspect_ratio = x_range / y_range

       # Loop over each unique cluster in the ordered list
    for idx, cluster in enumerate(ordered_clusters):
        ax = axes[idx]

        # Set white background for the subplot
        ax.set_facecolor('white')

        # Remove the outline
        for spine in ax.spines.values():
            spine.set_visible(False)

        # Scatter plot for the grey dots (all other clusters)
        ax.scatter(
            x=new_x_coords[adata_sample.obs['B_leiden2'] != cluster],
            y=new_y_coords[adata_sample.obs['B_leiden2'] != cluster],
            c='#C0C0C0', 
            s=3  # Adjust dot size
        )

        # Scatter plot for the red dots (the current cluster)
        ax.scatter(
            x=new_x_coords[adata_sample.obs['B_leiden2'] == cluster],
            y=new_y_coords[adata_sample.obs['B_leiden2'] == cluster],
            c='red', 
            s=9  # Adjust dot size
        )

        # Set aspect ratio for each subplot
        ax.set_aspect(aspect_ratio)

        # Add the cluster name below the plot
        ax.text(
            0.5, 0.02, f'{cluster}', 
            horizontalalignment='center', 
            verticalalignment='center', 
            transform=ax.transAxes, 
            color='black', fontsize=20, weight='bold'
        )

        # Remove grids
        ax.grid(False)

        # Remove ticks and their labels
        ax.set_xticks([])
        ax.set_yticks([])

    # Turn off empty subplots if there are any
    for ax in axes[len(ordered_clusters):]:
        ax.set_visible(False)

    # Define the directory and file name for the TIFF file
    tiff_filename = os.path.join(save_directory, f'all_clusters_4rows.tiff')

    # Adjust the spacing between subplots
    plt.subplots_adjust(
        left=0.05,    # Space from left edge
        right=0.95,   # Space from right edge
        top=0.95,     # Space from top edge
        bottom=0.05,  # Space from bottom edge
        wspace=0.1,   # Width space between columns
        hspace=0.05    # Height space between rows
    )

    # Add a title to the overall figure
    plt.suptitle(f'All Clusters in s2r2_HV184', color='white', fontsize=20, weight='bold', y=1.02)

    # Save the current figure as a TIFF file
    #fig.savefig(tiff_filename, dpi=300, format='tiff')

    plt.show()
    plt.close(fig)

In [None]:
import numpy as np
import scanpy as sc

# Step 1: Define the clusters to be renamed as "B-plasma"
b_cells = [ 'Pl.3', 'Pl.1', 'Pl.prol', 'Pl.2', 'Pl.4', 'B']

# Step 2: Subset the genes you're interested in 
genes_of_interest = [
    'ADAM28', 'APOLD1', 'AQP3', 'BANK1', 'BASP1', 'BCL2L11', 'BTNL9', 'CAV1', 'CCL5', 'CCNB2',
    'CCR6', 'CCR7', 'CD19', 'CD27', 'CD69', 'CD70', 'CD79A', 'CD83', 'CD86', 'CLECL1', 'COCH',
    'CXCR4', 'CYTIP', 'DERL3', 'DPEP1', 'DUSP2', 'DNAAF1', 'ESR1', 'FAS', 'FGL2', 'FKBP11', 
    'GLIPR1', 'GPR183', 'HLA-DMB', 'HLA-DQA2', 'HLA-DRB5', 'ICAM1', 'IGF1', 'IGHA1', 'IRF7', 
    'IRF8', 'ITGB2', 'KCNMA1', 'LY86', 'LYST', 'MDM2', 'MEF2C', 'MKI67', 'MPEG1', 'MYC', 'MZB1',
    'NLRC5', 'PCNA', 'PECAM1', 'PLCG2', 'PRDM1', 'PTPRC', 'SDC1', 'SEC11C', 'SELL', 'SERPINB9', 
    'SLAMF1', 'SLAMF7', 'SMYD2', 'SPI1', 'SPIB', 'STAT3', 'TCF4', 'TCL1A', 'THAP2', 'TENT5C',
    'TNFRSF13B', 'TNFRSF17', 'TOP2A', 'TRAC'
]
b_subset = adata[adata.obs['Lvl4'].isin(b_cells), genes_of_interest]

# Step 3: Preprocess data
np.random.seed(42)  # Set random seed for reproducibility
sc.pp.normalize_total(b_subset, target_sum=1e4)  # Normalize the counts
sc.pp.log1p(b_subset)  # Log-transform the data
sc.pp.scale(b_subset)  # Scale the data for clustering

# Step 4: Perform Leiden clustering with lower resolution for more coarse clustering
sc.pp.neighbors(b_subset, n_neighbors=30, use_rep='X', random_state=42)  # Set random state for neighbors calculation
sc.tl.leiden(b_subset, resolution=0.2, random_state=42)  # Lower resolution for fewer clusters (e.g., resolution=0.2)

# Step 5: Save the Leiden cluster labels to the 'Lvl4_leiden' column in `adata.obs`
adata.obs['Lvl4_leiden'] = 'NaN'  # Initialize the new column with NaN values

# Step 6: Rename the clusters as "B-Plasma-0", "B-Plasma-1", etc.
adata.obs.loc[b_subset.obs_names, 'Lvl4_leiden'] = [
    f'B-Plasma-{cluster}' for cluster in b_subset.obs['leiden']
]

In [None]:
import scanpy as sc

# Step 1: Ensure all observation names are unique
adata.obs_names_make_unique()

# Step 2: Subset the data to include all clusters except 'NaN' in 'B_cell_leiden'
adata_subset = adata[adata.obs['Lvl4_leiden'] != 'NaN'].copy()  # Avoid modifying a view

# Step 3: Log-transform the data if not already done
sc.pp.log1p(adata_subset)

# Step 4: Perform differential expression analysis using the Wilcoxon method
sc.tl.rank_genes_groups(adata_subset, 'Lvl4_leiden', method='wilcoxon', use_raw=False)

# Step 5: Extract top marker genes for each cluster
top_genes_per_cluster = {}
for cluster in adata_subset.obs['Lvl4_leiden'].cat.categories:
    top_genes_per_cluster[cluster] = adata_subset.uns['rank_genes_groups']['names'][cluster][:10]  # Top 10 genes

# Step 6: Generate dendrogram for cluster ordering
sc.tl.dendrogram(adata_subset, groupby='Lvl4_leiden')

# Step 7: Retrieve the cluster order based on the dendrogram
cluster_order = adata_subset.uns['dendrogram_Lvl4_leiden']['categories_ordered']

# Step 8: Ensure each cluster has at least 3 unique genes
unique_genes = set()
genes_for_plot = []

for cluster in cluster_order:
    if cluster in top_genes_per_cluster:
        cluster_genes = top_genes_per_cluster[cluster]
        genes_for_this_cluster = []
        for gene in cluster_genes:
            if len(genes_for_this_cluster) >= 5:
                break
            if gene not in unique_genes:
                unique_genes.add(gene)
                genes_for_this_cluster.append(gene)
        genes_for_plot.extend(genes_for_this_cluster)

# Step 9: Plot the dotplot with the unique genes, swapping axes if necessary
sc.pl.dotplot(
    adata_subset,
    var_names=genes_for_plot,
    groupby='Lvl4_leiden',
    dendrogram=True,
    use_raw=False,
    cmap="vlag",
    standard_scale='var',
    swap_axes=True
)

In [None]:
import numpy as np
import scanpy as sc

# Step 1: Define the clusters to be renamed as "T"
t_cells3 = ['T_new-0', 'T_new-1', 'T_new-2', 'T_new-3', 'T_new-4', 'T_new-5', 'T_new-6', 'T_new-7', 'T_new-8', 'T_new-9']

# Step 2: Subset the genes you're interested in 
T_cell_genes = [
'ADAM28', 'ANXA1', 'APOLD1', 'AQP3', 'BCL2L11', 'C1orf162', 'CCL3L1', 'CCL5', 'CD2', 'CD27', 'CD28', 'CD3D', 
    'CD4', 'CD69', 'CD8A', 'CD83', 'CD247', 'CCR6', 'CCR7', 'CYTIP', 'CTLA4', 'CTSC', 'DUSP2', 'FAS', 'FCGR3A', 
    'FGFBP2', 'FGL2', 'FKBP11', 'FOXP3', 'FXYD2', 'GEM', 'GLIPR1', 'GNLY', 'GPR183', 'GZMA', 'GZMB', 'GZMK', 
    'HAVCR2', 'ICAM1', 'IL1R2', 'IL17A', 'IL17F', 'IL23R', 'IL2RA', 'IL7R', 
    'IRF8', 'ITGA1', 'ITGB2', 'KLRB1', 'KLRC1', 'KLRD1', 'KIT', 'LAG3', 'LYST', 'MDM2', 'MKI67', 'NKG7', 'NLRC5', 
    'PDCD1', 'PLCG2', 'PPARG', 'PRF1', 'RGS16', 'RTKN2', 'SELL', 'SH2D3C', 'SLAMF1', 
    'SLAMF7', 'SMYD2', 'SOX4', 'STAT3', 'TNFRSF9', 'TRAC'

]
t_subset3 = adata[adata.obs['T_leiden2'].isin(t_cells3), T_cell_genes]

# Step 3: Preprocess data
np.random.seed(42)  # Set random seed for reproducibility
sc.pp.normalize_total(t_subset3, target_sum=1e4)  # Normalize the counts
sc.pp.log1p(t_subset3)  # Log-transform the data
sc.pp.scale(t_subset3)

# Step 6: Perform Leiden clustering with lower resolution for more coarse clustering
sc.pp.neighbors(t_subset3, n_neighbors=30, use_rep='X', random_state=42)  # Set random state for neighbors calculation
sc.tl.leiden(t_subset3, resolution=0.1, random_state=42)  # Lower resolution for fewer clusters (e.g., resolution=0.2)

# Step 7: Save the Leiden cluster labels to the 'Lvl4_leiden' column in adata.obs
adata.obs['T_leiden3'] = 'NaN'  # Initialize the new column with NaN values

# Step 8: Rename the clusters as "T-0", "T-1", etc.
adata.obs.loc[t_subset3.obs_names, 'T_leiden3'] = [
    f'T_new-{cluster}' for cluster in t_subset3.obs['leiden']
]

In [None]:
# Check if t_subset3.X is sparse and convert it if needed
if hasattr(t_subset3.X, 'toarray'):  # This checks if the object has the .toarray method (sparse matrix)
    t_subset3.X = t_subset3.X.toarray()
elif hasattr(t_subset3.X, 'A'):  # Handle sparse matrices represented as scipy.sparse matrices
    t_subset3.X = t_subset3.X.A


In [None]:
if np.any(np.isnan(t_subset3.X)):
    print("Check for NaN values in the data!")

In [None]:
import scanpy as sc

# Step 1: Ensure all observation names are unique
adata.obs_names_make_unique()

# Step 2: Subset the data to include all clusters except 'NaN' in 'B_cell_leiden'
adata_subset = adata[adata.obs['T_leiden3'] != 'NaN'].copy()  # Avoid modifying a view

# Step 3: Log-transform the data if not already done
sc.pp.log1p(adata_subset)

# Step 4: Perform differential expression analysis using the Wilcoxon method
sc.tl.rank_genes_groups(adata_subset, 'T_leiden3', method='wilcoxon', use_raw=False)

# Step 5: Extract top marker genes for each cluster
top_genes_per_cluster = {}
for cluster in adata_subset.obs['T_leiden3'].cat.categories:
    top_genes_per_cluster[cluster] = adata_subset.uns['rank_genes_groups']['names'][cluster][:10]  # Top 10 genes

# Step 6: Generate dendrogram for cluster ordering
sc.tl.dendrogram(adata_subset, groupby='T_leiden3')

# Step 7: Retrieve the cluster order based on the dendrogram
cluster_order = adata_subset.uns['dendrogram_T_leiden3']['categories_ordered']

# Step 8: Ensure each cluster has at least 3 unique genes
unique_genes = set()
genes_for_plot = []

for cluster in cluster_order:
    if cluster in top_genes_per_cluster:
        cluster_genes = top_genes_per_cluster[cluster]
        genes_for_this_cluster = []
        for gene in cluster_genes:
            if len(genes_for_this_cluster) >= 3:
                break
            if gene not in unique_genes:
                unique_genes.add(gene)
                genes_for_this_cluster.append(gene)
        genes_for_plot.extend(genes_for_this_cluster)

# Step 9: Plot the dotplot with the unique genes, swapping axes if necessary
sc.pl.dotplot(
    adata_subset,
    var_names=genes_for_plot,
    groupby='T_leiden3',
    dendrogram=True,
    use_raw=False,
    cmap="vlag",
    standard_scale='var',
    swap_axes=True
)

In [None]:
print(t_subset3.obs['leiden'].value_counts())
print(f"Number of unique clusters: {t_subset.obs['leiden'].nunique()}")


In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt

sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_versions()

In [None]:
# Step 1: Filter cells based on the 'B_cell_recluster' column to isolate B-plasma cells
B_cells = adata[adata.obs['B_cell_recluster'] == 'B-plasma'].copy()

# Step 2: Preprocess data (if not done already)
# Normalize total counts per cell to 10,000 and log-transform
sc.pp.normalize_total(B_cells, target_sum=1e4)
sc.pp.log1p(B_cells)

# Scale the data (important for clustering)
sc.pp.scale(B_cells)

# Step 3: Compute the neighborhood graph (necessary for clustering)
sc.pp.neighbors(B_cells, n_neighbors=50)  # You can adjust n_neighbors if needed

# Step 4: Run Leiden clustering
sc.tl.leiden(B_cells, key_added="leiden_0.1", resolution=0.1)  # Set your desired resolution (default is 1.0)

In [None]:
leiden_labels = B_cells.obs["leiden_0.1"].values  # Extract cluster labels

In [None]:
# Step 5: Save the KMeans cluster labels to the 'Lvl4_kmeans' column in `adata.obs`
# Ensure the labels are saved correctly for only the Th cells
adata.obs['B_cell_leiden'] = 'NaN'  # Initialize the new column with NaN values
adata.obs.loc[adata.obs['B_cell_recluster'] == 'B-plasma', 'B_cell_leiden'] = [f'B-Plasma-{i}' for i in leiden_labels]

In [None]:
import scanpy as sc

# Step 1: Ensure all observation names are unique
adata.obs_names_make_unique()

# Step 2: Subset the data to include all clusters except 'NaN' in 'B_cell_leiden'
adata_subset = adata[adata.obs['B_cell_leiden'] != 'NaN'].copy()  # Avoid modifying a view

# Step 3: Log-transform the data if not already done
sc.pp.log1p(adata_subset)

# Step 4: Perform differential expression analysis using the Wilcoxon method
sc.tl.rank_genes_groups(adata_subset, 'B_cell_leiden', method='wilcoxon', use_raw=False)

# Step 5: Extract top marker genes for each cluster
top_genes_per_cluster = {}
for cluster in adata_subset.obs['B_cell_leiden'].cat.categories:
    top_genes_per_cluster[cluster] = adata_subset.uns['rank_genes_groups']['names'][cluster][:10]  # Top 10 genes

# Step 6: Generate dendrogram for cluster ordering
sc.tl.dendrogram(adata_subset, groupby='B_cell_leiden')

# Step 7: Retrieve the cluster order based on the dendrogram
cluster_order = adata_subset.uns['dendrogram_B_cell_leiden']['categories_ordered']

# Step 8: Ensure each cluster has at least 3 unique genes
unique_genes = set()
genes_for_plot = []

for cluster in cluster_order:
    if cluster in top_genes_per_cluster:
        cluster_genes = top_genes_per_cluster[cluster]
        genes_for_this_cluster = []
        for gene in cluster_genes:
            if len(genes_for_this_cluster) >= 10:
                break
            if gene not in unique_genes:
                unique_genes.add(gene)
                genes_for_this_cluster.append(gene)
        genes_for_plot.extend(genes_for_this_cluster)

# Step 9: Plot the dotplot with the unique genes, swapping axes if necessary
sc.pl.dotplot(
    adata_subset,
    var_names=genes_for_plot,
    groupby='B_cell_leiden',
    dendrogram=True,
    use_raw=False,
    cmap="vlag",
    standard_scale='var',
    swap_axes=True
)

In [None]:
import scanpy as sc
from sklearn.cluster import KMeans

# Step 1: Filter cells based on the 'Lvl4' column to isolate T-helper (Th) cells
th_cells = adata[adata.obs['Lvl4_kmeans'] == 'B-Plasma'].copy()  # Create a copy to avoid view modification warning

# Step 2: Subset the genes you're interested in (if you have a specific list)
genes_of_interest = ['FOXP3']  # Replace with your list of genes
th_subset = th_cells[:, genes_of_interest]

# Step 3: Normalize or scale the data (optional but can help with clustering)
sc.pp.scale(th_subset)  # Scale the data for clustering

# Step 4: Perform k-means clustering
n_clusters = 2  # Adjust based on how many clusters you want (e.g., 3 for Th-0, Th-1, Th-2)
kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=42)  # Set n_init explicitly
kmeans_labels = kmeans.fit_predict(th_subset.X)  # Get cluster labels

# Step 5: Save the KMeans cluster labels to the 'Lvl4_kmeans' column in `adata.obs`
# Ensure the labels are saved correctly for only the Th cells
adata.obs['Lvl4_kmeans'] = 'NaN'  # Initialize the new column with NaN values
adata.obs.loc[adata.obs['Lvl4'] == 'Th', 'Lvl4_kmeans'] = [f'Th-{i}' for i in kmeans_labels]

In [None]:
print(adata.obs.columns)  # List all columns in the obs DataFrame

In [None]:
print(adata.obs['Lvl4_kmeans'].unique().tolist())

In [None]:
# Temporarily convert Lvl4_kmeans to a non-categorical type (e.g., object)
adata.obs['Lvl4_kmeans'] = adata.obs['Lvl4_kmeans'].astype('object')

# Condition to select rows where Lvl4 is not 'Th'
condition = adata.obs['Lvl4'] != 'Th'

# Transfer values from Lvl4 to Lvl4_kmeans for rows where Lvl4 is not 'Th'
adata.obs.loc[condition, 'Lvl4_kmeans'] = adata.obs.loc[condition, 'Lvl4']

# Optionally reset the 'Lvl4' column for these rows if needed
# adata.obs.loc[condition, 'Lvl4'] = None  # Uncomment if you want to reset 'Lvl4'

# Optionally, convert Lvl4_kmeans back to categorical (if required)
adata.obs['Lvl4_kmeans'] = pd.Categorical(adata.obs['Lvl4_kmeans'])

In [None]:
print(adata.obs['Lvl4'].unique().tolist())

In [None]:

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from matplotlib.transforms import Affine2D
import matplotlib.patches as patches
from mpl_toolkits.axes_grid1.anchored_artists import AnchoredSizeBar
import matplotlib.font_manager as fm

# Step 1: Define the color mapping for the Lvl2.5 categories
color_map = {
    'Th-0': 'cyan',
    'Th-1': 'red',
    'T.NK.Mix' : 'yellow'
}

# Default color for unmapped clusters
default_color = 'grey'

# Step 2: Filter adata.obs for the specific sample "s2r2_HV184"
subset_obs = adata.obs[adata.obs['sample'] == 's2r2_HV184'].copy()

# Step 3: Convert categorical data to colors in the filtered subset, setting unmapped to default color
subset_obs['color'] = subset_obs['Lvl4_kmeans'].map(color_map).fillna(default_color)

# Step 4: Define the rotation angle (in degrees)
rotation_angle = 30  # Adjust this value to your desired rotation angle

# Step 5: Set up the figure
fig, ax = plt.subplots(figsize=(10, 8))

# Apply the flip (scaling) and rotation using Affine2D directly to the axes transformation
transform = Affine2D().scale(1, 1).rotate_deg(rotation_angle) + ax.transData

# Step 6: Plot the Data from the subset with the transformation applied
scatter = ax.scatter(
    subset_obs['x_centroid'],   # Use x_centroid from the subset
    subset_obs['y_centroid'],   # Use y_centroid from the subset
    c=subset_obs['color'],      # Use the color mapping from the subset
    s=2,  # Size of points
    alpha=0.7,
    transform=transform  # Apply the flip and rotation transformation
)

# Set the aspect ratio to be equal to avoid distortion
ax.set_aspect('equal', adjustable='box')

# Remove the x and y axes and labels
plt.axis('off')

# Step 7: Add a 200-micron scale bar
scale_bar_length = 200  # Length of the scale bar in microns

# Define the scale bar using AnchoredSizeBar from mpl_toolkits
fontprops = fm.FontProperties(size=10)
scalebar = AnchoredSizeBar(ax.transData,
                           scale_bar_length, '', 'lower right', 
                           pad=0.1,
                           color='white',
                           frameon=False,
                           size_vertical=30,  # Thickness of the bar
                           fontproperties=fontprops)

# Add the scale bar to the plot
ax.add_artist(scalebar)

# Show the plot with a white background
plt.show()


In [None]:
import scanpy as sc

# Step 1: Ensure observation names are unique
adata.obs_names_make_unique()

# Step 2: Subset the adata object to only the specific clusters
desired_clusters = [  
'Th-0', 'Th-1', 'Tc', 'T.NK.Mix'
]
adata_subset = adata[adata.obs['Lvl4_kmeans'].isin(desired_clusters)].copy()  # Make a copy to avoid modifying a view

# Step 3: Log-transform the data if it hasn't been done already
sc.pp.log1p(adata_subset)

# Step 4: Perform differential expression analysis for the selected clusters
sc.tl.rank_genes_groups(adata_subset, 'Lvl4_kmeans', method='wilcoxon', use_raw=False)

# Step 5: Extract top marker genes for the selected clusters
top_genes_per_cluster = {}
for cluster in adata_subset.obs['Lvl4_kmeans'].cat.categories:
    top_genes_per_cluster[cluster] = adata_subset.uns['rank_genes_groups']['names'][cluster]

# Step 6: Generate the dendrogram to get cluster order
sc.tl.dendrogram(adata_subset, groupby='Lvl4_kmeans')

# Step 7: Retrieve the cluster order based on the dendrogram
cluster_order = adata_subset.uns['dendrogram_Lvl4_kmeans']['categories_ordered']

# Step 8: Ensure each cluster has at least 3 unique genes
unique_genes = set()
genes_for_plot = []

for cluster in cluster_order:
    if cluster in top_genes_per_cluster:
        cluster_genes = top_genes_per_cluster[cluster]
        genes_for_this_cluster = []
        for gene in cluster_genes:
            if len(genes_for_this_cluster) >= 10:
                break
            if gene not in unique_genes:
                unique_genes.add(gene)
                genes_for_this_cluster.append(gene)
        genes_for_plot.extend(genes_for_this_cluster)

# Step 9: Plot the matrixplot with the unique genes, swapping axes if necessary
sc.pl.dotplot(adata_subset, var_names=genes_for_plot, groupby='Lvl4_kmeans', dendrogram=True, use_raw=False, cmap="vlag", standard_scale='var', swap_axes=True)

In [None]:
import scanpy as sc
from sklearn.cluster import KMeans

# Step 1: Filter cells based on the 'Lvl4' column to isolate T-helper (Th) cells
NK_cells = adata[adata.obs['Lvl4'] == 'T.NK.Mix'].copy()  # Create a copy to avoid view modification warning

# Step 2: Subset the genes you're interested in (if you have a specific list)
genes_of_interest = ['KRT19']  # Replace with your list of genes
NK_subset = NK_cells[:, genes_of_interest]

# Step 3: Normalize or scale the data (optional but can help with clustering)
sc.pp.scale(NK_subset)  # Scale the data for clustering

# Step 4: Perform k-means clustering
n_clusters = 2  # Adjust based on how many clusters you want (e.g., 2 for T.NK.mix-0, T.NK.mix-1)
kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=42)  # Set n_init explicitly
kmeans_labels = kmeans.fit_predict(NK_subset.X)  # Get cluster labels

# Step 5: Save the KMeans cluster labels to the 'Lvl4_kmeans' column in `adata.obs`
# Ensure the labels are saved correctly for only the Th cells
adata.obs['Lvl4_kmeans2'] = 'NaN'  # Initialize the new column with NaN values
adata.obs.loc[adata.obs['Lvl4'] == 'T.NK.Mix', 'Lvl4_kmeans2'] = [f'T.NK.Mix-{i}' for i in kmeans_labels]


In [None]:
print(adata.obs.columns)  # List all columns in the obs DataFrame

In [None]:
print(adata.obs['Lvl4_kmeans2'].unique().tolist())

In [None]:
# Temporarily convert Lvl4_kmeans to a non-categorical type (e.g., object)
adata.obs['Lvl4_kmeans2'] = adata.obs['Lvl4_kmeans2'].astype('object')

# Condition to select rows where Lvl4 is not 'Th'
condition = adata.obs['Lvl4_kmeans'] != 'T.NK.Mix'

# Transfer values from Lvl4 to Lvl4_kmeans for rows where Lvl4 is not 'Th'
adata.obs.loc[condition, 'Lvl4_kmeans2'] = adata.obs.loc[condition, 'Lvl4_kmeans']

# Optionally reset the 'Lvl4' column for these rows if needed
# adata.obs.loc[condition, 'Lvl4'] = None  # Uncomment if you want to reset 'Lvl4'

# Optionally, convert Lvl4_kmeans back to categorical (if required)
adata.obs['Lvl4_kmeans2'] = pd.Categorical(adata.obs['Lvl4_kmeans2'])

In [None]:

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from matplotlib.transforms import Affine2D
import matplotlib.patches as patches
from mpl_toolkits.axes_grid1.anchored_artists import AnchoredSizeBar
import matplotlib.font_manager as fm

# Step 1: Define the color mapping for the Lvl2.5 categories
color_map = {
    'Th-0': 'cyan',
    'Th-1': 'red',
    'T.NK.Mix-1' : 'yellow',
    'T.NK.Mix-0' : 'magenta'
}

# Default color for unmapped clusters
default_color = 'grey'

# Step 2: Filter adata.obs for the specific sample "s2r2_HV184"
subset_obs = adata.obs[adata.obs['sample'] == 's2r1_HV207'].copy()

# Step 3: Convert categorical data to colors in the filtered subset, setting unmapped to default color
subset_obs['color'] = subset_obs['Lvl4_kmeans2'].map(color_map).fillna(default_color)

# Step 4: Define the rotation angle (in degrees)
rotation_angle = 30  # Adjust this value to your desired rotation angle

# Step 5: Set up the figure
fig, ax = plt.subplots(figsize=(10, 8))

# Apply the flip (scaling) and rotation using Affine2D directly to the axes transformation
transform = Affine2D().scale(1, 1).rotate_deg(rotation_angle) + ax.transData

# Step 6: Plot the Data from the subset with the transformation applied
scatter = ax.scatter(
    subset_obs['x_centroid'],   # Use x_centroid from the subset
    subset_obs['y_centroid'],   # Use y_centroid from the subset
    c=subset_obs['color'],      # Use the color mapping from the subset
    s=2,  # Size of points
    alpha=0.7,
    transform=transform  # Apply the flip and rotation transformation
)

# Set the aspect ratio to be equal to avoid distortion
ax.set_aspect('equal', adjustable='box')

# Remove the x and y axes and labels
plt.axis('off')

# Step 7: Add a 200-micron scale bar
scale_bar_length = 200  # Length of the scale bar in microns

# Define the scale bar using AnchoredSizeBar from mpl_toolkits
fontprops = fm.FontProperties(size=10)
scalebar = AnchoredSizeBar(ax.transData,
                           scale_bar_length, '', 'lower right', 
                           pad=0.1,
                           color='white',
                           frameon=False,
                           size_vertical=30,  # Thickness of the bar
                           fontproperties=fontprops)

# Add the scale bar to the plot
ax.add_artist(scalebar)

# Show the plot with a white background
plt.show()


In [None]:
import scanpy as sc

# Step 1: Ensure observation names are unique
adata.obs_names_make_unique()

# Step 2: Subset the adata object to only the specific clusters
desired_clusters = [  
'Th-0', 'Th-1', 'Tc', 'T.NK.Mix-0', 'T.NK.Mix-1', 'B', 'Pl.1', 'Pl.prol'
]
adata_subset = adata[adata.obs['Lvl4_kmeans2'].isin(desired_clusters)].copy()  # Make a copy to avoid modifying a view

# Step 3: Log-transform the data if it hasn't been done already
sc.pp.log1p(adata_subset)

# Step 4: Perform differential expression analysis for the selected clusters
sc.tl.rank_genes_groups(adata_subset, 'Lvl4_kmeans2', method='wilcoxon', use_raw=False)

# Step 5: Extract top marker genes for the selected clusters
top_genes_per_cluster = {}
for cluster in adata_subset.obs['Lvl4_kmeans2'].cat.categories:
    top_genes_per_cluster[cluster] = adata_subset.uns['rank_genes_groups']['names'][cluster]

# Step 6: Generate the dendrogram to get cluster order
sc.tl.dendrogram(adata_subset, groupby='Lvl4_kmeans2')

# Step 7: Retrieve the cluster order based on the dendrogram
cluster_order = adata_subset.uns['dendrogram_Lvl4_kmeans2']['categories_ordered']

# Step 8: Ensure each cluster has at least 3 unique genes
unique_genes = set()
genes_for_plot = []

for cluster in cluster_order:
    if cluster in top_genes_per_cluster:
        cluster_genes = top_genes_per_cluster[cluster]
        genes_for_this_cluster = []
        for gene in cluster_genes:
            if len(genes_for_this_cluster) >= 5:
                break
            if gene not in unique_genes:
                unique_genes.add(gene)
                genes_for_this_cluster.append(gene)
        genes_for_plot.extend(genes_for_this_cluster)

# Step 9: Plot the matrixplot with the unique genes, swapping axes if necessary
sc.pl.dotplot(adata_subset, var_names=genes_for_plot, groupby='Lvl4_kmeans2', dendrogram=True, use_raw=False, cmap="vlag", standard_scale='var', swap_axes=True)

In [None]:
import scanpy as sc

# Step 1: Ensure observation names are unique
adata.obs_names_make_unique()

# Step 2: Subset the adata object to only the specific clusters
desired_clusters = [  
'NeuroEpi', 'Ep.Sp', 'Fib.1'
]
adata_subset = adata[adata.obs['Lvl4_kmeans2'].isin(desired_clusters)].copy()  # Make a copy to avoid modifying a view

# Step 3: Log-transform the data if it hasn't been done already
sc.pp.log1p(adata_subset)

# Step 4: Perform differential expression analysis for the selected clusters
sc.tl.rank_genes_groups(adata_subset, 'Lvl4_kmeans2', method='wilcoxon', use_raw=False)

# Step 5: Extract top marker genes for the selected clusters
top_genes_per_cluster = {}
for cluster in adata_subset.obs['Lvl4_kmeans2'].cat.categories:
    top_genes_per_cluster[cluster] = adata_subset.uns['rank_genes_groups']['names'][cluster]

# Step 6: Generate the dendrogram to get cluster order
sc.tl.dendrogram(adata_subset, groupby='Lvl4_kmeans2')

# Step 7: Retrieve the cluster order based on the dendrogram
cluster_order = adata_subset.uns['dendrogram_Lvl4_kmeans2']['categories_ordered']

# Step 8: Ensure each cluster has at least 3 unique genes
unique_genes = set()
genes_for_plot = []

for cluster in cluster_order:
    if cluster in top_genes_per_cluster:
        cluster_genes = top_genes_per_cluster[cluster]
        genes_for_this_cluster = []
        for gene in cluster_genes:
            if len(genes_for_this_cluster) >= 20:
                break
            if gene not in unique_genes:
                unique_genes.add(gene)
                genes_for_this_cluster.append(gene)
        genes_for_plot.extend(genes_for_this_cluster)

# Step 9: Plot the matrixplot with the unique genes, swapping axes if necessary
sc.pl.dotplot(adata_subset, var_names=genes_for_plot, groupby='Lvl4_kmeans2', dendrogram=True, use_raw=False, cmap="vlag", standard_scale='var', swap_axes=True)

In [None]:
import scanpy as sc
from sklearn.cluster import KMeans

# Step 1: Filter cells based on the 'Lvl4' column to isolate T-helper (Th) cells
Mel_cells = adata[adata.obs['Lvl4'] == 'NeuroEpi'].copy()  # Create a copy to avoid view modification warning

# Step 2: Subset the genes you're interested in (if you have a specific list)
genes_of_interest = ['MLANA']  # Replace with your list of genes
Mel_subset = Mel_cells[:, genes_of_interest]

# Step 3: Normalize or scale the data (optional but can help with clustering)
sc.pp.scale(Mel_subset)  # Scale the data for clustering

# Step 4: Perform k-means clustering
n_clusters = 2  # Adjust based on how many clusters you want (e.g., 2 for NeuroEpi-0, NeuroEpi-1)
kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=42)  # Set n_init explicitly
kmeans_labels = kmeans.fit_predict(Mel_subset.X)  # Get cluster labels

# Step 5: Save the KMeans cluster labels to the 'Lvl4_kmeans' column in `adata.obs`
# Ensure the labels are saved correctly for only the Th cells
adata.obs['Lvl4_kmeans3'] = 'NaN'  # Initialize the new column with NaN values
adata.obs.loc[adata.obs['Lvl4'] == 'NeuroEpi', 'Lvl4_kmeans3'] = [f'NeuroEpi-{i}' for i in kmeans_labels]

# Step 6: Run PCA to compute PCA coordinates for visualization (optional)
sc.tl.pca(adata)  # Compute PCA on the original data

# Step 7: Visualize the clusters in PCA space (optional)
sc.pl.pca(adata, color='Lvl4_kmeans3')  # Plot the clusters using PCA results


In [None]:
import scanpy as sc

# Step 1: Ensure observation names are unique
adata.obs_names_make_unique()

# Step 2: Subset the adata object to only the specific clusters
desired_clusters = [  
'Th-0', 'Th-1', 'Tc', 'T.NK.Mix'
]
adata_subset = adata[adata.obs['Lvl4_kmeans'].isin(desired_clusters)].copy()  # Make a copy to avoid modifying a view

# Step 3: Log-transform the data if it hasn't been done already
sc.pp.log1p(adata_subset)

# Step 4: Perform differential expression analysis for the selected clusters
sc.tl.rank_genes_groups(adata_subset, 'Lvl4_kmeans', method='wilcoxon', use_raw=False)

# Step 5: Extract top marker genes for the selected clusters
top_genes_per_cluster = {}
for cluster in adata_subset.obs['Lvl4_kmeans'].cat.categories:
    top_genes_per_cluster[cluster] = adata_subset.uns['rank_genes_groups']['names'][cluster]

# Step 6: Generate the dendrogram to get cluster order
sc.tl.dendrogram(adata_subset, groupby='Lvl4_kmeans')

# Step 7: Retrieve the cluster order based on the dendrogram
cluster_order = adata_subset.uns['dendrogram_Lvl4_kmeans']['categories_ordered']

# Step 8: Ensure each cluster has at least 3 unique genes
unique_genes = set()
genes_for_plot = []

for cluster in cluster_order:
    if cluster in top_genes_per_cluster:
        cluster_genes = top_genes_per_cluster[cluster]
        genes_for_this_cluster = []
        for gene in cluster_genes:
            if len(genes_for_this_cluster) >= 10:
                break
            if gene not in unique_genes:
                unique_genes.add(gene)
                genes_for_this_cluster.append(gene)
        genes_for_plot.extend(genes_for_this_cluster)

# Step 9: Plot the matrixplot with the unique genes, swapping axes if necessary
sc.pl.dotplot(adata_subset, var_names=genes_for_plot, groupby='Lvl4_kmeans', dendrogram=True, use_raw=False, cmap="vlag", standard_scale='var', swap_axes=True)

In [None]:
print(adata.obs.columns)  # List all columns in the obs DataFrame

In [None]:
print(adata.obs['Lvl4_kmeans3'].unique().tolist())

In [None]:
# Temporarily convert Lvl4_kmeans to a non-categorical type (e.g., object)
adata.obs['Lvl4_kmeans3'] = adata.obs['Lvl4_kmeans3'].astype('object')

# Condition to select rows where Lvl4 is not 'Th'
condition = adata.obs['Lvl4_kmeans2'] != 'NeuroEpi'

# Transfer values from Lvl4 to Lvl4_kmeans for rows where Lvl4 is not 'Th'
adata.obs.loc[condition, 'Lvl4_kmeans3'] = adata.obs.loc[condition, 'Lvl4_kmeans2']

# Optionally reset the 'Lvl4' column for these rows if needed
# adata.obs.loc[condition, 'Lvl4'] = None  # Uncomment if you want to reset 'Lvl4'

# Optionally, convert Lvl4_kmeans back to categorical (if required)
adata.obs['Lvl4_kmeans3'] = pd.Categorical(adata.obs['Lvl4_kmeans3'])

In [None]:
print(adata.obs['Lvl4_kmeans3'].unique().tolist())

In [None]:

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from matplotlib.transforms import Affine2D
import matplotlib.patches as patches
from mpl_toolkits.axes_grid1.anchored_artists import AnchoredSizeBar
import matplotlib.font_manager as fm

# Step 1: Define the color mapping for the Lvl2.5 categories
color_map = {
    'NeuroEpi-0': 'cyan',
    'NeuroEpi-1': 'red',
    }

# Default color for unmapped clusters
default_color = 'grey'

# Step 2: Filter adata.obs for the specific sample "s2r2_HV184"
subset_obs = adata.obs[adata.obs['sample'] == 's2r1_HV207'].copy()

# Step 3: Convert categorical data to colors in the filtered subset, setting unmapped to default color
subset_obs['color'] = subset_obs['Lvl4_kmeans3'].map(color_map).fillna(default_color)

# Step 4: Define the rotation angle (in degrees)
rotation_angle = 30  # Adjust this value to your desired rotation angle

# Step 5: Set up the figure
fig, ax = plt.subplots(figsize=(10, 8))

# Apply the flip (scaling) and rotation using Affine2D directly to the axes transformation
transform = Affine2D().scale(1, 1).rotate_deg(rotation_angle) + ax.transData

# Step 6: Plot the Data from the subset with the transformation applied
scatter = ax.scatter(
    subset_obs['x_centroid'],   # Use x_centroid from the subset
    subset_obs['y_centroid'],   # Use y_centroid from the subset
    c=subset_obs['color'],      # Use the color mapping from the subset
    s=2,  # Size of points
    alpha=0.7,
    transform=transform  # Apply the flip and rotation transformation
)

# Set the aspect ratio to be equal to avoid distortion
ax.set_aspect('equal', adjustable='box')

# Remove the x and y axes and labels
plt.axis('off')

# Step 7: Add a 200-micron scale bar
scale_bar_length = 200  # Length of the scale bar in microns

# Define the scale bar using AnchoredSizeBar from mpl_toolkits
fontprops = fm.FontProperties(size=10)
scalebar = AnchoredSizeBar(ax.transData,
                           scale_bar_length, '', 'lower right', 
                           pad=0.1,
                           color='white',
                           frameon=False,
                           size_vertical=30,  # Thickness of the bar
                           fontproperties=fontprops)

# Add the scale bar to the plot
ax.add_artist(scalebar)

# Show the plot with a white background
plt.show()


In [None]:
import scanpy as sc

# Step 1: Ensure observation names are unique
adata.obs_names_make_unique()

# Step 2: Subset the adata object to only the specific clusters
desired_clusters = [  
'NeuroEpi-0', 'NeuroEpi-1', 'Ep.B.1'
]
adata_subset = adata[adata.obs['Lvl4_kmeans3'].isin(desired_clusters)].copy()  # Make a copy to avoid modifying a view

# Step 3: Log-transform the data if it hasn't been done already
sc.pp.log1p(adata_subset)

# Step 4: Perform differential expression analysis for the selected clusters
sc.tl.rank_genes_groups(adata_subset, 'Lvl4_kmeans3', method='wilcoxon', use_raw=False)

# Step 5: Extract top marker genes for the selected clusters
top_genes_per_cluster = {}
for cluster in adata_subset.obs['Lvl4_kmeans3'].cat.categories:
    top_genes_per_cluster[cluster] = adata_subset.uns['rank_genes_groups']['names'][cluster]

# Step 6: Generate the dendrogram to get cluster order
sc.tl.dendrogram(adata_subset, groupby='Lvl4_kmeans3')

# Step 7: Retrieve the cluster order based on the dendrogram
cluster_order = adata_subset.uns['dendrogram_Lvl4_kmeans3']['categories_ordered']

# Step 8: Ensure each cluster has at least 3 unique genes
unique_genes = set()
genes_for_plot = []

for cluster in cluster_order:
    if cluster in top_genes_per_cluster:
        cluster_genes = top_genes_per_cluster[cluster]
        genes_for_this_cluster = []
        for gene in cluster_genes:
            if len(genes_for_this_cluster) >= 20:
                break
            if gene not in unique_genes:
                unique_genes.add(gene)
                genes_for_this_cluster.append(gene)
        genes_for_plot.extend(genes_for_this_cluster)

# Step 9: Plot the matrixplot with the unique genes, swapping axes if necessary
sc.pl.dotplot(adata_subset, var_names=genes_for_plot, groupby='Lvl4_kmeans3', dendrogram=True, use_raw=False, cmap="vlag", standard_scale='var', swap_axes=True)

In [None]:
import scanpy as sc

# Step 1: Ensure observation names are unique
adata.obs_names_make_unique()

# Step 2: Subset the adata object to only the specific clusters
desired_clusters = [  
'Mac.Neut.Mix', 'DC.1', 'DC.2', 'Mac.1', 'Mac.2', 'Mast'
]
adata_subset = adata[adata.obs['Lvl4_kmeans3'].isin(desired_clusters)].copy()  # Make a copy to avoid modifying a view

# Step 3: Log-transform the data if it hasn't been done already
sc.pp.log1p(adata_subset)

# Step 4: Perform differential expression analysis for the selected clusters
sc.tl.rank_genes_groups(adata_subset, 'Lvl4_kmeans3', method='wilcoxon', use_raw=False)

# Step 5: Extract top marker genes for the selected clusters
top_genes_per_cluster = {}
for cluster in adata_subset.obs['Lvl4_kmeans3'].cat.categories:
    top_genes_per_cluster[cluster] = adata_subset.uns['rank_genes_groups']['names'][cluster]

# Step 6: Generate the dendrogram to get cluster order
sc.tl.dendrogram(adata_subset, groupby='Lvl4_kmeans3')

# Step 7: Retrieve the cluster order based on the dendrogram
cluster_order = adata_subset.uns['dendrogram_Lvl4_kmeans3']['categories_ordered']

# Step 8: Ensure each cluster has at least 3 unique genes
unique_genes = set()
genes_for_plot = []

for cluster in cluster_order:
    if cluster in top_genes_per_cluster:
        cluster_genes = top_genes_per_cluster[cluster]
        genes_for_this_cluster = []
        for gene in cluster_genes:
            if len(genes_for_this_cluster) >= 10:
                break
            if gene not in unique_genes:
                unique_genes.add(gene)
                genes_for_this_cluster.append(gene)
        genes_for_plot.extend(genes_for_this_cluster)

# Step 9: Plot the matrixplot with the unique genes, swapping axes if necessary
sc.pl.dotplot(adata_subset, var_names=genes_for_plot, groupby='Lvl4_kmeans3', dendrogram=True, use_raw=False, cmap="vlag", standard_scale='var', swap_axes=True)

In [None]:
import scanpy as sc
from sklearn.cluster import KMeans

# Step 1: Filter cells based on the 'Lvl4' column to isolate T-helper (Th) cells
Neut_cells = adata[adata.obs['Lvl4'] == 'Mac.Neut.Mix'].copy()  # Create a copy to avoid view modification warning

# Step 2: Subset the genes you're interested in (if you have a specific list)
genes_of_interest = ['CSF3R']  # Replace with your list of genes
Neut_subset = Neut_cells[:, genes_of_interest]

# Step 3: Normalize or scale the data (optional but can help with clustering)
sc.pp.scale(Neut_subset)  # Scale the data for clustering

# Step 4: Perform k-means clustering
n_clusters = 2  # Adjust based on how many clusters you want (e.g., 2 for NeuroEpi-0, NeuroEpi-1)
kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=42)  # Set n_init explicitly
kmeans_labels = kmeans.fit_predict(Neut_subset.X)  # Get cluster labels

# Step 5: Save the KMeans cluster labels to the 'Lvl4_kmeans' column in `adata.obs`
# Ensure the labels are saved correctly for only the Th cells
adata.obs['Lvl4_kmeans4'] = 'NaN'  # Initialize the new column with NaN values
adata.obs.loc[adata.obs['Lvl4'] == 'Mac.Neut.Mix', 'Lvl4_kmeans4'] = [f'Mac.Neut.Mix-{i}' for i in kmeans_labels]

In [None]:
print(adata.obs.columns)  # List all columns in the obs DataFrame

In [None]:
print(adata.obs['Lvl4_kmeans4'].unique().tolist())

In [None]:
# Temporarily convert Lvl4_kmeans to a non-categorical type (e.g., object)
adata.obs['Lvl4_kmeans4'] = adata.obs['Lvl4_kmeans4'].astype('object')

# Condition to select rows where Lvl4 is not 'Th'
condition = adata.obs['Lvl4_kmeans3'] != 'Mac.Neut.Mix'

# Transfer values from Lvl4 to Lvl4_kmeans for rows where Lvl4 is not 'Th'
adata.obs.loc[condition, 'Lvl4_kmeans4'] = adata.obs.loc[condition, 'Lvl4_kmeans3']

# Optionally reset the 'Lvl4' column for these rows if needed
# adata.obs.loc[condition, 'Lvl4'] = None  # Uncomment if you want to reset 'Lvl4'

# Optionally, convert Lvl4_kmeans back to categorical (if required)
adata.obs['Lvl4_kmeans4'] = pd.Categorical(adata.obs['Lvl4_kmeans4'])

In [None]:
print(adata.obs['Lvl4_kmeans3'].unique().tolist())

In [None]:

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from matplotlib.transforms import Affine2D
import matplotlib.patches as patches
from mpl_toolkits.axes_grid1.anchored_artists import AnchoredSizeBar
import matplotlib.font_manager as fm

# Step 1: Define the color mapping for the Lvl2.5 categories
color_map = {
    'Mac.Neut.Mix-0': 'cyan',
    'Mac.Neut.Mix-1': 'red',
    }

# Default color for unmapped clusters
default_color = 'grey'

# Step 2: Filter adata.obs for the specific sample "s2r2_HV184"
subset_obs = adata.obs[adata.obs['sample'] == 's2r1_HV207'].copy()

# Step 3: Convert categorical data to colors in the filtered subset, setting unmapped to default color
subset_obs['color'] = subset_obs['Lvl4_kmeans4'].map(color_map).fillna(default_color)

# Step 4: Define the rotation angle (in degrees)
rotation_angle = 30  # Adjust this value to your desired rotation angle

# Step 5: Set up the figure
fig, ax = plt.subplots(figsize=(10, 8))

# Apply the flip (scaling) and rotation using Affine2D directly to the axes transformation
transform = Affine2D().scale(1, 1).rotate_deg(rotation_angle) + ax.transData

# Step 6: Plot the Data from the subset with the transformation applied
scatter = ax.scatter(
    subset_obs['x_centroid'],   # Use x_centroid from the subset
    subset_obs['y_centroid'],   # Use y_centroid from the subset
    c=subset_obs['color'],      # Use the color mapping from the subset
    s=2,  # Size of points
    alpha=0.7,
    transform=transform  # Apply the flip and rotation transformation
)

# Set the aspect ratio to be equal to avoid distortion
ax.set_aspect('equal', adjustable='box')

# Remove the x and y axes and labels
plt.axis('off')

# Step 7: Add a 200-micron scale bar
scale_bar_length = 200  # Length of the scale bar in microns

# Define the scale bar using AnchoredSizeBar from mpl_toolkits
fontprops = fm.FontProperties(size=10)
scalebar = AnchoredSizeBar(ax.transData,
                           scale_bar_length, '', 'lower right', 
                           pad=0.1,
                           color='white',
                           frameon=False,
                           size_vertical=30,  # Thickness of the bar
                           fontproperties=fontprops)

# Add the scale bar to the plot
ax.add_artist(scalebar)

# Show the plot with a white background
plt.show()


In [None]:
import scanpy as sc

# Step 1: Ensure observation names are unique
adata.obs_names_make_unique()

# Step 2: Subset the adata object to only the specific clusters
desired_clusters = [  
'Mac.Neut.Mix-0', 'Mac.Neut.Mix-1', 'DC.1', 'DC.2', 'Mac.1', 'Mac.2', 'Mast', 'Mac.3', 'Oth.IRF8hi-MZB1hi'
]
adata_subset = adata[adata.obs['Lvl4_kmeans4'].isin(desired_clusters)].copy()  # Make a copy to avoid modifying a view

# Step 3: Log-transform the data if it hasn't been done already
sc.pp.log1p(adata_subset)

# Step 4: Perform differential expression analysis for the selected clusters
sc.tl.rank_genes_groups(adata_subset, 'Lvl4_kmeans4', method='wilcoxon', use_raw=False)

# Step 5: Extract top marker genes for the selected clusters
top_genes_per_cluster = {}
for cluster in adata_subset.obs['Lvl4_kmeans4'].cat.categories:
    top_genes_per_cluster[cluster] = adata_subset.uns['rank_genes_groups']['names'][cluster]

# Step 6: Generate the dendrogram to get cluster order
sc.tl.dendrogram(adata_subset, groupby='Lvl4_kmeans4')

# Step 7: Retrieve the cluster order based on the dendrogram
cluster_order = adata_subset.uns['dendrogram_Lvl4_kmeans4']['categories_ordered']

# Step 8: Ensure each cluster has at least 3 unique genes
unique_genes = set()
genes_for_plot = []

for cluster in cluster_order:
    if cluster in top_genes_per_cluster:
        cluster_genes = top_genes_per_cluster[cluster]
        genes_for_this_cluster = []
        for gene in cluster_genes:
            if len(genes_for_this_cluster) >= 5:
                break
            if gene not in unique_genes:
                unique_genes.add(gene)
                genes_for_this_cluster.append(gene)
        genes_for_plot.extend(genes_for_this_cluster)

# Step 9: Plot the matrixplot with the unique genes, swapping axes if necessary
sc.pl.dotplot(adata_subset, var_names=genes_for_plot, groupby='Lvl4_kmeans4', dendrogram=True, use_raw=False, cmap="vlag", standard_scale='var', swap_axes=True)

In [None]:
import scanpy as sc
from sklearn.cluster import KMeans

# Step 1: Filter cells based on the 'Lvl4' column to isolate T-helper (Th) cells
Pl_cells = adata[adata.obs['Lvl4'] == 'Pl.1'].copy()  # Create a copy to avoid view modification warning

# Step 2: Subset the genes you're interested in (if you have a specific list)
genes_of_interest = ['IGHA1']  # Replace with your list of genes
Pl_subset = Pl_cells[:, genes_of_interest]

# Step 3: Normalize or scale the data (optional but can help with clustering)
sc.pp.scale(Pl_subset)  # Scale the data for clustering

# Step 4: Perform k-means clustering
n_clusters = 2  # Adjust based on how many clusters you want (e.g., 3 for Th-0, Th-1, Th-2)
kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=42)  # Set n_init explicitly
kmeans_labels = kmeans.fit_predict(Pl_subset.X)  # Get cluster labels

# Step 5: Save the KMeans cluster labels to the 'Lvl4_kmeans' column in `adata.obs`
# Ensure the labels are saved correctly for only the Th cells
adata.obs['Lvl4_kmeans2'] = 'NaN'  # Initialize the new column with NaN values
adata.obs.loc[adata.obs['Lvl4'] == 'Pl.1', 'Lvl4_kmeans2'] = [f'Pl.1-{i}' for i in kmeans_labels]

# Step 6: Run PCA to compute PCA coordinates for visualization (optional)
sc.tl.pca(adata)  # Compute PCA on the original data

# Step 7: Visualize the clusters in PCA space (optional)
sc.pl.pca(adata, color='Lvl4_kmeans2')  # Plot the clusters using PCA results


In [None]:
print(adata.obs['Lvl4_kmeans2'].unique().tolist())

In [None]:
# Temporarily convert Lvl4_kmeans to a non-categorical type (e.g., object)
adata.obs['Lvl4_kmeans2'] = adata.obs['Lvl4_kmeans2'].astype('object')

# Condition to select rows where Lvl4 is not 'Th'
condition = adata.obs['Lvl4_kmeans'] != 'Pl.1'

# Transfer values from Lvl4 to Lvl4_kmeans for rows where Lvl4 is not 'Th'
adata.obs.loc[condition, 'Lvl4_kmeans2'] = adata.obs.loc[condition, 'Lvl4_kmeans']

# Optionally reset the 'Lvl4' column for these rows if needed
# adata.obs.loc[condition, 'Lvl4'] = None  # Uncomment if you want to reset 'Lvl4'

# Optionally, convert Lvl4_kmeans back to categorical (if required)
adata.obs['Lvl4_kmeans2'] = pd.Categorical(adata.obs['Lvl4_kmeans2'])

In [None]:
import scanpy as sc

# Step 1: Ensure observation names are unique
adata.obs_names_make_unique()

# Step 2: Subset the adata object to only the specific clusters
desired_clusters = [  
'Th-0', 'Th-1', 'B', 'Pl.1-0', 'Pl.1-1', 'Fib.1', 'Tc'
]
adata_subset = adata[adata.obs['Lvl4_kmeans2'].isin(desired_clusters)].copy()  # Make a copy to avoid modifying a view

# Step 3: Log-transform the data if it hasn't been done already
sc.pp.log1p(adata_subset)

# Step 4: Perform differential expression analysis for the selected clusters
sc.tl.rank_genes_groups(adata_subset, 'Lvl4_kmeans2', method='wilcoxon', use_raw=False)

# Step 5: Extract top marker genes for the selected clusters
top_genes_per_cluster = {}
for cluster in adata_subset.obs['Lvl4_kmeans2'].cat.categories:
    top_genes_per_cluster[cluster] = adata_subset.uns['rank_genes_groups']['names'][cluster]

# Step 6: Generate the dendrogram to get cluster order
sc.tl.dendrogram(adata_subset, groupby='Lvl4_kmeans2')

# Step 7: Retrieve the cluster order based on the dendrogram
cluster_order = adata_subset.uns['dendrogram_Lvl4_kmeans2']['categories_ordered']

# Step 8: Ensure each cluster has at least 3 unique genes
unique_genes = set()
genes_for_plot = []

for cluster in cluster_order:
    if cluster in top_genes_per_cluster:
        cluster_genes = top_genes_per_cluster[cluster]
        genes_for_this_cluster = []
        for gene in cluster_genes:
            if len(genes_for_this_cluster) >= 10:
                break
            if gene not in unique_genes:
                unique_genes.add(gene)
                genes_for_this_cluster.append(gene)
        genes_for_plot.extend(genes_for_this_cluster)

# Step 9: Plot the matrixplot with the unique genes, swapping axes if necessary
sc.pl.dotplot(adata_subset, var_names=genes_for_plot, groupby='Lvl4_kmeans2', dendrogram=True, use_raw=False, cmap="vlag", standard_scale='var', swap_axes=True)

In [None]:
# Define the list of clusters you're interested in
specific_clusters = ['Th-0', 'Th-1', 'Pl.1-1', 'Pl.1-0', 'B']

# Filter the 'Lvl4_kmeans' column for the specific clusters
filtered_counts = adata.obs[adata.obs['Lvl4_kmeans2'].isin(specific_clusters)]['Lvl4_kmeans2'].value_counts()

# Display the filtered cluster counts
print(filtered_counts)

In [None]:
import scanpy as sc
from sklearn.cluster import KMeans

# Step 1: Filter cells based on the 'Lvl4' column to isolate T-helper (Th) cells
B_cells = adata[adata.obs['Lvl4'] == 'B'].copy()  # Create a copy to avoid view modification warning

# Step 2: Subset the genes you're interested in (if you have a specific list)
genes_of_interest = ['HLA-DMB', 'HLA-DRB5', 'HLA-DQA2', 'HLA-DQB2']  # Replace with your list of genes
B_subset = B_cells[:, genes_of_interest]

# Step 3: Normalize or scale the data (optional but can help with clustering)
sc.pp.scale(B_subset)  # Scale the data for clustering

# Step 4: Perform k-means clustering
n_clusters = 2  # Adjust based on how many clusters you want (e.g., 3 for Th-0, Th-1, Th-2)
kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=42)  # Set n_init explicitly
kmeans_labels = kmeans.fit_predict(B_subset.X)  # Get cluster labels

# Step 5: Save the KMeans cluster labels to the 'Lvl4_kmeans' column in `adata.obs`
# Ensure the labels are saved correctly for only the Th cells
adata.obs['Lvl4_kmeans3'] = 'NaN'  # Initialize the new column with NaN values
adata.obs.loc[adata.obs['Lvl4'] == 'B', 'Lvl4_kmeans3'] = [f'B-{i}' for i in kmeans_labels]

# Step 6: Run PCA to compute PCA coordinates for visualization (optional)
sc.tl.pca(adata)  # Compute PCA on the original data

# Step 7: Visualize the clusters in PCA space (optional)
sc.pl.pca(adata, color='Lvl4_kmeans3')  # Plot the clusters using PCA results

In [None]:
print(adata.obs['Lvl4_kmeans3'].unique().tolist())

In [None]:
# Temporarily convert Lvl4_kmeans to a non-categorical type (e.g., object)
adata.obs['Lvl4_kmeans3'] = adata.obs['Lvl4_kmeans3'].astype('object')

# Condition to select rows where Lvl4 is not 'Th'
condition = adata.obs['Lvl4_kmeans2'] != 'B'

# Transfer values from Lvl4 to Lvl4_kmeans for rows where Lvl4 is not 'Th'
adata.obs.loc[condition, 'Lvl4_kmeans3'] = adata.obs.loc[condition, 'Lvl4_kmeans2']

# Optionally reset the 'Lvl4' column for these rows if needed
# adata.obs.loc[condition, 'Lvl4'] = None  # Uncomment if you want to reset 'Lvl4'

# Optionally, convert Lvl4_kmeans back to categorical (if required)
adata.obs['Lvl4_kmeans3'] = pd.Categorical(adata.obs['Lvl4_kmeans3'])

In [None]:
import scanpy as sc

# Step 1: Ensure observation names are unique
adata.obs_names_make_unique()

# Step 2: Subset the adata object to only the specific clusters
desired_clusters = [  
'Th-0', 'Th-1', 'B-1', 'B-0', 'Pl.1-0', 'Pl.1-1', 'Fib.1', 'Tc'
]
adata_subset = adata[adata.obs['Lvl4_kmeans3'].isin(desired_clusters)].copy()  # Make a copy to avoid modifying a view

# Step 3: Log-transform the data if it hasn't been done already
sc.pp.log1p(adata_subset)

# Step 4: Perform differential expression analysis for the selected clusters
sc.tl.rank_genes_groups(adata_subset, 'Lvl4_kmeans3', method='wilcoxon', use_raw=False)

# Step 5: Extract top marker genes for the selected clusters
top_genes_per_cluster = {}
for cluster in adata_subset.obs['Lvl4_kmeans3'].cat.categories:
    top_genes_per_cluster[cluster] = adata_subset.uns['rank_genes_groups']['names'][cluster]

# Step 6: Generate the dendrogram to get cluster order
sc.tl.dendrogram(adata_subset, groupby='Lvl4_kmeans3')

# Step 7: Retrieve the cluster order based on the dendrogram
cluster_order = adata_subset.uns['dendrogram_Lvl4_kmeans3']['categories_ordered']

# Step 8: Ensure each cluster has at least 3 unique genes
unique_genes = set()
genes_for_plot = []

for cluster in cluster_order:
    if cluster in top_genes_per_cluster:
        cluster_genes = top_genes_per_cluster[cluster]
        genes_for_this_cluster = []
        for gene in cluster_genes:
            if len(genes_for_this_cluster) >= 10:
                break
            if gene not in unique_genes:
                unique_genes.add(gene)
                genes_for_this_cluster.append(gene)
        genes_for_plot.extend(genes_for_this_cluster)

# Step 9: Plot the matrixplot with the unique genes, swapping axes if necessary
sc.pl.dotplot(adata_subset, var_names=genes_for_plot, groupby='Lvl4_kmeans3', dendrogram=True, use_raw=False, cmap="vlag", standard_scale='var', swap_axes=True)