In [None]:
!pip list

In [None]:
# Install a specific version of numpy
!pip install numpy==1.24.4

# Install a specific version of numba
!pip install numba==0.57.1

# Reinstall scanpy and squidpy
!pip uninstall -y scanpy squidpy
!pip install scanpy squidpy

In [None]:
!pip show numpy

In [None]:
!pip install numpy==1.24.0

In [None]:
import scanpy as sc
from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
refStr = sc.read_h5ad('/data/vasileiosionat2/Xenium/Integration_2025_only_stromal/NEW CITE seq h5ad files 2025.01/Stromal.h5ad')

In [None]:
refStr.obs

In [None]:
import pandas as pd

# Specify the file path
file_path = "/data/vasileiosionat2/Xenium/Integration_2025_only_stromal/NEW CITE seq h5ad files 2025.01/Stromal_metadata.csv"

# Read the CSV into a DataFrame
dfStr = pd.read_csv(file_path)

# Display the DataFrame
print(dfStr)

# Access specific columns or rows
#print(dfStrdfStr["ColumnName"])  # Access a column
#print(dfStr.iloc[0])        # Access the first row


In [None]:
import pandas as pd
import anndata as ad

# Check the dimensions
if len(dfStr) != refStr.n_obs:
    raise ValueError("Number of rows in the CSV file does not match the number of observations in adata.obs")

# Optionally, ensure the rows align
# If the index of `csv_data` and `adata.obs` don't match, you may need to align them:
# csv_data.index = adata.obs.index

# Add the CSV columns to `adata.obs`
for column in dfStr.columns:
    refStr.obs[column] = dfStr[column].values

# Save the updated AnnData object (optional)
#adata.write("updated_adata.h5ad")


In [None]:
refStr.obs

In [None]:
refStr.var

In [None]:
# Remove the 'Unnamed: 0' column from adata.obs
if 'Unnamed: 0' in refStr.obs.columns:
    del refStr.obs['Unnamed: 0']

# Alternatively, using the drop method
# adata.obs.drop(columns=['Unnamed: 0'], inplace=True)

# Verify the column is removed
print(refStr.obs.head())


In [None]:
refStr.obs.rename(columns={"StromalClusters": "cluster"}, inplace=True)

In [None]:
refStr.obs

In [None]:
refStr.X

In [None]:
if '_index' in refStr.raw.var.columns:
    refStr.raw.var.rename(columns={'_index': 'index_backup'}, inplace=True)


In [None]:
refStr.write("/data/vasileiosionat2/Xenium/Integration_2025_only_stromal/Modified_citeseq_files/Stromal.h5ad")

In [None]:
refStr.obs

In [None]:
ref_stromal = sc.read_h5ad("/data/vasileiosionat2/Xenium/Integration_2025_only_stromal/Modified_citeseq_files/Stromal.h5ad")

In [None]:
adata = sc.read_h5ad('/data/vasileiosionat2/Xenium/Drake_outputs/ccProcessed.h5ad')

In [None]:
stromal_adata = adata[adata.obs['Lvl1'].isin(['Fibroblast', 'Vascular'])].copy()

In [None]:
stromal_adata.obs

In [None]:
# Add assay information to obs for clarity
stromal_adata.obs['assay'] = 'Xenium_stromal'
ref_stromal.obs['assay'] = 'citeSeq_stromal'

# Merge datasets with batch key and categories
merged_stromal = stromal_adata.concatenate(
    ref_stromal,
    batch_key="assay",
    batch_categories=["Xenium_stromal", "citeSeq_stromal"],
    index_unique=None  # Avoids adding a suffix to index entries
)

In [None]:
sc.pp.scale(merged_stromal)
sc.tl.pca(merged_stromal)

In [None]:
sc.external.pp.harmony_integrate(merged_stromal, key="assay", max_iter_harmony=20, max_iter_kmeans=30)

In [None]:
# Visualize merged with UMAP embedding; takes a long time
sc.pp.neighbors(merged_stromal, n_neighbors=50, use_rep="X_pca_harmony", metric="correlation")
sc.tl.umap(merged_stromal, min_dist=0.5)

In [None]:
sc.pl.umap(merged_stromal, color='assay')

In [None]:
sc.pl.umap(merged_stromal, color='cluster')

In [None]:
sc.pl.umap(merged_stromal, color='Lvl4')

In [None]:
merged_stromal.write("/data/vasileiosionat2/Xenium/Integration_2025_only_stromal/Modified_citeseq_files/Xenium_citeSeq_stromal_harmony.h5ad")

In [None]:
merged_stromal = sc.read_h5ad('/data/vasileiosionat2/Xenium/Integration_2025_only_stromal/Modified_citeseq_files/Xenium_citeSeq_stromal_harmony.h5ad')

In [None]:
merged_stromal.obs

In [None]:
print("Unique assays in merged_stromal:", merged_stromal.obs["assay"].unique())

In [None]:
# Transfer annotations from scRNAseq to xenium
nn = KNeighborsClassifier(n_neighbors=1, n_jobs=16, weights='distance', metric='euclidean')
train = merged_stromal[merged_stromal.obs["assay"] == "citeSeq_stromal"]
nn.fit(train.obsm["X_pca_harmony"], train.obs['cluster']) 
labels = nn.predict(merged_stromal[merged_stromal.obs["assay"] == "Xenium_stromal"].obsm["X_pca_harmony"])
merged_stromal.obs_names_make_unique()
merged_stromal.obs["citeSeq_to_Xenium_label"] = pd.Series(labels, index=merged_stromal[merged_stromal.obs["assay"] == "Xenium_stromal"].obs.index)

In [None]:
sc.pl.umap(merged_stromal, color='citeSeq_to_Xenium_label')

In [None]:
# Rename 'Xenium_stromal' to 'Xenium' and 'citeSeq_stromal' to 'citeSeq'
merged_stromal.obs['assay'] = merged_stromal.obs['assay'].replace({
    'Xenium_stromal': 'Xenium',
    'citeSeq_stromal': 'citeSeq'
})

In [None]:
# Save transferred annotations to file
citeSeq_to_xenium_predicted_stromal_labels = merged_stromal.obs.loc[merged_stromal.obs['assay'] == 'Xenium', ['orig.ident','Lvl4','citeSeq_to_Xenium_label']]
citeSeq_to_xenium_predicted_stromal_labels.index = citeSeq_to_xenium_predicted_stromal_labels.index.str.replace('-Xenium','')
citeSeq_to_xenium_predicted_stromal_labels.to_csv('/data/vasileiosionat2/Xenium/Integration_2025_only_stromal/Modified_citeseq_files/citeSeq_to_Xenium_predicted_stromal_celltype.csv')

In [None]:
# Transfer annotations from xenium to scRNAseq
nn = KNeighborsClassifier(n_neighbors=1, n_jobs=16, weights='distance', metric='euclidean')
train = merged_stromal[merged_stromal.obs["assay"] == "Xenium"]
nn.fit(train.obsm["X_pca_harmony"], train.obs['Lvl4']) 
labels = nn.predict(merged_stromal[merged_stromal.obs["assay"] == "citeSeq"].obsm["X_pca_harmony"])
merged_stromal.obs["Xenium_to_citeSeq_label"] = pd.Series(labels, index=merged_stromal[merged_stromal.obs["assay"] == "citeSeq"].obs.index)

In [None]:
# Save transferred annotations to file
Xenium_to_citeSeq_predicted_stromal_labels = merged_stromal.obs.loc[merged_stromal.obs['assay'] == 'citeSeq', ['orig.ident','cluster','Xenium_to_citeSeq_label']]
Xenium_to_citeSeq_predicted_stromal_labels.index = Xenium_to_citeSeq_predicted_stromal_labels.index.str.replace('-citeSeq','')
Xenium_to_citeSeq_predicted_stromal_labels.to_csv('/data/vasileiosionat2/Xenium/Integration_2025_only_stromal/Modified_citeseq_files/Xenium_to_citeSeq_predicted_stromal_celltype.csv')

In [None]:
# Combine original labels from scSeq and transferred labels from xenium into one column
merged_stromal.obs.loc[merged_stromal.obs['assay'] == 'Xenium', 'final_label_citeSeq'] = merged_stromal.obs[merged_stromal.obs['assay'] == 'Xenium']['citeSeq_to_Xenium_label']
merged_stromal.obs.loc[merged_stromal.obs['assay'] == 'citeSeq', 'final_label_citeSeq'] = merged_stromal.obs[merged_stromal.obs['assay'] == 'citeSeq']['cluster']
# Combine original labels from xenium and transferred labels from scSeq into one column
# Convert categories to strings (safe option)
merged_stromal.obs.loc[merged_stromal.obs['assay'] == 'citeSeq', 'final_label_X'] = \
    merged_stromal.obs.loc[merged_stromal.obs['assay'] == 'citeSeq', 'Xenium_to_citeSeq_label'].astype(str)

merged_stromal.obs.loc[merged_stromal.obs['assay'] == 'Xenium', 'final_label_X'] = \
    merged_stromal.obs.loc[merged_stromal.obs['assay'] == 'Xenium', 'Lvl4'].astype(str)


# Visualized merged labels on UMAP
sc.pl.umap(merged_stromal, color='final_label_citeSeq')
sc.pl.umap(merged_stromal, color='assay')
sc.pl.umap(merged_stromal, color='final_label_X')

In [None]:
# Rename 'Xenium_stromal' to 'Xenium' and 'citeSeq_stromal' to 'citeSeq'
merged_stromal.obs['final_label_citeSeq'] = merged_stromal.obs['final_label_citeSeq'].replace({
    'Cycling': 'Cycling stromal',
})

In [None]:
# Rename 'Xenium_stromal' to 'Xenium' and 'citeSeq_stromal' to 'citeSeq'
merged_stromal.obs['citeSeq_to_Xenium_label'] = merged_stromal.obs['citeSeq_to_Xenium_label'].replace({
    'Cycling': 'Cycling stromal',
})

In [None]:
merged_stromal.write("/data/vasileiosionat2/Xenium/Integration_2025_only_stromal/Modified_citeseq_files/Xenium_citeSeq_stromal_harmony.h5ad")

In [None]:
# import matplotlib.pyplot as plt
import numpy as np
from matplotlib.backends.backend_pdf import PdfPages
import os


rotation_angle = 30  # Define rotation for this sample

# Specify the directory where you want to save the PDF
save_directory = '/data/vasileiosionat2/Xenium/Figures/lvl4_pdf/'
pdf_filename = os.path.join(save_directory, f'all_clusters_4rows.pdf')

# Filter the data for the specific sample
adata_sample = merged_stromal[merged_stromal.obs['sample'] == 's2r2_HV184']

unique_clusters = adata_sample.obs.loc[
    adata_sample.obs['final_label_citeSeq'].astype(str).str.startswith("Fib")|
    (adata_sample.obs['final_label_citeSeq'].str.startswith("End")),
    'final_label_citeSeq'
].unique()

# Reorder clusters (if a custom order is provided, replace `sorted(unique_clusters)`)
ordered_clusters = sorted(unique_clusters)

# Define the number of rows and columns
num_rows = 6
num_cols = int(np.ceil(len(ordered_clusters) / num_rows))  # Calculate columns based on clusters and rows

# Create a PdfPages object to save the plots
with PdfPages(pdf_filename) as pdf:
    # Create a figure for the sample
    fig, axes = plt.subplots(num_rows, num_cols, figsize=(26, 36))
    fig.patch.set_facecolor('white')

    # Flatten the axes array for easier indexing
    axes = axes.flatten()

    # Get the coordinates for the sample
    x_coords = adata_sample.obs['x_centroid']
    y_coords = adata_sample.obs['y_centroid']
    
    # Apply rotation
    angle = np.deg2rad(rotation_angle)
    new_x_coords = x_coords * np.cos(angle) - y_coords * np.sin(angle)
    new_y_coords = x_coords * np.sin(angle) + y_coords * np.cos(angle)

    # Determine aspect ratio
    x_range = new_x_coords.max() - new_x_coords.min()
    y_range = new_y_coords.max() - new_y_coords.min()
    aspect_ratio = x_range / y_range

       # Loop over each unique cluster in the ordered list
    for idx, cluster in enumerate(ordered_clusters):
        ax = axes[idx]

        # Set white background for the subplot
        ax.set_facecolor('white')

        # Remove the outline
        for spine in ax.spines.values():
            spine.set_visible(False)

        # Scatter plot for the grey dots (all other clusters)
        ax.scatter(
            x=new_x_coords[adata_sample.obs['final_label_citeSeq'] != cluster],
            y=new_y_coords[adata_sample.obs['final_label_citeSeq'] != cluster],
            c='#C0C0C0', 
            s=3  # Adjust dot size
        )

        # Scatter plot for the red dots (the current cluster)
        ax.scatter(
            x=new_x_coords[adata_sample.obs['final_label_citeSeq'] == cluster],
            y=new_y_coords[adata_sample.obs['final_label_citeSeq'] == cluster],
            c='red', 
            s=9  # Adjust dot size
        )

        # Set aspect ratio for each subplot
        ax.set_aspect(aspect_ratio)

        # Add the cluster name below the plot
        ax.text(
            0.5, 0.02, f'{cluster}', 
            horizontalalignment='center', 
            verticalalignment='center', 
            transform=ax.transAxes, 
            color='black', fontsize=20, weight='bold'
        )

        # Remove grids
        ax.grid(False)

        # Remove ticks and their labels
        ax.set_xticks([])
        ax.set_yticks([])

    # Turn off empty subplots if there are any
    for ax in axes[len(ordered_clusters):]:
        ax.set_visible(False)

    # Define the directory and file name for the TIFF file
    tiff_filename = os.path.join(save_directory, f'all_clusters_4rows.tiff')

    # Adjust the spacing between subplots
    plt.subplots_adjust(
        left=0.05,    # Space from left edge
        right=0.95,   # Space from right edge
        top=0.95,     # Space from top edge
        bottom=0.05,  # Space from bottom edge
        wspace=0.1,   # Width space between columns
        hspace=0.05    # Height space between rows
    )

    # Add a title to the overall figure
    plt.suptitle(f'All Clusters in s2r2_HV184', color='white', fontsize=20, weight='bold', y=1.02)

    # Save the current figure as a TIFF file
    #fig.savefig(tiff_filename, dpi=300, format='tiff')

    plt.show()
    plt.close(fig)

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming 'plot_niches' is a DataFrame or similar data structure
df = pd.DataFrame(merged_stromal.obs)
df['Lvl5'] = df['Lvl5'].astype('category')

# Aggregate counts
niche_groups = df.groupby(['final_label_citeSeq', 'niche_cc14']).size().unstack(fill_value=0)

# Identify clusters to remove
clusters_to_remove = df['final_label_citeSeq'].str.contains('Mix') | df['final_label_citeSeq'].str.startswith('Neuro')

# Get the unique clusters to be removed
clusters_to_remove = df.loc[clusters_to_remove, 'final_label_citeSeq'].unique()

# Filter out these clusters from the plot
niche_groups_filtered = niche_groups[~niche_groups.index.isin(clusters_to_remove)]

# Calculate relative frequencies
niche_groups_relative = niche_groups_filtered.div(niche_groups_filtered.sum(axis=1), axis=0)

# Plot heatmap with enhanced colormap
plt.figure(figsize=(16, 20))  # Adjusted figsize to accommodate the x-axis labels
sns.heatmap(niche_groups_relative, cmap='plasma', cbar_kws={'label': 'Relative Frequency'}, linewidths=0.05, linecolor='black')
plt.title('Relative Cell Type Distribution in Niches')
plt.xlabel('Niche')
plt.ylabel('Cell Cluster')
plt.xticks(rotation=45, ha='right')  # Adjust rotation and alignment of xticks
plt.yticks(rotation=0)
plt.tight_layout()  # Ensures all elements fit within the figure area
plt.show()


In [None]:
# Visualize xenium annotations vs transferred scRNAseq annotations
xen_obj = merged_stromal[merged_stromal.obs['assay']=='Xenium']
xen_obj.obs['Lvl4'] = xen_obj.obs['Lvl4'].astype('str')
celltype_counts = pd.DataFrame(xen_obj.obs.groupby(['Lvl4','citeSeq_to_Xenium_label']).size()).unstack()
celltype_counts.columns = celltype_counts.columns.droplevel()
celltype_counts.index.name = 'Xenium cell type'
celltype_counts.columns.name = 'predicted citeSeq cell type'
celltype_counts = celltype_counts.T
# Row scale co-occurrence frequencies (by predicted scRNAseq cell type)
celltype_counts = celltype_counts.div(celltype_counts.sum(axis=1), axis=0) 
celltype_counts = celltype_counts.loc[:,celltype_counts.idxmax(axis=0).sort_values().index]
celltype_counts = celltype_counts.fillna(0)

In [None]:
column_sums = celltype_counts.abs().sum(axis=0)
print(column_sums)
# Select columns to keep based on the threshold
columns_to_keep = column_sums[column_sums >= 0.1].index
print(columns_to_keep)
celltype_counts2 = celltype_counts[columns_to_keep]
print(celltype_counts2)

In [None]:
plt.figure(figsize = (26,14))
sns.heatmap(celltype_counts2, cmap='YlGnBu')

In [None]:
refTNK = sc.read_h5ad('/data/vasileiosionat2/Xenium/Integration_2025_only_stromal/NEW CITE seq h5ad files 2025.01/TNK.h5ad')

In [None]:
refTNK.obs

In [None]:
refTNK.var

In [None]:
import pandas as pd

# Specify the file path
file_path = "/data/vasileiosionat2/Xenium/Integration_2025_only_stromal/NEW CITE seq h5ad files 2025.01/TNK_metadata.csv"

# Read the CSV into a DataFrame
dfTNK = pd.read_csv(file_path)

# Display the DataFrame
print(dfTNK)

# Access specific columns or rows
#print(dfTNK["ColumnName"])  # Access a column
#print(dfTNK.iloc[0])        # Access the first row


In [None]:
import pandas as pd
import anndata as ad

# Check the dimensions
if len(dfTNK) != refTNK.n_obs:
    raise ValueError("Number of rows in the CSV file does not match the number of observations in adata.obs")

# Optionally, ensure the rows align
# If the index of `csv_data` and `adata.obs` don't match, you may need to align them:
# csv_data.index = adata.obs.index

# Add the CSV columns to `adata.obs`
for column in dfTNK.columns:
    refTNK.obs[column] = dfTNK[column].values

# Save the updated AnnData object (optional)
#adata.write("updated_adata.h5ad")


In [None]:
refTNK.obs

In [None]:
# Remove the 'Unnamed: 0' column from adata.obs
if 'Unnamed: 0' in refTNK.obs.columns:
    del refTNK.obs['Unnamed: 0']

# Alternatively, using the drop method
# adata.obs.drop(columns=['Unnamed: 0'], inplace=True)

# Verify the column is removed
print(refTNK.obs.head())


In [None]:
refTNK.obs.rename(columns={"TNKClusters": "cluster"}, inplace=True)

In [None]:
refTNK.obs

In [None]:
refBPl = sc.read_h5ad('/data/vasileiosionat2/Xenium/Integration_2025_only_stromal/NEW CITE seq h5ad files 2025.01/BPlasma.h5ad')

In [None]:
refBPl.obs

In [None]:
refBPl.var

In [None]:
import pandas as pd

# Specify the file path
file_path = "/data/vasileiosionat2/Xenium/Integration_2025_only_stromal/NEW CITE seq h5ad files 2025.01/BPlasma_metadata.csv"

# Read the CSV into a DataFrame
dfBPl = pd.read_csv(file_path)

# Display the DataFrame
print(dfBPl)

# Access specific columns or rows
#print(dfBPl["ColumnName"])  # Access a column
#print(dfBPl.iloc[0])        # Access the first row


In [None]:
import pandas as pd
import anndata as ad

# Check the dimensions
if len(dfBPl) != refBPl.n_obs:
    raise ValueError("Number of rows in the CSV file does not match the number of observations in adata.obs")

# Optionally, ensure the rows align
# If the index of `csv_data` and `adata.obs` don't match, you may need to align them:
# csv_data.index = adata.obs.index

# Add the CSV columns to `adata.obs`
for column in dfBPl.columns:
    refBPl.obs[column] = dfBPl[column].values

# Save the updated AnnData object (optional)
#adata.write("updated_adata.h5ad")


In [None]:
refBPl.obs

In [None]:
# Remove the 'Unnamed: 0' column from adata.obs
if 'Unnamed: 0' in refBPl.obs.columns:
    del refBPl.obs['Unnamed: 0']

# Alternatively, using the drop method
# adata.obs.drop(columns=['Unnamed: 0'], inplace=True)

# Verify the column is removed
print(refBPl.obs.head())


In [None]:
refBPl.obs.rename(columns={"BPlasmaClusters": "cluster"}, inplace=True)

In [None]:
refBPl.obs

In [None]:
refMyel = sc.read_h5ad('/data/vasileiosionat2/Xenium/Integration_2025_only_stromal/NEW CITE seq h5ad files 2025.01/Myeloid.h5ad')

In [None]:
refMyel.obs

In [None]:
import pandas as pd

# Specify the file path
file_path = "/data/vasileiosionat2/Xenium/Integration_2025_only_stromal/NEW CITE seq h5ad files 2025.01/Myeloid_metadata.csv"

# Read the CSV into a DataFrame
dfMyel = pd.read_csv(file_path)

# Display the DataFrame
print(dfMyel)

# Access specific columns or rows
#print(dfTNK["ColumnName"])  # Access a column
#print(dfTNK.iloc[0])        # Access the first row


In [None]:
import pandas as pd
import anndata as ad

# Check the dimensions
if len(dfMyel) != refMyel.n_obs:
    raise ValueError("Number of rows in the CSV file does not match the number of observations in adata.obs")

# Optionally, ensure the rows align
# If the index of `csv_data` and `adata.obs` don't match, you may need to align them:
# csv_data.index = adata.obs.index

# Add the CSV columns to `adata.obs`
for column in dfMyel.columns:
    refMyel.obs[column] = dfMyel[column].values

# Save the updated AnnData object (optional)
#adata.write("updated_adata.h5ad")


In [None]:
refMyel.obs

In [None]:
# Remove the 'Unnamed: 0' column from adata.obs
if 'Unnamed: 0' in refMyel.obs.columns:
    del refMyel.obs['Unnamed: 0']

# Alternatively, using the drop method
# adata.obs.drop(columns=['Unnamed: 0'], inplace=True)

# Verify the column is removed
print(refMyel.obs.head())


In [None]:
refMyel.obs.rename(columns={"MyeloidClusters": "cluster"}, inplace=True)

In [None]:
refMyel.obs

In [None]:
import anndata as ad

# Concatenate AnnData objects
ref_immune_combined = ad.concat([refTNK, refBPl, refMyel], join='outer')

# Save the combined AnnData object (optional)
#adata_combined.write("combined_adata.h5ad")


In [None]:
ref_immune_combined.obs

In [None]:
ref_immune_combined.var

In [None]:
print(ref_immune_combined.shape)

In [None]:
ref_immune_combined.X

In [None]:
ref_immune_combined.write("/data/vasileiosionat2/Xenium/Integration_2025_only_stromal/Modified_citeseq_files/Immune.h5ad")

In [None]:
ref_immune =  sc.read_h5ad("/data/vasileiosionat2/Xenium/Integration_2025_only_stromal/Modified_citeseq_files/Immune.h5ad")

In [None]:
adata = sc.read_h5ad('/data/vasileiosionat2/Xenium/Drake_outputs/ccProcessed.h5ad')

In [None]:
import pandas as pd

# Assuming adata.obs contains 'Lvl4' and 'Lvl5'
cluster_counts = (
    adata.obs.groupby(['Lvl1', 'Lvl4']).size().unstack(fill_value=0)
)

# Adjust Pandas display settings to show all rows and all columns
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.max_rows', None)     # Show all rows
pd.set_option('display.width', None)       # Avoid line breaks for wide DataFrames

# Display the full result
print(cluster_counts)

# Reset the options if needed (optional)
pd.reset_option('display.max_columns')
pd.reset_option('display.max_rows')
pd.reset_option('display.width')


In [None]:
adata.obs.loc[adata.obs['Lvl4'].isin(['T.B.Mix', 'Mac.Neut.Mix']), 'Lvl1'] = 'Immune'

In [None]:
adata.obs.loc[adata.obs['Lvl4'].isin(['Mac.Neut.Mix']), 'Lvl2'] = 'Myeloid'

In [None]:
import pandas as pd

# Assuming adata.obs contains 'Lvl4' and 'Lvl5'
cluster_counts = (
    adata.obs.groupby(['Lvl2', 'Lvl4']).size().unstack(fill_value=0)
)

# Adjust Pandas display settings to show all rows and all columns
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.max_rows', None)     # Show all rows
pd.set_option('display.width', None)       # Avoid line breaks for wide DataFrames

# Display the full result
print(cluster_counts)

# Reset the options if needed (optional)
pd.reset_option('display.max_columns')
pd.reset_option('display.max_rows')
pd.reset_option('display.width')


In [None]:
import pandas as pd

# Assuming adata.obs contains 'Lvl4' and 'Lvl5'
cluster_counts = (
    adata.obs.groupby(['Lvl1', 'Lvl4']).size().unstack(fill_value=0)
)

# Adjust Pandas display settings to show all rows and all columns
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.max_rows', None)     # Show all rows
pd.set_option('display.width', None)       # Avoid line breaks for wide DataFrames

# Display the full result
print(cluster_counts)

# Reset the options if needed (optional)
pd.reset_option('display.max_columns')
pd.reset_option('display.max_rows')
pd.reset_option('display.width')


In [None]:
#After T cell clustering save
adata.write_h5ad("/data/vasileiosionat2/Xenium/Drake_outputs/ccProcessed.h5ad")

In [None]:
immune_adata = adata[adata.obs['Lvl1'].isin(['Immune'])].copy()

In [None]:
immune_adata.obs

In [None]:
# Add assay information to obs for clarity
immune_adata.obs['assay'] = 'Xenium'
ref_immune.obs['assay'] = 'citeSeq'

# Merge datasets with batch key and categories
merged_immune = immune_adata.concatenate(
    ref_immune,
    batch_key="assay",
    batch_categories=["Xenium", "citeSeq"],
    index_unique=None  # Avoids adding a suffix to index entries
)

In [None]:
sc.pp.scale(merged_immune)
sc.tl.pca(merged_immune)

In [None]:
sc.external.pp.harmony_integrate(merged_immune, key="assay", max_iter_harmony=20, max_iter_kmeans=30)

In [None]:
# Visualize merged with UMAP embedding; takes a long time
sc.pp.neighbors(merged_immune, n_neighbors=50, use_rep="X_pca_harmony", metric="correlation")
sc.tl.umap(merged_immune, min_dist=0.5)

In [None]:
sc.pl.umap(merged_immune, color='assay')

In [None]:
sc.pl.umap(merged_immune, color='cluster')

In [None]:
sc.pl.umap(merged_immune, color='Lvl4')

In [None]:
merged_immune.write("/data/vasileiosionat2/Xenium/Integration_2025_only_stromal/Modified_citeseq_files/Xenium_citeSeq_immune_harmony.h5ad")

In [None]:
merged_immune = sc.read_h5ad('/data/vasileiosionat2/Xenium/Integration_2025_only_stromal/Modified_citeseq_files/Xenium_citeSeq_immune_harmony.h5ad')

In [None]:
merged_immune.obs

In [None]:
# Transfer annotations from scRNAseq to xenium
nn = KNeighborsClassifier(n_neighbors=1, n_jobs=16, weights='distance', metric='euclidean')
train = merged_immune[merged_immune.obs["assay"] == "citeSeq"]
nn.fit(train.obsm["X_pca_harmony"], train.obs['cluster']) 
labels = nn.predict(merged_immune[merged_immune.obs["assay"] == "Xenium"].obsm["X_pca_harmony"])
merged_immune.obs_names_make_unique()
merged_immune.obs["citeSeq_to_Xenium_label"] = pd.Series(labels, index=merged_immune[merged_immune.obs["assay"] == "Xenium"].obs.index)

In [None]:
sc.pl.umap(merged_immune, color='citeSeq_to_Xenium_label')

In [None]:
# Save transferred annotations to file
citeSeq_to_xenium_predicted_immune_labels = merged_immune.obs.loc[merged_immune.obs['assay'] == 'Xenium', ['orig.ident','Lvl4','citeSeq_to_Xenium_label']]
citeSeq_to_xenium_predicted_immune_labels.index = citeSeq_to_xenium_predicted_immune_labels.index.str.replace('-Xenium','')
citeSeq_to_xenium_predicted_immune_labels.to_csv('/data/vasileiosionat2/Xenium/Integration_2025_only_stromal/Modified_citeseq_files/citeSeq_to_Xenium_predicted_immune_celltype.csv')

In [None]:
# Transfer annotations from xenium to scRNAseq
nn = KNeighborsClassifier(n_neighbors=1, n_jobs=16, weights='distance', metric='euclidean')
train = merged_immune[merged_immune.obs["assay"] == "Xenium"]
nn.fit(train.obsm["X_pca_harmony"], train.obs['Lvl4']) 
labels = nn.predict(merged_immune[merged_immune.obs["assay"] == "citeSeq"].obsm["X_pca_harmony"])
merged_immune.obs["Xenium_to_citeSeq_label"] = pd.Series(labels, index=merged_immune[merged_immune.obs["assay"] == "citeSeq"].obs.index)

In [None]:
# Save transferred annotations to file
Xenium_to_citeSeq_predicted_immune_labels = merged_immune.obs.loc[merged_immune.obs['assay'] == 'citeSeq', ['orig.ident','cluster','Xenium_to_citeSeq_label']]
Xenium_to_citeSeq_predicted_immune_labels.index = Xenium_to_citeSeq_predicted_immune_labels.index.str.replace('-citeSeq','')
Xenium_to_citeSeq_predicted_immune_labels.to_csv('/data/vasileiosionat2/Xenium/Integration_2025_only_stromal/Modified_citeseq_files/Xenium_to_citeSeq_predicted_immune_celltype.csv')

In [None]:
# Combine original labels from scSeq and transferred labels from xenium into one column
merged_immune.obs.loc[merged_immune.obs['assay'] == 'Xenium', 'final_label_citeSeq'] = merged_immune.obs[merged_immune.obs['assay'] == 'Xenium']['citeSeq_to_Xenium_label']
merged_immune.obs.loc[merged_immune.obs['assay'] == 'citeSeq', 'final_label_citeSeq'] = merged_immune.obs[merged_immune.obs['assay'] == 'citeSeq']['cluster']
# Combine original labels from xenium and transferred labels from scSeq into one column
# Convert categories to strings (safe option)
merged_immune.obs.loc[merged_immune.obs['assay'] == 'citeSeq', 'final_label_X'] = \
    merged_immune.obs.loc[merged_immune.obs['assay'] == 'citeSeq', 'Xenium_to_citeSeq_label'].astype(str)

merged_immune.obs.loc[merged_immune.obs['assay'] == 'Xenium', 'final_label_X'] = \
    merged_immune.obs.loc[merged_immune.obs['assay'] == 'Xenium', 'Lvl4'].astype(str)


# Visualized merged labels on UMAP
sc.pl.umap(merged_immune, color='final_label_citeSeq')
sc.pl.umap(merged_immune, color='assay')
sc.pl.umap(merged_immune, color='final_label_X')

In [None]:
# Rename 'Xenium_stromal' to 'Xenium' and 'citeSeq_stromal' to 'citeSeq'
merged_immune.obs['final_label_citeSeq'] = merged_immune.obs['final_label_citeSeq'].replace({
    'Cycling': 'Cycling myeloid',
})

In [None]:
# Rename 'Xenium_stromal' to 'Xenium' and 'citeSeq_stromal' to 'citeSeq'
merged_immune.obs['citeSeq_to_Xenium_label'] = merged_immune.obs['citeSeq_to_Xenium_label'].replace({
    'Cycling': 'Cycling myeloid',
})

In [None]:
merged_immune.obs['citeSeq_to_Xenium_label'].unique().tolist()

In [None]:
merged_immune.write("/data/vasileiosionat2/Xenium/Integration_2025_only_stromal/Modified_citeseq_files/Xenium_citeSeq_immune_harmony.h5ad")

In [None]:
# import matplotlib.pyplot as plt
import numpy as np
from matplotlib.backends.backend_pdf import PdfPages
import os


rotation_angle = 30  # Define rotation for this sample

# Specify the directory where you want to save the PDF
save_directory = '/data/vasileiosionat2/Xenium/Figures/lvl4_pdf/'
pdf_filename = os.path.join(save_directory, f'all_clusters_4rows.pdf')

# Filter the data for the specific sample
adata_sample = merged_immune[merged_immune.obs['sample'] == 's2r3']

unique_clusters = adata_sample.obs.loc[
    adata_sample.obs['final_label_citeSeq'].astype(str).str.startswith("B")|
    (adata_sample.obs['final_label_citeSeq'].str.startswith("Pl")),
    'final_label_citeSeq'
].unique()

# Reorder clusters (if a custom order is provided, replace `sorted(unique_clusters)`)
ordered_clusters = sorted(unique_clusters)

# Define the number of rows and columns
num_rows = 6
num_cols = int(np.ceil(len(ordered_clusters) / num_rows))  # Calculate columns based on clusters and rows

# Create a PdfPages object to save the plots
with PdfPages(pdf_filename) as pdf:
    # Create a figure for the sample
    fig, axes = plt.subplots(num_rows, num_cols, figsize=(26, 36))
    fig.patch.set_facecolor('white')

    # Flatten the axes array for easier indexing
    axes = axes.flatten()

    # Get the coordinates for the sample
    x_coords = adata_sample.obs['x_centroid']
    y_coords = adata_sample.obs['y_centroid']
    
    # Apply rotation
    angle = np.deg2rad(rotation_angle)
    new_x_coords = x_coords * np.cos(angle) - y_coords * np.sin(angle)
    new_y_coords = x_coords * np.sin(angle) + y_coords * np.cos(angle)

    # Determine aspect ratio
    x_range = new_x_coords.max() - new_x_coords.min()
    y_range = new_y_coords.max() - new_y_coords.min()
    aspect_ratio = x_range / y_range

       # Loop over each unique cluster in the ordered list
    for idx, cluster in enumerate(ordered_clusters):
        ax = axes[idx]

        # Set white background for the subplot
        ax.set_facecolor('white')

        # Remove the outline
        for spine in ax.spines.values():
            spine.set_visible(False)

        # Scatter plot for the grey dots (all other clusters)
        ax.scatter(
            x=new_x_coords[adata_sample.obs['final_label_citeSeq'] != cluster],
            y=new_y_coords[adata_sample.obs['final_label_citeSeq'] != cluster],
            c='#C0C0C0', 
            s=3  # Adjust dot size
        )

        # Scatter plot for the red dots (the current cluster)
        ax.scatter(
            x=new_x_coords[adata_sample.obs['final_label_citeSeq'] == cluster],
            y=new_y_coords[adata_sample.obs['final_label_citeSeq'] == cluster],
            c='red', 
            s=9  # Adjust dot size
        )

        # Set aspect ratio for each subplot
        ax.set_aspect(aspect_ratio)

        # Add the cluster name below the plot
        ax.text(
            0.5, 0.02, f'{cluster}', 
            horizontalalignment='center', 
            verticalalignment='center', 
            transform=ax.transAxes, 
            color='black', fontsize=20, weight='bold'
        )

        # Remove grids
        ax.grid(False)

        # Remove ticks and their labels
        ax.set_xticks([])
        ax.set_yticks([])

    # Turn off empty subplots if there are any
    for ax in axes[len(ordered_clusters):]:
        ax.set_visible(False)

    # Define the directory and file name for the TIFF file
    tiff_filename = os.path.join(save_directory, f'all_clusters_4rows.tiff')

    # Adjust the spacing between subplots
    plt.subplots_adjust(
        left=0.05,    # Space from left edge
        right=0.95,   # Space from right edge
        top=0.95,     # Space from top edge
        bottom=0.05,  # Space from bottom edge
        wspace=0.1,   # Width space between columns
        hspace=0.05    # Height space between rows
    )

    # Add a title to the overall figure
    plt.suptitle(f'All Clusters in s2r2_HV184', color='white', fontsize=20, weight='bold', y=1.02)

    # Save the current figure as a TIFF file
    #fig.savefig(tiff_filename, dpi=300, format='tiff')

    plt.show()
    plt.close(fig)

In [None]:
# import matplotlib.pyplot as plt
import numpy as np
from matplotlib.backends.backend_pdf import PdfPages
import os


rotation_angle = 30  # Define rotation for this sample

# Specify the directory where you want to save the PDF
save_directory = '/data/vasileiosionat2/Xenium/Figures/lvl4_pdf/'
pdf_filename = os.path.join(save_directory, f'all_clusters_4rows.pdf')

# Filter the data for the specific sample
adata_sample = merged_immune[merged_immune.obs['sample'] == 's2r3']

unique_clusters = adata_sample.obs.loc[
    adata_sample.obs['final_label_citeSeq'].astype(str).str.startswith("CD")|
    (adata_sample.obs['final_label_citeSeq'].str.startswith("T")),
    'final_label_citeSeq'
].unique()

# Reorder clusters (if a custom order is provided, replace `sorted(unique_clusters)`)
ordered_clusters = sorted(unique_clusters)

# Define the number of rows and columns
num_rows = 6
num_cols = int(np.ceil(len(ordered_clusters) / num_rows))  # Calculate columns based on clusters and rows

# Create a PdfPages object to save the plots
with PdfPages(pdf_filename) as pdf:
    # Create a figure for the sample
    fig, axes = plt.subplots(num_rows, num_cols, figsize=(26, 36))
    fig.patch.set_facecolor('white')

    # Flatten the axes array for easier indexing
    axes = axes.flatten()

    # Get the coordinates for the sample
    x_coords = adata_sample.obs['x_centroid']
    y_coords = adata_sample.obs['y_centroid']
    
    # Apply rotation
    angle = np.deg2rad(rotation_angle)
    new_x_coords = x_coords * np.cos(angle) - y_coords * np.sin(angle)
    new_y_coords = x_coords * np.sin(angle) + y_coords * np.cos(angle)

    # Determine aspect ratio
    x_range = new_x_coords.max() - new_x_coords.min()
    y_range = new_y_coords.max() - new_y_coords.min()
    aspect_ratio = x_range / y_range

       # Loop over each unique cluster in the ordered list
    for idx, cluster in enumerate(ordered_clusters):
        ax = axes[idx]

        # Set white background for the subplot
        ax.set_facecolor('white')

        # Remove the outline
        for spine in ax.spines.values():
            spine.set_visible(False)

        # Scatter plot for the grey dots (all other clusters)
        ax.scatter(
            x=new_x_coords[adata_sample.obs['final_label_citeSeq'] != cluster],
            y=new_y_coords[adata_sample.obs['final_label_citeSeq'] != cluster],
            c='#C0C0C0', 
            s=3  # Adjust dot size
        )

        # Scatter plot for the red dots (the current cluster)
        ax.scatter(
            x=new_x_coords[adata_sample.obs['final_label_citeSeq'] == cluster],
            y=new_y_coords[adata_sample.obs['final_label_citeSeq'] == cluster],
            c='red', 
            s=9  # Adjust dot size
        )

        # Set aspect ratio for each subplot
        ax.set_aspect(aspect_ratio)

        # Add the cluster name below the plot
        ax.text(
            0.5, 0.02, f'{cluster}', 
            horizontalalignment='center', 
            verticalalignment='center', 
            transform=ax.transAxes, 
            color='black', fontsize=20, weight='bold'
        )

        # Remove grids
        ax.grid(False)

        # Remove ticks and their labels
        ax.set_xticks([])
        ax.set_yticks([])

    # Turn off empty subplots if there are any
    for ax in axes[len(ordered_clusters):]:
        ax.set_visible(False)

    # Define the directory and file name for the TIFF file
    tiff_filename = os.path.join(save_directory, f'all_clusters_4rows.tiff')

    # Adjust the spacing between subplots
    plt.subplots_adjust(
        left=0.05,    # Space from left edge
        right=0.95,   # Space from right edge
        top=0.95,     # Space from top edge
        bottom=0.05,  # Space from bottom edge
        wspace=0.1,   # Width space between columns
        hspace=0.05    # Height space between rows
    )

    # Add a title to the overall figure
    plt.suptitle(f'All Clusters in s2r2_HV184', color='white', fontsize=20, weight='bold', y=1.02)

    # Save the current figure as a TIFF file
    #fig.savefig(tiff_filename, dpi=300, format='tiff')

    plt.show()
    plt.close(fig)

In [None]:
# import matplotlib.pyplot as plt
import numpy as np
from matplotlib.backends.backend_pdf import PdfPages
import os


rotation_angle = 30  # Define rotation for this sample

# Specify the directory where you want to save the PDF
save_directory = '/data/vasileiosionat2/Xenium/Figures/lvl4_pdf/'
pdf_filename = os.path.join(save_directory, f'all_clusters_4rows.pdf')

# Filter the data for the specific sample
adata_sample = merged_immune[merged_immune.obs['sample'] == 's2r3']

unique_clusters = adata_sample.obs.loc[
    adata_sample.obs['final_label_citeSeq'].astype(str).str.startswith("A")|
    (adata_sample.obs['final_label_citeSeq'].str.startswith("N")),
    'final_label_citeSeq'
].unique()

# Reorder clusters (if a custom order is provided, replace `sorted(unique_clusters)`)
ordered_clusters = sorted(unique_clusters)

# Define the number of rows and columns
num_rows = 6
num_cols = int(np.ceil(len(ordered_clusters) / num_rows))  # Calculate columns based on clusters and rows

# Create a PdfPages object to save the plots
with PdfPages(pdf_filename) as pdf:
    # Create a figure for the sample
    fig, axes = plt.subplots(num_rows, num_cols, figsize=(26, 36))
    fig.patch.set_facecolor('white')

    # Flatten the axes array for easier indexing
    axes = axes.flatten()

    # Get the coordinates for the sample
    x_coords = adata_sample.obs['x_centroid']
    y_coords = adata_sample.obs['y_centroid']
    
    # Apply rotation
    angle = np.deg2rad(rotation_angle)
    new_x_coords = x_coords * np.cos(angle) - y_coords * np.sin(angle)
    new_y_coords = x_coords * np.sin(angle) + y_coords * np.cos(angle)

    # Determine aspect ratio
    x_range = new_x_coords.max() - new_x_coords.min()
    y_range = new_y_coords.max() - new_y_coords.min()
    aspect_ratio = x_range / y_range

       # Loop over each unique cluster in the ordered list
    for idx, cluster in enumerate(ordered_clusters):
        ax = axes[idx]

        # Set white background for the subplot
        ax.set_facecolor('white')

        # Remove the outline
        for spine in ax.spines.values():
            spine.set_visible(False)

        # Scatter plot for the grey dots (all other clusters)
        ax.scatter(
            x=new_x_coords[adata_sample.obs['final_label_citeSeq'] != cluster],
            y=new_y_coords[adata_sample.obs['final_label_citeSeq'] != cluster],
            c='#C0C0C0', 
            s=3  # Adjust dot size
        )

        # Scatter plot for the red dots (the current cluster)
        ax.scatter(
            x=new_x_coords[adata_sample.obs['final_label_citeSeq'] == cluster],
            y=new_y_coords[adata_sample.obs['final_label_citeSeq'] == cluster],
            c='red', 
            s=9  # Adjust dot size
        )

        # Set aspect ratio for each subplot
        ax.set_aspect(aspect_ratio)

        # Add the cluster name below the plot
        ax.text(
            0.5, 0.02, f'{cluster}', 
            horizontalalignment='center', 
            verticalalignment='center', 
            transform=ax.transAxes, 
            color='black', fontsize=20, weight='bold'
        )

        # Remove grids
        ax.grid(False)

        # Remove ticks and their labels
        ax.set_xticks([])
        ax.set_yticks([])

    # Turn off empty subplots if there are any
    for ax in axes[len(ordered_clusters):]:
        ax.set_visible(False)

    # Define the directory and file name for the TIFF file
    tiff_filename = os.path.join(save_directory, f'all_clusters_4rows.tiff')

    # Adjust the spacing between subplots
    plt.subplots_adjust(
        left=0.05,    # Space from left edge
        right=0.95,   # Space from right edge
        top=0.95,     # Space from top edge
        bottom=0.05,  # Space from bottom edge
        wspace=0.1,   # Width space between columns
        hspace=0.05    # Height space between rows
    )

    # Add a title to the overall figure
    plt.suptitle(f'All Clusters in s2r2_HV184', color='white', fontsize=20, weight='bold', y=1.02)

    # Save the current figure as a TIFF file
    #fig.savefig(tiff_filename, dpi=300, format='tiff')

    plt.show()
    plt.close(fig)

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming 'plot_niches' is a DataFrame or similar data structure
df = pd.DataFrame(merged_immune.obs)
df['Lvl5'] = df['Lvl5'].astype('category')

# Aggregate counts
niche_groups = df.groupby(['final_label_citeSeq', 'niche_cc14']).size().unstack(fill_value=0)

# Identify clusters to remove
clusters_to_remove = df['final_label_citeSeq'].str.contains('Mix') | df['final_label_citeSeq'].str.startswith('Neuro')

# Get the unique clusters to be removed
clusters_to_remove = df.loc[clusters_to_remove, 'final_label_citeSeq'].unique()

# Filter out these clusters from the plot
niche_groups_filtered = niche_groups[~niche_groups.index.isin(clusters_to_remove)]

# Calculate relative frequencies
niche_groups_relative = niche_groups_filtered.div(niche_groups_filtered.sum(axis=1), axis=0)

# Plot heatmap with enhanced colormap
plt.figure(figsize=(16, 20))  # Adjusted figsize to accommodate the x-axis labels
sns.heatmap(niche_groups_relative, cmap='plasma', cbar_kws={'label': 'Relative Frequency'}, linewidths=0.05, linecolor='black')
plt.title('Relative Cell Type Distribution in Niches')
plt.xlabel('Niche')
plt.ylabel('Cell Cluster')
plt.xticks(rotation=45, ha='right')  # Adjust rotation and alignment of xticks
plt.yticks(rotation=0)
plt.tight_layout()  # Ensures all elements fit within the figure area
plt.show()


In [None]:
# Visualize xenium annotations vs transferred scRNAseq annotations
xen_obj = merged_immune[merged_immune.obs['assay']=='Xenium']
xen_obj.obs['Lvl4'] = xen_obj.obs['Lvl4'].astype('str')
celltype_counts = pd.DataFrame(xen_obj.obs.groupby(['Lvl4','citeSeq_to_Xenium_label']).size()).unstack()
celltype_counts.columns = celltype_counts.columns.droplevel()
celltype_counts.index.name = 'Xenium cell type'
celltype_counts.columns.name = 'predicted citeSeq cell type'
celltype_counts = celltype_counts.T
# Row scale co-occurrence frequencies (by predicted scRNAseq cell type)
celltype_counts = celltype_counts.div(celltype_counts.sum(axis=1), axis=0) 
celltype_counts = celltype_counts.loc[:,celltype_counts.idxmax(axis=0).sort_values().index]
celltype_counts = celltype_counts.fillna(0)

In [None]:
column_sums = celltype_counts.abs().sum(axis=0)
print(column_sums)
# Select columns to keep based on the threshold
columns_to_keep = column_sums[column_sums >= 0.1].index
print(columns_to_keep)
celltype_counts2 = celltype_counts[columns_to_keep]
print(celltype_counts2)

In [None]:
plt.figure(figsize = (26,14))
sns.heatmap(celltype_counts2, cmap='YlGnBu')

In [None]:
# Subset the data where 'assay' is 'Xenium'
subset = merged_immune.obs[merged_immune.obs['assay'] == 'Xenium']

# Count the number of cells in each category of 'final_label_citeSeq'
counts = subset['Lvl4'].value_counts()

# Display the result
print(counts)

In [None]:
refEpi = sc.read_h5ad('/data/vasileiosionat2/Xenium/Integration_2025_only_stromal/NEW CITE seq h5ad files 2025.01/Epithelial.h5ad')

In [None]:
refEpi.obs

In [None]:
refEpi.var

In [None]:
import pandas as pd

# Specify the file path
file_path = "/data/vasileiosionat2/Xenium/Integration_2025_only_stromal/NEW CITE seq h5ad files 2025.01/Epithelial_metadata.csv"

# Read the CSV into a DataFrame
dfEpi = pd.read_csv(file_path)

# Display the DataFrame
print(dfEpi)

# Access specific columns or rows
#print(dfEpi["ColumnName"])  # Access a column
#print(dfEpi.iloc[0])        # Access the first row


In [None]:
import pandas as pd
import anndata as ad

# Check the dimensions
if len(dfEpi) != refEpi.n_obs:
    raise ValueError("Number of rows in the CSV file does not match the number of observations in adata.obs")

# Optionally, ensure the rows align
# If the index of `csv_data` and `adata.obs` don't match, you may need to align them:
# csv_data.index = adata.obs.index

# Add the CSV columns to `adata.obs`
for column in dfEpi.columns:
    refEpi.obs[column] = dfEpi[column].values

# Save the updated AnnData object (optional)
#adata.write("updated_adata.h5ad")


In [None]:
refEpi.obs

In [None]:
# Remove the 'Unnamed: 0' column from adata.obs
if 'Unnamed: 0' in refEpi.obs.columns:
    del refEpi.obs['Unnamed: 0']

# Alternatively, using the drop method
# adata.obs.drop(columns=['Unnamed: 0'], inplace=True)

# Verify the column is removed
print(refEpi.obs.head())


In [None]:
refEpi.obs.rename(columns={"EpithelialClusters": "cluster"}, inplace=True)

In [None]:
refEpi.obs

In [None]:
refEpi.X

In [None]:
if '_index' in refEpi.raw.var.columns:
    refEpi.raw.var.rename(columns={'_index': 'index_backup'}, inplace=True)


In [None]:
refEpi.write("/data/vasileiosionat2/Xenium/Integration_2025_only_stromal/Modified_citeseq_files/Epithelial.h5ad")

In [None]:
refEpi.obs

In [None]:
ref_epi = sc.read_h5ad("/data/vasileiosionat2/Xenium/Integration_2025_only_stromal/Modified_citeseq_files/Epithelial.h5ad")

In [None]:
adata = sc.read_h5ad('/data/vasileiosionat2/Xenium/Drake_outputs/ccProcessed.h5ad')

In [None]:
epi_adata = adata[adata.obs['Lvl1'].isin(['Epithelial'])].copy()

In [None]:
epi_adata.obs

In [None]:
# Add assay information to obs for clarity
epi_adata.obs['assay'] = 'Xenium'
ref_epi.obs['assay'] = 'citeSeq'

# Merge datasets with batch key and categories
merged_epi = epi_adata.concatenate(
    ref_epi,
    batch_key="assay",
    batch_categories=["Xenium", "citeSeq"],
    index_unique=None  # Avoids adding a suffix to index entries
)

In [None]:
sc.pp.scale(merged_epi)
sc.tl.pca(merged_epi)

In [None]:
sc.external.pp.harmony_integrate(merged_epi, key="assay", max_iter_harmony=20, max_iter_kmeans=30)

In [None]:
# Visualize merged with UMAP embedding; takes a long time
sc.pp.neighbors(merged_epi, n_neighbors=50, use_rep="X_pca_harmony", metric="correlation")
sc.tl.umap(merged_epi, min_dist=0.5)

In [None]:
sc.pl.umap(merged_epi, color='assay')

In [None]:
sc.pl.umap(merged_epi, color='cluster')

In [None]:
sc.pl.umap(merged_epi, color='Lvl3')

In [None]:
merged_epi.write("/data/vasileiosionat2/Xenium/Integration_2025_only_stromal/Modified_citeseq_files/Xenium_citeSeq_epithelial_harmony.h5ad")

In [None]:
merged_epi = sc.read_h5ad('/data/vasileiosionat2/Xenium/Integration_2025_only_stromal/Modified_citeseq_files/Xenium_citeSeq_epithelial_harmony.h5ad')

In [None]:
merged_epi.obs

In [None]:
print("Unique assays in merged_stromal:", merged_epi.obs["assay"].unique())

In [None]:
# Transfer annotations from scRNAseq to xenium
nn = KNeighborsClassifier(n_neighbors=1, n_jobs=16, weights='distance', metric='euclidean')
train = merged_epi[merged_epi.obs["assay"] == "citeSeq"]
nn.fit(train.obsm["X_pca_harmony"], train.obs['cluster']) 
labels = nn.predict(merged_epi[merged_epi.obs["assay"] == "Xenium"].obsm["X_pca_harmony"])
merged_epi.obs_names_make_unique()
merged_epi.obs["citeSeq_to_Xenium_label"] = pd.Series(labels, index=merged_epi[merged_epi.obs["assay"] == "Xenium"].obs.index)

In [None]:
sc.pl.umap(merged_epi, color='citeSeq_to_Xenium_label')

In [None]:
# Save transferred annotations to file
citeSeq_to_xenium_predicted_epi_labels = merged_epi.obs.loc[merged_epi.obs['assay'] == 'Xenium', ['orig.ident','Lvl4','citeSeq_to_Xenium_label']]
citeSeq_to_xenium_predicted_epi_labels.index = citeSeq_to_xenium_predicted_epi_labels.index.str.replace('-Xenium','')
citeSeq_to_xenium_predicted_epi_labels.to_csv('/data/vasileiosionat2/Xenium/Integration_2025_only_stromal/Modified_citeseq_files/citeSeq_to_Xenium_predicted_epithelial_celltype.csv')

In [None]:
# Transfer annotations from xenium to scRNAseq
nn = KNeighborsClassifier(n_neighbors=1, n_jobs=16, weights='distance', metric='euclidean')
train = merged_epi[merged_epi.obs["assay"] == "Xenium"]
nn.fit(train.obsm["X_pca_harmony"], train.obs['Lvl4']) 
labels = nn.predict(merged_epi[merged_epi.obs["assay"] == "citeSeq"].obsm["X_pca_harmony"])
merged_epi.obs["Xenium_to_citeSeq_label"] = pd.Series(labels, index=merged_epi[merged_epi.obs["assay"] == "citeSeq"].obs.index)

In [None]:
# Save transferred annotations to file
Xenium_to_citeSeq_predicted_epi_labels = merged_epi.obs.loc[merged_epi.obs['assay'] == 'citeSeq', ['orig.ident','cluster','Xenium_to_citeSeq_label']]
Xenium_to_citeSeq_predicted_epi_labels.index = Xenium_to_citeSeq_predicted_epi_labels.index.str.replace('-citeSeq','')
Xenium_to_citeSeq_predicted_epi_labels.to_csv('/data/vasileiosionat2/Xenium/Integration_2025_only_stromal/Modified_citeseq_files/Xenium_to_citeSeq_predicted_epithelial_celltype.csv')

In [None]:
# Combine original labels from scSeq and transferred labels from xenium into one column
merged_epi.obs.loc[merged_epi.obs['assay'] == 'Xenium', 'final_label_citeSeq'] = merged_epi.obs[merged_epi.obs['assay'] == 'Xenium']['citeSeq_to_Xenium_label']
merged_epi.obs.loc[merged_epi.obs['assay'] == 'citeSeq', 'final_label_citeSeq'] = merged_epi.obs[merged_epi.obs['assay'] == 'citeSeq']['cluster']
# Combine original labels from xenium and transferred labels from scSeq into one column
# Convert categories to strings (safe option)
merged_epi.obs.loc[merged_epi.obs['assay'] == 'citeSeq', 'final_label_X'] = \
    merged_epi.obs.loc[merged_epi.obs['assay'] == 'citeSeq', 'Xenium_to_citeSeq_label'].astype(str)

merged_epi.obs.loc[merged_epi.obs['assay'] == 'Xenium', 'final_label_X'] = \
    merged_epi.obs.loc[merged_epi.obs['assay'] == 'Xenium', 'Lvl4'].astype(str)


# Visualized merged labels on UMAP
sc.pl.umap(merged_epi, color='final_label_citeSeq')
sc.pl.umap(merged_epi, color='assay')
sc.pl.umap(merged_epi, color='final_label_X')

In [None]:
merged_epi.write("/data/vasileiosionat2/Xenium/Integration_2025_only_stromal/Modified_citeseq_files/Xenium_citeSeq_epithelial_harmony.h5ad")

In [None]:
# import matplotlib.pyplot as plt
import numpy as np
from matplotlib.backends.backend_pdf import PdfPages
import os


rotation_angle = 30  # Define rotation for this sample

# Specify the directory where you want to save the PDF
save_directory = '/data/vasileiosionat2/Xenium/Figures/lvl4_pdf/'
pdf_filename = os.path.join(save_directory, f'all_clusters_4rows.pdf')

# Filter the data for the specific sample
adata_sample = merged_epi[merged_epi.obs['sample'] == 's2r2_HV184']

unique_clusters = adata_sample.obs.loc[
    adata_sample.obs['final_label_citeSeq'].astype(str).str.startswith("Ep")|
    (adata_sample.obs['final_label_citeSeq'].str.startswith("C")),
    'final_label_citeSeq'
].unique()

# Reorder clusters (if a custom order is provided, replace `sorted(unique_clusters)`)
ordered_clusters = sorted(unique_clusters)

# Define the number of rows and columns
num_rows = 6
num_cols = int(np.ceil(len(ordered_clusters) / num_rows))  # Calculate columns based on clusters and rows

# Create a PdfPages object to save the plots
with PdfPages(pdf_filename) as pdf:
    # Create a figure for the sample
    fig, axes = plt.subplots(num_rows, num_cols, figsize=(26, 36))
    fig.patch.set_facecolor('white')

    # Flatten the axes array for easier indexing
    axes = axes.flatten()

    # Get the coordinates for the sample
    x_coords = adata_sample.obs['x_centroid']
    y_coords = adata_sample.obs['y_centroid']
    
    # Apply rotation
    angle = np.deg2rad(rotation_angle)
    new_x_coords = x_coords * np.cos(angle) - y_coords * np.sin(angle)
    new_y_coords = x_coords * np.sin(angle) + y_coords * np.cos(angle)

    # Determine aspect ratio
    x_range = new_x_coords.max() - new_x_coords.min()
    y_range = new_y_coords.max() - new_y_coords.min()
    aspect_ratio = x_range / y_range

       # Loop over each unique cluster in the ordered list
    for idx, cluster in enumerate(ordered_clusters):
        ax = axes[idx]

        # Set white background for the subplot
        ax.set_facecolor('white')

        # Remove the outline
        for spine in ax.spines.values():
            spine.set_visible(False)

        # Scatter plot for the grey dots (all other clusters)
        ax.scatter(
            x=new_x_coords[adata_sample.obs['final_label_citeSeq'] != cluster],
            y=new_y_coords[adata_sample.obs['final_label_citeSeq'] != cluster],
            c='#C0C0C0', 
            s=3  # Adjust dot size
        )

        # Scatter plot for the red dots (the current cluster)
        ax.scatter(
            x=new_x_coords[adata_sample.obs['final_label_citeSeq'] == cluster],
            y=new_y_coords[adata_sample.obs['final_label_citeSeq'] == cluster],
            c='red', 
            s=9  # Adjust dot size
        )

        # Set aspect ratio for each subplot
        ax.set_aspect(aspect_ratio)

        # Add the cluster name below the plot
        ax.text(
            0.5, 0.02, f'{cluster}', 
            horizontalalignment='center', 
            verticalalignment='center', 
            transform=ax.transAxes, 
            color='black', fontsize=20, weight='bold'
        )

        # Remove grids
        ax.grid(False)

        # Remove ticks and their labels
        ax.set_xticks([])
        ax.set_yticks([])

    # Turn off empty subplots if there are any
    for ax in axes[len(ordered_clusters):]:
        ax.set_visible(False)

    # Define the directory and file name for the TIFF file
    tiff_filename = os.path.join(save_directory, f'all_clusters_4rows.tiff')

    # Adjust the spacing between subplots
    plt.subplots_adjust(
        left=0.05,    # Space from left edge
        right=0.95,   # Space from right edge
        top=0.95,     # Space from top edge
        bottom=0.05,  # Space from bottom edge
        wspace=0.1,   # Width space between columns
        hspace=0.05    # Height space between rows
    )

    # Add a title to the overall figure
    plt.suptitle(f'All Clusters in s2r2_HV184', color='white', fontsize=20, weight='bold', y=1.02)

    # Save the current figure as a TIFF file
    #fig.savefig(tiff_filename, dpi=300, format='tiff')

    plt.show()
    plt.close(fig)

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming 'plot_niches' is a DataFrame or similar data structure
df = pd.DataFrame(merged_epi.obs)
df['Lvl5'] = df['Lvl5'].astype('category')

# Aggregate counts
niche_groups = df.groupby(['final_label_citeSeq', 'niche_cc14']).size().unstack(fill_value=0)

# Identify clusters to remove
clusters_to_remove = df['final_label_citeSeq'].str.contains('Mix') | df['final_label_citeSeq'].str.startswith('Neuro')

# Get the unique clusters to be removed
clusters_to_remove = df.loc[clusters_to_remove, 'final_label_citeSeq'].unique()

# Filter out these clusters from the plot
niche_groups_filtered = niche_groups[~niche_groups.index.isin(clusters_to_remove)]

# Calculate relative frequencies
niche_groups_relative = niche_groups_filtered.div(niche_groups_filtered.sum(axis=1), axis=0)

# Plot heatmap with enhanced colormap
plt.figure(figsize=(16, 20))  # Adjusted figsize to accommodate the x-axis labels
sns.heatmap(niche_groups_relative, cmap='plasma', cbar_kws={'label': 'Relative Frequency'}, linewidths=0.05, linecolor='black')
plt.title('Relative Cell Type Distribution in Niches')
plt.xlabel('Niche')
plt.ylabel('Cell Cluster')
plt.xticks(rotation=45, ha='right')  # Adjust rotation and alignment of xticks
plt.yticks(rotation=0)
plt.tight_layout()  # Ensures all elements fit within the figure area
plt.show()


In [None]:
# Visualize xenium annotations vs transferred scRNAseq annotations
xen_obj = merged_epi[merged_epi.obs['assay']=='Xenium']
xen_obj.obs['Lvl4'] = xen_obj.obs['Lvl4'].astype('str')
celltype_counts = pd.DataFrame(xen_obj.obs.groupby(['Lvl4','citeSeq_to_Xenium_label']).size()).unstack()
celltype_counts.columns = celltype_counts.columns.droplevel()
celltype_counts.index.name = 'Xenium cell type'
celltype_counts.columns.name = 'predicted citeSeq cell type'
celltype_counts = celltype_counts.T
# Row scale co-occurrence frequencies (by predicted scRNAseq cell type)
celltype_counts = celltype_counts.div(celltype_counts.sum(axis=1), axis=0) 
celltype_counts = celltype_counts.loc[:,celltype_counts.idxmax(axis=0).sort_values().index]
celltype_counts = celltype_counts.fillna(0)

In [None]:
column_sums = celltype_counts.abs().sum(axis=0)
print(column_sums)
# Select columns to keep based on the threshold
columns_to_keep = column_sums[column_sums >= 0.1].index
print(columns_to_keep)
celltype_counts2 = celltype_counts[columns_to_keep]
print(celltype_counts2)

In [None]:
plt.figure(figsize = (26,14))
sns.heatmap(celltype_counts2, cmap='YlGnBu')

In [None]:
merged_epi.obs.columns.unique().tolist()

In [None]:
merged_stromal.obs.columns.unique().tolist()

In [None]:
merged_immune.obs.columns.unique().tolist()

In [None]:
adata.obs['Lvl1'].unique()

In [None]:
other_adata = adata[adata.obs['Lvl1'].isin(['Other'])].copy()

In [None]:
import anndata as ad

# Concatenate AnnData objects
merged_total = ad.concat([merged_immune, merged_stromal, merged_epi, other_adata], join='outer')

# Save the combined AnnData object (optional)
#adata_combined.write("combined_adata.h5ad")


In [None]:
merged_total.obs.columns.unique().tolist()

In [None]:
merged_total.obs

In [None]:
# import matplotlib.pyplot as plt
import numpy as np
from matplotlib.backends.backend_pdf import PdfPages
import os


rotation_angle = 30  # Define rotation for this sample

# Specify the directory where you want to save the PDF
save_directory = '/data/vasileiosionat2/Xenium/Figures/lvl4_pdf/'
pdf_filename = os.path.join(save_directory, f'all_clusters_4rows.pdf')

# Filter the data for the specific sample
adata_sample = merged_total[merged_total.obs['sample'] == 's2r2_HV184']

unique_clusters = adata_sample.obs.loc[
    adata_sample.obs['final_label_citeSeq'].astype(str).str.startswith("Ep")|
    (adata_sample.obs['final_label_citeSeq'].str.startswith("C")),
    'final_label_citeSeq'
].unique()

# Reorder clusters (if a custom order is provided, replace `sorted(unique_clusters)`)
ordered_clusters = sorted(unique_clusters)

# Define the number of rows and columns
num_rows = 6
num_cols = int(np.ceil(len(ordered_clusters) / num_rows))  # Calculate columns based on clusters and rows

# Create a PdfPages object to save the plots
with PdfPages(pdf_filename) as pdf:
    # Create a figure for the sample
    fig, axes = plt.subplots(num_rows, num_cols, figsize=(26, 36))
    fig.patch.set_facecolor('white')

    # Flatten the axes array for easier indexing
    axes = axes.flatten()

    # Get the coordinates for the sample
    x_coords = adata_sample.obs['x_centroid']
    y_coords = adata_sample.obs['y_centroid']
    
    # Apply rotation
    angle = np.deg2rad(rotation_angle)
    new_x_coords = x_coords * np.cos(angle) - y_coords * np.sin(angle)
    new_y_coords = x_coords * np.sin(angle) + y_coords * np.cos(angle)

    # Determine aspect ratio
    x_range = new_x_coords.max() - new_x_coords.min()
    y_range = new_y_coords.max() - new_y_coords.min()
    aspect_ratio = x_range / y_range

       # Loop over each unique cluster in the ordered list
    for idx, cluster in enumerate(ordered_clusters):
        ax = axes[idx]

        # Set white background for the subplot
        ax.set_facecolor('white')

        # Remove the outline
        for spine in ax.spines.values():
            spine.set_visible(False)

        # Scatter plot for the grey dots (all other clusters)
        ax.scatter(
            x=new_x_coords[adata_sample.obs['final_label_citeSeq'] != cluster],
            y=new_y_coords[adata_sample.obs['final_label_citeSeq'] != cluster],
            c='#C0C0C0', 
            s=3  # Adjust dot size
        )

        # Scatter plot for the red dots (the current cluster)
        ax.scatter(
            x=new_x_coords[adata_sample.obs['final_label_citeSeq'] == cluster],
            y=new_y_coords[adata_sample.obs['final_label_citeSeq'] == cluster],
            c='red', 
            s=9  # Adjust dot size
        )

        # Set aspect ratio for each subplot
        ax.set_aspect(aspect_ratio)

        # Add the cluster name below the plot
        ax.text(
            0.5, 0.02, f'{cluster}', 
            horizontalalignment='center', 
            verticalalignment='center', 
            transform=ax.transAxes, 
            color='black', fontsize=20, weight='bold'
        )

        # Remove grids
        ax.grid(False)

        # Remove ticks and their labels
        ax.set_xticks([])
        ax.set_yticks([])

    # Turn off empty subplots if there are any
    for ax in axes[len(ordered_clusters):]:
        ax.set_visible(False)

    # Define the directory and file name for the TIFF file
    tiff_filename = os.path.join(save_directory, f'all_clusters_4rows.tiff')

    # Adjust the spacing between subplots
    plt.subplots_adjust(
        left=0.05,    # Space from left edge
        right=0.95,   # Space from right edge
        top=0.95,     # Space from top edge
        bottom=0.05,  # Space from bottom edge
        wspace=0.1,   # Width space between columns
        hspace=0.05    # Height space between rows
    )

    # Add a title to the overall figure
    plt.suptitle(f'All Clusters in s2r2_HV184', color='white', fontsize=20, weight='bold', y=1.02)

    # Save the current figure as a TIFF file
    #fig.savefig(tiff_filename, dpi=300, format='tiff')

    plt.show()
    plt.close(fig)

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming 'plot_niches' is a DataFrame or similar data structure
df = pd.DataFrame(merged_total.obs)
df['Lvl5'] = df['Lvl5'].astype('category')

# Aggregate counts
niche_groups = df.groupby(['final_label_citeSeq', 'niche_cc14']).size().unstack(fill_value=0)

# Identify clusters to remove
clusters_to_remove = df['final_label_citeSeq'].str.contains('Mix') | df['final_label_citeSeq'].str.startswith('Neuro')

# Get the unique clusters to be removed
clusters_to_remove = df.loc[clusters_to_remove, 'final_label_citeSeq'].unique()

# Filter out these clusters from the plot
niche_groups_filtered = niche_groups[~niche_groups.index.isin(clusters_to_remove)]

# Calculate relative frequencies
niche_groups_relative = niche_groups_filtered.div(niche_groups_filtered.sum(axis=1), axis=0)

# Plot heatmap with enhanced colormap
plt.figure(figsize=(16, 20))  # Adjusted figsize to accommodate the x-axis labels
sns.heatmap(niche_groups_relative, cmap='plasma', cbar_kws={'label': 'Relative Frequency'}, linewidths=0.05, linecolor='black')
plt.title('Relative Cell Type Distribution in Niches')
plt.xlabel('Niche')
plt.ylabel('Cell Cluster')
plt.xticks(rotation=45, ha='right')  # Adjust rotation and alignment of xticks
plt.yticks(rotation=0)
plt.tight_layout()  # Ensures all elements fit within the figure area
plt.show()


In [None]:
# Visualize xenium annotations vs transferred scRNAseq annotations
xen_obj = merged_total[merged_total.obs['assay']=='Xenium']
xen_obj.obs['Lvl4'] = xen_obj.obs['Lvl4'].astype('str')
celltype_counts = pd.DataFrame(xen_obj.obs.groupby(['Lvl4','citeSeq_to_Xenium_label']).size()).unstack()
celltype_counts.columns = celltype_counts.columns.droplevel()
celltype_counts.index.name = 'Xenium cell type'
celltype_counts.columns.name = 'predicted citeSeq cell type'
celltype_counts = celltype_counts.T
# Row scale co-occurrence frequencies (by predicted scRNAseq cell type)
celltype_counts = celltype_counts.div(celltype_counts.sum(axis=1), axis=0) 
celltype_counts = celltype_counts.loc[:,celltype_counts.idxmax(axis=0).sort_values().index]
celltype_counts = celltype_counts.fillna(0)

In [None]:
column_sums = celltype_counts.abs().sum(axis=0)
print(column_sums)
# Select columns to keep based on the threshold
columns_to_keep = column_sums[column_sums >= 0.1].index
print(columns_to_keep)
celltype_counts2 = celltype_counts[columns_to_keep]
print(celltype_counts2)

In [None]:
plt.figure(figsize = (26,14))
sns.heatmap(celltype_counts2, cmap='YlGnBu')

In [None]:
merged_total.write("/data/vasileiosionat2/Xenium/Integration_2025_only_stromal/Modified_citeseq_files/Xenium_citeSeq_total_harmony.h5ad")